diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,1058 +1,7847 @@ { - "best_global_step": 1038, - "best_metric": 1.032158374786377, - "best_model_checkpoint": "C:\\unity_train\\output\\unity-coder-adapter\\checkpoint-1038", - "epoch": 1.0, + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, "eval_steps": 500, - "global_step": 1038, + "global_step": 7839, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "entropy": 1.0811691626906395, - "epoch": 0.00963623223319682, - "grad_norm": 0.35738348960876465, - "learning_rate": 1.153846153846154e-05, - "loss": 2.42077693939209, - "mean_token_accuracy": 0.5671707078814506, - "num_tokens": 59463.0, + "entropy": 1.0787046998739243, + "epoch": 0.003827018752391887, + "grad_norm": 0.37200024724006653, + "learning_rate": 4.591836734693878e-06, + "loss": 2.590205955505371, + "mean_token_accuracy": 0.5478626236319541, + "num_tokens": 43996.0, "step": 10 }, { - "entropy": 1.0596500962972641, - "epoch": 0.01927246446639364, - "grad_norm": 0.26245468854904175, - "learning_rate": 2.435897435897436e-05, - "loss": 2.3048450469970705, - "mean_token_accuracy": 0.5807798743247986, - "num_tokens": 122441.0, + "entropy": 1.1275236845016479, + "epoch": 0.007654037504783774, + "grad_norm": 0.4282406270503998, + "learning_rate": 9.693877551020408e-06, + "loss": 2.732739067077637, + "mean_token_accuracy": 0.532574575394392, + "num_tokens": 84448.0, "step": 20 }, { - "entropy": 1.1488549172878266, - "epoch": 0.02890869669959046, - "grad_norm": 0.21727126836776733, - "learning_rate": 3.717948717948718e-05, - "loss": 2.15008487701416, - "mean_token_accuracy": 0.5763025932013989, - "num_tokens": 178300.0, + "entropy": 1.1098140180110931, + "epoch": 0.011481056257175661, + "grad_norm": 0.45254817605018616, + "learning_rate": 1.479591836734694e-05, + "loss": 2.595915603637695, + "mean_token_accuracy": 0.5385765254497528, + "num_tokens": 127136.0, "step": 30 }, { - "entropy": 1.324630731344223, - "epoch": 0.03854492893278728, - "grad_norm": 0.09244390577077866, - "learning_rate": 5e-05, - "loss": 1.9429821014404296, - "mean_token_accuracy": 0.595233204215765, - "num_tokens": 240279.0, + "entropy": 1.1594089552760125, + "epoch": 0.015308075009567547, + "grad_norm": 0.26816287636756897, + "learning_rate": 1.989795918367347e-05, + "loss": 2.3643749237060545, + "mean_token_accuracy": 0.5597088657319546, + "num_tokens": 172549.0, "step": 40 }, { - "entropy": 1.54823177754879, - "epoch": 0.0481811611659841, - "grad_norm": 0.07658641040325165, - "learning_rate": 6.282051282051282e-05, - "loss": 1.8167322158813477, - "mean_token_accuracy": 0.6044709533452988, - "num_tokens": 302914.0, + "entropy": 1.2817068248987198, + "epoch": 0.019135093761959432, + "grad_norm": 0.19904343783855438, + "learning_rate": 2.5e-05, + "loss": 2.1694852828979494, + "mean_token_accuracy": 0.5605865910649299, + "num_tokens": 218317.0, "step": 50 }, { - "entropy": 1.6716329783201218, - "epoch": 0.05781739339918092, - "grad_norm": 0.0950954258441925, - "learning_rate": 7.564102564102564e-05, - "loss": 1.7055500030517579, - "mean_token_accuracy": 0.6348149433732033, - "num_tokens": 351888.0, + "entropy": 1.356394973397255, + "epoch": 0.022962112514351322, + "grad_norm": 0.21708081662654877, + "learning_rate": 3.0102040816326533e-05, + "loss": 2.0824514389038087, + "mean_token_accuracy": 0.584179612249136, + "num_tokens": 255107.0, "step": 60 }, { - "entropy": 1.519571453332901, - "epoch": 0.06745362563237774, - "grad_norm": 0.07054093480110168, - "learning_rate": 8.846153846153847e-05, - "loss": 1.5317837715148925, - "mean_token_accuracy": 0.6589120730757714, - "num_tokens": 414923.0, + "entropy": 1.4309053242206573, + "epoch": 0.026789131266743208, + "grad_norm": 0.09860006719827652, + "learning_rate": 3.520408163265306e-05, + "loss": 1.777943229675293, + "mean_token_accuracy": 0.6243460461497307, + "num_tokens": 298973.0, "step": 70 }, { - "entropy": 1.5119123458862305, - "epoch": 0.07708985786557455, - "grad_norm": 0.07167445123195648, - "learning_rate": 0.00010128205128205129, - "loss": 1.4823562622070312, - "mean_token_accuracy": 0.6631396897137165, - "num_tokens": 468593.0, + "entropy": 1.4572882741689681, + "epoch": 0.030616150019135095, + "grad_norm": 0.07813975214958191, + "learning_rate": 4.0306122448979596e-05, + "loss": 1.7085393905639648, + "mean_token_accuracy": 0.641383134573698, + "num_tokens": 343287.0, "step": 80 }, { - "entropy": 1.3805345997214318, - "epoch": 0.08672609009877139, - "grad_norm": 0.10775867104530334, - "learning_rate": 0.0001141025641025641, - "loss": 1.413505458831787, - "mean_token_accuracy": 0.6830234751105309, - "num_tokens": 525914.0, + "entropy": 1.5205285474658012, + "epoch": 0.03444316877152698, + "grad_norm": 0.08015387505292892, + "learning_rate": 4.5408163265306124e-05, + "loss": 1.680305290222168, + "mean_token_accuracy": 0.6431211873888969, + "num_tokens": 376882.0, "step": 90 }, { - "entropy": 1.4891180142760276, - "epoch": 0.0963623223319682, - "grad_norm": 0.05629459396004677, - "learning_rate": 0.00012692307692307693, - "loss": 1.5126714706420898, - "mean_token_accuracy": 0.6582696467638016, - "num_tokens": 589835.0, + "entropy": 1.529197846353054, + "epoch": 0.038270187523918864, + "grad_norm": 0.1533895879983902, + "learning_rate": 5.051020408163265e-05, + "loss": 1.538798999786377, + "mean_token_accuracy": 0.6604955434799195, + "num_tokens": 414084.0, "step": 100 }, { - "entropy": 1.4281011313199996, - "epoch": 0.10599855456516502, - "grad_norm": 0.05547759681940079, - "learning_rate": 0.00013974358974358974, - "loss": 1.44595947265625, - "mean_token_accuracy": 0.6682780802249908, - "num_tokens": 648138.0, + "entropy": 1.454574093222618, + "epoch": 0.04209720627631076, + "grad_norm": 0.08893708884716034, + "learning_rate": 5.561224489795919e-05, + "loss": 1.482753372192383, + "mean_token_accuracy": 0.6748222857713699, + "num_tokens": 451489.0, "step": 110 }, { - "entropy": 1.3978321179747581, - "epoch": 0.11563478679836184, - "grad_norm": 0.05473790690302849, - "learning_rate": 0.00015256410256410255, - "loss": 1.4433627128601074, - "mean_token_accuracy": 0.674263383448124, - "num_tokens": 702219.0, + "entropy": 1.4169663548469544, + "epoch": 0.045924225028702644, + "grad_norm": 0.10994797945022583, + "learning_rate": 6.0714285714285715e-05, + "loss": 1.4192767143249512, + "mean_token_accuracy": 0.6815833821892738, + "num_tokens": 492001.0, "step": 120 }, { - "entropy": 1.4054933816194535, - "epoch": 0.12527101903155866, - "grad_norm": 0.06133154034614563, - "learning_rate": 0.0001653846153846154, - "loss": 1.4284349441528321, - "mean_token_accuracy": 0.6733272626996041, - "num_tokens": 759042.0, + "entropy": 1.3825553365051746, + "epoch": 0.04975124378109453, + "grad_norm": 0.09565065056085587, + "learning_rate": 6.581632653061225e-05, + "loss": 1.4434242248535156, + "mean_token_accuracy": 0.6830388471484184, + "num_tokens": 534229.0, "step": 130 }, { - "entropy": 1.452716739475727, - "epoch": 0.13490725126475547, - "grad_norm": 0.05656077712774277, - "learning_rate": 0.00017820512820512823, - "loss": 1.4834243774414062, - "mean_token_accuracy": 0.6669867470860481, - "num_tokens": 822515.0, + "entropy": 1.3660246580839157, + "epoch": 0.053578262533486416, + "grad_norm": 0.09277962148189545, + "learning_rate": 7.091836734693877e-05, + "loss": 1.3881919860839844, + "mean_token_accuracy": 0.6916770502924919, + "num_tokens": 573333.0, "step": 140 }, { - "entropy": 1.2200356990098953, - "epoch": 0.1445434834979523, - "grad_norm": 0.06002284586429596, - "learning_rate": 0.00019102564102564104, - "loss": 1.2548653602600097, - "mean_token_accuracy": 0.7205198630690575, - "num_tokens": 876506.0, + "entropy": 1.3041961744427681, + "epoch": 0.0574052812858783, + "grad_norm": 0.14179331064224243, + "learning_rate": 7.60204081632653e-05, + "loss": 1.3051923751831054, + "mean_token_accuracy": 0.7079917460680007, + "num_tokens": 612198.0, "step": 150 }, { - "entropy": 1.3374179184436799, - "epoch": 0.1541797157311491, - "grad_norm": 0.07770856469869614, - "learning_rate": 0.00019979716024340772, - "loss": 1.383979606628418, - "mean_token_accuracy": 0.6905814677476882, - "num_tokens": 934413.0, + "entropy": 1.3157715648412704, + "epoch": 0.06123230003827019, + "grad_norm": 0.11061020940542221, + "learning_rate": 8.112244897959184e-05, + "loss": 1.3127019882202149, + "mean_token_accuracy": 0.6986684441566468, + "num_tokens": 654309.0, "step": 160 }, { - "entropy": 1.275234666466713, - "epoch": 0.16381594796434595, - "grad_norm": 0.06369830667972565, - "learning_rate": 0.0001991210277214334, - "loss": 1.2860990524291993, - "mean_token_accuracy": 0.7051964432001114, - "num_tokens": 992594.0, + "entropy": 1.2894421368837357, + "epoch": 0.06505931879066207, + "grad_norm": 0.12903185188770294, + "learning_rate": 8.622448979591838e-05, + "loss": 1.3279677391052247, + "mean_token_accuracy": 0.7028318449854851, + "num_tokens": 694076.0, "step": 170 }, { - "entropy": 1.3504198059439658, - "epoch": 0.17345218019754277, - "grad_norm": 0.07278598099946976, - "learning_rate": 0.0001984448951994591, - "loss": 1.3741103172302247, - "mean_token_accuracy": 0.6862095996737481, - "num_tokens": 1048524.0, + "entropy": 1.2676123276352882, + "epoch": 0.06888633754305395, + "grad_norm": 0.10816285014152527, + "learning_rate": 9.13265306122449e-05, + "loss": 1.3254461288452148, + "mean_token_accuracy": 0.7063754379749299, + "num_tokens": 733806.0, "step": 180 }, { - "entropy": 1.2644047453999518, - "epoch": 0.1830884124307396, - "grad_norm": 0.07138558477163315, - "learning_rate": 0.00019776876267748477, - "loss": 1.2995844841003419, - "mean_token_accuracy": 0.706048458814621, - "num_tokens": 1109157.0, + "entropy": 1.0988808318972587, + "epoch": 0.07271335629544584, + "grad_norm": 0.08472651243209839, + "learning_rate": 9.642857142857143e-05, + "loss": 1.1510659217834474, + "mean_token_accuracy": 0.7411173984408379, + "num_tokens": 771622.0, "step": 190 }, { - "entropy": 1.3248096346855163, - "epoch": 0.1927246446639364, - "grad_norm": 0.08208955079317093, - "learning_rate": 0.00019709263015551048, - "loss": 1.3252597808837892, - "mean_token_accuracy": 0.7000985190272331, - "num_tokens": 1170070.0, + "entropy": 1.2284281507134438, + "epoch": 0.07654037504783773, + "grad_norm": 0.10897475481033325, + "learning_rate": 0.00010153061224489797, + "loss": 1.2724005699157714, + "mean_token_accuracy": 0.7178638219833374, + "num_tokens": 813167.0, "step": 200 }, { - "entropy": 1.2682933300733565, - "epoch": 0.20236087689713322, - "grad_norm": 0.055171046406030655, - "learning_rate": 0.00019641649763353617, - "loss": 1.3352859497070313, - "mean_token_accuracy": 0.700250719487667, - "num_tokens": 1229436.0, + "entropy": 1.2159623876214027, + "epoch": 0.08036739380022963, + "grad_norm": 0.12170197069644928, + "learning_rate": 0.0001066326530612245, + "loss": 1.26397647857666, + "mean_token_accuracy": 0.7138236090540886, + "num_tokens": 856210.0, "step": 210 }, { - "entropy": 1.347843087464571, - "epoch": 0.21199710913033004, - "grad_norm": 0.06492508947849274, - "learning_rate": 0.00019574036511156188, - "loss": 1.3542288780212401, - "mean_token_accuracy": 0.6922637760639191, - "num_tokens": 1287532.0, + "entropy": 1.2309471271932124, + "epoch": 0.08419441255262151, + "grad_norm": 0.08406181633472443, + "learning_rate": 0.00011173469387755102, + "loss": 1.3110918998718262, + "mean_token_accuracy": 0.7173333883285522, + "num_tokens": 893432.0, "step": 220 }, { - "entropy": 1.218190498650074, - "epoch": 0.22163334136352686, - "grad_norm": 0.0640958771109581, - "learning_rate": 0.00019506423258958757, - "loss": 1.2281950950622558, - "mean_token_accuracy": 0.718992106616497, - "num_tokens": 1344357.0, + "entropy": 1.2228039711713792, + "epoch": 0.0880214313050134, + "grad_norm": 0.10588081181049347, + "learning_rate": 0.00011683673469387754, + "loss": 1.2445635795593262, + "mean_token_accuracy": 0.7160170584917068, + "num_tokens": 931919.0, "step": 230 }, { - "entropy": 1.3014725491404533, - "epoch": 0.23126957359672368, - "grad_norm": 0.05718906223773956, - "learning_rate": 0.00019438810006761325, - "loss": 1.3229006767272948, - "mean_token_accuracy": 0.6986020863056183, - "num_tokens": 1405209.0, + "entropy": 1.138296764343977, + "epoch": 0.09184845005740529, + "grad_norm": 0.110760398209095, + "learning_rate": 0.00012193877551020409, + "loss": 1.2083134651184082, + "mean_token_accuracy": 0.7306654810905456, + "num_tokens": 976039.0, "step": 240 }, { - "entropy": 1.227053464204073, - "epoch": 0.2409058058299205, - "grad_norm": 0.05763186141848564, - "learning_rate": 0.00019371196754563896, - "loss": 1.2807598114013672, - "mean_token_accuracy": 0.7116438299417496, - "num_tokens": 1457023.0, + "entropy": 1.1915819495916367, + "epoch": 0.09567546880979717, + "grad_norm": 0.15018120408058167, + "learning_rate": 0.00012704081632653063, + "loss": 1.2230369567871093, + "mean_token_accuracy": 0.719833716750145, + "num_tokens": 1019312.0, "step": 250 }, { - "entropy": 1.2856510818004607, - "epoch": 0.2505420380631173, - "grad_norm": 0.06267844140529633, - "learning_rate": 0.00019303583502366465, - "loss": 1.3038443565368651, - "mean_token_accuracy": 0.7054769903421402, - "num_tokens": 1513975.0, + "entropy": 1.2996815636754036, + "epoch": 0.09950248756218906, + "grad_norm": 0.10838313400745392, + "learning_rate": 0.00013214285714285715, + "loss": 1.3142367362976075, + "mean_token_accuracy": 0.7013067752122879, + "num_tokens": 1062901.0, "step": 260 }, { - "entropy": 1.228683941066265, - "epoch": 0.26017827029631413, - "grad_norm": 0.0609310083091259, - "learning_rate": 0.00019235970250169033, - "loss": 1.2614136695861817, - "mean_token_accuracy": 0.7095189347863198, - "num_tokens": 1579252.0, + "entropy": 1.149668525904417, + "epoch": 0.10332950631458095, + "grad_norm": 0.09911312907934189, + "learning_rate": 0.00013724489795918367, + "loss": 1.1573083877563477, + "mean_token_accuracy": 0.728775355219841, + "num_tokens": 1102630.0, "step": 270 }, { - "entropy": 1.2551469817757606, - "epoch": 0.26981450252951095, - "grad_norm": 0.06873613595962524, - "learning_rate": 0.00019168356997971604, - "loss": 1.2714256286621093, - "mean_token_accuracy": 0.712159389257431, - "num_tokens": 1637995.0, + "entropy": 1.271340447664261, + "epoch": 0.10715652506697283, + "grad_norm": 0.09665267169475555, + "learning_rate": 0.00014234693877551022, + "loss": 1.341374111175537, + "mean_token_accuracy": 0.7027333110570908, + "num_tokens": 1142262.0, "step": 280 }, { - "entropy": 1.2518196165561677, - "epoch": 0.27945073476270776, - "grad_norm": 0.07062622159719467, - "learning_rate": 0.00019100743745774173, - "loss": 1.302776050567627, - "mean_token_accuracy": 0.7063210532069206, - "num_tokens": 1703008.0, + "entropy": 1.225122657418251, + "epoch": 0.11098354381936472, + "grad_norm": 0.13240815699100494, + "learning_rate": 0.00014744897959183674, + "loss": 1.2614737510681153, + "mean_token_accuracy": 0.7198452442884445, + "num_tokens": 1182386.0, "step": 290 }, { - "entropy": 1.2446896508336067, - "epoch": 0.2890869669959046, - "grad_norm": 0.051750048995018005, - "learning_rate": 0.00019033130493576744, - "loss": 1.3015289306640625, - "mean_token_accuracy": 0.7073144510388374, - "num_tokens": 1766072.0, + "entropy": 1.2733432039618493, + "epoch": 0.1148105625717566, + "grad_norm": 0.10651895403862, + "learning_rate": 0.00015255102040816326, + "loss": 1.2933347702026368, + "mean_token_accuracy": 0.7025195896625519, + "num_tokens": 1222805.0, "step": 300 }, { - "entropy": 1.3015983402729034, - "epoch": 0.2987231992291014, - "grad_norm": 0.0565313994884491, - "learning_rate": 0.00018965517241379312, - "loss": 1.311717414855957, - "mean_token_accuracy": 0.7029094457626343, - "num_tokens": 1820923.0, + "entropy": 1.1543171763420106, + "epoch": 0.11863758132414849, + "grad_norm": 0.08577804267406464, + "learning_rate": 0.00015765306122448978, + "loss": 1.197078323364258, + "mean_token_accuracy": 0.7300360783934593, + "num_tokens": 1263121.0, "step": 310 }, { - "entropy": 1.1577203705906869, - "epoch": 0.3083594314622982, - "grad_norm": 0.05471700802445412, - "learning_rate": 0.0001889790398918188, - "loss": 1.2251465797424317, - "mean_token_accuracy": 0.7273583576083184, - "num_tokens": 1877116.0, + "entropy": 1.1908820882439612, + "epoch": 0.12246460007654038, + "grad_norm": 0.11925600469112396, + "learning_rate": 0.00016275510204081633, + "loss": 1.2366827964782714, + "mean_token_accuracy": 0.7277334719896317, + "num_tokens": 1296346.0, "step": 320 }, { - "entropy": 1.205883078277111, - "epoch": 0.31799566369549503, - "grad_norm": 0.05748973786830902, - "learning_rate": 0.00018830290736984452, - "loss": 1.244644546508789, - "mean_token_accuracy": 0.7220181196928024, - "num_tokens": 1935606.0, + "entropy": 1.1711702406406403, + "epoch": 0.12629161882893225, + "grad_norm": 0.12476309388875961, + "learning_rate": 0.00016785714285714288, + "loss": 1.2408350944519042, + "mean_token_accuracy": 0.7282937213778495, + "num_tokens": 1335547.0, "step": 330 }, { - "entropy": 1.258717157691717, - "epoch": 0.3276318959286919, - "grad_norm": 0.05800570175051689, - "learning_rate": 0.0001876267748478702, - "loss": 1.3071101188659668, - "mean_token_accuracy": 0.7034722596406937, - "num_tokens": 1997199.0, + "entropy": 1.1748667433857918, + "epoch": 0.13011863758132414, + "grad_norm": 0.08671289682388306, + "learning_rate": 0.0001729591836734694, + "loss": 1.199030303955078, + "mean_token_accuracy": 0.7312668621540069, + "num_tokens": 1377093.0, "step": 340 }, { - "entropy": 1.22360208183527, - "epoch": 0.3372681281618887, - "grad_norm": 0.09994573146104813, - "learning_rate": 0.00018695064232589589, - "loss": 1.2296308517456054, - "mean_token_accuracy": 0.7146901577711106, - "num_tokens": 2048172.0, + "entropy": 1.153764547407627, + "epoch": 0.13394565633371602, + "grad_norm": 0.10536976903676987, + "learning_rate": 0.00017806122448979592, + "loss": 1.201906967163086, + "mean_token_accuracy": 0.7266524419188499, + "num_tokens": 1417236.0, "step": 350 }, { - "entropy": 1.3035957127809525, - "epoch": 0.34690436039508554, - "grad_norm": 0.07320819795131683, - "learning_rate": 0.00018627450980392157, - "loss": 1.2939067840576173, - "mean_token_accuracy": 0.696925450861454, - "num_tokens": 2109948.0, + "entropy": 1.2303058430552483, + "epoch": 0.1377726750861079, + "grad_norm": 0.09069176018238068, + "learning_rate": 0.00018316326530612247, + "loss": 1.2867681503295898, + "mean_token_accuracy": 0.7198954582214355, + "num_tokens": 1460306.0, "step": 360 }, { - "entropy": 1.212366634607315, - "epoch": 0.35654059262828236, - "grad_norm": 0.06324014812707901, - "learning_rate": 0.00018559837728194725, - "loss": 1.2521446228027344, - "mean_token_accuracy": 0.717373288422823, - "num_tokens": 2171174.0, + "entropy": 1.1944106668233871, + "epoch": 0.1415996938384998, + "grad_norm": 0.08539925515651703, + "learning_rate": 0.000188265306122449, + "loss": 1.2451179504394532, + "mean_token_accuracy": 0.7186401098966598, + "num_tokens": 1505966.0, "step": 370 }, { - "entropy": 1.240752936899662, - "epoch": 0.3661768248614792, - "grad_norm": 0.07091817259788513, - "learning_rate": 0.00018492224475997297, - "loss": 1.2615043640136718, - "mean_token_accuracy": 0.7065044552087784, - "num_tokens": 2228430.0, + "entropy": 1.2239032357931137, + "epoch": 0.14542671259089168, + "grad_norm": 0.08434446156024933, + "learning_rate": 0.0001933673469387755, + "loss": 1.2803629875183105, + "mean_token_accuracy": 0.7178368359804154, + "num_tokens": 1544132.0, "step": 380 }, { - "entropy": 1.2120654836297036, - "epoch": 0.375813057094676, - "grad_norm": 0.05131813511252403, - "learning_rate": 0.00018424611223799865, - "loss": 1.2185423851013184, - "mean_token_accuracy": 0.7187462538480759, - "num_tokens": 2285248.0, + "entropy": 1.1901779979467393, + "epoch": 0.14925373134328357, + "grad_norm": 0.08662886172533035, + "learning_rate": 0.00019846938775510203, + "loss": 1.2282370567321776, + "mean_token_accuracy": 0.7221132159233093, + "num_tokens": 1587355.0, "step": 390 }, { - "entropy": 1.2625371024012566, - "epoch": 0.3854492893278728, - "grad_norm": 0.056896600872278214, - "learning_rate": 0.00018356997971602433, - "loss": 1.3041479110717773, - "mean_token_accuracy": 0.7014564648270607, - "num_tokens": 2345584.0, + "entropy": 1.106569343805313, + "epoch": 0.15308075009567546, + "grad_norm": 0.13149450719356537, + "learning_rate": 0.00019981200483416142, + "loss": 1.136556625366211, + "mean_token_accuracy": 0.7384186327457428, + "num_tokens": 1624638.0, "step": 400 }, { - "entropy": 1.229997194558382, - "epoch": 0.39508552156106963, - "grad_norm": 0.06733101606369019, - "learning_rate": 0.00018289384719405005, - "loss": 1.2783479690551758, - "mean_token_accuracy": 0.7130212724208832, - "num_tokens": 2402073.0, + "entropy": 1.0393452920019626, + "epoch": 0.15690776884806737, + "grad_norm": 0.13831719756126404, + "learning_rate": 0.00019954344031153484, + "loss": 1.074817180633545, + "mean_token_accuracy": 0.7567356958985328, + "num_tokens": 1665215.0, "step": 410 }, { - "entropy": 1.253336711972952, - "epoch": 0.40472175379426645, - "grad_norm": 0.1017632707953453, - "learning_rate": 0.00018221771467207573, - "loss": 1.2827540397644044, - "mean_token_accuracy": 0.703055490553379, - "num_tokens": 2461269.0, + "entropy": 1.1298492863774299, + "epoch": 0.16073478760045926, + "grad_norm": 0.10244159400463104, + "learning_rate": 0.0001992748757889083, + "loss": 1.1741769790649415, + "mean_token_accuracy": 0.7414689466357232, + "num_tokens": 1701543.0, "step": 420 }, { - "entropy": 1.1999207064509392, - "epoch": 0.41435798602746327, - "grad_norm": 0.0675441101193428, - "learning_rate": 0.00018154158215010142, - "loss": 1.1980154037475585, - "mean_token_accuracy": 0.7192850261926651, - "num_tokens": 2520272.0, + "entropy": 1.1646860882639885, + "epoch": 0.16456180635285114, + "grad_norm": 0.09356453269720078, + "learning_rate": 0.00019900631126628174, + "loss": 1.2229989051818848, + "mean_token_accuracy": 0.7278152450919151, + "num_tokens": 1744719.0, "step": 430 }, { - "entropy": 1.1966832533478737, - "epoch": 0.4239942182606601, - "grad_norm": 0.04782295227050781, - "learning_rate": 0.00018086544962812713, - "loss": 1.196799373626709, - "mean_token_accuracy": 0.7202263355255127, - "num_tokens": 2574916.0, + "entropy": 1.1580850452184677, + "epoch": 0.16838882510524303, + "grad_norm": 0.08699047565460205, + "learning_rate": 0.00019873774674365518, + "loss": 1.1999470710754394, + "mean_token_accuracy": 0.7270642057061195, + "num_tokens": 1787999.0, "step": 440 }, { - "entropy": 1.28217963129282, - "epoch": 0.4336304504938569, - "grad_norm": 0.055622052401304245, - "learning_rate": 0.0001801893171061528, - "loss": 1.2780956268310546, - "mean_token_accuracy": 0.6991938084363938, - "num_tokens": 2630803.0, + "entropy": 1.105088683962822, + "epoch": 0.17221584385763491, + "grad_norm": 0.10489863902330399, + "learning_rate": 0.0001984691822210286, + "loss": 1.123628807067871, + "mean_token_accuracy": 0.7375599846243859, + "num_tokens": 1825171.0, "step": 450 }, { - "entropy": 1.1060978904366494, - "epoch": 0.4432666827270537, - "grad_norm": 0.08565858751535416, - "learning_rate": 0.00017951318458417852, - "loss": 1.1542920112609862, - "mean_token_accuracy": 0.7370428621768952, - "num_tokens": 2686122.0, + "entropy": 1.0744315460324287, + "epoch": 0.1760428626100268, + "grad_norm": 0.10170256346464157, + "learning_rate": 0.00019820061769840205, + "loss": 1.1449885368347168, + "mean_token_accuracy": 0.7466330513358116, + "num_tokens": 1863245.0, "step": 460 }, { - "entropy": 1.3041150823235512, - "epoch": 0.45290291496025054, - "grad_norm": 0.05283431336283684, - "learning_rate": 0.0001788370520622042, - "loss": 1.2778042793273925, - "mean_token_accuracy": 0.700305138528347, - "num_tokens": 2751119.0, + "entropy": 1.021899376064539, + "epoch": 0.1798698813624187, + "grad_norm": 0.09046658873558044, + "learning_rate": 0.0001979320531757755, + "loss": 1.0087275505065918, + "mean_token_accuracy": 0.7654190301895142, + "num_tokens": 1902205.0, "step": 470 }, { - "entropy": 1.2362390145659448, - "epoch": 0.46253914719344735, - "grad_norm": 0.05927009880542755, - "learning_rate": 0.0001781609195402299, - "loss": 1.2784390449523926, - "mean_token_accuracy": 0.700747960805893, - "num_tokens": 2809393.0, + "entropy": 1.1595304682850838, + "epoch": 0.18369690011481057, + "grad_norm": 0.09361740201711655, + "learning_rate": 0.00019766348865314892, + "loss": 1.2238757133483886, + "mean_token_accuracy": 0.7275362908840179, + "num_tokens": 1943827.0, "step": 480 }, { - "entropy": 1.2329043842852117, - "epoch": 0.47217537942664417, - "grad_norm": 0.05709557607769966, - "learning_rate": 0.0001774847870182556, - "loss": 1.2346397399902345, - "mean_token_accuracy": 0.7168422609567642, - "num_tokens": 2870246.0, + "entropy": 1.1023890599608421, + "epoch": 0.18752391886720246, + "grad_norm": 0.08471602201461792, + "learning_rate": 0.00019739492413052236, + "loss": 1.1567111015319824, + "mean_token_accuracy": 0.742167092859745, + "num_tokens": 1982630.0, "step": 490 }, { - "entropy": 1.197921334207058, - "epoch": 0.481811611659841, - "grad_norm": 0.051337700337171555, - "learning_rate": 0.0001768086544962813, - "loss": 1.2222982406616212, - "mean_token_accuracy": 0.7162339583039283, - "num_tokens": 2934671.0, + "entropy": 1.1427962884306908, + "epoch": 0.19135093761959435, + "grad_norm": 0.1170063391327858, + "learning_rate": 0.0001971263596078958, + "loss": 1.208080005645752, + "mean_token_accuracy": 0.7308674260973931, + "num_tokens": 2021311.0, "step": 500 }, { - "entropy": 1.2013787925243378, - "epoch": 0.4914478438930378, - "grad_norm": 0.06907735019922256, - "learning_rate": 0.00017613252197430697, - "loss": 1.2295709609985352, - "mean_token_accuracy": 0.7151955872774124, - "num_tokens": 2989014.0, + "entropy": 1.0002792343497275, + "epoch": 0.19517795637198623, + "grad_norm": 0.10567828267812729, + "learning_rate": 0.00019685779508526926, + "loss": 1.026076889038086, + "mean_token_accuracy": 0.7641370877623558, + "num_tokens": 2055396.0, "step": 510 }, { - "entropy": 1.1978842347860337, - "epoch": 0.5010840761262346, - "grad_norm": 0.06152976304292679, - "learning_rate": 0.00017545638945233268, - "loss": 1.2030101776123048, - "mean_token_accuracy": 0.7203953012824058, - "num_tokens": 3045999.0, + "entropy": 1.1096693962812423, + "epoch": 0.19900497512437812, + "grad_norm": 0.08597096055746078, + "learning_rate": 0.00019658923056264268, + "loss": 1.1631651878356934, + "mean_token_accuracy": 0.7389342650771141, + "num_tokens": 2097931.0, "step": 520 }, { - "entropy": 1.3108120203018188, - "epoch": 0.5107203083594315, - "grad_norm": 0.05314662307500839, - "learning_rate": 0.00017478025693035837, - "loss": 1.3376919746398925, - "mean_token_accuracy": 0.6901254534721375, - "num_tokens": 3106021.0, + "entropy": 1.04201333373785, + "epoch": 0.20283199387677, + "grad_norm": 0.1260094940662384, + "learning_rate": 0.00019632066604001613, + "loss": 1.0765873908996582, + "mean_token_accuracy": 0.7539548426866531, + "num_tokens": 2137657.0, "step": 530 }, { - "entropy": 1.269498337060213, - "epoch": 0.5203565405926283, - "grad_norm": 0.052769869565963745, - "learning_rate": 0.00017410412440838405, - "loss": 1.3347968101501464, - "mean_token_accuracy": 0.7055287733674049, - "num_tokens": 3167001.0, + "entropy": 1.0748623803257942, + "epoch": 0.2066590126291619, + "grad_norm": 0.0845552608370781, + "learning_rate": 0.00019605210151738955, + "loss": 1.1419748306274413, + "mean_token_accuracy": 0.7481048628687859, + "num_tokens": 2177343.0, "step": 540 }, { - "entropy": 1.174779912084341, - "epoch": 0.5299927728258251, - "grad_norm": 0.057087887078523636, - "learning_rate": 0.00017342799188640974, - "loss": 1.204134464263916, - "mean_token_accuracy": 0.7242644309997559, - "num_tokens": 3222794.0, + "entropy": 1.0970528617501258, + "epoch": 0.21048603138155378, + "grad_norm": 0.07105763256549835, + "learning_rate": 0.000195783536994763, + "loss": 1.1284755706787108, + "mean_token_accuracy": 0.747196614742279, + "num_tokens": 2211423.0, "step": 550 }, { - "entropy": 1.1798104658722877, - "epoch": 0.5396290050590219, - "grad_norm": 0.06867323815822601, - "learning_rate": 0.00017275185936443542, - "loss": 1.218485164642334, - "mean_token_accuracy": 0.7220648050308227, - "num_tokens": 3282856.0, + "entropy": 1.0551751986145974, + "epoch": 0.21431305013394567, + "grad_norm": 0.12569685280323029, + "learning_rate": 0.00019551497247213644, + "loss": 1.1170102119445802, + "mean_token_accuracy": 0.7472440049052238, + "num_tokens": 2249350.0, "step": 560 }, { - "entropy": 1.2600811369717122, - "epoch": 0.5492652372922188, - "grad_norm": 0.06002269685268402, - "learning_rate": 0.00017207572684246113, - "loss": 1.2951341629028321, - "mean_token_accuracy": 0.7052147269248963, - "num_tokens": 3343572.0, + "entropy": 1.0562219873070717, + "epoch": 0.21814006888633755, + "grad_norm": 0.08452208340167999, + "learning_rate": 0.0001952464079495099, + "loss": 1.0921730995178223, + "mean_token_accuracy": 0.7492805704474449, + "num_tokens": 2289564.0, "step": 570 }, { - "entropy": 1.2493580430746078, - "epoch": 0.5589014695254155, - "grad_norm": 0.06532762199640274, - "learning_rate": 0.00017139959432048682, - "loss": 1.2813149452209474, - "mean_token_accuracy": 0.7067271783947945, - "num_tokens": 3402401.0, + "entropy": 1.022915106266737, + "epoch": 0.22196708763872944, + "grad_norm": 0.08168510347604752, + "learning_rate": 0.00019497784342688333, + "loss": 1.064980697631836, + "mean_token_accuracy": 0.7562290355563164, + "num_tokens": 2332592.0, "step": 580 }, { - "entropy": 1.174679161608219, - "epoch": 0.5685377017586124, - "grad_norm": 0.0596516914665699, - "learning_rate": 0.0001707234617985125, - "loss": 1.2225926399230957, - "mean_token_accuracy": 0.7233842894434929, - "num_tokens": 3459042.0, + "entropy": 1.1041950330138206, + "epoch": 0.22579410639112132, + "grad_norm": 0.07596516609191895, + "learning_rate": 0.00019470927890425675, + "loss": 1.1410536766052246, + "mean_token_accuracy": 0.7350378915667534, + "num_tokens": 2380215.0, "step": 590 }, { - "entropy": 1.2576898023486138, - "epoch": 0.5781739339918092, - "grad_norm": 0.05735331028699875, - "learning_rate": 0.0001700473292765382, - "loss": 1.2835253715515136, - "mean_token_accuracy": 0.7045948460698128, - "num_tokens": 3527637.0, + "entropy": 1.1252056039869784, + "epoch": 0.2296211251435132, + "grad_norm": 0.07240597158670425, + "learning_rate": 0.0001944407143816302, + "loss": 1.1348044395446777, + "mean_token_accuracy": 0.7389790266752243, + "num_tokens": 2417819.0, "step": 600 }, { - "entropy": 1.2473611667752267, - "epoch": 0.587810166225006, - "grad_norm": 0.06076724827289581, - "learning_rate": 0.0001693711967545639, - "loss": 1.2636917114257813, - "mean_token_accuracy": 0.7045381426811218, - "num_tokens": 3589404.0, + "entropy": 1.0117394506931305, + "epoch": 0.2334481438959051, + "grad_norm": 0.08603253215551376, + "learning_rate": 0.00019417214985900362, + "loss": 1.0533303260803222, + "mean_token_accuracy": 0.7578112691640854, + "num_tokens": 2459072.0, "step": 610 }, { - "entropy": 1.2829708829522133, - "epoch": 0.5974463984582028, - "grad_norm": 0.05943462252616882, - "learning_rate": 0.0001686950642325896, - "loss": 1.2889452934265138, - "mean_token_accuracy": 0.7001010566949845, - "num_tokens": 3651124.0, + "entropy": 1.0193642653524875, + "epoch": 0.23727516264829698, + "grad_norm": 0.08400722593069077, + "learning_rate": 0.00019390358533637707, + "loss": 1.0955985069274903, + "mean_token_accuracy": 0.7598358646035195, + "num_tokens": 2497901.0, "step": 620 }, { - "entropy": 1.2208770334720611, - "epoch": 0.6070826306913997, - "grad_norm": 0.04695171117782593, - "learning_rate": 0.0001680189317106153, - "loss": 1.259203052520752, - "mean_token_accuracy": 0.7110297352075576, - "num_tokens": 3705293.0, + "entropy": 1.0488237984478475, + "epoch": 0.24110218140068887, + "grad_norm": 0.07221511751413345, + "learning_rate": 0.00019363502081375052, + "loss": 1.151495361328125, + "mean_token_accuracy": 0.7541669681668282, + "num_tokens": 2536530.0, "step": 630 }, { - "entropy": 1.301590697467327, - "epoch": 0.6167188629245964, - "grad_norm": 0.047615595161914825, - "learning_rate": 0.00016734279918864098, - "loss": 1.322781467437744, - "mean_token_accuracy": 0.6952215626835823, - "num_tokens": 3765429.0, + "entropy": 1.0595403373241425, + "epoch": 0.24492920015308076, + "grad_norm": 0.10258961468935013, + "learning_rate": 0.00019336645629112396, + "loss": 1.0888574600219727, + "mean_token_accuracy": 0.7474928990006446, + "num_tokens": 2571599.0, "step": 640 }, { - "entropy": 1.2000589437782765, - "epoch": 0.6263550951577933, - "grad_norm": 0.057772569358348846, - "learning_rate": 0.0001666666666666667, - "loss": 1.2448016166687013, - "mean_token_accuracy": 0.7192464172840118, - "num_tokens": 3826736.0, + "entropy": 1.0924376487731933, + "epoch": 0.24875621890547264, + "grad_norm": 0.0751282125711441, + "learning_rate": 0.0001930978917684974, + "loss": 1.1468082427978517, + "mean_token_accuracy": 0.7465573191642761, + "num_tokens": 2612401.0, "step": 650 }, { - "entropy": 1.187142127752304, - "epoch": 0.6359913273909901, - "grad_norm": 0.06310118734836578, - "learning_rate": 0.00016599053414469237, - "loss": 1.1989784240722656, - "mean_token_accuracy": 0.7233487412333488, - "num_tokens": 3886196.0, + "entropy": 0.9765479557216168, + "epoch": 0.2525832376578645, + "grad_norm": 0.09039046615362167, + "learning_rate": 0.00019282932724587083, + "loss": 1.054959201812744, + "mean_token_accuracy": 0.7649626806378365, + "num_tokens": 2652042.0, "step": 660 }, { - "entropy": 1.2190307170152663, - "epoch": 0.6456275596241869, - "grad_norm": 0.049476709216833115, - "learning_rate": 0.00016531440162271806, - "loss": 1.2834860801696777, - "mean_token_accuracy": 0.713593752682209, - "num_tokens": 3941611.0, + "entropy": 1.0833388939499855, + "epoch": 0.2564102564102564, + "grad_norm": 0.08905521035194397, + "learning_rate": 0.00019256076272324425, + "loss": 1.0937233924865724, + "mean_token_accuracy": 0.7510278865694999, + "num_tokens": 2692211.0, "step": 670 }, { - "entropy": 1.1955443389713765, - "epoch": 0.6552637918573838, - "grad_norm": 0.06877182424068451, - "learning_rate": 0.00016463826910074377, - "loss": 1.2530131340026855, - "mean_token_accuracy": 0.7211957752704621, - "num_tokens": 3998745.0, + "entropy": 1.0478161230683327, + "epoch": 0.2602372751626483, + "grad_norm": 0.09634676575660706, + "learning_rate": 0.0001922921982006177, + "loss": 1.1077991485595704, + "mean_token_accuracy": 0.7490576148033142, + "num_tokens": 2734562.0, "step": 680 }, { - "entropy": 1.255328443646431, - "epoch": 0.6649000240905806, - "grad_norm": 0.06499036401510239, - "learning_rate": 0.00016396213657876945, - "loss": 1.2712255477905274, - "mean_token_accuracy": 0.7075859263539315, - "num_tokens": 4057693.0, + "entropy": 1.13061283826828, + "epoch": 0.26406429391504016, + "grad_norm": 0.07757367938756943, + "learning_rate": 0.00019202363367799114, + "loss": 1.1330499649047852, + "mean_token_accuracy": 0.7350772902369499, + "num_tokens": 2778990.0, "step": 690 }, { - "entropy": 1.269506113231182, - "epoch": 0.6745362563237775, - "grad_norm": 0.07255974411964417, - "learning_rate": 0.00016328600405679514, - "loss": 1.294202709197998, - "mean_token_accuracy": 0.7018555745482444, - "num_tokens": 4113943.0, + "entropy": 1.0450415380299092, + "epoch": 0.26789131266743205, + "grad_norm": 0.06570328027009964, + "learning_rate": 0.0001917550691553646, + "loss": 1.1425201416015625, + "mean_token_accuracy": 0.7551864832639694, + "num_tokens": 2814475.0, "step": 700 }, { - "entropy": 1.24193774163723, - "epoch": 0.6841724885569742, - "grad_norm": 0.07056763768196106, - "learning_rate": 0.00016260987153482082, - "loss": 1.2850048065185546, - "mean_token_accuracy": 0.714261619746685, - "num_tokens": 4171377.0, + "entropy": 1.0114438571035862, + "epoch": 0.27171833141982393, + "grad_norm": 0.11020322889089584, + "learning_rate": 0.00019148650463273804, + "loss": 1.0655290603637695, + "mean_token_accuracy": 0.7622251763939858, + "num_tokens": 2847751.0, "step": 710 }, { - "entropy": 1.2012858629226684, - "epoch": 0.6938087207901711, - "grad_norm": 0.06263507157564163, - "learning_rate": 0.0001619337390128465, - "loss": 1.2244320869445802, - "mean_token_accuracy": 0.7127483233809471, - "num_tokens": 4224788.0, + "entropy": 1.1008106037974357, + "epoch": 0.2755453501722158, + "grad_norm": 0.07282241433858871, + "learning_rate": 0.00019121794011011146, + "loss": 1.139061450958252, + "mean_token_accuracy": 0.7368278667330742, + "num_tokens": 2889553.0, "step": 720 }, { - "entropy": 1.118230439722538, - "epoch": 0.7034449530233678, - "grad_norm": 0.0841294601559639, - "learning_rate": 0.00016125760649087222, - "loss": 1.1727646827697753, - "mean_token_accuracy": 0.7389020159840584, - "num_tokens": 4273260.0, + "entropy": 1.0155851803719997, + "epoch": 0.2793723689246077, + "grad_norm": 0.11799076199531555, + "learning_rate": 0.0001909493755874849, + "loss": 1.0634971618652345, + "mean_token_accuracy": 0.7578275159001351, + "num_tokens": 2926860.0, "step": 730 }, { - "entropy": 1.1662761889398099, - "epoch": 0.7130811852565647, - "grad_norm": 0.05674638971686363, - "learning_rate": 0.0001605814739688979, - "loss": 1.2040279388427735, - "mean_token_accuracy": 0.7247484371066093, - "num_tokens": 4330262.0, + "entropy": 1.0430648550391197, + "epoch": 0.2831993876769996, + "grad_norm": 0.08702066540718079, + "learning_rate": 0.00019068081106485832, + "loss": 1.0851760864257813, + "mean_token_accuracy": 0.7573095709085464, + "num_tokens": 2964029.0, "step": 740 }, { - "entropy": 1.1847384825348855, - "epoch": 0.7227174174897615, - "grad_norm": 0.0654911920428276, - "learning_rate": 0.00015990534144692359, - "loss": 1.245723056793213, - "mean_token_accuracy": 0.7201158210635186, - "num_tokens": 4387940.0, + "entropy": 1.0908455178141594, + "epoch": 0.2870264064293915, + "grad_norm": 0.06593967229127884, + "learning_rate": 0.00019041224654223177, + "loss": 1.0929256439208985, + "mean_token_accuracy": 0.7440102145075798, + "num_tokens": 3004528.0, "step": 750 }, { - "entropy": 1.2574817538261414, - "epoch": 0.7323536497229584, - "grad_norm": 0.05644860863685608, - "learning_rate": 0.0001592292089249493, - "loss": 1.2513961791992188, - "mean_token_accuracy": 0.7070513799786567, - "num_tokens": 4451167.0, + "entropy": 0.971546346694231, + "epoch": 0.29085342518178336, + "grad_norm": 0.08857332915067673, + "learning_rate": 0.00019014368201960522, + "loss": 1.0575197219848633, + "mean_token_accuracy": 0.7712547823786735, + "num_tokens": 3041203.0, "step": 760 }, { - "entropy": 1.1798933163285255, - "epoch": 0.7419898819561551, - "grad_norm": 0.04677167534828186, - "learning_rate": 0.00015855307640297498, - "loss": 1.2556601524353028, - "mean_token_accuracy": 0.7209865570068359, - "num_tokens": 4514662.0, + "entropy": 1.0496620319783687, + "epoch": 0.29468044393417525, + "grad_norm": 0.07172030210494995, + "learning_rate": 0.00018987511749697867, + "loss": 1.100113582611084, + "mean_token_accuracy": 0.747462597489357, + "num_tokens": 3086263.0, "step": 770 }, { - "entropy": 1.2833519145846366, - "epoch": 0.751626114189352, - "grad_norm": 0.05460638552904129, - "learning_rate": 0.0001578769438810007, - "loss": 1.3391889572143554, - "mean_token_accuracy": 0.699195285141468, - "num_tokens": 4577464.0, + "entropy": 1.0853942684829234, + "epoch": 0.29850746268656714, + "grad_norm": 0.0861373096704483, + "learning_rate": 0.0001896065529743521, + "loss": 1.1116449356079101, + "mean_token_accuracy": 0.7485424548387527, + "num_tokens": 3126741.0, "step": 780 }, { - "entropy": 1.301037323474884, - "epoch": 0.7612623464225488, - "grad_norm": 0.08067753911018372, - "learning_rate": 0.00015720081135902638, - "loss": 1.3280585289001465, - "mean_token_accuracy": 0.6965255320072175, - "num_tokens": 4640394.0, + "entropy": 1.039051755145192, + "epoch": 0.302334481438959, + "grad_norm": 0.07344193756580353, + "learning_rate": 0.00018933798845172553, + "loss": 1.092859935760498, + "mean_token_accuracy": 0.7524166733026505, + "num_tokens": 3164039.0, "step": 790 }, { - "entropy": 1.1654298216104508, - "epoch": 0.7708985786557456, - "grad_norm": 0.06938653439283371, - "learning_rate": 0.00015652467883705206, - "loss": 1.2016778945922852, - "mean_token_accuracy": 0.7259130507707596, - "num_tokens": 4694747.0, + "entropy": 1.021885236352682, + "epoch": 0.3061615001913509, + "grad_norm": 0.09843221306800842, + "learning_rate": 0.00018906942392909895, + "loss": 1.0813950538635253, + "mean_token_accuracy": 0.7612547591328621, + "num_tokens": 3202455.0, "step": 800 }, { - "entropy": 1.2972704201936722, - "epoch": 0.7805348108889424, - "grad_norm": 0.05404666066169739, - "learning_rate": 0.00015584854631507777, - "loss": 1.2715141296386718, - "mean_token_accuracy": 0.7007692798972129, - "num_tokens": 4754920.0, + "entropy": 1.0329275727272034, + "epoch": 0.3099885189437428, + "grad_norm": 0.07059452682733536, + "learning_rate": 0.0001888008594064724, + "loss": 1.0516587257385255, + "mean_token_accuracy": 0.7528441205620766, + "num_tokens": 3239911.0, "step": 810 }, { - "entropy": 1.191229759156704, - "epoch": 0.7901710431221393, - "grad_norm": 0.05804457888007164, - "learning_rate": 0.00015517241379310346, - "loss": 1.2225379943847656, - "mean_token_accuracy": 0.7155083760619163, - "num_tokens": 4813911.0, + "entropy": 1.0202949695289134, + "epoch": 0.31381553769613474, + "grad_norm": 0.07269048690795898, + "learning_rate": 0.00018853229488384585, + "loss": 1.0879244804382324, + "mean_token_accuracy": 0.7542849883437157, + "num_tokens": 3278682.0, "step": 820 }, { - "entropy": 1.1537679880857468, - "epoch": 0.799807275355336, - "grad_norm": 0.06406034529209137, - "learning_rate": 0.00015449628127112914, - "loss": 1.2089889526367188, - "mean_token_accuracy": 0.7305796332657337, - "num_tokens": 4867754.0, + "entropy": 1.0690899170935153, + "epoch": 0.3176425564485266, + "grad_norm": 0.14370054006576538, + "learning_rate": 0.0001882637303612193, + "loss": 1.1063778877258301, + "mean_token_accuracy": 0.75286915153265, + "num_tokens": 3325465.0, "step": 830 }, { - "entropy": 1.3313012823462487, - "epoch": 0.8094435075885329, - "grad_norm": 0.05331671983003616, - "learning_rate": 0.00015382014874915485, - "loss": 1.3202786445617676, - "mean_token_accuracy": 0.6896555438637734, - "num_tokens": 4932220.0, + "entropy": 1.0819261983036994, + "epoch": 0.3214695752009185, + "grad_norm": 0.0973975881934166, + "learning_rate": 0.00018799516583859274, + "loss": 1.0978353500366211, + "mean_token_accuracy": 0.749411192536354, + "num_tokens": 3363479.0, "step": 840 }, { - "entropy": 1.1428007125854491, - "epoch": 0.8190797398217297, - "grad_norm": 0.0543409064412117, - "learning_rate": 0.00015314401622718054, - "loss": 1.1924286842346192, - "mean_token_accuracy": 0.7276274234056472, - "num_tokens": 4991379.0, + "entropy": 1.0502549454569816, + "epoch": 0.3252965939533104, + "grad_norm": 0.11021706461906433, + "learning_rate": 0.0001877266013159662, + "loss": 1.1489330291748048, + "mean_token_accuracy": 0.7476440489292144, + "num_tokens": 3405863.0, "step": 850 }, { - "entropy": 1.1923618368804454, - "epoch": 0.8287159720549265, - "grad_norm": 0.04643765091896057, - "learning_rate": 0.00015246788370520625, - "loss": 1.2740966796875, - "mean_token_accuracy": 0.7193146347999573, - "num_tokens": 5050844.0, + "entropy": 1.1556663788855075, + "epoch": 0.3291236127057023, + "grad_norm": 0.06459799408912659, + "learning_rate": 0.0001874580367933396, + "loss": 1.1840539932250977, + "mean_token_accuracy": 0.7288095027208328, + "num_tokens": 3450829.0, "step": 860 }, { - "entropy": 1.2764008730649947, - "epoch": 0.8383522042881233, - "grad_norm": 0.06100849807262421, - "learning_rate": 0.00015179175118323193, - "loss": 1.2969582557678223, - "mean_token_accuracy": 0.7002251073718071, - "num_tokens": 5111937.0, + "entropy": 1.097336183488369, + "epoch": 0.33295063145809417, + "grad_norm": 0.06765513867139816, + "learning_rate": 0.00018718947227071303, + "loss": 1.1286226272583009, + "mean_token_accuracy": 0.7439226225018502, + "num_tokens": 3490640.0, "step": 870 }, { - "entropy": 1.1852023541927337, - "epoch": 0.8479884365213202, - "grad_norm": 0.08129674941301346, - "learning_rate": 0.00015111561866125762, - "loss": 1.215758991241455, - "mean_token_accuracy": 0.7221846386790276, - "num_tokens": 5173122.0, + "entropy": 1.0772622771561147, + "epoch": 0.33677765021048606, + "grad_norm": 0.08126482367515564, + "learning_rate": 0.00018692090774808648, + "loss": 1.1434885025024415, + "mean_token_accuracy": 0.7434220835566521, + "num_tokens": 3529438.0, "step": 880 }, { - "entropy": 1.2100131824612617, - "epoch": 0.857624668754517, - "grad_norm": 0.05021384358406067, - "learning_rate": 0.0001504394861392833, - "loss": 1.245238971710205, - "mean_token_accuracy": 0.7174671500921249, - "num_tokens": 5237440.0, + "entropy": 1.0091869838535785, + "epoch": 0.34060466896287794, + "grad_norm": 0.0654602199792862, + "learning_rate": 0.00018665234322545992, + "loss": 1.0767542839050293, + "mean_token_accuracy": 0.7644046351313591, + "num_tokens": 3565217.0, "step": 890 }, { - "entropy": 1.2959269508719444, - "epoch": 0.8672609009877138, - "grad_norm": 0.06317298859357834, - "learning_rate": 0.000149763353617309, - "loss": 1.298589324951172, - "mean_token_accuracy": 0.696705661714077, - "num_tokens": 5306311.0, + "entropy": 1.0432863399386405, + "epoch": 0.34443168771526983, + "grad_norm": 0.10025763511657715, + "learning_rate": 0.00018638377870283337, + "loss": 1.0826923370361328, + "mean_token_accuracy": 0.7591656729578972, + "num_tokens": 3603940.0, "step": 900 }, { - "entropy": 1.178437228500843, - "epoch": 0.8768971332209107, - "grad_norm": 0.0590016208589077, - "learning_rate": 0.00014908722109533467, - "loss": 1.2140485763549804, - "mean_token_accuracy": 0.7231360018253327, - "num_tokens": 5365204.0, + "entropy": 1.007722695171833, + "epoch": 0.3482587064676617, + "grad_norm": 0.06779270619153976, + "learning_rate": 0.00018611521418020682, + "loss": 1.0158637046813965, + "mean_token_accuracy": 0.763145099580288, + "num_tokens": 3644547.0, "step": 910 }, { - "entropy": 1.1817002773284913, - "epoch": 0.8865333654541074, - "grad_norm": 0.06499218195676804, - "learning_rate": 0.00014841108857336038, - "loss": 1.2365409851074218, - "mean_token_accuracy": 0.7207681879401207, - "num_tokens": 5425136.0, + "entropy": 1.0556264080107212, + "epoch": 0.3520857252200536, + "grad_norm": 0.07834554463624954, + "learning_rate": 0.00018584664965758026, + "loss": 1.091851806640625, + "mean_token_accuracy": 0.7483858004212379, + "num_tokens": 3691979.0, "step": 920 }, { - "entropy": 1.1871344536542892, - "epoch": 0.8961695976873043, - "grad_norm": 0.05251992866396904, - "learning_rate": 0.00014773495605138607, - "loss": 1.216776180267334, - "mean_token_accuracy": 0.7216181293129921, - "num_tokens": 5485557.0, + "entropy": 1.0730156242847442, + "epoch": 0.3559127439724455, + "grad_norm": 0.10772417485713959, + "learning_rate": 0.00018557808513495368, + "loss": 1.1370153427124023, + "mean_token_accuracy": 0.7466916054487228, + "num_tokens": 3728767.0, "step": 930 }, { - "entropy": 1.2011874541640282, - "epoch": 0.9058058299205011, - "grad_norm": 0.05413221940398216, - "learning_rate": 0.00014705882352941178, - "loss": 1.2213294982910157, - "mean_token_accuracy": 0.7150228247046471, - "num_tokens": 5545105.0, + "entropy": 1.081335111707449, + "epoch": 0.3597397627248374, + "grad_norm": 0.07669705897569656, + "learning_rate": 0.0001853095206123271, + "loss": 1.141366958618164, + "mean_token_accuracy": 0.744081811606884, + "num_tokens": 3772234.0, "step": 940 }, { - "entropy": 1.2181889459490776, - "epoch": 0.9154420621536979, - "grad_norm": 0.056055545806884766, - "learning_rate": 0.00014638269100743746, - "loss": 1.25078125, - "mean_token_accuracy": 0.7119832545518875, - "num_tokens": 5605134.0, + "entropy": 0.9984517656266689, + "epoch": 0.36356678147722926, + "grad_norm": 0.0695272758603096, + "learning_rate": 0.00018504095608970055, + "loss": 1.0501303672790527, + "mean_token_accuracy": 0.7590280339121819, + "num_tokens": 3816970.0, "step": 950 }, { - "entropy": 1.1678502529859542, - "epoch": 0.9250782943868947, - "grad_norm": 0.05342623591423035, - "learning_rate": 0.00014570655848546315, - "loss": 1.2045835494995116, - "mean_token_accuracy": 0.7228824377059937, - "num_tokens": 5663877.0, + "entropy": 0.910194194689393, + "epoch": 0.36739380022962115, + "grad_norm": 0.06411932408809662, + "learning_rate": 0.000184772391567074, + "loss": 0.9656248092651367, + "mean_token_accuracy": 0.7823562085628509, + "num_tokens": 3853816.0, "step": 960 }, { - "entropy": 1.1557280227541924, - "epoch": 0.9347145266200916, - "grad_norm": 0.056248344480991364, - "learning_rate": 0.00014503042596348886, - "loss": 1.175156021118164, - "mean_token_accuracy": 0.7239007547497749, - "num_tokens": 5726645.0, + "entropy": 0.9763211451470852, + "epoch": 0.37122081898201303, + "grad_norm": 0.08389662951231003, + "learning_rate": 0.00018450382704444744, + "loss": 1.0703671455383301, + "mean_token_accuracy": 0.76742093116045, + "num_tokens": 3896404.0, "step": 970 }, { - "entropy": 1.2256763622164726, - "epoch": 0.9443507588532883, - "grad_norm": 0.06137476861476898, - "learning_rate": 0.00014435429344151454, - "loss": 1.234114933013916, - "mean_token_accuracy": 0.7094191774725914, - "num_tokens": 5783468.0, + "entropy": 1.076941692829132, + "epoch": 0.3750478377344049, + "grad_norm": 0.13239043951034546, + "learning_rate": 0.0001842352625218209, + "loss": 1.1353830337524413, + "mean_token_accuracy": 0.7479456245899201, + "num_tokens": 3934187.0, "step": 980 }, { - "entropy": 1.1099035568535327, - "epoch": 0.9539869910864852, - "grad_norm": 0.052310869097709656, - "learning_rate": 0.00014367816091954023, - "loss": 1.158639907836914, - "mean_token_accuracy": 0.7366219267249108, - "num_tokens": 5840101.0, + "entropy": 1.077423833310604, + "epoch": 0.3788748564867968, + "grad_norm": 0.06203702092170715, + "learning_rate": 0.00018396669799919434, + "loss": 1.1363765716552734, + "mean_token_accuracy": 0.7437105163931846, + "num_tokens": 3975766.0, "step": 990 }, { - "entropy": 1.1962413311004638, - "epoch": 0.963623223319682, - "grad_norm": 0.055140018463134766, - "learning_rate": 0.00014300202839756594, - "loss": 1.210787010192871, - "mean_token_accuracy": 0.7173429980874062, - "num_tokens": 5903466.0, + "entropy": 1.009491826593876, + "epoch": 0.3827018752391887, + "grad_norm": 0.06740409135818481, + "learning_rate": 0.00018369813347656776, + "loss": 1.0752355575561523, + "mean_token_accuracy": 0.7598015293478966, + "num_tokens": 4018368.0, "step": 1000 }, { - "entropy": 1.0866067253053189, - "epoch": 0.9732594555528788, - "grad_norm": 0.07032942771911621, - "learning_rate": 0.00014232589587559162, - "loss": 1.1129253387451172, - "mean_token_accuracy": 0.7413682281970978, - "num_tokens": 5958697.0, + "entropy": 0.9744428530335426, + "epoch": 0.3865288939915806, + "grad_norm": 0.07750537246465683, + "learning_rate": 0.00018342956895394118, + "loss": 1.0554892539978027, + "mean_token_accuracy": 0.7682438552379608, + "num_tokens": 4057647.0, "step": 1010 }, { - "entropy": 1.1695112690329552, - "epoch": 0.9828956877860756, - "grad_norm": 0.06965778768062592, - "learning_rate": 0.00014164976335361734, - "loss": 1.2002062797546387, - "mean_token_accuracy": 0.7275112867355347, - "num_tokens": 6018820.0, + "entropy": 1.0335246473550797, + "epoch": 0.39035591274397247, + "grad_norm": 0.07627248764038086, + "learning_rate": 0.00018316100443131463, + "loss": 1.0600407600402832, + "mean_token_accuracy": 0.7552958622574806, + "num_tokens": 4098377.0, "step": 1020 }, { - "entropy": 1.129349136352539, - "epoch": 0.9925319200192725, - "grad_norm": 0.0639411062002182, - "learning_rate": 0.00014097363083164302, - "loss": 1.1613442420959472, - "mean_token_accuracy": 0.7342579141259193, - "num_tokens": 6074337.0, + "entropy": 1.0256185740232469, + "epoch": 0.39418293149636435, + "grad_norm": 0.10117889940738678, + "learning_rate": 0.00018289243990868807, + "loss": 1.0706727027893066, + "mean_token_accuracy": 0.75880047082901, + "num_tokens": 4141633.0, "step": 1030 }, { - "epoch": 1.0, - "eval_entropy": 1.0300818726266783, - "eval_loss": 1.032158374786377, - "eval_mean_token_accuracy": 0.7592671683222431, - "eval_num_tokens": 6115944.0, - "eval_runtime": 513.4443, - "eval_samples_per_second": 1.7, - "eval_steps_per_second": 1.7, - "step": 1038 + "entropy": 0.9883378148078918, + "epoch": 0.39800995024875624, + "grad_norm": 0.064593605697155, + "learning_rate": 0.00018262387538606152, + "loss": 1.007016372680664, + "mean_token_accuracy": 0.764974731206894, + "num_tokens": 4181155.0, + "step": 1040 + }, + { + "entropy": 1.0692595109343528, + "epoch": 0.4018369690011481, + "grad_norm": 0.07493151724338531, + "learning_rate": 0.00018235531086343497, + "loss": 1.123647975921631, + "mean_token_accuracy": 0.7452841177582741, + "num_tokens": 4218175.0, + "step": 1050 + }, + { + "entropy": 0.990117172151804, + "epoch": 0.40566398775354, + "grad_norm": 0.06332839280366898, + "learning_rate": 0.0001820867463408084, + "loss": 1.0538661003112793, + "mean_token_accuracy": 0.7637043848633767, + "num_tokens": 4262326.0, + "step": 1060 + }, + { + "entropy": 1.002436650544405, + "epoch": 0.4094910065059319, + "grad_norm": 0.07898294180631638, + "learning_rate": 0.00018181818181818183, + "loss": 0.9973239898681641, + "mean_token_accuracy": 0.7644759714603424, + "num_tokens": 4300876.0, + "step": 1070 + }, + { + "entropy": 0.9635432817041873, + "epoch": 0.4133180252583238, + "grad_norm": 0.09760674089193344, + "learning_rate": 0.00018154961729555525, + "loss": 1.0411369323730468, + "mean_token_accuracy": 0.7664693981409073, + "num_tokens": 4338887.0, + "step": 1080 + }, + { + "entropy": 0.9780610945075751, + "epoch": 0.41714504401071567, + "grad_norm": 0.08076441287994385, + "learning_rate": 0.0001812810527729287, + "loss": 1.0544751167297364, + "mean_token_accuracy": 0.76624975502491, + "num_tokens": 4380678.0, + "step": 1090 + }, + { + "entropy": 1.0548741944134234, + "epoch": 0.42097206276310756, + "grad_norm": 0.0646439641714096, + "learning_rate": 0.00018101248825030215, + "loss": 1.128230667114258, + "mean_token_accuracy": 0.7504925444722176, + "num_tokens": 4422899.0, + "step": 1100 + }, + { + "entropy": 1.0767812803387642, + "epoch": 0.42479908151549944, + "grad_norm": 0.06994366645812988, + "learning_rate": 0.0001807439237276756, + "loss": 1.1209583282470703, + "mean_token_accuracy": 0.7477620646357537, + "num_tokens": 4461864.0, + "step": 1110 + }, + { + "entropy": 1.062944334745407, + "epoch": 0.42862610026789133, + "grad_norm": 0.11016593873500824, + "learning_rate": 0.00018047535920504904, + "loss": 1.0880105018615722, + "mean_token_accuracy": 0.7455767750740051, + "num_tokens": 4501378.0, + "step": 1120 + }, + { + "entropy": 1.0511136516928672, + "epoch": 0.4324531190202832, + "grad_norm": 0.08707646280527115, + "learning_rate": 0.00018020679468242246, + "loss": 1.0764313697814942, + "mean_token_accuracy": 0.7519838035106658, + "num_tokens": 4541448.0, + "step": 1130 + }, + { + "entropy": 0.9529998056590557, + "epoch": 0.4362801377726751, + "grad_norm": 0.07353853434324265, + "learning_rate": 0.00017993823015979588, + "loss": 1.0098756790161132, + "mean_token_accuracy": 0.7720077604055404, + "num_tokens": 4586147.0, + "step": 1140 + }, + { + "entropy": 1.143844011425972, + "epoch": 0.440107156525067, + "grad_norm": 0.06268489360809326, + "learning_rate": 0.00017966966563716933, + "loss": 1.1934361457824707, + "mean_token_accuracy": 0.7281116575002671, + "num_tokens": 4631906.0, + "step": 1150 + }, + { + "entropy": 1.0761573910713196, + "epoch": 0.4439341752774589, + "grad_norm": 0.07078517228364944, + "learning_rate": 0.00017940110111454278, + "loss": 1.1359615325927734, + "mean_token_accuracy": 0.7409780561923981, + "num_tokens": 4674626.0, + "step": 1160 + }, + { + "entropy": 0.9940036550164223, + "epoch": 0.44776119402985076, + "grad_norm": 0.08054502308368683, + "learning_rate": 0.00017913253659191622, + "loss": 1.033839225769043, + "mean_token_accuracy": 0.7682256817817688, + "num_tokens": 4715717.0, + "step": 1170 + }, + { + "entropy": 0.9752239182591438, + "epoch": 0.45158821278224265, + "grad_norm": 0.08600450307130814, + "learning_rate": 0.00017886397206928967, + "loss": 1.0254844665527343, + "mean_token_accuracy": 0.7688430979847908, + "num_tokens": 4747316.0, + "step": 1180 + }, + { + "entropy": 1.064694558084011, + "epoch": 0.45541523153463453, + "grad_norm": 0.07270248234272003, + "learning_rate": 0.0001785954075466631, + "loss": 1.0806646347045898, + "mean_token_accuracy": 0.7534265503287315, + "num_tokens": 4788606.0, + "step": 1190 + }, + { + "entropy": 0.958549628406763, + "epoch": 0.4592422502870264, + "grad_norm": 0.0644846111536026, + "learning_rate": 0.00017832684302403654, + "loss": 1.0015847206115722, + "mean_token_accuracy": 0.7644492238759995, + "num_tokens": 4831371.0, + "step": 1200 + }, + { + "entropy": 1.0885677203536033, + "epoch": 0.4630692690394183, + "grad_norm": 0.13487283885478973, + "learning_rate": 0.00017805827850140996, + "loss": 1.1495524406433106, + "mean_token_accuracy": 0.7414823487401009, + "num_tokens": 4871231.0, + "step": 1210 + }, + { + "entropy": 1.1117899976670742, + "epoch": 0.4668962877918102, + "grad_norm": 0.08015701174736023, + "learning_rate": 0.0001777897139787834, + "loss": 1.1366958618164062, + "mean_token_accuracy": 0.7351289570331574, + "num_tokens": 4911520.0, + "step": 1220 + }, + { + "entropy": 0.9722193017601967, + "epoch": 0.4707233065442021, + "grad_norm": 0.06839531660079956, + "learning_rate": 0.00017752114945615685, + "loss": 1.0259140968322753, + "mean_token_accuracy": 0.7658233359456063, + "num_tokens": 4950296.0, + "step": 1230 + }, + { + "entropy": 1.0021446757018566, + "epoch": 0.47455032529659397, + "grad_norm": 0.08231978863477707, + "learning_rate": 0.0001772525849335303, + "loss": 1.0437036514282227, + "mean_token_accuracy": 0.7644398525357247, + "num_tokens": 4989688.0, + "step": 1240 + }, + { + "entropy": 0.9640353135764599, + "epoch": 0.47837734404898585, + "grad_norm": 0.11587074398994446, + "learning_rate": 0.00017698402041090375, + "loss": 1.0072126388549805, + "mean_token_accuracy": 0.7740294471383095, + "num_tokens": 5029135.0, + "step": 1250 + }, + { + "entropy": 1.0122342824935913, + "epoch": 0.48220436280137774, + "grad_norm": 0.07646426558494568, + "learning_rate": 0.00017671545588827717, + "loss": 1.0733034133911132, + "mean_token_accuracy": 0.7619047269225121, + "num_tokens": 5066488.0, + "step": 1260 + }, + { + "entropy": 1.0465880073606968, + "epoch": 0.4860313815537696, + "grad_norm": 0.07594821602106094, + "learning_rate": 0.0001764468913656506, + "loss": 1.0953254699707031, + "mean_token_accuracy": 0.7511951208114624, + "num_tokens": 5102103.0, + "step": 1270 + }, + { + "entropy": 1.0104024082422256, + "epoch": 0.4898584003061615, + "grad_norm": 0.07695835083723068, + "learning_rate": 0.00017617832684302403, + "loss": 1.1025714874267578, + "mean_token_accuracy": 0.7583977058529854, + "num_tokens": 5141113.0, + "step": 1280 + }, + { + "entropy": 1.044089037179947, + "epoch": 0.4936854190585534, + "grad_norm": 0.07186906039714813, + "learning_rate": 0.00017590976232039748, + "loss": 1.0713683128356934, + "mean_token_accuracy": 0.7546971932053566, + "num_tokens": 5181454.0, + "step": 1290 + }, + { + "entropy": 0.9154780797660351, + "epoch": 0.4975124378109453, + "grad_norm": 0.08934911340475082, + "learning_rate": 0.00017564119779777093, + "loss": 0.9871469497680664, + "mean_token_accuracy": 0.7756836161017417, + "num_tokens": 5214143.0, + "step": 1300 + }, + { + "entropy": 1.05635926425457, + "epoch": 0.5013394565633371, + "grad_norm": 0.07880513370037079, + "learning_rate": 0.00017537263327514437, + "loss": 1.0985527038574219, + "mean_token_accuracy": 0.7524559125304222, + "num_tokens": 5258310.0, + "step": 1310 + }, + { + "entropy": 1.0448619149625302, + "epoch": 0.505166475315729, + "grad_norm": 0.10507462918758392, + "learning_rate": 0.0001751040687525178, + "loss": 1.1040778160095215, + "mean_token_accuracy": 0.7477249845862388, + "num_tokens": 5296865.0, + "step": 1320 + }, + { + "entropy": 1.0706947155296802, + "epoch": 0.5089934940681209, + "grad_norm": 0.09437765926122665, + "learning_rate": 0.00017483550422989124, + "loss": 1.1372867584228517, + "mean_token_accuracy": 0.7502224639058113, + "num_tokens": 5335301.0, + "step": 1330 + }, + { + "entropy": 0.9736435614526272, + "epoch": 0.5128205128205128, + "grad_norm": 0.07162626087665558, + "learning_rate": 0.0001745669397072647, + "loss": 1.0273897171020507, + "mean_token_accuracy": 0.7711918234825135, + "num_tokens": 5372533.0, + "step": 1340 + }, + { + "entropy": 0.9989161014556884, + "epoch": 0.5166475315729047, + "grad_norm": 0.08805254101753235, + "learning_rate": 0.0001742983751846381, + "loss": 1.0603778839111329, + "mean_token_accuracy": 0.7600028276443481, + "num_tokens": 5412651.0, + "step": 1350 + }, + { + "entropy": 1.063752220571041, + "epoch": 0.5204745503252965, + "grad_norm": 0.08056829869747162, + "learning_rate": 0.00017402981066201156, + "loss": 1.0876687049865723, + "mean_token_accuracy": 0.7487231969833374, + "num_tokens": 5454518.0, + "step": 1360 + }, + { + "entropy": 0.966426993906498, + "epoch": 0.5243015690776884, + "grad_norm": 0.06970727443695068, + "learning_rate": 0.000173761246139385, + "loss": 1.0302441596984864, + "mean_token_accuracy": 0.7640074551105499, + "num_tokens": 5495257.0, + "step": 1370 + }, + { + "entropy": 0.9727898858487606, + "epoch": 0.5281285878300803, + "grad_norm": 0.09694326668977737, + "learning_rate": 0.00017349268161675842, + "loss": 1.0327792167663574, + "mean_token_accuracy": 0.7687779292464256, + "num_tokens": 5527573.0, + "step": 1380 + }, + { + "entropy": 1.043993879854679, + "epoch": 0.5319556065824722, + "grad_norm": 0.05676735192537308, + "learning_rate": 0.00017322411709413187, + "loss": 1.1139988899230957, + "mean_token_accuracy": 0.7597961351275444, + "num_tokens": 5566542.0, + "step": 1390 + }, + { + "entropy": 0.9891408108174801, + "epoch": 0.5357826253348641, + "grad_norm": 0.08670998364686966, + "learning_rate": 0.00017295555257150532, + "loss": 1.0878351211547852, + "mean_token_accuracy": 0.7640718072652817, + "num_tokens": 5604986.0, + "step": 1400 + }, + { + "entropy": 1.0097288101911546, + "epoch": 0.539609644087256, + "grad_norm": 0.09190856665372849, + "learning_rate": 0.00017268698804887876, + "loss": 1.079444408416748, + "mean_token_accuracy": 0.7590912491083145, + "num_tokens": 5642224.0, + "step": 1410 + }, + { + "entropy": 0.9927844725549221, + "epoch": 0.5434366628396479, + "grad_norm": 0.08191007375717163, + "learning_rate": 0.00017241842352625218, + "loss": 1.0661033630371093, + "mean_token_accuracy": 0.7664914444088936, + "num_tokens": 5680912.0, + "step": 1420 + }, + { + "entropy": 0.973179691657424, + "epoch": 0.5472636815920398, + "grad_norm": 0.08161566406488419, + "learning_rate": 0.00017214985900362563, + "loss": 1.078667163848877, + "mean_token_accuracy": 0.7686992704868316, + "num_tokens": 5717171.0, + "step": 1430 + }, + { + "entropy": 1.0467095457017421, + "epoch": 0.5510907003444316, + "grad_norm": 0.09403429925441742, + "learning_rate": 0.00017188129448099908, + "loss": 1.0912303924560547, + "mean_token_accuracy": 0.7550861686468124, + "num_tokens": 5755956.0, + "step": 1440 + }, + { + "entropy": 1.0082954704761504, + "epoch": 0.5549177190968235, + "grad_norm": 0.09858231991529465, + "learning_rate": 0.0001716127299583725, + "loss": 1.0449023246765137, + "mean_token_accuracy": 0.7586705282330513, + "num_tokens": 5798765.0, + "step": 1450 + }, + { + "entropy": 1.0528397418558597, + "epoch": 0.5587447378492154, + "grad_norm": 0.06697855144739151, + "learning_rate": 0.00017134416543574594, + "loss": 1.0901053428649903, + "mean_token_accuracy": 0.7517547190189362, + "num_tokens": 5839833.0, + "step": 1460 + }, + { + "entropy": 1.009619940817356, + "epoch": 0.5625717566016073, + "grad_norm": 0.07271189987659454, + "learning_rate": 0.0001710756009131194, + "loss": 1.0171070098876953, + "mean_token_accuracy": 0.7594649389386177, + "num_tokens": 5880613.0, + "step": 1470 + }, + { + "entropy": 0.9699329622089863, + "epoch": 0.5663987753539992, + "grad_norm": 0.07800697535276413, + "learning_rate": 0.0001708070363904928, + "loss": 1.1077412605285644, + "mean_token_accuracy": 0.7667307928204536, + "num_tokens": 5918218.0, + "step": 1480 + }, + { + "entropy": 0.9389957278966904, + "epoch": 0.5702257941063911, + "grad_norm": 0.08150342851877213, + "learning_rate": 0.00017053847186786626, + "loss": 0.9634763717651367, + "mean_token_accuracy": 0.7806087970733643, + "num_tokens": 5960284.0, + "step": 1490 + }, + { + "entropy": 1.0140163496136665, + "epoch": 0.574052812858783, + "grad_norm": 0.06430503726005554, + "learning_rate": 0.0001702699073452397, + "loss": 1.0751851081848145, + "mean_token_accuracy": 0.7564210310578346, + "num_tokens": 6000562.0, + "step": 1500 + }, + { + "entropy": 1.047791599482298, + "epoch": 0.5778798316111748, + "grad_norm": 0.07922326028347015, + "learning_rate": 0.00017000134282261313, + "loss": 1.1397698402404786, + "mean_token_accuracy": 0.7494736298918724, + "num_tokens": 6045192.0, + "step": 1510 + }, + { + "entropy": 1.1085532158613205, + "epoch": 0.5817068503635667, + "grad_norm": 0.1093953400850296, + "learning_rate": 0.00016973277829998657, + "loss": 1.142368698120117, + "mean_token_accuracy": 0.7414237394928932, + "num_tokens": 6090148.0, + "step": 1520 + }, + { + "entropy": 0.9427969709038735, + "epoch": 0.5855338691159586, + "grad_norm": 0.09579843282699585, + "learning_rate": 0.00016946421377736002, + "loss": 0.9980224609375, + "mean_token_accuracy": 0.7730853497982025, + "num_tokens": 6129845.0, + "step": 1530 + }, + { + "entropy": 1.0571323171257974, + "epoch": 0.5893608878683505, + "grad_norm": 0.09482655674219131, + "learning_rate": 0.00016919564925473347, + "loss": 1.0665513038635255, + "mean_token_accuracy": 0.749831511080265, + "num_tokens": 6171961.0, + "step": 1540 + }, + { + "entropy": 0.978803563863039, + "epoch": 0.5931879066207424, + "grad_norm": 0.08609842509031296, + "learning_rate": 0.0001689270847321069, + "loss": 1.0541341781616211, + "mean_token_accuracy": 0.769312071800232, + "num_tokens": 6210126.0, + "step": 1550 + }, + { + "entropy": 1.0714900024235248, + "epoch": 0.5970149253731343, + "grad_norm": 0.05390879884362221, + "learning_rate": 0.00016865852020948033, + "loss": 1.1057682037353516, + "mean_token_accuracy": 0.7402923837304115, + "num_tokens": 6262394.0, + "step": 1560 + }, + { + "entropy": 0.9234071888029576, + "epoch": 0.6008419441255262, + "grad_norm": 0.09692159295082092, + "learning_rate": 0.00016838995568685378, + "loss": 0.9622941017150879, + "mean_token_accuracy": 0.7832354381680489, + "num_tokens": 6292140.0, + "step": 1570 + }, + { + "entropy": 0.9591108359396457, + "epoch": 0.604668962877918, + "grad_norm": 0.0720645934343338, + "learning_rate": 0.0001681213911642272, + "loss": 1.019169235229492, + "mean_token_accuracy": 0.771478471159935, + "num_tokens": 6332134.0, + "step": 1580 + }, + { + "entropy": 1.0945825845003128, + "epoch": 0.6084959816303099, + "grad_norm": 0.07380460202693939, + "learning_rate": 0.00016785282664160065, + "loss": 1.1463271141052247, + "mean_token_accuracy": 0.7419712334871292, + "num_tokens": 6373967.0, + "step": 1590 + }, + { + "entropy": 1.055589073896408, + "epoch": 0.6123230003827018, + "grad_norm": 0.07209772616624832, + "learning_rate": 0.0001675842621189741, + "loss": 1.1144783973693848, + "mean_token_accuracy": 0.7461996227502823, + "num_tokens": 6418236.0, + "step": 1600 + }, + { + "entropy": 1.1149650782346725, + "epoch": 0.6161500191350937, + "grad_norm": 0.07935164868831635, + "learning_rate": 0.00016731569759634754, + "loss": 1.1727729797363282, + "mean_token_accuracy": 0.7365738078951836, + "num_tokens": 6464589.0, + "step": 1610 + }, + { + "entropy": 1.0068970195949078, + "epoch": 0.6199770378874856, + "grad_norm": 0.0804886594414711, + "learning_rate": 0.00016704713307372096, + "loss": 1.0483062744140625, + "mean_token_accuracy": 0.7632505163550377, + "num_tokens": 6504385.0, + "step": 1620 + }, + { + "entropy": 0.9996877416968346, + "epoch": 0.6238040566398775, + "grad_norm": 0.0723455473780632, + "learning_rate": 0.0001667785685510944, + "loss": 1.0592655181884765, + "mean_token_accuracy": 0.7597218692302704, + "num_tokens": 6545928.0, + "step": 1630 + }, + { + "entropy": 0.9159566629678011, + "epoch": 0.6276310753922695, + "grad_norm": 0.08028513193130493, + "learning_rate": 0.00016651000402846783, + "loss": 0.9434080123901367, + "mean_token_accuracy": 0.7826011970639228, + "num_tokens": 6586166.0, + "step": 1640 + }, + { + "entropy": 1.0089009895920753, + "epoch": 0.6314580941446614, + "grad_norm": 0.09154181182384491, + "learning_rate": 0.00016624143950584128, + "loss": 1.0339744567871094, + "mean_token_accuracy": 0.7604109585285187, + "num_tokens": 6624706.0, + "step": 1650 + }, + { + "entropy": 1.0208431974053382, + "epoch": 0.6352851128970533, + "grad_norm": 0.08039630204439163, + "learning_rate": 0.00016597287498321472, + "loss": 1.0823830604553222, + "mean_token_accuracy": 0.7548153042793274, + "num_tokens": 6665626.0, + "step": 1660 + }, + { + "entropy": 0.9645413011312485, + "epoch": 0.6391121316494451, + "grad_norm": 0.08834270387887955, + "learning_rate": 0.00016570431046058817, + "loss": 1.0401340484619142, + "mean_token_accuracy": 0.7703565835952759, + "num_tokens": 6699532.0, + "step": 1670 + }, + { + "entropy": 1.0028597339987755, + "epoch": 0.642939150401837, + "grad_norm": 0.08974612504243851, + "learning_rate": 0.00016543574593796162, + "loss": 1.0680928230285645, + "mean_token_accuracy": 0.7617413088679313, + "num_tokens": 6740574.0, + "step": 1680 + }, + { + "entropy": 0.8952145710587501, + "epoch": 0.6467661691542289, + "grad_norm": 0.09289242327213287, + "learning_rate": 0.00016516718141533504, + "loss": 0.9696210861206055, + "mean_token_accuracy": 0.7848611980676651, + "num_tokens": 6782208.0, + "step": 1690 + }, + { + "entropy": 0.9520700328052044, + "epoch": 0.6505931879066208, + "grad_norm": 0.07298107445240021, + "learning_rate": 0.00016489861689270848, + "loss": 0.9891908645629883, + "mean_token_accuracy": 0.7725306749343872, + "num_tokens": 6818937.0, + "step": 1700 + }, + { + "entropy": 0.9734554067254066, + "epoch": 0.6544202066590127, + "grad_norm": 0.08233233541250229, + "learning_rate": 0.0001646300523700819, + "loss": 1.0311893463134765, + "mean_token_accuracy": 0.7683428943157196, + "num_tokens": 6851548.0, + "step": 1710 + }, + { + "entropy": 0.9947349905967713, + "epoch": 0.6582472254114046, + "grad_norm": 0.08351403474807739, + "learning_rate": 0.00016436148784745535, + "loss": 1.0382192611694336, + "mean_token_accuracy": 0.7587384819984436, + "num_tokens": 6891553.0, + "step": 1720 + }, + { + "entropy": 1.0383310310542584, + "epoch": 0.6620742441637965, + "grad_norm": 0.07240983843803406, + "learning_rate": 0.0001640929233248288, + "loss": 1.0978598594665527, + "mean_token_accuracy": 0.7515711337327957, + "num_tokens": 6930875.0, + "step": 1730 + }, + { + "entropy": 1.085533195734024, + "epoch": 0.6659012629161883, + "grad_norm": 0.06999973207712173, + "learning_rate": 0.00016382435880220225, + "loss": 1.1412755966186523, + "mean_token_accuracy": 0.7471029132604599, + "num_tokens": 6971721.0, + "step": 1740 + }, + { + "entropy": 0.978867219388485, + "epoch": 0.6697282816685802, + "grad_norm": 0.06091843172907829, + "learning_rate": 0.0001635557942795757, + "loss": 1.023170566558838, + "mean_token_accuracy": 0.7686347916722298, + "num_tokens": 7010340.0, + "step": 1750 + }, + { + "entropy": 1.0823599390685559, + "epoch": 0.6735553004209721, + "grad_norm": 0.07732617110013962, + "learning_rate": 0.0001632872297569491, + "loss": 1.1036816596984864, + "mean_token_accuracy": 0.7404509574174881, + "num_tokens": 7061189.0, + "step": 1760 + }, + { + "entropy": 0.9984334908425808, + "epoch": 0.677382319173364, + "grad_norm": 0.11516186594963074, + "learning_rate": 0.00016301866523432253, + "loss": 1.1070829391479493, + "mean_token_accuracy": 0.7593477964401245, + "num_tokens": 7098345.0, + "step": 1770 + }, + { + "entropy": 1.0023914370685816, + "epoch": 0.6812093379257559, + "grad_norm": 0.08624757081270218, + "learning_rate": 0.00016275010071169598, + "loss": 1.043964958190918, + "mean_token_accuracy": 0.7635682225227356, + "num_tokens": 7136873.0, + "step": 1780 + }, + { + "entropy": 1.0823404759168624, + "epoch": 0.6850363566781478, + "grad_norm": 0.0846925675868988, + "learning_rate": 0.00016248153618906943, + "loss": 1.1333115577697754, + "mean_token_accuracy": 0.7394865393638611, + "num_tokens": 7181553.0, + "step": 1790 + }, + { + "entropy": 1.0224985226988792, + "epoch": 0.6888633754305397, + "grad_norm": 0.060152288526296616, + "learning_rate": 0.00016221297166644287, + "loss": 1.0782301902770997, + "mean_token_accuracy": 0.759938097000122, + "num_tokens": 7223076.0, + "step": 1800 + }, + { + "entropy": 1.0880159534513951, + "epoch": 0.6926903941829315, + "grad_norm": 0.06577905267477036, + "learning_rate": 0.00016194440714381632, + "loss": 1.1103734016418456, + "mean_token_accuracy": 0.7457254812121391, + "num_tokens": 7265594.0, + "step": 1810 + }, + { + "entropy": 1.0144932381808758, + "epoch": 0.6965174129353234, + "grad_norm": 0.07276095449924469, + "learning_rate": 0.00016167584262118974, + "loss": 1.0733102798461913, + "mean_token_accuracy": 0.759276558458805, + "num_tokens": 7305359.0, + "step": 1820 + }, + { + "entropy": 1.0186967477202415, + "epoch": 0.7003444316877153, + "grad_norm": 0.08775337040424347, + "learning_rate": 0.0001614072780985632, + "loss": 1.0672088623046876, + "mean_token_accuracy": 0.7567476496100426, + "num_tokens": 7348060.0, + "step": 1830 + }, + { + "entropy": 0.9723582908511161, + "epoch": 0.7041714504401072, + "grad_norm": 0.09250030666589737, + "learning_rate": 0.0001611387135759366, + "loss": 1.0032880783081055, + "mean_token_accuracy": 0.7693375036120415, + "num_tokens": 7387110.0, + "step": 1840 + }, + { + "entropy": 0.9738463938236237, + "epoch": 0.7079984691924991, + "grad_norm": 0.09884033352136612, + "learning_rate": 0.00016087014905331006, + "loss": 1.0514408111572267, + "mean_token_accuracy": 0.7625621780753136, + "num_tokens": 7428511.0, + "step": 1850 + }, + { + "entropy": 1.0910252556204796, + "epoch": 0.711825487944891, + "grad_norm": 0.09194686263799667, + "learning_rate": 0.0001606015845306835, + "loss": 1.1226488113403321, + "mean_token_accuracy": 0.7452121302485466, + "num_tokens": 7474736.0, + "step": 1860 + }, + { + "entropy": 1.0159518368542195, + "epoch": 0.7156525066972829, + "grad_norm": 0.07921712845563889, + "learning_rate": 0.00016033302000805695, + "loss": 1.061795711517334, + "mean_token_accuracy": 0.761272345483303, + "num_tokens": 7516891.0, + "step": 1870 + }, + { + "entropy": 0.8903906352818012, + "epoch": 0.7194795254496748, + "grad_norm": 0.10288332402706146, + "learning_rate": 0.0001600644554854304, + "loss": 0.9471863746643067, + "mean_token_accuracy": 0.7831206247210503, + "num_tokens": 7551597.0, + "step": 1880 + }, + { + "entropy": 1.0759326584637166, + "epoch": 0.7233065442020666, + "grad_norm": 0.06488945335149765, + "learning_rate": 0.00015979589096280382, + "loss": 1.136262798309326, + "mean_token_accuracy": 0.742707334458828, + "num_tokens": 7597529.0, + "step": 1890 + }, + { + "entropy": 0.9951202683150768, + "epoch": 0.7271335629544585, + "grad_norm": 0.06628359109163284, + "learning_rate": 0.00015952732644017724, + "loss": 1.0448930740356446, + "mean_token_accuracy": 0.7615623638033867, + "num_tokens": 7634291.0, + "step": 1900 + }, + { + "entropy": 1.0593122780323028, + "epoch": 0.7309605817068504, + "grad_norm": 0.08212320506572723, + "learning_rate": 0.00015925876191755068, + "loss": 1.1099414825439453, + "mean_token_accuracy": 0.7455122962594032, + "num_tokens": 7677086.0, + "step": 1910 + }, + { + "entropy": 1.0122868783771992, + "epoch": 0.7347876004592423, + "grad_norm": 0.06458455324172974, + "learning_rate": 0.00015899019739492413, + "loss": 1.0447346687316894, + "mean_token_accuracy": 0.7542453840374946, + "num_tokens": 7721785.0, + "step": 1920 + }, + { + "entropy": 1.0801358975470066, + "epoch": 0.7386146192116342, + "grad_norm": 0.06971931457519531, + "learning_rate": 0.00015872163287229758, + "loss": 1.109630012512207, + "mean_token_accuracy": 0.7450100436806679, + "num_tokens": 7761887.0, + "step": 1930 + }, + { + "entropy": 0.9081138484179974, + "epoch": 0.7424416379640261, + "grad_norm": 0.06223156675696373, + "learning_rate": 0.00015845306834967102, + "loss": 1.0026588439941406, + "mean_token_accuracy": 0.7772255912423134, + "num_tokens": 7807072.0, + "step": 1940 + }, + { + "entropy": 1.0170219503343105, + "epoch": 0.746268656716418, + "grad_norm": 0.0685853511095047, + "learning_rate": 0.00015818450382704447, + "loss": 1.055277442932129, + "mean_token_accuracy": 0.7580551549792289, + "num_tokens": 7845791.0, + "step": 1950 + }, + { + "entropy": 1.0753178864717483, + "epoch": 0.7500956754688098, + "grad_norm": 0.08306553959846497, + "learning_rate": 0.0001579159393044179, + "loss": 1.1332826614379883, + "mean_token_accuracy": 0.7421450033783913, + "num_tokens": 7891091.0, + "step": 1960 + }, + { + "entropy": 0.9297005102038384, + "epoch": 0.7539226942212017, + "grad_norm": 0.08018683642148972, + "learning_rate": 0.0001576473747817913, + "loss": 1.000318431854248, + "mean_token_accuracy": 0.7793557167053222, + "num_tokens": 7928252.0, + "step": 1970 + }, + { + "entropy": 1.0840253300964833, + "epoch": 0.7577497129735936, + "grad_norm": 0.06487595289945602, + "learning_rate": 0.00015737881025916476, + "loss": 1.1166275024414063, + "mean_token_accuracy": 0.7378593400120735, + "num_tokens": 7972071.0, + "step": 1980 + }, + { + "entropy": 1.0406386695802212, + "epoch": 0.7615767317259855, + "grad_norm": 0.0615115687251091, + "learning_rate": 0.0001571102457365382, + "loss": 1.0869349479675292, + "mean_token_accuracy": 0.7490768045186996, + "num_tokens": 8016865.0, + "step": 1990 + }, + { + "entropy": 0.9573215276002884, + "epoch": 0.7654037504783774, + "grad_norm": 0.0715412124991417, + "learning_rate": 0.00015684168121391165, + "loss": 1.0404720306396484, + "mean_token_accuracy": 0.7706617951393128, + "num_tokens": 8055917.0, + "step": 2000 + }, + { + "entropy": 0.9201878193765879, + "epoch": 0.7692307692307693, + "grad_norm": 0.07988248765468597, + "learning_rate": 0.0001565731166912851, + "loss": 0.9380558967590332, + "mean_token_accuracy": 0.782890722155571, + "num_tokens": 8093252.0, + "step": 2010 + }, + { + "entropy": 1.0045961767435074, + "epoch": 0.7730577879831612, + "grad_norm": 0.061089444905519485, + "learning_rate": 0.00015630455216865855, + "loss": 1.0528027534484863, + "mean_token_accuracy": 0.7598949059844017, + "num_tokens": 8135244.0, + "step": 2020 + }, + { + "entropy": 0.9942824639379978, + "epoch": 0.776884806735553, + "grad_norm": 0.06443686783313751, + "learning_rate": 0.00015603598764603197, + "loss": 1.0168493270874024, + "mean_token_accuracy": 0.7590687796473503, + "num_tokens": 8178961.0, + "step": 2030 + }, + { + "entropy": 0.9773981764912605, + "epoch": 0.7807118254879449, + "grad_norm": 0.0818348303437233, + "learning_rate": 0.0001557674231234054, + "loss": 1.0193141937255858, + "mean_token_accuracy": 0.7708378821611405, + "num_tokens": 8217139.0, + "step": 2040 + }, + { + "entropy": 0.9836540646851063, + "epoch": 0.7845388442403368, + "grad_norm": 0.06240411475300789, + "learning_rate": 0.00015549885860077883, + "loss": 1.0662775993347169, + "mean_token_accuracy": 0.7658124819397927, + "num_tokens": 8252825.0, + "step": 2050 + }, + { + "entropy": 1.036501456052065, + "epoch": 0.7883658629927287, + "grad_norm": 0.09231610596179962, + "learning_rate": 0.00015523029407815228, + "loss": 1.112645435333252, + "mean_token_accuracy": 0.7541953936219216, + "num_tokens": 8295113.0, + "step": 2060 + }, + { + "entropy": 0.9800528183579444, + "epoch": 0.7921928817451206, + "grad_norm": 0.08806589245796204, + "learning_rate": 0.00015496172955552573, + "loss": 1.0401280403137207, + "mean_token_accuracy": 0.7672899037599563, + "num_tokens": 8335977.0, + "step": 2070 + }, + { + "entropy": 0.9678378522396087, + "epoch": 0.7960199004975125, + "grad_norm": 0.08777868002653122, + "learning_rate": 0.00015469316503289918, + "loss": 1.0509014129638672, + "mean_token_accuracy": 0.7696513712406159, + "num_tokens": 8374917.0, + "step": 2080 + }, + { + "entropy": 1.042826947569847, + "epoch": 0.7998469192499044, + "grad_norm": 0.09018490463495255, + "learning_rate": 0.00015442460051027262, + "loss": 1.0869378089904784, + "mean_token_accuracy": 0.7507286682724953, + "num_tokens": 8415614.0, + "step": 2090 + }, + { + "entropy": 1.0548966623842717, + "epoch": 0.8036739380022963, + "grad_norm": 0.07267605513334274, + "learning_rate": 0.00015415603598764604, + "loss": 1.0960289001464845, + "mean_token_accuracy": 0.7545556098222732, + "num_tokens": 8455059.0, + "step": 2100 + }, + { + "entropy": 1.044345210492611, + "epoch": 0.8075009567546881, + "grad_norm": 0.08414279669523239, + "learning_rate": 0.00015388747146501946, + "loss": 1.1200661659240723, + "mean_token_accuracy": 0.7490431442856789, + "num_tokens": 8493866.0, + "step": 2110 + }, + { + "entropy": 1.0317029684782029, + "epoch": 0.81132797550708, + "grad_norm": 0.06549747288227081, + "learning_rate": 0.0001536189069423929, + "loss": 1.0583623886108398, + "mean_token_accuracy": 0.7555923700332642, + "num_tokens": 8536147.0, + "step": 2120 + }, + { + "entropy": 0.9694572634994983, + "epoch": 0.8151549942594719, + "grad_norm": 0.08112777769565582, + "learning_rate": 0.00015335034241976636, + "loss": 1.0503274917602539, + "mean_token_accuracy": 0.7646921187639236, + "num_tokens": 8578007.0, + "step": 2130 + }, + { + "entropy": 0.9358880028128624, + "epoch": 0.8189820130118638, + "grad_norm": 0.07176466286182404, + "learning_rate": 0.0001530817778971398, + "loss": 1.000410270690918, + "mean_token_accuracy": 0.773740467429161, + "num_tokens": 8620999.0, + "step": 2140 + }, + { + "entropy": 1.0444137938320637, + "epoch": 0.8228090317642557, + "grad_norm": 0.06355756521224976, + "learning_rate": 0.00015281321337451325, + "loss": 1.0860448837280274, + "mean_token_accuracy": 0.751850588619709, + "num_tokens": 8663354.0, + "step": 2150 + }, + { + "entropy": 0.9044980220496655, + "epoch": 0.8266360505166476, + "grad_norm": 0.080223448574543, + "learning_rate": 0.00015254464885188667, + "loss": 0.9434403419494629, + "mean_token_accuracy": 0.7828752338886261, + "num_tokens": 8699748.0, + "step": 2160 + }, + { + "entropy": 1.0172922544181346, + "epoch": 0.8304630692690395, + "grad_norm": 0.06971501559019089, + "learning_rate": 0.00015227608432926012, + "loss": 1.0325962066650392, + "mean_token_accuracy": 0.7651202365756035, + "num_tokens": 8739901.0, + "step": 2170 + }, + { + "entropy": 0.9639742732048034, + "epoch": 0.8342900880214313, + "grad_norm": 0.06396778672933578, + "learning_rate": 0.00015200751980663354, + "loss": 1.0435317039489747, + "mean_token_accuracy": 0.7667818054556846, + "num_tokens": 8778980.0, + "step": 2180 + }, + { + "entropy": 0.8876220636069775, + "epoch": 0.8381171067738232, + "grad_norm": 0.09910868853330612, + "learning_rate": 0.00015173895528400698, + "loss": 0.9876300811767578, + "mean_token_accuracy": 0.7865215808153152, + "num_tokens": 8815525.0, + "step": 2190 + }, + { + "entropy": 1.0369405087083579, + "epoch": 0.8419441255262151, + "grad_norm": 0.08775259554386139, + "learning_rate": 0.00015147039076138043, + "loss": 1.1244413375854492, + "mean_token_accuracy": 0.7550949841737747, + "num_tokens": 8857085.0, + "step": 2200 + }, + { + "entropy": 0.9762422502040863, + "epoch": 0.845771144278607, + "grad_norm": 0.08659302443265915, + "learning_rate": 0.00015120182623875388, + "loss": 1.0164811134338378, + "mean_token_accuracy": 0.771617329120636, + "num_tokens": 8894271.0, + "step": 2210 + }, + { + "entropy": 0.9543228000402451, + "epoch": 0.8495981630309989, + "grad_norm": 0.09588434547185898, + "learning_rate": 0.00015093326171612733, + "loss": 1.0303520202636718, + "mean_token_accuracy": 0.768992331624031, + "num_tokens": 8934095.0, + "step": 2220 + }, + { + "entropy": 1.1307236567139625, + "epoch": 0.8534251817833908, + "grad_norm": 0.07016360014677048, + "learning_rate": 0.00015066469719350075, + "loss": 1.1526556968688966, + "mean_token_accuracy": 0.7296861469745636, + "num_tokens": 8982341.0, + "step": 2230 + }, + { + "entropy": 1.0867296956479549, + "epoch": 0.8572522005357827, + "grad_norm": 0.07838597148656845, + "learning_rate": 0.00015039613267087417, + "loss": 1.1031158447265625, + "mean_token_accuracy": 0.7445572927594185, + "num_tokens": 9027401.0, + "step": 2240 + }, + { + "entropy": 0.9492381684482097, + "epoch": 0.8610792192881745, + "grad_norm": 0.08416638523340225, + "learning_rate": 0.0001501275681482476, + "loss": 1.0079804420471192, + "mean_token_accuracy": 0.7709973976016045, + "num_tokens": 9069985.0, + "step": 2250 + }, + { + "entropy": 0.9767517909407616, + "epoch": 0.8649062380405664, + "grad_norm": 0.09798935055732727, + "learning_rate": 0.00014985900362562106, + "loss": 1.0394697189331055, + "mean_token_accuracy": 0.7647709026932716, + "num_tokens": 9108246.0, + "step": 2260 + }, + { + "entropy": 0.9779160171747208, + "epoch": 0.8687332567929583, + "grad_norm": 0.08669373393058777, + "learning_rate": 0.0001495904391029945, + "loss": 1.0398100852966308, + "mean_token_accuracy": 0.7669417649507523, + "num_tokens": 9147055.0, + "step": 2270 + }, + { + "entropy": 1.014696953445673, + "epoch": 0.8725602755453502, + "grad_norm": 0.07674991339445114, + "learning_rate": 0.00014932187458036795, + "loss": 1.0742408752441406, + "mean_token_accuracy": 0.7583330690860748, + "num_tokens": 9187727.0, + "step": 2280 + }, + { + "entropy": 0.9619584158062935, + "epoch": 0.8763872942977421, + "grad_norm": 0.09512930363416672, + "learning_rate": 0.00014905331005774137, + "loss": 1.01895112991333, + "mean_token_accuracy": 0.7718996241688728, + "num_tokens": 9228518.0, + "step": 2290 + }, + { + "entropy": 0.8759313493967056, + "epoch": 0.880214313050134, + "grad_norm": 0.06927543133497238, + "learning_rate": 0.00014878474553511482, + "loss": 0.9590776443481446, + "mean_token_accuracy": 0.783099564909935, + "num_tokens": 9269392.0, + "step": 2300 + }, + { + "entropy": 1.0930156745016575, + "epoch": 0.8840413318025259, + "grad_norm": 0.07149595022201538, + "learning_rate": 0.00014851618101248824, + "loss": 1.132398796081543, + "mean_token_accuracy": 0.7445679202675819, + "num_tokens": 9310993.0, + "step": 2310 + }, + { + "entropy": 0.9991384916007519, + "epoch": 0.8878683505549178, + "grad_norm": 0.100126251578331, + "learning_rate": 0.0001482476164898617, + "loss": 1.0395862579345703, + "mean_token_accuracy": 0.7618231356143952, + "num_tokens": 9349210.0, + "step": 2320 + }, + { + "entropy": 0.9891969002783298, + "epoch": 0.8916953693073096, + "grad_norm": 0.07942050695419312, + "learning_rate": 0.00014797905196723514, + "loss": 1.0403067588806152, + "mean_token_accuracy": 0.7636258214712143, + "num_tokens": 9386251.0, + "step": 2330 + }, + { + "entropy": 1.034816125780344, + "epoch": 0.8955223880597015, + "grad_norm": 0.07803855836391449, + "learning_rate": 0.00014771048744460858, + "loss": 1.088371467590332, + "mean_token_accuracy": 0.7585563778877258, + "num_tokens": 9425492.0, + "step": 2340 + }, + { + "entropy": 0.998091223090887, + "epoch": 0.8993494068120934, + "grad_norm": 0.06696243584156036, + "learning_rate": 0.00014744192292198203, + "loss": 1.0410521507263184, + "mean_token_accuracy": 0.7595112159848213, + "num_tokens": 9466862.0, + "step": 2350 + }, + { + "entropy": 0.9615898832678795, + "epoch": 0.9031764255644853, + "grad_norm": 0.07813845574855804, + "learning_rate": 0.00014717335839935545, + "loss": 1.0265610694885254, + "mean_token_accuracy": 0.7707905381917953, + "num_tokens": 9503827.0, + "step": 2360 + }, + { + "entropy": 0.8776158876717091, + "epoch": 0.9070034443168772, + "grad_norm": 0.10287057608366013, + "learning_rate": 0.0001469047938767289, + "loss": 0.9231206893920898, + "mean_token_accuracy": 0.7909859612584114, + "num_tokens": 9536194.0, + "step": 2370 + }, + { + "entropy": 0.980732673406601, + "epoch": 0.9108304630692691, + "grad_norm": 0.06174289435148239, + "learning_rate": 0.00014663622935410232, + "loss": 1.0316704750061034, + "mean_token_accuracy": 0.7596900418400765, + "num_tokens": 9577621.0, + "step": 2380 + }, + { + "entropy": 1.0083129487931728, + "epoch": 0.914657481821661, + "grad_norm": 0.08805451542139053, + "learning_rate": 0.00014636766483147576, + "loss": 1.0296180725097657, + "mean_token_accuracy": 0.7577597886323929, + "num_tokens": 9616522.0, + "step": 2390 + }, + { + "entropy": 1.0002505116164684, + "epoch": 0.9184845005740528, + "grad_norm": 0.07697928696870804, + "learning_rate": 0.0001460991003088492, + "loss": 1.0411831855773925, + "mean_token_accuracy": 0.7589930936694145, + "num_tokens": 9659217.0, + "step": 2400 + }, + { + "entropy": 0.971958789229393, + "epoch": 0.9223115193264447, + "grad_norm": 0.08504882454872131, + "learning_rate": 0.00014583053578622266, + "loss": 1.015835952758789, + "mean_token_accuracy": 0.7664303690195083, + "num_tokens": 9694120.0, + "step": 2410 + }, + { + "entropy": 0.9250703640282154, + "epoch": 0.9261385380788366, + "grad_norm": 0.06279303133487701, + "learning_rate": 0.00014556197126359608, + "loss": 0.9673631668090821, + "mean_token_accuracy": 0.782692727446556, + "num_tokens": 9732460.0, + "step": 2420 + }, + { + "entropy": 1.0777716524899006, + "epoch": 0.9299655568312285, + "grad_norm": 0.06884833425283432, + "learning_rate": 0.00014529340674096952, + "loss": 1.1415311813354492, + "mean_token_accuracy": 0.7447684407234192, + "num_tokens": 9773760.0, + "step": 2430 + }, + { + "entropy": 1.0116477236151695, + "epoch": 0.9337925755836204, + "grad_norm": 0.06346814334392548, + "learning_rate": 0.00014502484221834297, + "loss": 1.0904932975769044, + "mean_token_accuracy": 0.7616935014724732, + "num_tokens": 9808910.0, + "step": 2440 + }, + { + "entropy": 0.9434679664671421, + "epoch": 0.9376195943360123, + "grad_norm": 0.09843038022518158, + "learning_rate": 0.0001447562776957164, + "loss": 1.0111047744750976, + "mean_token_accuracy": 0.774254959821701, + "num_tokens": 9846472.0, + "step": 2450 + }, + { + "entropy": 1.035598163306713, + "epoch": 0.9414466130884042, + "grad_norm": 0.08025770634412766, + "learning_rate": 0.00014448771317308984, + "loss": 1.1550275802612304, + "mean_token_accuracy": 0.7497850373387337, + "num_tokens": 9885082.0, + "step": 2460 + }, + { + "entropy": 1.057615876197815, + "epoch": 0.945273631840796, + "grad_norm": 0.07916443794965744, + "learning_rate": 0.00014421914865046329, + "loss": 1.114585781097412, + "mean_token_accuracy": 0.7495191320776939, + "num_tokens": 9924849.0, + "step": 2470 + }, + { + "entropy": 0.9576205931603908, + "epoch": 0.9491006505931879, + "grad_norm": 0.10745597630739212, + "learning_rate": 0.00014395058412783673, + "loss": 1.0471231460571289, + "mean_token_accuracy": 0.7697127804160118, + "num_tokens": 9969210.0, + "step": 2480 + }, + { + "entropy": 1.012363300472498, + "epoch": 0.9529276693455798, + "grad_norm": 0.09448845684528351, + "learning_rate": 0.00014368201960521015, + "loss": 1.0322566986083985, + "mean_token_accuracy": 0.7568502962589264, + "num_tokens": 10009532.0, + "step": 2490 + }, + { + "entropy": 0.9387446999549866, + "epoch": 0.9567546880979717, + "grad_norm": 0.08835543692111969, + "learning_rate": 0.0001434134550825836, + "loss": 0.9836790084838867, + "mean_token_accuracy": 0.7740270137786865, + "num_tokens": 10051767.0, + "step": 2500 + }, + { + "entropy": 1.043863268941641, + "epoch": 0.9605817068503636, + "grad_norm": 0.0590866394340992, + "learning_rate": 0.00014314489055995705, + "loss": 1.1286373138427734, + "mean_token_accuracy": 0.755294018983841, + "num_tokens": 10093518.0, + "step": 2510 + }, + { + "entropy": 1.068480123579502, + "epoch": 0.9644087256027555, + "grad_norm": 0.06240773946046829, + "learning_rate": 0.00014287632603733047, + "loss": 1.1243531227111816, + "mean_token_accuracy": 0.7457959160208703, + "num_tokens": 10137842.0, + "step": 2520 + }, + { + "entropy": 0.9648511357605457, + "epoch": 0.9682357443551474, + "grad_norm": 0.07577214390039444, + "learning_rate": 0.00014260776151470391, + "loss": 1.0646875381469727, + "mean_token_accuracy": 0.7689151406288147, + "num_tokens": 10177541.0, + "step": 2530 + }, + { + "entropy": 1.0034234993159772, + "epoch": 0.9720627631075393, + "grad_norm": 0.06887607276439667, + "learning_rate": 0.00014233919699207736, + "loss": 1.0736650466918944, + "mean_token_accuracy": 0.7580653995275497, + "num_tokens": 10217056.0, + "step": 2540 + }, + { + "entropy": 0.9054977536201477, + "epoch": 0.9758897818599311, + "grad_norm": 0.12731540203094482, + "learning_rate": 0.00014207063246945078, + "loss": 0.9581779479980469, + "mean_token_accuracy": 0.7800818130373954, + "num_tokens": 10249622.0, + "step": 2550 + }, + { + "entropy": 1.0892111197113992, + "epoch": 0.979716800612323, + "grad_norm": 0.08707671612501144, + "learning_rate": 0.00014180206794682423, + "loss": 1.1551457405090333, + "mean_token_accuracy": 0.7434241071343421, + "num_tokens": 10287483.0, + "step": 2560 + }, + { + "entropy": 0.9462251186370849, + "epoch": 0.9835438193647149, + "grad_norm": 0.10457631945610046, + "learning_rate": 0.00014153350342419768, + "loss": 0.9859563827514648, + "mean_token_accuracy": 0.7729493409395218, + "num_tokens": 10324562.0, + "step": 2570 + }, + { + "entropy": 0.9609014384448529, + "epoch": 0.9873708381171068, + "grad_norm": 0.1095169261097908, + "learning_rate": 0.0001412649389015711, + "loss": 1.00408992767334, + "mean_token_accuracy": 0.769461353123188, + "num_tokens": 10368482.0, + "step": 2580 + }, + { + "entropy": 0.9500531531870365, + "epoch": 0.9911978568694987, + "grad_norm": 0.12787973880767822, + "learning_rate": 0.00014099637437894454, + "loss": 1.0082733154296875, + "mean_token_accuracy": 0.7726384818553924, + "num_tokens": 10407666.0, + "step": 2590 + }, + { + "entropy": 0.9639500208199024, + "epoch": 0.9950248756218906, + "grad_norm": 0.08555731922388077, + "learning_rate": 0.000140727809856318, + "loss": 0.9910324096679688, + "mean_token_accuracy": 0.7700270056724549, + "num_tokens": 10445419.0, + "step": 2600 + }, + { + "entropy": 0.9984636768698693, + "epoch": 0.9988518943742825, + "grad_norm": 0.10294629633426666, + "learning_rate": 0.00014045924533369144, + "loss": 1.0837631225585938, + "mean_token_accuracy": 0.7655858203768731, + "num_tokens": 10483287.0, + "step": 2610 + }, + { + "entropy": 0.940229170024395, + "epoch": 1.0026789131266742, + "grad_norm": 0.10580310225486755, + "learning_rate": 0.00014019068081106486, + "loss": 0.9650541305541992, + "mean_token_accuracy": 0.7728109017014504, + "num_tokens": 10523841.0, + "step": 2620 + }, + { + "entropy": 0.9358184114098549, + "epoch": 1.0065059318790661, + "grad_norm": 0.12460961192846298, + "learning_rate": 0.0001399221162884383, + "loss": 0.9570166587829589, + "mean_token_accuracy": 0.7772100657224655, + "num_tokens": 10561636.0, + "step": 2630 + }, + { + "entropy": 1.010379894077778, + "epoch": 1.010332950631458, + "grad_norm": 0.0781383365392685, + "learning_rate": 0.00013965355176581175, + "loss": 1.0524909019470214, + "mean_token_accuracy": 0.7589353621006012, + "num_tokens": 10605899.0, + "step": 2640 + }, + { + "entropy": 0.977487600594759, + "epoch": 1.01415996938385, + "grad_norm": 0.0902724489569664, + "learning_rate": 0.00013938498724318517, + "loss": 1.0475889205932618, + "mean_token_accuracy": 0.7629667386412621, + "num_tokens": 10642372.0, + "step": 2650 + }, + { + "entropy": 0.9681369736790657, + "epoch": 1.0179869881362418, + "grad_norm": 0.06344746798276901, + "learning_rate": 0.00013911642272055862, + "loss": 1.0268775939941406, + "mean_token_accuracy": 0.7677509978413581, + "num_tokens": 10682308.0, + "step": 2660 + }, + { + "entropy": 0.9013996437191963, + "epoch": 1.0218140068886337, + "grad_norm": 0.09890369325876236, + "learning_rate": 0.00013884785819793206, + "loss": 0.969085693359375, + "mean_token_accuracy": 0.7815661624073982, + "num_tokens": 10720755.0, + "step": 2670 + }, + { + "entropy": 0.9415140472352505, + "epoch": 1.0256410256410255, + "grad_norm": 0.08691754937171936, + "learning_rate": 0.00013857929367530548, + "loss": 0.9783688545227051, + "mean_token_accuracy": 0.7722749456763267, + "num_tokens": 10759842.0, + "step": 2680 + }, + { + "entropy": 0.9437286920845509, + "epoch": 1.0294680443934174, + "grad_norm": 0.06577731668949127, + "learning_rate": 0.00013831072915267893, + "loss": 0.9904938697814941, + "mean_token_accuracy": 0.7716649904847145, + "num_tokens": 10803740.0, + "step": 2690 + }, + { + "entropy": 0.9657303221523762, + "epoch": 1.0332950631458093, + "grad_norm": 0.07847272604703903, + "learning_rate": 0.00013804216463005238, + "loss": 1.0073646545410155, + "mean_token_accuracy": 0.7678608119487762, + "num_tokens": 10841808.0, + "step": 2700 + }, + { + "entropy": 0.881027878075838, + "epoch": 1.0371220818982012, + "grad_norm": 0.12755495309829712, + "learning_rate": 0.00013777360010742583, + "loss": 0.955751895904541, + "mean_token_accuracy": 0.7835927039384842, + "num_tokens": 10880108.0, + "step": 2710 + }, + { + "entropy": 0.8458237417042256, + "epoch": 1.040949100650593, + "grad_norm": 0.07641884684562683, + "learning_rate": 0.00013750503558479925, + "loss": 0.9140083312988281, + "mean_token_accuracy": 0.7939343526959419, + "num_tokens": 10916272.0, + "step": 2720 + }, + { + "entropy": 0.8845301080495119, + "epoch": 1.044776119402985, + "grad_norm": 0.08896184712648392, + "learning_rate": 0.0001372364710621727, + "loss": 0.9332797050476074, + "mean_token_accuracy": 0.7884662911295891, + "num_tokens": 10951932.0, + "step": 2730 + }, + { + "entropy": 0.963884600251913, + "epoch": 1.0486031381553769, + "grad_norm": 0.10196536034345627, + "learning_rate": 0.00013696790653954614, + "loss": 1.0123867988586426, + "mean_token_accuracy": 0.7659088596701622, + "num_tokens": 10991548.0, + "step": 2740 + }, + { + "entropy": 0.9720129862427711, + "epoch": 1.0524301569077688, + "grad_norm": 0.07552212476730347, + "learning_rate": 0.00013669934201691956, + "loss": 1.015409564971924, + "mean_token_accuracy": 0.7689290955662728, + "num_tokens": 11028749.0, + "step": 2750 + }, + { + "entropy": 0.9871743015944958, + "epoch": 1.0562571756601606, + "grad_norm": 0.09255808591842651, + "learning_rate": 0.000136430777494293, + "loss": 1.0351217269897461, + "mean_token_accuracy": 0.7620491668581962, + "num_tokens": 11071336.0, + "step": 2760 + }, + { + "entropy": 0.809666246920824, + "epoch": 1.0600841944125525, + "grad_norm": 0.08891233056783676, + "learning_rate": 0.00013616221297166645, + "loss": 0.8595174789428711, + "mean_token_accuracy": 0.8053640425205231, + "num_tokens": 11107708.0, + "step": 2770 + }, + { + "entropy": 0.9220615286380053, + "epoch": 1.0639112131649444, + "grad_norm": 0.0731620192527771, + "learning_rate": 0.0001358936484490399, + "loss": 0.9694333076477051, + "mean_token_accuracy": 0.7767527863383293, + "num_tokens": 11149005.0, + "step": 2780 + }, + { + "entropy": 0.8744502332061529, + "epoch": 1.0677382319173363, + "grad_norm": 0.0865791067481041, + "learning_rate": 0.00013562508392641332, + "loss": 0.9401009559631348, + "mean_token_accuracy": 0.7854847684502602, + "num_tokens": 11189214.0, + "step": 2790 + }, + { + "entropy": 0.989877526462078, + "epoch": 1.0715652506697282, + "grad_norm": 0.09394430369138718, + "learning_rate": 0.00013535651940378677, + "loss": 1.0487696647644043, + "mean_token_accuracy": 0.7607394486665726, + "num_tokens": 11225161.0, + "step": 2800 + }, + { + "entropy": 0.8656694941222668, + "epoch": 1.07539226942212, + "grad_norm": 0.10940351337194443, + "learning_rate": 0.0001350879548811602, + "loss": 0.9236039161682129, + "mean_token_accuracy": 0.7919901207089424, + "num_tokens": 11261274.0, + "step": 2810 + }, + { + "entropy": 1.063130483776331, + "epoch": 1.079219288174512, + "grad_norm": 0.06853083521127701, + "learning_rate": 0.00013481939035853364, + "loss": 1.0725152015686035, + "mean_token_accuracy": 0.7454188778996468, + "num_tokens": 11302522.0, + "step": 2820 + }, + { + "entropy": 0.92764787748456, + "epoch": 1.0830463069269038, + "grad_norm": 0.10344231128692627, + "learning_rate": 0.00013455082583590708, + "loss": 0.9725144386291504, + "mean_token_accuracy": 0.7810687303543091, + "num_tokens": 11339898.0, + "step": 2830 + }, + { + "entropy": 0.9415482886135578, + "epoch": 1.0868733256792957, + "grad_norm": 0.12117484956979752, + "learning_rate": 0.00013428226131328053, + "loss": 1.0216625213623047, + "mean_token_accuracy": 0.7713929772377014, + "num_tokens": 11380187.0, + "step": 2840 + }, + { + "entropy": 0.9300718136131764, + "epoch": 1.0907003444316876, + "grad_norm": 0.09950343519449234, + "learning_rate": 0.00013401369679065398, + "loss": 0.9862215042114257, + "mean_token_accuracy": 0.7748491272330285, + "num_tokens": 11417351.0, + "step": 2850 + }, + { + "entropy": 0.9016943011432886, + "epoch": 1.0945273631840795, + "grad_norm": 0.10104110836982727, + "learning_rate": 0.0001337451322680274, + "loss": 0.9565576553344727, + "mean_token_accuracy": 0.7823473244905472, + "num_tokens": 11455566.0, + "step": 2860 + }, + { + "entropy": 1.0184541821479798, + "epoch": 1.0983543819364714, + "grad_norm": 0.07055146247148514, + "learning_rate": 0.00013347656774540084, + "loss": 1.0644380569458007, + "mean_token_accuracy": 0.7551941126585007, + "num_tokens": 11499960.0, + "step": 2870 + }, + { + "entropy": 0.9143499568104744, + "epoch": 1.1021814006888633, + "grad_norm": 0.09798481315374374, + "learning_rate": 0.00013320800322277426, + "loss": 0.9477805137634278, + "mean_token_accuracy": 0.778240317106247, + "num_tokens": 11536434.0, + "step": 2880 + }, + { + "entropy": 0.8803758375346661, + "epoch": 1.1060084194412552, + "grad_norm": 0.09720771014690399, + "learning_rate": 0.0001329394387001477, + "loss": 0.9369168281555176, + "mean_token_accuracy": 0.786097663640976, + "num_tokens": 11572420.0, + "step": 2890 + }, + { + "entropy": 0.9127089619636536, + "epoch": 1.109835438193647, + "grad_norm": 0.07493265718221664, + "learning_rate": 0.00013267087417752116, + "loss": 0.9610566139221192, + "mean_token_accuracy": 0.7780416712164879, + "num_tokens": 11607494.0, + "step": 2900 + }, + { + "entropy": 0.9359945230185985, + "epoch": 1.113662456946039, + "grad_norm": 0.09086300432682037, + "learning_rate": 0.0001324023096548946, + "loss": 0.9519670486450196, + "mean_token_accuracy": 0.7745376393198967, + "num_tokens": 11647057.0, + "step": 2910 + }, + { + "entropy": 0.9206651791930198, + "epoch": 1.1174894756984308, + "grad_norm": 0.10007902979850769, + "learning_rate": 0.00013213374513226805, + "loss": 0.9783179283142089, + "mean_token_accuracy": 0.778519794344902, + "num_tokens": 11685762.0, + "step": 2920 + }, + { + "entropy": 0.9937357418239117, + "epoch": 1.1213164944508227, + "grad_norm": 0.0993100181221962, + "learning_rate": 0.00013186518060964147, + "loss": 1.0440019607543944, + "mean_token_accuracy": 0.7590440228581429, + "num_tokens": 11727379.0, + "step": 2930 + }, + { + "entropy": 1.048055526614189, + "epoch": 1.1251435132032146, + "grad_norm": 0.11140380054712296, + "learning_rate": 0.0001315966160870149, + "loss": 1.1046284675598144, + "mean_token_accuracy": 0.7413847833871842, + "num_tokens": 11770734.0, + "step": 2940 + }, + { + "entropy": 0.9562077779322863, + "epoch": 1.1289705319556065, + "grad_norm": 0.11506770551204681, + "learning_rate": 0.00013132805156438834, + "loss": 0.9946146011352539, + "mean_token_accuracy": 0.7750585973262787, + "num_tokens": 11806270.0, + "step": 2950 + }, + { + "entropy": 0.9747304327785968, + "epoch": 1.1327975507079984, + "grad_norm": 0.1126897856593132, + "learning_rate": 0.00013105948704176179, + "loss": 1.061129093170166, + "mean_token_accuracy": 0.7613553464412689, + "num_tokens": 11852779.0, + "step": 2960 + }, + { + "entropy": 1.0132145062088966, + "epoch": 1.1366245694603903, + "grad_norm": 0.08260762691497803, + "learning_rate": 0.00013079092251913523, + "loss": 1.0199948310852052, + "mean_token_accuracy": 0.7617463275790215, + "num_tokens": 11897084.0, + "step": 2970 + }, + { + "entropy": 0.9878915682435035, + "epoch": 1.1404515882127821, + "grad_norm": 0.08098926395177841, + "learning_rate": 0.00013052235799650868, + "loss": 1.0480783462524415, + "mean_token_accuracy": 0.763205036520958, + "num_tokens": 11938987.0, + "step": 2980 + }, + { + "entropy": 1.0176467482000588, + "epoch": 1.144278606965174, + "grad_norm": 0.0966029092669487, + "learning_rate": 0.0001302537934738821, + "loss": 1.093599796295166, + "mean_token_accuracy": 0.7526282608509064, + "num_tokens": 11981156.0, + "step": 2990 + }, + { + "entropy": 1.0054687768220902, + "epoch": 1.148105625717566, + "grad_norm": 0.09327300637960434, + "learning_rate": 0.00012998522895125555, + "loss": 1.039564609527588, + "mean_token_accuracy": 0.7592228040099144, + "num_tokens": 12025389.0, + "step": 3000 + }, + { + "entropy": 0.9626951858401298, + "epoch": 1.1519326444699578, + "grad_norm": 0.06154703348875046, + "learning_rate": 0.00012971666442862897, + "loss": 0.9993762016296387, + "mean_token_accuracy": 0.769777101278305, + "num_tokens": 12069545.0, + "step": 3010 + }, + { + "entropy": 0.9221224367618561, + "epoch": 1.1557596632223497, + "grad_norm": 0.1140643060207367, + "learning_rate": 0.00012944809990600241, + "loss": 0.9887493133544922, + "mean_token_accuracy": 0.7754134178161621, + "num_tokens": 12113892.0, + "step": 3020 + }, + { + "entropy": 1.011741641908884, + "epoch": 1.1595866819747416, + "grad_norm": 0.08721659332513809, + "learning_rate": 0.00012917953538337586, + "loss": 1.068478488922119, + "mean_token_accuracy": 0.7615607067942619, + "num_tokens": 12153746.0, + "step": 3030 + }, + { + "entropy": 0.9926261432468891, + "epoch": 1.1634137007271335, + "grad_norm": 0.07577186822891235, + "learning_rate": 0.0001289109708607493, + "loss": 1.047102451324463, + "mean_token_accuracy": 0.7669480383396149, + "num_tokens": 12199067.0, + "step": 3040 + }, + { + "entropy": 0.945004402846098, + "epoch": 1.1672407194795253, + "grad_norm": 0.08443465083837509, + "learning_rate": 0.00012864240633812276, + "loss": 0.9891506195068359, + "mean_token_accuracy": 0.7756656989455223, + "num_tokens": 12243766.0, + "step": 3050 + }, + { + "entropy": 0.9602406993508339, + "epoch": 1.1710677382319172, + "grad_norm": 0.07647141069173813, + "learning_rate": 0.00012837384181549618, + "loss": 1.0091946601867676, + "mean_token_accuracy": 0.7702717915177345, + "num_tokens": 12279555.0, + "step": 3060 + }, + { + "entropy": 0.9430582121014595, + "epoch": 1.1748947569843091, + "grad_norm": 0.10050038248300552, + "learning_rate": 0.0001281052772928696, + "loss": 1.0251899719238282, + "mean_token_accuracy": 0.7759435445070266, + "num_tokens": 12316974.0, + "step": 3070 + }, + { + "entropy": 1.0339640237390995, + "epoch": 1.178721775736701, + "grad_norm": 0.09026551991701126, + "learning_rate": 0.00012783671277024304, + "loss": 1.0652464866638183, + "mean_token_accuracy": 0.7533303231000901, + "num_tokens": 12358111.0, + "step": 3080 + }, + { + "entropy": 0.9808862328529357, + "epoch": 1.182548794489093, + "grad_norm": 0.08769362419843674, + "learning_rate": 0.0001275681482476165, + "loss": 1.0068347930908204, + "mean_token_accuracy": 0.7660810023546218, + "num_tokens": 12401669.0, + "step": 3090 + }, + { + "entropy": 0.9436531282961369, + "epoch": 1.1863758132414848, + "grad_norm": 0.09366963803768158, + "learning_rate": 0.00012729958372498994, + "loss": 1.0298351287841796, + "mean_token_accuracy": 0.7704201564192772, + "num_tokens": 12442005.0, + "step": 3100 + }, + { + "entropy": 0.8712134130299092, + "epoch": 1.1902028319938767, + "grad_norm": 0.14041900634765625, + "learning_rate": 0.00012703101920236338, + "loss": 0.9094470977783203, + "mean_token_accuracy": 0.7861496224999428, + "num_tokens": 12484476.0, + "step": 3110 + }, + { + "entropy": 0.9474696554243565, + "epoch": 1.1940298507462686, + "grad_norm": 0.10449594259262085, + "learning_rate": 0.00012676245467973683, + "loss": 0.9729720115661621, + "mean_token_accuracy": 0.7746587276458741, + "num_tokens": 12521351.0, + "step": 3120 + }, + { + "entropy": 0.9215874671936035, + "epoch": 1.1978568694986604, + "grad_norm": 0.07733117789030075, + "learning_rate": 0.00012649389015711025, + "loss": 0.992548942565918, + "mean_token_accuracy": 0.7789316549897194, + "num_tokens": 12564603.0, + "step": 3130 + }, + { + "entropy": 0.9349980562925339, + "epoch": 1.2016838882510523, + "grad_norm": 0.06924714148044586, + "learning_rate": 0.00012622532563448367, + "loss": 1.010727596282959, + "mean_token_accuracy": 0.7728876963257789, + "num_tokens": 12606025.0, + "step": 3140 + }, + { + "entropy": 0.9719727545976639, + "epoch": 1.2055109070034442, + "grad_norm": 0.07646770775318146, + "learning_rate": 0.00012595676111185712, + "loss": 1.0482423782348633, + "mean_token_accuracy": 0.7659243881702423, + "num_tokens": 12647703.0, + "step": 3150 + }, + { + "entropy": 1.0236301876604557, + "epoch": 1.209337925755836, + "grad_norm": 0.08547945320606232, + "learning_rate": 0.00012568819658923056, + "loss": 1.0771334648132325, + "mean_token_accuracy": 0.7551302567124367, + "num_tokens": 12692347.0, + "step": 3160 + }, + { + "entropy": 0.9277745552361012, + "epoch": 1.213164944508228, + "grad_norm": 0.10816850513219833, + "learning_rate": 0.000125419632066604, + "loss": 0.9680308341979981, + "mean_token_accuracy": 0.7722468450665474, + "num_tokens": 12729671.0, + "step": 3170 + }, + { + "entropy": 0.9760092988610267, + "epoch": 1.2169919632606199, + "grad_norm": 0.08950033783912659, + "learning_rate": 0.00012515106754397746, + "loss": 1.000643539428711, + "mean_token_accuracy": 0.7665232941508293, + "num_tokens": 12768100.0, + "step": 3180 + }, + { + "entropy": 0.9292771026492119, + "epoch": 1.2208189820130118, + "grad_norm": 0.08686704933643341, + "learning_rate": 0.0001248825030213509, + "loss": 1.019674015045166, + "mean_token_accuracy": 0.7758068069815636, + "num_tokens": 12801323.0, + "step": 3190 + }, + { + "entropy": 0.8500060614198446, + "epoch": 1.2246460007654036, + "grad_norm": 0.07462778687477112, + "learning_rate": 0.00012461393849872433, + "loss": 0.9042973518371582, + "mean_token_accuracy": 0.7897424980998039, + "num_tokens": 12839880.0, + "step": 3200 + }, + { + "entropy": 0.9205234386026859, + "epoch": 1.2284730195177955, + "grad_norm": 0.07027672231197357, + "learning_rate": 0.00012434537397609775, + "loss": 0.9424190521240234, + "mean_token_accuracy": 0.7767854332923889, + "num_tokens": 12878349.0, + "step": 3210 + }, + { + "entropy": 0.9074239492416382, + "epoch": 1.2323000382701874, + "grad_norm": 0.09741132706403732, + "learning_rate": 0.0001240768094534712, + "loss": 0.9651589393615723, + "mean_token_accuracy": 0.7790584430098534, + "num_tokens": 12917588.0, + "step": 3220 + }, + { + "entropy": 0.8874296098947525, + "epoch": 1.2361270570225793, + "grad_norm": 0.08608463406562805, + "learning_rate": 0.00012380824493084464, + "loss": 0.9437139511108399, + "mean_token_accuracy": 0.7854243695735932, + "num_tokens": 12956199.0, + "step": 3230 + }, + { + "entropy": 0.9470510125160218, + "epoch": 1.2399540757749712, + "grad_norm": 0.09247037768363953, + "learning_rate": 0.0001235396804082181, + "loss": 1.032781982421875, + "mean_token_accuracy": 0.7712572082877159, + "num_tokens": 13000822.0, + "step": 3240 + }, + { + "entropy": 0.8850176699459553, + "epoch": 1.243781094527363, + "grad_norm": 0.08397585898637772, + "learning_rate": 0.00012327111588559153, + "loss": 0.9292671203613281, + "mean_token_accuracy": 0.787578609585762, + "num_tokens": 13043532.0, + "step": 3250 + }, + { + "entropy": 0.8605544999241829, + "epoch": 1.247608113279755, + "grad_norm": 0.0952179804444313, + "learning_rate": 0.00012300255136296498, + "loss": 0.8990240097045898, + "mean_token_accuracy": 0.7919793605804444, + "num_tokens": 13081376.0, + "step": 3260 + }, + { + "entropy": 1.003395075351, + "epoch": 1.2514351320321468, + "grad_norm": 0.08914512395858765, + "learning_rate": 0.0001227339868403384, + "loss": 1.1446642875671387, + "mean_token_accuracy": 0.7565032340586185, + "num_tokens": 13119474.0, + "step": 3270 + }, + { + "entropy": 0.9566417217254639, + "epoch": 1.2552621507845387, + "grad_norm": 0.13220350444316864, + "learning_rate": 0.00012246542231771182, + "loss": 0.9976698875427246, + "mean_token_accuracy": 0.7722181305289268, + "num_tokens": 13162637.0, + "step": 3280 + }, + { + "entropy": 0.888442064449191, + "epoch": 1.2590891695369306, + "grad_norm": 0.10493922978639603, + "learning_rate": 0.00012219685779508527, + "loss": 0.916744613647461, + "mean_token_accuracy": 0.7896391779184342, + "num_tokens": 13199412.0, + "step": 3290 + }, + { + "entropy": 0.9262259535491466, + "epoch": 1.2629161882893225, + "grad_norm": 0.09022962301969528, + "learning_rate": 0.00012192829327245872, + "loss": 0.9885137557983399, + "mean_token_accuracy": 0.778158649802208, + "num_tokens": 13240292.0, + "step": 3300 + }, + { + "entropy": 0.9356066003441811, + "epoch": 1.2667432070417144, + "grad_norm": 0.09693239629268646, + "learning_rate": 0.00012165972874983216, + "loss": 0.9731400489807129, + "mean_token_accuracy": 0.7748182758688926, + "num_tokens": 13275876.0, + "step": 3310 + }, + { + "entropy": 0.868951104208827, + "epoch": 1.2705702257941063, + "grad_norm": 0.09237370640039444, + "learning_rate": 0.0001213911642272056, + "loss": 0.9127277374267578, + "mean_token_accuracy": 0.7890144631266593, + "num_tokens": 13314857.0, + "step": 3320 + }, + { + "entropy": 0.9311054348945618, + "epoch": 1.2743972445464982, + "grad_norm": 0.08701436221599579, + "learning_rate": 0.00012112259970457902, + "loss": 0.9666108131408692, + "mean_token_accuracy": 0.7752738267183303, + "num_tokens": 13357039.0, + "step": 3330 + }, + { + "entropy": 0.9256260149180889, + "epoch": 1.27822426329889, + "grad_norm": 0.08751461654901505, + "learning_rate": 0.00012085403518195246, + "loss": 0.9926286697387695, + "mean_token_accuracy": 0.7750931903719902, + "num_tokens": 13397058.0, + "step": 3340 + }, + { + "entropy": 1.0074332721531392, + "epoch": 1.282051282051282, + "grad_norm": 0.07409587502479553, + "learning_rate": 0.00012058547065932591, + "loss": 1.062586498260498, + "mean_token_accuracy": 0.7546869352459907, + "num_tokens": 13441381.0, + "step": 3350 + }, + { + "entropy": 0.9596263833343983, + "epoch": 1.2858783008036738, + "grad_norm": 0.09343665838241577, + "learning_rate": 0.00012031690613669934, + "loss": 1.0023324012756347, + "mean_token_accuracy": 0.7719831839203835, + "num_tokens": 13481914.0, + "step": 3360 + }, + { + "entropy": 0.9313522674143314, + "epoch": 1.2897053195560657, + "grad_norm": 0.0879049226641655, + "learning_rate": 0.00012004834161407279, + "loss": 0.9833806991577149, + "mean_token_accuracy": 0.7737741976976394, + "num_tokens": 13519831.0, + "step": 3370 + }, + { + "entropy": 0.8369917057454586, + "epoch": 1.2935323383084576, + "grad_norm": 0.14339204132556915, + "learning_rate": 0.00011977977709144624, + "loss": 0.9147489547729493, + "mean_token_accuracy": 0.7984762340784073, + "num_tokens": 13559768.0, + "step": 3380 + }, + { + "entropy": 0.9055653363466263, + "epoch": 1.2973593570608495, + "grad_norm": 0.1441742479801178, + "learning_rate": 0.00011951121256881967, + "loss": 0.9521515846252442, + "mean_token_accuracy": 0.7834478095173836, + "num_tokens": 13595966.0, + "step": 3390 + }, + { + "entropy": 0.9677796266973019, + "epoch": 1.3011863758132414, + "grad_norm": 0.11233013868331909, + "learning_rate": 0.00011924264804619309, + "loss": 1.0522055625915527, + "mean_token_accuracy": 0.7664702609181404, + "num_tokens": 13638463.0, + "step": 3400 + }, + { + "entropy": 0.9398517791181803, + "epoch": 1.3050133945656333, + "grad_norm": 0.088468998670578, + "learning_rate": 0.00011897408352356654, + "loss": 0.9618704795837403, + "mean_token_accuracy": 0.7755557060241699, + "num_tokens": 13677769.0, + "step": 3410 + }, + { + "entropy": 0.8900398269295693, + "epoch": 1.3088404133180251, + "grad_norm": 0.09742283076047897, + "learning_rate": 0.00011870551900093999, + "loss": 0.9422917366027832, + "mean_token_accuracy": 0.7865706130862236, + "num_tokens": 13713374.0, + "step": 3420 + }, + { + "entropy": 0.9008657015860081, + "epoch": 1.312667432070417, + "grad_norm": 0.09111864864826202, + "learning_rate": 0.00011843695447831342, + "loss": 0.9726786613464355, + "mean_token_accuracy": 0.7835188135504723, + "num_tokens": 13753165.0, + "step": 3430 + }, + { + "entropy": 0.954158465564251, + "epoch": 1.316494450822809, + "grad_norm": 0.0949985608458519, + "learning_rate": 0.00011816838995568687, + "loss": 1.0072153091430665, + "mean_token_accuracy": 0.7668681025505066, + "num_tokens": 13790265.0, + "step": 3440 + }, + { + "entropy": 0.9259054005146027, + "epoch": 1.3203214695752008, + "grad_norm": 0.09144506603479385, + "learning_rate": 0.00011789982543306031, + "loss": 1.0319811820983886, + "mean_token_accuracy": 0.77575224339962, + "num_tokens": 13830720.0, + "step": 3450 + }, + { + "entropy": 0.9554400585591794, + "epoch": 1.3241484883275927, + "grad_norm": 0.05986972153186798, + "learning_rate": 0.00011763126091043373, + "loss": 0.9840157508850098, + "mean_token_accuracy": 0.7714304268360138, + "num_tokens": 13874024.0, + "step": 3460 + }, + { + "entropy": 0.9618137650191784, + "epoch": 1.3279755070799846, + "grad_norm": 0.08746087551116943, + "learning_rate": 0.00011736269638780717, + "loss": 1.0280908584594726, + "mean_token_accuracy": 0.7679046332836151, + "num_tokens": 13916099.0, + "step": 3470 + }, + { + "entropy": 1.02601458132267, + "epoch": 1.3318025258323765, + "grad_norm": 0.09883694350719452, + "learning_rate": 0.00011709413186518061, + "loss": 1.0893220901489258, + "mean_token_accuracy": 0.7487106472253799, + "num_tokens": 13955163.0, + "step": 3480 + }, + { + "entropy": 1.025067638605833, + "epoch": 1.3356295445847683, + "grad_norm": 0.07656730711460114, + "learning_rate": 0.00011682556734255406, + "loss": 1.0527194023132325, + "mean_token_accuracy": 0.7569629296660423, + "num_tokens": 13996990.0, + "step": 3490 + }, + { + "entropy": 0.8709930831566453, + "epoch": 1.3394565633371602, + "grad_norm": 0.1119026467204094, + "learning_rate": 0.0001165570028199275, + "loss": 0.9183405876159668, + "mean_token_accuracy": 0.784464044868946, + "num_tokens": 14040315.0, + "step": 3500 + }, + { + "entropy": 0.9783565014600754, + "epoch": 1.3432835820895521, + "grad_norm": 0.09997576475143433, + "learning_rate": 0.00011628843829730094, + "loss": 1.0318940162658692, + "mean_token_accuracy": 0.7614112690091133, + "num_tokens": 14083204.0, + "step": 3510 + }, + { + "entropy": 0.9975252889096737, + "epoch": 1.347110600841944, + "grad_norm": 0.10046812891960144, + "learning_rate": 0.00011601987377467437, + "loss": 1.0214290618896484, + "mean_token_accuracy": 0.7584437146782875, + "num_tokens": 14127039.0, + "step": 3520 + }, + { + "entropy": 0.8959422588348389, + "epoch": 1.350937619594336, + "grad_norm": 0.09512703120708466, + "learning_rate": 0.0001157513092520478, + "loss": 0.9528075218200683, + "mean_token_accuracy": 0.7823959946632385, + "num_tokens": 14163989.0, + "step": 3530 + }, + { + "entropy": 0.8903120748698712, + "epoch": 1.3547646383467278, + "grad_norm": 0.10500185191631317, + "learning_rate": 0.00011548274472942124, + "loss": 0.9784683227539063, + "mean_token_accuracy": 0.7854589730501175, + "num_tokens": 14198562.0, + "step": 3540 + }, + { + "entropy": 0.8580869071185588, + "epoch": 1.3585916570991197, + "grad_norm": 0.08716659992933273, + "learning_rate": 0.00011521418020679469, + "loss": 0.9078399658203125, + "mean_token_accuracy": 0.7894850671291351, + "num_tokens": 14236952.0, + "step": 3550 + }, + { + "entropy": 0.9841447554528713, + "epoch": 1.3624186758515116, + "grad_norm": 0.08638570457696915, + "learning_rate": 0.00011494561568416812, + "loss": 1.0438207626342773, + "mean_token_accuracy": 0.7629329964518548, + "num_tokens": 14278208.0, + "step": 3560 + }, + { + "entropy": 0.9100395441055298, + "epoch": 1.3662456946039034, + "grad_norm": 0.09058145433664322, + "learning_rate": 0.00011467705116154157, + "loss": 0.9560261726379394, + "mean_token_accuracy": 0.7807327762246132, + "num_tokens": 14314076.0, + "step": 3570 + }, + { + "entropy": 0.8529263667762279, + "epoch": 1.3700727133562953, + "grad_norm": 0.08847236633300781, + "learning_rate": 0.00011440848663891502, + "loss": 0.9192025184631347, + "mean_token_accuracy": 0.7945622354745865, + "num_tokens": 14349740.0, + "step": 3580 + }, + { + "entropy": 0.8977530397474766, + "epoch": 1.3738997321086872, + "grad_norm": 0.09535886347293854, + "learning_rate": 0.00011413992211628844, + "loss": 0.9331538200378418, + "mean_token_accuracy": 0.7803975984454155, + "num_tokens": 14392492.0, + "step": 3590 + }, + { + "entropy": 1.0430821359157563, + "epoch": 1.377726750861079, + "grad_norm": 0.08564139902591705, + "learning_rate": 0.00011387135759366187, + "loss": 1.0767670631408692, + "mean_token_accuracy": 0.7479040876030922, + "num_tokens": 14436961.0, + "step": 3600 + }, + { + "entropy": 0.8358541168272495, + "epoch": 1.381553769613471, + "grad_norm": 0.09847365319728851, + "learning_rate": 0.00011360279307103532, + "loss": 0.8758580207824707, + "mean_token_accuracy": 0.7964837267994881, + "num_tokens": 14472251.0, + "step": 3610 + }, + { + "entropy": 0.8302674755454064, + "epoch": 1.3853807883658629, + "grad_norm": 0.08570406585931778, + "learning_rate": 0.00011333422854840876, + "loss": 0.9068514823913574, + "mean_token_accuracy": 0.7943103745579719, + "num_tokens": 14509818.0, + "step": 3620 + }, + { + "entropy": 0.9825982883572578, + "epoch": 1.3892078071182548, + "grad_norm": 0.10844281315803528, + "learning_rate": 0.0001130656640257822, + "loss": 1.0484787940979003, + "mean_token_accuracy": 0.7600376740097999, + "num_tokens": 14553567.0, + "step": 3630 + }, + { + "entropy": 1.0431513242423534, + "epoch": 1.3930348258706466, + "grad_norm": 0.0750717744231224, + "learning_rate": 0.00011279709950315564, + "loss": 1.0337225914001464, + "mean_token_accuracy": 0.7504511162638664, + "num_tokens": 14598239.0, + "step": 3640 + }, + { + "entropy": 0.9319969929754734, + "epoch": 1.3968618446230385, + "grad_norm": 0.08307385444641113, + "learning_rate": 0.00011252853498052909, + "loss": 0.9771868705749511, + "mean_token_accuracy": 0.7778135031461716, + "num_tokens": 14638064.0, + "step": 3650 + }, + { + "entropy": 0.9992426164448261, + "epoch": 1.4006888633754304, + "grad_norm": 0.09222020208835602, + "learning_rate": 0.00011225997045790251, + "loss": 1.0516475677490233, + "mean_token_accuracy": 0.7587143570184708, + "num_tokens": 14682012.0, + "step": 3660 + }, + { + "entropy": 0.9670721650123596, + "epoch": 1.4045158821278223, + "grad_norm": 0.09432315081357956, + "learning_rate": 0.00011199140593527595, + "loss": 1.0164658546447753, + "mean_token_accuracy": 0.7670722231268883, + "num_tokens": 14722922.0, + "step": 3670 + }, + { + "entropy": 0.9808389253914356, + "epoch": 1.4083429008802142, + "grad_norm": 0.08502112329006195, + "learning_rate": 0.00011172284141264939, + "loss": 1.0553858757019043, + "mean_token_accuracy": 0.76065753698349, + "num_tokens": 14765083.0, + "step": 3680 + }, + { + "entropy": 1.011240091174841, + "epoch": 1.412169919632606, + "grad_norm": 0.07948844134807587, + "learning_rate": 0.00011145427689002284, + "loss": 1.0446209907531738, + "mean_token_accuracy": 0.75536377876997, + "num_tokens": 14806465.0, + "step": 3690 + }, + { + "entropy": 0.911352240294218, + "epoch": 1.415996938384998, + "grad_norm": 0.08382374793291092, + "learning_rate": 0.00011118571236739627, + "loss": 0.9388965606689453, + "mean_token_accuracy": 0.7807439729571343, + "num_tokens": 14850133.0, + "step": 3700 + }, + { + "entropy": 0.9055514119565486, + "epoch": 1.4198239571373898, + "grad_norm": 0.10713934898376465, + "learning_rate": 0.00011091714784476972, + "loss": 0.9727254867553711, + "mean_token_accuracy": 0.7801795959472656, + "num_tokens": 14887327.0, + "step": 3710 + }, + { + "entropy": 0.9338000696152449, + "epoch": 1.4236509758897817, + "grad_norm": 0.11418487876653671, + "learning_rate": 0.00011064858332214314, + "loss": 0.9989487648010253, + "mean_token_accuracy": 0.7747065275907516, + "num_tokens": 14927730.0, + "step": 3720 + }, + { + "entropy": 0.869029226526618, + "epoch": 1.4274779946421736, + "grad_norm": 0.10778038948774338, + "learning_rate": 0.00011038001879951659, + "loss": 0.9393071174621582, + "mean_token_accuracy": 0.7909289851784707, + "num_tokens": 14964847.0, + "step": 3730 + }, + { + "entropy": 0.8993408516049385, + "epoch": 1.4313050133945655, + "grad_norm": 0.08339972048997879, + "learning_rate": 0.00011011145427689002, + "loss": 0.9511364936828614, + "mean_token_accuracy": 0.7844893127679825, + "num_tokens": 15003449.0, + "step": 3740 + }, + { + "entropy": 0.9478372372686863, + "epoch": 1.4351320321469574, + "grad_norm": 0.07547847181558609, + "learning_rate": 0.00010984288975426347, + "loss": 0.9942925453186036, + "mean_token_accuracy": 0.772410535812378, + "num_tokens": 15046091.0, + "step": 3750 + }, + { + "entropy": 0.8367562972009182, + "epoch": 1.4389590508993493, + "grad_norm": 0.06902482360601425, + "learning_rate": 0.00010957432523163691, + "loss": 0.8951096534729004, + "mean_token_accuracy": 0.7985799089074135, + "num_tokens": 15091826.0, + "step": 3760 + }, + { + "entropy": 0.9437298484146595, + "epoch": 1.4427860696517412, + "grad_norm": 0.10231524705886841, + "learning_rate": 0.00010930576070901035, + "loss": 0.9919009208679199, + "mean_token_accuracy": 0.7663119360804558, + "num_tokens": 15133719.0, + "step": 3770 + }, + { + "entropy": 1.0057852260768414, + "epoch": 1.446613088404133, + "grad_norm": 0.09349844604730606, + "learning_rate": 0.0001090371961863838, + "loss": 1.0667811393737794, + "mean_token_accuracy": 0.757930365204811, + "num_tokens": 15173670.0, + "step": 3780 + }, + { + "entropy": 0.9152357578277588, + "epoch": 1.450440107156525, + "grad_norm": 0.09612533450126648, + "learning_rate": 0.00010876863166375722, + "loss": 0.9641363143920898, + "mean_token_accuracy": 0.7791497871279717, + "num_tokens": 15215154.0, + "step": 3790 + }, + { + "entropy": 0.849637558311224, + "epoch": 1.4542671259089168, + "grad_norm": 0.07079404592514038, + "learning_rate": 0.00010850006714113066, + "loss": 0.8924535751342774, + "mean_token_accuracy": 0.7958060145378113, + "num_tokens": 15261773.0, + "step": 3800 + }, + { + "entropy": 0.9689324770122767, + "epoch": 1.4580941446613087, + "grad_norm": 0.10107272863388062, + "learning_rate": 0.0001082315026185041, + "loss": 1.000623607635498, + "mean_token_accuracy": 0.7690365821123123, + "num_tokens": 15295693.0, + "step": 3810 + }, + { + "entropy": 0.8926774315536022, + "epoch": 1.4619211634137006, + "grad_norm": 0.0883372351527214, + "learning_rate": 0.00010796293809587754, + "loss": 0.9312380790710449, + "mean_token_accuracy": 0.7839185446500778, + "num_tokens": 15332324.0, + "step": 3820 + }, + { + "entropy": 0.9962236389517785, + "epoch": 1.4657481821660925, + "grad_norm": 0.09174945950508118, + "learning_rate": 0.00010769437357325099, + "loss": 1.0419865608215333, + "mean_token_accuracy": 0.7592507138848305, + "num_tokens": 15370812.0, + "step": 3830 + }, + { + "entropy": 1.0249286435544491, + "epoch": 1.4695752009184844, + "grad_norm": 0.07152284681797028, + "learning_rate": 0.00010742580905062442, + "loss": 1.0437363624572753, + "mean_token_accuracy": 0.7567671984434128, + "num_tokens": 15417719.0, + "step": 3840 + }, + { + "entropy": 0.903605168312788, + "epoch": 1.4734022196708763, + "grad_norm": 0.09400783479213715, + "learning_rate": 0.00010715724452799784, + "loss": 0.9410040855407715, + "mean_token_accuracy": 0.7839412048459053, + "num_tokens": 15455856.0, + "step": 3850 + }, + { + "entropy": 1.0259956195950508, + "epoch": 1.4772292384232681, + "grad_norm": 0.08671914041042328, + "learning_rate": 0.00010688868000537129, + "loss": 1.1025453567504884, + "mean_token_accuracy": 0.7507242172956466, + "num_tokens": 15492109.0, + "step": 3860 + }, + { + "entropy": 0.9178053669631481, + "epoch": 1.48105625717566, + "grad_norm": 0.07717446982860565, + "learning_rate": 0.00010662011548274474, + "loss": 0.96353178024292, + "mean_token_accuracy": 0.7797438561916351, + "num_tokens": 15532130.0, + "step": 3870 + }, + { + "entropy": 0.9423278756439686, + "epoch": 1.484883275928052, + "grad_norm": 0.11039029061794281, + "learning_rate": 0.00010635155096011817, + "loss": 0.979669189453125, + "mean_token_accuracy": 0.7755513936281204, + "num_tokens": 15575609.0, + "step": 3880 + }, + { + "entropy": 0.8999218411743641, + "epoch": 1.4887102946804438, + "grad_norm": 0.08974706381559372, + "learning_rate": 0.00010608298643749162, + "loss": 0.9477033615112305, + "mean_token_accuracy": 0.7822227850556374, + "num_tokens": 15621264.0, + "step": 3890 + }, + { + "entropy": 0.8756623603403568, + "epoch": 1.4925373134328357, + "grad_norm": 0.10864510387182236, + "learning_rate": 0.00010581442191486505, + "loss": 0.9711783409118653, + "mean_token_accuracy": 0.7893951386213303, + "num_tokens": 15656959.0, + "step": 3900 + }, + { + "entropy": 0.951158057898283, + "epoch": 1.4963643321852276, + "grad_norm": 0.09398993104696274, + "learning_rate": 0.0001055458573922385, + "loss": 1.0387070655822754, + "mean_token_accuracy": 0.7698590591549873, + "num_tokens": 15700293.0, + "step": 3910 + }, + { + "entropy": 0.9240442231297493, + "epoch": 1.5001913509376195, + "grad_norm": 0.09761729091405869, + "learning_rate": 0.00010527729286961192, + "loss": 0.9758125305175781, + "mean_token_accuracy": 0.7737968236207962, + "num_tokens": 15739304.0, + "step": 3920 + }, + { + "entropy": 0.9025500647723674, + "epoch": 1.5040183696900113, + "grad_norm": 0.08816131204366684, + "learning_rate": 0.00010500872834698537, + "loss": 0.913144302368164, + "mean_token_accuracy": 0.7775477200746537, + "num_tokens": 15785086.0, + "step": 3930 + }, + { + "entropy": 0.8958883471786976, + "epoch": 1.5078453884424032, + "grad_norm": 0.09690563380718231, + "learning_rate": 0.0001047401638243588, + "loss": 0.9484706878662109, + "mean_token_accuracy": 0.7867727875709534, + "num_tokens": 15822631.0, + "step": 3940 + }, + { + "entropy": 0.8738761503249407, + "epoch": 1.5116724071947951, + "grad_norm": 0.08325833082199097, + "learning_rate": 0.00010447159930173225, + "loss": 0.9258977890014648, + "mean_token_accuracy": 0.7862061053514481, + "num_tokens": 15863533.0, + "step": 3950 + }, + { + "entropy": 0.952784775942564, + "epoch": 1.515499425947187, + "grad_norm": 0.09089304506778717, + "learning_rate": 0.0001042030347791057, + "loss": 0.9893428802490234, + "mean_token_accuracy": 0.769037912786007, + "num_tokens": 15903798.0, + "step": 3960 + }, + { + "entropy": 0.9974973328411579, + "epoch": 1.519326444699579, + "grad_norm": 0.06594393402338028, + "learning_rate": 0.00010393447025647913, + "loss": 0.9982621192932128, + "mean_token_accuracy": 0.7653156638145446, + "num_tokens": 15947894.0, + "step": 3970 + }, + { + "entropy": 1.042479208856821, + "epoch": 1.5231534634519708, + "grad_norm": 0.09250905364751816, + "learning_rate": 0.00010366590573385255, + "loss": 1.0862640380859374, + "mean_token_accuracy": 0.7515693128108978, + "num_tokens": 15985609.0, + "step": 3980 + }, + { + "entropy": 0.869631578028202, + "epoch": 1.5269804822043627, + "grad_norm": 0.10154584795236588, + "learning_rate": 0.000103397341211226, + "loss": 0.9275701522827149, + "mean_token_accuracy": 0.7910413116216659, + "num_tokens": 16022339.0, + "step": 3990 + }, + { + "entropy": 0.9228729590773582, + "epoch": 1.5308075009567546, + "grad_norm": 0.08860265463590622, + "learning_rate": 0.00010312877668859944, + "loss": 1.0074289321899415, + "mean_token_accuracy": 0.7770419105887413, + "num_tokens": 16063778.0, + "step": 4000 + }, + { + "entropy": 0.9469372771680356, + "epoch": 1.5346345197091464, + "grad_norm": 0.08613952249288559, + "learning_rate": 0.00010286021216597287, + "loss": 1.0328418731689453, + "mean_token_accuracy": 0.7784339845180511, + "num_tokens": 16103389.0, + "step": 4010 + }, + { + "entropy": 0.9240258730947971, + "epoch": 1.5384615384615383, + "grad_norm": 0.09255630522966385, + "learning_rate": 0.00010259164764334632, + "loss": 0.9813838958740234, + "mean_token_accuracy": 0.779928731918335, + "num_tokens": 16141739.0, + "step": 4020 + }, + { + "entropy": 0.8300335463136435, + "epoch": 1.5422885572139302, + "grad_norm": 0.11173315346240997, + "learning_rate": 0.00010232308312071977, + "loss": 0.8650222778320312, + "mean_token_accuracy": 0.8004546627402306, + "num_tokens": 16179442.0, + "step": 4030 + }, + { + "entropy": 0.970530441403389, + "epoch": 1.546115575966322, + "grad_norm": 0.08758437633514404, + "learning_rate": 0.0001020545185980932, + "loss": 1.029263401031494, + "mean_token_accuracy": 0.7669389978051185, + "num_tokens": 16220502.0, + "step": 4040 + }, + { + "entropy": 0.8929917253553867, + "epoch": 1.549942594718714, + "grad_norm": 0.0840209424495697, + "learning_rate": 0.00010178595407546662, + "loss": 0.9574555397033692, + "mean_token_accuracy": 0.7882895812392234, + "num_tokens": 16263944.0, + "step": 4050 + }, + { + "entropy": 0.9571633011102676, + "epoch": 1.5537696134711059, + "grad_norm": 0.07731885462999344, + "learning_rate": 0.00010151738955284007, + "loss": 1.014600658416748, + "mean_token_accuracy": 0.7691228404641152, + "num_tokens": 16307508.0, + "step": 4060 + }, + { + "entropy": 0.9627384431660175, + "epoch": 1.5575966322234978, + "grad_norm": 0.09968744218349457, + "learning_rate": 0.00010124882503021352, + "loss": 1.0220794677734375, + "mean_token_accuracy": 0.7685489565134048, + "num_tokens": 16349178.0, + "step": 4070 + }, + { + "entropy": 0.8696753971278668, + "epoch": 1.5614236509758896, + "grad_norm": 0.08411276340484619, + "learning_rate": 0.00010098026050758695, + "loss": 0.9325771331787109, + "mean_token_accuracy": 0.7903442814946174, + "num_tokens": 16390375.0, + "step": 4080 + }, + { + "entropy": 0.8790203854441643, + "epoch": 1.5652506697282815, + "grad_norm": 0.0969686210155487, + "learning_rate": 0.0001007116959849604, + "loss": 0.9325167655944824, + "mean_token_accuracy": 0.7890133559703827, + "num_tokens": 16429198.0, + "step": 4090 + }, + { + "entropy": 0.9447548128664494, + "epoch": 1.5690776884806734, + "grad_norm": 0.07992373406887054, + "learning_rate": 0.00010044313146233384, + "loss": 0.9708291053771972, + "mean_token_accuracy": 0.7737105548381805, + "num_tokens": 16472336.0, + "step": 4100 + }, + { + "entropy": 0.974559249728918, + "epoch": 1.5729047072330653, + "grad_norm": 0.09685226529836655, + "learning_rate": 0.00010017456693970726, + "loss": 1.0289334297180175, + "mean_token_accuracy": 0.7674296617507934, + "num_tokens": 16511109.0, + "step": 4110 + }, + { + "entropy": 0.8575489681214095, + "epoch": 1.5767317259854572, + "grad_norm": 0.09298260509967804, + "learning_rate": 9.990600241708071e-05, + "loss": 0.8897696495056152, + "mean_token_accuracy": 0.7952411189675331, + "num_tokens": 16552802.0, + "step": 4120 + }, + { + "entropy": 0.869475956633687, + "epoch": 1.580558744737849, + "grad_norm": 0.129170760512352, + "learning_rate": 9.963743789445414e-05, + "loss": 0.9408356666564941, + "mean_token_accuracy": 0.7868246123194694, + "num_tokens": 16592603.0, + "step": 4130 + }, + { + "entropy": 0.9167623318731785, + "epoch": 1.584385763490241, + "grad_norm": 0.08131655305624008, + "learning_rate": 9.936887337182759e-05, + "loss": 1.005775260925293, + "mean_token_accuracy": 0.7779423877596855, + "num_tokens": 16633674.0, + "step": 4140 + }, + { + "entropy": 0.9069061763584614, + "epoch": 1.5882127822426328, + "grad_norm": 0.07485036551952362, + "learning_rate": 9.910030884920103e-05, + "loss": 0.9540878295898437, + "mean_token_accuracy": 0.7809736356139183, + "num_tokens": 16669966.0, + "step": 4150 + }, + { + "entropy": 1.0095594763755797, + "epoch": 1.5920398009950247, + "grad_norm": 0.11678522825241089, + "learning_rate": 9.883174432657446e-05, + "loss": 1.0742655754089356, + "mean_token_accuracy": 0.7636483564972878, + "num_tokens": 16711538.0, + "step": 4160 + }, + { + "entropy": 0.8342153321951628, + "epoch": 1.5958668197474166, + "grad_norm": 0.09654127061367035, + "learning_rate": 9.85631798039479e-05, + "loss": 0.8637946128845215, + "mean_token_accuracy": 0.7977021634578705, + "num_tokens": 16746947.0, + "step": 4170 + }, + { + "entropy": 0.9147222273051738, + "epoch": 1.5996938384998085, + "grad_norm": 0.10032576322555542, + "learning_rate": 9.829461528132134e-05, + "loss": 0.9848580360412598, + "mean_token_accuracy": 0.7794791385531425, + "num_tokens": 16792089.0, + "step": 4180 + }, + { + "entropy": 0.9350447114557028, + "epoch": 1.6035208572522004, + "grad_norm": 0.11322317272424698, + "learning_rate": 9.802605075869477e-05, + "loss": 0.9632351875305176, + "mean_token_accuracy": 0.7710213780403137, + "num_tokens": 16831782.0, + "step": 4190 + }, + { + "entropy": 0.8924577154219151, + "epoch": 1.6073478760045923, + "grad_norm": 0.08842343091964722, + "learning_rate": 9.775748623606822e-05, + "loss": 0.9661048889160156, + "mean_token_accuracy": 0.7863042891025543, + "num_tokens": 16867851.0, + "step": 4200 + }, + { + "entropy": 0.9452814936637879, + "epoch": 1.6111748947569842, + "grad_norm": 0.10469862073659897, + "learning_rate": 9.748892171344167e-05, + "loss": 1.0315632820129395, + "mean_token_accuracy": 0.769272243976593, + "num_tokens": 16909819.0, + "step": 4210 + }, + { + "entropy": 0.8794655621051788, + "epoch": 1.615001913509376, + "grad_norm": 0.08528223633766174, + "learning_rate": 9.72203571908151e-05, + "loss": 0.9158189773559571, + "mean_token_accuracy": 0.791112196445465, + "num_tokens": 16945241.0, + "step": 4220 + }, + { + "entropy": 0.9216304633766412, + "epoch": 1.618828932261768, + "grad_norm": 0.07684458047151566, + "learning_rate": 9.695179266818853e-05, + "loss": 1.0047569274902344, + "mean_token_accuracy": 0.7764274105429649, + "num_tokens": 16986516.0, + "step": 4230 + }, + { + "entropy": 0.8806056842207909, + "epoch": 1.6226559510141598, + "grad_norm": 0.09925177693367004, + "learning_rate": 9.668322814556198e-05, + "loss": 0.9321705818176269, + "mean_token_accuracy": 0.7873435765504837, + "num_tokens": 17026974.0, + "step": 4240 + }, + { + "entropy": 1.0260133132338525, + "epoch": 1.6264829697665517, + "grad_norm": 0.07781514525413513, + "learning_rate": 9.641466362293541e-05, + "loss": 1.0732348442077637, + "mean_token_accuracy": 0.755302457511425, + "num_tokens": 17063628.0, + "step": 4250 + }, + { + "entropy": 0.8771878894418478, + "epoch": 1.6303099885189436, + "grad_norm": 0.12377400696277618, + "learning_rate": 9.614609910030885e-05, + "loss": 0.9051324844360351, + "mean_token_accuracy": 0.7877693608403206, + "num_tokens": 17102243.0, + "step": 4260 + }, + { + "entropy": 0.9575911372900009, + "epoch": 1.6341370072713355, + "grad_norm": 0.07953961193561554, + "learning_rate": 9.58775345776823e-05, + "loss": 1.0206258773803711, + "mean_token_accuracy": 0.770101509988308, + "num_tokens": 17143256.0, + "step": 4270 + }, + { + "entropy": 0.9909125387668609, + "epoch": 1.6379640260237274, + "grad_norm": 0.09304741024971008, + "learning_rate": 9.560897005505573e-05, + "loss": 1.043109130859375, + "mean_token_accuracy": 0.7598145559430123, + "num_tokens": 17188878.0, + "step": 4280 + }, + { + "entropy": 0.8626054737716913, + "epoch": 1.6417910447761193, + "grad_norm": 0.08982561528682709, + "learning_rate": 9.534040553242916e-05, + "loss": 0.9062054634094239, + "mean_token_accuracy": 0.790125061571598, + "num_tokens": 17224537.0, + "step": 4290 + }, + { + "entropy": 0.919727610051632, + "epoch": 1.6456180635285111, + "grad_norm": 0.11226653307676315, + "learning_rate": 9.507184100980261e-05, + "loss": 0.970013427734375, + "mean_token_accuracy": 0.7747739493846894, + "num_tokens": 17262347.0, + "step": 4300 + }, + { + "entropy": 1.032866196334362, + "epoch": 1.649445082280903, + "grad_norm": 0.09440238773822784, + "learning_rate": 9.480327648717606e-05, + "loss": 1.0287545204162598, + "mean_token_accuracy": 0.7550160124897957, + "num_tokens": 17307578.0, + "step": 4310 + }, + { + "entropy": 0.907962580025196, + "epoch": 1.653272101033295, + "grad_norm": 0.11395370960235596, + "learning_rate": 9.453471196454948e-05, + "loss": 0.9705679893493653, + "mean_token_accuracy": 0.7807198286056518, + "num_tokens": 17342943.0, + "step": 4320 + }, + { + "entropy": 0.8495472550392151, + "epoch": 1.6570991197856868, + "grad_norm": 0.07685171812772751, + "learning_rate": 9.426614744192292e-05, + "loss": 0.9079866409301758, + "mean_token_accuracy": 0.7923004642128945, + "num_tokens": 17378158.0, + "step": 4330 + }, + { + "entropy": 0.8389323726296425, + "epoch": 1.6609261385380787, + "grad_norm": 0.09541229903697968, + "learning_rate": 9.399758291929637e-05, + "loss": 0.9092423439025878, + "mean_token_accuracy": 0.7946408927440644, + "num_tokens": 17412703.0, + "step": 4340 + }, + { + "entropy": 0.9035130314528942, + "epoch": 1.6647531572904706, + "grad_norm": 0.08291888236999512, + "learning_rate": 9.37290183966698e-05, + "loss": 0.9255120277404785, + "mean_token_accuracy": 0.7840688213706016, + "num_tokens": 17456250.0, + "step": 4350 + }, + { + "entropy": 0.8917031817138195, + "epoch": 1.6685801760428625, + "grad_norm": 0.08787538856267929, + "learning_rate": 9.346045387404324e-05, + "loss": 0.9318277359008789, + "mean_token_accuracy": 0.7854569494724274, + "num_tokens": 17492566.0, + "step": 4360 + }, + { + "entropy": 0.8860244527459145, + "epoch": 1.6724071947952543, + "grad_norm": 0.10287550836801529, + "learning_rate": 9.319188935141668e-05, + "loss": 0.9169553756713867, + "mean_token_accuracy": 0.7801365301012992, + "num_tokens": 17530267.0, + "step": 4370 + }, + { + "entropy": 0.8470614090561867, + "epoch": 1.6762342135476462, + "grad_norm": 0.13052308559417725, + "learning_rate": 9.292332482879013e-05, + "loss": 0.9004100799560547, + "mean_token_accuracy": 0.791596457362175, + "num_tokens": 17566336.0, + "step": 4380 + }, + { + "entropy": 0.9627884522080421, + "epoch": 1.6800612323000381, + "grad_norm": 0.09305555373430252, + "learning_rate": 9.265476030616355e-05, + "loss": 0.9837147712707519, + "mean_token_accuracy": 0.7687505498528481, + "num_tokens": 17609294.0, + "step": 4390 + }, + { + "entropy": 0.9614691123366356, + "epoch": 1.68388825105243, + "grad_norm": 0.08118042349815369, + "learning_rate": 9.2386195783537e-05, + "loss": 1.0093948364257812, + "mean_token_accuracy": 0.7686966329813003, + "num_tokens": 17653408.0, + "step": 4400 + }, + { + "entropy": 0.8255576498806476, + "epoch": 1.687715269804822, + "grad_norm": 0.07197146117687225, + "learning_rate": 9.211763126091045e-05, + "loss": 0.9013225555419921, + "mean_token_accuracy": 0.7994248151779175, + "num_tokens": 17693303.0, + "step": 4410 + }, + { + "entropy": 0.9197361193597317, + "epoch": 1.6915422885572138, + "grad_norm": 0.10147208720445633, + "learning_rate": 9.184906673828388e-05, + "loss": 0.966912841796875, + "mean_token_accuracy": 0.774210800230503, + "num_tokens": 17734446.0, + "step": 4420 + }, + { + "entropy": 0.8828513637185097, + "epoch": 1.6953693073096057, + "grad_norm": 0.08126919716596603, + "learning_rate": 9.158050221565731e-05, + "loss": 0.9237348556518554, + "mean_token_accuracy": 0.788974218070507, + "num_tokens": 17776056.0, + "step": 4430 + }, + { + "entropy": 0.8538446951657533, + "epoch": 1.6991963260619976, + "grad_norm": 0.08602278679609299, + "learning_rate": 9.131193769303076e-05, + "loss": 0.9384878158569336, + "mean_token_accuracy": 0.7924654617905617, + "num_tokens": 17814560.0, + "step": 4440 + }, + { + "entropy": 0.9160130321979523, + "epoch": 1.7030233448143894, + "grad_norm": 0.10127890110015869, + "learning_rate": 9.10433731704042e-05, + "loss": 0.9924029350280762, + "mean_token_accuracy": 0.7764815479516983, + "num_tokens": 17852872.0, + "step": 4450 + }, + { + "entropy": 0.8855723738670349, + "epoch": 1.7068503635667813, + "grad_norm": 0.09295201301574707, + "learning_rate": 9.077480864777763e-05, + "loss": 0.9131739616394043, + "mean_token_accuracy": 0.7876585990190506, + "num_tokens": 17893403.0, + "step": 4460 + }, + { + "entropy": 0.8825645297765732, + "epoch": 1.7106773823191732, + "grad_norm": 0.1038793995976448, + "learning_rate": 9.050624412515107e-05, + "loss": 0.9621119499206543, + "mean_token_accuracy": 0.7840767920017242, + "num_tokens": 17933069.0, + "step": 4470 + }, + { + "entropy": 0.9438045337796211, + "epoch": 1.714504401071565, + "grad_norm": 0.08998332172632217, + "learning_rate": 9.023767960252452e-05, + "loss": 1.0042546272277832, + "mean_token_accuracy": 0.7710625112056733, + "num_tokens": 17978081.0, + "step": 4480 + }, + { + "entropy": 0.9605814971029758, + "epoch": 1.718331419823957, + "grad_norm": 0.0936085507273674, + "learning_rate": 8.996911507989794e-05, + "loss": 1.0731863021850585, + "mean_token_accuracy": 0.7675610318779945, + "num_tokens": 18026355.0, + "step": 4490 + }, + { + "entropy": 0.9412197135388851, + "epoch": 1.7221584385763489, + "grad_norm": 0.11693151295185089, + "learning_rate": 8.970055055727139e-05, + "loss": 1.0271482467651367, + "mean_token_accuracy": 0.7749442532658577, + "num_tokens": 18064648.0, + "step": 4500 + }, + { + "entropy": 0.9840309470891953, + "epoch": 1.7259854573287408, + "grad_norm": 0.07721691578626633, + "learning_rate": 8.943198603464484e-05, + "loss": 0.9978925704956054, + "mean_token_accuracy": 0.7662706628441811, + "num_tokens": 18104352.0, + "step": 4510 + }, + { + "entropy": 0.9122109733521938, + "epoch": 1.7298124760811326, + "grad_norm": 0.10790548473596573, + "learning_rate": 8.916342151201827e-05, + "loss": 0.9912397384643554, + "mean_token_accuracy": 0.774566973745823, + "num_tokens": 18145632.0, + "step": 4520 + }, + { + "entropy": 0.8214024558663369, + "epoch": 1.7336394948335245, + "grad_norm": 0.0873790979385376, + "learning_rate": 8.88948569893917e-05, + "loss": 0.9174188613891602, + "mean_token_accuracy": 0.797317324578762, + "num_tokens": 18182216.0, + "step": 4530 + }, + { + "entropy": 0.8851194910705089, + "epoch": 1.7374665135859164, + "grad_norm": 0.08441472053527832, + "learning_rate": 8.862629246676515e-05, + "loss": 0.9345614433288574, + "mean_token_accuracy": 0.7909206628799439, + "num_tokens": 18220152.0, + "step": 4540 + }, + { + "entropy": 0.9045546390116215, + "epoch": 1.7412935323383083, + "grad_norm": 0.09491857141256332, + "learning_rate": 8.835772794413858e-05, + "loss": 1.0261774063110352, + "mean_token_accuracy": 0.7792002618312835, + "num_tokens": 18253615.0, + "step": 4550 + }, + { + "entropy": 0.8957971200346947, + "epoch": 1.7451205510907002, + "grad_norm": 0.07239943742752075, + "learning_rate": 8.808916342151202e-05, + "loss": 0.9220120429992675, + "mean_token_accuracy": 0.7812404081225395, + "num_tokens": 18292565.0, + "step": 4560 + }, + { + "entropy": 0.9733762003481388, + "epoch": 1.748947569843092, + "grad_norm": 0.07816951721906662, + "learning_rate": 8.782059889888546e-05, + "loss": 1.0176166534423827, + "mean_token_accuracy": 0.7636090680956841, + "num_tokens": 18337951.0, + "step": 4570 + }, + { + "entropy": 0.9952755816280842, + "epoch": 1.752774588595484, + "grad_norm": 0.09595679491758347, + "learning_rate": 8.75520343762589e-05, + "loss": 1.0484466552734375, + "mean_token_accuracy": 0.7624279737472535, + "num_tokens": 18378541.0, + "step": 4580 + }, + { + "entropy": 0.9325974151492119, + "epoch": 1.7566016073478758, + "grad_norm": 0.1425638496875763, + "learning_rate": 8.728346985363234e-05, + "loss": 1.007568645477295, + "mean_token_accuracy": 0.7753438904881478, + "num_tokens": 18416147.0, + "step": 4590 + }, + { + "entropy": 0.8879670143127442, + "epoch": 1.7604286261002677, + "grad_norm": 0.08936052024364471, + "learning_rate": 8.701490533100578e-05, + "loss": 0.9574133872985839, + "mean_token_accuracy": 0.7878236457705498, + "num_tokens": 18452658.0, + "step": 4600 + }, + { + "entropy": 0.9596087213605642, + "epoch": 1.7642556448526596, + "grad_norm": 0.08222804218530655, + "learning_rate": 8.674634080837921e-05, + "loss": 1.0134518623352051, + "mean_token_accuracy": 0.7686690568923951, + "num_tokens": 18493806.0, + "step": 4610 + }, + { + "entropy": 0.9412514306604862, + "epoch": 1.7680826636050515, + "grad_norm": 0.08482176810503006, + "learning_rate": 8.647777628575266e-05, + "loss": 0.9830768585205079, + "mean_token_accuracy": 0.7789766594767571, + "num_tokens": 18538740.0, + "step": 4620 + }, + { + "entropy": 0.8279416210949421, + "epoch": 1.7719096823574434, + "grad_norm": 0.12101086974143982, + "learning_rate": 8.620921176312609e-05, + "loss": 0.8422709465026855, + "mean_token_accuracy": 0.8009032368659973, + "num_tokens": 18579991.0, + "step": 4630 + }, + { + "entropy": 0.8889543637633324, + "epoch": 1.7757367011098353, + "grad_norm": 0.09586559236049652, + "learning_rate": 8.594064724049954e-05, + "loss": 0.9579720497131348, + "mean_token_accuracy": 0.7857530102133751, + "num_tokens": 18616195.0, + "step": 4640 + }, + { + "entropy": 0.9021936893463135, + "epoch": 1.7795637198622272, + "grad_norm": 0.0920713022351265, + "learning_rate": 8.567208271787297e-05, + "loss": 0.9568814277648926, + "mean_token_accuracy": 0.7835437625646591, + "num_tokens": 18650396.0, + "step": 4650 + }, + { + "entropy": 0.9605553701519967, + "epoch": 1.783390738614619, + "grad_norm": 0.0752284824848175, + "learning_rate": 8.54035181952464e-05, + "loss": 1.0107332229614259, + "mean_token_accuracy": 0.7712536633014679, + "num_tokens": 18692519.0, + "step": 4660 + }, + { + "entropy": 0.8929145928472281, + "epoch": 1.787217757367011, + "grad_norm": 0.08124406635761261, + "learning_rate": 8.513495367261985e-05, + "loss": 0.9392594337463379, + "mean_token_accuracy": 0.7837390914559365, + "num_tokens": 18730865.0, + "step": 4670 + }, + { + "entropy": 0.8995866551995277, + "epoch": 1.7910447761194028, + "grad_norm": 0.07306879013776779, + "learning_rate": 8.486638914999329e-05, + "loss": 0.9512563705444336, + "mean_token_accuracy": 0.7803288042545319, + "num_tokens": 18774851.0, + "step": 4680 + }, + { + "entropy": 0.9283428456634283, + "epoch": 1.7948717948717947, + "grad_norm": 0.06833672523498535, + "learning_rate": 8.459782462736673e-05, + "loss": 0.9614426612854003, + "mean_token_accuracy": 0.7776324123144149, + "num_tokens": 18815273.0, + "step": 4690 + }, + { + "entropy": 0.8980611331760884, + "epoch": 1.7986988136241866, + "grad_norm": 0.09426148980855942, + "learning_rate": 8.432926010474017e-05, + "loss": 0.9397372245788574, + "mean_token_accuracy": 0.7818324938416481, + "num_tokens": 18854806.0, + "step": 4700 + }, + { + "entropy": 0.9534067753702402, + "epoch": 1.8025258323765785, + "grad_norm": 0.11984719336032867, + "learning_rate": 8.40606955821136e-05, + "loss": 1.0058012008666992, + "mean_token_accuracy": 0.7710662186145782, + "num_tokens": 18893820.0, + "step": 4710 + }, + { + "entropy": 0.9396863542497158, + "epoch": 1.8063528511289704, + "grad_norm": 0.1126495823264122, + "learning_rate": 8.379213105948705e-05, + "loss": 0.9968315124511719, + "mean_token_accuracy": 0.7748499393463135, + "num_tokens": 18932078.0, + "step": 4720 + }, + { + "entropy": 1.0531147465109825, + "epoch": 1.8101798698813623, + "grad_norm": 0.0951380655169487, + "learning_rate": 8.352356653686048e-05, + "loss": 1.1058255195617677, + "mean_token_accuracy": 0.7495945364236831, + "num_tokens": 18974602.0, + "step": 4730 + }, + { + "entropy": 0.8520326256752014, + "epoch": 1.8140068886337541, + "grad_norm": 0.08623862266540527, + "learning_rate": 8.325500201423391e-05, + "loss": 0.8825064659118652, + "mean_token_accuracy": 0.7940022364258766, + "num_tokens": 19014261.0, + "step": 4740 + }, + { + "entropy": 0.979587784409523, + "epoch": 1.817833907386146, + "grad_norm": 0.11787699162960052, + "learning_rate": 8.298643749160736e-05, + "loss": 1.070664405822754, + "mean_token_accuracy": 0.7620177045464516, + "num_tokens": 19058223.0, + "step": 4750 + }, + { + "entropy": 0.8753061652183532, + "epoch": 1.821660926138538, + "grad_norm": 0.130862757563591, + "learning_rate": 8.271787296898081e-05, + "loss": 0.9366108894348144, + "mean_token_accuracy": 0.7874863654375076, + "num_tokens": 19100204.0, + "step": 4760 + }, + { + "entropy": 0.8777839131653309, + "epoch": 1.8254879448909298, + "grad_norm": 0.09261229634284973, + "learning_rate": 8.244930844635424e-05, + "loss": 0.9104420661926269, + "mean_token_accuracy": 0.7895808070898056, + "num_tokens": 19141588.0, + "step": 4770 + }, + { + "entropy": 0.9170226149260998, + "epoch": 1.8293149636433217, + "grad_norm": 0.06741383671760559, + "learning_rate": 8.218074392372768e-05, + "loss": 0.9543824195861816, + "mean_token_accuracy": 0.7765088111162186, + "num_tokens": 19182818.0, + "step": 4780 + }, + { + "entropy": 0.8602717489004135, + "epoch": 1.8331419823957136, + "grad_norm": 0.12861686944961548, + "learning_rate": 8.191217940110112e-05, + "loss": 0.926014518737793, + "mean_token_accuracy": 0.7917162731289864, + "num_tokens": 19217919.0, + "step": 4790 + }, + { + "entropy": 0.9471398882567883, + "epoch": 1.8369690011481055, + "grad_norm": 0.0744423121213913, + "learning_rate": 8.164361487847456e-05, + "loss": 0.9777775764465332, + "mean_token_accuracy": 0.7716371163725853, + "num_tokens": 19263991.0, + "step": 4800 + }, + { + "entropy": 0.8363759070634842, + "epoch": 1.8407960199004973, + "grad_norm": 0.08627327531576157, + "learning_rate": 8.137505035584799e-05, + "loss": 0.887846565246582, + "mean_token_accuracy": 0.7938979223370553, + "num_tokens": 19305441.0, + "step": 4810 + }, + { + "entropy": 0.9200571574270725, + "epoch": 1.8446230386528892, + "grad_norm": 0.08358518034219742, + "learning_rate": 8.110648583322144e-05, + "loss": 0.9703543663024903, + "mean_token_accuracy": 0.777827826142311, + "num_tokens": 19342309.0, + "step": 4820 + }, + { + "entropy": 0.9295372806489468, + "epoch": 1.8484500574052811, + "grad_norm": 0.0970570370554924, + "learning_rate": 8.083792131059487e-05, + "loss": 0.9934672355651856, + "mean_token_accuracy": 0.7739364430308342, + "num_tokens": 19387707.0, + "step": 4830 + }, + { + "entropy": 0.9003355488181114, + "epoch": 1.852277076157673, + "grad_norm": 0.09357219189405441, + "learning_rate": 8.05693567879683e-05, + "loss": 0.9544237136840821, + "mean_token_accuracy": 0.7824135825037957, + "num_tokens": 19428449.0, + "step": 4840 + }, + { + "entropy": 1.0107353992760182, + "epoch": 1.856104094910065, + "grad_norm": 0.08587910234928131, + "learning_rate": 8.030079226534175e-05, + "loss": 1.0694228172302247, + "mean_token_accuracy": 0.7585012704133988, + "num_tokens": 19469625.0, + "step": 4850 + }, + { + "entropy": 0.9866157718002796, + "epoch": 1.8599311136624568, + "grad_norm": 0.11663772910833359, + "learning_rate": 8.00322277427152e-05, + "loss": 1.0394890785217286, + "mean_token_accuracy": 0.7632519364356994, + "num_tokens": 19510480.0, + "step": 4860 + }, + { + "entropy": 0.881837759912014, + "epoch": 1.8637581324148487, + "grad_norm": 0.13599033653736115, + "learning_rate": 7.976366322008862e-05, + "loss": 0.9441938400268555, + "mean_token_accuracy": 0.7848336577415467, + "num_tokens": 19550305.0, + "step": 4870 + }, + { + "entropy": 0.9193019077181817, + "epoch": 1.8675851511672406, + "grad_norm": 0.09272989630699158, + "learning_rate": 7.949509869746207e-05, + "loss": 0.9862917900085449, + "mean_token_accuracy": 0.7736792579293251, + "num_tokens": 19588278.0, + "step": 4880 + }, + { + "entropy": 0.9219990812242032, + "epoch": 1.8714121699196324, + "grad_norm": 0.10006739944219589, + "learning_rate": 7.922653417483551e-05, + "loss": 0.9700265884399414, + "mean_token_accuracy": 0.780723437666893, + "num_tokens": 19625749.0, + "step": 4890 + }, + { + "entropy": 0.8564371943473816, + "epoch": 1.8752391886720245, + "grad_norm": 0.08216696232557297, + "learning_rate": 7.895796965220895e-05, + "loss": 0.9064787864685059, + "mean_token_accuracy": 0.7962334454059601, + "num_tokens": 19663935.0, + "step": 4900 + }, + { + "entropy": 0.9482992745935916, + "epoch": 1.8790662074244164, + "grad_norm": 0.06782303750514984, + "learning_rate": 7.868940512958238e-05, + "loss": 0.9920551300048828, + "mean_token_accuracy": 0.7696940049529075, + "num_tokens": 19704663.0, + "step": 4910 + }, + { + "entropy": 0.884397204965353, + "epoch": 1.8828932261768083, + "grad_norm": 0.06414399296045303, + "learning_rate": 7.842084060695583e-05, + "loss": 0.9083518981933594, + "mean_token_accuracy": 0.7858098462224007, + "num_tokens": 19753438.0, + "step": 4920 + }, + { + "entropy": 0.8019696604460478, + "epoch": 1.8867202449292002, + "grad_norm": 0.08456243574619293, + "learning_rate": 7.815227608432927e-05, + "loss": 0.8896969795227051, + "mean_token_accuracy": 0.8053153440356254, + "num_tokens": 19790404.0, + "step": 4930 + }, + { + "entropy": 0.8008564852178097, + "epoch": 1.890547263681592, + "grad_norm": 0.10543688386678696, + "learning_rate": 7.78837115617027e-05, + "loss": 0.8535223007202148, + "mean_token_accuracy": 0.8059684678912162, + "num_tokens": 19825645.0, + "step": 4940 + }, + { + "entropy": 0.8714719720184803, + "epoch": 1.894374282433984, + "grad_norm": 0.09498755633831024, + "learning_rate": 7.761514703907614e-05, + "loss": 0.9063860893249511, + "mean_token_accuracy": 0.7914781123399734, + "num_tokens": 19866092.0, + "step": 4950 + }, + { + "entropy": 0.9202240366488695, + "epoch": 1.8982013011863759, + "grad_norm": 0.07342597842216492, + "learning_rate": 7.734658251644959e-05, + "loss": 0.9767581939697265, + "mean_token_accuracy": 0.7755973920226097, + "num_tokens": 19907356.0, + "step": 4960 + }, + { + "entropy": 0.9477262906730175, + "epoch": 1.9020283199387678, + "grad_norm": 0.08742880076169968, + "learning_rate": 7.707801799382302e-05, + "loss": 1.0063783645629882, + "mean_token_accuracy": 0.7687567621469498, + "num_tokens": 19952869.0, + "step": 4970 + }, + { + "entropy": 0.977492806315422, + "epoch": 1.9058553386911596, + "grad_norm": 0.10321515798568726, + "learning_rate": 7.680945347119645e-05, + "loss": 1.0323823928833007, + "mean_token_accuracy": 0.7646988987922668, + "num_tokens": 19991372.0, + "step": 4980 + }, + { + "entropy": 0.7999268680810928, + "epoch": 1.9096823574435515, + "grad_norm": 0.08925452828407288, + "learning_rate": 7.65408889485699e-05, + "loss": 0.8391226768493653, + "mean_token_accuracy": 0.8017501994967461, + "num_tokens": 20029189.0, + "step": 4990 + }, + { + "entropy": 0.8757653787732125, + "epoch": 1.9135093761959434, + "grad_norm": 0.1915360540151596, + "learning_rate": 7.627232442594334e-05, + "loss": 0.9243562698364258, + "mean_token_accuracy": 0.7841411307454109, + "num_tokens": 20070611.0, + "step": 5000 + }, + { + "entropy": 0.9357082359492779, + "epoch": 1.9173363949483353, + "grad_norm": 0.08219558745622635, + "learning_rate": 7.600375990331677e-05, + "loss": 0.9772232055664063, + "mean_token_accuracy": 0.7725088000297546, + "num_tokens": 20110392.0, + "step": 5010 + }, + { + "entropy": 0.9191611532121897, + "epoch": 1.9211634137007272, + "grad_norm": 0.07629676163196564, + "learning_rate": 7.573519538069022e-05, + "loss": 0.9754646301269532, + "mean_token_accuracy": 0.7830281540751457, + "num_tokens": 20150683.0, + "step": 5020 + }, + { + "entropy": 0.9279548175632953, + "epoch": 1.924990432453119, + "grad_norm": 0.09845773130655289, + "learning_rate": 7.546663085806366e-05, + "loss": 0.9818471908569336, + "mean_token_accuracy": 0.7738550245761872, + "num_tokens": 20190521.0, + "step": 5030 + }, + { + "entropy": 0.9281142316758633, + "epoch": 1.928817451205511, + "grad_norm": 0.10571245104074478, + "learning_rate": 7.519806633543708e-05, + "loss": 0.999634075164795, + "mean_token_accuracy": 0.7708285465836525, + "num_tokens": 20230615.0, + "step": 5040 + }, + { + "entropy": 0.8793018095195293, + "epoch": 1.9326444699579028, + "grad_norm": 0.11255183815956116, + "learning_rate": 7.492950181281053e-05, + "loss": 0.9399495124816895, + "mean_token_accuracy": 0.7893765285611153, + "num_tokens": 20269332.0, + "step": 5050 + }, + { + "entropy": 0.8188632413744926, + "epoch": 1.9364714887102947, + "grad_norm": 0.08683498203754425, + "learning_rate": 7.466093729018398e-05, + "loss": 0.8760917663574219, + "mean_token_accuracy": 0.800470444560051, + "num_tokens": 20316849.0, + "step": 5060 + }, + { + "entropy": 0.9165158126503229, + "epoch": 1.9402985074626866, + "grad_norm": 0.12123431265354156, + "learning_rate": 7.439237276755741e-05, + "loss": 0.9515151023864746, + "mean_token_accuracy": 0.7772148326039314, + "num_tokens": 20354641.0, + "step": 5070 + }, + { + "entropy": 0.8890400048345327, + "epoch": 1.9441255262150785, + "grad_norm": 0.09551843255758286, + "learning_rate": 7.412380824493084e-05, + "loss": 0.9720385551452637, + "mean_token_accuracy": 0.7855533555150032, + "num_tokens": 20400703.0, + "step": 5080 + }, + { + "entropy": 0.9226945102214813, + "epoch": 1.9479525449674704, + "grad_norm": 0.11462504416704178, + "learning_rate": 7.385524372230429e-05, + "loss": 0.9757321357727051, + "mean_token_accuracy": 0.7739654749631881, + "num_tokens": 20442145.0, + "step": 5090 + }, + { + "entropy": 0.8108384694904089, + "epoch": 1.9517795637198623, + "grad_norm": 0.13017524778842926, + "learning_rate": 7.358667919967772e-05, + "loss": 0.8620017051696778, + "mean_token_accuracy": 0.8028488114476204, + "num_tokens": 20472714.0, + "step": 5100 + }, + { + "entropy": 0.9563053950667382, + "epoch": 1.9556065824722542, + "grad_norm": 0.10588496923446655, + "learning_rate": 7.331811467705116e-05, + "loss": 0.9805202484130859, + "mean_token_accuracy": 0.7729632049798966, + "num_tokens": 20518593.0, + "step": 5110 + }, + { + "entropy": 0.9307407476007938, + "epoch": 1.959433601224646, + "grad_norm": 0.09899015724658966, + "learning_rate": 7.30495501544246e-05, + "loss": 0.998748779296875, + "mean_token_accuracy": 0.7733172833919525, + "num_tokens": 20558008.0, + "step": 5120 + }, + { + "entropy": 0.9505821786820888, + "epoch": 1.963260619977038, + "grad_norm": 0.0943673700094223, + "learning_rate": 7.278098563179804e-05, + "loss": 1.0047925949096679, + "mean_token_accuracy": 0.7691358909010887, + "num_tokens": 20603741.0, + "step": 5130 + }, + { + "entropy": 1.04148171544075, + "epoch": 1.9670876387294298, + "grad_norm": 0.08869694918394089, + "learning_rate": 7.251242110917149e-05, + "loss": 1.0801177024841309, + "mean_token_accuracy": 0.7499634683132171, + "num_tokens": 20645827.0, + "step": 5140 + }, + { + "entropy": 0.7822969853878021, + "epoch": 1.9709146574818217, + "grad_norm": 0.0994991883635521, + "learning_rate": 7.224385658654492e-05, + "loss": 0.8042619705200196, + "mean_token_accuracy": 0.8097834318876267, + "num_tokens": 20684019.0, + "step": 5150 + }, + { + "entropy": 0.918664800748229, + "epoch": 1.9747416762342136, + "grad_norm": 0.11157739907503128, + "learning_rate": 7.197529206391837e-05, + "loss": 0.983153247833252, + "mean_token_accuracy": 0.7776870116591453, + "num_tokens": 20726278.0, + "step": 5160 + }, + { + "entropy": 0.911195681989193, + "epoch": 1.9785686949866055, + "grad_norm": 0.13472694158554077, + "learning_rate": 7.17067275412918e-05, + "loss": 0.9662351608276367, + "mean_token_accuracy": 0.7743990138173104, + "num_tokens": 20759927.0, + "step": 5170 + }, + { + "entropy": 0.8238823972642422, + "epoch": 1.9823957137389974, + "grad_norm": 0.08864834159612656, + "learning_rate": 7.143816301866523e-05, + "loss": 0.8870213508605957, + "mean_token_accuracy": 0.7989589869976044, + "num_tokens": 20798325.0, + "step": 5180 + }, + { + "entropy": 0.9405660286545754, + "epoch": 1.9862227324913893, + "grad_norm": 0.08372621983289719, + "learning_rate": 7.116959849603868e-05, + "loss": 0.9449873924255371, + "mean_token_accuracy": 0.7792889401316643, + "num_tokens": 20837136.0, + "step": 5190 + }, + { + "entropy": 0.8287422813475132, + "epoch": 1.9900497512437811, + "grad_norm": 0.0968240275979042, + "learning_rate": 7.090103397341211e-05, + "loss": 0.8873905181884766, + "mean_token_accuracy": 0.7976622357964516, + "num_tokens": 20877693.0, + "step": 5200 + }, + { + "entropy": 0.9188660819083452, + "epoch": 1.993876769996173, + "grad_norm": 0.09275626391172409, + "learning_rate": 7.063246945078555e-05, + "loss": 0.989016342163086, + "mean_token_accuracy": 0.7755422025918961, + "num_tokens": 20924885.0, + "step": 5210 + }, + { + "entropy": 0.9058490604162216, + "epoch": 1.997703788748565, + "grad_norm": 0.08644875138998032, + "learning_rate": 7.0363904928159e-05, + "loss": 0.9660470008850097, + "mean_token_accuracy": 0.7761533245444298, + "num_tokens": 20966342.0, + "step": 5220 + }, + { + "entropy": 0.7741431064903737, + "epoch": 2.0015308075009566, + "grad_norm": 0.07492107152938843, + "learning_rate": 7.009534040553243e-05, + "loss": 0.8241374015808105, + "mean_token_accuracy": 0.8149536207318306, + "num_tokens": 21004798.0, + "step": 5230 + }, + { + "entropy": 0.8813200116157531, + "epoch": 2.0053578262533485, + "grad_norm": 0.07805436849594116, + "learning_rate": 6.982677588290588e-05, + "loss": 0.921663761138916, + "mean_token_accuracy": 0.7912002876400948, + "num_tokens": 21049021.0, + "step": 5240 + }, + { + "entropy": 0.8896506872028113, + "epoch": 2.0091848450057403, + "grad_norm": 0.13928763568401337, + "learning_rate": 6.955821136027931e-05, + "loss": 0.9278170585632324, + "mean_token_accuracy": 0.7765205070376396, + "num_tokens": 21086531.0, + "step": 5250 + }, + { + "entropy": 0.9149777121841908, + "epoch": 2.0130118637581322, + "grad_norm": 0.06992843002080917, + "learning_rate": 6.928964683765274e-05, + "loss": 0.9667098045349121, + "mean_token_accuracy": 0.7750229969620704, + "num_tokens": 21127453.0, + "step": 5260 + }, + { + "entropy": 0.8076952576637269, + "epoch": 2.016838882510524, + "grad_norm": 0.12632791697978973, + "learning_rate": 6.902108231502619e-05, + "loss": 0.8237466812133789, + "mean_token_accuracy": 0.804887568950653, + "num_tokens": 21165297.0, + "step": 5270 + }, + { + "entropy": 0.8818444184958935, + "epoch": 2.020665901262916, + "grad_norm": 0.08924616128206253, + "learning_rate": 6.875251779239962e-05, + "loss": 0.9049506187438965, + "mean_token_accuracy": 0.7822276562452316, + "num_tokens": 21206219.0, + "step": 5280 + }, + { + "entropy": 0.7953705489635468, + "epoch": 2.024492920015308, + "grad_norm": 0.1111336424946785, + "learning_rate": 6.848395326977307e-05, + "loss": 0.8433744430541992, + "mean_token_accuracy": 0.8049945279955864, + "num_tokens": 21249239.0, + "step": 5290 + }, + { + "entropy": 0.904665675573051, + "epoch": 2.0283199387677, + "grad_norm": 0.09494993835687637, + "learning_rate": 6.82153887471465e-05, + "loss": 0.9693451881408691, + "mean_token_accuracy": 0.779350683093071, + "num_tokens": 21289639.0, + "step": 5300 + }, + { + "entropy": 0.7958274722099304, + "epoch": 2.0321469575200917, + "grad_norm": 0.10396509617567062, + "learning_rate": 6.794682422451995e-05, + "loss": 0.8559811592102051, + "mean_token_accuracy": 0.8057383120059967, + "num_tokens": 21329136.0, + "step": 5310 + }, + { + "entropy": 0.9416906848549843, + "epoch": 2.0359739762724836, + "grad_norm": 0.08166563510894775, + "learning_rate": 6.767825970189338e-05, + "loss": 0.9891387939453125, + "mean_token_accuracy": 0.7737650781869888, + "num_tokens": 21371300.0, + "step": 5320 + }, + { + "entropy": 0.9342201549559832, + "epoch": 2.0398009950248754, + "grad_norm": 0.09459090232849121, + "learning_rate": 6.740969517926682e-05, + "loss": 0.9509946823120117, + "mean_token_accuracy": 0.7751364663243294, + "num_tokens": 21412268.0, + "step": 5330 + }, + { + "entropy": 0.8397190041840077, + "epoch": 2.0436280137772673, + "grad_norm": 0.10005268454551697, + "learning_rate": 6.714113065664026e-05, + "loss": 0.9056560516357421, + "mean_token_accuracy": 0.79336898624897, + "num_tokens": 21451975.0, + "step": 5340 + }, + { + "entropy": 0.9148454248905182, + "epoch": 2.047455032529659, + "grad_norm": 0.10257065296173096, + "learning_rate": 6.68725661340137e-05, + "loss": 0.9611604690551758, + "mean_token_accuracy": 0.7737416908144951, + "num_tokens": 21491818.0, + "step": 5350 + }, + { + "entropy": 0.9010646104812622, + "epoch": 2.051282051282051, + "grad_norm": 0.11826229095458984, + "learning_rate": 6.660400161138713e-05, + "loss": 0.9446893692016601, + "mean_token_accuracy": 0.7851994633674622, + "num_tokens": 21528066.0, + "step": 5360 + }, + { + "entropy": 0.8987722039222718, + "epoch": 2.055109070034443, + "grad_norm": 0.10371451824903488, + "learning_rate": 6.633543708876058e-05, + "loss": 0.9595455169677735, + "mean_token_accuracy": 0.7833559066057205, + "num_tokens": 21562883.0, + "step": 5370 + }, + { + "entropy": 0.8856854721903801, + "epoch": 2.058936088786835, + "grad_norm": 0.1089499220252037, + "learning_rate": 6.606687256613403e-05, + "loss": 0.9219722747802734, + "mean_token_accuracy": 0.7822227373719215, + "num_tokens": 21600910.0, + "step": 5380 + }, + { + "entropy": 0.8720096081495285, + "epoch": 2.0627631075392268, + "grad_norm": 0.09962328523397446, + "learning_rate": 6.579830804350745e-05, + "loss": 0.9654089927673339, + "mean_token_accuracy": 0.7856920391321183, + "num_tokens": 21640445.0, + "step": 5390 + }, + { + "entropy": 0.9440382812172174, + "epoch": 2.0665901262916186, + "grad_norm": 0.08670477569103241, + "learning_rate": 6.552974352088089e-05, + "loss": 0.9934238433837891, + "mean_token_accuracy": 0.7687147289514542, + "num_tokens": 21682432.0, + "step": 5400 + }, + { + "entropy": 0.774172055721283, + "epoch": 2.0704171450440105, + "grad_norm": 0.11862040311098099, + "learning_rate": 6.526117899825434e-05, + "loss": 0.8106603622436523, + "mean_token_accuracy": 0.8135839134454728, + "num_tokens": 21721359.0, + "step": 5410 + }, + { + "entropy": 0.9194908868521452, + "epoch": 2.0742441637964024, + "grad_norm": 0.10227365791797638, + "learning_rate": 6.499261447562777e-05, + "loss": 0.9410523414611817, + "mean_token_accuracy": 0.7788734346628189, + "num_tokens": 21763700.0, + "step": 5420 + }, + { + "entropy": 0.7955736435949803, + "epoch": 2.0780711825487943, + "grad_norm": 0.09657785296440125, + "learning_rate": 6.472404995300121e-05, + "loss": 0.8665301322937011, + "mean_token_accuracy": 0.8067882195115089, + "num_tokens": 21804190.0, + "step": 5430 + }, + { + "entropy": 0.8065498791635036, + "epoch": 2.081898201301186, + "grad_norm": 0.11568085849285126, + "learning_rate": 6.445548543037465e-05, + "loss": 0.8515932083129882, + "mean_token_accuracy": 0.8035058185458184, + "num_tokens": 21839801.0, + "step": 5440 + }, + { + "entropy": 0.9087674509733915, + "epoch": 2.085725220053578, + "grad_norm": 0.09318574517965317, + "learning_rate": 6.418692090774809e-05, + "loss": 0.9387861251831054, + "mean_token_accuracy": 0.77939523011446, + "num_tokens": 21877125.0, + "step": 5450 + }, + { + "entropy": 0.86418566852808, + "epoch": 2.08955223880597, + "grad_norm": 0.08796729892492294, + "learning_rate": 6.391835638512152e-05, + "loss": 0.9152085304260253, + "mean_token_accuracy": 0.7899368211627007, + "num_tokens": 21921493.0, + "step": 5460 + }, + { + "entropy": 0.8593201294541359, + "epoch": 2.093379257558362, + "grad_norm": 0.14465564489364624, + "learning_rate": 6.364979186249497e-05, + "loss": 0.8955412864685058, + "mean_token_accuracy": 0.7898772984743119, + "num_tokens": 21961188.0, + "step": 5470 + }, + { + "entropy": 0.8998314358294011, + "epoch": 2.0972062763107537, + "grad_norm": 0.11634784191846848, + "learning_rate": 6.338122733986842e-05, + "loss": 0.9114861488342285, + "mean_token_accuracy": 0.7838647082448006, + "num_tokens": 22001738.0, + "step": 5480 + }, + { + "entropy": 0.8693659231066704, + "epoch": 2.1010332950631456, + "grad_norm": 0.11536803841590881, + "learning_rate": 6.311266281724184e-05, + "loss": 0.9232154846191406, + "mean_token_accuracy": 0.7881089702248574, + "num_tokens": 22039626.0, + "step": 5490 + }, + { + "entropy": 0.9556272588670254, + "epoch": 2.1048603138155375, + "grad_norm": 0.09614596515893936, + "learning_rate": 6.284409829461528e-05, + "loss": 1.0266177177429199, + "mean_token_accuracy": 0.7646962344646454, + "num_tokens": 22081971.0, + "step": 5500 + }, + { + "entropy": 0.7735307298600673, + "epoch": 2.1086873325679294, + "grad_norm": 0.10002073645591736, + "learning_rate": 6.257553377198873e-05, + "loss": 0.8011887550354004, + "mean_token_accuracy": 0.8088100135326386, + "num_tokens": 22117897.0, + "step": 5510 + }, + { + "entropy": 0.8981072999536991, + "epoch": 2.1125143513203213, + "grad_norm": 0.10524707287549973, + "learning_rate": 6.230696924936216e-05, + "loss": 0.9659936904907227, + "mean_token_accuracy": 0.7843907788395882, + "num_tokens": 22161049.0, + "step": 5520 + }, + { + "entropy": 0.8891891561448574, + "epoch": 2.116341370072713, + "grad_norm": 0.10095740854740143, + "learning_rate": 6.20384047267356e-05, + "loss": 0.9199987411499023, + "mean_token_accuracy": 0.7833669245243072, + "num_tokens": 22201183.0, + "step": 5530 + }, + { + "entropy": 0.9359986830502749, + "epoch": 2.120168388825105, + "grad_norm": 0.08723930269479752, + "learning_rate": 6.176984020410904e-05, + "loss": 0.9635790824890137, + "mean_token_accuracy": 0.7724878415465355, + "num_tokens": 22240779.0, + "step": 5540 + }, + { + "entropy": 0.8017430886626243, + "epoch": 2.123995407577497, + "grad_norm": 0.10579924285411835, + "learning_rate": 6.150127568148249e-05, + "loss": 0.842125129699707, + "mean_token_accuracy": 0.8020379558205605, + "num_tokens": 22279289.0, + "step": 5550 + }, + { + "entropy": 0.7666160762310028, + "epoch": 2.127822426329889, + "grad_norm": 0.09871628880500793, + "learning_rate": 6.123271115885591e-05, + "loss": 0.8378163337707519, + "mean_token_accuracy": 0.8119754999876022, + "num_tokens": 22316715.0, + "step": 5560 + }, + { + "entropy": 0.9505756117403508, + "epoch": 2.1316494450822807, + "grad_norm": 0.11093632131814957, + "learning_rate": 6.096414663622936e-05, + "loss": 0.9677371025085449, + "mean_token_accuracy": 0.7698320209980011, + "num_tokens": 22360112.0, + "step": 5570 + }, + { + "entropy": 0.7982158973813057, + "epoch": 2.1354764638346726, + "grad_norm": 0.11260368674993515, + "learning_rate": 6.06955821136028e-05, + "loss": 0.8571239471435547, + "mean_token_accuracy": 0.804571321606636, + "num_tokens": 22399114.0, + "step": 5580 + }, + { + "entropy": 0.8869463637471199, + "epoch": 2.1393034825870645, + "grad_norm": 0.08550643920898438, + "learning_rate": 6.042701759097623e-05, + "loss": 0.9476675033569336, + "mean_token_accuracy": 0.7807673364877701, + "num_tokens": 22440187.0, + "step": 5590 + }, + { + "entropy": 0.9491269618272782, + "epoch": 2.1431305013394564, + "grad_norm": 0.09019884467124939, + "learning_rate": 6.015845306834967e-05, + "loss": 1.0232599258422852, + "mean_token_accuracy": 0.7681664958596229, + "num_tokens": 22479682.0, + "step": 5600 + }, + { + "entropy": 0.8861779697239399, + "epoch": 2.1469575200918483, + "grad_norm": 0.11756031215190887, + "learning_rate": 5.988988854572312e-05, + "loss": 0.9251557350158691, + "mean_token_accuracy": 0.7849425792694091, + "num_tokens": 22520352.0, + "step": 5610 + }, + { + "entropy": 0.8735060147941113, + "epoch": 2.15078453884424, + "grad_norm": 0.0996679812669754, + "learning_rate": 5.9621324023096546e-05, + "loss": 0.9677264213562011, + "mean_token_accuracy": 0.7881714150309562, + "num_tokens": 22561677.0, + "step": 5620 + }, + { + "entropy": 0.991636025160551, + "epoch": 2.154611557596632, + "grad_norm": 0.10682649165391922, + "learning_rate": 5.935275950046999e-05, + "loss": 1.050811195373535, + "mean_token_accuracy": 0.7574850931763649, + "num_tokens": 22609671.0, + "step": 5630 + }, + { + "entropy": 0.9028345100581646, + "epoch": 2.158438576349024, + "grad_norm": 0.11249802261590958, + "learning_rate": 5.908419497784343e-05, + "loss": 0.9876343727111816, + "mean_token_accuracy": 0.783162035048008, + "num_tokens": 22650924.0, + "step": 5640 + }, + { + "entropy": 0.868353420495987, + "epoch": 2.162265595101416, + "grad_norm": 0.08846433460712433, + "learning_rate": 5.8815630455216867e-05, + "loss": 0.9271388053894043, + "mean_token_accuracy": 0.7898381799459457, + "num_tokens": 22691550.0, + "step": 5650 + }, + { + "entropy": 0.9247912406921387, + "epoch": 2.1660926138538077, + "grad_norm": 0.10013602674007416, + "learning_rate": 5.854706593259031e-05, + "loss": 1.0093653678894043, + "mean_token_accuracy": 0.7723490744829178, + "num_tokens": 22728956.0, + "step": 5660 + }, + { + "entropy": 0.82930968105793, + "epoch": 2.1699196326061996, + "grad_norm": 0.11004043370485306, + "learning_rate": 5.827850140996375e-05, + "loss": 0.8801467895507813, + "mean_token_accuracy": 0.798722094297409, + "num_tokens": 22765064.0, + "step": 5670 + }, + { + "entropy": 0.8950945638120175, + "epoch": 2.1737466513585915, + "grad_norm": 0.09994686394929886, + "learning_rate": 5.800993688733719e-05, + "loss": 0.9781051635742187, + "mean_token_accuracy": 0.7849533364176751, + "num_tokens": 22802213.0, + "step": 5680 + }, + { + "entropy": 0.8847132481634616, + "epoch": 2.1775736701109834, + "grad_norm": 0.09891512989997864, + "learning_rate": 5.774137236471062e-05, + "loss": 0.9338027954101562, + "mean_token_accuracy": 0.7867394030094147, + "num_tokens": 22839400.0, + "step": 5690 + }, + { + "entropy": 0.8212509788572788, + "epoch": 2.1814006888633752, + "grad_norm": 0.10451705008745193, + "learning_rate": 5.747280784208406e-05, + "loss": 0.8740688323974609, + "mean_token_accuracy": 0.7968196496367455, + "num_tokens": 22877771.0, + "step": 5700 + }, + { + "entropy": 0.7856742814183235, + "epoch": 2.185227707615767, + "grad_norm": 0.09351614862680435, + "learning_rate": 5.720424331945751e-05, + "loss": 0.8385543823242188, + "mean_token_accuracy": 0.8064358577132225, + "num_tokens": 22916159.0, + "step": 5710 + }, + { + "entropy": 0.9431014984846116, + "epoch": 2.189054726368159, + "grad_norm": 0.09432144463062286, + "learning_rate": 5.6935678796830935e-05, + "loss": 1.0021851539611817, + "mean_token_accuracy": 0.7693860113620759, + "num_tokens": 22958014.0, + "step": 5720 + }, + { + "entropy": 0.9080683786422015, + "epoch": 2.192881745120551, + "grad_norm": 0.08724278956651688, + "learning_rate": 5.666711427420438e-05, + "loss": 0.9878963470458985, + "mean_token_accuracy": 0.7802156403660774, + "num_tokens": 23003222.0, + "step": 5730 + }, + { + "entropy": 0.8772326201200485, + "epoch": 2.196708763872943, + "grad_norm": 0.1096489354968071, + "learning_rate": 5.639854975157782e-05, + "loss": 0.9326786041259766, + "mean_token_accuracy": 0.7881689593195915, + "num_tokens": 23039512.0, + "step": 5740 + }, + { + "entropy": 0.9084336057305336, + "epoch": 2.2005357826253347, + "grad_norm": 0.11137977987527847, + "learning_rate": 5.6129985228951256e-05, + "loss": 0.9574773788452149, + "mean_token_accuracy": 0.7860094889998436, + "num_tokens": 23078238.0, + "step": 5750 + }, + { + "entropy": 0.836103780195117, + "epoch": 2.2043628013777266, + "grad_norm": 0.11038387566804886, + "learning_rate": 5.5861420706324696e-05, + "loss": 0.88037109375, + "mean_token_accuracy": 0.7916925936937332, + "num_tokens": 23121089.0, + "step": 5760 + }, + { + "entropy": 0.9425606489181518, + "epoch": 2.2081898201301184, + "grad_norm": 0.10270453989505768, + "learning_rate": 5.5592856183698137e-05, + "loss": 0.983431339263916, + "mean_token_accuracy": 0.7715479463338852, + "num_tokens": 23158047.0, + "step": 5770 + }, + { + "entropy": 0.8212515480816365, + "epoch": 2.2120168388825103, + "grad_norm": 0.0880119651556015, + "learning_rate": 5.532429166107157e-05, + "loss": 0.887947940826416, + "mean_token_accuracy": 0.7997770145535469, + "num_tokens": 23204019.0, + "step": 5780 + }, + { + "entropy": 0.8668085850775242, + "epoch": 2.215843857634902, + "grad_norm": 0.11390146613121033, + "learning_rate": 5.505572713844501e-05, + "loss": 0.9010316848754882, + "mean_token_accuracy": 0.7880747586488723, + "num_tokens": 23241922.0, + "step": 5790 + }, + { + "entropy": 0.7907863073050976, + "epoch": 2.219670876387294, + "grad_norm": 0.11713080108165741, + "learning_rate": 5.478716261581846e-05, + "loss": 0.8595284461975098, + "mean_token_accuracy": 0.8068661123514176, + "num_tokens": 23280534.0, + "step": 5800 + }, + { + "entropy": 0.8358560226857662, + "epoch": 2.223497895139686, + "grad_norm": 0.11117064207792282, + "learning_rate": 5.45185980931919e-05, + "loss": 0.8745571136474609, + "mean_token_accuracy": 0.793362820148468, + "num_tokens": 23323119.0, + "step": 5810 + }, + { + "entropy": 0.8238232973963022, + "epoch": 2.227324913892078, + "grad_norm": 0.13185663521289825, + "learning_rate": 5.425003357056533e-05, + "loss": 0.8659845352172851, + "mean_token_accuracy": 0.8025152862071991, + "num_tokens": 23363749.0, + "step": 5820 + }, + { + "entropy": 0.8596846207976341, + "epoch": 2.2311519326444698, + "grad_norm": 0.09360291808843613, + "learning_rate": 5.398146904793877e-05, + "loss": 0.9118245124816895, + "mean_token_accuracy": 0.7882251426577568, + "num_tokens": 23402886.0, + "step": 5830 + }, + { + "entropy": 0.8035648860037327, + "epoch": 2.2349789513968616, + "grad_norm": 0.09347285330295563, + "learning_rate": 5.371290452531221e-05, + "loss": 0.8725827217102051, + "mean_token_accuracy": 0.8045972406864166, + "num_tokens": 23442339.0, + "step": 5840 + }, + { + "entropy": 0.9175308585166931, + "epoch": 2.2388059701492535, + "grad_norm": 0.12336985766887665, + "learning_rate": 5.3444340002685645e-05, + "loss": 0.9388077735900879, + "mean_token_accuracy": 0.7768721342086792, + "num_tokens": 23481344.0, + "step": 5850 + }, + { + "entropy": 0.868817687779665, + "epoch": 2.2426329889016454, + "grad_norm": 0.10311949998140335, + "learning_rate": 5.3175775480059086e-05, + "loss": 0.9337680816650391, + "mean_token_accuracy": 0.7877210825681686, + "num_tokens": 23520637.0, + "step": 5860 + }, + { + "entropy": 0.854228886961937, + "epoch": 2.2464600076540373, + "grad_norm": 0.10659918189048767, + "learning_rate": 5.2907210957432526e-05, + "loss": 0.9077530860900879, + "mean_token_accuracy": 0.7909654468297959, + "num_tokens": 23559877.0, + "step": 5870 + }, + { + "entropy": 0.8457217663526535, + "epoch": 2.250287026406429, + "grad_norm": 0.09633689373731613, + "learning_rate": 5.263864643480596e-05, + "loss": 0.8785475730895996, + "mean_token_accuracy": 0.7941769883036613, + "num_tokens": 23597033.0, + "step": 5880 + }, + { + "entropy": 0.8822055049240589, + "epoch": 2.254114045158821, + "grad_norm": 0.09562286734580994, + "learning_rate": 5.23700819121794e-05, + "loss": 0.8851138114929199, + "mean_token_accuracy": 0.7860250055789948, + "num_tokens": 23634788.0, + "step": 5890 + }, + { + "entropy": 0.8556318368762732, + "epoch": 2.257941063911213, + "grad_norm": 0.08814764767885208, + "learning_rate": 5.210151738955285e-05, + "loss": 0.8866415977478027, + "mean_token_accuracy": 0.7966004252433777, + "num_tokens": 23673283.0, + "step": 5900 + }, + { + "entropy": 0.7395530994981527, + "epoch": 2.261768082663605, + "grad_norm": 0.07671936601400375, + "learning_rate": 5.1832952866926274e-05, + "loss": 0.7680532455444335, + "mean_token_accuracy": 0.8190904691815376, + "num_tokens": 23711540.0, + "step": 5910 + }, + { + "entropy": 0.8898126773536206, + "epoch": 2.2655951014159967, + "grad_norm": 0.06960798799991608, + "learning_rate": 5.156438834429972e-05, + "loss": 1.026920700073242, + "mean_token_accuracy": 0.7816770374774933, + "num_tokens": 23756178.0, + "step": 5920 + }, + { + "entropy": 0.8902945756912232, + "epoch": 2.2694221201683886, + "grad_norm": 0.1114925891160965, + "learning_rate": 5.129582382167316e-05, + "loss": 0.9598423957824707, + "mean_token_accuracy": 0.784630736708641, + "num_tokens": 23792151.0, + "step": 5930 + }, + { + "entropy": 0.8439918398857117, + "epoch": 2.2732491389207805, + "grad_norm": 0.16730423271656036, + "learning_rate": 5.10272592990466e-05, + "loss": 0.851725959777832, + "mean_token_accuracy": 0.7940610617399215, + "num_tokens": 23830309.0, + "step": 5940 + }, + { + "entropy": 0.9178552135825158, + "epoch": 2.2770761576731724, + "grad_norm": 0.16359879076480865, + "learning_rate": 5.0758694776420035e-05, + "loss": 0.9417426109313964, + "mean_token_accuracy": 0.7781487166881561, + "num_tokens": 23874638.0, + "step": 5950 + }, + { + "entropy": 0.9053961969912052, + "epoch": 2.2809031764255643, + "grad_norm": 0.08877693116664886, + "learning_rate": 5.0490130253793475e-05, + "loss": 0.9975083351135254, + "mean_token_accuracy": 0.7837231978774071, + "num_tokens": 23918641.0, + "step": 5960 + }, + { + "entropy": 0.8590337552130223, + "epoch": 2.284730195177956, + "grad_norm": 0.1032002717256546, + "learning_rate": 5.022156573116692e-05, + "loss": 0.8895168304443359, + "mean_token_accuracy": 0.7937395930290222, + "num_tokens": 23964403.0, + "step": 5970 + }, + { + "entropy": 0.8678315542638302, + "epoch": 2.288557213930348, + "grad_norm": 0.12054577469825745, + "learning_rate": 4.9953001208540356e-05, + "loss": 0.9571179389953614, + "mean_token_accuracy": 0.7875312000513077, + "num_tokens": 24001736.0, + "step": 5980 + }, + { + "entropy": 0.8353918489068747, + "epoch": 2.29238423268274, + "grad_norm": 0.1126277968287468, + "learning_rate": 4.9684436685913796e-05, + "loss": 0.927174186706543, + "mean_token_accuracy": 0.7998543947935104, + "num_tokens": 24038494.0, + "step": 5990 + }, + { + "entropy": 0.7281714532524347, + "epoch": 2.296211251435132, + "grad_norm": 0.09404657036066055, + "learning_rate": 4.941587216328723e-05, + "loss": 0.7814407825469971, + "mean_token_accuracy": 0.8194777265191078, + "num_tokens": 24077404.0, + "step": 6000 + }, + { + "entropy": 0.8627386562526226, + "epoch": 2.3000382701875237, + "grad_norm": 0.07272294908761978, + "learning_rate": 4.914730764066067e-05, + "loss": 0.8920239448547364, + "mean_token_accuracy": 0.7905093863606453, + "num_tokens": 24123483.0, + "step": 6010 + }, + { + "entropy": 0.8679380901157856, + "epoch": 2.3038652889399156, + "grad_norm": 0.09443669021129608, + "learning_rate": 4.887874311803411e-05, + "loss": 0.874543571472168, + "mean_token_accuracy": 0.7891486629843711, + "num_tokens": 24165215.0, + "step": 6020 + }, + { + "entropy": 0.8942526787519455, + "epoch": 2.3076923076923075, + "grad_norm": 0.0953405573964119, + "learning_rate": 4.861017859540755e-05, + "loss": 0.9304584503173828, + "mean_token_accuracy": 0.7855148240923882, + "num_tokens": 24204454.0, + "step": 6030 + }, + { + "entropy": 0.7896301347762347, + "epoch": 2.3115193264446994, + "grad_norm": 0.11093971133232117, + "learning_rate": 4.834161407278099e-05, + "loss": 0.8957646369934082, + "mean_token_accuracy": 0.8066290900111198, + "num_tokens": 24245578.0, + "step": 6040 + }, + { + "entropy": 0.9012999664992094, + "epoch": 2.3153463451970913, + "grad_norm": 0.09953141212463379, + "learning_rate": 4.8073049550154424e-05, + "loss": 0.9699124336242676, + "mean_token_accuracy": 0.7792607560753823, + "num_tokens": 24286627.0, + "step": 6050 + }, + { + "entropy": 0.8553815156221389, + "epoch": 2.319173363949483, + "grad_norm": 0.09737669676542282, + "learning_rate": 4.7804485027527864e-05, + "loss": 0.9319831848144531, + "mean_token_accuracy": 0.7943563163280487, + "num_tokens": 24326050.0, + "step": 6060 + }, + { + "entropy": 0.8088245622813701, + "epoch": 2.323000382701875, + "grad_norm": 0.11754145473241806, + "learning_rate": 4.7535920504901305e-05, + "loss": 0.8612746238708496, + "mean_token_accuracy": 0.7998821645975113, + "num_tokens": 24365505.0, + "step": 6070 + }, + { + "entropy": 0.8720655493438244, + "epoch": 2.326827401454267, + "grad_norm": 0.10582665354013443, + "learning_rate": 4.726735598227474e-05, + "loss": 0.9663046836853028, + "mean_token_accuracy": 0.78773233294487, + "num_tokens": 24403619.0, + "step": 6080 + }, + { + "entropy": 0.814146314561367, + "epoch": 2.330654420206659, + "grad_norm": 0.10099766403436661, + "learning_rate": 4.6998791459648185e-05, + "loss": 0.8403602600097656, + "mean_token_accuracy": 0.8022790655493737, + "num_tokens": 24441133.0, + "step": 6090 + }, + { + "entropy": 0.8325122386217118, + "epoch": 2.3344814389590507, + "grad_norm": 0.0968555137515068, + "learning_rate": 4.673022693702162e-05, + "loss": 0.8952775955200195, + "mean_token_accuracy": 0.7972952157258988, + "num_tokens": 24487908.0, + "step": 6100 + }, + { + "entropy": 0.8313679326325655, + "epoch": 2.3383084577114426, + "grad_norm": 0.09856109321117401, + "learning_rate": 4.6461662414395066e-05, + "loss": 0.8740328788757324, + "mean_token_accuracy": 0.7973453208804131, + "num_tokens": 24528859.0, + "step": 6110 + }, + { + "entropy": 0.9734285809099674, + "epoch": 2.3421354764638345, + "grad_norm": 0.08564373850822449, + "learning_rate": 4.61930978917685e-05, + "loss": 1.0028407096862793, + "mean_token_accuracy": 0.761284664273262, + "num_tokens": 24574604.0, + "step": 6120 + }, + { + "entropy": 0.9015337243676186, + "epoch": 2.3459624952162264, + "grad_norm": 0.09626568853855133, + "learning_rate": 4.592453336914194e-05, + "loss": 0.9965445518493652, + "mean_token_accuracy": 0.7804829552769661, + "num_tokens": 24615926.0, + "step": 6130 + }, + { + "entropy": 0.8764280565083027, + "epoch": 2.3497895139686182, + "grad_norm": 0.09104456007480621, + "learning_rate": 4.565596884651538e-05, + "loss": 0.9158814430236817, + "mean_token_accuracy": 0.7859255224466324, + "num_tokens": 24656662.0, + "step": 6140 + }, + { + "entropy": 0.8626538865268231, + "epoch": 2.35361653272101, + "grad_norm": 0.10454346984624863, + "learning_rate": 4.5387404323888814e-05, + "loss": 0.9093445777893067, + "mean_token_accuracy": 0.7909897804260254, + "num_tokens": 24696048.0, + "step": 6150 + }, + { + "entropy": 0.9042750746011734, + "epoch": 2.357443551473402, + "grad_norm": 0.09976542741060257, + "learning_rate": 4.511883980126226e-05, + "loss": 0.9527711868286133, + "mean_token_accuracy": 0.7807446241378784, + "num_tokens": 24738856.0, + "step": 6160 + }, + { + "entropy": 0.892713101953268, + "epoch": 2.361270570225794, + "grad_norm": 0.09778838604688644, + "learning_rate": 4.4850275278635694e-05, + "loss": 0.9142132759094238, + "mean_token_accuracy": 0.7793798848986626, + "num_tokens": 24781940.0, + "step": 6170 + }, + { + "entropy": 0.8652282394468784, + "epoch": 2.365097588978186, + "grad_norm": 0.13737474381923676, + "learning_rate": 4.4581710756009134e-05, + "loss": 0.9030959129333496, + "mean_token_accuracy": 0.7882118329405785, + "num_tokens": 24818476.0, + "step": 6180 + }, + { + "entropy": 0.880942365527153, + "epoch": 2.3689246077305777, + "grad_norm": 0.09460416436195374, + "learning_rate": 4.4313146233382575e-05, + "loss": 0.9684123992919922, + "mean_token_accuracy": 0.7829654842615128, + "num_tokens": 24856950.0, + "step": 6190 + }, + { + "entropy": 0.9563789039850235, + "epoch": 2.3727516264829696, + "grad_norm": 0.10954713076353073, + "learning_rate": 4.404458171075601e-05, + "loss": 1.029030704498291, + "mean_token_accuracy": 0.7727080956101418, + "num_tokens": 24895606.0, + "step": 6200 + }, + { + "entropy": 0.827500730752945, + "epoch": 2.3765786452353614, + "grad_norm": 0.1212112084031105, + "learning_rate": 4.377601718812945e-05, + "loss": 0.8650990486145019, + "mean_token_accuracy": 0.7993797525763512, + "num_tokens": 24932482.0, + "step": 6210 + }, + { + "entropy": 0.8221234314143657, + "epoch": 2.3804056639877533, + "grad_norm": 0.10023710876703262, + "learning_rate": 4.350745266550289e-05, + "loss": 0.8777777671813964, + "mean_token_accuracy": 0.7987013593316078, + "num_tokens": 24975109.0, + "step": 6220 + }, + { + "entropy": 0.8734230428934098, + "epoch": 2.384232682740145, + "grad_norm": 0.09403553605079651, + "learning_rate": 4.323888814287633e-05, + "loss": 0.8978803634643555, + "mean_token_accuracy": 0.7872134670615196, + "num_tokens": 25020916.0, + "step": 6230 + }, + { + "entropy": 0.9003870271146297, + "epoch": 2.388059701492537, + "grad_norm": 0.09854581952095032, + "learning_rate": 4.297032362024977e-05, + "loss": 0.9225659370422363, + "mean_token_accuracy": 0.7807397484779358, + "num_tokens": 25061018.0, + "step": 6240 + }, + { + "entropy": 0.8118300527334213, + "epoch": 2.391886720244929, + "grad_norm": 0.11139514297246933, + "learning_rate": 4.27017590976232e-05, + "loss": 0.8876243591308594, + "mean_token_accuracy": 0.800039604306221, + "num_tokens": 25097954.0, + "step": 6250 + }, + { + "entropy": 0.8419897515326739, + "epoch": 2.395713738997321, + "grad_norm": 0.09123879671096802, + "learning_rate": 4.243319457499664e-05, + "loss": 0.86744384765625, + "mean_token_accuracy": 0.7919191718101501, + "num_tokens": 25134260.0, + "step": 6260 + }, + { + "entropy": 0.9123246632516384, + "epoch": 2.3995407577497128, + "grad_norm": 0.10300562530755997, + "learning_rate": 4.2164630052370084e-05, + "loss": 0.9368386268615723, + "mean_token_accuracy": 0.7797829449176789, + "num_tokens": 25176001.0, + "step": 6270 + }, + { + "entropy": 0.9066010326147079, + "epoch": 2.4033677765021046, + "grad_norm": 0.10231593996286392, + "learning_rate": 4.1896065529743524e-05, + "loss": 0.9637252807617187, + "mean_token_accuracy": 0.7807635113596916, + "num_tokens": 25214450.0, + "step": 6280 + }, + { + "entropy": 0.8680018067359925, + "epoch": 2.4071947952544965, + "grad_norm": 0.09813899546861649, + "learning_rate": 4.162750100711696e-05, + "loss": 0.9405930519104004, + "mean_token_accuracy": 0.7862071350216866, + "num_tokens": 25249019.0, + "step": 6290 + }, + { + "entropy": 0.8444254245609045, + "epoch": 2.4110218140068884, + "grad_norm": 0.09815159440040588, + "learning_rate": 4.1358936484490404e-05, + "loss": 0.9015726089477539, + "mean_token_accuracy": 0.7970604464411736, + "num_tokens": 25287466.0, + "step": 6300 + }, + { + "entropy": 0.9179269846528768, + "epoch": 2.4148488327592803, + "grad_norm": 0.1013285368680954, + "learning_rate": 4.109037196186384e-05, + "loss": 0.9629206657409668, + "mean_token_accuracy": 0.7756785362958908, + "num_tokens": 25325488.0, + "step": 6310 + }, + { + "entropy": 0.8627055402845144, + "epoch": 2.418675851511672, + "grad_norm": 0.09085863828659058, + "learning_rate": 4.082180743923728e-05, + "loss": 0.8825644493103028, + "mean_token_accuracy": 0.7927587017416954, + "num_tokens": 25362470.0, + "step": 6320 + }, + { + "entropy": 0.8909512132406234, + "epoch": 2.422502870264064, + "grad_norm": 0.12609654664993286, + "learning_rate": 4.055324291661072e-05, + "loss": 0.9005517959594727, + "mean_token_accuracy": 0.784729179739952, + "num_tokens": 25405399.0, + "step": 6330 + }, + { + "entropy": 0.8371693149209023, + "epoch": 2.426329889016456, + "grad_norm": 0.09511356055736542, + "learning_rate": 4.028467839398415e-05, + "loss": 0.8819235801696778, + "mean_token_accuracy": 0.7933985084295273, + "num_tokens": 25443537.0, + "step": 6340 + }, + { + "entropy": 0.8452706336975098, + "epoch": 2.430156907768848, + "grad_norm": 0.08440756797790527, + "learning_rate": 4.00161138713576e-05, + "loss": 0.9220956802368164, + "mean_token_accuracy": 0.791832709312439, + "num_tokens": 25482874.0, + "step": 6350 + }, + { + "entropy": 0.8533206440508365, + "epoch": 2.4339839265212397, + "grad_norm": 0.10529948770999908, + "learning_rate": 3.974754934873103e-05, + "loss": 0.8976041793823242, + "mean_token_accuracy": 0.7917203813791275, + "num_tokens": 25523091.0, + "step": 6360 + }, + { + "entropy": 0.8192368470132351, + "epoch": 2.4378109452736316, + "grad_norm": 0.08338342607021332, + "learning_rate": 3.947898482610447e-05, + "loss": 0.8657890319824219, + "mean_token_accuracy": 0.8002077579498291, + "num_tokens": 25566050.0, + "step": 6370 + }, + { + "entropy": 0.9303523369133473, + "epoch": 2.4416379640260235, + "grad_norm": 0.09010683745145798, + "learning_rate": 3.921042030347791e-05, + "loss": 0.9760264396667481, + "mean_token_accuracy": 0.7748634815216064, + "num_tokens": 25608936.0, + "step": 6380 + }, + { + "entropy": 0.7555282160639762, + "epoch": 2.4454649827784154, + "grad_norm": 0.11948851495981216, + "learning_rate": 3.894185578085135e-05, + "loss": 0.8005829811096191, + "mean_token_accuracy": 0.8136610746383667, + "num_tokens": 25647408.0, + "step": 6390 + }, + { + "entropy": 0.8959879912436008, + "epoch": 2.4492920015308073, + "grad_norm": 0.09189214557409286, + "learning_rate": 3.8673291258224794e-05, + "loss": 0.9070920944213867, + "mean_token_accuracy": 0.7838554188609124, + "num_tokens": 25690271.0, + "step": 6400 + }, + { + "entropy": 0.7601668298244476, + "epoch": 2.453119020283199, + "grad_norm": 0.11115460842847824, + "learning_rate": 3.840472673559823e-05, + "loss": 0.837701416015625, + "mean_token_accuracy": 0.8158529132604599, + "num_tokens": 25730098.0, + "step": 6410 + }, + { + "entropy": 0.9026189528405666, + "epoch": 2.456946039035591, + "grad_norm": 0.0951504036784172, + "learning_rate": 3.813616221297167e-05, + "loss": 0.9555998802185058, + "mean_token_accuracy": 0.7768774792551995, + "num_tokens": 25769649.0, + "step": 6420 + }, + { + "entropy": 0.8566267982125282, + "epoch": 2.460773057787983, + "grad_norm": 0.1477993279695511, + "learning_rate": 3.786759769034511e-05, + "loss": 0.901324462890625, + "mean_token_accuracy": 0.7918707326054573, + "num_tokens": 25805906.0, + "step": 6430 + }, + { + "entropy": 0.8576595298945904, + "epoch": 2.464600076540375, + "grad_norm": 0.08643563091754913, + "learning_rate": 3.759903316771854e-05, + "loss": 0.9027094841003418, + "mean_token_accuracy": 0.7925754263997078, + "num_tokens": 25847270.0, + "step": 6440 + }, + { + "entropy": 0.8848195761442185, + "epoch": 2.4684270952927667, + "grad_norm": 0.1148499846458435, + "learning_rate": 3.733046864509199e-05, + "loss": 0.9222222328186035, + "mean_token_accuracy": 0.7866752982139588, + "num_tokens": 25890454.0, + "step": 6450 + }, + { + "entropy": 0.8222585029900074, + "epoch": 2.4722541140451586, + "grad_norm": 0.1051439717411995, + "learning_rate": 3.706190412246542e-05, + "loss": 0.8674264907836914, + "mean_token_accuracy": 0.8014690011739731, + "num_tokens": 25927176.0, + "step": 6460 + }, + { + "entropy": 0.7895723138004541, + "epoch": 2.4760811327975505, + "grad_norm": 0.08904940634965897, + "learning_rate": 3.679333959983886e-05, + "loss": 0.8720718383789062, + "mean_token_accuracy": 0.8032544136047364, + "num_tokens": 25969008.0, + "step": 6470 + }, + { + "entropy": 0.8449521534144878, + "epoch": 2.4799081515499424, + "grad_norm": 0.09109736979007721, + "learning_rate": 3.65247750772123e-05, + "loss": 0.8994977951049805, + "mean_token_accuracy": 0.7939551532268524, + "num_tokens": 26008671.0, + "step": 6480 + }, + { + "entropy": 0.8769714809954167, + "epoch": 2.4837351703023343, + "grad_norm": 0.09221527725458145, + "learning_rate": 3.625621055458574e-05, + "loss": 0.9647493362426758, + "mean_token_accuracy": 0.7877351269125938, + "num_tokens": 26047583.0, + "step": 6490 + }, + { + "entropy": 0.840660959109664, + "epoch": 2.487562189054726, + "grad_norm": 0.0888860896229744, + "learning_rate": 3.598764603195918e-05, + "loss": 0.872824764251709, + "mean_token_accuracy": 0.7932088255882264, + "num_tokens": 26090690.0, + "step": 6500 + }, + { + "entropy": 0.9435165245085955, + "epoch": 2.491389207807118, + "grad_norm": 0.10055243968963623, + "learning_rate": 3.571908150933262e-05, + "loss": 1.008607769012451, + "mean_token_accuracy": 0.7684792190790176, + "num_tokens": 26134620.0, + "step": 6510 + }, + { + "entropy": 0.9596942149102687, + "epoch": 2.49521622655951, + "grad_norm": 0.11321604251861572, + "learning_rate": 3.545051698670606e-05, + "loss": 1.021597957611084, + "mean_token_accuracy": 0.7706323087215423, + "num_tokens": 26176850.0, + "step": 6520 + }, + { + "entropy": 0.9805667255073786, + "epoch": 2.499043245311902, + "grad_norm": 0.13084010779857635, + "learning_rate": 3.51819524640795e-05, + "loss": 1.0418537139892579, + "mean_token_accuracy": 0.763472905755043, + "num_tokens": 26220943.0, + "step": 6530 + }, + { + "entropy": 0.9104986634105444, + "epoch": 2.5028702640642937, + "grad_norm": 0.09176472574472427, + "learning_rate": 3.491338794145294e-05, + "loss": 0.972693920135498, + "mean_token_accuracy": 0.7809211134910583, + "num_tokens": 26262084.0, + "step": 6540 + }, + { + "entropy": 0.8316202580928802, + "epoch": 2.5066972828166856, + "grad_norm": 0.11009900271892548, + "learning_rate": 3.464482341882637e-05, + "loss": 0.8581557273864746, + "mean_token_accuracy": 0.7978575736284256, + "num_tokens": 26302790.0, + "step": 6550 + }, + { + "entropy": 0.9041007287800312, + "epoch": 2.5105243015690775, + "grad_norm": 0.12103740125894547, + "learning_rate": 3.437625889619981e-05, + "loss": 0.9546697616577149, + "mean_token_accuracy": 0.7800753250718117, + "num_tokens": 26347959.0, + "step": 6560 + }, + { + "entropy": 0.8139931574463845, + "epoch": 2.5143513203214694, + "grad_norm": 0.08679619431495667, + "learning_rate": 3.410769437357325e-05, + "loss": 0.8982272148132324, + "mean_token_accuracy": 0.8002956256270408, + "num_tokens": 26388946.0, + "step": 6570 + }, + { + "entropy": 0.838017127290368, + "epoch": 2.5181783390738612, + "grad_norm": 0.12066033482551575, + "learning_rate": 3.383912985094669e-05, + "loss": 0.8589006423950195, + "mean_token_accuracy": 0.7943052783608436, + "num_tokens": 26431191.0, + "step": 6580 + }, + { + "entropy": 0.8299121838063002, + "epoch": 2.522005357826253, + "grad_norm": 0.08988375216722488, + "learning_rate": 3.357056532832013e-05, + "loss": 0.9106943130493164, + "mean_token_accuracy": 0.7972570925951004, + "num_tokens": 26468346.0, + "step": 6590 + }, + { + "entropy": 1.0362544253468513, + "epoch": 2.525832376578645, + "grad_norm": 0.10034547746181488, + "learning_rate": 3.3302000805693566e-05, + "loss": 1.0991132736206055, + "mean_token_accuracy": 0.7502188056707382, + "num_tokens": 26508029.0, + "step": 6600 + }, + { + "entropy": 0.9098232574760914, + "epoch": 2.529659395331037, + "grad_norm": 0.12513861060142517, + "learning_rate": 3.303343628306701e-05, + "loss": 0.9807866096496582, + "mean_token_accuracy": 0.7815383434295654, + "num_tokens": 26549321.0, + "step": 6610 + }, + { + "entropy": 0.8234303712844848, + "epoch": 2.533486414083429, + "grad_norm": 0.08378947526216507, + "learning_rate": 3.2764871760440446e-05, + "loss": 0.8650754928588867, + "mean_token_accuracy": 0.7995569303631782, + "num_tokens": 26589472.0, + "step": 6620 + }, + { + "entropy": 0.769949347153306, + "epoch": 2.5373134328358207, + "grad_norm": 0.12056911736726761, + "learning_rate": 3.249630723781389e-05, + "loss": 0.8480927467346191, + "mean_token_accuracy": 0.818176555633545, + "num_tokens": 26627566.0, + "step": 6630 + }, + { + "entropy": 0.8099306054413319, + "epoch": 2.5411404515882126, + "grad_norm": 0.09869939833879471, + "learning_rate": 3.222774271518733e-05, + "loss": 0.8649662017822266, + "mean_token_accuracy": 0.7981634557247161, + "num_tokens": 26662566.0, + "step": 6640 + }, + { + "entropy": 0.8528701025992632, + "epoch": 2.5449674703406044, + "grad_norm": 0.10336704552173615, + "learning_rate": 3.195917819256076e-05, + "loss": 0.9127251625061035, + "mean_token_accuracy": 0.7928516089916229, + "num_tokens": 26705768.0, + "step": 6650 + }, + { + "entropy": 0.8498493686318398, + "epoch": 2.5487944890929963, + "grad_norm": 0.10704471170902252, + "learning_rate": 3.169061366993421e-05, + "loss": 0.863565731048584, + "mean_token_accuracy": 0.7932710304856301, + "num_tokens": 26743574.0, + "step": 6660 + }, + { + "entropy": 0.8566017836332321, + "epoch": 2.552621507845388, + "grad_norm": 0.12135261297225952, + "learning_rate": 3.142204914730764e-05, + "loss": 0.9187004089355468, + "mean_token_accuracy": 0.7913481816649437, + "num_tokens": 26784127.0, + "step": 6670 + }, + { + "entropy": 0.8302055161446333, + "epoch": 2.55644852659778, + "grad_norm": 0.1430647373199463, + "learning_rate": 3.115348462468108e-05, + "loss": 0.8857596397399903, + "mean_token_accuracy": 0.7965412393212319, + "num_tokens": 26823189.0, + "step": 6680 + }, + { + "entropy": 0.8327139757573605, + "epoch": 2.560275545350172, + "grad_norm": 0.09538804739713669, + "learning_rate": 3.088492010205452e-05, + "loss": 0.9255412101745606, + "mean_token_accuracy": 0.7939359977841377, + "num_tokens": 26861599.0, + "step": 6690 + }, + { + "entropy": 0.8530606523156166, + "epoch": 2.564102564102564, + "grad_norm": 0.09193538129329681, + "learning_rate": 3.0616355579427955e-05, + "loss": 0.9151040077209472, + "mean_token_accuracy": 0.7901859179139137, + "num_tokens": 26901064.0, + "step": 6700 + }, + { + "entropy": 0.794033832848072, + "epoch": 2.5679295828549558, + "grad_norm": 0.1283407062292099, + "learning_rate": 3.03477910568014e-05, + "loss": 0.8441056251525879, + "mean_token_accuracy": 0.8033816903829575, + "num_tokens": 26942161.0, + "step": 6710 + }, + { + "entropy": 0.9340717010200024, + "epoch": 2.5717566016073476, + "grad_norm": 0.09237734973430634, + "learning_rate": 3.0079226534174836e-05, + "loss": 0.9747485160827637, + "mean_token_accuracy": 0.7732965379953385, + "num_tokens": 26982759.0, + "step": 6720 + }, + { + "entropy": 0.8746799558401108, + "epoch": 2.5755836203597395, + "grad_norm": 0.1391710638999939, + "learning_rate": 2.9810662011548273e-05, + "loss": 0.9311764717102051, + "mean_token_accuracy": 0.7883311554789543, + "num_tokens": 27022926.0, + "step": 6730 + }, + { + "entropy": 0.8290158938616514, + "epoch": 2.5794106391121314, + "grad_norm": 0.10442391782999039, + "learning_rate": 2.9542097488921716e-05, + "loss": 0.8346040725708008, + "mean_token_accuracy": 0.7985544398427009, + "num_tokens": 27065028.0, + "step": 6740 + }, + { + "entropy": 0.8574424415826798, + "epoch": 2.5832376578645233, + "grad_norm": 0.13001689314842224, + "learning_rate": 2.9273532966295153e-05, + "loss": 0.906099510192871, + "mean_token_accuracy": 0.790866918861866, + "num_tokens": 27100867.0, + "step": 6750 + }, + { + "entropy": 0.840974472463131, + "epoch": 2.587064676616915, + "grad_norm": 0.1224556565284729, + "learning_rate": 2.9004968443668594e-05, + "loss": 0.8969048500061035, + "mean_token_accuracy": 0.7953185483813285, + "num_tokens": 27137338.0, + "step": 6760 + }, + { + "entropy": 0.8477607406675816, + "epoch": 2.590891695369307, + "grad_norm": 0.09641005098819733, + "learning_rate": 2.873640392104203e-05, + "loss": 0.9569526672363281, + "mean_token_accuracy": 0.7941199511289596, + "num_tokens": 27178308.0, + "step": 6770 + }, + { + "entropy": 0.8317056275904179, + "epoch": 2.594718714121699, + "grad_norm": 0.11853990703821182, + "learning_rate": 2.8467839398415468e-05, + "loss": 0.9125295639038086, + "mean_token_accuracy": 0.7960822626948356, + "num_tokens": 27216898.0, + "step": 6780 + }, + { + "entropy": 0.8558823302388191, + "epoch": 2.598545732874091, + "grad_norm": 0.10477570444345474, + "learning_rate": 2.819927487578891e-05, + "loss": 0.8844131469726563, + "mean_token_accuracy": 0.7940610870718956, + "num_tokens": 27254443.0, + "step": 6790 + }, + { + "entropy": 0.8210954669862985, + "epoch": 2.6023727516264827, + "grad_norm": 0.14100609719753265, + "learning_rate": 2.7930710353162348e-05, + "loss": 0.8684535980224609, + "mean_token_accuracy": 0.7988820597529411, + "num_tokens": 27290079.0, + "step": 6800 + }, + { + "entropy": 0.8657392464578152, + "epoch": 2.6061997703788746, + "grad_norm": 0.09813658148050308, + "learning_rate": 2.7662145830535785e-05, + "loss": 0.9158803939819335, + "mean_token_accuracy": 0.7908033922314643, + "num_tokens": 27328190.0, + "step": 6810 + }, + { + "entropy": 0.8866597019135952, + "epoch": 2.6100267891312665, + "grad_norm": 0.11115613579750061, + "learning_rate": 2.739358130790923e-05, + "loss": 0.9120420455932617, + "mean_token_accuracy": 0.7854148596525192, + "num_tokens": 27369945.0, + "step": 6820 + }, + { + "entropy": 0.7982962183654309, + "epoch": 2.6138538078836584, + "grad_norm": 0.1377696692943573, + "learning_rate": 2.7125016785282666e-05, + "loss": 0.8332090377807617, + "mean_token_accuracy": 0.8022376418113708, + "num_tokens": 27406302.0, + "step": 6830 + }, + { + "entropy": 0.8424798093736172, + "epoch": 2.6176808266360503, + "grad_norm": 0.11442425101995468, + "learning_rate": 2.6856452262656106e-05, + "loss": 0.8876424789428711, + "mean_token_accuracy": 0.7893706291913987, + "num_tokens": 27449733.0, + "step": 6840 + }, + { + "entropy": 0.9239407800137996, + "epoch": 2.621507845388442, + "grad_norm": 0.0799759030342102, + "learning_rate": 2.6587887740029543e-05, + "loss": 0.9658034324645997, + "mean_token_accuracy": 0.7757296651601792, + "num_tokens": 27492884.0, + "step": 6850 + }, + { + "entropy": 0.8720928959548473, + "epoch": 2.625334864140834, + "grad_norm": 0.11632338911294937, + "learning_rate": 2.631932321740298e-05, + "loss": 0.9089359283447266, + "mean_token_accuracy": 0.7913818553090095, + "num_tokens": 27531878.0, + "step": 6860 + }, + { + "entropy": 0.9302754916250706, + "epoch": 2.629161882893226, + "grad_norm": 0.11215951293706894, + "learning_rate": 2.6050758694776423e-05, + "loss": 1.0027677536010742, + "mean_token_accuracy": 0.7739164605736732, + "num_tokens": 27567970.0, + "step": 6870 + }, + { + "entropy": 0.9016003269702196, + "epoch": 2.632988901645618, + "grad_norm": 0.11951353400945663, + "learning_rate": 2.578219417214986e-05, + "loss": 0.9493217468261719, + "mean_token_accuracy": 0.7779877439141274, + "num_tokens": 27609840.0, + "step": 6880 + }, + { + "entropy": 0.8870487026870251, + "epoch": 2.6368159203980097, + "grad_norm": 0.1124744564294815, + "learning_rate": 2.55136296495233e-05, + "loss": 1.0031387329101562, + "mean_token_accuracy": 0.7866110280156136, + "num_tokens": 27649655.0, + "step": 6890 + }, + { + "entropy": 0.9296976864337921, + "epoch": 2.6406429391504016, + "grad_norm": 0.1161704882979393, + "learning_rate": 2.5245065126896738e-05, + "loss": 1.012251853942871, + "mean_token_accuracy": 0.7726465791463852, + "num_tokens": 27694105.0, + "step": 6900 + }, + { + "entropy": 0.8415393102914095, + "epoch": 2.6444699579027935, + "grad_norm": 0.0987096056342125, + "learning_rate": 2.4976500604270178e-05, + "loss": 0.9147520065307617, + "mean_token_accuracy": 0.7973951831459999, + "num_tokens": 27730663.0, + "step": 6910 + }, + { + "entropy": 0.8274203538894653, + "epoch": 2.6482969766551854, + "grad_norm": 0.1101188212633133, + "learning_rate": 2.4707936081643615e-05, + "loss": 0.8873770713806153, + "mean_token_accuracy": 0.7974746853113175, + "num_tokens": 27772881.0, + "step": 6920 + }, + { + "entropy": 0.7984559834003448, + "epoch": 2.6521239954075773, + "grad_norm": 0.10185439884662628, + "learning_rate": 2.4439371559017055e-05, + "loss": 0.8775921821594238, + "mean_token_accuracy": 0.807880648970604, + "num_tokens": 27809534.0, + "step": 6930 + }, + { + "entropy": 0.887981615960598, + "epoch": 2.655951014159969, + "grad_norm": 0.08309295773506165, + "learning_rate": 2.4170807036390495e-05, + "loss": 0.9466443061828613, + "mean_token_accuracy": 0.7859978228807449, + "num_tokens": 27852591.0, + "step": 6940 + }, + { + "entropy": 0.9378888584673405, + "epoch": 2.659778032912361, + "grad_norm": 0.136076882481575, + "learning_rate": 2.3902242513763932e-05, + "loss": 1.0269956588745117, + "mean_token_accuracy": 0.7709244459867477, + "num_tokens": 27892120.0, + "step": 6950 + }, + { + "entropy": 0.9220107842236758, + "epoch": 2.663605051664753, + "grad_norm": 0.08248933404684067, + "learning_rate": 2.363367799113737e-05, + "loss": 0.9726594924926758, + "mean_token_accuracy": 0.7753236919641495, + "num_tokens": 27935380.0, + "step": 6960 + }, + { + "entropy": 0.7793348811566829, + "epoch": 2.667432070417145, + "grad_norm": 0.08308061957359314, + "learning_rate": 2.336511346851081e-05, + "loss": 0.7947993278503418, + "mean_token_accuracy": 0.8088447406888009, + "num_tokens": 27973020.0, + "step": 6970 + }, + { + "entropy": 0.9587450519204139, + "epoch": 2.6712590891695367, + "grad_norm": 0.10263237357139587, + "learning_rate": 2.309654894588425e-05, + "loss": 0.9791707038879395, + "mean_token_accuracy": 0.7663016110658646, + "num_tokens": 28016389.0, + "step": 6980 + }, + { + "entropy": 0.8766636185348033, + "epoch": 2.6750861079219286, + "grad_norm": 0.09917714446783066, + "learning_rate": 2.282798442325769e-05, + "loss": 0.9187355041503906, + "mean_token_accuracy": 0.7864622801542283, + "num_tokens": 28058100.0, + "step": 6990 + }, + { + "entropy": 0.8623256701976061, + "epoch": 2.6789131266743205, + "grad_norm": 0.08802894502878189, + "learning_rate": 2.255941990063113e-05, + "loss": 0.9108509063720703, + "mean_token_accuracy": 0.7891170993447304, + "num_tokens": 28095166.0, + "step": 7000 + }, + { + "entropy": 0.919238954409957, + "epoch": 2.6827401454267124, + "grad_norm": 0.11916540563106537, + "learning_rate": 2.2290855378004567e-05, + "loss": 0.9972674369812011, + "mean_token_accuracy": 0.7765705808997154, + "num_tokens": 28137533.0, + "step": 7010 + }, + { + "entropy": 0.918128065392375, + "epoch": 2.6865671641791042, + "grad_norm": 0.09536208212375641, + "learning_rate": 2.2022290855378004e-05, + "loss": 0.9865476608276367, + "mean_token_accuracy": 0.7736267536878586, + "num_tokens": 28179301.0, + "step": 7020 + }, + { + "entropy": 0.8265572734177112, + "epoch": 2.690394182931496, + "grad_norm": 0.09432680904865265, + "learning_rate": 2.1753726332751444e-05, + "loss": 0.8995939254760742, + "mean_token_accuracy": 0.7947996065020562, + "num_tokens": 28223849.0, + "step": 7030 + }, + { + "entropy": 0.8321899034082889, + "epoch": 2.694221201683888, + "grad_norm": 0.1223755031824112, + "learning_rate": 2.1485161810124885e-05, + "loss": 0.9003139495849609, + "mean_token_accuracy": 0.7975824415683747, + "num_tokens": 28268485.0, + "step": 7040 + }, + { + "entropy": 0.9064472205936909, + "epoch": 2.69804822043628, + "grad_norm": 0.13409113883972168, + "learning_rate": 2.121659728749832e-05, + "loss": 0.9323970794677734, + "mean_token_accuracy": 0.7808707699179649, + "num_tokens": 28307792.0, + "step": 7050 + }, + { + "entropy": 0.9527742668986321, + "epoch": 2.701875239188672, + "grad_norm": 0.09863030910491943, + "learning_rate": 2.0948032764871762e-05, + "loss": 1.0056820869445802, + "mean_token_accuracy": 0.7673134744167328, + "num_tokens": 28355447.0, + "step": 7060 + }, + { + "entropy": 0.8202732041478157, + "epoch": 2.7057022579410637, + "grad_norm": 0.10251973569393158, + "learning_rate": 2.0679468242245202e-05, + "loss": 0.8743599891662598, + "mean_token_accuracy": 0.7957186102867126, + "num_tokens": 28397195.0, + "step": 7070 + }, + { + "entropy": 0.9328485410660505, + "epoch": 2.7095292766934556, + "grad_norm": 0.09044504910707474, + "learning_rate": 2.041090371961864e-05, + "loss": 0.9707870483398438, + "mean_token_accuracy": 0.7739486545324326, + "num_tokens": 28440070.0, + "step": 7080 + }, + { + "entropy": 0.9110265091061592, + "epoch": 2.7133562954458474, + "grad_norm": 0.10417858511209488, + "learning_rate": 2.0142339196992076e-05, + "loss": 0.9495024681091309, + "mean_token_accuracy": 0.7784481555223465, + "num_tokens": 28483039.0, + "step": 7090 + }, + { + "entropy": 0.907703897356987, + "epoch": 2.7171833141982393, + "grad_norm": 0.10365665704011917, + "learning_rate": 1.9873774674365516e-05, + "loss": 0.9539920806884765, + "mean_token_accuracy": 0.7803053423762322, + "num_tokens": 28524922.0, + "step": 7100 + }, + { + "entropy": 0.8090648584067821, + "epoch": 2.721010332950631, + "grad_norm": 0.13015250861644745, + "learning_rate": 1.9605210151738957e-05, + "loss": 0.8559967994689941, + "mean_token_accuracy": 0.7999090999364853, + "num_tokens": 28565638.0, + "step": 7110 + }, + { + "entropy": 0.832624789327383, + "epoch": 2.724837351703023, + "grad_norm": 0.12992241978645325, + "learning_rate": 1.9336645629112397e-05, + "loss": 0.886108112335205, + "mean_token_accuracy": 0.7986625626683235, + "num_tokens": 28603666.0, + "step": 7120 + }, + { + "entropy": 0.8167526118457318, + "epoch": 2.728664370455415, + "grad_norm": 0.0879233330488205, + "learning_rate": 1.9068081106485834e-05, + "loss": 0.8744274139404297, + "mean_token_accuracy": 0.8013173520565033, + "num_tokens": 28647331.0, + "step": 7130 + }, + { + "entropy": 0.8693740144371986, + "epoch": 2.732491389207807, + "grad_norm": 0.11505398899316788, + "learning_rate": 1.879951658385927e-05, + "loss": 0.9142866134643555, + "mean_token_accuracy": 0.7936322972178459, + "num_tokens": 28683073.0, + "step": 7140 + }, + { + "entropy": 0.7896613411605358, + "epoch": 2.7363184079601988, + "grad_norm": 0.10490158945322037, + "learning_rate": 1.853095206123271e-05, + "loss": 0.8762624740600586, + "mean_token_accuracy": 0.8044975116848946, + "num_tokens": 28722340.0, + "step": 7150 + }, + { + "entropy": 0.8261051677167416, + "epoch": 2.7401454267125906, + "grad_norm": 0.10280875116586685, + "learning_rate": 1.826238753860615e-05, + "loss": 0.888590145111084, + "mean_token_accuracy": 0.7989666223526001, + "num_tokens": 28757940.0, + "step": 7160 + }, + { + "entropy": 0.8630577899515629, + "epoch": 2.7439724454649825, + "grad_norm": 0.12757791578769684, + "learning_rate": 1.799382301597959e-05, + "loss": 0.9082697868347168, + "mean_token_accuracy": 0.7890564352273941, + "num_tokens": 28796985.0, + "step": 7170 + }, + { + "entropy": 0.8979216992855072, + "epoch": 2.7477994642173744, + "grad_norm": 0.13048897683620453, + "learning_rate": 1.772525849335303e-05, + "loss": 0.9468406677246094, + "mean_token_accuracy": 0.7829687342047691, + "num_tokens": 28838091.0, + "step": 7180 + }, + { + "entropy": 0.9002114910632372, + "epoch": 2.7516264829697663, + "grad_norm": 0.130500927567482, + "learning_rate": 1.745669397072647e-05, + "loss": 0.9897032737731933, + "mean_token_accuracy": 0.7817048847675323, + "num_tokens": 28879084.0, + "step": 7190 + }, + { + "entropy": 0.861878028512001, + "epoch": 2.755453501722158, + "grad_norm": 0.10523588210344315, + "learning_rate": 1.7188129448099906e-05, + "loss": 0.9628341674804688, + "mean_token_accuracy": 0.7882343173027039, + "num_tokens": 28918018.0, + "step": 7200 + }, + { + "entropy": 0.7814029835164547, + "epoch": 2.75928052047455, + "grad_norm": 0.14345957338809967, + "learning_rate": 1.6919564925473346e-05, + "loss": 0.8377615928649902, + "mean_token_accuracy": 0.8100636526942253, + "num_tokens": 28953674.0, + "step": 7210 + }, + { + "entropy": 0.8798072785139084, + "epoch": 2.763107539226942, + "grad_norm": 0.10911094397306442, + "learning_rate": 1.6651000402846783e-05, + "loss": 0.9405971527099609, + "mean_token_accuracy": 0.7843327835202217, + "num_tokens": 28995212.0, + "step": 7220 + }, + { + "entropy": 0.7432700909674168, + "epoch": 2.766934557979334, + "grad_norm": 0.09271088242530823, + "learning_rate": 1.6382435880220223e-05, + "loss": 0.7987990856170655, + "mean_token_accuracy": 0.8185402989387512, + "num_tokens": 29034878.0, + "step": 7230 + }, + { + "entropy": 0.7937459200620651, + "epoch": 2.7707615767317257, + "grad_norm": 0.11122163385152817, + "learning_rate": 1.6113871357593664e-05, + "loss": 0.8469036102294922, + "mean_token_accuracy": 0.8031805381178856, + "num_tokens": 29074372.0, + "step": 7240 + }, + { + "entropy": 0.8456454008817673, + "epoch": 2.7745885954841176, + "grad_norm": 0.11189702153205872, + "learning_rate": 1.5845306834967104e-05, + "loss": 0.8942484855651855, + "mean_token_accuracy": 0.7923400938510895, + "num_tokens": 29117619.0, + "step": 7250 + }, + { + "entropy": 0.885396859049797, + "epoch": 2.7784156142365095, + "grad_norm": 0.10170719027519226, + "learning_rate": 1.557674231234054e-05, + "loss": 0.9175837516784668, + "mean_token_accuracy": 0.7860854491591454, + "num_tokens": 29156601.0, + "step": 7260 + }, + { + "entropy": 0.8742636401206255, + "epoch": 2.7822426329889014, + "grad_norm": 0.11130956560373306, + "learning_rate": 1.5308177789713978e-05, + "loss": 0.9322646141052247, + "mean_token_accuracy": 0.7902692511677742, + "num_tokens": 29200295.0, + "step": 7270 + }, + { + "entropy": 0.8523757141083479, + "epoch": 2.7860696517412933, + "grad_norm": 0.08611233532428741, + "learning_rate": 1.5039613267087418e-05, + "loss": 0.9210372924804687, + "mean_token_accuracy": 0.7912763133645058, + "num_tokens": 29235323.0, + "step": 7280 + }, + { + "entropy": 0.7804547689855099, + "epoch": 2.789896670493685, + "grad_norm": 0.08091949671506882, + "learning_rate": 1.4771048744460858e-05, + "loss": 0.8202395439147949, + "mean_token_accuracy": 0.8117679923772811, + "num_tokens": 29270182.0, + "step": 7290 + }, + { + "entropy": 0.8199648998677731, + "epoch": 2.793723689246077, + "grad_norm": 0.07486634701490402, + "learning_rate": 1.4502484221834297e-05, + "loss": 0.8396285057067872, + "mean_token_accuracy": 0.8032143607735633, + "num_tokens": 29311588.0, + "step": 7300 + }, + { + "entropy": 0.9650515951216221, + "epoch": 2.797550707998469, + "grad_norm": 0.10391585528850555, + "learning_rate": 1.4233919699207734e-05, + "loss": 1.047046184539795, + "mean_token_accuracy": 0.7648886650800705, + "num_tokens": 29353979.0, + "step": 7310 + }, + { + "entropy": 0.7674700990319252, + "epoch": 2.801377726750861, + "grad_norm": 0.09043332189321518, + "learning_rate": 1.3965355176581174e-05, + "loss": 0.8154891014099122, + "mean_token_accuracy": 0.8105725541710853, + "num_tokens": 29393298.0, + "step": 7320 + }, + { + "entropy": 0.7795201197266579, + "epoch": 2.8052047455032527, + "grad_norm": 0.14624197781085968, + "learning_rate": 1.3696790653954614e-05, + "loss": 0.7968831062316895, + "mean_token_accuracy": 0.808569261431694, + "num_tokens": 29423547.0, + "step": 7330 + }, + { + "entropy": 0.9187626458704472, + "epoch": 2.8090317642556446, + "grad_norm": 0.1368781179189682, + "learning_rate": 1.3428226131328053e-05, + "loss": 0.9583258628845215, + "mean_token_accuracy": 0.7731027945876121, + "num_tokens": 29465593.0, + "step": 7340 + }, + { + "entropy": 0.9403511643409729, + "epoch": 2.8128587830080365, + "grad_norm": 0.10892713069915771, + "learning_rate": 1.315966160870149e-05, + "loss": 0.9621626853942871, + "mean_token_accuracy": 0.767315211892128, + "num_tokens": 29506888.0, + "step": 7350 + }, + { + "entropy": 0.842640140466392, + "epoch": 2.8166858017604284, + "grad_norm": 0.08862321823835373, + "learning_rate": 1.289109708607493e-05, + "loss": 0.9031145095825195, + "mean_token_accuracy": 0.7967306047677993, + "num_tokens": 29550811.0, + "step": 7360 + }, + { + "entropy": 0.8931968793272972, + "epoch": 2.8205128205128203, + "grad_norm": 0.0979296937584877, + "learning_rate": 1.2622532563448369e-05, + "loss": 0.9369117736816406, + "mean_token_accuracy": 0.785995215177536, + "num_tokens": 29587036.0, + "step": 7370 + }, + { + "entropy": 0.8621913805603981, + "epoch": 2.824339839265212, + "grad_norm": 0.08778136223554611, + "learning_rate": 1.2353968040821807e-05, + "loss": 0.884724235534668, + "mean_token_accuracy": 0.790358729660511, + "num_tokens": 29627992.0, + "step": 7380 + }, + { + "entropy": 0.8695362661033869, + "epoch": 2.828166858017604, + "grad_norm": 0.09141552448272705, + "learning_rate": 1.2085403518195248e-05, + "loss": 0.9539263725280762, + "mean_token_accuracy": 0.78631162494421, + "num_tokens": 29668509.0, + "step": 7390 + }, + { + "entropy": 0.8454725466668606, + "epoch": 2.831993876769996, + "grad_norm": 0.10090988874435425, + "learning_rate": 1.1816838995568685e-05, + "loss": 0.9256816864013672, + "mean_token_accuracy": 0.7941092774271965, + "num_tokens": 29706794.0, + "step": 7400 + }, + { + "entropy": 0.8406473740935325, + "epoch": 2.835820895522388, + "grad_norm": 0.12991519272327423, + "learning_rate": 1.1548274472942125e-05, + "loss": 0.8969921112060547, + "mean_token_accuracy": 0.7950825378298759, + "num_tokens": 29745883.0, + "step": 7410 + }, + { + "entropy": 0.8951507560908795, + "epoch": 2.8396479142747797, + "grad_norm": 0.14208164811134338, + "learning_rate": 1.1279709950315565e-05, + "loss": 0.9443653106689454, + "mean_token_accuracy": 0.7820898026227951, + "num_tokens": 29788428.0, + "step": 7420 + }, + { + "entropy": 0.859702505543828, + "epoch": 2.8434749330271716, + "grad_norm": 0.10485101491212845, + "learning_rate": 1.1011145427689002e-05, + "loss": 0.9106481552124024, + "mean_token_accuracy": 0.7910059571266175, + "num_tokens": 29829552.0, + "step": 7430 + }, + { + "entropy": 0.838575328886509, + "epoch": 2.8473019517795635, + "grad_norm": 0.09105801582336426, + "learning_rate": 1.0742580905062442e-05, + "loss": 0.9367799758911133, + "mean_token_accuracy": 0.7953649654984474, + "num_tokens": 29869380.0, + "step": 7440 + }, + { + "entropy": 0.9112126015126705, + "epoch": 2.8511289705319554, + "grad_norm": 0.09724974632263184, + "learning_rate": 1.0474016382435881e-05, + "loss": 0.9621581077575684, + "mean_token_accuracy": 0.7795066565275193, + "num_tokens": 29913977.0, + "step": 7450 + }, + { + "entropy": 0.7964273016899824, + "epoch": 2.8549559892843472, + "grad_norm": 0.09481512755155563, + "learning_rate": 1.020545185980932e-05, + "loss": 0.8208577156066894, + "mean_token_accuracy": 0.8045729547739029, + "num_tokens": 29949229.0, + "step": 7460 + }, + { + "entropy": 0.9103045649826527, + "epoch": 2.858783008036739, + "grad_norm": 0.08678591996431351, + "learning_rate": 9.936887337182758e-06, + "loss": 0.9599167823791503, + "mean_token_accuracy": 0.7792657531797886, + "num_tokens": 29999070.0, + "step": 7470 + }, + { + "entropy": 0.8333844318985939, + "epoch": 2.862610026789131, + "grad_norm": 0.07823742181062698, + "learning_rate": 9.668322814556198e-06, + "loss": 0.8832645416259766, + "mean_token_accuracy": 0.7986885383725166, + "num_tokens": 30041797.0, + "step": 7480 + }, + { + "entropy": 0.8970901295542717, + "epoch": 2.866437045541523, + "grad_norm": 0.11852974444627762, + "learning_rate": 9.399758291929635e-06, + "loss": 0.9755334854125977, + "mean_token_accuracy": 0.7814395651221275, + "num_tokens": 30080534.0, + "step": 7490 + }, + { + "entropy": 0.8733609687536955, + "epoch": 2.870264064293915, + "grad_norm": 0.08307944238185883, + "learning_rate": 9.131193769303076e-06, + "loss": 0.9116435050964355, + "mean_token_accuracy": 0.786429825425148, + "num_tokens": 30123488.0, + "step": 7500 + }, + { + "entropy": 0.7967244807630778, + "epoch": 2.8740910830463067, + "grad_norm": 0.121941938996315, + "learning_rate": 8.862629246676514e-06, + "loss": 0.8209601402282715, + "mean_token_accuracy": 0.8040247783064842, + "num_tokens": 30158076.0, + "step": 7510 + }, + { + "entropy": 0.8655086796730757, + "epoch": 2.8779181017986986, + "grad_norm": 0.10017320513725281, + "learning_rate": 8.594064724049953e-06, + "loss": 0.9246477127075196, + "mean_token_accuracy": 0.7905631095170975, + "num_tokens": 30198329.0, + "step": 7520 + }, + { + "entropy": 0.7916971929371357, + "epoch": 2.8817451205510904, + "grad_norm": 0.08822990953922272, + "learning_rate": 8.325500201423391e-06, + "loss": 0.8695680618286132, + "mean_token_accuracy": 0.8063599601387977, + "num_tokens": 30239990.0, + "step": 7530 + }, + { + "entropy": 0.7693583916872739, + "epoch": 2.8855721393034823, + "grad_norm": 0.1178632378578186, + "learning_rate": 8.056935678796832e-06, + "loss": 0.8029808044433594, + "mean_token_accuracy": 0.808713173866272, + "num_tokens": 30272583.0, + "step": 7540 + }, + { + "entropy": 0.9072235215455293, + "epoch": 2.889399158055874, + "grad_norm": 0.11368006467819214, + "learning_rate": 7.78837115617027e-06, + "loss": 0.9859001159667968, + "mean_token_accuracy": 0.7825366839766502, + "num_tokens": 30314370.0, + "step": 7550 + }, + { + "entropy": 0.909162075817585, + "epoch": 2.893226176808266, + "grad_norm": 0.10643935948610306, + "learning_rate": 7.519806633543709e-06, + "loss": 0.9263824462890625, + "mean_token_accuracy": 0.7813168540596962, + "num_tokens": 30362103.0, + "step": 7560 + }, + { + "entropy": 0.8779693342745304, + "epoch": 2.897053195560658, + "grad_norm": 0.12511365115642548, + "learning_rate": 7.2512421109171484e-06, + "loss": 0.9283166885375976, + "mean_token_accuracy": 0.7876154363155365, + "num_tokens": 30400468.0, + "step": 7570 + }, + { + "entropy": 0.9308112382888794, + "epoch": 2.90088021431305, + "grad_norm": 0.08942066878080368, + "learning_rate": 6.982677588290587e-06, + "loss": 0.9894198417663574, + "mean_token_accuracy": 0.7739586725831031, + "num_tokens": 30444628.0, + "step": 7580 + }, + { + "entropy": 0.8830183774232865, + "epoch": 2.9047072330654418, + "grad_norm": 0.08949998021125793, + "learning_rate": 6.7141130656640265e-06, + "loss": 0.9515928268432617, + "mean_token_accuracy": 0.7846902176737786, + "num_tokens": 30485845.0, + "step": 7590 + }, + { + "entropy": 0.8058773010969162, + "epoch": 2.9085342518178336, + "grad_norm": 0.1035229042172432, + "learning_rate": 6.445548543037465e-06, + "loss": 0.846186637878418, + "mean_token_accuracy": 0.8066700398921967, + "num_tokens": 30523979.0, + "step": 7600 + }, + { + "entropy": 0.9146121144294739, + "epoch": 2.9123612705702255, + "grad_norm": 0.09379884600639343, + "learning_rate": 6.176984020410904e-06, + "loss": 0.9735233306884765, + "mean_token_accuracy": 0.7774886921048164, + "num_tokens": 30564775.0, + "step": 7610 + }, + { + "entropy": 0.8396586284041405, + "epoch": 2.9161882893226174, + "grad_norm": 0.11920839548110962, + "learning_rate": 5.908419497784342e-06, + "loss": 0.9061779022216797, + "mean_token_accuracy": 0.7974281132221221, + "num_tokens": 30609113.0, + "step": 7620 + }, + { + "entropy": 0.8665836162865161, + "epoch": 2.9200153080750093, + "grad_norm": 0.10214731842279434, + "learning_rate": 5.639854975157783e-06, + "loss": 0.9333956718444825, + "mean_token_accuracy": 0.7912585958838463, + "num_tokens": 30652409.0, + "step": 7630 + }, + { + "entropy": 0.8082432024180889, + "epoch": 2.923842326827401, + "grad_norm": 0.09191566705703735, + "learning_rate": 5.371290452531221e-06, + "loss": 0.8443769454956055, + "mean_token_accuracy": 0.797667445242405, + "num_tokens": 30689299.0, + "step": 7640 + }, + { + "entropy": 0.8395522754639387, + "epoch": 2.927669345579793, + "grad_norm": 0.08281564712524414, + "learning_rate": 5.10272592990466e-06, + "loss": 0.8710539817810059, + "mean_token_accuracy": 0.7973509266972542, + "num_tokens": 30724619.0, + "step": 7650 + }, + { + "entropy": 0.8130493897944688, + "epoch": 2.931496364332185, + "grad_norm": 0.0996284931898117, + "learning_rate": 4.834161407278099e-06, + "loss": 0.8514342308044434, + "mean_token_accuracy": 0.800888329744339, + "num_tokens": 30764224.0, + "step": 7660 + }, + { + "entropy": 0.7793916609138251, + "epoch": 2.935323383084577, + "grad_norm": 0.09503267705440521, + "learning_rate": 4.565596884651538e-06, + "loss": 0.8305204391479493, + "mean_token_accuracy": 0.8106261268258095, + "num_tokens": 30800800.0, + "step": 7670 + }, + { + "entropy": 0.817446855083108, + "epoch": 2.9391504018369687, + "grad_norm": 0.13637053966522217, + "learning_rate": 4.2970323620249764e-06, + "loss": 0.839473819732666, + "mean_token_accuracy": 0.8018909886479377, + "num_tokens": 30841481.0, + "step": 7680 + }, + { + "entropy": 0.8140060313045978, + "epoch": 2.9429774205893606, + "grad_norm": 0.13390128314495087, + "learning_rate": 4.028467839398416e-06, + "loss": 0.8653444290161133, + "mean_token_accuracy": 0.8000675857067108, + "num_tokens": 30880001.0, + "step": 7690 + }, + { + "entropy": 0.7898532018065453, + "epoch": 2.9468044393417525, + "grad_norm": 0.11585478484630585, + "learning_rate": 3.7599033167718545e-06, + "loss": 0.8074365615844726, + "mean_token_accuracy": 0.8053972944617271, + "num_tokens": 30915563.0, + "step": 7700 + }, + { + "entropy": 0.8091453645378351, + "epoch": 2.9506314580941444, + "grad_norm": 0.09755035489797592, + "learning_rate": 3.4913387941452935e-06, + "loss": 0.8457134246826172, + "mean_token_accuracy": 0.8031114682555198, + "num_tokens": 30955410.0, + "step": 7710 + }, + { + "entropy": 0.8444364190101623, + "epoch": 2.9544584768465363, + "grad_norm": 0.1297679990530014, + "learning_rate": 3.2227742715187325e-06, + "loss": 0.910922622680664, + "mean_token_accuracy": 0.7976488128304482, + "num_tokens": 30997246.0, + "step": 7720 + }, + { + "entropy": 0.8454434804618358, + "epoch": 2.958285495598928, + "grad_norm": 0.15091662108898163, + "learning_rate": 2.954209748892171e-06, + "loss": 0.8977128982543945, + "mean_token_accuracy": 0.7951600447297096, + "num_tokens": 31042192.0, + "step": 7730 + }, + { + "entropy": 0.838621474429965, + "epoch": 2.96211251435132, + "grad_norm": 0.10101021081209183, + "learning_rate": 2.6856452262656106e-06, + "loss": 0.9142851829528809, + "mean_token_accuracy": 0.7966463148593903, + "num_tokens": 31082777.0, + "step": 7740 + }, + { + "entropy": 0.8021124713122845, + "epoch": 2.965939533103712, + "grad_norm": 0.11373798549175262, + "learning_rate": 2.4170807036390496e-06, + "loss": 0.845030403137207, + "mean_token_accuracy": 0.8039181783795357, + "num_tokens": 31122973.0, + "step": 7750 + }, + { + "entropy": 0.8570070005953312, + "epoch": 2.969766551856104, + "grad_norm": 0.0995812863111496, + "learning_rate": 2.1485161810124882e-06, + "loss": 0.8876262664794922, + "mean_token_accuracy": 0.7908932328224182, + "num_tokens": 31166313.0, + "step": 7760 + }, + { + "entropy": 0.9019658699631691, + "epoch": 2.9735935706084957, + "grad_norm": 0.10546575486660004, + "learning_rate": 1.8799516583859272e-06, + "loss": 0.9777070999145507, + "mean_token_accuracy": 0.7821963891386986, + "num_tokens": 31202060.0, + "step": 7770 + }, + { + "entropy": 0.9346055820584297, + "epoch": 2.9774205893608876, + "grad_norm": 0.11632298678159714, + "learning_rate": 1.6113871357593663e-06, + "loss": 1.017040729522705, + "mean_token_accuracy": 0.7751505836844444, + "num_tokens": 31241536.0, + "step": 7780 + }, + { + "entropy": 0.8882534563541412, + "epoch": 2.9812476081132795, + "grad_norm": 0.13064302504062653, + "learning_rate": 1.3428226131328053e-06, + "loss": 0.9505605697631836, + "mean_token_accuracy": 0.7848831593990326, + "num_tokens": 31278060.0, + "step": 7790 + }, + { + "entropy": 0.8854026839137077, + "epoch": 2.9850746268656714, + "grad_norm": 0.0977831557393074, + "learning_rate": 1.0742580905062441e-06, + "loss": 0.9311306953430176, + "mean_token_accuracy": 0.7847100362181664, + "num_tokens": 31325802.0, + "step": 7800 + }, + { + "entropy": 0.9448695838451385, + "epoch": 2.9889016456180633, + "grad_norm": 0.11724492162466049, + "learning_rate": 8.056935678796831e-07, + "loss": 0.983949089050293, + "mean_token_accuracy": 0.7636413291096688, + "num_tokens": 31367954.0, + "step": 7810 + }, + { + "entropy": 0.8787743166089058, + "epoch": 2.992728664370455, + "grad_norm": 0.09530383348464966, + "learning_rate": 5.371290452531221e-07, + "loss": 0.9605165481567383, + "mean_token_accuracy": 0.7847816556692123, + "num_tokens": 31410151.0, + "step": 7820 + }, + { + "entropy": 0.810061177611351, + "epoch": 2.996555683122847, + "grad_norm": 0.09042539447546005, + "learning_rate": 2.6856452262656103e-07, + "loss": 0.8766719818115234, + "mean_token_accuracy": 0.8047587737441063, + "num_tokens": 31451314.0, + "step": 7830 } ], "logging_steps": 10, - "max_steps": 3114, + "max_steps": 7839, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, @@ -1063,12 +7852,12 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": false + "should_training_stop": true }, "attributes": {} } }, - "total_flos": 2.1498282716531098e+18, + "total_flos": 1.17346463002948e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null