| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 7839, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.0787046998739243, | |
| "epoch": 0.003827018752391887, | |
| "grad_norm": 0.37200024724006653, | |
| "learning_rate": 4.591836734693878e-06, | |
| "loss": 2.590205955505371, | |
| "mean_token_accuracy": 0.5478626236319541, | |
| "num_tokens": 43996.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.1275236845016479, | |
| "epoch": 0.007654037504783774, | |
| "grad_norm": 0.4282406270503998, | |
| "learning_rate": 9.693877551020408e-06, | |
| "loss": 2.732739067077637, | |
| "mean_token_accuracy": 0.532574575394392, | |
| "num_tokens": 84448.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.1098140180110931, | |
| "epoch": 0.011481056257175661, | |
| "grad_norm": 0.45254817605018616, | |
| "learning_rate": 1.479591836734694e-05, | |
| "loss": 2.595915603637695, | |
| "mean_token_accuracy": 0.5385765254497528, | |
| "num_tokens": 127136.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.1594089552760125, | |
| "epoch": 0.015308075009567547, | |
| "grad_norm": 0.26816287636756897, | |
| "learning_rate": 1.989795918367347e-05, | |
| "loss": 2.3643749237060545, | |
| "mean_token_accuracy": 0.5597088657319546, | |
| "num_tokens": 172549.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.2817068248987198, | |
| "epoch": 0.019135093761959432, | |
| "grad_norm": 0.19904343783855438, | |
| "learning_rate": 2.5e-05, | |
| "loss": 2.1694852828979494, | |
| "mean_token_accuracy": 0.5605865910649299, | |
| "num_tokens": 218317.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.356394973397255, | |
| "epoch": 0.022962112514351322, | |
| "grad_norm": 0.21708081662654877, | |
| "learning_rate": 3.0102040816326533e-05, | |
| "loss": 2.0824514389038087, | |
| "mean_token_accuracy": 0.584179612249136, | |
| "num_tokens": 255107.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.4309053242206573, | |
| "epoch": 0.026789131266743208, | |
| "grad_norm": 0.09860006719827652, | |
| "learning_rate": 3.520408163265306e-05, | |
| "loss": 1.777943229675293, | |
| "mean_token_accuracy": 0.6243460461497307, | |
| "num_tokens": 298973.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.4572882741689681, | |
| "epoch": 0.030616150019135095, | |
| "grad_norm": 0.07813975214958191, | |
| "learning_rate": 4.0306122448979596e-05, | |
| "loss": 1.7085393905639648, | |
| "mean_token_accuracy": 0.641383134573698, | |
| "num_tokens": 343287.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.5205285474658012, | |
| "epoch": 0.03444316877152698, | |
| "grad_norm": 0.08015387505292892, | |
| "learning_rate": 4.5408163265306124e-05, | |
| "loss": 1.680305290222168, | |
| "mean_token_accuracy": 0.6431211873888969, | |
| "num_tokens": 376882.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.529197846353054, | |
| "epoch": 0.038270187523918864, | |
| "grad_norm": 0.1533895879983902, | |
| "learning_rate": 5.051020408163265e-05, | |
| "loss": 1.538798999786377, | |
| "mean_token_accuracy": 0.6604955434799195, | |
| "num_tokens": 414084.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.454574093222618, | |
| "epoch": 0.04209720627631076, | |
| "grad_norm": 0.08893708884716034, | |
| "learning_rate": 5.561224489795919e-05, | |
| "loss": 1.482753372192383, | |
| "mean_token_accuracy": 0.6748222857713699, | |
| "num_tokens": 451489.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.4169663548469544, | |
| "epoch": 0.045924225028702644, | |
| "grad_norm": 0.10994797945022583, | |
| "learning_rate": 6.0714285714285715e-05, | |
| "loss": 1.4192767143249512, | |
| "mean_token_accuracy": 0.6815833821892738, | |
| "num_tokens": 492001.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.3825553365051746, | |
| "epoch": 0.04975124378109453, | |
| "grad_norm": 0.09565065056085587, | |
| "learning_rate": 6.581632653061225e-05, | |
| "loss": 1.4434242248535156, | |
| "mean_token_accuracy": 0.6830388471484184, | |
| "num_tokens": 534229.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.3660246580839157, | |
| "epoch": 0.053578262533486416, | |
| "grad_norm": 0.09277962148189545, | |
| "learning_rate": 7.091836734693877e-05, | |
| "loss": 1.3881919860839844, | |
| "mean_token_accuracy": 0.6916770502924919, | |
| "num_tokens": 573333.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.3041961744427681, | |
| "epoch": 0.0574052812858783, | |
| "grad_norm": 0.14179331064224243, | |
| "learning_rate": 7.60204081632653e-05, | |
| "loss": 1.3051923751831054, | |
| "mean_token_accuracy": 0.7079917460680007, | |
| "num_tokens": 612198.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.3157715648412704, | |
| "epoch": 0.06123230003827019, | |
| "grad_norm": 0.11061020940542221, | |
| "learning_rate": 8.112244897959184e-05, | |
| "loss": 1.3127019882202149, | |
| "mean_token_accuracy": 0.6986684441566468, | |
| "num_tokens": 654309.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.2894421368837357, | |
| "epoch": 0.06505931879066207, | |
| "grad_norm": 0.12903185188770294, | |
| "learning_rate": 8.622448979591838e-05, | |
| "loss": 1.3279677391052247, | |
| "mean_token_accuracy": 0.7028318449854851, | |
| "num_tokens": 694076.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.2676123276352882, | |
| "epoch": 0.06888633754305395, | |
| "grad_norm": 0.10816285014152527, | |
| "learning_rate": 9.13265306122449e-05, | |
| "loss": 1.3254461288452148, | |
| "mean_token_accuracy": 0.7063754379749299, | |
| "num_tokens": 733806.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.0988808318972587, | |
| "epoch": 0.07271335629544584, | |
| "grad_norm": 0.08472651243209839, | |
| "learning_rate": 9.642857142857143e-05, | |
| "loss": 1.1510659217834474, | |
| "mean_token_accuracy": 0.7411173984408379, | |
| "num_tokens": 771622.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.2284281507134438, | |
| "epoch": 0.07654037504783773, | |
| "grad_norm": 0.10897475481033325, | |
| "learning_rate": 0.00010153061224489797, | |
| "loss": 1.2724005699157714, | |
| "mean_token_accuracy": 0.7178638219833374, | |
| "num_tokens": 813167.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.2159623876214027, | |
| "epoch": 0.08036739380022963, | |
| "grad_norm": 0.12170197069644928, | |
| "learning_rate": 0.0001066326530612245, | |
| "loss": 1.26397647857666, | |
| "mean_token_accuracy": 0.7138236090540886, | |
| "num_tokens": 856210.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.2309471271932124, | |
| "epoch": 0.08419441255262151, | |
| "grad_norm": 0.08406181633472443, | |
| "learning_rate": 0.00011173469387755102, | |
| "loss": 1.3110918998718262, | |
| "mean_token_accuracy": 0.7173333883285522, | |
| "num_tokens": 893432.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.2228039711713792, | |
| "epoch": 0.0880214313050134, | |
| "grad_norm": 0.10588081181049347, | |
| "learning_rate": 0.00011683673469387754, | |
| "loss": 1.2445635795593262, | |
| "mean_token_accuracy": 0.7160170584917068, | |
| "num_tokens": 931919.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.138296764343977, | |
| "epoch": 0.09184845005740529, | |
| "grad_norm": 0.110760398209095, | |
| "learning_rate": 0.00012193877551020409, | |
| "loss": 1.2083134651184082, | |
| "mean_token_accuracy": 0.7306654810905456, | |
| "num_tokens": 976039.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.1915819495916367, | |
| "epoch": 0.09567546880979717, | |
| "grad_norm": 0.15018120408058167, | |
| "learning_rate": 0.00012704081632653063, | |
| "loss": 1.2230369567871093, | |
| "mean_token_accuracy": 0.719833716750145, | |
| "num_tokens": 1019312.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.2996815636754036, | |
| "epoch": 0.09950248756218906, | |
| "grad_norm": 0.10838313400745392, | |
| "learning_rate": 0.00013214285714285715, | |
| "loss": 1.3142367362976075, | |
| "mean_token_accuracy": 0.7013067752122879, | |
| "num_tokens": 1062901.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.149668525904417, | |
| "epoch": 0.10332950631458095, | |
| "grad_norm": 0.09911312907934189, | |
| "learning_rate": 0.00013724489795918367, | |
| "loss": 1.1573083877563477, | |
| "mean_token_accuracy": 0.728775355219841, | |
| "num_tokens": 1102630.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.271340447664261, | |
| "epoch": 0.10715652506697283, | |
| "grad_norm": 0.09665267169475555, | |
| "learning_rate": 0.00014234693877551022, | |
| "loss": 1.341374111175537, | |
| "mean_token_accuracy": 0.7027333110570908, | |
| "num_tokens": 1142262.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.225122657418251, | |
| "epoch": 0.11098354381936472, | |
| "grad_norm": 0.13240815699100494, | |
| "learning_rate": 0.00014744897959183674, | |
| "loss": 1.2614737510681153, | |
| "mean_token_accuracy": 0.7198452442884445, | |
| "num_tokens": 1182386.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.2733432039618493, | |
| "epoch": 0.1148105625717566, | |
| "grad_norm": 0.10651895403862, | |
| "learning_rate": 0.00015255102040816326, | |
| "loss": 1.2933347702026368, | |
| "mean_token_accuracy": 0.7025195896625519, | |
| "num_tokens": 1222805.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.1543171763420106, | |
| "epoch": 0.11863758132414849, | |
| "grad_norm": 0.08577804267406464, | |
| "learning_rate": 0.00015765306122448978, | |
| "loss": 1.197078323364258, | |
| "mean_token_accuracy": 0.7300360783934593, | |
| "num_tokens": 1263121.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.1908820882439612, | |
| "epoch": 0.12246460007654038, | |
| "grad_norm": 0.11925600469112396, | |
| "learning_rate": 0.00016275510204081633, | |
| "loss": 1.2366827964782714, | |
| "mean_token_accuracy": 0.7277334719896317, | |
| "num_tokens": 1296346.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.1711702406406403, | |
| "epoch": 0.12629161882893225, | |
| "grad_norm": 0.12476309388875961, | |
| "learning_rate": 0.00016785714285714288, | |
| "loss": 1.2408350944519042, | |
| "mean_token_accuracy": 0.7282937213778495, | |
| "num_tokens": 1335547.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.1748667433857918, | |
| "epoch": 0.13011863758132414, | |
| "grad_norm": 0.08671289682388306, | |
| "learning_rate": 0.0001729591836734694, | |
| "loss": 1.199030303955078, | |
| "mean_token_accuracy": 0.7312668621540069, | |
| "num_tokens": 1377093.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.153764547407627, | |
| "epoch": 0.13394565633371602, | |
| "grad_norm": 0.10536976903676987, | |
| "learning_rate": 0.00017806122448979592, | |
| "loss": 1.201906967163086, | |
| "mean_token_accuracy": 0.7266524419188499, | |
| "num_tokens": 1417236.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.2303058430552483, | |
| "epoch": 0.1377726750861079, | |
| "grad_norm": 0.09069176018238068, | |
| "learning_rate": 0.00018316326530612247, | |
| "loss": 1.2867681503295898, | |
| "mean_token_accuracy": 0.7198954582214355, | |
| "num_tokens": 1460306.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.1944106668233871, | |
| "epoch": 0.1415996938384998, | |
| "grad_norm": 0.08539925515651703, | |
| "learning_rate": 0.000188265306122449, | |
| "loss": 1.2451179504394532, | |
| "mean_token_accuracy": 0.7186401098966598, | |
| "num_tokens": 1505966.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.2239032357931137, | |
| "epoch": 0.14542671259089168, | |
| "grad_norm": 0.08434446156024933, | |
| "learning_rate": 0.0001933673469387755, | |
| "loss": 1.2803629875183105, | |
| "mean_token_accuracy": 0.7178368359804154, | |
| "num_tokens": 1544132.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.1901779979467393, | |
| "epoch": 0.14925373134328357, | |
| "grad_norm": 0.08662886172533035, | |
| "learning_rate": 0.00019846938775510203, | |
| "loss": 1.2282370567321776, | |
| "mean_token_accuracy": 0.7221132159233093, | |
| "num_tokens": 1587355.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.106569343805313, | |
| "epoch": 0.15308075009567546, | |
| "grad_norm": 0.13149450719356537, | |
| "learning_rate": 0.00019981200483416142, | |
| "loss": 1.136556625366211, | |
| "mean_token_accuracy": 0.7384186327457428, | |
| "num_tokens": 1624638.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.0393452920019626, | |
| "epoch": 0.15690776884806737, | |
| "grad_norm": 0.13831719756126404, | |
| "learning_rate": 0.00019954344031153484, | |
| "loss": 1.074817180633545, | |
| "mean_token_accuracy": 0.7567356958985328, | |
| "num_tokens": 1665215.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.1298492863774299, | |
| "epoch": 0.16073478760045926, | |
| "grad_norm": 0.10244159400463104, | |
| "learning_rate": 0.0001992748757889083, | |
| "loss": 1.1741769790649415, | |
| "mean_token_accuracy": 0.7414689466357232, | |
| "num_tokens": 1701543.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.1646860882639885, | |
| "epoch": 0.16456180635285114, | |
| "grad_norm": 0.09356453269720078, | |
| "learning_rate": 0.00019900631126628174, | |
| "loss": 1.2229989051818848, | |
| "mean_token_accuracy": 0.7278152450919151, | |
| "num_tokens": 1744719.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.1580850452184677, | |
| "epoch": 0.16838882510524303, | |
| "grad_norm": 0.08699047565460205, | |
| "learning_rate": 0.00019873774674365518, | |
| "loss": 1.1999470710754394, | |
| "mean_token_accuracy": 0.7270642057061195, | |
| "num_tokens": 1787999.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.105088683962822, | |
| "epoch": 0.17221584385763491, | |
| "grad_norm": 0.10489863902330399, | |
| "learning_rate": 0.0001984691822210286, | |
| "loss": 1.123628807067871, | |
| "mean_token_accuracy": 0.7375599846243859, | |
| "num_tokens": 1825171.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.0744315460324287, | |
| "epoch": 0.1760428626100268, | |
| "grad_norm": 0.10170256346464157, | |
| "learning_rate": 0.00019820061769840205, | |
| "loss": 1.1449885368347168, | |
| "mean_token_accuracy": 0.7466330513358116, | |
| "num_tokens": 1863245.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.021899376064539, | |
| "epoch": 0.1798698813624187, | |
| "grad_norm": 0.09046658873558044, | |
| "learning_rate": 0.0001979320531757755, | |
| "loss": 1.0087275505065918, | |
| "mean_token_accuracy": 0.7654190301895142, | |
| "num_tokens": 1902205.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.1595304682850838, | |
| "epoch": 0.18369690011481057, | |
| "grad_norm": 0.09361740201711655, | |
| "learning_rate": 0.00019766348865314892, | |
| "loss": 1.2238757133483886, | |
| "mean_token_accuracy": 0.7275362908840179, | |
| "num_tokens": 1943827.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.1023890599608421, | |
| "epoch": 0.18752391886720246, | |
| "grad_norm": 0.08471602201461792, | |
| "learning_rate": 0.00019739492413052236, | |
| "loss": 1.1567111015319824, | |
| "mean_token_accuracy": 0.742167092859745, | |
| "num_tokens": 1982630.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.1427962884306908, | |
| "epoch": 0.19135093761959435, | |
| "grad_norm": 0.1170063391327858, | |
| "learning_rate": 0.0001971263596078958, | |
| "loss": 1.208080005645752, | |
| "mean_token_accuracy": 0.7308674260973931, | |
| "num_tokens": 2021311.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.0002792343497275, | |
| "epoch": 0.19517795637198623, | |
| "grad_norm": 0.10567828267812729, | |
| "learning_rate": 0.00019685779508526926, | |
| "loss": 1.026076889038086, | |
| "mean_token_accuracy": 0.7641370877623558, | |
| "num_tokens": 2055396.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.1096693962812423, | |
| "epoch": 0.19900497512437812, | |
| "grad_norm": 0.08597096055746078, | |
| "learning_rate": 0.00019658923056264268, | |
| "loss": 1.1631651878356934, | |
| "mean_token_accuracy": 0.7389342650771141, | |
| "num_tokens": 2097931.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.04201333373785, | |
| "epoch": 0.20283199387677, | |
| "grad_norm": 0.1260094940662384, | |
| "learning_rate": 0.00019632066604001613, | |
| "loss": 1.0765873908996582, | |
| "mean_token_accuracy": 0.7539548426866531, | |
| "num_tokens": 2137657.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.0748623803257942, | |
| "epoch": 0.2066590126291619, | |
| "grad_norm": 0.0845552608370781, | |
| "learning_rate": 0.00019605210151738955, | |
| "loss": 1.1419748306274413, | |
| "mean_token_accuracy": 0.7481048628687859, | |
| "num_tokens": 2177343.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 1.0970528617501258, | |
| "epoch": 0.21048603138155378, | |
| "grad_norm": 0.07105763256549835, | |
| "learning_rate": 0.000195783536994763, | |
| "loss": 1.1284755706787108, | |
| "mean_token_accuracy": 0.747196614742279, | |
| "num_tokens": 2211423.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.0551751986145974, | |
| "epoch": 0.21431305013394567, | |
| "grad_norm": 0.12569685280323029, | |
| "learning_rate": 0.00019551497247213644, | |
| "loss": 1.1170102119445802, | |
| "mean_token_accuracy": 0.7472440049052238, | |
| "num_tokens": 2249350.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 1.0562219873070717, | |
| "epoch": 0.21814006888633755, | |
| "grad_norm": 0.08452208340167999, | |
| "learning_rate": 0.0001952464079495099, | |
| "loss": 1.0921730995178223, | |
| "mean_token_accuracy": 0.7492805704474449, | |
| "num_tokens": 2289564.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 1.022915106266737, | |
| "epoch": 0.22196708763872944, | |
| "grad_norm": 0.08168510347604752, | |
| "learning_rate": 0.00019497784342688333, | |
| "loss": 1.064980697631836, | |
| "mean_token_accuracy": 0.7562290355563164, | |
| "num_tokens": 2332592.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 1.1041950330138206, | |
| "epoch": 0.22579410639112132, | |
| "grad_norm": 0.07596516609191895, | |
| "learning_rate": 0.00019470927890425675, | |
| "loss": 1.1410536766052246, | |
| "mean_token_accuracy": 0.7350378915667534, | |
| "num_tokens": 2380215.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 1.1252056039869784, | |
| "epoch": 0.2296211251435132, | |
| "grad_norm": 0.07240597158670425, | |
| "learning_rate": 0.0001944407143816302, | |
| "loss": 1.1348044395446777, | |
| "mean_token_accuracy": 0.7389790266752243, | |
| "num_tokens": 2417819.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 1.0117394506931305, | |
| "epoch": 0.2334481438959051, | |
| "grad_norm": 0.08603253215551376, | |
| "learning_rate": 0.00019417214985900362, | |
| "loss": 1.0533303260803222, | |
| "mean_token_accuracy": 0.7578112691640854, | |
| "num_tokens": 2459072.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 1.0193642653524875, | |
| "epoch": 0.23727516264829698, | |
| "grad_norm": 0.08400722593069077, | |
| "learning_rate": 0.00019390358533637707, | |
| "loss": 1.0955985069274903, | |
| "mean_token_accuracy": 0.7598358646035195, | |
| "num_tokens": 2497901.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 1.0488237984478475, | |
| "epoch": 0.24110218140068887, | |
| "grad_norm": 0.07221511751413345, | |
| "learning_rate": 0.00019363502081375052, | |
| "loss": 1.151495361328125, | |
| "mean_token_accuracy": 0.7541669681668282, | |
| "num_tokens": 2536530.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 1.0595403373241425, | |
| "epoch": 0.24492920015308076, | |
| "grad_norm": 0.10258961468935013, | |
| "learning_rate": 0.00019336645629112396, | |
| "loss": 1.0888574600219727, | |
| "mean_token_accuracy": 0.7474928990006446, | |
| "num_tokens": 2571599.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 1.0924376487731933, | |
| "epoch": 0.24875621890547264, | |
| "grad_norm": 0.0751282125711441, | |
| "learning_rate": 0.0001930978917684974, | |
| "loss": 1.1468082427978517, | |
| "mean_token_accuracy": 0.7465573191642761, | |
| "num_tokens": 2612401.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.9765479557216168, | |
| "epoch": 0.2525832376578645, | |
| "grad_norm": 0.09039046615362167, | |
| "learning_rate": 0.00019282932724587083, | |
| "loss": 1.054959201812744, | |
| "mean_token_accuracy": 0.7649626806378365, | |
| "num_tokens": 2652042.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 1.0833388939499855, | |
| "epoch": 0.2564102564102564, | |
| "grad_norm": 0.08905521035194397, | |
| "learning_rate": 0.00019256076272324425, | |
| "loss": 1.0937233924865724, | |
| "mean_token_accuracy": 0.7510278865694999, | |
| "num_tokens": 2692211.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 1.0478161230683327, | |
| "epoch": 0.2602372751626483, | |
| "grad_norm": 0.09634676575660706, | |
| "learning_rate": 0.0001922921982006177, | |
| "loss": 1.1077991485595704, | |
| "mean_token_accuracy": 0.7490576148033142, | |
| "num_tokens": 2734562.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 1.13061283826828, | |
| "epoch": 0.26406429391504016, | |
| "grad_norm": 0.07757367938756943, | |
| "learning_rate": 0.00019202363367799114, | |
| "loss": 1.1330499649047852, | |
| "mean_token_accuracy": 0.7350772902369499, | |
| "num_tokens": 2778990.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 1.0450415380299092, | |
| "epoch": 0.26789131266743205, | |
| "grad_norm": 0.06570328027009964, | |
| "learning_rate": 0.0001917550691553646, | |
| "loss": 1.1425201416015625, | |
| "mean_token_accuracy": 0.7551864832639694, | |
| "num_tokens": 2814475.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.0114438571035862, | |
| "epoch": 0.27171833141982393, | |
| "grad_norm": 0.11020322889089584, | |
| "learning_rate": 0.00019148650463273804, | |
| "loss": 1.0655290603637695, | |
| "mean_token_accuracy": 0.7622251763939858, | |
| "num_tokens": 2847751.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 1.1008106037974357, | |
| "epoch": 0.2755453501722158, | |
| "grad_norm": 0.07282241433858871, | |
| "learning_rate": 0.00019121794011011146, | |
| "loss": 1.139061450958252, | |
| "mean_token_accuracy": 0.7368278667330742, | |
| "num_tokens": 2889553.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 1.0155851803719997, | |
| "epoch": 0.2793723689246077, | |
| "grad_norm": 0.11799076199531555, | |
| "learning_rate": 0.0001909493755874849, | |
| "loss": 1.0634971618652345, | |
| "mean_token_accuracy": 0.7578275159001351, | |
| "num_tokens": 2926860.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 1.0430648550391197, | |
| "epoch": 0.2831993876769996, | |
| "grad_norm": 0.08702066540718079, | |
| "learning_rate": 0.00019068081106485832, | |
| "loss": 1.0851760864257813, | |
| "mean_token_accuracy": 0.7573095709085464, | |
| "num_tokens": 2964029.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 1.0908455178141594, | |
| "epoch": 0.2870264064293915, | |
| "grad_norm": 0.06593967229127884, | |
| "learning_rate": 0.00019041224654223177, | |
| "loss": 1.0929256439208985, | |
| "mean_token_accuracy": 0.7440102145075798, | |
| "num_tokens": 3004528.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.971546346694231, | |
| "epoch": 0.29085342518178336, | |
| "grad_norm": 0.08857332915067673, | |
| "learning_rate": 0.00019014368201960522, | |
| "loss": 1.0575197219848633, | |
| "mean_token_accuracy": 0.7712547823786735, | |
| "num_tokens": 3041203.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 1.0496620319783687, | |
| "epoch": 0.29468044393417525, | |
| "grad_norm": 0.07172030210494995, | |
| "learning_rate": 0.00018987511749697867, | |
| "loss": 1.100113582611084, | |
| "mean_token_accuracy": 0.747462597489357, | |
| "num_tokens": 3086263.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 1.0853942684829234, | |
| "epoch": 0.29850746268656714, | |
| "grad_norm": 0.0861373096704483, | |
| "learning_rate": 0.0001896065529743521, | |
| "loss": 1.1116449356079101, | |
| "mean_token_accuracy": 0.7485424548387527, | |
| "num_tokens": 3126741.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 1.039051755145192, | |
| "epoch": 0.302334481438959, | |
| "grad_norm": 0.07344193756580353, | |
| "learning_rate": 0.00018933798845172553, | |
| "loss": 1.092859935760498, | |
| "mean_token_accuracy": 0.7524166733026505, | |
| "num_tokens": 3164039.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 1.021885236352682, | |
| "epoch": 0.3061615001913509, | |
| "grad_norm": 0.09843221306800842, | |
| "learning_rate": 0.00018906942392909895, | |
| "loss": 1.0813950538635253, | |
| "mean_token_accuracy": 0.7612547591328621, | |
| "num_tokens": 3202455.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 1.0329275727272034, | |
| "epoch": 0.3099885189437428, | |
| "grad_norm": 0.07059452682733536, | |
| "learning_rate": 0.0001888008594064724, | |
| "loss": 1.0516587257385255, | |
| "mean_token_accuracy": 0.7528441205620766, | |
| "num_tokens": 3239911.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 1.0202949695289134, | |
| "epoch": 0.31381553769613474, | |
| "grad_norm": 0.07269048690795898, | |
| "learning_rate": 0.00018853229488384585, | |
| "loss": 1.0879244804382324, | |
| "mean_token_accuracy": 0.7542849883437157, | |
| "num_tokens": 3278682.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 1.0690899170935153, | |
| "epoch": 0.3176425564485266, | |
| "grad_norm": 0.14370054006576538, | |
| "learning_rate": 0.0001882637303612193, | |
| "loss": 1.1063778877258301, | |
| "mean_token_accuracy": 0.75286915153265, | |
| "num_tokens": 3325465.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 1.0819261983036994, | |
| "epoch": 0.3214695752009185, | |
| "grad_norm": 0.0973975881934166, | |
| "learning_rate": 0.00018799516583859274, | |
| "loss": 1.0978353500366211, | |
| "mean_token_accuracy": 0.749411192536354, | |
| "num_tokens": 3363479.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 1.0502549454569816, | |
| "epoch": 0.3252965939533104, | |
| "grad_norm": 0.11021706461906433, | |
| "learning_rate": 0.0001877266013159662, | |
| "loss": 1.1489330291748048, | |
| "mean_token_accuracy": 0.7476440489292144, | |
| "num_tokens": 3405863.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 1.1556663788855075, | |
| "epoch": 0.3291236127057023, | |
| "grad_norm": 0.06459799408912659, | |
| "learning_rate": 0.0001874580367933396, | |
| "loss": 1.1840539932250977, | |
| "mean_token_accuracy": 0.7288095027208328, | |
| "num_tokens": 3450829.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 1.097336183488369, | |
| "epoch": 0.33295063145809417, | |
| "grad_norm": 0.06765513867139816, | |
| "learning_rate": 0.00018718947227071303, | |
| "loss": 1.1286226272583009, | |
| "mean_token_accuracy": 0.7439226225018502, | |
| "num_tokens": 3490640.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 1.0772622771561147, | |
| "epoch": 0.33677765021048606, | |
| "grad_norm": 0.08126482367515564, | |
| "learning_rate": 0.00018692090774808648, | |
| "loss": 1.1434885025024415, | |
| "mean_token_accuracy": 0.7434220835566521, | |
| "num_tokens": 3529438.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 1.0091869838535785, | |
| "epoch": 0.34060466896287794, | |
| "grad_norm": 0.0654602199792862, | |
| "learning_rate": 0.00018665234322545992, | |
| "loss": 1.0767542839050293, | |
| "mean_token_accuracy": 0.7644046351313591, | |
| "num_tokens": 3565217.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 1.0432863399386405, | |
| "epoch": 0.34443168771526983, | |
| "grad_norm": 0.10025763511657715, | |
| "learning_rate": 0.00018638377870283337, | |
| "loss": 1.0826923370361328, | |
| "mean_token_accuracy": 0.7591656729578972, | |
| "num_tokens": 3603940.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.007722695171833, | |
| "epoch": 0.3482587064676617, | |
| "grad_norm": 0.06779270619153976, | |
| "learning_rate": 0.00018611521418020682, | |
| "loss": 1.0158637046813965, | |
| "mean_token_accuracy": 0.763145099580288, | |
| "num_tokens": 3644547.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 1.0556264080107212, | |
| "epoch": 0.3520857252200536, | |
| "grad_norm": 0.07834554463624954, | |
| "learning_rate": 0.00018584664965758026, | |
| "loss": 1.091851806640625, | |
| "mean_token_accuracy": 0.7483858004212379, | |
| "num_tokens": 3691979.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 1.0730156242847442, | |
| "epoch": 0.3559127439724455, | |
| "grad_norm": 0.10772417485713959, | |
| "learning_rate": 0.00018557808513495368, | |
| "loss": 1.1370153427124023, | |
| "mean_token_accuracy": 0.7466916054487228, | |
| "num_tokens": 3728767.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 1.081335111707449, | |
| "epoch": 0.3597397627248374, | |
| "grad_norm": 0.07669705897569656, | |
| "learning_rate": 0.0001853095206123271, | |
| "loss": 1.141366958618164, | |
| "mean_token_accuracy": 0.744081811606884, | |
| "num_tokens": 3772234.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 0.9984517656266689, | |
| "epoch": 0.36356678147722926, | |
| "grad_norm": 0.0695272758603096, | |
| "learning_rate": 0.00018504095608970055, | |
| "loss": 1.0501303672790527, | |
| "mean_token_accuracy": 0.7590280339121819, | |
| "num_tokens": 3816970.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 0.910194194689393, | |
| "epoch": 0.36739380022962115, | |
| "grad_norm": 0.06411932408809662, | |
| "learning_rate": 0.000184772391567074, | |
| "loss": 0.9656248092651367, | |
| "mean_token_accuracy": 0.7823562085628509, | |
| "num_tokens": 3853816.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 0.9763211451470852, | |
| "epoch": 0.37122081898201303, | |
| "grad_norm": 0.08389662951231003, | |
| "learning_rate": 0.00018450382704444744, | |
| "loss": 1.0703671455383301, | |
| "mean_token_accuracy": 0.76742093116045, | |
| "num_tokens": 3896404.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 1.076941692829132, | |
| "epoch": 0.3750478377344049, | |
| "grad_norm": 0.13239043951034546, | |
| "learning_rate": 0.0001842352625218209, | |
| "loss": 1.1353830337524413, | |
| "mean_token_accuracy": 0.7479456245899201, | |
| "num_tokens": 3934187.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 1.077423833310604, | |
| "epoch": 0.3788748564867968, | |
| "grad_norm": 0.06203702092170715, | |
| "learning_rate": 0.00018396669799919434, | |
| "loss": 1.1363765716552734, | |
| "mean_token_accuracy": 0.7437105163931846, | |
| "num_tokens": 3975766.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 1.009491826593876, | |
| "epoch": 0.3827018752391887, | |
| "grad_norm": 0.06740409135818481, | |
| "learning_rate": 0.00018369813347656776, | |
| "loss": 1.0752355575561523, | |
| "mean_token_accuracy": 0.7598015293478966, | |
| "num_tokens": 4018368.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.9744428530335426, | |
| "epoch": 0.3865288939915806, | |
| "grad_norm": 0.07750537246465683, | |
| "learning_rate": 0.00018342956895394118, | |
| "loss": 1.0554892539978027, | |
| "mean_token_accuracy": 0.7682438552379608, | |
| "num_tokens": 4057647.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 1.0335246473550797, | |
| "epoch": 0.39035591274397247, | |
| "grad_norm": 0.07627248764038086, | |
| "learning_rate": 0.00018316100443131463, | |
| "loss": 1.0600407600402832, | |
| "mean_token_accuracy": 0.7552958622574806, | |
| "num_tokens": 4098377.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 1.0256185740232469, | |
| "epoch": 0.39418293149636435, | |
| "grad_norm": 0.10117889940738678, | |
| "learning_rate": 0.00018289243990868807, | |
| "loss": 1.0706727027893066, | |
| "mean_token_accuracy": 0.75880047082901, | |
| "num_tokens": 4141633.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 0.9883378148078918, | |
| "epoch": 0.39800995024875624, | |
| "grad_norm": 0.064593605697155, | |
| "learning_rate": 0.00018262387538606152, | |
| "loss": 1.007016372680664, | |
| "mean_token_accuracy": 0.764974731206894, | |
| "num_tokens": 4181155.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 1.0692595109343528, | |
| "epoch": 0.4018369690011481, | |
| "grad_norm": 0.07493151724338531, | |
| "learning_rate": 0.00018235531086343497, | |
| "loss": 1.123647975921631, | |
| "mean_token_accuracy": 0.7452841177582741, | |
| "num_tokens": 4218175.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 0.990117172151804, | |
| "epoch": 0.40566398775354, | |
| "grad_norm": 0.06332839280366898, | |
| "learning_rate": 0.0001820867463408084, | |
| "loss": 1.0538661003112793, | |
| "mean_token_accuracy": 0.7637043848633767, | |
| "num_tokens": 4262326.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 1.002436650544405, | |
| "epoch": 0.4094910065059319, | |
| "grad_norm": 0.07898294180631638, | |
| "learning_rate": 0.00018181818181818183, | |
| "loss": 0.9973239898681641, | |
| "mean_token_accuracy": 0.7644759714603424, | |
| "num_tokens": 4300876.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 0.9635432817041873, | |
| "epoch": 0.4133180252583238, | |
| "grad_norm": 0.09760674089193344, | |
| "learning_rate": 0.00018154961729555525, | |
| "loss": 1.0411369323730468, | |
| "mean_token_accuracy": 0.7664693981409073, | |
| "num_tokens": 4338887.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 0.9780610945075751, | |
| "epoch": 0.41714504401071567, | |
| "grad_norm": 0.08076441287994385, | |
| "learning_rate": 0.0001812810527729287, | |
| "loss": 1.0544751167297364, | |
| "mean_token_accuracy": 0.76624975502491, | |
| "num_tokens": 4380678.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 1.0548741944134234, | |
| "epoch": 0.42097206276310756, | |
| "grad_norm": 0.0646439641714096, | |
| "learning_rate": 0.00018101248825030215, | |
| "loss": 1.128230667114258, | |
| "mean_token_accuracy": 0.7504925444722176, | |
| "num_tokens": 4422899.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 1.0767812803387642, | |
| "epoch": 0.42479908151549944, | |
| "grad_norm": 0.06994366645812988, | |
| "learning_rate": 0.0001807439237276756, | |
| "loss": 1.1209583282470703, | |
| "mean_token_accuracy": 0.7477620646357537, | |
| "num_tokens": 4461864.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 1.062944334745407, | |
| "epoch": 0.42862610026789133, | |
| "grad_norm": 0.11016593873500824, | |
| "learning_rate": 0.00018047535920504904, | |
| "loss": 1.0880105018615722, | |
| "mean_token_accuracy": 0.7455767750740051, | |
| "num_tokens": 4501378.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 1.0511136516928672, | |
| "epoch": 0.4324531190202832, | |
| "grad_norm": 0.08707646280527115, | |
| "learning_rate": 0.00018020679468242246, | |
| "loss": 1.0764313697814942, | |
| "mean_token_accuracy": 0.7519838035106658, | |
| "num_tokens": 4541448.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 0.9529998056590557, | |
| "epoch": 0.4362801377726751, | |
| "grad_norm": 0.07353853434324265, | |
| "learning_rate": 0.00017993823015979588, | |
| "loss": 1.0098756790161132, | |
| "mean_token_accuracy": 0.7720077604055404, | |
| "num_tokens": 4586147.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 1.143844011425972, | |
| "epoch": 0.440107156525067, | |
| "grad_norm": 0.06268489360809326, | |
| "learning_rate": 0.00017966966563716933, | |
| "loss": 1.1934361457824707, | |
| "mean_token_accuracy": 0.7281116575002671, | |
| "num_tokens": 4631906.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 1.0761573910713196, | |
| "epoch": 0.4439341752774589, | |
| "grad_norm": 0.07078517228364944, | |
| "learning_rate": 0.00017940110111454278, | |
| "loss": 1.1359615325927734, | |
| "mean_token_accuracy": 0.7409780561923981, | |
| "num_tokens": 4674626.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 0.9940036550164223, | |
| "epoch": 0.44776119402985076, | |
| "grad_norm": 0.08054502308368683, | |
| "learning_rate": 0.00017913253659191622, | |
| "loss": 1.033839225769043, | |
| "mean_token_accuracy": 0.7682256817817688, | |
| "num_tokens": 4715717.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 0.9752239182591438, | |
| "epoch": 0.45158821278224265, | |
| "grad_norm": 0.08600450307130814, | |
| "learning_rate": 0.00017886397206928967, | |
| "loss": 1.0254844665527343, | |
| "mean_token_accuracy": 0.7688430979847908, | |
| "num_tokens": 4747316.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 1.064694558084011, | |
| "epoch": 0.45541523153463453, | |
| "grad_norm": 0.07270248234272003, | |
| "learning_rate": 0.0001785954075466631, | |
| "loss": 1.0806646347045898, | |
| "mean_token_accuracy": 0.7534265503287315, | |
| "num_tokens": 4788606.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 0.958549628406763, | |
| "epoch": 0.4592422502870264, | |
| "grad_norm": 0.0644846111536026, | |
| "learning_rate": 0.00017832684302403654, | |
| "loss": 1.0015847206115722, | |
| "mean_token_accuracy": 0.7644492238759995, | |
| "num_tokens": 4831371.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 1.0885677203536033, | |
| "epoch": 0.4630692690394183, | |
| "grad_norm": 0.13487283885478973, | |
| "learning_rate": 0.00017805827850140996, | |
| "loss": 1.1495524406433106, | |
| "mean_token_accuracy": 0.7414823487401009, | |
| "num_tokens": 4871231.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 1.1117899976670742, | |
| "epoch": 0.4668962877918102, | |
| "grad_norm": 0.08015701174736023, | |
| "learning_rate": 0.0001777897139787834, | |
| "loss": 1.1366958618164062, | |
| "mean_token_accuracy": 0.7351289570331574, | |
| "num_tokens": 4911520.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 0.9722193017601967, | |
| "epoch": 0.4707233065442021, | |
| "grad_norm": 0.06839531660079956, | |
| "learning_rate": 0.00017752114945615685, | |
| "loss": 1.0259140968322753, | |
| "mean_token_accuracy": 0.7658233359456063, | |
| "num_tokens": 4950296.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 1.0021446757018566, | |
| "epoch": 0.47455032529659397, | |
| "grad_norm": 0.08231978863477707, | |
| "learning_rate": 0.0001772525849335303, | |
| "loss": 1.0437036514282227, | |
| "mean_token_accuracy": 0.7644398525357247, | |
| "num_tokens": 4989688.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 0.9640353135764599, | |
| "epoch": 0.47837734404898585, | |
| "grad_norm": 0.11587074398994446, | |
| "learning_rate": 0.00017698402041090375, | |
| "loss": 1.0072126388549805, | |
| "mean_token_accuracy": 0.7740294471383095, | |
| "num_tokens": 5029135.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 1.0122342824935913, | |
| "epoch": 0.48220436280137774, | |
| "grad_norm": 0.07646426558494568, | |
| "learning_rate": 0.00017671545588827717, | |
| "loss": 1.0733034133911132, | |
| "mean_token_accuracy": 0.7619047269225121, | |
| "num_tokens": 5066488.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 1.0465880073606968, | |
| "epoch": 0.4860313815537696, | |
| "grad_norm": 0.07594821602106094, | |
| "learning_rate": 0.0001764468913656506, | |
| "loss": 1.0953254699707031, | |
| "mean_token_accuracy": 0.7511951208114624, | |
| "num_tokens": 5102103.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 1.0104024082422256, | |
| "epoch": 0.4898584003061615, | |
| "grad_norm": 0.07695835083723068, | |
| "learning_rate": 0.00017617832684302403, | |
| "loss": 1.1025714874267578, | |
| "mean_token_accuracy": 0.7583977058529854, | |
| "num_tokens": 5141113.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 1.044089037179947, | |
| "epoch": 0.4936854190585534, | |
| "grad_norm": 0.07186906039714813, | |
| "learning_rate": 0.00017590976232039748, | |
| "loss": 1.0713683128356934, | |
| "mean_token_accuracy": 0.7546971932053566, | |
| "num_tokens": 5181454.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 0.9154780797660351, | |
| "epoch": 0.4975124378109453, | |
| "grad_norm": 0.08934911340475082, | |
| "learning_rate": 0.00017564119779777093, | |
| "loss": 0.9871469497680664, | |
| "mean_token_accuracy": 0.7756836161017417, | |
| "num_tokens": 5214143.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 1.05635926425457, | |
| "epoch": 0.5013394565633371, | |
| "grad_norm": 0.07880513370037079, | |
| "learning_rate": 0.00017537263327514437, | |
| "loss": 1.0985527038574219, | |
| "mean_token_accuracy": 0.7524559125304222, | |
| "num_tokens": 5258310.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 1.0448619149625302, | |
| "epoch": 0.505166475315729, | |
| "grad_norm": 0.10507462918758392, | |
| "learning_rate": 0.0001751040687525178, | |
| "loss": 1.1040778160095215, | |
| "mean_token_accuracy": 0.7477249845862388, | |
| "num_tokens": 5296865.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 1.0706947155296802, | |
| "epoch": 0.5089934940681209, | |
| "grad_norm": 0.09437765926122665, | |
| "learning_rate": 0.00017483550422989124, | |
| "loss": 1.1372867584228517, | |
| "mean_token_accuracy": 0.7502224639058113, | |
| "num_tokens": 5335301.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 0.9736435614526272, | |
| "epoch": 0.5128205128205128, | |
| "grad_norm": 0.07162626087665558, | |
| "learning_rate": 0.0001745669397072647, | |
| "loss": 1.0273897171020507, | |
| "mean_token_accuracy": 0.7711918234825135, | |
| "num_tokens": 5372533.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 0.9989161014556884, | |
| "epoch": 0.5166475315729047, | |
| "grad_norm": 0.08805254101753235, | |
| "learning_rate": 0.0001742983751846381, | |
| "loss": 1.0603778839111329, | |
| "mean_token_accuracy": 0.7600028276443481, | |
| "num_tokens": 5412651.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 1.063752220571041, | |
| "epoch": 0.5204745503252965, | |
| "grad_norm": 0.08056829869747162, | |
| "learning_rate": 0.00017402981066201156, | |
| "loss": 1.0876687049865723, | |
| "mean_token_accuracy": 0.7487231969833374, | |
| "num_tokens": 5454518.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 0.966426993906498, | |
| "epoch": 0.5243015690776884, | |
| "grad_norm": 0.06970727443695068, | |
| "learning_rate": 0.000173761246139385, | |
| "loss": 1.0302441596984864, | |
| "mean_token_accuracy": 0.7640074551105499, | |
| "num_tokens": 5495257.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 0.9727898858487606, | |
| "epoch": 0.5281285878300803, | |
| "grad_norm": 0.09694326668977737, | |
| "learning_rate": 0.00017349268161675842, | |
| "loss": 1.0327792167663574, | |
| "mean_token_accuracy": 0.7687779292464256, | |
| "num_tokens": 5527573.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 1.043993879854679, | |
| "epoch": 0.5319556065824722, | |
| "grad_norm": 0.05676735192537308, | |
| "learning_rate": 0.00017322411709413187, | |
| "loss": 1.1139988899230957, | |
| "mean_token_accuracy": 0.7597961351275444, | |
| "num_tokens": 5566542.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 0.9891408108174801, | |
| "epoch": 0.5357826253348641, | |
| "grad_norm": 0.08670998364686966, | |
| "learning_rate": 0.00017295555257150532, | |
| "loss": 1.0878351211547852, | |
| "mean_token_accuracy": 0.7640718072652817, | |
| "num_tokens": 5604986.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 1.0097288101911546, | |
| "epoch": 0.539609644087256, | |
| "grad_norm": 0.09190856665372849, | |
| "learning_rate": 0.00017268698804887876, | |
| "loss": 1.079444408416748, | |
| "mean_token_accuracy": 0.7590912491083145, | |
| "num_tokens": 5642224.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 0.9927844725549221, | |
| "epoch": 0.5434366628396479, | |
| "grad_norm": 0.08191007375717163, | |
| "learning_rate": 0.00017241842352625218, | |
| "loss": 1.0661033630371093, | |
| "mean_token_accuracy": 0.7664914444088936, | |
| "num_tokens": 5680912.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 0.973179691657424, | |
| "epoch": 0.5472636815920398, | |
| "grad_norm": 0.08161566406488419, | |
| "learning_rate": 0.00017214985900362563, | |
| "loss": 1.078667163848877, | |
| "mean_token_accuracy": 0.7686992704868316, | |
| "num_tokens": 5717171.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 1.0467095457017421, | |
| "epoch": 0.5510907003444316, | |
| "grad_norm": 0.09403429925441742, | |
| "learning_rate": 0.00017188129448099908, | |
| "loss": 1.0912303924560547, | |
| "mean_token_accuracy": 0.7550861686468124, | |
| "num_tokens": 5755956.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 1.0082954704761504, | |
| "epoch": 0.5549177190968235, | |
| "grad_norm": 0.09858231991529465, | |
| "learning_rate": 0.0001716127299583725, | |
| "loss": 1.0449023246765137, | |
| "mean_token_accuracy": 0.7586705282330513, | |
| "num_tokens": 5798765.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 1.0528397418558597, | |
| "epoch": 0.5587447378492154, | |
| "grad_norm": 0.06697855144739151, | |
| "learning_rate": 0.00017134416543574594, | |
| "loss": 1.0901053428649903, | |
| "mean_token_accuracy": 0.7517547190189362, | |
| "num_tokens": 5839833.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 1.009619940817356, | |
| "epoch": 0.5625717566016073, | |
| "grad_norm": 0.07271189987659454, | |
| "learning_rate": 0.0001710756009131194, | |
| "loss": 1.0171070098876953, | |
| "mean_token_accuracy": 0.7594649389386177, | |
| "num_tokens": 5880613.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 0.9699329622089863, | |
| "epoch": 0.5663987753539992, | |
| "grad_norm": 0.07800697535276413, | |
| "learning_rate": 0.0001708070363904928, | |
| "loss": 1.1077412605285644, | |
| "mean_token_accuracy": 0.7667307928204536, | |
| "num_tokens": 5918218.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 0.9389957278966904, | |
| "epoch": 0.5702257941063911, | |
| "grad_norm": 0.08150342851877213, | |
| "learning_rate": 0.00017053847186786626, | |
| "loss": 0.9634763717651367, | |
| "mean_token_accuracy": 0.7806087970733643, | |
| "num_tokens": 5960284.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 1.0140163496136665, | |
| "epoch": 0.574052812858783, | |
| "grad_norm": 0.06430503726005554, | |
| "learning_rate": 0.0001702699073452397, | |
| "loss": 1.0751851081848145, | |
| "mean_token_accuracy": 0.7564210310578346, | |
| "num_tokens": 6000562.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 1.047791599482298, | |
| "epoch": 0.5778798316111748, | |
| "grad_norm": 0.07922326028347015, | |
| "learning_rate": 0.00017000134282261313, | |
| "loss": 1.1397698402404786, | |
| "mean_token_accuracy": 0.7494736298918724, | |
| "num_tokens": 6045192.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 1.1085532158613205, | |
| "epoch": 0.5817068503635667, | |
| "grad_norm": 0.1093953400850296, | |
| "learning_rate": 0.00016973277829998657, | |
| "loss": 1.142368698120117, | |
| "mean_token_accuracy": 0.7414237394928932, | |
| "num_tokens": 6090148.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 0.9427969709038735, | |
| "epoch": 0.5855338691159586, | |
| "grad_norm": 0.09579843282699585, | |
| "learning_rate": 0.00016946421377736002, | |
| "loss": 0.9980224609375, | |
| "mean_token_accuracy": 0.7730853497982025, | |
| "num_tokens": 6129845.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 1.0571323171257974, | |
| "epoch": 0.5893608878683505, | |
| "grad_norm": 0.09482655674219131, | |
| "learning_rate": 0.00016919564925473347, | |
| "loss": 1.0665513038635255, | |
| "mean_token_accuracy": 0.749831511080265, | |
| "num_tokens": 6171961.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 0.978803563863039, | |
| "epoch": 0.5931879066207424, | |
| "grad_norm": 0.08609842509031296, | |
| "learning_rate": 0.0001689270847321069, | |
| "loss": 1.0541341781616211, | |
| "mean_token_accuracy": 0.769312071800232, | |
| "num_tokens": 6210126.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 1.0714900024235248, | |
| "epoch": 0.5970149253731343, | |
| "grad_norm": 0.05390879884362221, | |
| "learning_rate": 0.00016865852020948033, | |
| "loss": 1.1057682037353516, | |
| "mean_token_accuracy": 0.7402923837304115, | |
| "num_tokens": 6262394.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 0.9234071888029576, | |
| "epoch": 0.6008419441255262, | |
| "grad_norm": 0.09692159295082092, | |
| "learning_rate": 0.00016838995568685378, | |
| "loss": 0.9622941017150879, | |
| "mean_token_accuracy": 0.7832354381680489, | |
| "num_tokens": 6292140.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 0.9591108359396457, | |
| "epoch": 0.604668962877918, | |
| "grad_norm": 0.0720645934343338, | |
| "learning_rate": 0.0001681213911642272, | |
| "loss": 1.019169235229492, | |
| "mean_token_accuracy": 0.771478471159935, | |
| "num_tokens": 6332134.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 1.0945825845003128, | |
| "epoch": 0.6084959816303099, | |
| "grad_norm": 0.07380460202693939, | |
| "learning_rate": 0.00016785282664160065, | |
| "loss": 1.1463271141052247, | |
| "mean_token_accuracy": 0.7419712334871292, | |
| "num_tokens": 6373967.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 1.055589073896408, | |
| "epoch": 0.6123230003827018, | |
| "grad_norm": 0.07209772616624832, | |
| "learning_rate": 0.0001675842621189741, | |
| "loss": 1.1144783973693848, | |
| "mean_token_accuracy": 0.7461996227502823, | |
| "num_tokens": 6418236.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 1.1149650782346725, | |
| "epoch": 0.6161500191350937, | |
| "grad_norm": 0.07935164868831635, | |
| "learning_rate": 0.00016731569759634754, | |
| "loss": 1.1727729797363282, | |
| "mean_token_accuracy": 0.7365738078951836, | |
| "num_tokens": 6464589.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 1.0068970195949078, | |
| "epoch": 0.6199770378874856, | |
| "grad_norm": 0.0804886594414711, | |
| "learning_rate": 0.00016704713307372096, | |
| "loss": 1.0483062744140625, | |
| "mean_token_accuracy": 0.7632505163550377, | |
| "num_tokens": 6504385.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 0.9996877416968346, | |
| "epoch": 0.6238040566398775, | |
| "grad_norm": 0.0723455473780632, | |
| "learning_rate": 0.0001667785685510944, | |
| "loss": 1.0592655181884765, | |
| "mean_token_accuracy": 0.7597218692302704, | |
| "num_tokens": 6545928.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 0.9159566629678011, | |
| "epoch": 0.6276310753922695, | |
| "grad_norm": 0.08028513193130493, | |
| "learning_rate": 0.00016651000402846783, | |
| "loss": 0.9434080123901367, | |
| "mean_token_accuracy": 0.7826011970639228, | |
| "num_tokens": 6586166.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 1.0089009895920753, | |
| "epoch": 0.6314580941446614, | |
| "grad_norm": 0.09154181182384491, | |
| "learning_rate": 0.00016624143950584128, | |
| "loss": 1.0339744567871094, | |
| "mean_token_accuracy": 0.7604109585285187, | |
| "num_tokens": 6624706.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 1.0208431974053382, | |
| "epoch": 0.6352851128970533, | |
| "grad_norm": 0.08039630204439163, | |
| "learning_rate": 0.00016597287498321472, | |
| "loss": 1.0823830604553222, | |
| "mean_token_accuracy": 0.7548153042793274, | |
| "num_tokens": 6665626.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 0.9645413011312485, | |
| "epoch": 0.6391121316494451, | |
| "grad_norm": 0.08834270387887955, | |
| "learning_rate": 0.00016570431046058817, | |
| "loss": 1.0401340484619142, | |
| "mean_token_accuracy": 0.7703565835952759, | |
| "num_tokens": 6699532.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 1.0028597339987755, | |
| "epoch": 0.642939150401837, | |
| "grad_norm": 0.08974612504243851, | |
| "learning_rate": 0.00016543574593796162, | |
| "loss": 1.0680928230285645, | |
| "mean_token_accuracy": 0.7617413088679313, | |
| "num_tokens": 6740574.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 0.8952145710587501, | |
| "epoch": 0.6467661691542289, | |
| "grad_norm": 0.09289242327213287, | |
| "learning_rate": 0.00016516718141533504, | |
| "loss": 0.9696210861206055, | |
| "mean_token_accuracy": 0.7848611980676651, | |
| "num_tokens": 6782208.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 0.9520700328052044, | |
| "epoch": 0.6505931879066208, | |
| "grad_norm": 0.07298107445240021, | |
| "learning_rate": 0.00016489861689270848, | |
| "loss": 0.9891908645629883, | |
| "mean_token_accuracy": 0.7725306749343872, | |
| "num_tokens": 6818937.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 0.9734554067254066, | |
| "epoch": 0.6544202066590127, | |
| "grad_norm": 0.08233233541250229, | |
| "learning_rate": 0.0001646300523700819, | |
| "loss": 1.0311893463134765, | |
| "mean_token_accuracy": 0.7683428943157196, | |
| "num_tokens": 6851548.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 0.9947349905967713, | |
| "epoch": 0.6582472254114046, | |
| "grad_norm": 0.08351403474807739, | |
| "learning_rate": 0.00016436148784745535, | |
| "loss": 1.0382192611694336, | |
| "mean_token_accuracy": 0.7587384819984436, | |
| "num_tokens": 6891553.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 1.0383310310542584, | |
| "epoch": 0.6620742441637965, | |
| "grad_norm": 0.07240983843803406, | |
| "learning_rate": 0.0001640929233248288, | |
| "loss": 1.0978598594665527, | |
| "mean_token_accuracy": 0.7515711337327957, | |
| "num_tokens": 6930875.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 1.085533195734024, | |
| "epoch": 0.6659012629161883, | |
| "grad_norm": 0.06999973207712173, | |
| "learning_rate": 0.00016382435880220225, | |
| "loss": 1.1412755966186523, | |
| "mean_token_accuracy": 0.7471029132604599, | |
| "num_tokens": 6971721.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 0.978867219388485, | |
| "epoch": 0.6697282816685802, | |
| "grad_norm": 0.06091843172907829, | |
| "learning_rate": 0.0001635557942795757, | |
| "loss": 1.023170566558838, | |
| "mean_token_accuracy": 0.7686347916722298, | |
| "num_tokens": 7010340.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 1.0823599390685559, | |
| "epoch": 0.6735553004209721, | |
| "grad_norm": 0.07732617110013962, | |
| "learning_rate": 0.0001632872297569491, | |
| "loss": 1.1036816596984864, | |
| "mean_token_accuracy": 0.7404509574174881, | |
| "num_tokens": 7061189.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 0.9984334908425808, | |
| "epoch": 0.677382319173364, | |
| "grad_norm": 0.11516186594963074, | |
| "learning_rate": 0.00016301866523432253, | |
| "loss": 1.1070829391479493, | |
| "mean_token_accuracy": 0.7593477964401245, | |
| "num_tokens": 7098345.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 1.0023914370685816, | |
| "epoch": 0.6812093379257559, | |
| "grad_norm": 0.08624757081270218, | |
| "learning_rate": 0.00016275010071169598, | |
| "loss": 1.043964958190918, | |
| "mean_token_accuracy": 0.7635682225227356, | |
| "num_tokens": 7136873.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 1.0823404759168624, | |
| "epoch": 0.6850363566781478, | |
| "grad_norm": 0.0846925675868988, | |
| "learning_rate": 0.00016248153618906943, | |
| "loss": 1.1333115577697754, | |
| "mean_token_accuracy": 0.7394865393638611, | |
| "num_tokens": 7181553.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 1.0224985226988792, | |
| "epoch": 0.6888633754305397, | |
| "grad_norm": 0.060152288526296616, | |
| "learning_rate": 0.00016221297166644287, | |
| "loss": 1.0782301902770997, | |
| "mean_token_accuracy": 0.759938097000122, | |
| "num_tokens": 7223076.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 1.0880159534513951, | |
| "epoch": 0.6926903941829315, | |
| "grad_norm": 0.06577905267477036, | |
| "learning_rate": 0.00016194440714381632, | |
| "loss": 1.1103734016418456, | |
| "mean_token_accuracy": 0.7457254812121391, | |
| "num_tokens": 7265594.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 1.0144932381808758, | |
| "epoch": 0.6965174129353234, | |
| "grad_norm": 0.07276095449924469, | |
| "learning_rate": 0.00016167584262118974, | |
| "loss": 1.0733102798461913, | |
| "mean_token_accuracy": 0.759276558458805, | |
| "num_tokens": 7305359.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 1.0186967477202415, | |
| "epoch": 0.7003444316877153, | |
| "grad_norm": 0.08775337040424347, | |
| "learning_rate": 0.0001614072780985632, | |
| "loss": 1.0672088623046876, | |
| "mean_token_accuracy": 0.7567476496100426, | |
| "num_tokens": 7348060.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 0.9723582908511161, | |
| "epoch": 0.7041714504401072, | |
| "grad_norm": 0.09250030666589737, | |
| "learning_rate": 0.0001611387135759366, | |
| "loss": 1.0032880783081055, | |
| "mean_token_accuracy": 0.7693375036120415, | |
| "num_tokens": 7387110.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 0.9738463938236237, | |
| "epoch": 0.7079984691924991, | |
| "grad_norm": 0.09884033352136612, | |
| "learning_rate": 0.00016087014905331006, | |
| "loss": 1.0514408111572267, | |
| "mean_token_accuracy": 0.7625621780753136, | |
| "num_tokens": 7428511.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 1.0910252556204796, | |
| "epoch": 0.711825487944891, | |
| "grad_norm": 0.09194686263799667, | |
| "learning_rate": 0.0001606015845306835, | |
| "loss": 1.1226488113403321, | |
| "mean_token_accuracy": 0.7452121302485466, | |
| "num_tokens": 7474736.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 1.0159518368542195, | |
| "epoch": 0.7156525066972829, | |
| "grad_norm": 0.07921712845563889, | |
| "learning_rate": 0.00016033302000805695, | |
| "loss": 1.061795711517334, | |
| "mean_token_accuracy": 0.761272345483303, | |
| "num_tokens": 7516891.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 0.8903906352818012, | |
| "epoch": 0.7194795254496748, | |
| "grad_norm": 0.10288332402706146, | |
| "learning_rate": 0.0001600644554854304, | |
| "loss": 0.9471863746643067, | |
| "mean_token_accuracy": 0.7831206247210503, | |
| "num_tokens": 7551597.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 1.0759326584637166, | |
| "epoch": 0.7233065442020666, | |
| "grad_norm": 0.06488945335149765, | |
| "learning_rate": 0.00015979589096280382, | |
| "loss": 1.136262798309326, | |
| "mean_token_accuracy": 0.742707334458828, | |
| "num_tokens": 7597529.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 0.9951202683150768, | |
| "epoch": 0.7271335629544585, | |
| "grad_norm": 0.06628359109163284, | |
| "learning_rate": 0.00015952732644017724, | |
| "loss": 1.0448930740356446, | |
| "mean_token_accuracy": 0.7615623638033867, | |
| "num_tokens": 7634291.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 1.0593122780323028, | |
| "epoch": 0.7309605817068504, | |
| "grad_norm": 0.08212320506572723, | |
| "learning_rate": 0.00015925876191755068, | |
| "loss": 1.1099414825439453, | |
| "mean_token_accuracy": 0.7455122962594032, | |
| "num_tokens": 7677086.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 1.0122868783771992, | |
| "epoch": 0.7347876004592423, | |
| "grad_norm": 0.06458455324172974, | |
| "learning_rate": 0.00015899019739492413, | |
| "loss": 1.0447346687316894, | |
| "mean_token_accuracy": 0.7542453840374946, | |
| "num_tokens": 7721785.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 1.0801358975470066, | |
| "epoch": 0.7386146192116342, | |
| "grad_norm": 0.06971931457519531, | |
| "learning_rate": 0.00015872163287229758, | |
| "loss": 1.109630012512207, | |
| "mean_token_accuracy": 0.7450100436806679, | |
| "num_tokens": 7761887.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 0.9081138484179974, | |
| "epoch": 0.7424416379640261, | |
| "grad_norm": 0.06223156675696373, | |
| "learning_rate": 0.00015845306834967102, | |
| "loss": 1.0026588439941406, | |
| "mean_token_accuracy": 0.7772255912423134, | |
| "num_tokens": 7807072.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 1.0170219503343105, | |
| "epoch": 0.746268656716418, | |
| "grad_norm": 0.0685853511095047, | |
| "learning_rate": 0.00015818450382704447, | |
| "loss": 1.055277442932129, | |
| "mean_token_accuracy": 0.7580551549792289, | |
| "num_tokens": 7845791.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 1.0753178864717483, | |
| "epoch": 0.7500956754688098, | |
| "grad_norm": 0.08306553959846497, | |
| "learning_rate": 0.0001579159393044179, | |
| "loss": 1.1332826614379883, | |
| "mean_token_accuracy": 0.7421450033783913, | |
| "num_tokens": 7891091.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 0.9297005102038384, | |
| "epoch": 0.7539226942212017, | |
| "grad_norm": 0.08018683642148972, | |
| "learning_rate": 0.0001576473747817913, | |
| "loss": 1.000318431854248, | |
| "mean_token_accuracy": 0.7793557167053222, | |
| "num_tokens": 7928252.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 1.0840253300964833, | |
| "epoch": 0.7577497129735936, | |
| "grad_norm": 0.06487595289945602, | |
| "learning_rate": 0.00015737881025916476, | |
| "loss": 1.1166275024414063, | |
| "mean_token_accuracy": 0.7378593400120735, | |
| "num_tokens": 7972071.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 1.0406386695802212, | |
| "epoch": 0.7615767317259855, | |
| "grad_norm": 0.0615115687251091, | |
| "learning_rate": 0.0001571102457365382, | |
| "loss": 1.0869349479675292, | |
| "mean_token_accuracy": 0.7490768045186996, | |
| "num_tokens": 8016865.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 0.9573215276002884, | |
| "epoch": 0.7654037504783774, | |
| "grad_norm": 0.0715412124991417, | |
| "learning_rate": 0.00015684168121391165, | |
| "loss": 1.0404720306396484, | |
| "mean_token_accuracy": 0.7706617951393128, | |
| "num_tokens": 8055917.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 0.9201878193765879, | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 0.07988248765468597, | |
| "learning_rate": 0.0001565731166912851, | |
| "loss": 0.9380558967590332, | |
| "mean_token_accuracy": 0.782890722155571, | |
| "num_tokens": 8093252.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "entropy": 1.0045961767435074, | |
| "epoch": 0.7730577879831612, | |
| "grad_norm": 0.061089444905519485, | |
| "learning_rate": 0.00015630455216865855, | |
| "loss": 1.0528027534484863, | |
| "mean_token_accuracy": 0.7598949059844017, | |
| "num_tokens": 8135244.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "entropy": 0.9942824639379978, | |
| "epoch": 0.776884806735553, | |
| "grad_norm": 0.06443686783313751, | |
| "learning_rate": 0.00015603598764603197, | |
| "loss": 1.0168493270874024, | |
| "mean_token_accuracy": 0.7590687796473503, | |
| "num_tokens": 8178961.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "entropy": 0.9773981764912605, | |
| "epoch": 0.7807118254879449, | |
| "grad_norm": 0.0818348303437233, | |
| "learning_rate": 0.0001557674231234054, | |
| "loss": 1.0193141937255858, | |
| "mean_token_accuracy": 0.7708378821611405, | |
| "num_tokens": 8217139.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "entropy": 0.9836540646851063, | |
| "epoch": 0.7845388442403368, | |
| "grad_norm": 0.06240411475300789, | |
| "learning_rate": 0.00015549885860077883, | |
| "loss": 1.0662775993347169, | |
| "mean_token_accuracy": 0.7658124819397927, | |
| "num_tokens": 8252825.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 1.036501456052065, | |
| "epoch": 0.7883658629927287, | |
| "grad_norm": 0.09231610596179962, | |
| "learning_rate": 0.00015523029407815228, | |
| "loss": 1.112645435333252, | |
| "mean_token_accuracy": 0.7541953936219216, | |
| "num_tokens": 8295113.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "entropy": 0.9800528183579444, | |
| "epoch": 0.7921928817451206, | |
| "grad_norm": 0.08806589245796204, | |
| "learning_rate": 0.00015496172955552573, | |
| "loss": 1.0401280403137207, | |
| "mean_token_accuracy": 0.7672899037599563, | |
| "num_tokens": 8335977.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "entropy": 0.9678378522396087, | |
| "epoch": 0.7960199004975125, | |
| "grad_norm": 0.08777868002653122, | |
| "learning_rate": 0.00015469316503289918, | |
| "loss": 1.0509014129638672, | |
| "mean_token_accuracy": 0.7696513712406159, | |
| "num_tokens": 8374917.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "entropy": 1.042826947569847, | |
| "epoch": 0.7998469192499044, | |
| "grad_norm": 0.09018490463495255, | |
| "learning_rate": 0.00015442460051027262, | |
| "loss": 1.0869378089904784, | |
| "mean_token_accuracy": 0.7507286682724953, | |
| "num_tokens": 8415614.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "entropy": 1.0548966623842717, | |
| "epoch": 0.8036739380022963, | |
| "grad_norm": 0.07267605513334274, | |
| "learning_rate": 0.00015415603598764604, | |
| "loss": 1.0960289001464845, | |
| "mean_token_accuracy": 0.7545556098222732, | |
| "num_tokens": 8455059.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 1.044345210492611, | |
| "epoch": 0.8075009567546881, | |
| "grad_norm": 0.08414279669523239, | |
| "learning_rate": 0.00015388747146501946, | |
| "loss": 1.1200661659240723, | |
| "mean_token_accuracy": 0.7490431442856789, | |
| "num_tokens": 8493866.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "entropy": 1.0317029684782029, | |
| "epoch": 0.81132797550708, | |
| "grad_norm": 0.06549747288227081, | |
| "learning_rate": 0.0001536189069423929, | |
| "loss": 1.0583623886108398, | |
| "mean_token_accuracy": 0.7555923700332642, | |
| "num_tokens": 8536147.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "entropy": 0.9694572634994983, | |
| "epoch": 0.8151549942594719, | |
| "grad_norm": 0.08112777769565582, | |
| "learning_rate": 0.00015335034241976636, | |
| "loss": 1.0503274917602539, | |
| "mean_token_accuracy": 0.7646921187639236, | |
| "num_tokens": 8578007.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "entropy": 0.9358880028128624, | |
| "epoch": 0.8189820130118638, | |
| "grad_norm": 0.07176466286182404, | |
| "learning_rate": 0.0001530817778971398, | |
| "loss": 1.000410270690918, | |
| "mean_token_accuracy": 0.773740467429161, | |
| "num_tokens": 8620999.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "entropy": 1.0444137938320637, | |
| "epoch": 0.8228090317642557, | |
| "grad_norm": 0.06355756521224976, | |
| "learning_rate": 0.00015281321337451325, | |
| "loss": 1.0860448837280274, | |
| "mean_token_accuracy": 0.751850588619709, | |
| "num_tokens": 8663354.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 0.9044980220496655, | |
| "epoch": 0.8266360505166476, | |
| "grad_norm": 0.080223448574543, | |
| "learning_rate": 0.00015254464885188667, | |
| "loss": 0.9434403419494629, | |
| "mean_token_accuracy": 0.7828752338886261, | |
| "num_tokens": 8699748.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "entropy": 1.0172922544181346, | |
| "epoch": 0.8304630692690395, | |
| "grad_norm": 0.06971501559019089, | |
| "learning_rate": 0.00015227608432926012, | |
| "loss": 1.0325962066650392, | |
| "mean_token_accuracy": 0.7651202365756035, | |
| "num_tokens": 8739901.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "entropy": 0.9639742732048034, | |
| "epoch": 0.8342900880214313, | |
| "grad_norm": 0.06396778672933578, | |
| "learning_rate": 0.00015200751980663354, | |
| "loss": 1.0435317039489747, | |
| "mean_token_accuracy": 0.7667818054556846, | |
| "num_tokens": 8778980.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "entropy": 0.8876220636069775, | |
| "epoch": 0.8381171067738232, | |
| "grad_norm": 0.09910868853330612, | |
| "learning_rate": 0.00015173895528400698, | |
| "loss": 0.9876300811767578, | |
| "mean_token_accuracy": 0.7865215808153152, | |
| "num_tokens": 8815525.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "entropy": 1.0369405087083579, | |
| "epoch": 0.8419441255262151, | |
| "grad_norm": 0.08775259554386139, | |
| "learning_rate": 0.00015147039076138043, | |
| "loss": 1.1244413375854492, | |
| "mean_token_accuracy": 0.7550949841737747, | |
| "num_tokens": 8857085.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 0.9762422502040863, | |
| "epoch": 0.845771144278607, | |
| "grad_norm": 0.08659302443265915, | |
| "learning_rate": 0.00015120182623875388, | |
| "loss": 1.0164811134338378, | |
| "mean_token_accuracy": 0.771617329120636, | |
| "num_tokens": 8894271.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "entropy": 0.9543228000402451, | |
| "epoch": 0.8495981630309989, | |
| "grad_norm": 0.09588434547185898, | |
| "learning_rate": 0.00015093326171612733, | |
| "loss": 1.0303520202636718, | |
| "mean_token_accuracy": 0.768992331624031, | |
| "num_tokens": 8934095.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "entropy": 1.1307236567139625, | |
| "epoch": 0.8534251817833908, | |
| "grad_norm": 0.07016360014677048, | |
| "learning_rate": 0.00015066469719350075, | |
| "loss": 1.1526556968688966, | |
| "mean_token_accuracy": 0.7296861469745636, | |
| "num_tokens": 8982341.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "entropy": 1.0867296956479549, | |
| "epoch": 0.8572522005357827, | |
| "grad_norm": 0.07838597148656845, | |
| "learning_rate": 0.00015039613267087417, | |
| "loss": 1.1031158447265625, | |
| "mean_token_accuracy": 0.7445572927594185, | |
| "num_tokens": 9027401.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "entropy": 0.9492381684482097, | |
| "epoch": 0.8610792192881745, | |
| "grad_norm": 0.08416638523340225, | |
| "learning_rate": 0.0001501275681482476, | |
| "loss": 1.0079804420471192, | |
| "mean_token_accuracy": 0.7709973976016045, | |
| "num_tokens": 9069985.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 0.9767517909407616, | |
| "epoch": 0.8649062380405664, | |
| "grad_norm": 0.09798935055732727, | |
| "learning_rate": 0.00014985900362562106, | |
| "loss": 1.0394697189331055, | |
| "mean_token_accuracy": 0.7647709026932716, | |
| "num_tokens": 9108246.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "entropy": 0.9779160171747208, | |
| "epoch": 0.8687332567929583, | |
| "grad_norm": 0.08669373393058777, | |
| "learning_rate": 0.0001495904391029945, | |
| "loss": 1.0398100852966308, | |
| "mean_token_accuracy": 0.7669417649507523, | |
| "num_tokens": 9147055.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "entropy": 1.014696953445673, | |
| "epoch": 0.8725602755453502, | |
| "grad_norm": 0.07674991339445114, | |
| "learning_rate": 0.00014932187458036795, | |
| "loss": 1.0742408752441406, | |
| "mean_token_accuracy": 0.7583330690860748, | |
| "num_tokens": 9187727.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "entropy": 0.9619584158062935, | |
| "epoch": 0.8763872942977421, | |
| "grad_norm": 0.09512930363416672, | |
| "learning_rate": 0.00014905331005774137, | |
| "loss": 1.01895112991333, | |
| "mean_token_accuracy": 0.7718996241688728, | |
| "num_tokens": 9228518.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "entropy": 0.8759313493967056, | |
| "epoch": 0.880214313050134, | |
| "grad_norm": 0.06927543133497238, | |
| "learning_rate": 0.00014878474553511482, | |
| "loss": 0.9590776443481446, | |
| "mean_token_accuracy": 0.783099564909935, | |
| "num_tokens": 9269392.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 1.0930156745016575, | |
| "epoch": 0.8840413318025259, | |
| "grad_norm": 0.07149595022201538, | |
| "learning_rate": 0.00014851618101248824, | |
| "loss": 1.132398796081543, | |
| "mean_token_accuracy": 0.7445679202675819, | |
| "num_tokens": 9310993.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "entropy": 0.9991384916007519, | |
| "epoch": 0.8878683505549178, | |
| "grad_norm": 0.100126251578331, | |
| "learning_rate": 0.0001482476164898617, | |
| "loss": 1.0395862579345703, | |
| "mean_token_accuracy": 0.7618231356143952, | |
| "num_tokens": 9349210.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "entropy": 0.9891969002783298, | |
| "epoch": 0.8916953693073096, | |
| "grad_norm": 0.07942050695419312, | |
| "learning_rate": 0.00014797905196723514, | |
| "loss": 1.0403067588806152, | |
| "mean_token_accuracy": 0.7636258214712143, | |
| "num_tokens": 9386251.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "entropy": 1.034816125780344, | |
| "epoch": 0.8955223880597015, | |
| "grad_norm": 0.07803855836391449, | |
| "learning_rate": 0.00014771048744460858, | |
| "loss": 1.088371467590332, | |
| "mean_token_accuracy": 0.7585563778877258, | |
| "num_tokens": 9425492.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "entropy": 0.998091223090887, | |
| "epoch": 0.8993494068120934, | |
| "grad_norm": 0.06696243584156036, | |
| "learning_rate": 0.00014744192292198203, | |
| "loss": 1.0410521507263184, | |
| "mean_token_accuracy": 0.7595112159848213, | |
| "num_tokens": 9466862.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "entropy": 0.9615898832678795, | |
| "epoch": 0.9031764255644853, | |
| "grad_norm": 0.07813845574855804, | |
| "learning_rate": 0.00014717335839935545, | |
| "loss": 1.0265610694885254, | |
| "mean_token_accuracy": 0.7707905381917953, | |
| "num_tokens": 9503827.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "entropy": 0.8776158876717091, | |
| "epoch": 0.9070034443168772, | |
| "grad_norm": 0.10287057608366013, | |
| "learning_rate": 0.0001469047938767289, | |
| "loss": 0.9231206893920898, | |
| "mean_token_accuracy": 0.7909859612584114, | |
| "num_tokens": 9536194.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "entropy": 0.980732673406601, | |
| "epoch": 0.9108304630692691, | |
| "grad_norm": 0.06174289435148239, | |
| "learning_rate": 0.00014663622935410232, | |
| "loss": 1.0316704750061034, | |
| "mean_token_accuracy": 0.7596900418400765, | |
| "num_tokens": 9577621.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "entropy": 1.0083129487931728, | |
| "epoch": 0.914657481821661, | |
| "grad_norm": 0.08805451542139053, | |
| "learning_rate": 0.00014636766483147576, | |
| "loss": 1.0296180725097657, | |
| "mean_token_accuracy": 0.7577597886323929, | |
| "num_tokens": 9616522.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "entropy": 1.0002505116164684, | |
| "epoch": 0.9184845005740528, | |
| "grad_norm": 0.07697928696870804, | |
| "learning_rate": 0.0001460991003088492, | |
| "loss": 1.0411831855773925, | |
| "mean_token_accuracy": 0.7589930936694145, | |
| "num_tokens": 9659217.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 0.971958789229393, | |
| "epoch": 0.9223115193264447, | |
| "grad_norm": 0.08504882454872131, | |
| "learning_rate": 0.00014583053578622266, | |
| "loss": 1.015835952758789, | |
| "mean_token_accuracy": 0.7664303690195083, | |
| "num_tokens": 9694120.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "entropy": 0.9250703640282154, | |
| "epoch": 0.9261385380788366, | |
| "grad_norm": 0.06279303133487701, | |
| "learning_rate": 0.00014556197126359608, | |
| "loss": 0.9673631668090821, | |
| "mean_token_accuracy": 0.782692727446556, | |
| "num_tokens": 9732460.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "entropy": 1.0777716524899006, | |
| "epoch": 0.9299655568312285, | |
| "grad_norm": 0.06884833425283432, | |
| "learning_rate": 0.00014529340674096952, | |
| "loss": 1.1415311813354492, | |
| "mean_token_accuracy": 0.7447684407234192, | |
| "num_tokens": 9773760.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "entropy": 1.0116477236151695, | |
| "epoch": 0.9337925755836204, | |
| "grad_norm": 0.06346814334392548, | |
| "learning_rate": 0.00014502484221834297, | |
| "loss": 1.0904932975769044, | |
| "mean_token_accuracy": 0.7616935014724732, | |
| "num_tokens": 9808910.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "entropy": 0.9434679664671421, | |
| "epoch": 0.9376195943360123, | |
| "grad_norm": 0.09843038022518158, | |
| "learning_rate": 0.0001447562776957164, | |
| "loss": 1.0111047744750976, | |
| "mean_token_accuracy": 0.774254959821701, | |
| "num_tokens": 9846472.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "entropy": 1.035598163306713, | |
| "epoch": 0.9414466130884042, | |
| "grad_norm": 0.08025770634412766, | |
| "learning_rate": 0.00014448771317308984, | |
| "loss": 1.1550275802612304, | |
| "mean_token_accuracy": 0.7497850373387337, | |
| "num_tokens": 9885082.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "entropy": 1.057615876197815, | |
| "epoch": 0.945273631840796, | |
| "grad_norm": 0.07916443794965744, | |
| "learning_rate": 0.00014421914865046329, | |
| "loss": 1.114585781097412, | |
| "mean_token_accuracy": 0.7495191320776939, | |
| "num_tokens": 9924849.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "entropy": 0.9576205931603908, | |
| "epoch": 0.9491006505931879, | |
| "grad_norm": 0.10745597630739212, | |
| "learning_rate": 0.00014395058412783673, | |
| "loss": 1.0471231460571289, | |
| "mean_token_accuracy": 0.7697127804160118, | |
| "num_tokens": 9969210.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "entropy": 1.012363300472498, | |
| "epoch": 0.9529276693455798, | |
| "grad_norm": 0.09448845684528351, | |
| "learning_rate": 0.00014368201960521015, | |
| "loss": 1.0322566986083985, | |
| "mean_token_accuracy": 0.7568502962589264, | |
| "num_tokens": 10009532.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "entropy": 0.9387446999549866, | |
| "epoch": 0.9567546880979717, | |
| "grad_norm": 0.08835543692111969, | |
| "learning_rate": 0.0001434134550825836, | |
| "loss": 0.9836790084838867, | |
| "mean_token_accuracy": 0.7740270137786865, | |
| "num_tokens": 10051767.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 1.043863268941641, | |
| "epoch": 0.9605817068503636, | |
| "grad_norm": 0.0590866394340992, | |
| "learning_rate": 0.00014314489055995705, | |
| "loss": 1.1286373138427734, | |
| "mean_token_accuracy": 0.755294018983841, | |
| "num_tokens": 10093518.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "entropy": 1.068480123579502, | |
| "epoch": 0.9644087256027555, | |
| "grad_norm": 0.06240773946046829, | |
| "learning_rate": 0.00014287632603733047, | |
| "loss": 1.1243531227111816, | |
| "mean_token_accuracy": 0.7457959160208703, | |
| "num_tokens": 10137842.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "entropy": 0.9648511357605457, | |
| "epoch": 0.9682357443551474, | |
| "grad_norm": 0.07577214390039444, | |
| "learning_rate": 0.00014260776151470391, | |
| "loss": 1.0646875381469727, | |
| "mean_token_accuracy": 0.7689151406288147, | |
| "num_tokens": 10177541.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "entropy": 1.0034234993159772, | |
| "epoch": 0.9720627631075393, | |
| "grad_norm": 0.06887607276439667, | |
| "learning_rate": 0.00014233919699207736, | |
| "loss": 1.0736650466918944, | |
| "mean_token_accuracy": 0.7580653995275497, | |
| "num_tokens": 10217056.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "entropy": 0.9054977536201477, | |
| "epoch": 0.9758897818599311, | |
| "grad_norm": 0.12731540203094482, | |
| "learning_rate": 0.00014207063246945078, | |
| "loss": 0.9581779479980469, | |
| "mean_token_accuracy": 0.7800818130373954, | |
| "num_tokens": 10249622.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "entropy": 1.0892111197113992, | |
| "epoch": 0.979716800612323, | |
| "grad_norm": 0.08707671612501144, | |
| "learning_rate": 0.00014180206794682423, | |
| "loss": 1.1551457405090333, | |
| "mean_token_accuracy": 0.7434241071343421, | |
| "num_tokens": 10287483.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "entropy": 0.9462251186370849, | |
| "epoch": 0.9835438193647149, | |
| "grad_norm": 0.10457631945610046, | |
| "learning_rate": 0.00014153350342419768, | |
| "loss": 0.9859563827514648, | |
| "mean_token_accuracy": 0.7729493409395218, | |
| "num_tokens": 10324562.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "entropy": 0.9609014384448529, | |
| "epoch": 0.9873708381171068, | |
| "grad_norm": 0.1095169261097908, | |
| "learning_rate": 0.0001412649389015711, | |
| "loss": 1.00408992767334, | |
| "mean_token_accuracy": 0.769461353123188, | |
| "num_tokens": 10368482.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "entropy": 0.9500531531870365, | |
| "epoch": 0.9911978568694987, | |
| "grad_norm": 0.12787973880767822, | |
| "learning_rate": 0.00014099637437894454, | |
| "loss": 1.0082733154296875, | |
| "mean_token_accuracy": 0.7726384818553924, | |
| "num_tokens": 10407666.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "entropy": 0.9639500208199024, | |
| "epoch": 0.9950248756218906, | |
| "grad_norm": 0.08555731922388077, | |
| "learning_rate": 0.000140727809856318, | |
| "loss": 0.9910324096679688, | |
| "mean_token_accuracy": 0.7700270056724549, | |
| "num_tokens": 10445419.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 0.9984636768698693, | |
| "epoch": 0.9988518943742825, | |
| "grad_norm": 0.10294629633426666, | |
| "learning_rate": 0.00014045924533369144, | |
| "loss": 1.0837631225585938, | |
| "mean_token_accuracy": 0.7655858203768731, | |
| "num_tokens": 10483287.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "entropy": 0.940229170024395, | |
| "epoch": 1.0026789131266742, | |
| "grad_norm": 0.10580310225486755, | |
| "learning_rate": 0.00014019068081106486, | |
| "loss": 0.9650541305541992, | |
| "mean_token_accuracy": 0.7728109017014504, | |
| "num_tokens": 10523841.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "entropy": 0.9358184114098549, | |
| "epoch": 1.0065059318790661, | |
| "grad_norm": 0.12460961192846298, | |
| "learning_rate": 0.0001399221162884383, | |
| "loss": 0.9570166587829589, | |
| "mean_token_accuracy": 0.7772100657224655, | |
| "num_tokens": 10561636.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "entropy": 1.010379894077778, | |
| "epoch": 1.010332950631458, | |
| "grad_norm": 0.0781383365392685, | |
| "learning_rate": 0.00013965355176581175, | |
| "loss": 1.0524909019470214, | |
| "mean_token_accuracy": 0.7589353621006012, | |
| "num_tokens": 10605899.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "entropy": 0.977487600594759, | |
| "epoch": 1.01415996938385, | |
| "grad_norm": 0.0902724489569664, | |
| "learning_rate": 0.00013938498724318517, | |
| "loss": 1.0475889205932618, | |
| "mean_token_accuracy": 0.7629667386412621, | |
| "num_tokens": 10642372.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "entropy": 0.9681369736790657, | |
| "epoch": 1.0179869881362418, | |
| "grad_norm": 0.06344746798276901, | |
| "learning_rate": 0.00013911642272055862, | |
| "loss": 1.0268775939941406, | |
| "mean_token_accuracy": 0.7677509978413581, | |
| "num_tokens": 10682308.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "entropy": 0.9013996437191963, | |
| "epoch": 1.0218140068886337, | |
| "grad_norm": 0.09890369325876236, | |
| "learning_rate": 0.00013884785819793206, | |
| "loss": 0.969085693359375, | |
| "mean_token_accuracy": 0.7815661624073982, | |
| "num_tokens": 10720755.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "entropy": 0.9415140472352505, | |
| "epoch": 1.0256410256410255, | |
| "grad_norm": 0.08691754937171936, | |
| "learning_rate": 0.00013857929367530548, | |
| "loss": 0.9783688545227051, | |
| "mean_token_accuracy": 0.7722749456763267, | |
| "num_tokens": 10759842.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "entropy": 0.9437286920845509, | |
| "epoch": 1.0294680443934174, | |
| "grad_norm": 0.06577731668949127, | |
| "learning_rate": 0.00013831072915267893, | |
| "loss": 0.9904938697814941, | |
| "mean_token_accuracy": 0.7716649904847145, | |
| "num_tokens": 10803740.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "entropy": 0.9657303221523762, | |
| "epoch": 1.0332950631458093, | |
| "grad_norm": 0.07847272604703903, | |
| "learning_rate": 0.00013804216463005238, | |
| "loss": 1.0073646545410155, | |
| "mean_token_accuracy": 0.7678608119487762, | |
| "num_tokens": 10841808.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 0.881027878075838, | |
| "epoch": 1.0371220818982012, | |
| "grad_norm": 0.12755495309829712, | |
| "learning_rate": 0.00013777360010742583, | |
| "loss": 0.955751895904541, | |
| "mean_token_accuracy": 0.7835927039384842, | |
| "num_tokens": 10880108.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "entropy": 0.8458237417042256, | |
| "epoch": 1.040949100650593, | |
| "grad_norm": 0.07641884684562683, | |
| "learning_rate": 0.00013750503558479925, | |
| "loss": 0.9140083312988281, | |
| "mean_token_accuracy": 0.7939343526959419, | |
| "num_tokens": 10916272.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "entropy": 0.8845301080495119, | |
| "epoch": 1.044776119402985, | |
| "grad_norm": 0.08896184712648392, | |
| "learning_rate": 0.0001372364710621727, | |
| "loss": 0.9332797050476074, | |
| "mean_token_accuracy": 0.7884662911295891, | |
| "num_tokens": 10951932.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "entropy": 0.963884600251913, | |
| "epoch": 1.0486031381553769, | |
| "grad_norm": 0.10196536034345627, | |
| "learning_rate": 0.00013696790653954614, | |
| "loss": 1.0123867988586426, | |
| "mean_token_accuracy": 0.7659088596701622, | |
| "num_tokens": 10991548.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "entropy": 0.9720129862427711, | |
| "epoch": 1.0524301569077688, | |
| "grad_norm": 0.07552212476730347, | |
| "learning_rate": 0.00013669934201691956, | |
| "loss": 1.015409564971924, | |
| "mean_token_accuracy": 0.7689290955662728, | |
| "num_tokens": 11028749.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "entropy": 0.9871743015944958, | |
| "epoch": 1.0562571756601606, | |
| "grad_norm": 0.09255808591842651, | |
| "learning_rate": 0.000136430777494293, | |
| "loss": 1.0351217269897461, | |
| "mean_token_accuracy": 0.7620491668581962, | |
| "num_tokens": 11071336.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "entropy": 0.809666246920824, | |
| "epoch": 1.0600841944125525, | |
| "grad_norm": 0.08891233056783676, | |
| "learning_rate": 0.00013616221297166645, | |
| "loss": 0.8595174789428711, | |
| "mean_token_accuracy": 0.8053640425205231, | |
| "num_tokens": 11107708.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "entropy": 0.9220615286380053, | |
| "epoch": 1.0639112131649444, | |
| "grad_norm": 0.0731620192527771, | |
| "learning_rate": 0.0001358936484490399, | |
| "loss": 0.9694333076477051, | |
| "mean_token_accuracy": 0.7767527863383293, | |
| "num_tokens": 11149005.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "entropy": 0.8744502332061529, | |
| "epoch": 1.0677382319173363, | |
| "grad_norm": 0.0865791067481041, | |
| "learning_rate": 0.00013562508392641332, | |
| "loss": 0.9401009559631348, | |
| "mean_token_accuracy": 0.7854847684502602, | |
| "num_tokens": 11189214.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "entropy": 0.989877526462078, | |
| "epoch": 1.0715652506697282, | |
| "grad_norm": 0.09394430369138718, | |
| "learning_rate": 0.00013535651940378677, | |
| "loss": 1.0487696647644043, | |
| "mean_token_accuracy": 0.7607394486665726, | |
| "num_tokens": 11225161.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 0.8656694941222668, | |
| "epoch": 1.07539226942212, | |
| "grad_norm": 0.10940351337194443, | |
| "learning_rate": 0.0001350879548811602, | |
| "loss": 0.9236039161682129, | |
| "mean_token_accuracy": 0.7919901207089424, | |
| "num_tokens": 11261274.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "entropy": 1.063130483776331, | |
| "epoch": 1.079219288174512, | |
| "grad_norm": 0.06853083521127701, | |
| "learning_rate": 0.00013481939035853364, | |
| "loss": 1.0725152015686035, | |
| "mean_token_accuracy": 0.7454188778996468, | |
| "num_tokens": 11302522.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "entropy": 0.92764787748456, | |
| "epoch": 1.0830463069269038, | |
| "grad_norm": 0.10344231128692627, | |
| "learning_rate": 0.00013455082583590708, | |
| "loss": 0.9725144386291504, | |
| "mean_token_accuracy": 0.7810687303543091, | |
| "num_tokens": 11339898.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "entropy": 0.9415482886135578, | |
| "epoch": 1.0868733256792957, | |
| "grad_norm": 0.12117484956979752, | |
| "learning_rate": 0.00013428226131328053, | |
| "loss": 1.0216625213623047, | |
| "mean_token_accuracy": 0.7713929772377014, | |
| "num_tokens": 11380187.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "entropy": 0.9300718136131764, | |
| "epoch": 1.0907003444316876, | |
| "grad_norm": 0.09950343519449234, | |
| "learning_rate": 0.00013401369679065398, | |
| "loss": 0.9862215042114257, | |
| "mean_token_accuracy": 0.7748491272330285, | |
| "num_tokens": 11417351.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "entropy": 0.9016943011432886, | |
| "epoch": 1.0945273631840795, | |
| "grad_norm": 0.10104110836982727, | |
| "learning_rate": 0.0001337451322680274, | |
| "loss": 0.9565576553344727, | |
| "mean_token_accuracy": 0.7823473244905472, | |
| "num_tokens": 11455566.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "entropy": 1.0184541821479798, | |
| "epoch": 1.0983543819364714, | |
| "grad_norm": 0.07055146247148514, | |
| "learning_rate": 0.00013347656774540084, | |
| "loss": 1.0644380569458007, | |
| "mean_token_accuracy": 0.7551941126585007, | |
| "num_tokens": 11499960.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "entropy": 0.9143499568104744, | |
| "epoch": 1.1021814006888633, | |
| "grad_norm": 0.09798481315374374, | |
| "learning_rate": 0.00013320800322277426, | |
| "loss": 0.9477805137634278, | |
| "mean_token_accuracy": 0.778240317106247, | |
| "num_tokens": 11536434.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "entropy": 0.8803758375346661, | |
| "epoch": 1.1060084194412552, | |
| "grad_norm": 0.09720771014690399, | |
| "learning_rate": 0.0001329394387001477, | |
| "loss": 0.9369168281555176, | |
| "mean_token_accuracy": 0.786097663640976, | |
| "num_tokens": 11572420.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "entropy": 0.9127089619636536, | |
| "epoch": 1.109835438193647, | |
| "grad_norm": 0.07493265718221664, | |
| "learning_rate": 0.00013267087417752116, | |
| "loss": 0.9610566139221192, | |
| "mean_token_accuracy": 0.7780416712164879, | |
| "num_tokens": 11607494.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 0.9359945230185985, | |
| "epoch": 1.113662456946039, | |
| "grad_norm": 0.09086300432682037, | |
| "learning_rate": 0.0001324023096548946, | |
| "loss": 0.9519670486450196, | |
| "mean_token_accuracy": 0.7745376393198967, | |
| "num_tokens": 11647057.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "entropy": 0.9206651791930198, | |
| "epoch": 1.1174894756984308, | |
| "grad_norm": 0.10007902979850769, | |
| "learning_rate": 0.00013213374513226805, | |
| "loss": 0.9783179283142089, | |
| "mean_token_accuracy": 0.778519794344902, | |
| "num_tokens": 11685762.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "entropy": 0.9937357418239117, | |
| "epoch": 1.1213164944508227, | |
| "grad_norm": 0.0993100181221962, | |
| "learning_rate": 0.00013186518060964147, | |
| "loss": 1.0440019607543944, | |
| "mean_token_accuracy": 0.7590440228581429, | |
| "num_tokens": 11727379.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "entropy": 1.048055526614189, | |
| "epoch": 1.1251435132032146, | |
| "grad_norm": 0.11140380054712296, | |
| "learning_rate": 0.0001315966160870149, | |
| "loss": 1.1046284675598144, | |
| "mean_token_accuracy": 0.7413847833871842, | |
| "num_tokens": 11770734.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "entropy": 0.9562077779322863, | |
| "epoch": 1.1289705319556065, | |
| "grad_norm": 0.11506770551204681, | |
| "learning_rate": 0.00013132805156438834, | |
| "loss": 0.9946146011352539, | |
| "mean_token_accuracy": 0.7750585973262787, | |
| "num_tokens": 11806270.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "entropy": 0.9747304327785968, | |
| "epoch": 1.1327975507079984, | |
| "grad_norm": 0.1126897856593132, | |
| "learning_rate": 0.00013105948704176179, | |
| "loss": 1.061129093170166, | |
| "mean_token_accuracy": 0.7613553464412689, | |
| "num_tokens": 11852779.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "entropy": 1.0132145062088966, | |
| "epoch": 1.1366245694603903, | |
| "grad_norm": 0.08260762691497803, | |
| "learning_rate": 0.00013079092251913523, | |
| "loss": 1.0199948310852052, | |
| "mean_token_accuracy": 0.7617463275790215, | |
| "num_tokens": 11897084.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "entropy": 0.9878915682435035, | |
| "epoch": 1.1404515882127821, | |
| "grad_norm": 0.08098926395177841, | |
| "learning_rate": 0.00013052235799650868, | |
| "loss": 1.0480783462524415, | |
| "mean_token_accuracy": 0.763205036520958, | |
| "num_tokens": 11938987.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "entropy": 1.0176467482000588, | |
| "epoch": 1.144278606965174, | |
| "grad_norm": 0.0966029092669487, | |
| "learning_rate": 0.0001302537934738821, | |
| "loss": 1.093599796295166, | |
| "mean_token_accuracy": 0.7526282608509064, | |
| "num_tokens": 11981156.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "entropy": 1.0054687768220902, | |
| "epoch": 1.148105625717566, | |
| "grad_norm": 0.09327300637960434, | |
| "learning_rate": 0.00012998522895125555, | |
| "loss": 1.039564609527588, | |
| "mean_token_accuracy": 0.7592228040099144, | |
| "num_tokens": 12025389.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 0.9626951858401298, | |
| "epoch": 1.1519326444699578, | |
| "grad_norm": 0.06154703348875046, | |
| "learning_rate": 0.00012971666442862897, | |
| "loss": 0.9993762016296387, | |
| "mean_token_accuracy": 0.769777101278305, | |
| "num_tokens": 12069545.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "entropy": 0.9221224367618561, | |
| "epoch": 1.1557596632223497, | |
| "grad_norm": 0.1140643060207367, | |
| "learning_rate": 0.00012944809990600241, | |
| "loss": 0.9887493133544922, | |
| "mean_token_accuracy": 0.7754134178161621, | |
| "num_tokens": 12113892.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "entropy": 1.011741641908884, | |
| "epoch": 1.1595866819747416, | |
| "grad_norm": 0.08721659332513809, | |
| "learning_rate": 0.00012917953538337586, | |
| "loss": 1.068478488922119, | |
| "mean_token_accuracy": 0.7615607067942619, | |
| "num_tokens": 12153746.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "entropy": 0.9926261432468891, | |
| "epoch": 1.1634137007271335, | |
| "grad_norm": 0.07577186822891235, | |
| "learning_rate": 0.0001289109708607493, | |
| "loss": 1.047102451324463, | |
| "mean_token_accuracy": 0.7669480383396149, | |
| "num_tokens": 12199067.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "entropy": 0.945004402846098, | |
| "epoch": 1.1672407194795253, | |
| "grad_norm": 0.08443465083837509, | |
| "learning_rate": 0.00012864240633812276, | |
| "loss": 0.9891506195068359, | |
| "mean_token_accuracy": 0.7756656989455223, | |
| "num_tokens": 12243766.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "entropy": 0.9602406993508339, | |
| "epoch": 1.1710677382319172, | |
| "grad_norm": 0.07647141069173813, | |
| "learning_rate": 0.00012837384181549618, | |
| "loss": 1.0091946601867676, | |
| "mean_token_accuracy": 0.7702717915177345, | |
| "num_tokens": 12279555.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "entropy": 0.9430582121014595, | |
| "epoch": 1.1748947569843091, | |
| "grad_norm": 0.10050038248300552, | |
| "learning_rate": 0.0001281052772928696, | |
| "loss": 1.0251899719238282, | |
| "mean_token_accuracy": 0.7759435445070266, | |
| "num_tokens": 12316974.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "entropy": 1.0339640237390995, | |
| "epoch": 1.178721775736701, | |
| "grad_norm": 0.09026551991701126, | |
| "learning_rate": 0.00012783671277024304, | |
| "loss": 1.0652464866638183, | |
| "mean_token_accuracy": 0.7533303231000901, | |
| "num_tokens": 12358111.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "entropy": 0.9808862328529357, | |
| "epoch": 1.182548794489093, | |
| "grad_norm": 0.08769362419843674, | |
| "learning_rate": 0.0001275681482476165, | |
| "loss": 1.0068347930908204, | |
| "mean_token_accuracy": 0.7660810023546218, | |
| "num_tokens": 12401669.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "entropy": 0.9436531282961369, | |
| "epoch": 1.1863758132414848, | |
| "grad_norm": 0.09366963803768158, | |
| "learning_rate": 0.00012729958372498994, | |
| "loss": 1.0298351287841796, | |
| "mean_token_accuracy": 0.7704201564192772, | |
| "num_tokens": 12442005.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "entropy": 0.8712134130299092, | |
| "epoch": 1.1902028319938767, | |
| "grad_norm": 0.14041900634765625, | |
| "learning_rate": 0.00012703101920236338, | |
| "loss": 0.9094470977783203, | |
| "mean_token_accuracy": 0.7861496224999428, | |
| "num_tokens": 12484476.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "entropy": 0.9474696554243565, | |
| "epoch": 1.1940298507462686, | |
| "grad_norm": 0.10449594259262085, | |
| "learning_rate": 0.00012676245467973683, | |
| "loss": 0.9729720115661621, | |
| "mean_token_accuracy": 0.7746587276458741, | |
| "num_tokens": 12521351.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "entropy": 0.9215874671936035, | |
| "epoch": 1.1978568694986604, | |
| "grad_norm": 0.07733117789030075, | |
| "learning_rate": 0.00012649389015711025, | |
| "loss": 0.992548942565918, | |
| "mean_token_accuracy": 0.7789316549897194, | |
| "num_tokens": 12564603.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "entropy": 0.9349980562925339, | |
| "epoch": 1.2016838882510523, | |
| "grad_norm": 0.06924714148044586, | |
| "learning_rate": 0.00012622532563448367, | |
| "loss": 1.010727596282959, | |
| "mean_token_accuracy": 0.7728876963257789, | |
| "num_tokens": 12606025.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "entropy": 0.9719727545976639, | |
| "epoch": 1.2055109070034442, | |
| "grad_norm": 0.07646770775318146, | |
| "learning_rate": 0.00012595676111185712, | |
| "loss": 1.0482423782348633, | |
| "mean_token_accuracy": 0.7659243881702423, | |
| "num_tokens": 12647703.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "entropy": 1.0236301876604557, | |
| "epoch": 1.209337925755836, | |
| "grad_norm": 0.08547945320606232, | |
| "learning_rate": 0.00012568819658923056, | |
| "loss": 1.0771334648132325, | |
| "mean_token_accuracy": 0.7551302567124367, | |
| "num_tokens": 12692347.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "entropy": 0.9277745552361012, | |
| "epoch": 1.213164944508228, | |
| "grad_norm": 0.10816850513219833, | |
| "learning_rate": 0.000125419632066604, | |
| "loss": 0.9680308341979981, | |
| "mean_token_accuracy": 0.7722468450665474, | |
| "num_tokens": 12729671.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "entropy": 0.9760092988610267, | |
| "epoch": 1.2169919632606199, | |
| "grad_norm": 0.08950033783912659, | |
| "learning_rate": 0.00012515106754397746, | |
| "loss": 1.000643539428711, | |
| "mean_token_accuracy": 0.7665232941508293, | |
| "num_tokens": 12768100.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "entropy": 0.9292771026492119, | |
| "epoch": 1.2208189820130118, | |
| "grad_norm": 0.08686704933643341, | |
| "learning_rate": 0.0001248825030213509, | |
| "loss": 1.019674015045166, | |
| "mean_token_accuracy": 0.7758068069815636, | |
| "num_tokens": 12801323.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "entropy": 0.8500060614198446, | |
| "epoch": 1.2246460007654036, | |
| "grad_norm": 0.07462778687477112, | |
| "learning_rate": 0.00012461393849872433, | |
| "loss": 0.9042973518371582, | |
| "mean_token_accuracy": 0.7897424980998039, | |
| "num_tokens": 12839880.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "entropy": 0.9205234386026859, | |
| "epoch": 1.2284730195177955, | |
| "grad_norm": 0.07027672231197357, | |
| "learning_rate": 0.00012434537397609775, | |
| "loss": 0.9424190521240234, | |
| "mean_token_accuracy": 0.7767854332923889, | |
| "num_tokens": 12878349.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "entropy": 0.9074239492416382, | |
| "epoch": 1.2323000382701874, | |
| "grad_norm": 0.09741132706403732, | |
| "learning_rate": 0.0001240768094534712, | |
| "loss": 0.9651589393615723, | |
| "mean_token_accuracy": 0.7790584430098534, | |
| "num_tokens": 12917588.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "entropy": 0.8874296098947525, | |
| "epoch": 1.2361270570225793, | |
| "grad_norm": 0.08608463406562805, | |
| "learning_rate": 0.00012380824493084464, | |
| "loss": 0.9437139511108399, | |
| "mean_token_accuracy": 0.7854243695735932, | |
| "num_tokens": 12956199.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "entropy": 0.9470510125160218, | |
| "epoch": 1.2399540757749712, | |
| "grad_norm": 0.09247037768363953, | |
| "learning_rate": 0.0001235396804082181, | |
| "loss": 1.032781982421875, | |
| "mean_token_accuracy": 0.7712572082877159, | |
| "num_tokens": 13000822.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "entropy": 0.8850176699459553, | |
| "epoch": 1.243781094527363, | |
| "grad_norm": 0.08397585898637772, | |
| "learning_rate": 0.00012327111588559153, | |
| "loss": 0.9292671203613281, | |
| "mean_token_accuracy": 0.787578609585762, | |
| "num_tokens": 13043532.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "entropy": 0.8605544999241829, | |
| "epoch": 1.247608113279755, | |
| "grad_norm": 0.0952179804444313, | |
| "learning_rate": 0.00012300255136296498, | |
| "loss": 0.8990240097045898, | |
| "mean_token_accuracy": 0.7919793605804444, | |
| "num_tokens": 13081376.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "entropy": 1.003395075351, | |
| "epoch": 1.2514351320321468, | |
| "grad_norm": 0.08914512395858765, | |
| "learning_rate": 0.0001227339868403384, | |
| "loss": 1.1446642875671387, | |
| "mean_token_accuracy": 0.7565032340586185, | |
| "num_tokens": 13119474.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "entropy": 0.9566417217254639, | |
| "epoch": 1.2552621507845387, | |
| "grad_norm": 0.13220350444316864, | |
| "learning_rate": 0.00012246542231771182, | |
| "loss": 0.9976698875427246, | |
| "mean_token_accuracy": 0.7722181305289268, | |
| "num_tokens": 13162637.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "entropy": 0.888442064449191, | |
| "epoch": 1.2590891695369306, | |
| "grad_norm": 0.10493922978639603, | |
| "learning_rate": 0.00012219685779508527, | |
| "loss": 0.916744613647461, | |
| "mean_token_accuracy": 0.7896391779184342, | |
| "num_tokens": 13199412.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "entropy": 0.9262259535491466, | |
| "epoch": 1.2629161882893225, | |
| "grad_norm": 0.09022962301969528, | |
| "learning_rate": 0.00012192829327245872, | |
| "loss": 0.9885137557983399, | |
| "mean_token_accuracy": 0.778158649802208, | |
| "num_tokens": 13240292.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "entropy": 0.9356066003441811, | |
| "epoch": 1.2667432070417144, | |
| "grad_norm": 0.09693239629268646, | |
| "learning_rate": 0.00012165972874983216, | |
| "loss": 0.9731400489807129, | |
| "mean_token_accuracy": 0.7748182758688926, | |
| "num_tokens": 13275876.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "entropy": 0.868951104208827, | |
| "epoch": 1.2705702257941063, | |
| "grad_norm": 0.09237370640039444, | |
| "learning_rate": 0.0001213911642272056, | |
| "loss": 0.9127277374267578, | |
| "mean_token_accuracy": 0.7890144631266593, | |
| "num_tokens": 13314857.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "entropy": 0.9311054348945618, | |
| "epoch": 1.2743972445464982, | |
| "grad_norm": 0.08701436221599579, | |
| "learning_rate": 0.00012112259970457902, | |
| "loss": 0.9666108131408692, | |
| "mean_token_accuracy": 0.7752738267183303, | |
| "num_tokens": 13357039.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "entropy": 0.9256260149180889, | |
| "epoch": 1.27822426329889, | |
| "grad_norm": 0.08751461654901505, | |
| "learning_rate": 0.00012085403518195246, | |
| "loss": 0.9926286697387695, | |
| "mean_token_accuracy": 0.7750931903719902, | |
| "num_tokens": 13397058.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "entropy": 1.0074332721531392, | |
| "epoch": 1.282051282051282, | |
| "grad_norm": 0.07409587502479553, | |
| "learning_rate": 0.00012058547065932591, | |
| "loss": 1.062586498260498, | |
| "mean_token_accuracy": 0.7546869352459907, | |
| "num_tokens": 13441381.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "entropy": 0.9596263833343983, | |
| "epoch": 1.2858783008036738, | |
| "grad_norm": 0.09343665838241577, | |
| "learning_rate": 0.00012031690613669934, | |
| "loss": 1.0023324012756347, | |
| "mean_token_accuracy": 0.7719831839203835, | |
| "num_tokens": 13481914.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "entropy": 0.9313522674143314, | |
| "epoch": 1.2897053195560657, | |
| "grad_norm": 0.0879049226641655, | |
| "learning_rate": 0.00012004834161407279, | |
| "loss": 0.9833806991577149, | |
| "mean_token_accuracy": 0.7737741976976394, | |
| "num_tokens": 13519831.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "entropy": 0.8369917057454586, | |
| "epoch": 1.2935323383084576, | |
| "grad_norm": 0.14339204132556915, | |
| "learning_rate": 0.00011977977709144624, | |
| "loss": 0.9147489547729493, | |
| "mean_token_accuracy": 0.7984762340784073, | |
| "num_tokens": 13559768.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "entropy": 0.9055653363466263, | |
| "epoch": 1.2973593570608495, | |
| "grad_norm": 0.1441742479801178, | |
| "learning_rate": 0.00011951121256881967, | |
| "loss": 0.9521515846252442, | |
| "mean_token_accuracy": 0.7834478095173836, | |
| "num_tokens": 13595966.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "entropy": 0.9677796266973019, | |
| "epoch": 1.3011863758132414, | |
| "grad_norm": 0.11233013868331909, | |
| "learning_rate": 0.00011924264804619309, | |
| "loss": 1.0522055625915527, | |
| "mean_token_accuracy": 0.7664702609181404, | |
| "num_tokens": 13638463.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "entropy": 0.9398517791181803, | |
| "epoch": 1.3050133945656333, | |
| "grad_norm": 0.088468998670578, | |
| "learning_rate": 0.00011897408352356654, | |
| "loss": 0.9618704795837403, | |
| "mean_token_accuracy": 0.7755557060241699, | |
| "num_tokens": 13677769.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "entropy": 0.8900398269295693, | |
| "epoch": 1.3088404133180251, | |
| "grad_norm": 0.09742283076047897, | |
| "learning_rate": 0.00011870551900093999, | |
| "loss": 0.9422917366027832, | |
| "mean_token_accuracy": 0.7865706130862236, | |
| "num_tokens": 13713374.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "entropy": 0.9008657015860081, | |
| "epoch": 1.312667432070417, | |
| "grad_norm": 0.09111864864826202, | |
| "learning_rate": 0.00011843695447831342, | |
| "loss": 0.9726786613464355, | |
| "mean_token_accuracy": 0.7835188135504723, | |
| "num_tokens": 13753165.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "entropy": 0.954158465564251, | |
| "epoch": 1.316494450822809, | |
| "grad_norm": 0.0949985608458519, | |
| "learning_rate": 0.00011816838995568687, | |
| "loss": 1.0072153091430665, | |
| "mean_token_accuracy": 0.7668681025505066, | |
| "num_tokens": 13790265.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "entropy": 0.9259054005146027, | |
| "epoch": 1.3203214695752008, | |
| "grad_norm": 0.09144506603479385, | |
| "learning_rate": 0.00011789982543306031, | |
| "loss": 1.0319811820983886, | |
| "mean_token_accuracy": 0.77575224339962, | |
| "num_tokens": 13830720.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "entropy": 0.9554400585591794, | |
| "epoch": 1.3241484883275927, | |
| "grad_norm": 0.05986972153186798, | |
| "learning_rate": 0.00011763126091043373, | |
| "loss": 0.9840157508850098, | |
| "mean_token_accuracy": 0.7714304268360138, | |
| "num_tokens": 13874024.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "entropy": 0.9618137650191784, | |
| "epoch": 1.3279755070799846, | |
| "grad_norm": 0.08746087551116943, | |
| "learning_rate": 0.00011736269638780717, | |
| "loss": 1.0280908584594726, | |
| "mean_token_accuracy": 0.7679046332836151, | |
| "num_tokens": 13916099.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "entropy": 1.02601458132267, | |
| "epoch": 1.3318025258323765, | |
| "grad_norm": 0.09883694350719452, | |
| "learning_rate": 0.00011709413186518061, | |
| "loss": 1.0893220901489258, | |
| "mean_token_accuracy": 0.7487106472253799, | |
| "num_tokens": 13955163.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "entropy": 1.025067638605833, | |
| "epoch": 1.3356295445847683, | |
| "grad_norm": 0.07656730711460114, | |
| "learning_rate": 0.00011682556734255406, | |
| "loss": 1.0527194023132325, | |
| "mean_token_accuracy": 0.7569629296660423, | |
| "num_tokens": 13996990.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "entropy": 0.8709930831566453, | |
| "epoch": 1.3394565633371602, | |
| "grad_norm": 0.1119026467204094, | |
| "learning_rate": 0.0001165570028199275, | |
| "loss": 0.9183405876159668, | |
| "mean_token_accuracy": 0.784464044868946, | |
| "num_tokens": 14040315.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "entropy": 0.9783565014600754, | |
| "epoch": 1.3432835820895521, | |
| "grad_norm": 0.09997576475143433, | |
| "learning_rate": 0.00011628843829730094, | |
| "loss": 1.0318940162658692, | |
| "mean_token_accuracy": 0.7614112690091133, | |
| "num_tokens": 14083204.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "entropy": 0.9975252889096737, | |
| "epoch": 1.347110600841944, | |
| "grad_norm": 0.10046812891960144, | |
| "learning_rate": 0.00011601987377467437, | |
| "loss": 1.0214290618896484, | |
| "mean_token_accuracy": 0.7584437146782875, | |
| "num_tokens": 14127039.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "entropy": 0.8959422588348389, | |
| "epoch": 1.350937619594336, | |
| "grad_norm": 0.09512703120708466, | |
| "learning_rate": 0.0001157513092520478, | |
| "loss": 0.9528075218200683, | |
| "mean_token_accuracy": 0.7823959946632385, | |
| "num_tokens": 14163989.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "entropy": 0.8903120748698712, | |
| "epoch": 1.3547646383467278, | |
| "grad_norm": 0.10500185191631317, | |
| "learning_rate": 0.00011548274472942124, | |
| "loss": 0.9784683227539063, | |
| "mean_token_accuracy": 0.7854589730501175, | |
| "num_tokens": 14198562.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "entropy": 0.8580869071185588, | |
| "epoch": 1.3585916570991197, | |
| "grad_norm": 0.08716659992933273, | |
| "learning_rate": 0.00011521418020679469, | |
| "loss": 0.9078399658203125, | |
| "mean_token_accuracy": 0.7894850671291351, | |
| "num_tokens": 14236952.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "entropy": 0.9841447554528713, | |
| "epoch": 1.3624186758515116, | |
| "grad_norm": 0.08638570457696915, | |
| "learning_rate": 0.00011494561568416812, | |
| "loss": 1.0438207626342773, | |
| "mean_token_accuracy": 0.7629329964518548, | |
| "num_tokens": 14278208.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "entropy": 0.9100395441055298, | |
| "epoch": 1.3662456946039034, | |
| "grad_norm": 0.09058145433664322, | |
| "learning_rate": 0.00011467705116154157, | |
| "loss": 0.9560261726379394, | |
| "mean_token_accuracy": 0.7807327762246132, | |
| "num_tokens": 14314076.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "entropy": 0.8529263667762279, | |
| "epoch": 1.3700727133562953, | |
| "grad_norm": 0.08847236633300781, | |
| "learning_rate": 0.00011440848663891502, | |
| "loss": 0.9192025184631347, | |
| "mean_token_accuracy": 0.7945622354745865, | |
| "num_tokens": 14349740.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "entropy": 0.8977530397474766, | |
| "epoch": 1.3738997321086872, | |
| "grad_norm": 0.09535886347293854, | |
| "learning_rate": 0.00011413992211628844, | |
| "loss": 0.9331538200378418, | |
| "mean_token_accuracy": 0.7803975984454155, | |
| "num_tokens": 14392492.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "entropy": 1.0430821359157563, | |
| "epoch": 1.377726750861079, | |
| "grad_norm": 0.08564139902591705, | |
| "learning_rate": 0.00011387135759366187, | |
| "loss": 1.0767670631408692, | |
| "mean_token_accuracy": 0.7479040876030922, | |
| "num_tokens": 14436961.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "entropy": 0.8358541168272495, | |
| "epoch": 1.381553769613471, | |
| "grad_norm": 0.09847365319728851, | |
| "learning_rate": 0.00011360279307103532, | |
| "loss": 0.8758580207824707, | |
| "mean_token_accuracy": 0.7964837267994881, | |
| "num_tokens": 14472251.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "entropy": 0.8302674755454064, | |
| "epoch": 1.3853807883658629, | |
| "grad_norm": 0.08570406585931778, | |
| "learning_rate": 0.00011333422854840876, | |
| "loss": 0.9068514823913574, | |
| "mean_token_accuracy": 0.7943103745579719, | |
| "num_tokens": 14509818.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "entropy": 0.9825982883572578, | |
| "epoch": 1.3892078071182548, | |
| "grad_norm": 0.10844281315803528, | |
| "learning_rate": 0.0001130656640257822, | |
| "loss": 1.0484787940979003, | |
| "mean_token_accuracy": 0.7600376740097999, | |
| "num_tokens": 14553567.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "entropy": 1.0431513242423534, | |
| "epoch": 1.3930348258706466, | |
| "grad_norm": 0.0750717744231224, | |
| "learning_rate": 0.00011279709950315564, | |
| "loss": 1.0337225914001464, | |
| "mean_token_accuracy": 0.7504511162638664, | |
| "num_tokens": 14598239.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "entropy": 0.9319969929754734, | |
| "epoch": 1.3968618446230385, | |
| "grad_norm": 0.08307385444641113, | |
| "learning_rate": 0.00011252853498052909, | |
| "loss": 0.9771868705749511, | |
| "mean_token_accuracy": 0.7778135031461716, | |
| "num_tokens": 14638064.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "entropy": 0.9992426164448261, | |
| "epoch": 1.4006888633754304, | |
| "grad_norm": 0.09222020208835602, | |
| "learning_rate": 0.00011225997045790251, | |
| "loss": 1.0516475677490233, | |
| "mean_token_accuracy": 0.7587143570184708, | |
| "num_tokens": 14682012.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "entropy": 0.9670721650123596, | |
| "epoch": 1.4045158821278223, | |
| "grad_norm": 0.09432315081357956, | |
| "learning_rate": 0.00011199140593527595, | |
| "loss": 1.0164658546447753, | |
| "mean_token_accuracy": 0.7670722231268883, | |
| "num_tokens": 14722922.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "entropy": 0.9808389253914356, | |
| "epoch": 1.4083429008802142, | |
| "grad_norm": 0.08502112329006195, | |
| "learning_rate": 0.00011172284141264939, | |
| "loss": 1.0553858757019043, | |
| "mean_token_accuracy": 0.76065753698349, | |
| "num_tokens": 14765083.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "entropy": 1.011240091174841, | |
| "epoch": 1.412169919632606, | |
| "grad_norm": 0.07948844134807587, | |
| "learning_rate": 0.00011145427689002284, | |
| "loss": 1.0446209907531738, | |
| "mean_token_accuracy": 0.75536377876997, | |
| "num_tokens": 14806465.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "entropy": 0.911352240294218, | |
| "epoch": 1.415996938384998, | |
| "grad_norm": 0.08382374793291092, | |
| "learning_rate": 0.00011118571236739627, | |
| "loss": 0.9388965606689453, | |
| "mean_token_accuracy": 0.7807439729571343, | |
| "num_tokens": 14850133.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "entropy": 0.9055514119565486, | |
| "epoch": 1.4198239571373898, | |
| "grad_norm": 0.10713934898376465, | |
| "learning_rate": 0.00011091714784476972, | |
| "loss": 0.9727254867553711, | |
| "mean_token_accuracy": 0.7801795959472656, | |
| "num_tokens": 14887327.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "entropy": 0.9338000696152449, | |
| "epoch": 1.4236509758897817, | |
| "grad_norm": 0.11418487876653671, | |
| "learning_rate": 0.00011064858332214314, | |
| "loss": 0.9989487648010253, | |
| "mean_token_accuracy": 0.7747065275907516, | |
| "num_tokens": 14927730.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "entropy": 0.869029226526618, | |
| "epoch": 1.4274779946421736, | |
| "grad_norm": 0.10778038948774338, | |
| "learning_rate": 0.00011038001879951659, | |
| "loss": 0.9393071174621582, | |
| "mean_token_accuracy": 0.7909289851784707, | |
| "num_tokens": 14964847.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "entropy": 0.8993408516049385, | |
| "epoch": 1.4313050133945655, | |
| "grad_norm": 0.08339972048997879, | |
| "learning_rate": 0.00011011145427689002, | |
| "loss": 0.9511364936828614, | |
| "mean_token_accuracy": 0.7844893127679825, | |
| "num_tokens": 15003449.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "entropy": 0.9478372372686863, | |
| "epoch": 1.4351320321469574, | |
| "grad_norm": 0.07547847181558609, | |
| "learning_rate": 0.00010984288975426347, | |
| "loss": 0.9942925453186036, | |
| "mean_token_accuracy": 0.772410535812378, | |
| "num_tokens": 15046091.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "entropy": 0.8367562972009182, | |
| "epoch": 1.4389590508993493, | |
| "grad_norm": 0.06902482360601425, | |
| "learning_rate": 0.00010957432523163691, | |
| "loss": 0.8951096534729004, | |
| "mean_token_accuracy": 0.7985799089074135, | |
| "num_tokens": 15091826.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "entropy": 0.9437298484146595, | |
| "epoch": 1.4427860696517412, | |
| "grad_norm": 0.10231524705886841, | |
| "learning_rate": 0.00010930576070901035, | |
| "loss": 0.9919009208679199, | |
| "mean_token_accuracy": 0.7663119360804558, | |
| "num_tokens": 15133719.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "entropy": 1.0057852260768414, | |
| "epoch": 1.446613088404133, | |
| "grad_norm": 0.09349844604730606, | |
| "learning_rate": 0.0001090371961863838, | |
| "loss": 1.0667811393737794, | |
| "mean_token_accuracy": 0.757930365204811, | |
| "num_tokens": 15173670.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "entropy": 0.9152357578277588, | |
| "epoch": 1.450440107156525, | |
| "grad_norm": 0.09612533450126648, | |
| "learning_rate": 0.00010876863166375722, | |
| "loss": 0.9641363143920898, | |
| "mean_token_accuracy": 0.7791497871279717, | |
| "num_tokens": 15215154.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "entropy": 0.849637558311224, | |
| "epoch": 1.4542671259089168, | |
| "grad_norm": 0.07079404592514038, | |
| "learning_rate": 0.00010850006714113066, | |
| "loss": 0.8924535751342774, | |
| "mean_token_accuracy": 0.7958060145378113, | |
| "num_tokens": 15261773.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "entropy": 0.9689324770122767, | |
| "epoch": 1.4580941446613087, | |
| "grad_norm": 0.10107272863388062, | |
| "learning_rate": 0.0001082315026185041, | |
| "loss": 1.000623607635498, | |
| "mean_token_accuracy": 0.7690365821123123, | |
| "num_tokens": 15295693.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "entropy": 0.8926774315536022, | |
| "epoch": 1.4619211634137006, | |
| "grad_norm": 0.0883372351527214, | |
| "learning_rate": 0.00010796293809587754, | |
| "loss": 0.9312380790710449, | |
| "mean_token_accuracy": 0.7839185446500778, | |
| "num_tokens": 15332324.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "entropy": 0.9962236389517785, | |
| "epoch": 1.4657481821660925, | |
| "grad_norm": 0.09174945950508118, | |
| "learning_rate": 0.00010769437357325099, | |
| "loss": 1.0419865608215333, | |
| "mean_token_accuracy": 0.7592507138848305, | |
| "num_tokens": 15370812.0, | |
| "step": 3830 | |
| }, | |
| { | |
| "entropy": 1.0249286435544491, | |
| "epoch": 1.4695752009184844, | |
| "grad_norm": 0.07152284681797028, | |
| "learning_rate": 0.00010742580905062442, | |
| "loss": 1.0437363624572753, | |
| "mean_token_accuracy": 0.7567671984434128, | |
| "num_tokens": 15417719.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "entropy": 0.903605168312788, | |
| "epoch": 1.4734022196708763, | |
| "grad_norm": 0.09400783479213715, | |
| "learning_rate": 0.00010715724452799784, | |
| "loss": 0.9410040855407715, | |
| "mean_token_accuracy": 0.7839412048459053, | |
| "num_tokens": 15455856.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "entropy": 1.0259956195950508, | |
| "epoch": 1.4772292384232681, | |
| "grad_norm": 0.08671914041042328, | |
| "learning_rate": 0.00010688868000537129, | |
| "loss": 1.1025453567504884, | |
| "mean_token_accuracy": 0.7507242172956466, | |
| "num_tokens": 15492109.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "entropy": 0.9178053669631481, | |
| "epoch": 1.48105625717566, | |
| "grad_norm": 0.07717446982860565, | |
| "learning_rate": 0.00010662011548274474, | |
| "loss": 0.96353178024292, | |
| "mean_token_accuracy": 0.7797438561916351, | |
| "num_tokens": 15532130.0, | |
| "step": 3870 | |
| }, | |
| { | |
| "entropy": 0.9423278756439686, | |
| "epoch": 1.484883275928052, | |
| "grad_norm": 0.11039029061794281, | |
| "learning_rate": 0.00010635155096011817, | |
| "loss": 0.979669189453125, | |
| "mean_token_accuracy": 0.7755513936281204, | |
| "num_tokens": 15575609.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "entropy": 0.8999218411743641, | |
| "epoch": 1.4887102946804438, | |
| "grad_norm": 0.08974706381559372, | |
| "learning_rate": 0.00010608298643749162, | |
| "loss": 0.9477033615112305, | |
| "mean_token_accuracy": 0.7822227850556374, | |
| "num_tokens": 15621264.0, | |
| "step": 3890 | |
| }, | |
| { | |
| "entropy": 0.8756623603403568, | |
| "epoch": 1.4925373134328357, | |
| "grad_norm": 0.10864510387182236, | |
| "learning_rate": 0.00010581442191486505, | |
| "loss": 0.9711783409118653, | |
| "mean_token_accuracy": 0.7893951386213303, | |
| "num_tokens": 15656959.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "entropy": 0.951158057898283, | |
| "epoch": 1.4963643321852276, | |
| "grad_norm": 0.09398993104696274, | |
| "learning_rate": 0.0001055458573922385, | |
| "loss": 1.0387070655822754, | |
| "mean_token_accuracy": 0.7698590591549873, | |
| "num_tokens": 15700293.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "entropy": 0.9240442231297493, | |
| "epoch": 1.5001913509376195, | |
| "grad_norm": 0.09761729091405869, | |
| "learning_rate": 0.00010527729286961192, | |
| "loss": 0.9758125305175781, | |
| "mean_token_accuracy": 0.7737968236207962, | |
| "num_tokens": 15739304.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "entropy": 0.9025500647723674, | |
| "epoch": 1.5040183696900113, | |
| "grad_norm": 0.08816131204366684, | |
| "learning_rate": 0.00010500872834698537, | |
| "loss": 0.913144302368164, | |
| "mean_token_accuracy": 0.7775477200746537, | |
| "num_tokens": 15785086.0, | |
| "step": 3930 | |
| }, | |
| { | |
| "entropy": 0.8958883471786976, | |
| "epoch": 1.5078453884424032, | |
| "grad_norm": 0.09690563380718231, | |
| "learning_rate": 0.0001047401638243588, | |
| "loss": 0.9484706878662109, | |
| "mean_token_accuracy": 0.7867727875709534, | |
| "num_tokens": 15822631.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "entropy": 0.8738761503249407, | |
| "epoch": 1.5116724071947951, | |
| "grad_norm": 0.08325833082199097, | |
| "learning_rate": 0.00010447159930173225, | |
| "loss": 0.9258977890014648, | |
| "mean_token_accuracy": 0.7862061053514481, | |
| "num_tokens": 15863533.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "entropy": 0.952784775942564, | |
| "epoch": 1.515499425947187, | |
| "grad_norm": 0.09089304506778717, | |
| "learning_rate": 0.0001042030347791057, | |
| "loss": 0.9893428802490234, | |
| "mean_token_accuracy": 0.769037912786007, | |
| "num_tokens": 15903798.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "entropy": 0.9974973328411579, | |
| "epoch": 1.519326444699579, | |
| "grad_norm": 0.06594393402338028, | |
| "learning_rate": 0.00010393447025647913, | |
| "loss": 0.9982621192932128, | |
| "mean_token_accuracy": 0.7653156638145446, | |
| "num_tokens": 15947894.0, | |
| "step": 3970 | |
| }, | |
| { | |
| "entropy": 1.042479208856821, | |
| "epoch": 1.5231534634519708, | |
| "grad_norm": 0.09250905364751816, | |
| "learning_rate": 0.00010366590573385255, | |
| "loss": 1.0862640380859374, | |
| "mean_token_accuracy": 0.7515693128108978, | |
| "num_tokens": 15985609.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "entropy": 0.869631578028202, | |
| "epoch": 1.5269804822043627, | |
| "grad_norm": 0.10154584795236588, | |
| "learning_rate": 0.000103397341211226, | |
| "loss": 0.9275701522827149, | |
| "mean_token_accuracy": 0.7910413116216659, | |
| "num_tokens": 16022339.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "entropy": 0.9228729590773582, | |
| "epoch": 1.5308075009567546, | |
| "grad_norm": 0.08860265463590622, | |
| "learning_rate": 0.00010312877668859944, | |
| "loss": 1.0074289321899415, | |
| "mean_token_accuracy": 0.7770419105887413, | |
| "num_tokens": 16063778.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "entropy": 0.9469372771680356, | |
| "epoch": 1.5346345197091464, | |
| "grad_norm": 0.08613952249288559, | |
| "learning_rate": 0.00010286021216597287, | |
| "loss": 1.0328418731689453, | |
| "mean_token_accuracy": 0.7784339845180511, | |
| "num_tokens": 16103389.0, | |
| "step": 4010 | |
| }, | |
| { | |
| "entropy": 0.9240258730947971, | |
| "epoch": 1.5384615384615383, | |
| "grad_norm": 0.09255630522966385, | |
| "learning_rate": 0.00010259164764334632, | |
| "loss": 0.9813838958740234, | |
| "mean_token_accuracy": 0.779928731918335, | |
| "num_tokens": 16141739.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "entropy": 0.8300335463136435, | |
| "epoch": 1.5422885572139302, | |
| "grad_norm": 0.11173315346240997, | |
| "learning_rate": 0.00010232308312071977, | |
| "loss": 0.8650222778320312, | |
| "mean_token_accuracy": 0.8004546627402306, | |
| "num_tokens": 16179442.0, | |
| "step": 4030 | |
| }, | |
| { | |
| "entropy": 0.970530441403389, | |
| "epoch": 1.546115575966322, | |
| "grad_norm": 0.08758437633514404, | |
| "learning_rate": 0.0001020545185980932, | |
| "loss": 1.029263401031494, | |
| "mean_token_accuracy": 0.7669389978051185, | |
| "num_tokens": 16220502.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "entropy": 0.8929917253553867, | |
| "epoch": 1.549942594718714, | |
| "grad_norm": 0.0840209424495697, | |
| "learning_rate": 0.00010178595407546662, | |
| "loss": 0.9574555397033692, | |
| "mean_token_accuracy": 0.7882895812392234, | |
| "num_tokens": 16263944.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "entropy": 0.9571633011102676, | |
| "epoch": 1.5537696134711059, | |
| "grad_norm": 0.07731885462999344, | |
| "learning_rate": 0.00010151738955284007, | |
| "loss": 1.014600658416748, | |
| "mean_token_accuracy": 0.7691228404641152, | |
| "num_tokens": 16307508.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "entropy": 0.9627384431660175, | |
| "epoch": 1.5575966322234978, | |
| "grad_norm": 0.09968744218349457, | |
| "learning_rate": 0.00010124882503021352, | |
| "loss": 1.0220794677734375, | |
| "mean_token_accuracy": 0.7685489565134048, | |
| "num_tokens": 16349178.0, | |
| "step": 4070 | |
| }, | |
| { | |
| "entropy": 0.8696753971278668, | |
| "epoch": 1.5614236509758896, | |
| "grad_norm": 0.08411276340484619, | |
| "learning_rate": 0.00010098026050758695, | |
| "loss": 0.9325771331787109, | |
| "mean_token_accuracy": 0.7903442814946174, | |
| "num_tokens": 16390375.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "entropy": 0.8790203854441643, | |
| "epoch": 1.5652506697282815, | |
| "grad_norm": 0.0969686210155487, | |
| "learning_rate": 0.0001007116959849604, | |
| "loss": 0.9325167655944824, | |
| "mean_token_accuracy": 0.7890133559703827, | |
| "num_tokens": 16429198.0, | |
| "step": 4090 | |
| }, | |
| { | |
| "entropy": 0.9447548128664494, | |
| "epoch": 1.5690776884806734, | |
| "grad_norm": 0.07992373406887054, | |
| "learning_rate": 0.00010044313146233384, | |
| "loss": 0.9708291053771972, | |
| "mean_token_accuracy": 0.7737105548381805, | |
| "num_tokens": 16472336.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "entropy": 0.974559249728918, | |
| "epoch": 1.5729047072330653, | |
| "grad_norm": 0.09685226529836655, | |
| "learning_rate": 0.00010017456693970726, | |
| "loss": 1.0289334297180175, | |
| "mean_token_accuracy": 0.7674296617507934, | |
| "num_tokens": 16511109.0, | |
| "step": 4110 | |
| }, | |
| { | |
| "entropy": 0.8575489681214095, | |
| "epoch": 1.5767317259854572, | |
| "grad_norm": 0.09298260509967804, | |
| "learning_rate": 9.990600241708071e-05, | |
| "loss": 0.8897696495056152, | |
| "mean_token_accuracy": 0.7952411189675331, | |
| "num_tokens": 16552802.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "entropy": 0.869475956633687, | |
| "epoch": 1.580558744737849, | |
| "grad_norm": 0.129170760512352, | |
| "learning_rate": 9.963743789445414e-05, | |
| "loss": 0.9408356666564941, | |
| "mean_token_accuracy": 0.7868246123194694, | |
| "num_tokens": 16592603.0, | |
| "step": 4130 | |
| }, | |
| { | |
| "entropy": 0.9167623318731785, | |
| "epoch": 1.584385763490241, | |
| "grad_norm": 0.08131655305624008, | |
| "learning_rate": 9.936887337182759e-05, | |
| "loss": 1.005775260925293, | |
| "mean_token_accuracy": 0.7779423877596855, | |
| "num_tokens": 16633674.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "entropy": 0.9069061763584614, | |
| "epoch": 1.5882127822426328, | |
| "grad_norm": 0.07485036551952362, | |
| "learning_rate": 9.910030884920103e-05, | |
| "loss": 0.9540878295898437, | |
| "mean_token_accuracy": 0.7809736356139183, | |
| "num_tokens": 16669966.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "entropy": 1.0095594763755797, | |
| "epoch": 1.5920398009950247, | |
| "grad_norm": 0.11678522825241089, | |
| "learning_rate": 9.883174432657446e-05, | |
| "loss": 1.0742655754089356, | |
| "mean_token_accuracy": 0.7636483564972878, | |
| "num_tokens": 16711538.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "entropy": 0.8342153321951628, | |
| "epoch": 1.5958668197474166, | |
| "grad_norm": 0.09654127061367035, | |
| "learning_rate": 9.85631798039479e-05, | |
| "loss": 0.8637946128845215, | |
| "mean_token_accuracy": 0.7977021634578705, | |
| "num_tokens": 16746947.0, | |
| "step": 4170 | |
| }, | |
| { | |
| "entropy": 0.9147222273051738, | |
| "epoch": 1.5996938384998085, | |
| "grad_norm": 0.10032576322555542, | |
| "learning_rate": 9.829461528132134e-05, | |
| "loss": 0.9848580360412598, | |
| "mean_token_accuracy": 0.7794791385531425, | |
| "num_tokens": 16792089.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "entropy": 0.9350447114557028, | |
| "epoch": 1.6035208572522004, | |
| "grad_norm": 0.11322317272424698, | |
| "learning_rate": 9.802605075869477e-05, | |
| "loss": 0.9632351875305176, | |
| "mean_token_accuracy": 0.7710213780403137, | |
| "num_tokens": 16831782.0, | |
| "step": 4190 | |
| }, | |
| { | |
| "entropy": 0.8924577154219151, | |
| "epoch": 1.6073478760045923, | |
| "grad_norm": 0.08842343091964722, | |
| "learning_rate": 9.775748623606822e-05, | |
| "loss": 0.9661048889160156, | |
| "mean_token_accuracy": 0.7863042891025543, | |
| "num_tokens": 16867851.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "entropy": 0.9452814936637879, | |
| "epoch": 1.6111748947569842, | |
| "grad_norm": 0.10469862073659897, | |
| "learning_rate": 9.748892171344167e-05, | |
| "loss": 1.0315632820129395, | |
| "mean_token_accuracy": 0.769272243976593, | |
| "num_tokens": 16909819.0, | |
| "step": 4210 | |
| }, | |
| { | |
| "entropy": 0.8794655621051788, | |
| "epoch": 1.615001913509376, | |
| "grad_norm": 0.08528223633766174, | |
| "learning_rate": 9.72203571908151e-05, | |
| "loss": 0.9158189773559571, | |
| "mean_token_accuracy": 0.791112196445465, | |
| "num_tokens": 16945241.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "entropy": 0.9216304633766412, | |
| "epoch": 1.618828932261768, | |
| "grad_norm": 0.07684458047151566, | |
| "learning_rate": 9.695179266818853e-05, | |
| "loss": 1.0047569274902344, | |
| "mean_token_accuracy": 0.7764274105429649, | |
| "num_tokens": 16986516.0, | |
| "step": 4230 | |
| }, | |
| { | |
| "entropy": 0.8806056842207909, | |
| "epoch": 1.6226559510141598, | |
| "grad_norm": 0.09925177693367004, | |
| "learning_rate": 9.668322814556198e-05, | |
| "loss": 0.9321705818176269, | |
| "mean_token_accuracy": 0.7873435765504837, | |
| "num_tokens": 17026974.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "entropy": 1.0260133132338525, | |
| "epoch": 1.6264829697665517, | |
| "grad_norm": 0.07781514525413513, | |
| "learning_rate": 9.641466362293541e-05, | |
| "loss": 1.0732348442077637, | |
| "mean_token_accuracy": 0.755302457511425, | |
| "num_tokens": 17063628.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "entropy": 0.8771878894418478, | |
| "epoch": 1.6303099885189436, | |
| "grad_norm": 0.12377400696277618, | |
| "learning_rate": 9.614609910030885e-05, | |
| "loss": 0.9051324844360351, | |
| "mean_token_accuracy": 0.7877693608403206, | |
| "num_tokens": 17102243.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "entropy": 0.9575911372900009, | |
| "epoch": 1.6341370072713355, | |
| "grad_norm": 0.07953961193561554, | |
| "learning_rate": 9.58775345776823e-05, | |
| "loss": 1.0206258773803711, | |
| "mean_token_accuracy": 0.770101509988308, | |
| "num_tokens": 17143256.0, | |
| "step": 4270 | |
| }, | |
| { | |
| "entropy": 0.9909125387668609, | |
| "epoch": 1.6379640260237274, | |
| "grad_norm": 0.09304741024971008, | |
| "learning_rate": 9.560897005505573e-05, | |
| "loss": 1.043109130859375, | |
| "mean_token_accuracy": 0.7598145559430123, | |
| "num_tokens": 17188878.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "entropy": 0.8626054737716913, | |
| "epoch": 1.6417910447761193, | |
| "grad_norm": 0.08982561528682709, | |
| "learning_rate": 9.534040553242916e-05, | |
| "loss": 0.9062054634094239, | |
| "mean_token_accuracy": 0.790125061571598, | |
| "num_tokens": 17224537.0, | |
| "step": 4290 | |
| }, | |
| { | |
| "entropy": 0.919727610051632, | |
| "epoch": 1.6456180635285111, | |
| "grad_norm": 0.11226653307676315, | |
| "learning_rate": 9.507184100980261e-05, | |
| "loss": 0.970013427734375, | |
| "mean_token_accuracy": 0.7747739493846894, | |
| "num_tokens": 17262347.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "entropy": 1.032866196334362, | |
| "epoch": 1.649445082280903, | |
| "grad_norm": 0.09440238773822784, | |
| "learning_rate": 9.480327648717606e-05, | |
| "loss": 1.0287545204162598, | |
| "mean_token_accuracy": 0.7550160124897957, | |
| "num_tokens": 17307578.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "entropy": 0.907962580025196, | |
| "epoch": 1.653272101033295, | |
| "grad_norm": 0.11395370960235596, | |
| "learning_rate": 9.453471196454948e-05, | |
| "loss": 0.9705679893493653, | |
| "mean_token_accuracy": 0.7807198286056518, | |
| "num_tokens": 17342943.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "entropy": 0.8495472550392151, | |
| "epoch": 1.6570991197856868, | |
| "grad_norm": 0.07685171812772751, | |
| "learning_rate": 9.426614744192292e-05, | |
| "loss": 0.9079866409301758, | |
| "mean_token_accuracy": 0.7923004642128945, | |
| "num_tokens": 17378158.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "entropy": 0.8389323726296425, | |
| "epoch": 1.6609261385380787, | |
| "grad_norm": 0.09541229903697968, | |
| "learning_rate": 9.399758291929637e-05, | |
| "loss": 0.9092423439025878, | |
| "mean_token_accuracy": 0.7946408927440644, | |
| "num_tokens": 17412703.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "entropy": 0.9035130314528942, | |
| "epoch": 1.6647531572904706, | |
| "grad_norm": 0.08291888236999512, | |
| "learning_rate": 9.37290183966698e-05, | |
| "loss": 0.9255120277404785, | |
| "mean_token_accuracy": 0.7840688213706016, | |
| "num_tokens": 17456250.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "entropy": 0.8917031817138195, | |
| "epoch": 1.6685801760428625, | |
| "grad_norm": 0.08787538856267929, | |
| "learning_rate": 9.346045387404324e-05, | |
| "loss": 0.9318277359008789, | |
| "mean_token_accuracy": 0.7854569494724274, | |
| "num_tokens": 17492566.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "entropy": 0.8860244527459145, | |
| "epoch": 1.6724071947952543, | |
| "grad_norm": 0.10287550836801529, | |
| "learning_rate": 9.319188935141668e-05, | |
| "loss": 0.9169553756713867, | |
| "mean_token_accuracy": 0.7801365301012992, | |
| "num_tokens": 17530267.0, | |
| "step": 4370 | |
| }, | |
| { | |
| "entropy": 0.8470614090561867, | |
| "epoch": 1.6762342135476462, | |
| "grad_norm": 0.13052308559417725, | |
| "learning_rate": 9.292332482879013e-05, | |
| "loss": 0.9004100799560547, | |
| "mean_token_accuracy": 0.791596457362175, | |
| "num_tokens": 17566336.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "entropy": 0.9627884522080421, | |
| "epoch": 1.6800612323000381, | |
| "grad_norm": 0.09305555373430252, | |
| "learning_rate": 9.265476030616355e-05, | |
| "loss": 0.9837147712707519, | |
| "mean_token_accuracy": 0.7687505498528481, | |
| "num_tokens": 17609294.0, | |
| "step": 4390 | |
| }, | |
| { | |
| "entropy": 0.9614691123366356, | |
| "epoch": 1.68388825105243, | |
| "grad_norm": 0.08118042349815369, | |
| "learning_rate": 9.2386195783537e-05, | |
| "loss": 1.0093948364257812, | |
| "mean_token_accuracy": 0.7686966329813003, | |
| "num_tokens": 17653408.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "entropy": 0.8255576498806476, | |
| "epoch": 1.687715269804822, | |
| "grad_norm": 0.07197146117687225, | |
| "learning_rate": 9.211763126091045e-05, | |
| "loss": 0.9013225555419921, | |
| "mean_token_accuracy": 0.7994248151779175, | |
| "num_tokens": 17693303.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "entropy": 0.9197361193597317, | |
| "epoch": 1.6915422885572138, | |
| "grad_norm": 0.10147208720445633, | |
| "learning_rate": 9.184906673828388e-05, | |
| "loss": 0.966912841796875, | |
| "mean_token_accuracy": 0.774210800230503, | |
| "num_tokens": 17734446.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "entropy": 0.8828513637185097, | |
| "epoch": 1.6953693073096057, | |
| "grad_norm": 0.08126919716596603, | |
| "learning_rate": 9.158050221565731e-05, | |
| "loss": 0.9237348556518554, | |
| "mean_token_accuracy": 0.788974218070507, | |
| "num_tokens": 17776056.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "entropy": 0.8538446951657533, | |
| "epoch": 1.6991963260619976, | |
| "grad_norm": 0.08602278679609299, | |
| "learning_rate": 9.131193769303076e-05, | |
| "loss": 0.9384878158569336, | |
| "mean_token_accuracy": 0.7924654617905617, | |
| "num_tokens": 17814560.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "entropy": 0.9160130321979523, | |
| "epoch": 1.7030233448143894, | |
| "grad_norm": 0.10127890110015869, | |
| "learning_rate": 9.10433731704042e-05, | |
| "loss": 0.9924029350280762, | |
| "mean_token_accuracy": 0.7764815479516983, | |
| "num_tokens": 17852872.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "entropy": 0.8855723738670349, | |
| "epoch": 1.7068503635667813, | |
| "grad_norm": 0.09295201301574707, | |
| "learning_rate": 9.077480864777763e-05, | |
| "loss": 0.9131739616394043, | |
| "mean_token_accuracy": 0.7876585990190506, | |
| "num_tokens": 17893403.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "entropy": 0.8825645297765732, | |
| "epoch": 1.7106773823191732, | |
| "grad_norm": 0.1038793995976448, | |
| "learning_rate": 9.050624412515107e-05, | |
| "loss": 0.9621119499206543, | |
| "mean_token_accuracy": 0.7840767920017242, | |
| "num_tokens": 17933069.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "entropy": 0.9438045337796211, | |
| "epoch": 1.714504401071565, | |
| "grad_norm": 0.08998332172632217, | |
| "learning_rate": 9.023767960252452e-05, | |
| "loss": 1.0042546272277832, | |
| "mean_token_accuracy": 0.7710625112056733, | |
| "num_tokens": 17978081.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "entropy": 0.9605814971029758, | |
| "epoch": 1.718331419823957, | |
| "grad_norm": 0.0936085507273674, | |
| "learning_rate": 8.996911507989794e-05, | |
| "loss": 1.0731863021850585, | |
| "mean_token_accuracy": 0.7675610318779945, | |
| "num_tokens": 18026355.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "entropy": 0.9412197135388851, | |
| "epoch": 1.7221584385763489, | |
| "grad_norm": 0.11693151295185089, | |
| "learning_rate": 8.970055055727139e-05, | |
| "loss": 1.0271482467651367, | |
| "mean_token_accuracy": 0.7749442532658577, | |
| "num_tokens": 18064648.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "entropy": 0.9840309470891953, | |
| "epoch": 1.7259854573287408, | |
| "grad_norm": 0.07721691578626633, | |
| "learning_rate": 8.943198603464484e-05, | |
| "loss": 0.9978925704956054, | |
| "mean_token_accuracy": 0.7662706628441811, | |
| "num_tokens": 18104352.0, | |
| "step": 4510 | |
| }, | |
| { | |
| "entropy": 0.9122109733521938, | |
| "epoch": 1.7298124760811326, | |
| "grad_norm": 0.10790548473596573, | |
| "learning_rate": 8.916342151201827e-05, | |
| "loss": 0.9912397384643554, | |
| "mean_token_accuracy": 0.774566973745823, | |
| "num_tokens": 18145632.0, | |
| "step": 4520 | |
| }, | |
| { | |
| "entropy": 0.8214024558663369, | |
| "epoch": 1.7336394948335245, | |
| "grad_norm": 0.0873790979385376, | |
| "learning_rate": 8.88948569893917e-05, | |
| "loss": 0.9174188613891602, | |
| "mean_token_accuracy": 0.797317324578762, | |
| "num_tokens": 18182216.0, | |
| "step": 4530 | |
| }, | |
| { | |
| "entropy": 0.8851194910705089, | |
| "epoch": 1.7374665135859164, | |
| "grad_norm": 0.08441472053527832, | |
| "learning_rate": 8.862629246676515e-05, | |
| "loss": 0.9345614433288574, | |
| "mean_token_accuracy": 0.7909206628799439, | |
| "num_tokens": 18220152.0, | |
| "step": 4540 | |
| }, | |
| { | |
| "entropy": 0.9045546390116215, | |
| "epoch": 1.7412935323383083, | |
| "grad_norm": 0.09491857141256332, | |
| "learning_rate": 8.835772794413858e-05, | |
| "loss": 1.0261774063110352, | |
| "mean_token_accuracy": 0.7792002618312835, | |
| "num_tokens": 18253615.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "entropy": 0.8957971200346947, | |
| "epoch": 1.7451205510907002, | |
| "grad_norm": 0.07239943742752075, | |
| "learning_rate": 8.808916342151202e-05, | |
| "loss": 0.9220120429992675, | |
| "mean_token_accuracy": 0.7812404081225395, | |
| "num_tokens": 18292565.0, | |
| "step": 4560 | |
| }, | |
| { | |
| "entropy": 0.9733762003481388, | |
| "epoch": 1.748947569843092, | |
| "grad_norm": 0.07816951721906662, | |
| "learning_rate": 8.782059889888546e-05, | |
| "loss": 1.0176166534423827, | |
| "mean_token_accuracy": 0.7636090680956841, | |
| "num_tokens": 18337951.0, | |
| "step": 4570 | |
| }, | |
| { | |
| "entropy": 0.9952755816280842, | |
| "epoch": 1.752774588595484, | |
| "grad_norm": 0.09595679491758347, | |
| "learning_rate": 8.75520343762589e-05, | |
| "loss": 1.0484466552734375, | |
| "mean_token_accuracy": 0.7624279737472535, | |
| "num_tokens": 18378541.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "entropy": 0.9325974151492119, | |
| "epoch": 1.7566016073478758, | |
| "grad_norm": 0.1425638496875763, | |
| "learning_rate": 8.728346985363234e-05, | |
| "loss": 1.007568645477295, | |
| "mean_token_accuracy": 0.7753438904881478, | |
| "num_tokens": 18416147.0, | |
| "step": 4590 | |
| }, | |
| { | |
| "entropy": 0.8879670143127442, | |
| "epoch": 1.7604286261002677, | |
| "grad_norm": 0.08936052024364471, | |
| "learning_rate": 8.701490533100578e-05, | |
| "loss": 0.9574133872985839, | |
| "mean_token_accuracy": 0.7878236457705498, | |
| "num_tokens": 18452658.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "entropy": 0.9596087213605642, | |
| "epoch": 1.7642556448526596, | |
| "grad_norm": 0.08222804218530655, | |
| "learning_rate": 8.674634080837921e-05, | |
| "loss": 1.0134518623352051, | |
| "mean_token_accuracy": 0.7686690568923951, | |
| "num_tokens": 18493806.0, | |
| "step": 4610 | |
| }, | |
| { | |
| "entropy": 0.9412514306604862, | |
| "epoch": 1.7680826636050515, | |
| "grad_norm": 0.08482176810503006, | |
| "learning_rate": 8.647777628575266e-05, | |
| "loss": 0.9830768585205079, | |
| "mean_token_accuracy": 0.7789766594767571, | |
| "num_tokens": 18538740.0, | |
| "step": 4620 | |
| }, | |
| { | |
| "entropy": 0.8279416210949421, | |
| "epoch": 1.7719096823574434, | |
| "grad_norm": 0.12101086974143982, | |
| "learning_rate": 8.620921176312609e-05, | |
| "loss": 0.8422709465026855, | |
| "mean_token_accuracy": 0.8009032368659973, | |
| "num_tokens": 18579991.0, | |
| "step": 4630 | |
| }, | |
| { | |
| "entropy": 0.8889543637633324, | |
| "epoch": 1.7757367011098353, | |
| "grad_norm": 0.09586559236049652, | |
| "learning_rate": 8.594064724049954e-05, | |
| "loss": 0.9579720497131348, | |
| "mean_token_accuracy": 0.7857530102133751, | |
| "num_tokens": 18616195.0, | |
| "step": 4640 | |
| }, | |
| { | |
| "entropy": 0.9021936893463135, | |
| "epoch": 1.7795637198622272, | |
| "grad_norm": 0.0920713022351265, | |
| "learning_rate": 8.567208271787297e-05, | |
| "loss": 0.9568814277648926, | |
| "mean_token_accuracy": 0.7835437625646591, | |
| "num_tokens": 18650396.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "entropy": 0.9605553701519967, | |
| "epoch": 1.783390738614619, | |
| "grad_norm": 0.0752284824848175, | |
| "learning_rate": 8.54035181952464e-05, | |
| "loss": 1.0107332229614259, | |
| "mean_token_accuracy": 0.7712536633014679, | |
| "num_tokens": 18692519.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "entropy": 0.8929145928472281, | |
| "epoch": 1.787217757367011, | |
| "grad_norm": 0.08124406635761261, | |
| "learning_rate": 8.513495367261985e-05, | |
| "loss": 0.9392594337463379, | |
| "mean_token_accuracy": 0.7837390914559365, | |
| "num_tokens": 18730865.0, | |
| "step": 4670 | |
| }, | |
| { | |
| "entropy": 0.8995866551995277, | |
| "epoch": 1.7910447761194028, | |
| "grad_norm": 0.07306879013776779, | |
| "learning_rate": 8.486638914999329e-05, | |
| "loss": 0.9512563705444336, | |
| "mean_token_accuracy": 0.7803288042545319, | |
| "num_tokens": 18774851.0, | |
| "step": 4680 | |
| }, | |
| { | |
| "entropy": 0.9283428456634283, | |
| "epoch": 1.7948717948717947, | |
| "grad_norm": 0.06833672523498535, | |
| "learning_rate": 8.459782462736673e-05, | |
| "loss": 0.9614426612854003, | |
| "mean_token_accuracy": 0.7776324123144149, | |
| "num_tokens": 18815273.0, | |
| "step": 4690 | |
| }, | |
| { | |
| "entropy": 0.8980611331760884, | |
| "epoch": 1.7986988136241866, | |
| "grad_norm": 0.09426148980855942, | |
| "learning_rate": 8.432926010474017e-05, | |
| "loss": 0.9397372245788574, | |
| "mean_token_accuracy": 0.7818324938416481, | |
| "num_tokens": 18854806.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "entropy": 0.9534067753702402, | |
| "epoch": 1.8025258323765785, | |
| "grad_norm": 0.11984719336032867, | |
| "learning_rate": 8.40606955821136e-05, | |
| "loss": 1.0058012008666992, | |
| "mean_token_accuracy": 0.7710662186145782, | |
| "num_tokens": 18893820.0, | |
| "step": 4710 | |
| }, | |
| { | |
| "entropy": 0.9396863542497158, | |
| "epoch": 1.8063528511289704, | |
| "grad_norm": 0.1126495823264122, | |
| "learning_rate": 8.379213105948705e-05, | |
| "loss": 0.9968315124511719, | |
| "mean_token_accuracy": 0.7748499393463135, | |
| "num_tokens": 18932078.0, | |
| "step": 4720 | |
| }, | |
| { | |
| "entropy": 1.0531147465109825, | |
| "epoch": 1.8101798698813623, | |
| "grad_norm": 0.0951380655169487, | |
| "learning_rate": 8.352356653686048e-05, | |
| "loss": 1.1058255195617677, | |
| "mean_token_accuracy": 0.7495945364236831, | |
| "num_tokens": 18974602.0, | |
| "step": 4730 | |
| }, | |
| { | |
| "entropy": 0.8520326256752014, | |
| "epoch": 1.8140068886337541, | |
| "grad_norm": 0.08623862266540527, | |
| "learning_rate": 8.325500201423391e-05, | |
| "loss": 0.8825064659118652, | |
| "mean_token_accuracy": 0.7940022364258766, | |
| "num_tokens": 19014261.0, | |
| "step": 4740 | |
| }, | |
| { | |
| "entropy": 0.979587784409523, | |
| "epoch": 1.817833907386146, | |
| "grad_norm": 0.11787699162960052, | |
| "learning_rate": 8.298643749160736e-05, | |
| "loss": 1.070664405822754, | |
| "mean_token_accuracy": 0.7620177045464516, | |
| "num_tokens": 19058223.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "entropy": 0.8753061652183532, | |
| "epoch": 1.821660926138538, | |
| "grad_norm": 0.130862757563591, | |
| "learning_rate": 8.271787296898081e-05, | |
| "loss": 0.9366108894348144, | |
| "mean_token_accuracy": 0.7874863654375076, | |
| "num_tokens": 19100204.0, | |
| "step": 4760 | |
| }, | |
| { | |
| "entropy": 0.8777839131653309, | |
| "epoch": 1.8254879448909298, | |
| "grad_norm": 0.09261229634284973, | |
| "learning_rate": 8.244930844635424e-05, | |
| "loss": 0.9104420661926269, | |
| "mean_token_accuracy": 0.7895808070898056, | |
| "num_tokens": 19141588.0, | |
| "step": 4770 | |
| }, | |
| { | |
| "entropy": 0.9170226149260998, | |
| "epoch": 1.8293149636433217, | |
| "grad_norm": 0.06741383671760559, | |
| "learning_rate": 8.218074392372768e-05, | |
| "loss": 0.9543824195861816, | |
| "mean_token_accuracy": 0.7765088111162186, | |
| "num_tokens": 19182818.0, | |
| "step": 4780 | |
| }, | |
| { | |
| "entropy": 0.8602717489004135, | |
| "epoch": 1.8331419823957136, | |
| "grad_norm": 0.12861686944961548, | |
| "learning_rate": 8.191217940110112e-05, | |
| "loss": 0.926014518737793, | |
| "mean_token_accuracy": 0.7917162731289864, | |
| "num_tokens": 19217919.0, | |
| "step": 4790 | |
| }, | |
| { | |
| "entropy": 0.9471398882567883, | |
| "epoch": 1.8369690011481055, | |
| "grad_norm": 0.0744423121213913, | |
| "learning_rate": 8.164361487847456e-05, | |
| "loss": 0.9777775764465332, | |
| "mean_token_accuracy": 0.7716371163725853, | |
| "num_tokens": 19263991.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "entropy": 0.8363759070634842, | |
| "epoch": 1.8407960199004973, | |
| "grad_norm": 0.08627327531576157, | |
| "learning_rate": 8.137505035584799e-05, | |
| "loss": 0.887846565246582, | |
| "mean_token_accuracy": 0.7938979223370553, | |
| "num_tokens": 19305441.0, | |
| "step": 4810 | |
| }, | |
| { | |
| "entropy": 0.9200571574270725, | |
| "epoch": 1.8446230386528892, | |
| "grad_norm": 0.08358518034219742, | |
| "learning_rate": 8.110648583322144e-05, | |
| "loss": 0.9703543663024903, | |
| "mean_token_accuracy": 0.777827826142311, | |
| "num_tokens": 19342309.0, | |
| "step": 4820 | |
| }, | |
| { | |
| "entropy": 0.9295372806489468, | |
| "epoch": 1.8484500574052811, | |
| "grad_norm": 0.0970570370554924, | |
| "learning_rate": 8.083792131059487e-05, | |
| "loss": 0.9934672355651856, | |
| "mean_token_accuracy": 0.7739364430308342, | |
| "num_tokens": 19387707.0, | |
| "step": 4830 | |
| }, | |
| { | |
| "entropy": 0.9003355488181114, | |
| "epoch": 1.852277076157673, | |
| "grad_norm": 0.09357219189405441, | |
| "learning_rate": 8.05693567879683e-05, | |
| "loss": 0.9544237136840821, | |
| "mean_token_accuracy": 0.7824135825037957, | |
| "num_tokens": 19428449.0, | |
| "step": 4840 | |
| }, | |
| { | |
| "entropy": 1.0107353992760182, | |
| "epoch": 1.856104094910065, | |
| "grad_norm": 0.08587910234928131, | |
| "learning_rate": 8.030079226534175e-05, | |
| "loss": 1.0694228172302247, | |
| "mean_token_accuracy": 0.7585012704133988, | |
| "num_tokens": 19469625.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "entropy": 0.9866157718002796, | |
| "epoch": 1.8599311136624568, | |
| "grad_norm": 0.11663772910833359, | |
| "learning_rate": 8.00322277427152e-05, | |
| "loss": 1.0394890785217286, | |
| "mean_token_accuracy": 0.7632519364356994, | |
| "num_tokens": 19510480.0, | |
| "step": 4860 | |
| }, | |
| { | |
| "entropy": 0.881837759912014, | |
| "epoch": 1.8637581324148487, | |
| "grad_norm": 0.13599033653736115, | |
| "learning_rate": 7.976366322008862e-05, | |
| "loss": 0.9441938400268555, | |
| "mean_token_accuracy": 0.7848336577415467, | |
| "num_tokens": 19550305.0, | |
| "step": 4870 | |
| }, | |
| { | |
| "entropy": 0.9193019077181817, | |
| "epoch": 1.8675851511672406, | |
| "grad_norm": 0.09272989630699158, | |
| "learning_rate": 7.949509869746207e-05, | |
| "loss": 0.9862917900085449, | |
| "mean_token_accuracy": 0.7736792579293251, | |
| "num_tokens": 19588278.0, | |
| "step": 4880 | |
| }, | |
| { | |
| "entropy": 0.9219990812242032, | |
| "epoch": 1.8714121699196324, | |
| "grad_norm": 0.10006739944219589, | |
| "learning_rate": 7.922653417483551e-05, | |
| "loss": 0.9700265884399414, | |
| "mean_token_accuracy": 0.780723437666893, | |
| "num_tokens": 19625749.0, | |
| "step": 4890 | |
| }, | |
| { | |
| "entropy": 0.8564371943473816, | |
| "epoch": 1.8752391886720245, | |
| "grad_norm": 0.08216696232557297, | |
| "learning_rate": 7.895796965220895e-05, | |
| "loss": 0.9064787864685059, | |
| "mean_token_accuracy": 0.7962334454059601, | |
| "num_tokens": 19663935.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "entropy": 0.9482992745935916, | |
| "epoch": 1.8790662074244164, | |
| "grad_norm": 0.06782303750514984, | |
| "learning_rate": 7.868940512958238e-05, | |
| "loss": 0.9920551300048828, | |
| "mean_token_accuracy": 0.7696940049529075, | |
| "num_tokens": 19704663.0, | |
| "step": 4910 | |
| }, | |
| { | |
| "entropy": 0.884397204965353, | |
| "epoch": 1.8828932261768083, | |
| "grad_norm": 0.06414399296045303, | |
| "learning_rate": 7.842084060695583e-05, | |
| "loss": 0.9083518981933594, | |
| "mean_token_accuracy": 0.7858098462224007, | |
| "num_tokens": 19753438.0, | |
| "step": 4920 | |
| }, | |
| { | |
| "entropy": 0.8019696604460478, | |
| "epoch": 1.8867202449292002, | |
| "grad_norm": 0.08456243574619293, | |
| "learning_rate": 7.815227608432927e-05, | |
| "loss": 0.8896969795227051, | |
| "mean_token_accuracy": 0.8053153440356254, | |
| "num_tokens": 19790404.0, | |
| "step": 4930 | |
| }, | |
| { | |
| "entropy": 0.8008564852178097, | |
| "epoch": 1.890547263681592, | |
| "grad_norm": 0.10543688386678696, | |
| "learning_rate": 7.78837115617027e-05, | |
| "loss": 0.8535223007202148, | |
| "mean_token_accuracy": 0.8059684678912162, | |
| "num_tokens": 19825645.0, | |
| "step": 4940 | |
| }, | |
| { | |
| "entropy": 0.8714719720184803, | |
| "epoch": 1.894374282433984, | |
| "grad_norm": 0.09498755633831024, | |
| "learning_rate": 7.761514703907614e-05, | |
| "loss": 0.9063860893249511, | |
| "mean_token_accuracy": 0.7914781123399734, | |
| "num_tokens": 19866092.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "entropy": 0.9202240366488695, | |
| "epoch": 1.8982013011863759, | |
| "grad_norm": 0.07342597842216492, | |
| "learning_rate": 7.734658251644959e-05, | |
| "loss": 0.9767581939697265, | |
| "mean_token_accuracy": 0.7755973920226097, | |
| "num_tokens": 19907356.0, | |
| "step": 4960 | |
| }, | |
| { | |
| "entropy": 0.9477262906730175, | |
| "epoch": 1.9020283199387678, | |
| "grad_norm": 0.08742880076169968, | |
| "learning_rate": 7.707801799382302e-05, | |
| "loss": 1.0063783645629882, | |
| "mean_token_accuracy": 0.7687567621469498, | |
| "num_tokens": 19952869.0, | |
| "step": 4970 | |
| }, | |
| { | |
| "entropy": 0.977492806315422, | |
| "epoch": 1.9058553386911596, | |
| "grad_norm": 0.10321515798568726, | |
| "learning_rate": 7.680945347119645e-05, | |
| "loss": 1.0323823928833007, | |
| "mean_token_accuracy": 0.7646988987922668, | |
| "num_tokens": 19991372.0, | |
| "step": 4980 | |
| }, | |
| { | |
| "entropy": 0.7999268680810928, | |
| "epoch": 1.9096823574435515, | |
| "grad_norm": 0.08925452828407288, | |
| "learning_rate": 7.65408889485699e-05, | |
| "loss": 0.8391226768493653, | |
| "mean_token_accuracy": 0.8017501994967461, | |
| "num_tokens": 20029189.0, | |
| "step": 4990 | |
| }, | |
| { | |
| "entropy": 0.8757653787732125, | |
| "epoch": 1.9135093761959434, | |
| "grad_norm": 0.1915360540151596, | |
| "learning_rate": 7.627232442594334e-05, | |
| "loss": 0.9243562698364258, | |
| "mean_token_accuracy": 0.7841411307454109, | |
| "num_tokens": 20070611.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "entropy": 0.9357082359492779, | |
| "epoch": 1.9173363949483353, | |
| "grad_norm": 0.08219558745622635, | |
| "learning_rate": 7.600375990331677e-05, | |
| "loss": 0.9772232055664063, | |
| "mean_token_accuracy": 0.7725088000297546, | |
| "num_tokens": 20110392.0, | |
| "step": 5010 | |
| }, | |
| { | |
| "entropy": 0.9191611532121897, | |
| "epoch": 1.9211634137007272, | |
| "grad_norm": 0.07629676163196564, | |
| "learning_rate": 7.573519538069022e-05, | |
| "loss": 0.9754646301269532, | |
| "mean_token_accuracy": 0.7830281540751457, | |
| "num_tokens": 20150683.0, | |
| "step": 5020 | |
| }, | |
| { | |
| "entropy": 0.9279548175632953, | |
| "epoch": 1.924990432453119, | |
| "grad_norm": 0.09845773130655289, | |
| "learning_rate": 7.546663085806366e-05, | |
| "loss": 0.9818471908569336, | |
| "mean_token_accuracy": 0.7738550245761872, | |
| "num_tokens": 20190521.0, | |
| "step": 5030 | |
| }, | |
| { | |
| "entropy": 0.9281142316758633, | |
| "epoch": 1.928817451205511, | |
| "grad_norm": 0.10571245104074478, | |
| "learning_rate": 7.519806633543708e-05, | |
| "loss": 0.999634075164795, | |
| "mean_token_accuracy": 0.7708285465836525, | |
| "num_tokens": 20230615.0, | |
| "step": 5040 | |
| }, | |
| { | |
| "entropy": 0.8793018095195293, | |
| "epoch": 1.9326444699579028, | |
| "grad_norm": 0.11255183815956116, | |
| "learning_rate": 7.492950181281053e-05, | |
| "loss": 0.9399495124816895, | |
| "mean_token_accuracy": 0.7893765285611153, | |
| "num_tokens": 20269332.0, | |
| "step": 5050 | |
| }, | |
| { | |
| "entropy": 0.8188632413744926, | |
| "epoch": 1.9364714887102947, | |
| "grad_norm": 0.08683498203754425, | |
| "learning_rate": 7.466093729018398e-05, | |
| "loss": 0.8760917663574219, | |
| "mean_token_accuracy": 0.800470444560051, | |
| "num_tokens": 20316849.0, | |
| "step": 5060 | |
| }, | |
| { | |
| "entropy": 0.9165158126503229, | |
| "epoch": 1.9402985074626866, | |
| "grad_norm": 0.12123431265354156, | |
| "learning_rate": 7.439237276755741e-05, | |
| "loss": 0.9515151023864746, | |
| "mean_token_accuracy": 0.7772148326039314, | |
| "num_tokens": 20354641.0, | |
| "step": 5070 | |
| }, | |
| { | |
| "entropy": 0.8890400048345327, | |
| "epoch": 1.9441255262150785, | |
| "grad_norm": 0.09551843255758286, | |
| "learning_rate": 7.412380824493084e-05, | |
| "loss": 0.9720385551452637, | |
| "mean_token_accuracy": 0.7855533555150032, | |
| "num_tokens": 20400703.0, | |
| "step": 5080 | |
| }, | |
| { | |
| "entropy": 0.9226945102214813, | |
| "epoch": 1.9479525449674704, | |
| "grad_norm": 0.11462504416704178, | |
| "learning_rate": 7.385524372230429e-05, | |
| "loss": 0.9757321357727051, | |
| "mean_token_accuracy": 0.7739654749631881, | |
| "num_tokens": 20442145.0, | |
| "step": 5090 | |
| }, | |
| { | |
| "entropy": 0.8108384694904089, | |
| "epoch": 1.9517795637198623, | |
| "grad_norm": 0.13017524778842926, | |
| "learning_rate": 7.358667919967772e-05, | |
| "loss": 0.8620017051696778, | |
| "mean_token_accuracy": 0.8028488114476204, | |
| "num_tokens": 20472714.0, | |
| "step": 5100 | |
| }, | |
| { | |
| "entropy": 0.9563053950667382, | |
| "epoch": 1.9556065824722542, | |
| "grad_norm": 0.10588496923446655, | |
| "learning_rate": 7.331811467705116e-05, | |
| "loss": 0.9805202484130859, | |
| "mean_token_accuracy": 0.7729632049798966, | |
| "num_tokens": 20518593.0, | |
| "step": 5110 | |
| }, | |
| { | |
| "entropy": 0.9307407476007938, | |
| "epoch": 1.959433601224646, | |
| "grad_norm": 0.09899015724658966, | |
| "learning_rate": 7.30495501544246e-05, | |
| "loss": 0.998748779296875, | |
| "mean_token_accuracy": 0.7733172833919525, | |
| "num_tokens": 20558008.0, | |
| "step": 5120 | |
| }, | |
| { | |
| "entropy": 0.9505821786820888, | |
| "epoch": 1.963260619977038, | |
| "grad_norm": 0.0943673700094223, | |
| "learning_rate": 7.278098563179804e-05, | |
| "loss": 1.0047925949096679, | |
| "mean_token_accuracy": 0.7691358909010887, | |
| "num_tokens": 20603741.0, | |
| "step": 5130 | |
| }, | |
| { | |
| "entropy": 1.04148171544075, | |
| "epoch": 1.9670876387294298, | |
| "grad_norm": 0.08869694918394089, | |
| "learning_rate": 7.251242110917149e-05, | |
| "loss": 1.0801177024841309, | |
| "mean_token_accuracy": 0.7499634683132171, | |
| "num_tokens": 20645827.0, | |
| "step": 5140 | |
| }, | |
| { | |
| "entropy": 0.7822969853878021, | |
| "epoch": 1.9709146574818217, | |
| "grad_norm": 0.0994991883635521, | |
| "learning_rate": 7.224385658654492e-05, | |
| "loss": 0.8042619705200196, | |
| "mean_token_accuracy": 0.8097834318876267, | |
| "num_tokens": 20684019.0, | |
| "step": 5150 | |
| }, | |
| { | |
| "entropy": 0.918664800748229, | |
| "epoch": 1.9747416762342136, | |
| "grad_norm": 0.11157739907503128, | |
| "learning_rate": 7.197529206391837e-05, | |
| "loss": 0.983153247833252, | |
| "mean_token_accuracy": 0.7776870116591453, | |
| "num_tokens": 20726278.0, | |
| "step": 5160 | |
| }, | |
| { | |
| "entropy": 0.911195681989193, | |
| "epoch": 1.9785686949866055, | |
| "grad_norm": 0.13472694158554077, | |
| "learning_rate": 7.17067275412918e-05, | |
| "loss": 0.9662351608276367, | |
| "mean_token_accuracy": 0.7743990138173104, | |
| "num_tokens": 20759927.0, | |
| "step": 5170 | |
| }, | |
| { | |
| "entropy": 0.8238823972642422, | |
| "epoch": 1.9823957137389974, | |
| "grad_norm": 0.08864834159612656, | |
| "learning_rate": 7.143816301866523e-05, | |
| "loss": 0.8870213508605957, | |
| "mean_token_accuracy": 0.7989589869976044, | |
| "num_tokens": 20798325.0, | |
| "step": 5180 | |
| }, | |
| { | |
| "entropy": 0.9405660286545754, | |
| "epoch": 1.9862227324913893, | |
| "grad_norm": 0.08372621983289719, | |
| "learning_rate": 7.116959849603868e-05, | |
| "loss": 0.9449873924255371, | |
| "mean_token_accuracy": 0.7792889401316643, | |
| "num_tokens": 20837136.0, | |
| "step": 5190 | |
| }, | |
| { | |
| "entropy": 0.8287422813475132, | |
| "epoch": 1.9900497512437811, | |
| "grad_norm": 0.0968240275979042, | |
| "learning_rate": 7.090103397341211e-05, | |
| "loss": 0.8873905181884766, | |
| "mean_token_accuracy": 0.7976622357964516, | |
| "num_tokens": 20877693.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "entropy": 0.9188660819083452, | |
| "epoch": 1.993876769996173, | |
| "grad_norm": 0.09275626391172409, | |
| "learning_rate": 7.063246945078555e-05, | |
| "loss": 0.989016342163086, | |
| "mean_token_accuracy": 0.7755422025918961, | |
| "num_tokens": 20924885.0, | |
| "step": 5210 | |
| }, | |
| { | |
| "entropy": 0.9058490604162216, | |
| "epoch": 1.997703788748565, | |
| "grad_norm": 0.08644875138998032, | |
| "learning_rate": 7.0363904928159e-05, | |
| "loss": 0.9660470008850097, | |
| "mean_token_accuracy": 0.7761533245444298, | |
| "num_tokens": 20966342.0, | |
| "step": 5220 | |
| }, | |
| { | |
| "entropy": 0.7741431064903737, | |
| "epoch": 2.0015308075009566, | |
| "grad_norm": 0.07492107152938843, | |
| "learning_rate": 7.009534040553243e-05, | |
| "loss": 0.8241374015808105, | |
| "mean_token_accuracy": 0.8149536207318306, | |
| "num_tokens": 21004798.0, | |
| "step": 5230 | |
| }, | |
| { | |
| "entropy": 0.8813200116157531, | |
| "epoch": 2.0053578262533485, | |
| "grad_norm": 0.07805436849594116, | |
| "learning_rate": 6.982677588290588e-05, | |
| "loss": 0.921663761138916, | |
| "mean_token_accuracy": 0.7912002876400948, | |
| "num_tokens": 21049021.0, | |
| "step": 5240 | |
| }, | |
| { | |
| "entropy": 0.8896506872028113, | |
| "epoch": 2.0091848450057403, | |
| "grad_norm": 0.13928763568401337, | |
| "learning_rate": 6.955821136027931e-05, | |
| "loss": 0.9278170585632324, | |
| "mean_token_accuracy": 0.7765205070376396, | |
| "num_tokens": 21086531.0, | |
| "step": 5250 | |
| }, | |
| { | |
| "entropy": 0.9149777121841908, | |
| "epoch": 2.0130118637581322, | |
| "grad_norm": 0.06992843002080917, | |
| "learning_rate": 6.928964683765274e-05, | |
| "loss": 0.9667098045349121, | |
| "mean_token_accuracy": 0.7750229969620704, | |
| "num_tokens": 21127453.0, | |
| "step": 5260 | |
| }, | |
| { | |
| "entropy": 0.8076952576637269, | |
| "epoch": 2.016838882510524, | |
| "grad_norm": 0.12632791697978973, | |
| "learning_rate": 6.902108231502619e-05, | |
| "loss": 0.8237466812133789, | |
| "mean_token_accuracy": 0.804887568950653, | |
| "num_tokens": 21165297.0, | |
| "step": 5270 | |
| }, | |
| { | |
| "entropy": 0.8818444184958935, | |
| "epoch": 2.020665901262916, | |
| "grad_norm": 0.08924616128206253, | |
| "learning_rate": 6.875251779239962e-05, | |
| "loss": 0.9049506187438965, | |
| "mean_token_accuracy": 0.7822276562452316, | |
| "num_tokens": 21206219.0, | |
| "step": 5280 | |
| }, | |
| { | |
| "entropy": 0.7953705489635468, | |
| "epoch": 2.024492920015308, | |
| "grad_norm": 0.1111336424946785, | |
| "learning_rate": 6.848395326977307e-05, | |
| "loss": 0.8433744430541992, | |
| "mean_token_accuracy": 0.8049945279955864, | |
| "num_tokens": 21249239.0, | |
| "step": 5290 | |
| }, | |
| { | |
| "entropy": 0.904665675573051, | |
| "epoch": 2.0283199387677, | |
| "grad_norm": 0.09494993835687637, | |
| "learning_rate": 6.82153887471465e-05, | |
| "loss": 0.9693451881408691, | |
| "mean_token_accuracy": 0.779350683093071, | |
| "num_tokens": 21289639.0, | |
| "step": 5300 | |
| }, | |
| { | |
| "entropy": 0.7958274722099304, | |
| "epoch": 2.0321469575200917, | |
| "grad_norm": 0.10396509617567062, | |
| "learning_rate": 6.794682422451995e-05, | |
| "loss": 0.8559811592102051, | |
| "mean_token_accuracy": 0.8057383120059967, | |
| "num_tokens": 21329136.0, | |
| "step": 5310 | |
| }, | |
| { | |
| "entropy": 0.9416906848549843, | |
| "epoch": 2.0359739762724836, | |
| "grad_norm": 0.08166563510894775, | |
| "learning_rate": 6.767825970189338e-05, | |
| "loss": 0.9891387939453125, | |
| "mean_token_accuracy": 0.7737650781869888, | |
| "num_tokens": 21371300.0, | |
| "step": 5320 | |
| }, | |
| { | |
| "entropy": 0.9342201549559832, | |
| "epoch": 2.0398009950248754, | |
| "grad_norm": 0.09459090232849121, | |
| "learning_rate": 6.740969517926682e-05, | |
| "loss": 0.9509946823120117, | |
| "mean_token_accuracy": 0.7751364663243294, | |
| "num_tokens": 21412268.0, | |
| "step": 5330 | |
| }, | |
| { | |
| "entropy": 0.8397190041840077, | |
| "epoch": 2.0436280137772673, | |
| "grad_norm": 0.10005268454551697, | |
| "learning_rate": 6.714113065664026e-05, | |
| "loss": 0.9056560516357421, | |
| "mean_token_accuracy": 0.79336898624897, | |
| "num_tokens": 21451975.0, | |
| "step": 5340 | |
| }, | |
| { | |
| "entropy": 0.9148454248905182, | |
| "epoch": 2.047455032529659, | |
| "grad_norm": 0.10257065296173096, | |
| "learning_rate": 6.68725661340137e-05, | |
| "loss": 0.9611604690551758, | |
| "mean_token_accuracy": 0.7737416908144951, | |
| "num_tokens": 21491818.0, | |
| "step": 5350 | |
| }, | |
| { | |
| "entropy": 0.9010646104812622, | |
| "epoch": 2.051282051282051, | |
| "grad_norm": 0.11826229095458984, | |
| "learning_rate": 6.660400161138713e-05, | |
| "loss": 0.9446893692016601, | |
| "mean_token_accuracy": 0.7851994633674622, | |
| "num_tokens": 21528066.0, | |
| "step": 5360 | |
| }, | |
| { | |
| "entropy": 0.8987722039222718, | |
| "epoch": 2.055109070034443, | |
| "grad_norm": 0.10371451824903488, | |
| "learning_rate": 6.633543708876058e-05, | |
| "loss": 0.9595455169677735, | |
| "mean_token_accuracy": 0.7833559066057205, | |
| "num_tokens": 21562883.0, | |
| "step": 5370 | |
| }, | |
| { | |
| "entropy": 0.8856854721903801, | |
| "epoch": 2.058936088786835, | |
| "grad_norm": 0.1089499220252037, | |
| "learning_rate": 6.606687256613403e-05, | |
| "loss": 0.9219722747802734, | |
| "mean_token_accuracy": 0.7822227373719215, | |
| "num_tokens": 21600910.0, | |
| "step": 5380 | |
| }, | |
| { | |
| "entropy": 0.8720096081495285, | |
| "epoch": 2.0627631075392268, | |
| "grad_norm": 0.09962328523397446, | |
| "learning_rate": 6.579830804350745e-05, | |
| "loss": 0.9654089927673339, | |
| "mean_token_accuracy": 0.7856920391321183, | |
| "num_tokens": 21640445.0, | |
| "step": 5390 | |
| }, | |
| { | |
| "entropy": 0.9440382812172174, | |
| "epoch": 2.0665901262916186, | |
| "grad_norm": 0.08670477569103241, | |
| "learning_rate": 6.552974352088089e-05, | |
| "loss": 0.9934238433837891, | |
| "mean_token_accuracy": 0.7687147289514542, | |
| "num_tokens": 21682432.0, | |
| "step": 5400 | |
| }, | |
| { | |
| "entropy": 0.774172055721283, | |
| "epoch": 2.0704171450440105, | |
| "grad_norm": 0.11862040311098099, | |
| "learning_rate": 6.526117899825434e-05, | |
| "loss": 0.8106603622436523, | |
| "mean_token_accuracy": 0.8135839134454728, | |
| "num_tokens": 21721359.0, | |
| "step": 5410 | |
| }, | |
| { | |
| "entropy": 0.9194908868521452, | |
| "epoch": 2.0742441637964024, | |
| "grad_norm": 0.10227365791797638, | |
| "learning_rate": 6.499261447562777e-05, | |
| "loss": 0.9410523414611817, | |
| "mean_token_accuracy": 0.7788734346628189, | |
| "num_tokens": 21763700.0, | |
| "step": 5420 | |
| }, | |
| { | |
| "entropy": 0.7955736435949803, | |
| "epoch": 2.0780711825487943, | |
| "grad_norm": 0.09657785296440125, | |
| "learning_rate": 6.472404995300121e-05, | |
| "loss": 0.8665301322937011, | |
| "mean_token_accuracy": 0.8067882195115089, | |
| "num_tokens": 21804190.0, | |
| "step": 5430 | |
| }, | |
| { | |
| "entropy": 0.8065498791635036, | |
| "epoch": 2.081898201301186, | |
| "grad_norm": 0.11568085849285126, | |
| "learning_rate": 6.445548543037465e-05, | |
| "loss": 0.8515932083129882, | |
| "mean_token_accuracy": 0.8035058185458184, | |
| "num_tokens": 21839801.0, | |
| "step": 5440 | |
| }, | |
| { | |
| "entropy": 0.9087674509733915, | |
| "epoch": 2.085725220053578, | |
| "grad_norm": 0.09318574517965317, | |
| "learning_rate": 6.418692090774809e-05, | |
| "loss": 0.9387861251831054, | |
| "mean_token_accuracy": 0.77939523011446, | |
| "num_tokens": 21877125.0, | |
| "step": 5450 | |
| }, | |
| { | |
| "entropy": 0.86418566852808, | |
| "epoch": 2.08955223880597, | |
| "grad_norm": 0.08796729892492294, | |
| "learning_rate": 6.391835638512152e-05, | |
| "loss": 0.9152085304260253, | |
| "mean_token_accuracy": 0.7899368211627007, | |
| "num_tokens": 21921493.0, | |
| "step": 5460 | |
| }, | |
| { | |
| "entropy": 0.8593201294541359, | |
| "epoch": 2.093379257558362, | |
| "grad_norm": 0.14465564489364624, | |
| "learning_rate": 6.364979186249497e-05, | |
| "loss": 0.8955412864685058, | |
| "mean_token_accuracy": 0.7898772984743119, | |
| "num_tokens": 21961188.0, | |
| "step": 5470 | |
| }, | |
| { | |
| "entropy": 0.8998314358294011, | |
| "epoch": 2.0972062763107537, | |
| "grad_norm": 0.11634784191846848, | |
| "learning_rate": 6.338122733986842e-05, | |
| "loss": 0.9114861488342285, | |
| "mean_token_accuracy": 0.7838647082448006, | |
| "num_tokens": 22001738.0, | |
| "step": 5480 | |
| }, | |
| { | |
| "entropy": 0.8693659231066704, | |
| "epoch": 2.1010332950631456, | |
| "grad_norm": 0.11536803841590881, | |
| "learning_rate": 6.311266281724184e-05, | |
| "loss": 0.9232154846191406, | |
| "mean_token_accuracy": 0.7881089702248574, | |
| "num_tokens": 22039626.0, | |
| "step": 5490 | |
| }, | |
| { | |
| "entropy": 0.9556272588670254, | |
| "epoch": 2.1048603138155375, | |
| "grad_norm": 0.09614596515893936, | |
| "learning_rate": 6.284409829461528e-05, | |
| "loss": 1.0266177177429199, | |
| "mean_token_accuracy": 0.7646962344646454, | |
| "num_tokens": 22081971.0, | |
| "step": 5500 | |
| }, | |
| { | |
| "entropy": 0.7735307298600673, | |
| "epoch": 2.1086873325679294, | |
| "grad_norm": 0.10002073645591736, | |
| "learning_rate": 6.257553377198873e-05, | |
| "loss": 0.8011887550354004, | |
| "mean_token_accuracy": 0.8088100135326386, | |
| "num_tokens": 22117897.0, | |
| "step": 5510 | |
| }, | |
| { | |
| "entropy": 0.8981072999536991, | |
| "epoch": 2.1125143513203213, | |
| "grad_norm": 0.10524707287549973, | |
| "learning_rate": 6.230696924936216e-05, | |
| "loss": 0.9659936904907227, | |
| "mean_token_accuracy": 0.7843907788395882, | |
| "num_tokens": 22161049.0, | |
| "step": 5520 | |
| }, | |
| { | |
| "entropy": 0.8891891561448574, | |
| "epoch": 2.116341370072713, | |
| "grad_norm": 0.10095740854740143, | |
| "learning_rate": 6.20384047267356e-05, | |
| "loss": 0.9199987411499023, | |
| "mean_token_accuracy": 0.7833669245243072, | |
| "num_tokens": 22201183.0, | |
| "step": 5530 | |
| }, | |
| { | |
| "entropy": 0.9359986830502749, | |
| "epoch": 2.120168388825105, | |
| "grad_norm": 0.08723930269479752, | |
| "learning_rate": 6.176984020410904e-05, | |
| "loss": 0.9635790824890137, | |
| "mean_token_accuracy": 0.7724878415465355, | |
| "num_tokens": 22240779.0, | |
| "step": 5540 | |
| }, | |
| { | |
| "entropy": 0.8017430886626243, | |
| "epoch": 2.123995407577497, | |
| "grad_norm": 0.10579924285411835, | |
| "learning_rate": 6.150127568148249e-05, | |
| "loss": 0.842125129699707, | |
| "mean_token_accuracy": 0.8020379558205605, | |
| "num_tokens": 22279289.0, | |
| "step": 5550 | |
| }, | |
| { | |
| "entropy": 0.7666160762310028, | |
| "epoch": 2.127822426329889, | |
| "grad_norm": 0.09871628880500793, | |
| "learning_rate": 6.123271115885591e-05, | |
| "loss": 0.8378163337707519, | |
| "mean_token_accuracy": 0.8119754999876022, | |
| "num_tokens": 22316715.0, | |
| "step": 5560 | |
| }, | |
| { | |
| "entropy": 0.9505756117403508, | |
| "epoch": 2.1316494450822807, | |
| "grad_norm": 0.11093632131814957, | |
| "learning_rate": 6.096414663622936e-05, | |
| "loss": 0.9677371025085449, | |
| "mean_token_accuracy": 0.7698320209980011, | |
| "num_tokens": 22360112.0, | |
| "step": 5570 | |
| }, | |
| { | |
| "entropy": 0.7982158973813057, | |
| "epoch": 2.1354764638346726, | |
| "grad_norm": 0.11260368674993515, | |
| "learning_rate": 6.06955821136028e-05, | |
| "loss": 0.8571239471435547, | |
| "mean_token_accuracy": 0.804571321606636, | |
| "num_tokens": 22399114.0, | |
| "step": 5580 | |
| }, | |
| { | |
| "entropy": 0.8869463637471199, | |
| "epoch": 2.1393034825870645, | |
| "grad_norm": 0.08550643920898438, | |
| "learning_rate": 6.042701759097623e-05, | |
| "loss": 0.9476675033569336, | |
| "mean_token_accuracy": 0.7807673364877701, | |
| "num_tokens": 22440187.0, | |
| "step": 5590 | |
| }, | |
| { | |
| "entropy": 0.9491269618272782, | |
| "epoch": 2.1431305013394564, | |
| "grad_norm": 0.09019884467124939, | |
| "learning_rate": 6.015845306834967e-05, | |
| "loss": 1.0232599258422852, | |
| "mean_token_accuracy": 0.7681664958596229, | |
| "num_tokens": 22479682.0, | |
| "step": 5600 | |
| }, | |
| { | |
| "entropy": 0.8861779697239399, | |
| "epoch": 2.1469575200918483, | |
| "grad_norm": 0.11756031215190887, | |
| "learning_rate": 5.988988854572312e-05, | |
| "loss": 0.9251557350158691, | |
| "mean_token_accuracy": 0.7849425792694091, | |
| "num_tokens": 22520352.0, | |
| "step": 5610 | |
| }, | |
| { | |
| "entropy": 0.8735060147941113, | |
| "epoch": 2.15078453884424, | |
| "grad_norm": 0.0996679812669754, | |
| "learning_rate": 5.9621324023096546e-05, | |
| "loss": 0.9677264213562011, | |
| "mean_token_accuracy": 0.7881714150309562, | |
| "num_tokens": 22561677.0, | |
| "step": 5620 | |
| }, | |
| { | |
| "entropy": 0.991636025160551, | |
| "epoch": 2.154611557596632, | |
| "grad_norm": 0.10682649165391922, | |
| "learning_rate": 5.935275950046999e-05, | |
| "loss": 1.050811195373535, | |
| "mean_token_accuracy": 0.7574850931763649, | |
| "num_tokens": 22609671.0, | |
| "step": 5630 | |
| }, | |
| { | |
| "entropy": 0.9028345100581646, | |
| "epoch": 2.158438576349024, | |
| "grad_norm": 0.11249802261590958, | |
| "learning_rate": 5.908419497784343e-05, | |
| "loss": 0.9876343727111816, | |
| "mean_token_accuracy": 0.783162035048008, | |
| "num_tokens": 22650924.0, | |
| "step": 5640 | |
| }, | |
| { | |
| "entropy": 0.868353420495987, | |
| "epoch": 2.162265595101416, | |
| "grad_norm": 0.08846433460712433, | |
| "learning_rate": 5.8815630455216867e-05, | |
| "loss": 0.9271388053894043, | |
| "mean_token_accuracy": 0.7898381799459457, | |
| "num_tokens": 22691550.0, | |
| "step": 5650 | |
| }, | |
| { | |
| "entropy": 0.9247912406921387, | |
| "epoch": 2.1660926138538077, | |
| "grad_norm": 0.10013602674007416, | |
| "learning_rate": 5.854706593259031e-05, | |
| "loss": 1.0093653678894043, | |
| "mean_token_accuracy": 0.7723490744829178, | |
| "num_tokens": 22728956.0, | |
| "step": 5660 | |
| }, | |
| { | |
| "entropy": 0.82930968105793, | |
| "epoch": 2.1699196326061996, | |
| "grad_norm": 0.11004043370485306, | |
| "learning_rate": 5.827850140996375e-05, | |
| "loss": 0.8801467895507813, | |
| "mean_token_accuracy": 0.798722094297409, | |
| "num_tokens": 22765064.0, | |
| "step": 5670 | |
| }, | |
| { | |
| "entropy": 0.8950945638120175, | |
| "epoch": 2.1737466513585915, | |
| "grad_norm": 0.09994686394929886, | |
| "learning_rate": 5.800993688733719e-05, | |
| "loss": 0.9781051635742187, | |
| "mean_token_accuracy": 0.7849533364176751, | |
| "num_tokens": 22802213.0, | |
| "step": 5680 | |
| }, | |
| { | |
| "entropy": 0.8847132481634616, | |
| "epoch": 2.1775736701109834, | |
| "grad_norm": 0.09891512989997864, | |
| "learning_rate": 5.774137236471062e-05, | |
| "loss": 0.9338027954101562, | |
| "mean_token_accuracy": 0.7867394030094147, | |
| "num_tokens": 22839400.0, | |
| "step": 5690 | |
| }, | |
| { | |
| "entropy": 0.8212509788572788, | |
| "epoch": 2.1814006888633752, | |
| "grad_norm": 0.10451705008745193, | |
| "learning_rate": 5.747280784208406e-05, | |
| "loss": 0.8740688323974609, | |
| "mean_token_accuracy": 0.7968196496367455, | |
| "num_tokens": 22877771.0, | |
| "step": 5700 | |
| }, | |
| { | |
| "entropy": 0.7856742814183235, | |
| "epoch": 2.185227707615767, | |
| "grad_norm": 0.09351614862680435, | |
| "learning_rate": 5.720424331945751e-05, | |
| "loss": 0.8385543823242188, | |
| "mean_token_accuracy": 0.8064358577132225, | |
| "num_tokens": 22916159.0, | |
| "step": 5710 | |
| }, | |
| { | |
| "entropy": 0.9431014984846116, | |
| "epoch": 2.189054726368159, | |
| "grad_norm": 0.09432144463062286, | |
| "learning_rate": 5.6935678796830935e-05, | |
| "loss": 1.0021851539611817, | |
| "mean_token_accuracy": 0.7693860113620759, | |
| "num_tokens": 22958014.0, | |
| "step": 5720 | |
| }, | |
| { | |
| "entropy": 0.9080683786422015, | |
| "epoch": 2.192881745120551, | |
| "grad_norm": 0.08724278956651688, | |
| "learning_rate": 5.666711427420438e-05, | |
| "loss": 0.9878963470458985, | |
| "mean_token_accuracy": 0.7802156403660774, | |
| "num_tokens": 23003222.0, | |
| "step": 5730 | |
| }, | |
| { | |
| "entropy": 0.8772326201200485, | |
| "epoch": 2.196708763872943, | |
| "grad_norm": 0.1096489354968071, | |
| "learning_rate": 5.639854975157782e-05, | |
| "loss": 0.9326786041259766, | |
| "mean_token_accuracy": 0.7881689593195915, | |
| "num_tokens": 23039512.0, | |
| "step": 5740 | |
| }, | |
| { | |
| "entropy": 0.9084336057305336, | |
| "epoch": 2.2005357826253347, | |
| "grad_norm": 0.11137977987527847, | |
| "learning_rate": 5.6129985228951256e-05, | |
| "loss": 0.9574773788452149, | |
| "mean_token_accuracy": 0.7860094889998436, | |
| "num_tokens": 23078238.0, | |
| "step": 5750 | |
| }, | |
| { | |
| "entropy": 0.836103780195117, | |
| "epoch": 2.2043628013777266, | |
| "grad_norm": 0.11038387566804886, | |
| "learning_rate": 5.5861420706324696e-05, | |
| "loss": 0.88037109375, | |
| "mean_token_accuracy": 0.7916925936937332, | |
| "num_tokens": 23121089.0, | |
| "step": 5760 | |
| }, | |
| { | |
| "entropy": 0.9425606489181518, | |
| "epoch": 2.2081898201301184, | |
| "grad_norm": 0.10270453989505768, | |
| "learning_rate": 5.5592856183698137e-05, | |
| "loss": 0.983431339263916, | |
| "mean_token_accuracy": 0.7715479463338852, | |
| "num_tokens": 23158047.0, | |
| "step": 5770 | |
| }, | |
| { | |
| "entropy": 0.8212515480816365, | |
| "epoch": 2.2120168388825103, | |
| "grad_norm": 0.0880119651556015, | |
| "learning_rate": 5.532429166107157e-05, | |
| "loss": 0.887947940826416, | |
| "mean_token_accuracy": 0.7997770145535469, | |
| "num_tokens": 23204019.0, | |
| "step": 5780 | |
| }, | |
| { | |
| "entropy": 0.8668085850775242, | |
| "epoch": 2.215843857634902, | |
| "grad_norm": 0.11390146613121033, | |
| "learning_rate": 5.505572713844501e-05, | |
| "loss": 0.9010316848754882, | |
| "mean_token_accuracy": 0.7880747586488723, | |
| "num_tokens": 23241922.0, | |
| "step": 5790 | |
| }, | |
| { | |
| "entropy": 0.7907863073050976, | |
| "epoch": 2.219670876387294, | |
| "grad_norm": 0.11713080108165741, | |
| "learning_rate": 5.478716261581846e-05, | |
| "loss": 0.8595284461975098, | |
| "mean_token_accuracy": 0.8068661123514176, | |
| "num_tokens": 23280534.0, | |
| "step": 5800 | |
| }, | |
| { | |
| "entropy": 0.8358560226857662, | |
| "epoch": 2.223497895139686, | |
| "grad_norm": 0.11117064207792282, | |
| "learning_rate": 5.45185980931919e-05, | |
| "loss": 0.8745571136474609, | |
| "mean_token_accuracy": 0.793362820148468, | |
| "num_tokens": 23323119.0, | |
| "step": 5810 | |
| }, | |
| { | |
| "entropy": 0.8238232973963022, | |
| "epoch": 2.227324913892078, | |
| "grad_norm": 0.13185663521289825, | |
| "learning_rate": 5.425003357056533e-05, | |
| "loss": 0.8659845352172851, | |
| "mean_token_accuracy": 0.8025152862071991, | |
| "num_tokens": 23363749.0, | |
| "step": 5820 | |
| }, | |
| { | |
| "entropy": 0.8596846207976341, | |
| "epoch": 2.2311519326444698, | |
| "grad_norm": 0.09360291808843613, | |
| "learning_rate": 5.398146904793877e-05, | |
| "loss": 0.9118245124816895, | |
| "mean_token_accuracy": 0.7882251426577568, | |
| "num_tokens": 23402886.0, | |
| "step": 5830 | |
| }, | |
| { | |
| "entropy": 0.8035648860037327, | |
| "epoch": 2.2349789513968616, | |
| "grad_norm": 0.09347285330295563, | |
| "learning_rate": 5.371290452531221e-05, | |
| "loss": 0.8725827217102051, | |
| "mean_token_accuracy": 0.8045972406864166, | |
| "num_tokens": 23442339.0, | |
| "step": 5840 | |
| }, | |
| { | |
| "entropy": 0.9175308585166931, | |
| "epoch": 2.2388059701492535, | |
| "grad_norm": 0.12336985766887665, | |
| "learning_rate": 5.3444340002685645e-05, | |
| "loss": 0.9388077735900879, | |
| "mean_token_accuracy": 0.7768721342086792, | |
| "num_tokens": 23481344.0, | |
| "step": 5850 | |
| }, | |
| { | |
| "entropy": 0.868817687779665, | |
| "epoch": 2.2426329889016454, | |
| "grad_norm": 0.10311949998140335, | |
| "learning_rate": 5.3175775480059086e-05, | |
| "loss": 0.9337680816650391, | |
| "mean_token_accuracy": 0.7877210825681686, | |
| "num_tokens": 23520637.0, | |
| "step": 5860 | |
| }, | |
| { | |
| "entropy": 0.854228886961937, | |
| "epoch": 2.2464600076540373, | |
| "grad_norm": 0.10659918189048767, | |
| "learning_rate": 5.2907210957432526e-05, | |
| "loss": 0.9077530860900879, | |
| "mean_token_accuracy": 0.7909654468297959, | |
| "num_tokens": 23559877.0, | |
| "step": 5870 | |
| }, | |
| { | |
| "entropy": 0.8457217663526535, | |
| "epoch": 2.250287026406429, | |
| "grad_norm": 0.09633689373731613, | |
| "learning_rate": 5.263864643480596e-05, | |
| "loss": 0.8785475730895996, | |
| "mean_token_accuracy": 0.7941769883036613, | |
| "num_tokens": 23597033.0, | |
| "step": 5880 | |
| }, | |
| { | |
| "entropy": 0.8822055049240589, | |
| "epoch": 2.254114045158821, | |
| "grad_norm": 0.09562286734580994, | |
| "learning_rate": 5.23700819121794e-05, | |
| "loss": 0.8851138114929199, | |
| "mean_token_accuracy": 0.7860250055789948, | |
| "num_tokens": 23634788.0, | |
| "step": 5890 | |
| }, | |
| { | |
| "entropy": 0.8556318368762732, | |
| "epoch": 2.257941063911213, | |
| "grad_norm": 0.08814764767885208, | |
| "learning_rate": 5.210151738955285e-05, | |
| "loss": 0.8866415977478027, | |
| "mean_token_accuracy": 0.7966004252433777, | |
| "num_tokens": 23673283.0, | |
| "step": 5900 | |
| }, | |
| { | |
| "entropy": 0.7395530994981527, | |
| "epoch": 2.261768082663605, | |
| "grad_norm": 0.07671936601400375, | |
| "learning_rate": 5.1832952866926274e-05, | |
| "loss": 0.7680532455444335, | |
| "mean_token_accuracy": 0.8190904691815376, | |
| "num_tokens": 23711540.0, | |
| "step": 5910 | |
| }, | |
| { | |
| "entropy": 0.8898126773536206, | |
| "epoch": 2.2655951014159967, | |
| "grad_norm": 0.06960798799991608, | |
| "learning_rate": 5.156438834429972e-05, | |
| "loss": 1.026920700073242, | |
| "mean_token_accuracy": 0.7816770374774933, | |
| "num_tokens": 23756178.0, | |
| "step": 5920 | |
| }, | |
| { | |
| "entropy": 0.8902945756912232, | |
| "epoch": 2.2694221201683886, | |
| "grad_norm": 0.1114925891160965, | |
| "learning_rate": 5.129582382167316e-05, | |
| "loss": 0.9598423957824707, | |
| "mean_token_accuracy": 0.784630736708641, | |
| "num_tokens": 23792151.0, | |
| "step": 5930 | |
| }, | |
| { | |
| "entropy": 0.8439918398857117, | |
| "epoch": 2.2732491389207805, | |
| "grad_norm": 0.16730423271656036, | |
| "learning_rate": 5.10272592990466e-05, | |
| "loss": 0.851725959777832, | |
| "mean_token_accuracy": 0.7940610617399215, | |
| "num_tokens": 23830309.0, | |
| "step": 5940 | |
| }, | |
| { | |
| "entropy": 0.9178552135825158, | |
| "epoch": 2.2770761576731724, | |
| "grad_norm": 0.16359879076480865, | |
| "learning_rate": 5.0758694776420035e-05, | |
| "loss": 0.9417426109313964, | |
| "mean_token_accuracy": 0.7781487166881561, | |
| "num_tokens": 23874638.0, | |
| "step": 5950 | |
| }, | |
| { | |
| "entropy": 0.9053961969912052, | |
| "epoch": 2.2809031764255643, | |
| "grad_norm": 0.08877693116664886, | |
| "learning_rate": 5.0490130253793475e-05, | |
| "loss": 0.9975083351135254, | |
| "mean_token_accuracy": 0.7837231978774071, | |
| "num_tokens": 23918641.0, | |
| "step": 5960 | |
| }, | |
| { | |
| "entropy": 0.8590337552130223, | |
| "epoch": 2.284730195177956, | |
| "grad_norm": 0.1032002717256546, | |
| "learning_rate": 5.022156573116692e-05, | |
| "loss": 0.8895168304443359, | |
| "mean_token_accuracy": 0.7937395930290222, | |
| "num_tokens": 23964403.0, | |
| "step": 5970 | |
| }, | |
| { | |
| "entropy": 0.8678315542638302, | |
| "epoch": 2.288557213930348, | |
| "grad_norm": 0.12054577469825745, | |
| "learning_rate": 4.9953001208540356e-05, | |
| "loss": 0.9571179389953614, | |
| "mean_token_accuracy": 0.7875312000513077, | |
| "num_tokens": 24001736.0, | |
| "step": 5980 | |
| }, | |
| { | |
| "entropy": 0.8353918489068747, | |
| "epoch": 2.29238423268274, | |
| "grad_norm": 0.1126277968287468, | |
| "learning_rate": 4.9684436685913796e-05, | |
| "loss": 0.927174186706543, | |
| "mean_token_accuracy": 0.7998543947935104, | |
| "num_tokens": 24038494.0, | |
| "step": 5990 | |
| }, | |
| { | |
| "entropy": 0.7281714532524347, | |
| "epoch": 2.296211251435132, | |
| "grad_norm": 0.09404657036066055, | |
| "learning_rate": 4.941587216328723e-05, | |
| "loss": 0.7814407825469971, | |
| "mean_token_accuracy": 0.8194777265191078, | |
| "num_tokens": 24077404.0, | |
| "step": 6000 | |
| }, | |
| { | |
| "entropy": 0.8627386562526226, | |
| "epoch": 2.3000382701875237, | |
| "grad_norm": 0.07272294908761978, | |
| "learning_rate": 4.914730764066067e-05, | |
| "loss": 0.8920239448547364, | |
| "mean_token_accuracy": 0.7905093863606453, | |
| "num_tokens": 24123483.0, | |
| "step": 6010 | |
| }, | |
| { | |
| "entropy": 0.8679380901157856, | |
| "epoch": 2.3038652889399156, | |
| "grad_norm": 0.09443669021129608, | |
| "learning_rate": 4.887874311803411e-05, | |
| "loss": 0.874543571472168, | |
| "mean_token_accuracy": 0.7891486629843711, | |
| "num_tokens": 24165215.0, | |
| "step": 6020 | |
| }, | |
| { | |
| "entropy": 0.8942526787519455, | |
| "epoch": 2.3076923076923075, | |
| "grad_norm": 0.0953405573964119, | |
| "learning_rate": 4.861017859540755e-05, | |
| "loss": 0.9304584503173828, | |
| "mean_token_accuracy": 0.7855148240923882, | |
| "num_tokens": 24204454.0, | |
| "step": 6030 | |
| }, | |
| { | |
| "entropy": 0.7896301347762347, | |
| "epoch": 2.3115193264446994, | |
| "grad_norm": 0.11093971133232117, | |
| "learning_rate": 4.834161407278099e-05, | |
| "loss": 0.8957646369934082, | |
| "mean_token_accuracy": 0.8066290900111198, | |
| "num_tokens": 24245578.0, | |
| "step": 6040 | |
| }, | |
| { | |
| "entropy": 0.9012999664992094, | |
| "epoch": 2.3153463451970913, | |
| "grad_norm": 0.09953141212463379, | |
| "learning_rate": 4.8073049550154424e-05, | |
| "loss": 0.9699124336242676, | |
| "mean_token_accuracy": 0.7792607560753823, | |
| "num_tokens": 24286627.0, | |
| "step": 6050 | |
| }, | |
| { | |
| "entropy": 0.8553815156221389, | |
| "epoch": 2.319173363949483, | |
| "grad_norm": 0.09737669676542282, | |
| "learning_rate": 4.7804485027527864e-05, | |
| "loss": 0.9319831848144531, | |
| "mean_token_accuracy": 0.7943563163280487, | |
| "num_tokens": 24326050.0, | |
| "step": 6060 | |
| }, | |
| { | |
| "entropy": 0.8088245622813701, | |
| "epoch": 2.323000382701875, | |
| "grad_norm": 0.11754145473241806, | |
| "learning_rate": 4.7535920504901305e-05, | |
| "loss": 0.8612746238708496, | |
| "mean_token_accuracy": 0.7998821645975113, | |
| "num_tokens": 24365505.0, | |
| "step": 6070 | |
| }, | |
| { | |
| "entropy": 0.8720655493438244, | |
| "epoch": 2.326827401454267, | |
| "grad_norm": 0.10582665354013443, | |
| "learning_rate": 4.726735598227474e-05, | |
| "loss": 0.9663046836853028, | |
| "mean_token_accuracy": 0.78773233294487, | |
| "num_tokens": 24403619.0, | |
| "step": 6080 | |
| }, | |
| { | |
| "entropy": 0.814146314561367, | |
| "epoch": 2.330654420206659, | |
| "grad_norm": 0.10099766403436661, | |
| "learning_rate": 4.6998791459648185e-05, | |
| "loss": 0.8403602600097656, | |
| "mean_token_accuracy": 0.8022790655493737, | |
| "num_tokens": 24441133.0, | |
| "step": 6090 | |
| }, | |
| { | |
| "entropy": 0.8325122386217118, | |
| "epoch": 2.3344814389590507, | |
| "grad_norm": 0.0968555137515068, | |
| "learning_rate": 4.673022693702162e-05, | |
| "loss": 0.8952775955200195, | |
| "mean_token_accuracy": 0.7972952157258988, | |
| "num_tokens": 24487908.0, | |
| "step": 6100 | |
| }, | |
| { | |
| "entropy": 0.8313679326325655, | |
| "epoch": 2.3383084577114426, | |
| "grad_norm": 0.09856109321117401, | |
| "learning_rate": 4.6461662414395066e-05, | |
| "loss": 0.8740328788757324, | |
| "mean_token_accuracy": 0.7973453208804131, | |
| "num_tokens": 24528859.0, | |
| "step": 6110 | |
| }, | |
| { | |
| "entropy": 0.9734285809099674, | |
| "epoch": 2.3421354764638345, | |
| "grad_norm": 0.08564373850822449, | |
| "learning_rate": 4.61930978917685e-05, | |
| "loss": 1.0028407096862793, | |
| "mean_token_accuracy": 0.761284664273262, | |
| "num_tokens": 24574604.0, | |
| "step": 6120 | |
| }, | |
| { | |
| "entropy": 0.9015337243676186, | |
| "epoch": 2.3459624952162264, | |
| "grad_norm": 0.09626568853855133, | |
| "learning_rate": 4.592453336914194e-05, | |
| "loss": 0.9965445518493652, | |
| "mean_token_accuracy": 0.7804829552769661, | |
| "num_tokens": 24615926.0, | |
| "step": 6130 | |
| }, | |
| { | |
| "entropy": 0.8764280565083027, | |
| "epoch": 2.3497895139686182, | |
| "grad_norm": 0.09104456007480621, | |
| "learning_rate": 4.565596884651538e-05, | |
| "loss": 0.9158814430236817, | |
| "mean_token_accuracy": 0.7859255224466324, | |
| "num_tokens": 24656662.0, | |
| "step": 6140 | |
| }, | |
| { | |
| "entropy": 0.8626538865268231, | |
| "epoch": 2.35361653272101, | |
| "grad_norm": 0.10454346984624863, | |
| "learning_rate": 4.5387404323888814e-05, | |
| "loss": 0.9093445777893067, | |
| "mean_token_accuracy": 0.7909897804260254, | |
| "num_tokens": 24696048.0, | |
| "step": 6150 | |
| }, | |
| { | |
| "entropy": 0.9042750746011734, | |
| "epoch": 2.357443551473402, | |
| "grad_norm": 0.09976542741060257, | |
| "learning_rate": 4.511883980126226e-05, | |
| "loss": 0.9527711868286133, | |
| "mean_token_accuracy": 0.7807446241378784, | |
| "num_tokens": 24738856.0, | |
| "step": 6160 | |
| }, | |
| { | |
| "entropy": 0.892713101953268, | |
| "epoch": 2.361270570225794, | |
| "grad_norm": 0.09778838604688644, | |
| "learning_rate": 4.4850275278635694e-05, | |
| "loss": 0.9142132759094238, | |
| "mean_token_accuracy": 0.7793798848986626, | |
| "num_tokens": 24781940.0, | |
| "step": 6170 | |
| }, | |
| { | |
| "entropy": 0.8652282394468784, | |
| "epoch": 2.365097588978186, | |
| "grad_norm": 0.13737474381923676, | |
| "learning_rate": 4.4581710756009134e-05, | |
| "loss": 0.9030959129333496, | |
| "mean_token_accuracy": 0.7882118329405785, | |
| "num_tokens": 24818476.0, | |
| "step": 6180 | |
| }, | |
| { | |
| "entropy": 0.880942365527153, | |
| "epoch": 2.3689246077305777, | |
| "grad_norm": 0.09460416436195374, | |
| "learning_rate": 4.4313146233382575e-05, | |
| "loss": 0.9684123992919922, | |
| "mean_token_accuracy": 0.7829654842615128, | |
| "num_tokens": 24856950.0, | |
| "step": 6190 | |
| }, | |
| { | |
| "entropy": 0.9563789039850235, | |
| "epoch": 2.3727516264829696, | |
| "grad_norm": 0.10954713076353073, | |
| "learning_rate": 4.404458171075601e-05, | |
| "loss": 1.029030704498291, | |
| "mean_token_accuracy": 0.7727080956101418, | |
| "num_tokens": 24895606.0, | |
| "step": 6200 | |
| }, | |
| { | |
| "entropy": 0.827500730752945, | |
| "epoch": 2.3765786452353614, | |
| "grad_norm": 0.1212112084031105, | |
| "learning_rate": 4.377601718812945e-05, | |
| "loss": 0.8650990486145019, | |
| "mean_token_accuracy": 0.7993797525763512, | |
| "num_tokens": 24932482.0, | |
| "step": 6210 | |
| }, | |
| { | |
| "entropy": 0.8221234314143657, | |
| "epoch": 2.3804056639877533, | |
| "grad_norm": 0.10023710876703262, | |
| "learning_rate": 4.350745266550289e-05, | |
| "loss": 0.8777777671813964, | |
| "mean_token_accuracy": 0.7987013593316078, | |
| "num_tokens": 24975109.0, | |
| "step": 6220 | |
| }, | |
| { | |
| "entropy": 0.8734230428934098, | |
| "epoch": 2.384232682740145, | |
| "grad_norm": 0.09403553605079651, | |
| "learning_rate": 4.323888814287633e-05, | |
| "loss": 0.8978803634643555, | |
| "mean_token_accuracy": 0.7872134670615196, | |
| "num_tokens": 25020916.0, | |
| "step": 6230 | |
| }, | |
| { | |
| "entropy": 0.9003870271146297, | |
| "epoch": 2.388059701492537, | |
| "grad_norm": 0.09854581952095032, | |
| "learning_rate": 4.297032362024977e-05, | |
| "loss": 0.9225659370422363, | |
| "mean_token_accuracy": 0.7807397484779358, | |
| "num_tokens": 25061018.0, | |
| "step": 6240 | |
| }, | |
| { | |
| "entropy": 0.8118300527334213, | |
| "epoch": 2.391886720244929, | |
| "grad_norm": 0.11139514297246933, | |
| "learning_rate": 4.27017590976232e-05, | |
| "loss": 0.8876243591308594, | |
| "mean_token_accuracy": 0.800039604306221, | |
| "num_tokens": 25097954.0, | |
| "step": 6250 | |
| }, | |
| { | |
| "entropy": 0.8419897515326739, | |
| "epoch": 2.395713738997321, | |
| "grad_norm": 0.09123879671096802, | |
| "learning_rate": 4.243319457499664e-05, | |
| "loss": 0.86744384765625, | |
| "mean_token_accuracy": 0.7919191718101501, | |
| "num_tokens": 25134260.0, | |
| "step": 6260 | |
| }, | |
| { | |
| "entropy": 0.9123246632516384, | |
| "epoch": 2.3995407577497128, | |
| "grad_norm": 0.10300562530755997, | |
| "learning_rate": 4.2164630052370084e-05, | |
| "loss": 0.9368386268615723, | |
| "mean_token_accuracy": 0.7797829449176789, | |
| "num_tokens": 25176001.0, | |
| "step": 6270 | |
| }, | |
| { | |
| "entropy": 0.9066010326147079, | |
| "epoch": 2.4033677765021046, | |
| "grad_norm": 0.10231593996286392, | |
| "learning_rate": 4.1896065529743524e-05, | |
| "loss": 0.9637252807617187, | |
| "mean_token_accuracy": 0.7807635113596916, | |
| "num_tokens": 25214450.0, | |
| "step": 6280 | |
| }, | |
| { | |
| "entropy": 0.8680018067359925, | |
| "epoch": 2.4071947952544965, | |
| "grad_norm": 0.09813899546861649, | |
| "learning_rate": 4.162750100711696e-05, | |
| "loss": 0.9405930519104004, | |
| "mean_token_accuracy": 0.7862071350216866, | |
| "num_tokens": 25249019.0, | |
| "step": 6290 | |
| }, | |
| { | |
| "entropy": 0.8444254245609045, | |
| "epoch": 2.4110218140068884, | |
| "grad_norm": 0.09815159440040588, | |
| "learning_rate": 4.1358936484490404e-05, | |
| "loss": 0.9015726089477539, | |
| "mean_token_accuracy": 0.7970604464411736, | |
| "num_tokens": 25287466.0, | |
| "step": 6300 | |
| }, | |
| { | |
| "entropy": 0.9179269846528768, | |
| "epoch": 2.4148488327592803, | |
| "grad_norm": 0.1013285368680954, | |
| "learning_rate": 4.109037196186384e-05, | |
| "loss": 0.9629206657409668, | |
| "mean_token_accuracy": 0.7756785362958908, | |
| "num_tokens": 25325488.0, | |
| "step": 6310 | |
| }, | |
| { | |
| "entropy": 0.8627055402845144, | |
| "epoch": 2.418675851511672, | |
| "grad_norm": 0.09085863828659058, | |
| "learning_rate": 4.082180743923728e-05, | |
| "loss": 0.8825644493103028, | |
| "mean_token_accuracy": 0.7927587017416954, | |
| "num_tokens": 25362470.0, | |
| "step": 6320 | |
| }, | |
| { | |
| "entropy": 0.8909512132406234, | |
| "epoch": 2.422502870264064, | |
| "grad_norm": 0.12609654664993286, | |
| "learning_rate": 4.055324291661072e-05, | |
| "loss": 0.9005517959594727, | |
| "mean_token_accuracy": 0.784729179739952, | |
| "num_tokens": 25405399.0, | |
| "step": 6330 | |
| }, | |
| { | |
| "entropy": 0.8371693149209023, | |
| "epoch": 2.426329889016456, | |
| "grad_norm": 0.09511356055736542, | |
| "learning_rate": 4.028467839398415e-05, | |
| "loss": 0.8819235801696778, | |
| "mean_token_accuracy": 0.7933985084295273, | |
| "num_tokens": 25443537.0, | |
| "step": 6340 | |
| }, | |
| { | |
| "entropy": 0.8452706336975098, | |
| "epoch": 2.430156907768848, | |
| "grad_norm": 0.08440756797790527, | |
| "learning_rate": 4.00161138713576e-05, | |
| "loss": 0.9220956802368164, | |
| "mean_token_accuracy": 0.791832709312439, | |
| "num_tokens": 25482874.0, | |
| "step": 6350 | |
| }, | |
| { | |
| "entropy": 0.8533206440508365, | |
| "epoch": 2.4339839265212397, | |
| "grad_norm": 0.10529948770999908, | |
| "learning_rate": 3.974754934873103e-05, | |
| "loss": 0.8976041793823242, | |
| "mean_token_accuracy": 0.7917203813791275, | |
| "num_tokens": 25523091.0, | |
| "step": 6360 | |
| }, | |
| { | |
| "entropy": 0.8192368470132351, | |
| "epoch": 2.4378109452736316, | |
| "grad_norm": 0.08338342607021332, | |
| "learning_rate": 3.947898482610447e-05, | |
| "loss": 0.8657890319824219, | |
| "mean_token_accuracy": 0.8002077579498291, | |
| "num_tokens": 25566050.0, | |
| "step": 6370 | |
| }, | |
| { | |
| "entropy": 0.9303523369133473, | |
| "epoch": 2.4416379640260235, | |
| "grad_norm": 0.09010683745145798, | |
| "learning_rate": 3.921042030347791e-05, | |
| "loss": 0.9760264396667481, | |
| "mean_token_accuracy": 0.7748634815216064, | |
| "num_tokens": 25608936.0, | |
| "step": 6380 | |
| }, | |
| { | |
| "entropy": 0.7555282160639762, | |
| "epoch": 2.4454649827784154, | |
| "grad_norm": 0.11948851495981216, | |
| "learning_rate": 3.894185578085135e-05, | |
| "loss": 0.8005829811096191, | |
| "mean_token_accuracy": 0.8136610746383667, | |
| "num_tokens": 25647408.0, | |
| "step": 6390 | |
| }, | |
| { | |
| "entropy": 0.8959879912436008, | |
| "epoch": 2.4492920015308073, | |
| "grad_norm": 0.09189214557409286, | |
| "learning_rate": 3.8673291258224794e-05, | |
| "loss": 0.9070920944213867, | |
| "mean_token_accuracy": 0.7838554188609124, | |
| "num_tokens": 25690271.0, | |
| "step": 6400 | |
| }, | |
| { | |
| "entropy": 0.7601668298244476, | |
| "epoch": 2.453119020283199, | |
| "grad_norm": 0.11115460842847824, | |
| "learning_rate": 3.840472673559823e-05, | |
| "loss": 0.837701416015625, | |
| "mean_token_accuracy": 0.8158529132604599, | |
| "num_tokens": 25730098.0, | |
| "step": 6410 | |
| }, | |
| { | |
| "entropy": 0.9026189528405666, | |
| "epoch": 2.456946039035591, | |
| "grad_norm": 0.0951504036784172, | |
| "learning_rate": 3.813616221297167e-05, | |
| "loss": 0.9555998802185058, | |
| "mean_token_accuracy": 0.7768774792551995, | |
| "num_tokens": 25769649.0, | |
| "step": 6420 | |
| }, | |
| { | |
| "entropy": 0.8566267982125282, | |
| "epoch": 2.460773057787983, | |
| "grad_norm": 0.1477993279695511, | |
| "learning_rate": 3.786759769034511e-05, | |
| "loss": 0.901324462890625, | |
| "mean_token_accuracy": 0.7918707326054573, | |
| "num_tokens": 25805906.0, | |
| "step": 6430 | |
| }, | |
| { | |
| "entropy": 0.8576595298945904, | |
| "epoch": 2.464600076540375, | |
| "grad_norm": 0.08643563091754913, | |
| "learning_rate": 3.759903316771854e-05, | |
| "loss": 0.9027094841003418, | |
| "mean_token_accuracy": 0.7925754263997078, | |
| "num_tokens": 25847270.0, | |
| "step": 6440 | |
| }, | |
| { | |
| "entropy": 0.8848195761442185, | |
| "epoch": 2.4684270952927667, | |
| "grad_norm": 0.1148499846458435, | |
| "learning_rate": 3.733046864509199e-05, | |
| "loss": 0.9222222328186035, | |
| "mean_token_accuracy": 0.7866752982139588, | |
| "num_tokens": 25890454.0, | |
| "step": 6450 | |
| }, | |
| { | |
| "entropy": 0.8222585029900074, | |
| "epoch": 2.4722541140451586, | |
| "grad_norm": 0.1051439717411995, | |
| "learning_rate": 3.706190412246542e-05, | |
| "loss": 0.8674264907836914, | |
| "mean_token_accuracy": 0.8014690011739731, | |
| "num_tokens": 25927176.0, | |
| "step": 6460 | |
| }, | |
| { | |
| "entropy": 0.7895723138004541, | |
| "epoch": 2.4760811327975505, | |
| "grad_norm": 0.08904940634965897, | |
| "learning_rate": 3.679333959983886e-05, | |
| "loss": 0.8720718383789062, | |
| "mean_token_accuracy": 0.8032544136047364, | |
| "num_tokens": 25969008.0, | |
| "step": 6470 | |
| }, | |
| { | |
| "entropy": 0.8449521534144878, | |
| "epoch": 2.4799081515499424, | |
| "grad_norm": 0.09109736979007721, | |
| "learning_rate": 3.65247750772123e-05, | |
| "loss": 0.8994977951049805, | |
| "mean_token_accuracy": 0.7939551532268524, | |
| "num_tokens": 26008671.0, | |
| "step": 6480 | |
| }, | |
| { | |
| "entropy": 0.8769714809954167, | |
| "epoch": 2.4837351703023343, | |
| "grad_norm": 0.09221527725458145, | |
| "learning_rate": 3.625621055458574e-05, | |
| "loss": 0.9647493362426758, | |
| "mean_token_accuracy": 0.7877351269125938, | |
| "num_tokens": 26047583.0, | |
| "step": 6490 | |
| }, | |
| { | |
| "entropy": 0.840660959109664, | |
| "epoch": 2.487562189054726, | |
| "grad_norm": 0.0888860896229744, | |
| "learning_rate": 3.598764603195918e-05, | |
| "loss": 0.872824764251709, | |
| "mean_token_accuracy": 0.7932088255882264, | |
| "num_tokens": 26090690.0, | |
| "step": 6500 | |
| }, | |
| { | |
| "entropy": 0.9435165245085955, | |
| "epoch": 2.491389207807118, | |
| "grad_norm": 0.10055243968963623, | |
| "learning_rate": 3.571908150933262e-05, | |
| "loss": 1.008607769012451, | |
| "mean_token_accuracy": 0.7684792190790176, | |
| "num_tokens": 26134620.0, | |
| "step": 6510 | |
| }, | |
| { | |
| "entropy": 0.9596942149102687, | |
| "epoch": 2.49521622655951, | |
| "grad_norm": 0.11321604251861572, | |
| "learning_rate": 3.545051698670606e-05, | |
| "loss": 1.021597957611084, | |
| "mean_token_accuracy": 0.7706323087215423, | |
| "num_tokens": 26176850.0, | |
| "step": 6520 | |
| }, | |
| { | |
| "entropy": 0.9805667255073786, | |
| "epoch": 2.499043245311902, | |
| "grad_norm": 0.13084010779857635, | |
| "learning_rate": 3.51819524640795e-05, | |
| "loss": 1.0418537139892579, | |
| "mean_token_accuracy": 0.763472905755043, | |
| "num_tokens": 26220943.0, | |
| "step": 6530 | |
| }, | |
| { | |
| "entropy": 0.9104986634105444, | |
| "epoch": 2.5028702640642937, | |
| "grad_norm": 0.09176472574472427, | |
| "learning_rate": 3.491338794145294e-05, | |
| "loss": 0.972693920135498, | |
| "mean_token_accuracy": 0.7809211134910583, | |
| "num_tokens": 26262084.0, | |
| "step": 6540 | |
| }, | |
| { | |
| "entropy": 0.8316202580928802, | |
| "epoch": 2.5066972828166856, | |
| "grad_norm": 0.11009900271892548, | |
| "learning_rate": 3.464482341882637e-05, | |
| "loss": 0.8581557273864746, | |
| "mean_token_accuracy": 0.7978575736284256, | |
| "num_tokens": 26302790.0, | |
| "step": 6550 | |
| }, | |
| { | |
| "entropy": 0.9041007287800312, | |
| "epoch": 2.5105243015690775, | |
| "grad_norm": 0.12103740125894547, | |
| "learning_rate": 3.437625889619981e-05, | |
| "loss": 0.9546697616577149, | |
| "mean_token_accuracy": 0.7800753250718117, | |
| "num_tokens": 26347959.0, | |
| "step": 6560 | |
| }, | |
| { | |
| "entropy": 0.8139931574463845, | |
| "epoch": 2.5143513203214694, | |
| "grad_norm": 0.08679619431495667, | |
| "learning_rate": 3.410769437357325e-05, | |
| "loss": 0.8982272148132324, | |
| "mean_token_accuracy": 0.8002956256270408, | |
| "num_tokens": 26388946.0, | |
| "step": 6570 | |
| }, | |
| { | |
| "entropy": 0.838017127290368, | |
| "epoch": 2.5181783390738612, | |
| "grad_norm": 0.12066033482551575, | |
| "learning_rate": 3.383912985094669e-05, | |
| "loss": 0.8589006423950195, | |
| "mean_token_accuracy": 0.7943052783608436, | |
| "num_tokens": 26431191.0, | |
| "step": 6580 | |
| }, | |
| { | |
| "entropy": 0.8299121838063002, | |
| "epoch": 2.522005357826253, | |
| "grad_norm": 0.08988375216722488, | |
| "learning_rate": 3.357056532832013e-05, | |
| "loss": 0.9106943130493164, | |
| "mean_token_accuracy": 0.7972570925951004, | |
| "num_tokens": 26468346.0, | |
| "step": 6590 | |
| }, | |
| { | |
| "entropy": 1.0362544253468513, | |
| "epoch": 2.525832376578645, | |
| "grad_norm": 0.10034547746181488, | |
| "learning_rate": 3.3302000805693566e-05, | |
| "loss": 1.0991132736206055, | |
| "mean_token_accuracy": 0.7502188056707382, | |
| "num_tokens": 26508029.0, | |
| "step": 6600 | |
| }, | |
| { | |
| "entropy": 0.9098232574760914, | |
| "epoch": 2.529659395331037, | |
| "grad_norm": 0.12513861060142517, | |
| "learning_rate": 3.303343628306701e-05, | |
| "loss": 0.9807866096496582, | |
| "mean_token_accuracy": 0.7815383434295654, | |
| "num_tokens": 26549321.0, | |
| "step": 6610 | |
| }, | |
| { | |
| "entropy": 0.8234303712844848, | |
| "epoch": 2.533486414083429, | |
| "grad_norm": 0.08378947526216507, | |
| "learning_rate": 3.2764871760440446e-05, | |
| "loss": 0.8650754928588867, | |
| "mean_token_accuracy": 0.7995569303631782, | |
| "num_tokens": 26589472.0, | |
| "step": 6620 | |
| }, | |
| { | |
| "entropy": 0.769949347153306, | |
| "epoch": 2.5373134328358207, | |
| "grad_norm": 0.12056911736726761, | |
| "learning_rate": 3.249630723781389e-05, | |
| "loss": 0.8480927467346191, | |
| "mean_token_accuracy": 0.818176555633545, | |
| "num_tokens": 26627566.0, | |
| "step": 6630 | |
| }, | |
| { | |
| "entropy": 0.8099306054413319, | |
| "epoch": 2.5411404515882126, | |
| "grad_norm": 0.09869939833879471, | |
| "learning_rate": 3.222774271518733e-05, | |
| "loss": 0.8649662017822266, | |
| "mean_token_accuracy": 0.7981634557247161, | |
| "num_tokens": 26662566.0, | |
| "step": 6640 | |
| }, | |
| { | |
| "entropy": 0.8528701025992632, | |
| "epoch": 2.5449674703406044, | |
| "grad_norm": 0.10336704552173615, | |
| "learning_rate": 3.195917819256076e-05, | |
| "loss": 0.9127251625061035, | |
| "mean_token_accuracy": 0.7928516089916229, | |
| "num_tokens": 26705768.0, | |
| "step": 6650 | |
| }, | |
| { | |
| "entropy": 0.8498493686318398, | |
| "epoch": 2.5487944890929963, | |
| "grad_norm": 0.10704471170902252, | |
| "learning_rate": 3.169061366993421e-05, | |
| "loss": 0.863565731048584, | |
| "mean_token_accuracy": 0.7932710304856301, | |
| "num_tokens": 26743574.0, | |
| "step": 6660 | |
| }, | |
| { | |
| "entropy": 0.8566017836332321, | |
| "epoch": 2.552621507845388, | |
| "grad_norm": 0.12135261297225952, | |
| "learning_rate": 3.142204914730764e-05, | |
| "loss": 0.9187004089355468, | |
| "mean_token_accuracy": 0.7913481816649437, | |
| "num_tokens": 26784127.0, | |
| "step": 6670 | |
| }, | |
| { | |
| "entropy": 0.8302055161446333, | |
| "epoch": 2.55644852659778, | |
| "grad_norm": 0.1430647373199463, | |
| "learning_rate": 3.115348462468108e-05, | |
| "loss": 0.8857596397399903, | |
| "mean_token_accuracy": 0.7965412393212319, | |
| "num_tokens": 26823189.0, | |
| "step": 6680 | |
| }, | |
| { | |
| "entropy": 0.8327139757573605, | |
| "epoch": 2.560275545350172, | |
| "grad_norm": 0.09538804739713669, | |
| "learning_rate": 3.088492010205452e-05, | |
| "loss": 0.9255412101745606, | |
| "mean_token_accuracy": 0.7939359977841377, | |
| "num_tokens": 26861599.0, | |
| "step": 6690 | |
| }, | |
| { | |
| "entropy": 0.8530606523156166, | |
| "epoch": 2.564102564102564, | |
| "grad_norm": 0.09193538129329681, | |
| "learning_rate": 3.0616355579427955e-05, | |
| "loss": 0.9151040077209472, | |
| "mean_token_accuracy": 0.7901859179139137, | |
| "num_tokens": 26901064.0, | |
| "step": 6700 | |
| }, | |
| { | |
| "entropy": 0.794033832848072, | |
| "epoch": 2.5679295828549558, | |
| "grad_norm": 0.1283407062292099, | |
| "learning_rate": 3.03477910568014e-05, | |
| "loss": 0.8441056251525879, | |
| "mean_token_accuracy": 0.8033816903829575, | |
| "num_tokens": 26942161.0, | |
| "step": 6710 | |
| }, | |
| { | |
| "entropy": 0.9340717010200024, | |
| "epoch": 2.5717566016073476, | |
| "grad_norm": 0.09237734973430634, | |
| "learning_rate": 3.0079226534174836e-05, | |
| "loss": 0.9747485160827637, | |
| "mean_token_accuracy": 0.7732965379953385, | |
| "num_tokens": 26982759.0, | |
| "step": 6720 | |
| }, | |
| { | |
| "entropy": 0.8746799558401108, | |
| "epoch": 2.5755836203597395, | |
| "grad_norm": 0.1391710638999939, | |
| "learning_rate": 2.9810662011548273e-05, | |
| "loss": 0.9311764717102051, | |
| "mean_token_accuracy": 0.7883311554789543, | |
| "num_tokens": 27022926.0, | |
| "step": 6730 | |
| }, | |
| { | |
| "entropy": 0.8290158938616514, | |
| "epoch": 2.5794106391121314, | |
| "grad_norm": 0.10442391782999039, | |
| "learning_rate": 2.9542097488921716e-05, | |
| "loss": 0.8346040725708008, | |
| "mean_token_accuracy": 0.7985544398427009, | |
| "num_tokens": 27065028.0, | |
| "step": 6740 | |
| }, | |
| { | |
| "entropy": 0.8574424415826798, | |
| "epoch": 2.5832376578645233, | |
| "grad_norm": 0.13001689314842224, | |
| "learning_rate": 2.9273532966295153e-05, | |
| "loss": 0.906099510192871, | |
| "mean_token_accuracy": 0.790866918861866, | |
| "num_tokens": 27100867.0, | |
| "step": 6750 | |
| }, | |
| { | |
| "entropy": 0.840974472463131, | |
| "epoch": 2.587064676616915, | |
| "grad_norm": 0.1224556565284729, | |
| "learning_rate": 2.9004968443668594e-05, | |
| "loss": 0.8969048500061035, | |
| "mean_token_accuracy": 0.7953185483813285, | |
| "num_tokens": 27137338.0, | |
| "step": 6760 | |
| }, | |
| { | |
| "entropy": 0.8477607406675816, | |
| "epoch": 2.590891695369307, | |
| "grad_norm": 0.09641005098819733, | |
| "learning_rate": 2.873640392104203e-05, | |
| "loss": 0.9569526672363281, | |
| "mean_token_accuracy": 0.7941199511289596, | |
| "num_tokens": 27178308.0, | |
| "step": 6770 | |
| }, | |
| { | |
| "entropy": 0.8317056275904179, | |
| "epoch": 2.594718714121699, | |
| "grad_norm": 0.11853990703821182, | |
| "learning_rate": 2.8467839398415468e-05, | |
| "loss": 0.9125295639038086, | |
| "mean_token_accuracy": 0.7960822626948356, | |
| "num_tokens": 27216898.0, | |
| "step": 6780 | |
| }, | |
| { | |
| "entropy": 0.8558823302388191, | |
| "epoch": 2.598545732874091, | |
| "grad_norm": 0.10477570444345474, | |
| "learning_rate": 2.819927487578891e-05, | |
| "loss": 0.8844131469726563, | |
| "mean_token_accuracy": 0.7940610870718956, | |
| "num_tokens": 27254443.0, | |
| "step": 6790 | |
| }, | |
| { | |
| "entropy": 0.8210954669862985, | |
| "epoch": 2.6023727516264827, | |
| "grad_norm": 0.14100609719753265, | |
| "learning_rate": 2.7930710353162348e-05, | |
| "loss": 0.8684535980224609, | |
| "mean_token_accuracy": 0.7988820597529411, | |
| "num_tokens": 27290079.0, | |
| "step": 6800 | |
| }, | |
| { | |
| "entropy": 0.8657392464578152, | |
| "epoch": 2.6061997703788746, | |
| "grad_norm": 0.09813658148050308, | |
| "learning_rate": 2.7662145830535785e-05, | |
| "loss": 0.9158803939819335, | |
| "mean_token_accuracy": 0.7908033922314643, | |
| "num_tokens": 27328190.0, | |
| "step": 6810 | |
| }, | |
| { | |
| "entropy": 0.8866597019135952, | |
| "epoch": 2.6100267891312665, | |
| "grad_norm": 0.11115613579750061, | |
| "learning_rate": 2.739358130790923e-05, | |
| "loss": 0.9120420455932617, | |
| "mean_token_accuracy": 0.7854148596525192, | |
| "num_tokens": 27369945.0, | |
| "step": 6820 | |
| }, | |
| { | |
| "entropy": 0.7982962183654309, | |
| "epoch": 2.6138538078836584, | |
| "grad_norm": 0.1377696692943573, | |
| "learning_rate": 2.7125016785282666e-05, | |
| "loss": 0.8332090377807617, | |
| "mean_token_accuracy": 0.8022376418113708, | |
| "num_tokens": 27406302.0, | |
| "step": 6830 | |
| }, | |
| { | |
| "entropy": 0.8424798093736172, | |
| "epoch": 2.6176808266360503, | |
| "grad_norm": 0.11442425101995468, | |
| "learning_rate": 2.6856452262656106e-05, | |
| "loss": 0.8876424789428711, | |
| "mean_token_accuracy": 0.7893706291913987, | |
| "num_tokens": 27449733.0, | |
| "step": 6840 | |
| }, | |
| { | |
| "entropy": 0.9239407800137996, | |
| "epoch": 2.621507845388442, | |
| "grad_norm": 0.0799759030342102, | |
| "learning_rate": 2.6587887740029543e-05, | |
| "loss": 0.9658034324645997, | |
| "mean_token_accuracy": 0.7757296651601792, | |
| "num_tokens": 27492884.0, | |
| "step": 6850 | |
| }, | |
| { | |
| "entropy": 0.8720928959548473, | |
| "epoch": 2.625334864140834, | |
| "grad_norm": 0.11632338911294937, | |
| "learning_rate": 2.631932321740298e-05, | |
| "loss": 0.9089359283447266, | |
| "mean_token_accuracy": 0.7913818553090095, | |
| "num_tokens": 27531878.0, | |
| "step": 6860 | |
| }, | |
| { | |
| "entropy": 0.9302754916250706, | |
| "epoch": 2.629161882893226, | |
| "grad_norm": 0.11215951293706894, | |
| "learning_rate": 2.6050758694776423e-05, | |
| "loss": 1.0027677536010742, | |
| "mean_token_accuracy": 0.7739164605736732, | |
| "num_tokens": 27567970.0, | |
| "step": 6870 | |
| }, | |
| { | |
| "entropy": 0.9016003269702196, | |
| "epoch": 2.632988901645618, | |
| "grad_norm": 0.11951353400945663, | |
| "learning_rate": 2.578219417214986e-05, | |
| "loss": 0.9493217468261719, | |
| "mean_token_accuracy": 0.7779877439141274, | |
| "num_tokens": 27609840.0, | |
| "step": 6880 | |
| }, | |
| { | |
| "entropy": 0.8870487026870251, | |
| "epoch": 2.6368159203980097, | |
| "grad_norm": 0.1124744564294815, | |
| "learning_rate": 2.55136296495233e-05, | |
| "loss": 1.0031387329101562, | |
| "mean_token_accuracy": 0.7866110280156136, | |
| "num_tokens": 27649655.0, | |
| "step": 6890 | |
| }, | |
| { | |
| "entropy": 0.9296976864337921, | |
| "epoch": 2.6406429391504016, | |
| "grad_norm": 0.1161704882979393, | |
| "learning_rate": 2.5245065126896738e-05, | |
| "loss": 1.012251853942871, | |
| "mean_token_accuracy": 0.7726465791463852, | |
| "num_tokens": 27694105.0, | |
| "step": 6900 | |
| }, | |
| { | |
| "entropy": 0.8415393102914095, | |
| "epoch": 2.6444699579027935, | |
| "grad_norm": 0.0987096056342125, | |
| "learning_rate": 2.4976500604270178e-05, | |
| "loss": 0.9147520065307617, | |
| "mean_token_accuracy": 0.7973951831459999, | |
| "num_tokens": 27730663.0, | |
| "step": 6910 | |
| }, | |
| { | |
| "entropy": 0.8274203538894653, | |
| "epoch": 2.6482969766551854, | |
| "grad_norm": 0.1101188212633133, | |
| "learning_rate": 2.4707936081643615e-05, | |
| "loss": 0.8873770713806153, | |
| "mean_token_accuracy": 0.7974746853113175, | |
| "num_tokens": 27772881.0, | |
| "step": 6920 | |
| }, | |
| { | |
| "entropy": 0.7984559834003448, | |
| "epoch": 2.6521239954075773, | |
| "grad_norm": 0.10185439884662628, | |
| "learning_rate": 2.4439371559017055e-05, | |
| "loss": 0.8775921821594238, | |
| "mean_token_accuracy": 0.807880648970604, | |
| "num_tokens": 27809534.0, | |
| "step": 6930 | |
| }, | |
| { | |
| "entropy": 0.887981615960598, | |
| "epoch": 2.655951014159969, | |
| "grad_norm": 0.08309295773506165, | |
| "learning_rate": 2.4170807036390495e-05, | |
| "loss": 0.9466443061828613, | |
| "mean_token_accuracy": 0.7859978228807449, | |
| "num_tokens": 27852591.0, | |
| "step": 6940 | |
| }, | |
| { | |
| "entropy": 0.9378888584673405, | |
| "epoch": 2.659778032912361, | |
| "grad_norm": 0.136076882481575, | |
| "learning_rate": 2.3902242513763932e-05, | |
| "loss": 1.0269956588745117, | |
| "mean_token_accuracy": 0.7709244459867477, | |
| "num_tokens": 27892120.0, | |
| "step": 6950 | |
| }, | |
| { | |
| "entropy": 0.9220107842236758, | |
| "epoch": 2.663605051664753, | |
| "grad_norm": 0.08248933404684067, | |
| "learning_rate": 2.363367799113737e-05, | |
| "loss": 0.9726594924926758, | |
| "mean_token_accuracy": 0.7753236919641495, | |
| "num_tokens": 27935380.0, | |
| "step": 6960 | |
| }, | |
| { | |
| "entropy": 0.7793348811566829, | |
| "epoch": 2.667432070417145, | |
| "grad_norm": 0.08308061957359314, | |
| "learning_rate": 2.336511346851081e-05, | |
| "loss": 0.7947993278503418, | |
| "mean_token_accuracy": 0.8088447406888009, | |
| "num_tokens": 27973020.0, | |
| "step": 6970 | |
| }, | |
| { | |
| "entropy": 0.9587450519204139, | |
| "epoch": 2.6712590891695367, | |
| "grad_norm": 0.10263237357139587, | |
| "learning_rate": 2.309654894588425e-05, | |
| "loss": 0.9791707038879395, | |
| "mean_token_accuracy": 0.7663016110658646, | |
| "num_tokens": 28016389.0, | |
| "step": 6980 | |
| }, | |
| { | |
| "entropy": 0.8766636185348033, | |
| "epoch": 2.6750861079219286, | |
| "grad_norm": 0.09917714446783066, | |
| "learning_rate": 2.282798442325769e-05, | |
| "loss": 0.9187355041503906, | |
| "mean_token_accuracy": 0.7864622801542283, | |
| "num_tokens": 28058100.0, | |
| "step": 6990 | |
| }, | |
| { | |
| "entropy": 0.8623256701976061, | |
| "epoch": 2.6789131266743205, | |
| "grad_norm": 0.08802894502878189, | |
| "learning_rate": 2.255941990063113e-05, | |
| "loss": 0.9108509063720703, | |
| "mean_token_accuracy": 0.7891170993447304, | |
| "num_tokens": 28095166.0, | |
| "step": 7000 | |
| }, | |
| { | |
| "entropy": 0.919238954409957, | |
| "epoch": 2.6827401454267124, | |
| "grad_norm": 0.11916540563106537, | |
| "learning_rate": 2.2290855378004567e-05, | |
| "loss": 0.9972674369812011, | |
| "mean_token_accuracy": 0.7765705808997154, | |
| "num_tokens": 28137533.0, | |
| "step": 7010 | |
| }, | |
| { | |
| "entropy": 0.918128065392375, | |
| "epoch": 2.6865671641791042, | |
| "grad_norm": 0.09536208212375641, | |
| "learning_rate": 2.2022290855378004e-05, | |
| "loss": 0.9865476608276367, | |
| "mean_token_accuracy": 0.7736267536878586, | |
| "num_tokens": 28179301.0, | |
| "step": 7020 | |
| }, | |
| { | |
| "entropy": 0.8265572734177112, | |
| "epoch": 2.690394182931496, | |
| "grad_norm": 0.09432680904865265, | |
| "learning_rate": 2.1753726332751444e-05, | |
| "loss": 0.8995939254760742, | |
| "mean_token_accuracy": 0.7947996065020562, | |
| "num_tokens": 28223849.0, | |
| "step": 7030 | |
| }, | |
| { | |
| "entropy": 0.8321899034082889, | |
| "epoch": 2.694221201683888, | |
| "grad_norm": 0.1223755031824112, | |
| "learning_rate": 2.1485161810124885e-05, | |
| "loss": 0.9003139495849609, | |
| "mean_token_accuracy": 0.7975824415683747, | |
| "num_tokens": 28268485.0, | |
| "step": 7040 | |
| }, | |
| { | |
| "entropy": 0.9064472205936909, | |
| "epoch": 2.69804822043628, | |
| "grad_norm": 0.13409113883972168, | |
| "learning_rate": 2.121659728749832e-05, | |
| "loss": 0.9323970794677734, | |
| "mean_token_accuracy": 0.7808707699179649, | |
| "num_tokens": 28307792.0, | |
| "step": 7050 | |
| }, | |
| { | |
| "entropy": 0.9527742668986321, | |
| "epoch": 2.701875239188672, | |
| "grad_norm": 0.09863030910491943, | |
| "learning_rate": 2.0948032764871762e-05, | |
| "loss": 1.0056820869445802, | |
| "mean_token_accuracy": 0.7673134744167328, | |
| "num_tokens": 28355447.0, | |
| "step": 7060 | |
| }, | |
| { | |
| "entropy": 0.8202732041478157, | |
| "epoch": 2.7057022579410637, | |
| "grad_norm": 0.10251973569393158, | |
| "learning_rate": 2.0679468242245202e-05, | |
| "loss": 0.8743599891662598, | |
| "mean_token_accuracy": 0.7957186102867126, | |
| "num_tokens": 28397195.0, | |
| "step": 7070 | |
| }, | |
| { | |
| "entropy": 0.9328485410660505, | |
| "epoch": 2.7095292766934556, | |
| "grad_norm": 0.09044504910707474, | |
| "learning_rate": 2.041090371961864e-05, | |
| "loss": 0.9707870483398438, | |
| "mean_token_accuracy": 0.7739486545324326, | |
| "num_tokens": 28440070.0, | |
| "step": 7080 | |
| }, | |
| { | |
| "entropy": 0.9110265091061592, | |
| "epoch": 2.7133562954458474, | |
| "grad_norm": 0.10417858511209488, | |
| "learning_rate": 2.0142339196992076e-05, | |
| "loss": 0.9495024681091309, | |
| "mean_token_accuracy": 0.7784481555223465, | |
| "num_tokens": 28483039.0, | |
| "step": 7090 | |
| }, | |
| { | |
| "entropy": 0.907703897356987, | |
| "epoch": 2.7171833141982393, | |
| "grad_norm": 0.10365665704011917, | |
| "learning_rate": 1.9873774674365516e-05, | |
| "loss": 0.9539920806884765, | |
| "mean_token_accuracy": 0.7803053423762322, | |
| "num_tokens": 28524922.0, | |
| "step": 7100 | |
| }, | |
| { | |
| "entropy": 0.8090648584067821, | |
| "epoch": 2.721010332950631, | |
| "grad_norm": 0.13015250861644745, | |
| "learning_rate": 1.9605210151738957e-05, | |
| "loss": 0.8559967994689941, | |
| "mean_token_accuracy": 0.7999090999364853, | |
| "num_tokens": 28565638.0, | |
| "step": 7110 | |
| }, | |
| { | |
| "entropy": 0.832624789327383, | |
| "epoch": 2.724837351703023, | |
| "grad_norm": 0.12992241978645325, | |
| "learning_rate": 1.9336645629112397e-05, | |
| "loss": 0.886108112335205, | |
| "mean_token_accuracy": 0.7986625626683235, | |
| "num_tokens": 28603666.0, | |
| "step": 7120 | |
| }, | |
| { | |
| "entropy": 0.8167526118457318, | |
| "epoch": 2.728664370455415, | |
| "grad_norm": 0.0879233330488205, | |
| "learning_rate": 1.9068081106485834e-05, | |
| "loss": 0.8744274139404297, | |
| "mean_token_accuracy": 0.8013173520565033, | |
| "num_tokens": 28647331.0, | |
| "step": 7130 | |
| }, | |
| { | |
| "entropy": 0.8693740144371986, | |
| "epoch": 2.732491389207807, | |
| "grad_norm": 0.11505398899316788, | |
| "learning_rate": 1.879951658385927e-05, | |
| "loss": 0.9142866134643555, | |
| "mean_token_accuracy": 0.7936322972178459, | |
| "num_tokens": 28683073.0, | |
| "step": 7140 | |
| }, | |
| { | |
| "entropy": 0.7896613411605358, | |
| "epoch": 2.7363184079601988, | |
| "grad_norm": 0.10490158945322037, | |
| "learning_rate": 1.853095206123271e-05, | |
| "loss": 0.8762624740600586, | |
| "mean_token_accuracy": 0.8044975116848946, | |
| "num_tokens": 28722340.0, | |
| "step": 7150 | |
| }, | |
| { | |
| "entropy": 0.8261051677167416, | |
| "epoch": 2.7401454267125906, | |
| "grad_norm": 0.10280875116586685, | |
| "learning_rate": 1.826238753860615e-05, | |
| "loss": 0.888590145111084, | |
| "mean_token_accuracy": 0.7989666223526001, | |
| "num_tokens": 28757940.0, | |
| "step": 7160 | |
| }, | |
| { | |
| "entropy": 0.8630577899515629, | |
| "epoch": 2.7439724454649825, | |
| "grad_norm": 0.12757791578769684, | |
| "learning_rate": 1.799382301597959e-05, | |
| "loss": 0.9082697868347168, | |
| "mean_token_accuracy": 0.7890564352273941, | |
| "num_tokens": 28796985.0, | |
| "step": 7170 | |
| }, | |
| { | |
| "entropy": 0.8979216992855072, | |
| "epoch": 2.7477994642173744, | |
| "grad_norm": 0.13048897683620453, | |
| "learning_rate": 1.772525849335303e-05, | |
| "loss": 0.9468406677246094, | |
| "mean_token_accuracy": 0.7829687342047691, | |
| "num_tokens": 28838091.0, | |
| "step": 7180 | |
| }, | |
| { | |
| "entropy": 0.9002114910632372, | |
| "epoch": 2.7516264829697663, | |
| "grad_norm": 0.130500927567482, | |
| "learning_rate": 1.745669397072647e-05, | |
| "loss": 0.9897032737731933, | |
| "mean_token_accuracy": 0.7817048847675323, | |
| "num_tokens": 28879084.0, | |
| "step": 7190 | |
| }, | |
| { | |
| "entropy": 0.861878028512001, | |
| "epoch": 2.755453501722158, | |
| "grad_norm": 0.10523588210344315, | |
| "learning_rate": 1.7188129448099906e-05, | |
| "loss": 0.9628341674804688, | |
| "mean_token_accuracy": 0.7882343173027039, | |
| "num_tokens": 28918018.0, | |
| "step": 7200 | |
| }, | |
| { | |
| "entropy": 0.7814029835164547, | |
| "epoch": 2.75928052047455, | |
| "grad_norm": 0.14345957338809967, | |
| "learning_rate": 1.6919564925473346e-05, | |
| "loss": 0.8377615928649902, | |
| "mean_token_accuracy": 0.8100636526942253, | |
| "num_tokens": 28953674.0, | |
| "step": 7210 | |
| }, | |
| { | |
| "entropy": 0.8798072785139084, | |
| "epoch": 2.763107539226942, | |
| "grad_norm": 0.10911094397306442, | |
| "learning_rate": 1.6651000402846783e-05, | |
| "loss": 0.9405971527099609, | |
| "mean_token_accuracy": 0.7843327835202217, | |
| "num_tokens": 28995212.0, | |
| "step": 7220 | |
| }, | |
| { | |
| "entropy": 0.7432700909674168, | |
| "epoch": 2.766934557979334, | |
| "grad_norm": 0.09271088242530823, | |
| "learning_rate": 1.6382435880220223e-05, | |
| "loss": 0.7987990856170655, | |
| "mean_token_accuracy": 0.8185402989387512, | |
| "num_tokens": 29034878.0, | |
| "step": 7230 | |
| }, | |
| { | |
| "entropy": 0.7937459200620651, | |
| "epoch": 2.7707615767317257, | |
| "grad_norm": 0.11122163385152817, | |
| "learning_rate": 1.6113871357593664e-05, | |
| "loss": 0.8469036102294922, | |
| "mean_token_accuracy": 0.8031805381178856, | |
| "num_tokens": 29074372.0, | |
| "step": 7240 | |
| }, | |
| { | |
| "entropy": 0.8456454008817673, | |
| "epoch": 2.7745885954841176, | |
| "grad_norm": 0.11189702153205872, | |
| "learning_rate": 1.5845306834967104e-05, | |
| "loss": 0.8942484855651855, | |
| "mean_token_accuracy": 0.7923400938510895, | |
| "num_tokens": 29117619.0, | |
| "step": 7250 | |
| }, | |
| { | |
| "entropy": 0.885396859049797, | |
| "epoch": 2.7784156142365095, | |
| "grad_norm": 0.10170719027519226, | |
| "learning_rate": 1.557674231234054e-05, | |
| "loss": 0.9175837516784668, | |
| "mean_token_accuracy": 0.7860854491591454, | |
| "num_tokens": 29156601.0, | |
| "step": 7260 | |
| }, | |
| { | |
| "entropy": 0.8742636401206255, | |
| "epoch": 2.7822426329889014, | |
| "grad_norm": 0.11130956560373306, | |
| "learning_rate": 1.5308177789713978e-05, | |
| "loss": 0.9322646141052247, | |
| "mean_token_accuracy": 0.7902692511677742, | |
| "num_tokens": 29200295.0, | |
| "step": 7270 | |
| }, | |
| { | |
| "entropy": 0.8523757141083479, | |
| "epoch": 2.7860696517412933, | |
| "grad_norm": 0.08611233532428741, | |
| "learning_rate": 1.5039613267087418e-05, | |
| "loss": 0.9210372924804687, | |
| "mean_token_accuracy": 0.7912763133645058, | |
| "num_tokens": 29235323.0, | |
| "step": 7280 | |
| }, | |
| { | |
| "entropy": 0.7804547689855099, | |
| "epoch": 2.789896670493685, | |
| "grad_norm": 0.08091949671506882, | |
| "learning_rate": 1.4771048744460858e-05, | |
| "loss": 0.8202395439147949, | |
| "mean_token_accuracy": 0.8117679923772811, | |
| "num_tokens": 29270182.0, | |
| "step": 7290 | |
| }, | |
| { | |
| "entropy": 0.8199648998677731, | |
| "epoch": 2.793723689246077, | |
| "grad_norm": 0.07486634701490402, | |
| "learning_rate": 1.4502484221834297e-05, | |
| "loss": 0.8396285057067872, | |
| "mean_token_accuracy": 0.8032143607735633, | |
| "num_tokens": 29311588.0, | |
| "step": 7300 | |
| }, | |
| { | |
| "entropy": 0.9650515951216221, | |
| "epoch": 2.797550707998469, | |
| "grad_norm": 0.10391585528850555, | |
| "learning_rate": 1.4233919699207734e-05, | |
| "loss": 1.047046184539795, | |
| "mean_token_accuracy": 0.7648886650800705, | |
| "num_tokens": 29353979.0, | |
| "step": 7310 | |
| }, | |
| { | |
| "entropy": 0.7674700990319252, | |
| "epoch": 2.801377726750861, | |
| "grad_norm": 0.09043332189321518, | |
| "learning_rate": 1.3965355176581174e-05, | |
| "loss": 0.8154891014099122, | |
| "mean_token_accuracy": 0.8105725541710853, | |
| "num_tokens": 29393298.0, | |
| "step": 7320 | |
| }, | |
| { | |
| "entropy": 0.7795201197266579, | |
| "epoch": 2.8052047455032527, | |
| "grad_norm": 0.14624197781085968, | |
| "learning_rate": 1.3696790653954614e-05, | |
| "loss": 0.7968831062316895, | |
| "mean_token_accuracy": 0.808569261431694, | |
| "num_tokens": 29423547.0, | |
| "step": 7330 | |
| }, | |
| { | |
| "entropy": 0.9187626458704472, | |
| "epoch": 2.8090317642556446, | |
| "grad_norm": 0.1368781179189682, | |
| "learning_rate": 1.3428226131328053e-05, | |
| "loss": 0.9583258628845215, | |
| "mean_token_accuracy": 0.7731027945876121, | |
| "num_tokens": 29465593.0, | |
| "step": 7340 | |
| }, | |
| { | |
| "entropy": 0.9403511643409729, | |
| "epoch": 2.8128587830080365, | |
| "grad_norm": 0.10892713069915771, | |
| "learning_rate": 1.315966160870149e-05, | |
| "loss": 0.9621626853942871, | |
| "mean_token_accuracy": 0.767315211892128, | |
| "num_tokens": 29506888.0, | |
| "step": 7350 | |
| }, | |
| { | |
| "entropy": 0.842640140466392, | |
| "epoch": 2.8166858017604284, | |
| "grad_norm": 0.08862321823835373, | |
| "learning_rate": 1.289109708607493e-05, | |
| "loss": 0.9031145095825195, | |
| "mean_token_accuracy": 0.7967306047677993, | |
| "num_tokens": 29550811.0, | |
| "step": 7360 | |
| }, | |
| { | |
| "entropy": 0.8931968793272972, | |
| "epoch": 2.8205128205128203, | |
| "grad_norm": 0.0979296937584877, | |
| "learning_rate": 1.2622532563448369e-05, | |
| "loss": 0.9369117736816406, | |
| "mean_token_accuracy": 0.785995215177536, | |
| "num_tokens": 29587036.0, | |
| "step": 7370 | |
| }, | |
| { | |
| "entropy": 0.8621913805603981, | |
| "epoch": 2.824339839265212, | |
| "grad_norm": 0.08778136223554611, | |
| "learning_rate": 1.2353968040821807e-05, | |
| "loss": 0.884724235534668, | |
| "mean_token_accuracy": 0.790358729660511, | |
| "num_tokens": 29627992.0, | |
| "step": 7380 | |
| }, | |
| { | |
| "entropy": 0.8695362661033869, | |
| "epoch": 2.828166858017604, | |
| "grad_norm": 0.09141552448272705, | |
| "learning_rate": 1.2085403518195248e-05, | |
| "loss": 0.9539263725280762, | |
| "mean_token_accuracy": 0.78631162494421, | |
| "num_tokens": 29668509.0, | |
| "step": 7390 | |
| }, | |
| { | |
| "entropy": 0.8454725466668606, | |
| "epoch": 2.831993876769996, | |
| "grad_norm": 0.10090988874435425, | |
| "learning_rate": 1.1816838995568685e-05, | |
| "loss": 0.9256816864013672, | |
| "mean_token_accuracy": 0.7941092774271965, | |
| "num_tokens": 29706794.0, | |
| "step": 7400 | |
| }, | |
| { | |
| "entropy": 0.8406473740935325, | |
| "epoch": 2.835820895522388, | |
| "grad_norm": 0.12991519272327423, | |
| "learning_rate": 1.1548274472942125e-05, | |
| "loss": 0.8969921112060547, | |
| "mean_token_accuracy": 0.7950825378298759, | |
| "num_tokens": 29745883.0, | |
| "step": 7410 | |
| }, | |
| { | |
| "entropy": 0.8951507560908795, | |
| "epoch": 2.8396479142747797, | |
| "grad_norm": 0.14208164811134338, | |
| "learning_rate": 1.1279709950315565e-05, | |
| "loss": 0.9443653106689454, | |
| "mean_token_accuracy": 0.7820898026227951, | |
| "num_tokens": 29788428.0, | |
| "step": 7420 | |
| }, | |
| { | |
| "entropy": 0.859702505543828, | |
| "epoch": 2.8434749330271716, | |
| "grad_norm": 0.10485101491212845, | |
| "learning_rate": 1.1011145427689002e-05, | |
| "loss": 0.9106481552124024, | |
| "mean_token_accuracy": 0.7910059571266175, | |
| "num_tokens": 29829552.0, | |
| "step": 7430 | |
| }, | |
| { | |
| "entropy": 0.838575328886509, | |
| "epoch": 2.8473019517795635, | |
| "grad_norm": 0.09105801582336426, | |
| "learning_rate": 1.0742580905062442e-05, | |
| "loss": 0.9367799758911133, | |
| "mean_token_accuracy": 0.7953649654984474, | |
| "num_tokens": 29869380.0, | |
| "step": 7440 | |
| }, | |
| { | |
| "entropy": 0.9112126015126705, | |
| "epoch": 2.8511289705319554, | |
| "grad_norm": 0.09724974632263184, | |
| "learning_rate": 1.0474016382435881e-05, | |
| "loss": 0.9621581077575684, | |
| "mean_token_accuracy": 0.7795066565275193, | |
| "num_tokens": 29913977.0, | |
| "step": 7450 | |
| }, | |
| { | |
| "entropy": 0.7964273016899824, | |
| "epoch": 2.8549559892843472, | |
| "grad_norm": 0.09481512755155563, | |
| "learning_rate": 1.020545185980932e-05, | |
| "loss": 0.8208577156066894, | |
| "mean_token_accuracy": 0.8045729547739029, | |
| "num_tokens": 29949229.0, | |
| "step": 7460 | |
| }, | |
| { | |
| "entropy": 0.9103045649826527, | |
| "epoch": 2.858783008036739, | |
| "grad_norm": 0.08678591996431351, | |
| "learning_rate": 9.936887337182758e-06, | |
| "loss": 0.9599167823791503, | |
| "mean_token_accuracy": 0.7792657531797886, | |
| "num_tokens": 29999070.0, | |
| "step": 7470 | |
| }, | |
| { | |
| "entropy": 0.8333844318985939, | |
| "epoch": 2.862610026789131, | |
| "grad_norm": 0.07823742181062698, | |
| "learning_rate": 9.668322814556198e-06, | |
| "loss": 0.8832645416259766, | |
| "mean_token_accuracy": 0.7986885383725166, | |
| "num_tokens": 30041797.0, | |
| "step": 7480 | |
| }, | |
| { | |
| "entropy": 0.8970901295542717, | |
| "epoch": 2.866437045541523, | |
| "grad_norm": 0.11852974444627762, | |
| "learning_rate": 9.399758291929635e-06, | |
| "loss": 0.9755334854125977, | |
| "mean_token_accuracy": 0.7814395651221275, | |
| "num_tokens": 30080534.0, | |
| "step": 7490 | |
| }, | |
| { | |
| "entropy": 0.8733609687536955, | |
| "epoch": 2.870264064293915, | |
| "grad_norm": 0.08307944238185883, | |
| "learning_rate": 9.131193769303076e-06, | |
| "loss": 0.9116435050964355, | |
| "mean_token_accuracy": 0.786429825425148, | |
| "num_tokens": 30123488.0, | |
| "step": 7500 | |
| }, | |
| { | |
| "entropy": 0.7967244807630778, | |
| "epoch": 2.8740910830463067, | |
| "grad_norm": 0.121941938996315, | |
| "learning_rate": 8.862629246676514e-06, | |
| "loss": 0.8209601402282715, | |
| "mean_token_accuracy": 0.8040247783064842, | |
| "num_tokens": 30158076.0, | |
| "step": 7510 | |
| }, | |
| { | |
| "entropy": 0.8655086796730757, | |
| "epoch": 2.8779181017986986, | |
| "grad_norm": 0.10017320513725281, | |
| "learning_rate": 8.594064724049953e-06, | |
| "loss": 0.9246477127075196, | |
| "mean_token_accuracy": 0.7905631095170975, | |
| "num_tokens": 30198329.0, | |
| "step": 7520 | |
| }, | |
| { | |
| "entropy": 0.7916971929371357, | |
| "epoch": 2.8817451205510904, | |
| "grad_norm": 0.08822990953922272, | |
| "learning_rate": 8.325500201423391e-06, | |
| "loss": 0.8695680618286132, | |
| "mean_token_accuracy": 0.8063599601387977, | |
| "num_tokens": 30239990.0, | |
| "step": 7530 | |
| }, | |
| { | |
| "entropy": 0.7693583916872739, | |
| "epoch": 2.8855721393034823, | |
| "grad_norm": 0.1178632378578186, | |
| "learning_rate": 8.056935678796832e-06, | |
| "loss": 0.8029808044433594, | |
| "mean_token_accuracy": 0.808713173866272, | |
| "num_tokens": 30272583.0, | |
| "step": 7540 | |
| }, | |
| { | |
| "entropy": 0.9072235215455293, | |
| "epoch": 2.889399158055874, | |
| "grad_norm": 0.11368006467819214, | |
| "learning_rate": 7.78837115617027e-06, | |
| "loss": 0.9859001159667968, | |
| "mean_token_accuracy": 0.7825366839766502, | |
| "num_tokens": 30314370.0, | |
| "step": 7550 | |
| }, | |
| { | |
| "entropy": 0.909162075817585, | |
| "epoch": 2.893226176808266, | |
| "grad_norm": 0.10643935948610306, | |
| "learning_rate": 7.519806633543709e-06, | |
| "loss": 0.9263824462890625, | |
| "mean_token_accuracy": 0.7813168540596962, | |
| "num_tokens": 30362103.0, | |
| "step": 7560 | |
| }, | |
| { | |
| "entropy": 0.8779693342745304, | |
| "epoch": 2.897053195560658, | |
| "grad_norm": 0.12511365115642548, | |
| "learning_rate": 7.2512421109171484e-06, | |
| "loss": 0.9283166885375976, | |
| "mean_token_accuracy": 0.7876154363155365, | |
| "num_tokens": 30400468.0, | |
| "step": 7570 | |
| }, | |
| { | |
| "entropy": 0.9308112382888794, | |
| "epoch": 2.90088021431305, | |
| "grad_norm": 0.08942066878080368, | |
| "learning_rate": 6.982677588290587e-06, | |
| "loss": 0.9894198417663574, | |
| "mean_token_accuracy": 0.7739586725831031, | |
| "num_tokens": 30444628.0, | |
| "step": 7580 | |
| }, | |
| { | |
| "entropy": 0.8830183774232865, | |
| "epoch": 2.9047072330654418, | |
| "grad_norm": 0.08949998021125793, | |
| "learning_rate": 6.7141130656640265e-06, | |
| "loss": 0.9515928268432617, | |
| "mean_token_accuracy": 0.7846902176737786, | |
| "num_tokens": 30485845.0, | |
| "step": 7590 | |
| }, | |
| { | |
| "entropy": 0.8058773010969162, | |
| "epoch": 2.9085342518178336, | |
| "grad_norm": 0.1035229042172432, | |
| "learning_rate": 6.445548543037465e-06, | |
| "loss": 0.846186637878418, | |
| "mean_token_accuracy": 0.8066700398921967, | |
| "num_tokens": 30523979.0, | |
| "step": 7600 | |
| }, | |
| { | |
| "entropy": 0.9146121144294739, | |
| "epoch": 2.9123612705702255, | |
| "grad_norm": 0.09379884600639343, | |
| "learning_rate": 6.176984020410904e-06, | |
| "loss": 0.9735233306884765, | |
| "mean_token_accuracy": 0.7774886921048164, | |
| "num_tokens": 30564775.0, | |
| "step": 7610 | |
| }, | |
| { | |
| "entropy": 0.8396586284041405, | |
| "epoch": 2.9161882893226174, | |
| "grad_norm": 0.11920839548110962, | |
| "learning_rate": 5.908419497784342e-06, | |
| "loss": 0.9061779022216797, | |
| "mean_token_accuracy": 0.7974281132221221, | |
| "num_tokens": 30609113.0, | |
| "step": 7620 | |
| }, | |
| { | |
| "entropy": 0.8665836162865161, | |
| "epoch": 2.9200153080750093, | |
| "grad_norm": 0.10214731842279434, | |
| "learning_rate": 5.639854975157783e-06, | |
| "loss": 0.9333956718444825, | |
| "mean_token_accuracy": 0.7912585958838463, | |
| "num_tokens": 30652409.0, | |
| "step": 7630 | |
| }, | |
| { | |
| "entropy": 0.8082432024180889, | |
| "epoch": 2.923842326827401, | |
| "grad_norm": 0.09191566705703735, | |
| "learning_rate": 5.371290452531221e-06, | |
| "loss": 0.8443769454956055, | |
| "mean_token_accuracy": 0.797667445242405, | |
| "num_tokens": 30689299.0, | |
| "step": 7640 | |
| }, | |
| { | |
| "entropy": 0.8395522754639387, | |
| "epoch": 2.927669345579793, | |
| "grad_norm": 0.08281564712524414, | |
| "learning_rate": 5.10272592990466e-06, | |
| "loss": 0.8710539817810059, | |
| "mean_token_accuracy": 0.7973509266972542, | |
| "num_tokens": 30724619.0, | |
| "step": 7650 | |
| }, | |
| { | |
| "entropy": 0.8130493897944688, | |
| "epoch": 2.931496364332185, | |
| "grad_norm": 0.0996284931898117, | |
| "learning_rate": 4.834161407278099e-06, | |
| "loss": 0.8514342308044434, | |
| "mean_token_accuracy": 0.800888329744339, | |
| "num_tokens": 30764224.0, | |
| "step": 7660 | |
| }, | |
| { | |
| "entropy": 0.7793916609138251, | |
| "epoch": 2.935323383084577, | |
| "grad_norm": 0.09503267705440521, | |
| "learning_rate": 4.565596884651538e-06, | |
| "loss": 0.8305204391479493, | |
| "mean_token_accuracy": 0.8106261268258095, | |
| "num_tokens": 30800800.0, | |
| "step": 7670 | |
| }, | |
| { | |
| "entropy": 0.817446855083108, | |
| "epoch": 2.9391504018369687, | |
| "grad_norm": 0.13637053966522217, | |
| "learning_rate": 4.2970323620249764e-06, | |
| "loss": 0.839473819732666, | |
| "mean_token_accuracy": 0.8018909886479377, | |
| "num_tokens": 30841481.0, | |
| "step": 7680 | |
| }, | |
| { | |
| "entropy": 0.8140060313045978, | |
| "epoch": 2.9429774205893606, | |
| "grad_norm": 0.13390128314495087, | |
| "learning_rate": 4.028467839398416e-06, | |
| "loss": 0.8653444290161133, | |
| "mean_token_accuracy": 0.8000675857067108, | |
| "num_tokens": 30880001.0, | |
| "step": 7690 | |
| }, | |
| { | |
| "entropy": 0.7898532018065453, | |
| "epoch": 2.9468044393417525, | |
| "grad_norm": 0.11585478484630585, | |
| "learning_rate": 3.7599033167718545e-06, | |
| "loss": 0.8074365615844726, | |
| "mean_token_accuracy": 0.8053972944617271, | |
| "num_tokens": 30915563.0, | |
| "step": 7700 | |
| }, | |
| { | |
| "entropy": 0.8091453645378351, | |
| "epoch": 2.9506314580941444, | |
| "grad_norm": 0.09755035489797592, | |
| "learning_rate": 3.4913387941452935e-06, | |
| "loss": 0.8457134246826172, | |
| "mean_token_accuracy": 0.8031114682555198, | |
| "num_tokens": 30955410.0, | |
| "step": 7710 | |
| }, | |
| { | |
| "entropy": 0.8444364190101623, | |
| "epoch": 2.9544584768465363, | |
| "grad_norm": 0.1297679990530014, | |
| "learning_rate": 3.2227742715187325e-06, | |
| "loss": 0.910922622680664, | |
| "mean_token_accuracy": 0.7976488128304482, | |
| "num_tokens": 30997246.0, | |
| "step": 7720 | |
| }, | |
| { | |
| "entropy": 0.8454434804618358, | |
| "epoch": 2.958285495598928, | |
| "grad_norm": 0.15091662108898163, | |
| "learning_rate": 2.954209748892171e-06, | |
| "loss": 0.8977128982543945, | |
| "mean_token_accuracy": 0.7951600447297096, | |
| "num_tokens": 31042192.0, | |
| "step": 7730 | |
| }, | |
| { | |
| "entropy": 0.838621474429965, | |
| "epoch": 2.96211251435132, | |
| "grad_norm": 0.10101021081209183, | |
| "learning_rate": 2.6856452262656106e-06, | |
| "loss": 0.9142851829528809, | |
| "mean_token_accuracy": 0.7966463148593903, | |
| "num_tokens": 31082777.0, | |
| "step": 7740 | |
| }, | |
| { | |
| "entropy": 0.8021124713122845, | |
| "epoch": 2.965939533103712, | |
| "grad_norm": 0.11373798549175262, | |
| "learning_rate": 2.4170807036390496e-06, | |
| "loss": 0.845030403137207, | |
| "mean_token_accuracy": 0.8039181783795357, | |
| "num_tokens": 31122973.0, | |
| "step": 7750 | |
| }, | |
| { | |
| "entropy": 0.8570070005953312, | |
| "epoch": 2.969766551856104, | |
| "grad_norm": 0.0995812863111496, | |
| "learning_rate": 2.1485161810124882e-06, | |
| "loss": 0.8876262664794922, | |
| "mean_token_accuracy": 0.7908932328224182, | |
| "num_tokens": 31166313.0, | |
| "step": 7760 | |
| }, | |
| { | |
| "entropy": 0.9019658699631691, | |
| "epoch": 2.9735935706084957, | |
| "grad_norm": 0.10546575486660004, | |
| "learning_rate": 1.8799516583859272e-06, | |
| "loss": 0.9777070999145507, | |
| "mean_token_accuracy": 0.7821963891386986, | |
| "num_tokens": 31202060.0, | |
| "step": 7770 | |
| }, | |
| { | |
| "entropy": 0.9346055820584297, | |
| "epoch": 2.9774205893608876, | |
| "grad_norm": 0.11632298678159714, | |
| "learning_rate": 1.6113871357593663e-06, | |
| "loss": 1.017040729522705, | |
| "mean_token_accuracy": 0.7751505836844444, | |
| "num_tokens": 31241536.0, | |
| "step": 7780 | |
| }, | |
| { | |
| "entropy": 0.8882534563541412, | |
| "epoch": 2.9812476081132795, | |
| "grad_norm": 0.13064302504062653, | |
| "learning_rate": 1.3428226131328053e-06, | |
| "loss": 0.9505605697631836, | |
| "mean_token_accuracy": 0.7848831593990326, | |
| "num_tokens": 31278060.0, | |
| "step": 7790 | |
| }, | |
| { | |
| "entropy": 0.8854026839137077, | |
| "epoch": 2.9850746268656714, | |
| "grad_norm": 0.0977831557393074, | |
| "learning_rate": 1.0742580905062441e-06, | |
| "loss": 0.9311306953430176, | |
| "mean_token_accuracy": 0.7847100362181664, | |
| "num_tokens": 31325802.0, | |
| "step": 7800 | |
| }, | |
| { | |
| "entropy": 0.9448695838451385, | |
| "epoch": 2.9889016456180633, | |
| "grad_norm": 0.11724492162466049, | |
| "learning_rate": 8.056935678796831e-07, | |
| "loss": 0.983949089050293, | |
| "mean_token_accuracy": 0.7636413291096688, | |
| "num_tokens": 31367954.0, | |
| "step": 7810 | |
| }, | |
| { | |
| "entropy": 0.8787743166089058, | |
| "epoch": 2.992728664370455, | |
| "grad_norm": 0.09530383348464966, | |
| "learning_rate": 5.371290452531221e-07, | |
| "loss": 0.9605165481567383, | |
| "mean_token_accuracy": 0.7847816556692123, | |
| "num_tokens": 31410151.0, | |
| "step": 7820 | |
| }, | |
| { | |
| "entropy": 0.810061177611351, | |
| "epoch": 2.996555683122847, | |
| "grad_norm": 0.09042539447546005, | |
| "learning_rate": 2.6856452262656103e-07, | |
| "loss": 0.8766719818115234, | |
| "mean_token_accuracy": 0.8047587737441063, | |
| "num_tokens": 31451314.0, | |
| "step": 7830 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 7839, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.17346463002948e+19, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |