diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8074 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 804, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.136738121509552, + "epoch": 0.003738317757009346, + "grad_norm": 0.42455214262008667, + "learning_rate": 0.0002, + "loss": 2.479450225830078, + "mean_token_accuracy": 0.5328910201787949, + "num_tokens": 16274.0, + "step": 1 + }, + { + "entropy": 1.2458838522434235, + "epoch": 0.007476635514018692, + "grad_norm": 0.37535905838012695, + "learning_rate": 0.0002, + "loss": 2.177677869796753, + "mean_token_accuracy": 0.5608605146408081, + "num_tokens": 32608.0, + "step": 2 + }, + { + "entropy": 1.4101199209690094, + "epoch": 0.011214953271028037, + "grad_norm": 0.2836460769176483, + "learning_rate": 0.0002, + "loss": 1.7183680534362793, + "mean_token_accuracy": 0.5886513292789459, + "num_tokens": 49069.0, + "step": 3 + }, + { + "entropy": 1.3778190314769745, + "epoch": 0.014953271028037384, + "grad_norm": 0.234278604388237, + "learning_rate": 0.0002, + "loss": 1.4014493227005005, + "mean_token_accuracy": 0.6345515549182892, + "num_tokens": 65545.0, + "step": 4 + }, + { + "entropy": 1.3580558598041534, + "epoch": 0.018691588785046728, + "grad_norm": 0.25148847699165344, + "learning_rate": 0.0002, + "loss": 1.2977453470230103, + "mean_token_accuracy": 0.6410679519176483, + "num_tokens": 81930.0, + "step": 5 + }, + { + "entropy": 1.2881037592887878, + "epoch": 0.022429906542056073, + "grad_norm": 0.1429636925458908, + "learning_rate": 0.0002, + "loss": 1.200700283050537, + "mean_token_accuracy": 0.6533316373825073, + "num_tokens": 97974.0, + "step": 6 + }, + { + "entropy": 1.2034580409526825, + "epoch": 0.026168224299065422, + "grad_norm": 0.10399583727121353, + "learning_rate": 0.0002, + "loss": 1.099328637123108, + "mean_token_accuracy": 0.667132779955864, + "num_tokens": 114155.0, + "step": 7 + }, + { + "entropy": 1.1255775392055511, + "epoch": 0.029906542056074768, + "grad_norm": 0.11593594402074814, + "learning_rate": 0.0002, + "loss": 1.0287823677062988, + "mean_token_accuracy": 0.676301121711731, + "num_tokens": 130551.0, + "step": 8 + }, + { + "entropy": 1.035294532775879, + "epoch": 0.03364485981308411, + "grad_norm": 0.1378099024295807, + "learning_rate": 0.0002, + "loss": 0.9908202290534973, + "mean_token_accuracy": 0.6857471317052841, + "num_tokens": 146573.0, + "step": 9 + }, + { + "entropy": 0.9605472087860107, + "epoch": 0.037383177570093455, + "grad_norm": 0.14775174856185913, + "learning_rate": 0.0002, + "loss": 0.9240690469741821, + "mean_token_accuracy": 0.6948150098323822, + "num_tokens": 162866.0, + "step": 10 + }, + { + "entropy": 0.9512569606304169, + "epoch": 0.041121495327102804, + "grad_norm": 0.11748532950878143, + "learning_rate": 0.0002, + "loss": 0.8622909784317017, + "mean_token_accuracy": 0.7091559618711472, + "num_tokens": 179220.0, + "step": 11 + }, + { + "entropy": 0.9225088059902191, + "epoch": 0.044859813084112146, + "grad_norm": 0.12350241094827652, + "learning_rate": 0.0002, + "loss": 0.822359025478363, + "mean_token_accuracy": 0.7142422199249268, + "num_tokens": 195296.0, + "step": 12 + }, + { + "entropy": 0.8315692692995071, + "epoch": 0.048598130841121495, + "grad_norm": 1.498412013053894, + "learning_rate": 0.0002, + "loss": 0.857573390007019, + "mean_token_accuracy": 0.7114518880844116, + "num_tokens": 211304.0, + "step": 13 + }, + { + "entropy": 0.8135845959186554, + "epoch": 0.052336448598130844, + "grad_norm": 0.24739782512187958, + "learning_rate": 0.0002, + "loss": 0.7810372114181519, + "mean_token_accuracy": 0.7160922139883041, + "num_tokens": 227628.0, + "step": 14 + }, + { + "entropy": 0.7939814329147339, + "epoch": 0.056074766355140186, + "grad_norm": 0.09629958122968674, + "learning_rate": 0.0002, + "loss": 0.7519910335540771, + "mean_token_accuracy": 0.7263199836015701, + "num_tokens": 243754.0, + "step": 15 + }, + { + "entropy": 0.7665929198265076, + "epoch": 0.059813084112149535, + "grad_norm": 0.097053661942482, + "learning_rate": 0.0002, + "loss": 0.7232163548469543, + "mean_token_accuracy": 0.7332944422960281, + "num_tokens": 260330.0, + "step": 16 + }, + { + "entropy": 0.725548505783081, + "epoch": 0.06355140186915888, + "grad_norm": 0.07506023347377777, + "learning_rate": 0.0002, + "loss": 0.7072620987892151, + "mean_token_accuracy": 0.7361972779035568, + "num_tokens": 276558.0, + "step": 17 + }, + { + "entropy": 0.6787595748901367, + "epoch": 0.06728971962616823, + "grad_norm": 0.09339383244514465, + "learning_rate": 0.0002, + "loss": 0.6842125654220581, + "mean_token_accuracy": 0.7451010048389435, + "num_tokens": 293015.0, + "step": 18 + }, + { + "entropy": 0.6606214493513107, + "epoch": 0.07102803738317758, + "grad_norm": 0.1005033329129219, + "learning_rate": 0.0002, + "loss": 0.67589271068573, + "mean_token_accuracy": 0.7439846247434616, + "num_tokens": 309125.0, + "step": 19 + }, + { + "entropy": 0.6459693312644958, + "epoch": 0.07476635514018691, + "grad_norm": 0.08065377175807953, + "learning_rate": 0.0002, + "loss": 0.6449049711227417, + "mean_token_accuracy": 0.7484360784292221, + "num_tokens": 325340.0, + "step": 20 + }, + { + "entropy": 0.6585913002490997, + "epoch": 0.07850467289719626, + "grad_norm": 0.0826243907213211, + "learning_rate": 0.0002, + "loss": 0.6479358673095703, + "mean_token_accuracy": 0.7465608268976212, + "num_tokens": 341526.0, + "step": 21 + }, + { + "entropy": 0.6610395163297653, + "epoch": 0.08224299065420561, + "grad_norm": 0.11156535893678665, + "learning_rate": 0.0002, + "loss": 0.6428132057189941, + "mean_token_accuracy": 0.7519424259662628, + "num_tokens": 357723.0, + "step": 22 + }, + { + "entropy": 0.6467725187540054, + "epoch": 0.08598130841121496, + "grad_norm": 0.07777491956949234, + "learning_rate": 0.0002, + "loss": 0.6323320269584656, + "mean_token_accuracy": 0.7518283724784851, + "num_tokens": 374088.0, + "step": 23 + }, + { + "entropy": 0.6338784694671631, + "epoch": 0.08971962616822429, + "grad_norm": 0.067124143242836, + "learning_rate": 0.0002, + "loss": 0.6243886351585388, + "mean_token_accuracy": 0.7597910910844803, + "num_tokens": 390225.0, + "step": 24 + }, + { + "entropy": 0.6436891853809357, + "epoch": 0.09345794392523364, + "grad_norm": 0.07950877398252487, + "learning_rate": 0.0002, + "loss": 0.6342618465423584, + "mean_token_accuracy": 0.7521715611219406, + "num_tokens": 406768.0, + "step": 25 + }, + { + "entropy": 0.6160638332366943, + "epoch": 0.09719626168224299, + "grad_norm": 0.07700544595718384, + "learning_rate": 0.0002, + "loss": 0.6131497621536255, + "mean_token_accuracy": 0.7559348195791245, + "num_tokens": 422877.0, + "step": 26 + }, + { + "entropy": 0.628534808754921, + "epoch": 0.10093457943925234, + "grad_norm": 0.06708905845880508, + "learning_rate": 0.0002, + "loss": 0.6215195059776306, + "mean_token_accuracy": 0.7550520598888397, + "num_tokens": 439151.0, + "step": 27 + }, + { + "entropy": 0.6049717515707016, + "epoch": 0.10467289719626169, + "grad_norm": 0.06930189579725266, + "learning_rate": 0.0002, + "loss": 0.6006554961204529, + "mean_token_accuracy": 0.7655697911977768, + "num_tokens": 455376.0, + "step": 28 + }, + { + "entropy": 0.6179223954677582, + "epoch": 0.10841121495327102, + "grad_norm": 0.07139614224433899, + "learning_rate": 0.0002, + "loss": 0.6130858659744263, + "mean_token_accuracy": 0.757579579949379, + "num_tokens": 471522.0, + "step": 29 + }, + { + "entropy": 0.5762445032596588, + "epoch": 0.11214953271028037, + "grad_norm": 0.06326469033956528, + "learning_rate": 0.0002, + "loss": 0.5740499496459961, + "mean_token_accuracy": 0.7769385576248169, + "num_tokens": 487795.0, + "step": 30 + }, + { + "entropy": 0.5995292961597443, + "epoch": 0.11588785046728972, + "grad_norm": 0.05662618950009346, + "learning_rate": 0.0002, + "loss": 0.5942716002464294, + "mean_token_accuracy": 0.762609601020813, + "num_tokens": 503903.0, + "step": 31 + }, + { + "entropy": 0.564709946513176, + "epoch": 0.11962616822429907, + "grad_norm": 0.05710163712501526, + "learning_rate": 0.0002, + "loss": 0.5685403347015381, + "mean_token_accuracy": 0.7732207477092743, + "num_tokens": 520130.0, + "step": 32 + }, + { + "entropy": 0.5898115634918213, + "epoch": 0.1233644859813084, + "grad_norm": 0.06266085058450699, + "learning_rate": 0.0002, + "loss": 0.5901201963424683, + "mean_token_accuracy": 0.7677827328443527, + "num_tokens": 536535.0, + "step": 33 + }, + { + "entropy": 0.5797764509916306, + "epoch": 0.12710280373831775, + "grad_norm": 0.053195368498563766, + "learning_rate": 0.0002, + "loss": 0.5766412615776062, + "mean_token_accuracy": 0.7718744575977325, + "num_tokens": 552762.0, + "step": 34 + }, + { + "entropy": 0.5923163294792175, + "epoch": 0.1308411214953271, + "grad_norm": 0.058649640530347824, + "learning_rate": 0.0002, + "loss": 0.588531494140625, + "mean_token_accuracy": 0.7659970670938492, + "num_tokens": 569081.0, + "step": 35 + }, + { + "entropy": 0.598005622625351, + "epoch": 0.13457943925233645, + "grad_norm": 0.05565391853451729, + "learning_rate": 0.0002, + "loss": 0.5939058661460876, + "mean_token_accuracy": 0.7645552158355713, + "num_tokens": 585378.0, + "step": 36 + }, + { + "entropy": 0.568940594792366, + "epoch": 0.1383177570093458, + "grad_norm": 0.05046789348125458, + "learning_rate": 0.0002, + "loss": 0.5695131421089172, + "mean_token_accuracy": 0.76941779255867, + "num_tokens": 601818.0, + "step": 37 + }, + { + "entropy": 0.5911677330732346, + "epoch": 0.14205607476635515, + "grad_norm": 0.05749228596687317, + "learning_rate": 0.0002, + "loss": 0.595350444316864, + "mean_token_accuracy": 0.7632527947425842, + "num_tokens": 617971.0, + "step": 38 + }, + { + "entropy": 0.578888863325119, + "epoch": 0.14579439252336449, + "grad_norm": 0.05733935162425041, + "learning_rate": 0.0002, + "loss": 0.5821264982223511, + "mean_token_accuracy": 0.7635951936244965, + "num_tokens": 634366.0, + "step": 39 + }, + { + "entropy": 0.580509215593338, + "epoch": 0.14953271028037382, + "grad_norm": 0.04922669008374214, + "learning_rate": 0.0002, + "loss": 0.5817846059799194, + "mean_token_accuracy": 0.7654395997524261, + "num_tokens": 650944.0, + "step": 40 + }, + { + "entropy": 0.581391915678978, + "epoch": 0.15327102803738318, + "grad_norm": 0.05423928052186966, + "learning_rate": 0.0002, + "loss": 0.5844112634658813, + "mean_token_accuracy": 0.7646835446357727, + "num_tokens": 667338.0, + "step": 41 + }, + { + "entropy": 0.5903676152229309, + "epoch": 0.15700934579439252, + "grad_norm": 0.04594385251402855, + "learning_rate": 0.0002, + "loss": 0.5891512036323547, + "mean_token_accuracy": 0.7634948194026947, + "num_tokens": 683821.0, + "step": 42 + }, + { + "entropy": 0.5767852067947388, + "epoch": 0.16074766355140188, + "grad_norm": 0.04914949834346771, + "learning_rate": 0.0002, + "loss": 0.5805380344390869, + "mean_token_accuracy": 0.7673552930355072, + "num_tokens": 700040.0, + "step": 43 + }, + { + "entropy": 0.5833797305822372, + "epoch": 0.16448598130841122, + "grad_norm": 0.044859349727630615, + "learning_rate": 0.0002, + "loss": 0.5833380818367004, + "mean_token_accuracy": 0.7672955095767975, + "num_tokens": 716503.0, + "step": 44 + }, + { + "entropy": 0.5525845289230347, + "epoch": 0.16822429906542055, + "grad_norm": 0.04274710640311241, + "learning_rate": 0.0002, + "loss": 0.5535459518432617, + "mean_token_accuracy": 0.7772536426782608, + "num_tokens": 732556.0, + "step": 45 + }, + { + "entropy": 0.591767281293869, + "epoch": 0.17196261682242991, + "grad_norm": 0.047692377120256424, + "learning_rate": 0.0002, + "loss": 0.5856772661209106, + "mean_token_accuracy": 0.7700498551130295, + "num_tokens": 749181.0, + "step": 46 + }, + { + "entropy": 0.5679502487182617, + "epoch": 0.17570093457943925, + "grad_norm": 0.04246673360466957, + "learning_rate": 0.0002, + "loss": 0.5660921931266785, + "mean_token_accuracy": 0.7716551572084427, + "num_tokens": 765518.0, + "step": 47 + }, + { + "entropy": 0.5837526619434357, + "epoch": 0.17943925233644858, + "grad_norm": 0.04593057557940483, + "learning_rate": 0.0002, + "loss": 0.584723174571991, + "mean_token_accuracy": 0.7659307718276978, + "num_tokens": 782078.0, + "step": 48 + }, + { + "entropy": 0.5635000020265579, + "epoch": 0.18317757009345795, + "grad_norm": 0.04482231289148331, + "learning_rate": 0.0002, + "loss": 0.5647243857383728, + "mean_token_accuracy": 0.7718922346830368, + "num_tokens": 798351.0, + "step": 49 + }, + { + "entropy": 0.5844053775072098, + "epoch": 0.18691588785046728, + "grad_norm": 0.037112969905138016, + "learning_rate": 0.0002, + "loss": 0.583613395690918, + "mean_token_accuracy": 0.7636143714189529, + "num_tokens": 814901.0, + "step": 50 + }, + { + "entropy": 0.5725089907646179, + "epoch": 0.19065420560747665, + "grad_norm": 0.04044761508703232, + "learning_rate": 0.0002, + "loss": 0.572070300579071, + "mean_token_accuracy": 0.7681350558996201, + "num_tokens": 831260.0, + "step": 51 + }, + { + "entropy": 0.5514933615922928, + "epoch": 0.19439252336448598, + "grad_norm": 0.048450976610183716, + "learning_rate": 0.0002, + "loss": 0.5633155107498169, + "mean_token_accuracy": 0.7741293609142303, + "num_tokens": 847432.0, + "step": 52 + }, + { + "entropy": 0.5666041970252991, + "epoch": 0.19813084112149532, + "grad_norm": 0.037479840219020844, + "learning_rate": 0.0002, + "loss": 0.5648237466812134, + "mean_token_accuracy": 0.7711822837591171, + "num_tokens": 863773.0, + "step": 53 + }, + { + "entropy": 0.5806857347488403, + "epoch": 0.20186915887850468, + "grad_norm": 0.041213128715753555, + "learning_rate": 0.0002, + "loss": 0.5815138816833496, + "mean_token_accuracy": 0.7682100683450699, + "num_tokens": 880158.0, + "step": 54 + }, + { + "entropy": 0.5745600759983063, + "epoch": 0.205607476635514, + "grad_norm": 0.043696995824575424, + "learning_rate": 0.0002, + "loss": 0.568037211894989, + "mean_token_accuracy": 0.7706614434719086, + "num_tokens": 896711.0, + "step": 55 + }, + { + "entropy": 0.5852600038051605, + "epoch": 0.20934579439252338, + "grad_norm": 0.04500531032681465, + "learning_rate": 0.0002, + "loss": 0.5849421620368958, + "mean_token_accuracy": 0.7663431912660599, + "num_tokens": 913375.0, + "step": 56 + }, + { + "entropy": 0.5551716238260269, + "epoch": 0.2130841121495327, + "grad_norm": 0.04126116633415222, + "learning_rate": 0.0002, + "loss": 0.5583643913269043, + "mean_token_accuracy": 0.7752658873796463, + "num_tokens": 929701.0, + "step": 57 + }, + { + "entropy": 0.5559567362070084, + "epoch": 0.21682242990654205, + "grad_norm": 0.03906504437327385, + "learning_rate": 0.0002, + "loss": 0.5586991310119629, + "mean_token_accuracy": 0.7782035171985626, + "num_tokens": 945982.0, + "step": 58 + }, + { + "entropy": 0.5741323977708817, + "epoch": 0.2205607476635514, + "grad_norm": 0.044159967452287674, + "learning_rate": 0.0002, + "loss": 0.5750066041946411, + "mean_token_accuracy": 0.7701967805624008, + "num_tokens": 962111.0, + "step": 59 + }, + { + "entropy": 0.585385873913765, + "epoch": 0.22429906542056074, + "grad_norm": 0.03850168734788895, + "learning_rate": 0.0002, + "loss": 0.5819919109344482, + "mean_token_accuracy": 0.7681078165769577, + "num_tokens": 978456.0, + "step": 60 + }, + { + "entropy": 0.5729155838489532, + "epoch": 0.22803738317757008, + "grad_norm": 0.04180893301963806, + "learning_rate": 0.0002, + "loss": 0.5612373948097229, + "mean_token_accuracy": 0.7728097885847092, + "num_tokens": 994829.0, + "step": 61 + }, + { + "entropy": 0.5742225646972656, + "epoch": 0.23177570093457944, + "grad_norm": 0.0462227538228035, + "learning_rate": 0.0002, + "loss": 0.5630587339401245, + "mean_token_accuracy": 0.7737539410591125, + "num_tokens": 1011036.0, + "step": 62 + }, + { + "entropy": 0.5770351439714432, + "epoch": 0.23551401869158878, + "grad_norm": 0.03771355003118515, + "learning_rate": 0.0002, + "loss": 0.5766980051994324, + "mean_token_accuracy": 0.7672136276960373, + "num_tokens": 1027230.0, + "step": 63 + }, + { + "entropy": 0.5691997706890106, + "epoch": 0.23925233644859814, + "grad_norm": 0.04262315854430199, + "learning_rate": 0.0002, + "loss": 0.574401319026947, + "mean_token_accuracy": 0.768265426158905, + "num_tokens": 1043682.0, + "step": 64 + }, + { + "entropy": 0.5537833720445633, + "epoch": 0.24299065420560748, + "grad_norm": 0.04550046846270561, + "learning_rate": 0.0002, + "loss": 0.562879741191864, + "mean_token_accuracy": 0.7726933360099792, + "num_tokens": 1059653.0, + "step": 65 + }, + { + "entropy": 0.5562547147274017, + "epoch": 0.2467289719626168, + "grad_norm": 0.039495520293712616, + "learning_rate": 0.0002, + "loss": 0.5617212653160095, + "mean_token_accuracy": 0.7734608948230743, + "num_tokens": 1076229.0, + "step": 66 + }, + { + "entropy": 0.555762991309166, + "epoch": 0.2504672897196262, + "grad_norm": 0.04136459901928902, + "learning_rate": 0.0002, + "loss": 0.5561961531639099, + "mean_token_accuracy": 0.7754806876182556, + "num_tokens": 1092742.0, + "step": 67 + }, + { + "entropy": 0.560298278927803, + "epoch": 0.2542056074766355, + "grad_norm": 0.036972060799598694, + "learning_rate": 0.0002, + "loss": 0.5573002696037292, + "mean_token_accuracy": 0.7789498865604401, + "num_tokens": 1109145.0, + "step": 68 + }, + { + "entropy": 0.5759327560663223, + "epoch": 0.25794392523364484, + "grad_norm": 0.03908069431781769, + "learning_rate": 0.0002, + "loss": 0.5715816617012024, + "mean_token_accuracy": 0.7693368047475815, + "num_tokens": 1125662.0, + "step": 69 + }, + { + "entropy": 0.570976510643959, + "epoch": 0.2616822429906542, + "grad_norm": 0.036351628601551056, + "learning_rate": 0.0002, + "loss": 0.5671893358230591, + "mean_token_accuracy": 0.7697141170501709, + "num_tokens": 1141777.0, + "step": 70 + }, + { + "entropy": 0.5589511841535568, + "epoch": 0.26542056074766357, + "grad_norm": 0.042053237557411194, + "learning_rate": 0.0002, + "loss": 0.5577408075332642, + "mean_token_accuracy": 0.774786502122879, + "num_tokens": 1158296.0, + "step": 71 + }, + { + "entropy": 0.5530008673667908, + "epoch": 0.2691588785046729, + "grad_norm": 0.03951295465230942, + "learning_rate": 0.0002, + "loss": 0.557781994342804, + "mean_token_accuracy": 0.7735090106725693, + "num_tokens": 1174440.0, + "step": 72 + }, + { + "entropy": 0.5726254433393478, + "epoch": 0.27289719626168224, + "grad_norm": 0.0380430705845356, + "learning_rate": 0.0002, + "loss": 0.5711798667907715, + "mean_token_accuracy": 0.7688230872154236, + "num_tokens": 1190746.0, + "step": 73 + }, + { + "entropy": 0.5716652125120163, + "epoch": 0.2766355140186916, + "grad_norm": 0.04010101035237312, + "learning_rate": 0.0002, + "loss": 0.5822720527648926, + "mean_token_accuracy": 0.7654355019330978, + "num_tokens": 1207069.0, + "step": 74 + }, + { + "entropy": 0.5660725682973862, + "epoch": 0.2803738317757009, + "grad_norm": 0.03960711508989334, + "learning_rate": 0.0002, + "loss": 0.5682789087295532, + "mean_token_accuracy": 0.7714986652135849, + "num_tokens": 1223466.0, + "step": 75 + }, + { + "entropy": 0.5635482668876648, + "epoch": 0.2841121495327103, + "grad_norm": 0.033172570168972015, + "learning_rate": 0.0002, + "loss": 0.5611684918403625, + "mean_token_accuracy": 0.7727522701025009, + "num_tokens": 1239902.0, + "step": 76 + }, + { + "entropy": 0.5699751675128937, + "epoch": 0.28785046728971964, + "grad_norm": 0.04123734310269356, + "learning_rate": 0.0002, + "loss": 0.5646215081214905, + "mean_token_accuracy": 0.7738035619258881, + "num_tokens": 1256190.0, + "step": 77 + }, + { + "entropy": 0.5798220336437225, + "epoch": 0.29158878504672897, + "grad_norm": 0.03779308497905731, + "learning_rate": 0.0002, + "loss": 0.5753127932548523, + "mean_token_accuracy": 0.769858792424202, + "num_tokens": 1272670.0, + "step": 78 + }, + { + "entropy": 0.5637505650520325, + "epoch": 0.2953271028037383, + "grad_norm": 0.036078162491321564, + "learning_rate": 0.0002, + "loss": 0.5640622973442078, + "mean_token_accuracy": 0.7736664712429047, + "num_tokens": 1288842.0, + "step": 79 + }, + { + "entropy": 0.5597875267267227, + "epoch": 0.29906542056074764, + "grad_norm": 0.032451264560222626, + "learning_rate": 0.0002, + "loss": 0.563526451587677, + "mean_token_accuracy": 0.7725878953933716, + "num_tokens": 1305088.0, + "step": 80 + }, + { + "entropy": 0.5821113288402557, + "epoch": 0.30280373831775703, + "grad_norm": 0.038235142827034, + "learning_rate": 0.0002, + "loss": 0.5853811502456665, + "mean_token_accuracy": 0.7640060931444168, + "num_tokens": 1321504.0, + "step": 81 + }, + { + "entropy": 0.5438373982906342, + "epoch": 0.30654205607476637, + "grad_norm": 0.03746205195784569, + "learning_rate": 0.0002, + "loss": 0.5479013919830322, + "mean_token_accuracy": 0.7800798416137695, + "num_tokens": 1337868.0, + "step": 82 + }, + { + "entropy": 0.5653156489133835, + "epoch": 0.3102803738317757, + "grad_norm": 0.03490284085273743, + "learning_rate": 0.0002, + "loss": 0.5649343132972717, + "mean_token_accuracy": 0.7717723995447159, + "num_tokens": 1354004.0, + "step": 83 + }, + { + "entropy": 0.5832313299179077, + "epoch": 0.31401869158878504, + "grad_norm": 0.03734894096851349, + "learning_rate": 0.0002, + "loss": 0.5807749032974243, + "mean_token_accuracy": 0.7655426859855652, + "num_tokens": 1370558.0, + "step": 84 + }, + { + "entropy": 0.5402187407016754, + "epoch": 0.3177570093457944, + "grad_norm": 0.03854204714298248, + "learning_rate": 0.0002, + "loss": 0.5464577078819275, + "mean_token_accuracy": 0.7792229354381561, + "num_tokens": 1386854.0, + "step": 85 + }, + { + "entropy": 0.5570175051689148, + "epoch": 0.32149532710280376, + "grad_norm": 0.03692445531487465, + "learning_rate": 0.0002, + "loss": 0.5628695487976074, + "mean_token_accuracy": 0.7712178230285645, + "num_tokens": 1403119.0, + "step": 86 + }, + { + "entropy": 0.544977530837059, + "epoch": 0.3252336448598131, + "grad_norm": 0.0364147312939167, + "learning_rate": 0.0002, + "loss": 0.5444334149360657, + "mean_token_accuracy": 0.7836048603057861, + "num_tokens": 1419561.0, + "step": 87 + }, + { + "entropy": 0.5631776601076126, + "epoch": 0.32897196261682243, + "grad_norm": 0.03956674784421921, + "learning_rate": 0.0002, + "loss": 0.5600812435150146, + "mean_token_accuracy": 0.7744863480329514, + "num_tokens": 1435885.0, + "step": 88 + }, + { + "entropy": 0.5635630786418915, + "epoch": 0.33271028037383177, + "grad_norm": 0.033574752509593964, + "learning_rate": 0.0002, + "loss": 0.559590756893158, + "mean_token_accuracy": 0.7763379067182541, + "num_tokens": 1452239.0, + "step": 89 + }, + { + "entropy": 0.5845580548048019, + "epoch": 0.3364485981308411, + "grad_norm": 0.04166650027036667, + "learning_rate": 0.0002, + "loss": 0.5896680355072021, + "mean_token_accuracy": 0.7613022774457932, + "num_tokens": 1469012.0, + "step": 90 + }, + { + "entropy": 0.5831678807735443, + "epoch": 0.3401869158878505, + "grad_norm": 0.03430558741092682, + "learning_rate": 0.0002, + "loss": 0.5816903114318848, + "mean_token_accuracy": 0.7649097442626953, + "num_tokens": 1485787.0, + "step": 91 + }, + { + "entropy": 0.5729665160179138, + "epoch": 0.34392523364485983, + "grad_norm": 0.035805486142635345, + "learning_rate": 0.0002, + "loss": 0.5738450884819031, + "mean_token_accuracy": 0.769588515162468, + "num_tokens": 1502079.0, + "step": 92 + }, + { + "entropy": 0.5653799027204514, + "epoch": 0.34766355140186916, + "grad_norm": 0.03914204239845276, + "learning_rate": 0.0002, + "loss": 0.5674072504043579, + "mean_token_accuracy": 0.7691812217235565, + "num_tokens": 1518311.0, + "step": 93 + }, + { + "entropy": 0.5616943389177322, + "epoch": 0.3514018691588785, + "grad_norm": 0.03653113916516304, + "learning_rate": 0.0002, + "loss": 0.5629323720932007, + "mean_token_accuracy": 0.7708428353071213, + "num_tokens": 1534540.0, + "step": 94 + }, + { + "entropy": 0.5549721717834473, + "epoch": 0.35514018691588783, + "grad_norm": 0.03406484052538872, + "learning_rate": 0.0002, + "loss": 0.5544697046279907, + "mean_token_accuracy": 0.7769979536533356, + "num_tokens": 1550933.0, + "step": 95 + }, + { + "entropy": 0.5604167878627777, + "epoch": 0.35887850467289717, + "grad_norm": 0.03333809971809387, + "learning_rate": 0.0002, + "loss": 0.5594492554664612, + "mean_token_accuracy": 0.7748987078666687, + "num_tokens": 1567207.0, + "step": 96 + }, + { + "entropy": 0.546283945441246, + "epoch": 0.36261682242990656, + "grad_norm": 0.03933919966220856, + "learning_rate": 0.0002, + "loss": 0.5514240264892578, + "mean_token_accuracy": 0.7776303142309189, + "num_tokens": 1583311.0, + "step": 97 + }, + { + "entropy": 0.5612452626228333, + "epoch": 0.3663551401869159, + "grad_norm": 0.04003525897860527, + "learning_rate": 0.0002, + "loss": 0.5653887987136841, + "mean_token_accuracy": 0.7751908302307129, + "num_tokens": 1599670.0, + "step": 98 + }, + { + "entropy": 0.5733049809932709, + "epoch": 0.37009345794392523, + "grad_norm": 0.038050130009651184, + "learning_rate": 0.0002, + "loss": 0.5774834752082825, + "mean_token_accuracy": 0.7684330493211746, + "num_tokens": 1616055.0, + "step": 99 + }, + { + "entropy": 0.5716951340436935, + "epoch": 0.37383177570093457, + "grad_norm": 0.03439132496714592, + "learning_rate": 0.0002, + "loss": 0.5752108097076416, + "mean_token_accuracy": 0.7674903124570847, + "num_tokens": 1632632.0, + "step": 100 + }, + { + "entropy": 0.5594859421253204, + "epoch": 0.3775700934579439, + "grad_norm": 0.03533682972192764, + "learning_rate": 0.0002, + "loss": 0.568997323513031, + "mean_token_accuracy": 0.7706844657659531, + "num_tokens": 1648909.0, + "step": 101 + }, + { + "entropy": 0.5699178278446198, + "epoch": 0.3813084112149533, + "grad_norm": 0.03436512500047684, + "learning_rate": 0.0002, + "loss": 0.5691055059432983, + "mean_token_accuracy": 0.7695006430149078, + "num_tokens": 1665325.0, + "step": 102 + }, + { + "entropy": 0.540950670838356, + "epoch": 0.3850467289719626, + "grad_norm": 0.03944160416722298, + "learning_rate": 0.0002, + "loss": 0.5404485464096069, + "mean_token_accuracy": 0.781610980629921, + "num_tokens": 1681319.0, + "step": 103 + }, + { + "entropy": 0.5614301711320877, + "epoch": 0.38878504672897196, + "grad_norm": 0.03688221052289009, + "learning_rate": 0.0002, + "loss": 0.5563839077949524, + "mean_token_accuracy": 0.7734142690896988, + "num_tokens": 1697654.0, + "step": 104 + }, + { + "entropy": 0.5428010523319244, + "epoch": 0.3925233644859813, + "grad_norm": 0.03438030928373337, + "learning_rate": 0.0002, + "loss": 0.5420697331428528, + "mean_token_accuracy": 0.7806366533041, + "num_tokens": 1713877.0, + "step": 105 + }, + { + "entropy": 0.5636697560548782, + "epoch": 0.39626168224299063, + "grad_norm": 0.039323464035987854, + "learning_rate": 0.0002, + "loss": 0.5671778321266174, + "mean_token_accuracy": 0.7705285251140594, + "num_tokens": 1730348.0, + "step": 106 + }, + { + "entropy": 0.5588130354881287, + "epoch": 0.4, + "grad_norm": 0.038590364158153534, + "learning_rate": 0.0002, + "loss": 0.5618171095848083, + "mean_token_accuracy": 0.772266685962677, + "num_tokens": 1746757.0, + "step": 107 + }, + { + "entropy": 0.561880424618721, + "epoch": 0.40373831775700936, + "grad_norm": 0.03582950308918953, + "learning_rate": 0.0002, + "loss": 0.5647022128105164, + "mean_token_accuracy": 0.771363690495491, + "num_tokens": 1763220.0, + "step": 108 + }, + { + "entropy": 0.5577187687158585, + "epoch": 0.4074766355140187, + "grad_norm": 0.0341983400285244, + "learning_rate": 0.0002, + "loss": 0.5510934591293335, + "mean_token_accuracy": 0.7795996069908142, + "num_tokens": 1779600.0, + "step": 109 + }, + { + "entropy": 0.5766544491052628, + "epoch": 0.411214953271028, + "grad_norm": 0.03449225798249245, + "learning_rate": 0.0002, + "loss": 0.567363977432251, + "mean_token_accuracy": 0.7708986699581146, + "num_tokens": 1795971.0, + "step": 110 + }, + { + "entropy": 0.5705956965684891, + "epoch": 0.41495327102803736, + "grad_norm": 0.033156272023916245, + "learning_rate": 0.0002, + "loss": 0.5640801191329956, + "mean_token_accuracy": 0.7707958519458771, + "num_tokens": 1812546.0, + "step": 111 + }, + { + "entropy": 0.5591244697570801, + "epoch": 0.41869158878504675, + "grad_norm": 0.029204925522208214, + "learning_rate": 0.0002, + "loss": 0.5617936849594116, + "mean_token_accuracy": 0.7732467949390411, + "num_tokens": 1828919.0, + "step": 112 + }, + { + "entropy": 0.553697258234024, + "epoch": 0.4224299065420561, + "grad_norm": 0.03737286105751991, + "learning_rate": 0.0002, + "loss": 0.558631420135498, + "mean_token_accuracy": 0.777028888463974, + "num_tokens": 1845572.0, + "step": 113 + }, + { + "entropy": 0.5312661975622177, + "epoch": 0.4261682242990654, + "grad_norm": 0.03722711279988289, + "learning_rate": 0.0002, + "loss": 0.5394684672355652, + "mean_token_accuracy": 0.7818541079759598, + "num_tokens": 1861653.0, + "step": 114 + }, + { + "entropy": 0.546062633395195, + "epoch": 0.42990654205607476, + "grad_norm": 0.03108621947467327, + "learning_rate": 0.0002, + "loss": 0.5509571433067322, + "mean_token_accuracy": 0.7786260843276978, + "num_tokens": 1877945.0, + "step": 115 + }, + { + "entropy": 0.5667106062173843, + "epoch": 0.4336448598130841, + "grad_norm": 0.029234400019049644, + "learning_rate": 0.0002, + "loss": 0.5700245499610901, + "mean_token_accuracy": 0.7707543075084686, + "num_tokens": 1894419.0, + "step": 116 + }, + { + "entropy": 0.5690203160047531, + "epoch": 0.4373831775700935, + "grad_norm": 0.03232111781835556, + "learning_rate": 0.0002, + "loss": 0.5682043433189392, + "mean_token_accuracy": 0.7720295935869217, + "num_tokens": 1910917.0, + "step": 117 + }, + { + "entropy": 0.5568649768829346, + "epoch": 0.4411214953271028, + "grad_norm": 0.031728681176900864, + "learning_rate": 0.0002, + "loss": 0.5546939969062805, + "mean_token_accuracy": 0.7736632823944092, + "num_tokens": 1927308.0, + "step": 118 + }, + { + "entropy": 0.5677070319652557, + "epoch": 0.44485981308411215, + "grad_norm": 0.038115937262773514, + "learning_rate": 0.0002, + "loss": 0.569874107837677, + "mean_token_accuracy": 0.7713787406682968, + "num_tokens": 1943291.0, + "step": 119 + }, + { + "entropy": 0.5715564787387848, + "epoch": 0.4485981308411215, + "grad_norm": 0.031203260645270348, + "learning_rate": 0.0002, + "loss": 0.5690625309944153, + "mean_token_accuracy": 0.7683161348104477, + "num_tokens": 1959628.0, + "step": 120 + }, + { + "entropy": 0.5510662794113159, + "epoch": 0.4523364485981308, + "grad_norm": 0.034137893468141556, + "learning_rate": 0.0002, + "loss": 0.552444338798523, + "mean_token_accuracy": 0.7768227756023407, + "num_tokens": 1976103.0, + "step": 121 + }, + { + "entropy": 0.5565275996923447, + "epoch": 0.45607476635514016, + "grad_norm": 0.039453648030757904, + "learning_rate": 0.0002, + "loss": 0.5519005656242371, + "mean_token_accuracy": 0.7769535332918167, + "num_tokens": 1992472.0, + "step": 122 + }, + { + "entropy": 0.5421037673950195, + "epoch": 0.45981308411214955, + "grad_norm": 0.03410222753882408, + "learning_rate": 0.0002, + "loss": 0.5471655130386353, + "mean_token_accuracy": 0.7754515707492828, + "num_tokens": 2008555.0, + "step": 123 + }, + { + "entropy": 0.5602420270442963, + "epoch": 0.4635514018691589, + "grad_norm": 0.034221749752759933, + "learning_rate": 0.0002, + "loss": 0.561537504196167, + "mean_token_accuracy": 0.7700601816177368, + "num_tokens": 2025061.0, + "step": 124 + }, + { + "entropy": 0.5571303218603134, + "epoch": 0.4672897196261682, + "grad_norm": 0.036211200058460236, + "learning_rate": 0.0002, + "loss": 0.5621545910835266, + "mean_token_accuracy": 0.7712312340736389, + "num_tokens": 2041431.0, + "step": 125 + }, + { + "entropy": 0.5687634646892548, + "epoch": 0.47102803738317756, + "grad_norm": 0.03822026401758194, + "learning_rate": 0.0002, + "loss": 0.5733909010887146, + "mean_token_accuracy": 0.7690061926841736, + "num_tokens": 2057835.0, + "step": 126 + }, + { + "entropy": 0.561365082859993, + "epoch": 0.4747663551401869, + "grad_norm": 0.03744380176067352, + "learning_rate": 0.0002, + "loss": 0.561904788017273, + "mean_token_accuracy": 0.7733899652957916, + "num_tokens": 2073971.0, + "step": 127 + }, + { + "entropy": 0.5848346501588821, + "epoch": 0.4785046728971963, + "grad_norm": 0.04521637409925461, + "learning_rate": 0.0002, + "loss": 0.5786125063896179, + "mean_token_accuracy": 0.7650021761655807, + "num_tokens": 2090449.0, + "step": 128 + }, + { + "entropy": 0.547502264380455, + "epoch": 0.4822429906542056, + "grad_norm": 0.030477922409772873, + "learning_rate": 0.0002, + "loss": 0.5489711761474609, + "mean_token_accuracy": 0.7773880660533905, + "num_tokens": 2106721.0, + "step": 129 + }, + { + "entropy": 0.5484167486429214, + "epoch": 0.48598130841121495, + "grad_norm": 0.031105147674679756, + "learning_rate": 0.0002, + "loss": 0.5544927716255188, + "mean_token_accuracy": 0.7739315629005432, + "num_tokens": 2123019.0, + "step": 130 + }, + { + "entropy": 0.5533222556114197, + "epoch": 0.4897196261682243, + "grad_norm": 0.0377504825592041, + "learning_rate": 0.0002, + "loss": 0.5580374002456665, + "mean_token_accuracy": 0.7739338725805283, + "num_tokens": 2139430.0, + "step": 131 + }, + { + "entropy": 0.5609411746263504, + "epoch": 0.4934579439252336, + "grad_norm": 0.04023468494415283, + "learning_rate": 0.0002, + "loss": 0.566258430480957, + "mean_token_accuracy": 0.7712711840867996, + "num_tokens": 2155890.0, + "step": 132 + }, + { + "entropy": 0.5359190106391907, + "epoch": 0.497196261682243, + "grad_norm": 0.03230973705649376, + "learning_rate": 0.0002, + "loss": 0.5397756695747375, + "mean_token_accuracy": 0.7778284996747971, + "num_tokens": 2172146.0, + "step": 133 + }, + { + "entropy": 0.5826548039913177, + "epoch": 0.5009345794392523, + "grad_norm": 0.03292697295546532, + "learning_rate": 0.0002, + "loss": 0.5766130685806274, + "mean_token_accuracy": 0.7655939012765884, + "num_tokens": 2188654.0, + "step": 134 + }, + { + "entropy": 0.5718280375003815, + "epoch": 0.5046728971962616, + "grad_norm": 0.03372567892074585, + "learning_rate": 0.0002, + "loss": 0.5630422830581665, + "mean_token_accuracy": 0.7710609585046768, + "num_tokens": 2205019.0, + "step": 135 + }, + { + "entropy": 0.5586309432983398, + "epoch": 0.508411214953271, + "grad_norm": 0.033095572143793106, + "learning_rate": 0.0002, + "loss": 0.5580739974975586, + "mean_token_accuracy": 0.7730830013751984, + "num_tokens": 2221412.0, + "step": 136 + }, + { + "entropy": 0.555738240480423, + "epoch": 0.5121495327102804, + "grad_norm": 0.03156888484954834, + "learning_rate": 0.0002, + "loss": 0.5593985319137573, + "mean_token_accuracy": 0.7678060680627823, + "num_tokens": 2237712.0, + "step": 137 + }, + { + "entropy": 0.5371007025241852, + "epoch": 0.5158878504672897, + "grad_norm": 0.03322545811533928, + "learning_rate": 0.0002, + "loss": 0.5458611845970154, + "mean_token_accuracy": 0.7752078175544739, + "num_tokens": 2253827.0, + "step": 138 + }, + { + "entropy": 0.5338983088731766, + "epoch": 0.5196261682242991, + "grad_norm": 0.03455954045057297, + "learning_rate": 0.0002, + "loss": 0.545717179775238, + "mean_token_accuracy": 0.7804258912801743, + "num_tokens": 2270009.0, + "step": 139 + }, + { + "entropy": 0.5622379928827286, + "epoch": 0.5233644859813084, + "grad_norm": 0.030741458758711815, + "learning_rate": 0.0002, + "loss": 0.5627423524856567, + "mean_token_accuracy": 0.771199107170105, + "num_tokens": 2286480.0, + "step": 140 + }, + { + "entropy": 0.5630074888467789, + "epoch": 0.5271028037383177, + "grad_norm": 0.03147517144680023, + "learning_rate": 0.0002, + "loss": 0.5579048991203308, + "mean_token_accuracy": 0.7733362317085266, + "num_tokens": 2302838.0, + "step": 141 + }, + { + "entropy": 0.5669132769107819, + "epoch": 0.5308411214953271, + "grad_norm": 0.03270615637302399, + "learning_rate": 0.0002, + "loss": 0.5602433085441589, + "mean_token_accuracy": 0.7726781070232391, + "num_tokens": 2318805.0, + "step": 142 + }, + { + "entropy": 0.5551939755678177, + "epoch": 0.5345794392523364, + "grad_norm": 0.03414132818579674, + "learning_rate": 0.0002, + "loss": 0.5483931303024292, + "mean_token_accuracy": 0.7783452570438385, + "num_tokens": 2335157.0, + "step": 143 + }, + { + "entropy": 0.5692463368177414, + "epoch": 0.5383177570093458, + "grad_norm": 0.037064556032419205, + "learning_rate": 0.0002, + "loss": 0.5773426294326782, + "mean_token_accuracy": 0.7641823440790176, + "num_tokens": 2351384.0, + "step": 144 + }, + { + "entropy": 0.5659217685461044, + "epoch": 0.5420560747663551, + "grad_norm": 0.037828508764505386, + "learning_rate": 0.0002, + "loss": 0.5750471353530884, + "mean_token_accuracy": 0.765160858631134, + "num_tokens": 2367801.0, + "step": 145 + }, + { + "entropy": 0.561379611492157, + "epoch": 0.5457943925233645, + "grad_norm": 0.03333268687129021, + "learning_rate": 0.0002, + "loss": 0.5598861575126648, + "mean_token_accuracy": 0.7742918580770493, + "num_tokens": 2384482.0, + "step": 146 + }, + { + "entropy": 0.5676955878734589, + "epoch": 0.5495327102803739, + "grad_norm": 0.0296424962580204, + "learning_rate": 0.0002, + "loss": 0.560563325881958, + "mean_token_accuracy": 0.774119034409523, + "num_tokens": 2400736.0, + "step": 147 + }, + { + "entropy": 0.5658537149429321, + "epoch": 0.5532710280373832, + "grad_norm": 0.0320475734770298, + "learning_rate": 0.0002, + "loss": 0.5580253601074219, + "mean_token_accuracy": 0.7784629613161087, + "num_tokens": 2417222.0, + "step": 148 + }, + { + "entropy": 0.5581035912036896, + "epoch": 0.5570093457943925, + "grad_norm": 0.03125445544719696, + "learning_rate": 0.0002, + "loss": 0.5552960634231567, + "mean_token_accuracy": 0.7758258730173111, + "num_tokens": 2433798.0, + "step": 149 + }, + { + "entropy": 0.5418982058763504, + "epoch": 0.5607476635514018, + "grad_norm": 0.0312193613499403, + "learning_rate": 0.0002, + "loss": 0.5517758727073669, + "mean_token_accuracy": 0.775468572974205, + "num_tokens": 2450186.0, + "step": 150 + }, + { + "entropy": 0.5608511418104172, + "epoch": 0.5644859813084112, + "grad_norm": 0.032521311193704605, + "learning_rate": 0.0002, + "loss": 0.574313759803772, + "mean_token_accuracy": 0.7643774896860123, + "num_tokens": 2466450.0, + "step": 151 + }, + { + "entropy": 0.5575774312019348, + "epoch": 0.5682242990654206, + "grad_norm": 0.02971738576889038, + "learning_rate": 0.0002, + "loss": 0.5584424138069153, + "mean_token_accuracy": 0.7738354802131653, + "num_tokens": 2482922.0, + "step": 152 + }, + { + "entropy": 0.5496693998575211, + "epoch": 0.5719626168224299, + "grad_norm": 0.0331539623439312, + "learning_rate": 0.0002, + "loss": 0.5571150183677673, + "mean_token_accuracy": 0.7751006782054901, + "num_tokens": 2499215.0, + "step": 153 + }, + { + "entropy": 0.5449398458003998, + "epoch": 0.5757009345794393, + "grad_norm": 0.028877289965748787, + "learning_rate": 0.0002, + "loss": 0.5457977652549744, + "mean_token_accuracy": 0.7815146297216415, + "num_tokens": 2515544.0, + "step": 154 + }, + { + "entropy": 0.5479637384414673, + "epoch": 0.5794392523364486, + "grad_norm": 0.03473028913140297, + "learning_rate": 0.0002, + "loss": 0.5433490872383118, + "mean_token_accuracy": 0.7794239521026611, + "num_tokens": 2531678.0, + "step": 155 + }, + { + "entropy": 0.5400601774454117, + "epoch": 0.5831775700934579, + "grad_norm": 0.030446792021393776, + "learning_rate": 0.0002, + "loss": 0.5424179434776306, + "mean_token_accuracy": 0.7802795916795731, + "num_tokens": 2547759.0, + "step": 156 + }, + { + "entropy": 0.5537837445735931, + "epoch": 0.5869158878504673, + "grad_norm": 0.030434170737862587, + "learning_rate": 0.0002, + "loss": 0.5575878024101257, + "mean_token_accuracy": 0.7747874706983566, + "num_tokens": 2564234.0, + "step": 157 + }, + { + "entropy": 0.5445020198822021, + "epoch": 0.5906542056074766, + "grad_norm": 0.031060006469488144, + "learning_rate": 0.0002, + "loss": 0.5391767024993896, + "mean_token_accuracy": 0.7812654078006744, + "num_tokens": 2580415.0, + "step": 158 + }, + { + "entropy": 0.5522216707468033, + "epoch": 0.594392523364486, + "grad_norm": 0.03142917901277542, + "learning_rate": 0.0002, + "loss": 0.5536507964134216, + "mean_token_accuracy": 0.7752956748008728, + "num_tokens": 2596612.0, + "step": 159 + }, + { + "entropy": 0.5727219432592392, + "epoch": 0.5981308411214953, + "grad_norm": 0.033186040818691254, + "learning_rate": 0.0002, + "loss": 0.5693776607513428, + "mean_token_accuracy": 0.7679739594459534, + "num_tokens": 2613027.0, + "step": 160 + }, + { + "entropy": 0.5479543507099152, + "epoch": 0.6018691588785047, + "grad_norm": 0.029086332768201828, + "learning_rate": 0.0002, + "loss": 0.5436608195304871, + "mean_token_accuracy": 0.7767124027013779, + "num_tokens": 2629513.0, + "step": 161 + }, + { + "entropy": 0.5365437120199203, + "epoch": 0.6056074766355141, + "grad_norm": 0.03190625086426735, + "learning_rate": 0.0002, + "loss": 0.539345383644104, + "mean_token_accuracy": 0.7788415402173996, + "num_tokens": 2645517.0, + "step": 162 + }, + { + "entropy": 0.5434722602367401, + "epoch": 0.6093457943925233, + "grad_norm": 0.03164217248558998, + "learning_rate": 0.0002, + "loss": 0.5468231439590454, + "mean_token_accuracy": 0.7775093168020248, + "num_tokens": 2661809.0, + "step": 163 + }, + { + "entropy": 0.5627670586109161, + "epoch": 0.6130841121495327, + "grad_norm": 0.030292298644781113, + "learning_rate": 0.0002, + "loss": 0.5634962916374207, + "mean_token_accuracy": 0.77353835105896, + "num_tokens": 2678296.0, + "step": 164 + }, + { + "entropy": 0.5599590986967087, + "epoch": 0.616822429906542, + "grad_norm": 0.029385516420006752, + "learning_rate": 0.0002, + "loss": 0.5621803402900696, + "mean_token_accuracy": 0.7726689726114273, + "num_tokens": 2694312.0, + "step": 165 + }, + { + "entropy": 0.5674590021371841, + "epoch": 0.6205607476635514, + "grad_norm": 0.03131924942135811, + "learning_rate": 0.0002, + "loss": 0.5683683753013611, + "mean_token_accuracy": 0.7695939242839813, + "num_tokens": 2710615.0, + "step": 166 + }, + { + "entropy": 0.5440984219312668, + "epoch": 0.6242990654205608, + "grad_norm": 0.030983613803982735, + "learning_rate": 0.0002, + "loss": 0.5437349081039429, + "mean_token_accuracy": 0.7795680165290833, + "num_tokens": 2726995.0, + "step": 167 + }, + { + "entropy": 0.5603994876146317, + "epoch": 0.6280373831775701, + "grad_norm": 0.028368143364787102, + "learning_rate": 0.0002, + "loss": 0.5631406903266907, + "mean_token_accuracy": 0.7677181214094162, + "num_tokens": 2743576.0, + "step": 168 + }, + { + "entropy": 0.5566483587026596, + "epoch": 0.6317757009345795, + "grad_norm": 0.03626226633787155, + "learning_rate": 0.0002, + "loss": 0.5647379159927368, + "mean_token_accuracy": 0.7732031494379044, + "num_tokens": 2760174.0, + "step": 169 + }, + { + "entropy": 0.564967080950737, + "epoch": 0.6355140186915887, + "grad_norm": 0.03088950738310814, + "learning_rate": 0.0002, + "loss": 0.5664903521537781, + "mean_token_accuracy": 0.7681966125965118, + "num_tokens": 2776574.0, + "step": 170 + }, + { + "entropy": 0.5679657459259033, + "epoch": 0.6392523364485981, + "grad_norm": 0.03709281235933304, + "learning_rate": 0.0002, + "loss": 0.5604699850082397, + "mean_token_accuracy": 0.7735682725906372, + "num_tokens": 2792828.0, + "step": 171 + }, + { + "entropy": 0.581437885761261, + "epoch": 0.6429906542056075, + "grad_norm": 0.026090197265148163, + "learning_rate": 0.0002, + "loss": 0.5791311264038086, + "mean_token_accuracy": 0.762902557849884, + "num_tokens": 2809131.0, + "step": 172 + }, + { + "entropy": 0.5521832853555679, + "epoch": 0.6467289719626168, + "grad_norm": 0.029906727373600006, + "learning_rate": 0.0002, + "loss": 0.5577026605606079, + "mean_token_accuracy": 0.7733243852853775, + "num_tokens": 2825112.0, + "step": 173 + }, + { + "entropy": 0.555216521024704, + "epoch": 0.6504672897196262, + "grad_norm": 0.033849943429231644, + "learning_rate": 0.0002, + "loss": 0.5657702684402466, + "mean_token_accuracy": 0.7707085460424423, + "num_tokens": 2841339.0, + "step": 174 + }, + { + "entropy": 0.551365852355957, + "epoch": 0.6542056074766355, + "grad_norm": 0.031996551901102066, + "learning_rate": 0.0002, + "loss": 0.5525310039520264, + "mean_token_accuracy": 0.7775240540504456, + "num_tokens": 2857512.0, + "step": 175 + }, + { + "entropy": 0.549150824546814, + "epoch": 0.6579439252336449, + "grad_norm": 0.029290124773979187, + "learning_rate": 0.0002, + "loss": 0.5493086576461792, + "mean_token_accuracy": 0.7772240936756134, + "num_tokens": 2873797.0, + "step": 176 + }, + { + "entropy": 0.5804181694984436, + "epoch": 0.6616822429906543, + "grad_norm": 0.02999035269021988, + "learning_rate": 0.0002, + "loss": 0.5758557319641113, + "mean_token_accuracy": 0.766997441649437, + "num_tokens": 2890362.0, + "step": 177 + }, + { + "entropy": 0.5632698237895966, + "epoch": 0.6654205607476635, + "grad_norm": 0.02878536470234394, + "learning_rate": 0.0002, + "loss": 0.559673011302948, + "mean_token_accuracy": 0.7719457000494003, + "num_tokens": 2906508.0, + "step": 178 + }, + { + "entropy": 0.5700209885835648, + "epoch": 0.6691588785046729, + "grad_norm": 0.028609320521354675, + "learning_rate": 0.0002, + "loss": 0.5671200752258301, + "mean_token_accuracy": 0.7697011083364487, + "num_tokens": 2922954.0, + "step": 179 + }, + { + "entropy": 0.5542242079973221, + "epoch": 0.6728971962616822, + "grad_norm": 0.028964772820472717, + "learning_rate": 0.0002, + "loss": 0.5530304908752441, + "mean_token_accuracy": 0.776089608669281, + "num_tokens": 2939138.0, + "step": 180 + }, + { + "entropy": 0.570984736084938, + "epoch": 0.6766355140186916, + "grad_norm": 0.030573254451155663, + "learning_rate": 0.0002, + "loss": 0.5777382850646973, + "mean_token_accuracy": 0.7673922181129456, + "num_tokens": 2955869.0, + "step": 181 + }, + { + "entropy": 0.5505239069461823, + "epoch": 0.680373831775701, + "grad_norm": 0.02849324606359005, + "learning_rate": 0.0002, + "loss": 0.5563762784004211, + "mean_token_accuracy": 0.7741904556751251, + "num_tokens": 2972440.0, + "step": 182 + }, + { + "entropy": 0.5634328275918961, + "epoch": 0.6841121495327103, + "grad_norm": 0.029679251834750175, + "learning_rate": 0.0002, + "loss": 0.5653568506240845, + "mean_token_accuracy": 0.7715763002634048, + "num_tokens": 2988878.0, + "step": 183 + }, + { + "entropy": 0.5458139479160309, + "epoch": 0.6878504672897197, + "grad_norm": 0.028634849935770035, + "learning_rate": 0.0002, + "loss": 0.5491960048675537, + "mean_token_accuracy": 0.7766706198453903, + "num_tokens": 3004892.0, + "step": 184 + }, + { + "entropy": 0.566186711192131, + "epoch": 0.6915887850467289, + "grad_norm": 0.03155568614602089, + "learning_rate": 0.0002, + "loss": 0.5609148740768433, + "mean_token_accuracy": 0.77189701795578, + "num_tokens": 3021081.0, + "step": 185 + }, + { + "entropy": 0.5414428561925888, + "epoch": 0.6953271028037383, + "grad_norm": 0.027838880196213722, + "learning_rate": 0.0002, + "loss": 0.5334644317626953, + "mean_token_accuracy": 0.7823236584663391, + "num_tokens": 3037358.0, + "step": 186 + }, + { + "entropy": 0.5581627041101456, + "epoch": 0.6990654205607477, + "grad_norm": 0.028703616932034492, + "learning_rate": 0.0002, + "loss": 0.5619620680809021, + "mean_token_accuracy": 0.7744630128145218, + "num_tokens": 3053550.0, + "step": 187 + }, + { + "entropy": 0.5253149345517159, + "epoch": 0.702803738317757, + "grad_norm": 0.036868445575237274, + "learning_rate": 0.0002, + "loss": 0.5375533103942871, + "mean_token_accuracy": 0.7792882919311523, + "num_tokens": 3069739.0, + "step": 188 + }, + { + "entropy": 0.5389144420623779, + "epoch": 0.7065420560747664, + "grad_norm": 0.03131941705942154, + "learning_rate": 0.0002, + "loss": 0.5459954738616943, + "mean_token_accuracy": 0.7803291976451874, + "num_tokens": 3085960.0, + "step": 189 + }, + { + "entropy": 0.5600542724132538, + "epoch": 0.7102803738317757, + "grad_norm": 0.029567614197731018, + "learning_rate": 0.0002, + "loss": 0.5498960614204407, + "mean_token_accuracy": 0.7771395742893219, + "num_tokens": 3102054.0, + "step": 190 + }, + { + "entropy": 0.5614524185657501, + "epoch": 0.7140186915887851, + "grad_norm": 0.02720000222325325, + "learning_rate": 0.0002, + "loss": 0.5563923716545105, + "mean_token_accuracy": 0.7744134813547134, + "num_tokens": 3118447.0, + "step": 191 + }, + { + "entropy": 0.5513639450073242, + "epoch": 0.7177570093457943, + "grad_norm": 0.03140437230467796, + "learning_rate": 0.0002, + "loss": 0.5556260347366333, + "mean_token_accuracy": 0.7725710570812225, + "num_tokens": 3134824.0, + "step": 192 + }, + { + "entropy": 0.5523516684770584, + "epoch": 0.7214953271028037, + "grad_norm": 0.027633987367153168, + "learning_rate": 0.0002, + "loss": 0.5527896285057068, + "mean_token_accuracy": 0.775657445192337, + "num_tokens": 3151098.0, + "step": 193 + }, + { + "entropy": 0.5488898605108261, + "epoch": 0.7252336448598131, + "grad_norm": 0.02825162373483181, + "learning_rate": 0.0002, + "loss": 0.5509235858917236, + "mean_token_accuracy": 0.7764269262552261, + "num_tokens": 3167503.0, + "step": 194 + }, + { + "entropy": 0.5719494670629501, + "epoch": 0.7289719626168224, + "grad_norm": 0.03426945582032204, + "learning_rate": 0.0002, + "loss": 0.581325888633728, + "mean_token_accuracy": 0.7639684081077576, + "num_tokens": 3183932.0, + "step": 195 + }, + { + "entropy": 0.5686557441949844, + "epoch": 0.7327102803738318, + "grad_norm": 0.0296348724514246, + "learning_rate": 0.0002, + "loss": 0.5693853497505188, + "mean_token_accuracy": 0.7690740376710892, + "num_tokens": 3200339.0, + "step": 196 + }, + { + "entropy": 0.545837864279747, + "epoch": 0.7364485981308411, + "grad_norm": 0.03093736432492733, + "learning_rate": 0.0002, + "loss": 0.5460183024406433, + "mean_token_accuracy": 0.7779112309217453, + "num_tokens": 3216460.0, + "step": 197 + }, + { + "entropy": 0.5601817220449448, + "epoch": 0.7401869158878505, + "grad_norm": 0.02910701371729374, + "learning_rate": 0.0002, + "loss": 0.5535531044006348, + "mean_token_accuracy": 0.7790419608354568, + "num_tokens": 3232857.0, + "step": 198 + }, + { + "entropy": 0.5590776205062866, + "epoch": 0.7439252336448599, + "grad_norm": 0.02963181957602501, + "learning_rate": 0.0002, + "loss": 0.5586906671524048, + "mean_token_accuracy": 0.771443635225296, + "num_tokens": 3249103.0, + "step": 199 + }, + { + "entropy": 0.5511395633220673, + "epoch": 0.7476635514018691, + "grad_norm": 0.031875208020210266, + "learning_rate": 0.0002, + "loss": 0.5533637404441833, + "mean_token_accuracy": 0.7755965292453766, + "num_tokens": 3265378.0, + "step": 200 + }, + { + "entropy": 0.5485802739858627, + "epoch": 0.7514018691588785, + "grad_norm": 0.027761714532971382, + "learning_rate": 0.0002, + "loss": 0.5528791546821594, + "mean_token_accuracy": 0.7763257622718811, + "num_tokens": 3281575.0, + "step": 201 + }, + { + "entropy": 0.5456139892339706, + "epoch": 0.7551401869158878, + "grad_norm": 0.030668726190924644, + "learning_rate": 0.0002, + "loss": 0.5525423884391785, + "mean_token_accuracy": 0.7762203961610794, + "num_tokens": 3297643.0, + "step": 202 + }, + { + "entropy": 0.5445601046085358, + "epoch": 0.7588785046728972, + "grad_norm": 0.029748164117336273, + "learning_rate": 0.0002, + "loss": 0.5478560924530029, + "mean_token_accuracy": 0.7770299166440964, + "num_tokens": 3314080.0, + "step": 203 + }, + { + "entropy": 0.5609561204910278, + "epoch": 0.7626168224299066, + "grad_norm": 0.03164554014801979, + "learning_rate": 0.0002, + "loss": 0.5568655729293823, + "mean_token_accuracy": 0.7736087292432785, + "num_tokens": 3330614.0, + "step": 204 + }, + { + "entropy": 0.5710489153862, + "epoch": 0.7663551401869159, + "grad_norm": 0.028819117695093155, + "learning_rate": 0.0002, + "loss": 0.5648672580718994, + "mean_token_accuracy": 0.7728124856948853, + "num_tokens": 3347098.0, + "step": 205 + }, + { + "entropy": 0.5394908934831619, + "epoch": 0.7700934579439253, + "grad_norm": 0.027154654264450073, + "learning_rate": 0.0002, + "loss": 0.5411190986633301, + "mean_token_accuracy": 0.7803514152765274, + "num_tokens": 3363492.0, + "step": 206 + }, + { + "entropy": 0.5685155987739563, + "epoch": 0.7738317757009345, + "grad_norm": 0.029519978910684586, + "learning_rate": 0.0002, + "loss": 0.5696613192558289, + "mean_token_accuracy": 0.7668858170509338, + "num_tokens": 3380119.0, + "step": 207 + }, + { + "entropy": 0.5677367150783539, + "epoch": 0.7775700934579439, + "grad_norm": 0.030474061146378517, + "learning_rate": 0.0002, + "loss": 0.5646864771842957, + "mean_token_accuracy": 0.770299568772316, + "num_tokens": 3396414.0, + "step": 208 + }, + { + "entropy": 0.550197958946228, + "epoch": 0.7813084112149533, + "grad_norm": 0.03337560594081879, + "learning_rate": 0.0002, + "loss": 0.5529907941818237, + "mean_token_accuracy": 0.7742650210857391, + "num_tokens": 3412594.0, + "step": 209 + }, + { + "entropy": 0.5500561147928238, + "epoch": 0.7850467289719626, + "grad_norm": 0.029891593381762505, + "learning_rate": 0.0002, + "loss": 0.5457524657249451, + "mean_token_accuracy": 0.7752271890640259, + "num_tokens": 3428832.0, + "step": 210 + }, + { + "entropy": 0.5407024472951889, + "epoch": 0.788785046728972, + "grad_norm": 0.03455657884478569, + "learning_rate": 0.0002, + "loss": 0.5474290251731873, + "mean_token_accuracy": 0.7768412232398987, + "num_tokens": 3444963.0, + "step": 211 + }, + { + "entropy": 0.5501608401536942, + "epoch": 0.7925233644859813, + "grad_norm": 0.03011375665664673, + "learning_rate": 0.0002, + "loss": 0.5565037131309509, + "mean_token_accuracy": 0.775830090045929, + "num_tokens": 3461474.0, + "step": 212 + }, + { + "entropy": 0.5669280886650085, + "epoch": 0.7962616822429907, + "grad_norm": 0.026809366419911385, + "learning_rate": 0.0002, + "loss": 0.5677840113639832, + "mean_token_accuracy": 0.7668448388576508, + "num_tokens": 3477897.0, + "step": 213 + }, + { + "entropy": 0.5445934683084488, + "epoch": 0.8, + "grad_norm": 0.029105374589562416, + "learning_rate": 0.0002, + "loss": 0.5453011989593506, + "mean_token_accuracy": 0.7774604558944702, + "num_tokens": 3494383.0, + "step": 214 + }, + { + "entropy": 0.5411728769540787, + "epoch": 0.8037383177570093, + "grad_norm": 0.030234847217798233, + "learning_rate": 0.0002, + "loss": 0.5382598042488098, + "mean_token_accuracy": 0.7800557911396027, + "num_tokens": 3510715.0, + "step": 215 + }, + { + "entropy": 0.5646664798259735, + "epoch": 0.8074766355140187, + "grad_norm": 0.026832353323698044, + "learning_rate": 0.0002, + "loss": 0.5600794553756714, + "mean_token_accuracy": 0.7739932984113693, + "num_tokens": 3527178.0, + "step": 216 + }, + { + "entropy": 0.5487450510263443, + "epoch": 0.811214953271028, + "grad_norm": 0.03036404214799404, + "learning_rate": 0.0002, + "loss": 0.5507562160491943, + "mean_token_accuracy": 0.7767736315727234, + "num_tokens": 3543310.0, + "step": 217 + }, + { + "entropy": 0.5374595075845718, + "epoch": 0.8149532710280374, + "grad_norm": 0.02880716696381569, + "learning_rate": 0.0002, + "loss": 0.5401508808135986, + "mean_token_accuracy": 0.7788828462362289, + "num_tokens": 3559560.0, + "step": 218 + }, + { + "entropy": 0.5644234865903854, + "epoch": 0.8186915887850468, + "grad_norm": 0.028643809258937836, + "learning_rate": 0.0002, + "loss": 0.5721843242645264, + "mean_token_accuracy": 0.766772523522377, + "num_tokens": 3575647.0, + "step": 219 + }, + { + "entropy": 0.5402477234601974, + "epoch": 0.822429906542056, + "grad_norm": 0.029598018154501915, + "learning_rate": 0.0002, + "loss": 0.544965922832489, + "mean_token_accuracy": 0.7768107801675797, + "num_tokens": 3591846.0, + "step": 220 + }, + { + "entropy": 0.5643706917762756, + "epoch": 0.8261682242990654, + "grad_norm": 0.026736166328191757, + "learning_rate": 0.0002, + "loss": 0.5665444731712341, + "mean_token_accuracy": 0.7706874907016754, + "num_tokens": 3608162.0, + "step": 221 + }, + { + "entropy": 0.5756243169307709, + "epoch": 0.8299065420560747, + "grad_norm": 0.03206360712647438, + "learning_rate": 0.0002, + "loss": 0.5674804449081421, + "mean_token_accuracy": 0.7733075469732285, + "num_tokens": 3624379.0, + "step": 222 + }, + { + "entropy": 0.5734997987747192, + "epoch": 0.8336448598130841, + "grad_norm": 0.03715579956769943, + "learning_rate": 0.0002, + "loss": 0.5701056718826294, + "mean_token_accuracy": 0.768852710723877, + "num_tokens": 3640765.0, + "step": 223 + }, + { + "entropy": 0.5725346952676773, + "epoch": 0.8373831775700935, + "grad_norm": 0.030887214466929436, + "learning_rate": 0.0002, + "loss": 0.573249340057373, + "mean_token_accuracy": 0.767242893576622, + "num_tokens": 3657061.0, + "step": 224 + }, + { + "entropy": 0.5410794317722321, + "epoch": 0.8411214953271028, + "grad_norm": 0.035770442336797714, + "learning_rate": 0.0002, + "loss": 0.5480395555496216, + "mean_token_accuracy": 0.7784460484981537, + "num_tokens": 3673262.0, + "step": 225 + }, + { + "entropy": 0.5552464425563812, + "epoch": 0.8448598130841122, + "grad_norm": 0.02601957693696022, + "learning_rate": 0.0002, + "loss": 0.5526980757713318, + "mean_token_accuracy": 0.7752740979194641, + "num_tokens": 3689556.0, + "step": 226 + }, + { + "entropy": 0.5603229254484177, + "epoch": 0.8485981308411215, + "grad_norm": 0.02927876077592373, + "learning_rate": 0.0002, + "loss": 0.5548710823059082, + "mean_token_accuracy": 0.7742596417665482, + "num_tokens": 3705909.0, + "step": 227 + }, + { + "entropy": 0.5540427714586258, + "epoch": 0.8523364485981308, + "grad_norm": 0.027498876675963402, + "learning_rate": 0.0002, + "loss": 0.551406741142273, + "mean_token_accuracy": 0.7762021422386169, + "num_tokens": 3722220.0, + "step": 228 + }, + { + "entropy": 0.547235295176506, + "epoch": 0.8560747663551402, + "grad_norm": 0.03035261482000351, + "learning_rate": 0.0002, + "loss": 0.5486667156219482, + "mean_token_accuracy": 0.7744798958301544, + "num_tokens": 3738434.0, + "step": 229 + }, + { + "entropy": 0.5418660938739777, + "epoch": 0.8598130841121495, + "grad_norm": 0.027413956820964813, + "learning_rate": 0.0002, + "loss": 0.5479095578193665, + "mean_token_accuracy": 0.7781544178724289, + "num_tokens": 3755054.0, + "step": 230 + }, + { + "entropy": 0.5522046238183975, + "epoch": 0.8635514018691589, + "grad_norm": 0.030631685629487038, + "learning_rate": 0.0002, + "loss": 0.5644456148147583, + "mean_token_accuracy": 0.7745120227336884, + "num_tokens": 3771586.0, + "step": 231 + }, + { + "entropy": 0.551509827375412, + "epoch": 0.8672897196261682, + "grad_norm": 0.026732027530670166, + "learning_rate": 0.0002, + "loss": 0.5578428506851196, + "mean_token_accuracy": 0.7758689671754837, + "num_tokens": 3787722.0, + "step": 232 + }, + { + "entropy": 0.5497246235609055, + "epoch": 0.8710280373831776, + "grad_norm": 0.026703782379627228, + "learning_rate": 0.0002, + "loss": 0.5488728284835815, + "mean_token_accuracy": 0.7787783890962601, + "num_tokens": 3804140.0, + "step": 233 + }, + { + "entropy": 0.5564501583576202, + "epoch": 0.874766355140187, + "grad_norm": 0.03062787838280201, + "learning_rate": 0.0002, + "loss": 0.5527218580245972, + "mean_token_accuracy": 0.773425742983818, + "num_tokens": 3820337.0, + "step": 234 + }, + { + "entropy": 0.5788596719503403, + "epoch": 0.8785046728971962, + "grad_norm": 0.02753940224647522, + "learning_rate": 0.0002, + "loss": 0.5711097121238708, + "mean_token_accuracy": 0.7668430060148239, + "num_tokens": 3836808.0, + "step": 235 + }, + { + "entropy": 0.5476598590612411, + "epoch": 0.8822429906542056, + "grad_norm": 0.027400346472859383, + "learning_rate": 0.0002, + "loss": 0.5463334918022156, + "mean_token_accuracy": 0.7781798541545868, + "num_tokens": 3853299.0, + "step": 236 + }, + { + "entropy": 0.5600839555263519, + "epoch": 0.8859813084112149, + "grad_norm": 0.028775498270988464, + "learning_rate": 0.0002, + "loss": 0.5644516348838806, + "mean_token_accuracy": 0.7703653573989868, + "num_tokens": 3869775.0, + "step": 237 + }, + { + "entropy": 0.5513971447944641, + "epoch": 0.8897196261682243, + "grad_norm": 0.03031282313168049, + "learning_rate": 0.0002, + "loss": 0.5573993921279907, + "mean_token_accuracy": 0.7723653614521027, + "num_tokens": 3885744.0, + "step": 238 + }, + { + "entropy": 0.5429326295852661, + "epoch": 0.8934579439252337, + "grad_norm": 0.033718034625053406, + "learning_rate": 0.0002, + "loss": 0.5453519821166992, + "mean_token_accuracy": 0.7795106470584869, + "num_tokens": 3902195.0, + "step": 239 + }, + { + "entropy": 0.5441034436225891, + "epoch": 0.897196261682243, + "grad_norm": 0.02875681221485138, + "learning_rate": 0.0002, + "loss": 0.5451952219009399, + "mean_token_accuracy": 0.7779187709093094, + "num_tokens": 3918423.0, + "step": 240 + }, + { + "entropy": 0.5620324909687042, + "epoch": 0.9009345794392524, + "grad_norm": 0.03552895039319992, + "learning_rate": 0.0002, + "loss": 0.5609052181243896, + "mean_token_accuracy": 0.7701629996299744, + "num_tokens": 3934679.0, + "step": 241 + }, + { + "entropy": 0.5460164248943329, + "epoch": 0.9046728971962616, + "grad_norm": 0.029886359348893166, + "learning_rate": 0.0002, + "loss": 0.5491501688957214, + "mean_token_accuracy": 0.7752289175987244, + "num_tokens": 3950961.0, + "step": 242 + }, + { + "entropy": 0.546056404709816, + "epoch": 0.908411214953271, + "grad_norm": 0.030852871015667915, + "learning_rate": 0.0002, + "loss": 0.5487393140792847, + "mean_token_accuracy": 0.7780929207801819, + "num_tokens": 3966886.0, + "step": 243 + }, + { + "entropy": 0.5609732866287231, + "epoch": 0.9121495327102803, + "grad_norm": 0.030140889808535576, + "learning_rate": 0.0002, + "loss": 0.5606030225753784, + "mean_token_accuracy": 0.7716473340988159, + "num_tokens": 3983151.0, + "step": 244 + }, + { + "entropy": 0.5604758709669113, + "epoch": 0.9158878504672897, + "grad_norm": 0.029578525573015213, + "learning_rate": 0.0002, + "loss": 0.5585839748382568, + "mean_token_accuracy": 0.7732879519462585, + "num_tokens": 3999795.0, + "step": 245 + }, + { + "entropy": 0.5532268136739731, + "epoch": 0.9196261682242991, + "grad_norm": 0.027891919016838074, + "learning_rate": 0.0002, + "loss": 0.5495057106018066, + "mean_token_accuracy": 0.7766385078430176, + "num_tokens": 4016175.0, + "step": 246 + }, + { + "entropy": 0.5554448813199997, + "epoch": 0.9233644859813084, + "grad_norm": 0.02818435989320278, + "learning_rate": 0.0002, + "loss": 0.5564701557159424, + "mean_token_accuracy": 0.7740474194288254, + "num_tokens": 4032347.0, + "step": 247 + }, + { + "entropy": 0.5707967579364777, + "epoch": 0.9271028037383178, + "grad_norm": 0.03303643688559532, + "learning_rate": 0.0002, + "loss": 0.5724941492080688, + "mean_token_accuracy": 0.7658564150333405, + "num_tokens": 4048630.0, + "step": 248 + }, + { + "entropy": 0.5627187788486481, + "epoch": 0.930841121495327, + "grad_norm": 0.028818685561418533, + "learning_rate": 0.0002, + "loss": 0.558162271976471, + "mean_token_accuracy": 0.7722532004117966, + "num_tokens": 4065016.0, + "step": 249 + }, + { + "entropy": 0.559718668460846, + "epoch": 0.9345794392523364, + "grad_norm": 0.028685420751571655, + "learning_rate": 0.0002, + "loss": 0.5579565763473511, + "mean_token_accuracy": 0.7721400260925293, + "num_tokens": 4081500.0, + "step": 250 + }, + { + "entropy": 0.5709707289934158, + "epoch": 0.9383177570093458, + "grad_norm": 0.02683587744832039, + "learning_rate": 0.0002, + "loss": 0.5758322477340698, + "mean_token_accuracy": 0.767963632941246, + "num_tokens": 4097612.0, + "step": 251 + }, + { + "entropy": 0.5344300121068954, + "epoch": 0.9420560747663551, + "grad_norm": 0.03321561962366104, + "learning_rate": 0.0002, + "loss": 0.5469601154327393, + "mean_token_accuracy": 0.7776601910591125, + "num_tokens": 4113985.0, + "step": 252 + }, + { + "entropy": 0.5629133135080338, + "epoch": 0.9457943925233645, + "grad_norm": 0.029224324971437454, + "learning_rate": 0.0002, + "loss": 0.5662878751754761, + "mean_token_accuracy": 0.7696039378643036, + "num_tokens": 4130584.0, + "step": 253 + }, + { + "entropy": 0.5419919043779373, + "epoch": 0.9495327102803738, + "grad_norm": 0.02663436345756054, + "learning_rate": 0.0002, + "loss": 0.5382669568061829, + "mean_token_accuracy": 0.7805986851453781, + "num_tokens": 4146982.0, + "step": 254 + }, + { + "entropy": 0.5534703135490417, + "epoch": 0.9532710280373832, + "grad_norm": 0.031037772074341774, + "learning_rate": 0.0002, + "loss": 0.5461369752883911, + "mean_token_accuracy": 0.7764857709407806, + "num_tokens": 4163123.0, + "step": 255 + }, + { + "entropy": 0.5555615872144699, + "epoch": 0.9570093457943926, + "grad_norm": 0.030655430629849434, + "learning_rate": 0.0002, + "loss": 0.547662615776062, + "mean_token_accuracy": 0.7764341235160828, + "num_tokens": 4179406.0, + "step": 256 + }, + { + "entropy": 0.5540354400873184, + "epoch": 0.9607476635514018, + "grad_norm": 0.026620658114552498, + "learning_rate": 0.0002, + "loss": 0.5554948449134827, + "mean_token_accuracy": 0.7734782248735428, + "num_tokens": 4195892.0, + "step": 257 + }, + { + "entropy": 0.5525697767734528, + "epoch": 0.9644859813084112, + "grad_norm": 0.034786809235811234, + "learning_rate": 0.0002, + "loss": 0.5611132383346558, + "mean_token_accuracy": 0.7743758261203766, + "num_tokens": 4212488.0, + "step": 258 + }, + { + "entropy": 0.5380009859800339, + "epoch": 0.9682242990654205, + "grad_norm": 0.030664261430501938, + "learning_rate": 0.0002, + "loss": 0.5456718802452087, + "mean_token_accuracy": 0.7777111083269119, + "num_tokens": 4228671.0, + "step": 259 + }, + { + "entropy": 0.5611264109611511, + "epoch": 0.9719626168224299, + "grad_norm": 0.03532060608267784, + "learning_rate": 0.0002, + "loss": 0.5694712400436401, + "mean_token_accuracy": 0.7701623737812042, + "num_tokens": 4244945.0, + "step": 260 + }, + { + "entropy": 0.5638745427131653, + "epoch": 0.9757009345794393, + "grad_norm": 0.030665291473269463, + "learning_rate": 0.0002, + "loss": 0.5605924129486084, + "mean_token_accuracy": 0.773237332701683, + "num_tokens": 4261322.0, + "step": 261 + }, + { + "entropy": 0.5428778678178787, + "epoch": 0.9794392523364486, + "grad_norm": 0.02648600935935974, + "learning_rate": 0.0002, + "loss": 0.5397819876670837, + "mean_token_accuracy": 0.7809075862169266, + "num_tokens": 4277549.0, + "step": 262 + }, + { + "entropy": 0.5510172247886658, + "epoch": 0.983177570093458, + "grad_norm": 0.03357943519949913, + "learning_rate": 0.0002, + "loss": 0.5478790998458862, + "mean_token_accuracy": 0.776640459895134, + "num_tokens": 4293936.0, + "step": 263 + }, + { + "entropy": 0.574288085103035, + "epoch": 0.9869158878504672, + "grad_norm": 0.034169699996709824, + "learning_rate": 0.0002, + "loss": 0.5746400952339172, + "mean_token_accuracy": 0.7657656222581863, + "num_tokens": 4310409.0, + "step": 264 + }, + { + "entropy": 0.5594803243875504, + "epoch": 0.9906542056074766, + "grad_norm": 0.02862885594367981, + "learning_rate": 0.0002, + "loss": 0.5636025071144104, + "mean_token_accuracy": 0.7713442891836166, + "num_tokens": 4326712.0, + "step": 265 + }, + { + "entropy": 0.539836198091507, + "epoch": 0.994392523364486, + "grad_norm": 0.029507551342248917, + "learning_rate": 0.0002, + "loss": 0.5382612943649292, + "mean_token_accuracy": 0.7807378023862839, + "num_tokens": 4343219.0, + "step": 266 + }, + { + "entropy": 0.5551609694957733, + "epoch": 0.9981308411214953, + "grad_norm": 0.03200124204158783, + "learning_rate": 0.0002, + "loss": 0.5617719888687134, + "mean_token_accuracy": 0.773328885436058, + "num_tokens": 4359527.0, + "step": 267 + }, + { + "entropy": 0.5970158576965332, + "epoch": 1.0, + "grad_norm": 0.037066467106342316, + "learning_rate": 0.0002, + "loss": 0.5663654208183289, + "mean_token_accuracy": 0.7600691318511963, + "num_tokens": 4365381.0, + "step": 268 + }, + { + "entropy": 0.5525570958852768, + "epoch": 1.0037383177570094, + "grad_norm": 0.03900527581572533, + "learning_rate": 0.0002, + "loss": 0.5462143421173096, + "mean_token_accuracy": 0.778084933757782, + "num_tokens": 4381629.0, + "step": 269 + }, + { + "entropy": 0.54682257771492, + "epoch": 1.0074766355140188, + "grad_norm": 0.035536471754312515, + "learning_rate": 0.0002, + "loss": 0.545343816280365, + "mean_token_accuracy": 0.7801588326692581, + "num_tokens": 4398070.0, + "step": 270 + }, + { + "entropy": 0.5365859419107437, + "epoch": 1.011214953271028, + "grad_norm": 0.034878168255090714, + "learning_rate": 0.0002, + "loss": 0.5464543104171753, + "mean_token_accuracy": 0.7776919007301331, + "num_tokens": 4414453.0, + "step": 271 + }, + { + "entropy": 0.5360404700040817, + "epoch": 1.0149532710280373, + "grad_norm": 0.048785679042339325, + "learning_rate": 0.0002, + "loss": 0.5453898906707764, + "mean_token_accuracy": 0.7778858244419098, + "num_tokens": 4430869.0, + "step": 272 + }, + { + "entropy": 0.5594298243522644, + "epoch": 1.0186915887850467, + "grad_norm": 0.03072672337293625, + "learning_rate": 0.0002, + "loss": 0.5592078566551208, + "mean_token_accuracy": 0.7696069329977036, + "num_tokens": 4447216.0, + "step": 273 + }, + { + "entropy": 0.5412543565034866, + "epoch": 1.0224299065420561, + "grad_norm": 0.02756349742412567, + "learning_rate": 0.0002, + "loss": 0.531460702419281, + "mean_token_accuracy": 0.7837116718292236, + "num_tokens": 4463360.0, + "step": 274 + }, + { + "entropy": 0.5926365852355957, + "epoch": 1.0261682242990655, + "grad_norm": 0.03326818346977234, + "learning_rate": 0.0002, + "loss": 0.5784746408462524, + "mean_token_accuracy": 0.766831248998642, + "num_tokens": 4479796.0, + "step": 275 + }, + { + "entropy": 0.5474714189767838, + "epoch": 1.0299065420560747, + "grad_norm": 0.03355902433395386, + "learning_rate": 0.0002, + "loss": 0.5428712368011475, + "mean_token_accuracy": 0.7803516983985901, + "num_tokens": 4495955.0, + "step": 276 + }, + { + "entropy": 0.5455677509307861, + "epoch": 1.033644859813084, + "grad_norm": 0.030776510015130043, + "learning_rate": 0.0002, + "loss": 0.5518208742141724, + "mean_token_accuracy": 0.7767827361822128, + "num_tokens": 4512316.0, + "step": 277 + }, + { + "entropy": 0.526500478386879, + "epoch": 1.0373831775700935, + "grad_norm": 0.033801671117544174, + "learning_rate": 0.0002, + "loss": 0.5366923809051514, + "mean_token_accuracy": 0.7806129455566406, + "num_tokens": 4528811.0, + "step": 278 + }, + { + "entropy": 0.541542574763298, + "epoch": 1.0411214953271029, + "grad_norm": 0.02768930047750473, + "learning_rate": 0.0002, + "loss": 0.5472083687782288, + "mean_token_accuracy": 0.7802270948886871, + "num_tokens": 4545459.0, + "step": 279 + }, + { + "entropy": 0.5428692698478699, + "epoch": 1.0448598130841122, + "grad_norm": 0.026535486802458763, + "learning_rate": 0.0002, + "loss": 0.5472521185874939, + "mean_token_accuracy": 0.7790202796459198, + "num_tokens": 4561743.0, + "step": 280 + }, + { + "entropy": 0.5543521493673325, + "epoch": 1.0485981308411214, + "grad_norm": 0.02853293903172016, + "learning_rate": 0.0002, + "loss": 0.5497039556503296, + "mean_token_accuracy": 0.7779083847999573, + "num_tokens": 4577971.0, + "step": 281 + }, + { + "entropy": 0.5532562881708145, + "epoch": 1.0523364485981308, + "grad_norm": 0.032453302294015884, + "learning_rate": 0.0002, + "loss": 0.5507426261901855, + "mean_token_accuracy": 0.7767385840415955, + "num_tokens": 4594320.0, + "step": 282 + }, + { + "entropy": 0.5443340390920639, + "epoch": 1.0560747663551402, + "grad_norm": 0.02682569809257984, + "learning_rate": 0.0002, + "loss": 0.5412614345550537, + "mean_token_accuracy": 0.77817003428936, + "num_tokens": 4610559.0, + "step": 283 + }, + { + "entropy": 0.5440877228975296, + "epoch": 1.0598130841121496, + "grad_norm": 0.0319623202085495, + "learning_rate": 0.0002, + "loss": 0.5511168241500854, + "mean_token_accuracy": 0.7755201905965805, + "num_tokens": 4627054.0, + "step": 284 + }, + { + "entropy": 0.5398871749639511, + "epoch": 1.063551401869159, + "grad_norm": 0.03265109285712242, + "learning_rate": 0.0002, + "loss": 0.5518860220909119, + "mean_token_accuracy": 0.7781514823436737, + "num_tokens": 4643292.0, + "step": 285 + }, + { + "entropy": 0.5381411910057068, + "epoch": 1.0672897196261681, + "grad_norm": 0.03156265616416931, + "learning_rate": 0.0002, + "loss": 0.5409814119338989, + "mean_token_accuracy": 0.778824657201767, + "num_tokens": 4659607.0, + "step": 286 + }, + { + "entropy": 0.5548661798238754, + "epoch": 1.0710280373831775, + "grad_norm": 0.02918156050145626, + "learning_rate": 0.0002, + "loss": 0.5548707246780396, + "mean_token_accuracy": 0.7780612111091614, + "num_tokens": 4676232.0, + "step": 287 + }, + { + "entropy": 0.5623743981122971, + "epoch": 1.074766355140187, + "grad_norm": 0.03226201981306076, + "learning_rate": 0.0002, + "loss": 0.554677426815033, + "mean_token_accuracy": 0.7718137502670288, + "num_tokens": 4692488.0, + "step": 288 + }, + { + "entropy": 0.5557957291603088, + "epoch": 1.0785046728971963, + "grad_norm": 0.03029857762157917, + "learning_rate": 0.0002, + "loss": 0.5494679808616638, + "mean_token_accuracy": 0.7741122990846634, + "num_tokens": 4709044.0, + "step": 289 + }, + { + "entropy": 0.5503045320510864, + "epoch": 1.0822429906542057, + "grad_norm": 0.0307212695479393, + "learning_rate": 0.0002, + "loss": 0.5489369630813599, + "mean_token_accuracy": 0.7767912298440933, + "num_tokens": 4725364.0, + "step": 290 + }, + { + "entropy": 0.539084866642952, + "epoch": 1.0859813084112149, + "grad_norm": 0.03190942481160164, + "learning_rate": 0.0002, + "loss": 0.5426544547080994, + "mean_token_accuracy": 0.7790433466434479, + "num_tokens": 4741941.0, + "step": 291 + }, + { + "entropy": 0.5557795614004135, + "epoch": 1.0897196261682243, + "grad_norm": 0.029419884085655212, + "learning_rate": 0.0002, + "loss": 0.5554101467132568, + "mean_token_accuracy": 0.7740319818258286, + "num_tokens": 4758205.0, + "step": 292 + }, + { + "entropy": 0.5505965203046799, + "epoch": 1.0934579439252337, + "grad_norm": 0.03429512679576874, + "learning_rate": 0.0002, + "loss": 0.5480504035949707, + "mean_token_accuracy": 0.7799728959798813, + "num_tokens": 4774531.0, + "step": 293 + }, + { + "entropy": 0.5202708393335342, + "epoch": 1.097196261682243, + "grad_norm": 0.03249349445104599, + "learning_rate": 0.0002, + "loss": 0.5263637900352478, + "mean_token_accuracy": 0.7847210466861725, + "num_tokens": 4790852.0, + "step": 294 + }, + { + "entropy": 0.5314037203788757, + "epoch": 1.1009345794392524, + "grad_norm": 0.03021075204014778, + "learning_rate": 0.0002, + "loss": 0.5385116934776306, + "mean_token_accuracy": 0.7801961302757263, + "num_tokens": 4807025.0, + "step": 295 + }, + { + "entropy": 0.5293320417404175, + "epoch": 1.1046728971962616, + "grad_norm": 0.028836827725172043, + "learning_rate": 0.0002, + "loss": 0.5360400676727295, + "mean_token_accuracy": 0.7814829498529434, + "num_tokens": 4823214.0, + "step": 296 + }, + { + "entropy": 0.5449055731296539, + "epoch": 1.108411214953271, + "grad_norm": 0.03287169337272644, + "learning_rate": 0.0002, + "loss": 0.5433195233345032, + "mean_token_accuracy": 0.7783774435520172, + "num_tokens": 4839406.0, + "step": 297 + }, + { + "entropy": 0.5480804145336151, + "epoch": 1.1121495327102804, + "grad_norm": 0.02733924239873886, + "learning_rate": 0.0002, + "loss": 0.5476067066192627, + "mean_token_accuracy": 0.7756602764129639, + "num_tokens": 4855715.0, + "step": 298 + }, + { + "entropy": 0.5223022475838661, + "epoch": 1.1158878504672898, + "grad_norm": 0.030849482864141464, + "learning_rate": 0.0002, + "loss": 0.5228248834609985, + "mean_token_accuracy": 0.7864928692579269, + "num_tokens": 4871905.0, + "step": 299 + }, + { + "entropy": 0.5403697192668915, + "epoch": 1.1196261682242992, + "grad_norm": 0.04067312180995941, + "learning_rate": 0.0002, + "loss": 0.5488058924674988, + "mean_token_accuracy": 0.7756592333316803, + "num_tokens": 4888255.0, + "step": 300 + }, + { + "entropy": 0.5412193536758423, + "epoch": 1.1233644859813083, + "grad_norm": 0.0385926179587841, + "learning_rate": 0.0002, + "loss": 0.5371171832084656, + "mean_token_accuracy": 0.7830539792776108, + "num_tokens": 4904401.0, + "step": 301 + }, + { + "entropy": 0.5478123128414154, + "epoch": 1.1271028037383177, + "grad_norm": 0.034694232046604156, + "learning_rate": 0.0002, + "loss": 0.5448811054229736, + "mean_token_accuracy": 0.7775193750858307, + "num_tokens": 4920569.0, + "step": 302 + }, + { + "entropy": 0.5659691095352173, + "epoch": 1.1308411214953271, + "grad_norm": 0.0340200699865818, + "learning_rate": 0.0002, + "loss": 0.5552433729171753, + "mean_token_accuracy": 0.7739417254924774, + "num_tokens": 4936990.0, + "step": 303 + }, + { + "entropy": 0.5497810244560242, + "epoch": 1.1345794392523365, + "grad_norm": 0.03331321105360985, + "learning_rate": 0.0002, + "loss": 0.5419542193412781, + "mean_token_accuracy": 0.7797157317399979, + "num_tokens": 4953234.0, + "step": 304 + }, + { + "entropy": 0.5460351407527924, + "epoch": 1.1383177570093457, + "grad_norm": 0.028563540428876877, + "learning_rate": 0.0002, + "loss": 0.5521276593208313, + "mean_token_accuracy": 0.7758445143699646, + "num_tokens": 4969315.0, + "step": 305 + }, + { + "entropy": 0.5569003522396088, + "epoch": 1.142056074766355, + "grad_norm": 0.037444427609443665, + "learning_rate": 0.0002, + "loss": 0.5681867003440857, + "mean_token_accuracy": 0.768034890294075, + "num_tokens": 4985515.0, + "step": 306 + }, + { + "entropy": 0.5470724403858185, + "epoch": 1.1457943925233645, + "grad_norm": 0.03636348247528076, + "learning_rate": 0.0002, + "loss": 0.557773768901825, + "mean_token_accuracy": 0.7707736641168594, + "num_tokens": 5001866.0, + "step": 307 + }, + { + "entropy": 0.5416572839021683, + "epoch": 1.1495327102803738, + "grad_norm": 0.03566127270460129, + "learning_rate": 0.0002, + "loss": 0.5432877540588379, + "mean_token_accuracy": 0.7794886082410812, + "num_tokens": 5018262.0, + "step": 308 + }, + { + "entropy": 0.5430650115013123, + "epoch": 1.1532710280373832, + "grad_norm": 0.03256046026945114, + "learning_rate": 0.0002, + "loss": 0.5351021885871887, + "mean_token_accuracy": 0.7833839505910873, + "num_tokens": 5034628.0, + "step": 309 + }, + { + "entropy": 0.5461114794015884, + "epoch": 1.1570093457943926, + "grad_norm": 0.029036300256848335, + "learning_rate": 0.0002, + "loss": 0.5413210391998291, + "mean_token_accuracy": 0.7808763384819031, + "num_tokens": 5050585.0, + "step": 310 + }, + { + "entropy": 0.554639920592308, + "epoch": 1.1607476635514018, + "grad_norm": 0.03361903131008148, + "learning_rate": 0.0002, + "loss": 0.5511503219604492, + "mean_token_accuracy": 0.7750130593776703, + "num_tokens": 5067128.0, + "step": 311 + }, + { + "entropy": 0.548187181353569, + "epoch": 1.1644859813084112, + "grad_norm": 0.031710732728242874, + "learning_rate": 0.0002, + "loss": 0.5527835488319397, + "mean_token_accuracy": 0.7745499759912491, + "num_tokens": 5083711.0, + "step": 312 + }, + { + "entropy": 0.5482472777366638, + "epoch": 1.1682242990654206, + "grad_norm": 0.02857045829296112, + "learning_rate": 0.0002, + "loss": 0.5483166575431824, + "mean_token_accuracy": 0.7769645005464554, + "num_tokens": 5100016.0, + "step": 313 + }, + { + "entropy": 0.5292567014694214, + "epoch": 1.17196261682243, + "grad_norm": 0.03421966731548309, + "learning_rate": 0.0002, + "loss": 0.5375955104827881, + "mean_token_accuracy": 0.7807260155677795, + "num_tokens": 5116491.0, + "step": 314 + }, + { + "entropy": 0.5369534343481064, + "epoch": 1.1757009345794391, + "grad_norm": 0.03103984147310257, + "learning_rate": 0.0002, + "loss": 0.5436944365501404, + "mean_token_accuracy": 0.7778706103563309, + "num_tokens": 5132827.0, + "step": 315 + }, + { + "entropy": 0.5408389419317245, + "epoch": 1.1794392523364485, + "grad_norm": 0.030202720314264297, + "learning_rate": 0.0002, + "loss": 0.5370794534683228, + "mean_token_accuracy": 0.7810905873775482, + "num_tokens": 5149320.0, + "step": 316 + }, + { + "entropy": 0.5496339946985245, + "epoch": 1.183177570093458, + "grad_norm": 0.051472123712301254, + "learning_rate": 0.0002, + "loss": 0.5469900965690613, + "mean_token_accuracy": 0.7799471020698547, + "num_tokens": 5165783.0, + "step": 317 + }, + { + "entropy": 0.5685789883136749, + "epoch": 1.1869158878504673, + "grad_norm": 0.02587290294468403, + "learning_rate": 0.0002, + "loss": 0.5606150031089783, + "mean_token_accuracy": 0.7720179110765457, + "num_tokens": 5182220.0, + "step": 318 + }, + { + "entropy": 0.562404528260231, + "epoch": 1.1906542056074767, + "grad_norm": 0.02697838842868805, + "learning_rate": 0.0002, + "loss": 0.5576499700546265, + "mean_token_accuracy": 0.773972287774086, + "num_tokens": 5198785.0, + "step": 319 + }, + { + "entropy": 0.5579852014780045, + "epoch": 1.194392523364486, + "grad_norm": 0.03318994492292404, + "learning_rate": 0.0002, + "loss": 0.5587575435638428, + "mean_token_accuracy": 0.7752004414796829, + "num_tokens": 5215306.0, + "step": 320 + }, + { + "entropy": 0.5371066629886627, + "epoch": 1.1981308411214953, + "grad_norm": 0.03372027724981308, + "learning_rate": 0.0002, + "loss": 0.5434669852256775, + "mean_token_accuracy": 0.7805782556533813, + "num_tokens": 5231543.0, + "step": 321 + }, + { + "entropy": 0.5399051606655121, + "epoch": 1.2018691588785047, + "grad_norm": 0.02985556237399578, + "learning_rate": 0.0002, + "loss": 0.535866379737854, + "mean_token_accuracy": 0.7829108238220215, + "num_tokens": 5248118.0, + "step": 322 + }, + { + "entropy": 0.5591253787279129, + "epoch": 1.205607476635514, + "grad_norm": 0.036681629717350006, + "learning_rate": 0.0002, + "loss": 0.5629308223724365, + "mean_token_accuracy": 0.7753039449453354, + "num_tokens": 5264354.0, + "step": 323 + }, + { + "entropy": 0.530854269862175, + "epoch": 1.2093457943925234, + "grad_norm": 0.03271258994936943, + "learning_rate": 0.0002, + "loss": 0.5393017530441284, + "mean_token_accuracy": 0.779604122042656, + "num_tokens": 5280420.0, + "step": 324 + }, + { + "entropy": 0.5350240170955658, + "epoch": 1.2130841121495326, + "grad_norm": 0.03653952106833458, + "learning_rate": 0.0002, + "loss": 0.5409165620803833, + "mean_token_accuracy": 0.7822408974170685, + "num_tokens": 5296606.0, + "step": 325 + }, + { + "entropy": 0.5648403018712997, + "epoch": 1.216822429906542, + "grad_norm": 0.03994280472397804, + "learning_rate": 0.0002, + "loss": 0.5743881464004517, + "mean_token_accuracy": 0.7659939229488373, + "num_tokens": 5313007.0, + "step": 326 + }, + { + "entropy": 0.5432536900043488, + "epoch": 1.2205607476635514, + "grad_norm": 0.02927422896027565, + "learning_rate": 0.0002, + "loss": 0.5374885201454163, + "mean_token_accuracy": 0.7827716916799545, + "num_tokens": 5329527.0, + "step": 327 + }, + { + "entropy": 0.5514259338378906, + "epoch": 1.2242990654205608, + "grad_norm": 0.0344291552901268, + "learning_rate": 0.0002, + "loss": 0.5449932217597961, + "mean_token_accuracy": 0.7758236229419708, + "num_tokens": 5345773.0, + "step": 328 + }, + { + "entropy": 0.5402681678533554, + "epoch": 1.2280373831775702, + "grad_norm": 0.030332623049616814, + "learning_rate": 0.0002, + "loss": 0.5353883504867554, + "mean_token_accuracy": 0.7829276025295258, + "num_tokens": 5362127.0, + "step": 329 + }, + { + "entropy": 0.5448068082332611, + "epoch": 1.2317757009345796, + "grad_norm": 0.03531905263662338, + "learning_rate": 0.0002, + "loss": 0.5491797924041748, + "mean_token_accuracy": 0.7790112793445587, + "num_tokens": 5378581.0, + "step": 330 + }, + { + "entropy": 0.5472903549671173, + "epoch": 1.2355140186915887, + "grad_norm": 0.031015828251838684, + "learning_rate": 0.0002, + "loss": 0.5477415323257446, + "mean_token_accuracy": 0.7792595028877258, + "num_tokens": 5395212.0, + "step": 331 + }, + { + "entropy": 0.5454255491495132, + "epoch": 1.2392523364485981, + "grad_norm": 0.029113125056028366, + "learning_rate": 0.0002, + "loss": 0.5511353611946106, + "mean_token_accuracy": 0.7783665657043457, + "num_tokens": 5411639.0, + "step": 332 + }, + { + "entropy": 0.540988489985466, + "epoch": 1.2429906542056075, + "grad_norm": 0.03799515590071678, + "learning_rate": 0.0002, + "loss": 0.5437558889389038, + "mean_token_accuracy": 0.7799660414457321, + "num_tokens": 5428149.0, + "step": 333 + }, + { + "entropy": 0.5526563823223114, + "epoch": 1.246728971962617, + "grad_norm": 0.031356047838926315, + "learning_rate": 0.0002, + "loss": 0.5517259836196899, + "mean_token_accuracy": 0.7754609137773514, + "num_tokens": 5444823.0, + "step": 334 + }, + { + "entropy": 0.5555935055017471, + "epoch": 1.250467289719626, + "grad_norm": 0.029837489128112793, + "learning_rate": 0.0002, + "loss": 0.5508607029914856, + "mean_token_accuracy": 0.7758114188909531, + "num_tokens": 5461092.0, + "step": 335 + }, + { + "entropy": 0.5559129118919373, + "epoch": 1.2542056074766355, + "grad_norm": 0.033403050154447556, + "learning_rate": 0.0002, + "loss": 0.5595272183418274, + "mean_token_accuracy": 0.7721016854047775, + "num_tokens": 5477562.0, + "step": 336 + }, + { + "entropy": 0.5547732263803482, + "epoch": 1.2579439252336448, + "grad_norm": 0.030496301129460335, + "learning_rate": 0.0002, + "loss": 0.5562547445297241, + "mean_token_accuracy": 0.7715321332216263, + "num_tokens": 5493726.0, + "step": 337 + }, + { + "entropy": 0.5420037060976028, + "epoch": 1.2616822429906542, + "grad_norm": 0.029967380687594414, + "learning_rate": 0.0002, + "loss": 0.5383363962173462, + "mean_token_accuracy": 0.7796787321567535, + "num_tokens": 5510133.0, + "step": 338 + }, + { + "entropy": 0.5514421463012695, + "epoch": 1.2654205607476636, + "grad_norm": 0.028999267145991325, + "learning_rate": 0.0002, + "loss": 0.5496451258659363, + "mean_token_accuracy": 0.775850385427475, + "num_tokens": 5526472.0, + "step": 339 + }, + { + "entropy": 0.5502642691135406, + "epoch": 1.269158878504673, + "grad_norm": 0.031769949942827225, + "learning_rate": 0.0002, + "loss": 0.5571942925453186, + "mean_token_accuracy": 0.7752085626125336, + "num_tokens": 5542630.0, + "step": 340 + }, + { + "entropy": 0.5570239722728729, + "epoch": 1.2728971962616822, + "grad_norm": 0.027039222419261932, + "learning_rate": 0.0002, + "loss": 0.5579078197479248, + "mean_token_accuracy": 0.771331325173378, + "num_tokens": 5559029.0, + "step": 341 + }, + { + "entropy": 0.5515656173229218, + "epoch": 1.2766355140186916, + "grad_norm": 0.029703624546527863, + "learning_rate": 0.0002, + "loss": 0.5461840033531189, + "mean_token_accuracy": 0.7807486355304718, + "num_tokens": 5575356.0, + "step": 342 + }, + { + "entropy": 0.5342657566070557, + "epoch": 1.280373831775701, + "grad_norm": 0.03180944547057152, + "learning_rate": 0.0002, + "loss": 0.5324912071228027, + "mean_token_accuracy": 0.7840546667575836, + "num_tokens": 5591449.0, + "step": 343 + }, + { + "entropy": 0.5342217683792114, + "epoch": 1.2841121495327104, + "grad_norm": 0.029021048918366432, + "learning_rate": 0.0002, + "loss": 0.5414597392082214, + "mean_token_accuracy": 0.780342310667038, + "num_tokens": 5607833.0, + "step": 344 + }, + { + "entropy": 0.55648373067379, + "epoch": 1.2878504672897195, + "grad_norm": 0.046803366392850876, + "learning_rate": 0.0002, + "loss": 0.5738747119903564, + "mean_token_accuracy": 0.7661998569965363, + "num_tokens": 5624322.0, + "step": 345 + }, + { + "entropy": 0.5502487570047379, + "epoch": 1.291588785046729, + "grad_norm": 0.03315203636884689, + "learning_rate": 0.0002, + "loss": 0.5500799417495728, + "mean_token_accuracy": 0.7736621350049973, + "num_tokens": 5640389.0, + "step": 346 + }, + { + "entropy": 0.5614677965641022, + "epoch": 1.2953271028037383, + "grad_norm": 0.03268338739871979, + "learning_rate": 0.0002, + "loss": 0.5521507263183594, + "mean_token_accuracy": 0.7767487168312073, + "num_tokens": 5657037.0, + "step": 347 + }, + { + "entropy": 0.5565991103649139, + "epoch": 1.2990654205607477, + "grad_norm": 0.03224696218967438, + "learning_rate": 0.0002, + "loss": 0.5461902022361755, + "mean_token_accuracy": 0.7800005227327347, + "num_tokens": 5673177.0, + "step": 348 + }, + { + "entropy": 0.5575346350669861, + "epoch": 1.302803738317757, + "grad_norm": 0.03355446085333824, + "learning_rate": 0.0002, + "loss": 0.5535120368003845, + "mean_token_accuracy": 0.7763462960720062, + "num_tokens": 5689487.0, + "step": 349 + }, + { + "entropy": 0.5268712043762207, + "epoch": 1.3065420560747665, + "grad_norm": 0.03238300606608391, + "learning_rate": 0.0002, + "loss": 0.5330650806427002, + "mean_token_accuracy": 0.7824077904224396, + "num_tokens": 5705730.0, + "step": 350 + }, + { + "entropy": 0.5374382436275482, + "epoch": 1.3102803738317756, + "grad_norm": 0.03740353882312775, + "learning_rate": 0.0002, + "loss": 0.547337532043457, + "mean_token_accuracy": 0.7766657322645187, + "num_tokens": 5722106.0, + "step": 351 + }, + { + "entropy": 0.5559621900320053, + "epoch": 1.314018691588785, + "grad_norm": 0.030790183693170547, + "learning_rate": 0.0002, + "loss": 0.5596389770507812, + "mean_token_accuracy": 0.7734881490468979, + "num_tokens": 5738765.0, + "step": 352 + }, + { + "entropy": 0.5731736719608307, + "epoch": 1.3177570093457944, + "grad_norm": 0.029810264706611633, + "learning_rate": 0.0002, + "loss": 0.5712341070175171, + "mean_token_accuracy": 0.7706846296787262, + "num_tokens": 5755390.0, + "step": 353 + }, + { + "entropy": 0.5471203178167343, + "epoch": 1.3214953271028038, + "grad_norm": 0.031591080129146576, + "learning_rate": 0.0002, + "loss": 0.5398964881896973, + "mean_token_accuracy": 0.7803937494754791, + "num_tokens": 5771544.0, + "step": 354 + }, + { + "entropy": 0.54648657143116, + "epoch": 1.325233644859813, + "grad_norm": 0.02947516180574894, + "learning_rate": 0.0002, + "loss": 0.5480178594589233, + "mean_token_accuracy": 0.7766480296850204, + "num_tokens": 5787883.0, + "step": 355 + }, + { + "entropy": 0.547360748052597, + "epoch": 1.3289719626168224, + "grad_norm": 0.03545290604233742, + "learning_rate": 0.0002, + "loss": 0.5540775656700134, + "mean_token_accuracy": 0.7739741057157516, + "num_tokens": 5804479.0, + "step": 356 + }, + { + "entropy": 0.5666938722133636, + "epoch": 1.3327102803738318, + "grad_norm": 0.030594807118177414, + "learning_rate": 0.0002, + "loss": 0.5684746503829956, + "mean_token_accuracy": 0.7727518826723099, + "num_tokens": 5820828.0, + "step": 357 + }, + { + "entropy": 0.5427974164485931, + "epoch": 1.3364485981308412, + "grad_norm": 0.028014780953526497, + "learning_rate": 0.0002, + "loss": 0.5475256443023682, + "mean_token_accuracy": 0.7783348560333252, + "num_tokens": 5837279.0, + "step": 358 + }, + { + "entropy": 0.5367537513375282, + "epoch": 1.3401869158878505, + "grad_norm": 0.02776091918349266, + "learning_rate": 0.0002, + "loss": 0.5400804877281189, + "mean_token_accuracy": 0.7791878134012222, + "num_tokens": 5853587.0, + "step": 359 + }, + { + "entropy": 0.5417804718017578, + "epoch": 1.34392523364486, + "grad_norm": 0.030707573518157005, + "learning_rate": 0.0002, + "loss": 0.5384262204170227, + "mean_token_accuracy": 0.7809765785932541, + "num_tokens": 5869870.0, + "step": 360 + }, + { + "entropy": 0.5336113572120667, + "epoch": 1.347663551401869, + "grad_norm": 0.02800143137574196, + "learning_rate": 0.0002, + "loss": 0.5343897342681885, + "mean_token_accuracy": 0.7851298898458481, + "num_tokens": 5886161.0, + "step": 361 + }, + { + "entropy": 0.5595630407333374, + "epoch": 1.3514018691588785, + "grad_norm": 0.033487141132354736, + "learning_rate": 0.0002, + "loss": 0.5617246031761169, + "mean_token_accuracy": 0.7718270570039749, + "num_tokens": 5902403.0, + "step": 362 + }, + { + "entropy": 0.5564123690128326, + "epoch": 1.355140186915888, + "grad_norm": 0.030184373259544373, + "learning_rate": 0.0002, + "loss": 0.5559881925582886, + "mean_token_accuracy": 0.7759525775909424, + "num_tokens": 5918682.0, + "step": 363 + }, + { + "entropy": 0.5504587888717651, + "epoch": 1.358878504672897, + "grad_norm": 0.030958138406276703, + "learning_rate": 0.0002, + "loss": 0.5547590255737305, + "mean_token_accuracy": 0.7714252471923828, + "num_tokens": 5935040.0, + "step": 364 + }, + { + "entropy": 0.5442556142807007, + "epoch": 1.3626168224299064, + "grad_norm": 0.02721753716468811, + "learning_rate": 0.0002, + "loss": 0.5465304851531982, + "mean_token_accuracy": 0.7787513434886932, + "num_tokens": 5951319.0, + "step": 365 + }, + { + "entropy": 0.5471279621124268, + "epoch": 1.3663551401869158, + "grad_norm": 0.02792942337691784, + "learning_rate": 0.0002, + "loss": 0.5474483966827393, + "mean_token_accuracy": 0.7781872451305389, + "num_tokens": 5967748.0, + "step": 366 + }, + { + "entropy": 0.5490291863679886, + "epoch": 1.3700934579439252, + "grad_norm": 0.028123004361987114, + "learning_rate": 0.0002, + "loss": 0.543886125087738, + "mean_token_accuracy": 0.7788564115762711, + "num_tokens": 5984234.0, + "step": 367 + }, + { + "entropy": 0.5531343668699265, + "epoch": 1.3738317757009346, + "grad_norm": 0.028855035081505775, + "learning_rate": 0.0002, + "loss": 0.5493640303611755, + "mean_token_accuracy": 0.7753023356199265, + "num_tokens": 6000373.0, + "step": 368 + }, + { + "entropy": 0.5437296479940414, + "epoch": 1.377570093457944, + "grad_norm": 0.03395684063434601, + "learning_rate": 0.0002, + "loss": 0.5439614057540894, + "mean_token_accuracy": 0.7777044326066971, + "num_tokens": 6016881.0, + "step": 369 + }, + { + "entropy": 0.5404936373233795, + "epoch": 1.3813084112149534, + "grad_norm": 0.032147135585546494, + "learning_rate": 0.0002, + "loss": 0.545538604259491, + "mean_token_accuracy": 0.7780790776014328, + "num_tokens": 6033034.0, + "step": 370 + }, + { + "entropy": 0.5291552245616913, + "epoch": 1.3850467289719626, + "grad_norm": 0.033786509186029434, + "learning_rate": 0.0002, + "loss": 0.5318939685821533, + "mean_token_accuracy": 0.7833190411329269, + "num_tokens": 6049080.0, + "step": 371 + }, + { + "entropy": 0.567609116435051, + "epoch": 1.388785046728972, + "grad_norm": 0.028979122638702393, + "learning_rate": 0.0002, + "loss": 0.5664098858833313, + "mean_token_accuracy": 0.7676924616098404, + "num_tokens": 6065561.0, + "step": 372 + }, + { + "entropy": 0.5492920726537704, + "epoch": 1.3925233644859814, + "grad_norm": 0.03035721927881241, + "learning_rate": 0.0002, + "loss": 0.5452470779418945, + "mean_token_accuracy": 0.7780449539422989, + "num_tokens": 6081826.0, + "step": 373 + }, + { + "entropy": 0.5530673414468765, + "epoch": 1.3962616822429905, + "grad_norm": 0.028820699080824852, + "learning_rate": 0.0002, + "loss": 0.54978346824646, + "mean_token_accuracy": 0.7757824063301086, + "num_tokens": 6098304.0, + "step": 374 + }, + { + "entropy": 0.5440555065870285, + "epoch": 1.4, + "grad_norm": 0.032084014266729355, + "learning_rate": 0.0002, + "loss": 0.5414038896560669, + "mean_token_accuracy": 0.7783534377813339, + "num_tokens": 6114548.0, + "step": 375 + }, + { + "entropy": 0.5330861955881119, + "epoch": 1.4037383177570093, + "grad_norm": 0.03771211951971054, + "learning_rate": 0.0002, + "loss": 0.5359234809875488, + "mean_token_accuracy": 0.7809047400951385, + "num_tokens": 6130775.0, + "step": 376 + }, + { + "entropy": 0.5356210172176361, + "epoch": 1.4074766355140187, + "grad_norm": 0.031670473515987396, + "learning_rate": 0.0002, + "loss": 0.539705753326416, + "mean_token_accuracy": 0.7816208004951477, + "num_tokens": 6147119.0, + "step": 377 + }, + { + "entropy": 0.5264314264059067, + "epoch": 1.411214953271028, + "grad_norm": 0.03143749758601189, + "learning_rate": 0.0002, + "loss": 0.5309686660766602, + "mean_token_accuracy": 0.7844567745923996, + "num_tokens": 6163184.0, + "step": 378 + }, + { + "entropy": 0.5401338934898376, + "epoch": 1.4149532710280375, + "grad_norm": 0.03694392740726471, + "learning_rate": 0.0002, + "loss": 0.5471729040145874, + "mean_token_accuracy": 0.7767902612686157, + "num_tokens": 6179419.0, + "step": 379 + }, + { + "entropy": 0.5529890209436417, + "epoch": 1.4186915887850469, + "grad_norm": 0.033812835812568665, + "learning_rate": 0.0002, + "loss": 0.565970778465271, + "mean_token_accuracy": 0.7665126621723175, + "num_tokens": 6195649.0, + "step": 380 + }, + { + "entropy": 0.5637273043394089, + "epoch": 1.422429906542056, + "grad_norm": 0.02983681485056877, + "learning_rate": 0.0002, + "loss": 0.5626575350761414, + "mean_token_accuracy": 0.7695245891809464, + "num_tokens": 6212102.0, + "step": 381 + }, + { + "entropy": 0.5482936352491379, + "epoch": 1.4261682242990654, + "grad_norm": 0.026917651295661926, + "learning_rate": 0.0002, + "loss": 0.54402095079422, + "mean_token_accuracy": 0.7796863317489624, + "num_tokens": 6228396.0, + "step": 382 + }, + { + "entropy": 0.5501932799816132, + "epoch": 1.4299065420560748, + "grad_norm": 0.029674382880330086, + "learning_rate": 0.0002, + "loss": 0.5445118546485901, + "mean_token_accuracy": 0.7807586491107941, + "num_tokens": 6244754.0, + "step": 383 + }, + { + "entropy": 0.5524531006813049, + "epoch": 1.433644859813084, + "grad_norm": 0.031190212815999985, + "learning_rate": 0.0002, + "loss": 0.5460107922554016, + "mean_token_accuracy": 0.7775425314903259, + "num_tokens": 6261129.0, + "step": 384 + }, + { + "entropy": 0.5559191405773163, + "epoch": 1.4373831775700934, + "grad_norm": 0.03291701897978783, + "learning_rate": 0.0002, + "loss": 0.5565997362136841, + "mean_token_accuracy": 0.7729406207799911, + "num_tokens": 6277295.0, + "step": 385 + }, + { + "entropy": 0.5387507379055023, + "epoch": 1.4411214953271028, + "grad_norm": 0.029946424067020416, + "learning_rate": 0.0002, + "loss": 0.5400161147117615, + "mean_token_accuracy": 0.7815364450216293, + "num_tokens": 6293740.0, + "step": 386 + }, + { + "entropy": 0.5453538447618484, + "epoch": 1.4448598130841122, + "grad_norm": 0.03343931958079338, + "learning_rate": 0.0002, + "loss": 0.5513398051261902, + "mean_token_accuracy": 0.7751688063144684, + "num_tokens": 6309953.0, + "step": 387 + }, + { + "entropy": 0.5543301105499268, + "epoch": 1.4485981308411215, + "grad_norm": 0.02918226271867752, + "learning_rate": 0.0002, + "loss": 0.5551657676696777, + "mean_token_accuracy": 0.7708156853914261, + "num_tokens": 6326235.0, + "step": 388 + }, + { + "entropy": 0.5655284225940704, + "epoch": 1.452336448598131, + "grad_norm": 0.031265538185834885, + "learning_rate": 0.0002, + "loss": 0.5654095411300659, + "mean_token_accuracy": 0.7689371109008789, + "num_tokens": 6342382.0, + "step": 389 + }, + { + "entropy": 0.543906643986702, + "epoch": 1.45607476635514, + "grad_norm": 0.0303683839738369, + "learning_rate": 0.0002, + "loss": 0.5439735651016235, + "mean_token_accuracy": 0.7809619605541229, + "num_tokens": 6358481.0, + "step": 390 + }, + { + "entropy": 0.5395262986421585, + "epoch": 1.4598130841121495, + "grad_norm": 0.029894739389419556, + "learning_rate": 0.0002, + "loss": 0.5406848192214966, + "mean_token_accuracy": 0.7806549370288849, + "num_tokens": 6374676.0, + "step": 391 + }, + { + "entropy": 0.5415726751089096, + "epoch": 1.4635514018691589, + "grad_norm": 0.03674378991127014, + "learning_rate": 0.0002, + "loss": 0.5463482141494751, + "mean_token_accuracy": 0.777136966586113, + "num_tokens": 6390831.0, + "step": 392 + }, + { + "entropy": 0.5493590235710144, + "epoch": 1.4672897196261683, + "grad_norm": 0.028417574241757393, + "learning_rate": 0.0002, + "loss": 0.5471122860908508, + "mean_token_accuracy": 0.7766950279474258, + "num_tokens": 6407316.0, + "step": 393 + }, + { + "entropy": 0.5292122960090637, + "epoch": 1.4710280373831774, + "grad_norm": 0.03298041224479675, + "learning_rate": 0.0002, + "loss": 0.5297983884811401, + "mean_token_accuracy": 0.7847895622253418, + "num_tokens": 6423557.0, + "step": 394 + }, + { + "entropy": 0.5528764873743057, + "epoch": 1.4747663551401868, + "grad_norm": 0.02988980896770954, + "learning_rate": 0.0002, + "loss": 0.5556773543357849, + "mean_token_accuracy": 0.7742319852113724, + "num_tokens": 6439634.0, + "step": 395 + }, + { + "entropy": 0.551476001739502, + "epoch": 1.4785046728971962, + "grad_norm": 0.030642125755548477, + "learning_rate": 0.0002, + "loss": 0.550857663154602, + "mean_token_accuracy": 0.7747069448232651, + "num_tokens": 6455686.0, + "step": 396 + }, + { + "entropy": 0.5380145758390427, + "epoch": 1.4822429906542056, + "grad_norm": 0.02931920997798443, + "learning_rate": 0.0002, + "loss": 0.5356473326683044, + "mean_token_accuracy": 0.7809695601463318, + "num_tokens": 6471935.0, + "step": 397 + }, + { + "entropy": 0.5300455242395401, + "epoch": 1.485981308411215, + "grad_norm": 0.035832446068525314, + "learning_rate": 0.0002, + "loss": 0.5344862937927246, + "mean_token_accuracy": 0.783536359667778, + "num_tokens": 6488278.0, + "step": 398 + }, + { + "entropy": 0.5476836711168289, + "epoch": 1.4897196261682244, + "grad_norm": 0.0333210714161396, + "learning_rate": 0.0002, + "loss": 0.5524132251739502, + "mean_token_accuracy": 0.7725930511951447, + "num_tokens": 6504288.0, + "step": 399 + }, + { + "entropy": 0.535921260714531, + "epoch": 1.4934579439252336, + "grad_norm": 0.03744916617870331, + "learning_rate": 0.0002, + "loss": 0.5385454297065735, + "mean_token_accuracy": 0.7788458317518234, + "num_tokens": 6520498.0, + "step": 400 + }, + { + "entropy": 0.5604094713926315, + "epoch": 1.497196261682243, + "grad_norm": 0.029392311349511147, + "learning_rate": 0.0002, + "loss": 0.5529255867004395, + "mean_token_accuracy": 0.7746162861585617, + "num_tokens": 6536660.0, + "step": 401 + }, + { + "entropy": 0.5519481748342514, + "epoch": 1.5009345794392523, + "grad_norm": 0.031238850206136703, + "learning_rate": 0.0002, + "loss": 0.5494250655174255, + "mean_token_accuracy": 0.7768653184175491, + "num_tokens": 6553126.0, + "step": 402 + }, + { + "entropy": 0.5527535974979401, + "epoch": 1.5046728971962615, + "grad_norm": 0.03839221969246864, + "learning_rate": 0.0002, + "loss": 0.5625566840171814, + "mean_token_accuracy": 0.7720162123441696, + "num_tokens": 6569143.0, + "step": 403 + }, + { + "entropy": 0.5520975738763809, + "epoch": 1.508411214953271, + "grad_norm": 0.03356786444783211, + "learning_rate": 0.0002, + "loss": 0.5538414716720581, + "mean_token_accuracy": 0.7732953429222107, + "num_tokens": 6585541.0, + "step": 404 + }, + { + "entropy": 0.5360509008169174, + "epoch": 1.5121495327102803, + "grad_norm": 0.03467942029237747, + "learning_rate": 0.0002, + "loss": 0.538615345954895, + "mean_token_accuracy": 0.781312882900238, + "num_tokens": 6602274.0, + "step": 405 + }, + { + "entropy": 0.5343838483095169, + "epoch": 1.5158878504672897, + "grad_norm": 0.031671129167079926, + "learning_rate": 0.0002, + "loss": 0.5385384559631348, + "mean_token_accuracy": 0.7813232839107513, + "num_tokens": 6618593.0, + "step": 406 + }, + { + "entropy": 0.5469245910644531, + "epoch": 1.519626168224299, + "grad_norm": 0.032783299684524536, + "learning_rate": 0.0002, + "loss": 0.5508422255516052, + "mean_token_accuracy": 0.7776025533676147, + "num_tokens": 6634867.0, + "step": 407 + }, + { + "entropy": 0.5412425100803375, + "epoch": 1.5233644859813085, + "grad_norm": 0.031013811007142067, + "learning_rate": 0.0002, + "loss": 0.5350072383880615, + "mean_token_accuracy": 0.7846681326627731, + "num_tokens": 6651176.0, + "step": 408 + }, + { + "entropy": 0.5564695447683334, + "epoch": 1.5271028037383179, + "grad_norm": 0.029242711141705513, + "learning_rate": 0.0002, + "loss": 0.5514513254165649, + "mean_token_accuracy": 0.7769511342048645, + "num_tokens": 6667349.0, + "step": 409 + }, + { + "entropy": 0.5430487841367722, + "epoch": 1.5308411214953273, + "grad_norm": 0.034237101674079895, + "learning_rate": 0.0002, + "loss": 0.5410158038139343, + "mean_token_accuracy": 0.7824828028678894, + "num_tokens": 6683579.0, + "step": 410 + }, + { + "entropy": 0.540764793753624, + "epoch": 1.5345794392523364, + "grad_norm": 0.033801931887865067, + "learning_rate": 0.0002, + "loss": 0.5481416583061218, + "mean_token_accuracy": 0.7755506336688995, + "num_tokens": 6700088.0, + "step": 411 + }, + { + "entropy": 0.5592468529939651, + "epoch": 1.5383177570093458, + "grad_norm": 0.03298957645893097, + "learning_rate": 0.0002, + "loss": 0.568514347076416, + "mean_token_accuracy": 0.768169105052948, + "num_tokens": 6716293.0, + "step": 412 + }, + { + "entropy": 0.5506596267223358, + "epoch": 1.542056074766355, + "grad_norm": 0.02886568009853363, + "learning_rate": 0.0002, + "loss": 0.5483353137969971, + "mean_token_accuracy": 0.7779833972454071, + "num_tokens": 6732868.0, + "step": 413 + }, + { + "entropy": 0.5599238723516464, + "epoch": 1.5457943925233644, + "grad_norm": 0.030115319415926933, + "learning_rate": 0.0002, + "loss": 0.5546934008598328, + "mean_token_accuracy": 0.7749194204807281, + "num_tokens": 6749345.0, + "step": 414 + }, + { + "entropy": 0.5543588250875473, + "epoch": 1.5495327102803738, + "grad_norm": 0.0297012310475111, + "learning_rate": 0.0002, + "loss": 0.5521113276481628, + "mean_token_accuracy": 0.7790784388780594, + "num_tokens": 6765790.0, + "step": 415 + }, + { + "entropy": 0.5414643287658691, + "epoch": 1.5532710280373832, + "grad_norm": 0.031022896990180016, + "learning_rate": 0.0002, + "loss": 0.5424535870552063, + "mean_token_accuracy": 0.7811086177825928, + "num_tokens": 6781960.0, + "step": 416 + }, + { + "entropy": 0.5437638312578201, + "epoch": 1.5570093457943925, + "grad_norm": 0.03145059570670128, + "learning_rate": 0.0002, + "loss": 0.5443586707115173, + "mean_token_accuracy": 0.7771809101104736, + "num_tokens": 6798095.0, + "step": 417 + }, + { + "entropy": 0.5528093725442886, + "epoch": 1.560747663551402, + "grad_norm": 0.03482895717024803, + "learning_rate": 0.0002, + "loss": 0.5612707138061523, + "mean_token_accuracy": 0.769527867436409, + "num_tokens": 6814364.0, + "step": 418 + }, + { + "entropy": 0.5420306473970413, + "epoch": 1.5644859813084113, + "grad_norm": 0.028292428702116013, + "learning_rate": 0.0002, + "loss": 0.5386024117469788, + "mean_token_accuracy": 0.779121145606041, + "num_tokens": 6830638.0, + "step": 419 + }, + { + "entropy": 0.5656863749027252, + "epoch": 1.5682242990654207, + "grad_norm": 0.03678108751773834, + "learning_rate": 0.0002, + "loss": 0.556216835975647, + "mean_token_accuracy": 0.7773893773555756, + "num_tokens": 6846999.0, + "step": 420 + }, + { + "entropy": 0.5444925278425217, + "epoch": 1.5719626168224299, + "grad_norm": 0.027032526209950447, + "learning_rate": 0.0002, + "loss": 0.5408859848976135, + "mean_token_accuracy": 0.779949277639389, + "num_tokens": 6863623.0, + "step": 421 + }, + { + "entropy": 0.5538263916969299, + "epoch": 1.5757009345794393, + "grad_norm": 0.030989233404397964, + "learning_rate": 0.0002, + "loss": 0.5511465072631836, + "mean_token_accuracy": 0.7773707062005997, + "num_tokens": 6880063.0, + "step": 422 + }, + { + "entropy": 0.5507351458072662, + "epoch": 1.5794392523364484, + "grad_norm": 0.031385209411382675, + "learning_rate": 0.0002, + "loss": 0.5568031668663025, + "mean_token_accuracy": 0.7725684642791748, + "num_tokens": 6896314.0, + "step": 423 + }, + { + "entropy": 0.5659656077623367, + "epoch": 1.5831775700934578, + "grad_norm": 0.03111201897263527, + "learning_rate": 0.0002, + "loss": 0.5699700713157654, + "mean_token_accuracy": 0.7707180082798004, + "num_tokens": 6912699.0, + "step": 424 + }, + { + "entropy": 0.544001892209053, + "epoch": 1.5869158878504672, + "grad_norm": 0.028569631278514862, + "learning_rate": 0.0002, + "loss": 0.5452186465263367, + "mean_token_accuracy": 0.7768339067697525, + "num_tokens": 6928999.0, + "step": 425 + }, + { + "entropy": 0.5471579134464264, + "epoch": 1.5906542056074766, + "grad_norm": 0.0320747047662735, + "learning_rate": 0.0002, + "loss": 0.5472685694694519, + "mean_token_accuracy": 0.7775517106056213, + "num_tokens": 6945430.0, + "step": 426 + }, + { + "entropy": 0.5361240059137344, + "epoch": 1.594392523364486, + "grad_norm": 0.030863573774695396, + "learning_rate": 0.0002, + "loss": 0.5366886258125305, + "mean_token_accuracy": 0.7811662554740906, + "num_tokens": 6961737.0, + "step": 427 + }, + { + "entropy": 0.5404235273599625, + "epoch": 1.5981308411214954, + "grad_norm": 0.03127690777182579, + "learning_rate": 0.0002, + "loss": 0.542918860912323, + "mean_token_accuracy": 0.7802147716283798, + "num_tokens": 6978071.0, + "step": 428 + }, + { + "entropy": 0.5443604737520218, + "epoch": 1.6018691588785048, + "grad_norm": 0.0312102772295475, + "learning_rate": 0.0002, + "loss": 0.5430488586425781, + "mean_token_accuracy": 0.7779473662376404, + "num_tokens": 6994482.0, + "step": 429 + }, + { + "entropy": 0.5391067713499069, + "epoch": 1.6056074766355142, + "grad_norm": 0.030182059854269028, + "learning_rate": 0.0002, + "loss": 0.5479601621627808, + "mean_token_accuracy": 0.7749738842248917, + "num_tokens": 7010796.0, + "step": 430 + }, + { + "entropy": 0.5425884425640106, + "epoch": 1.6093457943925233, + "grad_norm": 0.03449074178934097, + "learning_rate": 0.0002, + "loss": 0.5474182367324829, + "mean_token_accuracy": 0.7784157991409302, + "num_tokens": 7026912.0, + "step": 431 + }, + { + "entropy": 0.5485401153564453, + "epoch": 1.6130841121495327, + "grad_norm": 0.028809968382120132, + "learning_rate": 0.0002, + "loss": 0.545950710773468, + "mean_token_accuracy": 0.7787201404571533, + "num_tokens": 7043303.0, + "step": 432 + }, + { + "entropy": 0.5581236183643341, + "epoch": 1.616822429906542, + "grad_norm": 0.03381406515836716, + "learning_rate": 0.0002, + "loss": 0.551964521408081, + "mean_token_accuracy": 0.7750234007835388, + "num_tokens": 7059825.0, + "step": 433 + }, + { + "entropy": 0.5686533004045486, + "epoch": 1.6205607476635513, + "grad_norm": 0.03113553300499916, + "learning_rate": 0.0002, + "loss": 0.565824031829834, + "mean_token_accuracy": 0.7702126502990723, + "num_tokens": 7076166.0, + "step": 434 + }, + { + "entropy": 0.541614830493927, + "epoch": 1.6242990654205607, + "grad_norm": 0.02829281985759735, + "learning_rate": 0.0002, + "loss": 0.5373748540878296, + "mean_token_accuracy": 0.7802039682865143, + "num_tokens": 7092453.0, + "step": 435 + }, + { + "entropy": 0.5370374619960785, + "epoch": 1.62803738317757, + "grad_norm": 0.038676466792821884, + "learning_rate": 0.0002, + "loss": 0.5463578701019287, + "mean_token_accuracy": 0.777619257569313, + "num_tokens": 7108489.0, + "step": 436 + }, + { + "entropy": 0.5232041031122208, + "epoch": 1.6317757009345795, + "grad_norm": 0.03186805173754692, + "learning_rate": 0.0002, + "loss": 0.5275849103927612, + "mean_token_accuracy": 0.7832721769809723, + "num_tokens": 7124721.0, + "step": 437 + }, + { + "entropy": 0.553793340921402, + "epoch": 1.6355140186915889, + "grad_norm": 0.034783560782670975, + "learning_rate": 0.0002, + "loss": 0.5545384287834167, + "mean_token_accuracy": 0.7748959958553314, + "num_tokens": 7141085.0, + "step": 438 + }, + { + "entropy": 0.5291094183921814, + "epoch": 1.6392523364485982, + "grad_norm": 0.032007865607738495, + "learning_rate": 0.0002, + "loss": 0.5233455896377563, + "mean_token_accuracy": 0.7880895286798477, + "num_tokens": 7157324.0, + "step": 439 + }, + { + "entropy": 0.5520578771829605, + "epoch": 1.6429906542056076, + "grad_norm": 0.03354523330926895, + "learning_rate": 0.0002, + "loss": 0.546679675579071, + "mean_token_accuracy": 0.7785679250955582, + "num_tokens": 7173748.0, + "step": 440 + }, + { + "entropy": 0.556230291724205, + "epoch": 1.6467289719626168, + "grad_norm": 0.029538605362176895, + "learning_rate": 0.0002, + "loss": 0.5578120946884155, + "mean_token_accuracy": 0.7732451260089874, + "num_tokens": 7190279.0, + "step": 441 + }, + { + "entropy": 0.5558031052350998, + "epoch": 1.6504672897196262, + "grad_norm": 0.032964110374450684, + "learning_rate": 0.0002, + "loss": 0.5642295479774475, + "mean_token_accuracy": 0.7702266126871109, + "num_tokens": 7206573.0, + "step": 442 + }, + { + "entropy": 0.5462649762630463, + "epoch": 1.6542056074766354, + "grad_norm": 0.03030145727097988, + "learning_rate": 0.0002, + "loss": 0.5472926497459412, + "mean_token_accuracy": 0.7779358178377151, + "num_tokens": 7222876.0, + "step": 443 + }, + { + "entropy": 0.5564112961292267, + "epoch": 1.6579439252336448, + "grad_norm": 0.03184789791703224, + "learning_rate": 0.0002, + "loss": 0.5510616898536682, + "mean_token_accuracy": 0.778466135263443, + "num_tokens": 7239236.0, + "step": 444 + }, + { + "entropy": 0.5630057454109192, + "epoch": 1.6616822429906541, + "grad_norm": 0.029552895575761795, + "learning_rate": 0.0002, + "loss": 0.5598853826522827, + "mean_token_accuracy": 0.7684319913387299, + "num_tokens": 7255686.0, + "step": 445 + }, + { + "entropy": 0.5392823964357376, + "epoch": 1.6654205607476635, + "grad_norm": 0.03244130313396454, + "learning_rate": 0.0002, + "loss": 0.535351574420929, + "mean_token_accuracy": 0.782013326883316, + "num_tokens": 7271954.0, + "step": 446 + }, + { + "entropy": 0.5588325262069702, + "epoch": 1.669158878504673, + "grad_norm": 0.03356410935521126, + "learning_rate": 0.0002, + "loss": 0.5625864267349243, + "mean_token_accuracy": 0.7675231099128723, + "num_tokens": 7288380.0, + "step": 447 + }, + { + "entropy": 0.5461699217557907, + "epoch": 1.6728971962616823, + "grad_norm": 0.03438694775104523, + "learning_rate": 0.0002, + "loss": 0.5539066195487976, + "mean_token_accuracy": 0.7729915380477905, + "num_tokens": 7304727.0, + "step": 448 + }, + { + "entropy": 0.5323803722858429, + "epoch": 1.6766355140186917, + "grad_norm": 0.031167054548859596, + "learning_rate": 0.0002, + "loss": 0.5316429138183594, + "mean_token_accuracy": 0.7824341952800751, + "num_tokens": 7321168.0, + "step": 449 + }, + { + "entropy": 0.5499107241630554, + "epoch": 1.680373831775701, + "grad_norm": 0.02761506475508213, + "learning_rate": 0.0002, + "loss": 0.5486537218093872, + "mean_token_accuracy": 0.7810203582048416, + "num_tokens": 7337539.0, + "step": 450 + }, + { + "entropy": 0.5529857277870178, + "epoch": 1.6841121495327103, + "grad_norm": 0.027268333360552788, + "learning_rate": 0.0002, + "loss": 0.5520575642585754, + "mean_token_accuracy": 0.7759742587804794, + "num_tokens": 7353841.0, + "step": 451 + }, + { + "entropy": 0.5497375279664993, + "epoch": 1.6878504672897197, + "grad_norm": 0.03056921809911728, + "learning_rate": 0.0002, + "loss": 0.5536624789237976, + "mean_token_accuracy": 0.7728510946035385, + "num_tokens": 7370087.0, + "step": 452 + }, + { + "entropy": 0.5587235540151596, + "epoch": 1.6915887850467288, + "grad_norm": 0.029484033584594727, + "learning_rate": 0.0002, + "loss": 0.5550922751426697, + "mean_token_accuracy": 0.7721101641654968, + "num_tokens": 7386564.0, + "step": 453 + }, + { + "entropy": 0.5490273237228394, + "epoch": 1.6953271028037382, + "grad_norm": 0.030445117503404617, + "learning_rate": 0.0002, + "loss": 0.5525585412979126, + "mean_token_accuracy": 0.7748779058456421, + "num_tokens": 7402899.0, + "step": 454 + }, + { + "entropy": 0.5428823530673981, + "epoch": 1.6990654205607476, + "grad_norm": 0.03236573934555054, + "learning_rate": 0.0002, + "loss": 0.5434162020683289, + "mean_token_accuracy": 0.7772506028413773, + "num_tokens": 7419333.0, + "step": 455 + }, + { + "entropy": 0.5431548655033112, + "epoch": 1.702803738317757, + "grad_norm": 0.02851344272494316, + "learning_rate": 0.0002, + "loss": 0.548797607421875, + "mean_token_accuracy": 0.7761964499950409, + "num_tokens": 7435551.0, + "step": 456 + }, + { + "entropy": 0.5437126010656357, + "epoch": 1.7065420560747664, + "grad_norm": 0.028895648196339607, + "learning_rate": 0.0002, + "loss": 0.5451984405517578, + "mean_token_accuracy": 0.7779933959245682, + "num_tokens": 7451825.0, + "step": 457 + }, + { + "entropy": 0.5545783340930939, + "epoch": 1.7102803738317758, + "grad_norm": 0.029966024681925774, + "learning_rate": 0.0002, + "loss": 0.5465367436408997, + "mean_token_accuracy": 0.7784044593572617, + "num_tokens": 7468353.0, + "step": 458 + }, + { + "entropy": 0.5588552802801132, + "epoch": 1.7140186915887852, + "grad_norm": 0.03131483122706413, + "learning_rate": 0.0002, + "loss": 0.5527791976928711, + "mean_token_accuracy": 0.7737509310245514, + "num_tokens": 7484697.0, + "step": 459 + }, + { + "entropy": 0.544754758477211, + "epoch": 1.7177570093457943, + "grad_norm": 0.032681871205568314, + "learning_rate": 0.0002, + "loss": 0.544547975063324, + "mean_token_accuracy": 0.7755615264177322, + "num_tokens": 7500910.0, + "step": 460 + }, + { + "entropy": 0.5452838689088821, + "epoch": 1.7214953271028037, + "grad_norm": 0.03389411419630051, + "learning_rate": 0.0002, + "loss": 0.5545741319656372, + "mean_token_accuracy": 0.7773826718330383, + "num_tokens": 7517229.0, + "step": 461 + }, + { + "entropy": 0.5361888408660889, + "epoch": 1.7252336448598131, + "grad_norm": 0.03197981417179108, + "learning_rate": 0.0002, + "loss": 0.5412315726280212, + "mean_token_accuracy": 0.7807294726371765, + "num_tokens": 7533496.0, + "step": 462 + }, + { + "entropy": 0.5551300644874573, + "epoch": 1.7289719626168223, + "grad_norm": 0.029581068083643913, + "learning_rate": 0.0002, + "loss": 0.5573985576629639, + "mean_token_accuracy": 0.7714389115571976, + "num_tokens": 7549945.0, + "step": 463 + }, + { + "entropy": 0.561688169836998, + "epoch": 1.7327102803738317, + "grad_norm": 0.06545328348875046, + "learning_rate": 0.0002, + "loss": 0.5578382611274719, + "mean_token_accuracy": 0.7731240689754486, + "num_tokens": 7566295.0, + "step": 464 + }, + { + "entropy": 0.546341672539711, + "epoch": 1.736448598130841, + "grad_norm": 0.033630188554525375, + "learning_rate": 0.0002, + "loss": 0.5406599044799805, + "mean_token_accuracy": 0.7778750658035278, + "num_tokens": 7582430.0, + "step": 465 + }, + { + "entropy": 0.5360140800476074, + "epoch": 1.7401869158878505, + "grad_norm": 0.0722091943025589, + "learning_rate": 0.0002, + "loss": 0.5544889569282532, + "mean_token_accuracy": 0.7800410985946655, + "num_tokens": 7598724.0, + "step": 466 + }, + { + "entropy": 0.5511822551488876, + "epoch": 1.7439252336448599, + "grad_norm": 0.0453655868768692, + "learning_rate": 0.0002, + "loss": 0.5510369539260864, + "mean_token_accuracy": 0.7753463685512543, + "num_tokens": 7615066.0, + "step": 467 + }, + { + "entropy": 0.5692480951547623, + "epoch": 1.7476635514018692, + "grad_norm": 0.03852085396647453, + "learning_rate": 0.0002, + "loss": 0.5623142123222351, + "mean_token_accuracy": 0.7731426060199738, + "num_tokens": 7631470.0, + "step": 468 + }, + { + "entropy": 0.5454059541225433, + "epoch": 1.7514018691588786, + "grad_norm": 0.03272124007344246, + "learning_rate": 0.0002, + "loss": 0.5412269830703735, + "mean_token_accuracy": 0.7835355550050735, + "num_tokens": 7647653.0, + "step": 469 + }, + { + "entropy": 0.5629090666770935, + "epoch": 1.7551401869158878, + "grad_norm": 0.03476416692137718, + "learning_rate": 0.0002, + "loss": 0.5631899833679199, + "mean_token_accuracy": 0.7717723101377487, + "num_tokens": 7664168.0, + "step": 470 + }, + { + "entropy": 0.5542246699333191, + "epoch": 1.7588785046728972, + "grad_norm": 0.04324596747756004, + "learning_rate": 0.0002, + "loss": 0.5525742769241333, + "mean_token_accuracy": 0.7750708311796188, + "num_tokens": 7680515.0, + "step": 471 + }, + { + "entropy": 0.5454518496990204, + "epoch": 1.7626168224299066, + "grad_norm": 0.029530908912420273, + "learning_rate": 0.0002, + "loss": 0.5429126024246216, + "mean_token_accuracy": 0.7794791609048843, + "num_tokens": 7696701.0, + "step": 472 + }, + { + "entropy": 0.537155881524086, + "epoch": 1.7663551401869158, + "grad_norm": 0.03453315421938896, + "learning_rate": 0.0002, + "loss": 0.5391392111778259, + "mean_token_accuracy": 0.7809355705976486, + "num_tokens": 7712858.0, + "step": 473 + }, + { + "entropy": 0.5679727643728256, + "epoch": 1.7700934579439251, + "grad_norm": 0.033509086817502975, + "learning_rate": 0.0002, + "loss": 0.5633649826049805, + "mean_token_accuracy": 0.7738876938819885, + "num_tokens": 7729346.0, + "step": 474 + }, + { + "entropy": 0.5437932014465332, + "epoch": 1.7738317757009345, + "grad_norm": 0.03146712854504585, + "learning_rate": 0.0002, + "loss": 0.5468876361846924, + "mean_token_accuracy": 0.7783123552799225, + "num_tokens": 7745581.0, + "step": 475 + }, + { + "entropy": 0.5403594225645065, + "epoch": 1.777570093457944, + "grad_norm": 0.03930533304810524, + "learning_rate": 0.0002, + "loss": 0.5513323545455933, + "mean_token_accuracy": 0.7714991271495819, + "num_tokens": 7761661.0, + "step": 476 + }, + { + "entropy": 0.5275924056768417, + "epoch": 1.7813084112149533, + "grad_norm": 0.029612882062792778, + "learning_rate": 0.0002, + "loss": 0.5271086096763611, + "mean_token_accuracy": 0.7869246900081635, + "num_tokens": 7777776.0, + "step": 477 + }, + { + "entropy": 0.5542174875736237, + "epoch": 1.7850467289719627, + "grad_norm": 0.03353600203990936, + "learning_rate": 0.0002, + "loss": 0.5515888333320618, + "mean_token_accuracy": 0.7757923156023026, + "num_tokens": 7794208.0, + "step": 478 + }, + { + "entropy": 0.5686136484146118, + "epoch": 1.788785046728972, + "grad_norm": 0.0359848290681839, + "learning_rate": 0.0002, + "loss": 0.5625109672546387, + "mean_token_accuracy": 0.770489439368248, + "num_tokens": 7810470.0, + "step": 479 + }, + { + "entropy": 0.5591467618942261, + "epoch": 1.7925233644859813, + "grad_norm": 0.03523039445281029, + "learning_rate": 0.0002, + "loss": 0.5659374594688416, + "mean_token_accuracy": 0.7701008170843124, + "num_tokens": 7826783.0, + "step": 480 + }, + { + "entropy": 0.5457279831171036, + "epoch": 1.7962616822429907, + "grad_norm": 0.03323599696159363, + "learning_rate": 0.0002, + "loss": 0.5534178614616394, + "mean_token_accuracy": 0.7753042280673981, + "num_tokens": 7843052.0, + "step": 481 + }, + { + "entropy": 0.5539824962615967, + "epoch": 1.8, + "grad_norm": 0.03477081283926964, + "learning_rate": 0.0002, + "loss": 0.5533367395401001, + "mean_token_accuracy": 0.7745995223522186, + "num_tokens": 7859471.0, + "step": 482 + }, + { + "entropy": 0.5645311027765274, + "epoch": 1.8037383177570092, + "grad_norm": 0.03863651677966118, + "learning_rate": 0.0002, + "loss": 0.5656211376190186, + "mean_token_accuracy": 0.7659904807806015, + "num_tokens": 7875935.0, + "step": 483 + }, + { + "entropy": 0.5520930737257004, + "epoch": 1.8074766355140186, + "grad_norm": 0.030585726723074913, + "learning_rate": 0.0002, + "loss": 0.5455970168113708, + "mean_token_accuracy": 0.7777505069971085, + "num_tokens": 7892341.0, + "step": 484 + }, + { + "entropy": 0.5476150959730148, + "epoch": 1.811214953271028, + "grad_norm": 0.03229457512497902, + "learning_rate": 0.0002, + "loss": 0.5434688329696655, + "mean_token_accuracy": 0.7792231887578964, + "num_tokens": 7908634.0, + "step": 485 + }, + { + "entropy": 0.5417618900537491, + "epoch": 1.8149532710280374, + "grad_norm": 0.036823540925979614, + "learning_rate": 0.0002, + "loss": 0.5497291684150696, + "mean_token_accuracy": 0.7776121497154236, + "num_tokens": 7924838.0, + "step": 486 + }, + { + "entropy": 0.5513382405042648, + "epoch": 1.8186915887850468, + "grad_norm": 0.03352766111493111, + "learning_rate": 0.0002, + "loss": 0.5549485683441162, + "mean_token_accuracy": 0.7709111571311951, + "num_tokens": 7941280.0, + "step": 487 + }, + { + "entropy": 0.5313259810209274, + "epoch": 1.8224299065420562, + "grad_norm": 0.03318050131201744, + "learning_rate": 0.0002, + "loss": 0.5327860116958618, + "mean_token_accuracy": 0.7831867635250092, + "num_tokens": 7957629.0, + "step": 488 + }, + { + "entropy": 0.5544183105230331, + "epoch": 1.8261682242990656, + "grad_norm": 0.031777020543813705, + "learning_rate": 0.0002, + "loss": 0.5501781702041626, + "mean_token_accuracy": 0.7757555842399597, + "num_tokens": 7974007.0, + "step": 489 + }, + { + "entropy": 0.5466530621051788, + "epoch": 1.8299065420560747, + "grad_norm": 0.03351164236664772, + "learning_rate": 0.0002, + "loss": 0.5517051815986633, + "mean_token_accuracy": 0.7755817025899887, + "num_tokens": 7990357.0, + "step": 490 + }, + { + "entropy": 0.5637551099061966, + "epoch": 1.8336448598130841, + "grad_norm": 0.029702039435505867, + "learning_rate": 0.0002, + "loss": 0.5635112524032593, + "mean_token_accuracy": 0.7732028961181641, + "num_tokens": 8006595.0, + "step": 491 + }, + { + "entropy": 0.5468285977840424, + "epoch": 1.8373831775700935, + "grad_norm": 0.03702588006854057, + "learning_rate": 0.0002, + "loss": 0.5488995909690857, + "mean_token_accuracy": 0.7771299928426743, + "num_tokens": 8022970.0, + "step": 492 + }, + { + "entropy": 0.5305586457252502, + "epoch": 1.8411214953271027, + "grad_norm": 0.03400636091828346, + "learning_rate": 0.0002, + "loss": 0.5370339155197144, + "mean_token_accuracy": 0.7821398675441742, + "num_tokens": 8038831.0, + "step": 493 + }, + { + "entropy": 0.5582829564809799, + "epoch": 1.844859813084112, + "grad_norm": 0.034549374133348465, + "learning_rate": 0.0002, + "loss": 0.5647696852684021, + "mean_token_accuracy": 0.7711978554725647, + "num_tokens": 8055116.0, + "step": 494 + }, + { + "entropy": 0.5546866208314896, + "epoch": 1.8485981308411215, + "grad_norm": 0.030837178230285645, + "learning_rate": 0.0002, + "loss": 0.5502550601959229, + "mean_token_accuracy": 0.7736955285072327, + "num_tokens": 8071294.0, + "step": 495 + }, + { + "entropy": 0.5573176741600037, + "epoch": 1.8523364485981308, + "grad_norm": 0.030723506584763527, + "learning_rate": 0.0002, + "loss": 0.5504124164581299, + "mean_token_accuracy": 0.7768528610467911, + "num_tokens": 8087507.0, + "step": 496 + }, + { + "entropy": 0.5464990586042404, + "epoch": 1.8560747663551402, + "grad_norm": 0.031615249812603, + "learning_rate": 0.0002, + "loss": 0.5419277548789978, + "mean_token_accuracy": 0.7811318039894104, + "num_tokens": 8103739.0, + "step": 497 + }, + { + "entropy": 0.5507507473230362, + "epoch": 1.8598130841121496, + "grad_norm": 0.031241271644830704, + "learning_rate": 0.0002, + "loss": 0.5549343824386597, + "mean_token_accuracy": 0.7766365706920624, + "num_tokens": 8120195.0, + "step": 498 + }, + { + "entropy": 0.528255894780159, + "epoch": 1.863551401869159, + "grad_norm": 0.03131619840860367, + "learning_rate": 0.0002, + "loss": 0.5327234268188477, + "mean_token_accuracy": 0.7834469825029373, + "num_tokens": 8136237.0, + "step": 499 + }, + { + "entropy": 0.5606546849012375, + "epoch": 1.8672897196261682, + "grad_norm": 0.029506012797355652, + "learning_rate": 0.0002, + "loss": 0.5612553954124451, + "mean_token_accuracy": 0.7706436961889267, + "num_tokens": 8152593.0, + "step": 500 + }, + { + "entropy": 0.5505183190107346, + "epoch": 1.8710280373831776, + "grad_norm": 0.027479592710733414, + "learning_rate": 0.0002, + "loss": 0.5496048331260681, + "mean_token_accuracy": 0.7758743315935135, + "num_tokens": 8168904.0, + "step": 501 + }, + { + "entropy": 0.5521711856126785, + "epoch": 1.874766355140187, + "grad_norm": 0.029544338583946228, + "learning_rate": 0.0002, + "loss": 0.5500134825706482, + "mean_token_accuracy": 0.7751173824071884, + "num_tokens": 8185076.0, + "step": 502 + }, + { + "entropy": 0.5535303503274918, + "epoch": 1.8785046728971961, + "grad_norm": 0.03169948607683182, + "learning_rate": 0.0002, + "loss": 0.5551643967628479, + "mean_token_accuracy": 0.7735145539045334, + "num_tokens": 8201211.0, + "step": 503 + }, + { + "entropy": 0.5526206642389297, + "epoch": 1.8822429906542055, + "grad_norm": 0.029640454798936844, + "learning_rate": 0.0002, + "loss": 0.5438597202301025, + "mean_token_accuracy": 0.7798311114311218, + "num_tokens": 8217307.0, + "step": 504 + }, + { + "entropy": 0.5524756759405136, + "epoch": 1.885981308411215, + "grad_norm": 0.036887627094984055, + "learning_rate": 0.0002, + "loss": 0.5504904985427856, + "mean_token_accuracy": 0.7746451199054718, + "num_tokens": 8233547.0, + "step": 505 + }, + { + "entropy": 0.5518786162137985, + "epoch": 1.8897196261682243, + "grad_norm": 0.03335992246866226, + "learning_rate": 0.0002, + "loss": 0.5517621040344238, + "mean_token_accuracy": 0.7776313275098801, + "num_tokens": 8249917.0, + "step": 506 + }, + { + "entropy": 0.5617891550064087, + "epoch": 1.8934579439252337, + "grad_norm": 0.032016802579164505, + "learning_rate": 0.0002, + "loss": 0.5628353357315063, + "mean_token_accuracy": 0.7710386514663696, + "num_tokens": 8266109.0, + "step": 507 + }, + { + "entropy": 0.542267769575119, + "epoch": 1.897196261682243, + "grad_norm": 0.030068468302488327, + "learning_rate": 0.0002, + "loss": 0.5358614921569824, + "mean_token_accuracy": 0.7818213999271393, + "num_tokens": 8282620.0, + "step": 508 + }, + { + "entropy": 0.5325959101319313, + "epoch": 1.9009345794392525, + "grad_norm": 0.026692470535635948, + "learning_rate": 0.0002, + "loss": 0.5332909822463989, + "mean_token_accuracy": 0.7836195677518845, + "num_tokens": 8299102.0, + "step": 509 + }, + { + "entropy": 0.5525154024362564, + "epoch": 1.9046728971962616, + "grad_norm": 0.029202645644545555, + "learning_rate": 0.0002, + "loss": 0.5564330816268921, + "mean_token_accuracy": 0.7734202891588211, + "num_tokens": 8315257.0, + "step": 510 + }, + { + "entropy": 0.5515794008970261, + "epoch": 1.908411214953271, + "grad_norm": 0.03252064809203148, + "learning_rate": 0.0002, + "loss": 0.5536593198776245, + "mean_token_accuracy": 0.7732348889112473, + "num_tokens": 8331546.0, + "step": 511 + }, + { + "entropy": 0.548397034406662, + "epoch": 1.9121495327102802, + "grad_norm": 0.02674887888133526, + "learning_rate": 0.0002, + "loss": 0.5501974821090698, + "mean_token_accuracy": 0.7732282876968384, + "num_tokens": 8347949.0, + "step": 512 + }, + { + "entropy": 0.5453347116708755, + "epoch": 1.9158878504672896, + "grad_norm": 0.031093353405594826, + "learning_rate": 0.0002, + "loss": 0.5438382625579834, + "mean_token_accuracy": 0.7786152511835098, + "num_tokens": 8364391.0, + "step": 513 + }, + { + "entropy": 0.5397911220788956, + "epoch": 1.919626168224299, + "grad_norm": 0.03151440620422363, + "learning_rate": 0.0002, + "loss": 0.5433223247528076, + "mean_token_accuracy": 0.782426580786705, + "num_tokens": 8380708.0, + "step": 514 + }, + { + "entropy": 0.5369860827922821, + "epoch": 1.9233644859813084, + "grad_norm": 0.035960424691438675, + "learning_rate": 0.0002, + "loss": 0.5434173345565796, + "mean_token_accuracy": 0.7791251987218857, + "num_tokens": 8397064.0, + "step": 515 + }, + { + "entropy": 0.5706264823675156, + "epoch": 1.9271028037383178, + "grad_norm": 0.032570187002420425, + "learning_rate": 0.0002, + "loss": 0.5725105404853821, + "mean_token_accuracy": 0.7670982778072357, + "num_tokens": 8413343.0, + "step": 516 + }, + { + "entropy": 0.5545293837785721, + "epoch": 1.9308411214953272, + "grad_norm": 0.02958519756793976, + "learning_rate": 0.0002, + "loss": 0.5506482124328613, + "mean_token_accuracy": 0.7761605083942413, + "num_tokens": 8429612.0, + "step": 517 + }, + { + "entropy": 0.5615670382976532, + "epoch": 1.9345794392523366, + "grad_norm": 0.033964622765779495, + "learning_rate": 0.0002, + "loss": 0.5493220686912537, + "mean_token_accuracy": 0.7761300206184387, + "num_tokens": 8446212.0, + "step": 518 + }, + { + "entropy": 0.5410606861114502, + "epoch": 1.938317757009346, + "grad_norm": 0.03190803527832031, + "learning_rate": 0.0002, + "loss": 0.5400397181510925, + "mean_token_accuracy": 0.7820769846439362, + "num_tokens": 8462334.0, + "step": 519 + }, + { + "entropy": 0.5581596940755844, + "epoch": 1.9420560747663551, + "grad_norm": 0.03331439197063446, + "learning_rate": 0.0002, + "loss": 0.5643476843833923, + "mean_token_accuracy": 0.7706414759159088, + "num_tokens": 8478855.0, + "step": 520 + }, + { + "entropy": 0.5354294329881668, + "epoch": 1.9457943925233645, + "grad_norm": 0.03460956737399101, + "learning_rate": 0.0002, + "loss": 0.5405067801475525, + "mean_token_accuracy": 0.7803503423929214, + "num_tokens": 8495299.0, + "step": 521 + }, + { + "entropy": 0.5444121658802032, + "epoch": 1.9495327102803737, + "grad_norm": 0.03110821172595024, + "learning_rate": 0.0002, + "loss": 0.5491779446601868, + "mean_token_accuracy": 0.7754928916692734, + "num_tokens": 8512049.0, + "step": 522 + }, + { + "entropy": 0.5544265806674957, + "epoch": 1.953271028037383, + "grad_norm": 0.03355475887656212, + "learning_rate": 0.0002, + "loss": 0.5517175197601318, + "mean_token_accuracy": 0.7744346410036087, + "num_tokens": 8528391.0, + "step": 523 + }, + { + "entropy": 0.5565292239189148, + "epoch": 1.9570093457943925, + "grad_norm": 0.029388394206762314, + "learning_rate": 0.0002, + "loss": 0.5525711178779602, + "mean_token_accuracy": 0.7771426290273666, + "num_tokens": 8544951.0, + "step": 524 + }, + { + "entropy": 0.5562525689601898, + "epoch": 1.9607476635514018, + "grad_norm": 0.03225899115204811, + "learning_rate": 0.0002, + "loss": 0.5521144866943359, + "mean_token_accuracy": 0.7747148424386978, + "num_tokens": 8561319.0, + "step": 525 + }, + { + "entropy": 0.5570588409900665, + "epoch": 1.9644859813084112, + "grad_norm": 0.03741025924682617, + "learning_rate": 0.0002, + "loss": 0.557031512260437, + "mean_token_accuracy": 0.7722350209951401, + "num_tokens": 8577515.0, + "step": 526 + }, + { + "entropy": 0.5351222157478333, + "epoch": 1.9682242990654206, + "grad_norm": 0.029640430584549904, + "learning_rate": 0.0002, + "loss": 0.5402463674545288, + "mean_token_accuracy": 0.7800589948892593, + "num_tokens": 8593870.0, + "step": 527 + }, + { + "entropy": 0.5618735998868942, + "epoch": 1.97196261682243, + "grad_norm": 0.03617403656244278, + "learning_rate": 0.0002, + "loss": 0.5702189803123474, + "mean_token_accuracy": 0.7701396048069, + "num_tokens": 8610566.0, + "step": 528 + }, + { + "entropy": 0.5421914011240005, + "epoch": 1.9757009345794394, + "grad_norm": 0.0348467081785202, + "learning_rate": 0.0002, + "loss": 0.5410540699958801, + "mean_token_accuracy": 0.7805227339267731, + "num_tokens": 8626999.0, + "step": 529 + }, + { + "entropy": 0.5460681766271591, + "epoch": 1.9794392523364486, + "grad_norm": 0.03002713993191719, + "learning_rate": 0.0002, + "loss": 0.5420626997947693, + "mean_token_accuracy": 0.7775750756263733, + "num_tokens": 8643379.0, + "step": 530 + }, + { + "entropy": 0.5560031235218048, + "epoch": 1.983177570093458, + "grad_norm": 0.02867712639272213, + "learning_rate": 0.0002, + "loss": 0.5502682328224182, + "mean_token_accuracy": 0.7767003029584885, + "num_tokens": 8659814.0, + "step": 531 + }, + { + "entropy": 0.5697564780712128, + "epoch": 1.9869158878504671, + "grad_norm": 0.034316740930080414, + "learning_rate": 0.0002, + "loss": 0.5676635503768921, + "mean_token_accuracy": 0.7674341350793839, + "num_tokens": 8676132.0, + "step": 532 + }, + { + "entropy": 0.5442752093076706, + "epoch": 1.9906542056074765, + "grad_norm": 0.032586514949798584, + "learning_rate": 0.0002, + "loss": 0.5461133718490601, + "mean_token_accuracy": 0.7766181975603104, + "num_tokens": 8692605.0, + "step": 533 + }, + { + "entropy": 0.5408132523298264, + "epoch": 1.994392523364486, + "grad_norm": 0.028621118515729904, + "learning_rate": 0.0002, + "loss": 0.5469282865524292, + "mean_token_accuracy": 0.7724520564079285, + "num_tokens": 8708747.0, + "step": 534 + }, + { + "entropy": 0.5602825433015823, + "epoch": 1.9981308411214953, + "grad_norm": 0.030758850276470184, + "learning_rate": 0.0002, + "loss": 0.561021625995636, + "mean_token_accuracy": 0.7700503617525101, + "num_tokens": 8725013.0, + "step": 535 + }, + { + "entropy": 0.5262570530176163, + "epoch": 2.0, + "grad_norm": 0.041106510907411575, + "learning_rate": 0.0002, + "loss": 0.5266788601875305, + "mean_token_accuracy": 0.7866178452968597, + "num_tokens": 8730698.0, + "step": 536 + }, + { + "entropy": 0.5550900399684906, + "epoch": 2.0037383177570094, + "grad_norm": 0.031247610226273537, + "learning_rate": 0.0002, + "loss": 0.5427648425102234, + "mean_token_accuracy": 0.7791824787855148, + "num_tokens": 8747042.0, + "step": 537 + }, + { + "entropy": 0.5459851175546646, + "epoch": 2.007476635514019, + "grad_norm": 0.03372490033507347, + "learning_rate": 0.0002, + "loss": 0.5406405925750732, + "mean_token_accuracy": 0.7772052884101868, + "num_tokens": 8763334.0, + "step": 538 + }, + { + "entropy": 0.5324701964855194, + "epoch": 2.011214953271028, + "grad_norm": 0.035976309329271317, + "learning_rate": 0.0002, + "loss": 0.5377114415168762, + "mean_token_accuracy": 0.7791852504014969, + "num_tokens": 8779449.0, + "step": 539 + }, + { + "entropy": 0.5410480201244354, + "epoch": 2.0149532710280376, + "grad_norm": 0.03491639345884323, + "learning_rate": 0.0002, + "loss": 0.546131432056427, + "mean_token_accuracy": 0.7801298946142197, + "num_tokens": 8795796.0, + "step": 540 + }, + { + "entropy": 0.5370665192604065, + "epoch": 2.0186915887850465, + "grad_norm": 0.04541780799627304, + "learning_rate": 0.0002, + "loss": 0.5394240617752075, + "mean_token_accuracy": 0.7802438586950302, + "num_tokens": 8812239.0, + "step": 541 + }, + { + "entropy": 0.5596486777067184, + "epoch": 2.022429906542056, + "grad_norm": 0.03784856200218201, + "learning_rate": 0.0002, + "loss": 0.5519159436225891, + "mean_token_accuracy": 0.7757291346788406, + "num_tokens": 8828405.0, + "step": 542 + }, + { + "entropy": 0.5454950630664825, + "epoch": 2.0261682242990653, + "grad_norm": 0.044418152421712875, + "learning_rate": 0.0002, + "loss": 0.5402126908302307, + "mean_token_accuracy": 0.7813611626625061, + "num_tokens": 8844636.0, + "step": 543 + }, + { + "entropy": 0.5182391032576561, + "epoch": 2.0299065420560747, + "grad_norm": 0.03811662271618843, + "learning_rate": 0.0002, + "loss": 0.5167620778083801, + "mean_token_accuracy": 0.7919700294733047, + "num_tokens": 8860774.0, + "step": 544 + }, + { + "entropy": 0.5337931364774704, + "epoch": 2.033644859813084, + "grad_norm": 0.04127589613199234, + "learning_rate": 0.0002, + "loss": 0.5322132110595703, + "mean_token_accuracy": 0.78387551009655, + "num_tokens": 8876930.0, + "step": 545 + }, + { + "entropy": 0.5331794023513794, + "epoch": 2.0373831775700935, + "grad_norm": 0.03687538579106331, + "learning_rate": 0.0002, + "loss": 0.5393625497817993, + "mean_token_accuracy": 0.7809632271528244, + "num_tokens": 8893177.0, + "step": 546 + }, + { + "entropy": 0.5235881805419922, + "epoch": 2.041121495327103, + "grad_norm": 0.04371653124690056, + "learning_rate": 0.0002, + "loss": 0.5326187610626221, + "mean_token_accuracy": 0.7816834598779678, + "num_tokens": 8909522.0, + "step": 547 + }, + { + "entropy": 0.5280764102935791, + "epoch": 2.0448598130841122, + "grad_norm": 0.03337106481194496, + "learning_rate": 0.0002, + "loss": 0.531906247138977, + "mean_token_accuracy": 0.7849632948637009, + "num_tokens": 8925662.0, + "step": 548 + }, + { + "entropy": 0.5238441824913025, + "epoch": 2.0485981308411216, + "grad_norm": 0.04188013821840286, + "learning_rate": 0.0002, + "loss": 0.5119596123695374, + "mean_token_accuracy": 0.7923747897148132, + "num_tokens": 8942074.0, + "step": 549 + }, + { + "entropy": 0.5489809960126877, + "epoch": 2.052336448598131, + "grad_norm": 0.03609222173690796, + "learning_rate": 0.0002, + "loss": 0.5403028726577759, + "mean_token_accuracy": 0.7791068702936172, + "num_tokens": 8958329.0, + "step": 550 + }, + { + "entropy": 0.5437890142202377, + "epoch": 2.05607476635514, + "grad_norm": 0.03745613619685173, + "learning_rate": 0.0002, + "loss": 0.5450887084007263, + "mean_token_accuracy": 0.7779961079359055, + "num_tokens": 8974422.0, + "step": 551 + }, + { + "entropy": 0.5333922803401947, + "epoch": 2.0598130841121494, + "grad_norm": 0.042939819395542145, + "learning_rate": 0.0002, + "loss": 0.5377407670021057, + "mean_token_accuracy": 0.7828031182289124, + "num_tokens": 8990622.0, + "step": 552 + }, + { + "entropy": 0.5367026776075363, + "epoch": 2.0635514018691588, + "grad_norm": 0.03213382139801979, + "learning_rate": 0.0002, + "loss": 0.5339721441268921, + "mean_token_accuracy": 0.7822674959897995, + "num_tokens": 9006957.0, + "step": 553 + }, + { + "entropy": 0.5521921962499619, + "epoch": 2.067289719626168, + "grad_norm": 0.043271131813526154, + "learning_rate": 0.0002, + "loss": 0.5421251654624939, + "mean_token_accuracy": 0.7799997925758362, + "num_tokens": 9023385.0, + "step": 554 + }, + { + "entropy": 0.536955714225769, + "epoch": 2.0710280373831775, + "grad_norm": 0.036944299936294556, + "learning_rate": 0.0002, + "loss": 0.53342604637146, + "mean_token_accuracy": 0.7826140820980072, + "num_tokens": 9039979.0, + "step": 555 + }, + { + "entropy": 0.5285215377807617, + "epoch": 2.074766355140187, + "grad_norm": 0.03607345372438431, + "learning_rate": 0.0002, + "loss": 0.5354245901107788, + "mean_token_accuracy": 0.7834082543849945, + "num_tokens": 9056411.0, + "step": 556 + }, + { + "entropy": 0.5265194773674011, + "epoch": 2.0785046728971963, + "grad_norm": 0.03231218829751015, + "learning_rate": 0.0002, + "loss": 0.5345092415809631, + "mean_token_accuracy": 0.7815968543291092, + "num_tokens": 9072925.0, + "step": 557 + }, + { + "entropy": 0.528659924864769, + "epoch": 2.0822429906542057, + "grad_norm": 0.0367112010717392, + "learning_rate": 0.0002, + "loss": 0.5276532173156738, + "mean_token_accuracy": 0.7867192178964615, + "num_tokens": 9089043.0, + "step": 558 + }, + { + "entropy": 0.5355315059423447, + "epoch": 2.085981308411215, + "grad_norm": 0.03834950551390648, + "learning_rate": 0.0002, + "loss": 0.5311352014541626, + "mean_token_accuracy": 0.7842549979686737, + "num_tokens": 9105540.0, + "step": 559 + }, + { + "entropy": 0.5209446102380753, + "epoch": 2.0897196261682245, + "grad_norm": 0.03739333152770996, + "learning_rate": 0.0002, + "loss": 0.5185686945915222, + "mean_token_accuracy": 0.7894220799207687, + "num_tokens": 9121845.0, + "step": 560 + }, + { + "entropy": 0.521641194820404, + "epoch": 2.0934579439252334, + "grad_norm": 0.04100764915347099, + "learning_rate": 0.0002, + "loss": 0.5293187499046326, + "mean_token_accuracy": 0.7825994938611984, + "num_tokens": 9138058.0, + "step": 561 + }, + { + "entropy": 0.5365670919418335, + "epoch": 2.097196261682243, + "grad_norm": 0.036154504865407944, + "learning_rate": 0.0002, + "loss": 0.5414365530014038, + "mean_token_accuracy": 0.780719056725502, + "num_tokens": 9154400.0, + "step": 562 + }, + { + "entropy": 0.5273338481783867, + "epoch": 2.100934579439252, + "grad_norm": 0.04028065875172615, + "learning_rate": 0.0002, + "loss": 0.529691755771637, + "mean_token_accuracy": 0.7834713310003281, + "num_tokens": 9170963.0, + "step": 563 + }, + { + "entropy": 0.5385439097881317, + "epoch": 2.1046728971962616, + "grad_norm": 0.03361035883426666, + "learning_rate": 0.0002, + "loss": 0.5270369052886963, + "mean_token_accuracy": 0.7854906022548676, + "num_tokens": 9187292.0, + "step": 564 + }, + { + "entropy": 0.5487733483314514, + "epoch": 2.108411214953271, + "grad_norm": 0.03783544525504112, + "learning_rate": 0.0002, + "loss": 0.5383955240249634, + "mean_token_accuracy": 0.7841213643550873, + "num_tokens": 9203564.0, + "step": 565 + }, + { + "entropy": 0.5291211605072021, + "epoch": 2.1121495327102804, + "grad_norm": 0.037037670612335205, + "learning_rate": 0.0002, + "loss": 0.5292587280273438, + "mean_token_accuracy": 0.7856062203645706, + "num_tokens": 9219726.0, + "step": 566 + }, + { + "entropy": 0.5350653678178787, + "epoch": 2.1158878504672898, + "grad_norm": 0.045786142349243164, + "learning_rate": 0.0002, + "loss": 0.5461608171463013, + "mean_token_accuracy": 0.7790984213352203, + "num_tokens": 9235906.0, + "step": 567 + }, + { + "entropy": 0.5272955000400543, + "epoch": 2.119626168224299, + "grad_norm": 0.03238094225525856, + "learning_rate": 0.0002, + "loss": 0.5275669097900391, + "mean_token_accuracy": 0.7858364135026932, + "num_tokens": 9252138.0, + "step": 568 + }, + { + "entropy": 0.5282166749238968, + "epoch": 2.1233644859813086, + "grad_norm": 0.040294334292411804, + "learning_rate": 0.0002, + "loss": 0.5235229730606079, + "mean_token_accuracy": 0.7853135764598846, + "num_tokens": 9268479.0, + "step": 569 + }, + { + "entropy": 0.5460007339715958, + "epoch": 2.127102803738318, + "grad_norm": 0.033559415489435196, + "learning_rate": 0.0002, + "loss": 0.543584406375885, + "mean_token_accuracy": 0.780407503247261, + "num_tokens": 9284881.0, + "step": 570 + }, + { + "entropy": 0.5487277060747147, + "epoch": 2.130841121495327, + "grad_norm": 0.04158215597271919, + "learning_rate": 0.0002, + "loss": 0.5564124584197998, + "mean_token_accuracy": 0.7759140133857727, + "num_tokens": 9301347.0, + "step": 571 + }, + { + "entropy": 0.5200594365596771, + "epoch": 2.1345794392523363, + "grad_norm": 0.03448987007141113, + "learning_rate": 0.0002, + "loss": 0.5191783905029297, + "mean_token_accuracy": 0.7882550060749054, + "num_tokens": 9317467.0, + "step": 572 + }, + { + "entropy": 0.5137363150715828, + "epoch": 2.1383177570093457, + "grad_norm": 0.04049127548933029, + "learning_rate": 0.0002, + "loss": 0.5194275975227356, + "mean_token_accuracy": 0.7879694402217865, + "num_tokens": 9333343.0, + "step": 573 + }, + { + "entropy": 0.5200390294194221, + "epoch": 2.142056074766355, + "grad_norm": 0.036729056388139725, + "learning_rate": 0.0002, + "loss": 0.5214616060256958, + "mean_token_accuracy": 0.788168340921402, + "num_tokens": 9349842.0, + "step": 574 + }, + { + "entropy": 0.5292311161756516, + "epoch": 2.1457943925233645, + "grad_norm": 0.0415414534509182, + "learning_rate": 0.0002, + "loss": 0.5297327041625977, + "mean_token_accuracy": 0.7849121540784836, + "num_tokens": 9366012.0, + "step": 575 + }, + { + "entropy": 0.5386081859469414, + "epoch": 2.149532710280374, + "grad_norm": 0.03256456181406975, + "learning_rate": 0.0002, + "loss": 0.5289144515991211, + "mean_token_accuracy": 0.7840749174356461, + "num_tokens": 9382392.0, + "step": 576 + }, + { + "entropy": 0.5395008027553558, + "epoch": 2.1532710280373832, + "grad_norm": 0.04578758776187897, + "learning_rate": 0.0002, + "loss": 0.5399198532104492, + "mean_token_accuracy": 0.7795116752386093, + "num_tokens": 9398608.0, + "step": 577 + }, + { + "entropy": 0.5388733297586441, + "epoch": 2.1570093457943926, + "grad_norm": 0.04348094016313553, + "learning_rate": 0.0002, + "loss": 0.5477968454360962, + "mean_token_accuracy": 0.7775707393884659, + "num_tokens": 9414864.0, + "step": 578 + }, + { + "entropy": 0.5495359748601913, + "epoch": 2.160747663551402, + "grad_norm": 0.04143773764371872, + "learning_rate": 0.0002, + "loss": 0.5492925643920898, + "mean_token_accuracy": 0.7751609981060028, + "num_tokens": 9431018.0, + "step": 579 + }, + { + "entropy": 0.5342540368437767, + "epoch": 2.1644859813084114, + "grad_norm": 0.0339248850941658, + "learning_rate": 0.0002, + "loss": 0.5319566130638123, + "mean_token_accuracy": 0.7832492738962173, + "num_tokens": 9447324.0, + "step": 580 + }, + { + "entropy": 0.5374208092689514, + "epoch": 2.1682242990654204, + "grad_norm": 0.04187169671058655, + "learning_rate": 0.0002, + "loss": 0.532558023929596, + "mean_token_accuracy": 0.7825674563646317, + "num_tokens": 9463630.0, + "step": 581 + }, + { + "entropy": 0.5422970801591873, + "epoch": 2.1719626168224297, + "grad_norm": 0.035590704530477524, + "learning_rate": 0.0002, + "loss": 0.5381810665130615, + "mean_token_accuracy": 0.7824431657791138, + "num_tokens": 9480262.0, + "step": 582 + }, + { + "entropy": 0.5083005726337433, + "epoch": 2.175700934579439, + "grad_norm": 0.04746020957827568, + "learning_rate": 0.0002, + "loss": 0.5185438990592957, + "mean_token_accuracy": 0.7896369099617004, + "num_tokens": 9496413.0, + "step": 583 + }, + { + "entropy": 0.538268506526947, + "epoch": 2.1794392523364485, + "grad_norm": 0.036502279341220856, + "learning_rate": 0.0002, + "loss": 0.5346280336380005, + "mean_token_accuracy": 0.7869013249874115, + "num_tokens": 9512790.0, + "step": 584 + }, + { + "entropy": 0.5377851128578186, + "epoch": 2.183177570093458, + "grad_norm": 0.04309968277812004, + "learning_rate": 0.0002, + "loss": 0.5360051989555359, + "mean_token_accuracy": 0.7825826555490494, + "num_tokens": 9529078.0, + "step": 585 + }, + { + "entropy": 0.5475880205631256, + "epoch": 2.1869158878504673, + "grad_norm": 0.035980913788080215, + "learning_rate": 0.0002, + "loss": 0.5418691635131836, + "mean_token_accuracy": 0.7812779098749161, + "num_tokens": 9545620.0, + "step": 586 + }, + { + "entropy": 0.5415029525756836, + "epoch": 2.1906542056074767, + "grad_norm": 0.03402319550514221, + "learning_rate": 0.0002, + "loss": 0.5380699038505554, + "mean_token_accuracy": 0.7777233719825745, + "num_tokens": 9561847.0, + "step": 587 + }, + { + "entropy": 0.5402775406837463, + "epoch": 2.194392523364486, + "grad_norm": 0.03904155641794205, + "learning_rate": 0.0002, + "loss": 0.5427509546279907, + "mean_token_accuracy": 0.7772547751665115, + "num_tokens": 9578299.0, + "step": 588 + }, + { + "entropy": 0.5400314331054688, + "epoch": 2.1981308411214955, + "grad_norm": 0.040362391620874405, + "learning_rate": 0.0002, + "loss": 0.5459818840026855, + "mean_token_accuracy": 0.7782857865095139, + "num_tokens": 9594690.0, + "step": 589 + }, + { + "entropy": 0.5383201092481613, + "epoch": 2.201869158878505, + "grad_norm": 0.03448455408215523, + "learning_rate": 0.0002, + "loss": 0.5378321409225464, + "mean_token_accuracy": 0.7824592739343643, + "num_tokens": 9611075.0, + "step": 590 + }, + { + "entropy": 0.5350049138069153, + "epoch": 2.205607476635514, + "grad_norm": 0.03910663723945618, + "learning_rate": 0.0002, + "loss": 0.5314739346504211, + "mean_token_accuracy": 0.7833352535963058, + "num_tokens": 9627390.0, + "step": 591 + }, + { + "entropy": 0.5416655540466309, + "epoch": 2.209345794392523, + "grad_norm": 0.035276249051094055, + "learning_rate": 0.0002, + "loss": 0.5419362187385559, + "mean_token_accuracy": 0.7800017446279526, + "num_tokens": 9643588.0, + "step": 592 + }, + { + "entropy": 0.5288861393928528, + "epoch": 2.2130841121495326, + "grad_norm": 0.03930996358394623, + "learning_rate": 0.0002, + "loss": 0.5276063680648804, + "mean_token_accuracy": 0.7874087691307068, + "num_tokens": 9660028.0, + "step": 593 + }, + { + "entropy": 0.5260862559080124, + "epoch": 2.216822429906542, + "grad_norm": 0.0427854023873806, + "learning_rate": 0.0002, + "loss": 0.5307914614677429, + "mean_token_accuracy": 0.7824005782604218, + "num_tokens": 9676398.0, + "step": 594 + }, + { + "entropy": 0.5367477387189865, + "epoch": 2.2205607476635514, + "grad_norm": 0.04066091775894165, + "learning_rate": 0.0002, + "loss": 0.5374104380607605, + "mean_token_accuracy": 0.7815362513065338, + "num_tokens": 9692902.0, + "step": 595 + }, + { + "entropy": 0.5327900499105453, + "epoch": 2.2242990654205608, + "grad_norm": 0.03865550830960274, + "learning_rate": 0.0002, + "loss": 0.5303129553794861, + "mean_token_accuracy": 0.7837819904088974, + "num_tokens": 9709300.0, + "step": 596 + }, + { + "entropy": 0.5307036936283112, + "epoch": 2.22803738317757, + "grad_norm": 0.03896172344684601, + "learning_rate": 0.0002, + "loss": 0.5255040526390076, + "mean_token_accuracy": 0.7894146144390106, + "num_tokens": 9725629.0, + "step": 597 + }, + { + "entropy": 0.555603951215744, + "epoch": 2.2317757009345796, + "grad_norm": 0.042428482323884964, + "learning_rate": 0.0002, + "loss": 0.5537799000740051, + "mean_token_accuracy": 0.7759463936090469, + "num_tokens": 9742301.0, + "step": 598 + }, + { + "entropy": 0.5576256364583969, + "epoch": 2.235514018691589, + "grad_norm": 0.0402006059885025, + "learning_rate": 0.0002, + "loss": 0.554506242275238, + "mean_token_accuracy": 0.7767222672700882, + "num_tokens": 9758260.0, + "step": 599 + }, + { + "entropy": 0.5121647417545319, + "epoch": 2.2392523364485983, + "grad_norm": 0.043138571083545685, + "learning_rate": 0.0002, + "loss": 0.5151476860046387, + "mean_token_accuracy": 0.790720209479332, + "num_tokens": 9774531.0, + "step": 600 + }, + { + "entropy": 0.5213710218667984, + "epoch": 2.2429906542056073, + "grad_norm": 0.04871455207467079, + "learning_rate": 0.0002, + "loss": 0.5321851968765259, + "mean_token_accuracy": 0.785635232925415, + "num_tokens": 9790856.0, + "step": 601 + }, + { + "entropy": 0.5272484123706818, + "epoch": 2.2467289719626167, + "grad_norm": 0.040508877485990524, + "learning_rate": 0.0002, + "loss": 0.5326333045959473, + "mean_token_accuracy": 0.7852048426866531, + "num_tokens": 9807182.0, + "step": 602 + }, + { + "entropy": 0.5392047762870789, + "epoch": 2.250467289719626, + "grad_norm": 0.041053518652915955, + "learning_rate": 0.0002, + "loss": 0.5345852375030518, + "mean_token_accuracy": 0.7831785529851913, + "num_tokens": 9823474.0, + "step": 603 + }, + { + "entropy": 0.5716013610363007, + "epoch": 2.2542056074766355, + "grad_norm": 0.03574617952108383, + "learning_rate": 0.0002, + "loss": 0.5612589120864868, + "mean_token_accuracy": 0.7706786692142487, + "num_tokens": 9840032.0, + "step": 604 + }, + { + "entropy": 0.5285372212529182, + "epoch": 2.257943925233645, + "grad_norm": 0.03970465064048767, + "learning_rate": 0.0002, + "loss": 0.5292190909385681, + "mean_token_accuracy": 0.7823670506477356, + "num_tokens": 9856156.0, + "step": 605 + }, + { + "entropy": 0.5217142850160599, + "epoch": 2.2616822429906542, + "grad_norm": 0.04674631357192993, + "learning_rate": 0.0002, + "loss": 0.5359308123588562, + "mean_token_accuracy": 0.7826663255691528, + "num_tokens": 9872455.0, + "step": 606 + }, + { + "entropy": 0.5451616048812866, + "epoch": 2.2654205607476636, + "grad_norm": 0.03408370912075043, + "learning_rate": 0.0002, + "loss": 0.5420879125595093, + "mean_token_accuracy": 0.7793671786785126, + "num_tokens": 9888926.0, + "step": 607 + }, + { + "entropy": 0.5499323606491089, + "epoch": 2.269158878504673, + "grad_norm": 0.042992595583200455, + "learning_rate": 0.0002, + "loss": 0.5445036292076111, + "mean_token_accuracy": 0.7789521962404251, + "num_tokens": 9905440.0, + "step": 608 + }, + { + "entropy": 0.5252562239766121, + "epoch": 2.2728971962616824, + "grad_norm": 0.035366203635931015, + "learning_rate": 0.0002, + "loss": 0.5255159139633179, + "mean_token_accuracy": 0.7839477211236954, + "num_tokens": 9921478.0, + "step": 609 + }, + { + "entropy": 0.544396385550499, + "epoch": 2.2766355140186914, + "grad_norm": 0.03851408511400223, + "learning_rate": 0.0002, + "loss": 0.5386646389961243, + "mean_token_accuracy": 0.783714771270752, + "num_tokens": 9937700.0, + "step": 610 + }, + { + "entropy": 0.5345716625452042, + "epoch": 2.2803738317757007, + "grad_norm": 0.03978222236037254, + "learning_rate": 0.0002, + "loss": 0.5340750813484192, + "mean_token_accuracy": 0.7818202525377274, + "num_tokens": 9953970.0, + "step": 611 + }, + { + "entropy": 0.5345568954944611, + "epoch": 2.28411214953271, + "grad_norm": 0.0438290499150753, + "learning_rate": 0.0002, + "loss": 0.5399565100669861, + "mean_token_accuracy": 0.7790825515985489, + "num_tokens": 9970196.0, + "step": 612 + }, + { + "entropy": 0.5261162966489792, + "epoch": 2.2878504672897195, + "grad_norm": 0.0412151962518692, + "learning_rate": 0.0002, + "loss": 0.5271809101104736, + "mean_token_accuracy": 0.7853071689605713, + "num_tokens": 9986604.0, + "step": 613 + }, + { + "entropy": 0.5332745313644409, + "epoch": 2.291588785046729, + "grad_norm": 0.03914888948202133, + "learning_rate": 0.0002, + "loss": 0.5373774766921997, + "mean_token_accuracy": 0.7806718051433563, + "num_tokens": 10003320.0, + "step": 614 + }, + { + "entropy": 0.5465565472841263, + "epoch": 2.2953271028037383, + "grad_norm": 0.036169324070215225, + "learning_rate": 0.0002, + "loss": 0.5407333970069885, + "mean_token_accuracy": 0.7835270166397095, + "num_tokens": 10019823.0, + "step": 615 + }, + { + "entropy": 0.5345954746007919, + "epoch": 2.2990654205607477, + "grad_norm": 0.0409700982272625, + "learning_rate": 0.0002, + "loss": 0.5321113467216492, + "mean_token_accuracy": 0.7842907607555389, + "num_tokens": 10036092.0, + "step": 616 + }, + { + "entropy": 0.5495668053627014, + "epoch": 2.302803738317757, + "grad_norm": 0.03986911475658417, + "learning_rate": 0.0002, + "loss": 0.5555334687232971, + "mean_token_accuracy": 0.7753788381814957, + "num_tokens": 10052693.0, + "step": 617 + }, + { + "entropy": 0.5311184674501419, + "epoch": 2.3065420560747665, + "grad_norm": 0.04703551530838013, + "learning_rate": 0.0002, + "loss": 0.5360814929008484, + "mean_token_accuracy": 0.7818758338689804, + "num_tokens": 10069207.0, + "step": 618 + }, + { + "entropy": 0.5312536582350731, + "epoch": 2.310280373831776, + "grad_norm": 0.042258135974407196, + "learning_rate": 0.0002, + "loss": 0.5263288021087646, + "mean_token_accuracy": 0.7859921306371689, + "num_tokens": 10085277.0, + "step": 619 + }, + { + "entropy": 0.5470355451107025, + "epoch": 2.3140186915887853, + "grad_norm": 0.03973059356212616, + "learning_rate": 0.0002, + "loss": 0.545122504234314, + "mean_token_accuracy": 0.7791093438863754, + "num_tokens": 10101813.0, + "step": 620 + }, + { + "entropy": 0.5233904868364334, + "epoch": 2.317757009345794, + "grad_norm": 0.042754027992486954, + "learning_rate": 0.0002, + "loss": 0.5272444486618042, + "mean_token_accuracy": 0.7844137996435165, + "num_tokens": 10118006.0, + "step": 621 + }, + { + "entropy": 0.5061093345284462, + "epoch": 2.3214953271028036, + "grad_norm": 0.038454048335552216, + "learning_rate": 0.0002, + "loss": 0.5092731714248657, + "mean_token_accuracy": 0.7937048375606537, + "num_tokens": 10134010.0, + "step": 622 + }, + { + "entropy": 0.5244115591049194, + "epoch": 2.325233644859813, + "grad_norm": 0.04292070120573044, + "learning_rate": 0.0002, + "loss": 0.531722366809845, + "mean_token_accuracy": 0.7844948768615723, + "num_tokens": 10150394.0, + "step": 623 + }, + { + "entropy": 0.5242590308189392, + "epoch": 2.3289719626168224, + "grad_norm": 0.041382841765880585, + "learning_rate": 0.0002, + "loss": 0.5197622776031494, + "mean_token_accuracy": 0.7893139272928238, + "num_tokens": 10166971.0, + "step": 624 + }, + { + "entropy": 0.5171145796775818, + "epoch": 2.3327102803738318, + "grad_norm": 0.04326708987355232, + "learning_rate": 0.0002, + "loss": 0.5186761021614075, + "mean_token_accuracy": 0.7864463478326797, + "num_tokens": 10183110.0, + "step": 625 + }, + { + "entropy": 0.537827268242836, + "epoch": 2.336448598130841, + "grad_norm": 0.03789723291993141, + "learning_rate": 0.0002, + "loss": 0.533248245716095, + "mean_token_accuracy": 0.7839608788490295, + "num_tokens": 10199350.0, + "step": 626 + }, + { + "entropy": 0.5234524011611938, + "epoch": 2.3401869158878505, + "grad_norm": 0.04052973911166191, + "learning_rate": 0.0002, + "loss": 0.5173730254173279, + "mean_token_accuracy": 0.7916721105575562, + "num_tokens": 10215685.0, + "step": 627 + }, + { + "entropy": 0.5470897704362869, + "epoch": 2.34392523364486, + "grad_norm": 0.03785538300871849, + "learning_rate": 0.0002, + "loss": 0.5449895262718201, + "mean_token_accuracy": 0.7797056883573532, + "num_tokens": 10231984.0, + "step": 628 + }, + { + "entropy": 0.5209166556596756, + "epoch": 2.3476635514018693, + "grad_norm": 0.04008970037102699, + "learning_rate": 0.0002, + "loss": 0.5221812129020691, + "mean_token_accuracy": 0.7847004383802414, + "num_tokens": 10248210.0, + "step": 629 + }, + { + "entropy": 0.5186321288347244, + "epoch": 2.3514018691588783, + "grad_norm": 0.04004783183336258, + "learning_rate": 0.0002, + "loss": 0.5238394141197205, + "mean_token_accuracy": 0.7874705046415329, + "num_tokens": 10264273.0, + "step": 630 + }, + { + "entropy": 0.5370597541332245, + "epoch": 2.3551401869158877, + "grad_norm": 0.04393518716096878, + "learning_rate": 0.0002, + "loss": 0.5417079925537109, + "mean_token_accuracy": 0.7808088809251785, + "num_tokens": 10280306.0, + "step": 631 + }, + { + "entropy": 0.5319319814443588, + "epoch": 2.358878504672897, + "grad_norm": 0.041104018688201904, + "learning_rate": 0.0002, + "loss": 0.5323211550712585, + "mean_token_accuracy": 0.7829127311706543, + "num_tokens": 10296761.0, + "step": 632 + }, + { + "entropy": 0.5280259251594543, + "epoch": 2.3626168224299064, + "grad_norm": 0.04328769072890282, + "learning_rate": 0.0002, + "loss": 0.5300282835960388, + "mean_token_accuracy": 0.7860163003206253, + "num_tokens": 10312760.0, + "step": 633 + }, + { + "entropy": 0.5409206449985504, + "epoch": 2.366355140186916, + "grad_norm": 0.03652770817279816, + "learning_rate": 0.0002, + "loss": 0.5389151573181152, + "mean_token_accuracy": 0.7825797498226166, + "num_tokens": 10328967.0, + "step": 634 + }, + { + "entropy": 0.5524943023920059, + "epoch": 2.3700934579439252, + "grad_norm": 0.03295084461569786, + "learning_rate": 0.0002, + "loss": 0.5457990169525146, + "mean_token_accuracy": 0.7791374921798706, + "num_tokens": 10345465.0, + "step": 635 + }, + { + "entropy": 0.5282382369041443, + "epoch": 2.3738317757009346, + "grad_norm": 0.04183012619614601, + "learning_rate": 0.0002, + "loss": 0.5293847322463989, + "mean_token_accuracy": 0.7866223007440567, + "num_tokens": 10361693.0, + "step": 636 + }, + { + "entropy": 0.5491131991147995, + "epoch": 2.377570093457944, + "grad_norm": 0.04285868629813194, + "learning_rate": 0.0002, + "loss": 0.5491060614585876, + "mean_token_accuracy": 0.7811785042285919, + "num_tokens": 10377893.0, + "step": 637 + }, + { + "entropy": 0.5452415496110916, + "epoch": 2.3813084112149534, + "grad_norm": 0.03759608045220375, + "learning_rate": 0.0002, + "loss": 0.542472779750824, + "mean_token_accuracy": 0.7795874774456024, + "num_tokens": 10394225.0, + "step": 638 + }, + { + "entropy": 0.5412954837083817, + "epoch": 2.385046728971963, + "grad_norm": 0.04048043116927147, + "learning_rate": 0.0002, + "loss": 0.5419274568557739, + "mean_token_accuracy": 0.7792576551437378, + "num_tokens": 10410592.0, + "step": 639 + }, + { + "entropy": 0.5429834425449371, + "epoch": 2.388785046728972, + "grad_norm": 0.03450307622551918, + "learning_rate": 0.0002, + "loss": 0.5450208187103271, + "mean_token_accuracy": 0.7783068269491196, + "num_tokens": 10427270.0, + "step": 640 + }, + { + "entropy": 0.5542936772108078, + "epoch": 2.392523364485981, + "grad_norm": 0.038160763680934906, + "learning_rate": 0.0002, + "loss": 0.5524436831474304, + "mean_token_accuracy": 0.7759971767663956, + "num_tokens": 10443620.0, + "step": 641 + }, + { + "entropy": 0.5481744855642319, + "epoch": 2.3962616822429905, + "grad_norm": 0.045667361468076706, + "learning_rate": 0.0002, + "loss": 0.5538838505744934, + "mean_token_accuracy": 0.7768992632627487, + "num_tokens": 10459970.0, + "step": 642 + }, + { + "entropy": 0.5398150980472565, + "epoch": 2.4, + "grad_norm": 0.03911864385008812, + "learning_rate": 0.0002, + "loss": 0.5386427044868469, + "mean_token_accuracy": 0.7801983505487442, + "num_tokens": 10476045.0, + "step": 643 + }, + { + "entropy": 0.539577066898346, + "epoch": 2.4037383177570093, + "grad_norm": 0.038023967295885086, + "learning_rate": 0.0002, + "loss": 0.5344752669334412, + "mean_token_accuracy": 0.7824069559574127, + "num_tokens": 10492232.0, + "step": 644 + }, + { + "entropy": 0.5577429980039597, + "epoch": 2.4074766355140187, + "grad_norm": 0.04197937622666359, + "learning_rate": 0.0002, + "loss": 0.5572798252105713, + "mean_token_accuracy": 0.7743511646986008, + "num_tokens": 10508664.0, + "step": 645 + }, + { + "entropy": 0.5277721136808395, + "epoch": 2.411214953271028, + "grad_norm": 0.04219021648168564, + "learning_rate": 0.0002, + "loss": 0.5340980887413025, + "mean_token_accuracy": 0.784222811460495, + "num_tokens": 10524893.0, + "step": 646 + }, + { + "entropy": 0.5434385687112808, + "epoch": 2.4149532710280375, + "grad_norm": 0.04173292592167854, + "learning_rate": 0.0002, + "loss": 0.5448347330093384, + "mean_token_accuracy": 0.7814051806926727, + "num_tokens": 10541450.0, + "step": 647 + }, + { + "entropy": 0.542335107922554, + "epoch": 2.418691588785047, + "grad_norm": 0.04221229627728462, + "learning_rate": 0.0002, + "loss": 0.539567232131958, + "mean_token_accuracy": 0.778713047504425, + "num_tokens": 10557626.0, + "step": 648 + }, + { + "entropy": 0.5323275178670883, + "epoch": 2.4224299065420563, + "grad_norm": 0.04274986311793327, + "learning_rate": 0.0002, + "loss": 0.5353638529777527, + "mean_token_accuracy": 0.7840659618377686, + "num_tokens": 10573896.0, + "step": 649 + }, + { + "entropy": 0.53394415974617, + "epoch": 2.426168224299065, + "grad_norm": 0.03369283676147461, + "learning_rate": 0.0002, + "loss": 0.5374135971069336, + "mean_token_accuracy": 0.7838425934314728, + "num_tokens": 10590327.0, + "step": 650 + }, + { + "entropy": 0.5415229946374893, + "epoch": 2.4299065420560746, + "grad_norm": 0.041201673448085785, + "learning_rate": 0.0002, + "loss": 0.5444363355636597, + "mean_token_accuracy": 0.7790240347385406, + "num_tokens": 10606679.0, + "step": 651 + }, + { + "entropy": 0.5405219197273254, + "epoch": 2.433644859813084, + "grad_norm": 0.03498642519116402, + "learning_rate": 0.0002, + "loss": 0.5373417139053345, + "mean_token_accuracy": 0.780651792883873, + "num_tokens": 10622823.0, + "step": 652 + }, + { + "entropy": 0.552605539560318, + "epoch": 2.4373831775700934, + "grad_norm": 0.035641010850667953, + "learning_rate": 0.0002, + "loss": 0.5453448295593262, + "mean_token_accuracy": 0.778374508023262, + "num_tokens": 10639286.0, + "step": 653 + }, + { + "entropy": 0.5386696010828018, + "epoch": 2.4411214953271028, + "grad_norm": 0.03597128391265869, + "learning_rate": 0.0002, + "loss": 0.5370036959648132, + "mean_token_accuracy": 0.7805056869983673, + "num_tokens": 10655659.0, + "step": 654 + }, + { + "entropy": 0.5298606008291245, + "epoch": 2.444859813084112, + "grad_norm": 0.044264063239097595, + "learning_rate": 0.0002, + "loss": 0.5362837314605713, + "mean_token_accuracy": 0.783015176653862, + "num_tokens": 10672206.0, + "step": 655 + }, + { + "entropy": 0.5373793393373489, + "epoch": 2.4485981308411215, + "grad_norm": 0.03741737827658653, + "learning_rate": 0.0002, + "loss": 0.5384023189544678, + "mean_token_accuracy": 0.7820392400026321, + "num_tokens": 10688480.0, + "step": 656 + }, + { + "entropy": 0.5373315960168839, + "epoch": 2.452336448598131, + "grad_norm": 0.03572740778326988, + "learning_rate": 0.0002, + "loss": 0.5341760516166687, + "mean_token_accuracy": 0.7831654995679855, + "num_tokens": 10704724.0, + "step": 657 + }, + { + "entropy": 0.5349675416946411, + "epoch": 2.4560747663551403, + "grad_norm": 0.033913351595401764, + "learning_rate": 0.0002, + "loss": 0.5286991000175476, + "mean_token_accuracy": 0.7848687022924423, + "num_tokens": 10720999.0, + "step": 658 + }, + { + "entropy": 0.5428586453199387, + "epoch": 2.4598130841121497, + "grad_norm": 0.03939831256866455, + "learning_rate": 0.0002, + "loss": 0.5347580313682556, + "mean_token_accuracy": 0.7854909747838974, + "num_tokens": 10737393.0, + "step": 659 + }, + { + "entropy": 0.5376365929841995, + "epoch": 2.463551401869159, + "grad_norm": 0.040565796196460724, + "learning_rate": 0.0002, + "loss": 0.5377070903778076, + "mean_token_accuracy": 0.7797142714262009, + "num_tokens": 10753719.0, + "step": 660 + }, + { + "entropy": 0.5290745496749878, + "epoch": 2.467289719626168, + "grad_norm": 0.042844053357839584, + "learning_rate": 0.0002, + "loss": 0.5402673482894897, + "mean_token_accuracy": 0.7787514328956604, + "num_tokens": 10770023.0, + "step": 661 + }, + { + "entropy": 0.5171011686325073, + "epoch": 2.4710280373831774, + "grad_norm": 0.03879300504922867, + "learning_rate": 0.0002, + "loss": 0.5276088714599609, + "mean_token_accuracy": 0.7832213789224625, + "num_tokens": 10786225.0, + "step": 662 + }, + { + "entropy": 0.5425639152526855, + "epoch": 2.474766355140187, + "grad_norm": 0.039429496973752975, + "learning_rate": 0.0002, + "loss": 0.5366618633270264, + "mean_token_accuracy": 0.7817561626434326, + "num_tokens": 10802664.0, + "step": 663 + }, + { + "entropy": 0.5487925857305527, + "epoch": 2.4785046728971962, + "grad_norm": 0.037153951823711395, + "learning_rate": 0.0002, + "loss": 0.5378848314285278, + "mean_token_accuracy": 0.7842334508895874, + "num_tokens": 10818791.0, + "step": 664 + }, + { + "entropy": 0.5683897584676743, + "epoch": 2.4822429906542056, + "grad_norm": 0.037182942032814026, + "learning_rate": 0.0002, + "loss": 0.558560848236084, + "mean_token_accuracy": 0.7739170640707016, + "num_tokens": 10835365.0, + "step": 665 + }, + { + "entropy": 0.522308886051178, + "epoch": 2.485981308411215, + "grad_norm": 0.038330383598804474, + "learning_rate": 0.0002, + "loss": 0.5248692035675049, + "mean_token_accuracy": 0.786837100982666, + "num_tokens": 10851372.0, + "step": 666 + }, + { + "entropy": 0.5347889512777328, + "epoch": 2.4897196261682244, + "grad_norm": 0.0479077473282814, + "learning_rate": 0.0002, + "loss": 0.5449220538139343, + "mean_token_accuracy": 0.7782158553600311, + "num_tokens": 10867792.0, + "step": 667 + }, + { + "entropy": 0.5346285253763199, + "epoch": 2.493457943925234, + "grad_norm": 0.03586270660161972, + "learning_rate": 0.0002, + "loss": 0.5363122224807739, + "mean_token_accuracy": 0.7790896743535995, + "num_tokens": 10883867.0, + "step": 668 + }, + { + "entropy": 0.543928936123848, + "epoch": 2.497196261682243, + "grad_norm": 0.034230347722768784, + "learning_rate": 0.0002, + "loss": 0.5394496321678162, + "mean_token_accuracy": 0.7808743715286255, + "num_tokens": 10900252.0, + "step": 669 + }, + { + "entropy": 0.5318520069122314, + "epoch": 2.500934579439252, + "grad_norm": 0.03817044571042061, + "learning_rate": 0.0002, + "loss": 0.5254085063934326, + "mean_token_accuracy": 0.7874796390533447, + "num_tokens": 10916581.0, + "step": 670 + }, + { + "entropy": 0.5420865267515182, + "epoch": 2.5046728971962615, + "grad_norm": 0.040148667991161346, + "learning_rate": 0.0002, + "loss": 0.5419960021972656, + "mean_token_accuracy": 0.7817145586013794, + "num_tokens": 10932966.0, + "step": 671 + }, + { + "entropy": 0.5323408842086792, + "epoch": 2.508411214953271, + "grad_norm": 0.03530094772577286, + "learning_rate": 0.0002, + "loss": 0.5375533103942871, + "mean_token_accuracy": 0.7799390703439713, + "num_tokens": 10949355.0, + "step": 672 + }, + { + "entropy": 0.5461747795343399, + "epoch": 2.5121495327102803, + "grad_norm": 0.044817496091127396, + "learning_rate": 0.0002, + "loss": 0.5525781512260437, + "mean_token_accuracy": 0.7765567153692245, + "num_tokens": 10965822.0, + "step": 673 + }, + { + "entropy": 0.5261489972472191, + "epoch": 2.5158878504672897, + "grad_norm": 0.03769567608833313, + "learning_rate": 0.0002, + "loss": 0.5308763980865479, + "mean_token_accuracy": 0.7855723053216934, + "num_tokens": 10982147.0, + "step": 674 + }, + { + "entropy": 0.5603571683168411, + "epoch": 2.519626168224299, + "grad_norm": 0.03521028161048889, + "learning_rate": 0.0002, + "loss": 0.5556987524032593, + "mean_token_accuracy": 0.7740853279829025, + "num_tokens": 10998452.0, + "step": 675 + }, + { + "entropy": 0.5460142344236374, + "epoch": 2.5233644859813085, + "grad_norm": 0.04217759519815445, + "learning_rate": 0.0002, + "loss": 0.537194013595581, + "mean_token_accuracy": 0.7822094410657883, + "num_tokens": 11014939.0, + "step": 676 + }, + { + "entropy": 0.5280861929059029, + "epoch": 2.527102803738318, + "grad_norm": 0.0387798547744751, + "learning_rate": 0.0002, + "loss": 0.5307282209396362, + "mean_token_accuracy": 0.7829688042402267, + "num_tokens": 11031382.0, + "step": 677 + }, + { + "entropy": 0.540501594543457, + "epoch": 2.5308411214953273, + "grad_norm": 0.04039468243718147, + "learning_rate": 0.0002, + "loss": 0.5451690554618835, + "mean_token_accuracy": 0.7794826477766037, + "num_tokens": 11047827.0, + "step": 678 + }, + { + "entropy": 0.5370010584592819, + "epoch": 2.5345794392523366, + "grad_norm": 0.04067126661539078, + "learning_rate": 0.0002, + "loss": 0.5422821044921875, + "mean_token_accuracy": 0.7765499353408813, + "num_tokens": 11063880.0, + "step": 679 + }, + { + "entropy": 0.5253852158784866, + "epoch": 2.538317757009346, + "grad_norm": 0.0398363396525383, + "learning_rate": 0.0002, + "loss": 0.5209527611732483, + "mean_token_accuracy": 0.7851253598928452, + "num_tokens": 11079962.0, + "step": 680 + }, + { + "entropy": 0.5486203581094742, + "epoch": 2.542056074766355, + "grad_norm": 0.036812763661146164, + "learning_rate": 0.0002, + "loss": 0.5471652746200562, + "mean_token_accuracy": 0.7765830755233765, + "num_tokens": 11096121.0, + "step": 681 + }, + { + "entropy": 0.5477603673934937, + "epoch": 2.5457943925233644, + "grad_norm": 0.036881882697343826, + "learning_rate": 0.0002, + "loss": 0.5459142327308655, + "mean_token_accuracy": 0.7773014456033707, + "num_tokens": 11112554.0, + "step": 682 + }, + { + "entropy": 0.5485733300447464, + "epoch": 2.5495327102803738, + "grad_norm": 0.038766611367464066, + "learning_rate": 0.0002, + "loss": 0.5489373803138733, + "mean_token_accuracy": 0.7768855541944504, + "num_tokens": 11128948.0, + "step": 683 + }, + { + "entropy": 0.5454789996147156, + "epoch": 2.553271028037383, + "grad_norm": 0.03894040733575821, + "learning_rate": 0.0002, + "loss": 0.5461348295211792, + "mean_token_accuracy": 0.7770956754684448, + "num_tokens": 11145269.0, + "step": 684 + }, + { + "entropy": 0.5280539393424988, + "epoch": 2.5570093457943925, + "grad_norm": 0.03855814412236214, + "learning_rate": 0.0002, + "loss": 0.5288204550743103, + "mean_token_accuracy": 0.7839067131280899, + "num_tokens": 11161868.0, + "step": 685 + }, + { + "entropy": 0.5194002389907837, + "epoch": 2.560747663551402, + "grad_norm": 0.04192323610186577, + "learning_rate": 0.0002, + "loss": 0.5202341079711914, + "mean_token_accuracy": 0.7879376262426376, + "num_tokens": 11178075.0, + "step": 686 + }, + { + "entropy": 0.530809260904789, + "epoch": 2.5644859813084113, + "grad_norm": 0.0389430895447731, + "learning_rate": 0.0002, + "loss": 0.5309376120567322, + "mean_token_accuracy": 0.7828936278820038, + "num_tokens": 11194469.0, + "step": 687 + }, + { + "entropy": 0.5557933151721954, + "epoch": 2.5682242990654207, + "grad_norm": 0.04179178923368454, + "learning_rate": 0.0002, + "loss": 0.5567797422409058, + "mean_token_accuracy": 0.77239590883255, + "num_tokens": 11210862.0, + "step": 688 + }, + { + "entropy": 0.5501526147127151, + "epoch": 2.5719626168224297, + "grad_norm": 0.037524376064538956, + "learning_rate": 0.0002, + "loss": 0.5423179864883423, + "mean_token_accuracy": 0.7853735685348511, + "num_tokens": 11227307.0, + "step": 689 + }, + { + "entropy": 0.5341353267431259, + "epoch": 2.575700934579439, + "grad_norm": 0.03998028114438057, + "learning_rate": 0.0002, + "loss": 0.5339105129241943, + "mean_token_accuracy": 0.7838831543922424, + "num_tokens": 11243496.0, + "step": 690 + }, + { + "entropy": 0.5265656411647797, + "epoch": 2.5794392523364484, + "grad_norm": 0.037673015147447586, + "learning_rate": 0.0002, + "loss": 0.532894492149353, + "mean_token_accuracy": 0.7829709053039551, + "num_tokens": 11259619.0, + "step": 691 + }, + { + "entropy": 0.5275788754224777, + "epoch": 2.583177570093458, + "grad_norm": 0.042666979134082794, + "learning_rate": 0.0002, + "loss": 0.5304786562919617, + "mean_token_accuracy": 0.786617174744606, + "num_tokens": 11275881.0, + "step": 692 + }, + { + "entropy": 0.5386812686920166, + "epoch": 2.586915887850467, + "grad_norm": 0.03469294682145119, + "learning_rate": 0.0002, + "loss": 0.5371942520141602, + "mean_token_accuracy": 0.7833464443683624, + "num_tokens": 11292531.0, + "step": 693 + }, + { + "entropy": 0.5397353172302246, + "epoch": 2.5906542056074766, + "grad_norm": 0.03771025687456131, + "learning_rate": 0.0002, + "loss": 0.5416728854179382, + "mean_token_accuracy": 0.7781106233596802, + "num_tokens": 11308718.0, + "step": 694 + }, + { + "entropy": 0.5285734534263611, + "epoch": 2.594392523364486, + "grad_norm": 0.03823580965399742, + "learning_rate": 0.0002, + "loss": 0.5330255627632141, + "mean_token_accuracy": 0.7819734811782837, + "num_tokens": 11324754.0, + "step": 695 + }, + { + "entropy": 0.5268194079399109, + "epoch": 2.5981308411214954, + "grad_norm": 0.04260648787021637, + "learning_rate": 0.0002, + "loss": 0.5239391326904297, + "mean_token_accuracy": 0.7846912294626236, + "num_tokens": 11340968.0, + "step": 696 + }, + { + "entropy": 0.5480171293020248, + "epoch": 2.601869158878505, + "grad_norm": 0.036302078515291214, + "learning_rate": 0.0002, + "loss": 0.5469081997871399, + "mean_token_accuracy": 0.7779969573020935, + "num_tokens": 11357426.0, + "step": 697 + }, + { + "entropy": 0.5506609529256821, + "epoch": 2.605607476635514, + "grad_norm": 0.03509664908051491, + "learning_rate": 0.0002, + "loss": 0.5489600300788879, + "mean_token_accuracy": 0.7774904668331146, + "num_tokens": 11373998.0, + "step": 698 + }, + { + "entropy": 0.5326560884714127, + "epoch": 2.6093457943925236, + "grad_norm": 0.04218761622905731, + "learning_rate": 0.0002, + "loss": 0.5333682894706726, + "mean_token_accuracy": 0.7844813168048859, + "num_tokens": 11390325.0, + "step": 699 + }, + { + "entropy": 0.5372011959552765, + "epoch": 2.613084112149533, + "grad_norm": 0.033543672412633896, + "learning_rate": 0.0002, + "loss": 0.5339676737785339, + "mean_token_accuracy": 0.7814666479825974, + "num_tokens": 11406819.0, + "step": 700 + }, + { + "entropy": 0.5389137715101242, + "epoch": 2.616822429906542, + "grad_norm": 0.041952140629291534, + "learning_rate": 0.0002, + "loss": 0.544516921043396, + "mean_token_accuracy": 0.777948647737503, + "num_tokens": 11423114.0, + "step": 701 + }, + { + "entropy": 0.5339553654193878, + "epoch": 2.6205607476635513, + "grad_norm": 0.04165533185005188, + "learning_rate": 0.0002, + "loss": 0.5357546210289001, + "mean_token_accuracy": 0.7815313339233398, + "num_tokens": 11439501.0, + "step": 702 + }, + { + "entropy": 0.5327703803777695, + "epoch": 2.6242990654205607, + "grad_norm": 0.04676680266857147, + "learning_rate": 0.0002, + "loss": 0.5317723155021667, + "mean_token_accuracy": 0.7842179238796234, + "num_tokens": 11455968.0, + "step": 703 + }, + { + "entropy": 0.5289216041564941, + "epoch": 2.62803738317757, + "grad_norm": 0.04229076951742172, + "learning_rate": 0.0002, + "loss": 0.5229822397232056, + "mean_token_accuracy": 0.7897761762142181, + "num_tokens": 11472189.0, + "step": 704 + }, + { + "entropy": 0.5446136146783829, + "epoch": 2.6317757009345795, + "grad_norm": 0.045102689415216446, + "learning_rate": 0.0002, + "loss": 0.5427557229995728, + "mean_token_accuracy": 0.780061662197113, + "num_tokens": 11488564.0, + "step": 705 + }, + { + "entropy": 0.5221227407455444, + "epoch": 2.635514018691589, + "grad_norm": 0.048957664519548416, + "learning_rate": 0.0002, + "loss": 0.5266565680503845, + "mean_token_accuracy": 0.7874798774719238, + "num_tokens": 11505006.0, + "step": 706 + }, + { + "entropy": 0.5404656529426575, + "epoch": 2.6392523364485982, + "grad_norm": 0.04282474145293236, + "learning_rate": 0.0002, + "loss": 0.5407752394676208, + "mean_token_accuracy": 0.7841375470161438, + "num_tokens": 11521778.0, + "step": 707 + }, + { + "entropy": 0.5307196229696274, + "epoch": 2.6429906542056076, + "grad_norm": 0.03666854277253151, + "learning_rate": 0.0002, + "loss": 0.5308942198753357, + "mean_token_accuracy": 0.7844790369272232, + "num_tokens": 11538182.0, + "step": 708 + }, + { + "entropy": 0.5377398878335953, + "epoch": 2.6467289719626166, + "grad_norm": 0.039178650826215744, + "learning_rate": 0.0002, + "loss": 0.5345498919487, + "mean_token_accuracy": 0.7794453948736191, + "num_tokens": 11554684.0, + "step": 709 + }, + { + "entropy": 0.5536644905805588, + "epoch": 2.650467289719626, + "grad_norm": 0.03832559287548065, + "learning_rate": 0.0002, + "loss": 0.5499898195266724, + "mean_token_accuracy": 0.7778642326593399, + "num_tokens": 11571201.0, + "step": 710 + }, + { + "entropy": 0.5420532822608948, + "epoch": 2.6542056074766354, + "grad_norm": 0.03739701583981514, + "learning_rate": 0.0002, + "loss": 0.5493305921554565, + "mean_token_accuracy": 0.7773284465074539, + "num_tokens": 11587368.0, + "step": 711 + }, + { + "entropy": 0.5395905524492264, + "epoch": 2.6579439252336448, + "grad_norm": 0.04595664143562317, + "learning_rate": 0.0002, + "loss": 0.544034481048584, + "mean_token_accuracy": 0.7786346226930618, + "num_tokens": 11603772.0, + "step": 712 + }, + { + "entropy": 0.5372798144817352, + "epoch": 2.661682242990654, + "grad_norm": 0.04127715900540352, + "learning_rate": 0.0002, + "loss": 0.5426503419876099, + "mean_token_accuracy": 0.7785855233669281, + "num_tokens": 11620154.0, + "step": 713 + }, + { + "entropy": 0.5510691106319427, + "epoch": 2.6654205607476635, + "grad_norm": 0.03742792084813118, + "learning_rate": 0.0002, + "loss": 0.5547200441360474, + "mean_token_accuracy": 0.7739700675010681, + "num_tokens": 11636423.0, + "step": 714 + }, + { + "entropy": 0.5472855418920517, + "epoch": 2.669158878504673, + "grad_norm": 0.04350201040506363, + "learning_rate": 0.0002, + "loss": 0.5393113493919373, + "mean_token_accuracy": 0.7822133600711823, + "num_tokens": 11652714.0, + "step": 715 + }, + { + "entropy": 0.5539711713790894, + "epoch": 2.6728971962616823, + "grad_norm": 0.03917175903916359, + "learning_rate": 0.0002, + "loss": 0.5469767451286316, + "mean_token_accuracy": 0.7737536281347275, + "num_tokens": 11668901.0, + "step": 716 + }, + { + "entropy": 0.532858207821846, + "epoch": 2.6766355140186917, + "grad_norm": 0.04417381435632706, + "learning_rate": 0.0002, + "loss": 0.5299221873283386, + "mean_token_accuracy": 0.7851613610982895, + "num_tokens": 11685245.0, + "step": 717 + }, + { + "entropy": 0.5360149517655373, + "epoch": 2.680373831775701, + "grad_norm": 0.045524608343839645, + "learning_rate": 0.0002, + "loss": 0.5396956205368042, + "mean_token_accuracy": 0.7826401442289352, + "num_tokens": 11701586.0, + "step": 718 + }, + { + "entropy": 0.5417494922876358, + "epoch": 2.6841121495327105, + "grad_norm": 0.04371016472578049, + "learning_rate": 0.0002, + "loss": 0.547548770904541, + "mean_token_accuracy": 0.7796981632709503, + "num_tokens": 11717869.0, + "step": 719 + }, + { + "entropy": 0.5364665389060974, + "epoch": 2.68785046728972, + "grad_norm": 0.04112569987773895, + "learning_rate": 0.0002, + "loss": 0.5360340476036072, + "mean_token_accuracy": 0.7829563021659851, + "num_tokens": 11733931.0, + "step": 720 + }, + { + "entropy": 0.5393244326114655, + "epoch": 2.691588785046729, + "grad_norm": 0.03485904261469841, + "learning_rate": 0.0002, + "loss": 0.5365700721740723, + "mean_token_accuracy": 0.7832135111093521, + "num_tokens": 11750251.0, + "step": 721 + }, + { + "entropy": 0.5402418822050095, + "epoch": 2.695327102803738, + "grad_norm": 0.035655390471220016, + "learning_rate": 0.0002, + "loss": 0.5399341583251953, + "mean_token_accuracy": 0.7821518778800964, + "num_tokens": 11766402.0, + "step": 722 + }, + { + "entropy": 0.5645921379327774, + "epoch": 2.6990654205607476, + "grad_norm": 0.03827064484357834, + "learning_rate": 0.0002, + "loss": 0.5571991205215454, + "mean_token_accuracy": 0.7733338475227356, + "num_tokens": 11783003.0, + "step": 723 + }, + { + "entropy": 0.5261956825852394, + "epoch": 2.702803738317757, + "grad_norm": 0.03958537429571152, + "learning_rate": 0.0002, + "loss": 0.5266394019126892, + "mean_token_accuracy": 0.7840381115674973, + "num_tokens": 11799129.0, + "step": 724 + }, + { + "entropy": 0.5451592206954956, + "epoch": 2.7065420560747664, + "grad_norm": 0.03517312929034233, + "learning_rate": 0.0002, + "loss": 0.5486286878585815, + "mean_token_accuracy": 0.7751190662384033, + "num_tokens": 11815476.0, + "step": 725 + }, + { + "entropy": 0.5393543839454651, + "epoch": 2.710280373831776, + "grad_norm": 0.0371185727417469, + "learning_rate": 0.0002, + "loss": 0.5353912115097046, + "mean_token_accuracy": 0.7856109440326691, + "num_tokens": 11831984.0, + "step": 726 + }, + { + "entropy": 0.5295817404985428, + "epoch": 2.714018691588785, + "grad_norm": 0.038498733192682266, + "learning_rate": 0.0002, + "loss": 0.5308818221092224, + "mean_token_accuracy": 0.7821252197027206, + "num_tokens": 11848466.0, + "step": 727 + }, + { + "entropy": 0.5493554025888443, + "epoch": 2.717757009345794, + "grad_norm": 0.039601247757673264, + "learning_rate": 0.0002, + "loss": 0.547062873840332, + "mean_token_accuracy": 0.7771877348423004, + "num_tokens": 11864918.0, + "step": 728 + }, + { + "entropy": 0.5261873155832291, + "epoch": 2.7214953271028035, + "grad_norm": 0.038875121623277664, + "learning_rate": 0.0002, + "loss": 0.5303913950920105, + "mean_token_accuracy": 0.7834369987249374, + "num_tokens": 11881212.0, + "step": 729 + }, + { + "entropy": 0.5295650064945221, + "epoch": 2.725233644859813, + "grad_norm": 0.03564633056521416, + "learning_rate": 0.0002, + "loss": 0.5287110805511475, + "mean_token_accuracy": 0.7855493873357773, + "num_tokens": 11897285.0, + "step": 730 + }, + { + "entropy": 0.5161215513944626, + "epoch": 2.7289719626168223, + "grad_norm": 0.03593486547470093, + "learning_rate": 0.0002, + "loss": 0.5092670321464539, + "mean_token_accuracy": 0.7932390123605728, + "num_tokens": 11913860.0, + "step": 731 + }, + { + "entropy": 0.5438321828842163, + "epoch": 2.7327102803738317, + "grad_norm": 0.03703992813825607, + "learning_rate": 0.0002, + "loss": 0.5476446151733398, + "mean_token_accuracy": 0.7795855104923248, + "num_tokens": 11930261.0, + "step": 732 + }, + { + "entropy": 0.5311425775289536, + "epoch": 2.736448598130841, + "grad_norm": 0.037695612758398056, + "learning_rate": 0.0002, + "loss": 0.534140408039093, + "mean_token_accuracy": 0.7818547487258911, + "num_tokens": 11946560.0, + "step": 733 + }, + { + "entropy": 0.5209683775901794, + "epoch": 2.7401869158878505, + "grad_norm": 0.03890440613031387, + "learning_rate": 0.0002, + "loss": 0.5241660475730896, + "mean_token_accuracy": 0.787934735417366, + "num_tokens": 11962956.0, + "step": 734 + }, + { + "entropy": 0.5443778336048126, + "epoch": 2.74392523364486, + "grad_norm": 0.03788384422659874, + "learning_rate": 0.0002, + "loss": 0.5421298742294312, + "mean_token_accuracy": 0.7814332842826843, + "num_tokens": 11979290.0, + "step": 735 + }, + { + "entropy": 0.5518642365932465, + "epoch": 2.7476635514018692, + "grad_norm": 0.040383294224739075, + "learning_rate": 0.0002, + "loss": 0.555992603302002, + "mean_token_accuracy": 0.7762507796287537, + "num_tokens": 11995487.0, + "step": 736 + }, + { + "entropy": 0.5241853296756744, + "epoch": 2.7514018691588786, + "grad_norm": 0.039098046720027924, + "learning_rate": 0.0002, + "loss": 0.52538001537323, + "mean_token_accuracy": 0.7879568636417389, + "num_tokens": 12011811.0, + "step": 737 + }, + { + "entropy": 0.5365500450134277, + "epoch": 2.755140186915888, + "grad_norm": 0.041129641234874725, + "learning_rate": 0.0002, + "loss": 0.5400822162628174, + "mean_token_accuracy": 0.7799027115106583, + "num_tokens": 12028077.0, + "step": 738 + }, + { + "entropy": 0.5531625598669052, + "epoch": 2.7588785046728974, + "grad_norm": 0.040683627128601074, + "learning_rate": 0.0002, + "loss": 0.5487805604934692, + "mean_token_accuracy": 0.7768402397632599, + "num_tokens": 12044438.0, + "step": 739 + }, + { + "entropy": 0.5442669987678528, + "epoch": 2.762616822429907, + "grad_norm": 0.040092833340168, + "learning_rate": 0.0002, + "loss": 0.547822117805481, + "mean_token_accuracy": 0.7773446589708328, + "num_tokens": 12060591.0, + "step": 740 + }, + { + "entropy": 0.5772512406110764, + "epoch": 2.7663551401869158, + "grad_norm": 0.04026317968964577, + "learning_rate": 0.0002, + "loss": 0.5646360516548157, + "mean_token_accuracy": 0.771668940782547, + "num_tokens": 12076995.0, + "step": 741 + }, + { + "entropy": 0.550237238407135, + "epoch": 2.770093457943925, + "grad_norm": 0.03907431662082672, + "learning_rate": 0.0002, + "loss": 0.5490835905075073, + "mean_token_accuracy": 0.7739729285240173, + "num_tokens": 12093381.0, + "step": 742 + }, + { + "entropy": 0.5247115790843964, + "epoch": 2.7738317757009345, + "grad_norm": 0.038372889161109924, + "learning_rate": 0.0002, + "loss": 0.5327771902084351, + "mean_token_accuracy": 0.7830088138580322, + "num_tokens": 12109606.0, + "step": 743 + }, + { + "entropy": 0.5228423178195953, + "epoch": 2.777570093457944, + "grad_norm": 0.044808950275182724, + "learning_rate": 0.0002, + "loss": 0.5320113301277161, + "mean_token_accuracy": 0.7844864279031754, + "num_tokens": 12125852.0, + "step": 744 + }, + { + "entropy": 0.530124694108963, + "epoch": 2.7813084112149533, + "grad_norm": 0.03803320229053497, + "learning_rate": 0.0002, + "loss": 0.5259105563163757, + "mean_token_accuracy": 0.784927174448967, + "num_tokens": 12142284.0, + "step": 745 + }, + { + "entropy": 0.5511182993650436, + "epoch": 2.7850467289719627, + "grad_norm": 0.03542547672986984, + "learning_rate": 0.0002, + "loss": 0.5479939579963684, + "mean_token_accuracy": 0.7765052765607834, + "num_tokens": 12158595.0, + "step": 746 + }, + { + "entropy": 0.5446942672133446, + "epoch": 2.788785046728972, + "grad_norm": 0.04419056326150894, + "learning_rate": 0.0002, + "loss": 0.5443319082260132, + "mean_token_accuracy": 0.7793864756822586, + "num_tokens": 12174701.0, + "step": 747 + }, + { + "entropy": 0.5594473034143448, + "epoch": 2.792523364485981, + "grad_norm": 0.034595683217048645, + "learning_rate": 0.0002, + "loss": 0.5563086271286011, + "mean_token_accuracy": 0.7736584842205048, + "num_tokens": 12191251.0, + "step": 748 + }, + { + "entropy": 0.5360135138034821, + "epoch": 2.7962616822429904, + "grad_norm": 0.037246719002723694, + "learning_rate": 0.0002, + "loss": 0.53227299451828, + "mean_token_accuracy": 0.7829789221286774, + "num_tokens": 12207768.0, + "step": 749 + }, + { + "entropy": 0.5398263484239578, + "epoch": 2.8, + "grad_norm": 0.040100134909152985, + "learning_rate": 0.0002, + "loss": 0.5421459078788757, + "mean_token_accuracy": 0.7809022068977356, + "num_tokens": 12224318.0, + "step": 750 + }, + { + "entropy": 0.5231769531965256, + "epoch": 2.803738317757009, + "grad_norm": 0.04384174942970276, + "learning_rate": 0.0002, + "loss": 0.532194972038269, + "mean_token_accuracy": 0.7854811102151871, + "num_tokens": 12240560.0, + "step": 751 + }, + { + "entropy": 0.5372914969921112, + "epoch": 2.8074766355140186, + "grad_norm": 0.041690364480018616, + "learning_rate": 0.0002, + "loss": 0.5358984470367432, + "mean_token_accuracy": 0.7798957526683807, + "num_tokens": 12256698.0, + "step": 752 + }, + { + "entropy": 0.5490759164094925, + "epoch": 2.811214953271028, + "grad_norm": 0.035977087914943695, + "learning_rate": 0.0002, + "loss": 0.5457599759101868, + "mean_token_accuracy": 0.7804635167121887, + "num_tokens": 12273196.0, + "step": 753 + }, + { + "entropy": 0.5454043298959732, + "epoch": 2.8149532710280374, + "grad_norm": 0.038070011883974075, + "learning_rate": 0.0002, + "loss": 0.5440967082977295, + "mean_token_accuracy": 0.7805526405572891, + "num_tokens": 12289448.0, + "step": 754 + }, + { + "entropy": 0.5171327888965607, + "epoch": 2.8186915887850468, + "grad_norm": 0.043416958302259445, + "learning_rate": 0.0002, + "loss": 0.5235368609428406, + "mean_token_accuracy": 0.7856585532426834, + "num_tokens": 12305728.0, + "step": 755 + }, + { + "entropy": 0.5519092679023743, + "epoch": 2.822429906542056, + "grad_norm": 0.04086895287036896, + "learning_rate": 0.0002, + "loss": 0.5498278737068176, + "mean_token_accuracy": 0.776623860001564, + "num_tokens": 12322183.0, + "step": 756 + }, + { + "entropy": 0.5254665613174438, + "epoch": 2.8261682242990656, + "grad_norm": 0.03841882944107056, + "learning_rate": 0.0002, + "loss": 0.5266950130462646, + "mean_token_accuracy": 0.786298468708992, + "num_tokens": 12338635.0, + "step": 757 + }, + { + "entropy": 0.5411494970321655, + "epoch": 2.829906542056075, + "grad_norm": 0.0401926264166832, + "learning_rate": 0.0002, + "loss": 0.5370864272117615, + "mean_token_accuracy": 0.7828312367200851, + "num_tokens": 12355089.0, + "step": 758 + }, + { + "entropy": 0.5513712912797928, + "epoch": 2.8336448598130843, + "grad_norm": 0.0333644300699234, + "learning_rate": 0.0002, + "loss": 0.5467706918716431, + "mean_token_accuracy": 0.7782435566186905, + "num_tokens": 12371887.0, + "step": 759 + }, + { + "entropy": 0.5323017537593842, + "epoch": 2.8373831775700937, + "grad_norm": 0.04663057625293732, + "learning_rate": 0.0002, + "loss": 0.5380391478538513, + "mean_token_accuracy": 0.7797932773828506, + "num_tokens": 12388141.0, + "step": 760 + }, + { + "entropy": 0.5405762195587158, + "epoch": 2.8411214953271027, + "grad_norm": 0.03440416231751442, + "learning_rate": 0.0002, + "loss": 0.5396413207054138, + "mean_token_accuracy": 0.7808928042650223, + "num_tokens": 12404651.0, + "step": 761 + }, + { + "entropy": 0.5396543145179749, + "epoch": 2.844859813084112, + "grad_norm": 0.035920772701501846, + "learning_rate": 0.0002, + "loss": 0.5427154898643494, + "mean_token_accuracy": 0.7765078693628311, + "num_tokens": 12420963.0, + "step": 762 + }, + { + "entropy": 0.5548029094934464, + "epoch": 2.8485981308411215, + "grad_norm": 0.039103955030441284, + "learning_rate": 0.0002, + "loss": 0.5556834936141968, + "mean_token_accuracy": 0.7767738401889801, + "num_tokens": 12437531.0, + "step": 763 + }, + { + "entropy": 0.5415983200073242, + "epoch": 2.852336448598131, + "grad_norm": 0.042144615203142166, + "learning_rate": 0.0002, + "loss": 0.5447718501091003, + "mean_token_accuracy": 0.7775606215000153, + "num_tokens": 12454010.0, + "step": 764 + }, + { + "entropy": 0.5518344044685364, + "epoch": 2.8560747663551402, + "grad_norm": 0.04103001952171326, + "learning_rate": 0.0002, + "loss": 0.5544916391372681, + "mean_token_accuracy": 0.7738406360149384, + "num_tokens": 12470274.0, + "step": 765 + }, + { + "entropy": 0.5313066393136978, + "epoch": 2.8598130841121496, + "grad_norm": 0.032009996473789215, + "learning_rate": 0.0002, + "loss": 0.5202063918113708, + "mean_token_accuracy": 0.7903139889240265, + "num_tokens": 12486731.0, + "step": 766 + }, + { + "entropy": 0.5384960919618607, + "epoch": 2.863551401869159, + "grad_norm": 0.03478989377617836, + "learning_rate": 0.0002, + "loss": 0.5340580344200134, + "mean_token_accuracy": 0.7825128734111786, + "num_tokens": 12503030.0, + "step": 767 + }, + { + "entropy": 0.5224665403366089, + "epoch": 2.867289719626168, + "grad_norm": 0.04253165051341057, + "learning_rate": 0.0002, + "loss": 0.5197573900222778, + "mean_token_accuracy": 0.7862243801355362, + "num_tokens": 12519459.0, + "step": 768 + }, + { + "entropy": 0.5370153039693832, + "epoch": 2.8710280373831774, + "grad_norm": 0.043792687356472015, + "learning_rate": 0.0002, + "loss": 0.5484573245048523, + "mean_token_accuracy": 0.7781466245651245, + "num_tokens": 12535908.0, + "step": 769 + }, + { + "entropy": 0.5293686985969543, + "epoch": 2.8747663551401867, + "grad_norm": 0.040938977152109146, + "learning_rate": 0.0002, + "loss": 0.536544680595398, + "mean_token_accuracy": 0.7815448939800262, + "num_tokens": 12552278.0, + "step": 770 + }, + { + "entropy": 0.5587695837020874, + "epoch": 2.878504672897196, + "grad_norm": 0.03950825333595276, + "learning_rate": 0.0002, + "loss": 0.5560339689254761, + "mean_token_accuracy": 0.7734224796295166, + "num_tokens": 12568976.0, + "step": 771 + }, + { + "entropy": 0.5375785976648331, + "epoch": 2.8822429906542055, + "grad_norm": 0.039299093186855316, + "learning_rate": 0.0002, + "loss": 0.5312560796737671, + "mean_token_accuracy": 0.7845937460660934, + "num_tokens": 12585119.0, + "step": 772 + }, + { + "entropy": 0.5369968712329865, + "epoch": 2.885981308411215, + "grad_norm": 0.03820907697081566, + "learning_rate": 0.0002, + "loss": 0.5376330614089966, + "mean_token_accuracy": 0.7842772305011749, + "num_tokens": 12601276.0, + "step": 773 + }, + { + "entropy": 0.544855386018753, + "epoch": 2.8897196261682243, + "grad_norm": 0.03554603457450867, + "learning_rate": 0.0002, + "loss": 0.5421188473701477, + "mean_token_accuracy": 0.777969166636467, + "num_tokens": 12617527.0, + "step": 774 + }, + { + "entropy": 0.5378414988517761, + "epoch": 2.8934579439252337, + "grad_norm": 0.040478140115737915, + "learning_rate": 0.0002, + "loss": 0.5390060544013977, + "mean_token_accuracy": 0.7802563160657883, + "num_tokens": 12633807.0, + "step": 775 + }, + { + "entropy": 0.5441332012414932, + "epoch": 2.897196261682243, + "grad_norm": 0.043334268033504486, + "learning_rate": 0.0002, + "loss": 0.5497444272041321, + "mean_token_accuracy": 0.7784465253353119, + "num_tokens": 12650362.0, + "step": 776 + }, + { + "entropy": 0.5244786143302917, + "epoch": 2.9009345794392525, + "grad_norm": 0.04438484460115433, + "learning_rate": 0.0002, + "loss": 0.5322083830833435, + "mean_token_accuracy": 0.7820873856544495, + "num_tokens": 12666671.0, + "step": 777 + }, + { + "entropy": 0.5577758997678757, + "epoch": 2.904672897196262, + "grad_norm": 0.0378209687769413, + "learning_rate": 0.0002, + "loss": 0.5586643815040588, + "mean_token_accuracy": 0.7750083953142166, + "num_tokens": 12682921.0, + "step": 778 + }, + { + "entropy": 0.5417321473360062, + "epoch": 2.9084112149532713, + "grad_norm": 0.04056469723582268, + "learning_rate": 0.0002, + "loss": 0.5351645350456238, + "mean_token_accuracy": 0.7857696861028671, + "num_tokens": 12699271.0, + "step": 779 + }, + { + "entropy": 0.5629414021968842, + "epoch": 2.91214953271028, + "grad_norm": 0.03783043846487999, + "learning_rate": 0.0002, + "loss": 0.5545192956924438, + "mean_token_accuracy": 0.7746420204639435, + "num_tokens": 12715410.0, + "step": 780 + }, + { + "entropy": 0.5410288572311401, + "epoch": 2.9158878504672896, + "grad_norm": 0.03987415134906769, + "learning_rate": 0.0002, + "loss": 0.5423367619514465, + "mean_token_accuracy": 0.778704360127449, + "num_tokens": 12731587.0, + "step": 781 + }, + { + "entropy": 0.5282145440578461, + "epoch": 2.919626168224299, + "grad_norm": 0.03972848504781723, + "learning_rate": 0.0002, + "loss": 0.5305967330932617, + "mean_token_accuracy": 0.7833620458841324, + "num_tokens": 12747966.0, + "step": 782 + }, + { + "entropy": 0.5176219195127487, + "epoch": 2.9233644859813084, + "grad_norm": 0.0456930510699749, + "learning_rate": 0.0002, + "loss": 0.5257712602615356, + "mean_token_accuracy": 0.7853413671255112, + "num_tokens": 12764231.0, + "step": 783 + }, + { + "entropy": 0.5378240942955017, + "epoch": 2.9271028037383178, + "grad_norm": 0.04101483151316643, + "learning_rate": 0.0002, + "loss": 0.5421459674835205, + "mean_token_accuracy": 0.7807480245828629, + "num_tokens": 12780553.0, + "step": 784 + }, + { + "entropy": 0.557637631893158, + "epoch": 2.930841121495327, + "grad_norm": 0.04411512240767479, + "learning_rate": 0.0002, + "loss": 0.554862380027771, + "mean_token_accuracy": 0.7744756042957306, + "num_tokens": 12796795.0, + "step": 785 + }, + { + "entropy": 0.5379236787557602, + "epoch": 2.9345794392523366, + "grad_norm": 0.03611760586500168, + "learning_rate": 0.0002, + "loss": 0.533431887626648, + "mean_token_accuracy": 0.7804027199745178, + "num_tokens": 12813009.0, + "step": 786 + }, + { + "entropy": 0.5566578209400177, + "epoch": 2.938317757009346, + "grad_norm": 0.03715398907661438, + "learning_rate": 0.0002, + "loss": 0.5433220267295837, + "mean_token_accuracy": 0.7798871845006943, + "num_tokens": 12829448.0, + "step": 787 + }, + { + "entropy": 0.5410407036542892, + "epoch": 2.942056074766355, + "grad_norm": 0.03546489402651787, + "learning_rate": 0.0002, + "loss": 0.5368283987045288, + "mean_token_accuracy": 0.780596062541008, + "num_tokens": 12845740.0, + "step": 788 + }, + { + "entropy": 0.5211344361305237, + "epoch": 2.9457943925233643, + "grad_norm": 0.04108017310500145, + "learning_rate": 0.0002, + "loss": 0.5253114104270935, + "mean_token_accuracy": 0.7879950255155563, + "num_tokens": 12861833.0, + "step": 789 + }, + { + "entropy": 0.5430938005447388, + "epoch": 2.9495327102803737, + "grad_norm": 0.04749761149287224, + "learning_rate": 0.0002, + "loss": 0.5581343770027161, + "mean_token_accuracy": 0.7737741619348526, + "num_tokens": 12878195.0, + "step": 790 + }, + { + "entropy": 0.5290612280368805, + "epoch": 2.953271028037383, + "grad_norm": 0.04016729071736336, + "learning_rate": 0.0002, + "loss": 0.5378360152244568, + "mean_token_accuracy": 0.782782569527626, + "num_tokens": 12894573.0, + "step": 791 + }, + { + "entropy": 0.5361242145299911, + "epoch": 2.9570093457943925, + "grad_norm": 0.03678933158516884, + "learning_rate": 0.0002, + "loss": 0.5257135629653931, + "mean_token_accuracy": 0.7893773168325424, + "num_tokens": 12910863.0, + "step": 792 + }, + { + "entropy": 0.5259503424167633, + "epoch": 2.960747663551402, + "grad_norm": 0.03752489387989044, + "learning_rate": 0.0002, + "loss": 0.5143717527389526, + "mean_token_accuracy": 0.7932046353816986, + "num_tokens": 12927225.0, + "step": 793 + }, + { + "entropy": 0.5639631152153015, + "epoch": 2.9644859813084112, + "grad_norm": 0.040728434920310974, + "learning_rate": 0.0002, + "loss": 0.5542000532150269, + "mean_token_accuracy": 0.7755206525325775, + "num_tokens": 12943436.0, + "step": 794 + }, + { + "entropy": 0.5300319939851761, + "epoch": 2.9682242990654206, + "grad_norm": 0.03866998106241226, + "learning_rate": 0.0002, + "loss": 0.536708652973175, + "mean_token_accuracy": 0.7822965830564499, + "num_tokens": 12959780.0, + "step": 795 + }, + { + "entropy": 0.5109827667474747, + "epoch": 2.97196261682243, + "grad_norm": 0.05049736052751541, + "learning_rate": 0.0002, + "loss": 0.5270419716835022, + "mean_token_accuracy": 0.785243034362793, + "num_tokens": 12976091.0, + "step": 796 + }, + { + "entropy": 0.5446041822433472, + "epoch": 2.9757009345794394, + "grad_norm": 0.03969959914684296, + "learning_rate": 0.0002, + "loss": 0.5448250770568848, + "mean_token_accuracy": 0.7787258327007294, + "num_tokens": 12992521.0, + "step": 797 + }, + { + "entropy": 0.5244547426700592, + "epoch": 2.979439252336449, + "grad_norm": 0.03810025751590729, + "learning_rate": 0.0002, + "loss": 0.5291001796722412, + "mean_token_accuracy": 0.784002810716629, + "num_tokens": 13008703.0, + "step": 798 + }, + { + "entropy": 0.5672779381275177, + "epoch": 2.983177570093458, + "grad_norm": 0.039007145911455154, + "learning_rate": 0.0002, + "loss": 0.5631389021873474, + "mean_token_accuracy": 0.7706998288631439, + "num_tokens": 13025057.0, + "step": 799 + }, + { + "entropy": 0.5293317213654518, + "epoch": 2.986915887850467, + "grad_norm": 0.037154458463191986, + "learning_rate": 0.0002, + "loss": 0.5227472186088562, + "mean_token_accuracy": 0.7874658554792404, + "num_tokens": 13041252.0, + "step": 800 + }, + { + "entropy": 0.537149466574192, + "epoch": 2.9906542056074765, + "grad_norm": 0.03552801162004471, + "learning_rate": 0.0002, + "loss": 0.5343438982963562, + "mean_token_accuracy": 0.7848320156335831, + "num_tokens": 13057526.0, + "step": 801 + }, + { + "entropy": 0.530741959810257, + "epoch": 2.994392523364486, + "grad_norm": 0.043317440897226334, + "learning_rate": 0.0002, + "loss": 0.5383322238922119, + "mean_token_accuracy": 0.7800941169261932, + "num_tokens": 13073756.0, + "step": 802 + }, + { + "entropy": 0.5432422608137131, + "epoch": 2.9981308411214953, + "grad_norm": 0.044908758252859116, + "learning_rate": 0.0002, + "loss": 0.5521294474601746, + "mean_token_accuracy": 0.7733398079872131, + "num_tokens": 13090164.0, + "step": 803 + }, + { + "entropy": 0.5437783300876617, + "epoch": 3.0, + "grad_norm": 0.04327746853232384, + "learning_rate": 0.0002, + "loss": 0.5394353270530701, + "mean_token_accuracy": 0.7844643592834473, + "num_tokens": 13096259.0, + "step": 804 + } + ], + "logging_steps": 1, + "max_steps": 804, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2208281383809843e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}