{ "best_global_step": 3800, "best_metric": 1.0250593423843384, "best_model_checkpoint": "/workspace/adversarial-rlhf/runs/sft-OpenHermes-2.5-Mistral-7B-20251108-1943/checkpoints/checkpoint-3800", "epoch": 1.0, "eval_steps": 100, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.1123480509966612, "epoch": 0.0013085793735176249, "grad_norm": 0.6217469573020935, "learning_rate": 1.9979063072494113e-05, "loss": 1.4467, "mean_token_accuracy": 0.6513771142810583, "num_tokens": 38846.0, "step": 5 }, { "entropy": 1.1004129022359848, "epoch": 0.0026171587470352497, "grad_norm": 0.4485588073730469, "learning_rate": 1.995289191311175e-05, "loss": 1.3538, "mean_token_accuracy": 0.6656047463417053, "num_tokens": 77083.0, "step": 10 }, { "entropy": 1.1241086978465318, "epoch": 0.003925738120552875, "grad_norm": 0.4486566483974457, "learning_rate": 1.9926720753729393e-05, "loss": 1.2136, "mean_token_accuracy": 0.6849155448377132, "num_tokens": 113871.0, "step": 15 }, { "entropy": 1.2270274113863706, "epoch": 0.005234317494070499, "grad_norm": 0.3619006276130676, "learning_rate": 1.990054959434703e-05, "loss": 1.1882, "mean_token_accuracy": 0.6774788822978735, "num_tokens": 149409.0, "step": 20 }, { "entropy": 1.21558427400887, "epoch": 0.006542896867588125, "grad_norm": 0.3003220558166504, "learning_rate": 1.987437843496467e-05, "loss": 1.2674, "mean_token_accuracy": 0.6861417829990387, "num_tokens": 191753.0, "step": 25 }, { "entropy": 1.2555227734148502, "epoch": 0.00785147624110575, "grad_norm": 0.28489917516708374, "learning_rate": 1.9848207275582308e-05, "loss": 1.1575, "mean_token_accuracy": 0.6786608777940273, "num_tokens": 236354.0, "step": 30 }, { "entropy": 1.2973661322146655, "epoch": 0.009160055614623375, "grad_norm": 0.2774274945259094, "learning_rate": 1.982203611619995e-05, "loss": 1.2217, "mean_token_accuracy": 0.6799626011401415, "num_tokens": 276048.0, "step": 35 }, { "entropy": 1.196922979876399, "epoch": 0.010468634988140999, "grad_norm": 0.2974745035171509, "learning_rate": 1.979586495681759e-05, "loss": 1.1239, "mean_token_accuracy": 0.6995673455297947, "num_tokens": 313936.0, "step": 40 }, { "entropy": 1.2235115006566049, "epoch": 0.011777214361658624, "grad_norm": 0.2803105115890503, "learning_rate": 1.9769693797435227e-05, "loss": 1.1753, "mean_token_accuracy": 0.6886466249823571, "num_tokens": 354248.0, "step": 45 }, { "entropy": 1.2047650370746852, "epoch": 0.01308579373517625, "grad_norm": 0.24354718625545502, "learning_rate": 1.974352263805287e-05, "loss": 1.1418, "mean_token_accuracy": 0.6953188866376877, "num_tokens": 392556.0, "step": 50 }, { "entropy": 1.174419540166855, "epoch": 0.014394373108693874, "grad_norm": 0.2286166548728943, "learning_rate": 1.9717351478670507e-05, "loss": 1.1289, "mean_token_accuracy": 0.6958644594997168, "num_tokens": 429553.0, "step": 55 }, { "entropy": 1.2164320405572653, "epoch": 0.0157029524822115, "grad_norm": 0.24150270223617554, "learning_rate": 1.9691180319288145e-05, "loss": 1.1822, "mean_token_accuracy": 0.6899411961436271, "num_tokens": 471479.0, "step": 60 }, { "entropy": 1.3109304085373878, "epoch": 0.017011531855729123, "grad_norm": 0.2634604275226593, "learning_rate": 1.9665009159905787e-05, "loss": 1.2606, "mean_token_accuracy": 0.6797668907791377, "num_tokens": 507880.0, "step": 65 }, { "entropy": 1.3245886493474246, "epoch": 0.01832011122924675, "grad_norm": 0.2704319953918457, "learning_rate": 1.9638838000523425e-05, "loss": 1.1799, "mean_token_accuracy": 0.6835815001279115, "num_tokens": 545408.0, "step": 70 }, { "entropy": 1.2262448083609343, "epoch": 0.019628690602764374, "grad_norm": 0.2790036201477051, "learning_rate": 1.9612666841141064e-05, "loss": 1.1106, "mean_token_accuracy": 0.6929917622357606, "num_tokens": 583634.0, "step": 75 }, { "entropy": 1.265508808940649, "epoch": 0.020937269976281998, "grad_norm": 0.22567331790924072, "learning_rate": 1.9586495681758702e-05, "loss": 1.2072, "mean_token_accuracy": 0.6882555760443211, "num_tokens": 619048.0, "step": 80 }, { "entropy": 1.2399516824632884, "epoch": 0.022245849349799625, "grad_norm": 0.26959890127182007, "learning_rate": 1.9560324522376344e-05, "loss": 1.1765, "mean_token_accuracy": 0.6866359710693359, "num_tokens": 657325.0, "step": 85 }, { "entropy": 1.2313102029263974, "epoch": 0.02355442872331725, "grad_norm": 0.31447833776474, "learning_rate": 1.9534153362993982e-05, "loss": 1.1615, "mean_token_accuracy": 0.6929020721465349, "num_tokens": 692549.0, "step": 90 }, { "entropy": 1.29357091486454, "epoch": 0.024863008096834872, "grad_norm": 0.27703842520713806, "learning_rate": 1.950798220361162e-05, "loss": 1.2535, "mean_token_accuracy": 0.6816725376993418, "num_tokens": 730459.0, "step": 95 }, { "entropy": 1.1631319765001535, "epoch": 0.0261715874703525, "grad_norm": 0.30061009526252747, "learning_rate": 1.9481811044229262e-05, "loss": 1.0757, "mean_token_accuracy": 0.7062662675976753, "num_tokens": 770154.0, "step": 100 }, { "epoch": 0.0261715874703525, "eval_entropy": 1.138489418029785, "eval_loss": 1.1535744667053223, "eval_mean_token_accuracy": 0.7070313329696656, "eval_num_tokens": 770154.0, "eval_runtime": 96.9826, "eval_samples_per_second": 10.311, "eval_steps_per_second": 1.289, "step": 100 }, { "entropy": 1.2952044211328029, "epoch": 0.027480166843870123, "grad_norm": 0.3016025125980377, "learning_rate": 1.94556398848469e-05, "loss": 1.1798, "mean_token_accuracy": 0.6826052766293287, "num_tokens": 806341.0, "step": 105 }, { "entropy": 1.2129868753254414, "epoch": 0.028788746217387747, "grad_norm": 0.2947443425655365, "learning_rate": 1.942946872546454e-05, "loss": 1.1327, "mean_token_accuracy": 0.6927057210355997, "num_tokens": 843316.0, "step": 110 }, { "entropy": 1.290729960054159, "epoch": 0.030097325590905374, "grad_norm": 0.3131401240825653, "learning_rate": 1.940329756608218e-05, "loss": 1.1927, "mean_token_accuracy": 0.6800491612404584, "num_tokens": 879843.0, "step": 115 }, { "entropy": 1.1782143030315637, "epoch": 0.031405904964423, "grad_norm": 0.2727121114730835, "learning_rate": 1.937712640669982e-05, "loss": 1.0659, "mean_token_accuracy": 0.700430641323328, "num_tokens": 920743.0, "step": 120 }, { "entropy": 1.1695528313517571, "epoch": 0.03271448433794062, "grad_norm": 0.26583895087242126, "learning_rate": 1.9350955247317458e-05, "loss": 1.0232, "mean_token_accuracy": 0.7075442411005497, "num_tokens": 959096.0, "step": 125 }, { "entropy": 1.2109053663909435, "epoch": 0.034023063711458246, "grad_norm": 0.3991076946258545, "learning_rate": 1.93247840879351e-05, "loss": 1.1003, "mean_token_accuracy": 0.699517872184515, "num_tokens": 992352.0, "step": 130 }, { "entropy": 1.2005993926897645, "epoch": 0.035331643084975876, "grad_norm": 0.22760024666786194, "learning_rate": 1.9298612928552734e-05, "loss": 1.0873, "mean_token_accuracy": 0.7063136156648397, "num_tokens": 1031994.0, "step": 135 }, { "entropy": 1.2139225870370864, "epoch": 0.0366402224584935, "grad_norm": 0.2275068610906601, "learning_rate": 1.9272441769170376e-05, "loss": 1.2156, "mean_token_accuracy": 0.6952568493783474, "num_tokens": 1074569.0, "step": 140 }, { "entropy": 1.1324009403586388, "epoch": 0.037948801832011124, "grad_norm": 0.29410237073898315, "learning_rate": 1.9246270609788015e-05, "loss": 1.0779, "mean_token_accuracy": 0.7172829706221819, "num_tokens": 1113952.0, "step": 145 }, { "entropy": 1.2293724507093429, "epoch": 0.03925738120552875, "grad_norm": 0.37723538279533386, "learning_rate": 1.9220099450405653e-05, "loss": 1.146, "mean_token_accuracy": 0.6888038989156484, "num_tokens": 1148309.0, "step": 150 }, { "entropy": 1.2574020594358444, "epoch": 0.04056596057904637, "grad_norm": 0.2657281756401062, "learning_rate": 1.9193928291023295e-05, "loss": 1.1935, "mean_token_accuracy": 0.6899809587746859, "num_tokens": 1189282.0, "step": 155 }, { "entropy": 1.2693338964134455, "epoch": 0.041874539952563995, "grad_norm": 0.3656497895717621, "learning_rate": 1.9167757131640933e-05, "loss": 1.1972, "mean_token_accuracy": 0.6868170712143182, "num_tokens": 1227946.0, "step": 160 }, { "entropy": 1.2286424834281207, "epoch": 0.043183119326081626, "grad_norm": 0.3276367485523224, "learning_rate": 1.914158597225857e-05, "loss": 1.2015, "mean_token_accuracy": 0.6974392894655466, "num_tokens": 1263944.0, "step": 165 }, { "entropy": 1.1861646961420775, "epoch": 0.04449169869959925, "grad_norm": 0.23723381757736206, "learning_rate": 1.9115414812876213e-05, "loss": 1.0972, "mean_token_accuracy": 0.7079249806702137, "num_tokens": 1304585.0, "step": 170 }, { "entropy": 1.208771700784564, "epoch": 0.045800278073116873, "grad_norm": 0.26714691519737244, "learning_rate": 1.908924365349385e-05, "loss": 1.1027, "mean_token_accuracy": 0.6939801312983036, "num_tokens": 1348683.0, "step": 175 }, { "entropy": 1.2366201855242251, "epoch": 0.0471088574466345, "grad_norm": 0.2907228171825409, "learning_rate": 1.906307249411149e-05, "loss": 1.1874, "mean_token_accuracy": 0.6881870869547129, "num_tokens": 1383690.0, "step": 180 }, { "entropy": 1.1898431174457074, "epoch": 0.04841743682015212, "grad_norm": 0.307679146528244, "learning_rate": 1.9036901334729128e-05, "loss": 1.0673, "mean_token_accuracy": 0.6996020458638668, "num_tokens": 1421508.0, "step": 185 }, { "entropy": 1.187743027880788, "epoch": 0.049726016193669745, "grad_norm": 0.3183029890060425, "learning_rate": 1.901073017534677e-05, "loss": 1.0849, "mean_token_accuracy": 0.7036747265607118, "num_tokens": 1462197.0, "step": 190 }, { "entropy": 1.2066817820072173, "epoch": 0.051034595567187375, "grad_norm": 0.3657572865486145, "learning_rate": 1.898455901596441e-05, "loss": 1.1581, "mean_token_accuracy": 0.6929362419992685, "num_tokens": 1495983.0, "step": 195 }, { "entropy": 1.2126169182360171, "epoch": 0.052343174940705, "grad_norm": 0.29915717244148254, "learning_rate": 1.8958387856582047e-05, "loss": 1.209, "mean_token_accuracy": 0.6987330242991447, "num_tokens": 1533227.0, "step": 200 }, { "epoch": 0.052343174940705, "eval_entropy": 1.1249961452484132, "eval_loss": 1.1296290159225464, "eval_mean_token_accuracy": 0.7111005320549011, "eval_num_tokens": 1533227.0, "eval_runtime": 96.9069, "eval_samples_per_second": 10.319, "eval_steps_per_second": 1.29, "step": 200 }, { "entropy": 1.2599914189428092, "epoch": 0.05365175431422262, "grad_norm": 0.27243492007255554, "learning_rate": 1.893221669719969e-05, "loss": 1.1688, "mean_token_accuracy": 0.6900306183844804, "num_tokens": 1571122.0, "step": 205 }, { "entropy": 1.182131139561534, "epoch": 0.05496033368774025, "grad_norm": 0.3210395276546478, "learning_rate": 1.8906045537817327e-05, "loss": 1.0958, "mean_token_accuracy": 0.7051124095916748, "num_tokens": 1610597.0, "step": 210 }, { "entropy": 1.1748668916523457, "epoch": 0.05626891306125787, "grad_norm": 0.289655864238739, "learning_rate": 1.8879874378434965e-05, "loss": 1.1017, "mean_token_accuracy": 0.704343120381236, "num_tokens": 1647358.0, "step": 215 }, { "entropy": 1.2121441032737494, "epoch": 0.057577492434775494, "grad_norm": 0.27067476511001587, "learning_rate": 1.8853703219052607e-05, "loss": 1.0814, "mean_token_accuracy": 0.7023848608136177, "num_tokens": 1683267.0, "step": 220 }, { "entropy": 1.2210756927728652, "epoch": 0.058886071808293125, "grad_norm": 0.2811789810657501, "learning_rate": 1.8827532059670245e-05, "loss": 1.1921, "mean_token_accuracy": 0.6895555172115564, "num_tokens": 1720402.0, "step": 225 }, { "entropy": 1.2133880224078895, "epoch": 0.06019465118181075, "grad_norm": 0.2670990526676178, "learning_rate": 1.8801360900287884e-05, "loss": 1.1452, "mean_token_accuracy": 0.6956219498068095, "num_tokens": 1760874.0, "step": 230 }, { "entropy": 1.1936849866062402, "epoch": 0.06150323055532837, "grad_norm": 0.23748992383480072, "learning_rate": 1.8775189740905526e-05, "loss": 1.1095, "mean_token_accuracy": 0.701577215641737, "num_tokens": 1798253.0, "step": 235 }, { "entropy": 1.1502909563481807, "epoch": 0.062811809928846, "grad_norm": 0.26068389415740967, "learning_rate": 1.8749018581523164e-05, "loss": 1.158, "mean_token_accuracy": 0.7133214943110943, "num_tokens": 1833443.0, "step": 240 }, { "entropy": 1.3078780982643365, "epoch": 0.06412038930236362, "grad_norm": 0.4137086272239685, "learning_rate": 1.8722847422140802e-05, "loss": 1.2878, "mean_token_accuracy": 0.6780555315315724, "num_tokens": 1871910.0, "step": 245 }, { "entropy": 1.2249418403953314, "epoch": 0.06542896867588124, "grad_norm": 0.2942444384098053, "learning_rate": 1.869667626275844e-05, "loss": 1.1637, "mean_token_accuracy": 0.6950553271919488, "num_tokens": 1909524.0, "step": 250 }, { "entropy": 1.211066035553813, "epoch": 0.06673754804939887, "grad_norm": 0.3243419826030731, "learning_rate": 1.8670505103376082e-05, "loss": 1.1516, "mean_token_accuracy": 0.6988872833549976, "num_tokens": 1949325.0, "step": 255 }, { "entropy": 1.1659034360200167, "epoch": 0.06804612742291649, "grad_norm": 0.2724755108356476, "learning_rate": 1.864433394399372e-05, "loss": 1.1041, "mean_token_accuracy": 0.7077850546687842, "num_tokens": 1992509.0, "step": 260 }, { "entropy": 1.1442662578076124, "epoch": 0.06935470679643412, "grad_norm": 0.22777320444583893, "learning_rate": 1.861816278461136e-05, "loss": 1.0722, "mean_token_accuracy": 0.7042363248765469, "num_tokens": 2025452.0, "step": 265 }, { "entropy": 1.1255293242633342, "epoch": 0.07066328616995175, "grad_norm": 0.30172210931777954, "learning_rate": 1.8591991625229e-05, "loss": 1.039, "mean_token_accuracy": 0.7133398558944464, "num_tokens": 2066169.0, "step": 270 }, { "entropy": 1.1561335507780313, "epoch": 0.07197186554346938, "grad_norm": 0.2582629323005676, "learning_rate": 1.856582046584664e-05, "loss": 1.1285, "mean_token_accuracy": 0.6994227156043052, "num_tokens": 2107302.0, "step": 275 }, { "entropy": 1.1572829529643058, "epoch": 0.073280444916987, "grad_norm": 0.3104718029499054, "learning_rate": 1.8539649306464278e-05, "loss": 1.0956, "mean_token_accuracy": 0.706654816865921, "num_tokens": 2143551.0, "step": 280 }, { "entropy": 1.1630447298288344, "epoch": 0.07458902429050462, "grad_norm": 0.23650068044662476, "learning_rate": 1.851347814708192e-05, "loss": 1.1061, "mean_token_accuracy": 0.7114994272589683, "num_tokens": 2184046.0, "step": 285 }, { "entropy": 1.1605928294360637, "epoch": 0.07589760366402225, "grad_norm": 0.2913525104522705, "learning_rate": 1.8487306987699554e-05, "loss": 1.1521, "mean_token_accuracy": 0.7072546981275082, "num_tokens": 2223868.0, "step": 290 }, { "entropy": 1.169267463684082, "epoch": 0.07720618303753987, "grad_norm": 0.2545417547225952, "learning_rate": 1.8461135828317196e-05, "loss": 1.1464, "mean_token_accuracy": 0.7014396741986275, "num_tokens": 2265467.0, "step": 295 }, { "entropy": 1.170828291401267, "epoch": 0.0785147624110575, "grad_norm": 0.3250999450683594, "learning_rate": 1.8434964668934835e-05, "loss": 1.1287, "mean_token_accuracy": 0.7041407331824303, "num_tokens": 2304656.0, "step": 300 }, { "epoch": 0.0785147624110575, "eval_entropy": 1.1109629173278808, "eval_loss": 1.1124978065490723, "eval_mean_token_accuracy": 0.7139446325302125, "eval_num_tokens": 2304656.0, "eval_runtime": 96.7866, "eval_samples_per_second": 10.332, "eval_steps_per_second": 1.292, "step": 300 }, { "entropy": 1.2286480199545622, "epoch": 0.07982334178457512, "grad_norm": 0.2837753891944885, "learning_rate": 1.8408793509552473e-05, "loss": 1.1965, "mean_token_accuracy": 0.6960091460496187, "num_tokens": 2346109.0, "step": 305 }, { "entropy": 1.225469772517681, "epoch": 0.08113192115809274, "grad_norm": 0.3345058262348175, "learning_rate": 1.8382622350170115e-05, "loss": 1.2391, "mean_token_accuracy": 0.6988137848675251, "num_tokens": 2387084.0, "step": 310 }, { "entropy": 1.076605834439397, "epoch": 0.08244050053161037, "grad_norm": 0.2856815755367279, "learning_rate": 1.8356451190787753e-05, "loss": 1.0384, "mean_token_accuracy": 0.7207759529352188, "num_tokens": 2428351.0, "step": 315 }, { "entropy": 1.1501711413264275, "epoch": 0.08374907990512799, "grad_norm": 0.3421330451965332, "learning_rate": 1.833028003140539e-05, "loss": 1.1022, "mean_token_accuracy": 0.7128844160586596, "num_tokens": 2465278.0, "step": 320 }, { "entropy": 1.1183946415781976, "epoch": 0.08505765927864561, "grad_norm": 0.26048743724823, "learning_rate": 1.8304108872023033e-05, "loss": 1.1056, "mean_token_accuracy": 0.7164099890738725, "num_tokens": 2505150.0, "step": 325 }, { "entropy": 1.2478729356080294, "epoch": 0.08636623865216325, "grad_norm": 0.3070579767227173, "learning_rate": 1.827793771264067e-05, "loss": 1.1858, "mean_token_accuracy": 0.6890709776431322, "num_tokens": 2545535.0, "step": 330 }, { "entropy": 1.1905897859483958, "epoch": 0.08767481802568088, "grad_norm": 0.29979345202445984, "learning_rate": 1.825176655325831e-05, "loss": 1.1911, "mean_token_accuracy": 0.6996586941182613, "num_tokens": 2587914.0, "step": 335 }, { "entropy": 1.1719540372490882, "epoch": 0.0889833973991985, "grad_norm": 0.3220307528972626, "learning_rate": 1.822559539387595e-05, "loss": 1.1214, "mean_token_accuracy": 0.7045478235930205, "num_tokens": 2625193.0, "step": 340 }, { "entropy": 1.1586624164134265, "epoch": 0.09029197677271612, "grad_norm": 0.2599794566631317, "learning_rate": 1.819942423449359e-05, "loss": 1.0431, "mean_token_accuracy": 0.7106049194931984, "num_tokens": 2665612.0, "step": 345 }, { "entropy": 1.1819968853145837, "epoch": 0.09160055614623375, "grad_norm": 0.29061323404312134, "learning_rate": 1.817325307511123e-05, "loss": 1.1262, "mean_token_accuracy": 0.7002443138509988, "num_tokens": 2706946.0, "step": 350 }, { "entropy": 1.1458000589162112, "epoch": 0.09290913551975137, "grad_norm": 0.347755491733551, "learning_rate": 1.8147081915728867e-05, "loss": 1.0993, "mean_token_accuracy": 0.7002494022250175, "num_tokens": 2747375.0, "step": 355 }, { "entropy": 1.1966037943959236, "epoch": 0.094217714893269, "grad_norm": 0.37435945868492126, "learning_rate": 1.812091075634651e-05, "loss": 1.1504, "mean_token_accuracy": 0.7029601544141769, "num_tokens": 2790158.0, "step": 360 }, { "entropy": 1.1675037112087012, "epoch": 0.09552629426678662, "grad_norm": 0.32143616676330566, "learning_rate": 1.8094739596964147e-05, "loss": 1.0943, "mean_token_accuracy": 0.7053755812346936, "num_tokens": 2827819.0, "step": 365 }, { "entropy": 1.1707987286150456, "epoch": 0.09683487364030424, "grad_norm": 0.35289084911346436, "learning_rate": 1.8068568437581785e-05, "loss": 1.1038, "mean_token_accuracy": 0.7076830513775348, "num_tokens": 2869307.0, "step": 370 }, { "entropy": 1.1168341800570487, "epoch": 0.09814345301382187, "grad_norm": 0.3782387673854828, "learning_rate": 1.8042397278199427e-05, "loss": 1.0558, "mean_token_accuracy": 0.7170991696417331, "num_tokens": 2914107.0, "step": 375 }, { "entropy": 1.1111165493726731, "epoch": 0.09945203238733949, "grad_norm": 0.3120217025279999, "learning_rate": 1.8016226118817065e-05, "loss": 1.0232, "mean_token_accuracy": 0.7135602042078972, "num_tokens": 2952137.0, "step": 380 }, { "entropy": 1.1221987165510654, "epoch": 0.10076061176085711, "grad_norm": 0.3564794659614563, "learning_rate": 1.7990054959434704e-05, "loss": 1.0421, "mean_token_accuracy": 0.712994397431612, "num_tokens": 2992220.0, "step": 385 }, { "entropy": 1.1915016632527113, "epoch": 0.10206919113437475, "grad_norm": 0.3401761054992676, "learning_rate": 1.7963883800052346e-05, "loss": 1.1218, "mean_token_accuracy": 0.704281483963132, "num_tokens": 3032423.0, "step": 390 }, { "entropy": 1.1808374393731356, "epoch": 0.10337777050789237, "grad_norm": 0.32225707173347473, "learning_rate": 1.7937712640669984e-05, "loss": 1.1646, "mean_token_accuracy": 0.6994639791548252, "num_tokens": 3068229.0, "step": 395 }, { "entropy": 1.1816586170345544, "epoch": 0.10468634988141, "grad_norm": 0.29380306601524353, "learning_rate": 1.7911541481287622e-05, "loss": 1.1252, "mean_token_accuracy": 0.7064340703189373, "num_tokens": 3108153.0, "step": 400 }, { "epoch": 0.10468634988141, "eval_entropy": 1.1162484121322631, "eval_loss": 1.1014440059661865, "eval_mean_token_accuracy": 0.7159673566818238, "eval_num_tokens": 3108153.0, "eval_runtime": 96.8755, "eval_samples_per_second": 10.323, "eval_steps_per_second": 1.29, "step": 400 }, { "entropy": 1.1671059928834437, "epoch": 0.10599492925492762, "grad_norm": 0.34220021963119507, "learning_rate": 1.788537032190526e-05, "loss": 1.0648, "mean_token_accuracy": 0.7134943757206201, "num_tokens": 3144634.0, "step": 405 }, { "entropy": 1.1742517005652189, "epoch": 0.10730350862844525, "grad_norm": 0.3622688949108124, "learning_rate": 1.7859199162522902e-05, "loss": 1.1752, "mean_token_accuracy": 0.7035086948424578, "num_tokens": 3184006.0, "step": 410 }, { "entropy": 1.193211056292057, "epoch": 0.10861208800196287, "grad_norm": 0.29637637734413147, "learning_rate": 1.783302800314054e-05, "loss": 1.1365, "mean_token_accuracy": 0.6987239971756936, "num_tokens": 3224565.0, "step": 415 }, { "entropy": 1.251770207285881, "epoch": 0.1099206673754805, "grad_norm": 0.2511119544506073, "learning_rate": 1.780685684375818e-05, "loss": 1.2718, "mean_token_accuracy": 0.6891444839537144, "num_tokens": 3265562.0, "step": 420 }, { "entropy": 1.23516104221344, "epoch": 0.11122924674899812, "grad_norm": 0.32558682560920715, "learning_rate": 1.778068568437582e-05, "loss": 1.2006, "mean_token_accuracy": 0.6901515744626522, "num_tokens": 3302848.0, "step": 425 }, { "entropy": 1.132848785072565, "epoch": 0.11253782612251574, "grad_norm": 0.26255035400390625, "learning_rate": 1.775451452499346e-05, "loss": 1.0432, "mean_token_accuracy": 0.7118600439280272, "num_tokens": 3342127.0, "step": 430 }, { "entropy": 1.180259444192052, "epoch": 0.11384640549603336, "grad_norm": 0.3205581307411194, "learning_rate": 1.7728343365611098e-05, "loss": 1.1224, "mean_token_accuracy": 0.7004977215081454, "num_tokens": 3381062.0, "step": 435 }, { "entropy": 1.249288512021303, "epoch": 0.11515498486955099, "grad_norm": 0.31854113936424255, "learning_rate": 1.770217220622874e-05, "loss": 1.2151, "mean_token_accuracy": 0.6909528046846389, "num_tokens": 3418674.0, "step": 440 }, { "entropy": 1.1658391878008842, "epoch": 0.11646356424306861, "grad_norm": 0.31040823459625244, "learning_rate": 1.7676001046846374e-05, "loss": 1.1628, "mean_token_accuracy": 0.7044171277433634, "num_tokens": 3459660.0, "step": 445 }, { "entropy": 1.1223286792635918, "epoch": 0.11777214361658625, "grad_norm": 0.387347936630249, "learning_rate": 1.7649829887464016e-05, "loss": 1.0412, "mean_token_accuracy": 0.716064503788948, "num_tokens": 3498595.0, "step": 450 }, { "entropy": 1.1415202051401139, "epoch": 0.11908072299010387, "grad_norm": 0.28736481070518494, "learning_rate": 1.7623658728081658e-05, "loss": 1.1526, "mean_token_accuracy": 0.7126259196549654, "num_tokens": 3538259.0, "step": 455 }, { "entropy": 1.1327776111662389, "epoch": 0.1203893023636215, "grad_norm": 0.2839507460594177, "learning_rate": 1.7597487568699293e-05, "loss": 1.0416, "mean_token_accuracy": 0.7203302428126335, "num_tokens": 3577846.0, "step": 460 }, { "entropy": 1.1565908435732126, "epoch": 0.12169788173713912, "grad_norm": 0.27812105417251587, "learning_rate": 1.7571316409316935e-05, "loss": 1.1039, "mean_token_accuracy": 0.7012306869029998, "num_tokens": 3621788.0, "step": 465 }, { "entropy": 1.2063092295080424, "epoch": 0.12300646111065675, "grad_norm": 0.38487356901168823, "learning_rate": 1.7545145249934573e-05, "loss": 1.1509, "mean_token_accuracy": 0.7023797459900379, "num_tokens": 3658439.0, "step": 470 }, { "entropy": 1.0865493860095738, "epoch": 0.12431504048417437, "grad_norm": 0.2500401735305786, "learning_rate": 1.751897409055221e-05, "loss": 1.003, "mean_token_accuracy": 0.7198965962976217, "num_tokens": 3698868.0, "step": 475 }, { "entropy": 1.1908767573535441, "epoch": 0.125623619857692, "grad_norm": 0.3962502181529999, "learning_rate": 1.7492802931169853e-05, "loss": 1.1297, "mean_token_accuracy": 0.7053800851106644, "num_tokens": 3741081.0, "step": 480 }, { "entropy": 1.1331688780337572, "epoch": 0.12693219923120963, "grad_norm": 0.2883553206920624, "learning_rate": 1.746663177178749e-05, "loss": 1.0536, "mean_token_accuracy": 0.7061506755650043, "num_tokens": 3780895.0, "step": 485 }, { "entropy": 1.097454984486103, "epoch": 0.12824077860472724, "grad_norm": 0.3009795844554901, "learning_rate": 1.744046061240513e-05, "loss": 1.0696, "mean_token_accuracy": 0.7190258864313364, "num_tokens": 3821605.0, "step": 490 }, { "entropy": 1.2135265406221152, "epoch": 0.12954935797824488, "grad_norm": 0.25389495491981506, "learning_rate": 1.741428945302277e-05, "loss": 1.2089, "mean_token_accuracy": 0.693002799153328, "num_tokens": 3861658.0, "step": 495 }, { "entropy": 1.2169387113302945, "epoch": 0.1308579373517625, "grad_norm": 0.2983970642089844, "learning_rate": 1.738811829364041e-05, "loss": 1.1668, "mean_token_accuracy": 0.7012713517993688, "num_tokens": 3903328.0, "step": 500 }, { "epoch": 0.1308579373517625, "eval_entropy": 1.0979133324623107, "eval_loss": 1.0918148756027222, "eval_mean_token_accuracy": 0.7178149290084839, "eval_num_tokens": 3903328.0, "eval_runtime": 96.8747, "eval_samples_per_second": 10.323, "eval_steps_per_second": 1.29, "step": 500 }, { "entropy": 1.1749178305268289, "epoch": 0.13216651672528013, "grad_norm": 0.38501033186912537, "learning_rate": 1.736194713425805e-05, "loss": 1.1413, "mean_token_accuracy": 0.707249428331852, "num_tokens": 3944540.0, "step": 505 }, { "entropy": 1.1787702813744545, "epoch": 0.13347509609879774, "grad_norm": 0.43281009793281555, "learning_rate": 1.7335775974875687e-05, "loss": 1.1217, "mean_token_accuracy": 0.7034583434462547, "num_tokens": 3980926.0, "step": 510 }, { "entropy": 1.0645542204380036, "epoch": 0.13478367547231537, "grad_norm": 0.3150987923145294, "learning_rate": 1.730960481549333e-05, "loss": 1.05, "mean_token_accuracy": 0.7132887698709964, "num_tokens": 4021891.0, "step": 515 }, { "entropy": 1.1106295462697744, "epoch": 0.13609225484583298, "grad_norm": 0.3438737094402313, "learning_rate": 1.7283433656110967e-05, "loss": 1.0003, "mean_token_accuracy": 0.7212319687008858, "num_tokens": 4060166.0, "step": 520 }, { "entropy": 1.1168012116104364, "epoch": 0.13740083421935062, "grad_norm": 0.3113572895526886, "learning_rate": 1.7257262496728605e-05, "loss": 1.0543, "mean_token_accuracy": 0.7209181990474463, "num_tokens": 4094716.0, "step": 525 }, { "entropy": 1.175467026978731, "epoch": 0.13870941359286823, "grad_norm": 0.302837073802948, "learning_rate": 1.7231091337346247e-05, "loss": 1.1289, "mean_token_accuracy": 0.7042849272489548, "num_tokens": 4141339.0, "step": 530 }, { "entropy": 1.0599542949348688, "epoch": 0.14001799296638587, "grad_norm": 0.2495729923248291, "learning_rate": 1.7204920177963885e-05, "loss": 1.052, "mean_token_accuracy": 0.7274259474128485, "num_tokens": 4185050.0, "step": 535 }, { "entropy": 1.1991025242954492, "epoch": 0.1413265723399035, "grad_norm": 0.4198792576789856, "learning_rate": 1.7178749018581524e-05, "loss": 1.1902, "mean_token_accuracy": 0.6947382442653179, "num_tokens": 4223794.0, "step": 540 }, { "entropy": 1.227722565829754, "epoch": 0.14263515171342112, "grad_norm": 0.3961770236492157, "learning_rate": 1.7152577859199166e-05, "loss": 1.1986, "mean_token_accuracy": 0.6865826688706875, "num_tokens": 4255882.0, "step": 545 }, { "entropy": 1.1813674479722978, "epoch": 0.14394373108693875, "grad_norm": 0.33829399943351746, "learning_rate": 1.7126406699816804e-05, "loss": 1.1159, "mean_token_accuracy": 0.7054100755602122, "num_tokens": 4293931.0, "step": 550 }, { "entropy": 1.1598187141120433, "epoch": 0.14525231046045636, "grad_norm": 0.29768019914627075, "learning_rate": 1.7100235540434442e-05, "loss": 1.1149, "mean_token_accuracy": 0.7081062331795692, "num_tokens": 4331281.0, "step": 555 }, { "entropy": 1.1658945206552744, "epoch": 0.146560889833974, "grad_norm": 0.47267428040504456, "learning_rate": 1.707406438105208e-05, "loss": 1.126, "mean_token_accuracy": 0.7110950201749802, "num_tokens": 4367666.0, "step": 560 }, { "entropy": 1.107292691990733, "epoch": 0.1478694692074916, "grad_norm": 0.30985575914382935, "learning_rate": 1.7047893221669722e-05, "loss": 1.0491, "mean_token_accuracy": 0.7133530873805285, "num_tokens": 4404165.0, "step": 565 }, { "entropy": 1.0999628335237503, "epoch": 0.14917804858100925, "grad_norm": 0.38928133249282837, "learning_rate": 1.702172206228736e-05, "loss": 1.0697, "mean_token_accuracy": 0.7206672951579094, "num_tokens": 4444145.0, "step": 570 }, { "entropy": 1.0466942593455315, "epoch": 0.15048662795452686, "grad_norm": 0.3638196289539337, "learning_rate": 1.6995550902905e-05, "loss": 0.9879, "mean_token_accuracy": 0.7333550229668617, "num_tokens": 4486612.0, "step": 575 }, { "entropy": 1.1943504758179189, "epoch": 0.1517952073280445, "grad_norm": 0.35094064474105835, "learning_rate": 1.696937974352264e-05, "loss": 1.173, "mean_token_accuracy": 0.7010251984000206, "num_tokens": 4527971.0, "step": 580 }, { "entropy": 1.1672972977161407, "epoch": 0.1531037867015621, "grad_norm": 0.3714410662651062, "learning_rate": 1.694320858414028e-05, "loss": 1.1864, "mean_token_accuracy": 0.6985016442835331, "num_tokens": 4565626.0, "step": 585 }, { "entropy": 1.183141415938735, "epoch": 0.15441236607507974, "grad_norm": 0.3710881173610687, "learning_rate": 1.6917037424757918e-05, "loss": 1.1097, "mean_token_accuracy": 0.709296465292573, "num_tokens": 4601255.0, "step": 590 }, { "entropy": 1.1613754861056804, "epoch": 0.15572094544859735, "grad_norm": 0.4122501015663147, "learning_rate": 1.689086626537556e-05, "loss": 1.0919, "mean_token_accuracy": 0.7126895777881146, "num_tokens": 4635025.0, "step": 595 }, { "entropy": 1.1225487384945154, "epoch": 0.157029524822115, "grad_norm": 0.260109007358551, "learning_rate": 1.6864695105993198e-05, "loss": 1.074, "mean_token_accuracy": 0.71206672526896, "num_tokens": 4675929.0, "step": 600 }, { "epoch": 0.157029524822115, "eval_entropy": 1.078299481868744, "eval_loss": 1.0839170217514038, "eval_mean_token_accuracy": 0.7196547708511353, "eval_num_tokens": 4675929.0, "eval_runtime": 96.9266, "eval_samples_per_second": 10.317, "eval_steps_per_second": 1.29, "step": 600 }, { "entropy": 1.214971611648798, "epoch": 0.15833810419563263, "grad_norm": 0.4801560044288635, "learning_rate": 1.6838523946610836e-05, "loss": 1.1404, "mean_token_accuracy": 0.6989750042557716, "num_tokens": 4714005.0, "step": 605 }, { "entropy": 1.1845928117632867, "epoch": 0.15964668356915024, "grad_norm": 0.30919569730758667, "learning_rate": 1.6812352787228478e-05, "loss": 1.2036, "mean_token_accuracy": 0.698324055224657, "num_tokens": 4755777.0, "step": 610 }, { "entropy": 1.0673103269189597, "epoch": 0.16095526294266788, "grad_norm": 0.2788703143596649, "learning_rate": 1.6786181627846113e-05, "loss": 0.9835, "mean_token_accuracy": 0.719642236456275, "num_tokens": 4795809.0, "step": 615 }, { "entropy": 1.1469231896102428, "epoch": 0.16226384231618549, "grad_norm": 0.3465251624584198, "learning_rate": 1.6760010468463755e-05, "loss": 1.0888, "mean_token_accuracy": 0.7080798294395209, "num_tokens": 4837156.0, "step": 620 }, { "entropy": 1.1547171102836729, "epoch": 0.16357242168970312, "grad_norm": 0.25486519932746887, "learning_rate": 1.6733839309081393e-05, "loss": 1.0928, "mean_token_accuracy": 0.7026465550065041, "num_tokens": 4877484.0, "step": 625 }, { "entropy": 1.1483009189367295, "epoch": 0.16488100106322073, "grad_norm": 0.2877664566040039, "learning_rate": 1.670766814969903e-05, "loss": 1.0912, "mean_token_accuracy": 0.7045180670917034, "num_tokens": 4916567.0, "step": 630 }, { "entropy": 1.057371946424246, "epoch": 0.16618958043673837, "grad_norm": 0.29588788747787476, "learning_rate": 1.6681496990316673e-05, "loss": 1.0147, "mean_token_accuracy": 0.733672034740448, "num_tokens": 4955375.0, "step": 635 }, { "entropy": 1.105701708048582, "epoch": 0.16749815981025598, "grad_norm": 0.305503785610199, "learning_rate": 1.665532583093431e-05, "loss": 1.0907, "mean_token_accuracy": 0.718507794290781, "num_tokens": 4994274.0, "step": 640 }, { "entropy": 1.1482198983430862, "epoch": 0.16880673918377362, "grad_norm": 0.34045520424842834, "learning_rate": 1.662915467155195e-05, "loss": 1.0737, "mean_token_accuracy": 0.7120114967226983, "num_tokens": 5028739.0, "step": 645 }, { "entropy": 1.1510255340486766, "epoch": 0.17011531855729123, "grad_norm": 0.3617549240589142, "learning_rate": 1.660298351216959e-05, "loss": 1.0989, "mean_token_accuracy": 0.7049180768430233, "num_tokens": 5067269.0, "step": 650 }, { "entropy": 1.1354583825916051, "epoch": 0.17142389793080887, "grad_norm": 0.2900081276893616, "learning_rate": 1.657681235278723e-05, "loss": 1.07, "mean_token_accuracy": 0.7162535794079303, "num_tokens": 5105740.0, "step": 655 }, { "entropy": 1.1124658603221178, "epoch": 0.1727324773043265, "grad_norm": 0.31473371386528015, "learning_rate": 1.655064119340487e-05, "loss": 1.0722, "mean_token_accuracy": 0.7151258502155542, "num_tokens": 5144288.0, "step": 660 }, { "entropy": 1.1872437849640847, "epoch": 0.1740410566778441, "grad_norm": 0.33180883526802063, "learning_rate": 1.6524470034022507e-05, "loss": 1.1771, "mean_token_accuracy": 0.6998372994363308, "num_tokens": 5182259.0, "step": 665 }, { "entropy": 1.140797757729888, "epoch": 0.17534963605136175, "grad_norm": 0.3464796245098114, "learning_rate": 1.649829887464015e-05, "loss": 1.088, "mean_token_accuracy": 0.7082854315638543, "num_tokens": 5221128.0, "step": 670 }, { "entropy": 1.1274819798767566, "epoch": 0.17665821542487936, "grad_norm": 0.3252648711204529, "learning_rate": 1.6472127715257787e-05, "loss": 1.0973, "mean_token_accuracy": 0.7148232467472553, "num_tokens": 5256205.0, "step": 675 }, { "entropy": 1.1311011631041765, "epoch": 0.177966794798397, "grad_norm": 0.6462693214416504, "learning_rate": 1.6445956555875425e-05, "loss": 1.0543, "mean_token_accuracy": 0.7152682617306709, "num_tokens": 5292320.0, "step": 680 }, { "entropy": 1.0446813367307186, "epoch": 0.1792753741719146, "grad_norm": 0.28639963269233704, "learning_rate": 1.6419785396493067e-05, "loss": 0.9461, "mean_token_accuracy": 0.7340242311358451, "num_tokens": 5334639.0, "step": 685 }, { "entropy": 1.1492939487099647, "epoch": 0.18058395354543225, "grad_norm": 0.34751448035240173, "learning_rate": 1.6393614237110705e-05, "loss": 1.0876, "mean_token_accuracy": 0.71434987783432, "num_tokens": 5371003.0, "step": 690 }, { "entropy": 1.1387122191488743, "epoch": 0.18189253291894986, "grad_norm": 0.32928481698036194, "learning_rate": 1.6367443077728344e-05, "loss": 1.1427, "mean_token_accuracy": 0.7110460348427295, "num_tokens": 5408983.0, "step": 695 }, { "entropy": 1.202421073615551, "epoch": 0.1832011122924675, "grad_norm": 0.36161890625953674, "learning_rate": 1.6341271918345986e-05, "loss": 1.17, "mean_token_accuracy": 0.6983021099120379, "num_tokens": 5449270.0, "step": 700 }, { "epoch": 0.1832011122924675, "eval_entropy": 1.0868224625587464, "eval_loss": 1.078864336013794, "eval_mean_token_accuracy": 0.7203193211555481, "eval_num_tokens": 5449270.0, "eval_runtime": 96.8653, "eval_samples_per_second": 10.324, "eval_steps_per_second": 1.29, "step": 700 }, { "entropy": 1.139945473894477, "epoch": 0.1845096916659851, "grad_norm": 0.34418225288391113, "learning_rate": 1.6315100758963624e-05, "loss": 1.0577, "mean_token_accuracy": 0.7123764835298061, "num_tokens": 5486900.0, "step": 705 }, { "entropy": 1.2072007820010184, "epoch": 0.18581827103950274, "grad_norm": 0.2719118595123291, "learning_rate": 1.6288929599581262e-05, "loss": 1.165, "mean_token_accuracy": 0.6938376247882843, "num_tokens": 5526941.0, "step": 710 }, { "entropy": 1.254874274134636, "epoch": 0.18712685041302035, "grad_norm": 0.42757540941238403, "learning_rate": 1.6262758440198904e-05, "loss": 1.2349, "mean_token_accuracy": 0.6856517255306244, "num_tokens": 5563702.0, "step": 715 }, { "entropy": 1.0055264480412007, "epoch": 0.188435429786538, "grad_norm": 0.3122069537639618, "learning_rate": 1.6236587280816542e-05, "loss": 0.9459, "mean_token_accuracy": 0.7404189445078373, "num_tokens": 5607678.0, "step": 720 }, { "entropy": 1.1009038139134646, "epoch": 0.18974400916005563, "grad_norm": 0.31834590435028076, "learning_rate": 1.621041612143418e-05, "loss": 1.0656, "mean_token_accuracy": 0.7191543571650982, "num_tokens": 5647310.0, "step": 725 }, { "entropy": 1.0673759788274766, "epoch": 0.19105258853357324, "grad_norm": 0.2675130069255829, "learning_rate": 1.618424496205182e-05, "loss": 1.0109, "mean_token_accuracy": 0.7255190499126911, "num_tokens": 5685639.0, "step": 730 }, { "entropy": 1.1511727664619684, "epoch": 0.19236116790709087, "grad_norm": 0.37566685676574707, "learning_rate": 1.615807380266946e-05, "loss": 1.0625, "mean_token_accuracy": 0.7034743718802929, "num_tokens": 5720880.0, "step": 735 }, { "entropy": 1.0888585902750492, "epoch": 0.19366974728060848, "grad_norm": 0.25365257263183594, "learning_rate": 1.61319026432871e-05, "loss": 1.049, "mean_token_accuracy": 0.7239378560334444, "num_tokens": 5762559.0, "step": 740 }, { "entropy": 1.1175981510430575, "epoch": 0.19497832665412612, "grad_norm": 0.43423643708229065, "learning_rate": 1.6105731483904738e-05, "loss": 1.0806, "mean_token_accuracy": 0.7193359076976776, "num_tokens": 5803927.0, "step": 745 }, { "entropy": 1.103783832490444, "epoch": 0.19628690602764373, "grad_norm": 0.38531434535980225, "learning_rate": 1.607956032452238e-05, "loss": 1.0503, "mean_token_accuracy": 0.722858403250575, "num_tokens": 5845809.0, "step": 750 }, { "entropy": 1.1102315738797188, "epoch": 0.19759548540116137, "grad_norm": 0.3240512013435364, "learning_rate": 1.6053389165140018e-05, "loss": 1.074, "mean_token_accuracy": 0.7143935695290565, "num_tokens": 5884671.0, "step": 755 }, { "entropy": 1.1394222188740968, "epoch": 0.19890406477467898, "grad_norm": 0.35714927315711975, "learning_rate": 1.6027218005757656e-05, "loss": 1.0505, "mean_token_accuracy": 0.7142257861793041, "num_tokens": 5923161.0, "step": 760 }, { "entropy": 1.10749419555068, "epoch": 0.20021264414819662, "grad_norm": 0.28041279315948486, "learning_rate": 1.6001046846375298e-05, "loss": 1.0247, "mean_token_accuracy": 0.7158019613474608, "num_tokens": 5964592.0, "step": 765 }, { "entropy": 1.1163969319313765, "epoch": 0.20152122352171423, "grad_norm": 0.3620986342430115, "learning_rate": 1.5974875686992933e-05, "loss": 1.1243, "mean_token_accuracy": 0.7199557162821293, "num_tokens": 6002121.0, "step": 770 }, { "entropy": 1.1485939621925354, "epoch": 0.20282980289523186, "grad_norm": 0.28997066617012024, "learning_rate": 1.5948704527610575e-05, "loss": 1.1103, "mean_token_accuracy": 0.7058781541883945, "num_tokens": 6042026.0, "step": 775 }, { "entropy": 1.163365462049842, "epoch": 0.2041383822687495, "grad_norm": 0.3916630148887634, "learning_rate": 1.5922533368228213e-05, "loss": 1.1024, "mean_token_accuracy": 0.7097311306744813, "num_tokens": 6081711.0, "step": 780 }, { "entropy": 1.1414872355759145, "epoch": 0.2054469616422671, "grad_norm": 0.3654806315898895, "learning_rate": 1.589636220884585e-05, "loss": 1.1151, "mean_token_accuracy": 0.7132655665278435, "num_tokens": 6124022.0, "step": 785 }, { "entropy": 1.1237910758703946, "epoch": 0.20675554101578475, "grad_norm": 0.30389606952667236, "learning_rate": 1.5870191049463493e-05, "loss": 1.071, "mean_token_accuracy": 0.7095677696168423, "num_tokens": 6163807.0, "step": 790 }, { "entropy": 1.108309706300497, "epoch": 0.20806412038930236, "grad_norm": 0.3702613413333893, "learning_rate": 1.584401989008113e-05, "loss": 1.0634, "mean_token_accuracy": 0.7195238072425127, "num_tokens": 6201204.0, "step": 795 }, { "entropy": 1.114043417200446, "epoch": 0.20937269976282, "grad_norm": 0.35386136174201965, "learning_rate": 1.581784873069877e-05, "loss": 1.0579, "mean_token_accuracy": 0.7175327345728875, "num_tokens": 6241249.0, "step": 800 }, { "epoch": 0.20937269976282, "eval_entropy": 1.0886374835968018, "eval_loss": 1.0736411809921265, "eval_mean_token_accuracy": 0.7216972298622132, "eval_num_tokens": 6241249.0, "eval_runtime": 96.8086, "eval_samples_per_second": 10.33, "eval_steps_per_second": 1.291, "step": 800 }, { "entropy": 1.1533120274543762, "epoch": 0.2106812791363376, "grad_norm": 0.37774205207824707, "learning_rate": 1.579167757131641e-05, "loss": 1.1071, "mean_token_accuracy": 0.7110585495829582, "num_tokens": 6280255.0, "step": 805 }, { "entropy": 1.1299995694309473, "epoch": 0.21198985850985524, "grad_norm": 0.3383270502090454, "learning_rate": 1.576550641193405e-05, "loss": 1.1167, "mean_token_accuracy": 0.7196012400090694, "num_tokens": 6320822.0, "step": 810 }, { "entropy": 1.132519034296274, "epoch": 0.21329843788337285, "grad_norm": 0.4295906126499176, "learning_rate": 1.573933525255169e-05, "loss": 1.0924, "mean_token_accuracy": 0.7042114794254303, "num_tokens": 6356129.0, "step": 815 }, { "entropy": 1.0832153180614115, "epoch": 0.2146070172568905, "grad_norm": 0.3158726394176483, "learning_rate": 1.571316409316933e-05, "loss": 1.0307, "mean_token_accuracy": 0.7189163245260716, "num_tokens": 6396277.0, "step": 820 }, { "entropy": 1.1116347532719373, "epoch": 0.2159155966304081, "grad_norm": 0.41325512528419495, "learning_rate": 1.568699293378697e-05, "loss": 1.0844, "mean_token_accuracy": 0.7163894921541214, "num_tokens": 6437087.0, "step": 825 }, { "entropy": 1.143670390546322, "epoch": 0.21722417600392574, "grad_norm": 0.4080452620983124, "learning_rate": 1.5660821774404607e-05, "loss": 1.1135, "mean_token_accuracy": 0.7132163152098656, "num_tokens": 6476092.0, "step": 830 }, { "entropy": 1.0523447673767805, "epoch": 0.21853275537744335, "grad_norm": 0.4133390784263611, "learning_rate": 1.5634650615022245e-05, "loss": 0.9998, "mean_token_accuracy": 0.7262923561036587, "num_tokens": 6513582.0, "step": 835 }, { "entropy": 1.1258170191198587, "epoch": 0.219841334750961, "grad_norm": 0.30965185165405273, "learning_rate": 1.5608479455639887e-05, "loss": 1.0456, "mean_token_accuracy": 0.7124281242489815, "num_tokens": 6548126.0, "step": 840 }, { "entropy": 0.9893411479890346, "epoch": 0.22114991412447862, "grad_norm": 0.2973790168762207, "learning_rate": 1.5582308296257525e-05, "loss": 0.9799, "mean_token_accuracy": 0.7382751934230327, "num_tokens": 6586710.0, "step": 845 }, { "entropy": 1.0379876332357525, "epoch": 0.22245849349799623, "grad_norm": 0.3356630206108093, "learning_rate": 1.5556137136875164e-05, "loss": 0.996, "mean_token_accuracy": 0.7286129504442215, "num_tokens": 6619347.0, "step": 850 }, { "entropy": 1.1024742640554905, "epoch": 0.22376707287151387, "grad_norm": 0.2636205554008484, "learning_rate": 1.5529965977492806e-05, "loss": 1.0735, "mean_token_accuracy": 0.7184439640492201, "num_tokens": 6659706.0, "step": 855 }, { "entropy": 1.0713739044964314, "epoch": 0.22507565224503148, "grad_norm": 0.31522178649902344, "learning_rate": 1.5503794818110444e-05, "loss": 1.0024, "mean_token_accuracy": 0.7307705961167812, "num_tokens": 6697262.0, "step": 860 }, { "entropy": 1.104831437766552, "epoch": 0.22638423161854912, "grad_norm": 0.4077572822570801, "learning_rate": 1.5477623658728082e-05, "loss": 1.0503, "mean_token_accuracy": 0.714413607865572, "num_tokens": 6736296.0, "step": 865 }, { "entropy": 1.111717356555164, "epoch": 0.22769281099206673, "grad_norm": 0.29983147978782654, "learning_rate": 1.5451452499345724e-05, "loss": 1.0415, "mean_token_accuracy": 0.7202403925359249, "num_tokens": 6771577.0, "step": 870 }, { "entropy": 1.0804973103106021, "epoch": 0.22900139036558437, "grad_norm": 0.3723820149898529, "learning_rate": 1.5425281339963362e-05, "loss": 1.0643, "mean_token_accuracy": 0.7192892145365477, "num_tokens": 6809238.0, "step": 875 }, { "entropy": 1.1236145053058864, "epoch": 0.23030996973910198, "grad_norm": 0.3762100338935852, "learning_rate": 1.5399110180581e-05, "loss": 1.1046, "mean_token_accuracy": 0.7134812079370022, "num_tokens": 6849361.0, "step": 880 }, { "entropy": 1.1618900313973426, "epoch": 0.23161854911261961, "grad_norm": 0.30742284655570984, "learning_rate": 1.537293902119864e-05, "loss": 1.1275, "mean_token_accuracy": 0.7040756050497293, "num_tokens": 6888648.0, "step": 885 }, { "entropy": 1.0937749415636062, "epoch": 0.23292712848613722, "grad_norm": 0.3493000864982605, "learning_rate": 1.534676786181628e-05, "loss": 0.9861, "mean_token_accuracy": 0.7299770966172219, "num_tokens": 6930330.0, "step": 890 }, { "entropy": 1.1693835414946079, "epoch": 0.23423570785965486, "grad_norm": 0.4292133152484894, "learning_rate": 1.532059670243392e-05, "loss": 1.1322, "mean_token_accuracy": 0.7005651116371154, "num_tokens": 6967178.0, "step": 895 }, { "entropy": 1.1351615231484176, "epoch": 0.2355442872331725, "grad_norm": 0.375841349363327, "learning_rate": 1.5294425543051558e-05, "loss": 1.0663, "mean_token_accuracy": 0.7153576008975506, "num_tokens": 7004358.0, "step": 900 }, { "epoch": 0.2355442872331725, "eval_entropy": 1.0517715344429015, "eval_loss": 1.0706533193588257, "eval_mean_token_accuracy": 0.7221157221794129, "eval_num_tokens": 7004358.0, "eval_runtime": 97.0639, "eval_samples_per_second": 10.302, "eval_steps_per_second": 1.288, "step": 900 }, { "entropy": 1.0132040306925774, "epoch": 0.2368528666066901, "grad_norm": 0.3356113135814667, "learning_rate": 1.52682543836692e-05, "loss": 1.0007, "mean_token_accuracy": 0.7337090380489826, "num_tokens": 7045486.0, "step": 905 }, { "entropy": 1.0505661655217409, "epoch": 0.23816144598020775, "grad_norm": 0.3069000244140625, "learning_rate": 1.5242083224286836e-05, "loss": 1.0113, "mean_token_accuracy": 0.7300900310277939, "num_tokens": 7087578.0, "step": 910 }, { "entropy": 1.0948154201731086, "epoch": 0.23947002535372536, "grad_norm": 0.26865532994270325, "learning_rate": 1.5215912064904476e-05, "loss": 0.996, "mean_token_accuracy": 0.7241026997566223, "num_tokens": 7125621.0, "step": 915 }, { "entropy": 1.0862870909273625, "epoch": 0.240778604727243, "grad_norm": 0.3076420724391937, "learning_rate": 1.5189740905522116e-05, "loss": 1.0411, "mean_token_accuracy": 0.7218753002583981, "num_tokens": 7163248.0, "step": 920 }, { "entropy": 1.0920290663838386, "epoch": 0.2420871841007606, "grad_norm": 0.35735777020454407, "learning_rate": 1.5163569746139755e-05, "loss": 0.9907, "mean_token_accuracy": 0.7253928408026695, "num_tokens": 7198313.0, "step": 925 }, { "entropy": 1.0561616513878107, "epoch": 0.24339576347427824, "grad_norm": 0.3752039670944214, "learning_rate": 1.5137398586757395e-05, "loss": 1.0076, "mean_token_accuracy": 0.7270361024886369, "num_tokens": 7236629.0, "step": 930 }, { "entropy": 1.192596261948347, "epoch": 0.24470434284779585, "grad_norm": 0.3303165137767792, "learning_rate": 1.5111227427375035e-05, "loss": 1.1446, "mean_token_accuracy": 0.7021023813635111, "num_tokens": 7278903.0, "step": 935 }, { "entropy": 1.1245649460703135, "epoch": 0.2460129222213135, "grad_norm": 0.43512144684791565, "learning_rate": 1.5085056267992673e-05, "loss": 1.0783, "mean_token_accuracy": 0.7149506479501724, "num_tokens": 7314092.0, "step": 940 }, { "entropy": 1.1568658344447613, "epoch": 0.2473215015948311, "grad_norm": 0.2600148320198059, "learning_rate": 1.5058885108610313e-05, "loss": 1.1493, "mean_token_accuracy": 0.7064739305526018, "num_tokens": 7354852.0, "step": 945 }, { "entropy": 1.1390374675393105, "epoch": 0.24863008096834874, "grad_norm": 0.4230680465698242, "learning_rate": 1.5032713949227953e-05, "loss": 1.051, "mean_token_accuracy": 0.7147066026926041, "num_tokens": 7391988.0, "step": 950 }, { "entropy": 1.1157769251614809, "epoch": 0.24993866034186635, "grad_norm": 0.31027132272720337, "learning_rate": 1.500654278984559e-05, "loss": 1.1055, "mean_token_accuracy": 0.7114299058914184, "num_tokens": 7430698.0, "step": 955 }, { "entropy": 1.1084700468927622, "epoch": 0.251247239715384, "grad_norm": 0.3883765935897827, "learning_rate": 1.498037163046323e-05, "loss": 1.0371, "mean_token_accuracy": 0.7152060814201832, "num_tokens": 7466746.0, "step": 960 }, { "entropy": 1.1067252047359943, "epoch": 0.2525558190889016, "grad_norm": 0.3586679697036743, "learning_rate": 1.495420047108087e-05, "loss": 1.091, "mean_token_accuracy": 0.7141210325062275, "num_tokens": 7508827.0, "step": 965 }, { "entropy": 1.1788568448275327, "epoch": 0.25386439846241926, "grad_norm": 0.28028714656829834, "learning_rate": 1.4928029311698508e-05, "loss": 1.1108, "mean_token_accuracy": 0.7068845115602016, "num_tokens": 7549412.0, "step": 970 }, { "entropy": 1.1229758808389305, "epoch": 0.25517297783593684, "grad_norm": 0.25545772910118103, "learning_rate": 1.4901858152316149e-05, "loss": 1.0713, "mean_token_accuracy": 0.7074509769678116, "num_tokens": 7587152.0, "step": 975 }, { "entropy": 1.0951409205794334, "epoch": 0.2564815572094545, "grad_norm": 0.4516986608505249, "learning_rate": 1.4875686992933789e-05, "loss": 0.9969, "mean_token_accuracy": 0.723530513048172, "num_tokens": 7623760.0, "step": 980 }, { "entropy": 1.1083605211228131, "epoch": 0.2577901365829721, "grad_norm": 0.28635019063949585, "learning_rate": 1.4849515833551427e-05, "loss": 1.0367, "mean_token_accuracy": 0.7231942892074585, "num_tokens": 7670277.0, "step": 985 }, { "entropy": 1.140053043141961, "epoch": 0.25909871595648976, "grad_norm": 0.29921141266822815, "learning_rate": 1.4823344674169067e-05, "loss": 1.0606, "mean_token_accuracy": 0.7078870553523302, "num_tokens": 7711405.0, "step": 990 }, { "entropy": 1.1648910887539388, "epoch": 0.26040729533000734, "grad_norm": 0.44739824533462524, "learning_rate": 1.4797173514786707e-05, "loss": 1.0833, "mean_token_accuracy": 0.7147607050836087, "num_tokens": 7745597.0, "step": 995 }, { "entropy": 1.1015649508684873, "epoch": 0.261715874703525, "grad_norm": 0.3616364598274231, "learning_rate": 1.4771002355404345e-05, "loss": 1.102, "mean_token_accuracy": 0.7119447905570269, "num_tokens": 7778271.0, "step": 1000 }, { "epoch": 0.261715874703525, "eval_entropy": 1.0673753538131714, "eval_loss": 1.0669485330581665, "eval_mean_token_accuracy": 0.7229559454917908, "eval_num_tokens": 7778271.0, "eval_runtime": 96.9367, "eval_samples_per_second": 10.316, "eval_steps_per_second": 1.29, "step": 1000 }, { "entropy": 1.1813726715743542, "epoch": 0.2630244540770426, "grad_norm": 0.34745144844055176, "learning_rate": 1.4744831196021986e-05, "loss": 1.1606, "mean_token_accuracy": 0.7018857698887586, "num_tokens": 7820880.0, "step": 1005 }, { "entropy": 1.1542737431824208, "epoch": 0.26433303345056025, "grad_norm": 0.37115049362182617, "learning_rate": 1.4718660036639626e-05, "loss": 1.102, "mean_token_accuracy": 0.7048508331179619, "num_tokens": 7858918.0, "step": 1010 }, { "entropy": 1.0566180381923913, "epoch": 0.26564161282407783, "grad_norm": 0.3299511671066284, "learning_rate": 1.4692488877257262e-05, "loss": 0.9832, "mean_token_accuracy": 0.7326587349176407, "num_tokens": 7896026.0, "step": 1015 }, { "entropy": 1.131032995507121, "epoch": 0.26695019219759547, "grad_norm": 0.3021943271160126, "learning_rate": 1.4666317717874902e-05, "loss": 1.1154, "mean_token_accuracy": 0.7120835833251477, "num_tokens": 7936716.0, "step": 1020 }, { "entropy": 1.1042870879173279, "epoch": 0.2682587715711131, "grad_norm": 0.47742941975593567, "learning_rate": 1.4640146558492542e-05, "loss": 1.047, "mean_token_accuracy": 0.7116286411881447, "num_tokens": 7973759.0, "step": 1025 }, { "entropy": 1.1014587018638848, "epoch": 0.26956735094463075, "grad_norm": 0.36218902468681335, "learning_rate": 1.461397539911018e-05, "loss": 1.0494, "mean_token_accuracy": 0.7173497840762139, "num_tokens": 8012730.0, "step": 1030 }, { "entropy": 1.1734120152890681, "epoch": 0.2708759303181484, "grad_norm": 0.38173866271972656, "learning_rate": 1.458780423972782e-05, "loss": 1.0801, "mean_token_accuracy": 0.7082495357841253, "num_tokens": 8053786.0, "step": 1035 }, { "entropy": 1.102143411897123, "epoch": 0.27218450969166597, "grad_norm": 0.32196053862571716, "learning_rate": 1.4561633080345461e-05, "loss": 1.0723, "mean_token_accuracy": 0.7233434859663248, "num_tokens": 8092132.0, "step": 1040 }, { "entropy": 1.0193447671830653, "epoch": 0.2734930890651836, "grad_norm": 0.3907255530357361, "learning_rate": 1.4535461920963101e-05, "loss": 0.9895, "mean_token_accuracy": 0.7300974868237973, "num_tokens": 8131337.0, "step": 1045 }, { "entropy": 1.105736630409956, "epoch": 0.27480166843870124, "grad_norm": 0.564200758934021, "learning_rate": 1.450929076158074e-05, "loss": 1.0985, "mean_token_accuracy": 0.7164596065878868, "num_tokens": 8170291.0, "step": 1050 }, { "entropy": 1.0954801928251983, "epoch": 0.2761102478122189, "grad_norm": 0.3417541980743408, "learning_rate": 1.448311960219838e-05, "loss": 1.0295, "mean_token_accuracy": 0.7208357889205217, "num_tokens": 8205888.0, "step": 1055 }, { "entropy": 1.164321320876479, "epoch": 0.27741882718573646, "grad_norm": 0.450543075799942, "learning_rate": 1.445694844281602e-05, "loss": 1.0975, "mean_token_accuracy": 0.7097847059369087, "num_tokens": 8242915.0, "step": 1060 }, { "entropy": 1.1172142535448075, "epoch": 0.2787274065592541, "grad_norm": 0.3983106315135956, "learning_rate": 1.4430777283433656e-05, "loss": 1.0618, "mean_token_accuracy": 0.7233660608530045, "num_tokens": 8280049.0, "step": 1065 }, { "entropy": 1.1097337387502193, "epoch": 0.28003598593277174, "grad_norm": 0.3336636424064636, "learning_rate": 1.4404606124051296e-05, "loss": 1.0713, "mean_token_accuracy": 0.7141048096120357, "num_tokens": 8320724.0, "step": 1070 }, { "entropy": 1.143090207502246, "epoch": 0.2813445653062894, "grad_norm": 0.47186407446861267, "learning_rate": 1.4378434964668936e-05, "loss": 1.1169, "mean_token_accuracy": 0.7122757855802775, "num_tokens": 8355846.0, "step": 1075 }, { "entropy": 1.116348212212324, "epoch": 0.282653144679807, "grad_norm": 0.3499598205089569, "learning_rate": 1.4352263805286575e-05, "loss": 1.0442, "mean_token_accuracy": 0.7156722906976938, "num_tokens": 8394668.0, "step": 1080 }, { "entropy": 1.2300225652754306, "epoch": 0.2839617240533246, "grad_norm": 0.3478023111820221, "learning_rate": 1.4326092645904215e-05, "loss": 1.1722, "mean_token_accuracy": 0.6905759438872338, "num_tokens": 8431587.0, "step": 1085 }, { "entropy": 1.0534675564616918, "epoch": 0.28527030342684223, "grad_norm": 0.38736996054649353, "learning_rate": 1.4299921486521855e-05, "loss": 1.0075, "mean_token_accuracy": 0.7261179834604263, "num_tokens": 8469819.0, "step": 1090 }, { "entropy": 1.0744053564965725, "epoch": 0.28657888280035987, "grad_norm": 0.32573196291923523, "learning_rate": 1.4273750327139493e-05, "loss": 1.0065, "mean_token_accuracy": 0.727313157171011, "num_tokens": 8507289.0, "step": 1095 }, { "entropy": 1.1115598980337382, "epoch": 0.2878874621738775, "grad_norm": 0.2983573079109192, "learning_rate": 1.4247579167757133e-05, "loss": 1.0318, "mean_token_accuracy": 0.7211863253265619, "num_tokens": 8541734.0, "step": 1100 }, { "epoch": 0.2878874621738775, "eval_entropy": 1.0663088278770447, "eval_loss": 1.0640983581542969, "eval_mean_token_accuracy": 0.7233748874664306, "eval_num_tokens": 8541734.0, "eval_runtime": 97.2334, "eval_samples_per_second": 10.285, "eval_steps_per_second": 1.286, "step": 1100 }, { "entropy": 1.179681045934558, "epoch": 0.2891960415473951, "grad_norm": 0.34421974420547485, "learning_rate": 1.4221408008374773e-05, "loss": 1.1522, "mean_token_accuracy": 0.7020174067467451, "num_tokens": 8579590.0, "step": 1105 }, { "entropy": 1.23528895676136, "epoch": 0.2905046209209127, "grad_norm": 0.4845016300678253, "learning_rate": 1.4195236848992412e-05, "loss": 1.2052, "mean_token_accuracy": 0.6924681778997183, "num_tokens": 8616216.0, "step": 1110 }, { "entropy": 1.071528448536992, "epoch": 0.29181320029443036, "grad_norm": 0.3960973620414734, "learning_rate": 1.4169065689610052e-05, "loss": 0.9994, "mean_token_accuracy": 0.7225042834877968, "num_tokens": 8654775.0, "step": 1115 }, { "entropy": 1.2246194936335086, "epoch": 0.293121779667948, "grad_norm": 0.42761850357055664, "learning_rate": 1.4142894530227692e-05, "loss": 1.1149, "mean_token_accuracy": 0.6936006706207991, "num_tokens": 8690533.0, "step": 1120 }, { "entropy": 1.0734559996053576, "epoch": 0.2944303590414656, "grad_norm": 0.3908097445964813, "learning_rate": 1.4116723370845328e-05, "loss": 1.0524, "mean_token_accuracy": 0.7204558227211237, "num_tokens": 8727564.0, "step": 1125 }, { "entropy": 1.0594063133001328, "epoch": 0.2957389384149832, "grad_norm": 0.28768423199653625, "learning_rate": 1.4090552211462969e-05, "loss": 1.0328, "mean_token_accuracy": 0.7205640614032746, "num_tokens": 8774139.0, "step": 1130 }, { "entropy": 1.0696671523153782, "epoch": 0.29704751778850086, "grad_norm": 0.4257560968399048, "learning_rate": 1.4064381052080609e-05, "loss": 1.0058, "mean_token_accuracy": 0.7245784349739551, "num_tokens": 8810357.0, "step": 1135 }, { "entropy": 1.1237424857914449, "epoch": 0.2983560971620185, "grad_norm": 0.3298150599002838, "learning_rate": 1.4038209892698247e-05, "loss": 1.1134, "mean_token_accuracy": 0.7139571741223335, "num_tokens": 8850219.0, "step": 1140 }, { "entropy": 1.0590598836541176, "epoch": 0.29966467653553613, "grad_norm": 0.36441755294799805, "learning_rate": 1.4012038733315887e-05, "loss": 1.0017, "mean_token_accuracy": 0.72717116586864, "num_tokens": 8890728.0, "step": 1145 }, { "entropy": 1.0748774517327546, "epoch": 0.3009732559090537, "grad_norm": 0.5299174189567566, "learning_rate": 1.3985867573933527e-05, "loss": 1.0444, "mean_token_accuracy": 0.7247283719480038, "num_tokens": 8931167.0, "step": 1150 }, { "entropy": 1.2320478869602085, "epoch": 0.30228183528257135, "grad_norm": 0.31828537583351135, "learning_rate": 1.3959696414551165e-05, "loss": 1.1425, "mean_token_accuracy": 0.7029214788228273, "num_tokens": 8971179.0, "step": 1155 }, { "entropy": 1.0877997063100338, "epoch": 0.303590414656089, "grad_norm": 0.3202095329761505, "learning_rate": 1.3933525255168806e-05, "loss": 1.0868, "mean_token_accuracy": 0.7259886477142572, "num_tokens": 9012122.0, "step": 1160 }, { "entropy": 1.1524120923131704, "epoch": 0.30489899402960663, "grad_norm": 0.44416385889053345, "learning_rate": 1.3907354095786446e-05, "loss": 1.1781, "mean_token_accuracy": 0.7089681778103113, "num_tokens": 9051879.0, "step": 1165 }, { "entropy": 1.142632443830371, "epoch": 0.3062075734031242, "grad_norm": 0.29014092683792114, "learning_rate": 1.3881182936404082e-05, "loss": 1.1183, "mean_token_accuracy": 0.7098891779780387, "num_tokens": 9095080.0, "step": 1170 }, { "entropy": 1.0738845858722925, "epoch": 0.30751615277664185, "grad_norm": 0.5643961429595947, "learning_rate": 1.3855011777021722e-05, "loss": 1.0341, "mean_token_accuracy": 0.7218465633690357, "num_tokens": 9134553.0, "step": 1175 }, { "entropy": 1.0997203588485718, "epoch": 0.3088247321501595, "grad_norm": 0.3256031274795532, "learning_rate": 1.3828840617639362e-05, "loss": 1.0443, "mean_token_accuracy": 0.7164226226508618, "num_tokens": 9171451.0, "step": 1180 }, { "entropy": 1.1001708198338747, "epoch": 0.3101333115236771, "grad_norm": 0.38000190258026123, "learning_rate": 1.3802669458257e-05, "loss": 1.0945, "mean_token_accuracy": 0.7193384803831577, "num_tokens": 9212655.0, "step": 1185 }, { "entropy": 1.1442682899534702, "epoch": 0.3114418908971947, "grad_norm": 0.38140004873275757, "learning_rate": 1.377649829887464e-05, "loss": 1.1116, "mean_token_accuracy": 0.7097606465220452, "num_tokens": 9251378.0, "step": 1190 }, { "entropy": 1.0966169007122517, "epoch": 0.31275047027071234, "grad_norm": 0.30173954367637634, "learning_rate": 1.3750327139492281e-05, "loss": 1.0569, "mean_token_accuracy": 0.7154310643672943, "num_tokens": 9290376.0, "step": 1195 }, { "entropy": 1.112706720456481, "epoch": 0.31405904964423, "grad_norm": 0.3677958846092224, "learning_rate": 1.372415598010992e-05, "loss": 1.1293, "mean_token_accuracy": 0.7175452932715416, "num_tokens": 9335450.0, "step": 1200 }, { "epoch": 0.31405904964423, "eval_entropy": 1.0616401386260987, "eval_loss": 1.0616909265518188, "eval_mean_token_accuracy": 0.7236422438621521, "eval_num_tokens": 9335450.0, "eval_runtime": 97.3061, "eval_samples_per_second": 10.277, "eval_steps_per_second": 1.285, "step": 1200 }, { "entropy": 1.0197914224117994, "epoch": 0.3153676290177476, "grad_norm": 0.303828626871109, "learning_rate": 1.369798482072756e-05, "loss": 0.9773, "mean_token_accuracy": 0.7335654478520155, "num_tokens": 9376757.0, "step": 1205 }, { "entropy": 1.0958842366933823, "epoch": 0.31667620839126526, "grad_norm": 0.31567996740341187, "learning_rate": 1.36718136613452e-05, "loss": 1.1095, "mean_token_accuracy": 0.7230709217488765, "num_tokens": 9412399.0, "step": 1210 }, { "entropy": 1.0366449020802975, "epoch": 0.31798478776478284, "grad_norm": 0.28812670707702637, "learning_rate": 1.364564250196284e-05, "loss": 0.9809, "mean_token_accuracy": 0.7372792772948742, "num_tokens": 9459659.0, "step": 1215 }, { "entropy": 1.16514153778553, "epoch": 0.3192933671383005, "grad_norm": 0.32817167043685913, "learning_rate": 1.3619471342580476e-05, "loss": 1.1274, "mean_token_accuracy": 0.7073730494827032, "num_tokens": 9501119.0, "step": 1220 }, { "entropy": 1.1919928897172212, "epoch": 0.3206019465118181, "grad_norm": 0.4222601056098938, "learning_rate": 1.3593300183198118e-05, "loss": 1.1383, "mean_token_accuracy": 0.7020205046981574, "num_tokens": 9541572.0, "step": 1225 }, { "entropy": 1.0665121775120496, "epoch": 0.32191052588533575, "grad_norm": 0.37065136432647705, "learning_rate": 1.3567129023815758e-05, "loss": 1.0282, "mean_token_accuracy": 0.7204833775758743, "num_tokens": 9582686.0, "step": 1230 }, { "entropy": 1.1947314314544202, "epoch": 0.32321910525885333, "grad_norm": 0.34813007712364197, "learning_rate": 1.3540957864433395e-05, "loss": 1.1388, "mean_token_accuracy": 0.6964055716991424, "num_tokens": 9623500.0, "step": 1235 }, { "entropy": 1.0864916004240512, "epoch": 0.32452768463237097, "grad_norm": 0.2981452941894531, "learning_rate": 1.3514786705051035e-05, "loss": 1.0402, "mean_token_accuracy": 0.7253331996500492, "num_tokens": 9667330.0, "step": 1240 }, { "entropy": 1.088518501445651, "epoch": 0.3258362640058886, "grad_norm": 0.27413734793663025, "learning_rate": 1.3488615545668675e-05, "loss": 1.0039, "mean_token_accuracy": 0.7201201483607292, "num_tokens": 9707928.0, "step": 1245 }, { "entropy": 1.0232492092996837, "epoch": 0.32714484337940625, "grad_norm": 0.3049759864807129, "learning_rate": 1.3462444386286313e-05, "loss": 1.0155, "mean_token_accuracy": 0.731953501701355, "num_tokens": 9749414.0, "step": 1250 }, { "entropy": 1.1411474995315074, "epoch": 0.3284534227529239, "grad_norm": 0.30817753076553345, "learning_rate": 1.3436273226903953e-05, "loss": 1.1117, "mean_token_accuracy": 0.7109464205801487, "num_tokens": 9787037.0, "step": 1255 }, { "entropy": 1.0607344008982182, "epoch": 0.32976200212644147, "grad_norm": 0.2996397316455841, "learning_rate": 1.3410102067521593e-05, "loss": 0.9988, "mean_token_accuracy": 0.7263243019580841, "num_tokens": 9823917.0, "step": 1260 }, { "entropy": 1.1400364696979524, "epoch": 0.3310705814999591, "grad_norm": 0.394529789686203, "learning_rate": 1.3383930908139232e-05, "loss": 1.1346, "mean_token_accuracy": 0.7115379109978676, "num_tokens": 9863897.0, "step": 1265 }, { "entropy": 1.1153028715401887, "epoch": 0.33237916087347674, "grad_norm": 0.38152188062667847, "learning_rate": 1.3357759748756872e-05, "loss": 1.0826, "mean_token_accuracy": 0.7159910671412945, "num_tokens": 9907161.0, "step": 1270 }, { "entropy": 1.1335538432002068, "epoch": 0.3336877402469944, "grad_norm": 0.3795076012611389, "learning_rate": 1.3331588589374512e-05, "loss": 1.062, "mean_token_accuracy": 0.712515490502119, "num_tokens": 9941050.0, "step": 1275 }, { "entropy": 1.201156435534358, "epoch": 0.33499631962051196, "grad_norm": 0.31430941820144653, "learning_rate": 1.3305417429992148e-05, "loss": 1.1755, "mean_token_accuracy": 0.7030155394226313, "num_tokens": 9982204.0, "step": 1280 }, { "entropy": 1.0916207611560822, "epoch": 0.3363048989940296, "grad_norm": 0.3562738597393036, "learning_rate": 1.3279246270609789e-05, "loss": 1.03, "mean_token_accuracy": 0.7254784423857927, "num_tokens": 10023122.0, "step": 1285 }, { "entropy": 1.1606000121682882, "epoch": 0.33761347836754724, "grad_norm": 0.4184325337409973, "learning_rate": 1.3253075111227429e-05, "loss": 1.1373, "mean_token_accuracy": 0.7046990938484669, "num_tokens": 10064911.0, "step": 1290 }, { "entropy": 1.0977379951626063, "epoch": 0.3389220577410649, "grad_norm": 0.3197747766971588, "learning_rate": 1.3226903951845067e-05, "loss": 1.0689, "mean_token_accuracy": 0.7257760524749756, "num_tokens": 10100407.0, "step": 1295 }, { "entropy": 1.074541300162673, "epoch": 0.34023063711458246, "grad_norm": 0.40503931045532227, "learning_rate": 1.3200732792462707e-05, "loss": 0.967, "mean_token_accuracy": 0.7171443022787571, "num_tokens": 10134360.0, "step": 1300 }, { "epoch": 0.34023063711458246, "eval_entropy": 1.04598343706131, "eval_loss": 1.0586909055709839, "eval_mean_token_accuracy": 0.7240785593986512, "eval_num_tokens": 10134360.0, "eval_runtime": 97.2108, "eval_samples_per_second": 10.287, "eval_steps_per_second": 1.286, "step": 1300 }, { "entropy": 1.093527865409851, "epoch": 0.3415392164881001, "grad_norm": 0.30255845189094543, "learning_rate": 1.3174561633080347e-05, "loss": 1.0996, "mean_token_accuracy": 0.7116521965712309, "num_tokens": 10174760.0, "step": 1305 }, { "entropy": 1.0523213766515256, "epoch": 0.34284779586161773, "grad_norm": 0.2793543040752411, "learning_rate": 1.3148390473697985e-05, "loss": 0.9752, "mean_token_accuracy": 0.7246817748993635, "num_tokens": 10216071.0, "step": 1310 }, { "entropy": 1.0620372116565704, "epoch": 0.34415637523513537, "grad_norm": 0.31549394130706787, "learning_rate": 1.3122219314315626e-05, "loss": 0.992, "mean_token_accuracy": 0.7261651441454887, "num_tokens": 10256598.0, "step": 1315 }, { "entropy": 1.0895736794918776, "epoch": 0.345464954608653, "grad_norm": 0.3750867545604706, "learning_rate": 1.3096048154933266e-05, "loss": 1.051, "mean_token_accuracy": 0.7179807629436255, "num_tokens": 10294800.0, "step": 1320 }, { "entropy": 1.0776633590459823, "epoch": 0.3467735339821706, "grad_norm": 0.4244779944419861, "learning_rate": 1.3069876995550902e-05, "loss": 1.007, "mean_token_accuracy": 0.7257129170000554, "num_tokens": 10334846.0, "step": 1325 }, { "entropy": 1.1380624793469907, "epoch": 0.3480821133556882, "grad_norm": 0.3876670300960541, "learning_rate": 1.3043705836168542e-05, "loss": 1.1296, "mean_token_accuracy": 0.7143970631062985, "num_tokens": 10374248.0, "step": 1330 }, { "entropy": 1.055607095360756, "epoch": 0.34939069272920587, "grad_norm": 0.5531731247901917, "learning_rate": 1.3017534676786182e-05, "loss": 0.9648, "mean_token_accuracy": 0.7317230701446533, "num_tokens": 10410746.0, "step": 1335 }, { "entropy": 1.134307411313057, "epoch": 0.3506992721027235, "grad_norm": 0.3122623860836029, "learning_rate": 1.299136351740382e-05, "loss": 1.078, "mean_token_accuracy": 0.7120365314185619, "num_tokens": 10452938.0, "step": 1340 }, { "entropy": 1.0759996108710765, "epoch": 0.3520078514762411, "grad_norm": 0.35110828280448914, "learning_rate": 1.296519235802146e-05, "loss": 0.9849, "mean_token_accuracy": 0.7297365363687277, "num_tokens": 10491903.0, "step": 1345 }, { "entropy": 1.1492443475872278, "epoch": 0.3533164308497587, "grad_norm": 0.35788699984550476, "learning_rate": 1.2939021198639101e-05, "loss": 1.1878, "mean_token_accuracy": 0.7069006145000458, "num_tokens": 10527375.0, "step": 1350 }, { "entropy": 1.0579143922775984, "epoch": 0.35462501022327636, "grad_norm": 0.35787340998649597, "learning_rate": 1.291285003925674e-05, "loss": 1.0007, "mean_token_accuracy": 0.7237559407949448, "num_tokens": 10568018.0, "step": 1355 }, { "entropy": 1.1339404121041299, "epoch": 0.355933589596794, "grad_norm": 0.4546626806259155, "learning_rate": 1.288667887987438e-05, "loss": 1.1295, "mean_token_accuracy": 0.7201502379029989, "num_tokens": 10607681.0, "step": 1360 }, { "entropy": 1.0893705368041993, "epoch": 0.3572421689703116, "grad_norm": 0.45456886291503906, "learning_rate": 1.286050772049202e-05, "loss": 1.0304, "mean_token_accuracy": 0.7344392716884613, "num_tokens": 10641980.0, "step": 1365 }, { "entropy": 1.0794930804520846, "epoch": 0.3585507483438292, "grad_norm": 0.3907780945301056, "learning_rate": 1.283433656110966e-05, "loss": 1.0827, "mean_token_accuracy": 0.7246263407170772, "num_tokens": 10683159.0, "step": 1370 }, { "entropy": 1.0798350811004638, "epoch": 0.35985932771734686, "grad_norm": 0.49161893129348755, "learning_rate": 1.2808165401727298e-05, "loss": 1.0915, "mean_token_accuracy": 0.7175786912441253, "num_tokens": 10719560.0, "step": 1375 }, { "entropy": 1.131149498745799, "epoch": 0.3611679070908645, "grad_norm": 0.4854622185230255, "learning_rate": 1.2781994242344938e-05, "loss": 1.1285, "mean_token_accuracy": 0.7150968369096518, "num_tokens": 10759050.0, "step": 1380 }, { "entropy": 1.0728477615863086, "epoch": 0.36247648646438213, "grad_norm": 0.42540574073791504, "learning_rate": 1.2755823082962578e-05, "loss": 1.0358, "mean_token_accuracy": 0.724468483030796, "num_tokens": 10800630.0, "step": 1385 }, { "entropy": 1.1225707672536374, "epoch": 0.3637850658378997, "grad_norm": 0.44094985723495483, "learning_rate": 1.2729651923580215e-05, "loss": 1.0344, "mean_token_accuracy": 0.7132398471236229, "num_tokens": 10835582.0, "step": 1390 }, { "entropy": 1.127060130983591, "epoch": 0.36509364521141735, "grad_norm": 0.3640246093273163, "learning_rate": 1.2703480764197855e-05, "loss": 1.0752, "mean_token_accuracy": 0.717224034294486, "num_tokens": 10873989.0, "step": 1395 }, { "entropy": 1.110959093272686, "epoch": 0.366402224584935, "grad_norm": 0.3808118402957916, "learning_rate": 1.2677309604815495e-05, "loss": 1.0352, "mean_token_accuracy": 0.7157318696379662, "num_tokens": 10914540.0, "step": 1400 }, { "epoch": 0.366402224584935, "eval_entropy": 1.057036506652832, "eval_loss": 1.056433081626892, "eval_mean_token_accuracy": 0.724943666934967, "eval_num_tokens": 10914540.0, "eval_runtime": 97.2457, "eval_samples_per_second": 10.283, "eval_steps_per_second": 1.285, "step": 1400 }, { "entropy": 1.1179429598152637, "epoch": 0.3677108039584526, "grad_norm": 0.3942883610725403, "learning_rate": 1.2651138445433133e-05, "loss": 1.0674, "mean_token_accuracy": 0.7157964497804642, "num_tokens": 10951332.0, "step": 1405 }, { "entropy": 1.144387823715806, "epoch": 0.3690193833319702, "grad_norm": 0.35214006900787354, "learning_rate": 1.2624967286050773e-05, "loss": 1.1239, "mean_token_accuracy": 0.7134873129427433, "num_tokens": 10990644.0, "step": 1410 }, { "entropy": 1.057513090223074, "epoch": 0.37032796270548785, "grad_norm": 0.3838125169277191, "learning_rate": 1.2598796126668413e-05, "loss": 1.0198, "mean_token_accuracy": 0.7325119793415069, "num_tokens": 11032177.0, "step": 1415 }, { "entropy": 1.088934962451458, "epoch": 0.3716365420790055, "grad_norm": 0.26814350485801697, "learning_rate": 1.2572624967286052e-05, "loss": 1.0808, "mean_token_accuracy": 0.7172140318900346, "num_tokens": 11072735.0, "step": 1420 }, { "entropy": 1.1592457950115205, "epoch": 0.3729451214525231, "grad_norm": 0.45993542671203613, "learning_rate": 1.2546453807903692e-05, "loss": 1.1143, "mean_token_accuracy": 0.7036133039742708, "num_tokens": 11111309.0, "step": 1425 }, { "entropy": 1.1127744242548943, "epoch": 0.3742537008260407, "grad_norm": 0.3038633167743683, "learning_rate": 1.2520282648521332e-05, "loss": 1.0659, "mean_token_accuracy": 0.725579933822155, "num_tokens": 11150368.0, "step": 1430 }, { "entropy": 1.135872596874833, "epoch": 0.37556228019955834, "grad_norm": 0.32042160630226135, "learning_rate": 1.2494111489138968e-05, "loss": 1.1148, "mean_token_accuracy": 0.7169803373515606, "num_tokens": 11186513.0, "step": 1435 }, { "entropy": 1.087304012849927, "epoch": 0.376870859573076, "grad_norm": 0.3336869180202484, "learning_rate": 1.2467940329756609e-05, "loss": 1.0445, "mean_token_accuracy": 0.7192518580704927, "num_tokens": 11225174.0, "step": 1440 }, { "entropy": 1.0319055780768394, "epoch": 0.3781794389465936, "grad_norm": 0.43524104356765747, "learning_rate": 1.2441769170374249e-05, "loss": 0.9855, "mean_token_accuracy": 0.7250500712543726, "num_tokens": 11261841.0, "step": 1445 }, { "entropy": 1.0503356985747814, "epoch": 0.37948801832011125, "grad_norm": 0.34029972553253174, "learning_rate": 1.2415598010991887e-05, "loss": 0.9427, "mean_token_accuracy": 0.7291811019182205, "num_tokens": 11301122.0, "step": 1450 }, { "entropy": 1.135501291975379, "epoch": 0.38079659769362884, "grad_norm": 0.34890979528427124, "learning_rate": 1.2389426851609527e-05, "loss": 1.0706, "mean_token_accuracy": 0.707598365843296, "num_tokens": 11341491.0, "step": 1455 }, { "entropy": 1.012824462354183, "epoch": 0.3821051770671465, "grad_norm": 0.44555002450942993, "learning_rate": 1.2363255692227167e-05, "loss": 0.986, "mean_token_accuracy": 0.7334959991276264, "num_tokens": 11382277.0, "step": 1460 }, { "entropy": 1.084730239585042, "epoch": 0.3834137564406641, "grad_norm": 0.34672921895980835, "learning_rate": 1.2337084532844805e-05, "loss": 1.0185, "mean_token_accuracy": 0.7293158903717994, "num_tokens": 11420312.0, "step": 1465 }, { "entropy": 1.0931822203099728, "epoch": 0.38472233581418175, "grad_norm": 0.4738580882549286, "learning_rate": 1.2310913373462446e-05, "loss": 1.0744, "mean_token_accuracy": 0.7275880422443152, "num_tokens": 11458742.0, "step": 1470 }, { "entropy": 1.0959295086562633, "epoch": 0.38603091518769933, "grad_norm": 0.3458891212940216, "learning_rate": 1.2284742214080086e-05, "loss": 1.0425, "mean_token_accuracy": 0.7294268280267715, "num_tokens": 11494946.0, "step": 1475 }, { "entropy": 1.1744199082255364, "epoch": 0.38733949456121697, "grad_norm": 0.3247629404067993, "learning_rate": 1.2258571054697724e-05, "loss": 1.1929, "mean_token_accuracy": 0.6963723137974739, "num_tokens": 11531053.0, "step": 1480 }, { "entropy": 1.022578400745988, "epoch": 0.3886480739347346, "grad_norm": 0.37479135394096375, "learning_rate": 1.2232399895315364e-05, "loss": 0.9592, "mean_token_accuracy": 0.7337148085236549, "num_tokens": 11569356.0, "step": 1485 }, { "entropy": 1.0597552750259638, "epoch": 0.38995665330825224, "grad_norm": 0.34326809644699097, "learning_rate": 1.2206228735933004e-05, "loss": 1.03, "mean_token_accuracy": 0.7236659746617079, "num_tokens": 11609142.0, "step": 1490 }, { "entropy": 1.038928609713912, "epoch": 0.3912652326817699, "grad_norm": 0.33052825927734375, "learning_rate": 1.218005757655064e-05, "loss": 1.0297, "mean_token_accuracy": 0.7271129328757524, "num_tokens": 11649979.0, "step": 1495 }, { "entropy": 1.149324530735612, "epoch": 0.39257381205528746, "grad_norm": 0.44736945629119873, "learning_rate": 1.2153886417168281e-05, "loss": 1.0566, "mean_token_accuracy": 0.7146747212857008, "num_tokens": 11685504.0, "step": 1500 }, { "epoch": 0.39257381205528746, "eval_entropy": 1.061660517692566, "eval_loss": 1.0545203685760498, "eval_mean_token_accuracy": 0.7252409801483154, "eval_num_tokens": 11685504.0, "eval_runtime": 97.242, "eval_samples_per_second": 10.284, "eval_steps_per_second": 1.285, "step": 1500 }, { "entropy": 1.1069697327911854, "epoch": 0.3938823914288051, "grad_norm": 0.7664462327957153, "learning_rate": 1.2127715257785921e-05, "loss": 1.0101, "mean_token_accuracy": 0.7186468083411455, "num_tokens": 11720415.0, "step": 1505 }, { "entropy": 1.0142726931720971, "epoch": 0.39519097080232274, "grad_norm": 0.3830406069755554, "learning_rate": 1.210154409840356e-05, "loss": 0.974, "mean_token_accuracy": 0.736117148399353, "num_tokens": 11756577.0, "step": 1510 }, { "entropy": 1.112984489277005, "epoch": 0.3964995501758404, "grad_norm": 0.41094282269477844, "learning_rate": 1.20753729390212e-05, "loss": 1.1002, "mean_token_accuracy": 0.7189074017107486, "num_tokens": 11794872.0, "step": 1515 }, { "entropy": 1.0922920301556587, "epoch": 0.39780812954935796, "grad_norm": 0.37885573506355286, "learning_rate": 1.204920177963884e-05, "loss": 1.0302, "mean_token_accuracy": 0.7164245635271073, "num_tokens": 11833530.0, "step": 1520 }, { "entropy": 1.1610772363841533, "epoch": 0.3991167089228756, "grad_norm": 0.44092655181884766, "learning_rate": 1.2023030620256478e-05, "loss": 1.1124, "mean_token_accuracy": 0.6998836103826761, "num_tokens": 11865444.0, "step": 1525 }, { "entropy": 1.1576842796057463, "epoch": 0.40042528829639323, "grad_norm": 0.44296759366989136, "learning_rate": 1.1996859460874118e-05, "loss": 1.1521, "mean_token_accuracy": 0.7068853817880154, "num_tokens": 11901975.0, "step": 1530 }, { "entropy": 1.0967234898358584, "epoch": 0.40173386766991087, "grad_norm": 0.33487918972969055, "learning_rate": 1.1970688301491758e-05, "loss": 1.0064, "mean_token_accuracy": 0.7251658782362937, "num_tokens": 11939696.0, "step": 1535 }, { "entropy": 1.086240155249834, "epoch": 0.40304244704342845, "grad_norm": 0.40726640820503235, "learning_rate": 1.1944517142109398e-05, "loss": 1.0593, "mean_token_accuracy": 0.725240446254611, "num_tokens": 11982185.0, "step": 1540 }, { "entropy": 1.1482622426003217, "epoch": 0.4043510264169461, "grad_norm": 0.4323379397392273, "learning_rate": 1.1918345982727035e-05, "loss": 1.128, "mean_token_accuracy": 0.7092867191880942, "num_tokens": 12017859.0, "step": 1545 }, { "entropy": 1.2151045732200145, "epoch": 0.40565960579046373, "grad_norm": 0.3303086459636688, "learning_rate": 1.1892174823344675e-05, "loss": 1.2045, "mean_token_accuracy": 0.6960071615874768, "num_tokens": 12056052.0, "step": 1550 }, { "entropy": 1.0982510183006524, "epoch": 0.40696818516398137, "grad_norm": 0.3505324721336365, "learning_rate": 1.1866003663962315e-05, "loss": 1.067, "mean_token_accuracy": 0.7211838360875845, "num_tokens": 12093787.0, "step": 1555 }, { "entropy": 1.0351288244128227, "epoch": 0.408276764537499, "grad_norm": 0.4866773188114166, "learning_rate": 1.1839832504579953e-05, "loss": 1.0177, "mean_token_accuracy": 0.7380380086600781, "num_tokens": 12135954.0, "step": 1560 }, { "entropy": 1.0588519033044577, "epoch": 0.4095853439110166, "grad_norm": 0.35260552167892456, "learning_rate": 1.1813661345197593e-05, "loss": 0.9948, "mean_token_accuracy": 0.7311153151094913, "num_tokens": 12177684.0, "step": 1565 }, { "entropy": 1.09910380654037, "epoch": 0.4108939232845342, "grad_norm": 0.3076630234718323, "learning_rate": 1.1787490185815233e-05, "loss": 1.0572, "mean_token_accuracy": 0.7178509868681431, "num_tokens": 12217097.0, "step": 1570 }, { "entropy": 1.0710554268211125, "epoch": 0.41220250265805186, "grad_norm": 0.6353772878646851, "learning_rate": 1.1761319026432872e-05, "loss": 1.0566, "mean_token_accuracy": 0.7280903398990631, "num_tokens": 12254118.0, "step": 1575 }, { "entropy": 1.0847718585282564, "epoch": 0.4135110820315695, "grad_norm": 0.5102863311767578, "learning_rate": 1.1735147867050512e-05, "loss": 1.0074, "mean_token_accuracy": 0.7276991017162799, "num_tokens": 12290736.0, "step": 1580 }, { "entropy": 1.1992057055234908, "epoch": 0.4148196614050871, "grad_norm": 0.46092355251312256, "learning_rate": 1.1708976707668152e-05, "loss": 1.1854, "mean_token_accuracy": 0.7027558118104935, "num_tokens": 12327361.0, "step": 1585 }, { "entropy": 1.1261769212782382, "epoch": 0.4161282407786047, "grad_norm": 0.2733989357948303, "learning_rate": 1.168280554828579e-05, "loss": 1.0773, "mean_token_accuracy": 0.7125700250267982, "num_tokens": 12364890.0, "step": 1590 }, { "entropy": 1.116732070967555, "epoch": 0.41743682015212236, "grad_norm": 0.24640727043151855, "learning_rate": 1.165663438890343e-05, "loss": 1.1195, "mean_token_accuracy": 0.7179068107157945, "num_tokens": 12408137.0, "step": 1595 }, { "entropy": 1.119725486636162, "epoch": 0.41874539952564, "grad_norm": 0.38413721323013306, "learning_rate": 1.163046322952107e-05, "loss": 1.02, "mean_token_accuracy": 0.7142953380942345, "num_tokens": 12448886.0, "step": 1600 }, { "epoch": 0.41874539952564, "eval_entropy": 1.0526617636680604, "eval_loss": 1.0522558689117432, "eval_mean_token_accuracy": 0.7257419657707215, "eval_num_tokens": 12448886.0, "eval_runtime": 97.091, "eval_samples_per_second": 10.3, "eval_steps_per_second": 1.287, "step": 1600 }, { "entropy": 1.052151035144925, "epoch": 0.4200539788991576, "grad_norm": 0.3078325390815735, "learning_rate": 1.1604292070138707e-05, "loss": 1.0539, "mean_token_accuracy": 0.7285561367869378, "num_tokens": 12488545.0, "step": 1605 }, { "entropy": 1.143933541327715, "epoch": 0.4213625582726752, "grad_norm": 0.4508214592933655, "learning_rate": 1.1578120910756347e-05, "loss": 1.1383, "mean_token_accuracy": 0.7101572744548321, "num_tokens": 12524775.0, "step": 1610 }, { "entropy": 1.1741201549768447, "epoch": 0.42267113764619285, "grad_norm": 0.4099365770816803, "learning_rate": 1.1551949751373987e-05, "loss": 1.1825, "mean_token_accuracy": 0.705308374390006, "num_tokens": 12560784.0, "step": 1615 }, { "entropy": 1.0800459958612918, "epoch": 0.4239797170197105, "grad_norm": 0.3584926128387451, "learning_rate": 1.1525778591991625e-05, "loss": 1.0387, "mean_token_accuracy": 0.7256841942667961, "num_tokens": 12603010.0, "step": 1620 }, { "entropy": 1.1276030663400889, "epoch": 0.4252882963932281, "grad_norm": 0.5926274657249451, "learning_rate": 1.1499607432609266e-05, "loss": 1.0812, "mean_token_accuracy": 0.718872818350792, "num_tokens": 12639637.0, "step": 1625 }, { "entropy": 1.0730494752526283, "epoch": 0.4265968757667457, "grad_norm": 0.3367457389831543, "learning_rate": 1.1473436273226906e-05, "loss": 1.0183, "mean_token_accuracy": 0.7222284369170666, "num_tokens": 12685003.0, "step": 1630 }, { "entropy": 1.1390976216644049, "epoch": 0.42790545514026335, "grad_norm": 0.3196088373661041, "learning_rate": 1.1447265113844544e-05, "loss": 1.1352, "mean_token_accuracy": 0.7116539262235164, "num_tokens": 12725882.0, "step": 1635 }, { "entropy": 1.1160973783582449, "epoch": 0.429214034513781, "grad_norm": 0.30303776264190674, "learning_rate": 1.1421093954462184e-05, "loss": 1.045, "mean_token_accuracy": 0.7158118285238743, "num_tokens": 12769557.0, "step": 1640 }, { "entropy": 1.1582866314798594, "epoch": 0.4305226138872986, "grad_norm": 0.3513560891151428, "learning_rate": 1.1394922795079824e-05, "loss": 1.1353, "mean_token_accuracy": 0.705193604901433, "num_tokens": 12810697.0, "step": 1645 }, { "entropy": 1.1496058501303197, "epoch": 0.4318311932608162, "grad_norm": 0.2923238277435303, "learning_rate": 1.136875163569746e-05, "loss": 1.0884, "mean_token_accuracy": 0.7144812878221274, "num_tokens": 12851256.0, "step": 1650 }, { "entropy": 1.1143131528049708, "epoch": 0.43313977263433384, "grad_norm": 0.3065350651741028, "learning_rate": 1.1342580476315101e-05, "loss": 1.0523, "mean_token_accuracy": 0.7173210971057415, "num_tokens": 12888965.0, "step": 1655 }, { "entropy": 1.0787921320647, "epoch": 0.4344483520078515, "grad_norm": 0.46374326944351196, "learning_rate": 1.1316409316932741e-05, "loss": 1.0793, "mean_token_accuracy": 0.7249518293887377, "num_tokens": 12929333.0, "step": 1660 }, { "entropy": 1.057104966789484, "epoch": 0.4357569313813691, "grad_norm": 0.313374400138855, "learning_rate": 1.129023815755038e-05, "loss": 1.0395, "mean_token_accuracy": 0.7259028911590576, "num_tokens": 12968877.0, "step": 1665 }, { "entropy": 1.1018395122140645, "epoch": 0.4370655107548867, "grad_norm": 0.5729189515113831, "learning_rate": 1.126406699816802e-05, "loss": 1.0302, "mean_token_accuracy": 0.7193635780364275, "num_tokens": 13009336.0, "step": 1670 }, { "entropy": 1.090810688957572, "epoch": 0.43837409012840434, "grad_norm": 0.34041526913642883, "learning_rate": 1.123789583878566e-05, "loss": 1.0004, "mean_token_accuracy": 0.7279005717486143, "num_tokens": 13050299.0, "step": 1675 }, { "entropy": 0.9902068875730038, "epoch": 0.439682669501922, "grad_norm": 0.3354148864746094, "learning_rate": 1.1211724679403298e-05, "loss": 0.9542, "mean_token_accuracy": 0.7408786326646805, "num_tokens": 13090891.0, "step": 1680 }, { "entropy": 0.9622725404798984, "epoch": 0.4409912488754396, "grad_norm": 0.36764782667160034, "learning_rate": 1.1185553520020938e-05, "loss": 0.9176, "mean_token_accuracy": 0.7487996619194746, "num_tokens": 13132101.0, "step": 1685 }, { "entropy": 1.045450020954013, "epoch": 0.44229982824895725, "grad_norm": 0.37813687324523926, "learning_rate": 1.1159382360638578e-05, "loss": 0.9415, "mean_token_accuracy": 0.7361387588083744, "num_tokens": 13170933.0, "step": 1690 }, { "entropy": 1.0522997014224529, "epoch": 0.44360840762247483, "grad_norm": 0.382914274930954, "learning_rate": 1.1133211201256216e-05, "loss": 1.0357, "mean_token_accuracy": 0.7225256580859423, "num_tokens": 13209472.0, "step": 1695 }, { "entropy": 1.0627464193850755, "epoch": 0.44491698699599247, "grad_norm": 0.6186394691467285, "learning_rate": 1.1107040041873856e-05, "loss": 0.9805, "mean_token_accuracy": 0.7301313698291778, "num_tokens": 13243159.0, "step": 1700 }, { "epoch": 0.44491698699599247, "eval_entropy": 1.048878930568695, "eval_loss": 1.0501657724380493, "eval_mean_token_accuracy": 0.7263216118812561, "eval_num_tokens": 13243159.0, "eval_runtime": 97.3502, "eval_samples_per_second": 10.272, "eval_steps_per_second": 1.284, "step": 1700 }, { "entropy": 1.0235866460949183, "epoch": 0.4462255663695101, "grad_norm": 0.3209091126918793, "learning_rate": 1.1080868882491496e-05, "loss": 1.0014, "mean_token_accuracy": 0.739182323589921, "num_tokens": 13280837.0, "step": 1705 }, { "entropy": 1.0558000404387713, "epoch": 0.44753414574302774, "grad_norm": 0.30199235677719116, "learning_rate": 1.1054697723109137e-05, "loss": 0.9891, "mean_token_accuracy": 0.7261330507695675, "num_tokens": 13322890.0, "step": 1710 }, { "entropy": 1.121560701727867, "epoch": 0.4488427251165453, "grad_norm": 0.4387257993221283, "learning_rate": 1.1028526563726773e-05, "loss": 1.1191, "mean_token_accuracy": 0.7114489011466503, "num_tokens": 13360229.0, "step": 1715 }, { "entropy": 1.0835831377655267, "epoch": 0.45015130449006296, "grad_norm": 0.44212666153907776, "learning_rate": 1.1002355404344413e-05, "loss": 1.0534, "mean_token_accuracy": 0.7208398774266243, "num_tokens": 13401284.0, "step": 1720 }, { "entropy": 1.0752512661740183, "epoch": 0.4514598838635806, "grad_norm": 0.304918110370636, "learning_rate": 1.0976184244962053e-05, "loss": 1.0347, "mean_token_accuracy": 0.725695389136672, "num_tokens": 13444485.0, "step": 1725 }, { "entropy": 1.021533490344882, "epoch": 0.45276846323709824, "grad_norm": 0.3151404857635498, "learning_rate": 1.0950013085579692e-05, "loss": 0.9656, "mean_token_accuracy": 0.7328935824334621, "num_tokens": 13486444.0, "step": 1730 }, { "entropy": 1.1548058629035949, "epoch": 0.4540770426106159, "grad_norm": 0.4337512254714966, "learning_rate": 1.0923841926197332e-05, "loss": 1.1178, "mean_token_accuracy": 0.7139813594520092, "num_tokens": 13527431.0, "step": 1735 }, { "entropy": 1.142104067094624, "epoch": 0.45538562198413346, "grad_norm": 0.36979472637176514, "learning_rate": 1.0897670766814972e-05, "loss": 1.1315, "mean_token_accuracy": 0.7120837710797787, "num_tokens": 13567478.0, "step": 1740 }, { "entropy": 1.1160997115075588, "epoch": 0.4566942013576511, "grad_norm": 0.3845500946044922, "learning_rate": 1.087149960743261e-05, "loss": 1.1022, "mean_token_accuracy": 0.712579957395792, "num_tokens": 13605160.0, "step": 1745 }, { "entropy": 0.9988198220729828, "epoch": 0.45800278073116873, "grad_norm": 0.38837000727653503, "learning_rate": 1.084532844805025e-05, "loss": 0.9329, "mean_token_accuracy": 0.7446095444262028, "num_tokens": 13644314.0, "step": 1750 }, { "entropy": 1.130447569489479, "epoch": 0.4593113601046864, "grad_norm": 0.3902071714401245, "learning_rate": 1.081915728866789e-05, "loss": 1.0794, "mean_token_accuracy": 0.7083112396299839, "num_tokens": 13681672.0, "step": 1755 }, { "entropy": 1.0452111046761274, "epoch": 0.46061993947820395, "grad_norm": 0.4102812111377716, "learning_rate": 1.0792986129285527e-05, "loss": 1.0083, "mean_token_accuracy": 0.7271155752241611, "num_tokens": 13723858.0, "step": 1760 }, { "entropy": 0.9923233393579721, "epoch": 0.4619285188517216, "grad_norm": 0.3408261239528656, "learning_rate": 1.0766814969903167e-05, "loss": 0.9552, "mean_token_accuracy": 0.7382691666483879, "num_tokens": 13762274.0, "step": 1765 }, { "entropy": 1.0694189239293337, "epoch": 0.46323709822523923, "grad_norm": 0.33313289284706116, "learning_rate": 1.0740643810520807e-05, "loss": 1.0309, "mean_token_accuracy": 0.728821600228548, "num_tokens": 13804541.0, "step": 1770 }, { "entropy": 1.148886876925826, "epoch": 0.46454567759875687, "grad_norm": 0.4039045572280884, "learning_rate": 1.0714472651138445e-05, "loss": 1.1428, "mean_token_accuracy": 0.7021496020257473, "num_tokens": 13842698.0, "step": 1775 }, { "entropy": 1.059429519250989, "epoch": 0.46585425697227445, "grad_norm": 0.3325613737106323, "learning_rate": 1.0688301491756086e-05, "loss": 1.0277, "mean_token_accuracy": 0.7256958331912756, "num_tokens": 13883697.0, "step": 1780 }, { "entropy": 1.1187500689178704, "epoch": 0.4671628363457921, "grad_norm": 0.3969448208808899, "learning_rate": 1.0662130332373726e-05, "loss": 1.0818, "mean_token_accuracy": 0.7165641494095325, "num_tokens": 13922509.0, "step": 1785 }, { "entropy": 1.1411572575569153, "epoch": 0.4684714157193097, "grad_norm": 0.391754150390625, "learning_rate": 1.0635959172991364e-05, "loss": 1.1149, "mean_token_accuracy": 0.7164013959467411, "num_tokens": 13958198.0, "step": 1790 }, { "entropy": 1.0621652530506254, "epoch": 0.46977999509282736, "grad_norm": 0.3251922130584717, "learning_rate": 1.0609788013609004e-05, "loss": 1.0348, "mean_token_accuracy": 0.7337204076349735, "num_tokens": 13997488.0, "step": 1795 }, { "entropy": 1.0552761495113372, "epoch": 0.471088574466345, "grad_norm": 0.28842347860336304, "learning_rate": 1.0583616854226644e-05, "loss": 0.9837, "mean_token_accuracy": 0.732194972038269, "num_tokens": 14038479.0, "step": 1800 }, { "epoch": 0.471088574466345, "eval_entropy": 1.056775134563446, "eval_loss": 1.048877477645874, "eval_mean_token_accuracy": 0.7264609928131104, "eval_num_tokens": 14038479.0, "eval_runtime": 97.5713, "eval_samples_per_second": 10.249, "eval_steps_per_second": 1.281, "step": 1800 }, { "entropy": 1.1512902732938528, "epoch": 0.4723971538398626, "grad_norm": 0.4646497964859009, "learning_rate": 1.055744569484428e-05, "loss": 1.1312, "mean_token_accuracy": 0.7179503016173839, "num_tokens": 14074174.0, "step": 1805 }, { "entropy": 1.0646660778671504, "epoch": 0.4737057332133802, "grad_norm": 0.29955199360847473, "learning_rate": 1.0531274535461921e-05, "loss": 1.0278, "mean_token_accuracy": 0.7245239842683077, "num_tokens": 14115408.0, "step": 1810 }, { "entropy": 1.0254539363086224, "epoch": 0.47501431258689786, "grad_norm": 0.31829652190208435, "learning_rate": 1.0505103376079561e-05, "loss": 0.9724, "mean_token_accuracy": 0.7322143398225307, "num_tokens": 14154697.0, "step": 1815 }, { "entropy": 1.1258851181715728, "epoch": 0.4763228919604155, "grad_norm": 0.45105260610580444, "learning_rate": 1.04789322166972e-05, "loss": 1.1154, "mean_token_accuracy": 0.7129046030342578, "num_tokens": 14194725.0, "step": 1820 }, { "entropy": 1.098579701408744, "epoch": 0.4776314713339331, "grad_norm": 0.37159058451652527, "learning_rate": 1.045276105731484e-05, "loss": 1.0148, "mean_token_accuracy": 0.7232114516198636, "num_tokens": 14236159.0, "step": 1825 }, { "entropy": 1.0738941352814435, "epoch": 0.4789400507074507, "grad_norm": 0.34484973549842834, "learning_rate": 1.042658989793248e-05, "loss": 1.0382, "mean_token_accuracy": 0.7244183134287596, "num_tokens": 14275868.0, "step": 1830 }, { "entropy": 1.0540620289742946, "epoch": 0.48024863008096835, "grad_norm": 0.4253799617290497, "learning_rate": 1.0400418738550118e-05, "loss": 0.9825, "mean_token_accuracy": 0.7318723525851965, "num_tokens": 14314897.0, "step": 1835 }, { "entropy": 1.1317568343132733, "epoch": 0.481557209454486, "grad_norm": 0.3978184163570404, "learning_rate": 1.0374247579167758e-05, "loss": 1.1196, "mean_token_accuracy": 0.7122768431901931, "num_tokens": 14353468.0, "step": 1840 }, { "entropy": 0.9921163013204932, "epoch": 0.4828657888280036, "grad_norm": 0.6404853463172913, "learning_rate": 1.0348076419785398e-05, "loss": 0.9584, "mean_token_accuracy": 0.7398792453110218, "num_tokens": 14393542.0, "step": 1845 }, { "entropy": 1.0703047849237919, "epoch": 0.4841743682015212, "grad_norm": 0.4562324285507202, "learning_rate": 1.0321905260403036e-05, "loss": 1.0268, "mean_token_accuracy": 0.7257931806147099, "num_tokens": 14438299.0, "step": 1850 }, { "entropy": 1.098178230971098, "epoch": 0.48548294757503885, "grad_norm": 0.5880526900291443, "learning_rate": 1.0295734101020676e-05, "loss": 1.0345, "mean_token_accuracy": 0.719204118102789, "num_tokens": 14472012.0, "step": 1855 }, { "entropy": 0.9670162990689277, "epoch": 0.4867915269485565, "grad_norm": 0.6454201340675354, "learning_rate": 1.0269562941638316e-05, "loss": 0.9095, "mean_token_accuracy": 0.7467180170118809, "num_tokens": 14508320.0, "step": 1860 }, { "entropy": 1.0095211911946536, "epoch": 0.4881001063220741, "grad_norm": 0.42344188690185547, "learning_rate": 1.0243391782255957e-05, "loss": 0.9624, "mean_token_accuracy": 0.7334278732538223, "num_tokens": 14545217.0, "step": 1865 }, { "entropy": 1.055738451331854, "epoch": 0.4894086856955917, "grad_norm": 0.35296040773391724, "learning_rate": 1.0217220622873593e-05, "loss": 0.9997, "mean_token_accuracy": 0.7282113581895828, "num_tokens": 14587523.0, "step": 1870 }, { "entropy": 1.0467969175428151, "epoch": 0.49071726506910934, "grad_norm": 0.3616087734699249, "learning_rate": 1.0191049463491233e-05, "loss": 1.0211, "mean_token_accuracy": 0.7238417502492667, "num_tokens": 14629386.0, "step": 1875 }, { "entropy": 1.0537079151719808, "epoch": 0.492025844442627, "grad_norm": 0.3368701934814453, "learning_rate": 1.0164878304108873e-05, "loss": 1.0216, "mean_token_accuracy": 0.7280502840876579, "num_tokens": 14672059.0, "step": 1880 }, { "entropy": 1.1863497417420148, "epoch": 0.4933344238161446, "grad_norm": 0.35971367359161377, "learning_rate": 1.0138707144726512e-05, "loss": 1.1773, "mean_token_accuracy": 0.7055241405963898, "num_tokens": 14713139.0, "step": 1885 }, { "entropy": 1.0729142222553492, "epoch": 0.4946430031896622, "grad_norm": 0.3640763461589813, "learning_rate": 1.0112535985344152e-05, "loss": 1.0327, "mean_token_accuracy": 0.7191622793674469, "num_tokens": 14751005.0, "step": 1890 }, { "entropy": 1.06036187261343, "epoch": 0.49595158256317984, "grad_norm": 0.529114305973053, "learning_rate": 1.0086364825961792e-05, "loss": 0.9994, "mean_token_accuracy": 0.7312576837837697, "num_tokens": 14791363.0, "step": 1895 }, { "entropy": 1.0985333036631346, "epoch": 0.4972601619366975, "grad_norm": 0.655299723148346, "learning_rate": 1.006019366657943e-05, "loss": 1.0157, "mean_token_accuracy": 0.715160496532917, "num_tokens": 14824965.0, "step": 1900 }, { "epoch": 0.4972601619366975, "eval_entropy": 1.049722795009613, "eval_loss": 1.0476868152618408, "eval_mean_token_accuracy": 0.7267285461425781, "eval_num_tokens": 14824965.0, "eval_runtime": 97.3332, "eval_samples_per_second": 10.274, "eval_steps_per_second": 1.284, "step": 1900 }, { "entropy": 1.1007526509463788, "epoch": 0.4985687413102151, "grad_norm": 0.3946051001548767, "learning_rate": 1.003402250719707e-05, "loss": 1.0781, "mean_token_accuracy": 0.7194148235023021, "num_tokens": 14861751.0, "step": 1905 }, { "entropy": 1.0730507016181945, "epoch": 0.4998773206837327, "grad_norm": 0.4027865529060364, "learning_rate": 1.000785134781471e-05, "loss": 1.0453, "mean_token_accuracy": 0.7289649330079555, "num_tokens": 14899381.0, "step": 1910 }, { "entropy": 1.0322015034034848, "epoch": 0.5011859000572504, "grad_norm": 0.3154545724391937, "learning_rate": 9.981680188432349e-06, "loss": 0.9092, "mean_token_accuracy": 0.7354275353252888, "num_tokens": 14937444.0, "step": 1915 }, { "entropy": 1.070195358619094, "epoch": 0.502494479430768, "grad_norm": 0.6190849542617798, "learning_rate": 9.955509029049987e-06, "loss": 0.9885, "mean_token_accuracy": 0.7289122432470322, "num_tokens": 14973078.0, "step": 1920 }, { "entropy": 1.1444286353886128, "epoch": 0.5038030588042856, "grad_norm": 0.3921438455581665, "learning_rate": 9.929337869667627e-06, "loss": 1.1402, "mean_token_accuracy": 0.7071823209524155, "num_tokens": 15013712.0, "step": 1925 }, { "entropy": 1.1046634882688522, "epoch": 0.5051116381778032, "grad_norm": 0.35547974705696106, "learning_rate": 9.903166710285267e-06, "loss": 1.0808, "mean_token_accuracy": 0.7203411210328341, "num_tokens": 15056268.0, "step": 1930 }, { "entropy": 1.1063471656292676, "epoch": 0.5064202175513208, "grad_norm": 0.4154115915298462, "learning_rate": 9.876995550902906e-06, "loss": 1.1305, "mean_token_accuracy": 0.7238013096153736, "num_tokens": 15095616.0, "step": 1935 }, { "entropy": 1.093080758303404, "epoch": 0.5077287969248385, "grad_norm": 0.33288487792015076, "learning_rate": 9.850824391520546e-06, "loss": 1.0423, "mean_token_accuracy": 0.7236416719853878, "num_tokens": 15135823.0, "step": 1940 }, { "entropy": 1.0675853207707404, "epoch": 0.5090373762983561, "grad_norm": 0.4353385269641876, "learning_rate": 9.824653232138186e-06, "loss": 1.027, "mean_token_accuracy": 0.721454057097435, "num_tokens": 15173689.0, "step": 1945 }, { "entropy": 0.9674046196043491, "epoch": 0.5103459556718737, "grad_norm": 0.6318839192390442, "learning_rate": 9.798482072755824e-06, "loss": 0.8977, "mean_token_accuracy": 0.7534206889569759, "num_tokens": 15215084.0, "step": 1950 }, { "entropy": 1.029712200164795, "epoch": 0.5116545350453914, "grad_norm": 0.4194331765174866, "learning_rate": 9.772310913373462e-06, "loss": 0.9468, "mean_token_accuracy": 0.7411890789866448, "num_tokens": 15255554.0, "step": 1955 }, { "entropy": 1.0968198750168086, "epoch": 0.512963114418909, "grad_norm": 0.5084384679794312, "learning_rate": 9.746139753991103e-06, "loss": 1.1293, "mean_token_accuracy": 0.7243112053722143, "num_tokens": 15291594.0, "step": 1960 }, { "entropy": 1.1029110010713339, "epoch": 0.5142716937924265, "grad_norm": 0.39724844694137573, "learning_rate": 9.719968594608743e-06, "loss": 1.1192, "mean_token_accuracy": 0.7196755088865757, "num_tokens": 15331102.0, "step": 1965 }, { "entropy": 1.088718469440937, "epoch": 0.5155802731659442, "grad_norm": 0.29742664098739624, "learning_rate": 9.693797435226381e-06, "loss": 1.0498, "mean_token_accuracy": 0.718688540160656, "num_tokens": 15370084.0, "step": 1970 }, { "entropy": 1.0800972525030375, "epoch": 0.5168888525394618, "grad_norm": 0.4087739586830139, "learning_rate": 9.667626275844021e-06, "loss": 1.0431, "mean_token_accuracy": 0.7267526693642139, "num_tokens": 15407968.0, "step": 1975 }, { "entropy": 1.1198014959692955, "epoch": 0.5181974319129795, "grad_norm": 0.3880080282688141, "learning_rate": 9.64145511646166e-06, "loss": 1.1213, "mean_token_accuracy": 0.7201451409608126, "num_tokens": 15445003.0, "step": 1980 }, { "entropy": 1.098936778306961, "epoch": 0.5195060112864971, "grad_norm": 0.3442750871181488, "learning_rate": 9.6152839570793e-06, "loss": 1.0513, "mean_token_accuracy": 0.7280969373881817, "num_tokens": 15486123.0, "step": 1985 }, { "entropy": 1.106550743058324, "epoch": 0.5208145906600147, "grad_norm": 0.34958696365356445, "learning_rate": 9.58911279769694e-06, "loss": 1.0701, "mean_token_accuracy": 0.7197918314486742, "num_tokens": 15525274.0, "step": 1990 }, { "entropy": 1.0225239008665086, "epoch": 0.5221231700335324, "grad_norm": 0.2728900909423828, "learning_rate": 9.562941638314578e-06, "loss": 0.9536, "mean_token_accuracy": 0.7333686843514442, "num_tokens": 15567557.0, "step": 1995 }, { "entropy": 1.015058632940054, "epoch": 0.52343174940705, "grad_norm": 0.42618778347969055, "learning_rate": 9.536770478932218e-06, "loss": 0.9551, "mean_token_accuracy": 0.7347531169652939, "num_tokens": 15608023.0, "step": 2000 }, { "epoch": 0.52343174940705, "eval_entropy": 1.0467440810203552, "eval_loss": 1.0457295179367065, "eval_mean_token_accuracy": 0.727197292804718, "eval_num_tokens": 15608023.0, "eval_runtime": 97.3484, "eval_samples_per_second": 10.272, "eval_steps_per_second": 1.284, "step": 2000 }, { "entropy": 1.1194863107055426, "epoch": 0.5247403287805676, "grad_norm": 0.43802356719970703, "learning_rate": 9.510599319549856e-06, "loss": 1.1008, "mean_token_accuracy": 0.7199479583650827, "num_tokens": 15646573.0, "step": 2005 }, { "entropy": 1.123955625295639, "epoch": 0.5260489081540852, "grad_norm": 0.3263470232486725, "learning_rate": 9.484428160167496e-06, "loss": 1.0753, "mean_token_accuracy": 0.7194419991225004, "num_tokens": 15688301.0, "step": 2010 }, { "entropy": 1.075960063934326, "epoch": 0.5273574875276028, "grad_norm": 0.43562212586402893, "learning_rate": 9.458257000785136e-06, "loss": 1.0737, "mean_token_accuracy": 0.7178402289748191, "num_tokens": 15725214.0, "step": 2015 }, { "entropy": 1.048216313868761, "epoch": 0.5286660669011205, "grad_norm": 0.2702055871486664, "learning_rate": 9.432085841402775e-06, "loss": 1.0164, "mean_token_accuracy": 0.7296169806271792, "num_tokens": 15764282.0, "step": 2020 }, { "entropy": 1.0972533270716667, "epoch": 0.5299746462746381, "grad_norm": 0.38811036944389343, "learning_rate": 9.405914682020413e-06, "loss": 1.0683, "mean_token_accuracy": 0.7193491701036692, "num_tokens": 15807279.0, "step": 2025 }, { "entropy": 1.0837439749389888, "epoch": 0.5312832256481557, "grad_norm": 0.38090547919273376, "learning_rate": 9.379743522638053e-06, "loss": 1.0179, "mean_token_accuracy": 0.7216418325901032, "num_tokens": 15849485.0, "step": 2030 }, { "entropy": 1.0280620481818914, "epoch": 0.5325918050216734, "grad_norm": 0.44307711720466614, "learning_rate": 9.353572363255693e-06, "loss": 0.9705, "mean_token_accuracy": 0.7349147848784924, "num_tokens": 15888255.0, "step": 2035 }, { "entropy": 1.1240373719483614, "epoch": 0.5339003843951909, "grad_norm": 0.41446390748023987, "learning_rate": 9.327401203873332e-06, "loss": 1.1051, "mean_token_accuracy": 0.7141484066843986, "num_tokens": 15929388.0, "step": 2040 }, { "entropy": 1.1305542685091496, "epoch": 0.5352089637687086, "grad_norm": 0.749841034412384, "learning_rate": 9.301230044490972e-06, "loss": 1.126, "mean_token_accuracy": 0.7178158447146415, "num_tokens": 15971137.0, "step": 2045 }, { "entropy": 1.0323502123355865, "epoch": 0.5365175431422262, "grad_norm": 0.48051634430885315, "learning_rate": 9.27505888510861e-06, "loss": 0.9978, "mean_token_accuracy": 0.7359145298600197, "num_tokens": 16011804.0, "step": 2050 }, { "entropy": 1.0280569117516278, "epoch": 0.5378261225157438, "grad_norm": 0.2908135950565338, "learning_rate": 9.24888772572625e-06, "loss": 0.9785, "mean_token_accuracy": 0.7374115951359272, "num_tokens": 16050785.0, "step": 2055 }, { "entropy": 1.0569781139492989, "epoch": 0.5391347018892615, "grad_norm": 0.35007575154304504, "learning_rate": 9.22271656634389e-06, "loss": 0.9877, "mean_token_accuracy": 0.7339214488863945, "num_tokens": 16090836.0, "step": 2060 }, { "entropy": 1.0800054289400578, "epoch": 0.5404432812627791, "grad_norm": 0.32629233598709106, "learning_rate": 9.196545406961529e-06, "loss": 1.0375, "mean_token_accuracy": 0.7278057150542736, "num_tokens": 16131547.0, "step": 2065 }, { "entropy": 1.0801922511309385, "epoch": 0.5417518606362968, "grad_norm": 0.43629997968673706, "learning_rate": 9.170374247579169e-06, "loss": 1.0448, "mean_token_accuracy": 0.7214881278574466, "num_tokens": 16168637.0, "step": 2070 }, { "entropy": 1.10114951916039, "epoch": 0.5430604400098143, "grad_norm": 0.3292621076107025, "learning_rate": 9.144203088196809e-06, "loss": 1.0632, "mean_token_accuracy": 0.7205091584473848, "num_tokens": 16210603.0, "step": 2075 }, { "entropy": 1.0731607031077146, "epoch": 0.5443690193833319, "grad_norm": 0.3450382351875305, "learning_rate": 9.118031928814447e-06, "loss": 0.985, "mean_token_accuracy": 0.7285647377371788, "num_tokens": 16248842.0, "step": 2080 }, { "entropy": 1.110469686985016, "epoch": 0.5456775987568496, "grad_norm": 0.5088993310928345, "learning_rate": 9.091860769432087e-06, "loss": 1.0761, "mean_token_accuracy": 0.7160798791795969, "num_tokens": 16282386.0, "step": 2085 }, { "entropy": 1.066830562800169, "epoch": 0.5469861781303672, "grad_norm": 0.5230424404144287, "learning_rate": 9.065689610049726e-06, "loss": 1.0209, "mean_token_accuracy": 0.7298058848828077, "num_tokens": 16315017.0, "step": 2090 }, { "entropy": 1.0682125560939313, "epoch": 0.5482947575038849, "grad_norm": 0.29241734743118286, "learning_rate": 9.039518450667366e-06, "loss": 1.0398, "mean_token_accuracy": 0.7212686751037836, "num_tokens": 16356673.0, "step": 2095 }, { "entropy": 1.0166700199246406, "epoch": 0.5496033368774025, "grad_norm": 0.37928712368011475, "learning_rate": 9.013347291285006e-06, "loss": 0.9511, "mean_token_accuracy": 0.7362009316682816, "num_tokens": 16397408.0, "step": 2100 }, { "epoch": 0.5496033368774025, "eval_entropy": 1.0475300779342651, "eval_loss": 1.0435727834701538, "eval_mean_token_accuracy": 0.7274958577156067, "eval_num_tokens": 16397408.0, "eval_runtime": 97.2217, "eval_samples_per_second": 10.286, "eval_steps_per_second": 1.286, "step": 2100 }, { "entropy": 1.0761947065591813, "epoch": 0.5509119162509201, "grad_norm": 0.4101719558238983, "learning_rate": 8.987176131902644e-06, "loss": 1.0654, "mean_token_accuracy": 0.7192930780351162, "num_tokens": 16434511.0, "step": 2105 }, { "entropy": 1.097949853539467, "epoch": 0.5522204956244378, "grad_norm": 0.4837743043899536, "learning_rate": 8.961004972520282e-06, "loss": 1.0397, "mean_token_accuracy": 0.7156211558729411, "num_tokens": 16472214.0, "step": 2110 }, { "entropy": 1.0977317329496146, "epoch": 0.5535290749979553, "grad_norm": 0.39783498644828796, "learning_rate": 8.934833813137923e-06, "loss": 1.1348, "mean_token_accuracy": 0.71487329415977, "num_tokens": 16512818.0, "step": 2115 }, { "entropy": 1.1164306864142417, "epoch": 0.5548376543714729, "grad_norm": 0.5113677978515625, "learning_rate": 8.908662653755563e-06, "loss": 1.0433, "mean_token_accuracy": 0.7190901465713978, "num_tokens": 16549988.0, "step": 2120 }, { "entropy": 1.118570750951767, "epoch": 0.5561462337449906, "grad_norm": 0.457736074924469, "learning_rate": 8.882491494373201e-06, "loss": 1.0624, "mean_token_accuracy": 0.7195118874311447, "num_tokens": 16588026.0, "step": 2125 }, { "entropy": 1.0895183108747006, "epoch": 0.5574548131185082, "grad_norm": 0.443959504365921, "learning_rate": 8.856320334990841e-06, "loss": 1.0314, "mean_token_accuracy": 0.7270146373659372, "num_tokens": 16626368.0, "step": 2130 }, { "entropy": 1.091917496919632, "epoch": 0.5587633924920259, "grad_norm": 0.3934486210346222, "learning_rate": 8.83014917560848e-06, "loss": 1.0449, "mean_token_accuracy": 0.7249642074108124, "num_tokens": 16658226.0, "step": 2135 }, { "entropy": 1.0808244206011295, "epoch": 0.5600719718655435, "grad_norm": 0.4523155093193054, "learning_rate": 8.80397801622612e-06, "loss": 1.0726, "mean_token_accuracy": 0.725436444580555, "num_tokens": 16694803.0, "step": 2140 }, { "entropy": 1.0595968872308732, "epoch": 0.561380551239061, "grad_norm": 0.6529179215431213, "learning_rate": 8.77780685684376e-06, "loss": 1.0853, "mean_token_accuracy": 0.728100998699665, "num_tokens": 16731046.0, "step": 2145 }, { "entropy": 1.1225625108927488, "epoch": 0.5626891306125787, "grad_norm": 0.5509600043296814, "learning_rate": 8.751635697461398e-06, "loss": 1.0464, "mean_token_accuracy": 0.7140719693154096, "num_tokens": 16770258.0, "step": 2150 }, { "entropy": 1.0641490031033753, "epoch": 0.5639977099860963, "grad_norm": 0.401279091835022, "learning_rate": 8.725464538079038e-06, "loss": 1.006, "mean_token_accuracy": 0.7247611582279205, "num_tokens": 16808152.0, "step": 2155 }, { "entropy": 1.077583958208561, "epoch": 0.565306289359614, "grad_norm": 0.3935731053352356, "learning_rate": 8.699293378696676e-06, "loss": 1.0364, "mean_token_accuracy": 0.7233245447278023, "num_tokens": 16846703.0, "step": 2160 }, { "entropy": 1.1116807896643877, "epoch": 0.5666148687331316, "grad_norm": 0.31971287727355957, "learning_rate": 8.673122219314316e-06, "loss": 1.1141, "mean_token_accuracy": 0.7137772016227245, "num_tokens": 16885531.0, "step": 2165 }, { "entropy": 1.1279645822942257, "epoch": 0.5679234481066492, "grad_norm": 0.33220037817955017, "learning_rate": 8.646951059931956e-06, "loss": 1.1373, "mean_token_accuracy": 0.7136048063635826, "num_tokens": 16922160.0, "step": 2170 }, { "entropy": 1.0512098886072636, "epoch": 0.5692320274801669, "grad_norm": 0.41083812713623047, "learning_rate": 8.620779900549595e-06, "loss": 1.0153, "mean_token_accuracy": 0.7320755422115326, "num_tokens": 16961964.0, "step": 2175 }, { "entropy": 1.0632092259824275, "epoch": 0.5705406068536845, "grad_norm": 0.3310851752758026, "learning_rate": 8.594608741167235e-06, "loss": 1.0625, "mean_token_accuracy": 0.7241213608533144, "num_tokens": 17005710.0, "step": 2180 }, { "entropy": 1.0541639674454928, "epoch": 0.571849186227202, "grad_norm": 0.5147233009338379, "learning_rate": 8.568437581784875e-06, "loss": 1.0132, "mean_token_accuracy": 0.7322873834520578, "num_tokens": 17043166.0, "step": 2185 }, { "entropy": 1.1003672912716866, "epoch": 0.5731577656007197, "grad_norm": 0.6766669154167175, "learning_rate": 8.542266422402513e-06, "loss": 1.1054, "mean_token_accuracy": 0.7199019856750966, "num_tokens": 17077200.0, "step": 2190 }, { "entropy": 1.0057383235543966, "epoch": 0.5744663449742373, "grad_norm": 0.41832664608955383, "learning_rate": 8.516095263020152e-06, "loss": 0.9413, "mean_token_accuracy": 0.7480561546981335, "num_tokens": 17116788.0, "step": 2195 }, { "entropy": 1.0743875481188296, "epoch": 0.575774924347755, "grad_norm": 0.38505351543426514, "learning_rate": 8.489924103637792e-06, "loss": 0.9846, "mean_token_accuracy": 0.7285970754921436, "num_tokens": 17156595.0, "step": 2200 }, { "epoch": 0.575774924347755, "eval_entropy": 1.047807973384857, "eval_loss": 1.0419152975082397, "eval_mean_token_accuracy": 0.7281912508010864, "eval_num_tokens": 17156595.0, "eval_runtime": 97.2111, "eval_samples_per_second": 10.287, "eval_steps_per_second": 1.286, "step": 2200 }, { "entropy": 1.018094291538, "epoch": 0.5770835037212726, "grad_norm": 0.47084370255470276, "learning_rate": 8.463752944255432e-06, "loss": 0.9611, "mean_token_accuracy": 0.7393688425421715, "num_tokens": 17190391.0, "step": 2205 }, { "entropy": 1.0440906029194594, "epoch": 0.5783920830947902, "grad_norm": 0.611663818359375, "learning_rate": 8.43758178487307e-06, "loss": 1.0335, "mean_token_accuracy": 0.7285435609519482, "num_tokens": 17228559.0, "step": 2210 }, { "entropy": 1.033133103698492, "epoch": 0.5797006624683079, "grad_norm": 0.4824153482913971, "learning_rate": 8.41141062549071e-06, "loss": 1.013, "mean_token_accuracy": 0.7390854120254516, "num_tokens": 17264262.0, "step": 2215 }, { "entropy": 1.0519094690680504, "epoch": 0.5810092418418255, "grad_norm": 0.35666635632514954, "learning_rate": 8.385239466108349e-06, "loss": 1.0353, "mean_token_accuracy": 0.7265864860266447, "num_tokens": 17298474.0, "step": 2220 }, { "entropy": 1.1184704467654227, "epoch": 0.5823178212153431, "grad_norm": 0.4187191426753998, "learning_rate": 8.359068306725989e-06, "loss": 1.0395, "mean_token_accuracy": 0.7226161871105432, "num_tokens": 17342249.0, "step": 2225 }, { "entropy": 1.109714150428772, "epoch": 0.5836264005888607, "grad_norm": 0.424214243888855, "learning_rate": 8.332897147343629e-06, "loss": 1.0727, "mean_token_accuracy": 0.7170622806996108, "num_tokens": 17381434.0, "step": 2230 }, { "entropy": 1.0706456020474433, "epoch": 0.5849349799623783, "grad_norm": 0.3002157211303711, "learning_rate": 8.306725987961267e-06, "loss": 1.046, "mean_token_accuracy": 0.7237683780491352, "num_tokens": 17419527.0, "step": 2235 }, { "entropy": 1.0564652543514967, "epoch": 0.586243559335896, "grad_norm": 0.3851909041404724, "learning_rate": 8.280554828578907e-06, "loss": 0.9903, "mean_token_accuracy": 0.731862674281001, "num_tokens": 17456332.0, "step": 2240 }, { "entropy": 1.059485612437129, "epoch": 0.5875521387094136, "grad_norm": 0.30845922231674194, "learning_rate": 8.254383669196546e-06, "loss": 1.0166, "mean_token_accuracy": 0.7287565600126982, "num_tokens": 17503670.0, "step": 2245 }, { "entropy": 1.0321750465780497, "epoch": 0.5888607180829312, "grad_norm": 0.35590362548828125, "learning_rate": 8.228212509814186e-06, "loss": 1.0185, "mean_token_accuracy": 0.7324472777545452, "num_tokens": 17541709.0, "step": 2250 }, { "entropy": 1.029836729541421, "epoch": 0.5901692974564489, "grad_norm": 0.5374951362609863, "learning_rate": 8.202041350431826e-06, "loss": 0.9625, "mean_token_accuracy": 0.7331007912755012, "num_tokens": 17578411.0, "step": 2255 }, { "entropy": 1.0388061545789242, "epoch": 0.5914778768299664, "grad_norm": 0.26899445056915283, "learning_rate": 8.175870191049464e-06, "loss": 0.9839, "mean_token_accuracy": 0.7304695654660464, "num_tokens": 17618221.0, "step": 2260 }, { "entropy": 1.0612275712192059, "epoch": 0.5927864562034841, "grad_norm": 0.47736746072769165, "learning_rate": 8.149699031667102e-06, "loss": 1.0262, "mean_token_accuracy": 0.725922341644764, "num_tokens": 17656712.0, "step": 2265 }, { "entropy": 1.1566721752285958, "epoch": 0.5940950355770017, "grad_norm": 0.37901201844215393, "learning_rate": 8.123527872284743e-06, "loss": 1.1165, "mean_token_accuracy": 0.7078841663897038, "num_tokens": 17692983.0, "step": 2270 }, { "entropy": 1.1062884621322155, "epoch": 0.5954036149505193, "grad_norm": 0.5285326838493347, "learning_rate": 8.097356712902383e-06, "loss": 1.06, "mean_token_accuracy": 0.7236683152616024, "num_tokens": 17732335.0, "step": 2275 }, { "entropy": 1.093737093731761, "epoch": 0.596712194324037, "grad_norm": 0.3490510582923889, "learning_rate": 8.071185553520021e-06, "loss": 1.0661, "mean_token_accuracy": 0.7216609187424183, "num_tokens": 17767130.0, "step": 2280 }, { "entropy": 1.108936246484518, "epoch": 0.5980207736975546, "grad_norm": 0.5806106925010681, "learning_rate": 8.045014394137661e-06, "loss": 1.1176, "mean_token_accuracy": 0.7208809830248356, "num_tokens": 17803730.0, "step": 2285 }, { "entropy": 1.168883834220469, "epoch": 0.5993293530710723, "grad_norm": 0.36321160197257996, "learning_rate": 8.0188432347553e-06, "loss": 1.1264, "mean_token_accuracy": 0.707911616563797, "num_tokens": 17844864.0, "step": 2290 }, { "entropy": 1.0555749453604222, "epoch": 0.6006379324445899, "grad_norm": 0.37642714381217957, "learning_rate": 7.99267207537294e-06, "loss": 1.0828, "mean_token_accuracy": 0.7319136694073677, "num_tokens": 17888044.0, "step": 2295 }, { "entropy": 1.0490516472607851, "epoch": 0.6019465118181074, "grad_norm": 1.011374831199646, "learning_rate": 7.96650091599058e-06, "loss": 1.0583, "mean_token_accuracy": 0.7280811902135611, "num_tokens": 17925845.0, "step": 2300 }, { "epoch": 0.6019465118181074, "eval_entropy": 1.038636552810669, "eval_loss": 1.0401142835617065, "eval_mean_token_accuracy": 0.7285269827842712, "eval_num_tokens": 17925845.0, "eval_runtime": 97.1825, "eval_samples_per_second": 10.29, "eval_steps_per_second": 1.286, "step": 2300 }, { "entropy": 1.060953125730157, "epoch": 0.6032550911916251, "grad_norm": 0.3888475000858307, "learning_rate": 7.940329756608218e-06, "loss": 1.0127, "mean_token_accuracy": 0.7261522125452757, "num_tokens": 17967227.0, "step": 2305 }, { "entropy": 1.108807889930904, "epoch": 0.6045636705651427, "grad_norm": 0.4732019901275635, "learning_rate": 7.914158597225858e-06, "loss": 1.048, "mean_token_accuracy": 0.7257308520376682, "num_tokens": 18000648.0, "step": 2310 }, { "entropy": 1.0645822279155255, "epoch": 0.6058722499386603, "grad_norm": 0.5344901084899902, "learning_rate": 7.887987437843498e-06, "loss": 1.0637, "mean_token_accuracy": 0.726478286832571, "num_tokens": 18041076.0, "step": 2315 }, { "entropy": 1.0792375735938549, "epoch": 0.607180829312178, "grad_norm": 0.4639453589916229, "learning_rate": 7.861816278461136e-06, "loss": 1.0164, "mean_token_accuracy": 0.7243579313158989, "num_tokens": 18078141.0, "step": 2320 }, { "entropy": 1.1244300540536643, "epoch": 0.6084894086856956, "grad_norm": 0.4256502389907837, "learning_rate": 7.835645119078776e-06, "loss": 1.0715, "mean_token_accuracy": 0.721744529902935, "num_tokens": 18115889.0, "step": 2325 }, { "entropy": 1.0580251537263394, "epoch": 0.6097979880592133, "grad_norm": 0.3264947235584259, "learning_rate": 7.809473959696415e-06, "loss": 1.0605, "mean_token_accuracy": 0.7277645215392112, "num_tokens": 18157055.0, "step": 2330 }, { "entropy": 1.0746600326150655, "epoch": 0.6111065674327308, "grad_norm": 0.2848566472530365, "learning_rate": 7.783302800314055e-06, "loss": 1.0606, "mean_token_accuracy": 0.7236160174012184, "num_tokens": 18194754.0, "step": 2335 }, { "entropy": 1.0729442548006773, "epoch": 0.6124151468062484, "grad_norm": 0.34202495217323303, "learning_rate": 7.757131640931695e-06, "loss": 1.0274, "mean_token_accuracy": 0.7167530670762062, "num_tokens": 18239971.0, "step": 2340 }, { "entropy": 1.1377609726041555, "epoch": 0.6137237261797661, "grad_norm": 0.5917489528656006, "learning_rate": 7.730960481549333e-06, "loss": 1.0862, "mean_token_accuracy": 0.7097996596246958, "num_tokens": 18277572.0, "step": 2345 }, { "entropy": 1.0688726913183928, "epoch": 0.6150323055532837, "grad_norm": 0.3864566683769226, "learning_rate": 7.704789322166972e-06, "loss": 0.9895, "mean_token_accuracy": 0.7335865460336208, "num_tokens": 18318117.0, "step": 2350 }, { "entropy": 1.0338464200496673, "epoch": 0.6163408849268014, "grad_norm": 0.3866376280784607, "learning_rate": 7.678618162784612e-06, "loss": 0.9989, "mean_token_accuracy": 0.7306736633181572, "num_tokens": 18357427.0, "step": 2355 }, { "entropy": 1.0776856757700444, "epoch": 0.617649464300319, "grad_norm": 0.37204793095588684, "learning_rate": 7.652447003402252e-06, "loss": 1.0805, "mean_token_accuracy": 0.7231950167566538, "num_tokens": 18398690.0, "step": 2360 }, { "entropy": 1.0225880276411772, "epoch": 0.6189580436738366, "grad_norm": 0.4354966878890991, "learning_rate": 7.62627584401989e-06, "loss": 0.9557, "mean_token_accuracy": 0.7365483164787292, "num_tokens": 18436962.0, "step": 2365 }, { "entropy": 1.056931370869279, "epoch": 0.6202666230473542, "grad_norm": 0.29399538040161133, "learning_rate": 7.60010468463753e-06, "loss": 1.0455, "mean_token_accuracy": 0.7246545068919659, "num_tokens": 18479590.0, "step": 2370 }, { "entropy": 1.0880108322948217, "epoch": 0.6215752024208718, "grad_norm": 0.26629939675331116, "learning_rate": 7.5739335252551695e-06, "loss": 1.0752, "mean_token_accuracy": 0.7159001469612122, "num_tokens": 18520588.0, "step": 2375 }, { "entropy": 1.0517158180475235, "epoch": 0.6228837817943894, "grad_norm": 0.4959399998188019, "learning_rate": 7.547762365872809e-06, "loss": 0.9754, "mean_token_accuracy": 0.7284245084971189, "num_tokens": 18556912.0, "step": 2380 }, { "entropy": 1.0565275117754935, "epoch": 0.6241923611679071, "grad_norm": 0.6061561107635498, "learning_rate": 7.521591206490449e-06, "loss": 0.9693, "mean_token_accuracy": 0.7295171424746514, "num_tokens": 18591545.0, "step": 2385 }, { "entropy": 1.1225751712918282, "epoch": 0.6255009405414247, "grad_norm": 0.43570899963378906, "learning_rate": 7.495420047108087e-06, "loss": 1.0837, "mean_token_accuracy": 0.7186510719358921, "num_tokens": 18632236.0, "step": 2390 }, { "entropy": 1.1181074380874634, "epoch": 0.6268095199149424, "grad_norm": 0.49738365411758423, "learning_rate": 7.469248887725726e-06, "loss": 1.0293, "mean_token_accuracy": 0.7210543874651194, "num_tokens": 18671653.0, "step": 2395 }, { "entropy": 1.092723872885108, "epoch": 0.62811809928846, "grad_norm": 0.5354933142662048, "learning_rate": 7.4430777283433664e-06, "loss": 1.0792, "mean_token_accuracy": 0.7276735674589873, "num_tokens": 18712711.0, "step": 2400 }, { "epoch": 0.62811809928846, "eval_entropy": 1.0314421949386596, "eval_loss": 1.0386921167373657, "eval_mean_token_accuracy": 0.7286318821907043, "eval_num_tokens": 18712711.0, "eval_runtime": 97.152, "eval_samples_per_second": 10.293, "eval_steps_per_second": 1.287, "step": 2400 }, { "entropy": 1.1883727714419365, "epoch": 0.6294266786619775, "grad_norm": 0.3528256416320801, "learning_rate": 7.416906568961006e-06, "loss": 1.1798, "mean_token_accuracy": 0.7113151207566262, "num_tokens": 18752528.0, "step": 2405 }, { "entropy": 1.1015507124364377, "epoch": 0.6307352580354952, "grad_norm": 0.421294242143631, "learning_rate": 7.390735409578646e-06, "loss": 1.058, "mean_token_accuracy": 0.7168937616050244, "num_tokens": 18791480.0, "step": 2410 }, { "entropy": 1.040089299157262, "epoch": 0.6320438374090128, "grad_norm": 0.34841352701187134, "learning_rate": 7.364564250196284e-06, "loss": 0.9257, "mean_token_accuracy": 0.7292000584304332, "num_tokens": 18827672.0, "step": 2415 }, { "entropy": 1.0772018309682607, "epoch": 0.6333524167825305, "grad_norm": 0.39158549904823303, "learning_rate": 7.338393090813923e-06, "loss": 1.0312, "mean_token_accuracy": 0.7295682705938816, "num_tokens": 18867541.0, "step": 2420 }, { "entropy": 1.191065814346075, "epoch": 0.6346609961560481, "grad_norm": 0.5301610231399536, "learning_rate": 7.312221931431563e-06, "loss": 1.214, "mean_token_accuracy": 0.7043293602764606, "num_tokens": 18907080.0, "step": 2425 }, { "entropy": 1.0250558149069549, "epoch": 0.6359695755295657, "grad_norm": 0.46418026089668274, "learning_rate": 7.286050772049203e-06, "loss": 0.9982, "mean_token_accuracy": 0.7344914440065622, "num_tokens": 18945688.0, "step": 2430 }, { "entropy": 1.1069321628659963, "epoch": 0.6372781549030834, "grad_norm": 0.45496806502342224, "learning_rate": 7.259879612666841e-06, "loss": 1.016, "mean_token_accuracy": 0.7186556220054626, "num_tokens": 18982398.0, "step": 2435 }, { "entropy": 1.0202362570911645, "epoch": 0.638586734276601, "grad_norm": 0.46681129932403564, "learning_rate": 7.233708453284481e-06, "loss": 1.0134, "mean_token_accuracy": 0.7437549475580454, "num_tokens": 19022263.0, "step": 2440 }, { "entropy": 1.026650892943144, "epoch": 0.6398953136501185, "grad_norm": 0.2987666428089142, "learning_rate": 7.20753729390212e-06, "loss": 0.9835, "mean_token_accuracy": 0.7353227615356446, "num_tokens": 19065353.0, "step": 2445 }, { "entropy": 1.0794596791267395, "epoch": 0.6412038930236362, "grad_norm": 0.6045072674751282, "learning_rate": 7.1813661345197595e-06, "loss": 1.0901, "mean_token_accuracy": 0.7167344830930233, "num_tokens": 19106128.0, "step": 2450 }, { "entropy": 1.1413863252848386, "epoch": 0.6425124723971538, "grad_norm": 0.408202588558197, "learning_rate": 7.1551949751373995e-06, "loss": 1.07, "mean_token_accuracy": 0.709665472060442, "num_tokens": 19144518.0, "step": 2455 }, { "entropy": 1.018723826110363, "epoch": 0.6438210517706715, "grad_norm": 0.7658793330192566, "learning_rate": 7.129023815755039e-06, "loss": 1.0139, "mean_token_accuracy": 0.7404938109219075, "num_tokens": 19186225.0, "step": 2460 }, { "entropy": 1.035357365384698, "epoch": 0.6451296311441891, "grad_norm": 0.27525585889816284, "learning_rate": 7.102852656372677e-06, "loss": 0.9998, "mean_token_accuracy": 0.7318320117890835, "num_tokens": 19227399.0, "step": 2465 }, { "entropy": 0.9564740158617496, "epoch": 0.6464382105177067, "grad_norm": 0.4672304093837738, "learning_rate": 7.076681496990317e-06, "loss": 0.9195, "mean_token_accuracy": 0.7519853860139847, "num_tokens": 19267194.0, "step": 2470 }, { "entropy": 1.0981974124908447, "epoch": 0.6477467898912244, "grad_norm": 0.5112661719322205, "learning_rate": 7.050510337607956e-06, "loss": 1.0416, "mean_token_accuracy": 0.7168846629559994, "num_tokens": 19303499.0, "step": 2475 }, { "entropy": 1.1099361848086118, "epoch": 0.6490553692647419, "grad_norm": 0.6969805359840393, "learning_rate": 7.0243391782255965e-06, "loss": 1.0715, "mean_token_accuracy": 0.7152190506458282, "num_tokens": 19341950.0, "step": 2480 }, { "entropy": 1.0465524211525916, "epoch": 0.6503639486382596, "grad_norm": 0.26998811960220337, "learning_rate": 6.998168018843236e-06, "loss": 1.0512, "mean_token_accuracy": 0.7323187306523323, "num_tokens": 19386737.0, "step": 2485 }, { "entropy": 0.9889295142143965, "epoch": 0.6516725280117772, "grad_norm": 0.38220036029815674, "learning_rate": 6.971996859460874e-06, "loss": 0.9593, "mean_token_accuracy": 0.7394830010831356, "num_tokens": 19428001.0, "step": 2490 }, { "entropy": 1.0677450440824032, "epoch": 0.6529811073852948, "grad_norm": 0.4677034914493561, "learning_rate": 6.945825700078514e-06, "loss": 0.9954, "mean_token_accuracy": 0.7268911335617304, "num_tokens": 19465393.0, "step": 2495 }, { "entropy": 1.049769724905491, "epoch": 0.6542896867588125, "grad_norm": 0.6634142398834229, "learning_rate": 6.919654540696153e-06, "loss": 1.039, "mean_token_accuracy": 0.7285342015326023, "num_tokens": 19501464.0, "step": 2500 }, { "epoch": 0.6542896867588125, "eval_entropy": 1.035133779525757, "eval_loss": 1.0362484455108643, "eval_mean_token_accuracy": 0.7291943707466125, "eval_num_tokens": 19501464.0, "eval_runtime": 97.1896, "eval_samples_per_second": 10.289, "eval_steps_per_second": 1.286, "step": 2500 }, { "entropy": 1.0178256921470166, "epoch": 0.6555982661323301, "grad_norm": 0.3480639159679413, "learning_rate": 6.8934833813137926e-06, "loss": 0.9423, "mean_token_accuracy": 0.7350625194609165, "num_tokens": 19541127.0, "step": 2505 }, { "entropy": 1.1573872964829206, "epoch": 0.6569068455058478, "grad_norm": 0.3416019678115845, "learning_rate": 6.867312221931433e-06, "loss": 1.1634, "mean_token_accuracy": 0.7034627720713615, "num_tokens": 19577837.0, "step": 2510 }, { "entropy": 1.08112654350698, "epoch": 0.6582154248793654, "grad_norm": 0.5420477390289307, "learning_rate": 6.841141062549072e-06, "loss": 1.016, "mean_token_accuracy": 0.7271684132516384, "num_tokens": 19608622.0, "step": 2515 }, { "entropy": 0.9906784903258086, "epoch": 0.6595240042528829, "grad_norm": 0.46363091468811035, "learning_rate": 6.81496990316671e-06, "loss": 0.9497, "mean_token_accuracy": 0.7543116796761751, "num_tokens": 19643843.0, "step": 2520 }, { "entropy": 1.001466591656208, "epoch": 0.6608325836264006, "grad_norm": 0.37773942947387695, "learning_rate": 6.78879874378435e-06, "loss": 0.9092, "mean_token_accuracy": 0.7434275582432747, "num_tokens": 19681055.0, "step": 2525 }, { "entropy": 1.1146674454212189, "epoch": 0.6621411629999182, "grad_norm": 0.33123865723609924, "learning_rate": 6.7626275844019895e-06, "loss": 1.1595, "mean_token_accuracy": 0.721651129052043, "num_tokens": 19722916.0, "step": 2530 }, { "entropy": 1.0137363068759442, "epoch": 0.6634497423734358, "grad_norm": 0.2801375091075897, "learning_rate": 6.736456425019629e-06, "loss": 1.0089, "mean_token_accuracy": 0.7334360480308533, "num_tokens": 19764966.0, "step": 2535 }, { "entropy": 1.0639188002794981, "epoch": 0.6647583217469535, "grad_norm": 0.3247496485710144, "learning_rate": 6.710285265637269e-06, "loss": 1.0763, "mean_token_accuracy": 0.7175609428435564, "num_tokens": 19804025.0, "step": 2540 }, { "entropy": 1.1048081483691932, "epoch": 0.6660669011204711, "grad_norm": 0.4086282551288605, "learning_rate": 6.684114106254907e-06, "loss": 1.1058, "mean_token_accuracy": 0.7259872317314148, "num_tokens": 19843774.0, "step": 2545 }, { "entropy": 1.1255848862230777, "epoch": 0.6673754804939888, "grad_norm": 0.44439879059791565, "learning_rate": 6.657942946872546e-06, "loss": 1.0964, "mean_token_accuracy": 0.7162352904677391, "num_tokens": 19881825.0, "step": 2550 }, { "entropy": 1.1439194109290838, "epoch": 0.6686840598675063, "grad_norm": 0.29299938678741455, "learning_rate": 6.6317717874901865e-06, "loss": 1.1223, "mean_token_accuracy": 0.7133913572877646, "num_tokens": 19921618.0, "step": 2555 }, { "entropy": 1.029699081927538, "epoch": 0.6699926392410239, "grad_norm": 0.40280821919441223, "learning_rate": 6.605600628107826e-06, "loss": 0.9621, "mean_token_accuracy": 0.7391855999827385, "num_tokens": 19960586.0, "step": 2560 }, { "entropy": 1.003632688894868, "epoch": 0.6713012186145416, "grad_norm": 0.2862677276134491, "learning_rate": 6.579429468725466e-06, "loss": 0.9738, "mean_token_accuracy": 0.7375729866325855, "num_tokens": 20000547.0, "step": 2565 }, { "entropy": 1.1057576406747103, "epoch": 0.6726097979880592, "grad_norm": 0.28728872537612915, "learning_rate": 6.553258309343105e-06, "loss": 1.0678, "mean_token_accuracy": 0.7252844780683517, "num_tokens": 20045934.0, "step": 2570 }, { "entropy": 1.0278929378837347, "epoch": 0.6739183773615769, "grad_norm": 0.3557819724082947, "learning_rate": 6.527087149960743e-06, "loss": 0.9804, "mean_token_accuracy": 0.7362292345613242, "num_tokens": 20088247.0, "step": 2575 }, { "entropy": 1.0297599412500857, "epoch": 0.6752269567350945, "grad_norm": 0.3730546534061432, "learning_rate": 6.500915990578383e-06, "loss": 1.0108, "mean_token_accuracy": 0.7349158681929111, "num_tokens": 20130988.0, "step": 2580 }, { "entropy": 1.0113672140985728, "epoch": 0.6765355361086121, "grad_norm": 0.3825022876262665, "learning_rate": 6.474744831196023e-06, "loss": 0.9893, "mean_token_accuracy": 0.7319887422025204, "num_tokens": 20169034.0, "step": 2585 }, { "entropy": 1.1008230350911616, "epoch": 0.6778441154821297, "grad_norm": 0.5840580463409424, "learning_rate": 6.448573671813662e-06, "loss": 1.0848, "mean_token_accuracy": 0.716325681656599, "num_tokens": 20206269.0, "step": 2590 }, { "entropy": 1.0766204059123994, "epoch": 0.6791526948556473, "grad_norm": 0.4373389482498169, "learning_rate": 6.422402512431302e-06, "loss": 1.0217, "mean_token_accuracy": 0.7277089573442936, "num_tokens": 20241866.0, "step": 2595 }, { "entropy": 1.0236305307596922, "epoch": 0.6804612742291649, "grad_norm": 0.6429861783981323, "learning_rate": 6.39623135304894e-06, "loss": 0.9714, "mean_token_accuracy": 0.7359235771000385, "num_tokens": 20279405.0, "step": 2600 }, { "epoch": 0.6804612742291649, "eval_entropy": 1.0339577412605285, "eval_loss": 1.03507399559021, "eval_mean_token_accuracy": 0.7294820728302002, "eval_num_tokens": 20279405.0, "eval_runtime": 97.2889, "eval_samples_per_second": 10.279, "eval_steps_per_second": 1.285, "step": 2600 }, { "entropy": 1.1064529620110988, "epoch": 0.6817698536026826, "grad_norm": 0.6711133718490601, "learning_rate": 6.3700601936665795e-06, "loss": 1.0578, "mean_token_accuracy": 0.7143168658018112, "num_tokens": 20318358.0, "step": 2605 }, { "entropy": 1.0551523761823773, "epoch": 0.6830784329762002, "grad_norm": 0.6876797080039978, "learning_rate": 6.3438890342842196e-06, "loss": 1.0351, "mean_token_accuracy": 0.7304907165467739, "num_tokens": 20355919.0, "step": 2610 }, { "entropy": 1.0216753158718348, "epoch": 0.6843870123497179, "grad_norm": 0.43939870595932007, "learning_rate": 6.317717874901859e-06, "loss": 0.9788, "mean_token_accuracy": 0.7326975151896477, "num_tokens": 20395739.0, "step": 2615 }, { "entropy": 1.0042755860835313, "epoch": 0.6856955917232355, "grad_norm": 0.45771950483322144, "learning_rate": 6.291546715519498e-06, "loss": 1.0214, "mean_token_accuracy": 0.7377393446862698, "num_tokens": 20439063.0, "step": 2620 }, { "entropy": 1.08444495908916, "epoch": 0.687004171096753, "grad_norm": 0.328112930059433, "learning_rate": 6.265375556137138e-06, "loss": 1.0578, "mean_token_accuracy": 0.7258397076278925, "num_tokens": 20480446.0, "step": 2625 }, { "entropy": 1.053849907964468, "epoch": 0.6883127504702707, "grad_norm": 0.4496280550956726, "learning_rate": 6.2392043967547764e-06, "loss": 1.0282, "mean_token_accuracy": 0.7329249262809754, "num_tokens": 20519257.0, "step": 2630 }, { "entropy": 1.0985306035727262, "epoch": 0.6896213298437883, "grad_norm": 0.43236038088798523, "learning_rate": 6.213033237372416e-06, "loss": 1.0809, "mean_token_accuracy": 0.7229348205029964, "num_tokens": 20560703.0, "step": 2635 }, { "entropy": 1.039340978488326, "epoch": 0.690929909217306, "grad_norm": 0.592126727104187, "learning_rate": 6.186862077990056e-06, "loss": 0.9885, "mean_token_accuracy": 0.7413885131478309, "num_tokens": 20602363.0, "step": 2640 }, { "entropy": 1.040134682506323, "epoch": 0.6922384885908236, "grad_norm": 0.5198757648468018, "learning_rate": 6.160690918607695e-06, "loss": 1.0226, "mean_token_accuracy": 0.7368385374546051, "num_tokens": 20639437.0, "step": 2645 }, { "entropy": 1.1080417584627866, "epoch": 0.6935470679643412, "grad_norm": 0.2894703149795532, "learning_rate": 6.134519759225335e-06, "loss": 1.1345, "mean_token_accuracy": 0.7155061598867178, "num_tokens": 20680275.0, "step": 2650 }, { "entropy": 1.0198019824922084, "epoch": 0.6948556473378589, "grad_norm": 0.3462557792663574, "learning_rate": 6.108348599842973e-06, "loss": 1.0143, "mean_token_accuracy": 0.7402982845902443, "num_tokens": 20718417.0, "step": 2655 }, { "entropy": 1.0144409965723753, "epoch": 0.6961642267113765, "grad_norm": 0.3350953757762909, "learning_rate": 6.082177440460613e-06, "loss": 0.9838, "mean_token_accuracy": 0.7343977279961109, "num_tokens": 20762227.0, "step": 2660 }, { "entropy": 1.0715901810675859, "epoch": 0.697472806084894, "grad_norm": 0.6075988411903381, "learning_rate": 6.056006281078253e-06, "loss": 1.0055, "mean_token_accuracy": 0.7305215656757355, "num_tokens": 20798122.0, "step": 2665 }, { "entropy": 1.1007160846143962, "epoch": 0.6987813854584117, "grad_norm": 0.4501047134399414, "learning_rate": 6.029835121695892e-06, "loss": 1.058, "mean_token_accuracy": 0.7208323903381825, "num_tokens": 20834210.0, "step": 2670 }, { "entropy": 1.0522528853267432, "epoch": 0.7000899648319293, "grad_norm": 0.36250752210617065, "learning_rate": 6.00366396231353e-06, "loss": 0.9972, "mean_token_accuracy": 0.7273079171776772, "num_tokens": 20872418.0, "step": 2675 }, { "entropy": 1.054316360130906, "epoch": 0.701398544205447, "grad_norm": 0.3319310247898102, "learning_rate": 5.977492802931171e-06, "loss": 1.0515, "mean_token_accuracy": 0.7271708060055971, "num_tokens": 20911504.0, "step": 2680 }, { "entropy": 1.0553948923945426, "epoch": 0.7027071235789646, "grad_norm": 0.46403029561042786, "learning_rate": 5.9513216435488095e-06, "loss": 1.0058, "mean_token_accuracy": 0.733677176013589, "num_tokens": 20947438.0, "step": 2685 }, { "entropy": 1.135214940458536, "epoch": 0.7040157029524822, "grad_norm": 0.2997356355190277, "learning_rate": 5.925150484166449e-06, "loss": 1.0205, "mean_token_accuracy": 0.7149875804781913, "num_tokens": 20984843.0, "step": 2690 }, { "entropy": 1.067294117808342, "epoch": 0.7053242823259999, "grad_norm": 0.7109698057174683, "learning_rate": 5.898979324784089e-06, "loss": 1.0424, "mean_token_accuracy": 0.7298292949795723, "num_tokens": 21024317.0, "step": 2695 }, { "entropy": 1.0000967428088188, "epoch": 0.7066328616995174, "grad_norm": 0.4540089964866638, "learning_rate": 5.872808165401728e-06, "loss": 1.0007, "mean_token_accuracy": 0.734975878149271, "num_tokens": 21064116.0, "step": 2700 }, { "epoch": 0.7066328616995174, "eval_entropy": 1.033733127117157, "eval_loss": 1.0336334705352783, "eval_mean_token_accuracy": 0.7298680348396301, "eval_num_tokens": 21064116.0, "eval_runtime": 97.2545, "eval_samples_per_second": 10.282, "eval_steps_per_second": 1.285, "step": 2700 }, { "entropy": 1.0902955643832684, "epoch": 0.7079414410730351, "grad_norm": 0.4074270725250244, "learning_rate": 5.846637006019366e-06, "loss": 1.103, "mean_token_accuracy": 0.7196738637983799, "num_tokens": 21106096.0, "step": 2705 }, { "entropy": 0.9893661700189114, "epoch": 0.7092500204465527, "grad_norm": 0.43556877970695496, "learning_rate": 5.8204658466370065e-06, "loss": 0.9469, "mean_token_accuracy": 0.7477334760129452, "num_tokens": 21143151.0, "step": 2710 }, { "entropy": 1.0833146207034587, "epoch": 0.7105585998200703, "grad_norm": 0.6449909806251526, "learning_rate": 5.794294687254646e-06, "loss": 1.0748, "mean_token_accuracy": 0.7204306960105896, "num_tokens": 21179840.0, "step": 2715 }, { "entropy": 1.0693892493844033, "epoch": 0.711867179193588, "grad_norm": 0.2570124864578247, "learning_rate": 5.768123527872285e-06, "loss": 1.0456, "mean_token_accuracy": 0.722801336273551, "num_tokens": 21217194.0, "step": 2720 }, { "entropy": 1.0765843842178584, "epoch": 0.7131757585671056, "grad_norm": 0.5316287279129028, "learning_rate": 5.741952368489925e-06, "loss": 1.0263, "mean_token_accuracy": 0.7246413186192513, "num_tokens": 21251605.0, "step": 2725 }, { "entropy": 1.0620105125010013, "epoch": 0.7144843379406232, "grad_norm": 0.3342624306678772, "learning_rate": 5.715781209107563e-06, "loss": 1.0065, "mean_token_accuracy": 0.7301050588488579, "num_tokens": 21294871.0, "step": 2730 }, { "entropy": 1.0648996368050576, "epoch": 0.7157929173141409, "grad_norm": 0.5223391056060791, "learning_rate": 5.689610049725203e-06, "loss": 1.0258, "mean_token_accuracy": 0.7259559325873852, "num_tokens": 21326670.0, "step": 2735 }, { "entropy": 1.0742478474974633, "epoch": 0.7171014966876584, "grad_norm": 0.5522735118865967, "learning_rate": 5.663438890342843e-06, "loss": 1.0703, "mean_token_accuracy": 0.7225585378706455, "num_tokens": 21367325.0, "step": 2740 }, { "entropy": 0.9710161112248897, "epoch": 0.7184100760611761, "grad_norm": 0.33066603541374207, "learning_rate": 5.637267730960482e-06, "loss": 0.9442, "mean_token_accuracy": 0.7384205959737301, "num_tokens": 21408334.0, "step": 2745 }, { "entropy": 1.0664944287389517, "epoch": 0.7197186554346937, "grad_norm": 0.4889247417449951, "learning_rate": 5.611096571578122e-06, "loss": 1.112, "mean_token_accuracy": 0.7264700572937727, "num_tokens": 21449839.0, "step": 2750 }, { "entropy": 1.0030995801091194, "epoch": 0.7210272348082113, "grad_norm": 0.3732108771800995, "learning_rate": 5.584925412195761e-06, "loss": 0.9111, "mean_token_accuracy": 0.7403148956596851, "num_tokens": 21487742.0, "step": 2755 }, { "entropy": 1.0033697545528413, "epoch": 0.722335814181729, "grad_norm": 0.43490299582481384, "learning_rate": 5.5587542528133995e-06, "loss": 0.9582, "mean_token_accuracy": 0.7397238194942475, "num_tokens": 21526844.0, "step": 2760 }, { "entropy": 1.1253223221749067, "epoch": 0.7236443935552466, "grad_norm": 0.46164193749427795, "learning_rate": 5.5325830934310396e-06, "loss": 1.1347, "mean_token_accuracy": 0.7200637456029654, "num_tokens": 21568539.0, "step": 2765 }, { "entropy": 1.0714545018970967, "epoch": 0.7249529729287643, "grad_norm": 0.39674556255340576, "learning_rate": 5.506411934048679e-06, "loss": 1.0253, "mean_token_accuracy": 0.7276010327041149, "num_tokens": 21603326.0, "step": 2770 }, { "entropy": 1.06046348772943, "epoch": 0.7262615523022818, "grad_norm": 0.4999883770942688, "learning_rate": 5.480240774666318e-06, "loss": 1.0495, "mean_token_accuracy": 0.7262998007237911, "num_tokens": 21642529.0, "step": 2775 }, { "entropy": 1.0288800846785306, "epoch": 0.7275701316757994, "grad_norm": 0.45190781354904175, "learning_rate": 5.454069615283958e-06, "loss": 1.0062, "mean_token_accuracy": 0.7358099550008774, "num_tokens": 21675950.0, "step": 2780 }, { "entropy": 1.064651170000434, "epoch": 0.7288787110493171, "grad_norm": 0.6091992259025574, "learning_rate": 5.4278984559015964e-06, "loss": 1.0302, "mean_token_accuracy": 0.7322472549974919, "num_tokens": 21712389.0, "step": 2785 }, { "entropy": 1.067555221542716, "epoch": 0.7301872904228347, "grad_norm": 0.3587604761123657, "learning_rate": 5.401727296519236e-06, "loss": 1.0269, "mean_token_accuracy": 0.7269019264727831, "num_tokens": 21750233.0, "step": 2790 }, { "entropy": 1.0527180466800927, "epoch": 0.7314958697963523, "grad_norm": 0.6426623463630676, "learning_rate": 5.375556137136876e-06, "loss": 1.045, "mean_token_accuracy": 0.7300838761031627, "num_tokens": 21788770.0, "step": 2795 }, { "entropy": 1.115583229251206, "epoch": 0.73280444916987, "grad_norm": 0.5922237634658813, "learning_rate": 5.349384977754515e-06, "loss": 1.0523, "mean_token_accuracy": 0.7175281938165426, "num_tokens": 21826397.0, "step": 2800 }, { "epoch": 0.73280444916987, "eval_entropy": 1.0408368062973024, "eval_loss": 1.0325729846954346, "eval_mean_token_accuracy": 0.73026052236557, "eval_num_tokens": 21826397.0, "eval_runtime": 97.1897, "eval_samples_per_second": 10.289, "eval_steps_per_second": 1.286, "step": 2800 }, { "entropy": 1.1054367668926717, "epoch": 0.7341130285433876, "grad_norm": 0.5465598106384277, "learning_rate": 5.323213818372154e-06, "loss": 1.08, "mean_token_accuracy": 0.720421589165926, "num_tokens": 21863424.0, "step": 2805 }, { "entropy": 1.1147074215114117, "epoch": 0.7354216079169053, "grad_norm": 0.4170718193054199, "learning_rate": 5.297042658989794e-06, "loss": 1.0508, "mean_token_accuracy": 0.7232054278254509, "num_tokens": 21900462.0, "step": 2810 }, { "entropy": 1.0979057766497136, "epoch": 0.7367301872904228, "grad_norm": 0.4706130027770996, "learning_rate": 5.270871499607433e-06, "loss": 1.0168, "mean_token_accuracy": 0.719353062659502, "num_tokens": 21933354.0, "step": 2815 }, { "entropy": 0.9744918283075095, "epoch": 0.7380387666639404, "grad_norm": 0.5279991626739502, "learning_rate": 5.244700340225073e-06, "loss": 0.9217, "mean_token_accuracy": 0.7506507318466902, "num_tokens": 21970100.0, "step": 2820 }, { "entropy": 1.0728078037500381, "epoch": 0.7393473460374581, "grad_norm": 0.5053285956382751, "learning_rate": 5.218529180842712e-06, "loss": 1.0283, "mean_token_accuracy": 0.7311844080686569, "num_tokens": 22004093.0, "step": 2825 }, { "entropy": 1.0765360169112683, "epoch": 0.7406559254109757, "grad_norm": 0.36004361510276794, "learning_rate": 5.192358021460351e-06, "loss": 1.0092, "mean_token_accuracy": 0.7210248097777366, "num_tokens": 22045336.0, "step": 2830 }, { "entropy": 1.017987198382616, "epoch": 0.7419645047844934, "grad_norm": 0.36846044659614563, "learning_rate": 5.166186862077991e-06, "loss": 0.9839, "mean_token_accuracy": 0.7387872941792011, "num_tokens": 22086534.0, "step": 2835 }, { "entropy": 1.0489673603326082, "epoch": 0.743273084158011, "grad_norm": 0.46179378032684326, "learning_rate": 5.1400157026956295e-06, "loss": 0.9415, "mean_token_accuracy": 0.7353867895901203, "num_tokens": 22121230.0, "step": 2840 }, { "entropy": 1.1243290316313506, "epoch": 0.7445816635315285, "grad_norm": 0.5417360663414001, "learning_rate": 5.113844543313269e-06, "loss": 1.0578, "mean_token_accuracy": 0.7204208463430405, "num_tokens": 22159032.0, "step": 2845 }, { "entropy": 1.0939917907118797, "epoch": 0.7458902429050462, "grad_norm": 0.48897913098335266, "learning_rate": 5.087673383930909e-06, "loss": 1.0063, "mean_token_accuracy": 0.7165301315486431, "num_tokens": 22196955.0, "step": 2850 }, { "entropy": 1.0858067644760012, "epoch": 0.7471988222785638, "grad_norm": 0.5564597845077515, "learning_rate": 5.061502224548548e-06, "loss": 1.0764, "mean_token_accuracy": 0.7208572022616864, "num_tokens": 22234201.0, "step": 2855 }, { "entropy": 1.0590477578341961, "epoch": 0.7485074016520814, "grad_norm": 0.3768922686576843, "learning_rate": 5.035331065166187e-06, "loss": 0.9812, "mean_token_accuracy": 0.7291722763329744, "num_tokens": 22272779.0, "step": 2860 }, { "entropy": 1.1251448437571525, "epoch": 0.7498159810255991, "grad_norm": 0.3560837507247925, "learning_rate": 5.009159905783827e-06, "loss": 1.0892, "mean_token_accuracy": 0.7106486916542053, "num_tokens": 22308851.0, "step": 2865 }, { "entropy": 1.119718360155821, "epoch": 0.7511245603991167, "grad_norm": 0.5380016565322876, "learning_rate": 4.982988746401466e-06, "loss": 1.0983, "mean_token_accuracy": 0.7206918202340603, "num_tokens": 22343884.0, "step": 2870 }, { "entropy": 1.0894725158810616, "epoch": 0.7524331397726344, "grad_norm": 0.3485453426837921, "learning_rate": 4.956817587019106e-06, "loss": 1.0491, "mean_token_accuracy": 0.7222230531275272, "num_tokens": 22384017.0, "step": 2875 }, { "entropy": 1.0093999858945608, "epoch": 0.753741719146152, "grad_norm": 0.4127729535102844, "learning_rate": 4.930646427636745e-06, "loss": 0.9502, "mean_token_accuracy": 0.7418296955525875, "num_tokens": 22418122.0, "step": 2880 }, { "entropy": 1.1371208924800158, "epoch": 0.7550502985196695, "grad_norm": 0.43648311495780945, "learning_rate": 4.904475268254384e-06, "loss": 1.1494, "mean_token_accuracy": 0.7121193654835224, "num_tokens": 22459009.0, "step": 2885 }, { "entropy": 1.0378689534962178, "epoch": 0.7563588778931872, "grad_norm": 0.44997143745422363, "learning_rate": 4.878304108872023e-06, "loss": 0.9618, "mean_token_accuracy": 0.7308846026659012, "num_tokens": 22493139.0, "step": 2890 }, { "entropy": 1.0220247332006693, "epoch": 0.7576674572667048, "grad_norm": 0.561329185962677, "learning_rate": 4.852132949489663e-06, "loss": 0.9283, "mean_token_accuracy": 0.7339823555201292, "num_tokens": 22529225.0, "step": 2895 }, { "entropy": 1.0556524885818361, "epoch": 0.7589760366402225, "grad_norm": 0.3141125738620758, "learning_rate": 4.825961790107302e-06, "loss": 1.0225, "mean_token_accuracy": 0.732053418457508, "num_tokens": 22565967.0, "step": 2900 }, { "epoch": 0.7589760366402225, "eval_entropy": 1.0340708026885985, "eval_loss": 1.0317820310592651, "eval_mean_token_accuracy": 0.7304344477653504, "eval_num_tokens": 22565967.0, "eval_runtime": 97.2034, "eval_samples_per_second": 10.288, "eval_steps_per_second": 1.286, "step": 2900 }, { "entropy": 1.0628287993371486, "epoch": 0.7602846160137401, "grad_norm": 0.5528143048286438, "learning_rate": 4.799790630724941e-06, "loss": 1.0052, "mean_token_accuracy": 0.7236215595155955, "num_tokens": 22605849.0, "step": 2905 }, { "entropy": 1.0544053295627236, "epoch": 0.7615931953872577, "grad_norm": 0.5025759339332581, "learning_rate": 4.773619471342581e-06, "loss": 0.995, "mean_token_accuracy": 0.7306290067732334, "num_tokens": 22647439.0, "step": 2910 }, { "entropy": 1.0855362933129071, "epoch": 0.7629017747607754, "grad_norm": 0.5423142910003662, "learning_rate": 4.74744831196022e-06, "loss": 1.0644, "mean_token_accuracy": 0.729085548222065, "num_tokens": 22686489.0, "step": 2915 }, { "entropy": 1.1064934462308884, "epoch": 0.764210354134293, "grad_norm": 0.324994295835495, "learning_rate": 4.7212771525778596e-06, "loss": 1.0572, "mean_token_accuracy": 0.7155943371355533, "num_tokens": 22729097.0, "step": 2920 }, { "entropy": 1.0514110021293164, "epoch": 0.7655189335078105, "grad_norm": 0.48154741525650024, "learning_rate": 4.695105993195499e-06, "loss": 0.9801, "mean_token_accuracy": 0.728981614857912, "num_tokens": 22767464.0, "step": 2925 }, { "entropy": 1.0362781658768654, "epoch": 0.7668275128813282, "grad_norm": 0.5839325189590454, "learning_rate": 4.668934833813139e-06, "loss": 0.9951, "mean_token_accuracy": 0.7349088959395885, "num_tokens": 22806605.0, "step": 2930 }, { "entropy": 1.0683909840881824, "epoch": 0.7681360922548458, "grad_norm": 0.37387317419052124, "learning_rate": 4.642763674430777e-06, "loss": 1.0933, "mean_token_accuracy": 0.7265824314206839, "num_tokens": 22844597.0, "step": 2935 }, { "entropy": 1.1125381592661143, "epoch": 0.7694446716283635, "grad_norm": 0.6262072324752808, "learning_rate": 4.616592515048417e-06, "loss": 1.0776, "mean_token_accuracy": 0.7169565990567207, "num_tokens": 22883020.0, "step": 2940 }, { "entropy": 1.1716872084885837, "epoch": 0.7707532510018811, "grad_norm": 0.4011521637439728, "learning_rate": 4.5904213556660565e-06, "loss": 1.1654, "mean_token_accuracy": 0.7077478947117924, "num_tokens": 22922580.0, "step": 2945 }, { "entropy": 1.0219284979626537, "epoch": 0.7720618303753987, "grad_norm": 0.6057654619216919, "learning_rate": 4.564250196283696e-06, "loss": 1.0102, "mean_token_accuracy": 0.7341908976435662, "num_tokens": 22961570.0, "step": 2950 }, { "entropy": 1.045608853548765, "epoch": 0.7733704097489164, "grad_norm": 0.5418355464935303, "learning_rate": 4.538079036901335e-06, "loss": 0.9675, "mean_token_accuracy": 0.731929623708129, "num_tokens": 22998884.0, "step": 2955 }, { "entropy": 1.1297316145151854, "epoch": 0.7746789891224339, "grad_norm": 0.469908207654953, "learning_rate": 4.511907877518974e-06, "loss": 1.1027, "mean_token_accuracy": 0.7112006828188896, "num_tokens": 23036097.0, "step": 2960 }, { "entropy": 1.0829615991562604, "epoch": 0.7759875684959516, "grad_norm": 0.42076605558395386, "learning_rate": 4.485736718136614e-06, "loss": 0.9541, "mean_token_accuracy": 0.7311103120446205, "num_tokens": 23071198.0, "step": 2965 }, { "entropy": 1.0218299932777881, "epoch": 0.7772961478694692, "grad_norm": 0.6354956030845642, "learning_rate": 4.459565558754253e-06, "loss": 0.94, "mean_token_accuracy": 0.7375240176916122, "num_tokens": 23105690.0, "step": 2970 }, { "entropy": 1.0296128673478961, "epoch": 0.7786047272429868, "grad_norm": 0.5609083771705627, "learning_rate": 4.433394399371893e-06, "loss": 0.9641, "mean_token_accuracy": 0.7355793751776218, "num_tokens": 23149341.0, "step": 2975 }, { "entropy": 1.0164244651794434, "epoch": 0.7799133066165045, "grad_norm": 0.4903509318828583, "learning_rate": 4.407223239989532e-06, "loss": 0.9678, "mean_token_accuracy": 0.7360415138304234, "num_tokens": 23184633.0, "step": 2980 }, { "entropy": 1.0966936625540256, "epoch": 0.7812218859900221, "grad_norm": 0.39143356680870056, "learning_rate": 4.381052080607171e-06, "loss": 1.0806, "mean_token_accuracy": 0.7219570934772491, "num_tokens": 23221750.0, "step": 2985 }, { "entropy": 1.042829731106758, "epoch": 0.7825304653635398, "grad_norm": 0.4407028257846832, "learning_rate": 4.35488092122481e-06, "loss": 1.0816, "mean_token_accuracy": 0.7296155303716659, "num_tokens": 23263629.0, "step": 2990 }, { "entropy": 1.067452398315072, "epoch": 0.7838390447370573, "grad_norm": 0.4685254991054535, "learning_rate": 4.32870976184245e-06, "loss": 1.0394, "mean_token_accuracy": 0.7196490127593279, "num_tokens": 23299548.0, "step": 2995 }, { "entropy": 1.0206131491810084, "epoch": 0.7851476241105749, "grad_norm": 0.4271470010280609, "learning_rate": 4.30253860246009e-06, "loss": 0.9486, "mean_token_accuracy": 0.7346534051001072, "num_tokens": 23340920.0, "step": 3000 }, { "epoch": 0.7851476241105749, "eval_entropy": 1.0305686144828796, "eval_loss": 1.03084135055542, "eval_mean_token_accuracy": 0.7306465420722962, "eval_num_tokens": 23340920.0, "eval_runtime": 97.1158, "eval_samples_per_second": 10.297, "eval_steps_per_second": 1.287, "step": 3000 }, { "entropy": 1.0277832087129355, "epoch": 0.7864562034840926, "grad_norm": 0.44500380754470825, "learning_rate": 4.276367443077729e-06, "loss": 0.987, "mean_token_accuracy": 0.7308167792856693, "num_tokens": 23381557.0, "step": 3005 }, { "entropy": 1.086503054201603, "epoch": 0.7877647828576102, "grad_norm": 0.3013031780719757, "learning_rate": 4.250196283695368e-06, "loss": 1.1165, "mean_token_accuracy": 0.721335730701685, "num_tokens": 23422282.0, "step": 3010 }, { "entropy": 1.1198953952640296, "epoch": 0.7890733622311278, "grad_norm": 0.5127637982368469, "learning_rate": 4.224025124313007e-06, "loss": 1.0667, "mean_token_accuracy": 0.716464214771986, "num_tokens": 23462818.0, "step": 3015 }, { "entropy": 1.0935983188450336, "epoch": 0.7903819416046455, "grad_norm": 0.7555818557739258, "learning_rate": 4.1978539649306465e-06, "loss": 1.0789, "mean_token_accuracy": 0.7195366825908422, "num_tokens": 23500331.0, "step": 3020 }, { "entropy": 1.092792271077633, "epoch": 0.7916905209781631, "grad_norm": 0.37865763902664185, "learning_rate": 4.171682805548286e-06, "loss": 1.0283, "mean_token_accuracy": 0.7241503912955523, "num_tokens": 23537976.0, "step": 3025 }, { "entropy": 1.0759663056582212, "epoch": 0.7929991003516808, "grad_norm": 0.5228607058525085, "learning_rate": 4.145511646165926e-06, "loss": 1.0736, "mean_token_accuracy": 0.7280153460800648, "num_tokens": 23581747.0, "step": 3030 }, { "entropy": 1.1236386895179749, "epoch": 0.7943076797251983, "grad_norm": 0.6752633452415466, "learning_rate": 4.119340486783565e-06, "loss": 1.0768, "mean_token_accuracy": 0.7147096544504166, "num_tokens": 23615855.0, "step": 3035 }, { "entropy": 1.1543139606714248, "epoch": 0.7956162590987159, "grad_norm": 0.402425616979599, "learning_rate": 4.093169327401204e-06, "loss": 1.1301, "mean_token_accuracy": 0.7124893587082625, "num_tokens": 23653681.0, "step": 3040 }, { "entropy": 1.1155839897692204, "epoch": 0.7969248384722336, "grad_norm": 0.3531131148338318, "learning_rate": 4.066998168018843e-06, "loss": 1.1011, "mean_token_accuracy": 0.7147599868476391, "num_tokens": 23691707.0, "step": 3045 }, { "entropy": 1.0141445618122815, "epoch": 0.7982334178457512, "grad_norm": 0.8314666152000427, "learning_rate": 4.0408270086364835e-06, "loss": 0.9637, "mean_token_accuracy": 0.7401646625250577, "num_tokens": 23731289.0, "step": 3050 }, { "entropy": 1.1060345830395817, "epoch": 0.7995419972192689, "grad_norm": 0.39150726795196533, "learning_rate": 4.014655849254122e-06, "loss": 1.1086, "mean_token_accuracy": 0.7131407152861357, "num_tokens": 23769110.0, "step": 3055 }, { "entropy": 1.1247680716216564, "epoch": 0.8008505765927865, "grad_norm": 0.393403023481369, "learning_rate": 3.988484689871762e-06, "loss": 1.0878, "mean_token_accuracy": 0.7203011158853769, "num_tokens": 23805247.0, "step": 3060 }, { "entropy": 1.0936600599437951, "epoch": 0.802159155966304, "grad_norm": 0.40403568744659424, "learning_rate": 3.962313530489401e-06, "loss": 1.0812, "mean_token_accuracy": 0.7204918511211872, "num_tokens": 23846787.0, "step": 3065 }, { "entropy": 1.1725836582481861, "epoch": 0.8034677353398217, "grad_norm": 0.4385111629962921, "learning_rate": 3.93614237110704e-06, "loss": 1.1878, "mean_token_accuracy": 0.7098725575953722, "num_tokens": 23884749.0, "step": 3070 }, { "entropy": 1.056993416696787, "epoch": 0.8047763147133393, "grad_norm": 0.4194464087486267, "learning_rate": 3.90997121172468e-06, "loss": 1.0316, "mean_token_accuracy": 0.7291632607579231, "num_tokens": 23925854.0, "step": 3075 }, { "entropy": 1.122009427472949, "epoch": 0.8060848940868569, "grad_norm": 0.34683725237846375, "learning_rate": 3.883800052342319e-06, "loss": 1.0402, "mean_token_accuracy": 0.7219870649278164, "num_tokens": 23965584.0, "step": 3080 }, { "entropy": 0.982495572976768, "epoch": 0.8073934734603746, "grad_norm": 0.7075260281562805, "learning_rate": 3.857628892959959e-06, "loss": 0.9437, "mean_token_accuracy": 0.750491987913847, "num_tokens": 24004600.0, "step": 3085 }, { "entropy": 1.0386792987585067, "epoch": 0.8087020528338922, "grad_norm": 0.3854619562625885, "learning_rate": 3.831457733577597e-06, "loss": 0.9687, "mean_token_accuracy": 0.7319545924663544, "num_tokens": 24039464.0, "step": 3090 }, { "entropy": 1.0844634655863046, "epoch": 0.8100106322074099, "grad_norm": 0.47892338037490845, "learning_rate": 3.8052865741952373e-06, "loss": 1.0465, "mean_token_accuracy": 0.7266070485115051, "num_tokens": 24076409.0, "step": 3095 }, { "entropy": 1.0820760104805232, "epoch": 0.8113192115809275, "grad_norm": 0.43651270866394043, "learning_rate": 3.7791154148128765e-06, "loss": 1.0007, "mean_token_accuracy": 0.7236479226499796, "num_tokens": 24118481.0, "step": 3100 }, { "epoch": 0.8113192115809275, "eval_entropy": 1.036030520915985, "eval_loss": 1.028921365737915, "eval_mean_token_accuracy": 0.7310774421691895, "eval_num_tokens": 24118481.0, "eval_runtime": 97.2385, "eval_samples_per_second": 10.284, "eval_steps_per_second": 1.285, "step": 3100 }, { "entropy": 1.1077489994466305, "epoch": 0.812627790954445, "grad_norm": 0.42022934556007385, "learning_rate": 3.7529442554305157e-06, "loss": 1.0661, "mean_token_accuracy": 0.7255024470388889, "num_tokens": 24158004.0, "step": 3105 }, { "entropy": 1.0772877372801304, "epoch": 0.8139363703279627, "grad_norm": 0.3519681990146637, "learning_rate": 3.7267730960481554e-06, "loss": 1.0428, "mean_token_accuracy": 0.7230064436793328, "num_tokens": 24194320.0, "step": 3110 }, { "entropy": 1.0549692034721374, "epoch": 0.8152449497014803, "grad_norm": 0.5634621381759644, "learning_rate": 3.7006019366657946e-06, "loss": 1.0893, "mean_token_accuracy": 0.7291611786931753, "num_tokens": 24233260.0, "step": 3115 }, { "entropy": 0.9838933110237121, "epoch": 0.816553529074998, "grad_norm": 0.3864845037460327, "learning_rate": 3.6744307772834342e-06, "loss": 0.9414, "mean_token_accuracy": 0.7454824268817901, "num_tokens": 24270814.0, "step": 3120 }, { "entropy": 1.0704244837164878, "epoch": 0.8178621084485156, "grad_norm": 0.5081256628036499, "learning_rate": 3.648259617901073e-06, "loss": 1.0257, "mean_token_accuracy": 0.723455361276865, "num_tokens": 24309235.0, "step": 3125 }, { "entropy": 1.022134768590331, "epoch": 0.8191706878220332, "grad_norm": 0.42314413189888, "learning_rate": 3.6220884585187127e-06, "loss": 0.9948, "mean_token_accuracy": 0.7342596560716629, "num_tokens": 24348724.0, "step": 3130 }, { "entropy": 1.0905879106372596, "epoch": 0.8204792671955509, "grad_norm": 0.30757448077201843, "learning_rate": 3.5959172991363523e-06, "loss": 1.0555, "mean_token_accuracy": 0.724842281267047, "num_tokens": 24388628.0, "step": 3135 }, { "entropy": 1.0407748512923718, "epoch": 0.8217878465690684, "grad_norm": 0.7021998167037964, "learning_rate": 3.569746139753991e-06, "loss": 0.9854, "mean_token_accuracy": 0.7305317636579275, "num_tokens": 24427341.0, "step": 3140 }, { "entropy": 1.1128646079450846, "epoch": 0.823096425942586, "grad_norm": 0.3260432779788971, "learning_rate": 3.5435749803716308e-06, "loss": 1.0951, "mean_token_accuracy": 0.712972067296505, "num_tokens": 24462200.0, "step": 3145 }, { "entropy": 1.1428508426994086, "epoch": 0.8244050053161037, "grad_norm": 0.37607815861701965, "learning_rate": 3.5174038209892704e-06, "loss": 1.1074, "mean_token_accuracy": 0.7139423068612814, "num_tokens": 24503955.0, "step": 3150 }, { "entropy": 1.0314291812479497, "epoch": 0.8257135846896213, "grad_norm": 0.5070587992668152, "learning_rate": 3.491232661606909e-06, "loss": 1.0294, "mean_token_accuracy": 0.7355128142982721, "num_tokens": 24543206.0, "step": 3155 }, { "entropy": 1.1036609571427107, "epoch": 0.827022164063139, "grad_norm": 0.41178247332572937, "learning_rate": 3.465061502224549e-06, "loss": 1.0937, "mean_token_accuracy": 0.7211760215461254, "num_tokens": 24579806.0, "step": 3160 }, { "entropy": 1.1116086110472678, "epoch": 0.8283307434366566, "grad_norm": 0.5788869857788086, "learning_rate": 3.438890342842188e-06, "loss": 1.0604, "mean_token_accuracy": 0.721147482842207, "num_tokens": 24616984.0, "step": 3165 }, { "entropy": 1.1319230053573848, "epoch": 0.8296393228101742, "grad_norm": 0.6010908484458923, "learning_rate": 3.4127191834598277e-06, "loss": 1.0807, "mean_token_accuracy": 0.7168502196669578, "num_tokens": 24654055.0, "step": 3170 }, { "entropy": 1.1207389827817678, "epoch": 0.8309479021836919, "grad_norm": 0.5787162780761719, "learning_rate": 3.386548024077467e-06, "loss": 1.0741, "mean_token_accuracy": 0.7188828401267529, "num_tokens": 24696566.0, "step": 3175 }, { "entropy": 1.0550225611776114, "epoch": 0.8322564815572094, "grad_norm": 0.4683392643928528, "learning_rate": 3.360376864695106e-06, "loss": 1.0378, "mean_token_accuracy": 0.7269921116530895, "num_tokens": 24735302.0, "step": 3180 }, { "entropy": 1.064477189257741, "epoch": 0.8335650609307271, "grad_norm": 0.43787601590156555, "learning_rate": 3.3342057053127458e-06, "loss": 1.029, "mean_token_accuracy": 0.7256352994590998, "num_tokens": 24777394.0, "step": 3185 }, { "entropy": 0.9952436108142138, "epoch": 0.8348736403042447, "grad_norm": 0.619921863079071, "learning_rate": 3.3080345459303846e-06, "loss": 0.9214, "mean_token_accuracy": 0.7489484068006277, "num_tokens": 24820138.0, "step": 3190 }, { "entropy": 1.0670820290222764, "epoch": 0.8361822196777623, "grad_norm": 0.4853121340274811, "learning_rate": 3.281863386548024e-06, "loss": 1.0733, "mean_token_accuracy": 0.7309010468423367, "num_tokens": 24857394.0, "step": 3195 }, { "entropy": 1.0793268527835607, "epoch": 0.83749079905128, "grad_norm": 0.3238151967525482, "learning_rate": 3.255692227165664e-06, "loss": 1.0963, "mean_token_accuracy": 0.7233440794050694, "num_tokens": 24894819.0, "step": 3200 }, { "epoch": 0.83749079905128, "eval_entropy": 1.0310797443389892, "eval_loss": 1.0281308889389038, "eval_mean_token_accuracy": 0.7311860795021057, "eval_num_tokens": 24894819.0, "eval_runtime": 97.2362, "eval_samples_per_second": 10.284, "eval_steps_per_second": 1.286, "step": 3200 }, { "entropy": 1.0457366950809956, "epoch": 0.8387993784247976, "grad_norm": 0.44579482078552246, "learning_rate": 3.2295210677833035e-06, "loss": 0.995, "mean_token_accuracy": 0.7295556962490082, "num_tokens": 24934467.0, "step": 3205 }, { "entropy": 1.0695260234177113, "epoch": 0.8401079577983152, "grad_norm": 0.4931753873825073, "learning_rate": 3.2033499084009423e-06, "loss": 1.0579, "mean_token_accuracy": 0.7268257789313793, "num_tokens": 24973181.0, "step": 3210 }, { "entropy": 1.0737264335155488, "epoch": 0.8414165371718328, "grad_norm": 0.542246401309967, "learning_rate": 3.177178749018582e-06, "loss": 1.044, "mean_token_accuracy": 0.7273680947721004, "num_tokens": 25008234.0, "step": 3215 }, { "entropy": 1.0925125159323215, "epoch": 0.8427251165453504, "grad_norm": 0.7821519374847412, "learning_rate": 3.151007589636221e-06, "loss": 1.0926, "mean_token_accuracy": 0.7236344940960407, "num_tokens": 25042804.0, "step": 3220 }, { "entropy": 1.0435322053730487, "epoch": 0.8440336959188681, "grad_norm": 0.5571607351303101, "learning_rate": 3.1248364302538604e-06, "loss": 1.0314, "mean_token_accuracy": 0.7397142663598061, "num_tokens": 25081713.0, "step": 3225 }, { "entropy": 1.0707201555371284, "epoch": 0.8453422752923857, "grad_norm": 0.46003878116607666, "learning_rate": 3.0986652708715e-06, "loss": 1.0396, "mean_token_accuracy": 0.7285445638000965, "num_tokens": 25115761.0, "step": 3230 }, { "entropy": 1.0421096246689558, "epoch": 0.8466508546659033, "grad_norm": 0.5213029980659485, "learning_rate": 3.0724941114891392e-06, "loss": 0.9743, "mean_token_accuracy": 0.729071582481265, "num_tokens": 25158298.0, "step": 3235 }, { "entropy": 0.9816816847771406, "epoch": 0.847959434039421, "grad_norm": 0.2949255704879761, "learning_rate": 3.0463229521067784e-06, "loss": 0.8791, "mean_token_accuracy": 0.7419180549681187, "num_tokens": 25195031.0, "step": 3240 }, { "entropy": 1.1069608356803655, "epoch": 0.8492680134129386, "grad_norm": 0.4634745121002197, "learning_rate": 3.0201517927244177e-06, "loss": 1.0572, "mean_token_accuracy": 0.7191666610538959, "num_tokens": 25233843.0, "step": 3245 }, { "entropy": 1.0450120318681002, "epoch": 0.8505765927864563, "grad_norm": 0.41588935256004333, "learning_rate": 2.9939806333420573e-06, "loss": 1.004, "mean_token_accuracy": 0.7298405677080154, "num_tokens": 25277199.0, "step": 3250 }, { "entropy": 1.0192270964384078, "epoch": 0.8518851721599738, "grad_norm": 0.3328169584274292, "learning_rate": 2.967809473959697e-06, "loss": 1.0314, "mean_token_accuracy": 0.7342530839145184, "num_tokens": 25313963.0, "step": 3255 }, { "entropy": 1.068826200440526, "epoch": 0.8531937515334914, "grad_norm": 0.4146538972854614, "learning_rate": 2.9416383145773357e-06, "loss": 1.0426, "mean_token_accuracy": 0.7297728694975376, "num_tokens": 25354460.0, "step": 3260 }, { "entropy": 1.0722479410469532, "epoch": 0.8545023309070091, "grad_norm": 0.41407662630081177, "learning_rate": 2.9154671551949754e-06, "loss": 1.038, "mean_token_accuracy": 0.7312197633087635, "num_tokens": 25391042.0, "step": 3265 }, { "entropy": 1.0916271772235633, "epoch": 0.8558109102805267, "grad_norm": 0.36934009194374084, "learning_rate": 2.889295995812615e-06, "loss": 1.1389, "mean_token_accuracy": 0.7267607122659683, "num_tokens": 25435161.0, "step": 3270 }, { "entropy": 1.051083530113101, "epoch": 0.8571194896540443, "grad_norm": 0.35780689120292664, "learning_rate": 2.863124836430254e-06, "loss": 1.0192, "mean_token_accuracy": 0.7258496068418026, "num_tokens": 25479291.0, "step": 3275 }, { "entropy": 1.0690735118463635, "epoch": 0.858428069027562, "grad_norm": 0.3563995659351349, "learning_rate": 2.8369536770478935e-06, "loss": 1.0232, "mean_token_accuracy": 0.7209200043231249, "num_tokens": 25518588.0, "step": 3280 }, { "entropy": 1.0599579058587552, "epoch": 0.8597366484010796, "grad_norm": 0.50145423412323, "learning_rate": 2.8107825176655327e-06, "loss": 1.0418, "mean_token_accuracy": 0.7253329046070576, "num_tokens": 25555559.0, "step": 3285 }, { "entropy": 1.0637880939990283, "epoch": 0.8610452277745972, "grad_norm": 0.32374799251556396, "learning_rate": 2.7846113582831723e-06, "loss": 1.0589, "mean_token_accuracy": 0.729487194865942, "num_tokens": 25599141.0, "step": 3290 }, { "entropy": 1.121987307816744, "epoch": 0.8623538071481148, "grad_norm": 0.7996993064880371, "learning_rate": 2.7584401989008115e-06, "loss": 1.1568, "mean_token_accuracy": 0.71376275382936, "num_tokens": 25635193.0, "step": 3295 }, { "entropy": 1.075880871899426, "epoch": 0.8636623865216324, "grad_norm": 0.3166949450969696, "learning_rate": 2.7322690395184508e-06, "loss": 1.0328, "mean_token_accuracy": 0.7281475432217122, "num_tokens": 25671967.0, "step": 3300 }, { "epoch": 0.8636623865216324, "eval_entropy": 1.0315785236358643, "eval_loss": 1.0274593830108643, "eval_mean_token_accuracy": 0.7313339967727661, "eval_num_tokens": 25671967.0, "eval_runtime": 97.2667, "eval_samples_per_second": 10.281, "eval_steps_per_second": 1.285, "step": 3300 }, { "entropy": 1.0423275344073772, "epoch": 0.8649709658951501, "grad_norm": 0.5029602646827698, "learning_rate": 2.7060978801360904e-06, "loss": 1.0144, "mean_token_accuracy": 0.7371830597519875, "num_tokens": 25714593.0, "step": 3305 }, { "entropy": 1.019960217177868, "epoch": 0.8662795452686677, "grad_norm": 0.3714219629764557, "learning_rate": 2.679926720753729e-06, "loss": 0.9591, "mean_token_accuracy": 0.7337398871779441, "num_tokens": 25753622.0, "step": 3310 }, { "entropy": 1.0438139468431473, "epoch": 0.8675881246421854, "grad_norm": 0.49829211831092834, "learning_rate": 2.653755561371369e-06, "loss": 1.0152, "mean_token_accuracy": 0.7405216414481401, "num_tokens": 25789764.0, "step": 3315 }, { "entropy": 1.0796325359493495, "epoch": 0.868896704015703, "grad_norm": 0.9016978144645691, "learning_rate": 2.6275844019890085e-06, "loss": 1.0369, "mean_token_accuracy": 0.7271438419818879, "num_tokens": 25827139.0, "step": 3320 }, { "entropy": 1.1373902840539813, "epoch": 0.8702052833892205, "grad_norm": 0.5277310609817505, "learning_rate": 2.601413242606648e-06, "loss": 1.1125, "mean_token_accuracy": 0.7127894099801779, "num_tokens": 25866714.0, "step": 3325 }, { "entropy": 1.0553706608712674, "epoch": 0.8715138627627382, "grad_norm": 0.39588603377342224, "learning_rate": 2.575242083224287e-06, "loss": 1.0219, "mean_token_accuracy": 0.7270961362868548, "num_tokens": 25909686.0, "step": 3330 }, { "entropy": 1.065532573312521, "epoch": 0.8728224421362558, "grad_norm": 0.5084222555160522, "learning_rate": 2.5490709238419266e-06, "loss": 1.0483, "mean_token_accuracy": 0.7294468872249127, "num_tokens": 25950054.0, "step": 3335 }, { "entropy": 1.0791085483506322, "epoch": 0.8741310215097734, "grad_norm": 0.5307091474533081, "learning_rate": 2.5228997644595658e-06, "loss": 1.0317, "mean_token_accuracy": 0.7308267526328563, "num_tokens": 25988785.0, "step": 3340 }, { "entropy": 1.0206924524158238, "epoch": 0.8754396008832911, "grad_norm": 0.44742336869239807, "learning_rate": 2.4967286050772054e-06, "loss": 0.9997, "mean_token_accuracy": 0.7365788772702218, "num_tokens": 26031548.0, "step": 3345 }, { "entropy": 1.072092866897583, "epoch": 0.8767481802568087, "grad_norm": 0.4204552471637726, "learning_rate": 2.4705574456948446e-06, "loss": 1.0516, "mean_token_accuracy": 0.7274888075888157, "num_tokens": 26073312.0, "step": 3350 }, { "entropy": 1.0556320391595364, "epoch": 0.8780567596303264, "grad_norm": 0.4932871162891388, "learning_rate": 2.444386286312484e-06, "loss": 1.0326, "mean_token_accuracy": 0.7380503460764885, "num_tokens": 26114431.0, "step": 3355 }, { "entropy": 1.091795524954796, "epoch": 0.879365339003844, "grad_norm": 0.561880350112915, "learning_rate": 2.418215126930123e-06, "loss": 1.0774, "mean_token_accuracy": 0.7251040559262037, "num_tokens": 26153886.0, "step": 3360 }, { "entropy": 1.07560763284564, "epoch": 0.8806739183773615, "grad_norm": 0.48657548427581787, "learning_rate": 2.3920439675477623e-06, "loss": 1.0658, "mean_token_accuracy": 0.7262101206928492, "num_tokens": 26186389.0, "step": 3365 }, { "entropy": 1.0190218634903432, "epoch": 0.8819824977508792, "grad_norm": 0.7028672099113464, "learning_rate": 2.365872808165402e-06, "loss": 0.9896, "mean_token_accuracy": 0.7349074624478817, "num_tokens": 26216742.0, "step": 3370 }, { "entropy": 1.0444237791001796, "epoch": 0.8832910771243968, "grad_norm": 0.3169384300708771, "learning_rate": 2.339701648783041e-06, "loss": 0.9722, "mean_token_accuracy": 0.7366019688546658, "num_tokens": 26252709.0, "step": 3375 }, { "entropy": 1.0996058486402034, "epoch": 0.8845996564979145, "grad_norm": 0.45103803277015686, "learning_rate": 2.313530489400681e-06, "loss": 1.0965, "mean_token_accuracy": 0.7159675225615502, "num_tokens": 26290405.0, "step": 3380 }, { "entropy": 1.120962193235755, "epoch": 0.8859082358714321, "grad_norm": 0.4903068244457245, "learning_rate": 2.28735933001832e-06, "loss": 1.133, "mean_token_accuracy": 0.7219177789986133, "num_tokens": 26328036.0, "step": 3385 }, { "entropy": 1.015447051078081, "epoch": 0.8872168152449497, "grad_norm": 0.5176340341567993, "learning_rate": 2.2611881706359592e-06, "loss": 1.0307, "mean_token_accuracy": 0.7406690739095211, "num_tokens": 26370299.0, "step": 3390 }, { "entropy": 1.0935089591890574, "epoch": 0.8885253946184674, "grad_norm": 0.29109635949134827, "learning_rate": 2.235017011253599e-06, "loss": 1.0479, "mean_token_accuracy": 0.7220962606370449, "num_tokens": 26410839.0, "step": 3395 }, { "entropy": 1.073088315874338, "epoch": 0.8898339739919849, "grad_norm": 0.41668152809143066, "learning_rate": 2.208845851871238e-06, "loss": 1.0365, "mean_token_accuracy": 0.7313703790307045, "num_tokens": 26451902.0, "step": 3400 }, { "epoch": 0.8898339739919849, "eval_entropy": 1.0298821225166321, "eval_loss": 1.0266761779785156, "eval_mean_token_accuracy": 0.7314435024261474, "eval_num_tokens": 26451902.0, "eval_runtime": 97.2178, "eval_samples_per_second": 10.286, "eval_steps_per_second": 1.286, "step": 3400 }, { "entropy": 1.1048546012490987, "epoch": 0.8911425533655025, "grad_norm": 0.33564722537994385, "learning_rate": 2.1826746924888777e-06, "loss": 1.1361, "mean_token_accuracy": 0.7248132299631834, "num_tokens": 26494266.0, "step": 3405 }, { "entropy": 1.0767640516161918, "epoch": 0.8924511327390202, "grad_norm": 0.34754645824432373, "learning_rate": 2.156503533106517e-06, "loss": 1.0538, "mean_token_accuracy": 0.7239204317331314, "num_tokens": 26538961.0, "step": 3410 }, { "entropy": 1.1368263632059097, "epoch": 0.8937597121125378, "grad_norm": 0.42159193754196167, "learning_rate": 2.130332373724156e-06, "loss": 1.0626, "mean_token_accuracy": 0.7145794078707695, "num_tokens": 26576209.0, "step": 3415 }, { "entropy": 1.1049470301717519, "epoch": 0.8950682914860555, "grad_norm": 0.41963493824005127, "learning_rate": 2.1041612143417954e-06, "loss": 1.078, "mean_token_accuracy": 0.7214818011969328, "num_tokens": 26615242.0, "step": 3420 }, { "entropy": 1.0639567028731107, "epoch": 0.8963768708595731, "grad_norm": 0.27538204193115234, "learning_rate": 2.0779900549594346e-06, "loss": 1.0215, "mean_token_accuracy": 0.7226650364696979, "num_tokens": 26655742.0, "step": 3425 }, { "entropy": 1.041541761904955, "epoch": 0.8976854502330907, "grad_norm": 0.3388216197490692, "learning_rate": 2.0518188955770743e-06, "loss": 1.0363, "mean_token_accuracy": 0.7356565605849028, "num_tokens": 26693766.0, "step": 3430 }, { "entropy": 1.0412372374907135, "epoch": 0.8989940296066083, "grad_norm": 0.3273700773715973, "learning_rate": 2.0256477361947135e-06, "loss": 0.9961, "mean_token_accuracy": 0.7328124668449163, "num_tokens": 26731172.0, "step": 3435 }, { "entropy": 1.0402305848896503, "epoch": 0.9003026089801259, "grad_norm": 0.5662808418273926, "learning_rate": 1.999476576812353e-06, "loss": 1.0287, "mean_token_accuracy": 0.7347014851868152, "num_tokens": 26770030.0, "step": 3440 }, { "entropy": 1.0628270111978053, "epoch": 0.9016111883536436, "grad_norm": 0.39010319113731384, "learning_rate": 1.9733054174299923e-06, "loss": 0.9835, "mean_token_accuracy": 0.7256053328514099, "num_tokens": 26811239.0, "step": 3445 }, { "entropy": 1.0397684559226037, "epoch": 0.9029197677271612, "grad_norm": 0.5970498323440552, "learning_rate": 1.9471342580476316e-06, "loss": 1.0163, "mean_token_accuracy": 0.7326931424438954, "num_tokens": 26850997.0, "step": 3450 }, { "entropy": 1.0377406038343906, "epoch": 0.9042283471006788, "grad_norm": 0.45579779148101807, "learning_rate": 1.920963098665271e-06, "loss": 1.0484, "mean_token_accuracy": 0.7341817907989026, "num_tokens": 26889505.0, "step": 3455 }, { "entropy": 1.020282133296132, "epoch": 0.9055369264741965, "grad_norm": 0.40827953815460205, "learning_rate": 1.8947919392829104e-06, "loss": 1.0153, "mean_token_accuracy": 0.737718590721488, "num_tokens": 26927519.0, "step": 3460 }, { "entropy": 1.053854411840439, "epoch": 0.9068455058477141, "grad_norm": 0.45387595891952515, "learning_rate": 1.8686207799005498e-06, "loss": 0.9982, "mean_token_accuracy": 0.7261220943182707, "num_tokens": 26966277.0, "step": 3465 }, { "entropy": 1.017658719792962, "epoch": 0.9081540852212318, "grad_norm": 0.36533382534980774, "learning_rate": 1.842449620518189e-06, "loss": 0.9675, "mean_token_accuracy": 0.7362812142819166, "num_tokens": 27002691.0, "step": 3470 }, { "entropy": 1.053343753516674, "epoch": 0.9094626645947493, "grad_norm": 0.3158053755760193, "learning_rate": 1.8162784611358283e-06, "loss": 1.0161, "mean_token_accuracy": 0.7329125508666039, "num_tokens": 27040733.0, "step": 3475 }, { "entropy": 0.9715707473456859, "epoch": 0.9107712439682669, "grad_norm": 0.30097201466560364, "learning_rate": 1.790107301753468e-06, "loss": 0.9108, "mean_token_accuracy": 0.741458211094141, "num_tokens": 27083118.0, "step": 3480 }, { "entropy": 1.1143366053700448, "epoch": 0.9120798233417846, "grad_norm": 0.5649247765541077, "learning_rate": 1.7639361423711071e-06, "loss": 1.0844, "mean_token_accuracy": 0.7136627584695816, "num_tokens": 27118214.0, "step": 3485 }, { "entropy": 1.053029465302825, "epoch": 0.9133884027153022, "grad_norm": 0.41902416944503784, "learning_rate": 1.7377649829887466e-06, "loss": 0.9994, "mean_token_accuracy": 0.7316753648221492, "num_tokens": 27154246.0, "step": 3490 }, { "entropy": 1.0596496284008026, "epoch": 0.9146969820888198, "grad_norm": 0.4515739977359772, "learning_rate": 1.7115938236063858e-06, "loss": 1.0266, "mean_token_accuracy": 0.7284657262265682, "num_tokens": 27190566.0, "step": 3495 }, { "entropy": 1.1401704950258136, "epoch": 0.9160055614623375, "grad_norm": 0.5285766124725342, "learning_rate": 1.6854226642240254e-06, "loss": 1.1079, "mean_token_accuracy": 0.7140452340245247, "num_tokens": 27231234.0, "step": 3500 }, { "epoch": 0.9160055614623375, "eval_entropy": 1.0293893675804138, "eval_loss": 1.0261290073394775, "eval_mean_token_accuracy": 0.7316831665039063, "eval_num_tokens": 27231234.0, "eval_runtime": 97.2439, "eval_samples_per_second": 10.283, "eval_steps_per_second": 1.285, "step": 3500 }, { "entropy": 1.060255604609847, "epoch": 0.917314140835855, "grad_norm": 0.5222223401069641, "learning_rate": 1.6592515048416647e-06, "loss": 1.0422, "mean_token_accuracy": 0.7291194565594197, "num_tokens": 27274088.0, "step": 3505 }, { "entropy": 1.0341279707849025, "epoch": 0.9186227202093727, "grad_norm": 0.46550068259239197, "learning_rate": 1.6330803454593039e-06, "loss": 1.0274, "mean_token_accuracy": 0.736503117159009, "num_tokens": 27314182.0, "step": 3510 }, { "entropy": 1.0762779969722032, "epoch": 0.9199312995828903, "grad_norm": 0.4758652150630951, "learning_rate": 1.6069091860769433e-06, "loss": 1.0411, "mean_token_accuracy": 0.7197697997093201, "num_tokens": 27353165.0, "step": 3515 }, { "entropy": 1.0701686941087245, "epoch": 0.9212398789564079, "grad_norm": 0.49974560737609863, "learning_rate": 1.5807380266945827e-06, "loss": 1.0625, "mean_token_accuracy": 0.7291940327733755, "num_tokens": 27387526.0, "step": 3520 }, { "entropy": 1.083727527782321, "epoch": 0.9225484583299256, "grad_norm": 0.5067974328994751, "learning_rate": 1.5545668673122222e-06, "loss": 1.0838, "mean_token_accuracy": 0.719190240651369, "num_tokens": 27424821.0, "step": 3525 }, { "entropy": 1.0643510822206736, "epoch": 0.9238570377034432, "grad_norm": 0.6037015914916992, "learning_rate": 1.5283957079298614e-06, "loss": 1.08, "mean_token_accuracy": 0.7282540425658226, "num_tokens": 27462609.0, "step": 3530 }, { "entropy": 1.0500620037317276, "epoch": 0.9251656170769609, "grad_norm": 0.6834056377410889, "learning_rate": 1.5022245485475006e-06, "loss": 1.0026, "mean_token_accuracy": 0.7320256743580102, "num_tokens": 27500178.0, "step": 3535 }, { "entropy": 1.0600904449820518, "epoch": 0.9264741964504785, "grad_norm": 0.5821903347969055, "learning_rate": 1.4760533891651402e-06, "loss": 1.0701, "mean_token_accuracy": 0.7213537424802781, "num_tokens": 27539087.0, "step": 3540 }, { "entropy": 1.057712410390377, "epoch": 0.927782775823996, "grad_norm": 0.28830403089523315, "learning_rate": 1.4498822297827795e-06, "loss": 0.9984, "mean_token_accuracy": 0.7261978283524513, "num_tokens": 27575481.0, "step": 3545 }, { "entropy": 1.0745814580470323, "epoch": 0.9290913551975137, "grad_norm": 0.5320100784301758, "learning_rate": 1.4237110704004189e-06, "loss": 1.0255, "mean_token_accuracy": 0.7240989219397307, "num_tokens": 27610121.0, "step": 3550 }, { "entropy": 1.1143604163080454, "epoch": 0.9303999345710313, "grad_norm": 0.3023637533187866, "learning_rate": 1.3975399110180581e-06, "loss": 1.1067, "mean_token_accuracy": 0.7260128542780876, "num_tokens": 27648696.0, "step": 3555 }, { "entropy": 1.0836713276803493, "epoch": 0.9317085139445489, "grad_norm": 0.6137750148773193, "learning_rate": 1.3713687516356975e-06, "loss": 1.0442, "mean_token_accuracy": 0.7201635256409645, "num_tokens": 27682007.0, "step": 3560 }, { "entropy": 1.1006150282919407, "epoch": 0.9330170933180666, "grad_norm": 0.44908300042152405, "learning_rate": 1.345197592253337e-06, "loss": 1.1144, "mean_token_accuracy": 0.7233285054564476, "num_tokens": 27719353.0, "step": 3565 }, { "entropy": 1.0796324241906405, "epoch": 0.9343256726915842, "grad_norm": 0.4785999655723572, "learning_rate": 1.3190264328709762e-06, "loss": 1.0587, "mean_token_accuracy": 0.725184840708971, "num_tokens": 27760487.0, "step": 3570 }, { "entropy": 1.0421875026077032, "epoch": 0.9356342520651019, "grad_norm": 0.5122357606887817, "learning_rate": 1.2928552734886158e-06, "loss": 1.025, "mean_token_accuracy": 0.7302470609545708, "num_tokens": 27803053.0, "step": 3575 }, { "entropy": 1.0390013061463832, "epoch": 0.9369428314386195, "grad_norm": 0.5996967554092407, "learning_rate": 1.266684114106255e-06, "loss": 0.9942, "mean_token_accuracy": 0.7326175101101399, "num_tokens": 27841890.0, "step": 3580 }, { "entropy": 1.0458267323672772, "epoch": 0.938251410812137, "grad_norm": 0.4176723062992096, "learning_rate": 1.2405129547238943e-06, "loss": 1.0177, "mean_token_accuracy": 0.724136520922184, "num_tokens": 27882510.0, "step": 3585 }, { "entropy": 1.0069429714232683, "epoch": 0.9395599901856547, "grad_norm": 0.40060707926750183, "learning_rate": 1.2143417953415337e-06, "loss": 0.9671, "mean_token_accuracy": 0.7366093017160893, "num_tokens": 27923388.0, "step": 3590 }, { "entropy": 1.16991759352386, "epoch": 0.9408685695591723, "grad_norm": 0.3461279571056366, "learning_rate": 1.1881706359591731e-06, "loss": 1.1462, "mean_token_accuracy": 0.7035489819943905, "num_tokens": 27964084.0, "step": 3595 }, { "entropy": 1.0686412911862135, "epoch": 0.94217714893269, "grad_norm": 0.2957308888435364, "learning_rate": 1.1619994765768126e-06, "loss": 1.1066, "mean_token_accuracy": 0.7225402969866991, "num_tokens": 28001682.0, "step": 3600 }, { "epoch": 0.94217714893269, "eval_entropy": 1.0279519958496093, "eval_loss": 1.0255744457244873, "eval_mean_token_accuracy": 0.7317811703681946, "eval_num_tokens": 28001682.0, "eval_runtime": 97.2947, "eval_samples_per_second": 10.278, "eval_steps_per_second": 1.285, "step": 3600 }, { "entropy": 1.0206676226109266, "epoch": 0.9434857283062076, "grad_norm": 0.35788989067077637, "learning_rate": 1.1358283171944518e-06, "loss": 1.0176, "mean_token_accuracy": 0.7300234287977219, "num_tokens": 28042579.0, "step": 3605 }, { "entropy": 1.1715757973492145, "epoch": 0.9447943076797252, "grad_norm": 0.6498376131057739, "learning_rate": 1.1096571578120912e-06, "loss": 1.2274, "mean_token_accuracy": 0.7090075612068176, "num_tokens": 28077521.0, "step": 3610 }, { "entropy": 1.0543236445635558, "epoch": 0.9461028870532429, "grad_norm": 0.3194473385810852, "learning_rate": 1.0834859984297304e-06, "loss": 1.0491, "mean_token_accuracy": 0.72547093257308, "num_tokens": 28116840.0, "step": 3615 }, { "entropy": 1.1037466993555427, "epoch": 0.9474114664267604, "grad_norm": 0.6278566718101501, "learning_rate": 1.0573148390473699e-06, "loss": 1.0452, "mean_token_accuracy": 0.7226000647991896, "num_tokens": 28153551.0, "step": 3620 }, { "entropy": 1.0038308002054692, "epoch": 0.948720045800278, "grad_norm": 0.48984530568122864, "learning_rate": 1.0311436796650093e-06, "loss": 0.9795, "mean_token_accuracy": 0.7407138921320439, "num_tokens": 28189324.0, "step": 3625 }, { "entropy": 1.0743804059922695, "epoch": 0.9500286251737957, "grad_norm": 0.35918810963630676, "learning_rate": 1.0049725202826487e-06, "loss": 1.1096, "mean_token_accuracy": 0.7247271824628114, "num_tokens": 28230753.0, "step": 3630 }, { "entropy": 1.0661400273442267, "epoch": 0.9513372045473133, "grad_norm": 0.45430681109428406, "learning_rate": 9.78801360900288e-07, "loss": 1.0431, "mean_token_accuracy": 0.7261486738920212, "num_tokens": 28267961.0, "step": 3635 }, { "entropy": 1.0930729560554027, "epoch": 0.952645783920831, "grad_norm": 0.5693202018737793, "learning_rate": 9.526302015179273e-07, "loss": 1.0501, "mean_token_accuracy": 0.7252973057329655, "num_tokens": 28303967.0, "step": 3640 }, { "entropy": 1.124368030577898, "epoch": 0.9539543632943486, "grad_norm": 0.5943740010261536, "learning_rate": 9.264590421355667e-07, "loss": 1.148, "mean_token_accuracy": 0.7163769982755184, "num_tokens": 28339080.0, "step": 3645 }, { "entropy": 1.0308339811861515, "epoch": 0.9552629426678662, "grad_norm": 0.4916401505470276, "learning_rate": 9.00287882753206e-07, "loss": 0.9571, "mean_token_accuracy": 0.7355926677584648, "num_tokens": 28376833.0, "step": 3650 }, { "entropy": 1.0489124123007059, "epoch": 0.9565715220413838, "grad_norm": 0.5581033229827881, "learning_rate": 8.741167233708454e-07, "loss": 1.0203, "mean_token_accuracy": 0.7289952971041203, "num_tokens": 28413990.0, "step": 3655 }, { "entropy": 1.027711633220315, "epoch": 0.9578801014149014, "grad_norm": 0.5208143591880798, "learning_rate": 8.479455639884849e-07, "loss": 0.985, "mean_token_accuracy": 0.7333911582827568, "num_tokens": 28451250.0, "step": 3660 }, { "entropy": 1.0729194439947605, "epoch": 0.9591886807884191, "grad_norm": 0.6603464484214783, "learning_rate": 8.217744046061241e-07, "loss": 1.1221, "mean_token_accuracy": 0.7259935919195414, "num_tokens": 28488287.0, "step": 3665 }, { "entropy": 1.1133702561259269, "epoch": 0.9604972601619367, "grad_norm": 0.45469507575035095, "learning_rate": 7.956032452237634e-07, "loss": 1.0952, "mean_token_accuracy": 0.7223341464996338, "num_tokens": 28523839.0, "step": 3670 }, { "entropy": 1.0385285302996636, "epoch": 0.9618058395354543, "grad_norm": 0.4402262270450592, "learning_rate": 7.694320858414028e-07, "loss": 1.0786, "mean_token_accuracy": 0.7375961367040873, "num_tokens": 28561415.0, "step": 3675 }, { "entropy": 1.0912371952086688, "epoch": 0.963114418908972, "grad_norm": 0.4388654828071594, "learning_rate": 7.432609264590422e-07, "loss": 1.1011, "mean_token_accuracy": 0.7208574242889881, "num_tokens": 28598012.0, "step": 3680 }, { "entropy": 1.0840743597596885, "epoch": 0.9644229982824896, "grad_norm": 0.5213295221328735, "learning_rate": 7.170897670766816e-07, "loss": 1.0878, "mean_token_accuracy": 0.7260554354637861, "num_tokens": 28640120.0, "step": 3685 }, { "entropy": 1.0481572005897761, "epoch": 0.9657315776560071, "grad_norm": 0.6161360740661621, "learning_rate": 6.90918607694321e-07, "loss": 0.9982, "mean_token_accuracy": 0.7346723277121783, "num_tokens": 28680039.0, "step": 3690 }, { "entropy": 1.081262205913663, "epoch": 0.9670401570295248, "grad_norm": 0.5245823264122009, "learning_rate": 6.647474483119602e-07, "loss": 1.0661, "mean_token_accuracy": 0.7203562073409557, "num_tokens": 28720731.0, "step": 3695 }, { "entropy": 1.0632508873939515, "epoch": 0.9683487364030424, "grad_norm": 0.3406558334827423, "learning_rate": 6.385762889295996e-07, "loss": 1.011, "mean_token_accuracy": 0.7315301813185215, "num_tokens": 28758379.0, "step": 3700 }, { "epoch": 0.9683487364030424, "eval_entropy": 1.0293308172225952, "eval_loss": 1.0250952243804932, "eval_mean_token_accuracy": 0.7318736510276794, "eval_num_tokens": 28758379.0, "eval_runtime": 97.2597, "eval_samples_per_second": 10.282, "eval_steps_per_second": 1.285, "step": 3700 }, { "entropy": 1.0361842691898346, "epoch": 0.9696573157765601, "grad_norm": 0.3829176127910614, "learning_rate": 6.12405129547239e-07, "loss": 1.0297, "mean_token_accuracy": 0.738416987657547, "num_tokens": 28801046.0, "step": 3705 }, { "entropy": 0.9786388706415892, "epoch": 0.9709658951500777, "grad_norm": 0.42764556407928467, "learning_rate": 5.862339701648783e-07, "loss": 0.897, "mean_token_accuracy": 0.7432105515152216, "num_tokens": 28841846.0, "step": 3710 }, { "entropy": 1.0343420587480068, "epoch": 0.9722744745235953, "grad_norm": 0.3440265357494354, "learning_rate": 5.600628107825177e-07, "loss": 1.0067, "mean_token_accuracy": 0.7312737897038459, "num_tokens": 28888566.0, "step": 3715 }, { "entropy": 1.0930624961853028, "epoch": 0.973583053897113, "grad_norm": 0.26113247871398926, "learning_rate": 5.338916514001571e-07, "loss": 1.1205, "mean_token_accuracy": 0.7230574164539576, "num_tokens": 28926022.0, "step": 3720 }, { "entropy": 1.0122245840728283, "epoch": 0.9748916332706306, "grad_norm": 0.4856005012989044, "learning_rate": 5.077204920177964e-07, "loss": 1.0115, "mean_token_accuracy": 0.7327728264033795, "num_tokens": 28967341.0, "step": 3725 }, { "entropy": 1.0293555181473493, "epoch": 0.9762002126441482, "grad_norm": 0.4436942934989929, "learning_rate": 4.815493326354357e-07, "loss": 1.0318, "mean_token_accuracy": 0.7331165555864573, "num_tokens": 29011545.0, "step": 3730 }, { "entropy": 1.0567047752439975, "epoch": 0.9775087920176658, "grad_norm": 0.46222445368766785, "learning_rate": 4.5537817325307516e-07, "loss": 1.0437, "mean_token_accuracy": 0.7290311522781849, "num_tokens": 29050339.0, "step": 3735 }, { "entropy": 1.0223099350929261, "epoch": 0.9788173713911834, "grad_norm": 0.6509369611740112, "learning_rate": 4.2920701387071454e-07, "loss": 1.0271, "mean_token_accuracy": 0.743386908620596, "num_tokens": 29090275.0, "step": 3740 }, { "entropy": 1.0612529110163451, "epoch": 0.9801259507647011, "grad_norm": 0.5703464150428772, "learning_rate": 4.0303585448835386e-07, "loss": 1.0539, "mean_token_accuracy": 0.7298849746584892, "num_tokens": 29129073.0, "step": 3745 }, { "entropy": 1.0348493970930577, "epoch": 0.9814345301382187, "grad_norm": 0.35511285066604614, "learning_rate": 3.7686469510599324e-07, "loss": 1.0283, "mean_token_accuracy": 0.7344648085534573, "num_tokens": 29171021.0, "step": 3750 }, { "entropy": 1.1379429820924998, "epoch": 0.9827431095117363, "grad_norm": 0.3323829174041748, "learning_rate": 3.506935357236326e-07, "loss": 1.1105, "mean_token_accuracy": 0.7117891266942025, "num_tokens": 29206905.0, "step": 3755 }, { "entropy": 1.0437415711581708, "epoch": 0.984051688885254, "grad_norm": 0.40716320276260376, "learning_rate": 3.2452237634127194e-07, "loss": 1.0219, "mean_token_accuracy": 0.7391361027956009, "num_tokens": 29247960.0, "step": 3760 }, { "entropy": 1.1265955708920956, "epoch": 0.9853602682587715, "grad_norm": 0.6306870579719543, "learning_rate": 2.983512169589113e-07, "loss": 1.062, "mean_token_accuracy": 0.7148844588547945, "num_tokens": 29282043.0, "step": 3765 }, { "entropy": 1.0719765815883875, "epoch": 0.9866688476322892, "grad_norm": 0.35759237408638, "learning_rate": 2.7218005757655065e-07, "loss": 1.0682, "mean_token_accuracy": 0.7296040445566178, "num_tokens": 29323533.0, "step": 3770 }, { "entropy": 1.0471963623538614, "epoch": 0.9879774270058068, "grad_norm": 0.4440222382545471, "learning_rate": 2.4600889819419e-07, "loss": 0.9946, "mean_token_accuracy": 0.7308863338083029, "num_tokens": 29364067.0, "step": 3775 }, { "entropy": 1.0608526386320591, "epoch": 0.9892860063793244, "grad_norm": 0.41301101446151733, "learning_rate": 2.198377388118294e-07, "loss": 1.0721, "mean_token_accuracy": 0.730213138461113, "num_tokens": 29411394.0, "step": 3780 }, { "entropy": 1.042181348055601, "epoch": 0.9905945857528421, "grad_norm": 0.7013579607009888, "learning_rate": 1.9366657942946875e-07, "loss": 0.9976, "mean_token_accuracy": 0.7371749997138977, "num_tokens": 29447775.0, "step": 3785 }, { "entropy": 1.0719102643430234, "epoch": 0.9919031651263597, "grad_norm": 0.6451042890548706, "learning_rate": 1.6749542004710808e-07, "loss": 1.0273, "mean_token_accuracy": 0.725009347498417, "num_tokens": 29482789.0, "step": 3790 }, { "entropy": 1.0650145791471004, "epoch": 0.9932117444998774, "grad_norm": 0.46040284633636475, "learning_rate": 1.4132426066474745e-07, "loss": 1.0511, "mean_token_accuracy": 0.7309038709849119, "num_tokens": 29522469.0, "step": 3795 }, { "entropy": 1.099685497954488, "epoch": 0.994520323873395, "grad_norm": 0.47909924387931824, "learning_rate": 1.1515310128238682e-07, "loss": 1.0294, "mean_token_accuracy": 0.7283653065562248, "num_tokens": 29558695.0, "step": 3800 }, { "epoch": 0.994520323873395, "eval_entropy": 1.027329535484314, "eval_loss": 1.0250593423843384, "eval_mean_token_accuracy": 0.7319152827262878, "eval_num_tokens": 29558695.0, "eval_runtime": 97.2485, "eval_samples_per_second": 10.283, "eval_steps_per_second": 1.285, "step": 3800 }, { "entropy": 1.0765836983919144, "epoch": 0.9958289032469125, "grad_norm": 0.46970200538635254, "learning_rate": 8.898194190002618e-08, "loss": 1.0639, "mean_token_accuracy": 0.7184258241206407, "num_tokens": 29599289.0, "step": 3805 }, { "entropy": 1.0330897614359855, "epoch": 0.9971374826204302, "grad_norm": 0.34983229637145996, "learning_rate": 6.281078251766554e-08, "loss": 1.0025, "mean_token_accuracy": 0.7342484682798386, "num_tokens": 29637188.0, "step": 3810 }, { "entropy": 1.04201779961586, "epoch": 0.9984460619939478, "grad_norm": 0.39102649688720703, "learning_rate": 3.6639623135304896e-08, "loss": 0.9961, "mean_token_accuracy": 0.7322810731828213, "num_tokens": 29671533.0, "step": 3815 }, { "entropy": 1.1104622691869737, "epoch": 0.9997546413674654, "grad_norm": 0.39447513222694397, "learning_rate": 1.0468463752944255e-08, "loss": 1.1002, "mean_token_accuracy": 0.7095972750335932, "num_tokens": 29712231.0, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 1.297831890260091e+18, "train_loss": 1.0613669765554028, "train_runtime": 22265.1627, "train_samples_per_second": 2.746, "train_steps_per_second": 0.172 } ], "logging_steps": 5, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.297831890260091e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }