| { | |
| "best_global_step": 3800, | |
| "best_metric": 1.0250593423843384, | |
| "best_model_checkpoint": "/workspace/adversarial-rlhf/runs/sft-OpenHermes-2.5-Mistral-7B-20251108-1943/checkpoints/checkpoint-3800", | |
| "epoch": 1.0, | |
| "eval_steps": 100, | |
| "global_step": 3821, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.1123480509966612, | |
| "epoch": 0.0013085793735176249, | |
| "grad_norm": 0.6217469573020935, | |
| "learning_rate": 1.9979063072494113e-05, | |
| "loss": 1.4467, | |
| "mean_token_accuracy": 0.6513771142810583, | |
| "num_tokens": 38846.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.1004129022359848, | |
| "epoch": 0.0026171587470352497, | |
| "grad_norm": 0.4485588073730469, | |
| "learning_rate": 1.995289191311175e-05, | |
| "loss": 1.3538, | |
| "mean_token_accuracy": 0.6656047463417053, | |
| "num_tokens": 77083.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.1241086978465318, | |
| "epoch": 0.003925738120552875, | |
| "grad_norm": 0.4486566483974457, | |
| "learning_rate": 1.9926720753729393e-05, | |
| "loss": 1.2136, | |
| "mean_token_accuracy": 0.6849155448377132, | |
| "num_tokens": 113871.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 1.2270274113863706, | |
| "epoch": 0.005234317494070499, | |
| "grad_norm": 0.3619006276130676, | |
| "learning_rate": 1.990054959434703e-05, | |
| "loss": 1.1882, | |
| "mean_token_accuracy": 0.6774788822978735, | |
| "num_tokens": 149409.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.21558427400887, | |
| "epoch": 0.006542896867588125, | |
| "grad_norm": 0.3003220558166504, | |
| "learning_rate": 1.987437843496467e-05, | |
| "loss": 1.2674, | |
| "mean_token_accuracy": 0.6861417829990387, | |
| "num_tokens": 191753.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 1.2555227734148502, | |
| "epoch": 0.00785147624110575, | |
| "grad_norm": 0.28489917516708374, | |
| "learning_rate": 1.9848207275582308e-05, | |
| "loss": 1.1575, | |
| "mean_token_accuracy": 0.6786608777940273, | |
| "num_tokens": 236354.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.2973661322146655, | |
| "epoch": 0.009160055614623375, | |
| "grad_norm": 0.2774274945259094, | |
| "learning_rate": 1.982203611619995e-05, | |
| "loss": 1.2217, | |
| "mean_token_accuracy": 0.6799626011401415, | |
| "num_tokens": 276048.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 1.196922979876399, | |
| "epoch": 0.010468634988140999, | |
| "grad_norm": 0.2974745035171509, | |
| "learning_rate": 1.979586495681759e-05, | |
| "loss": 1.1239, | |
| "mean_token_accuracy": 0.6995673455297947, | |
| "num_tokens": 313936.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.2235115006566049, | |
| "epoch": 0.011777214361658624, | |
| "grad_norm": 0.2803105115890503, | |
| "learning_rate": 1.9769693797435227e-05, | |
| "loss": 1.1753, | |
| "mean_token_accuracy": 0.6886466249823571, | |
| "num_tokens": 354248.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 1.2047650370746852, | |
| "epoch": 0.01308579373517625, | |
| "grad_norm": 0.24354718625545502, | |
| "learning_rate": 1.974352263805287e-05, | |
| "loss": 1.1418, | |
| "mean_token_accuracy": 0.6953188866376877, | |
| "num_tokens": 392556.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.174419540166855, | |
| "epoch": 0.014394373108693874, | |
| "grad_norm": 0.2286166548728943, | |
| "learning_rate": 1.9717351478670507e-05, | |
| "loss": 1.1289, | |
| "mean_token_accuracy": 0.6958644594997168, | |
| "num_tokens": 429553.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 1.2164320405572653, | |
| "epoch": 0.0157029524822115, | |
| "grad_norm": 0.24150270223617554, | |
| "learning_rate": 1.9691180319288145e-05, | |
| "loss": 1.1822, | |
| "mean_token_accuracy": 0.6899411961436271, | |
| "num_tokens": 471479.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.3109304085373878, | |
| "epoch": 0.017011531855729123, | |
| "grad_norm": 0.2634604275226593, | |
| "learning_rate": 1.9665009159905787e-05, | |
| "loss": 1.2606, | |
| "mean_token_accuracy": 0.6797668907791377, | |
| "num_tokens": 507880.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 1.3245886493474246, | |
| "epoch": 0.01832011122924675, | |
| "grad_norm": 0.2704319953918457, | |
| "learning_rate": 1.9638838000523425e-05, | |
| "loss": 1.1799, | |
| "mean_token_accuracy": 0.6835815001279115, | |
| "num_tokens": 545408.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.2262448083609343, | |
| "epoch": 0.019628690602764374, | |
| "grad_norm": 0.2790036201477051, | |
| "learning_rate": 1.9612666841141064e-05, | |
| "loss": 1.1106, | |
| "mean_token_accuracy": 0.6929917622357606, | |
| "num_tokens": 583634.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 1.265508808940649, | |
| "epoch": 0.020937269976281998, | |
| "grad_norm": 0.22567331790924072, | |
| "learning_rate": 1.9586495681758702e-05, | |
| "loss": 1.2072, | |
| "mean_token_accuracy": 0.6882555760443211, | |
| "num_tokens": 619048.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.2399516824632884, | |
| "epoch": 0.022245849349799625, | |
| "grad_norm": 0.26959890127182007, | |
| "learning_rate": 1.9560324522376344e-05, | |
| "loss": 1.1765, | |
| "mean_token_accuracy": 0.6866359710693359, | |
| "num_tokens": 657325.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 1.2313102029263974, | |
| "epoch": 0.02355442872331725, | |
| "grad_norm": 0.31447833776474, | |
| "learning_rate": 1.9534153362993982e-05, | |
| "loss": 1.1615, | |
| "mean_token_accuracy": 0.6929020721465349, | |
| "num_tokens": 692549.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.29357091486454, | |
| "epoch": 0.024863008096834872, | |
| "grad_norm": 0.27703842520713806, | |
| "learning_rate": 1.950798220361162e-05, | |
| "loss": 1.2535, | |
| "mean_token_accuracy": 0.6816725376993418, | |
| "num_tokens": 730459.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 1.1631319765001535, | |
| "epoch": 0.0261715874703525, | |
| "grad_norm": 0.30061009526252747, | |
| "learning_rate": 1.9481811044229262e-05, | |
| "loss": 1.0757, | |
| "mean_token_accuracy": 0.7062662675976753, | |
| "num_tokens": 770154.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0261715874703525, | |
| "eval_entropy": 1.138489418029785, | |
| "eval_loss": 1.1535744667053223, | |
| "eval_mean_token_accuracy": 0.7070313329696656, | |
| "eval_num_tokens": 770154.0, | |
| "eval_runtime": 96.9826, | |
| "eval_samples_per_second": 10.311, | |
| "eval_steps_per_second": 1.289, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.2952044211328029, | |
| "epoch": 0.027480166843870123, | |
| "grad_norm": 0.3016025125980377, | |
| "learning_rate": 1.94556398848469e-05, | |
| "loss": 1.1798, | |
| "mean_token_accuracy": 0.6826052766293287, | |
| "num_tokens": 806341.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 1.2129868753254414, | |
| "epoch": 0.028788746217387747, | |
| "grad_norm": 0.2947443425655365, | |
| "learning_rate": 1.942946872546454e-05, | |
| "loss": 1.1327, | |
| "mean_token_accuracy": 0.6927057210355997, | |
| "num_tokens": 843316.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.290729960054159, | |
| "epoch": 0.030097325590905374, | |
| "grad_norm": 0.3131401240825653, | |
| "learning_rate": 1.940329756608218e-05, | |
| "loss": 1.1927, | |
| "mean_token_accuracy": 0.6800491612404584, | |
| "num_tokens": 879843.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 1.1782143030315637, | |
| "epoch": 0.031405904964423, | |
| "grad_norm": 0.2727121114730835, | |
| "learning_rate": 1.937712640669982e-05, | |
| "loss": 1.0659, | |
| "mean_token_accuracy": 0.700430641323328, | |
| "num_tokens": 920743.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.1695528313517571, | |
| "epoch": 0.03271448433794062, | |
| "grad_norm": 0.26583895087242126, | |
| "learning_rate": 1.9350955247317458e-05, | |
| "loss": 1.0232, | |
| "mean_token_accuracy": 0.7075442411005497, | |
| "num_tokens": 959096.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 1.2109053663909435, | |
| "epoch": 0.034023063711458246, | |
| "grad_norm": 0.3991076946258545, | |
| "learning_rate": 1.93247840879351e-05, | |
| "loss": 1.1003, | |
| "mean_token_accuracy": 0.699517872184515, | |
| "num_tokens": 992352.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.2005993926897645, | |
| "epoch": 0.035331643084975876, | |
| "grad_norm": 0.22760024666786194, | |
| "learning_rate": 1.9298612928552734e-05, | |
| "loss": 1.0873, | |
| "mean_token_accuracy": 0.7063136156648397, | |
| "num_tokens": 1031994.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 1.2139225870370864, | |
| "epoch": 0.0366402224584935, | |
| "grad_norm": 0.2275068610906601, | |
| "learning_rate": 1.9272441769170376e-05, | |
| "loss": 1.2156, | |
| "mean_token_accuracy": 0.6952568493783474, | |
| "num_tokens": 1074569.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.1324009403586388, | |
| "epoch": 0.037948801832011124, | |
| "grad_norm": 0.29410237073898315, | |
| "learning_rate": 1.9246270609788015e-05, | |
| "loss": 1.0779, | |
| "mean_token_accuracy": 0.7172829706221819, | |
| "num_tokens": 1113952.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 1.2293724507093429, | |
| "epoch": 0.03925738120552875, | |
| "grad_norm": 0.37723538279533386, | |
| "learning_rate": 1.9220099450405653e-05, | |
| "loss": 1.146, | |
| "mean_token_accuracy": 0.6888038989156484, | |
| "num_tokens": 1148309.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.2574020594358444, | |
| "epoch": 0.04056596057904637, | |
| "grad_norm": 0.2657281756401062, | |
| "learning_rate": 1.9193928291023295e-05, | |
| "loss": 1.1935, | |
| "mean_token_accuracy": 0.6899809587746859, | |
| "num_tokens": 1189282.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 1.2693338964134455, | |
| "epoch": 0.041874539952563995, | |
| "grad_norm": 0.3656497895717621, | |
| "learning_rate": 1.9167757131640933e-05, | |
| "loss": 1.1972, | |
| "mean_token_accuracy": 0.6868170712143182, | |
| "num_tokens": 1227946.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.2286424834281207, | |
| "epoch": 0.043183119326081626, | |
| "grad_norm": 0.3276367485523224, | |
| "learning_rate": 1.914158597225857e-05, | |
| "loss": 1.2015, | |
| "mean_token_accuracy": 0.6974392894655466, | |
| "num_tokens": 1263944.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 1.1861646961420775, | |
| "epoch": 0.04449169869959925, | |
| "grad_norm": 0.23723381757736206, | |
| "learning_rate": 1.9115414812876213e-05, | |
| "loss": 1.0972, | |
| "mean_token_accuracy": 0.7079249806702137, | |
| "num_tokens": 1304585.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.208771700784564, | |
| "epoch": 0.045800278073116873, | |
| "grad_norm": 0.26714691519737244, | |
| "learning_rate": 1.908924365349385e-05, | |
| "loss": 1.1027, | |
| "mean_token_accuracy": 0.6939801312983036, | |
| "num_tokens": 1348683.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 1.2366201855242251, | |
| "epoch": 0.0471088574466345, | |
| "grad_norm": 0.2907228171825409, | |
| "learning_rate": 1.906307249411149e-05, | |
| "loss": 1.1874, | |
| "mean_token_accuracy": 0.6881870869547129, | |
| "num_tokens": 1383690.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.1898431174457074, | |
| "epoch": 0.04841743682015212, | |
| "grad_norm": 0.307679146528244, | |
| "learning_rate": 1.9036901334729128e-05, | |
| "loss": 1.0673, | |
| "mean_token_accuracy": 0.6996020458638668, | |
| "num_tokens": 1421508.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 1.187743027880788, | |
| "epoch": 0.049726016193669745, | |
| "grad_norm": 0.3183029890060425, | |
| "learning_rate": 1.901073017534677e-05, | |
| "loss": 1.0849, | |
| "mean_token_accuracy": 0.7036747265607118, | |
| "num_tokens": 1462197.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.2066817820072173, | |
| "epoch": 0.051034595567187375, | |
| "grad_norm": 0.3657572865486145, | |
| "learning_rate": 1.898455901596441e-05, | |
| "loss": 1.1581, | |
| "mean_token_accuracy": 0.6929362419992685, | |
| "num_tokens": 1495983.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 1.2126169182360171, | |
| "epoch": 0.052343174940705, | |
| "grad_norm": 0.29915717244148254, | |
| "learning_rate": 1.8958387856582047e-05, | |
| "loss": 1.209, | |
| "mean_token_accuracy": 0.6987330242991447, | |
| "num_tokens": 1533227.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.052343174940705, | |
| "eval_entropy": 1.1249961452484132, | |
| "eval_loss": 1.1296290159225464, | |
| "eval_mean_token_accuracy": 0.7111005320549011, | |
| "eval_num_tokens": 1533227.0, | |
| "eval_runtime": 96.9069, | |
| "eval_samples_per_second": 10.319, | |
| "eval_steps_per_second": 1.29, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.2599914189428092, | |
| "epoch": 0.05365175431422262, | |
| "grad_norm": 0.27243492007255554, | |
| "learning_rate": 1.893221669719969e-05, | |
| "loss": 1.1688, | |
| "mean_token_accuracy": 0.6900306183844804, | |
| "num_tokens": 1571122.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 1.182131139561534, | |
| "epoch": 0.05496033368774025, | |
| "grad_norm": 0.3210395276546478, | |
| "learning_rate": 1.8906045537817327e-05, | |
| "loss": 1.0958, | |
| "mean_token_accuracy": 0.7051124095916748, | |
| "num_tokens": 1610597.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.1748668916523457, | |
| "epoch": 0.05626891306125787, | |
| "grad_norm": 0.289655864238739, | |
| "learning_rate": 1.8879874378434965e-05, | |
| "loss": 1.1017, | |
| "mean_token_accuracy": 0.704343120381236, | |
| "num_tokens": 1647358.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 1.2121441032737494, | |
| "epoch": 0.057577492434775494, | |
| "grad_norm": 0.27067476511001587, | |
| "learning_rate": 1.8853703219052607e-05, | |
| "loss": 1.0814, | |
| "mean_token_accuracy": 0.7023848608136177, | |
| "num_tokens": 1683267.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.2210756927728652, | |
| "epoch": 0.058886071808293125, | |
| "grad_norm": 0.2811789810657501, | |
| "learning_rate": 1.8827532059670245e-05, | |
| "loss": 1.1921, | |
| "mean_token_accuracy": 0.6895555172115564, | |
| "num_tokens": 1720402.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 1.2133880224078895, | |
| "epoch": 0.06019465118181075, | |
| "grad_norm": 0.2670990526676178, | |
| "learning_rate": 1.8801360900287884e-05, | |
| "loss": 1.1452, | |
| "mean_token_accuracy": 0.6956219498068095, | |
| "num_tokens": 1760874.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.1936849866062402, | |
| "epoch": 0.06150323055532837, | |
| "grad_norm": 0.23748992383480072, | |
| "learning_rate": 1.8775189740905526e-05, | |
| "loss": 1.1095, | |
| "mean_token_accuracy": 0.701577215641737, | |
| "num_tokens": 1798253.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 1.1502909563481807, | |
| "epoch": 0.062811809928846, | |
| "grad_norm": 0.26068389415740967, | |
| "learning_rate": 1.8749018581523164e-05, | |
| "loss": 1.158, | |
| "mean_token_accuracy": 0.7133214943110943, | |
| "num_tokens": 1833443.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.3078780982643365, | |
| "epoch": 0.06412038930236362, | |
| "grad_norm": 0.4137086272239685, | |
| "learning_rate": 1.8722847422140802e-05, | |
| "loss": 1.2878, | |
| "mean_token_accuracy": 0.6780555315315724, | |
| "num_tokens": 1871910.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 1.2249418403953314, | |
| "epoch": 0.06542896867588124, | |
| "grad_norm": 0.2942444384098053, | |
| "learning_rate": 1.869667626275844e-05, | |
| "loss": 1.1637, | |
| "mean_token_accuracy": 0.6950553271919488, | |
| "num_tokens": 1909524.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.211066035553813, | |
| "epoch": 0.06673754804939887, | |
| "grad_norm": 0.3243419826030731, | |
| "learning_rate": 1.8670505103376082e-05, | |
| "loss": 1.1516, | |
| "mean_token_accuracy": 0.6988872833549976, | |
| "num_tokens": 1949325.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 1.1659034360200167, | |
| "epoch": 0.06804612742291649, | |
| "grad_norm": 0.2724755108356476, | |
| "learning_rate": 1.864433394399372e-05, | |
| "loss": 1.1041, | |
| "mean_token_accuracy": 0.7077850546687842, | |
| "num_tokens": 1992509.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.1442662578076124, | |
| "epoch": 0.06935470679643412, | |
| "grad_norm": 0.22777320444583893, | |
| "learning_rate": 1.861816278461136e-05, | |
| "loss": 1.0722, | |
| "mean_token_accuracy": 0.7042363248765469, | |
| "num_tokens": 2025452.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 1.1255293242633342, | |
| "epoch": 0.07066328616995175, | |
| "grad_norm": 0.30172210931777954, | |
| "learning_rate": 1.8591991625229e-05, | |
| "loss": 1.039, | |
| "mean_token_accuracy": 0.7133398558944464, | |
| "num_tokens": 2066169.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.1561335507780313, | |
| "epoch": 0.07197186554346938, | |
| "grad_norm": 0.2582629323005676, | |
| "learning_rate": 1.856582046584664e-05, | |
| "loss": 1.1285, | |
| "mean_token_accuracy": 0.6994227156043052, | |
| "num_tokens": 2107302.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 1.1572829529643058, | |
| "epoch": 0.073280444916987, | |
| "grad_norm": 0.3104718029499054, | |
| "learning_rate": 1.8539649306464278e-05, | |
| "loss": 1.0956, | |
| "mean_token_accuracy": 0.706654816865921, | |
| "num_tokens": 2143551.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.1630447298288344, | |
| "epoch": 0.07458902429050462, | |
| "grad_norm": 0.23650068044662476, | |
| "learning_rate": 1.851347814708192e-05, | |
| "loss": 1.1061, | |
| "mean_token_accuracy": 0.7114994272589683, | |
| "num_tokens": 2184046.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 1.1605928294360637, | |
| "epoch": 0.07589760366402225, | |
| "grad_norm": 0.2913525104522705, | |
| "learning_rate": 1.8487306987699554e-05, | |
| "loss": 1.1521, | |
| "mean_token_accuracy": 0.7072546981275082, | |
| "num_tokens": 2223868.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.169267463684082, | |
| "epoch": 0.07720618303753987, | |
| "grad_norm": 0.2545417547225952, | |
| "learning_rate": 1.8461135828317196e-05, | |
| "loss": 1.1464, | |
| "mean_token_accuracy": 0.7014396741986275, | |
| "num_tokens": 2265467.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 1.170828291401267, | |
| "epoch": 0.0785147624110575, | |
| "grad_norm": 0.3250999450683594, | |
| "learning_rate": 1.8434964668934835e-05, | |
| "loss": 1.1287, | |
| "mean_token_accuracy": 0.7041407331824303, | |
| "num_tokens": 2304656.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0785147624110575, | |
| "eval_entropy": 1.1109629173278808, | |
| "eval_loss": 1.1124978065490723, | |
| "eval_mean_token_accuracy": 0.7139446325302125, | |
| "eval_num_tokens": 2304656.0, | |
| "eval_runtime": 96.7866, | |
| "eval_samples_per_second": 10.332, | |
| "eval_steps_per_second": 1.292, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.2286480199545622, | |
| "epoch": 0.07982334178457512, | |
| "grad_norm": 0.2837753891944885, | |
| "learning_rate": 1.8408793509552473e-05, | |
| "loss": 1.1965, | |
| "mean_token_accuracy": 0.6960091460496187, | |
| "num_tokens": 2346109.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 1.225469772517681, | |
| "epoch": 0.08113192115809274, | |
| "grad_norm": 0.3345058262348175, | |
| "learning_rate": 1.8382622350170115e-05, | |
| "loss": 1.2391, | |
| "mean_token_accuracy": 0.6988137848675251, | |
| "num_tokens": 2387084.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.076605834439397, | |
| "epoch": 0.08244050053161037, | |
| "grad_norm": 0.2856815755367279, | |
| "learning_rate": 1.8356451190787753e-05, | |
| "loss": 1.0384, | |
| "mean_token_accuracy": 0.7207759529352188, | |
| "num_tokens": 2428351.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 1.1501711413264275, | |
| "epoch": 0.08374907990512799, | |
| "grad_norm": 0.3421330451965332, | |
| "learning_rate": 1.833028003140539e-05, | |
| "loss": 1.1022, | |
| "mean_token_accuracy": 0.7128844160586596, | |
| "num_tokens": 2465278.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.1183946415781976, | |
| "epoch": 0.08505765927864561, | |
| "grad_norm": 0.26048743724823, | |
| "learning_rate": 1.8304108872023033e-05, | |
| "loss": 1.1056, | |
| "mean_token_accuracy": 0.7164099890738725, | |
| "num_tokens": 2505150.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 1.2478729356080294, | |
| "epoch": 0.08636623865216325, | |
| "grad_norm": 0.3070579767227173, | |
| "learning_rate": 1.827793771264067e-05, | |
| "loss": 1.1858, | |
| "mean_token_accuracy": 0.6890709776431322, | |
| "num_tokens": 2545535.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.1905897859483958, | |
| "epoch": 0.08767481802568088, | |
| "grad_norm": 0.29979345202445984, | |
| "learning_rate": 1.825176655325831e-05, | |
| "loss": 1.1911, | |
| "mean_token_accuracy": 0.6996586941182613, | |
| "num_tokens": 2587914.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 1.1719540372490882, | |
| "epoch": 0.0889833973991985, | |
| "grad_norm": 0.3220307528972626, | |
| "learning_rate": 1.822559539387595e-05, | |
| "loss": 1.1214, | |
| "mean_token_accuracy": 0.7045478235930205, | |
| "num_tokens": 2625193.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.1586624164134265, | |
| "epoch": 0.09029197677271612, | |
| "grad_norm": 0.2599794566631317, | |
| "learning_rate": 1.819942423449359e-05, | |
| "loss": 1.0431, | |
| "mean_token_accuracy": 0.7106049194931984, | |
| "num_tokens": 2665612.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 1.1819968853145837, | |
| "epoch": 0.09160055614623375, | |
| "grad_norm": 0.29061323404312134, | |
| "learning_rate": 1.817325307511123e-05, | |
| "loss": 1.1262, | |
| "mean_token_accuracy": 0.7002443138509988, | |
| "num_tokens": 2706946.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.1458000589162112, | |
| "epoch": 0.09290913551975137, | |
| "grad_norm": 0.347755491733551, | |
| "learning_rate": 1.8147081915728867e-05, | |
| "loss": 1.0993, | |
| "mean_token_accuracy": 0.7002494022250175, | |
| "num_tokens": 2747375.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 1.1966037943959236, | |
| "epoch": 0.094217714893269, | |
| "grad_norm": 0.37435945868492126, | |
| "learning_rate": 1.812091075634651e-05, | |
| "loss": 1.1504, | |
| "mean_token_accuracy": 0.7029601544141769, | |
| "num_tokens": 2790158.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.1675037112087012, | |
| "epoch": 0.09552629426678662, | |
| "grad_norm": 0.32143616676330566, | |
| "learning_rate": 1.8094739596964147e-05, | |
| "loss": 1.0943, | |
| "mean_token_accuracy": 0.7053755812346936, | |
| "num_tokens": 2827819.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 1.1707987286150456, | |
| "epoch": 0.09683487364030424, | |
| "grad_norm": 0.35289084911346436, | |
| "learning_rate": 1.8068568437581785e-05, | |
| "loss": 1.1038, | |
| "mean_token_accuracy": 0.7076830513775348, | |
| "num_tokens": 2869307.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.1168341800570487, | |
| "epoch": 0.09814345301382187, | |
| "grad_norm": 0.3782387673854828, | |
| "learning_rate": 1.8042397278199427e-05, | |
| "loss": 1.0558, | |
| "mean_token_accuracy": 0.7170991696417331, | |
| "num_tokens": 2914107.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 1.1111165493726731, | |
| "epoch": 0.09945203238733949, | |
| "grad_norm": 0.3120217025279999, | |
| "learning_rate": 1.8016226118817065e-05, | |
| "loss": 1.0232, | |
| "mean_token_accuracy": 0.7135602042078972, | |
| "num_tokens": 2952137.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.1221987165510654, | |
| "epoch": 0.10076061176085711, | |
| "grad_norm": 0.3564794659614563, | |
| "learning_rate": 1.7990054959434704e-05, | |
| "loss": 1.0421, | |
| "mean_token_accuracy": 0.712994397431612, | |
| "num_tokens": 2992220.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 1.1915016632527113, | |
| "epoch": 0.10206919113437475, | |
| "grad_norm": 0.3401761054992676, | |
| "learning_rate": 1.7963883800052346e-05, | |
| "loss": 1.1218, | |
| "mean_token_accuracy": 0.704281483963132, | |
| "num_tokens": 3032423.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.1808374393731356, | |
| "epoch": 0.10337777050789237, | |
| "grad_norm": 0.32225707173347473, | |
| "learning_rate": 1.7937712640669984e-05, | |
| "loss": 1.1646, | |
| "mean_token_accuracy": 0.6994639791548252, | |
| "num_tokens": 3068229.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 1.1816586170345544, | |
| "epoch": 0.10468634988141, | |
| "grad_norm": 0.29380306601524353, | |
| "learning_rate": 1.7911541481287622e-05, | |
| "loss": 1.1252, | |
| "mean_token_accuracy": 0.7064340703189373, | |
| "num_tokens": 3108153.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.10468634988141, | |
| "eval_entropy": 1.1162484121322631, | |
| "eval_loss": 1.1014440059661865, | |
| "eval_mean_token_accuracy": 0.7159673566818238, | |
| "eval_num_tokens": 3108153.0, | |
| "eval_runtime": 96.8755, | |
| "eval_samples_per_second": 10.323, | |
| "eval_steps_per_second": 1.29, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.1671059928834437, | |
| "epoch": 0.10599492925492762, | |
| "grad_norm": 0.34220021963119507, | |
| "learning_rate": 1.788537032190526e-05, | |
| "loss": 1.0648, | |
| "mean_token_accuracy": 0.7134943757206201, | |
| "num_tokens": 3144634.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 1.1742517005652189, | |
| "epoch": 0.10730350862844525, | |
| "grad_norm": 0.3622688949108124, | |
| "learning_rate": 1.7859199162522902e-05, | |
| "loss": 1.1752, | |
| "mean_token_accuracy": 0.7035086948424578, | |
| "num_tokens": 3184006.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.193211056292057, | |
| "epoch": 0.10861208800196287, | |
| "grad_norm": 0.29637637734413147, | |
| "learning_rate": 1.783302800314054e-05, | |
| "loss": 1.1365, | |
| "mean_token_accuracy": 0.6987239971756936, | |
| "num_tokens": 3224565.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 1.251770207285881, | |
| "epoch": 0.1099206673754805, | |
| "grad_norm": 0.2511119544506073, | |
| "learning_rate": 1.780685684375818e-05, | |
| "loss": 1.2718, | |
| "mean_token_accuracy": 0.6891444839537144, | |
| "num_tokens": 3265562.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.23516104221344, | |
| "epoch": 0.11122924674899812, | |
| "grad_norm": 0.32558682560920715, | |
| "learning_rate": 1.778068568437582e-05, | |
| "loss": 1.2006, | |
| "mean_token_accuracy": 0.6901515744626522, | |
| "num_tokens": 3302848.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 1.132848785072565, | |
| "epoch": 0.11253782612251574, | |
| "grad_norm": 0.26255035400390625, | |
| "learning_rate": 1.775451452499346e-05, | |
| "loss": 1.0432, | |
| "mean_token_accuracy": 0.7118600439280272, | |
| "num_tokens": 3342127.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.180259444192052, | |
| "epoch": 0.11384640549603336, | |
| "grad_norm": 0.3205581307411194, | |
| "learning_rate": 1.7728343365611098e-05, | |
| "loss": 1.1224, | |
| "mean_token_accuracy": 0.7004977215081454, | |
| "num_tokens": 3381062.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 1.249288512021303, | |
| "epoch": 0.11515498486955099, | |
| "grad_norm": 0.31854113936424255, | |
| "learning_rate": 1.770217220622874e-05, | |
| "loss": 1.2151, | |
| "mean_token_accuracy": 0.6909528046846389, | |
| "num_tokens": 3418674.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.1658391878008842, | |
| "epoch": 0.11646356424306861, | |
| "grad_norm": 0.31040823459625244, | |
| "learning_rate": 1.7676001046846374e-05, | |
| "loss": 1.1628, | |
| "mean_token_accuracy": 0.7044171277433634, | |
| "num_tokens": 3459660.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 1.1223286792635918, | |
| "epoch": 0.11777214361658625, | |
| "grad_norm": 0.387347936630249, | |
| "learning_rate": 1.7649829887464016e-05, | |
| "loss": 1.0412, | |
| "mean_token_accuracy": 0.716064503788948, | |
| "num_tokens": 3498595.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.1415202051401139, | |
| "epoch": 0.11908072299010387, | |
| "grad_norm": 0.28736481070518494, | |
| "learning_rate": 1.7623658728081658e-05, | |
| "loss": 1.1526, | |
| "mean_token_accuracy": 0.7126259196549654, | |
| "num_tokens": 3538259.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 1.1327776111662389, | |
| "epoch": 0.1203893023636215, | |
| "grad_norm": 0.2839507460594177, | |
| "learning_rate": 1.7597487568699293e-05, | |
| "loss": 1.0416, | |
| "mean_token_accuracy": 0.7203302428126335, | |
| "num_tokens": 3577846.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.1565908435732126, | |
| "epoch": 0.12169788173713912, | |
| "grad_norm": 0.27812105417251587, | |
| "learning_rate": 1.7571316409316935e-05, | |
| "loss": 1.1039, | |
| "mean_token_accuracy": 0.7012306869029998, | |
| "num_tokens": 3621788.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 1.2063092295080424, | |
| "epoch": 0.12300646111065675, | |
| "grad_norm": 0.38487356901168823, | |
| "learning_rate": 1.7545145249934573e-05, | |
| "loss": 1.1509, | |
| "mean_token_accuracy": 0.7023797459900379, | |
| "num_tokens": 3658439.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.0865493860095738, | |
| "epoch": 0.12431504048417437, | |
| "grad_norm": 0.2500401735305786, | |
| "learning_rate": 1.751897409055221e-05, | |
| "loss": 1.003, | |
| "mean_token_accuracy": 0.7198965962976217, | |
| "num_tokens": 3698868.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 1.1908767573535441, | |
| "epoch": 0.125623619857692, | |
| "grad_norm": 0.3962502181529999, | |
| "learning_rate": 1.7492802931169853e-05, | |
| "loss": 1.1297, | |
| "mean_token_accuracy": 0.7053800851106644, | |
| "num_tokens": 3741081.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.1331688780337572, | |
| "epoch": 0.12693219923120963, | |
| "grad_norm": 0.2883553206920624, | |
| "learning_rate": 1.746663177178749e-05, | |
| "loss": 1.0536, | |
| "mean_token_accuracy": 0.7061506755650043, | |
| "num_tokens": 3780895.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 1.097454984486103, | |
| "epoch": 0.12824077860472724, | |
| "grad_norm": 0.3009795844554901, | |
| "learning_rate": 1.744046061240513e-05, | |
| "loss": 1.0696, | |
| "mean_token_accuracy": 0.7190258864313364, | |
| "num_tokens": 3821605.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.2135265406221152, | |
| "epoch": 0.12954935797824488, | |
| "grad_norm": 0.25389495491981506, | |
| "learning_rate": 1.741428945302277e-05, | |
| "loss": 1.2089, | |
| "mean_token_accuracy": 0.693002799153328, | |
| "num_tokens": 3861658.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 1.2169387113302945, | |
| "epoch": 0.1308579373517625, | |
| "grad_norm": 0.2983970642089844, | |
| "learning_rate": 1.738811829364041e-05, | |
| "loss": 1.1668, | |
| "mean_token_accuracy": 0.7012713517993688, | |
| "num_tokens": 3903328.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1308579373517625, | |
| "eval_entropy": 1.0979133324623107, | |
| "eval_loss": 1.0918148756027222, | |
| "eval_mean_token_accuracy": 0.7178149290084839, | |
| "eval_num_tokens": 3903328.0, | |
| "eval_runtime": 96.8747, | |
| "eval_samples_per_second": 10.323, | |
| "eval_steps_per_second": 1.29, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.1749178305268289, | |
| "epoch": 0.13216651672528013, | |
| "grad_norm": 0.38501033186912537, | |
| "learning_rate": 1.736194713425805e-05, | |
| "loss": 1.1413, | |
| "mean_token_accuracy": 0.707249428331852, | |
| "num_tokens": 3944540.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 1.1787702813744545, | |
| "epoch": 0.13347509609879774, | |
| "grad_norm": 0.43281009793281555, | |
| "learning_rate": 1.7335775974875687e-05, | |
| "loss": 1.1217, | |
| "mean_token_accuracy": 0.7034583434462547, | |
| "num_tokens": 3980926.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.0645542204380036, | |
| "epoch": 0.13478367547231537, | |
| "grad_norm": 0.3150987923145294, | |
| "learning_rate": 1.730960481549333e-05, | |
| "loss": 1.05, | |
| "mean_token_accuracy": 0.7132887698709964, | |
| "num_tokens": 4021891.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 1.1106295462697744, | |
| "epoch": 0.13609225484583298, | |
| "grad_norm": 0.3438737094402313, | |
| "learning_rate": 1.7283433656110967e-05, | |
| "loss": 1.0003, | |
| "mean_token_accuracy": 0.7212319687008858, | |
| "num_tokens": 4060166.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.1168012116104364, | |
| "epoch": 0.13740083421935062, | |
| "grad_norm": 0.3113572895526886, | |
| "learning_rate": 1.7257262496728605e-05, | |
| "loss": 1.0543, | |
| "mean_token_accuracy": 0.7209181990474463, | |
| "num_tokens": 4094716.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 1.175467026978731, | |
| "epoch": 0.13870941359286823, | |
| "grad_norm": 0.302837073802948, | |
| "learning_rate": 1.7231091337346247e-05, | |
| "loss": 1.1289, | |
| "mean_token_accuracy": 0.7042849272489548, | |
| "num_tokens": 4141339.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.0599542949348688, | |
| "epoch": 0.14001799296638587, | |
| "grad_norm": 0.2495729923248291, | |
| "learning_rate": 1.7204920177963885e-05, | |
| "loss": 1.052, | |
| "mean_token_accuracy": 0.7274259474128485, | |
| "num_tokens": 4185050.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 1.1991025242954492, | |
| "epoch": 0.1413265723399035, | |
| "grad_norm": 0.4198792576789856, | |
| "learning_rate": 1.7178749018581524e-05, | |
| "loss": 1.1902, | |
| "mean_token_accuracy": 0.6947382442653179, | |
| "num_tokens": 4223794.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 1.227722565829754, | |
| "epoch": 0.14263515171342112, | |
| "grad_norm": 0.3961770236492157, | |
| "learning_rate": 1.7152577859199166e-05, | |
| "loss": 1.1986, | |
| "mean_token_accuracy": 0.6865826688706875, | |
| "num_tokens": 4255882.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 1.1813674479722978, | |
| "epoch": 0.14394373108693875, | |
| "grad_norm": 0.33829399943351746, | |
| "learning_rate": 1.7126406699816804e-05, | |
| "loss": 1.1159, | |
| "mean_token_accuracy": 0.7054100755602122, | |
| "num_tokens": 4293931.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.1598187141120433, | |
| "epoch": 0.14525231046045636, | |
| "grad_norm": 0.29768019914627075, | |
| "learning_rate": 1.7100235540434442e-05, | |
| "loss": 1.1149, | |
| "mean_token_accuracy": 0.7081062331795692, | |
| "num_tokens": 4331281.0, | |
| "step": 555 | |
| }, | |
| { | |
| "entropy": 1.1658945206552744, | |
| "epoch": 0.146560889833974, | |
| "grad_norm": 0.47267428040504456, | |
| "learning_rate": 1.707406438105208e-05, | |
| "loss": 1.126, | |
| "mean_token_accuracy": 0.7110950201749802, | |
| "num_tokens": 4367666.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 1.107292691990733, | |
| "epoch": 0.1478694692074916, | |
| "grad_norm": 0.30985575914382935, | |
| "learning_rate": 1.7047893221669722e-05, | |
| "loss": 1.0491, | |
| "mean_token_accuracy": 0.7133530873805285, | |
| "num_tokens": 4404165.0, | |
| "step": 565 | |
| }, | |
| { | |
| "entropy": 1.0999628335237503, | |
| "epoch": 0.14917804858100925, | |
| "grad_norm": 0.38928133249282837, | |
| "learning_rate": 1.702172206228736e-05, | |
| "loss": 1.0697, | |
| "mean_token_accuracy": 0.7206672951579094, | |
| "num_tokens": 4444145.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 1.0466942593455315, | |
| "epoch": 0.15048662795452686, | |
| "grad_norm": 0.3638196289539337, | |
| "learning_rate": 1.6995550902905e-05, | |
| "loss": 0.9879, | |
| "mean_token_accuracy": 0.7333550229668617, | |
| "num_tokens": 4486612.0, | |
| "step": 575 | |
| }, | |
| { | |
| "entropy": 1.1943504758179189, | |
| "epoch": 0.1517952073280445, | |
| "grad_norm": 0.35094064474105835, | |
| "learning_rate": 1.696937974352264e-05, | |
| "loss": 1.173, | |
| "mean_token_accuracy": 0.7010251984000206, | |
| "num_tokens": 4527971.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 1.1672972977161407, | |
| "epoch": 0.1531037867015621, | |
| "grad_norm": 0.3714410662651062, | |
| "learning_rate": 1.694320858414028e-05, | |
| "loss": 1.1864, | |
| "mean_token_accuracy": 0.6985016442835331, | |
| "num_tokens": 4565626.0, | |
| "step": 585 | |
| }, | |
| { | |
| "entropy": 1.183141415938735, | |
| "epoch": 0.15441236607507974, | |
| "grad_norm": 0.3710881173610687, | |
| "learning_rate": 1.6917037424757918e-05, | |
| "loss": 1.1097, | |
| "mean_token_accuracy": 0.709296465292573, | |
| "num_tokens": 4601255.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 1.1613754861056804, | |
| "epoch": 0.15572094544859735, | |
| "grad_norm": 0.4122501015663147, | |
| "learning_rate": 1.689086626537556e-05, | |
| "loss": 1.0919, | |
| "mean_token_accuracy": 0.7126895777881146, | |
| "num_tokens": 4635025.0, | |
| "step": 595 | |
| }, | |
| { | |
| "entropy": 1.1225487384945154, | |
| "epoch": 0.157029524822115, | |
| "grad_norm": 0.260109007358551, | |
| "learning_rate": 1.6864695105993198e-05, | |
| "loss": 1.074, | |
| "mean_token_accuracy": 0.71206672526896, | |
| "num_tokens": 4675929.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.157029524822115, | |
| "eval_entropy": 1.078299481868744, | |
| "eval_loss": 1.0839170217514038, | |
| "eval_mean_token_accuracy": 0.7196547708511353, | |
| "eval_num_tokens": 4675929.0, | |
| "eval_runtime": 96.9266, | |
| "eval_samples_per_second": 10.317, | |
| "eval_steps_per_second": 1.29, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 1.214971611648798, | |
| "epoch": 0.15833810419563263, | |
| "grad_norm": 0.4801560044288635, | |
| "learning_rate": 1.6838523946610836e-05, | |
| "loss": 1.1404, | |
| "mean_token_accuracy": 0.6989750042557716, | |
| "num_tokens": 4714005.0, | |
| "step": 605 | |
| }, | |
| { | |
| "entropy": 1.1845928117632867, | |
| "epoch": 0.15964668356915024, | |
| "grad_norm": 0.30919569730758667, | |
| "learning_rate": 1.6812352787228478e-05, | |
| "loss": 1.2036, | |
| "mean_token_accuracy": 0.698324055224657, | |
| "num_tokens": 4755777.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 1.0673103269189597, | |
| "epoch": 0.16095526294266788, | |
| "grad_norm": 0.2788703143596649, | |
| "learning_rate": 1.6786181627846113e-05, | |
| "loss": 0.9835, | |
| "mean_token_accuracy": 0.719642236456275, | |
| "num_tokens": 4795809.0, | |
| "step": 615 | |
| }, | |
| { | |
| "entropy": 1.1469231896102428, | |
| "epoch": 0.16226384231618549, | |
| "grad_norm": 0.3465251624584198, | |
| "learning_rate": 1.6760010468463755e-05, | |
| "loss": 1.0888, | |
| "mean_token_accuracy": 0.7080798294395209, | |
| "num_tokens": 4837156.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 1.1547171102836729, | |
| "epoch": 0.16357242168970312, | |
| "grad_norm": 0.25486519932746887, | |
| "learning_rate": 1.6733839309081393e-05, | |
| "loss": 1.0928, | |
| "mean_token_accuracy": 0.7026465550065041, | |
| "num_tokens": 4877484.0, | |
| "step": 625 | |
| }, | |
| { | |
| "entropy": 1.1483009189367295, | |
| "epoch": 0.16488100106322073, | |
| "grad_norm": 0.2877664566040039, | |
| "learning_rate": 1.670766814969903e-05, | |
| "loss": 1.0912, | |
| "mean_token_accuracy": 0.7045180670917034, | |
| "num_tokens": 4916567.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 1.057371946424246, | |
| "epoch": 0.16618958043673837, | |
| "grad_norm": 0.29588788747787476, | |
| "learning_rate": 1.6681496990316673e-05, | |
| "loss": 1.0147, | |
| "mean_token_accuracy": 0.733672034740448, | |
| "num_tokens": 4955375.0, | |
| "step": 635 | |
| }, | |
| { | |
| "entropy": 1.105701708048582, | |
| "epoch": 0.16749815981025598, | |
| "grad_norm": 0.305503785610199, | |
| "learning_rate": 1.665532583093431e-05, | |
| "loss": 1.0907, | |
| "mean_token_accuracy": 0.718507794290781, | |
| "num_tokens": 4994274.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 1.1482198983430862, | |
| "epoch": 0.16880673918377362, | |
| "grad_norm": 0.34045520424842834, | |
| "learning_rate": 1.662915467155195e-05, | |
| "loss": 1.0737, | |
| "mean_token_accuracy": 0.7120114967226983, | |
| "num_tokens": 5028739.0, | |
| "step": 645 | |
| }, | |
| { | |
| "entropy": 1.1510255340486766, | |
| "epoch": 0.17011531855729123, | |
| "grad_norm": 0.3617549240589142, | |
| "learning_rate": 1.660298351216959e-05, | |
| "loss": 1.0989, | |
| "mean_token_accuracy": 0.7049180768430233, | |
| "num_tokens": 5067269.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 1.1354583825916051, | |
| "epoch": 0.17142389793080887, | |
| "grad_norm": 0.2900081276893616, | |
| "learning_rate": 1.657681235278723e-05, | |
| "loss": 1.07, | |
| "mean_token_accuracy": 0.7162535794079303, | |
| "num_tokens": 5105740.0, | |
| "step": 655 | |
| }, | |
| { | |
| "entropy": 1.1124658603221178, | |
| "epoch": 0.1727324773043265, | |
| "grad_norm": 0.31473371386528015, | |
| "learning_rate": 1.655064119340487e-05, | |
| "loss": 1.0722, | |
| "mean_token_accuracy": 0.7151258502155542, | |
| "num_tokens": 5144288.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 1.1872437849640847, | |
| "epoch": 0.1740410566778441, | |
| "grad_norm": 0.33180883526802063, | |
| "learning_rate": 1.6524470034022507e-05, | |
| "loss": 1.1771, | |
| "mean_token_accuracy": 0.6998372994363308, | |
| "num_tokens": 5182259.0, | |
| "step": 665 | |
| }, | |
| { | |
| "entropy": 1.140797757729888, | |
| "epoch": 0.17534963605136175, | |
| "grad_norm": 0.3464796245098114, | |
| "learning_rate": 1.649829887464015e-05, | |
| "loss": 1.088, | |
| "mean_token_accuracy": 0.7082854315638543, | |
| "num_tokens": 5221128.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 1.1274819798767566, | |
| "epoch": 0.17665821542487936, | |
| "grad_norm": 0.3252648711204529, | |
| "learning_rate": 1.6472127715257787e-05, | |
| "loss": 1.0973, | |
| "mean_token_accuracy": 0.7148232467472553, | |
| "num_tokens": 5256205.0, | |
| "step": 675 | |
| }, | |
| { | |
| "entropy": 1.1311011631041765, | |
| "epoch": 0.177966794798397, | |
| "grad_norm": 0.6462693214416504, | |
| "learning_rate": 1.6445956555875425e-05, | |
| "loss": 1.0543, | |
| "mean_token_accuracy": 0.7152682617306709, | |
| "num_tokens": 5292320.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 1.0446813367307186, | |
| "epoch": 0.1792753741719146, | |
| "grad_norm": 0.28639963269233704, | |
| "learning_rate": 1.6419785396493067e-05, | |
| "loss": 0.9461, | |
| "mean_token_accuracy": 0.7340242311358451, | |
| "num_tokens": 5334639.0, | |
| "step": 685 | |
| }, | |
| { | |
| "entropy": 1.1492939487099647, | |
| "epoch": 0.18058395354543225, | |
| "grad_norm": 0.34751448035240173, | |
| "learning_rate": 1.6393614237110705e-05, | |
| "loss": 1.0876, | |
| "mean_token_accuracy": 0.71434987783432, | |
| "num_tokens": 5371003.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 1.1387122191488743, | |
| "epoch": 0.18189253291894986, | |
| "grad_norm": 0.32928481698036194, | |
| "learning_rate": 1.6367443077728344e-05, | |
| "loss": 1.1427, | |
| "mean_token_accuracy": 0.7110460348427295, | |
| "num_tokens": 5408983.0, | |
| "step": 695 | |
| }, | |
| { | |
| "entropy": 1.202421073615551, | |
| "epoch": 0.1832011122924675, | |
| "grad_norm": 0.36161890625953674, | |
| "learning_rate": 1.6341271918345986e-05, | |
| "loss": 1.17, | |
| "mean_token_accuracy": 0.6983021099120379, | |
| "num_tokens": 5449270.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.1832011122924675, | |
| "eval_entropy": 1.0868224625587464, | |
| "eval_loss": 1.078864336013794, | |
| "eval_mean_token_accuracy": 0.7203193211555481, | |
| "eval_num_tokens": 5449270.0, | |
| "eval_runtime": 96.8653, | |
| "eval_samples_per_second": 10.324, | |
| "eval_steps_per_second": 1.29, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.139945473894477, | |
| "epoch": 0.1845096916659851, | |
| "grad_norm": 0.34418225288391113, | |
| "learning_rate": 1.6315100758963624e-05, | |
| "loss": 1.0577, | |
| "mean_token_accuracy": 0.7123764835298061, | |
| "num_tokens": 5486900.0, | |
| "step": 705 | |
| }, | |
| { | |
| "entropy": 1.2072007820010184, | |
| "epoch": 0.18581827103950274, | |
| "grad_norm": 0.2719118595123291, | |
| "learning_rate": 1.6288929599581262e-05, | |
| "loss": 1.165, | |
| "mean_token_accuracy": 0.6938376247882843, | |
| "num_tokens": 5526941.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 1.254874274134636, | |
| "epoch": 0.18712685041302035, | |
| "grad_norm": 0.42757540941238403, | |
| "learning_rate": 1.6262758440198904e-05, | |
| "loss": 1.2349, | |
| "mean_token_accuracy": 0.6856517255306244, | |
| "num_tokens": 5563702.0, | |
| "step": 715 | |
| }, | |
| { | |
| "entropy": 1.0055264480412007, | |
| "epoch": 0.188435429786538, | |
| "grad_norm": 0.3122069537639618, | |
| "learning_rate": 1.6236587280816542e-05, | |
| "loss": 0.9459, | |
| "mean_token_accuracy": 0.7404189445078373, | |
| "num_tokens": 5607678.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 1.1009038139134646, | |
| "epoch": 0.18974400916005563, | |
| "grad_norm": 0.31834590435028076, | |
| "learning_rate": 1.621041612143418e-05, | |
| "loss": 1.0656, | |
| "mean_token_accuracy": 0.7191543571650982, | |
| "num_tokens": 5647310.0, | |
| "step": 725 | |
| }, | |
| { | |
| "entropy": 1.0673759788274766, | |
| "epoch": 0.19105258853357324, | |
| "grad_norm": 0.2675130069255829, | |
| "learning_rate": 1.618424496205182e-05, | |
| "loss": 1.0109, | |
| "mean_token_accuracy": 0.7255190499126911, | |
| "num_tokens": 5685639.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 1.1511727664619684, | |
| "epoch": 0.19236116790709087, | |
| "grad_norm": 0.37566685676574707, | |
| "learning_rate": 1.615807380266946e-05, | |
| "loss": 1.0625, | |
| "mean_token_accuracy": 0.7034743718802929, | |
| "num_tokens": 5720880.0, | |
| "step": 735 | |
| }, | |
| { | |
| "entropy": 1.0888585902750492, | |
| "epoch": 0.19366974728060848, | |
| "grad_norm": 0.25365257263183594, | |
| "learning_rate": 1.61319026432871e-05, | |
| "loss": 1.049, | |
| "mean_token_accuracy": 0.7239378560334444, | |
| "num_tokens": 5762559.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 1.1175981510430575, | |
| "epoch": 0.19497832665412612, | |
| "grad_norm": 0.43423643708229065, | |
| "learning_rate": 1.6105731483904738e-05, | |
| "loss": 1.0806, | |
| "mean_token_accuracy": 0.7193359076976776, | |
| "num_tokens": 5803927.0, | |
| "step": 745 | |
| }, | |
| { | |
| "entropy": 1.103783832490444, | |
| "epoch": 0.19628690602764373, | |
| "grad_norm": 0.38531434535980225, | |
| "learning_rate": 1.607956032452238e-05, | |
| "loss": 1.0503, | |
| "mean_token_accuracy": 0.722858403250575, | |
| "num_tokens": 5845809.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.1102315738797188, | |
| "epoch": 0.19759548540116137, | |
| "grad_norm": 0.3240512013435364, | |
| "learning_rate": 1.6053389165140018e-05, | |
| "loss": 1.074, | |
| "mean_token_accuracy": 0.7143935695290565, | |
| "num_tokens": 5884671.0, | |
| "step": 755 | |
| }, | |
| { | |
| "entropy": 1.1394222188740968, | |
| "epoch": 0.19890406477467898, | |
| "grad_norm": 0.35714927315711975, | |
| "learning_rate": 1.6027218005757656e-05, | |
| "loss": 1.0505, | |
| "mean_token_accuracy": 0.7142257861793041, | |
| "num_tokens": 5923161.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 1.10749419555068, | |
| "epoch": 0.20021264414819662, | |
| "grad_norm": 0.28041279315948486, | |
| "learning_rate": 1.6001046846375298e-05, | |
| "loss": 1.0247, | |
| "mean_token_accuracy": 0.7158019613474608, | |
| "num_tokens": 5964592.0, | |
| "step": 765 | |
| }, | |
| { | |
| "entropy": 1.1163969319313765, | |
| "epoch": 0.20152122352171423, | |
| "grad_norm": 0.3620986342430115, | |
| "learning_rate": 1.5974875686992933e-05, | |
| "loss": 1.1243, | |
| "mean_token_accuracy": 0.7199557162821293, | |
| "num_tokens": 6002121.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 1.1485939621925354, | |
| "epoch": 0.20282980289523186, | |
| "grad_norm": 0.28997066617012024, | |
| "learning_rate": 1.5948704527610575e-05, | |
| "loss": 1.1103, | |
| "mean_token_accuracy": 0.7058781541883945, | |
| "num_tokens": 6042026.0, | |
| "step": 775 | |
| }, | |
| { | |
| "entropy": 1.163365462049842, | |
| "epoch": 0.2041383822687495, | |
| "grad_norm": 0.3916630148887634, | |
| "learning_rate": 1.5922533368228213e-05, | |
| "loss": 1.1024, | |
| "mean_token_accuracy": 0.7097311306744813, | |
| "num_tokens": 6081711.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 1.1414872355759145, | |
| "epoch": 0.2054469616422671, | |
| "grad_norm": 0.3654806315898895, | |
| "learning_rate": 1.589636220884585e-05, | |
| "loss": 1.1151, | |
| "mean_token_accuracy": 0.7132655665278435, | |
| "num_tokens": 6124022.0, | |
| "step": 785 | |
| }, | |
| { | |
| "entropy": 1.1237910758703946, | |
| "epoch": 0.20675554101578475, | |
| "grad_norm": 0.30389606952667236, | |
| "learning_rate": 1.5870191049463493e-05, | |
| "loss": 1.071, | |
| "mean_token_accuracy": 0.7095677696168423, | |
| "num_tokens": 6163807.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 1.108309706300497, | |
| "epoch": 0.20806412038930236, | |
| "grad_norm": 0.3702613413333893, | |
| "learning_rate": 1.584401989008113e-05, | |
| "loss": 1.0634, | |
| "mean_token_accuracy": 0.7195238072425127, | |
| "num_tokens": 6201204.0, | |
| "step": 795 | |
| }, | |
| { | |
| "entropy": 1.114043417200446, | |
| "epoch": 0.20937269976282, | |
| "grad_norm": 0.35386136174201965, | |
| "learning_rate": 1.581784873069877e-05, | |
| "loss": 1.0579, | |
| "mean_token_accuracy": 0.7175327345728875, | |
| "num_tokens": 6241249.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.20937269976282, | |
| "eval_entropy": 1.0886374835968018, | |
| "eval_loss": 1.0736411809921265, | |
| "eval_mean_token_accuracy": 0.7216972298622132, | |
| "eval_num_tokens": 6241249.0, | |
| "eval_runtime": 96.8086, | |
| "eval_samples_per_second": 10.33, | |
| "eval_steps_per_second": 1.291, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 1.1533120274543762, | |
| "epoch": 0.2106812791363376, | |
| "grad_norm": 0.37774205207824707, | |
| "learning_rate": 1.579167757131641e-05, | |
| "loss": 1.1071, | |
| "mean_token_accuracy": 0.7110585495829582, | |
| "num_tokens": 6280255.0, | |
| "step": 805 | |
| }, | |
| { | |
| "entropy": 1.1299995694309473, | |
| "epoch": 0.21198985850985524, | |
| "grad_norm": 0.3383270502090454, | |
| "learning_rate": 1.576550641193405e-05, | |
| "loss": 1.1167, | |
| "mean_token_accuracy": 0.7196012400090694, | |
| "num_tokens": 6320822.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 1.132519034296274, | |
| "epoch": 0.21329843788337285, | |
| "grad_norm": 0.4295906126499176, | |
| "learning_rate": 1.573933525255169e-05, | |
| "loss": 1.0924, | |
| "mean_token_accuracy": 0.7042114794254303, | |
| "num_tokens": 6356129.0, | |
| "step": 815 | |
| }, | |
| { | |
| "entropy": 1.0832153180614115, | |
| "epoch": 0.2146070172568905, | |
| "grad_norm": 0.3158726394176483, | |
| "learning_rate": 1.571316409316933e-05, | |
| "loss": 1.0307, | |
| "mean_token_accuracy": 0.7189163245260716, | |
| "num_tokens": 6396277.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 1.1116347532719373, | |
| "epoch": 0.2159155966304081, | |
| "grad_norm": 0.41325512528419495, | |
| "learning_rate": 1.568699293378697e-05, | |
| "loss": 1.0844, | |
| "mean_token_accuracy": 0.7163894921541214, | |
| "num_tokens": 6437087.0, | |
| "step": 825 | |
| }, | |
| { | |
| "entropy": 1.143670390546322, | |
| "epoch": 0.21722417600392574, | |
| "grad_norm": 0.4080452620983124, | |
| "learning_rate": 1.5660821774404607e-05, | |
| "loss": 1.1135, | |
| "mean_token_accuracy": 0.7132163152098656, | |
| "num_tokens": 6476092.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 1.0523447673767805, | |
| "epoch": 0.21853275537744335, | |
| "grad_norm": 0.4133390784263611, | |
| "learning_rate": 1.5634650615022245e-05, | |
| "loss": 0.9998, | |
| "mean_token_accuracy": 0.7262923561036587, | |
| "num_tokens": 6513582.0, | |
| "step": 835 | |
| }, | |
| { | |
| "entropy": 1.1258170191198587, | |
| "epoch": 0.219841334750961, | |
| "grad_norm": 0.30965185165405273, | |
| "learning_rate": 1.5608479455639887e-05, | |
| "loss": 1.0456, | |
| "mean_token_accuracy": 0.7124281242489815, | |
| "num_tokens": 6548126.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 0.9893411479890346, | |
| "epoch": 0.22114991412447862, | |
| "grad_norm": 0.2973790168762207, | |
| "learning_rate": 1.5582308296257525e-05, | |
| "loss": 0.9799, | |
| "mean_token_accuracy": 0.7382751934230327, | |
| "num_tokens": 6586710.0, | |
| "step": 845 | |
| }, | |
| { | |
| "entropy": 1.0379876332357525, | |
| "epoch": 0.22245849349799623, | |
| "grad_norm": 0.3356630206108093, | |
| "learning_rate": 1.5556137136875164e-05, | |
| "loss": 0.996, | |
| "mean_token_accuracy": 0.7286129504442215, | |
| "num_tokens": 6619347.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 1.1024742640554905, | |
| "epoch": 0.22376707287151387, | |
| "grad_norm": 0.2636205554008484, | |
| "learning_rate": 1.5529965977492806e-05, | |
| "loss": 1.0735, | |
| "mean_token_accuracy": 0.7184439640492201, | |
| "num_tokens": 6659706.0, | |
| "step": 855 | |
| }, | |
| { | |
| "entropy": 1.0713739044964314, | |
| "epoch": 0.22507565224503148, | |
| "grad_norm": 0.31522178649902344, | |
| "learning_rate": 1.5503794818110444e-05, | |
| "loss": 1.0024, | |
| "mean_token_accuracy": 0.7307705961167812, | |
| "num_tokens": 6697262.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 1.104831437766552, | |
| "epoch": 0.22638423161854912, | |
| "grad_norm": 0.4077572822570801, | |
| "learning_rate": 1.5477623658728082e-05, | |
| "loss": 1.0503, | |
| "mean_token_accuracy": 0.714413607865572, | |
| "num_tokens": 6736296.0, | |
| "step": 865 | |
| }, | |
| { | |
| "entropy": 1.111717356555164, | |
| "epoch": 0.22769281099206673, | |
| "grad_norm": 0.29983147978782654, | |
| "learning_rate": 1.5451452499345724e-05, | |
| "loss": 1.0415, | |
| "mean_token_accuracy": 0.7202403925359249, | |
| "num_tokens": 6771577.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 1.0804973103106021, | |
| "epoch": 0.22900139036558437, | |
| "grad_norm": 0.3723820149898529, | |
| "learning_rate": 1.5425281339963362e-05, | |
| "loss": 1.0643, | |
| "mean_token_accuracy": 0.7192892145365477, | |
| "num_tokens": 6809238.0, | |
| "step": 875 | |
| }, | |
| { | |
| "entropy": 1.1236145053058864, | |
| "epoch": 0.23030996973910198, | |
| "grad_norm": 0.3762100338935852, | |
| "learning_rate": 1.5399110180581e-05, | |
| "loss": 1.1046, | |
| "mean_token_accuracy": 0.7134812079370022, | |
| "num_tokens": 6849361.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 1.1618900313973426, | |
| "epoch": 0.23161854911261961, | |
| "grad_norm": 0.30742284655570984, | |
| "learning_rate": 1.537293902119864e-05, | |
| "loss": 1.1275, | |
| "mean_token_accuracy": 0.7040756050497293, | |
| "num_tokens": 6888648.0, | |
| "step": 885 | |
| }, | |
| { | |
| "entropy": 1.0937749415636062, | |
| "epoch": 0.23292712848613722, | |
| "grad_norm": 0.3493000864982605, | |
| "learning_rate": 1.534676786181628e-05, | |
| "loss": 0.9861, | |
| "mean_token_accuracy": 0.7299770966172219, | |
| "num_tokens": 6930330.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 1.1693835414946079, | |
| "epoch": 0.23423570785965486, | |
| "grad_norm": 0.4292133152484894, | |
| "learning_rate": 1.532059670243392e-05, | |
| "loss": 1.1322, | |
| "mean_token_accuracy": 0.7005651116371154, | |
| "num_tokens": 6967178.0, | |
| "step": 895 | |
| }, | |
| { | |
| "entropy": 1.1351615231484176, | |
| "epoch": 0.2355442872331725, | |
| "grad_norm": 0.375841349363327, | |
| "learning_rate": 1.5294425543051558e-05, | |
| "loss": 1.0663, | |
| "mean_token_accuracy": 0.7153576008975506, | |
| "num_tokens": 7004358.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2355442872331725, | |
| "eval_entropy": 1.0517715344429015, | |
| "eval_loss": 1.0706533193588257, | |
| "eval_mean_token_accuracy": 0.7221157221794129, | |
| "eval_num_tokens": 7004358.0, | |
| "eval_runtime": 97.0639, | |
| "eval_samples_per_second": 10.302, | |
| "eval_steps_per_second": 1.288, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.0132040306925774, | |
| "epoch": 0.2368528666066901, | |
| "grad_norm": 0.3356113135814667, | |
| "learning_rate": 1.52682543836692e-05, | |
| "loss": 1.0007, | |
| "mean_token_accuracy": 0.7337090380489826, | |
| "num_tokens": 7045486.0, | |
| "step": 905 | |
| }, | |
| { | |
| "entropy": 1.0505661655217409, | |
| "epoch": 0.23816144598020775, | |
| "grad_norm": 0.3069000244140625, | |
| "learning_rate": 1.5242083224286836e-05, | |
| "loss": 1.0113, | |
| "mean_token_accuracy": 0.7300900310277939, | |
| "num_tokens": 7087578.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 1.0948154201731086, | |
| "epoch": 0.23947002535372536, | |
| "grad_norm": 0.26865532994270325, | |
| "learning_rate": 1.5215912064904476e-05, | |
| "loss": 0.996, | |
| "mean_token_accuracy": 0.7241026997566223, | |
| "num_tokens": 7125621.0, | |
| "step": 915 | |
| }, | |
| { | |
| "entropy": 1.0862870909273625, | |
| "epoch": 0.240778604727243, | |
| "grad_norm": 0.3076420724391937, | |
| "learning_rate": 1.5189740905522116e-05, | |
| "loss": 1.0411, | |
| "mean_token_accuracy": 0.7218753002583981, | |
| "num_tokens": 7163248.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 1.0920290663838386, | |
| "epoch": 0.2420871841007606, | |
| "grad_norm": 0.35735777020454407, | |
| "learning_rate": 1.5163569746139755e-05, | |
| "loss": 0.9907, | |
| "mean_token_accuracy": 0.7253928408026695, | |
| "num_tokens": 7198313.0, | |
| "step": 925 | |
| }, | |
| { | |
| "entropy": 1.0561616513878107, | |
| "epoch": 0.24339576347427824, | |
| "grad_norm": 0.3752039670944214, | |
| "learning_rate": 1.5137398586757395e-05, | |
| "loss": 1.0076, | |
| "mean_token_accuracy": 0.7270361024886369, | |
| "num_tokens": 7236629.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 1.192596261948347, | |
| "epoch": 0.24470434284779585, | |
| "grad_norm": 0.3303165137767792, | |
| "learning_rate": 1.5111227427375035e-05, | |
| "loss": 1.1446, | |
| "mean_token_accuracy": 0.7021023813635111, | |
| "num_tokens": 7278903.0, | |
| "step": 935 | |
| }, | |
| { | |
| "entropy": 1.1245649460703135, | |
| "epoch": 0.2460129222213135, | |
| "grad_norm": 0.43512144684791565, | |
| "learning_rate": 1.5085056267992673e-05, | |
| "loss": 1.0783, | |
| "mean_token_accuracy": 0.7149506479501724, | |
| "num_tokens": 7314092.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 1.1568658344447613, | |
| "epoch": 0.2473215015948311, | |
| "grad_norm": 0.2600148320198059, | |
| "learning_rate": 1.5058885108610313e-05, | |
| "loss": 1.1493, | |
| "mean_token_accuracy": 0.7064739305526018, | |
| "num_tokens": 7354852.0, | |
| "step": 945 | |
| }, | |
| { | |
| "entropy": 1.1390374675393105, | |
| "epoch": 0.24863008096834874, | |
| "grad_norm": 0.4230680465698242, | |
| "learning_rate": 1.5032713949227953e-05, | |
| "loss": 1.051, | |
| "mean_token_accuracy": 0.7147066026926041, | |
| "num_tokens": 7391988.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 1.1157769251614809, | |
| "epoch": 0.24993866034186635, | |
| "grad_norm": 0.31027132272720337, | |
| "learning_rate": 1.500654278984559e-05, | |
| "loss": 1.1055, | |
| "mean_token_accuracy": 0.7114299058914184, | |
| "num_tokens": 7430698.0, | |
| "step": 955 | |
| }, | |
| { | |
| "entropy": 1.1084700468927622, | |
| "epoch": 0.251247239715384, | |
| "grad_norm": 0.3883765935897827, | |
| "learning_rate": 1.498037163046323e-05, | |
| "loss": 1.0371, | |
| "mean_token_accuracy": 0.7152060814201832, | |
| "num_tokens": 7466746.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 1.1067252047359943, | |
| "epoch": 0.2525558190889016, | |
| "grad_norm": 0.3586679697036743, | |
| "learning_rate": 1.495420047108087e-05, | |
| "loss": 1.091, | |
| "mean_token_accuracy": 0.7141210325062275, | |
| "num_tokens": 7508827.0, | |
| "step": 965 | |
| }, | |
| { | |
| "entropy": 1.1788568448275327, | |
| "epoch": 0.25386439846241926, | |
| "grad_norm": 0.28028714656829834, | |
| "learning_rate": 1.4928029311698508e-05, | |
| "loss": 1.1108, | |
| "mean_token_accuracy": 0.7068845115602016, | |
| "num_tokens": 7549412.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 1.1229758808389305, | |
| "epoch": 0.25517297783593684, | |
| "grad_norm": 0.25545772910118103, | |
| "learning_rate": 1.4901858152316149e-05, | |
| "loss": 1.0713, | |
| "mean_token_accuracy": 0.7074509769678116, | |
| "num_tokens": 7587152.0, | |
| "step": 975 | |
| }, | |
| { | |
| "entropy": 1.0951409205794334, | |
| "epoch": 0.2564815572094545, | |
| "grad_norm": 0.4516986608505249, | |
| "learning_rate": 1.4875686992933789e-05, | |
| "loss": 0.9969, | |
| "mean_token_accuracy": 0.723530513048172, | |
| "num_tokens": 7623760.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 1.1083605211228131, | |
| "epoch": 0.2577901365829721, | |
| "grad_norm": 0.28635019063949585, | |
| "learning_rate": 1.4849515833551427e-05, | |
| "loss": 1.0367, | |
| "mean_token_accuracy": 0.7231942892074585, | |
| "num_tokens": 7670277.0, | |
| "step": 985 | |
| }, | |
| { | |
| "entropy": 1.140053043141961, | |
| "epoch": 0.25909871595648976, | |
| "grad_norm": 0.29921141266822815, | |
| "learning_rate": 1.4823344674169067e-05, | |
| "loss": 1.0606, | |
| "mean_token_accuracy": 0.7078870553523302, | |
| "num_tokens": 7711405.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 1.1648910887539388, | |
| "epoch": 0.26040729533000734, | |
| "grad_norm": 0.44739824533462524, | |
| "learning_rate": 1.4797173514786707e-05, | |
| "loss": 1.0833, | |
| "mean_token_accuracy": 0.7147607050836087, | |
| "num_tokens": 7745597.0, | |
| "step": 995 | |
| }, | |
| { | |
| "entropy": 1.1015649508684873, | |
| "epoch": 0.261715874703525, | |
| "grad_norm": 0.3616364598274231, | |
| "learning_rate": 1.4771002355404345e-05, | |
| "loss": 1.102, | |
| "mean_token_accuracy": 0.7119447905570269, | |
| "num_tokens": 7778271.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.261715874703525, | |
| "eval_entropy": 1.0673753538131714, | |
| "eval_loss": 1.0669485330581665, | |
| "eval_mean_token_accuracy": 0.7229559454917908, | |
| "eval_num_tokens": 7778271.0, | |
| "eval_runtime": 96.9367, | |
| "eval_samples_per_second": 10.316, | |
| "eval_steps_per_second": 1.29, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 1.1813726715743542, | |
| "epoch": 0.2630244540770426, | |
| "grad_norm": 0.34745144844055176, | |
| "learning_rate": 1.4744831196021986e-05, | |
| "loss": 1.1606, | |
| "mean_token_accuracy": 0.7018857698887586, | |
| "num_tokens": 7820880.0, | |
| "step": 1005 | |
| }, | |
| { | |
| "entropy": 1.1542737431824208, | |
| "epoch": 0.26433303345056025, | |
| "grad_norm": 0.37115049362182617, | |
| "learning_rate": 1.4718660036639626e-05, | |
| "loss": 1.102, | |
| "mean_token_accuracy": 0.7048508331179619, | |
| "num_tokens": 7858918.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 1.0566180381923913, | |
| "epoch": 0.26564161282407783, | |
| "grad_norm": 0.3299511671066284, | |
| "learning_rate": 1.4692488877257262e-05, | |
| "loss": 0.9832, | |
| "mean_token_accuracy": 0.7326587349176407, | |
| "num_tokens": 7896026.0, | |
| "step": 1015 | |
| }, | |
| { | |
| "entropy": 1.131032995507121, | |
| "epoch": 0.26695019219759547, | |
| "grad_norm": 0.3021943271160126, | |
| "learning_rate": 1.4666317717874902e-05, | |
| "loss": 1.1154, | |
| "mean_token_accuracy": 0.7120835833251477, | |
| "num_tokens": 7936716.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 1.1042870879173279, | |
| "epoch": 0.2682587715711131, | |
| "grad_norm": 0.47742941975593567, | |
| "learning_rate": 1.4640146558492542e-05, | |
| "loss": 1.047, | |
| "mean_token_accuracy": 0.7116286411881447, | |
| "num_tokens": 7973759.0, | |
| "step": 1025 | |
| }, | |
| { | |
| "entropy": 1.1014587018638848, | |
| "epoch": 0.26956735094463075, | |
| "grad_norm": 0.36218902468681335, | |
| "learning_rate": 1.461397539911018e-05, | |
| "loss": 1.0494, | |
| "mean_token_accuracy": 0.7173497840762139, | |
| "num_tokens": 8012730.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 1.1734120152890681, | |
| "epoch": 0.2708759303181484, | |
| "grad_norm": 0.38173866271972656, | |
| "learning_rate": 1.458780423972782e-05, | |
| "loss": 1.0801, | |
| "mean_token_accuracy": 0.7082495357841253, | |
| "num_tokens": 8053786.0, | |
| "step": 1035 | |
| }, | |
| { | |
| "entropy": 1.102143411897123, | |
| "epoch": 0.27218450969166597, | |
| "grad_norm": 0.32196053862571716, | |
| "learning_rate": 1.4561633080345461e-05, | |
| "loss": 1.0723, | |
| "mean_token_accuracy": 0.7233434859663248, | |
| "num_tokens": 8092132.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 1.0193447671830653, | |
| "epoch": 0.2734930890651836, | |
| "grad_norm": 0.3907255530357361, | |
| "learning_rate": 1.4535461920963101e-05, | |
| "loss": 0.9895, | |
| "mean_token_accuracy": 0.7300974868237973, | |
| "num_tokens": 8131337.0, | |
| "step": 1045 | |
| }, | |
| { | |
| "entropy": 1.105736630409956, | |
| "epoch": 0.27480166843870124, | |
| "grad_norm": 0.564200758934021, | |
| "learning_rate": 1.450929076158074e-05, | |
| "loss": 1.0985, | |
| "mean_token_accuracy": 0.7164596065878868, | |
| "num_tokens": 8170291.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 1.0954801928251983, | |
| "epoch": 0.2761102478122189, | |
| "grad_norm": 0.3417541980743408, | |
| "learning_rate": 1.448311960219838e-05, | |
| "loss": 1.0295, | |
| "mean_token_accuracy": 0.7208357889205217, | |
| "num_tokens": 8205888.0, | |
| "step": 1055 | |
| }, | |
| { | |
| "entropy": 1.164321320876479, | |
| "epoch": 0.27741882718573646, | |
| "grad_norm": 0.450543075799942, | |
| "learning_rate": 1.445694844281602e-05, | |
| "loss": 1.0975, | |
| "mean_token_accuracy": 0.7097847059369087, | |
| "num_tokens": 8242915.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 1.1172142535448075, | |
| "epoch": 0.2787274065592541, | |
| "grad_norm": 0.3983106315135956, | |
| "learning_rate": 1.4430777283433656e-05, | |
| "loss": 1.0618, | |
| "mean_token_accuracy": 0.7233660608530045, | |
| "num_tokens": 8280049.0, | |
| "step": 1065 | |
| }, | |
| { | |
| "entropy": 1.1097337387502193, | |
| "epoch": 0.28003598593277174, | |
| "grad_norm": 0.3336636424064636, | |
| "learning_rate": 1.4404606124051296e-05, | |
| "loss": 1.0713, | |
| "mean_token_accuracy": 0.7141048096120357, | |
| "num_tokens": 8320724.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 1.143090207502246, | |
| "epoch": 0.2813445653062894, | |
| "grad_norm": 0.47186407446861267, | |
| "learning_rate": 1.4378434964668936e-05, | |
| "loss": 1.1169, | |
| "mean_token_accuracy": 0.7122757855802775, | |
| "num_tokens": 8355846.0, | |
| "step": 1075 | |
| }, | |
| { | |
| "entropy": 1.116348212212324, | |
| "epoch": 0.282653144679807, | |
| "grad_norm": 0.3499598205089569, | |
| "learning_rate": 1.4352263805286575e-05, | |
| "loss": 1.0442, | |
| "mean_token_accuracy": 0.7156722906976938, | |
| "num_tokens": 8394668.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 1.2300225652754306, | |
| "epoch": 0.2839617240533246, | |
| "grad_norm": 0.3478023111820221, | |
| "learning_rate": 1.4326092645904215e-05, | |
| "loss": 1.1722, | |
| "mean_token_accuracy": 0.6905759438872338, | |
| "num_tokens": 8431587.0, | |
| "step": 1085 | |
| }, | |
| { | |
| "entropy": 1.0534675564616918, | |
| "epoch": 0.28527030342684223, | |
| "grad_norm": 0.38736996054649353, | |
| "learning_rate": 1.4299921486521855e-05, | |
| "loss": 1.0075, | |
| "mean_token_accuracy": 0.7261179834604263, | |
| "num_tokens": 8469819.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 1.0744053564965725, | |
| "epoch": 0.28657888280035987, | |
| "grad_norm": 0.32573196291923523, | |
| "learning_rate": 1.4273750327139493e-05, | |
| "loss": 1.0065, | |
| "mean_token_accuracy": 0.727313157171011, | |
| "num_tokens": 8507289.0, | |
| "step": 1095 | |
| }, | |
| { | |
| "entropy": 1.1115598980337382, | |
| "epoch": 0.2878874621738775, | |
| "grad_norm": 0.2983573079109192, | |
| "learning_rate": 1.4247579167757133e-05, | |
| "loss": 1.0318, | |
| "mean_token_accuracy": 0.7211863253265619, | |
| "num_tokens": 8541734.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.2878874621738775, | |
| "eval_entropy": 1.0663088278770447, | |
| "eval_loss": 1.0640983581542969, | |
| "eval_mean_token_accuracy": 0.7233748874664306, | |
| "eval_num_tokens": 8541734.0, | |
| "eval_runtime": 97.2334, | |
| "eval_samples_per_second": 10.285, | |
| "eval_steps_per_second": 1.286, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 1.179681045934558, | |
| "epoch": 0.2891960415473951, | |
| "grad_norm": 0.34421974420547485, | |
| "learning_rate": 1.4221408008374773e-05, | |
| "loss": 1.1522, | |
| "mean_token_accuracy": 0.7020174067467451, | |
| "num_tokens": 8579590.0, | |
| "step": 1105 | |
| }, | |
| { | |
| "entropy": 1.23528895676136, | |
| "epoch": 0.2905046209209127, | |
| "grad_norm": 0.4845016300678253, | |
| "learning_rate": 1.4195236848992412e-05, | |
| "loss": 1.2052, | |
| "mean_token_accuracy": 0.6924681778997183, | |
| "num_tokens": 8616216.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 1.071528448536992, | |
| "epoch": 0.29181320029443036, | |
| "grad_norm": 0.3960973620414734, | |
| "learning_rate": 1.4169065689610052e-05, | |
| "loss": 0.9994, | |
| "mean_token_accuracy": 0.7225042834877968, | |
| "num_tokens": 8654775.0, | |
| "step": 1115 | |
| }, | |
| { | |
| "entropy": 1.2246194936335086, | |
| "epoch": 0.293121779667948, | |
| "grad_norm": 0.42761850357055664, | |
| "learning_rate": 1.4142894530227692e-05, | |
| "loss": 1.1149, | |
| "mean_token_accuracy": 0.6936006706207991, | |
| "num_tokens": 8690533.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 1.0734559996053576, | |
| "epoch": 0.2944303590414656, | |
| "grad_norm": 0.3908097445964813, | |
| "learning_rate": 1.4116723370845328e-05, | |
| "loss": 1.0524, | |
| "mean_token_accuracy": 0.7204558227211237, | |
| "num_tokens": 8727564.0, | |
| "step": 1125 | |
| }, | |
| { | |
| "entropy": 1.0594063133001328, | |
| "epoch": 0.2957389384149832, | |
| "grad_norm": 0.28768423199653625, | |
| "learning_rate": 1.4090552211462969e-05, | |
| "loss": 1.0328, | |
| "mean_token_accuracy": 0.7205640614032746, | |
| "num_tokens": 8774139.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 1.0696671523153782, | |
| "epoch": 0.29704751778850086, | |
| "grad_norm": 0.4257560968399048, | |
| "learning_rate": 1.4064381052080609e-05, | |
| "loss": 1.0058, | |
| "mean_token_accuracy": 0.7245784349739551, | |
| "num_tokens": 8810357.0, | |
| "step": 1135 | |
| }, | |
| { | |
| "entropy": 1.1237424857914449, | |
| "epoch": 0.2983560971620185, | |
| "grad_norm": 0.3298150599002838, | |
| "learning_rate": 1.4038209892698247e-05, | |
| "loss": 1.1134, | |
| "mean_token_accuracy": 0.7139571741223335, | |
| "num_tokens": 8850219.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 1.0590598836541176, | |
| "epoch": 0.29966467653553613, | |
| "grad_norm": 0.36441755294799805, | |
| "learning_rate": 1.4012038733315887e-05, | |
| "loss": 1.0017, | |
| "mean_token_accuracy": 0.72717116586864, | |
| "num_tokens": 8890728.0, | |
| "step": 1145 | |
| }, | |
| { | |
| "entropy": 1.0748774517327546, | |
| "epoch": 0.3009732559090537, | |
| "grad_norm": 0.5299174189567566, | |
| "learning_rate": 1.3985867573933527e-05, | |
| "loss": 1.0444, | |
| "mean_token_accuracy": 0.7247283719480038, | |
| "num_tokens": 8931167.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 1.2320478869602085, | |
| "epoch": 0.30228183528257135, | |
| "grad_norm": 0.31828537583351135, | |
| "learning_rate": 1.3959696414551165e-05, | |
| "loss": 1.1425, | |
| "mean_token_accuracy": 0.7029214788228273, | |
| "num_tokens": 8971179.0, | |
| "step": 1155 | |
| }, | |
| { | |
| "entropy": 1.0877997063100338, | |
| "epoch": 0.303590414656089, | |
| "grad_norm": 0.3202095329761505, | |
| "learning_rate": 1.3933525255168806e-05, | |
| "loss": 1.0868, | |
| "mean_token_accuracy": 0.7259886477142572, | |
| "num_tokens": 9012122.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 1.1524120923131704, | |
| "epoch": 0.30489899402960663, | |
| "grad_norm": 0.44416385889053345, | |
| "learning_rate": 1.3907354095786446e-05, | |
| "loss": 1.1781, | |
| "mean_token_accuracy": 0.7089681778103113, | |
| "num_tokens": 9051879.0, | |
| "step": 1165 | |
| }, | |
| { | |
| "entropy": 1.142632443830371, | |
| "epoch": 0.3062075734031242, | |
| "grad_norm": 0.29014092683792114, | |
| "learning_rate": 1.3881182936404082e-05, | |
| "loss": 1.1183, | |
| "mean_token_accuracy": 0.7098891779780387, | |
| "num_tokens": 9095080.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 1.0738845858722925, | |
| "epoch": 0.30751615277664185, | |
| "grad_norm": 0.5643961429595947, | |
| "learning_rate": 1.3855011777021722e-05, | |
| "loss": 1.0341, | |
| "mean_token_accuracy": 0.7218465633690357, | |
| "num_tokens": 9134553.0, | |
| "step": 1175 | |
| }, | |
| { | |
| "entropy": 1.0997203588485718, | |
| "epoch": 0.3088247321501595, | |
| "grad_norm": 0.3256031274795532, | |
| "learning_rate": 1.3828840617639362e-05, | |
| "loss": 1.0443, | |
| "mean_token_accuracy": 0.7164226226508618, | |
| "num_tokens": 9171451.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 1.1001708198338747, | |
| "epoch": 0.3101333115236771, | |
| "grad_norm": 0.38000190258026123, | |
| "learning_rate": 1.3802669458257e-05, | |
| "loss": 1.0945, | |
| "mean_token_accuracy": 0.7193384803831577, | |
| "num_tokens": 9212655.0, | |
| "step": 1185 | |
| }, | |
| { | |
| "entropy": 1.1442682899534702, | |
| "epoch": 0.3114418908971947, | |
| "grad_norm": 0.38140004873275757, | |
| "learning_rate": 1.377649829887464e-05, | |
| "loss": 1.1116, | |
| "mean_token_accuracy": 0.7097606465220452, | |
| "num_tokens": 9251378.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 1.0966169007122517, | |
| "epoch": 0.31275047027071234, | |
| "grad_norm": 0.30173954367637634, | |
| "learning_rate": 1.3750327139492281e-05, | |
| "loss": 1.0569, | |
| "mean_token_accuracy": 0.7154310643672943, | |
| "num_tokens": 9290376.0, | |
| "step": 1195 | |
| }, | |
| { | |
| "entropy": 1.112706720456481, | |
| "epoch": 0.31405904964423, | |
| "grad_norm": 0.3677958846092224, | |
| "learning_rate": 1.372415598010992e-05, | |
| "loss": 1.1293, | |
| "mean_token_accuracy": 0.7175452932715416, | |
| "num_tokens": 9335450.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.31405904964423, | |
| "eval_entropy": 1.0616401386260987, | |
| "eval_loss": 1.0616909265518188, | |
| "eval_mean_token_accuracy": 0.7236422438621521, | |
| "eval_num_tokens": 9335450.0, | |
| "eval_runtime": 97.3061, | |
| "eval_samples_per_second": 10.277, | |
| "eval_steps_per_second": 1.285, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 1.0197914224117994, | |
| "epoch": 0.3153676290177476, | |
| "grad_norm": 0.303828626871109, | |
| "learning_rate": 1.369798482072756e-05, | |
| "loss": 0.9773, | |
| "mean_token_accuracy": 0.7335654478520155, | |
| "num_tokens": 9376757.0, | |
| "step": 1205 | |
| }, | |
| { | |
| "entropy": 1.0958842366933823, | |
| "epoch": 0.31667620839126526, | |
| "grad_norm": 0.31567996740341187, | |
| "learning_rate": 1.36718136613452e-05, | |
| "loss": 1.1095, | |
| "mean_token_accuracy": 0.7230709217488765, | |
| "num_tokens": 9412399.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 1.0366449020802975, | |
| "epoch": 0.31798478776478284, | |
| "grad_norm": 0.28812670707702637, | |
| "learning_rate": 1.364564250196284e-05, | |
| "loss": 0.9809, | |
| "mean_token_accuracy": 0.7372792772948742, | |
| "num_tokens": 9459659.0, | |
| "step": 1215 | |
| }, | |
| { | |
| "entropy": 1.16514153778553, | |
| "epoch": 0.3192933671383005, | |
| "grad_norm": 0.32817167043685913, | |
| "learning_rate": 1.3619471342580476e-05, | |
| "loss": 1.1274, | |
| "mean_token_accuracy": 0.7073730494827032, | |
| "num_tokens": 9501119.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 1.1919928897172212, | |
| "epoch": 0.3206019465118181, | |
| "grad_norm": 0.4222601056098938, | |
| "learning_rate": 1.3593300183198118e-05, | |
| "loss": 1.1383, | |
| "mean_token_accuracy": 0.7020205046981574, | |
| "num_tokens": 9541572.0, | |
| "step": 1225 | |
| }, | |
| { | |
| "entropy": 1.0665121775120496, | |
| "epoch": 0.32191052588533575, | |
| "grad_norm": 0.37065136432647705, | |
| "learning_rate": 1.3567129023815758e-05, | |
| "loss": 1.0282, | |
| "mean_token_accuracy": 0.7204833775758743, | |
| "num_tokens": 9582686.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 1.1947314314544202, | |
| "epoch": 0.32321910525885333, | |
| "grad_norm": 0.34813007712364197, | |
| "learning_rate": 1.3540957864433395e-05, | |
| "loss": 1.1388, | |
| "mean_token_accuracy": 0.6964055716991424, | |
| "num_tokens": 9623500.0, | |
| "step": 1235 | |
| }, | |
| { | |
| "entropy": 1.0864916004240512, | |
| "epoch": 0.32452768463237097, | |
| "grad_norm": 0.2981452941894531, | |
| "learning_rate": 1.3514786705051035e-05, | |
| "loss": 1.0402, | |
| "mean_token_accuracy": 0.7253331996500492, | |
| "num_tokens": 9667330.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 1.088518501445651, | |
| "epoch": 0.3258362640058886, | |
| "grad_norm": 0.27413734793663025, | |
| "learning_rate": 1.3488615545668675e-05, | |
| "loss": 1.0039, | |
| "mean_token_accuracy": 0.7201201483607292, | |
| "num_tokens": 9707928.0, | |
| "step": 1245 | |
| }, | |
| { | |
| "entropy": 1.0232492092996837, | |
| "epoch": 0.32714484337940625, | |
| "grad_norm": 0.3049759864807129, | |
| "learning_rate": 1.3462444386286313e-05, | |
| "loss": 1.0155, | |
| "mean_token_accuracy": 0.731953501701355, | |
| "num_tokens": 9749414.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 1.1411474995315074, | |
| "epoch": 0.3284534227529239, | |
| "grad_norm": 0.30817753076553345, | |
| "learning_rate": 1.3436273226903953e-05, | |
| "loss": 1.1117, | |
| "mean_token_accuracy": 0.7109464205801487, | |
| "num_tokens": 9787037.0, | |
| "step": 1255 | |
| }, | |
| { | |
| "entropy": 1.0607344008982182, | |
| "epoch": 0.32976200212644147, | |
| "grad_norm": 0.2996397316455841, | |
| "learning_rate": 1.3410102067521593e-05, | |
| "loss": 0.9988, | |
| "mean_token_accuracy": 0.7263243019580841, | |
| "num_tokens": 9823917.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 1.1400364696979524, | |
| "epoch": 0.3310705814999591, | |
| "grad_norm": 0.394529789686203, | |
| "learning_rate": 1.3383930908139232e-05, | |
| "loss": 1.1346, | |
| "mean_token_accuracy": 0.7115379109978676, | |
| "num_tokens": 9863897.0, | |
| "step": 1265 | |
| }, | |
| { | |
| "entropy": 1.1153028715401887, | |
| "epoch": 0.33237916087347674, | |
| "grad_norm": 0.38152188062667847, | |
| "learning_rate": 1.3357759748756872e-05, | |
| "loss": 1.0826, | |
| "mean_token_accuracy": 0.7159910671412945, | |
| "num_tokens": 9907161.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 1.1335538432002068, | |
| "epoch": 0.3336877402469944, | |
| "grad_norm": 0.3795076012611389, | |
| "learning_rate": 1.3331588589374512e-05, | |
| "loss": 1.062, | |
| "mean_token_accuracy": 0.712515490502119, | |
| "num_tokens": 9941050.0, | |
| "step": 1275 | |
| }, | |
| { | |
| "entropy": 1.201156435534358, | |
| "epoch": 0.33499631962051196, | |
| "grad_norm": 0.31430941820144653, | |
| "learning_rate": 1.3305417429992148e-05, | |
| "loss": 1.1755, | |
| "mean_token_accuracy": 0.7030155394226313, | |
| "num_tokens": 9982204.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 1.0916207611560822, | |
| "epoch": 0.3363048989940296, | |
| "grad_norm": 0.3562738597393036, | |
| "learning_rate": 1.3279246270609789e-05, | |
| "loss": 1.03, | |
| "mean_token_accuracy": 0.7254784423857927, | |
| "num_tokens": 10023122.0, | |
| "step": 1285 | |
| }, | |
| { | |
| "entropy": 1.1606000121682882, | |
| "epoch": 0.33761347836754724, | |
| "grad_norm": 0.4184325337409973, | |
| "learning_rate": 1.3253075111227429e-05, | |
| "loss": 1.1373, | |
| "mean_token_accuracy": 0.7046990938484669, | |
| "num_tokens": 10064911.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 1.0977379951626063, | |
| "epoch": 0.3389220577410649, | |
| "grad_norm": 0.3197747766971588, | |
| "learning_rate": 1.3226903951845067e-05, | |
| "loss": 1.0689, | |
| "mean_token_accuracy": 0.7257760524749756, | |
| "num_tokens": 10100407.0, | |
| "step": 1295 | |
| }, | |
| { | |
| "entropy": 1.074541300162673, | |
| "epoch": 0.34023063711458246, | |
| "grad_norm": 0.40503931045532227, | |
| "learning_rate": 1.3200732792462707e-05, | |
| "loss": 0.967, | |
| "mean_token_accuracy": 0.7171443022787571, | |
| "num_tokens": 10134360.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.34023063711458246, | |
| "eval_entropy": 1.04598343706131, | |
| "eval_loss": 1.0586909055709839, | |
| "eval_mean_token_accuracy": 0.7240785593986512, | |
| "eval_num_tokens": 10134360.0, | |
| "eval_runtime": 97.2108, | |
| "eval_samples_per_second": 10.287, | |
| "eval_steps_per_second": 1.286, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 1.093527865409851, | |
| "epoch": 0.3415392164881001, | |
| "grad_norm": 0.30255845189094543, | |
| "learning_rate": 1.3174561633080347e-05, | |
| "loss": 1.0996, | |
| "mean_token_accuracy": 0.7116521965712309, | |
| "num_tokens": 10174760.0, | |
| "step": 1305 | |
| }, | |
| { | |
| "entropy": 1.0523213766515256, | |
| "epoch": 0.34284779586161773, | |
| "grad_norm": 0.2793543040752411, | |
| "learning_rate": 1.3148390473697985e-05, | |
| "loss": 0.9752, | |
| "mean_token_accuracy": 0.7246817748993635, | |
| "num_tokens": 10216071.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 1.0620372116565704, | |
| "epoch": 0.34415637523513537, | |
| "grad_norm": 0.31549394130706787, | |
| "learning_rate": 1.3122219314315626e-05, | |
| "loss": 0.992, | |
| "mean_token_accuracy": 0.7261651441454887, | |
| "num_tokens": 10256598.0, | |
| "step": 1315 | |
| }, | |
| { | |
| "entropy": 1.0895736794918776, | |
| "epoch": 0.345464954608653, | |
| "grad_norm": 0.3750867545604706, | |
| "learning_rate": 1.3096048154933266e-05, | |
| "loss": 1.051, | |
| "mean_token_accuracy": 0.7179807629436255, | |
| "num_tokens": 10294800.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 1.0776633590459823, | |
| "epoch": 0.3467735339821706, | |
| "grad_norm": 0.4244779944419861, | |
| "learning_rate": 1.3069876995550902e-05, | |
| "loss": 1.007, | |
| "mean_token_accuracy": 0.7257129170000554, | |
| "num_tokens": 10334846.0, | |
| "step": 1325 | |
| }, | |
| { | |
| "entropy": 1.1380624793469907, | |
| "epoch": 0.3480821133556882, | |
| "grad_norm": 0.3876670300960541, | |
| "learning_rate": 1.3043705836168542e-05, | |
| "loss": 1.1296, | |
| "mean_token_accuracy": 0.7143970631062985, | |
| "num_tokens": 10374248.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 1.055607095360756, | |
| "epoch": 0.34939069272920587, | |
| "grad_norm": 0.5531731247901917, | |
| "learning_rate": 1.3017534676786182e-05, | |
| "loss": 0.9648, | |
| "mean_token_accuracy": 0.7317230701446533, | |
| "num_tokens": 10410746.0, | |
| "step": 1335 | |
| }, | |
| { | |
| "entropy": 1.134307411313057, | |
| "epoch": 0.3506992721027235, | |
| "grad_norm": 0.3122623860836029, | |
| "learning_rate": 1.299136351740382e-05, | |
| "loss": 1.078, | |
| "mean_token_accuracy": 0.7120365314185619, | |
| "num_tokens": 10452938.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 1.0759996108710765, | |
| "epoch": 0.3520078514762411, | |
| "grad_norm": 0.35110828280448914, | |
| "learning_rate": 1.296519235802146e-05, | |
| "loss": 0.9849, | |
| "mean_token_accuracy": 0.7297365363687277, | |
| "num_tokens": 10491903.0, | |
| "step": 1345 | |
| }, | |
| { | |
| "entropy": 1.1492443475872278, | |
| "epoch": 0.3533164308497587, | |
| "grad_norm": 0.35788699984550476, | |
| "learning_rate": 1.2939021198639101e-05, | |
| "loss": 1.1878, | |
| "mean_token_accuracy": 0.7069006145000458, | |
| "num_tokens": 10527375.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 1.0579143922775984, | |
| "epoch": 0.35462501022327636, | |
| "grad_norm": 0.35787340998649597, | |
| "learning_rate": 1.291285003925674e-05, | |
| "loss": 1.0007, | |
| "mean_token_accuracy": 0.7237559407949448, | |
| "num_tokens": 10568018.0, | |
| "step": 1355 | |
| }, | |
| { | |
| "entropy": 1.1339404121041299, | |
| "epoch": 0.355933589596794, | |
| "grad_norm": 0.4546626806259155, | |
| "learning_rate": 1.288667887987438e-05, | |
| "loss": 1.1295, | |
| "mean_token_accuracy": 0.7201502379029989, | |
| "num_tokens": 10607681.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 1.0893705368041993, | |
| "epoch": 0.3572421689703116, | |
| "grad_norm": 0.45456886291503906, | |
| "learning_rate": 1.286050772049202e-05, | |
| "loss": 1.0304, | |
| "mean_token_accuracy": 0.7344392716884613, | |
| "num_tokens": 10641980.0, | |
| "step": 1365 | |
| }, | |
| { | |
| "entropy": 1.0794930804520846, | |
| "epoch": 0.3585507483438292, | |
| "grad_norm": 0.3907780945301056, | |
| "learning_rate": 1.283433656110966e-05, | |
| "loss": 1.0827, | |
| "mean_token_accuracy": 0.7246263407170772, | |
| "num_tokens": 10683159.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 1.0798350811004638, | |
| "epoch": 0.35985932771734686, | |
| "grad_norm": 0.49161893129348755, | |
| "learning_rate": 1.2808165401727298e-05, | |
| "loss": 1.0915, | |
| "mean_token_accuracy": 0.7175786912441253, | |
| "num_tokens": 10719560.0, | |
| "step": 1375 | |
| }, | |
| { | |
| "entropy": 1.131149498745799, | |
| "epoch": 0.3611679070908645, | |
| "grad_norm": 0.4854622185230255, | |
| "learning_rate": 1.2781994242344938e-05, | |
| "loss": 1.1285, | |
| "mean_token_accuracy": 0.7150968369096518, | |
| "num_tokens": 10759050.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 1.0728477615863086, | |
| "epoch": 0.36247648646438213, | |
| "grad_norm": 0.42540574073791504, | |
| "learning_rate": 1.2755823082962578e-05, | |
| "loss": 1.0358, | |
| "mean_token_accuracy": 0.724468483030796, | |
| "num_tokens": 10800630.0, | |
| "step": 1385 | |
| }, | |
| { | |
| "entropy": 1.1225707672536374, | |
| "epoch": 0.3637850658378997, | |
| "grad_norm": 0.44094985723495483, | |
| "learning_rate": 1.2729651923580215e-05, | |
| "loss": 1.0344, | |
| "mean_token_accuracy": 0.7132398471236229, | |
| "num_tokens": 10835582.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 1.127060130983591, | |
| "epoch": 0.36509364521141735, | |
| "grad_norm": 0.3640246093273163, | |
| "learning_rate": 1.2703480764197855e-05, | |
| "loss": 1.0752, | |
| "mean_token_accuracy": 0.717224034294486, | |
| "num_tokens": 10873989.0, | |
| "step": 1395 | |
| }, | |
| { | |
| "entropy": 1.110959093272686, | |
| "epoch": 0.366402224584935, | |
| "grad_norm": 0.3808118402957916, | |
| "learning_rate": 1.2677309604815495e-05, | |
| "loss": 1.0352, | |
| "mean_token_accuracy": 0.7157318696379662, | |
| "num_tokens": 10914540.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.366402224584935, | |
| "eval_entropy": 1.057036506652832, | |
| "eval_loss": 1.056433081626892, | |
| "eval_mean_token_accuracy": 0.724943666934967, | |
| "eval_num_tokens": 10914540.0, | |
| "eval_runtime": 97.2457, | |
| "eval_samples_per_second": 10.283, | |
| "eval_steps_per_second": 1.285, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 1.1179429598152637, | |
| "epoch": 0.3677108039584526, | |
| "grad_norm": 0.3942883610725403, | |
| "learning_rate": 1.2651138445433133e-05, | |
| "loss": 1.0674, | |
| "mean_token_accuracy": 0.7157964497804642, | |
| "num_tokens": 10951332.0, | |
| "step": 1405 | |
| }, | |
| { | |
| "entropy": 1.144387823715806, | |
| "epoch": 0.3690193833319702, | |
| "grad_norm": 0.35214006900787354, | |
| "learning_rate": 1.2624967286050773e-05, | |
| "loss": 1.1239, | |
| "mean_token_accuracy": 0.7134873129427433, | |
| "num_tokens": 10990644.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 1.057513090223074, | |
| "epoch": 0.37032796270548785, | |
| "grad_norm": 0.3838125169277191, | |
| "learning_rate": 1.2598796126668413e-05, | |
| "loss": 1.0198, | |
| "mean_token_accuracy": 0.7325119793415069, | |
| "num_tokens": 11032177.0, | |
| "step": 1415 | |
| }, | |
| { | |
| "entropy": 1.088934962451458, | |
| "epoch": 0.3716365420790055, | |
| "grad_norm": 0.26814350485801697, | |
| "learning_rate": 1.2572624967286052e-05, | |
| "loss": 1.0808, | |
| "mean_token_accuracy": 0.7172140318900346, | |
| "num_tokens": 11072735.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 1.1592457950115205, | |
| "epoch": 0.3729451214525231, | |
| "grad_norm": 0.45993542671203613, | |
| "learning_rate": 1.2546453807903692e-05, | |
| "loss": 1.1143, | |
| "mean_token_accuracy": 0.7036133039742708, | |
| "num_tokens": 11111309.0, | |
| "step": 1425 | |
| }, | |
| { | |
| "entropy": 1.1127744242548943, | |
| "epoch": 0.3742537008260407, | |
| "grad_norm": 0.3038633167743683, | |
| "learning_rate": 1.2520282648521332e-05, | |
| "loss": 1.0659, | |
| "mean_token_accuracy": 0.725579933822155, | |
| "num_tokens": 11150368.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 1.135872596874833, | |
| "epoch": 0.37556228019955834, | |
| "grad_norm": 0.32042160630226135, | |
| "learning_rate": 1.2494111489138968e-05, | |
| "loss": 1.1148, | |
| "mean_token_accuracy": 0.7169803373515606, | |
| "num_tokens": 11186513.0, | |
| "step": 1435 | |
| }, | |
| { | |
| "entropy": 1.087304012849927, | |
| "epoch": 0.376870859573076, | |
| "grad_norm": 0.3336869180202484, | |
| "learning_rate": 1.2467940329756609e-05, | |
| "loss": 1.0445, | |
| "mean_token_accuracy": 0.7192518580704927, | |
| "num_tokens": 11225174.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 1.0319055780768394, | |
| "epoch": 0.3781794389465936, | |
| "grad_norm": 0.43524104356765747, | |
| "learning_rate": 1.2441769170374249e-05, | |
| "loss": 0.9855, | |
| "mean_token_accuracy": 0.7250500712543726, | |
| "num_tokens": 11261841.0, | |
| "step": 1445 | |
| }, | |
| { | |
| "entropy": 1.0503356985747814, | |
| "epoch": 0.37948801832011125, | |
| "grad_norm": 0.34029972553253174, | |
| "learning_rate": 1.2415598010991887e-05, | |
| "loss": 0.9427, | |
| "mean_token_accuracy": 0.7291811019182205, | |
| "num_tokens": 11301122.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 1.135501291975379, | |
| "epoch": 0.38079659769362884, | |
| "grad_norm": 0.34890979528427124, | |
| "learning_rate": 1.2389426851609527e-05, | |
| "loss": 1.0706, | |
| "mean_token_accuracy": 0.707598365843296, | |
| "num_tokens": 11341491.0, | |
| "step": 1455 | |
| }, | |
| { | |
| "entropy": 1.012824462354183, | |
| "epoch": 0.3821051770671465, | |
| "grad_norm": 0.44555002450942993, | |
| "learning_rate": 1.2363255692227167e-05, | |
| "loss": 0.986, | |
| "mean_token_accuracy": 0.7334959991276264, | |
| "num_tokens": 11382277.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 1.084730239585042, | |
| "epoch": 0.3834137564406641, | |
| "grad_norm": 0.34672921895980835, | |
| "learning_rate": 1.2337084532844805e-05, | |
| "loss": 1.0185, | |
| "mean_token_accuracy": 0.7293158903717994, | |
| "num_tokens": 11420312.0, | |
| "step": 1465 | |
| }, | |
| { | |
| "entropy": 1.0931822203099728, | |
| "epoch": 0.38472233581418175, | |
| "grad_norm": 0.4738580882549286, | |
| "learning_rate": 1.2310913373462446e-05, | |
| "loss": 1.0744, | |
| "mean_token_accuracy": 0.7275880422443152, | |
| "num_tokens": 11458742.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 1.0959295086562633, | |
| "epoch": 0.38603091518769933, | |
| "grad_norm": 0.3458891212940216, | |
| "learning_rate": 1.2284742214080086e-05, | |
| "loss": 1.0425, | |
| "mean_token_accuracy": 0.7294268280267715, | |
| "num_tokens": 11494946.0, | |
| "step": 1475 | |
| }, | |
| { | |
| "entropy": 1.1744199082255364, | |
| "epoch": 0.38733949456121697, | |
| "grad_norm": 0.3247629404067993, | |
| "learning_rate": 1.2258571054697724e-05, | |
| "loss": 1.1929, | |
| "mean_token_accuracy": 0.6963723137974739, | |
| "num_tokens": 11531053.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 1.022578400745988, | |
| "epoch": 0.3886480739347346, | |
| "grad_norm": 0.37479135394096375, | |
| "learning_rate": 1.2232399895315364e-05, | |
| "loss": 0.9592, | |
| "mean_token_accuracy": 0.7337148085236549, | |
| "num_tokens": 11569356.0, | |
| "step": 1485 | |
| }, | |
| { | |
| "entropy": 1.0597552750259638, | |
| "epoch": 0.38995665330825224, | |
| "grad_norm": 0.34326809644699097, | |
| "learning_rate": 1.2206228735933004e-05, | |
| "loss": 1.03, | |
| "mean_token_accuracy": 0.7236659746617079, | |
| "num_tokens": 11609142.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 1.038928609713912, | |
| "epoch": 0.3912652326817699, | |
| "grad_norm": 0.33052825927734375, | |
| "learning_rate": 1.218005757655064e-05, | |
| "loss": 1.0297, | |
| "mean_token_accuracy": 0.7271129328757524, | |
| "num_tokens": 11649979.0, | |
| "step": 1495 | |
| }, | |
| { | |
| "entropy": 1.149324530735612, | |
| "epoch": 0.39257381205528746, | |
| "grad_norm": 0.44736945629119873, | |
| "learning_rate": 1.2153886417168281e-05, | |
| "loss": 1.0566, | |
| "mean_token_accuracy": 0.7146747212857008, | |
| "num_tokens": 11685504.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.39257381205528746, | |
| "eval_entropy": 1.061660517692566, | |
| "eval_loss": 1.0545203685760498, | |
| "eval_mean_token_accuracy": 0.7252409801483154, | |
| "eval_num_tokens": 11685504.0, | |
| "eval_runtime": 97.242, | |
| "eval_samples_per_second": 10.284, | |
| "eval_steps_per_second": 1.285, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 1.1069697327911854, | |
| "epoch": 0.3938823914288051, | |
| "grad_norm": 0.7664462327957153, | |
| "learning_rate": 1.2127715257785921e-05, | |
| "loss": 1.0101, | |
| "mean_token_accuracy": 0.7186468083411455, | |
| "num_tokens": 11720415.0, | |
| "step": 1505 | |
| }, | |
| { | |
| "entropy": 1.0142726931720971, | |
| "epoch": 0.39519097080232274, | |
| "grad_norm": 0.3830406069755554, | |
| "learning_rate": 1.210154409840356e-05, | |
| "loss": 0.974, | |
| "mean_token_accuracy": 0.736117148399353, | |
| "num_tokens": 11756577.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 1.112984489277005, | |
| "epoch": 0.3964995501758404, | |
| "grad_norm": 0.41094282269477844, | |
| "learning_rate": 1.20753729390212e-05, | |
| "loss": 1.1002, | |
| "mean_token_accuracy": 0.7189074017107486, | |
| "num_tokens": 11794872.0, | |
| "step": 1515 | |
| }, | |
| { | |
| "entropy": 1.0922920301556587, | |
| "epoch": 0.39780812954935796, | |
| "grad_norm": 0.37885573506355286, | |
| "learning_rate": 1.204920177963884e-05, | |
| "loss": 1.0302, | |
| "mean_token_accuracy": 0.7164245635271073, | |
| "num_tokens": 11833530.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 1.1610772363841533, | |
| "epoch": 0.3991167089228756, | |
| "grad_norm": 0.44092655181884766, | |
| "learning_rate": 1.2023030620256478e-05, | |
| "loss": 1.1124, | |
| "mean_token_accuracy": 0.6998836103826761, | |
| "num_tokens": 11865444.0, | |
| "step": 1525 | |
| }, | |
| { | |
| "entropy": 1.1576842796057463, | |
| "epoch": 0.40042528829639323, | |
| "grad_norm": 0.44296759366989136, | |
| "learning_rate": 1.1996859460874118e-05, | |
| "loss": 1.1521, | |
| "mean_token_accuracy": 0.7068853817880154, | |
| "num_tokens": 11901975.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 1.0967234898358584, | |
| "epoch": 0.40173386766991087, | |
| "grad_norm": 0.33487918972969055, | |
| "learning_rate": 1.1970688301491758e-05, | |
| "loss": 1.0064, | |
| "mean_token_accuracy": 0.7251658782362937, | |
| "num_tokens": 11939696.0, | |
| "step": 1535 | |
| }, | |
| { | |
| "entropy": 1.086240155249834, | |
| "epoch": 0.40304244704342845, | |
| "grad_norm": 0.40726640820503235, | |
| "learning_rate": 1.1944517142109398e-05, | |
| "loss": 1.0593, | |
| "mean_token_accuracy": 0.725240446254611, | |
| "num_tokens": 11982185.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 1.1482622426003217, | |
| "epoch": 0.4043510264169461, | |
| "grad_norm": 0.4323379397392273, | |
| "learning_rate": 1.1918345982727035e-05, | |
| "loss": 1.128, | |
| "mean_token_accuracy": 0.7092867191880942, | |
| "num_tokens": 12017859.0, | |
| "step": 1545 | |
| }, | |
| { | |
| "entropy": 1.2151045732200145, | |
| "epoch": 0.40565960579046373, | |
| "grad_norm": 0.3303086459636688, | |
| "learning_rate": 1.1892174823344675e-05, | |
| "loss": 1.2045, | |
| "mean_token_accuracy": 0.6960071615874768, | |
| "num_tokens": 12056052.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 1.0982510183006524, | |
| "epoch": 0.40696818516398137, | |
| "grad_norm": 0.3505324721336365, | |
| "learning_rate": 1.1866003663962315e-05, | |
| "loss": 1.067, | |
| "mean_token_accuracy": 0.7211838360875845, | |
| "num_tokens": 12093787.0, | |
| "step": 1555 | |
| }, | |
| { | |
| "entropy": 1.0351288244128227, | |
| "epoch": 0.408276764537499, | |
| "grad_norm": 0.4866773188114166, | |
| "learning_rate": 1.1839832504579953e-05, | |
| "loss": 1.0177, | |
| "mean_token_accuracy": 0.7380380086600781, | |
| "num_tokens": 12135954.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 1.0588519033044577, | |
| "epoch": 0.4095853439110166, | |
| "grad_norm": 0.35260552167892456, | |
| "learning_rate": 1.1813661345197593e-05, | |
| "loss": 0.9948, | |
| "mean_token_accuracy": 0.7311153151094913, | |
| "num_tokens": 12177684.0, | |
| "step": 1565 | |
| }, | |
| { | |
| "entropy": 1.09910380654037, | |
| "epoch": 0.4108939232845342, | |
| "grad_norm": 0.3076630234718323, | |
| "learning_rate": 1.1787490185815233e-05, | |
| "loss": 1.0572, | |
| "mean_token_accuracy": 0.7178509868681431, | |
| "num_tokens": 12217097.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 1.0710554268211125, | |
| "epoch": 0.41220250265805186, | |
| "grad_norm": 0.6353772878646851, | |
| "learning_rate": 1.1761319026432872e-05, | |
| "loss": 1.0566, | |
| "mean_token_accuracy": 0.7280903398990631, | |
| "num_tokens": 12254118.0, | |
| "step": 1575 | |
| }, | |
| { | |
| "entropy": 1.0847718585282564, | |
| "epoch": 0.4135110820315695, | |
| "grad_norm": 0.5102863311767578, | |
| "learning_rate": 1.1735147867050512e-05, | |
| "loss": 1.0074, | |
| "mean_token_accuracy": 0.7276991017162799, | |
| "num_tokens": 12290736.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 1.1992057055234908, | |
| "epoch": 0.4148196614050871, | |
| "grad_norm": 0.46092355251312256, | |
| "learning_rate": 1.1708976707668152e-05, | |
| "loss": 1.1854, | |
| "mean_token_accuracy": 0.7027558118104935, | |
| "num_tokens": 12327361.0, | |
| "step": 1585 | |
| }, | |
| { | |
| "entropy": 1.1261769212782382, | |
| "epoch": 0.4161282407786047, | |
| "grad_norm": 0.2733989357948303, | |
| "learning_rate": 1.168280554828579e-05, | |
| "loss": 1.0773, | |
| "mean_token_accuracy": 0.7125700250267982, | |
| "num_tokens": 12364890.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 1.116732070967555, | |
| "epoch": 0.41743682015212236, | |
| "grad_norm": 0.24640727043151855, | |
| "learning_rate": 1.165663438890343e-05, | |
| "loss": 1.1195, | |
| "mean_token_accuracy": 0.7179068107157945, | |
| "num_tokens": 12408137.0, | |
| "step": 1595 | |
| }, | |
| { | |
| "entropy": 1.119725486636162, | |
| "epoch": 0.41874539952564, | |
| "grad_norm": 0.38413721323013306, | |
| "learning_rate": 1.163046322952107e-05, | |
| "loss": 1.02, | |
| "mean_token_accuracy": 0.7142953380942345, | |
| "num_tokens": 12448886.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.41874539952564, | |
| "eval_entropy": 1.0526617636680604, | |
| "eval_loss": 1.0522558689117432, | |
| "eval_mean_token_accuracy": 0.7257419657707215, | |
| "eval_num_tokens": 12448886.0, | |
| "eval_runtime": 97.091, | |
| "eval_samples_per_second": 10.3, | |
| "eval_steps_per_second": 1.287, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 1.052151035144925, | |
| "epoch": 0.4200539788991576, | |
| "grad_norm": 0.3078325390815735, | |
| "learning_rate": 1.1604292070138707e-05, | |
| "loss": 1.0539, | |
| "mean_token_accuracy": 0.7285561367869378, | |
| "num_tokens": 12488545.0, | |
| "step": 1605 | |
| }, | |
| { | |
| "entropy": 1.143933541327715, | |
| "epoch": 0.4213625582726752, | |
| "grad_norm": 0.4508214592933655, | |
| "learning_rate": 1.1578120910756347e-05, | |
| "loss": 1.1383, | |
| "mean_token_accuracy": 0.7101572744548321, | |
| "num_tokens": 12524775.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 1.1741201549768447, | |
| "epoch": 0.42267113764619285, | |
| "grad_norm": 0.4099365770816803, | |
| "learning_rate": 1.1551949751373987e-05, | |
| "loss": 1.1825, | |
| "mean_token_accuracy": 0.705308374390006, | |
| "num_tokens": 12560784.0, | |
| "step": 1615 | |
| }, | |
| { | |
| "entropy": 1.0800459958612918, | |
| "epoch": 0.4239797170197105, | |
| "grad_norm": 0.3584926128387451, | |
| "learning_rate": 1.1525778591991625e-05, | |
| "loss": 1.0387, | |
| "mean_token_accuracy": 0.7256841942667961, | |
| "num_tokens": 12603010.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 1.1276030663400889, | |
| "epoch": 0.4252882963932281, | |
| "grad_norm": 0.5926274657249451, | |
| "learning_rate": 1.1499607432609266e-05, | |
| "loss": 1.0812, | |
| "mean_token_accuracy": 0.718872818350792, | |
| "num_tokens": 12639637.0, | |
| "step": 1625 | |
| }, | |
| { | |
| "entropy": 1.0730494752526283, | |
| "epoch": 0.4265968757667457, | |
| "grad_norm": 0.3367457389831543, | |
| "learning_rate": 1.1473436273226906e-05, | |
| "loss": 1.0183, | |
| "mean_token_accuracy": 0.7222284369170666, | |
| "num_tokens": 12685003.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 1.1390976216644049, | |
| "epoch": 0.42790545514026335, | |
| "grad_norm": 0.3196088373661041, | |
| "learning_rate": 1.1447265113844544e-05, | |
| "loss": 1.1352, | |
| "mean_token_accuracy": 0.7116539262235164, | |
| "num_tokens": 12725882.0, | |
| "step": 1635 | |
| }, | |
| { | |
| "entropy": 1.1160973783582449, | |
| "epoch": 0.429214034513781, | |
| "grad_norm": 0.30303776264190674, | |
| "learning_rate": 1.1421093954462184e-05, | |
| "loss": 1.045, | |
| "mean_token_accuracy": 0.7158118285238743, | |
| "num_tokens": 12769557.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 1.1582866314798594, | |
| "epoch": 0.4305226138872986, | |
| "grad_norm": 0.3513560891151428, | |
| "learning_rate": 1.1394922795079824e-05, | |
| "loss": 1.1353, | |
| "mean_token_accuracy": 0.705193604901433, | |
| "num_tokens": 12810697.0, | |
| "step": 1645 | |
| }, | |
| { | |
| "entropy": 1.1496058501303197, | |
| "epoch": 0.4318311932608162, | |
| "grad_norm": 0.2923238277435303, | |
| "learning_rate": 1.136875163569746e-05, | |
| "loss": 1.0884, | |
| "mean_token_accuracy": 0.7144812878221274, | |
| "num_tokens": 12851256.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 1.1143131528049708, | |
| "epoch": 0.43313977263433384, | |
| "grad_norm": 0.3065350651741028, | |
| "learning_rate": 1.1342580476315101e-05, | |
| "loss": 1.0523, | |
| "mean_token_accuracy": 0.7173210971057415, | |
| "num_tokens": 12888965.0, | |
| "step": 1655 | |
| }, | |
| { | |
| "entropy": 1.0787921320647, | |
| "epoch": 0.4344483520078515, | |
| "grad_norm": 0.46374326944351196, | |
| "learning_rate": 1.1316409316932741e-05, | |
| "loss": 1.0793, | |
| "mean_token_accuracy": 0.7249518293887377, | |
| "num_tokens": 12929333.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 1.057104966789484, | |
| "epoch": 0.4357569313813691, | |
| "grad_norm": 0.313374400138855, | |
| "learning_rate": 1.129023815755038e-05, | |
| "loss": 1.0395, | |
| "mean_token_accuracy": 0.7259028911590576, | |
| "num_tokens": 12968877.0, | |
| "step": 1665 | |
| }, | |
| { | |
| "entropy": 1.1018395122140645, | |
| "epoch": 0.4370655107548867, | |
| "grad_norm": 0.5729189515113831, | |
| "learning_rate": 1.126406699816802e-05, | |
| "loss": 1.0302, | |
| "mean_token_accuracy": 0.7193635780364275, | |
| "num_tokens": 13009336.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 1.090810688957572, | |
| "epoch": 0.43837409012840434, | |
| "grad_norm": 0.34041526913642883, | |
| "learning_rate": 1.123789583878566e-05, | |
| "loss": 1.0004, | |
| "mean_token_accuracy": 0.7279005717486143, | |
| "num_tokens": 13050299.0, | |
| "step": 1675 | |
| }, | |
| { | |
| "entropy": 0.9902068875730038, | |
| "epoch": 0.439682669501922, | |
| "grad_norm": 0.3354148864746094, | |
| "learning_rate": 1.1211724679403298e-05, | |
| "loss": 0.9542, | |
| "mean_token_accuracy": 0.7408786326646805, | |
| "num_tokens": 13090891.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 0.9622725404798984, | |
| "epoch": 0.4409912488754396, | |
| "grad_norm": 0.36764782667160034, | |
| "learning_rate": 1.1185553520020938e-05, | |
| "loss": 0.9176, | |
| "mean_token_accuracy": 0.7487996619194746, | |
| "num_tokens": 13132101.0, | |
| "step": 1685 | |
| }, | |
| { | |
| "entropy": 1.045450020954013, | |
| "epoch": 0.44229982824895725, | |
| "grad_norm": 0.37813687324523926, | |
| "learning_rate": 1.1159382360638578e-05, | |
| "loss": 0.9415, | |
| "mean_token_accuracy": 0.7361387588083744, | |
| "num_tokens": 13170933.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 1.0522997014224529, | |
| "epoch": 0.44360840762247483, | |
| "grad_norm": 0.382914274930954, | |
| "learning_rate": 1.1133211201256216e-05, | |
| "loss": 1.0357, | |
| "mean_token_accuracy": 0.7225256580859423, | |
| "num_tokens": 13209472.0, | |
| "step": 1695 | |
| }, | |
| { | |
| "entropy": 1.0627464193850755, | |
| "epoch": 0.44491698699599247, | |
| "grad_norm": 0.6186394691467285, | |
| "learning_rate": 1.1107040041873856e-05, | |
| "loss": 0.9805, | |
| "mean_token_accuracy": 0.7301313698291778, | |
| "num_tokens": 13243159.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.44491698699599247, | |
| "eval_entropy": 1.048878930568695, | |
| "eval_loss": 1.0501657724380493, | |
| "eval_mean_token_accuracy": 0.7263216118812561, | |
| "eval_num_tokens": 13243159.0, | |
| "eval_runtime": 97.3502, | |
| "eval_samples_per_second": 10.272, | |
| "eval_steps_per_second": 1.284, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 1.0235866460949183, | |
| "epoch": 0.4462255663695101, | |
| "grad_norm": 0.3209091126918793, | |
| "learning_rate": 1.1080868882491496e-05, | |
| "loss": 1.0014, | |
| "mean_token_accuracy": 0.739182323589921, | |
| "num_tokens": 13280837.0, | |
| "step": 1705 | |
| }, | |
| { | |
| "entropy": 1.0558000404387713, | |
| "epoch": 0.44753414574302774, | |
| "grad_norm": 0.30199235677719116, | |
| "learning_rate": 1.1054697723109137e-05, | |
| "loss": 0.9891, | |
| "mean_token_accuracy": 0.7261330507695675, | |
| "num_tokens": 13322890.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 1.121560701727867, | |
| "epoch": 0.4488427251165453, | |
| "grad_norm": 0.4387257993221283, | |
| "learning_rate": 1.1028526563726773e-05, | |
| "loss": 1.1191, | |
| "mean_token_accuracy": 0.7114489011466503, | |
| "num_tokens": 13360229.0, | |
| "step": 1715 | |
| }, | |
| { | |
| "entropy": 1.0835831377655267, | |
| "epoch": 0.45015130449006296, | |
| "grad_norm": 0.44212666153907776, | |
| "learning_rate": 1.1002355404344413e-05, | |
| "loss": 1.0534, | |
| "mean_token_accuracy": 0.7208398774266243, | |
| "num_tokens": 13401284.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 1.0752512661740183, | |
| "epoch": 0.4514598838635806, | |
| "grad_norm": 0.304918110370636, | |
| "learning_rate": 1.0976184244962053e-05, | |
| "loss": 1.0347, | |
| "mean_token_accuracy": 0.725695389136672, | |
| "num_tokens": 13444485.0, | |
| "step": 1725 | |
| }, | |
| { | |
| "entropy": 1.021533490344882, | |
| "epoch": 0.45276846323709824, | |
| "grad_norm": 0.3151404857635498, | |
| "learning_rate": 1.0950013085579692e-05, | |
| "loss": 0.9656, | |
| "mean_token_accuracy": 0.7328935824334621, | |
| "num_tokens": 13486444.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 1.1548058629035949, | |
| "epoch": 0.4540770426106159, | |
| "grad_norm": 0.4337512254714966, | |
| "learning_rate": 1.0923841926197332e-05, | |
| "loss": 1.1178, | |
| "mean_token_accuracy": 0.7139813594520092, | |
| "num_tokens": 13527431.0, | |
| "step": 1735 | |
| }, | |
| { | |
| "entropy": 1.142104067094624, | |
| "epoch": 0.45538562198413346, | |
| "grad_norm": 0.36979472637176514, | |
| "learning_rate": 1.0897670766814972e-05, | |
| "loss": 1.1315, | |
| "mean_token_accuracy": 0.7120837710797787, | |
| "num_tokens": 13567478.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 1.1160997115075588, | |
| "epoch": 0.4566942013576511, | |
| "grad_norm": 0.3845500946044922, | |
| "learning_rate": 1.087149960743261e-05, | |
| "loss": 1.1022, | |
| "mean_token_accuracy": 0.712579957395792, | |
| "num_tokens": 13605160.0, | |
| "step": 1745 | |
| }, | |
| { | |
| "entropy": 0.9988198220729828, | |
| "epoch": 0.45800278073116873, | |
| "grad_norm": 0.38837000727653503, | |
| "learning_rate": 1.084532844805025e-05, | |
| "loss": 0.9329, | |
| "mean_token_accuracy": 0.7446095444262028, | |
| "num_tokens": 13644314.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 1.130447569489479, | |
| "epoch": 0.4593113601046864, | |
| "grad_norm": 0.3902071714401245, | |
| "learning_rate": 1.081915728866789e-05, | |
| "loss": 1.0794, | |
| "mean_token_accuracy": 0.7083112396299839, | |
| "num_tokens": 13681672.0, | |
| "step": 1755 | |
| }, | |
| { | |
| "entropy": 1.0452111046761274, | |
| "epoch": 0.46061993947820395, | |
| "grad_norm": 0.4102812111377716, | |
| "learning_rate": 1.0792986129285527e-05, | |
| "loss": 1.0083, | |
| "mean_token_accuracy": 0.7271155752241611, | |
| "num_tokens": 13723858.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 0.9923233393579721, | |
| "epoch": 0.4619285188517216, | |
| "grad_norm": 0.3408261239528656, | |
| "learning_rate": 1.0766814969903167e-05, | |
| "loss": 0.9552, | |
| "mean_token_accuracy": 0.7382691666483879, | |
| "num_tokens": 13762274.0, | |
| "step": 1765 | |
| }, | |
| { | |
| "entropy": 1.0694189239293337, | |
| "epoch": 0.46323709822523923, | |
| "grad_norm": 0.33313289284706116, | |
| "learning_rate": 1.0740643810520807e-05, | |
| "loss": 1.0309, | |
| "mean_token_accuracy": 0.728821600228548, | |
| "num_tokens": 13804541.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 1.148886876925826, | |
| "epoch": 0.46454567759875687, | |
| "grad_norm": 0.4039045572280884, | |
| "learning_rate": 1.0714472651138445e-05, | |
| "loss": 1.1428, | |
| "mean_token_accuracy": 0.7021496020257473, | |
| "num_tokens": 13842698.0, | |
| "step": 1775 | |
| }, | |
| { | |
| "entropy": 1.059429519250989, | |
| "epoch": 0.46585425697227445, | |
| "grad_norm": 0.3325613737106323, | |
| "learning_rate": 1.0688301491756086e-05, | |
| "loss": 1.0277, | |
| "mean_token_accuracy": 0.7256958331912756, | |
| "num_tokens": 13883697.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 1.1187500689178704, | |
| "epoch": 0.4671628363457921, | |
| "grad_norm": 0.3969448208808899, | |
| "learning_rate": 1.0662130332373726e-05, | |
| "loss": 1.0818, | |
| "mean_token_accuracy": 0.7165641494095325, | |
| "num_tokens": 13922509.0, | |
| "step": 1785 | |
| }, | |
| { | |
| "entropy": 1.1411572575569153, | |
| "epoch": 0.4684714157193097, | |
| "grad_norm": 0.391754150390625, | |
| "learning_rate": 1.0635959172991364e-05, | |
| "loss": 1.1149, | |
| "mean_token_accuracy": 0.7164013959467411, | |
| "num_tokens": 13958198.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 1.0621652530506254, | |
| "epoch": 0.46977999509282736, | |
| "grad_norm": 0.3251922130584717, | |
| "learning_rate": 1.0609788013609004e-05, | |
| "loss": 1.0348, | |
| "mean_token_accuracy": 0.7337204076349735, | |
| "num_tokens": 13997488.0, | |
| "step": 1795 | |
| }, | |
| { | |
| "entropy": 1.0552761495113372, | |
| "epoch": 0.471088574466345, | |
| "grad_norm": 0.28842347860336304, | |
| "learning_rate": 1.0583616854226644e-05, | |
| "loss": 0.9837, | |
| "mean_token_accuracy": 0.732194972038269, | |
| "num_tokens": 14038479.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.471088574466345, | |
| "eval_entropy": 1.056775134563446, | |
| "eval_loss": 1.048877477645874, | |
| "eval_mean_token_accuracy": 0.7264609928131104, | |
| "eval_num_tokens": 14038479.0, | |
| "eval_runtime": 97.5713, | |
| "eval_samples_per_second": 10.249, | |
| "eval_steps_per_second": 1.281, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 1.1512902732938528, | |
| "epoch": 0.4723971538398626, | |
| "grad_norm": 0.4646497964859009, | |
| "learning_rate": 1.055744569484428e-05, | |
| "loss": 1.1312, | |
| "mean_token_accuracy": 0.7179503016173839, | |
| "num_tokens": 14074174.0, | |
| "step": 1805 | |
| }, | |
| { | |
| "entropy": 1.0646660778671504, | |
| "epoch": 0.4737057332133802, | |
| "grad_norm": 0.29955199360847473, | |
| "learning_rate": 1.0531274535461921e-05, | |
| "loss": 1.0278, | |
| "mean_token_accuracy": 0.7245239842683077, | |
| "num_tokens": 14115408.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 1.0254539363086224, | |
| "epoch": 0.47501431258689786, | |
| "grad_norm": 0.31829652190208435, | |
| "learning_rate": 1.0505103376079561e-05, | |
| "loss": 0.9724, | |
| "mean_token_accuracy": 0.7322143398225307, | |
| "num_tokens": 14154697.0, | |
| "step": 1815 | |
| }, | |
| { | |
| "entropy": 1.1258851181715728, | |
| "epoch": 0.4763228919604155, | |
| "grad_norm": 0.45105260610580444, | |
| "learning_rate": 1.04789322166972e-05, | |
| "loss": 1.1154, | |
| "mean_token_accuracy": 0.7129046030342578, | |
| "num_tokens": 14194725.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 1.098579701408744, | |
| "epoch": 0.4776314713339331, | |
| "grad_norm": 0.37159058451652527, | |
| "learning_rate": 1.045276105731484e-05, | |
| "loss": 1.0148, | |
| "mean_token_accuracy": 0.7232114516198636, | |
| "num_tokens": 14236159.0, | |
| "step": 1825 | |
| }, | |
| { | |
| "entropy": 1.0738941352814435, | |
| "epoch": 0.4789400507074507, | |
| "grad_norm": 0.34484973549842834, | |
| "learning_rate": 1.042658989793248e-05, | |
| "loss": 1.0382, | |
| "mean_token_accuracy": 0.7244183134287596, | |
| "num_tokens": 14275868.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 1.0540620289742946, | |
| "epoch": 0.48024863008096835, | |
| "grad_norm": 0.4253799617290497, | |
| "learning_rate": 1.0400418738550118e-05, | |
| "loss": 0.9825, | |
| "mean_token_accuracy": 0.7318723525851965, | |
| "num_tokens": 14314897.0, | |
| "step": 1835 | |
| }, | |
| { | |
| "entropy": 1.1317568343132733, | |
| "epoch": 0.481557209454486, | |
| "grad_norm": 0.3978184163570404, | |
| "learning_rate": 1.0374247579167758e-05, | |
| "loss": 1.1196, | |
| "mean_token_accuracy": 0.7122768431901931, | |
| "num_tokens": 14353468.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 0.9921163013204932, | |
| "epoch": 0.4828657888280036, | |
| "grad_norm": 0.6404853463172913, | |
| "learning_rate": 1.0348076419785398e-05, | |
| "loss": 0.9584, | |
| "mean_token_accuracy": 0.7398792453110218, | |
| "num_tokens": 14393542.0, | |
| "step": 1845 | |
| }, | |
| { | |
| "entropy": 1.0703047849237919, | |
| "epoch": 0.4841743682015212, | |
| "grad_norm": 0.4562324285507202, | |
| "learning_rate": 1.0321905260403036e-05, | |
| "loss": 1.0268, | |
| "mean_token_accuracy": 0.7257931806147099, | |
| "num_tokens": 14438299.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 1.098178230971098, | |
| "epoch": 0.48548294757503885, | |
| "grad_norm": 0.5880526900291443, | |
| "learning_rate": 1.0295734101020676e-05, | |
| "loss": 1.0345, | |
| "mean_token_accuracy": 0.719204118102789, | |
| "num_tokens": 14472012.0, | |
| "step": 1855 | |
| }, | |
| { | |
| "entropy": 0.9670162990689277, | |
| "epoch": 0.4867915269485565, | |
| "grad_norm": 0.6454201340675354, | |
| "learning_rate": 1.0269562941638316e-05, | |
| "loss": 0.9095, | |
| "mean_token_accuracy": 0.7467180170118809, | |
| "num_tokens": 14508320.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 1.0095211911946536, | |
| "epoch": 0.4881001063220741, | |
| "grad_norm": 0.42344188690185547, | |
| "learning_rate": 1.0243391782255957e-05, | |
| "loss": 0.9624, | |
| "mean_token_accuracy": 0.7334278732538223, | |
| "num_tokens": 14545217.0, | |
| "step": 1865 | |
| }, | |
| { | |
| "entropy": 1.055738451331854, | |
| "epoch": 0.4894086856955917, | |
| "grad_norm": 0.35296040773391724, | |
| "learning_rate": 1.0217220622873593e-05, | |
| "loss": 0.9997, | |
| "mean_token_accuracy": 0.7282113581895828, | |
| "num_tokens": 14587523.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 1.0467969175428151, | |
| "epoch": 0.49071726506910934, | |
| "grad_norm": 0.3616087734699249, | |
| "learning_rate": 1.0191049463491233e-05, | |
| "loss": 1.0211, | |
| "mean_token_accuracy": 0.7238417502492667, | |
| "num_tokens": 14629386.0, | |
| "step": 1875 | |
| }, | |
| { | |
| "entropy": 1.0537079151719808, | |
| "epoch": 0.492025844442627, | |
| "grad_norm": 0.3368701934814453, | |
| "learning_rate": 1.0164878304108873e-05, | |
| "loss": 1.0216, | |
| "mean_token_accuracy": 0.7280502840876579, | |
| "num_tokens": 14672059.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 1.1863497417420148, | |
| "epoch": 0.4933344238161446, | |
| "grad_norm": 0.35971367359161377, | |
| "learning_rate": 1.0138707144726512e-05, | |
| "loss": 1.1773, | |
| "mean_token_accuracy": 0.7055241405963898, | |
| "num_tokens": 14713139.0, | |
| "step": 1885 | |
| }, | |
| { | |
| "entropy": 1.0729142222553492, | |
| "epoch": 0.4946430031896622, | |
| "grad_norm": 0.3640763461589813, | |
| "learning_rate": 1.0112535985344152e-05, | |
| "loss": 1.0327, | |
| "mean_token_accuracy": 0.7191622793674469, | |
| "num_tokens": 14751005.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 1.06036187261343, | |
| "epoch": 0.49595158256317984, | |
| "grad_norm": 0.529114305973053, | |
| "learning_rate": 1.0086364825961792e-05, | |
| "loss": 0.9994, | |
| "mean_token_accuracy": 0.7312576837837697, | |
| "num_tokens": 14791363.0, | |
| "step": 1895 | |
| }, | |
| { | |
| "entropy": 1.0985333036631346, | |
| "epoch": 0.4972601619366975, | |
| "grad_norm": 0.655299723148346, | |
| "learning_rate": 1.006019366657943e-05, | |
| "loss": 1.0157, | |
| "mean_token_accuracy": 0.715160496532917, | |
| "num_tokens": 14824965.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.4972601619366975, | |
| "eval_entropy": 1.049722795009613, | |
| "eval_loss": 1.0476868152618408, | |
| "eval_mean_token_accuracy": 0.7267285461425781, | |
| "eval_num_tokens": 14824965.0, | |
| "eval_runtime": 97.3332, | |
| "eval_samples_per_second": 10.274, | |
| "eval_steps_per_second": 1.284, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 1.1007526509463788, | |
| "epoch": 0.4985687413102151, | |
| "grad_norm": 0.3946051001548767, | |
| "learning_rate": 1.003402250719707e-05, | |
| "loss": 1.0781, | |
| "mean_token_accuracy": 0.7194148235023021, | |
| "num_tokens": 14861751.0, | |
| "step": 1905 | |
| }, | |
| { | |
| "entropy": 1.0730507016181945, | |
| "epoch": 0.4998773206837327, | |
| "grad_norm": 0.4027865529060364, | |
| "learning_rate": 1.000785134781471e-05, | |
| "loss": 1.0453, | |
| "mean_token_accuracy": 0.7289649330079555, | |
| "num_tokens": 14899381.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 1.0322015034034848, | |
| "epoch": 0.5011859000572504, | |
| "grad_norm": 0.3154545724391937, | |
| "learning_rate": 9.981680188432349e-06, | |
| "loss": 0.9092, | |
| "mean_token_accuracy": 0.7354275353252888, | |
| "num_tokens": 14937444.0, | |
| "step": 1915 | |
| }, | |
| { | |
| "entropy": 1.070195358619094, | |
| "epoch": 0.502494479430768, | |
| "grad_norm": 0.6190849542617798, | |
| "learning_rate": 9.955509029049987e-06, | |
| "loss": 0.9885, | |
| "mean_token_accuracy": 0.7289122432470322, | |
| "num_tokens": 14973078.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 1.1444286353886128, | |
| "epoch": 0.5038030588042856, | |
| "grad_norm": 0.3921438455581665, | |
| "learning_rate": 9.929337869667627e-06, | |
| "loss": 1.1402, | |
| "mean_token_accuracy": 0.7071823209524155, | |
| "num_tokens": 15013712.0, | |
| "step": 1925 | |
| }, | |
| { | |
| "entropy": 1.1046634882688522, | |
| "epoch": 0.5051116381778032, | |
| "grad_norm": 0.35547974705696106, | |
| "learning_rate": 9.903166710285267e-06, | |
| "loss": 1.0808, | |
| "mean_token_accuracy": 0.7203411210328341, | |
| "num_tokens": 15056268.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 1.1063471656292676, | |
| "epoch": 0.5064202175513208, | |
| "grad_norm": 0.4154115915298462, | |
| "learning_rate": 9.876995550902906e-06, | |
| "loss": 1.1305, | |
| "mean_token_accuracy": 0.7238013096153736, | |
| "num_tokens": 15095616.0, | |
| "step": 1935 | |
| }, | |
| { | |
| "entropy": 1.093080758303404, | |
| "epoch": 0.5077287969248385, | |
| "grad_norm": 0.33288487792015076, | |
| "learning_rate": 9.850824391520546e-06, | |
| "loss": 1.0423, | |
| "mean_token_accuracy": 0.7236416719853878, | |
| "num_tokens": 15135823.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 1.0675853207707404, | |
| "epoch": 0.5090373762983561, | |
| "grad_norm": 0.4353385269641876, | |
| "learning_rate": 9.824653232138186e-06, | |
| "loss": 1.027, | |
| "mean_token_accuracy": 0.721454057097435, | |
| "num_tokens": 15173689.0, | |
| "step": 1945 | |
| }, | |
| { | |
| "entropy": 0.9674046196043491, | |
| "epoch": 0.5103459556718737, | |
| "grad_norm": 0.6318839192390442, | |
| "learning_rate": 9.798482072755824e-06, | |
| "loss": 0.8977, | |
| "mean_token_accuracy": 0.7534206889569759, | |
| "num_tokens": 15215084.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 1.029712200164795, | |
| "epoch": 0.5116545350453914, | |
| "grad_norm": 0.4194331765174866, | |
| "learning_rate": 9.772310913373462e-06, | |
| "loss": 0.9468, | |
| "mean_token_accuracy": 0.7411890789866448, | |
| "num_tokens": 15255554.0, | |
| "step": 1955 | |
| }, | |
| { | |
| "entropy": 1.0968198750168086, | |
| "epoch": 0.512963114418909, | |
| "grad_norm": 0.5084384679794312, | |
| "learning_rate": 9.746139753991103e-06, | |
| "loss": 1.1293, | |
| "mean_token_accuracy": 0.7243112053722143, | |
| "num_tokens": 15291594.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 1.1029110010713339, | |
| "epoch": 0.5142716937924265, | |
| "grad_norm": 0.39724844694137573, | |
| "learning_rate": 9.719968594608743e-06, | |
| "loss": 1.1192, | |
| "mean_token_accuracy": 0.7196755088865757, | |
| "num_tokens": 15331102.0, | |
| "step": 1965 | |
| }, | |
| { | |
| "entropy": 1.088718469440937, | |
| "epoch": 0.5155802731659442, | |
| "grad_norm": 0.29742664098739624, | |
| "learning_rate": 9.693797435226381e-06, | |
| "loss": 1.0498, | |
| "mean_token_accuracy": 0.718688540160656, | |
| "num_tokens": 15370084.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 1.0800972525030375, | |
| "epoch": 0.5168888525394618, | |
| "grad_norm": 0.4087739586830139, | |
| "learning_rate": 9.667626275844021e-06, | |
| "loss": 1.0431, | |
| "mean_token_accuracy": 0.7267526693642139, | |
| "num_tokens": 15407968.0, | |
| "step": 1975 | |
| }, | |
| { | |
| "entropy": 1.1198014959692955, | |
| "epoch": 0.5181974319129795, | |
| "grad_norm": 0.3880080282688141, | |
| "learning_rate": 9.64145511646166e-06, | |
| "loss": 1.1213, | |
| "mean_token_accuracy": 0.7201451409608126, | |
| "num_tokens": 15445003.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 1.098936778306961, | |
| "epoch": 0.5195060112864971, | |
| "grad_norm": 0.3442750871181488, | |
| "learning_rate": 9.6152839570793e-06, | |
| "loss": 1.0513, | |
| "mean_token_accuracy": 0.7280969373881817, | |
| "num_tokens": 15486123.0, | |
| "step": 1985 | |
| }, | |
| { | |
| "entropy": 1.106550743058324, | |
| "epoch": 0.5208145906600147, | |
| "grad_norm": 0.34958696365356445, | |
| "learning_rate": 9.58911279769694e-06, | |
| "loss": 1.0701, | |
| "mean_token_accuracy": 0.7197918314486742, | |
| "num_tokens": 15525274.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 1.0225239008665086, | |
| "epoch": 0.5221231700335324, | |
| "grad_norm": 0.2728900909423828, | |
| "learning_rate": 9.562941638314578e-06, | |
| "loss": 0.9536, | |
| "mean_token_accuracy": 0.7333686843514442, | |
| "num_tokens": 15567557.0, | |
| "step": 1995 | |
| }, | |
| { | |
| "entropy": 1.015058632940054, | |
| "epoch": 0.52343174940705, | |
| "grad_norm": 0.42618778347969055, | |
| "learning_rate": 9.536770478932218e-06, | |
| "loss": 0.9551, | |
| "mean_token_accuracy": 0.7347531169652939, | |
| "num_tokens": 15608023.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.52343174940705, | |
| "eval_entropy": 1.0467440810203552, | |
| "eval_loss": 1.0457295179367065, | |
| "eval_mean_token_accuracy": 0.727197292804718, | |
| "eval_num_tokens": 15608023.0, | |
| "eval_runtime": 97.3484, | |
| "eval_samples_per_second": 10.272, | |
| "eval_steps_per_second": 1.284, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 1.1194863107055426, | |
| "epoch": 0.5247403287805676, | |
| "grad_norm": 0.43802356719970703, | |
| "learning_rate": 9.510599319549856e-06, | |
| "loss": 1.1008, | |
| "mean_token_accuracy": 0.7199479583650827, | |
| "num_tokens": 15646573.0, | |
| "step": 2005 | |
| }, | |
| { | |
| "entropy": 1.123955625295639, | |
| "epoch": 0.5260489081540852, | |
| "grad_norm": 0.3263470232486725, | |
| "learning_rate": 9.484428160167496e-06, | |
| "loss": 1.0753, | |
| "mean_token_accuracy": 0.7194419991225004, | |
| "num_tokens": 15688301.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "entropy": 1.075960063934326, | |
| "epoch": 0.5273574875276028, | |
| "grad_norm": 0.43562212586402893, | |
| "learning_rate": 9.458257000785136e-06, | |
| "loss": 1.0737, | |
| "mean_token_accuracy": 0.7178402289748191, | |
| "num_tokens": 15725214.0, | |
| "step": 2015 | |
| }, | |
| { | |
| "entropy": 1.048216313868761, | |
| "epoch": 0.5286660669011205, | |
| "grad_norm": 0.2702055871486664, | |
| "learning_rate": 9.432085841402775e-06, | |
| "loss": 1.0164, | |
| "mean_token_accuracy": 0.7296169806271792, | |
| "num_tokens": 15764282.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "entropy": 1.0972533270716667, | |
| "epoch": 0.5299746462746381, | |
| "grad_norm": 0.38811036944389343, | |
| "learning_rate": 9.405914682020413e-06, | |
| "loss": 1.0683, | |
| "mean_token_accuracy": 0.7193491701036692, | |
| "num_tokens": 15807279.0, | |
| "step": 2025 | |
| }, | |
| { | |
| "entropy": 1.0837439749389888, | |
| "epoch": 0.5312832256481557, | |
| "grad_norm": 0.38090547919273376, | |
| "learning_rate": 9.379743522638053e-06, | |
| "loss": 1.0179, | |
| "mean_token_accuracy": 0.7216418325901032, | |
| "num_tokens": 15849485.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "entropy": 1.0280620481818914, | |
| "epoch": 0.5325918050216734, | |
| "grad_norm": 0.44307711720466614, | |
| "learning_rate": 9.353572363255693e-06, | |
| "loss": 0.9705, | |
| "mean_token_accuracy": 0.7349147848784924, | |
| "num_tokens": 15888255.0, | |
| "step": 2035 | |
| }, | |
| { | |
| "entropy": 1.1240373719483614, | |
| "epoch": 0.5339003843951909, | |
| "grad_norm": 0.41446390748023987, | |
| "learning_rate": 9.327401203873332e-06, | |
| "loss": 1.1051, | |
| "mean_token_accuracy": 0.7141484066843986, | |
| "num_tokens": 15929388.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "entropy": 1.1305542685091496, | |
| "epoch": 0.5352089637687086, | |
| "grad_norm": 0.749841034412384, | |
| "learning_rate": 9.301230044490972e-06, | |
| "loss": 1.126, | |
| "mean_token_accuracy": 0.7178158447146415, | |
| "num_tokens": 15971137.0, | |
| "step": 2045 | |
| }, | |
| { | |
| "entropy": 1.0323502123355865, | |
| "epoch": 0.5365175431422262, | |
| "grad_norm": 0.48051634430885315, | |
| "learning_rate": 9.27505888510861e-06, | |
| "loss": 0.9978, | |
| "mean_token_accuracy": 0.7359145298600197, | |
| "num_tokens": 16011804.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 1.0280569117516278, | |
| "epoch": 0.5378261225157438, | |
| "grad_norm": 0.2908135950565338, | |
| "learning_rate": 9.24888772572625e-06, | |
| "loss": 0.9785, | |
| "mean_token_accuracy": 0.7374115951359272, | |
| "num_tokens": 16050785.0, | |
| "step": 2055 | |
| }, | |
| { | |
| "entropy": 1.0569781139492989, | |
| "epoch": 0.5391347018892615, | |
| "grad_norm": 0.35007575154304504, | |
| "learning_rate": 9.22271656634389e-06, | |
| "loss": 0.9877, | |
| "mean_token_accuracy": 0.7339214488863945, | |
| "num_tokens": 16090836.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "entropy": 1.0800054289400578, | |
| "epoch": 0.5404432812627791, | |
| "grad_norm": 0.32629233598709106, | |
| "learning_rate": 9.196545406961529e-06, | |
| "loss": 1.0375, | |
| "mean_token_accuracy": 0.7278057150542736, | |
| "num_tokens": 16131547.0, | |
| "step": 2065 | |
| }, | |
| { | |
| "entropy": 1.0801922511309385, | |
| "epoch": 0.5417518606362968, | |
| "grad_norm": 0.43629997968673706, | |
| "learning_rate": 9.170374247579169e-06, | |
| "loss": 1.0448, | |
| "mean_token_accuracy": 0.7214881278574466, | |
| "num_tokens": 16168637.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "entropy": 1.10114951916039, | |
| "epoch": 0.5430604400098143, | |
| "grad_norm": 0.3292621076107025, | |
| "learning_rate": 9.144203088196809e-06, | |
| "loss": 1.0632, | |
| "mean_token_accuracy": 0.7205091584473848, | |
| "num_tokens": 16210603.0, | |
| "step": 2075 | |
| }, | |
| { | |
| "entropy": 1.0731607031077146, | |
| "epoch": 0.5443690193833319, | |
| "grad_norm": 0.3450382351875305, | |
| "learning_rate": 9.118031928814447e-06, | |
| "loss": 0.985, | |
| "mean_token_accuracy": 0.7285647377371788, | |
| "num_tokens": 16248842.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "entropy": 1.110469686985016, | |
| "epoch": 0.5456775987568496, | |
| "grad_norm": 0.5088993310928345, | |
| "learning_rate": 9.091860769432087e-06, | |
| "loss": 1.0761, | |
| "mean_token_accuracy": 0.7160798791795969, | |
| "num_tokens": 16282386.0, | |
| "step": 2085 | |
| }, | |
| { | |
| "entropy": 1.066830562800169, | |
| "epoch": 0.5469861781303672, | |
| "grad_norm": 0.5230424404144287, | |
| "learning_rate": 9.065689610049726e-06, | |
| "loss": 1.0209, | |
| "mean_token_accuracy": 0.7298058848828077, | |
| "num_tokens": 16315017.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "entropy": 1.0682125560939313, | |
| "epoch": 0.5482947575038849, | |
| "grad_norm": 0.29241734743118286, | |
| "learning_rate": 9.039518450667366e-06, | |
| "loss": 1.0398, | |
| "mean_token_accuracy": 0.7212686751037836, | |
| "num_tokens": 16356673.0, | |
| "step": 2095 | |
| }, | |
| { | |
| "entropy": 1.0166700199246406, | |
| "epoch": 0.5496033368774025, | |
| "grad_norm": 0.37928712368011475, | |
| "learning_rate": 9.013347291285006e-06, | |
| "loss": 0.9511, | |
| "mean_token_accuracy": 0.7362009316682816, | |
| "num_tokens": 16397408.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.5496033368774025, | |
| "eval_entropy": 1.0475300779342651, | |
| "eval_loss": 1.0435727834701538, | |
| "eval_mean_token_accuracy": 0.7274958577156067, | |
| "eval_num_tokens": 16397408.0, | |
| "eval_runtime": 97.2217, | |
| "eval_samples_per_second": 10.286, | |
| "eval_steps_per_second": 1.286, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 1.0761947065591813, | |
| "epoch": 0.5509119162509201, | |
| "grad_norm": 0.4101719558238983, | |
| "learning_rate": 8.987176131902644e-06, | |
| "loss": 1.0654, | |
| "mean_token_accuracy": 0.7192930780351162, | |
| "num_tokens": 16434511.0, | |
| "step": 2105 | |
| }, | |
| { | |
| "entropy": 1.097949853539467, | |
| "epoch": 0.5522204956244378, | |
| "grad_norm": 0.4837743043899536, | |
| "learning_rate": 8.961004972520282e-06, | |
| "loss": 1.0397, | |
| "mean_token_accuracy": 0.7156211558729411, | |
| "num_tokens": 16472214.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "entropy": 1.0977317329496146, | |
| "epoch": 0.5535290749979553, | |
| "grad_norm": 0.39783498644828796, | |
| "learning_rate": 8.934833813137923e-06, | |
| "loss": 1.1348, | |
| "mean_token_accuracy": 0.71487329415977, | |
| "num_tokens": 16512818.0, | |
| "step": 2115 | |
| }, | |
| { | |
| "entropy": 1.1164306864142417, | |
| "epoch": 0.5548376543714729, | |
| "grad_norm": 0.5113677978515625, | |
| "learning_rate": 8.908662653755563e-06, | |
| "loss": 1.0433, | |
| "mean_token_accuracy": 0.7190901465713978, | |
| "num_tokens": 16549988.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "entropy": 1.118570750951767, | |
| "epoch": 0.5561462337449906, | |
| "grad_norm": 0.457736074924469, | |
| "learning_rate": 8.882491494373201e-06, | |
| "loss": 1.0624, | |
| "mean_token_accuracy": 0.7195118874311447, | |
| "num_tokens": 16588026.0, | |
| "step": 2125 | |
| }, | |
| { | |
| "entropy": 1.0895183108747006, | |
| "epoch": 0.5574548131185082, | |
| "grad_norm": 0.443959504365921, | |
| "learning_rate": 8.856320334990841e-06, | |
| "loss": 1.0314, | |
| "mean_token_accuracy": 0.7270146373659372, | |
| "num_tokens": 16626368.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "entropy": 1.091917496919632, | |
| "epoch": 0.5587633924920259, | |
| "grad_norm": 0.3934486210346222, | |
| "learning_rate": 8.83014917560848e-06, | |
| "loss": 1.0449, | |
| "mean_token_accuracy": 0.7249642074108124, | |
| "num_tokens": 16658226.0, | |
| "step": 2135 | |
| }, | |
| { | |
| "entropy": 1.0808244206011295, | |
| "epoch": 0.5600719718655435, | |
| "grad_norm": 0.4523155093193054, | |
| "learning_rate": 8.80397801622612e-06, | |
| "loss": 1.0726, | |
| "mean_token_accuracy": 0.725436444580555, | |
| "num_tokens": 16694803.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "entropy": 1.0595968872308732, | |
| "epoch": 0.561380551239061, | |
| "grad_norm": 0.6529179215431213, | |
| "learning_rate": 8.77780685684376e-06, | |
| "loss": 1.0853, | |
| "mean_token_accuracy": 0.728100998699665, | |
| "num_tokens": 16731046.0, | |
| "step": 2145 | |
| }, | |
| { | |
| "entropy": 1.1225625108927488, | |
| "epoch": 0.5626891306125787, | |
| "grad_norm": 0.5509600043296814, | |
| "learning_rate": 8.751635697461398e-06, | |
| "loss": 1.0464, | |
| "mean_token_accuracy": 0.7140719693154096, | |
| "num_tokens": 16770258.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 1.0641490031033753, | |
| "epoch": 0.5639977099860963, | |
| "grad_norm": 0.401279091835022, | |
| "learning_rate": 8.725464538079038e-06, | |
| "loss": 1.006, | |
| "mean_token_accuracy": 0.7247611582279205, | |
| "num_tokens": 16808152.0, | |
| "step": 2155 | |
| }, | |
| { | |
| "entropy": 1.077583958208561, | |
| "epoch": 0.565306289359614, | |
| "grad_norm": 0.3935731053352356, | |
| "learning_rate": 8.699293378696676e-06, | |
| "loss": 1.0364, | |
| "mean_token_accuracy": 0.7233245447278023, | |
| "num_tokens": 16846703.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "entropy": 1.1116807896643877, | |
| "epoch": 0.5666148687331316, | |
| "grad_norm": 0.31971287727355957, | |
| "learning_rate": 8.673122219314316e-06, | |
| "loss": 1.1141, | |
| "mean_token_accuracy": 0.7137772016227245, | |
| "num_tokens": 16885531.0, | |
| "step": 2165 | |
| }, | |
| { | |
| "entropy": 1.1279645822942257, | |
| "epoch": 0.5679234481066492, | |
| "grad_norm": 0.33220037817955017, | |
| "learning_rate": 8.646951059931956e-06, | |
| "loss": 1.1373, | |
| "mean_token_accuracy": 0.7136048063635826, | |
| "num_tokens": 16922160.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "entropy": 1.0512098886072636, | |
| "epoch": 0.5692320274801669, | |
| "grad_norm": 0.41083812713623047, | |
| "learning_rate": 8.620779900549595e-06, | |
| "loss": 1.0153, | |
| "mean_token_accuracy": 0.7320755422115326, | |
| "num_tokens": 16961964.0, | |
| "step": 2175 | |
| }, | |
| { | |
| "entropy": 1.0632092259824275, | |
| "epoch": 0.5705406068536845, | |
| "grad_norm": 0.3310851752758026, | |
| "learning_rate": 8.594608741167235e-06, | |
| "loss": 1.0625, | |
| "mean_token_accuracy": 0.7241213608533144, | |
| "num_tokens": 17005710.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "entropy": 1.0541639674454928, | |
| "epoch": 0.571849186227202, | |
| "grad_norm": 0.5147233009338379, | |
| "learning_rate": 8.568437581784875e-06, | |
| "loss": 1.0132, | |
| "mean_token_accuracy": 0.7322873834520578, | |
| "num_tokens": 17043166.0, | |
| "step": 2185 | |
| }, | |
| { | |
| "entropy": 1.1003672912716866, | |
| "epoch": 0.5731577656007197, | |
| "grad_norm": 0.6766669154167175, | |
| "learning_rate": 8.542266422402513e-06, | |
| "loss": 1.1054, | |
| "mean_token_accuracy": 0.7199019856750966, | |
| "num_tokens": 17077200.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "entropy": 1.0057383235543966, | |
| "epoch": 0.5744663449742373, | |
| "grad_norm": 0.41832664608955383, | |
| "learning_rate": 8.516095263020152e-06, | |
| "loss": 0.9413, | |
| "mean_token_accuracy": 0.7480561546981335, | |
| "num_tokens": 17116788.0, | |
| "step": 2195 | |
| }, | |
| { | |
| "entropy": 1.0743875481188296, | |
| "epoch": 0.575774924347755, | |
| "grad_norm": 0.38505351543426514, | |
| "learning_rate": 8.489924103637792e-06, | |
| "loss": 0.9846, | |
| "mean_token_accuracy": 0.7285970754921436, | |
| "num_tokens": 17156595.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.575774924347755, | |
| "eval_entropy": 1.047807973384857, | |
| "eval_loss": 1.0419152975082397, | |
| "eval_mean_token_accuracy": 0.7281912508010864, | |
| "eval_num_tokens": 17156595.0, | |
| "eval_runtime": 97.2111, | |
| "eval_samples_per_second": 10.287, | |
| "eval_steps_per_second": 1.286, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 1.018094291538, | |
| "epoch": 0.5770835037212726, | |
| "grad_norm": 0.47084370255470276, | |
| "learning_rate": 8.463752944255432e-06, | |
| "loss": 0.9611, | |
| "mean_token_accuracy": 0.7393688425421715, | |
| "num_tokens": 17190391.0, | |
| "step": 2205 | |
| }, | |
| { | |
| "entropy": 1.0440906029194594, | |
| "epoch": 0.5783920830947902, | |
| "grad_norm": 0.611663818359375, | |
| "learning_rate": 8.43758178487307e-06, | |
| "loss": 1.0335, | |
| "mean_token_accuracy": 0.7285435609519482, | |
| "num_tokens": 17228559.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "entropy": 1.033133103698492, | |
| "epoch": 0.5797006624683079, | |
| "grad_norm": 0.4824153482913971, | |
| "learning_rate": 8.41141062549071e-06, | |
| "loss": 1.013, | |
| "mean_token_accuracy": 0.7390854120254516, | |
| "num_tokens": 17264262.0, | |
| "step": 2215 | |
| }, | |
| { | |
| "entropy": 1.0519094690680504, | |
| "epoch": 0.5810092418418255, | |
| "grad_norm": 0.35666635632514954, | |
| "learning_rate": 8.385239466108349e-06, | |
| "loss": 1.0353, | |
| "mean_token_accuracy": 0.7265864860266447, | |
| "num_tokens": 17298474.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "entropy": 1.1184704467654227, | |
| "epoch": 0.5823178212153431, | |
| "grad_norm": 0.4187191426753998, | |
| "learning_rate": 8.359068306725989e-06, | |
| "loss": 1.0395, | |
| "mean_token_accuracy": 0.7226161871105432, | |
| "num_tokens": 17342249.0, | |
| "step": 2225 | |
| }, | |
| { | |
| "entropy": 1.109714150428772, | |
| "epoch": 0.5836264005888607, | |
| "grad_norm": 0.424214243888855, | |
| "learning_rate": 8.332897147343629e-06, | |
| "loss": 1.0727, | |
| "mean_token_accuracy": 0.7170622806996108, | |
| "num_tokens": 17381434.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "entropy": 1.0706456020474433, | |
| "epoch": 0.5849349799623783, | |
| "grad_norm": 0.3002157211303711, | |
| "learning_rate": 8.306725987961267e-06, | |
| "loss": 1.046, | |
| "mean_token_accuracy": 0.7237683780491352, | |
| "num_tokens": 17419527.0, | |
| "step": 2235 | |
| }, | |
| { | |
| "entropy": 1.0564652543514967, | |
| "epoch": 0.586243559335896, | |
| "grad_norm": 0.3851909041404724, | |
| "learning_rate": 8.280554828578907e-06, | |
| "loss": 0.9903, | |
| "mean_token_accuracy": 0.731862674281001, | |
| "num_tokens": 17456332.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "entropy": 1.059485612437129, | |
| "epoch": 0.5875521387094136, | |
| "grad_norm": 0.30845922231674194, | |
| "learning_rate": 8.254383669196546e-06, | |
| "loss": 1.0166, | |
| "mean_token_accuracy": 0.7287565600126982, | |
| "num_tokens": 17503670.0, | |
| "step": 2245 | |
| }, | |
| { | |
| "entropy": 1.0321750465780497, | |
| "epoch": 0.5888607180829312, | |
| "grad_norm": 0.35590362548828125, | |
| "learning_rate": 8.228212509814186e-06, | |
| "loss": 1.0185, | |
| "mean_token_accuracy": 0.7324472777545452, | |
| "num_tokens": 17541709.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 1.029836729541421, | |
| "epoch": 0.5901692974564489, | |
| "grad_norm": 0.5374951362609863, | |
| "learning_rate": 8.202041350431826e-06, | |
| "loss": 0.9625, | |
| "mean_token_accuracy": 0.7331007912755012, | |
| "num_tokens": 17578411.0, | |
| "step": 2255 | |
| }, | |
| { | |
| "entropy": 1.0388061545789242, | |
| "epoch": 0.5914778768299664, | |
| "grad_norm": 0.26899445056915283, | |
| "learning_rate": 8.175870191049464e-06, | |
| "loss": 0.9839, | |
| "mean_token_accuracy": 0.7304695654660464, | |
| "num_tokens": 17618221.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "entropy": 1.0612275712192059, | |
| "epoch": 0.5927864562034841, | |
| "grad_norm": 0.47736746072769165, | |
| "learning_rate": 8.149699031667102e-06, | |
| "loss": 1.0262, | |
| "mean_token_accuracy": 0.725922341644764, | |
| "num_tokens": 17656712.0, | |
| "step": 2265 | |
| }, | |
| { | |
| "entropy": 1.1566721752285958, | |
| "epoch": 0.5940950355770017, | |
| "grad_norm": 0.37901201844215393, | |
| "learning_rate": 8.123527872284743e-06, | |
| "loss": 1.1165, | |
| "mean_token_accuracy": 0.7078841663897038, | |
| "num_tokens": 17692983.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "entropy": 1.1062884621322155, | |
| "epoch": 0.5954036149505193, | |
| "grad_norm": 0.5285326838493347, | |
| "learning_rate": 8.097356712902383e-06, | |
| "loss": 1.06, | |
| "mean_token_accuracy": 0.7236683152616024, | |
| "num_tokens": 17732335.0, | |
| "step": 2275 | |
| }, | |
| { | |
| "entropy": 1.093737093731761, | |
| "epoch": 0.596712194324037, | |
| "grad_norm": 0.3490510582923889, | |
| "learning_rate": 8.071185553520021e-06, | |
| "loss": 1.0661, | |
| "mean_token_accuracy": 0.7216609187424183, | |
| "num_tokens": 17767130.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "entropy": 1.108936246484518, | |
| "epoch": 0.5980207736975546, | |
| "grad_norm": 0.5806106925010681, | |
| "learning_rate": 8.045014394137661e-06, | |
| "loss": 1.1176, | |
| "mean_token_accuracy": 0.7208809830248356, | |
| "num_tokens": 17803730.0, | |
| "step": 2285 | |
| }, | |
| { | |
| "entropy": 1.168883834220469, | |
| "epoch": 0.5993293530710723, | |
| "grad_norm": 0.36321160197257996, | |
| "learning_rate": 8.0188432347553e-06, | |
| "loss": 1.1264, | |
| "mean_token_accuracy": 0.707911616563797, | |
| "num_tokens": 17844864.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "entropy": 1.0555749453604222, | |
| "epoch": 0.6006379324445899, | |
| "grad_norm": 0.37642714381217957, | |
| "learning_rate": 7.99267207537294e-06, | |
| "loss": 1.0828, | |
| "mean_token_accuracy": 0.7319136694073677, | |
| "num_tokens": 17888044.0, | |
| "step": 2295 | |
| }, | |
| { | |
| "entropy": 1.0490516472607851, | |
| "epoch": 0.6019465118181074, | |
| "grad_norm": 1.011374831199646, | |
| "learning_rate": 7.96650091599058e-06, | |
| "loss": 1.0583, | |
| "mean_token_accuracy": 0.7280811902135611, | |
| "num_tokens": 17925845.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6019465118181074, | |
| "eval_entropy": 1.038636552810669, | |
| "eval_loss": 1.0401142835617065, | |
| "eval_mean_token_accuracy": 0.7285269827842712, | |
| "eval_num_tokens": 17925845.0, | |
| "eval_runtime": 97.1825, | |
| "eval_samples_per_second": 10.29, | |
| "eval_steps_per_second": 1.286, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 1.060953125730157, | |
| "epoch": 0.6032550911916251, | |
| "grad_norm": 0.3888475000858307, | |
| "learning_rate": 7.940329756608218e-06, | |
| "loss": 1.0127, | |
| "mean_token_accuracy": 0.7261522125452757, | |
| "num_tokens": 17967227.0, | |
| "step": 2305 | |
| }, | |
| { | |
| "entropy": 1.108807889930904, | |
| "epoch": 0.6045636705651427, | |
| "grad_norm": 0.4732019901275635, | |
| "learning_rate": 7.914158597225858e-06, | |
| "loss": 1.048, | |
| "mean_token_accuracy": 0.7257308520376682, | |
| "num_tokens": 18000648.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "entropy": 1.0645822279155255, | |
| "epoch": 0.6058722499386603, | |
| "grad_norm": 0.5344901084899902, | |
| "learning_rate": 7.887987437843498e-06, | |
| "loss": 1.0637, | |
| "mean_token_accuracy": 0.726478286832571, | |
| "num_tokens": 18041076.0, | |
| "step": 2315 | |
| }, | |
| { | |
| "entropy": 1.0792375735938549, | |
| "epoch": 0.607180829312178, | |
| "grad_norm": 0.4639453589916229, | |
| "learning_rate": 7.861816278461136e-06, | |
| "loss": 1.0164, | |
| "mean_token_accuracy": 0.7243579313158989, | |
| "num_tokens": 18078141.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "entropy": 1.1244300540536643, | |
| "epoch": 0.6084894086856956, | |
| "grad_norm": 0.4256502389907837, | |
| "learning_rate": 7.835645119078776e-06, | |
| "loss": 1.0715, | |
| "mean_token_accuracy": 0.721744529902935, | |
| "num_tokens": 18115889.0, | |
| "step": 2325 | |
| }, | |
| { | |
| "entropy": 1.0580251537263394, | |
| "epoch": 0.6097979880592133, | |
| "grad_norm": 0.3264947235584259, | |
| "learning_rate": 7.809473959696415e-06, | |
| "loss": 1.0605, | |
| "mean_token_accuracy": 0.7277645215392112, | |
| "num_tokens": 18157055.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "entropy": 1.0746600326150655, | |
| "epoch": 0.6111065674327308, | |
| "grad_norm": 0.2848566472530365, | |
| "learning_rate": 7.783302800314055e-06, | |
| "loss": 1.0606, | |
| "mean_token_accuracy": 0.7236160174012184, | |
| "num_tokens": 18194754.0, | |
| "step": 2335 | |
| }, | |
| { | |
| "entropy": 1.0729442548006773, | |
| "epoch": 0.6124151468062484, | |
| "grad_norm": 0.34202495217323303, | |
| "learning_rate": 7.757131640931695e-06, | |
| "loss": 1.0274, | |
| "mean_token_accuracy": 0.7167530670762062, | |
| "num_tokens": 18239971.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "entropy": 1.1377609726041555, | |
| "epoch": 0.6137237261797661, | |
| "grad_norm": 0.5917489528656006, | |
| "learning_rate": 7.730960481549333e-06, | |
| "loss": 1.0862, | |
| "mean_token_accuracy": 0.7097996596246958, | |
| "num_tokens": 18277572.0, | |
| "step": 2345 | |
| }, | |
| { | |
| "entropy": 1.0688726913183928, | |
| "epoch": 0.6150323055532837, | |
| "grad_norm": 0.3864566683769226, | |
| "learning_rate": 7.704789322166972e-06, | |
| "loss": 0.9895, | |
| "mean_token_accuracy": 0.7335865460336208, | |
| "num_tokens": 18318117.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "entropy": 1.0338464200496673, | |
| "epoch": 0.6163408849268014, | |
| "grad_norm": 0.3866376280784607, | |
| "learning_rate": 7.678618162784612e-06, | |
| "loss": 0.9989, | |
| "mean_token_accuracy": 0.7306736633181572, | |
| "num_tokens": 18357427.0, | |
| "step": 2355 | |
| }, | |
| { | |
| "entropy": 1.0776856757700444, | |
| "epoch": 0.617649464300319, | |
| "grad_norm": 0.37204793095588684, | |
| "learning_rate": 7.652447003402252e-06, | |
| "loss": 1.0805, | |
| "mean_token_accuracy": 0.7231950167566538, | |
| "num_tokens": 18398690.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "entropy": 1.0225880276411772, | |
| "epoch": 0.6189580436738366, | |
| "grad_norm": 0.4354966878890991, | |
| "learning_rate": 7.62627584401989e-06, | |
| "loss": 0.9557, | |
| "mean_token_accuracy": 0.7365483164787292, | |
| "num_tokens": 18436962.0, | |
| "step": 2365 | |
| }, | |
| { | |
| "entropy": 1.056931370869279, | |
| "epoch": 0.6202666230473542, | |
| "grad_norm": 0.29399538040161133, | |
| "learning_rate": 7.60010468463753e-06, | |
| "loss": 1.0455, | |
| "mean_token_accuracy": 0.7246545068919659, | |
| "num_tokens": 18479590.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "entropy": 1.0880108322948217, | |
| "epoch": 0.6215752024208718, | |
| "grad_norm": 0.26629939675331116, | |
| "learning_rate": 7.5739335252551695e-06, | |
| "loss": 1.0752, | |
| "mean_token_accuracy": 0.7159001469612122, | |
| "num_tokens": 18520588.0, | |
| "step": 2375 | |
| }, | |
| { | |
| "entropy": 1.0517158180475235, | |
| "epoch": 0.6228837817943894, | |
| "grad_norm": 0.4959399998188019, | |
| "learning_rate": 7.547762365872809e-06, | |
| "loss": 0.9754, | |
| "mean_token_accuracy": 0.7284245084971189, | |
| "num_tokens": 18556912.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "entropy": 1.0565275117754935, | |
| "epoch": 0.6241923611679071, | |
| "grad_norm": 0.6061561107635498, | |
| "learning_rate": 7.521591206490449e-06, | |
| "loss": 0.9693, | |
| "mean_token_accuracy": 0.7295171424746514, | |
| "num_tokens": 18591545.0, | |
| "step": 2385 | |
| }, | |
| { | |
| "entropy": 1.1225751712918282, | |
| "epoch": 0.6255009405414247, | |
| "grad_norm": 0.43570899963378906, | |
| "learning_rate": 7.495420047108087e-06, | |
| "loss": 1.0837, | |
| "mean_token_accuracy": 0.7186510719358921, | |
| "num_tokens": 18632236.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "entropy": 1.1181074380874634, | |
| "epoch": 0.6268095199149424, | |
| "grad_norm": 0.49738365411758423, | |
| "learning_rate": 7.469248887725726e-06, | |
| "loss": 1.0293, | |
| "mean_token_accuracy": 0.7210543874651194, | |
| "num_tokens": 18671653.0, | |
| "step": 2395 | |
| }, | |
| { | |
| "entropy": 1.092723872885108, | |
| "epoch": 0.62811809928846, | |
| "grad_norm": 0.5354933142662048, | |
| "learning_rate": 7.4430777283433664e-06, | |
| "loss": 1.0792, | |
| "mean_token_accuracy": 0.7276735674589873, | |
| "num_tokens": 18712711.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.62811809928846, | |
| "eval_entropy": 1.0314421949386596, | |
| "eval_loss": 1.0386921167373657, | |
| "eval_mean_token_accuracy": 0.7286318821907043, | |
| "eval_num_tokens": 18712711.0, | |
| "eval_runtime": 97.152, | |
| "eval_samples_per_second": 10.293, | |
| "eval_steps_per_second": 1.287, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 1.1883727714419365, | |
| "epoch": 0.6294266786619775, | |
| "grad_norm": 0.3528256416320801, | |
| "learning_rate": 7.416906568961006e-06, | |
| "loss": 1.1798, | |
| "mean_token_accuracy": 0.7113151207566262, | |
| "num_tokens": 18752528.0, | |
| "step": 2405 | |
| }, | |
| { | |
| "entropy": 1.1015507124364377, | |
| "epoch": 0.6307352580354952, | |
| "grad_norm": 0.421294242143631, | |
| "learning_rate": 7.390735409578646e-06, | |
| "loss": 1.058, | |
| "mean_token_accuracy": 0.7168937616050244, | |
| "num_tokens": 18791480.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "entropy": 1.040089299157262, | |
| "epoch": 0.6320438374090128, | |
| "grad_norm": 0.34841352701187134, | |
| "learning_rate": 7.364564250196284e-06, | |
| "loss": 0.9257, | |
| "mean_token_accuracy": 0.7292000584304332, | |
| "num_tokens": 18827672.0, | |
| "step": 2415 | |
| }, | |
| { | |
| "entropy": 1.0772018309682607, | |
| "epoch": 0.6333524167825305, | |
| "grad_norm": 0.39158549904823303, | |
| "learning_rate": 7.338393090813923e-06, | |
| "loss": 1.0312, | |
| "mean_token_accuracy": 0.7295682705938816, | |
| "num_tokens": 18867541.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "entropy": 1.191065814346075, | |
| "epoch": 0.6346609961560481, | |
| "grad_norm": 0.5301610231399536, | |
| "learning_rate": 7.312221931431563e-06, | |
| "loss": 1.214, | |
| "mean_token_accuracy": 0.7043293602764606, | |
| "num_tokens": 18907080.0, | |
| "step": 2425 | |
| }, | |
| { | |
| "entropy": 1.0250558149069549, | |
| "epoch": 0.6359695755295657, | |
| "grad_norm": 0.46418026089668274, | |
| "learning_rate": 7.286050772049203e-06, | |
| "loss": 0.9982, | |
| "mean_token_accuracy": 0.7344914440065622, | |
| "num_tokens": 18945688.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "entropy": 1.1069321628659963, | |
| "epoch": 0.6372781549030834, | |
| "grad_norm": 0.45496806502342224, | |
| "learning_rate": 7.259879612666841e-06, | |
| "loss": 1.016, | |
| "mean_token_accuracy": 0.7186556220054626, | |
| "num_tokens": 18982398.0, | |
| "step": 2435 | |
| }, | |
| { | |
| "entropy": 1.0202362570911645, | |
| "epoch": 0.638586734276601, | |
| "grad_norm": 0.46681129932403564, | |
| "learning_rate": 7.233708453284481e-06, | |
| "loss": 1.0134, | |
| "mean_token_accuracy": 0.7437549475580454, | |
| "num_tokens": 19022263.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "entropy": 1.026650892943144, | |
| "epoch": 0.6398953136501185, | |
| "grad_norm": 0.2987666428089142, | |
| "learning_rate": 7.20753729390212e-06, | |
| "loss": 0.9835, | |
| "mean_token_accuracy": 0.7353227615356446, | |
| "num_tokens": 19065353.0, | |
| "step": 2445 | |
| }, | |
| { | |
| "entropy": 1.0794596791267395, | |
| "epoch": 0.6412038930236362, | |
| "grad_norm": 0.6045072674751282, | |
| "learning_rate": 7.1813661345197595e-06, | |
| "loss": 1.0901, | |
| "mean_token_accuracy": 0.7167344830930233, | |
| "num_tokens": 19106128.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "entropy": 1.1413863252848386, | |
| "epoch": 0.6425124723971538, | |
| "grad_norm": 0.408202588558197, | |
| "learning_rate": 7.1551949751373995e-06, | |
| "loss": 1.07, | |
| "mean_token_accuracy": 0.709665472060442, | |
| "num_tokens": 19144518.0, | |
| "step": 2455 | |
| }, | |
| { | |
| "entropy": 1.018723826110363, | |
| "epoch": 0.6438210517706715, | |
| "grad_norm": 0.7658793330192566, | |
| "learning_rate": 7.129023815755039e-06, | |
| "loss": 1.0139, | |
| "mean_token_accuracy": 0.7404938109219075, | |
| "num_tokens": 19186225.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "entropy": 1.035357365384698, | |
| "epoch": 0.6451296311441891, | |
| "grad_norm": 0.27525585889816284, | |
| "learning_rate": 7.102852656372677e-06, | |
| "loss": 0.9998, | |
| "mean_token_accuracy": 0.7318320117890835, | |
| "num_tokens": 19227399.0, | |
| "step": 2465 | |
| }, | |
| { | |
| "entropy": 0.9564740158617496, | |
| "epoch": 0.6464382105177067, | |
| "grad_norm": 0.4672304093837738, | |
| "learning_rate": 7.076681496990317e-06, | |
| "loss": 0.9195, | |
| "mean_token_accuracy": 0.7519853860139847, | |
| "num_tokens": 19267194.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "entropy": 1.0981974124908447, | |
| "epoch": 0.6477467898912244, | |
| "grad_norm": 0.5112661719322205, | |
| "learning_rate": 7.050510337607956e-06, | |
| "loss": 1.0416, | |
| "mean_token_accuracy": 0.7168846629559994, | |
| "num_tokens": 19303499.0, | |
| "step": 2475 | |
| }, | |
| { | |
| "entropy": 1.1099361848086118, | |
| "epoch": 0.6490553692647419, | |
| "grad_norm": 0.6969805359840393, | |
| "learning_rate": 7.0243391782255965e-06, | |
| "loss": 1.0715, | |
| "mean_token_accuracy": 0.7152190506458282, | |
| "num_tokens": 19341950.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "entropy": 1.0465524211525916, | |
| "epoch": 0.6503639486382596, | |
| "grad_norm": 0.26998811960220337, | |
| "learning_rate": 6.998168018843236e-06, | |
| "loss": 1.0512, | |
| "mean_token_accuracy": 0.7323187306523323, | |
| "num_tokens": 19386737.0, | |
| "step": 2485 | |
| }, | |
| { | |
| "entropy": 0.9889295142143965, | |
| "epoch": 0.6516725280117772, | |
| "grad_norm": 0.38220036029815674, | |
| "learning_rate": 6.971996859460874e-06, | |
| "loss": 0.9593, | |
| "mean_token_accuracy": 0.7394830010831356, | |
| "num_tokens": 19428001.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "entropy": 1.0677450440824032, | |
| "epoch": 0.6529811073852948, | |
| "grad_norm": 0.4677034914493561, | |
| "learning_rate": 6.945825700078514e-06, | |
| "loss": 0.9954, | |
| "mean_token_accuracy": 0.7268911335617304, | |
| "num_tokens": 19465393.0, | |
| "step": 2495 | |
| }, | |
| { | |
| "entropy": 1.049769724905491, | |
| "epoch": 0.6542896867588125, | |
| "grad_norm": 0.6634142398834229, | |
| "learning_rate": 6.919654540696153e-06, | |
| "loss": 1.039, | |
| "mean_token_accuracy": 0.7285342015326023, | |
| "num_tokens": 19501464.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6542896867588125, | |
| "eval_entropy": 1.035133779525757, | |
| "eval_loss": 1.0362484455108643, | |
| "eval_mean_token_accuracy": 0.7291943707466125, | |
| "eval_num_tokens": 19501464.0, | |
| "eval_runtime": 97.1896, | |
| "eval_samples_per_second": 10.289, | |
| "eval_steps_per_second": 1.286, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 1.0178256921470166, | |
| "epoch": 0.6555982661323301, | |
| "grad_norm": 0.3480639159679413, | |
| "learning_rate": 6.8934833813137926e-06, | |
| "loss": 0.9423, | |
| "mean_token_accuracy": 0.7350625194609165, | |
| "num_tokens": 19541127.0, | |
| "step": 2505 | |
| }, | |
| { | |
| "entropy": 1.1573872964829206, | |
| "epoch": 0.6569068455058478, | |
| "grad_norm": 0.3416019678115845, | |
| "learning_rate": 6.867312221931433e-06, | |
| "loss": 1.1634, | |
| "mean_token_accuracy": 0.7034627720713615, | |
| "num_tokens": 19577837.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "entropy": 1.08112654350698, | |
| "epoch": 0.6582154248793654, | |
| "grad_norm": 0.5420477390289307, | |
| "learning_rate": 6.841141062549072e-06, | |
| "loss": 1.016, | |
| "mean_token_accuracy": 0.7271684132516384, | |
| "num_tokens": 19608622.0, | |
| "step": 2515 | |
| }, | |
| { | |
| "entropy": 0.9906784903258086, | |
| "epoch": 0.6595240042528829, | |
| "grad_norm": 0.46363091468811035, | |
| "learning_rate": 6.81496990316671e-06, | |
| "loss": 0.9497, | |
| "mean_token_accuracy": 0.7543116796761751, | |
| "num_tokens": 19643843.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "entropy": 1.001466591656208, | |
| "epoch": 0.6608325836264006, | |
| "grad_norm": 0.37773942947387695, | |
| "learning_rate": 6.78879874378435e-06, | |
| "loss": 0.9092, | |
| "mean_token_accuracy": 0.7434275582432747, | |
| "num_tokens": 19681055.0, | |
| "step": 2525 | |
| }, | |
| { | |
| "entropy": 1.1146674454212189, | |
| "epoch": 0.6621411629999182, | |
| "grad_norm": 0.33123865723609924, | |
| "learning_rate": 6.7626275844019895e-06, | |
| "loss": 1.1595, | |
| "mean_token_accuracy": 0.721651129052043, | |
| "num_tokens": 19722916.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "entropy": 1.0137363068759442, | |
| "epoch": 0.6634497423734358, | |
| "grad_norm": 0.2801375091075897, | |
| "learning_rate": 6.736456425019629e-06, | |
| "loss": 1.0089, | |
| "mean_token_accuracy": 0.7334360480308533, | |
| "num_tokens": 19764966.0, | |
| "step": 2535 | |
| }, | |
| { | |
| "entropy": 1.0639188002794981, | |
| "epoch": 0.6647583217469535, | |
| "grad_norm": 0.3247496485710144, | |
| "learning_rate": 6.710285265637269e-06, | |
| "loss": 1.0763, | |
| "mean_token_accuracy": 0.7175609428435564, | |
| "num_tokens": 19804025.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "entropy": 1.1048081483691932, | |
| "epoch": 0.6660669011204711, | |
| "grad_norm": 0.4086282551288605, | |
| "learning_rate": 6.684114106254907e-06, | |
| "loss": 1.1058, | |
| "mean_token_accuracy": 0.7259872317314148, | |
| "num_tokens": 19843774.0, | |
| "step": 2545 | |
| }, | |
| { | |
| "entropy": 1.1255848862230777, | |
| "epoch": 0.6673754804939888, | |
| "grad_norm": 0.44439879059791565, | |
| "learning_rate": 6.657942946872546e-06, | |
| "loss": 1.0964, | |
| "mean_token_accuracy": 0.7162352904677391, | |
| "num_tokens": 19881825.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "entropy": 1.1439194109290838, | |
| "epoch": 0.6686840598675063, | |
| "grad_norm": 0.29299938678741455, | |
| "learning_rate": 6.6317717874901865e-06, | |
| "loss": 1.1223, | |
| "mean_token_accuracy": 0.7133913572877646, | |
| "num_tokens": 19921618.0, | |
| "step": 2555 | |
| }, | |
| { | |
| "entropy": 1.029699081927538, | |
| "epoch": 0.6699926392410239, | |
| "grad_norm": 0.40280821919441223, | |
| "learning_rate": 6.605600628107826e-06, | |
| "loss": 0.9621, | |
| "mean_token_accuracy": 0.7391855999827385, | |
| "num_tokens": 19960586.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "entropy": 1.003632688894868, | |
| "epoch": 0.6713012186145416, | |
| "grad_norm": 0.2862677276134491, | |
| "learning_rate": 6.579429468725466e-06, | |
| "loss": 0.9738, | |
| "mean_token_accuracy": 0.7375729866325855, | |
| "num_tokens": 20000547.0, | |
| "step": 2565 | |
| }, | |
| { | |
| "entropy": 1.1057576406747103, | |
| "epoch": 0.6726097979880592, | |
| "grad_norm": 0.28728872537612915, | |
| "learning_rate": 6.553258309343105e-06, | |
| "loss": 1.0678, | |
| "mean_token_accuracy": 0.7252844780683517, | |
| "num_tokens": 20045934.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "entropy": 1.0278929378837347, | |
| "epoch": 0.6739183773615769, | |
| "grad_norm": 0.3557819724082947, | |
| "learning_rate": 6.527087149960743e-06, | |
| "loss": 0.9804, | |
| "mean_token_accuracy": 0.7362292345613242, | |
| "num_tokens": 20088247.0, | |
| "step": 2575 | |
| }, | |
| { | |
| "entropy": 1.0297599412500857, | |
| "epoch": 0.6752269567350945, | |
| "grad_norm": 0.3730546534061432, | |
| "learning_rate": 6.500915990578383e-06, | |
| "loss": 1.0108, | |
| "mean_token_accuracy": 0.7349158681929111, | |
| "num_tokens": 20130988.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "entropy": 1.0113672140985728, | |
| "epoch": 0.6765355361086121, | |
| "grad_norm": 0.3825022876262665, | |
| "learning_rate": 6.474744831196023e-06, | |
| "loss": 0.9893, | |
| "mean_token_accuracy": 0.7319887422025204, | |
| "num_tokens": 20169034.0, | |
| "step": 2585 | |
| }, | |
| { | |
| "entropy": 1.1008230350911616, | |
| "epoch": 0.6778441154821297, | |
| "grad_norm": 0.5840580463409424, | |
| "learning_rate": 6.448573671813662e-06, | |
| "loss": 1.0848, | |
| "mean_token_accuracy": 0.716325681656599, | |
| "num_tokens": 20206269.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "entropy": 1.0766204059123994, | |
| "epoch": 0.6791526948556473, | |
| "grad_norm": 0.4373389482498169, | |
| "learning_rate": 6.422402512431302e-06, | |
| "loss": 1.0217, | |
| "mean_token_accuracy": 0.7277089573442936, | |
| "num_tokens": 20241866.0, | |
| "step": 2595 | |
| }, | |
| { | |
| "entropy": 1.0236305307596922, | |
| "epoch": 0.6804612742291649, | |
| "grad_norm": 0.6429861783981323, | |
| "learning_rate": 6.39623135304894e-06, | |
| "loss": 0.9714, | |
| "mean_token_accuracy": 0.7359235771000385, | |
| "num_tokens": 20279405.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.6804612742291649, | |
| "eval_entropy": 1.0339577412605285, | |
| "eval_loss": 1.03507399559021, | |
| "eval_mean_token_accuracy": 0.7294820728302002, | |
| "eval_num_tokens": 20279405.0, | |
| "eval_runtime": 97.2889, | |
| "eval_samples_per_second": 10.279, | |
| "eval_steps_per_second": 1.285, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 1.1064529620110988, | |
| "epoch": 0.6817698536026826, | |
| "grad_norm": 0.6711133718490601, | |
| "learning_rate": 6.3700601936665795e-06, | |
| "loss": 1.0578, | |
| "mean_token_accuracy": 0.7143168658018112, | |
| "num_tokens": 20318358.0, | |
| "step": 2605 | |
| }, | |
| { | |
| "entropy": 1.0551523761823773, | |
| "epoch": 0.6830784329762002, | |
| "grad_norm": 0.6876797080039978, | |
| "learning_rate": 6.3438890342842196e-06, | |
| "loss": 1.0351, | |
| "mean_token_accuracy": 0.7304907165467739, | |
| "num_tokens": 20355919.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "entropy": 1.0216753158718348, | |
| "epoch": 0.6843870123497179, | |
| "grad_norm": 0.43939870595932007, | |
| "learning_rate": 6.317717874901859e-06, | |
| "loss": 0.9788, | |
| "mean_token_accuracy": 0.7326975151896477, | |
| "num_tokens": 20395739.0, | |
| "step": 2615 | |
| }, | |
| { | |
| "entropy": 1.0042755860835313, | |
| "epoch": 0.6856955917232355, | |
| "grad_norm": 0.45771950483322144, | |
| "learning_rate": 6.291546715519498e-06, | |
| "loss": 1.0214, | |
| "mean_token_accuracy": 0.7377393446862698, | |
| "num_tokens": 20439063.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "entropy": 1.08444495908916, | |
| "epoch": 0.687004171096753, | |
| "grad_norm": 0.328112930059433, | |
| "learning_rate": 6.265375556137138e-06, | |
| "loss": 1.0578, | |
| "mean_token_accuracy": 0.7258397076278925, | |
| "num_tokens": 20480446.0, | |
| "step": 2625 | |
| }, | |
| { | |
| "entropy": 1.053849907964468, | |
| "epoch": 0.6883127504702707, | |
| "grad_norm": 0.4496280550956726, | |
| "learning_rate": 6.2392043967547764e-06, | |
| "loss": 1.0282, | |
| "mean_token_accuracy": 0.7329249262809754, | |
| "num_tokens": 20519257.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "entropy": 1.0985306035727262, | |
| "epoch": 0.6896213298437883, | |
| "grad_norm": 0.43236038088798523, | |
| "learning_rate": 6.213033237372416e-06, | |
| "loss": 1.0809, | |
| "mean_token_accuracy": 0.7229348205029964, | |
| "num_tokens": 20560703.0, | |
| "step": 2635 | |
| }, | |
| { | |
| "entropy": 1.039340978488326, | |
| "epoch": 0.690929909217306, | |
| "grad_norm": 0.592126727104187, | |
| "learning_rate": 6.186862077990056e-06, | |
| "loss": 0.9885, | |
| "mean_token_accuracy": 0.7413885131478309, | |
| "num_tokens": 20602363.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "entropy": 1.040134682506323, | |
| "epoch": 0.6922384885908236, | |
| "grad_norm": 0.5198757648468018, | |
| "learning_rate": 6.160690918607695e-06, | |
| "loss": 1.0226, | |
| "mean_token_accuracy": 0.7368385374546051, | |
| "num_tokens": 20639437.0, | |
| "step": 2645 | |
| }, | |
| { | |
| "entropy": 1.1080417584627866, | |
| "epoch": 0.6935470679643412, | |
| "grad_norm": 0.2894703149795532, | |
| "learning_rate": 6.134519759225335e-06, | |
| "loss": 1.1345, | |
| "mean_token_accuracy": 0.7155061598867178, | |
| "num_tokens": 20680275.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "entropy": 1.0198019824922084, | |
| "epoch": 0.6948556473378589, | |
| "grad_norm": 0.3462557792663574, | |
| "learning_rate": 6.108348599842973e-06, | |
| "loss": 1.0143, | |
| "mean_token_accuracy": 0.7402982845902443, | |
| "num_tokens": 20718417.0, | |
| "step": 2655 | |
| }, | |
| { | |
| "entropy": 1.0144409965723753, | |
| "epoch": 0.6961642267113765, | |
| "grad_norm": 0.3350953757762909, | |
| "learning_rate": 6.082177440460613e-06, | |
| "loss": 0.9838, | |
| "mean_token_accuracy": 0.7343977279961109, | |
| "num_tokens": 20762227.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "entropy": 1.0715901810675859, | |
| "epoch": 0.697472806084894, | |
| "grad_norm": 0.6075988411903381, | |
| "learning_rate": 6.056006281078253e-06, | |
| "loss": 1.0055, | |
| "mean_token_accuracy": 0.7305215656757355, | |
| "num_tokens": 20798122.0, | |
| "step": 2665 | |
| }, | |
| { | |
| "entropy": 1.1007160846143962, | |
| "epoch": 0.6987813854584117, | |
| "grad_norm": 0.4501047134399414, | |
| "learning_rate": 6.029835121695892e-06, | |
| "loss": 1.058, | |
| "mean_token_accuracy": 0.7208323903381825, | |
| "num_tokens": 20834210.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "entropy": 1.0522528853267432, | |
| "epoch": 0.7000899648319293, | |
| "grad_norm": 0.36250752210617065, | |
| "learning_rate": 6.00366396231353e-06, | |
| "loss": 0.9972, | |
| "mean_token_accuracy": 0.7273079171776772, | |
| "num_tokens": 20872418.0, | |
| "step": 2675 | |
| }, | |
| { | |
| "entropy": 1.054316360130906, | |
| "epoch": 0.701398544205447, | |
| "grad_norm": 0.3319310247898102, | |
| "learning_rate": 5.977492802931171e-06, | |
| "loss": 1.0515, | |
| "mean_token_accuracy": 0.7271708060055971, | |
| "num_tokens": 20911504.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "entropy": 1.0553948923945426, | |
| "epoch": 0.7027071235789646, | |
| "grad_norm": 0.46403029561042786, | |
| "learning_rate": 5.9513216435488095e-06, | |
| "loss": 1.0058, | |
| "mean_token_accuracy": 0.733677176013589, | |
| "num_tokens": 20947438.0, | |
| "step": 2685 | |
| }, | |
| { | |
| "entropy": 1.135214940458536, | |
| "epoch": 0.7040157029524822, | |
| "grad_norm": 0.2997356355190277, | |
| "learning_rate": 5.925150484166449e-06, | |
| "loss": 1.0205, | |
| "mean_token_accuracy": 0.7149875804781913, | |
| "num_tokens": 20984843.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "entropy": 1.067294117808342, | |
| "epoch": 0.7053242823259999, | |
| "grad_norm": 0.7109698057174683, | |
| "learning_rate": 5.898979324784089e-06, | |
| "loss": 1.0424, | |
| "mean_token_accuracy": 0.7298292949795723, | |
| "num_tokens": 21024317.0, | |
| "step": 2695 | |
| }, | |
| { | |
| "entropy": 1.0000967428088188, | |
| "epoch": 0.7066328616995174, | |
| "grad_norm": 0.4540089964866638, | |
| "learning_rate": 5.872808165401728e-06, | |
| "loss": 1.0007, | |
| "mean_token_accuracy": 0.734975878149271, | |
| "num_tokens": 21064116.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.7066328616995174, | |
| "eval_entropy": 1.033733127117157, | |
| "eval_loss": 1.0336334705352783, | |
| "eval_mean_token_accuracy": 0.7298680348396301, | |
| "eval_num_tokens": 21064116.0, | |
| "eval_runtime": 97.2545, | |
| "eval_samples_per_second": 10.282, | |
| "eval_steps_per_second": 1.285, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 1.0902955643832684, | |
| "epoch": 0.7079414410730351, | |
| "grad_norm": 0.4074270725250244, | |
| "learning_rate": 5.846637006019366e-06, | |
| "loss": 1.103, | |
| "mean_token_accuracy": 0.7196738637983799, | |
| "num_tokens": 21106096.0, | |
| "step": 2705 | |
| }, | |
| { | |
| "entropy": 0.9893661700189114, | |
| "epoch": 0.7092500204465527, | |
| "grad_norm": 0.43556877970695496, | |
| "learning_rate": 5.8204658466370065e-06, | |
| "loss": 0.9469, | |
| "mean_token_accuracy": 0.7477334760129452, | |
| "num_tokens": 21143151.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "entropy": 1.0833146207034587, | |
| "epoch": 0.7105585998200703, | |
| "grad_norm": 0.6449909806251526, | |
| "learning_rate": 5.794294687254646e-06, | |
| "loss": 1.0748, | |
| "mean_token_accuracy": 0.7204306960105896, | |
| "num_tokens": 21179840.0, | |
| "step": 2715 | |
| }, | |
| { | |
| "entropy": 1.0693892493844033, | |
| "epoch": 0.711867179193588, | |
| "grad_norm": 0.2570124864578247, | |
| "learning_rate": 5.768123527872285e-06, | |
| "loss": 1.0456, | |
| "mean_token_accuracy": 0.722801336273551, | |
| "num_tokens": 21217194.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "entropy": 1.0765843842178584, | |
| "epoch": 0.7131757585671056, | |
| "grad_norm": 0.5316287279129028, | |
| "learning_rate": 5.741952368489925e-06, | |
| "loss": 1.0263, | |
| "mean_token_accuracy": 0.7246413186192513, | |
| "num_tokens": 21251605.0, | |
| "step": 2725 | |
| }, | |
| { | |
| "entropy": 1.0620105125010013, | |
| "epoch": 0.7144843379406232, | |
| "grad_norm": 0.3342624306678772, | |
| "learning_rate": 5.715781209107563e-06, | |
| "loss": 1.0065, | |
| "mean_token_accuracy": 0.7301050588488579, | |
| "num_tokens": 21294871.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "entropy": 1.0648996368050576, | |
| "epoch": 0.7157929173141409, | |
| "grad_norm": 0.5223391056060791, | |
| "learning_rate": 5.689610049725203e-06, | |
| "loss": 1.0258, | |
| "mean_token_accuracy": 0.7259559325873852, | |
| "num_tokens": 21326670.0, | |
| "step": 2735 | |
| }, | |
| { | |
| "entropy": 1.0742478474974633, | |
| "epoch": 0.7171014966876584, | |
| "grad_norm": 0.5522735118865967, | |
| "learning_rate": 5.663438890342843e-06, | |
| "loss": 1.0703, | |
| "mean_token_accuracy": 0.7225585378706455, | |
| "num_tokens": 21367325.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "entropy": 0.9710161112248897, | |
| "epoch": 0.7184100760611761, | |
| "grad_norm": 0.33066603541374207, | |
| "learning_rate": 5.637267730960482e-06, | |
| "loss": 0.9442, | |
| "mean_token_accuracy": 0.7384205959737301, | |
| "num_tokens": 21408334.0, | |
| "step": 2745 | |
| }, | |
| { | |
| "entropy": 1.0664944287389517, | |
| "epoch": 0.7197186554346937, | |
| "grad_norm": 0.4889247417449951, | |
| "learning_rate": 5.611096571578122e-06, | |
| "loss": 1.112, | |
| "mean_token_accuracy": 0.7264700572937727, | |
| "num_tokens": 21449839.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "entropy": 1.0030995801091194, | |
| "epoch": 0.7210272348082113, | |
| "grad_norm": 0.3732108771800995, | |
| "learning_rate": 5.584925412195761e-06, | |
| "loss": 0.9111, | |
| "mean_token_accuracy": 0.7403148956596851, | |
| "num_tokens": 21487742.0, | |
| "step": 2755 | |
| }, | |
| { | |
| "entropy": 1.0033697545528413, | |
| "epoch": 0.722335814181729, | |
| "grad_norm": 0.43490299582481384, | |
| "learning_rate": 5.5587542528133995e-06, | |
| "loss": 0.9582, | |
| "mean_token_accuracy": 0.7397238194942475, | |
| "num_tokens": 21526844.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "entropy": 1.1253223221749067, | |
| "epoch": 0.7236443935552466, | |
| "grad_norm": 0.46164193749427795, | |
| "learning_rate": 5.5325830934310396e-06, | |
| "loss": 1.1347, | |
| "mean_token_accuracy": 0.7200637456029654, | |
| "num_tokens": 21568539.0, | |
| "step": 2765 | |
| }, | |
| { | |
| "entropy": 1.0714545018970967, | |
| "epoch": 0.7249529729287643, | |
| "grad_norm": 0.39674556255340576, | |
| "learning_rate": 5.506411934048679e-06, | |
| "loss": 1.0253, | |
| "mean_token_accuracy": 0.7276010327041149, | |
| "num_tokens": 21603326.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "entropy": 1.06046348772943, | |
| "epoch": 0.7262615523022818, | |
| "grad_norm": 0.4999883770942688, | |
| "learning_rate": 5.480240774666318e-06, | |
| "loss": 1.0495, | |
| "mean_token_accuracy": 0.7262998007237911, | |
| "num_tokens": 21642529.0, | |
| "step": 2775 | |
| }, | |
| { | |
| "entropy": 1.0288800846785306, | |
| "epoch": 0.7275701316757994, | |
| "grad_norm": 0.45190781354904175, | |
| "learning_rate": 5.454069615283958e-06, | |
| "loss": 1.0062, | |
| "mean_token_accuracy": 0.7358099550008774, | |
| "num_tokens": 21675950.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "entropy": 1.064651170000434, | |
| "epoch": 0.7288787110493171, | |
| "grad_norm": 0.6091992259025574, | |
| "learning_rate": 5.4278984559015964e-06, | |
| "loss": 1.0302, | |
| "mean_token_accuracy": 0.7322472549974919, | |
| "num_tokens": 21712389.0, | |
| "step": 2785 | |
| }, | |
| { | |
| "entropy": 1.067555221542716, | |
| "epoch": 0.7301872904228347, | |
| "grad_norm": 0.3587604761123657, | |
| "learning_rate": 5.401727296519236e-06, | |
| "loss": 1.0269, | |
| "mean_token_accuracy": 0.7269019264727831, | |
| "num_tokens": 21750233.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "entropy": 1.0527180466800927, | |
| "epoch": 0.7314958697963523, | |
| "grad_norm": 0.6426623463630676, | |
| "learning_rate": 5.375556137136876e-06, | |
| "loss": 1.045, | |
| "mean_token_accuracy": 0.7300838761031627, | |
| "num_tokens": 21788770.0, | |
| "step": 2795 | |
| }, | |
| { | |
| "entropy": 1.115583229251206, | |
| "epoch": 0.73280444916987, | |
| "grad_norm": 0.5922237634658813, | |
| "learning_rate": 5.349384977754515e-06, | |
| "loss": 1.0523, | |
| "mean_token_accuracy": 0.7175281938165426, | |
| "num_tokens": 21826397.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.73280444916987, | |
| "eval_entropy": 1.0408368062973024, | |
| "eval_loss": 1.0325729846954346, | |
| "eval_mean_token_accuracy": 0.73026052236557, | |
| "eval_num_tokens": 21826397.0, | |
| "eval_runtime": 97.1897, | |
| "eval_samples_per_second": 10.289, | |
| "eval_steps_per_second": 1.286, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 1.1054367668926717, | |
| "epoch": 0.7341130285433876, | |
| "grad_norm": 0.5465598106384277, | |
| "learning_rate": 5.323213818372154e-06, | |
| "loss": 1.08, | |
| "mean_token_accuracy": 0.720421589165926, | |
| "num_tokens": 21863424.0, | |
| "step": 2805 | |
| }, | |
| { | |
| "entropy": 1.1147074215114117, | |
| "epoch": 0.7354216079169053, | |
| "grad_norm": 0.4170718193054199, | |
| "learning_rate": 5.297042658989794e-06, | |
| "loss": 1.0508, | |
| "mean_token_accuracy": 0.7232054278254509, | |
| "num_tokens": 21900462.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "entropy": 1.0979057766497136, | |
| "epoch": 0.7367301872904228, | |
| "grad_norm": 0.4706130027770996, | |
| "learning_rate": 5.270871499607433e-06, | |
| "loss": 1.0168, | |
| "mean_token_accuracy": 0.719353062659502, | |
| "num_tokens": 21933354.0, | |
| "step": 2815 | |
| }, | |
| { | |
| "entropy": 0.9744918283075095, | |
| "epoch": 0.7380387666639404, | |
| "grad_norm": 0.5279991626739502, | |
| "learning_rate": 5.244700340225073e-06, | |
| "loss": 0.9217, | |
| "mean_token_accuracy": 0.7506507318466902, | |
| "num_tokens": 21970100.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "entropy": 1.0728078037500381, | |
| "epoch": 0.7393473460374581, | |
| "grad_norm": 0.5053285956382751, | |
| "learning_rate": 5.218529180842712e-06, | |
| "loss": 1.0283, | |
| "mean_token_accuracy": 0.7311844080686569, | |
| "num_tokens": 22004093.0, | |
| "step": 2825 | |
| }, | |
| { | |
| "entropy": 1.0765360169112683, | |
| "epoch": 0.7406559254109757, | |
| "grad_norm": 0.36004361510276794, | |
| "learning_rate": 5.192358021460351e-06, | |
| "loss": 1.0092, | |
| "mean_token_accuracy": 0.7210248097777366, | |
| "num_tokens": 22045336.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "entropy": 1.017987198382616, | |
| "epoch": 0.7419645047844934, | |
| "grad_norm": 0.36846044659614563, | |
| "learning_rate": 5.166186862077991e-06, | |
| "loss": 0.9839, | |
| "mean_token_accuracy": 0.7387872941792011, | |
| "num_tokens": 22086534.0, | |
| "step": 2835 | |
| }, | |
| { | |
| "entropy": 1.0489673603326082, | |
| "epoch": 0.743273084158011, | |
| "grad_norm": 0.46179378032684326, | |
| "learning_rate": 5.1400157026956295e-06, | |
| "loss": 0.9415, | |
| "mean_token_accuracy": 0.7353867895901203, | |
| "num_tokens": 22121230.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "entropy": 1.1243290316313506, | |
| "epoch": 0.7445816635315285, | |
| "grad_norm": 0.5417360663414001, | |
| "learning_rate": 5.113844543313269e-06, | |
| "loss": 1.0578, | |
| "mean_token_accuracy": 0.7204208463430405, | |
| "num_tokens": 22159032.0, | |
| "step": 2845 | |
| }, | |
| { | |
| "entropy": 1.0939917907118797, | |
| "epoch": 0.7458902429050462, | |
| "grad_norm": 0.48897913098335266, | |
| "learning_rate": 5.087673383930909e-06, | |
| "loss": 1.0063, | |
| "mean_token_accuracy": 0.7165301315486431, | |
| "num_tokens": 22196955.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "entropy": 1.0858067644760012, | |
| "epoch": 0.7471988222785638, | |
| "grad_norm": 0.5564597845077515, | |
| "learning_rate": 5.061502224548548e-06, | |
| "loss": 1.0764, | |
| "mean_token_accuracy": 0.7208572022616864, | |
| "num_tokens": 22234201.0, | |
| "step": 2855 | |
| }, | |
| { | |
| "entropy": 1.0590477578341961, | |
| "epoch": 0.7485074016520814, | |
| "grad_norm": 0.3768922686576843, | |
| "learning_rate": 5.035331065166187e-06, | |
| "loss": 0.9812, | |
| "mean_token_accuracy": 0.7291722763329744, | |
| "num_tokens": 22272779.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "entropy": 1.1251448437571525, | |
| "epoch": 0.7498159810255991, | |
| "grad_norm": 0.3560837507247925, | |
| "learning_rate": 5.009159905783827e-06, | |
| "loss": 1.0892, | |
| "mean_token_accuracy": 0.7106486916542053, | |
| "num_tokens": 22308851.0, | |
| "step": 2865 | |
| }, | |
| { | |
| "entropy": 1.119718360155821, | |
| "epoch": 0.7511245603991167, | |
| "grad_norm": 0.5380016565322876, | |
| "learning_rate": 4.982988746401466e-06, | |
| "loss": 1.0983, | |
| "mean_token_accuracy": 0.7206918202340603, | |
| "num_tokens": 22343884.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "entropy": 1.0894725158810616, | |
| "epoch": 0.7524331397726344, | |
| "grad_norm": 0.3485453426837921, | |
| "learning_rate": 4.956817587019106e-06, | |
| "loss": 1.0491, | |
| "mean_token_accuracy": 0.7222230531275272, | |
| "num_tokens": 22384017.0, | |
| "step": 2875 | |
| }, | |
| { | |
| "entropy": 1.0093999858945608, | |
| "epoch": 0.753741719146152, | |
| "grad_norm": 0.4127729535102844, | |
| "learning_rate": 4.930646427636745e-06, | |
| "loss": 0.9502, | |
| "mean_token_accuracy": 0.7418296955525875, | |
| "num_tokens": 22418122.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "entropy": 1.1371208924800158, | |
| "epoch": 0.7550502985196695, | |
| "grad_norm": 0.43648311495780945, | |
| "learning_rate": 4.904475268254384e-06, | |
| "loss": 1.1494, | |
| "mean_token_accuracy": 0.7121193654835224, | |
| "num_tokens": 22459009.0, | |
| "step": 2885 | |
| }, | |
| { | |
| "entropy": 1.0378689534962178, | |
| "epoch": 0.7563588778931872, | |
| "grad_norm": 0.44997143745422363, | |
| "learning_rate": 4.878304108872023e-06, | |
| "loss": 0.9618, | |
| "mean_token_accuracy": 0.7308846026659012, | |
| "num_tokens": 22493139.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "entropy": 1.0220247332006693, | |
| "epoch": 0.7576674572667048, | |
| "grad_norm": 0.561329185962677, | |
| "learning_rate": 4.852132949489663e-06, | |
| "loss": 0.9283, | |
| "mean_token_accuracy": 0.7339823555201292, | |
| "num_tokens": 22529225.0, | |
| "step": 2895 | |
| }, | |
| { | |
| "entropy": 1.0556524885818361, | |
| "epoch": 0.7589760366402225, | |
| "grad_norm": 0.3141125738620758, | |
| "learning_rate": 4.825961790107302e-06, | |
| "loss": 1.0225, | |
| "mean_token_accuracy": 0.732053418457508, | |
| "num_tokens": 22565967.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.7589760366402225, | |
| "eval_entropy": 1.0340708026885985, | |
| "eval_loss": 1.0317820310592651, | |
| "eval_mean_token_accuracy": 0.7304344477653504, | |
| "eval_num_tokens": 22565967.0, | |
| "eval_runtime": 97.2034, | |
| "eval_samples_per_second": 10.288, | |
| "eval_steps_per_second": 1.286, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 1.0628287993371486, | |
| "epoch": 0.7602846160137401, | |
| "grad_norm": 0.5528143048286438, | |
| "learning_rate": 4.799790630724941e-06, | |
| "loss": 1.0052, | |
| "mean_token_accuracy": 0.7236215595155955, | |
| "num_tokens": 22605849.0, | |
| "step": 2905 | |
| }, | |
| { | |
| "entropy": 1.0544053295627236, | |
| "epoch": 0.7615931953872577, | |
| "grad_norm": 0.5025759339332581, | |
| "learning_rate": 4.773619471342581e-06, | |
| "loss": 0.995, | |
| "mean_token_accuracy": 0.7306290067732334, | |
| "num_tokens": 22647439.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "entropy": 1.0855362933129071, | |
| "epoch": 0.7629017747607754, | |
| "grad_norm": 0.5423142910003662, | |
| "learning_rate": 4.74744831196022e-06, | |
| "loss": 1.0644, | |
| "mean_token_accuracy": 0.729085548222065, | |
| "num_tokens": 22686489.0, | |
| "step": 2915 | |
| }, | |
| { | |
| "entropy": 1.1064934462308884, | |
| "epoch": 0.764210354134293, | |
| "grad_norm": 0.324994295835495, | |
| "learning_rate": 4.7212771525778596e-06, | |
| "loss": 1.0572, | |
| "mean_token_accuracy": 0.7155943371355533, | |
| "num_tokens": 22729097.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "entropy": 1.0514110021293164, | |
| "epoch": 0.7655189335078105, | |
| "grad_norm": 0.48154741525650024, | |
| "learning_rate": 4.695105993195499e-06, | |
| "loss": 0.9801, | |
| "mean_token_accuracy": 0.728981614857912, | |
| "num_tokens": 22767464.0, | |
| "step": 2925 | |
| }, | |
| { | |
| "entropy": 1.0362781658768654, | |
| "epoch": 0.7668275128813282, | |
| "grad_norm": 0.5839325189590454, | |
| "learning_rate": 4.668934833813139e-06, | |
| "loss": 0.9951, | |
| "mean_token_accuracy": 0.7349088959395885, | |
| "num_tokens": 22806605.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "entropy": 1.0683909840881824, | |
| "epoch": 0.7681360922548458, | |
| "grad_norm": 0.37387317419052124, | |
| "learning_rate": 4.642763674430777e-06, | |
| "loss": 1.0933, | |
| "mean_token_accuracy": 0.7265824314206839, | |
| "num_tokens": 22844597.0, | |
| "step": 2935 | |
| }, | |
| { | |
| "entropy": 1.1125381592661143, | |
| "epoch": 0.7694446716283635, | |
| "grad_norm": 0.6262072324752808, | |
| "learning_rate": 4.616592515048417e-06, | |
| "loss": 1.0776, | |
| "mean_token_accuracy": 0.7169565990567207, | |
| "num_tokens": 22883020.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "entropy": 1.1716872084885837, | |
| "epoch": 0.7707532510018811, | |
| "grad_norm": 0.4011521637439728, | |
| "learning_rate": 4.5904213556660565e-06, | |
| "loss": 1.1654, | |
| "mean_token_accuracy": 0.7077478947117924, | |
| "num_tokens": 22922580.0, | |
| "step": 2945 | |
| }, | |
| { | |
| "entropy": 1.0219284979626537, | |
| "epoch": 0.7720618303753987, | |
| "grad_norm": 0.6057654619216919, | |
| "learning_rate": 4.564250196283696e-06, | |
| "loss": 1.0102, | |
| "mean_token_accuracy": 0.7341908976435662, | |
| "num_tokens": 22961570.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "entropy": 1.045608853548765, | |
| "epoch": 0.7733704097489164, | |
| "grad_norm": 0.5418355464935303, | |
| "learning_rate": 4.538079036901335e-06, | |
| "loss": 0.9675, | |
| "mean_token_accuracy": 0.731929623708129, | |
| "num_tokens": 22998884.0, | |
| "step": 2955 | |
| }, | |
| { | |
| "entropy": 1.1297316145151854, | |
| "epoch": 0.7746789891224339, | |
| "grad_norm": 0.469908207654953, | |
| "learning_rate": 4.511907877518974e-06, | |
| "loss": 1.1027, | |
| "mean_token_accuracy": 0.7112006828188896, | |
| "num_tokens": 23036097.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "entropy": 1.0829615991562604, | |
| "epoch": 0.7759875684959516, | |
| "grad_norm": 0.42076605558395386, | |
| "learning_rate": 4.485736718136614e-06, | |
| "loss": 0.9541, | |
| "mean_token_accuracy": 0.7311103120446205, | |
| "num_tokens": 23071198.0, | |
| "step": 2965 | |
| }, | |
| { | |
| "entropy": 1.0218299932777881, | |
| "epoch": 0.7772961478694692, | |
| "grad_norm": 0.6354956030845642, | |
| "learning_rate": 4.459565558754253e-06, | |
| "loss": 0.94, | |
| "mean_token_accuracy": 0.7375240176916122, | |
| "num_tokens": 23105690.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "entropy": 1.0296128673478961, | |
| "epoch": 0.7786047272429868, | |
| "grad_norm": 0.5609083771705627, | |
| "learning_rate": 4.433394399371893e-06, | |
| "loss": 0.9641, | |
| "mean_token_accuracy": 0.7355793751776218, | |
| "num_tokens": 23149341.0, | |
| "step": 2975 | |
| }, | |
| { | |
| "entropy": 1.0164244651794434, | |
| "epoch": 0.7799133066165045, | |
| "grad_norm": 0.4903509318828583, | |
| "learning_rate": 4.407223239989532e-06, | |
| "loss": 0.9678, | |
| "mean_token_accuracy": 0.7360415138304234, | |
| "num_tokens": 23184633.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "entropy": 1.0966936625540256, | |
| "epoch": 0.7812218859900221, | |
| "grad_norm": 0.39143356680870056, | |
| "learning_rate": 4.381052080607171e-06, | |
| "loss": 1.0806, | |
| "mean_token_accuracy": 0.7219570934772491, | |
| "num_tokens": 23221750.0, | |
| "step": 2985 | |
| }, | |
| { | |
| "entropy": 1.042829731106758, | |
| "epoch": 0.7825304653635398, | |
| "grad_norm": 0.4407028257846832, | |
| "learning_rate": 4.35488092122481e-06, | |
| "loss": 1.0816, | |
| "mean_token_accuracy": 0.7296155303716659, | |
| "num_tokens": 23263629.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "entropy": 1.067452398315072, | |
| "epoch": 0.7838390447370573, | |
| "grad_norm": 0.4685254991054535, | |
| "learning_rate": 4.32870976184245e-06, | |
| "loss": 1.0394, | |
| "mean_token_accuracy": 0.7196490127593279, | |
| "num_tokens": 23299548.0, | |
| "step": 2995 | |
| }, | |
| { | |
| "entropy": 1.0206131491810084, | |
| "epoch": 0.7851476241105749, | |
| "grad_norm": 0.4271470010280609, | |
| "learning_rate": 4.30253860246009e-06, | |
| "loss": 0.9486, | |
| "mean_token_accuracy": 0.7346534051001072, | |
| "num_tokens": 23340920.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.7851476241105749, | |
| "eval_entropy": 1.0305686144828796, | |
| "eval_loss": 1.03084135055542, | |
| "eval_mean_token_accuracy": 0.7306465420722962, | |
| "eval_num_tokens": 23340920.0, | |
| "eval_runtime": 97.1158, | |
| "eval_samples_per_second": 10.297, | |
| "eval_steps_per_second": 1.287, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 1.0277832087129355, | |
| "epoch": 0.7864562034840926, | |
| "grad_norm": 0.44500380754470825, | |
| "learning_rate": 4.276367443077729e-06, | |
| "loss": 0.987, | |
| "mean_token_accuracy": 0.7308167792856693, | |
| "num_tokens": 23381557.0, | |
| "step": 3005 | |
| }, | |
| { | |
| "entropy": 1.086503054201603, | |
| "epoch": 0.7877647828576102, | |
| "grad_norm": 0.3013031780719757, | |
| "learning_rate": 4.250196283695368e-06, | |
| "loss": 1.1165, | |
| "mean_token_accuracy": 0.721335730701685, | |
| "num_tokens": 23422282.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "entropy": 1.1198953952640296, | |
| "epoch": 0.7890733622311278, | |
| "grad_norm": 0.5127637982368469, | |
| "learning_rate": 4.224025124313007e-06, | |
| "loss": 1.0667, | |
| "mean_token_accuracy": 0.716464214771986, | |
| "num_tokens": 23462818.0, | |
| "step": 3015 | |
| }, | |
| { | |
| "entropy": 1.0935983188450336, | |
| "epoch": 0.7903819416046455, | |
| "grad_norm": 0.7555818557739258, | |
| "learning_rate": 4.1978539649306465e-06, | |
| "loss": 1.0789, | |
| "mean_token_accuracy": 0.7195366825908422, | |
| "num_tokens": 23500331.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "entropy": 1.092792271077633, | |
| "epoch": 0.7916905209781631, | |
| "grad_norm": 0.37865763902664185, | |
| "learning_rate": 4.171682805548286e-06, | |
| "loss": 1.0283, | |
| "mean_token_accuracy": 0.7241503912955523, | |
| "num_tokens": 23537976.0, | |
| "step": 3025 | |
| }, | |
| { | |
| "entropy": 1.0759663056582212, | |
| "epoch": 0.7929991003516808, | |
| "grad_norm": 0.5228607058525085, | |
| "learning_rate": 4.145511646165926e-06, | |
| "loss": 1.0736, | |
| "mean_token_accuracy": 0.7280153460800648, | |
| "num_tokens": 23581747.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "entropy": 1.1236386895179749, | |
| "epoch": 0.7943076797251983, | |
| "grad_norm": 0.6752633452415466, | |
| "learning_rate": 4.119340486783565e-06, | |
| "loss": 1.0768, | |
| "mean_token_accuracy": 0.7147096544504166, | |
| "num_tokens": 23615855.0, | |
| "step": 3035 | |
| }, | |
| { | |
| "entropy": 1.1543139606714248, | |
| "epoch": 0.7956162590987159, | |
| "grad_norm": 0.402425616979599, | |
| "learning_rate": 4.093169327401204e-06, | |
| "loss": 1.1301, | |
| "mean_token_accuracy": 0.7124893587082625, | |
| "num_tokens": 23653681.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "entropy": 1.1155839897692204, | |
| "epoch": 0.7969248384722336, | |
| "grad_norm": 0.3531131148338318, | |
| "learning_rate": 4.066998168018843e-06, | |
| "loss": 1.1011, | |
| "mean_token_accuracy": 0.7147599868476391, | |
| "num_tokens": 23691707.0, | |
| "step": 3045 | |
| }, | |
| { | |
| "entropy": 1.0141445618122815, | |
| "epoch": 0.7982334178457512, | |
| "grad_norm": 0.8314666152000427, | |
| "learning_rate": 4.0408270086364835e-06, | |
| "loss": 0.9637, | |
| "mean_token_accuracy": 0.7401646625250577, | |
| "num_tokens": 23731289.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "entropy": 1.1060345830395817, | |
| "epoch": 0.7995419972192689, | |
| "grad_norm": 0.39150726795196533, | |
| "learning_rate": 4.014655849254122e-06, | |
| "loss": 1.1086, | |
| "mean_token_accuracy": 0.7131407152861357, | |
| "num_tokens": 23769110.0, | |
| "step": 3055 | |
| }, | |
| { | |
| "entropy": 1.1247680716216564, | |
| "epoch": 0.8008505765927865, | |
| "grad_norm": 0.393403023481369, | |
| "learning_rate": 3.988484689871762e-06, | |
| "loss": 1.0878, | |
| "mean_token_accuracy": 0.7203011158853769, | |
| "num_tokens": 23805247.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "entropy": 1.0936600599437951, | |
| "epoch": 0.802159155966304, | |
| "grad_norm": 0.40403568744659424, | |
| "learning_rate": 3.962313530489401e-06, | |
| "loss": 1.0812, | |
| "mean_token_accuracy": 0.7204918511211872, | |
| "num_tokens": 23846787.0, | |
| "step": 3065 | |
| }, | |
| { | |
| "entropy": 1.1725836582481861, | |
| "epoch": 0.8034677353398217, | |
| "grad_norm": 0.4385111629962921, | |
| "learning_rate": 3.93614237110704e-06, | |
| "loss": 1.1878, | |
| "mean_token_accuracy": 0.7098725575953722, | |
| "num_tokens": 23884749.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "entropy": 1.056993416696787, | |
| "epoch": 0.8047763147133393, | |
| "grad_norm": 0.4194464087486267, | |
| "learning_rate": 3.90997121172468e-06, | |
| "loss": 1.0316, | |
| "mean_token_accuracy": 0.7291632607579231, | |
| "num_tokens": 23925854.0, | |
| "step": 3075 | |
| }, | |
| { | |
| "entropy": 1.122009427472949, | |
| "epoch": 0.8060848940868569, | |
| "grad_norm": 0.34683725237846375, | |
| "learning_rate": 3.883800052342319e-06, | |
| "loss": 1.0402, | |
| "mean_token_accuracy": 0.7219870649278164, | |
| "num_tokens": 23965584.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "entropy": 0.982495572976768, | |
| "epoch": 0.8073934734603746, | |
| "grad_norm": 0.7075260281562805, | |
| "learning_rate": 3.857628892959959e-06, | |
| "loss": 0.9437, | |
| "mean_token_accuracy": 0.750491987913847, | |
| "num_tokens": 24004600.0, | |
| "step": 3085 | |
| }, | |
| { | |
| "entropy": 1.0386792987585067, | |
| "epoch": 0.8087020528338922, | |
| "grad_norm": 0.3854619562625885, | |
| "learning_rate": 3.831457733577597e-06, | |
| "loss": 0.9687, | |
| "mean_token_accuracy": 0.7319545924663544, | |
| "num_tokens": 24039464.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "entropy": 1.0844634655863046, | |
| "epoch": 0.8100106322074099, | |
| "grad_norm": 0.47892338037490845, | |
| "learning_rate": 3.8052865741952373e-06, | |
| "loss": 1.0465, | |
| "mean_token_accuracy": 0.7266070485115051, | |
| "num_tokens": 24076409.0, | |
| "step": 3095 | |
| }, | |
| { | |
| "entropy": 1.0820760104805232, | |
| "epoch": 0.8113192115809275, | |
| "grad_norm": 0.43651270866394043, | |
| "learning_rate": 3.7791154148128765e-06, | |
| "loss": 1.0007, | |
| "mean_token_accuracy": 0.7236479226499796, | |
| "num_tokens": 24118481.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.8113192115809275, | |
| "eval_entropy": 1.036030520915985, | |
| "eval_loss": 1.028921365737915, | |
| "eval_mean_token_accuracy": 0.7310774421691895, | |
| "eval_num_tokens": 24118481.0, | |
| "eval_runtime": 97.2385, | |
| "eval_samples_per_second": 10.284, | |
| "eval_steps_per_second": 1.285, | |
| "step": 3100 | |
| }, | |
| { | |
| "entropy": 1.1077489994466305, | |
| "epoch": 0.812627790954445, | |
| "grad_norm": 0.42022934556007385, | |
| "learning_rate": 3.7529442554305157e-06, | |
| "loss": 1.0661, | |
| "mean_token_accuracy": 0.7255024470388889, | |
| "num_tokens": 24158004.0, | |
| "step": 3105 | |
| }, | |
| { | |
| "entropy": 1.0772877372801304, | |
| "epoch": 0.8139363703279627, | |
| "grad_norm": 0.3519681990146637, | |
| "learning_rate": 3.7267730960481554e-06, | |
| "loss": 1.0428, | |
| "mean_token_accuracy": 0.7230064436793328, | |
| "num_tokens": 24194320.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "entropy": 1.0549692034721374, | |
| "epoch": 0.8152449497014803, | |
| "grad_norm": 0.5634621381759644, | |
| "learning_rate": 3.7006019366657946e-06, | |
| "loss": 1.0893, | |
| "mean_token_accuracy": 0.7291611786931753, | |
| "num_tokens": 24233260.0, | |
| "step": 3115 | |
| }, | |
| { | |
| "entropy": 0.9838933110237121, | |
| "epoch": 0.816553529074998, | |
| "grad_norm": 0.3864845037460327, | |
| "learning_rate": 3.6744307772834342e-06, | |
| "loss": 0.9414, | |
| "mean_token_accuracy": 0.7454824268817901, | |
| "num_tokens": 24270814.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "entropy": 1.0704244837164878, | |
| "epoch": 0.8178621084485156, | |
| "grad_norm": 0.5081256628036499, | |
| "learning_rate": 3.648259617901073e-06, | |
| "loss": 1.0257, | |
| "mean_token_accuracy": 0.723455361276865, | |
| "num_tokens": 24309235.0, | |
| "step": 3125 | |
| }, | |
| { | |
| "entropy": 1.022134768590331, | |
| "epoch": 0.8191706878220332, | |
| "grad_norm": 0.42314413189888, | |
| "learning_rate": 3.6220884585187127e-06, | |
| "loss": 0.9948, | |
| "mean_token_accuracy": 0.7342596560716629, | |
| "num_tokens": 24348724.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "entropy": 1.0905879106372596, | |
| "epoch": 0.8204792671955509, | |
| "grad_norm": 0.30757448077201843, | |
| "learning_rate": 3.5959172991363523e-06, | |
| "loss": 1.0555, | |
| "mean_token_accuracy": 0.724842281267047, | |
| "num_tokens": 24388628.0, | |
| "step": 3135 | |
| }, | |
| { | |
| "entropy": 1.0407748512923718, | |
| "epoch": 0.8217878465690684, | |
| "grad_norm": 0.7021998167037964, | |
| "learning_rate": 3.569746139753991e-06, | |
| "loss": 0.9854, | |
| "mean_token_accuracy": 0.7305317636579275, | |
| "num_tokens": 24427341.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "entropy": 1.1128646079450846, | |
| "epoch": 0.823096425942586, | |
| "grad_norm": 0.3260432779788971, | |
| "learning_rate": 3.5435749803716308e-06, | |
| "loss": 1.0951, | |
| "mean_token_accuracy": 0.712972067296505, | |
| "num_tokens": 24462200.0, | |
| "step": 3145 | |
| }, | |
| { | |
| "entropy": 1.1428508426994086, | |
| "epoch": 0.8244050053161037, | |
| "grad_norm": 0.37607815861701965, | |
| "learning_rate": 3.5174038209892704e-06, | |
| "loss": 1.1074, | |
| "mean_token_accuracy": 0.7139423068612814, | |
| "num_tokens": 24503955.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "entropy": 1.0314291812479497, | |
| "epoch": 0.8257135846896213, | |
| "grad_norm": 0.5070587992668152, | |
| "learning_rate": 3.491232661606909e-06, | |
| "loss": 1.0294, | |
| "mean_token_accuracy": 0.7355128142982721, | |
| "num_tokens": 24543206.0, | |
| "step": 3155 | |
| }, | |
| { | |
| "entropy": 1.1036609571427107, | |
| "epoch": 0.827022164063139, | |
| "grad_norm": 0.41178247332572937, | |
| "learning_rate": 3.465061502224549e-06, | |
| "loss": 1.0937, | |
| "mean_token_accuracy": 0.7211760215461254, | |
| "num_tokens": 24579806.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "entropy": 1.1116086110472678, | |
| "epoch": 0.8283307434366566, | |
| "grad_norm": 0.5788869857788086, | |
| "learning_rate": 3.438890342842188e-06, | |
| "loss": 1.0604, | |
| "mean_token_accuracy": 0.721147482842207, | |
| "num_tokens": 24616984.0, | |
| "step": 3165 | |
| }, | |
| { | |
| "entropy": 1.1319230053573848, | |
| "epoch": 0.8296393228101742, | |
| "grad_norm": 0.6010908484458923, | |
| "learning_rate": 3.4127191834598277e-06, | |
| "loss": 1.0807, | |
| "mean_token_accuracy": 0.7168502196669578, | |
| "num_tokens": 24654055.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "entropy": 1.1207389827817678, | |
| "epoch": 0.8309479021836919, | |
| "grad_norm": 0.5787162780761719, | |
| "learning_rate": 3.386548024077467e-06, | |
| "loss": 1.0741, | |
| "mean_token_accuracy": 0.7188828401267529, | |
| "num_tokens": 24696566.0, | |
| "step": 3175 | |
| }, | |
| { | |
| "entropy": 1.0550225611776114, | |
| "epoch": 0.8322564815572094, | |
| "grad_norm": 0.4683392643928528, | |
| "learning_rate": 3.360376864695106e-06, | |
| "loss": 1.0378, | |
| "mean_token_accuracy": 0.7269921116530895, | |
| "num_tokens": 24735302.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "entropy": 1.064477189257741, | |
| "epoch": 0.8335650609307271, | |
| "grad_norm": 0.43787601590156555, | |
| "learning_rate": 3.3342057053127458e-06, | |
| "loss": 1.029, | |
| "mean_token_accuracy": 0.7256352994590998, | |
| "num_tokens": 24777394.0, | |
| "step": 3185 | |
| }, | |
| { | |
| "entropy": 0.9952436108142138, | |
| "epoch": 0.8348736403042447, | |
| "grad_norm": 0.619921863079071, | |
| "learning_rate": 3.3080345459303846e-06, | |
| "loss": 0.9214, | |
| "mean_token_accuracy": 0.7489484068006277, | |
| "num_tokens": 24820138.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "entropy": 1.0670820290222764, | |
| "epoch": 0.8361822196777623, | |
| "grad_norm": 0.4853121340274811, | |
| "learning_rate": 3.281863386548024e-06, | |
| "loss": 1.0733, | |
| "mean_token_accuracy": 0.7309010468423367, | |
| "num_tokens": 24857394.0, | |
| "step": 3195 | |
| }, | |
| { | |
| "entropy": 1.0793268527835607, | |
| "epoch": 0.83749079905128, | |
| "grad_norm": 0.3238151967525482, | |
| "learning_rate": 3.255692227165664e-06, | |
| "loss": 1.0963, | |
| "mean_token_accuracy": 0.7233440794050694, | |
| "num_tokens": 24894819.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.83749079905128, | |
| "eval_entropy": 1.0310797443389892, | |
| "eval_loss": 1.0281308889389038, | |
| "eval_mean_token_accuracy": 0.7311860795021057, | |
| "eval_num_tokens": 24894819.0, | |
| "eval_runtime": 97.2362, | |
| "eval_samples_per_second": 10.284, | |
| "eval_steps_per_second": 1.286, | |
| "step": 3200 | |
| }, | |
| { | |
| "entropy": 1.0457366950809956, | |
| "epoch": 0.8387993784247976, | |
| "grad_norm": 0.44579482078552246, | |
| "learning_rate": 3.2295210677833035e-06, | |
| "loss": 0.995, | |
| "mean_token_accuracy": 0.7295556962490082, | |
| "num_tokens": 24934467.0, | |
| "step": 3205 | |
| }, | |
| { | |
| "entropy": 1.0695260234177113, | |
| "epoch": 0.8401079577983152, | |
| "grad_norm": 0.4931753873825073, | |
| "learning_rate": 3.2033499084009423e-06, | |
| "loss": 1.0579, | |
| "mean_token_accuracy": 0.7268257789313793, | |
| "num_tokens": 24973181.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "entropy": 1.0737264335155488, | |
| "epoch": 0.8414165371718328, | |
| "grad_norm": 0.542246401309967, | |
| "learning_rate": 3.177178749018582e-06, | |
| "loss": 1.044, | |
| "mean_token_accuracy": 0.7273680947721004, | |
| "num_tokens": 25008234.0, | |
| "step": 3215 | |
| }, | |
| { | |
| "entropy": 1.0925125159323215, | |
| "epoch": 0.8427251165453504, | |
| "grad_norm": 0.7821519374847412, | |
| "learning_rate": 3.151007589636221e-06, | |
| "loss": 1.0926, | |
| "mean_token_accuracy": 0.7236344940960407, | |
| "num_tokens": 25042804.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "entropy": 1.0435322053730487, | |
| "epoch": 0.8440336959188681, | |
| "grad_norm": 0.5571607351303101, | |
| "learning_rate": 3.1248364302538604e-06, | |
| "loss": 1.0314, | |
| "mean_token_accuracy": 0.7397142663598061, | |
| "num_tokens": 25081713.0, | |
| "step": 3225 | |
| }, | |
| { | |
| "entropy": 1.0707201555371284, | |
| "epoch": 0.8453422752923857, | |
| "grad_norm": 0.46003878116607666, | |
| "learning_rate": 3.0986652708715e-06, | |
| "loss": 1.0396, | |
| "mean_token_accuracy": 0.7285445638000965, | |
| "num_tokens": 25115761.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "entropy": 1.0421096246689558, | |
| "epoch": 0.8466508546659033, | |
| "grad_norm": 0.5213029980659485, | |
| "learning_rate": 3.0724941114891392e-06, | |
| "loss": 0.9743, | |
| "mean_token_accuracy": 0.729071582481265, | |
| "num_tokens": 25158298.0, | |
| "step": 3235 | |
| }, | |
| { | |
| "entropy": 0.9816816847771406, | |
| "epoch": 0.847959434039421, | |
| "grad_norm": 0.2949255704879761, | |
| "learning_rate": 3.0463229521067784e-06, | |
| "loss": 0.8791, | |
| "mean_token_accuracy": 0.7419180549681187, | |
| "num_tokens": 25195031.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "entropy": 1.1069608356803655, | |
| "epoch": 0.8492680134129386, | |
| "grad_norm": 0.4634745121002197, | |
| "learning_rate": 3.0201517927244177e-06, | |
| "loss": 1.0572, | |
| "mean_token_accuracy": 0.7191666610538959, | |
| "num_tokens": 25233843.0, | |
| "step": 3245 | |
| }, | |
| { | |
| "entropy": 1.0450120318681002, | |
| "epoch": 0.8505765927864563, | |
| "grad_norm": 0.41588935256004333, | |
| "learning_rate": 2.9939806333420573e-06, | |
| "loss": 1.004, | |
| "mean_token_accuracy": 0.7298405677080154, | |
| "num_tokens": 25277199.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "entropy": 1.0192270964384078, | |
| "epoch": 0.8518851721599738, | |
| "grad_norm": 0.3328169584274292, | |
| "learning_rate": 2.967809473959697e-06, | |
| "loss": 1.0314, | |
| "mean_token_accuracy": 0.7342530839145184, | |
| "num_tokens": 25313963.0, | |
| "step": 3255 | |
| }, | |
| { | |
| "entropy": 1.068826200440526, | |
| "epoch": 0.8531937515334914, | |
| "grad_norm": 0.4146538972854614, | |
| "learning_rate": 2.9416383145773357e-06, | |
| "loss": 1.0426, | |
| "mean_token_accuracy": 0.7297728694975376, | |
| "num_tokens": 25354460.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "entropy": 1.0722479410469532, | |
| "epoch": 0.8545023309070091, | |
| "grad_norm": 0.41407662630081177, | |
| "learning_rate": 2.9154671551949754e-06, | |
| "loss": 1.038, | |
| "mean_token_accuracy": 0.7312197633087635, | |
| "num_tokens": 25391042.0, | |
| "step": 3265 | |
| }, | |
| { | |
| "entropy": 1.0916271772235633, | |
| "epoch": 0.8558109102805267, | |
| "grad_norm": 0.36934009194374084, | |
| "learning_rate": 2.889295995812615e-06, | |
| "loss": 1.1389, | |
| "mean_token_accuracy": 0.7267607122659683, | |
| "num_tokens": 25435161.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "entropy": 1.051083530113101, | |
| "epoch": 0.8571194896540443, | |
| "grad_norm": 0.35780689120292664, | |
| "learning_rate": 2.863124836430254e-06, | |
| "loss": 1.0192, | |
| "mean_token_accuracy": 0.7258496068418026, | |
| "num_tokens": 25479291.0, | |
| "step": 3275 | |
| }, | |
| { | |
| "entropy": 1.0690735118463635, | |
| "epoch": 0.858428069027562, | |
| "grad_norm": 0.3563995659351349, | |
| "learning_rate": 2.8369536770478935e-06, | |
| "loss": 1.0232, | |
| "mean_token_accuracy": 0.7209200043231249, | |
| "num_tokens": 25518588.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "entropy": 1.0599579058587552, | |
| "epoch": 0.8597366484010796, | |
| "grad_norm": 0.50145423412323, | |
| "learning_rate": 2.8107825176655327e-06, | |
| "loss": 1.0418, | |
| "mean_token_accuracy": 0.7253329046070576, | |
| "num_tokens": 25555559.0, | |
| "step": 3285 | |
| }, | |
| { | |
| "entropy": 1.0637880939990283, | |
| "epoch": 0.8610452277745972, | |
| "grad_norm": 0.32374799251556396, | |
| "learning_rate": 2.7846113582831723e-06, | |
| "loss": 1.0589, | |
| "mean_token_accuracy": 0.729487194865942, | |
| "num_tokens": 25599141.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "entropy": 1.121987307816744, | |
| "epoch": 0.8623538071481148, | |
| "grad_norm": 0.7996993064880371, | |
| "learning_rate": 2.7584401989008115e-06, | |
| "loss": 1.1568, | |
| "mean_token_accuracy": 0.71376275382936, | |
| "num_tokens": 25635193.0, | |
| "step": 3295 | |
| }, | |
| { | |
| "entropy": 1.075880871899426, | |
| "epoch": 0.8636623865216324, | |
| "grad_norm": 0.3166949450969696, | |
| "learning_rate": 2.7322690395184508e-06, | |
| "loss": 1.0328, | |
| "mean_token_accuracy": 0.7281475432217122, | |
| "num_tokens": 25671967.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.8636623865216324, | |
| "eval_entropy": 1.0315785236358643, | |
| "eval_loss": 1.0274593830108643, | |
| "eval_mean_token_accuracy": 0.7313339967727661, | |
| "eval_num_tokens": 25671967.0, | |
| "eval_runtime": 97.2667, | |
| "eval_samples_per_second": 10.281, | |
| "eval_steps_per_second": 1.285, | |
| "step": 3300 | |
| }, | |
| { | |
| "entropy": 1.0423275344073772, | |
| "epoch": 0.8649709658951501, | |
| "grad_norm": 0.5029602646827698, | |
| "learning_rate": 2.7060978801360904e-06, | |
| "loss": 1.0144, | |
| "mean_token_accuracy": 0.7371830597519875, | |
| "num_tokens": 25714593.0, | |
| "step": 3305 | |
| }, | |
| { | |
| "entropy": 1.019960217177868, | |
| "epoch": 0.8662795452686677, | |
| "grad_norm": 0.3714219629764557, | |
| "learning_rate": 2.679926720753729e-06, | |
| "loss": 0.9591, | |
| "mean_token_accuracy": 0.7337398871779441, | |
| "num_tokens": 25753622.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "entropy": 1.0438139468431473, | |
| "epoch": 0.8675881246421854, | |
| "grad_norm": 0.49829211831092834, | |
| "learning_rate": 2.653755561371369e-06, | |
| "loss": 1.0152, | |
| "mean_token_accuracy": 0.7405216414481401, | |
| "num_tokens": 25789764.0, | |
| "step": 3315 | |
| }, | |
| { | |
| "entropy": 1.0796325359493495, | |
| "epoch": 0.868896704015703, | |
| "grad_norm": 0.9016978144645691, | |
| "learning_rate": 2.6275844019890085e-06, | |
| "loss": 1.0369, | |
| "mean_token_accuracy": 0.7271438419818879, | |
| "num_tokens": 25827139.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "entropy": 1.1373902840539813, | |
| "epoch": 0.8702052833892205, | |
| "grad_norm": 0.5277310609817505, | |
| "learning_rate": 2.601413242606648e-06, | |
| "loss": 1.1125, | |
| "mean_token_accuracy": 0.7127894099801779, | |
| "num_tokens": 25866714.0, | |
| "step": 3325 | |
| }, | |
| { | |
| "entropy": 1.0553706608712674, | |
| "epoch": 0.8715138627627382, | |
| "grad_norm": 0.39588603377342224, | |
| "learning_rate": 2.575242083224287e-06, | |
| "loss": 1.0219, | |
| "mean_token_accuracy": 0.7270961362868548, | |
| "num_tokens": 25909686.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "entropy": 1.065532573312521, | |
| "epoch": 0.8728224421362558, | |
| "grad_norm": 0.5084222555160522, | |
| "learning_rate": 2.5490709238419266e-06, | |
| "loss": 1.0483, | |
| "mean_token_accuracy": 0.7294468872249127, | |
| "num_tokens": 25950054.0, | |
| "step": 3335 | |
| }, | |
| { | |
| "entropy": 1.0791085483506322, | |
| "epoch": 0.8741310215097734, | |
| "grad_norm": 0.5307091474533081, | |
| "learning_rate": 2.5228997644595658e-06, | |
| "loss": 1.0317, | |
| "mean_token_accuracy": 0.7308267526328563, | |
| "num_tokens": 25988785.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "entropy": 1.0206924524158238, | |
| "epoch": 0.8754396008832911, | |
| "grad_norm": 0.44742336869239807, | |
| "learning_rate": 2.4967286050772054e-06, | |
| "loss": 0.9997, | |
| "mean_token_accuracy": 0.7365788772702218, | |
| "num_tokens": 26031548.0, | |
| "step": 3345 | |
| }, | |
| { | |
| "entropy": 1.072092866897583, | |
| "epoch": 0.8767481802568087, | |
| "grad_norm": 0.4204552471637726, | |
| "learning_rate": 2.4705574456948446e-06, | |
| "loss": 1.0516, | |
| "mean_token_accuracy": 0.7274888075888157, | |
| "num_tokens": 26073312.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "entropy": 1.0556320391595364, | |
| "epoch": 0.8780567596303264, | |
| "grad_norm": 0.4932871162891388, | |
| "learning_rate": 2.444386286312484e-06, | |
| "loss": 1.0326, | |
| "mean_token_accuracy": 0.7380503460764885, | |
| "num_tokens": 26114431.0, | |
| "step": 3355 | |
| }, | |
| { | |
| "entropy": 1.091795524954796, | |
| "epoch": 0.879365339003844, | |
| "grad_norm": 0.561880350112915, | |
| "learning_rate": 2.418215126930123e-06, | |
| "loss": 1.0774, | |
| "mean_token_accuracy": 0.7251040559262037, | |
| "num_tokens": 26153886.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "entropy": 1.07560763284564, | |
| "epoch": 0.8806739183773615, | |
| "grad_norm": 0.48657548427581787, | |
| "learning_rate": 2.3920439675477623e-06, | |
| "loss": 1.0658, | |
| "mean_token_accuracy": 0.7262101206928492, | |
| "num_tokens": 26186389.0, | |
| "step": 3365 | |
| }, | |
| { | |
| "entropy": 1.0190218634903432, | |
| "epoch": 0.8819824977508792, | |
| "grad_norm": 0.7028672099113464, | |
| "learning_rate": 2.365872808165402e-06, | |
| "loss": 0.9896, | |
| "mean_token_accuracy": 0.7349074624478817, | |
| "num_tokens": 26216742.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "entropy": 1.0444237791001796, | |
| "epoch": 0.8832910771243968, | |
| "grad_norm": 0.3169384300708771, | |
| "learning_rate": 2.339701648783041e-06, | |
| "loss": 0.9722, | |
| "mean_token_accuracy": 0.7366019688546658, | |
| "num_tokens": 26252709.0, | |
| "step": 3375 | |
| }, | |
| { | |
| "entropy": 1.0996058486402034, | |
| "epoch": 0.8845996564979145, | |
| "grad_norm": 0.45103803277015686, | |
| "learning_rate": 2.313530489400681e-06, | |
| "loss": 1.0965, | |
| "mean_token_accuracy": 0.7159675225615502, | |
| "num_tokens": 26290405.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "entropy": 1.120962193235755, | |
| "epoch": 0.8859082358714321, | |
| "grad_norm": 0.4903068244457245, | |
| "learning_rate": 2.28735933001832e-06, | |
| "loss": 1.133, | |
| "mean_token_accuracy": 0.7219177789986133, | |
| "num_tokens": 26328036.0, | |
| "step": 3385 | |
| }, | |
| { | |
| "entropy": 1.015447051078081, | |
| "epoch": 0.8872168152449497, | |
| "grad_norm": 0.5176340341567993, | |
| "learning_rate": 2.2611881706359592e-06, | |
| "loss": 1.0307, | |
| "mean_token_accuracy": 0.7406690739095211, | |
| "num_tokens": 26370299.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "entropy": 1.0935089591890574, | |
| "epoch": 0.8885253946184674, | |
| "grad_norm": 0.29109635949134827, | |
| "learning_rate": 2.235017011253599e-06, | |
| "loss": 1.0479, | |
| "mean_token_accuracy": 0.7220962606370449, | |
| "num_tokens": 26410839.0, | |
| "step": 3395 | |
| }, | |
| { | |
| "entropy": 1.073088315874338, | |
| "epoch": 0.8898339739919849, | |
| "grad_norm": 0.41668152809143066, | |
| "learning_rate": 2.208845851871238e-06, | |
| "loss": 1.0365, | |
| "mean_token_accuracy": 0.7313703790307045, | |
| "num_tokens": 26451902.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.8898339739919849, | |
| "eval_entropy": 1.0298821225166321, | |
| "eval_loss": 1.0266761779785156, | |
| "eval_mean_token_accuracy": 0.7314435024261474, | |
| "eval_num_tokens": 26451902.0, | |
| "eval_runtime": 97.2178, | |
| "eval_samples_per_second": 10.286, | |
| "eval_steps_per_second": 1.286, | |
| "step": 3400 | |
| }, | |
| { | |
| "entropy": 1.1048546012490987, | |
| "epoch": 0.8911425533655025, | |
| "grad_norm": 0.33564722537994385, | |
| "learning_rate": 2.1826746924888777e-06, | |
| "loss": 1.1361, | |
| "mean_token_accuracy": 0.7248132299631834, | |
| "num_tokens": 26494266.0, | |
| "step": 3405 | |
| }, | |
| { | |
| "entropy": 1.0767640516161918, | |
| "epoch": 0.8924511327390202, | |
| "grad_norm": 0.34754645824432373, | |
| "learning_rate": 2.156503533106517e-06, | |
| "loss": 1.0538, | |
| "mean_token_accuracy": 0.7239204317331314, | |
| "num_tokens": 26538961.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "entropy": 1.1368263632059097, | |
| "epoch": 0.8937597121125378, | |
| "grad_norm": 0.42159193754196167, | |
| "learning_rate": 2.130332373724156e-06, | |
| "loss": 1.0626, | |
| "mean_token_accuracy": 0.7145794078707695, | |
| "num_tokens": 26576209.0, | |
| "step": 3415 | |
| }, | |
| { | |
| "entropy": 1.1049470301717519, | |
| "epoch": 0.8950682914860555, | |
| "grad_norm": 0.41963493824005127, | |
| "learning_rate": 2.1041612143417954e-06, | |
| "loss": 1.078, | |
| "mean_token_accuracy": 0.7214818011969328, | |
| "num_tokens": 26615242.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "entropy": 1.0639567028731107, | |
| "epoch": 0.8963768708595731, | |
| "grad_norm": 0.27538204193115234, | |
| "learning_rate": 2.0779900549594346e-06, | |
| "loss": 1.0215, | |
| "mean_token_accuracy": 0.7226650364696979, | |
| "num_tokens": 26655742.0, | |
| "step": 3425 | |
| }, | |
| { | |
| "entropy": 1.041541761904955, | |
| "epoch": 0.8976854502330907, | |
| "grad_norm": 0.3388216197490692, | |
| "learning_rate": 2.0518188955770743e-06, | |
| "loss": 1.0363, | |
| "mean_token_accuracy": 0.7356565605849028, | |
| "num_tokens": 26693766.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "entropy": 1.0412372374907135, | |
| "epoch": 0.8989940296066083, | |
| "grad_norm": 0.3273700773715973, | |
| "learning_rate": 2.0256477361947135e-06, | |
| "loss": 0.9961, | |
| "mean_token_accuracy": 0.7328124668449163, | |
| "num_tokens": 26731172.0, | |
| "step": 3435 | |
| }, | |
| { | |
| "entropy": 1.0402305848896503, | |
| "epoch": 0.9003026089801259, | |
| "grad_norm": 0.5662808418273926, | |
| "learning_rate": 1.999476576812353e-06, | |
| "loss": 1.0287, | |
| "mean_token_accuracy": 0.7347014851868152, | |
| "num_tokens": 26770030.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "entropy": 1.0628270111978053, | |
| "epoch": 0.9016111883536436, | |
| "grad_norm": 0.39010319113731384, | |
| "learning_rate": 1.9733054174299923e-06, | |
| "loss": 0.9835, | |
| "mean_token_accuracy": 0.7256053328514099, | |
| "num_tokens": 26811239.0, | |
| "step": 3445 | |
| }, | |
| { | |
| "entropy": 1.0397684559226037, | |
| "epoch": 0.9029197677271612, | |
| "grad_norm": 0.5970498323440552, | |
| "learning_rate": 1.9471342580476316e-06, | |
| "loss": 1.0163, | |
| "mean_token_accuracy": 0.7326931424438954, | |
| "num_tokens": 26850997.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "entropy": 1.0377406038343906, | |
| "epoch": 0.9042283471006788, | |
| "grad_norm": 0.45579779148101807, | |
| "learning_rate": 1.920963098665271e-06, | |
| "loss": 1.0484, | |
| "mean_token_accuracy": 0.7341817907989026, | |
| "num_tokens": 26889505.0, | |
| "step": 3455 | |
| }, | |
| { | |
| "entropy": 1.020282133296132, | |
| "epoch": 0.9055369264741965, | |
| "grad_norm": 0.40827953815460205, | |
| "learning_rate": 1.8947919392829104e-06, | |
| "loss": 1.0153, | |
| "mean_token_accuracy": 0.737718590721488, | |
| "num_tokens": 26927519.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "entropy": 1.053854411840439, | |
| "epoch": 0.9068455058477141, | |
| "grad_norm": 0.45387595891952515, | |
| "learning_rate": 1.8686207799005498e-06, | |
| "loss": 0.9982, | |
| "mean_token_accuracy": 0.7261220943182707, | |
| "num_tokens": 26966277.0, | |
| "step": 3465 | |
| }, | |
| { | |
| "entropy": 1.017658719792962, | |
| "epoch": 0.9081540852212318, | |
| "grad_norm": 0.36533382534980774, | |
| "learning_rate": 1.842449620518189e-06, | |
| "loss": 0.9675, | |
| "mean_token_accuracy": 0.7362812142819166, | |
| "num_tokens": 27002691.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "entropy": 1.053343753516674, | |
| "epoch": 0.9094626645947493, | |
| "grad_norm": 0.3158053755760193, | |
| "learning_rate": 1.8162784611358283e-06, | |
| "loss": 1.0161, | |
| "mean_token_accuracy": 0.7329125508666039, | |
| "num_tokens": 27040733.0, | |
| "step": 3475 | |
| }, | |
| { | |
| "entropy": 0.9715707473456859, | |
| "epoch": 0.9107712439682669, | |
| "grad_norm": 0.30097201466560364, | |
| "learning_rate": 1.790107301753468e-06, | |
| "loss": 0.9108, | |
| "mean_token_accuracy": 0.741458211094141, | |
| "num_tokens": 27083118.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "entropy": 1.1143366053700448, | |
| "epoch": 0.9120798233417846, | |
| "grad_norm": 0.5649247765541077, | |
| "learning_rate": 1.7639361423711071e-06, | |
| "loss": 1.0844, | |
| "mean_token_accuracy": 0.7136627584695816, | |
| "num_tokens": 27118214.0, | |
| "step": 3485 | |
| }, | |
| { | |
| "entropy": 1.053029465302825, | |
| "epoch": 0.9133884027153022, | |
| "grad_norm": 0.41902416944503784, | |
| "learning_rate": 1.7377649829887466e-06, | |
| "loss": 0.9994, | |
| "mean_token_accuracy": 0.7316753648221492, | |
| "num_tokens": 27154246.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "entropy": 1.0596496284008026, | |
| "epoch": 0.9146969820888198, | |
| "grad_norm": 0.4515739977359772, | |
| "learning_rate": 1.7115938236063858e-06, | |
| "loss": 1.0266, | |
| "mean_token_accuracy": 0.7284657262265682, | |
| "num_tokens": 27190566.0, | |
| "step": 3495 | |
| }, | |
| { | |
| "entropy": 1.1401704950258136, | |
| "epoch": 0.9160055614623375, | |
| "grad_norm": 0.5285766124725342, | |
| "learning_rate": 1.6854226642240254e-06, | |
| "loss": 1.1079, | |
| "mean_token_accuracy": 0.7140452340245247, | |
| "num_tokens": 27231234.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.9160055614623375, | |
| "eval_entropy": 1.0293893675804138, | |
| "eval_loss": 1.0261290073394775, | |
| "eval_mean_token_accuracy": 0.7316831665039063, | |
| "eval_num_tokens": 27231234.0, | |
| "eval_runtime": 97.2439, | |
| "eval_samples_per_second": 10.283, | |
| "eval_steps_per_second": 1.285, | |
| "step": 3500 | |
| }, | |
| { | |
| "entropy": 1.060255604609847, | |
| "epoch": 0.917314140835855, | |
| "grad_norm": 0.5222223401069641, | |
| "learning_rate": 1.6592515048416647e-06, | |
| "loss": 1.0422, | |
| "mean_token_accuracy": 0.7291194565594197, | |
| "num_tokens": 27274088.0, | |
| "step": 3505 | |
| }, | |
| { | |
| "entropy": 1.0341279707849025, | |
| "epoch": 0.9186227202093727, | |
| "grad_norm": 0.46550068259239197, | |
| "learning_rate": 1.6330803454593039e-06, | |
| "loss": 1.0274, | |
| "mean_token_accuracy": 0.736503117159009, | |
| "num_tokens": 27314182.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "entropy": 1.0762779969722032, | |
| "epoch": 0.9199312995828903, | |
| "grad_norm": 0.4758652150630951, | |
| "learning_rate": 1.6069091860769433e-06, | |
| "loss": 1.0411, | |
| "mean_token_accuracy": 0.7197697997093201, | |
| "num_tokens": 27353165.0, | |
| "step": 3515 | |
| }, | |
| { | |
| "entropy": 1.0701686941087245, | |
| "epoch": 0.9212398789564079, | |
| "grad_norm": 0.49974560737609863, | |
| "learning_rate": 1.5807380266945827e-06, | |
| "loss": 1.0625, | |
| "mean_token_accuracy": 0.7291940327733755, | |
| "num_tokens": 27387526.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "entropy": 1.083727527782321, | |
| "epoch": 0.9225484583299256, | |
| "grad_norm": 0.5067974328994751, | |
| "learning_rate": 1.5545668673122222e-06, | |
| "loss": 1.0838, | |
| "mean_token_accuracy": 0.719190240651369, | |
| "num_tokens": 27424821.0, | |
| "step": 3525 | |
| }, | |
| { | |
| "entropy": 1.0643510822206736, | |
| "epoch": 0.9238570377034432, | |
| "grad_norm": 0.6037015914916992, | |
| "learning_rate": 1.5283957079298614e-06, | |
| "loss": 1.08, | |
| "mean_token_accuracy": 0.7282540425658226, | |
| "num_tokens": 27462609.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "entropy": 1.0500620037317276, | |
| "epoch": 0.9251656170769609, | |
| "grad_norm": 0.6834056377410889, | |
| "learning_rate": 1.5022245485475006e-06, | |
| "loss": 1.0026, | |
| "mean_token_accuracy": 0.7320256743580102, | |
| "num_tokens": 27500178.0, | |
| "step": 3535 | |
| }, | |
| { | |
| "entropy": 1.0600904449820518, | |
| "epoch": 0.9264741964504785, | |
| "grad_norm": 0.5821903347969055, | |
| "learning_rate": 1.4760533891651402e-06, | |
| "loss": 1.0701, | |
| "mean_token_accuracy": 0.7213537424802781, | |
| "num_tokens": 27539087.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "entropy": 1.057712410390377, | |
| "epoch": 0.927782775823996, | |
| "grad_norm": 0.28830403089523315, | |
| "learning_rate": 1.4498822297827795e-06, | |
| "loss": 0.9984, | |
| "mean_token_accuracy": 0.7261978283524513, | |
| "num_tokens": 27575481.0, | |
| "step": 3545 | |
| }, | |
| { | |
| "entropy": 1.0745814580470323, | |
| "epoch": 0.9290913551975137, | |
| "grad_norm": 0.5320100784301758, | |
| "learning_rate": 1.4237110704004189e-06, | |
| "loss": 1.0255, | |
| "mean_token_accuracy": 0.7240989219397307, | |
| "num_tokens": 27610121.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "entropy": 1.1143604163080454, | |
| "epoch": 0.9303999345710313, | |
| "grad_norm": 0.3023637533187866, | |
| "learning_rate": 1.3975399110180581e-06, | |
| "loss": 1.1067, | |
| "mean_token_accuracy": 0.7260128542780876, | |
| "num_tokens": 27648696.0, | |
| "step": 3555 | |
| }, | |
| { | |
| "entropy": 1.0836713276803493, | |
| "epoch": 0.9317085139445489, | |
| "grad_norm": 0.6137750148773193, | |
| "learning_rate": 1.3713687516356975e-06, | |
| "loss": 1.0442, | |
| "mean_token_accuracy": 0.7201635256409645, | |
| "num_tokens": 27682007.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "entropy": 1.1006150282919407, | |
| "epoch": 0.9330170933180666, | |
| "grad_norm": 0.44908300042152405, | |
| "learning_rate": 1.345197592253337e-06, | |
| "loss": 1.1144, | |
| "mean_token_accuracy": 0.7233285054564476, | |
| "num_tokens": 27719353.0, | |
| "step": 3565 | |
| }, | |
| { | |
| "entropy": 1.0796324241906405, | |
| "epoch": 0.9343256726915842, | |
| "grad_norm": 0.4785999655723572, | |
| "learning_rate": 1.3190264328709762e-06, | |
| "loss": 1.0587, | |
| "mean_token_accuracy": 0.725184840708971, | |
| "num_tokens": 27760487.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "entropy": 1.0421875026077032, | |
| "epoch": 0.9356342520651019, | |
| "grad_norm": 0.5122357606887817, | |
| "learning_rate": 1.2928552734886158e-06, | |
| "loss": 1.025, | |
| "mean_token_accuracy": 0.7302470609545708, | |
| "num_tokens": 27803053.0, | |
| "step": 3575 | |
| }, | |
| { | |
| "entropy": 1.0390013061463832, | |
| "epoch": 0.9369428314386195, | |
| "grad_norm": 0.5996967554092407, | |
| "learning_rate": 1.266684114106255e-06, | |
| "loss": 0.9942, | |
| "mean_token_accuracy": 0.7326175101101399, | |
| "num_tokens": 27841890.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "entropy": 1.0458267323672772, | |
| "epoch": 0.938251410812137, | |
| "grad_norm": 0.4176723062992096, | |
| "learning_rate": 1.2405129547238943e-06, | |
| "loss": 1.0177, | |
| "mean_token_accuracy": 0.724136520922184, | |
| "num_tokens": 27882510.0, | |
| "step": 3585 | |
| }, | |
| { | |
| "entropy": 1.0069429714232683, | |
| "epoch": 0.9395599901856547, | |
| "grad_norm": 0.40060707926750183, | |
| "learning_rate": 1.2143417953415337e-06, | |
| "loss": 0.9671, | |
| "mean_token_accuracy": 0.7366093017160893, | |
| "num_tokens": 27923388.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "entropy": 1.16991759352386, | |
| "epoch": 0.9408685695591723, | |
| "grad_norm": 0.3461279571056366, | |
| "learning_rate": 1.1881706359591731e-06, | |
| "loss": 1.1462, | |
| "mean_token_accuracy": 0.7035489819943905, | |
| "num_tokens": 27964084.0, | |
| "step": 3595 | |
| }, | |
| { | |
| "entropy": 1.0686412911862135, | |
| "epoch": 0.94217714893269, | |
| "grad_norm": 0.2957308888435364, | |
| "learning_rate": 1.1619994765768126e-06, | |
| "loss": 1.1066, | |
| "mean_token_accuracy": 0.7225402969866991, | |
| "num_tokens": 28001682.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.94217714893269, | |
| "eval_entropy": 1.0279519958496093, | |
| "eval_loss": 1.0255744457244873, | |
| "eval_mean_token_accuracy": 0.7317811703681946, | |
| "eval_num_tokens": 28001682.0, | |
| "eval_runtime": 97.2947, | |
| "eval_samples_per_second": 10.278, | |
| "eval_steps_per_second": 1.285, | |
| "step": 3600 | |
| }, | |
| { | |
| "entropy": 1.0206676226109266, | |
| "epoch": 0.9434857283062076, | |
| "grad_norm": 0.35788989067077637, | |
| "learning_rate": 1.1358283171944518e-06, | |
| "loss": 1.0176, | |
| "mean_token_accuracy": 0.7300234287977219, | |
| "num_tokens": 28042579.0, | |
| "step": 3605 | |
| }, | |
| { | |
| "entropy": 1.1715757973492145, | |
| "epoch": 0.9447943076797252, | |
| "grad_norm": 0.6498376131057739, | |
| "learning_rate": 1.1096571578120912e-06, | |
| "loss": 1.2274, | |
| "mean_token_accuracy": 0.7090075612068176, | |
| "num_tokens": 28077521.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "entropy": 1.0543236445635558, | |
| "epoch": 0.9461028870532429, | |
| "grad_norm": 0.3194473385810852, | |
| "learning_rate": 1.0834859984297304e-06, | |
| "loss": 1.0491, | |
| "mean_token_accuracy": 0.72547093257308, | |
| "num_tokens": 28116840.0, | |
| "step": 3615 | |
| }, | |
| { | |
| "entropy": 1.1037466993555427, | |
| "epoch": 0.9474114664267604, | |
| "grad_norm": 0.6278566718101501, | |
| "learning_rate": 1.0573148390473699e-06, | |
| "loss": 1.0452, | |
| "mean_token_accuracy": 0.7226000647991896, | |
| "num_tokens": 28153551.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "entropy": 1.0038308002054692, | |
| "epoch": 0.948720045800278, | |
| "grad_norm": 0.48984530568122864, | |
| "learning_rate": 1.0311436796650093e-06, | |
| "loss": 0.9795, | |
| "mean_token_accuracy": 0.7407138921320439, | |
| "num_tokens": 28189324.0, | |
| "step": 3625 | |
| }, | |
| { | |
| "entropy": 1.0743804059922695, | |
| "epoch": 0.9500286251737957, | |
| "grad_norm": 0.35918810963630676, | |
| "learning_rate": 1.0049725202826487e-06, | |
| "loss": 1.1096, | |
| "mean_token_accuracy": 0.7247271824628114, | |
| "num_tokens": 28230753.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "entropy": 1.0661400273442267, | |
| "epoch": 0.9513372045473133, | |
| "grad_norm": 0.45430681109428406, | |
| "learning_rate": 9.78801360900288e-07, | |
| "loss": 1.0431, | |
| "mean_token_accuracy": 0.7261486738920212, | |
| "num_tokens": 28267961.0, | |
| "step": 3635 | |
| }, | |
| { | |
| "entropy": 1.0930729560554027, | |
| "epoch": 0.952645783920831, | |
| "grad_norm": 0.5693202018737793, | |
| "learning_rate": 9.526302015179273e-07, | |
| "loss": 1.0501, | |
| "mean_token_accuracy": 0.7252973057329655, | |
| "num_tokens": 28303967.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "entropy": 1.124368030577898, | |
| "epoch": 0.9539543632943486, | |
| "grad_norm": 0.5943740010261536, | |
| "learning_rate": 9.264590421355667e-07, | |
| "loss": 1.148, | |
| "mean_token_accuracy": 0.7163769982755184, | |
| "num_tokens": 28339080.0, | |
| "step": 3645 | |
| }, | |
| { | |
| "entropy": 1.0308339811861515, | |
| "epoch": 0.9552629426678662, | |
| "grad_norm": 0.4916401505470276, | |
| "learning_rate": 9.00287882753206e-07, | |
| "loss": 0.9571, | |
| "mean_token_accuracy": 0.7355926677584648, | |
| "num_tokens": 28376833.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "entropy": 1.0489124123007059, | |
| "epoch": 0.9565715220413838, | |
| "grad_norm": 0.5581033229827881, | |
| "learning_rate": 8.741167233708454e-07, | |
| "loss": 1.0203, | |
| "mean_token_accuracy": 0.7289952971041203, | |
| "num_tokens": 28413990.0, | |
| "step": 3655 | |
| }, | |
| { | |
| "entropy": 1.027711633220315, | |
| "epoch": 0.9578801014149014, | |
| "grad_norm": 0.5208143591880798, | |
| "learning_rate": 8.479455639884849e-07, | |
| "loss": 0.985, | |
| "mean_token_accuracy": 0.7333911582827568, | |
| "num_tokens": 28451250.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "entropy": 1.0729194439947605, | |
| "epoch": 0.9591886807884191, | |
| "grad_norm": 0.6603464484214783, | |
| "learning_rate": 8.217744046061241e-07, | |
| "loss": 1.1221, | |
| "mean_token_accuracy": 0.7259935919195414, | |
| "num_tokens": 28488287.0, | |
| "step": 3665 | |
| }, | |
| { | |
| "entropy": 1.1133702561259269, | |
| "epoch": 0.9604972601619367, | |
| "grad_norm": 0.45469507575035095, | |
| "learning_rate": 7.956032452237634e-07, | |
| "loss": 1.0952, | |
| "mean_token_accuracy": 0.7223341464996338, | |
| "num_tokens": 28523839.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "entropy": 1.0385285302996636, | |
| "epoch": 0.9618058395354543, | |
| "grad_norm": 0.4402262270450592, | |
| "learning_rate": 7.694320858414028e-07, | |
| "loss": 1.0786, | |
| "mean_token_accuracy": 0.7375961367040873, | |
| "num_tokens": 28561415.0, | |
| "step": 3675 | |
| }, | |
| { | |
| "entropy": 1.0912371952086688, | |
| "epoch": 0.963114418908972, | |
| "grad_norm": 0.4388654828071594, | |
| "learning_rate": 7.432609264590422e-07, | |
| "loss": 1.1011, | |
| "mean_token_accuracy": 0.7208574242889881, | |
| "num_tokens": 28598012.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "entropy": 1.0840743597596885, | |
| "epoch": 0.9644229982824896, | |
| "grad_norm": 0.5213295221328735, | |
| "learning_rate": 7.170897670766816e-07, | |
| "loss": 1.0878, | |
| "mean_token_accuracy": 0.7260554354637861, | |
| "num_tokens": 28640120.0, | |
| "step": 3685 | |
| }, | |
| { | |
| "entropy": 1.0481572005897761, | |
| "epoch": 0.9657315776560071, | |
| "grad_norm": 0.6161360740661621, | |
| "learning_rate": 6.90918607694321e-07, | |
| "loss": 0.9982, | |
| "mean_token_accuracy": 0.7346723277121783, | |
| "num_tokens": 28680039.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "entropy": 1.081262205913663, | |
| "epoch": 0.9670401570295248, | |
| "grad_norm": 0.5245823264122009, | |
| "learning_rate": 6.647474483119602e-07, | |
| "loss": 1.0661, | |
| "mean_token_accuracy": 0.7203562073409557, | |
| "num_tokens": 28720731.0, | |
| "step": 3695 | |
| }, | |
| { | |
| "entropy": 1.0632508873939515, | |
| "epoch": 0.9683487364030424, | |
| "grad_norm": 0.3406558334827423, | |
| "learning_rate": 6.385762889295996e-07, | |
| "loss": 1.011, | |
| "mean_token_accuracy": 0.7315301813185215, | |
| "num_tokens": 28758379.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.9683487364030424, | |
| "eval_entropy": 1.0293308172225952, | |
| "eval_loss": 1.0250952243804932, | |
| "eval_mean_token_accuracy": 0.7318736510276794, | |
| "eval_num_tokens": 28758379.0, | |
| "eval_runtime": 97.2597, | |
| "eval_samples_per_second": 10.282, | |
| "eval_steps_per_second": 1.285, | |
| "step": 3700 | |
| }, | |
| { | |
| "entropy": 1.0361842691898346, | |
| "epoch": 0.9696573157765601, | |
| "grad_norm": 0.3829176127910614, | |
| "learning_rate": 6.12405129547239e-07, | |
| "loss": 1.0297, | |
| "mean_token_accuracy": 0.738416987657547, | |
| "num_tokens": 28801046.0, | |
| "step": 3705 | |
| }, | |
| { | |
| "entropy": 0.9786388706415892, | |
| "epoch": 0.9709658951500777, | |
| "grad_norm": 0.42764556407928467, | |
| "learning_rate": 5.862339701648783e-07, | |
| "loss": 0.897, | |
| "mean_token_accuracy": 0.7432105515152216, | |
| "num_tokens": 28841846.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "entropy": 1.0343420587480068, | |
| "epoch": 0.9722744745235953, | |
| "grad_norm": 0.3440265357494354, | |
| "learning_rate": 5.600628107825177e-07, | |
| "loss": 1.0067, | |
| "mean_token_accuracy": 0.7312737897038459, | |
| "num_tokens": 28888566.0, | |
| "step": 3715 | |
| }, | |
| { | |
| "entropy": 1.0930624961853028, | |
| "epoch": 0.973583053897113, | |
| "grad_norm": 0.26113247871398926, | |
| "learning_rate": 5.338916514001571e-07, | |
| "loss": 1.1205, | |
| "mean_token_accuracy": 0.7230574164539576, | |
| "num_tokens": 28926022.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "entropy": 1.0122245840728283, | |
| "epoch": 0.9748916332706306, | |
| "grad_norm": 0.4856005012989044, | |
| "learning_rate": 5.077204920177964e-07, | |
| "loss": 1.0115, | |
| "mean_token_accuracy": 0.7327728264033795, | |
| "num_tokens": 28967341.0, | |
| "step": 3725 | |
| }, | |
| { | |
| "entropy": 1.0293555181473493, | |
| "epoch": 0.9762002126441482, | |
| "grad_norm": 0.4436942934989929, | |
| "learning_rate": 4.815493326354357e-07, | |
| "loss": 1.0318, | |
| "mean_token_accuracy": 0.7331165555864573, | |
| "num_tokens": 29011545.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "entropy": 1.0567047752439975, | |
| "epoch": 0.9775087920176658, | |
| "grad_norm": 0.46222445368766785, | |
| "learning_rate": 4.5537817325307516e-07, | |
| "loss": 1.0437, | |
| "mean_token_accuracy": 0.7290311522781849, | |
| "num_tokens": 29050339.0, | |
| "step": 3735 | |
| }, | |
| { | |
| "entropy": 1.0223099350929261, | |
| "epoch": 0.9788173713911834, | |
| "grad_norm": 0.6509369611740112, | |
| "learning_rate": 4.2920701387071454e-07, | |
| "loss": 1.0271, | |
| "mean_token_accuracy": 0.743386908620596, | |
| "num_tokens": 29090275.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "entropy": 1.0612529110163451, | |
| "epoch": 0.9801259507647011, | |
| "grad_norm": 0.5703464150428772, | |
| "learning_rate": 4.0303585448835386e-07, | |
| "loss": 1.0539, | |
| "mean_token_accuracy": 0.7298849746584892, | |
| "num_tokens": 29129073.0, | |
| "step": 3745 | |
| }, | |
| { | |
| "entropy": 1.0348493970930577, | |
| "epoch": 0.9814345301382187, | |
| "grad_norm": 0.35511285066604614, | |
| "learning_rate": 3.7686469510599324e-07, | |
| "loss": 1.0283, | |
| "mean_token_accuracy": 0.7344648085534573, | |
| "num_tokens": 29171021.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "entropy": 1.1379429820924998, | |
| "epoch": 0.9827431095117363, | |
| "grad_norm": 0.3323829174041748, | |
| "learning_rate": 3.506935357236326e-07, | |
| "loss": 1.1105, | |
| "mean_token_accuracy": 0.7117891266942025, | |
| "num_tokens": 29206905.0, | |
| "step": 3755 | |
| }, | |
| { | |
| "entropy": 1.0437415711581708, | |
| "epoch": 0.984051688885254, | |
| "grad_norm": 0.40716320276260376, | |
| "learning_rate": 3.2452237634127194e-07, | |
| "loss": 1.0219, | |
| "mean_token_accuracy": 0.7391361027956009, | |
| "num_tokens": 29247960.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "entropy": 1.1265955708920956, | |
| "epoch": 0.9853602682587715, | |
| "grad_norm": 0.6306870579719543, | |
| "learning_rate": 2.983512169589113e-07, | |
| "loss": 1.062, | |
| "mean_token_accuracy": 0.7148844588547945, | |
| "num_tokens": 29282043.0, | |
| "step": 3765 | |
| }, | |
| { | |
| "entropy": 1.0719765815883875, | |
| "epoch": 0.9866688476322892, | |
| "grad_norm": 0.35759237408638, | |
| "learning_rate": 2.7218005757655065e-07, | |
| "loss": 1.0682, | |
| "mean_token_accuracy": 0.7296040445566178, | |
| "num_tokens": 29323533.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "entropy": 1.0471963623538614, | |
| "epoch": 0.9879774270058068, | |
| "grad_norm": 0.4440222382545471, | |
| "learning_rate": 2.4600889819419e-07, | |
| "loss": 0.9946, | |
| "mean_token_accuracy": 0.7308863338083029, | |
| "num_tokens": 29364067.0, | |
| "step": 3775 | |
| }, | |
| { | |
| "entropy": 1.0608526386320591, | |
| "epoch": 0.9892860063793244, | |
| "grad_norm": 0.41301101446151733, | |
| "learning_rate": 2.198377388118294e-07, | |
| "loss": 1.0721, | |
| "mean_token_accuracy": 0.730213138461113, | |
| "num_tokens": 29411394.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "entropy": 1.042181348055601, | |
| "epoch": 0.9905945857528421, | |
| "grad_norm": 0.7013579607009888, | |
| "learning_rate": 1.9366657942946875e-07, | |
| "loss": 0.9976, | |
| "mean_token_accuracy": 0.7371749997138977, | |
| "num_tokens": 29447775.0, | |
| "step": 3785 | |
| }, | |
| { | |
| "entropy": 1.0719102643430234, | |
| "epoch": 0.9919031651263597, | |
| "grad_norm": 0.6451042890548706, | |
| "learning_rate": 1.6749542004710808e-07, | |
| "loss": 1.0273, | |
| "mean_token_accuracy": 0.725009347498417, | |
| "num_tokens": 29482789.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "entropy": 1.0650145791471004, | |
| "epoch": 0.9932117444998774, | |
| "grad_norm": 0.46040284633636475, | |
| "learning_rate": 1.4132426066474745e-07, | |
| "loss": 1.0511, | |
| "mean_token_accuracy": 0.7309038709849119, | |
| "num_tokens": 29522469.0, | |
| "step": 3795 | |
| }, | |
| { | |
| "entropy": 1.099685497954488, | |
| "epoch": 0.994520323873395, | |
| "grad_norm": 0.47909924387931824, | |
| "learning_rate": 1.1515310128238682e-07, | |
| "loss": 1.0294, | |
| "mean_token_accuracy": 0.7283653065562248, | |
| "num_tokens": 29558695.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.994520323873395, | |
| "eval_entropy": 1.027329535484314, | |
| "eval_loss": 1.0250593423843384, | |
| "eval_mean_token_accuracy": 0.7319152827262878, | |
| "eval_num_tokens": 29558695.0, | |
| "eval_runtime": 97.2485, | |
| "eval_samples_per_second": 10.283, | |
| "eval_steps_per_second": 1.285, | |
| "step": 3800 | |
| }, | |
| { | |
| "entropy": 1.0765836983919144, | |
| "epoch": 0.9958289032469125, | |
| "grad_norm": 0.46970200538635254, | |
| "learning_rate": 8.898194190002618e-08, | |
| "loss": 1.0639, | |
| "mean_token_accuracy": 0.7184258241206407, | |
| "num_tokens": 29599289.0, | |
| "step": 3805 | |
| }, | |
| { | |
| "entropy": 1.0330897614359855, | |
| "epoch": 0.9971374826204302, | |
| "grad_norm": 0.34983229637145996, | |
| "learning_rate": 6.281078251766554e-08, | |
| "loss": 1.0025, | |
| "mean_token_accuracy": 0.7342484682798386, | |
| "num_tokens": 29637188.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "entropy": 1.04201779961586, | |
| "epoch": 0.9984460619939478, | |
| "grad_norm": 0.39102649688720703, | |
| "learning_rate": 3.6639623135304896e-08, | |
| "loss": 0.9961, | |
| "mean_token_accuracy": 0.7322810731828213, | |
| "num_tokens": 29671533.0, | |
| "step": 3815 | |
| }, | |
| { | |
| "entropy": 1.1104622691869737, | |
| "epoch": 0.9997546413674654, | |
| "grad_norm": 0.39447513222694397, | |
| "learning_rate": 1.0468463752944255e-08, | |
| "loss": 1.1002, | |
| "mean_token_accuracy": 0.7095972750335932, | |
| "num_tokens": 29712231.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 3821, | |
| "total_flos": 1.297831890260091e+18, | |
| "train_loss": 1.0613669765554028, | |
| "train_runtime": 22265.1627, | |
| "train_samples_per_second": 2.746, | |
| "train_steps_per_second": 0.172 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 3821, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.297831890260091e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |