mistral-7b-sft / trainer_state.json
Jenbenarye's picture
Model save
3719292 verified
{
"best_global_step": 3800,
"best_metric": 1.0250593423843384,
"best_model_checkpoint": "/workspace/adversarial-rlhf/runs/sft-OpenHermes-2.5-Mistral-7B-20251108-1943/checkpoints/checkpoint-3800",
"epoch": 1.0,
"eval_steps": 100,
"global_step": 3821,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.1123480509966612,
"epoch": 0.0013085793735176249,
"grad_norm": 0.6217469573020935,
"learning_rate": 1.9979063072494113e-05,
"loss": 1.4467,
"mean_token_accuracy": 0.6513771142810583,
"num_tokens": 38846.0,
"step": 5
},
{
"entropy": 1.1004129022359848,
"epoch": 0.0026171587470352497,
"grad_norm": 0.4485588073730469,
"learning_rate": 1.995289191311175e-05,
"loss": 1.3538,
"mean_token_accuracy": 0.6656047463417053,
"num_tokens": 77083.0,
"step": 10
},
{
"entropy": 1.1241086978465318,
"epoch": 0.003925738120552875,
"grad_norm": 0.4486566483974457,
"learning_rate": 1.9926720753729393e-05,
"loss": 1.2136,
"mean_token_accuracy": 0.6849155448377132,
"num_tokens": 113871.0,
"step": 15
},
{
"entropy": 1.2270274113863706,
"epoch": 0.005234317494070499,
"grad_norm": 0.3619006276130676,
"learning_rate": 1.990054959434703e-05,
"loss": 1.1882,
"mean_token_accuracy": 0.6774788822978735,
"num_tokens": 149409.0,
"step": 20
},
{
"entropy": 1.21558427400887,
"epoch": 0.006542896867588125,
"grad_norm": 0.3003220558166504,
"learning_rate": 1.987437843496467e-05,
"loss": 1.2674,
"mean_token_accuracy": 0.6861417829990387,
"num_tokens": 191753.0,
"step": 25
},
{
"entropy": 1.2555227734148502,
"epoch": 0.00785147624110575,
"grad_norm": 0.28489917516708374,
"learning_rate": 1.9848207275582308e-05,
"loss": 1.1575,
"mean_token_accuracy": 0.6786608777940273,
"num_tokens": 236354.0,
"step": 30
},
{
"entropy": 1.2973661322146655,
"epoch": 0.009160055614623375,
"grad_norm": 0.2774274945259094,
"learning_rate": 1.982203611619995e-05,
"loss": 1.2217,
"mean_token_accuracy": 0.6799626011401415,
"num_tokens": 276048.0,
"step": 35
},
{
"entropy": 1.196922979876399,
"epoch": 0.010468634988140999,
"grad_norm": 0.2974745035171509,
"learning_rate": 1.979586495681759e-05,
"loss": 1.1239,
"mean_token_accuracy": 0.6995673455297947,
"num_tokens": 313936.0,
"step": 40
},
{
"entropy": 1.2235115006566049,
"epoch": 0.011777214361658624,
"grad_norm": 0.2803105115890503,
"learning_rate": 1.9769693797435227e-05,
"loss": 1.1753,
"mean_token_accuracy": 0.6886466249823571,
"num_tokens": 354248.0,
"step": 45
},
{
"entropy": 1.2047650370746852,
"epoch": 0.01308579373517625,
"grad_norm": 0.24354718625545502,
"learning_rate": 1.974352263805287e-05,
"loss": 1.1418,
"mean_token_accuracy": 0.6953188866376877,
"num_tokens": 392556.0,
"step": 50
},
{
"entropy": 1.174419540166855,
"epoch": 0.014394373108693874,
"grad_norm": 0.2286166548728943,
"learning_rate": 1.9717351478670507e-05,
"loss": 1.1289,
"mean_token_accuracy": 0.6958644594997168,
"num_tokens": 429553.0,
"step": 55
},
{
"entropy": 1.2164320405572653,
"epoch": 0.0157029524822115,
"grad_norm": 0.24150270223617554,
"learning_rate": 1.9691180319288145e-05,
"loss": 1.1822,
"mean_token_accuracy": 0.6899411961436271,
"num_tokens": 471479.0,
"step": 60
},
{
"entropy": 1.3109304085373878,
"epoch": 0.017011531855729123,
"grad_norm": 0.2634604275226593,
"learning_rate": 1.9665009159905787e-05,
"loss": 1.2606,
"mean_token_accuracy": 0.6797668907791377,
"num_tokens": 507880.0,
"step": 65
},
{
"entropy": 1.3245886493474246,
"epoch": 0.01832011122924675,
"grad_norm": 0.2704319953918457,
"learning_rate": 1.9638838000523425e-05,
"loss": 1.1799,
"mean_token_accuracy": 0.6835815001279115,
"num_tokens": 545408.0,
"step": 70
},
{
"entropy": 1.2262448083609343,
"epoch": 0.019628690602764374,
"grad_norm": 0.2790036201477051,
"learning_rate": 1.9612666841141064e-05,
"loss": 1.1106,
"mean_token_accuracy": 0.6929917622357606,
"num_tokens": 583634.0,
"step": 75
},
{
"entropy": 1.265508808940649,
"epoch": 0.020937269976281998,
"grad_norm": 0.22567331790924072,
"learning_rate": 1.9586495681758702e-05,
"loss": 1.2072,
"mean_token_accuracy": 0.6882555760443211,
"num_tokens": 619048.0,
"step": 80
},
{
"entropy": 1.2399516824632884,
"epoch": 0.022245849349799625,
"grad_norm": 0.26959890127182007,
"learning_rate": 1.9560324522376344e-05,
"loss": 1.1765,
"mean_token_accuracy": 0.6866359710693359,
"num_tokens": 657325.0,
"step": 85
},
{
"entropy": 1.2313102029263974,
"epoch": 0.02355442872331725,
"grad_norm": 0.31447833776474,
"learning_rate": 1.9534153362993982e-05,
"loss": 1.1615,
"mean_token_accuracy": 0.6929020721465349,
"num_tokens": 692549.0,
"step": 90
},
{
"entropy": 1.29357091486454,
"epoch": 0.024863008096834872,
"grad_norm": 0.27703842520713806,
"learning_rate": 1.950798220361162e-05,
"loss": 1.2535,
"mean_token_accuracy": 0.6816725376993418,
"num_tokens": 730459.0,
"step": 95
},
{
"entropy": 1.1631319765001535,
"epoch": 0.0261715874703525,
"grad_norm": 0.30061009526252747,
"learning_rate": 1.9481811044229262e-05,
"loss": 1.0757,
"mean_token_accuracy": 0.7062662675976753,
"num_tokens": 770154.0,
"step": 100
},
{
"epoch": 0.0261715874703525,
"eval_entropy": 1.138489418029785,
"eval_loss": 1.1535744667053223,
"eval_mean_token_accuracy": 0.7070313329696656,
"eval_num_tokens": 770154.0,
"eval_runtime": 96.9826,
"eval_samples_per_second": 10.311,
"eval_steps_per_second": 1.289,
"step": 100
},
{
"entropy": 1.2952044211328029,
"epoch": 0.027480166843870123,
"grad_norm": 0.3016025125980377,
"learning_rate": 1.94556398848469e-05,
"loss": 1.1798,
"mean_token_accuracy": 0.6826052766293287,
"num_tokens": 806341.0,
"step": 105
},
{
"entropy": 1.2129868753254414,
"epoch": 0.028788746217387747,
"grad_norm": 0.2947443425655365,
"learning_rate": 1.942946872546454e-05,
"loss": 1.1327,
"mean_token_accuracy": 0.6927057210355997,
"num_tokens": 843316.0,
"step": 110
},
{
"entropy": 1.290729960054159,
"epoch": 0.030097325590905374,
"grad_norm": 0.3131401240825653,
"learning_rate": 1.940329756608218e-05,
"loss": 1.1927,
"mean_token_accuracy": 0.6800491612404584,
"num_tokens": 879843.0,
"step": 115
},
{
"entropy": 1.1782143030315637,
"epoch": 0.031405904964423,
"grad_norm": 0.2727121114730835,
"learning_rate": 1.937712640669982e-05,
"loss": 1.0659,
"mean_token_accuracy": 0.700430641323328,
"num_tokens": 920743.0,
"step": 120
},
{
"entropy": 1.1695528313517571,
"epoch": 0.03271448433794062,
"grad_norm": 0.26583895087242126,
"learning_rate": 1.9350955247317458e-05,
"loss": 1.0232,
"mean_token_accuracy": 0.7075442411005497,
"num_tokens": 959096.0,
"step": 125
},
{
"entropy": 1.2109053663909435,
"epoch": 0.034023063711458246,
"grad_norm": 0.3991076946258545,
"learning_rate": 1.93247840879351e-05,
"loss": 1.1003,
"mean_token_accuracy": 0.699517872184515,
"num_tokens": 992352.0,
"step": 130
},
{
"entropy": 1.2005993926897645,
"epoch": 0.035331643084975876,
"grad_norm": 0.22760024666786194,
"learning_rate": 1.9298612928552734e-05,
"loss": 1.0873,
"mean_token_accuracy": 0.7063136156648397,
"num_tokens": 1031994.0,
"step": 135
},
{
"entropy": 1.2139225870370864,
"epoch": 0.0366402224584935,
"grad_norm": 0.2275068610906601,
"learning_rate": 1.9272441769170376e-05,
"loss": 1.2156,
"mean_token_accuracy": 0.6952568493783474,
"num_tokens": 1074569.0,
"step": 140
},
{
"entropy": 1.1324009403586388,
"epoch": 0.037948801832011124,
"grad_norm": 0.29410237073898315,
"learning_rate": 1.9246270609788015e-05,
"loss": 1.0779,
"mean_token_accuracy": 0.7172829706221819,
"num_tokens": 1113952.0,
"step": 145
},
{
"entropy": 1.2293724507093429,
"epoch": 0.03925738120552875,
"grad_norm": 0.37723538279533386,
"learning_rate": 1.9220099450405653e-05,
"loss": 1.146,
"mean_token_accuracy": 0.6888038989156484,
"num_tokens": 1148309.0,
"step": 150
},
{
"entropy": 1.2574020594358444,
"epoch": 0.04056596057904637,
"grad_norm": 0.2657281756401062,
"learning_rate": 1.9193928291023295e-05,
"loss": 1.1935,
"mean_token_accuracy": 0.6899809587746859,
"num_tokens": 1189282.0,
"step": 155
},
{
"entropy": 1.2693338964134455,
"epoch": 0.041874539952563995,
"grad_norm": 0.3656497895717621,
"learning_rate": 1.9167757131640933e-05,
"loss": 1.1972,
"mean_token_accuracy": 0.6868170712143182,
"num_tokens": 1227946.0,
"step": 160
},
{
"entropy": 1.2286424834281207,
"epoch": 0.043183119326081626,
"grad_norm": 0.3276367485523224,
"learning_rate": 1.914158597225857e-05,
"loss": 1.2015,
"mean_token_accuracy": 0.6974392894655466,
"num_tokens": 1263944.0,
"step": 165
},
{
"entropy": 1.1861646961420775,
"epoch": 0.04449169869959925,
"grad_norm": 0.23723381757736206,
"learning_rate": 1.9115414812876213e-05,
"loss": 1.0972,
"mean_token_accuracy": 0.7079249806702137,
"num_tokens": 1304585.0,
"step": 170
},
{
"entropy": 1.208771700784564,
"epoch": 0.045800278073116873,
"grad_norm": 0.26714691519737244,
"learning_rate": 1.908924365349385e-05,
"loss": 1.1027,
"mean_token_accuracy": 0.6939801312983036,
"num_tokens": 1348683.0,
"step": 175
},
{
"entropy": 1.2366201855242251,
"epoch": 0.0471088574466345,
"grad_norm": 0.2907228171825409,
"learning_rate": 1.906307249411149e-05,
"loss": 1.1874,
"mean_token_accuracy": 0.6881870869547129,
"num_tokens": 1383690.0,
"step": 180
},
{
"entropy": 1.1898431174457074,
"epoch": 0.04841743682015212,
"grad_norm": 0.307679146528244,
"learning_rate": 1.9036901334729128e-05,
"loss": 1.0673,
"mean_token_accuracy": 0.6996020458638668,
"num_tokens": 1421508.0,
"step": 185
},
{
"entropy": 1.187743027880788,
"epoch": 0.049726016193669745,
"grad_norm": 0.3183029890060425,
"learning_rate": 1.901073017534677e-05,
"loss": 1.0849,
"mean_token_accuracy": 0.7036747265607118,
"num_tokens": 1462197.0,
"step": 190
},
{
"entropy": 1.2066817820072173,
"epoch": 0.051034595567187375,
"grad_norm": 0.3657572865486145,
"learning_rate": 1.898455901596441e-05,
"loss": 1.1581,
"mean_token_accuracy": 0.6929362419992685,
"num_tokens": 1495983.0,
"step": 195
},
{
"entropy": 1.2126169182360171,
"epoch": 0.052343174940705,
"grad_norm": 0.29915717244148254,
"learning_rate": 1.8958387856582047e-05,
"loss": 1.209,
"mean_token_accuracy": 0.6987330242991447,
"num_tokens": 1533227.0,
"step": 200
},
{
"epoch": 0.052343174940705,
"eval_entropy": 1.1249961452484132,
"eval_loss": 1.1296290159225464,
"eval_mean_token_accuracy": 0.7111005320549011,
"eval_num_tokens": 1533227.0,
"eval_runtime": 96.9069,
"eval_samples_per_second": 10.319,
"eval_steps_per_second": 1.29,
"step": 200
},
{
"entropy": 1.2599914189428092,
"epoch": 0.05365175431422262,
"grad_norm": 0.27243492007255554,
"learning_rate": 1.893221669719969e-05,
"loss": 1.1688,
"mean_token_accuracy": 0.6900306183844804,
"num_tokens": 1571122.0,
"step": 205
},
{
"entropy": 1.182131139561534,
"epoch": 0.05496033368774025,
"grad_norm": 0.3210395276546478,
"learning_rate": 1.8906045537817327e-05,
"loss": 1.0958,
"mean_token_accuracy": 0.7051124095916748,
"num_tokens": 1610597.0,
"step": 210
},
{
"entropy": 1.1748668916523457,
"epoch": 0.05626891306125787,
"grad_norm": 0.289655864238739,
"learning_rate": 1.8879874378434965e-05,
"loss": 1.1017,
"mean_token_accuracy": 0.704343120381236,
"num_tokens": 1647358.0,
"step": 215
},
{
"entropy": 1.2121441032737494,
"epoch": 0.057577492434775494,
"grad_norm": 0.27067476511001587,
"learning_rate": 1.8853703219052607e-05,
"loss": 1.0814,
"mean_token_accuracy": 0.7023848608136177,
"num_tokens": 1683267.0,
"step": 220
},
{
"entropy": 1.2210756927728652,
"epoch": 0.058886071808293125,
"grad_norm": 0.2811789810657501,
"learning_rate": 1.8827532059670245e-05,
"loss": 1.1921,
"mean_token_accuracy": 0.6895555172115564,
"num_tokens": 1720402.0,
"step": 225
},
{
"entropy": 1.2133880224078895,
"epoch": 0.06019465118181075,
"grad_norm": 0.2670990526676178,
"learning_rate": 1.8801360900287884e-05,
"loss": 1.1452,
"mean_token_accuracy": 0.6956219498068095,
"num_tokens": 1760874.0,
"step": 230
},
{
"entropy": 1.1936849866062402,
"epoch": 0.06150323055532837,
"grad_norm": 0.23748992383480072,
"learning_rate": 1.8775189740905526e-05,
"loss": 1.1095,
"mean_token_accuracy": 0.701577215641737,
"num_tokens": 1798253.0,
"step": 235
},
{
"entropy": 1.1502909563481807,
"epoch": 0.062811809928846,
"grad_norm": 0.26068389415740967,
"learning_rate": 1.8749018581523164e-05,
"loss": 1.158,
"mean_token_accuracy": 0.7133214943110943,
"num_tokens": 1833443.0,
"step": 240
},
{
"entropy": 1.3078780982643365,
"epoch": 0.06412038930236362,
"grad_norm": 0.4137086272239685,
"learning_rate": 1.8722847422140802e-05,
"loss": 1.2878,
"mean_token_accuracy": 0.6780555315315724,
"num_tokens": 1871910.0,
"step": 245
},
{
"entropy": 1.2249418403953314,
"epoch": 0.06542896867588124,
"grad_norm": 0.2942444384098053,
"learning_rate": 1.869667626275844e-05,
"loss": 1.1637,
"mean_token_accuracy": 0.6950553271919488,
"num_tokens": 1909524.0,
"step": 250
},
{
"entropy": 1.211066035553813,
"epoch": 0.06673754804939887,
"grad_norm": 0.3243419826030731,
"learning_rate": 1.8670505103376082e-05,
"loss": 1.1516,
"mean_token_accuracy": 0.6988872833549976,
"num_tokens": 1949325.0,
"step": 255
},
{
"entropy": 1.1659034360200167,
"epoch": 0.06804612742291649,
"grad_norm": 0.2724755108356476,
"learning_rate": 1.864433394399372e-05,
"loss": 1.1041,
"mean_token_accuracy": 0.7077850546687842,
"num_tokens": 1992509.0,
"step": 260
},
{
"entropy": 1.1442662578076124,
"epoch": 0.06935470679643412,
"grad_norm": 0.22777320444583893,
"learning_rate": 1.861816278461136e-05,
"loss": 1.0722,
"mean_token_accuracy": 0.7042363248765469,
"num_tokens": 2025452.0,
"step": 265
},
{
"entropy": 1.1255293242633342,
"epoch": 0.07066328616995175,
"grad_norm": 0.30172210931777954,
"learning_rate": 1.8591991625229e-05,
"loss": 1.039,
"mean_token_accuracy": 0.7133398558944464,
"num_tokens": 2066169.0,
"step": 270
},
{
"entropy": 1.1561335507780313,
"epoch": 0.07197186554346938,
"grad_norm": 0.2582629323005676,
"learning_rate": 1.856582046584664e-05,
"loss": 1.1285,
"mean_token_accuracy": 0.6994227156043052,
"num_tokens": 2107302.0,
"step": 275
},
{
"entropy": 1.1572829529643058,
"epoch": 0.073280444916987,
"grad_norm": 0.3104718029499054,
"learning_rate": 1.8539649306464278e-05,
"loss": 1.0956,
"mean_token_accuracy": 0.706654816865921,
"num_tokens": 2143551.0,
"step": 280
},
{
"entropy": 1.1630447298288344,
"epoch": 0.07458902429050462,
"grad_norm": 0.23650068044662476,
"learning_rate": 1.851347814708192e-05,
"loss": 1.1061,
"mean_token_accuracy": 0.7114994272589683,
"num_tokens": 2184046.0,
"step": 285
},
{
"entropy": 1.1605928294360637,
"epoch": 0.07589760366402225,
"grad_norm": 0.2913525104522705,
"learning_rate": 1.8487306987699554e-05,
"loss": 1.1521,
"mean_token_accuracy": 0.7072546981275082,
"num_tokens": 2223868.0,
"step": 290
},
{
"entropy": 1.169267463684082,
"epoch": 0.07720618303753987,
"grad_norm": 0.2545417547225952,
"learning_rate": 1.8461135828317196e-05,
"loss": 1.1464,
"mean_token_accuracy": 0.7014396741986275,
"num_tokens": 2265467.0,
"step": 295
},
{
"entropy": 1.170828291401267,
"epoch": 0.0785147624110575,
"grad_norm": 0.3250999450683594,
"learning_rate": 1.8434964668934835e-05,
"loss": 1.1287,
"mean_token_accuracy": 0.7041407331824303,
"num_tokens": 2304656.0,
"step": 300
},
{
"epoch": 0.0785147624110575,
"eval_entropy": 1.1109629173278808,
"eval_loss": 1.1124978065490723,
"eval_mean_token_accuracy": 0.7139446325302125,
"eval_num_tokens": 2304656.0,
"eval_runtime": 96.7866,
"eval_samples_per_second": 10.332,
"eval_steps_per_second": 1.292,
"step": 300
},
{
"entropy": 1.2286480199545622,
"epoch": 0.07982334178457512,
"grad_norm": 0.2837753891944885,
"learning_rate": 1.8408793509552473e-05,
"loss": 1.1965,
"mean_token_accuracy": 0.6960091460496187,
"num_tokens": 2346109.0,
"step": 305
},
{
"entropy": 1.225469772517681,
"epoch": 0.08113192115809274,
"grad_norm": 0.3345058262348175,
"learning_rate": 1.8382622350170115e-05,
"loss": 1.2391,
"mean_token_accuracy": 0.6988137848675251,
"num_tokens": 2387084.0,
"step": 310
},
{
"entropy": 1.076605834439397,
"epoch": 0.08244050053161037,
"grad_norm": 0.2856815755367279,
"learning_rate": 1.8356451190787753e-05,
"loss": 1.0384,
"mean_token_accuracy": 0.7207759529352188,
"num_tokens": 2428351.0,
"step": 315
},
{
"entropy": 1.1501711413264275,
"epoch": 0.08374907990512799,
"grad_norm": 0.3421330451965332,
"learning_rate": 1.833028003140539e-05,
"loss": 1.1022,
"mean_token_accuracy": 0.7128844160586596,
"num_tokens": 2465278.0,
"step": 320
},
{
"entropy": 1.1183946415781976,
"epoch": 0.08505765927864561,
"grad_norm": 0.26048743724823,
"learning_rate": 1.8304108872023033e-05,
"loss": 1.1056,
"mean_token_accuracy": 0.7164099890738725,
"num_tokens": 2505150.0,
"step": 325
},
{
"entropy": 1.2478729356080294,
"epoch": 0.08636623865216325,
"grad_norm": 0.3070579767227173,
"learning_rate": 1.827793771264067e-05,
"loss": 1.1858,
"mean_token_accuracy": 0.6890709776431322,
"num_tokens": 2545535.0,
"step": 330
},
{
"entropy": 1.1905897859483958,
"epoch": 0.08767481802568088,
"grad_norm": 0.29979345202445984,
"learning_rate": 1.825176655325831e-05,
"loss": 1.1911,
"mean_token_accuracy": 0.6996586941182613,
"num_tokens": 2587914.0,
"step": 335
},
{
"entropy": 1.1719540372490882,
"epoch": 0.0889833973991985,
"grad_norm": 0.3220307528972626,
"learning_rate": 1.822559539387595e-05,
"loss": 1.1214,
"mean_token_accuracy": 0.7045478235930205,
"num_tokens": 2625193.0,
"step": 340
},
{
"entropy": 1.1586624164134265,
"epoch": 0.09029197677271612,
"grad_norm": 0.2599794566631317,
"learning_rate": 1.819942423449359e-05,
"loss": 1.0431,
"mean_token_accuracy": 0.7106049194931984,
"num_tokens": 2665612.0,
"step": 345
},
{
"entropy": 1.1819968853145837,
"epoch": 0.09160055614623375,
"grad_norm": 0.29061323404312134,
"learning_rate": 1.817325307511123e-05,
"loss": 1.1262,
"mean_token_accuracy": 0.7002443138509988,
"num_tokens": 2706946.0,
"step": 350
},
{
"entropy": 1.1458000589162112,
"epoch": 0.09290913551975137,
"grad_norm": 0.347755491733551,
"learning_rate": 1.8147081915728867e-05,
"loss": 1.0993,
"mean_token_accuracy": 0.7002494022250175,
"num_tokens": 2747375.0,
"step": 355
},
{
"entropy": 1.1966037943959236,
"epoch": 0.094217714893269,
"grad_norm": 0.37435945868492126,
"learning_rate": 1.812091075634651e-05,
"loss": 1.1504,
"mean_token_accuracy": 0.7029601544141769,
"num_tokens": 2790158.0,
"step": 360
},
{
"entropy": 1.1675037112087012,
"epoch": 0.09552629426678662,
"grad_norm": 0.32143616676330566,
"learning_rate": 1.8094739596964147e-05,
"loss": 1.0943,
"mean_token_accuracy": 0.7053755812346936,
"num_tokens": 2827819.0,
"step": 365
},
{
"entropy": 1.1707987286150456,
"epoch": 0.09683487364030424,
"grad_norm": 0.35289084911346436,
"learning_rate": 1.8068568437581785e-05,
"loss": 1.1038,
"mean_token_accuracy": 0.7076830513775348,
"num_tokens": 2869307.0,
"step": 370
},
{
"entropy": 1.1168341800570487,
"epoch": 0.09814345301382187,
"grad_norm": 0.3782387673854828,
"learning_rate": 1.8042397278199427e-05,
"loss": 1.0558,
"mean_token_accuracy": 0.7170991696417331,
"num_tokens": 2914107.0,
"step": 375
},
{
"entropy": 1.1111165493726731,
"epoch": 0.09945203238733949,
"grad_norm": 0.3120217025279999,
"learning_rate": 1.8016226118817065e-05,
"loss": 1.0232,
"mean_token_accuracy": 0.7135602042078972,
"num_tokens": 2952137.0,
"step": 380
},
{
"entropy": 1.1221987165510654,
"epoch": 0.10076061176085711,
"grad_norm": 0.3564794659614563,
"learning_rate": 1.7990054959434704e-05,
"loss": 1.0421,
"mean_token_accuracy": 0.712994397431612,
"num_tokens": 2992220.0,
"step": 385
},
{
"entropy": 1.1915016632527113,
"epoch": 0.10206919113437475,
"grad_norm": 0.3401761054992676,
"learning_rate": 1.7963883800052346e-05,
"loss": 1.1218,
"mean_token_accuracy": 0.704281483963132,
"num_tokens": 3032423.0,
"step": 390
},
{
"entropy": 1.1808374393731356,
"epoch": 0.10337777050789237,
"grad_norm": 0.32225707173347473,
"learning_rate": 1.7937712640669984e-05,
"loss": 1.1646,
"mean_token_accuracy": 0.6994639791548252,
"num_tokens": 3068229.0,
"step": 395
},
{
"entropy": 1.1816586170345544,
"epoch": 0.10468634988141,
"grad_norm": 0.29380306601524353,
"learning_rate": 1.7911541481287622e-05,
"loss": 1.1252,
"mean_token_accuracy": 0.7064340703189373,
"num_tokens": 3108153.0,
"step": 400
},
{
"epoch": 0.10468634988141,
"eval_entropy": 1.1162484121322631,
"eval_loss": 1.1014440059661865,
"eval_mean_token_accuracy": 0.7159673566818238,
"eval_num_tokens": 3108153.0,
"eval_runtime": 96.8755,
"eval_samples_per_second": 10.323,
"eval_steps_per_second": 1.29,
"step": 400
},
{
"entropy": 1.1671059928834437,
"epoch": 0.10599492925492762,
"grad_norm": 0.34220021963119507,
"learning_rate": 1.788537032190526e-05,
"loss": 1.0648,
"mean_token_accuracy": 0.7134943757206201,
"num_tokens": 3144634.0,
"step": 405
},
{
"entropy": 1.1742517005652189,
"epoch": 0.10730350862844525,
"grad_norm": 0.3622688949108124,
"learning_rate": 1.7859199162522902e-05,
"loss": 1.1752,
"mean_token_accuracy": 0.7035086948424578,
"num_tokens": 3184006.0,
"step": 410
},
{
"entropy": 1.193211056292057,
"epoch": 0.10861208800196287,
"grad_norm": 0.29637637734413147,
"learning_rate": 1.783302800314054e-05,
"loss": 1.1365,
"mean_token_accuracy": 0.6987239971756936,
"num_tokens": 3224565.0,
"step": 415
},
{
"entropy": 1.251770207285881,
"epoch": 0.1099206673754805,
"grad_norm": 0.2511119544506073,
"learning_rate": 1.780685684375818e-05,
"loss": 1.2718,
"mean_token_accuracy": 0.6891444839537144,
"num_tokens": 3265562.0,
"step": 420
},
{
"entropy": 1.23516104221344,
"epoch": 0.11122924674899812,
"grad_norm": 0.32558682560920715,
"learning_rate": 1.778068568437582e-05,
"loss": 1.2006,
"mean_token_accuracy": 0.6901515744626522,
"num_tokens": 3302848.0,
"step": 425
},
{
"entropy": 1.132848785072565,
"epoch": 0.11253782612251574,
"grad_norm": 0.26255035400390625,
"learning_rate": 1.775451452499346e-05,
"loss": 1.0432,
"mean_token_accuracy": 0.7118600439280272,
"num_tokens": 3342127.0,
"step": 430
},
{
"entropy": 1.180259444192052,
"epoch": 0.11384640549603336,
"grad_norm": 0.3205581307411194,
"learning_rate": 1.7728343365611098e-05,
"loss": 1.1224,
"mean_token_accuracy": 0.7004977215081454,
"num_tokens": 3381062.0,
"step": 435
},
{
"entropy": 1.249288512021303,
"epoch": 0.11515498486955099,
"grad_norm": 0.31854113936424255,
"learning_rate": 1.770217220622874e-05,
"loss": 1.2151,
"mean_token_accuracy": 0.6909528046846389,
"num_tokens": 3418674.0,
"step": 440
},
{
"entropy": 1.1658391878008842,
"epoch": 0.11646356424306861,
"grad_norm": 0.31040823459625244,
"learning_rate": 1.7676001046846374e-05,
"loss": 1.1628,
"mean_token_accuracy": 0.7044171277433634,
"num_tokens": 3459660.0,
"step": 445
},
{
"entropy": 1.1223286792635918,
"epoch": 0.11777214361658625,
"grad_norm": 0.387347936630249,
"learning_rate": 1.7649829887464016e-05,
"loss": 1.0412,
"mean_token_accuracy": 0.716064503788948,
"num_tokens": 3498595.0,
"step": 450
},
{
"entropy": 1.1415202051401139,
"epoch": 0.11908072299010387,
"grad_norm": 0.28736481070518494,
"learning_rate": 1.7623658728081658e-05,
"loss": 1.1526,
"mean_token_accuracy": 0.7126259196549654,
"num_tokens": 3538259.0,
"step": 455
},
{
"entropy": 1.1327776111662389,
"epoch": 0.1203893023636215,
"grad_norm": 0.2839507460594177,
"learning_rate": 1.7597487568699293e-05,
"loss": 1.0416,
"mean_token_accuracy": 0.7203302428126335,
"num_tokens": 3577846.0,
"step": 460
},
{
"entropy": 1.1565908435732126,
"epoch": 0.12169788173713912,
"grad_norm": 0.27812105417251587,
"learning_rate": 1.7571316409316935e-05,
"loss": 1.1039,
"mean_token_accuracy": 0.7012306869029998,
"num_tokens": 3621788.0,
"step": 465
},
{
"entropy": 1.2063092295080424,
"epoch": 0.12300646111065675,
"grad_norm": 0.38487356901168823,
"learning_rate": 1.7545145249934573e-05,
"loss": 1.1509,
"mean_token_accuracy": 0.7023797459900379,
"num_tokens": 3658439.0,
"step": 470
},
{
"entropy": 1.0865493860095738,
"epoch": 0.12431504048417437,
"grad_norm": 0.2500401735305786,
"learning_rate": 1.751897409055221e-05,
"loss": 1.003,
"mean_token_accuracy": 0.7198965962976217,
"num_tokens": 3698868.0,
"step": 475
},
{
"entropy": 1.1908767573535441,
"epoch": 0.125623619857692,
"grad_norm": 0.3962502181529999,
"learning_rate": 1.7492802931169853e-05,
"loss": 1.1297,
"mean_token_accuracy": 0.7053800851106644,
"num_tokens": 3741081.0,
"step": 480
},
{
"entropy": 1.1331688780337572,
"epoch": 0.12693219923120963,
"grad_norm": 0.2883553206920624,
"learning_rate": 1.746663177178749e-05,
"loss": 1.0536,
"mean_token_accuracy": 0.7061506755650043,
"num_tokens": 3780895.0,
"step": 485
},
{
"entropy": 1.097454984486103,
"epoch": 0.12824077860472724,
"grad_norm": 0.3009795844554901,
"learning_rate": 1.744046061240513e-05,
"loss": 1.0696,
"mean_token_accuracy": 0.7190258864313364,
"num_tokens": 3821605.0,
"step": 490
},
{
"entropy": 1.2135265406221152,
"epoch": 0.12954935797824488,
"grad_norm": 0.25389495491981506,
"learning_rate": 1.741428945302277e-05,
"loss": 1.2089,
"mean_token_accuracy": 0.693002799153328,
"num_tokens": 3861658.0,
"step": 495
},
{
"entropy": 1.2169387113302945,
"epoch": 0.1308579373517625,
"grad_norm": 0.2983970642089844,
"learning_rate": 1.738811829364041e-05,
"loss": 1.1668,
"mean_token_accuracy": 0.7012713517993688,
"num_tokens": 3903328.0,
"step": 500
},
{
"epoch": 0.1308579373517625,
"eval_entropy": 1.0979133324623107,
"eval_loss": 1.0918148756027222,
"eval_mean_token_accuracy": 0.7178149290084839,
"eval_num_tokens": 3903328.0,
"eval_runtime": 96.8747,
"eval_samples_per_second": 10.323,
"eval_steps_per_second": 1.29,
"step": 500
},
{
"entropy": 1.1749178305268289,
"epoch": 0.13216651672528013,
"grad_norm": 0.38501033186912537,
"learning_rate": 1.736194713425805e-05,
"loss": 1.1413,
"mean_token_accuracy": 0.707249428331852,
"num_tokens": 3944540.0,
"step": 505
},
{
"entropy": 1.1787702813744545,
"epoch": 0.13347509609879774,
"grad_norm": 0.43281009793281555,
"learning_rate": 1.7335775974875687e-05,
"loss": 1.1217,
"mean_token_accuracy": 0.7034583434462547,
"num_tokens": 3980926.0,
"step": 510
},
{
"entropy": 1.0645542204380036,
"epoch": 0.13478367547231537,
"grad_norm": 0.3150987923145294,
"learning_rate": 1.730960481549333e-05,
"loss": 1.05,
"mean_token_accuracy": 0.7132887698709964,
"num_tokens": 4021891.0,
"step": 515
},
{
"entropy": 1.1106295462697744,
"epoch": 0.13609225484583298,
"grad_norm": 0.3438737094402313,
"learning_rate": 1.7283433656110967e-05,
"loss": 1.0003,
"mean_token_accuracy": 0.7212319687008858,
"num_tokens": 4060166.0,
"step": 520
},
{
"entropy": 1.1168012116104364,
"epoch": 0.13740083421935062,
"grad_norm": 0.3113572895526886,
"learning_rate": 1.7257262496728605e-05,
"loss": 1.0543,
"mean_token_accuracy": 0.7209181990474463,
"num_tokens": 4094716.0,
"step": 525
},
{
"entropy": 1.175467026978731,
"epoch": 0.13870941359286823,
"grad_norm": 0.302837073802948,
"learning_rate": 1.7231091337346247e-05,
"loss": 1.1289,
"mean_token_accuracy": 0.7042849272489548,
"num_tokens": 4141339.0,
"step": 530
},
{
"entropy": 1.0599542949348688,
"epoch": 0.14001799296638587,
"grad_norm": 0.2495729923248291,
"learning_rate": 1.7204920177963885e-05,
"loss": 1.052,
"mean_token_accuracy": 0.7274259474128485,
"num_tokens": 4185050.0,
"step": 535
},
{
"entropy": 1.1991025242954492,
"epoch": 0.1413265723399035,
"grad_norm": 0.4198792576789856,
"learning_rate": 1.7178749018581524e-05,
"loss": 1.1902,
"mean_token_accuracy": 0.6947382442653179,
"num_tokens": 4223794.0,
"step": 540
},
{
"entropy": 1.227722565829754,
"epoch": 0.14263515171342112,
"grad_norm": 0.3961770236492157,
"learning_rate": 1.7152577859199166e-05,
"loss": 1.1986,
"mean_token_accuracy": 0.6865826688706875,
"num_tokens": 4255882.0,
"step": 545
},
{
"entropy": 1.1813674479722978,
"epoch": 0.14394373108693875,
"grad_norm": 0.33829399943351746,
"learning_rate": 1.7126406699816804e-05,
"loss": 1.1159,
"mean_token_accuracy": 0.7054100755602122,
"num_tokens": 4293931.0,
"step": 550
},
{
"entropy": 1.1598187141120433,
"epoch": 0.14525231046045636,
"grad_norm": 0.29768019914627075,
"learning_rate": 1.7100235540434442e-05,
"loss": 1.1149,
"mean_token_accuracy": 0.7081062331795692,
"num_tokens": 4331281.0,
"step": 555
},
{
"entropy": 1.1658945206552744,
"epoch": 0.146560889833974,
"grad_norm": 0.47267428040504456,
"learning_rate": 1.707406438105208e-05,
"loss": 1.126,
"mean_token_accuracy": 0.7110950201749802,
"num_tokens": 4367666.0,
"step": 560
},
{
"entropy": 1.107292691990733,
"epoch": 0.1478694692074916,
"grad_norm": 0.30985575914382935,
"learning_rate": 1.7047893221669722e-05,
"loss": 1.0491,
"mean_token_accuracy": 0.7133530873805285,
"num_tokens": 4404165.0,
"step": 565
},
{
"entropy": 1.0999628335237503,
"epoch": 0.14917804858100925,
"grad_norm": 0.38928133249282837,
"learning_rate": 1.702172206228736e-05,
"loss": 1.0697,
"mean_token_accuracy": 0.7206672951579094,
"num_tokens": 4444145.0,
"step": 570
},
{
"entropy": 1.0466942593455315,
"epoch": 0.15048662795452686,
"grad_norm": 0.3638196289539337,
"learning_rate": 1.6995550902905e-05,
"loss": 0.9879,
"mean_token_accuracy": 0.7333550229668617,
"num_tokens": 4486612.0,
"step": 575
},
{
"entropy": 1.1943504758179189,
"epoch": 0.1517952073280445,
"grad_norm": 0.35094064474105835,
"learning_rate": 1.696937974352264e-05,
"loss": 1.173,
"mean_token_accuracy": 0.7010251984000206,
"num_tokens": 4527971.0,
"step": 580
},
{
"entropy": 1.1672972977161407,
"epoch": 0.1531037867015621,
"grad_norm": 0.3714410662651062,
"learning_rate": 1.694320858414028e-05,
"loss": 1.1864,
"mean_token_accuracy": 0.6985016442835331,
"num_tokens": 4565626.0,
"step": 585
},
{
"entropy": 1.183141415938735,
"epoch": 0.15441236607507974,
"grad_norm": 0.3710881173610687,
"learning_rate": 1.6917037424757918e-05,
"loss": 1.1097,
"mean_token_accuracy": 0.709296465292573,
"num_tokens": 4601255.0,
"step": 590
},
{
"entropy": 1.1613754861056804,
"epoch": 0.15572094544859735,
"grad_norm": 0.4122501015663147,
"learning_rate": 1.689086626537556e-05,
"loss": 1.0919,
"mean_token_accuracy": 0.7126895777881146,
"num_tokens": 4635025.0,
"step": 595
},
{
"entropy": 1.1225487384945154,
"epoch": 0.157029524822115,
"grad_norm": 0.260109007358551,
"learning_rate": 1.6864695105993198e-05,
"loss": 1.074,
"mean_token_accuracy": 0.71206672526896,
"num_tokens": 4675929.0,
"step": 600
},
{
"epoch": 0.157029524822115,
"eval_entropy": 1.078299481868744,
"eval_loss": 1.0839170217514038,
"eval_mean_token_accuracy": 0.7196547708511353,
"eval_num_tokens": 4675929.0,
"eval_runtime": 96.9266,
"eval_samples_per_second": 10.317,
"eval_steps_per_second": 1.29,
"step": 600
},
{
"entropy": 1.214971611648798,
"epoch": 0.15833810419563263,
"grad_norm": 0.4801560044288635,
"learning_rate": 1.6838523946610836e-05,
"loss": 1.1404,
"mean_token_accuracy": 0.6989750042557716,
"num_tokens": 4714005.0,
"step": 605
},
{
"entropy": 1.1845928117632867,
"epoch": 0.15964668356915024,
"grad_norm": 0.30919569730758667,
"learning_rate": 1.6812352787228478e-05,
"loss": 1.2036,
"mean_token_accuracy": 0.698324055224657,
"num_tokens": 4755777.0,
"step": 610
},
{
"entropy": 1.0673103269189597,
"epoch": 0.16095526294266788,
"grad_norm": 0.2788703143596649,
"learning_rate": 1.6786181627846113e-05,
"loss": 0.9835,
"mean_token_accuracy": 0.719642236456275,
"num_tokens": 4795809.0,
"step": 615
},
{
"entropy": 1.1469231896102428,
"epoch": 0.16226384231618549,
"grad_norm": 0.3465251624584198,
"learning_rate": 1.6760010468463755e-05,
"loss": 1.0888,
"mean_token_accuracy": 0.7080798294395209,
"num_tokens": 4837156.0,
"step": 620
},
{
"entropy": 1.1547171102836729,
"epoch": 0.16357242168970312,
"grad_norm": 0.25486519932746887,
"learning_rate": 1.6733839309081393e-05,
"loss": 1.0928,
"mean_token_accuracy": 0.7026465550065041,
"num_tokens": 4877484.0,
"step": 625
},
{
"entropy": 1.1483009189367295,
"epoch": 0.16488100106322073,
"grad_norm": 0.2877664566040039,
"learning_rate": 1.670766814969903e-05,
"loss": 1.0912,
"mean_token_accuracy": 0.7045180670917034,
"num_tokens": 4916567.0,
"step": 630
},
{
"entropy": 1.057371946424246,
"epoch": 0.16618958043673837,
"grad_norm": 0.29588788747787476,
"learning_rate": 1.6681496990316673e-05,
"loss": 1.0147,
"mean_token_accuracy": 0.733672034740448,
"num_tokens": 4955375.0,
"step": 635
},
{
"entropy": 1.105701708048582,
"epoch": 0.16749815981025598,
"grad_norm": 0.305503785610199,
"learning_rate": 1.665532583093431e-05,
"loss": 1.0907,
"mean_token_accuracy": 0.718507794290781,
"num_tokens": 4994274.0,
"step": 640
},
{
"entropy": 1.1482198983430862,
"epoch": 0.16880673918377362,
"grad_norm": 0.34045520424842834,
"learning_rate": 1.662915467155195e-05,
"loss": 1.0737,
"mean_token_accuracy": 0.7120114967226983,
"num_tokens": 5028739.0,
"step": 645
},
{
"entropy": 1.1510255340486766,
"epoch": 0.17011531855729123,
"grad_norm": 0.3617549240589142,
"learning_rate": 1.660298351216959e-05,
"loss": 1.0989,
"mean_token_accuracy": 0.7049180768430233,
"num_tokens": 5067269.0,
"step": 650
},
{
"entropy": 1.1354583825916051,
"epoch": 0.17142389793080887,
"grad_norm": 0.2900081276893616,
"learning_rate": 1.657681235278723e-05,
"loss": 1.07,
"mean_token_accuracy": 0.7162535794079303,
"num_tokens": 5105740.0,
"step": 655
},
{
"entropy": 1.1124658603221178,
"epoch": 0.1727324773043265,
"grad_norm": 0.31473371386528015,
"learning_rate": 1.655064119340487e-05,
"loss": 1.0722,
"mean_token_accuracy": 0.7151258502155542,
"num_tokens": 5144288.0,
"step": 660
},
{
"entropy": 1.1872437849640847,
"epoch": 0.1740410566778441,
"grad_norm": 0.33180883526802063,
"learning_rate": 1.6524470034022507e-05,
"loss": 1.1771,
"mean_token_accuracy": 0.6998372994363308,
"num_tokens": 5182259.0,
"step": 665
},
{
"entropy": 1.140797757729888,
"epoch": 0.17534963605136175,
"grad_norm": 0.3464796245098114,
"learning_rate": 1.649829887464015e-05,
"loss": 1.088,
"mean_token_accuracy": 0.7082854315638543,
"num_tokens": 5221128.0,
"step": 670
},
{
"entropy": 1.1274819798767566,
"epoch": 0.17665821542487936,
"grad_norm": 0.3252648711204529,
"learning_rate": 1.6472127715257787e-05,
"loss": 1.0973,
"mean_token_accuracy": 0.7148232467472553,
"num_tokens": 5256205.0,
"step": 675
},
{
"entropy": 1.1311011631041765,
"epoch": 0.177966794798397,
"grad_norm": 0.6462693214416504,
"learning_rate": 1.6445956555875425e-05,
"loss": 1.0543,
"mean_token_accuracy": 0.7152682617306709,
"num_tokens": 5292320.0,
"step": 680
},
{
"entropy": 1.0446813367307186,
"epoch": 0.1792753741719146,
"grad_norm": 0.28639963269233704,
"learning_rate": 1.6419785396493067e-05,
"loss": 0.9461,
"mean_token_accuracy": 0.7340242311358451,
"num_tokens": 5334639.0,
"step": 685
},
{
"entropy": 1.1492939487099647,
"epoch": 0.18058395354543225,
"grad_norm": 0.34751448035240173,
"learning_rate": 1.6393614237110705e-05,
"loss": 1.0876,
"mean_token_accuracy": 0.71434987783432,
"num_tokens": 5371003.0,
"step": 690
},
{
"entropy": 1.1387122191488743,
"epoch": 0.18189253291894986,
"grad_norm": 0.32928481698036194,
"learning_rate": 1.6367443077728344e-05,
"loss": 1.1427,
"mean_token_accuracy": 0.7110460348427295,
"num_tokens": 5408983.0,
"step": 695
},
{
"entropy": 1.202421073615551,
"epoch": 0.1832011122924675,
"grad_norm": 0.36161890625953674,
"learning_rate": 1.6341271918345986e-05,
"loss": 1.17,
"mean_token_accuracy": 0.6983021099120379,
"num_tokens": 5449270.0,
"step": 700
},
{
"epoch": 0.1832011122924675,
"eval_entropy": 1.0868224625587464,
"eval_loss": 1.078864336013794,
"eval_mean_token_accuracy": 0.7203193211555481,
"eval_num_tokens": 5449270.0,
"eval_runtime": 96.8653,
"eval_samples_per_second": 10.324,
"eval_steps_per_second": 1.29,
"step": 700
},
{
"entropy": 1.139945473894477,
"epoch": 0.1845096916659851,
"grad_norm": 0.34418225288391113,
"learning_rate": 1.6315100758963624e-05,
"loss": 1.0577,
"mean_token_accuracy": 0.7123764835298061,
"num_tokens": 5486900.0,
"step": 705
},
{
"entropy": 1.2072007820010184,
"epoch": 0.18581827103950274,
"grad_norm": 0.2719118595123291,
"learning_rate": 1.6288929599581262e-05,
"loss": 1.165,
"mean_token_accuracy": 0.6938376247882843,
"num_tokens": 5526941.0,
"step": 710
},
{
"entropy": 1.254874274134636,
"epoch": 0.18712685041302035,
"grad_norm": 0.42757540941238403,
"learning_rate": 1.6262758440198904e-05,
"loss": 1.2349,
"mean_token_accuracy": 0.6856517255306244,
"num_tokens": 5563702.0,
"step": 715
},
{
"entropy": 1.0055264480412007,
"epoch": 0.188435429786538,
"grad_norm": 0.3122069537639618,
"learning_rate": 1.6236587280816542e-05,
"loss": 0.9459,
"mean_token_accuracy": 0.7404189445078373,
"num_tokens": 5607678.0,
"step": 720
},
{
"entropy": 1.1009038139134646,
"epoch": 0.18974400916005563,
"grad_norm": 0.31834590435028076,
"learning_rate": 1.621041612143418e-05,
"loss": 1.0656,
"mean_token_accuracy": 0.7191543571650982,
"num_tokens": 5647310.0,
"step": 725
},
{
"entropy": 1.0673759788274766,
"epoch": 0.19105258853357324,
"grad_norm": 0.2675130069255829,
"learning_rate": 1.618424496205182e-05,
"loss": 1.0109,
"mean_token_accuracy": 0.7255190499126911,
"num_tokens": 5685639.0,
"step": 730
},
{
"entropy": 1.1511727664619684,
"epoch": 0.19236116790709087,
"grad_norm": 0.37566685676574707,
"learning_rate": 1.615807380266946e-05,
"loss": 1.0625,
"mean_token_accuracy": 0.7034743718802929,
"num_tokens": 5720880.0,
"step": 735
},
{
"entropy": 1.0888585902750492,
"epoch": 0.19366974728060848,
"grad_norm": 0.25365257263183594,
"learning_rate": 1.61319026432871e-05,
"loss": 1.049,
"mean_token_accuracy": 0.7239378560334444,
"num_tokens": 5762559.0,
"step": 740
},
{
"entropy": 1.1175981510430575,
"epoch": 0.19497832665412612,
"grad_norm": 0.43423643708229065,
"learning_rate": 1.6105731483904738e-05,
"loss": 1.0806,
"mean_token_accuracy": 0.7193359076976776,
"num_tokens": 5803927.0,
"step": 745
},
{
"entropy": 1.103783832490444,
"epoch": 0.19628690602764373,
"grad_norm": 0.38531434535980225,
"learning_rate": 1.607956032452238e-05,
"loss": 1.0503,
"mean_token_accuracy": 0.722858403250575,
"num_tokens": 5845809.0,
"step": 750
},
{
"entropy": 1.1102315738797188,
"epoch": 0.19759548540116137,
"grad_norm": 0.3240512013435364,
"learning_rate": 1.6053389165140018e-05,
"loss": 1.074,
"mean_token_accuracy": 0.7143935695290565,
"num_tokens": 5884671.0,
"step": 755
},
{
"entropy": 1.1394222188740968,
"epoch": 0.19890406477467898,
"grad_norm": 0.35714927315711975,
"learning_rate": 1.6027218005757656e-05,
"loss": 1.0505,
"mean_token_accuracy": 0.7142257861793041,
"num_tokens": 5923161.0,
"step": 760
},
{
"entropy": 1.10749419555068,
"epoch": 0.20021264414819662,
"grad_norm": 0.28041279315948486,
"learning_rate": 1.6001046846375298e-05,
"loss": 1.0247,
"mean_token_accuracy": 0.7158019613474608,
"num_tokens": 5964592.0,
"step": 765
},
{
"entropy": 1.1163969319313765,
"epoch": 0.20152122352171423,
"grad_norm": 0.3620986342430115,
"learning_rate": 1.5974875686992933e-05,
"loss": 1.1243,
"mean_token_accuracy": 0.7199557162821293,
"num_tokens": 6002121.0,
"step": 770
},
{
"entropy": 1.1485939621925354,
"epoch": 0.20282980289523186,
"grad_norm": 0.28997066617012024,
"learning_rate": 1.5948704527610575e-05,
"loss": 1.1103,
"mean_token_accuracy": 0.7058781541883945,
"num_tokens": 6042026.0,
"step": 775
},
{
"entropy": 1.163365462049842,
"epoch": 0.2041383822687495,
"grad_norm": 0.3916630148887634,
"learning_rate": 1.5922533368228213e-05,
"loss": 1.1024,
"mean_token_accuracy": 0.7097311306744813,
"num_tokens": 6081711.0,
"step": 780
},
{
"entropy": 1.1414872355759145,
"epoch": 0.2054469616422671,
"grad_norm": 0.3654806315898895,
"learning_rate": 1.589636220884585e-05,
"loss": 1.1151,
"mean_token_accuracy": 0.7132655665278435,
"num_tokens": 6124022.0,
"step": 785
},
{
"entropy": 1.1237910758703946,
"epoch": 0.20675554101578475,
"grad_norm": 0.30389606952667236,
"learning_rate": 1.5870191049463493e-05,
"loss": 1.071,
"mean_token_accuracy": 0.7095677696168423,
"num_tokens": 6163807.0,
"step": 790
},
{
"entropy": 1.108309706300497,
"epoch": 0.20806412038930236,
"grad_norm": 0.3702613413333893,
"learning_rate": 1.584401989008113e-05,
"loss": 1.0634,
"mean_token_accuracy": 0.7195238072425127,
"num_tokens": 6201204.0,
"step": 795
},
{
"entropy": 1.114043417200446,
"epoch": 0.20937269976282,
"grad_norm": 0.35386136174201965,
"learning_rate": 1.581784873069877e-05,
"loss": 1.0579,
"mean_token_accuracy": 0.7175327345728875,
"num_tokens": 6241249.0,
"step": 800
},
{
"epoch": 0.20937269976282,
"eval_entropy": 1.0886374835968018,
"eval_loss": 1.0736411809921265,
"eval_mean_token_accuracy": 0.7216972298622132,
"eval_num_tokens": 6241249.0,
"eval_runtime": 96.8086,
"eval_samples_per_second": 10.33,
"eval_steps_per_second": 1.291,
"step": 800
},
{
"entropy": 1.1533120274543762,
"epoch": 0.2106812791363376,
"grad_norm": 0.37774205207824707,
"learning_rate": 1.579167757131641e-05,
"loss": 1.1071,
"mean_token_accuracy": 0.7110585495829582,
"num_tokens": 6280255.0,
"step": 805
},
{
"entropy": 1.1299995694309473,
"epoch": 0.21198985850985524,
"grad_norm": 0.3383270502090454,
"learning_rate": 1.576550641193405e-05,
"loss": 1.1167,
"mean_token_accuracy": 0.7196012400090694,
"num_tokens": 6320822.0,
"step": 810
},
{
"entropy": 1.132519034296274,
"epoch": 0.21329843788337285,
"grad_norm": 0.4295906126499176,
"learning_rate": 1.573933525255169e-05,
"loss": 1.0924,
"mean_token_accuracy": 0.7042114794254303,
"num_tokens": 6356129.0,
"step": 815
},
{
"entropy": 1.0832153180614115,
"epoch": 0.2146070172568905,
"grad_norm": 0.3158726394176483,
"learning_rate": 1.571316409316933e-05,
"loss": 1.0307,
"mean_token_accuracy": 0.7189163245260716,
"num_tokens": 6396277.0,
"step": 820
},
{
"entropy": 1.1116347532719373,
"epoch": 0.2159155966304081,
"grad_norm": 0.41325512528419495,
"learning_rate": 1.568699293378697e-05,
"loss": 1.0844,
"mean_token_accuracy": 0.7163894921541214,
"num_tokens": 6437087.0,
"step": 825
},
{
"entropy": 1.143670390546322,
"epoch": 0.21722417600392574,
"grad_norm": 0.4080452620983124,
"learning_rate": 1.5660821774404607e-05,
"loss": 1.1135,
"mean_token_accuracy": 0.7132163152098656,
"num_tokens": 6476092.0,
"step": 830
},
{
"entropy": 1.0523447673767805,
"epoch": 0.21853275537744335,
"grad_norm": 0.4133390784263611,
"learning_rate": 1.5634650615022245e-05,
"loss": 0.9998,
"mean_token_accuracy": 0.7262923561036587,
"num_tokens": 6513582.0,
"step": 835
},
{
"entropy": 1.1258170191198587,
"epoch": 0.219841334750961,
"grad_norm": 0.30965185165405273,
"learning_rate": 1.5608479455639887e-05,
"loss": 1.0456,
"mean_token_accuracy": 0.7124281242489815,
"num_tokens": 6548126.0,
"step": 840
},
{
"entropy": 0.9893411479890346,
"epoch": 0.22114991412447862,
"grad_norm": 0.2973790168762207,
"learning_rate": 1.5582308296257525e-05,
"loss": 0.9799,
"mean_token_accuracy": 0.7382751934230327,
"num_tokens": 6586710.0,
"step": 845
},
{
"entropy": 1.0379876332357525,
"epoch": 0.22245849349799623,
"grad_norm": 0.3356630206108093,
"learning_rate": 1.5556137136875164e-05,
"loss": 0.996,
"mean_token_accuracy": 0.7286129504442215,
"num_tokens": 6619347.0,
"step": 850
},
{
"entropy": 1.1024742640554905,
"epoch": 0.22376707287151387,
"grad_norm": 0.2636205554008484,
"learning_rate": 1.5529965977492806e-05,
"loss": 1.0735,
"mean_token_accuracy": 0.7184439640492201,
"num_tokens": 6659706.0,
"step": 855
},
{
"entropy": 1.0713739044964314,
"epoch": 0.22507565224503148,
"grad_norm": 0.31522178649902344,
"learning_rate": 1.5503794818110444e-05,
"loss": 1.0024,
"mean_token_accuracy": 0.7307705961167812,
"num_tokens": 6697262.0,
"step": 860
},
{
"entropy": 1.104831437766552,
"epoch": 0.22638423161854912,
"grad_norm": 0.4077572822570801,
"learning_rate": 1.5477623658728082e-05,
"loss": 1.0503,
"mean_token_accuracy": 0.714413607865572,
"num_tokens": 6736296.0,
"step": 865
},
{
"entropy": 1.111717356555164,
"epoch": 0.22769281099206673,
"grad_norm": 0.29983147978782654,
"learning_rate": 1.5451452499345724e-05,
"loss": 1.0415,
"mean_token_accuracy": 0.7202403925359249,
"num_tokens": 6771577.0,
"step": 870
},
{
"entropy": 1.0804973103106021,
"epoch": 0.22900139036558437,
"grad_norm": 0.3723820149898529,
"learning_rate": 1.5425281339963362e-05,
"loss": 1.0643,
"mean_token_accuracy": 0.7192892145365477,
"num_tokens": 6809238.0,
"step": 875
},
{
"entropy": 1.1236145053058864,
"epoch": 0.23030996973910198,
"grad_norm": 0.3762100338935852,
"learning_rate": 1.5399110180581e-05,
"loss": 1.1046,
"mean_token_accuracy": 0.7134812079370022,
"num_tokens": 6849361.0,
"step": 880
},
{
"entropy": 1.1618900313973426,
"epoch": 0.23161854911261961,
"grad_norm": 0.30742284655570984,
"learning_rate": 1.537293902119864e-05,
"loss": 1.1275,
"mean_token_accuracy": 0.7040756050497293,
"num_tokens": 6888648.0,
"step": 885
},
{
"entropy": 1.0937749415636062,
"epoch": 0.23292712848613722,
"grad_norm": 0.3493000864982605,
"learning_rate": 1.534676786181628e-05,
"loss": 0.9861,
"mean_token_accuracy": 0.7299770966172219,
"num_tokens": 6930330.0,
"step": 890
},
{
"entropy": 1.1693835414946079,
"epoch": 0.23423570785965486,
"grad_norm": 0.4292133152484894,
"learning_rate": 1.532059670243392e-05,
"loss": 1.1322,
"mean_token_accuracy": 0.7005651116371154,
"num_tokens": 6967178.0,
"step": 895
},
{
"entropy": 1.1351615231484176,
"epoch": 0.2355442872331725,
"grad_norm": 0.375841349363327,
"learning_rate": 1.5294425543051558e-05,
"loss": 1.0663,
"mean_token_accuracy": 0.7153576008975506,
"num_tokens": 7004358.0,
"step": 900
},
{
"epoch": 0.2355442872331725,
"eval_entropy": 1.0517715344429015,
"eval_loss": 1.0706533193588257,
"eval_mean_token_accuracy": 0.7221157221794129,
"eval_num_tokens": 7004358.0,
"eval_runtime": 97.0639,
"eval_samples_per_second": 10.302,
"eval_steps_per_second": 1.288,
"step": 900
},
{
"entropy": 1.0132040306925774,
"epoch": 0.2368528666066901,
"grad_norm": 0.3356113135814667,
"learning_rate": 1.52682543836692e-05,
"loss": 1.0007,
"mean_token_accuracy": 0.7337090380489826,
"num_tokens": 7045486.0,
"step": 905
},
{
"entropy": 1.0505661655217409,
"epoch": 0.23816144598020775,
"grad_norm": 0.3069000244140625,
"learning_rate": 1.5242083224286836e-05,
"loss": 1.0113,
"mean_token_accuracy": 0.7300900310277939,
"num_tokens": 7087578.0,
"step": 910
},
{
"entropy": 1.0948154201731086,
"epoch": 0.23947002535372536,
"grad_norm": 0.26865532994270325,
"learning_rate": 1.5215912064904476e-05,
"loss": 0.996,
"mean_token_accuracy": 0.7241026997566223,
"num_tokens": 7125621.0,
"step": 915
},
{
"entropy": 1.0862870909273625,
"epoch": 0.240778604727243,
"grad_norm": 0.3076420724391937,
"learning_rate": 1.5189740905522116e-05,
"loss": 1.0411,
"mean_token_accuracy": 0.7218753002583981,
"num_tokens": 7163248.0,
"step": 920
},
{
"entropy": 1.0920290663838386,
"epoch": 0.2420871841007606,
"grad_norm": 0.35735777020454407,
"learning_rate": 1.5163569746139755e-05,
"loss": 0.9907,
"mean_token_accuracy": 0.7253928408026695,
"num_tokens": 7198313.0,
"step": 925
},
{
"entropy": 1.0561616513878107,
"epoch": 0.24339576347427824,
"grad_norm": 0.3752039670944214,
"learning_rate": 1.5137398586757395e-05,
"loss": 1.0076,
"mean_token_accuracy": 0.7270361024886369,
"num_tokens": 7236629.0,
"step": 930
},
{
"entropy": 1.192596261948347,
"epoch": 0.24470434284779585,
"grad_norm": 0.3303165137767792,
"learning_rate": 1.5111227427375035e-05,
"loss": 1.1446,
"mean_token_accuracy": 0.7021023813635111,
"num_tokens": 7278903.0,
"step": 935
},
{
"entropy": 1.1245649460703135,
"epoch": 0.2460129222213135,
"grad_norm": 0.43512144684791565,
"learning_rate": 1.5085056267992673e-05,
"loss": 1.0783,
"mean_token_accuracy": 0.7149506479501724,
"num_tokens": 7314092.0,
"step": 940
},
{
"entropy": 1.1568658344447613,
"epoch": 0.2473215015948311,
"grad_norm": 0.2600148320198059,
"learning_rate": 1.5058885108610313e-05,
"loss": 1.1493,
"mean_token_accuracy": 0.7064739305526018,
"num_tokens": 7354852.0,
"step": 945
},
{
"entropy": 1.1390374675393105,
"epoch": 0.24863008096834874,
"grad_norm": 0.4230680465698242,
"learning_rate": 1.5032713949227953e-05,
"loss": 1.051,
"mean_token_accuracy": 0.7147066026926041,
"num_tokens": 7391988.0,
"step": 950
},
{
"entropy": 1.1157769251614809,
"epoch": 0.24993866034186635,
"grad_norm": 0.31027132272720337,
"learning_rate": 1.500654278984559e-05,
"loss": 1.1055,
"mean_token_accuracy": 0.7114299058914184,
"num_tokens": 7430698.0,
"step": 955
},
{
"entropy": 1.1084700468927622,
"epoch": 0.251247239715384,
"grad_norm": 0.3883765935897827,
"learning_rate": 1.498037163046323e-05,
"loss": 1.0371,
"mean_token_accuracy": 0.7152060814201832,
"num_tokens": 7466746.0,
"step": 960
},
{
"entropy": 1.1067252047359943,
"epoch": 0.2525558190889016,
"grad_norm": 0.3586679697036743,
"learning_rate": 1.495420047108087e-05,
"loss": 1.091,
"mean_token_accuracy": 0.7141210325062275,
"num_tokens": 7508827.0,
"step": 965
},
{
"entropy": 1.1788568448275327,
"epoch": 0.25386439846241926,
"grad_norm": 0.28028714656829834,
"learning_rate": 1.4928029311698508e-05,
"loss": 1.1108,
"mean_token_accuracy": 0.7068845115602016,
"num_tokens": 7549412.0,
"step": 970
},
{
"entropy": 1.1229758808389305,
"epoch": 0.25517297783593684,
"grad_norm": 0.25545772910118103,
"learning_rate": 1.4901858152316149e-05,
"loss": 1.0713,
"mean_token_accuracy": 0.7074509769678116,
"num_tokens": 7587152.0,
"step": 975
},
{
"entropy": 1.0951409205794334,
"epoch": 0.2564815572094545,
"grad_norm": 0.4516986608505249,
"learning_rate": 1.4875686992933789e-05,
"loss": 0.9969,
"mean_token_accuracy": 0.723530513048172,
"num_tokens": 7623760.0,
"step": 980
},
{
"entropy": 1.1083605211228131,
"epoch": 0.2577901365829721,
"grad_norm": 0.28635019063949585,
"learning_rate": 1.4849515833551427e-05,
"loss": 1.0367,
"mean_token_accuracy": 0.7231942892074585,
"num_tokens": 7670277.0,
"step": 985
},
{
"entropy": 1.140053043141961,
"epoch": 0.25909871595648976,
"grad_norm": 0.29921141266822815,
"learning_rate": 1.4823344674169067e-05,
"loss": 1.0606,
"mean_token_accuracy": 0.7078870553523302,
"num_tokens": 7711405.0,
"step": 990
},
{
"entropy": 1.1648910887539388,
"epoch": 0.26040729533000734,
"grad_norm": 0.44739824533462524,
"learning_rate": 1.4797173514786707e-05,
"loss": 1.0833,
"mean_token_accuracy": 0.7147607050836087,
"num_tokens": 7745597.0,
"step": 995
},
{
"entropy": 1.1015649508684873,
"epoch": 0.261715874703525,
"grad_norm": 0.3616364598274231,
"learning_rate": 1.4771002355404345e-05,
"loss": 1.102,
"mean_token_accuracy": 0.7119447905570269,
"num_tokens": 7778271.0,
"step": 1000
},
{
"epoch": 0.261715874703525,
"eval_entropy": 1.0673753538131714,
"eval_loss": 1.0669485330581665,
"eval_mean_token_accuracy": 0.7229559454917908,
"eval_num_tokens": 7778271.0,
"eval_runtime": 96.9367,
"eval_samples_per_second": 10.316,
"eval_steps_per_second": 1.29,
"step": 1000
},
{
"entropy": 1.1813726715743542,
"epoch": 0.2630244540770426,
"grad_norm": 0.34745144844055176,
"learning_rate": 1.4744831196021986e-05,
"loss": 1.1606,
"mean_token_accuracy": 0.7018857698887586,
"num_tokens": 7820880.0,
"step": 1005
},
{
"entropy": 1.1542737431824208,
"epoch": 0.26433303345056025,
"grad_norm": 0.37115049362182617,
"learning_rate": 1.4718660036639626e-05,
"loss": 1.102,
"mean_token_accuracy": 0.7048508331179619,
"num_tokens": 7858918.0,
"step": 1010
},
{
"entropy": 1.0566180381923913,
"epoch": 0.26564161282407783,
"grad_norm": 0.3299511671066284,
"learning_rate": 1.4692488877257262e-05,
"loss": 0.9832,
"mean_token_accuracy": 0.7326587349176407,
"num_tokens": 7896026.0,
"step": 1015
},
{
"entropy": 1.131032995507121,
"epoch": 0.26695019219759547,
"grad_norm": 0.3021943271160126,
"learning_rate": 1.4666317717874902e-05,
"loss": 1.1154,
"mean_token_accuracy": 0.7120835833251477,
"num_tokens": 7936716.0,
"step": 1020
},
{
"entropy": 1.1042870879173279,
"epoch": 0.2682587715711131,
"grad_norm": 0.47742941975593567,
"learning_rate": 1.4640146558492542e-05,
"loss": 1.047,
"mean_token_accuracy": 0.7116286411881447,
"num_tokens": 7973759.0,
"step": 1025
},
{
"entropy": 1.1014587018638848,
"epoch": 0.26956735094463075,
"grad_norm": 0.36218902468681335,
"learning_rate": 1.461397539911018e-05,
"loss": 1.0494,
"mean_token_accuracy": 0.7173497840762139,
"num_tokens": 8012730.0,
"step": 1030
},
{
"entropy": 1.1734120152890681,
"epoch": 0.2708759303181484,
"grad_norm": 0.38173866271972656,
"learning_rate": 1.458780423972782e-05,
"loss": 1.0801,
"mean_token_accuracy": 0.7082495357841253,
"num_tokens": 8053786.0,
"step": 1035
},
{
"entropy": 1.102143411897123,
"epoch": 0.27218450969166597,
"grad_norm": 0.32196053862571716,
"learning_rate": 1.4561633080345461e-05,
"loss": 1.0723,
"mean_token_accuracy": 0.7233434859663248,
"num_tokens": 8092132.0,
"step": 1040
},
{
"entropy": 1.0193447671830653,
"epoch": 0.2734930890651836,
"grad_norm": 0.3907255530357361,
"learning_rate": 1.4535461920963101e-05,
"loss": 0.9895,
"mean_token_accuracy": 0.7300974868237973,
"num_tokens": 8131337.0,
"step": 1045
},
{
"entropy": 1.105736630409956,
"epoch": 0.27480166843870124,
"grad_norm": 0.564200758934021,
"learning_rate": 1.450929076158074e-05,
"loss": 1.0985,
"mean_token_accuracy": 0.7164596065878868,
"num_tokens": 8170291.0,
"step": 1050
},
{
"entropy": 1.0954801928251983,
"epoch": 0.2761102478122189,
"grad_norm": 0.3417541980743408,
"learning_rate": 1.448311960219838e-05,
"loss": 1.0295,
"mean_token_accuracy": 0.7208357889205217,
"num_tokens": 8205888.0,
"step": 1055
},
{
"entropy": 1.164321320876479,
"epoch": 0.27741882718573646,
"grad_norm": 0.450543075799942,
"learning_rate": 1.445694844281602e-05,
"loss": 1.0975,
"mean_token_accuracy": 0.7097847059369087,
"num_tokens": 8242915.0,
"step": 1060
},
{
"entropy": 1.1172142535448075,
"epoch": 0.2787274065592541,
"grad_norm": 0.3983106315135956,
"learning_rate": 1.4430777283433656e-05,
"loss": 1.0618,
"mean_token_accuracy": 0.7233660608530045,
"num_tokens": 8280049.0,
"step": 1065
},
{
"entropy": 1.1097337387502193,
"epoch": 0.28003598593277174,
"grad_norm": 0.3336636424064636,
"learning_rate": 1.4404606124051296e-05,
"loss": 1.0713,
"mean_token_accuracy": 0.7141048096120357,
"num_tokens": 8320724.0,
"step": 1070
},
{
"entropy": 1.143090207502246,
"epoch": 0.2813445653062894,
"grad_norm": 0.47186407446861267,
"learning_rate": 1.4378434964668936e-05,
"loss": 1.1169,
"mean_token_accuracy": 0.7122757855802775,
"num_tokens": 8355846.0,
"step": 1075
},
{
"entropy": 1.116348212212324,
"epoch": 0.282653144679807,
"grad_norm": 0.3499598205089569,
"learning_rate": 1.4352263805286575e-05,
"loss": 1.0442,
"mean_token_accuracy": 0.7156722906976938,
"num_tokens": 8394668.0,
"step": 1080
},
{
"entropy": 1.2300225652754306,
"epoch": 0.2839617240533246,
"grad_norm": 0.3478023111820221,
"learning_rate": 1.4326092645904215e-05,
"loss": 1.1722,
"mean_token_accuracy": 0.6905759438872338,
"num_tokens": 8431587.0,
"step": 1085
},
{
"entropy": 1.0534675564616918,
"epoch": 0.28527030342684223,
"grad_norm": 0.38736996054649353,
"learning_rate": 1.4299921486521855e-05,
"loss": 1.0075,
"mean_token_accuracy": 0.7261179834604263,
"num_tokens": 8469819.0,
"step": 1090
},
{
"entropy": 1.0744053564965725,
"epoch": 0.28657888280035987,
"grad_norm": 0.32573196291923523,
"learning_rate": 1.4273750327139493e-05,
"loss": 1.0065,
"mean_token_accuracy": 0.727313157171011,
"num_tokens": 8507289.0,
"step": 1095
},
{
"entropy": 1.1115598980337382,
"epoch": 0.2878874621738775,
"grad_norm": 0.2983573079109192,
"learning_rate": 1.4247579167757133e-05,
"loss": 1.0318,
"mean_token_accuracy": 0.7211863253265619,
"num_tokens": 8541734.0,
"step": 1100
},
{
"epoch": 0.2878874621738775,
"eval_entropy": 1.0663088278770447,
"eval_loss": 1.0640983581542969,
"eval_mean_token_accuracy": 0.7233748874664306,
"eval_num_tokens": 8541734.0,
"eval_runtime": 97.2334,
"eval_samples_per_second": 10.285,
"eval_steps_per_second": 1.286,
"step": 1100
},
{
"entropy": 1.179681045934558,
"epoch": 0.2891960415473951,
"grad_norm": 0.34421974420547485,
"learning_rate": 1.4221408008374773e-05,
"loss": 1.1522,
"mean_token_accuracy": 0.7020174067467451,
"num_tokens": 8579590.0,
"step": 1105
},
{
"entropy": 1.23528895676136,
"epoch": 0.2905046209209127,
"grad_norm": 0.4845016300678253,
"learning_rate": 1.4195236848992412e-05,
"loss": 1.2052,
"mean_token_accuracy": 0.6924681778997183,
"num_tokens": 8616216.0,
"step": 1110
},
{
"entropy": 1.071528448536992,
"epoch": 0.29181320029443036,
"grad_norm": 0.3960973620414734,
"learning_rate": 1.4169065689610052e-05,
"loss": 0.9994,
"mean_token_accuracy": 0.7225042834877968,
"num_tokens": 8654775.0,
"step": 1115
},
{
"entropy": 1.2246194936335086,
"epoch": 0.293121779667948,
"grad_norm": 0.42761850357055664,
"learning_rate": 1.4142894530227692e-05,
"loss": 1.1149,
"mean_token_accuracy": 0.6936006706207991,
"num_tokens": 8690533.0,
"step": 1120
},
{
"entropy": 1.0734559996053576,
"epoch": 0.2944303590414656,
"grad_norm": 0.3908097445964813,
"learning_rate": 1.4116723370845328e-05,
"loss": 1.0524,
"mean_token_accuracy": 0.7204558227211237,
"num_tokens": 8727564.0,
"step": 1125
},
{
"entropy": 1.0594063133001328,
"epoch": 0.2957389384149832,
"grad_norm": 0.28768423199653625,
"learning_rate": 1.4090552211462969e-05,
"loss": 1.0328,
"mean_token_accuracy": 0.7205640614032746,
"num_tokens": 8774139.0,
"step": 1130
},
{
"entropy": 1.0696671523153782,
"epoch": 0.29704751778850086,
"grad_norm": 0.4257560968399048,
"learning_rate": 1.4064381052080609e-05,
"loss": 1.0058,
"mean_token_accuracy": 0.7245784349739551,
"num_tokens": 8810357.0,
"step": 1135
},
{
"entropy": 1.1237424857914449,
"epoch": 0.2983560971620185,
"grad_norm": 0.3298150599002838,
"learning_rate": 1.4038209892698247e-05,
"loss": 1.1134,
"mean_token_accuracy": 0.7139571741223335,
"num_tokens": 8850219.0,
"step": 1140
},
{
"entropy": 1.0590598836541176,
"epoch": 0.29966467653553613,
"grad_norm": 0.36441755294799805,
"learning_rate": 1.4012038733315887e-05,
"loss": 1.0017,
"mean_token_accuracy": 0.72717116586864,
"num_tokens": 8890728.0,
"step": 1145
},
{
"entropy": 1.0748774517327546,
"epoch": 0.3009732559090537,
"grad_norm": 0.5299174189567566,
"learning_rate": 1.3985867573933527e-05,
"loss": 1.0444,
"mean_token_accuracy": 0.7247283719480038,
"num_tokens": 8931167.0,
"step": 1150
},
{
"entropy": 1.2320478869602085,
"epoch": 0.30228183528257135,
"grad_norm": 0.31828537583351135,
"learning_rate": 1.3959696414551165e-05,
"loss": 1.1425,
"mean_token_accuracy": 0.7029214788228273,
"num_tokens": 8971179.0,
"step": 1155
},
{
"entropy": 1.0877997063100338,
"epoch": 0.303590414656089,
"grad_norm": 0.3202095329761505,
"learning_rate": 1.3933525255168806e-05,
"loss": 1.0868,
"mean_token_accuracy": 0.7259886477142572,
"num_tokens": 9012122.0,
"step": 1160
},
{
"entropy": 1.1524120923131704,
"epoch": 0.30489899402960663,
"grad_norm": 0.44416385889053345,
"learning_rate": 1.3907354095786446e-05,
"loss": 1.1781,
"mean_token_accuracy": 0.7089681778103113,
"num_tokens": 9051879.0,
"step": 1165
},
{
"entropy": 1.142632443830371,
"epoch": 0.3062075734031242,
"grad_norm": 0.29014092683792114,
"learning_rate": 1.3881182936404082e-05,
"loss": 1.1183,
"mean_token_accuracy": 0.7098891779780387,
"num_tokens": 9095080.0,
"step": 1170
},
{
"entropy": 1.0738845858722925,
"epoch": 0.30751615277664185,
"grad_norm": 0.5643961429595947,
"learning_rate": 1.3855011777021722e-05,
"loss": 1.0341,
"mean_token_accuracy": 0.7218465633690357,
"num_tokens": 9134553.0,
"step": 1175
},
{
"entropy": 1.0997203588485718,
"epoch": 0.3088247321501595,
"grad_norm": 0.3256031274795532,
"learning_rate": 1.3828840617639362e-05,
"loss": 1.0443,
"mean_token_accuracy": 0.7164226226508618,
"num_tokens": 9171451.0,
"step": 1180
},
{
"entropy": 1.1001708198338747,
"epoch": 0.3101333115236771,
"grad_norm": 0.38000190258026123,
"learning_rate": 1.3802669458257e-05,
"loss": 1.0945,
"mean_token_accuracy": 0.7193384803831577,
"num_tokens": 9212655.0,
"step": 1185
},
{
"entropy": 1.1442682899534702,
"epoch": 0.3114418908971947,
"grad_norm": 0.38140004873275757,
"learning_rate": 1.377649829887464e-05,
"loss": 1.1116,
"mean_token_accuracy": 0.7097606465220452,
"num_tokens": 9251378.0,
"step": 1190
},
{
"entropy": 1.0966169007122517,
"epoch": 0.31275047027071234,
"grad_norm": 0.30173954367637634,
"learning_rate": 1.3750327139492281e-05,
"loss": 1.0569,
"mean_token_accuracy": 0.7154310643672943,
"num_tokens": 9290376.0,
"step": 1195
},
{
"entropy": 1.112706720456481,
"epoch": 0.31405904964423,
"grad_norm": 0.3677958846092224,
"learning_rate": 1.372415598010992e-05,
"loss": 1.1293,
"mean_token_accuracy": 0.7175452932715416,
"num_tokens": 9335450.0,
"step": 1200
},
{
"epoch": 0.31405904964423,
"eval_entropy": 1.0616401386260987,
"eval_loss": 1.0616909265518188,
"eval_mean_token_accuracy": 0.7236422438621521,
"eval_num_tokens": 9335450.0,
"eval_runtime": 97.3061,
"eval_samples_per_second": 10.277,
"eval_steps_per_second": 1.285,
"step": 1200
},
{
"entropy": 1.0197914224117994,
"epoch": 0.3153676290177476,
"grad_norm": 0.303828626871109,
"learning_rate": 1.369798482072756e-05,
"loss": 0.9773,
"mean_token_accuracy": 0.7335654478520155,
"num_tokens": 9376757.0,
"step": 1205
},
{
"entropy": 1.0958842366933823,
"epoch": 0.31667620839126526,
"grad_norm": 0.31567996740341187,
"learning_rate": 1.36718136613452e-05,
"loss": 1.1095,
"mean_token_accuracy": 0.7230709217488765,
"num_tokens": 9412399.0,
"step": 1210
},
{
"entropy": 1.0366449020802975,
"epoch": 0.31798478776478284,
"grad_norm": 0.28812670707702637,
"learning_rate": 1.364564250196284e-05,
"loss": 0.9809,
"mean_token_accuracy": 0.7372792772948742,
"num_tokens": 9459659.0,
"step": 1215
},
{
"entropy": 1.16514153778553,
"epoch": 0.3192933671383005,
"grad_norm": 0.32817167043685913,
"learning_rate": 1.3619471342580476e-05,
"loss": 1.1274,
"mean_token_accuracy": 0.7073730494827032,
"num_tokens": 9501119.0,
"step": 1220
},
{
"entropy": 1.1919928897172212,
"epoch": 0.3206019465118181,
"grad_norm": 0.4222601056098938,
"learning_rate": 1.3593300183198118e-05,
"loss": 1.1383,
"mean_token_accuracy": 0.7020205046981574,
"num_tokens": 9541572.0,
"step": 1225
},
{
"entropy": 1.0665121775120496,
"epoch": 0.32191052588533575,
"grad_norm": 0.37065136432647705,
"learning_rate": 1.3567129023815758e-05,
"loss": 1.0282,
"mean_token_accuracy": 0.7204833775758743,
"num_tokens": 9582686.0,
"step": 1230
},
{
"entropy": 1.1947314314544202,
"epoch": 0.32321910525885333,
"grad_norm": 0.34813007712364197,
"learning_rate": 1.3540957864433395e-05,
"loss": 1.1388,
"mean_token_accuracy": 0.6964055716991424,
"num_tokens": 9623500.0,
"step": 1235
},
{
"entropy": 1.0864916004240512,
"epoch": 0.32452768463237097,
"grad_norm": 0.2981452941894531,
"learning_rate": 1.3514786705051035e-05,
"loss": 1.0402,
"mean_token_accuracy": 0.7253331996500492,
"num_tokens": 9667330.0,
"step": 1240
},
{
"entropy": 1.088518501445651,
"epoch": 0.3258362640058886,
"grad_norm": 0.27413734793663025,
"learning_rate": 1.3488615545668675e-05,
"loss": 1.0039,
"mean_token_accuracy": 0.7201201483607292,
"num_tokens": 9707928.0,
"step": 1245
},
{
"entropy": 1.0232492092996837,
"epoch": 0.32714484337940625,
"grad_norm": 0.3049759864807129,
"learning_rate": 1.3462444386286313e-05,
"loss": 1.0155,
"mean_token_accuracy": 0.731953501701355,
"num_tokens": 9749414.0,
"step": 1250
},
{
"entropy": 1.1411474995315074,
"epoch": 0.3284534227529239,
"grad_norm": 0.30817753076553345,
"learning_rate": 1.3436273226903953e-05,
"loss": 1.1117,
"mean_token_accuracy": 0.7109464205801487,
"num_tokens": 9787037.0,
"step": 1255
},
{
"entropy": 1.0607344008982182,
"epoch": 0.32976200212644147,
"grad_norm": 0.2996397316455841,
"learning_rate": 1.3410102067521593e-05,
"loss": 0.9988,
"mean_token_accuracy": 0.7263243019580841,
"num_tokens": 9823917.0,
"step": 1260
},
{
"entropy": 1.1400364696979524,
"epoch": 0.3310705814999591,
"grad_norm": 0.394529789686203,
"learning_rate": 1.3383930908139232e-05,
"loss": 1.1346,
"mean_token_accuracy": 0.7115379109978676,
"num_tokens": 9863897.0,
"step": 1265
},
{
"entropy": 1.1153028715401887,
"epoch": 0.33237916087347674,
"grad_norm": 0.38152188062667847,
"learning_rate": 1.3357759748756872e-05,
"loss": 1.0826,
"mean_token_accuracy": 0.7159910671412945,
"num_tokens": 9907161.0,
"step": 1270
},
{
"entropy": 1.1335538432002068,
"epoch": 0.3336877402469944,
"grad_norm": 0.3795076012611389,
"learning_rate": 1.3331588589374512e-05,
"loss": 1.062,
"mean_token_accuracy": 0.712515490502119,
"num_tokens": 9941050.0,
"step": 1275
},
{
"entropy": 1.201156435534358,
"epoch": 0.33499631962051196,
"grad_norm": 0.31430941820144653,
"learning_rate": 1.3305417429992148e-05,
"loss": 1.1755,
"mean_token_accuracy": 0.7030155394226313,
"num_tokens": 9982204.0,
"step": 1280
},
{
"entropy": 1.0916207611560822,
"epoch": 0.3363048989940296,
"grad_norm": 0.3562738597393036,
"learning_rate": 1.3279246270609789e-05,
"loss": 1.03,
"mean_token_accuracy": 0.7254784423857927,
"num_tokens": 10023122.0,
"step": 1285
},
{
"entropy": 1.1606000121682882,
"epoch": 0.33761347836754724,
"grad_norm": 0.4184325337409973,
"learning_rate": 1.3253075111227429e-05,
"loss": 1.1373,
"mean_token_accuracy": 0.7046990938484669,
"num_tokens": 10064911.0,
"step": 1290
},
{
"entropy": 1.0977379951626063,
"epoch": 0.3389220577410649,
"grad_norm": 0.3197747766971588,
"learning_rate": 1.3226903951845067e-05,
"loss": 1.0689,
"mean_token_accuracy": 0.7257760524749756,
"num_tokens": 10100407.0,
"step": 1295
},
{
"entropy": 1.074541300162673,
"epoch": 0.34023063711458246,
"grad_norm": 0.40503931045532227,
"learning_rate": 1.3200732792462707e-05,
"loss": 0.967,
"mean_token_accuracy": 0.7171443022787571,
"num_tokens": 10134360.0,
"step": 1300
},
{
"epoch": 0.34023063711458246,
"eval_entropy": 1.04598343706131,
"eval_loss": 1.0586909055709839,
"eval_mean_token_accuracy": 0.7240785593986512,
"eval_num_tokens": 10134360.0,
"eval_runtime": 97.2108,
"eval_samples_per_second": 10.287,
"eval_steps_per_second": 1.286,
"step": 1300
},
{
"entropy": 1.093527865409851,
"epoch": 0.3415392164881001,
"grad_norm": 0.30255845189094543,
"learning_rate": 1.3174561633080347e-05,
"loss": 1.0996,
"mean_token_accuracy": 0.7116521965712309,
"num_tokens": 10174760.0,
"step": 1305
},
{
"entropy": 1.0523213766515256,
"epoch": 0.34284779586161773,
"grad_norm": 0.2793543040752411,
"learning_rate": 1.3148390473697985e-05,
"loss": 0.9752,
"mean_token_accuracy": 0.7246817748993635,
"num_tokens": 10216071.0,
"step": 1310
},
{
"entropy": 1.0620372116565704,
"epoch": 0.34415637523513537,
"grad_norm": 0.31549394130706787,
"learning_rate": 1.3122219314315626e-05,
"loss": 0.992,
"mean_token_accuracy": 0.7261651441454887,
"num_tokens": 10256598.0,
"step": 1315
},
{
"entropy": 1.0895736794918776,
"epoch": 0.345464954608653,
"grad_norm": 0.3750867545604706,
"learning_rate": 1.3096048154933266e-05,
"loss": 1.051,
"mean_token_accuracy": 0.7179807629436255,
"num_tokens": 10294800.0,
"step": 1320
},
{
"entropy": 1.0776633590459823,
"epoch": 0.3467735339821706,
"grad_norm": 0.4244779944419861,
"learning_rate": 1.3069876995550902e-05,
"loss": 1.007,
"mean_token_accuracy": 0.7257129170000554,
"num_tokens": 10334846.0,
"step": 1325
},
{
"entropy": 1.1380624793469907,
"epoch": 0.3480821133556882,
"grad_norm": 0.3876670300960541,
"learning_rate": 1.3043705836168542e-05,
"loss": 1.1296,
"mean_token_accuracy": 0.7143970631062985,
"num_tokens": 10374248.0,
"step": 1330
},
{
"entropy": 1.055607095360756,
"epoch": 0.34939069272920587,
"grad_norm": 0.5531731247901917,
"learning_rate": 1.3017534676786182e-05,
"loss": 0.9648,
"mean_token_accuracy": 0.7317230701446533,
"num_tokens": 10410746.0,
"step": 1335
},
{
"entropy": 1.134307411313057,
"epoch": 0.3506992721027235,
"grad_norm": 0.3122623860836029,
"learning_rate": 1.299136351740382e-05,
"loss": 1.078,
"mean_token_accuracy": 0.7120365314185619,
"num_tokens": 10452938.0,
"step": 1340
},
{
"entropy": 1.0759996108710765,
"epoch": 0.3520078514762411,
"grad_norm": 0.35110828280448914,
"learning_rate": 1.296519235802146e-05,
"loss": 0.9849,
"mean_token_accuracy": 0.7297365363687277,
"num_tokens": 10491903.0,
"step": 1345
},
{
"entropy": 1.1492443475872278,
"epoch": 0.3533164308497587,
"grad_norm": 0.35788699984550476,
"learning_rate": 1.2939021198639101e-05,
"loss": 1.1878,
"mean_token_accuracy": 0.7069006145000458,
"num_tokens": 10527375.0,
"step": 1350
},
{
"entropy": 1.0579143922775984,
"epoch": 0.35462501022327636,
"grad_norm": 0.35787340998649597,
"learning_rate": 1.291285003925674e-05,
"loss": 1.0007,
"mean_token_accuracy": 0.7237559407949448,
"num_tokens": 10568018.0,
"step": 1355
},
{
"entropy": 1.1339404121041299,
"epoch": 0.355933589596794,
"grad_norm": 0.4546626806259155,
"learning_rate": 1.288667887987438e-05,
"loss": 1.1295,
"mean_token_accuracy": 0.7201502379029989,
"num_tokens": 10607681.0,
"step": 1360
},
{
"entropy": 1.0893705368041993,
"epoch": 0.3572421689703116,
"grad_norm": 0.45456886291503906,
"learning_rate": 1.286050772049202e-05,
"loss": 1.0304,
"mean_token_accuracy": 0.7344392716884613,
"num_tokens": 10641980.0,
"step": 1365
},
{
"entropy": 1.0794930804520846,
"epoch": 0.3585507483438292,
"grad_norm": 0.3907780945301056,
"learning_rate": 1.283433656110966e-05,
"loss": 1.0827,
"mean_token_accuracy": 0.7246263407170772,
"num_tokens": 10683159.0,
"step": 1370
},
{
"entropy": 1.0798350811004638,
"epoch": 0.35985932771734686,
"grad_norm": 0.49161893129348755,
"learning_rate": 1.2808165401727298e-05,
"loss": 1.0915,
"mean_token_accuracy": 0.7175786912441253,
"num_tokens": 10719560.0,
"step": 1375
},
{
"entropy": 1.131149498745799,
"epoch": 0.3611679070908645,
"grad_norm": 0.4854622185230255,
"learning_rate": 1.2781994242344938e-05,
"loss": 1.1285,
"mean_token_accuracy": 0.7150968369096518,
"num_tokens": 10759050.0,
"step": 1380
},
{
"entropy": 1.0728477615863086,
"epoch": 0.36247648646438213,
"grad_norm": 0.42540574073791504,
"learning_rate": 1.2755823082962578e-05,
"loss": 1.0358,
"mean_token_accuracy": 0.724468483030796,
"num_tokens": 10800630.0,
"step": 1385
},
{
"entropy": 1.1225707672536374,
"epoch": 0.3637850658378997,
"grad_norm": 0.44094985723495483,
"learning_rate": 1.2729651923580215e-05,
"loss": 1.0344,
"mean_token_accuracy": 0.7132398471236229,
"num_tokens": 10835582.0,
"step": 1390
},
{
"entropy": 1.127060130983591,
"epoch": 0.36509364521141735,
"grad_norm": 0.3640246093273163,
"learning_rate": 1.2703480764197855e-05,
"loss": 1.0752,
"mean_token_accuracy": 0.717224034294486,
"num_tokens": 10873989.0,
"step": 1395
},
{
"entropy": 1.110959093272686,
"epoch": 0.366402224584935,
"grad_norm": 0.3808118402957916,
"learning_rate": 1.2677309604815495e-05,
"loss": 1.0352,
"mean_token_accuracy": 0.7157318696379662,
"num_tokens": 10914540.0,
"step": 1400
},
{
"epoch": 0.366402224584935,
"eval_entropy": 1.057036506652832,
"eval_loss": 1.056433081626892,
"eval_mean_token_accuracy": 0.724943666934967,
"eval_num_tokens": 10914540.0,
"eval_runtime": 97.2457,
"eval_samples_per_second": 10.283,
"eval_steps_per_second": 1.285,
"step": 1400
},
{
"entropy": 1.1179429598152637,
"epoch": 0.3677108039584526,
"grad_norm": 0.3942883610725403,
"learning_rate": 1.2651138445433133e-05,
"loss": 1.0674,
"mean_token_accuracy": 0.7157964497804642,
"num_tokens": 10951332.0,
"step": 1405
},
{
"entropy": 1.144387823715806,
"epoch": 0.3690193833319702,
"grad_norm": 0.35214006900787354,
"learning_rate": 1.2624967286050773e-05,
"loss": 1.1239,
"mean_token_accuracy": 0.7134873129427433,
"num_tokens": 10990644.0,
"step": 1410
},
{
"entropy": 1.057513090223074,
"epoch": 0.37032796270548785,
"grad_norm": 0.3838125169277191,
"learning_rate": 1.2598796126668413e-05,
"loss": 1.0198,
"mean_token_accuracy": 0.7325119793415069,
"num_tokens": 11032177.0,
"step": 1415
},
{
"entropy": 1.088934962451458,
"epoch": 0.3716365420790055,
"grad_norm": 0.26814350485801697,
"learning_rate": 1.2572624967286052e-05,
"loss": 1.0808,
"mean_token_accuracy": 0.7172140318900346,
"num_tokens": 11072735.0,
"step": 1420
},
{
"entropy": 1.1592457950115205,
"epoch": 0.3729451214525231,
"grad_norm": 0.45993542671203613,
"learning_rate": 1.2546453807903692e-05,
"loss": 1.1143,
"mean_token_accuracy": 0.7036133039742708,
"num_tokens": 11111309.0,
"step": 1425
},
{
"entropy": 1.1127744242548943,
"epoch": 0.3742537008260407,
"grad_norm": 0.3038633167743683,
"learning_rate": 1.2520282648521332e-05,
"loss": 1.0659,
"mean_token_accuracy": 0.725579933822155,
"num_tokens": 11150368.0,
"step": 1430
},
{
"entropy": 1.135872596874833,
"epoch": 0.37556228019955834,
"grad_norm": 0.32042160630226135,
"learning_rate": 1.2494111489138968e-05,
"loss": 1.1148,
"mean_token_accuracy": 0.7169803373515606,
"num_tokens": 11186513.0,
"step": 1435
},
{
"entropy": 1.087304012849927,
"epoch": 0.376870859573076,
"grad_norm": 0.3336869180202484,
"learning_rate": 1.2467940329756609e-05,
"loss": 1.0445,
"mean_token_accuracy": 0.7192518580704927,
"num_tokens": 11225174.0,
"step": 1440
},
{
"entropy": 1.0319055780768394,
"epoch": 0.3781794389465936,
"grad_norm": 0.43524104356765747,
"learning_rate": 1.2441769170374249e-05,
"loss": 0.9855,
"mean_token_accuracy": 0.7250500712543726,
"num_tokens": 11261841.0,
"step": 1445
},
{
"entropy": 1.0503356985747814,
"epoch": 0.37948801832011125,
"grad_norm": 0.34029972553253174,
"learning_rate": 1.2415598010991887e-05,
"loss": 0.9427,
"mean_token_accuracy": 0.7291811019182205,
"num_tokens": 11301122.0,
"step": 1450
},
{
"entropy": 1.135501291975379,
"epoch": 0.38079659769362884,
"grad_norm": 0.34890979528427124,
"learning_rate": 1.2389426851609527e-05,
"loss": 1.0706,
"mean_token_accuracy": 0.707598365843296,
"num_tokens": 11341491.0,
"step": 1455
},
{
"entropy": 1.012824462354183,
"epoch": 0.3821051770671465,
"grad_norm": 0.44555002450942993,
"learning_rate": 1.2363255692227167e-05,
"loss": 0.986,
"mean_token_accuracy": 0.7334959991276264,
"num_tokens": 11382277.0,
"step": 1460
},
{
"entropy": 1.084730239585042,
"epoch": 0.3834137564406641,
"grad_norm": 0.34672921895980835,
"learning_rate": 1.2337084532844805e-05,
"loss": 1.0185,
"mean_token_accuracy": 0.7293158903717994,
"num_tokens": 11420312.0,
"step": 1465
},
{
"entropy": 1.0931822203099728,
"epoch": 0.38472233581418175,
"grad_norm": 0.4738580882549286,
"learning_rate": 1.2310913373462446e-05,
"loss": 1.0744,
"mean_token_accuracy": 0.7275880422443152,
"num_tokens": 11458742.0,
"step": 1470
},
{
"entropy": 1.0959295086562633,
"epoch": 0.38603091518769933,
"grad_norm": 0.3458891212940216,
"learning_rate": 1.2284742214080086e-05,
"loss": 1.0425,
"mean_token_accuracy": 0.7294268280267715,
"num_tokens": 11494946.0,
"step": 1475
},
{
"entropy": 1.1744199082255364,
"epoch": 0.38733949456121697,
"grad_norm": 0.3247629404067993,
"learning_rate": 1.2258571054697724e-05,
"loss": 1.1929,
"mean_token_accuracy": 0.6963723137974739,
"num_tokens": 11531053.0,
"step": 1480
},
{
"entropy": 1.022578400745988,
"epoch": 0.3886480739347346,
"grad_norm": 0.37479135394096375,
"learning_rate": 1.2232399895315364e-05,
"loss": 0.9592,
"mean_token_accuracy": 0.7337148085236549,
"num_tokens": 11569356.0,
"step": 1485
},
{
"entropy": 1.0597552750259638,
"epoch": 0.38995665330825224,
"grad_norm": 0.34326809644699097,
"learning_rate": 1.2206228735933004e-05,
"loss": 1.03,
"mean_token_accuracy": 0.7236659746617079,
"num_tokens": 11609142.0,
"step": 1490
},
{
"entropy": 1.038928609713912,
"epoch": 0.3912652326817699,
"grad_norm": 0.33052825927734375,
"learning_rate": 1.218005757655064e-05,
"loss": 1.0297,
"mean_token_accuracy": 0.7271129328757524,
"num_tokens": 11649979.0,
"step": 1495
},
{
"entropy": 1.149324530735612,
"epoch": 0.39257381205528746,
"grad_norm": 0.44736945629119873,
"learning_rate": 1.2153886417168281e-05,
"loss": 1.0566,
"mean_token_accuracy": 0.7146747212857008,
"num_tokens": 11685504.0,
"step": 1500
},
{
"epoch": 0.39257381205528746,
"eval_entropy": 1.061660517692566,
"eval_loss": 1.0545203685760498,
"eval_mean_token_accuracy": 0.7252409801483154,
"eval_num_tokens": 11685504.0,
"eval_runtime": 97.242,
"eval_samples_per_second": 10.284,
"eval_steps_per_second": 1.285,
"step": 1500
},
{
"entropy": 1.1069697327911854,
"epoch": 0.3938823914288051,
"grad_norm": 0.7664462327957153,
"learning_rate": 1.2127715257785921e-05,
"loss": 1.0101,
"mean_token_accuracy": 0.7186468083411455,
"num_tokens": 11720415.0,
"step": 1505
},
{
"entropy": 1.0142726931720971,
"epoch": 0.39519097080232274,
"grad_norm": 0.3830406069755554,
"learning_rate": 1.210154409840356e-05,
"loss": 0.974,
"mean_token_accuracy": 0.736117148399353,
"num_tokens": 11756577.0,
"step": 1510
},
{
"entropy": 1.112984489277005,
"epoch": 0.3964995501758404,
"grad_norm": 0.41094282269477844,
"learning_rate": 1.20753729390212e-05,
"loss": 1.1002,
"mean_token_accuracy": 0.7189074017107486,
"num_tokens": 11794872.0,
"step": 1515
},
{
"entropy": 1.0922920301556587,
"epoch": 0.39780812954935796,
"grad_norm": 0.37885573506355286,
"learning_rate": 1.204920177963884e-05,
"loss": 1.0302,
"mean_token_accuracy": 0.7164245635271073,
"num_tokens": 11833530.0,
"step": 1520
},
{
"entropy": 1.1610772363841533,
"epoch": 0.3991167089228756,
"grad_norm": 0.44092655181884766,
"learning_rate": 1.2023030620256478e-05,
"loss": 1.1124,
"mean_token_accuracy": 0.6998836103826761,
"num_tokens": 11865444.0,
"step": 1525
},
{
"entropy": 1.1576842796057463,
"epoch": 0.40042528829639323,
"grad_norm": 0.44296759366989136,
"learning_rate": 1.1996859460874118e-05,
"loss": 1.1521,
"mean_token_accuracy": 0.7068853817880154,
"num_tokens": 11901975.0,
"step": 1530
},
{
"entropy": 1.0967234898358584,
"epoch": 0.40173386766991087,
"grad_norm": 0.33487918972969055,
"learning_rate": 1.1970688301491758e-05,
"loss": 1.0064,
"mean_token_accuracy": 0.7251658782362937,
"num_tokens": 11939696.0,
"step": 1535
},
{
"entropy": 1.086240155249834,
"epoch": 0.40304244704342845,
"grad_norm": 0.40726640820503235,
"learning_rate": 1.1944517142109398e-05,
"loss": 1.0593,
"mean_token_accuracy": 0.725240446254611,
"num_tokens": 11982185.0,
"step": 1540
},
{
"entropy": 1.1482622426003217,
"epoch": 0.4043510264169461,
"grad_norm": 0.4323379397392273,
"learning_rate": 1.1918345982727035e-05,
"loss": 1.128,
"mean_token_accuracy": 0.7092867191880942,
"num_tokens": 12017859.0,
"step": 1545
},
{
"entropy": 1.2151045732200145,
"epoch": 0.40565960579046373,
"grad_norm": 0.3303086459636688,
"learning_rate": 1.1892174823344675e-05,
"loss": 1.2045,
"mean_token_accuracy": 0.6960071615874768,
"num_tokens": 12056052.0,
"step": 1550
},
{
"entropy": 1.0982510183006524,
"epoch": 0.40696818516398137,
"grad_norm": 0.3505324721336365,
"learning_rate": 1.1866003663962315e-05,
"loss": 1.067,
"mean_token_accuracy": 0.7211838360875845,
"num_tokens": 12093787.0,
"step": 1555
},
{
"entropy": 1.0351288244128227,
"epoch": 0.408276764537499,
"grad_norm": 0.4866773188114166,
"learning_rate": 1.1839832504579953e-05,
"loss": 1.0177,
"mean_token_accuracy": 0.7380380086600781,
"num_tokens": 12135954.0,
"step": 1560
},
{
"entropy": 1.0588519033044577,
"epoch": 0.4095853439110166,
"grad_norm": 0.35260552167892456,
"learning_rate": 1.1813661345197593e-05,
"loss": 0.9948,
"mean_token_accuracy": 0.7311153151094913,
"num_tokens": 12177684.0,
"step": 1565
},
{
"entropy": 1.09910380654037,
"epoch": 0.4108939232845342,
"grad_norm": 0.3076630234718323,
"learning_rate": 1.1787490185815233e-05,
"loss": 1.0572,
"mean_token_accuracy": 0.7178509868681431,
"num_tokens": 12217097.0,
"step": 1570
},
{
"entropy": 1.0710554268211125,
"epoch": 0.41220250265805186,
"grad_norm": 0.6353772878646851,
"learning_rate": 1.1761319026432872e-05,
"loss": 1.0566,
"mean_token_accuracy": 0.7280903398990631,
"num_tokens": 12254118.0,
"step": 1575
},
{
"entropy": 1.0847718585282564,
"epoch": 0.4135110820315695,
"grad_norm": 0.5102863311767578,
"learning_rate": 1.1735147867050512e-05,
"loss": 1.0074,
"mean_token_accuracy": 0.7276991017162799,
"num_tokens": 12290736.0,
"step": 1580
},
{
"entropy": 1.1992057055234908,
"epoch": 0.4148196614050871,
"grad_norm": 0.46092355251312256,
"learning_rate": 1.1708976707668152e-05,
"loss": 1.1854,
"mean_token_accuracy": 0.7027558118104935,
"num_tokens": 12327361.0,
"step": 1585
},
{
"entropy": 1.1261769212782382,
"epoch": 0.4161282407786047,
"grad_norm": 0.2733989357948303,
"learning_rate": 1.168280554828579e-05,
"loss": 1.0773,
"mean_token_accuracy": 0.7125700250267982,
"num_tokens": 12364890.0,
"step": 1590
},
{
"entropy": 1.116732070967555,
"epoch": 0.41743682015212236,
"grad_norm": 0.24640727043151855,
"learning_rate": 1.165663438890343e-05,
"loss": 1.1195,
"mean_token_accuracy": 0.7179068107157945,
"num_tokens": 12408137.0,
"step": 1595
},
{
"entropy": 1.119725486636162,
"epoch": 0.41874539952564,
"grad_norm": 0.38413721323013306,
"learning_rate": 1.163046322952107e-05,
"loss": 1.02,
"mean_token_accuracy": 0.7142953380942345,
"num_tokens": 12448886.0,
"step": 1600
},
{
"epoch": 0.41874539952564,
"eval_entropy": 1.0526617636680604,
"eval_loss": 1.0522558689117432,
"eval_mean_token_accuracy": 0.7257419657707215,
"eval_num_tokens": 12448886.0,
"eval_runtime": 97.091,
"eval_samples_per_second": 10.3,
"eval_steps_per_second": 1.287,
"step": 1600
},
{
"entropy": 1.052151035144925,
"epoch": 0.4200539788991576,
"grad_norm": 0.3078325390815735,
"learning_rate": 1.1604292070138707e-05,
"loss": 1.0539,
"mean_token_accuracy": 0.7285561367869378,
"num_tokens": 12488545.0,
"step": 1605
},
{
"entropy": 1.143933541327715,
"epoch": 0.4213625582726752,
"grad_norm": 0.4508214592933655,
"learning_rate": 1.1578120910756347e-05,
"loss": 1.1383,
"mean_token_accuracy": 0.7101572744548321,
"num_tokens": 12524775.0,
"step": 1610
},
{
"entropy": 1.1741201549768447,
"epoch": 0.42267113764619285,
"grad_norm": 0.4099365770816803,
"learning_rate": 1.1551949751373987e-05,
"loss": 1.1825,
"mean_token_accuracy": 0.705308374390006,
"num_tokens": 12560784.0,
"step": 1615
},
{
"entropy": 1.0800459958612918,
"epoch": 0.4239797170197105,
"grad_norm": 0.3584926128387451,
"learning_rate": 1.1525778591991625e-05,
"loss": 1.0387,
"mean_token_accuracy": 0.7256841942667961,
"num_tokens": 12603010.0,
"step": 1620
},
{
"entropy": 1.1276030663400889,
"epoch": 0.4252882963932281,
"grad_norm": 0.5926274657249451,
"learning_rate": 1.1499607432609266e-05,
"loss": 1.0812,
"mean_token_accuracy": 0.718872818350792,
"num_tokens": 12639637.0,
"step": 1625
},
{
"entropy": 1.0730494752526283,
"epoch": 0.4265968757667457,
"grad_norm": 0.3367457389831543,
"learning_rate": 1.1473436273226906e-05,
"loss": 1.0183,
"mean_token_accuracy": 0.7222284369170666,
"num_tokens": 12685003.0,
"step": 1630
},
{
"entropy": 1.1390976216644049,
"epoch": 0.42790545514026335,
"grad_norm": 0.3196088373661041,
"learning_rate": 1.1447265113844544e-05,
"loss": 1.1352,
"mean_token_accuracy": 0.7116539262235164,
"num_tokens": 12725882.0,
"step": 1635
},
{
"entropy": 1.1160973783582449,
"epoch": 0.429214034513781,
"grad_norm": 0.30303776264190674,
"learning_rate": 1.1421093954462184e-05,
"loss": 1.045,
"mean_token_accuracy": 0.7158118285238743,
"num_tokens": 12769557.0,
"step": 1640
},
{
"entropy": 1.1582866314798594,
"epoch": 0.4305226138872986,
"grad_norm": 0.3513560891151428,
"learning_rate": 1.1394922795079824e-05,
"loss": 1.1353,
"mean_token_accuracy": 0.705193604901433,
"num_tokens": 12810697.0,
"step": 1645
},
{
"entropy": 1.1496058501303197,
"epoch": 0.4318311932608162,
"grad_norm": 0.2923238277435303,
"learning_rate": 1.136875163569746e-05,
"loss": 1.0884,
"mean_token_accuracy": 0.7144812878221274,
"num_tokens": 12851256.0,
"step": 1650
},
{
"entropy": 1.1143131528049708,
"epoch": 0.43313977263433384,
"grad_norm": 0.3065350651741028,
"learning_rate": 1.1342580476315101e-05,
"loss": 1.0523,
"mean_token_accuracy": 0.7173210971057415,
"num_tokens": 12888965.0,
"step": 1655
},
{
"entropy": 1.0787921320647,
"epoch": 0.4344483520078515,
"grad_norm": 0.46374326944351196,
"learning_rate": 1.1316409316932741e-05,
"loss": 1.0793,
"mean_token_accuracy": 0.7249518293887377,
"num_tokens": 12929333.0,
"step": 1660
},
{
"entropy": 1.057104966789484,
"epoch": 0.4357569313813691,
"grad_norm": 0.313374400138855,
"learning_rate": 1.129023815755038e-05,
"loss": 1.0395,
"mean_token_accuracy": 0.7259028911590576,
"num_tokens": 12968877.0,
"step": 1665
},
{
"entropy": 1.1018395122140645,
"epoch": 0.4370655107548867,
"grad_norm": 0.5729189515113831,
"learning_rate": 1.126406699816802e-05,
"loss": 1.0302,
"mean_token_accuracy": 0.7193635780364275,
"num_tokens": 13009336.0,
"step": 1670
},
{
"entropy": 1.090810688957572,
"epoch": 0.43837409012840434,
"grad_norm": 0.34041526913642883,
"learning_rate": 1.123789583878566e-05,
"loss": 1.0004,
"mean_token_accuracy": 0.7279005717486143,
"num_tokens": 13050299.0,
"step": 1675
},
{
"entropy": 0.9902068875730038,
"epoch": 0.439682669501922,
"grad_norm": 0.3354148864746094,
"learning_rate": 1.1211724679403298e-05,
"loss": 0.9542,
"mean_token_accuracy": 0.7408786326646805,
"num_tokens": 13090891.0,
"step": 1680
},
{
"entropy": 0.9622725404798984,
"epoch": 0.4409912488754396,
"grad_norm": 0.36764782667160034,
"learning_rate": 1.1185553520020938e-05,
"loss": 0.9176,
"mean_token_accuracy": 0.7487996619194746,
"num_tokens": 13132101.0,
"step": 1685
},
{
"entropy": 1.045450020954013,
"epoch": 0.44229982824895725,
"grad_norm": 0.37813687324523926,
"learning_rate": 1.1159382360638578e-05,
"loss": 0.9415,
"mean_token_accuracy": 0.7361387588083744,
"num_tokens": 13170933.0,
"step": 1690
},
{
"entropy": 1.0522997014224529,
"epoch": 0.44360840762247483,
"grad_norm": 0.382914274930954,
"learning_rate": 1.1133211201256216e-05,
"loss": 1.0357,
"mean_token_accuracy": 0.7225256580859423,
"num_tokens": 13209472.0,
"step": 1695
},
{
"entropy": 1.0627464193850755,
"epoch": 0.44491698699599247,
"grad_norm": 0.6186394691467285,
"learning_rate": 1.1107040041873856e-05,
"loss": 0.9805,
"mean_token_accuracy": 0.7301313698291778,
"num_tokens": 13243159.0,
"step": 1700
},
{
"epoch": 0.44491698699599247,
"eval_entropy": 1.048878930568695,
"eval_loss": 1.0501657724380493,
"eval_mean_token_accuracy": 0.7263216118812561,
"eval_num_tokens": 13243159.0,
"eval_runtime": 97.3502,
"eval_samples_per_second": 10.272,
"eval_steps_per_second": 1.284,
"step": 1700
},
{
"entropy": 1.0235866460949183,
"epoch": 0.4462255663695101,
"grad_norm": 0.3209091126918793,
"learning_rate": 1.1080868882491496e-05,
"loss": 1.0014,
"mean_token_accuracy": 0.739182323589921,
"num_tokens": 13280837.0,
"step": 1705
},
{
"entropy": 1.0558000404387713,
"epoch": 0.44753414574302774,
"grad_norm": 0.30199235677719116,
"learning_rate": 1.1054697723109137e-05,
"loss": 0.9891,
"mean_token_accuracy": 0.7261330507695675,
"num_tokens": 13322890.0,
"step": 1710
},
{
"entropy": 1.121560701727867,
"epoch": 0.4488427251165453,
"grad_norm": 0.4387257993221283,
"learning_rate": 1.1028526563726773e-05,
"loss": 1.1191,
"mean_token_accuracy": 0.7114489011466503,
"num_tokens": 13360229.0,
"step": 1715
},
{
"entropy": 1.0835831377655267,
"epoch": 0.45015130449006296,
"grad_norm": 0.44212666153907776,
"learning_rate": 1.1002355404344413e-05,
"loss": 1.0534,
"mean_token_accuracy": 0.7208398774266243,
"num_tokens": 13401284.0,
"step": 1720
},
{
"entropy": 1.0752512661740183,
"epoch": 0.4514598838635806,
"grad_norm": 0.304918110370636,
"learning_rate": 1.0976184244962053e-05,
"loss": 1.0347,
"mean_token_accuracy": 0.725695389136672,
"num_tokens": 13444485.0,
"step": 1725
},
{
"entropy": 1.021533490344882,
"epoch": 0.45276846323709824,
"grad_norm": 0.3151404857635498,
"learning_rate": 1.0950013085579692e-05,
"loss": 0.9656,
"mean_token_accuracy": 0.7328935824334621,
"num_tokens": 13486444.0,
"step": 1730
},
{
"entropy": 1.1548058629035949,
"epoch": 0.4540770426106159,
"grad_norm": 0.4337512254714966,
"learning_rate": 1.0923841926197332e-05,
"loss": 1.1178,
"mean_token_accuracy": 0.7139813594520092,
"num_tokens": 13527431.0,
"step": 1735
},
{
"entropy": 1.142104067094624,
"epoch": 0.45538562198413346,
"grad_norm": 0.36979472637176514,
"learning_rate": 1.0897670766814972e-05,
"loss": 1.1315,
"mean_token_accuracy": 0.7120837710797787,
"num_tokens": 13567478.0,
"step": 1740
},
{
"entropy": 1.1160997115075588,
"epoch": 0.4566942013576511,
"grad_norm": 0.3845500946044922,
"learning_rate": 1.087149960743261e-05,
"loss": 1.1022,
"mean_token_accuracy": 0.712579957395792,
"num_tokens": 13605160.0,
"step": 1745
},
{
"entropy": 0.9988198220729828,
"epoch": 0.45800278073116873,
"grad_norm": 0.38837000727653503,
"learning_rate": 1.084532844805025e-05,
"loss": 0.9329,
"mean_token_accuracy": 0.7446095444262028,
"num_tokens": 13644314.0,
"step": 1750
},
{
"entropy": 1.130447569489479,
"epoch": 0.4593113601046864,
"grad_norm": 0.3902071714401245,
"learning_rate": 1.081915728866789e-05,
"loss": 1.0794,
"mean_token_accuracy": 0.7083112396299839,
"num_tokens": 13681672.0,
"step": 1755
},
{
"entropy": 1.0452111046761274,
"epoch": 0.46061993947820395,
"grad_norm": 0.4102812111377716,
"learning_rate": 1.0792986129285527e-05,
"loss": 1.0083,
"mean_token_accuracy": 0.7271155752241611,
"num_tokens": 13723858.0,
"step": 1760
},
{
"entropy": 0.9923233393579721,
"epoch": 0.4619285188517216,
"grad_norm": 0.3408261239528656,
"learning_rate": 1.0766814969903167e-05,
"loss": 0.9552,
"mean_token_accuracy": 0.7382691666483879,
"num_tokens": 13762274.0,
"step": 1765
},
{
"entropy": 1.0694189239293337,
"epoch": 0.46323709822523923,
"grad_norm": 0.33313289284706116,
"learning_rate": 1.0740643810520807e-05,
"loss": 1.0309,
"mean_token_accuracy": 0.728821600228548,
"num_tokens": 13804541.0,
"step": 1770
},
{
"entropy": 1.148886876925826,
"epoch": 0.46454567759875687,
"grad_norm": 0.4039045572280884,
"learning_rate": 1.0714472651138445e-05,
"loss": 1.1428,
"mean_token_accuracy": 0.7021496020257473,
"num_tokens": 13842698.0,
"step": 1775
},
{
"entropy": 1.059429519250989,
"epoch": 0.46585425697227445,
"grad_norm": 0.3325613737106323,
"learning_rate": 1.0688301491756086e-05,
"loss": 1.0277,
"mean_token_accuracy": 0.7256958331912756,
"num_tokens": 13883697.0,
"step": 1780
},
{
"entropy": 1.1187500689178704,
"epoch": 0.4671628363457921,
"grad_norm": 0.3969448208808899,
"learning_rate": 1.0662130332373726e-05,
"loss": 1.0818,
"mean_token_accuracy": 0.7165641494095325,
"num_tokens": 13922509.0,
"step": 1785
},
{
"entropy": 1.1411572575569153,
"epoch": 0.4684714157193097,
"grad_norm": 0.391754150390625,
"learning_rate": 1.0635959172991364e-05,
"loss": 1.1149,
"mean_token_accuracy": 0.7164013959467411,
"num_tokens": 13958198.0,
"step": 1790
},
{
"entropy": 1.0621652530506254,
"epoch": 0.46977999509282736,
"grad_norm": 0.3251922130584717,
"learning_rate": 1.0609788013609004e-05,
"loss": 1.0348,
"mean_token_accuracy": 0.7337204076349735,
"num_tokens": 13997488.0,
"step": 1795
},
{
"entropy": 1.0552761495113372,
"epoch": 0.471088574466345,
"grad_norm": 0.28842347860336304,
"learning_rate": 1.0583616854226644e-05,
"loss": 0.9837,
"mean_token_accuracy": 0.732194972038269,
"num_tokens": 14038479.0,
"step": 1800
},
{
"epoch": 0.471088574466345,
"eval_entropy": 1.056775134563446,
"eval_loss": 1.048877477645874,
"eval_mean_token_accuracy": 0.7264609928131104,
"eval_num_tokens": 14038479.0,
"eval_runtime": 97.5713,
"eval_samples_per_second": 10.249,
"eval_steps_per_second": 1.281,
"step": 1800
},
{
"entropy": 1.1512902732938528,
"epoch": 0.4723971538398626,
"grad_norm": 0.4646497964859009,
"learning_rate": 1.055744569484428e-05,
"loss": 1.1312,
"mean_token_accuracy": 0.7179503016173839,
"num_tokens": 14074174.0,
"step": 1805
},
{
"entropy": 1.0646660778671504,
"epoch": 0.4737057332133802,
"grad_norm": 0.29955199360847473,
"learning_rate": 1.0531274535461921e-05,
"loss": 1.0278,
"mean_token_accuracy": 0.7245239842683077,
"num_tokens": 14115408.0,
"step": 1810
},
{
"entropy": 1.0254539363086224,
"epoch": 0.47501431258689786,
"grad_norm": 0.31829652190208435,
"learning_rate": 1.0505103376079561e-05,
"loss": 0.9724,
"mean_token_accuracy": 0.7322143398225307,
"num_tokens": 14154697.0,
"step": 1815
},
{
"entropy": 1.1258851181715728,
"epoch": 0.4763228919604155,
"grad_norm": 0.45105260610580444,
"learning_rate": 1.04789322166972e-05,
"loss": 1.1154,
"mean_token_accuracy": 0.7129046030342578,
"num_tokens": 14194725.0,
"step": 1820
},
{
"entropy": 1.098579701408744,
"epoch": 0.4776314713339331,
"grad_norm": 0.37159058451652527,
"learning_rate": 1.045276105731484e-05,
"loss": 1.0148,
"mean_token_accuracy": 0.7232114516198636,
"num_tokens": 14236159.0,
"step": 1825
},
{
"entropy": 1.0738941352814435,
"epoch": 0.4789400507074507,
"grad_norm": 0.34484973549842834,
"learning_rate": 1.042658989793248e-05,
"loss": 1.0382,
"mean_token_accuracy": 0.7244183134287596,
"num_tokens": 14275868.0,
"step": 1830
},
{
"entropy": 1.0540620289742946,
"epoch": 0.48024863008096835,
"grad_norm": 0.4253799617290497,
"learning_rate": 1.0400418738550118e-05,
"loss": 0.9825,
"mean_token_accuracy": 0.7318723525851965,
"num_tokens": 14314897.0,
"step": 1835
},
{
"entropy": 1.1317568343132733,
"epoch": 0.481557209454486,
"grad_norm": 0.3978184163570404,
"learning_rate": 1.0374247579167758e-05,
"loss": 1.1196,
"mean_token_accuracy": 0.7122768431901931,
"num_tokens": 14353468.0,
"step": 1840
},
{
"entropy": 0.9921163013204932,
"epoch": 0.4828657888280036,
"grad_norm": 0.6404853463172913,
"learning_rate": 1.0348076419785398e-05,
"loss": 0.9584,
"mean_token_accuracy": 0.7398792453110218,
"num_tokens": 14393542.0,
"step": 1845
},
{
"entropy": 1.0703047849237919,
"epoch": 0.4841743682015212,
"grad_norm": 0.4562324285507202,
"learning_rate": 1.0321905260403036e-05,
"loss": 1.0268,
"mean_token_accuracy": 0.7257931806147099,
"num_tokens": 14438299.0,
"step": 1850
},
{
"entropy": 1.098178230971098,
"epoch": 0.48548294757503885,
"grad_norm": 0.5880526900291443,
"learning_rate": 1.0295734101020676e-05,
"loss": 1.0345,
"mean_token_accuracy": 0.719204118102789,
"num_tokens": 14472012.0,
"step": 1855
},
{
"entropy": 0.9670162990689277,
"epoch": 0.4867915269485565,
"grad_norm": 0.6454201340675354,
"learning_rate": 1.0269562941638316e-05,
"loss": 0.9095,
"mean_token_accuracy": 0.7467180170118809,
"num_tokens": 14508320.0,
"step": 1860
},
{
"entropy": 1.0095211911946536,
"epoch": 0.4881001063220741,
"grad_norm": 0.42344188690185547,
"learning_rate": 1.0243391782255957e-05,
"loss": 0.9624,
"mean_token_accuracy": 0.7334278732538223,
"num_tokens": 14545217.0,
"step": 1865
},
{
"entropy": 1.055738451331854,
"epoch": 0.4894086856955917,
"grad_norm": 0.35296040773391724,
"learning_rate": 1.0217220622873593e-05,
"loss": 0.9997,
"mean_token_accuracy": 0.7282113581895828,
"num_tokens": 14587523.0,
"step": 1870
},
{
"entropy": 1.0467969175428151,
"epoch": 0.49071726506910934,
"grad_norm": 0.3616087734699249,
"learning_rate": 1.0191049463491233e-05,
"loss": 1.0211,
"mean_token_accuracy": 0.7238417502492667,
"num_tokens": 14629386.0,
"step": 1875
},
{
"entropy": 1.0537079151719808,
"epoch": 0.492025844442627,
"grad_norm": 0.3368701934814453,
"learning_rate": 1.0164878304108873e-05,
"loss": 1.0216,
"mean_token_accuracy": 0.7280502840876579,
"num_tokens": 14672059.0,
"step": 1880
},
{
"entropy": 1.1863497417420148,
"epoch": 0.4933344238161446,
"grad_norm": 0.35971367359161377,
"learning_rate": 1.0138707144726512e-05,
"loss": 1.1773,
"mean_token_accuracy": 0.7055241405963898,
"num_tokens": 14713139.0,
"step": 1885
},
{
"entropy": 1.0729142222553492,
"epoch": 0.4946430031896622,
"grad_norm": 0.3640763461589813,
"learning_rate": 1.0112535985344152e-05,
"loss": 1.0327,
"mean_token_accuracy": 0.7191622793674469,
"num_tokens": 14751005.0,
"step": 1890
},
{
"entropy": 1.06036187261343,
"epoch": 0.49595158256317984,
"grad_norm": 0.529114305973053,
"learning_rate": 1.0086364825961792e-05,
"loss": 0.9994,
"mean_token_accuracy": 0.7312576837837697,
"num_tokens": 14791363.0,
"step": 1895
},
{
"entropy": 1.0985333036631346,
"epoch": 0.4972601619366975,
"grad_norm": 0.655299723148346,
"learning_rate": 1.006019366657943e-05,
"loss": 1.0157,
"mean_token_accuracy": 0.715160496532917,
"num_tokens": 14824965.0,
"step": 1900
},
{
"epoch": 0.4972601619366975,
"eval_entropy": 1.049722795009613,
"eval_loss": 1.0476868152618408,
"eval_mean_token_accuracy": 0.7267285461425781,
"eval_num_tokens": 14824965.0,
"eval_runtime": 97.3332,
"eval_samples_per_second": 10.274,
"eval_steps_per_second": 1.284,
"step": 1900
},
{
"entropy": 1.1007526509463788,
"epoch": 0.4985687413102151,
"grad_norm": 0.3946051001548767,
"learning_rate": 1.003402250719707e-05,
"loss": 1.0781,
"mean_token_accuracy": 0.7194148235023021,
"num_tokens": 14861751.0,
"step": 1905
},
{
"entropy": 1.0730507016181945,
"epoch": 0.4998773206837327,
"grad_norm": 0.4027865529060364,
"learning_rate": 1.000785134781471e-05,
"loss": 1.0453,
"mean_token_accuracy": 0.7289649330079555,
"num_tokens": 14899381.0,
"step": 1910
},
{
"entropy": 1.0322015034034848,
"epoch": 0.5011859000572504,
"grad_norm": 0.3154545724391937,
"learning_rate": 9.981680188432349e-06,
"loss": 0.9092,
"mean_token_accuracy": 0.7354275353252888,
"num_tokens": 14937444.0,
"step": 1915
},
{
"entropy": 1.070195358619094,
"epoch": 0.502494479430768,
"grad_norm": 0.6190849542617798,
"learning_rate": 9.955509029049987e-06,
"loss": 0.9885,
"mean_token_accuracy": 0.7289122432470322,
"num_tokens": 14973078.0,
"step": 1920
},
{
"entropy": 1.1444286353886128,
"epoch": 0.5038030588042856,
"grad_norm": 0.3921438455581665,
"learning_rate": 9.929337869667627e-06,
"loss": 1.1402,
"mean_token_accuracy": 0.7071823209524155,
"num_tokens": 15013712.0,
"step": 1925
},
{
"entropy": 1.1046634882688522,
"epoch": 0.5051116381778032,
"grad_norm": 0.35547974705696106,
"learning_rate": 9.903166710285267e-06,
"loss": 1.0808,
"mean_token_accuracy": 0.7203411210328341,
"num_tokens": 15056268.0,
"step": 1930
},
{
"entropy": 1.1063471656292676,
"epoch": 0.5064202175513208,
"grad_norm": 0.4154115915298462,
"learning_rate": 9.876995550902906e-06,
"loss": 1.1305,
"mean_token_accuracy": 0.7238013096153736,
"num_tokens": 15095616.0,
"step": 1935
},
{
"entropy": 1.093080758303404,
"epoch": 0.5077287969248385,
"grad_norm": 0.33288487792015076,
"learning_rate": 9.850824391520546e-06,
"loss": 1.0423,
"mean_token_accuracy": 0.7236416719853878,
"num_tokens": 15135823.0,
"step": 1940
},
{
"entropy": 1.0675853207707404,
"epoch": 0.5090373762983561,
"grad_norm": 0.4353385269641876,
"learning_rate": 9.824653232138186e-06,
"loss": 1.027,
"mean_token_accuracy": 0.721454057097435,
"num_tokens": 15173689.0,
"step": 1945
},
{
"entropy": 0.9674046196043491,
"epoch": 0.5103459556718737,
"grad_norm": 0.6318839192390442,
"learning_rate": 9.798482072755824e-06,
"loss": 0.8977,
"mean_token_accuracy": 0.7534206889569759,
"num_tokens": 15215084.0,
"step": 1950
},
{
"entropy": 1.029712200164795,
"epoch": 0.5116545350453914,
"grad_norm": 0.4194331765174866,
"learning_rate": 9.772310913373462e-06,
"loss": 0.9468,
"mean_token_accuracy": 0.7411890789866448,
"num_tokens": 15255554.0,
"step": 1955
},
{
"entropy": 1.0968198750168086,
"epoch": 0.512963114418909,
"grad_norm": 0.5084384679794312,
"learning_rate": 9.746139753991103e-06,
"loss": 1.1293,
"mean_token_accuracy": 0.7243112053722143,
"num_tokens": 15291594.0,
"step": 1960
},
{
"entropy": 1.1029110010713339,
"epoch": 0.5142716937924265,
"grad_norm": 0.39724844694137573,
"learning_rate": 9.719968594608743e-06,
"loss": 1.1192,
"mean_token_accuracy": 0.7196755088865757,
"num_tokens": 15331102.0,
"step": 1965
},
{
"entropy": 1.088718469440937,
"epoch": 0.5155802731659442,
"grad_norm": 0.29742664098739624,
"learning_rate": 9.693797435226381e-06,
"loss": 1.0498,
"mean_token_accuracy": 0.718688540160656,
"num_tokens": 15370084.0,
"step": 1970
},
{
"entropy": 1.0800972525030375,
"epoch": 0.5168888525394618,
"grad_norm": 0.4087739586830139,
"learning_rate": 9.667626275844021e-06,
"loss": 1.0431,
"mean_token_accuracy": 0.7267526693642139,
"num_tokens": 15407968.0,
"step": 1975
},
{
"entropy": 1.1198014959692955,
"epoch": 0.5181974319129795,
"grad_norm": 0.3880080282688141,
"learning_rate": 9.64145511646166e-06,
"loss": 1.1213,
"mean_token_accuracy": 0.7201451409608126,
"num_tokens": 15445003.0,
"step": 1980
},
{
"entropy": 1.098936778306961,
"epoch": 0.5195060112864971,
"grad_norm": 0.3442750871181488,
"learning_rate": 9.6152839570793e-06,
"loss": 1.0513,
"mean_token_accuracy": 0.7280969373881817,
"num_tokens": 15486123.0,
"step": 1985
},
{
"entropy": 1.106550743058324,
"epoch": 0.5208145906600147,
"grad_norm": 0.34958696365356445,
"learning_rate": 9.58911279769694e-06,
"loss": 1.0701,
"mean_token_accuracy": 0.7197918314486742,
"num_tokens": 15525274.0,
"step": 1990
},
{
"entropy": 1.0225239008665086,
"epoch": 0.5221231700335324,
"grad_norm": 0.2728900909423828,
"learning_rate": 9.562941638314578e-06,
"loss": 0.9536,
"mean_token_accuracy": 0.7333686843514442,
"num_tokens": 15567557.0,
"step": 1995
},
{
"entropy": 1.015058632940054,
"epoch": 0.52343174940705,
"grad_norm": 0.42618778347969055,
"learning_rate": 9.536770478932218e-06,
"loss": 0.9551,
"mean_token_accuracy": 0.7347531169652939,
"num_tokens": 15608023.0,
"step": 2000
},
{
"epoch": 0.52343174940705,
"eval_entropy": 1.0467440810203552,
"eval_loss": 1.0457295179367065,
"eval_mean_token_accuracy": 0.727197292804718,
"eval_num_tokens": 15608023.0,
"eval_runtime": 97.3484,
"eval_samples_per_second": 10.272,
"eval_steps_per_second": 1.284,
"step": 2000
},
{
"entropy": 1.1194863107055426,
"epoch": 0.5247403287805676,
"grad_norm": 0.43802356719970703,
"learning_rate": 9.510599319549856e-06,
"loss": 1.1008,
"mean_token_accuracy": 0.7199479583650827,
"num_tokens": 15646573.0,
"step": 2005
},
{
"entropy": 1.123955625295639,
"epoch": 0.5260489081540852,
"grad_norm": 0.3263470232486725,
"learning_rate": 9.484428160167496e-06,
"loss": 1.0753,
"mean_token_accuracy": 0.7194419991225004,
"num_tokens": 15688301.0,
"step": 2010
},
{
"entropy": 1.075960063934326,
"epoch": 0.5273574875276028,
"grad_norm": 0.43562212586402893,
"learning_rate": 9.458257000785136e-06,
"loss": 1.0737,
"mean_token_accuracy": 0.7178402289748191,
"num_tokens": 15725214.0,
"step": 2015
},
{
"entropy": 1.048216313868761,
"epoch": 0.5286660669011205,
"grad_norm": 0.2702055871486664,
"learning_rate": 9.432085841402775e-06,
"loss": 1.0164,
"mean_token_accuracy": 0.7296169806271792,
"num_tokens": 15764282.0,
"step": 2020
},
{
"entropy": 1.0972533270716667,
"epoch": 0.5299746462746381,
"grad_norm": 0.38811036944389343,
"learning_rate": 9.405914682020413e-06,
"loss": 1.0683,
"mean_token_accuracy": 0.7193491701036692,
"num_tokens": 15807279.0,
"step": 2025
},
{
"entropy": 1.0837439749389888,
"epoch": 0.5312832256481557,
"grad_norm": 0.38090547919273376,
"learning_rate": 9.379743522638053e-06,
"loss": 1.0179,
"mean_token_accuracy": 0.7216418325901032,
"num_tokens": 15849485.0,
"step": 2030
},
{
"entropy": 1.0280620481818914,
"epoch": 0.5325918050216734,
"grad_norm": 0.44307711720466614,
"learning_rate": 9.353572363255693e-06,
"loss": 0.9705,
"mean_token_accuracy": 0.7349147848784924,
"num_tokens": 15888255.0,
"step": 2035
},
{
"entropy": 1.1240373719483614,
"epoch": 0.5339003843951909,
"grad_norm": 0.41446390748023987,
"learning_rate": 9.327401203873332e-06,
"loss": 1.1051,
"mean_token_accuracy": 0.7141484066843986,
"num_tokens": 15929388.0,
"step": 2040
},
{
"entropy": 1.1305542685091496,
"epoch": 0.5352089637687086,
"grad_norm": 0.749841034412384,
"learning_rate": 9.301230044490972e-06,
"loss": 1.126,
"mean_token_accuracy": 0.7178158447146415,
"num_tokens": 15971137.0,
"step": 2045
},
{
"entropy": 1.0323502123355865,
"epoch": 0.5365175431422262,
"grad_norm": 0.48051634430885315,
"learning_rate": 9.27505888510861e-06,
"loss": 0.9978,
"mean_token_accuracy": 0.7359145298600197,
"num_tokens": 16011804.0,
"step": 2050
},
{
"entropy": 1.0280569117516278,
"epoch": 0.5378261225157438,
"grad_norm": 0.2908135950565338,
"learning_rate": 9.24888772572625e-06,
"loss": 0.9785,
"mean_token_accuracy": 0.7374115951359272,
"num_tokens": 16050785.0,
"step": 2055
},
{
"entropy": 1.0569781139492989,
"epoch": 0.5391347018892615,
"grad_norm": 0.35007575154304504,
"learning_rate": 9.22271656634389e-06,
"loss": 0.9877,
"mean_token_accuracy": 0.7339214488863945,
"num_tokens": 16090836.0,
"step": 2060
},
{
"entropy": 1.0800054289400578,
"epoch": 0.5404432812627791,
"grad_norm": 0.32629233598709106,
"learning_rate": 9.196545406961529e-06,
"loss": 1.0375,
"mean_token_accuracy": 0.7278057150542736,
"num_tokens": 16131547.0,
"step": 2065
},
{
"entropy": 1.0801922511309385,
"epoch": 0.5417518606362968,
"grad_norm": 0.43629997968673706,
"learning_rate": 9.170374247579169e-06,
"loss": 1.0448,
"mean_token_accuracy": 0.7214881278574466,
"num_tokens": 16168637.0,
"step": 2070
},
{
"entropy": 1.10114951916039,
"epoch": 0.5430604400098143,
"grad_norm": 0.3292621076107025,
"learning_rate": 9.144203088196809e-06,
"loss": 1.0632,
"mean_token_accuracy": 0.7205091584473848,
"num_tokens": 16210603.0,
"step": 2075
},
{
"entropy": 1.0731607031077146,
"epoch": 0.5443690193833319,
"grad_norm": 0.3450382351875305,
"learning_rate": 9.118031928814447e-06,
"loss": 0.985,
"mean_token_accuracy": 0.7285647377371788,
"num_tokens": 16248842.0,
"step": 2080
},
{
"entropy": 1.110469686985016,
"epoch": 0.5456775987568496,
"grad_norm": 0.5088993310928345,
"learning_rate": 9.091860769432087e-06,
"loss": 1.0761,
"mean_token_accuracy": 0.7160798791795969,
"num_tokens": 16282386.0,
"step": 2085
},
{
"entropy": 1.066830562800169,
"epoch": 0.5469861781303672,
"grad_norm": 0.5230424404144287,
"learning_rate": 9.065689610049726e-06,
"loss": 1.0209,
"mean_token_accuracy": 0.7298058848828077,
"num_tokens": 16315017.0,
"step": 2090
},
{
"entropy": 1.0682125560939313,
"epoch": 0.5482947575038849,
"grad_norm": 0.29241734743118286,
"learning_rate": 9.039518450667366e-06,
"loss": 1.0398,
"mean_token_accuracy": 0.7212686751037836,
"num_tokens": 16356673.0,
"step": 2095
},
{
"entropy": 1.0166700199246406,
"epoch": 0.5496033368774025,
"grad_norm": 0.37928712368011475,
"learning_rate": 9.013347291285006e-06,
"loss": 0.9511,
"mean_token_accuracy": 0.7362009316682816,
"num_tokens": 16397408.0,
"step": 2100
},
{
"epoch": 0.5496033368774025,
"eval_entropy": 1.0475300779342651,
"eval_loss": 1.0435727834701538,
"eval_mean_token_accuracy": 0.7274958577156067,
"eval_num_tokens": 16397408.0,
"eval_runtime": 97.2217,
"eval_samples_per_second": 10.286,
"eval_steps_per_second": 1.286,
"step": 2100
},
{
"entropy": 1.0761947065591813,
"epoch": 0.5509119162509201,
"grad_norm": 0.4101719558238983,
"learning_rate": 8.987176131902644e-06,
"loss": 1.0654,
"mean_token_accuracy": 0.7192930780351162,
"num_tokens": 16434511.0,
"step": 2105
},
{
"entropy": 1.097949853539467,
"epoch": 0.5522204956244378,
"grad_norm": 0.4837743043899536,
"learning_rate": 8.961004972520282e-06,
"loss": 1.0397,
"mean_token_accuracy": 0.7156211558729411,
"num_tokens": 16472214.0,
"step": 2110
},
{
"entropy": 1.0977317329496146,
"epoch": 0.5535290749979553,
"grad_norm": 0.39783498644828796,
"learning_rate": 8.934833813137923e-06,
"loss": 1.1348,
"mean_token_accuracy": 0.71487329415977,
"num_tokens": 16512818.0,
"step": 2115
},
{
"entropy": 1.1164306864142417,
"epoch": 0.5548376543714729,
"grad_norm": 0.5113677978515625,
"learning_rate": 8.908662653755563e-06,
"loss": 1.0433,
"mean_token_accuracy": 0.7190901465713978,
"num_tokens": 16549988.0,
"step": 2120
},
{
"entropy": 1.118570750951767,
"epoch": 0.5561462337449906,
"grad_norm": 0.457736074924469,
"learning_rate": 8.882491494373201e-06,
"loss": 1.0624,
"mean_token_accuracy": 0.7195118874311447,
"num_tokens": 16588026.0,
"step": 2125
},
{
"entropy": 1.0895183108747006,
"epoch": 0.5574548131185082,
"grad_norm": 0.443959504365921,
"learning_rate": 8.856320334990841e-06,
"loss": 1.0314,
"mean_token_accuracy": 0.7270146373659372,
"num_tokens": 16626368.0,
"step": 2130
},
{
"entropy": 1.091917496919632,
"epoch": 0.5587633924920259,
"grad_norm": 0.3934486210346222,
"learning_rate": 8.83014917560848e-06,
"loss": 1.0449,
"mean_token_accuracy": 0.7249642074108124,
"num_tokens": 16658226.0,
"step": 2135
},
{
"entropy": 1.0808244206011295,
"epoch": 0.5600719718655435,
"grad_norm": 0.4523155093193054,
"learning_rate": 8.80397801622612e-06,
"loss": 1.0726,
"mean_token_accuracy": 0.725436444580555,
"num_tokens": 16694803.0,
"step": 2140
},
{
"entropy": 1.0595968872308732,
"epoch": 0.561380551239061,
"grad_norm": 0.6529179215431213,
"learning_rate": 8.77780685684376e-06,
"loss": 1.0853,
"mean_token_accuracy": 0.728100998699665,
"num_tokens": 16731046.0,
"step": 2145
},
{
"entropy": 1.1225625108927488,
"epoch": 0.5626891306125787,
"grad_norm": 0.5509600043296814,
"learning_rate": 8.751635697461398e-06,
"loss": 1.0464,
"mean_token_accuracy": 0.7140719693154096,
"num_tokens": 16770258.0,
"step": 2150
},
{
"entropy": 1.0641490031033753,
"epoch": 0.5639977099860963,
"grad_norm": 0.401279091835022,
"learning_rate": 8.725464538079038e-06,
"loss": 1.006,
"mean_token_accuracy": 0.7247611582279205,
"num_tokens": 16808152.0,
"step": 2155
},
{
"entropy": 1.077583958208561,
"epoch": 0.565306289359614,
"grad_norm": 0.3935731053352356,
"learning_rate": 8.699293378696676e-06,
"loss": 1.0364,
"mean_token_accuracy": 0.7233245447278023,
"num_tokens": 16846703.0,
"step": 2160
},
{
"entropy": 1.1116807896643877,
"epoch": 0.5666148687331316,
"grad_norm": 0.31971287727355957,
"learning_rate": 8.673122219314316e-06,
"loss": 1.1141,
"mean_token_accuracy": 0.7137772016227245,
"num_tokens": 16885531.0,
"step": 2165
},
{
"entropy": 1.1279645822942257,
"epoch": 0.5679234481066492,
"grad_norm": 0.33220037817955017,
"learning_rate": 8.646951059931956e-06,
"loss": 1.1373,
"mean_token_accuracy": 0.7136048063635826,
"num_tokens": 16922160.0,
"step": 2170
},
{
"entropy": 1.0512098886072636,
"epoch": 0.5692320274801669,
"grad_norm": 0.41083812713623047,
"learning_rate": 8.620779900549595e-06,
"loss": 1.0153,
"mean_token_accuracy": 0.7320755422115326,
"num_tokens": 16961964.0,
"step": 2175
},
{
"entropy": 1.0632092259824275,
"epoch": 0.5705406068536845,
"grad_norm": 0.3310851752758026,
"learning_rate": 8.594608741167235e-06,
"loss": 1.0625,
"mean_token_accuracy": 0.7241213608533144,
"num_tokens": 17005710.0,
"step": 2180
},
{
"entropy": 1.0541639674454928,
"epoch": 0.571849186227202,
"grad_norm": 0.5147233009338379,
"learning_rate": 8.568437581784875e-06,
"loss": 1.0132,
"mean_token_accuracy": 0.7322873834520578,
"num_tokens": 17043166.0,
"step": 2185
},
{
"entropy": 1.1003672912716866,
"epoch": 0.5731577656007197,
"grad_norm": 0.6766669154167175,
"learning_rate": 8.542266422402513e-06,
"loss": 1.1054,
"mean_token_accuracy": 0.7199019856750966,
"num_tokens": 17077200.0,
"step": 2190
},
{
"entropy": 1.0057383235543966,
"epoch": 0.5744663449742373,
"grad_norm": 0.41832664608955383,
"learning_rate": 8.516095263020152e-06,
"loss": 0.9413,
"mean_token_accuracy": 0.7480561546981335,
"num_tokens": 17116788.0,
"step": 2195
},
{
"entropy": 1.0743875481188296,
"epoch": 0.575774924347755,
"grad_norm": 0.38505351543426514,
"learning_rate": 8.489924103637792e-06,
"loss": 0.9846,
"mean_token_accuracy": 0.7285970754921436,
"num_tokens": 17156595.0,
"step": 2200
},
{
"epoch": 0.575774924347755,
"eval_entropy": 1.047807973384857,
"eval_loss": 1.0419152975082397,
"eval_mean_token_accuracy": 0.7281912508010864,
"eval_num_tokens": 17156595.0,
"eval_runtime": 97.2111,
"eval_samples_per_second": 10.287,
"eval_steps_per_second": 1.286,
"step": 2200
},
{
"entropy": 1.018094291538,
"epoch": 0.5770835037212726,
"grad_norm": 0.47084370255470276,
"learning_rate": 8.463752944255432e-06,
"loss": 0.9611,
"mean_token_accuracy": 0.7393688425421715,
"num_tokens": 17190391.0,
"step": 2205
},
{
"entropy": 1.0440906029194594,
"epoch": 0.5783920830947902,
"grad_norm": 0.611663818359375,
"learning_rate": 8.43758178487307e-06,
"loss": 1.0335,
"mean_token_accuracy": 0.7285435609519482,
"num_tokens": 17228559.0,
"step": 2210
},
{
"entropy": 1.033133103698492,
"epoch": 0.5797006624683079,
"grad_norm": 0.4824153482913971,
"learning_rate": 8.41141062549071e-06,
"loss": 1.013,
"mean_token_accuracy": 0.7390854120254516,
"num_tokens": 17264262.0,
"step": 2215
},
{
"entropy": 1.0519094690680504,
"epoch": 0.5810092418418255,
"grad_norm": 0.35666635632514954,
"learning_rate": 8.385239466108349e-06,
"loss": 1.0353,
"mean_token_accuracy": 0.7265864860266447,
"num_tokens": 17298474.0,
"step": 2220
},
{
"entropy": 1.1184704467654227,
"epoch": 0.5823178212153431,
"grad_norm": 0.4187191426753998,
"learning_rate": 8.359068306725989e-06,
"loss": 1.0395,
"mean_token_accuracy": 0.7226161871105432,
"num_tokens": 17342249.0,
"step": 2225
},
{
"entropy": 1.109714150428772,
"epoch": 0.5836264005888607,
"grad_norm": 0.424214243888855,
"learning_rate": 8.332897147343629e-06,
"loss": 1.0727,
"mean_token_accuracy": 0.7170622806996108,
"num_tokens": 17381434.0,
"step": 2230
},
{
"entropy": 1.0706456020474433,
"epoch": 0.5849349799623783,
"grad_norm": 0.3002157211303711,
"learning_rate": 8.306725987961267e-06,
"loss": 1.046,
"mean_token_accuracy": 0.7237683780491352,
"num_tokens": 17419527.0,
"step": 2235
},
{
"entropy": 1.0564652543514967,
"epoch": 0.586243559335896,
"grad_norm": 0.3851909041404724,
"learning_rate": 8.280554828578907e-06,
"loss": 0.9903,
"mean_token_accuracy": 0.731862674281001,
"num_tokens": 17456332.0,
"step": 2240
},
{
"entropy": 1.059485612437129,
"epoch": 0.5875521387094136,
"grad_norm": 0.30845922231674194,
"learning_rate": 8.254383669196546e-06,
"loss": 1.0166,
"mean_token_accuracy": 0.7287565600126982,
"num_tokens": 17503670.0,
"step": 2245
},
{
"entropy": 1.0321750465780497,
"epoch": 0.5888607180829312,
"grad_norm": 0.35590362548828125,
"learning_rate": 8.228212509814186e-06,
"loss": 1.0185,
"mean_token_accuracy": 0.7324472777545452,
"num_tokens": 17541709.0,
"step": 2250
},
{
"entropy": 1.029836729541421,
"epoch": 0.5901692974564489,
"grad_norm": 0.5374951362609863,
"learning_rate": 8.202041350431826e-06,
"loss": 0.9625,
"mean_token_accuracy": 0.7331007912755012,
"num_tokens": 17578411.0,
"step": 2255
},
{
"entropy": 1.0388061545789242,
"epoch": 0.5914778768299664,
"grad_norm": 0.26899445056915283,
"learning_rate": 8.175870191049464e-06,
"loss": 0.9839,
"mean_token_accuracy": 0.7304695654660464,
"num_tokens": 17618221.0,
"step": 2260
},
{
"entropy": 1.0612275712192059,
"epoch": 0.5927864562034841,
"grad_norm": 0.47736746072769165,
"learning_rate": 8.149699031667102e-06,
"loss": 1.0262,
"mean_token_accuracy": 0.725922341644764,
"num_tokens": 17656712.0,
"step": 2265
},
{
"entropy": 1.1566721752285958,
"epoch": 0.5940950355770017,
"grad_norm": 0.37901201844215393,
"learning_rate": 8.123527872284743e-06,
"loss": 1.1165,
"mean_token_accuracy": 0.7078841663897038,
"num_tokens": 17692983.0,
"step": 2270
},
{
"entropy": 1.1062884621322155,
"epoch": 0.5954036149505193,
"grad_norm": 0.5285326838493347,
"learning_rate": 8.097356712902383e-06,
"loss": 1.06,
"mean_token_accuracy": 0.7236683152616024,
"num_tokens": 17732335.0,
"step": 2275
},
{
"entropy": 1.093737093731761,
"epoch": 0.596712194324037,
"grad_norm": 0.3490510582923889,
"learning_rate": 8.071185553520021e-06,
"loss": 1.0661,
"mean_token_accuracy": 0.7216609187424183,
"num_tokens": 17767130.0,
"step": 2280
},
{
"entropy": 1.108936246484518,
"epoch": 0.5980207736975546,
"grad_norm": 0.5806106925010681,
"learning_rate": 8.045014394137661e-06,
"loss": 1.1176,
"mean_token_accuracy": 0.7208809830248356,
"num_tokens": 17803730.0,
"step": 2285
},
{
"entropy": 1.168883834220469,
"epoch": 0.5993293530710723,
"grad_norm": 0.36321160197257996,
"learning_rate": 8.0188432347553e-06,
"loss": 1.1264,
"mean_token_accuracy": 0.707911616563797,
"num_tokens": 17844864.0,
"step": 2290
},
{
"entropy": 1.0555749453604222,
"epoch": 0.6006379324445899,
"grad_norm": 0.37642714381217957,
"learning_rate": 7.99267207537294e-06,
"loss": 1.0828,
"mean_token_accuracy": 0.7319136694073677,
"num_tokens": 17888044.0,
"step": 2295
},
{
"entropy": 1.0490516472607851,
"epoch": 0.6019465118181074,
"grad_norm": 1.011374831199646,
"learning_rate": 7.96650091599058e-06,
"loss": 1.0583,
"mean_token_accuracy": 0.7280811902135611,
"num_tokens": 17925845.0,
"step": 2300
},
{
"epoch": 0.6019465118181074,
"eval_entropy": 1.038636552810669,
"eval_loss": 1.0401142835617065,
"eval_mean_token_accuracy": 0.7285269827842712,
"eval_num_tokens": 17925845.0,
"eval_runtime": 97.1825,
"eval_samples_per_second": 10.29,
"eval_steps_per_second": 1.286,
"step": 2300
},
{
"entropy": 1.060953125730157,
"epoch": 0.6032550911916251,
"grad_norm": 0.3888475000858307,
"learning_rate": 7.940329756608218e-06,
"loss": 1.0127,
"mean_token_accuracy": 0.7261522125452757,
"num_tokens": 17967227.0,
"step": 2305
},
{
"entropy": 1.108807889930904,
"epoch": 0.6045636705651427,
"grad_norm": 0.4732019901275635,
"learning_rate": 7.914158597225858e-06,
"loss": 1.048,
"mean_token_accuracy": 0.7257308520376682,
"num_tokens": 18000648.0,
"step": 2310
},
{
"entropy": 1.0645822279155255,
"epoch": 0.6058722499386603,
"grad_norm": 0.5344901084899902,
"learning_rate": 7.887987437843498e-06,
"loss": 1.0637,
"mean_token_accuracy": 0.726478286832571,
"num_tokens": 18041076.0,
"step": 2315
},
{
"entropy": 1.0792375735938549,
"epoch": 0.607180829312178,
"grad_norm": 0.4639453589916229,
"learning_rate": 7.861816278461136e-06,
"loss": 1.0164,
"mean_token_accuracy": 0.7243579313158989,
"num_tokens": 18078141.0,
"step": 2320
},
{
"entropy": 1.1244300540536643,
"epoch": 0.6084894086856956,
"grad_norm": 0.4256502389907837,
"learning_rate": 7.835645119078776e-06,
"loss": 1.0715,
"mean_token_accuracy": 0.721744529902935,
"num_tokens": 18115889.0,
"step": 2325
},
{
"entropy": 1.0580251537263394,
"epoch": 0.6097979880592133,
"grad_norm": 0.3264947235584259,
"learning_rate": 7.809473959696415e-06,
"loss": 1.0605,
"mean_token_accuracy": 0.7277645215392112,
"num_tokens": 18157055.0,
"step": 2330
},
{
"entropy": 1.0746600326150655,
"epoch": 0.6111065674327308,
"grad_norm": 0.2848566472530365,
"learning_rate": 7.783302800314055e-06,
"loss": 1.0606,
"mean_token_accuracy": 0.7236160174012184,
"num_tokens": 18194754.0,
"step": 2335
},
{
"entropy": 1.0729442548006773,
"epoch": 0.6124151468062484,
"grad_norm": 0.34202495217323303,
"learning_rate": 7.757131640931695e-06,
"loss": 1.0274,
"mean_token_accuracy": 0.7167530670762062,
"num_tokens": 18239971.0,
"step": 2340
},
{
"entropy": 1.1377609726041555,
"epoch": 0.6137237261797661,
"grad_norm": 0.5917489528656006,
"learning_rate": 7.730960481549333e-06,
"loss": 1.0862,
"mean_token_accuracy": 0.7097996596246958,
"num_tokens": 18277572.0,
"step": 2345
},
{
"entropy": 1.0688726913183928,
"epoch": 0.6150323055532837,
"grad_norm": 0.3864566683769226,
"learning_rate": 7.704789322166972e-06,
"loss": 0.9895,
"mean_token_accuracy": 0.7335865460336208,
"num_tokens": 18318117.0,
"step": 2350
},
{
"entropy": 1.0338464200496673,
"epoch": 0.6163408849268014,
"grad_norm": 0.3866376280784607,
"learning_rate": 7.678618162784612e-06,
"loss": 0.9989,
"mean_token_accuracy": 0.7306736633181572,
"num_tokens": 18357427.0,
"step": 2355
},
{
"entropy": 1.0776856757700444,
"epoch": 0.617649464300319,
"grad_norm": 0.37204793095588684,
"learning_rate": 7.652447003402252e-06,
"loss": 1.0805,
"mean_token_accuracy": 0.7231950167566538,
"num_tokens": 18398690.0,
"step": 2360
},
{
"entropy": 1.0225880276411772,
"epoch": 0.6189580436738366,
"grad_norm": 0.4354966878890991,
"learning_rate": 7.62627584401989e-06,
"loss": 0.9557,
"mean_token_accuracy": 0.7365483164787292,
"num_tokens": 18436962.0,
"step": 2365
},
{
"entropy": 1.056931370869279,
"epoch": 0.6202666230473542,
"grad_norm": 0.29399538040161133,
"learning_rate": 7.60010468463753e-06,
"loss": 1.0455,
"mean_token_accuracy": 0.7246545068919659,
"num_tokens": 18479590.0,
"step": 2370
},
{
"entropy": 1.0880108322948217,
"epoch": 0.6215752024208718,
"grad_norm": 0.26629939675331116,
"learning_rate": 7.5739335252551695e-06,
"loss": 1.0752,
"mean_token_accuracy": 0.7159001469612122,
"num_tokens": 18520588.0,
"step": 2375
},
{
"entropy": 1.0517158180475235,
"epoch": 0.6228837817943894,
"grad_norm": 0.4959399998188019,
"learning_rate": 7.547762365872809e-06,
"loss": 0.9754,
"mean_token_accuracy": 0.7284245084971189,
"num_tokens": 18556912.0,
"step": 2380
},
{
"entropy": 1.0565275117754935,
"epoch": 0.6241923611679071,
"grad_norm": 0.6061561107635498,
"learning_rate": 7.521591206490449e-06,
"loss": 0.9693,
"mean_token_accuracy": 0.7295171424746514,
"num_tokens": 18591545.0,
"step": 2385
},
{
"entropy": 1.1225751712918282,
"epoch": 0.6255009405414247,
"grad_norm": 0.43570899963378906,
"learning_rate": 7.495420047108087e-06,
"loss": 1.0837,
"mean_token_accuracy": 0.7186510719358921,
"num_tokens": 18632236.0,
"step": 2390
},
{
"entropy": 1.1181074380874634,
"epoch": 0.6268095199149424,
"grad_norm": 0.49738365411758423,
"learning_rate": 7.469248887725726e-06,
"loss": 1.0293,
"mean_token_accuracy": 0.7210543874651194,
"num_tokens": 18671653.0,
"step": 2395
},
{
"entropy": 1.092723872885108,
"epoch": 0.62811809928846,
"grad_norm": 0.5354933142662048,
"learning_rate": 7.4430777283433664e-06,
"loss": 1.0792,
"mean_token_accuracy": 0.7276735674589873,
"num_tokens": 18712711.0,
"step": 2400
},
{
"epoch": 0.62811809928846,
"eval_entropy": 1.0314421949386596,
"eval_loss": 1.0386921167373657,
"eval_mean_token_accuracy": 0.7286318821907043,
"eval_num_tokens": 18712711.0,
"eval_runtime": 97.152,
"eval_samples_per_second": 10.293,
"eval_steps_per_second": 1.287,
"step": 2400
},
{
"entropy": 1.1883727714419365,
"epoch": 0.6294266786619775,
"grad_norm": 0.3528256416320801,
"learning_rate": 7.416906568961006e-06,
"loss": 1.1798,
"mean_token_accuracy": 0.7113151207566262,
"num_tokens": 18752528.0,
"step": 2405
},
{
"entropy": 1.1015507124364377,
"epoch": 0.6307352580354952,
"grad_norm": 0.421294242143631,
"learning_rate": 7.390735409578646e-06,
"loss": 1.058,
"mean_token_accuracy": 0.7168937616050244,
"num_tokens": 18791480.0,
"step": 2410
},
{
"entropy": 1.040089299157262,
"epoch": 0.6320438374090128,
"grad_norm": 0.34841352701187134,
"learning_rate": 7.364564250196284e-06,
"loss": 0.9257,
"mean_token_accuracy": 0.7292000584304332,
"num_tokens": 18827672.0,
"step": 2415
},
{
"entropy": 1.0772018309682607,
"epoch": 0.6333524167825305,
"grad_norm": 0.39158549904823303,
"learning_rate": 7.338393090813923e-06,
"loss": 1.0312,
"mean_token_accuracy": 0.7295682705938816,
"num_tokens": 18867541.0,
"step": 2420
},
{
"entropy": 1.191065814346075,
"epoch": 0.6346609961560481,
"grad_norm": 0.5301610231399536,
"learning_rate": 7.312221931431563e-06,
"loss": 1.214,
"mean_token_accuracy": 0.7043293602764606,
"num_tokens": 18907080.0,
"step": 2425
},
{
"entropy": 1.0250558149069549,
"epoch": 0.6359695755295657,
"grad_norm": 0.46418026089668274,
"learning_rate": 7.286050772049203e-06,
"loss": 0.9982,
"mean_token_accuracy": 0.7344914440065622,
"num_tokens": 18945688.0,
"step": 2430
},
{
"entropy": 1.1069321628659963,
"epoch": 0.6372781549030834,
"grad_norm": 0.45496806502342224,
"learning_rate": 7.259879612666841e-06,
"loss": 1.016,
"mean_token_accuracy": 0.7186556220054626,
"num_tokens": 18982398.0,
"step": 2435
},
{
"entropy": 1.0202362570911645,
"epoch": 0.638586734276601,
"grad_norm": 0.46681129932403564,
"learning_rate": 7.233708453284481e-06,
"loss": 1.0134,
"mean_token_accuracy": 0.7437549475580454,
"num_tokens": 19022263.0,
"step": 2440
},
{
"entropy": 1.026650892943144,
"epoch": 0.6398953136501185,
"grad_norm": 0.2987666428089142,
"learning_rate": 7.20753729390212e-06,
"loss": 0.9835,
"mean_token_accuracy": 0.7353227615356446,
"num_tokens": 19065353.0,
"step": 2445
},
{
"entropy": 1.0794596791267395,
"epoch": 0.6412038930236362,
"grad_norm": 0.6045072674751282,
"learning_rate": 7.1813661345197595e-06,
"loss": 1.0901,
"mean_token_accuracy": 0.7167344830930233,
"num_tokens": 19106128.0,
"step": 2450
},
{
"entropy": 1.1413863252848386,
"epoch": 0.6425124723971538,
"grad_norm": 0.408202588558197,
"learning_rate": 7.1551949751373995e-06,
"loss": 1.07,
"mean_token_accuracy": 0.709665472060442,
"num_tokens": 19144518.0,
"step": 2455
},
{
"entropy": 1.018723826110363,
"epoch": 0.6438210517706715,
"grad_norm": 0.7658793330192566,
"learning_rate": 7.129023815755039e-06,
"loss": 1.0139,
"mean_token_accuracy": 0.7404938109219075,
"num_tokens": 19186225.0,
"step": 2460
},
{
"entropy": 1.035357365384698,
"epoch": 0.6451296311441891,
"grad_norm": 0.27525585889816284,
"learning_rate": 7.102852656372677e-06,
"loss": 0.9998,
"mean_token_accuracy": 0.7318320117890835,
"num_tokens": 19227399.0,
"step": 2465
},
{
"entropy": 0.9564740158617496,
"epoch": 0.6464382105177067,
"grad_norm": 0.4672304093837738,
"learning_rate": 7.076681496990317e-06,
"loss": 0.9195,
"mean_token_accuracy": 0.7519853860139847,
"num_tokens": 19267194.0,
"step": 2470
},
{
"entropy": 1.0981974124908447,
"epoch": 0.6477467898912244,
"grad_norm": 0.5112661719322205,
"learning_rate": 7.050510337607956e-06,
"loss": 1.0416,
"mean_token_accuracy": 0.7168846629559994,
"num_tokens": 19303499.0,
"step": 2475
},
{
"entropy": 1.1099361848086118,
"epoch": 0.6490553692647419,
"grad_norm": 0.6969805359840393,
"learning_rate": 7.0243391782255965e-06,
"loss": 1.0715,
"mean_token_accuracy": 0.7152190506458282,
"num_tokens": 19341950.0,
"step": 2480
},
{
"entropy": 1.0465524211525916,
"epoch": 0.6503639486382596,
"grad_norm": 0.26998811960220337,
"learning_rate": 6.998168018843236e-06,
"loss": 1.0512,
"mean_token_accuracy": 0.7323187306523323,
"num_tokens": 19386737.0,
"step": 2485
},
{
"entropy": 0.9889295142143965,
"epoch": 0.6516725280117772,
"grad_norm": 0.38220036029815674,
"learning_rate": 6.971996859460874e-06,
"loss": 0.9593,
"mean_token_accuracy": 0.7394830010831356,
"num_tokens": 19428001.0,
"step": 2490
},
{
"entropy": 1.0677450440824032,
"epoch": 0.6529811073852948,
"grad_norm": 0.4677034914493561,
"learning_rate": 6.945825700078514e-06,
"loss": 0.9954,
"mean_token_accuracy": 0.7268911335617304,
"num_tokens": 19465393.0,
"step": 2495
},
{
"entropy": 1.049769724905491,
"epoch": 0.6542896867588125,
"grad_norm": 0.6634142398834229,
"learning_rate": 6.919654540696153e-06,
"loss": 1.039,
"mean_token_accuracy": 0.7285342015326023,
"num_tokens": 19501464.0,
"step": 2500
},
{
"epoch": 0.6542896867588125,
"eval_entropy": 1.035133779525757,
"eval_loss": 1.0362484455108643,
"eval_mean_token_accuracy": 0.7291943707466125,
"eval_num_tokens": 19501464.0,
"eval_runtime": 97.1896,
"eval_samples_per_second": 10.289,
"eval_steps_per_second": 1.286,
"step": 2500
},
{
"entropy": 1.0178256921470166,
"epoch": 0.6555982661323301,
"grad_norm": 0.3480639159679413,
"learning_rate": 6.8934833813137926e-06,
"loss": 0.9423,
"mean_token_accuracy": 0.7350625194609165,
"num_tokens": 19541127.0,
"step": 2505
},
{
"entropy": 1.1573872964829206,
"epoch": 0.6569068455058478,
"grad_norm": 0.3416019678115845,
"learning_rate": 6.867312221931433e-06,
"loss": 1.1634,
"mean_token_accuracy": 0.7034627720713615,
"num_tokens": 19577837.0,
"step": 2510
},
{
"entropy": 1.08112654350698,
"epoch": 0.6582154248793654,
"grad_norm": 0.5420477390289307,
"learning_rate": 6.841141062549072e-06,
"loss": 1.016,
"mean_token_accuracy": 0.7271684132516384,
"num_tokens": 19608622.0,
"step": 2515
},
{
"entropy": 0.9906784903258086,
"epoch": 0.6595240042528829,
"grad_norm": 0.46363091468811035,
"learning_rate": 6.81496990316671e-06,
"loss": 0.9497,
"mean_token_accuracy": 0.7543116796761751,
"num_tokens": 19643843.0,
"step": 2520
},
{
"entropy": 1.001466591656208,
"epoch": 0.6608325836264006,
"grad_norm": 0.37773942947387695,
"learning_rate": 6.78879874378435e-06,
"loss": 0.9092,
"mean_token_accuracy": 0.7434275582432747,
"num_tokens": 19681055.0,
"step": 2525
},
{
"entropy": 1.1146674454212189,
"epoch": 0.6621411629999182,
"grad_norm": 0.33123865723609924,
"learning_rate": 6.7626275844019895e-06,
"loss": 1.1595,
"mean_token_accuracy": 0.721651129052043,
"num_tokens": 19722916.0,
"step": 2530
},
{
"entropy": 1.0137363068759442,
"epoch": 0.6634497423734358,
"grad_norm": 0.2801375091075897,
"learning_rate": 6.736456425019629e-06,
"loss": 1.0089,
"mean_token_accuracy": 0.7334360480308533,
"num_tokens": 19764966.0,
"step": 2535
},
{
"entropy": 1.0639188002794981,
"epoch": 0.6647583217469535,
"grad_norm": 0.3247496485710144,
"learning_rate": 6.710285265637269e-06,
"loss": 1.0763,
"mean_token_accuracy": 0.7175609428435564,
"num_tokens": 19804025.0,
"step": 2540
},
{
"entropy": 1.1048081483691932,
"epoch": 0.6660669011204711,
"grad_norm": 0.4086282551288605,
"learning_rate": 6.684114106254907e-06,
"loss": 1.1058,
"mean_token_accuracy": 0.7259872317314148,
"num_tokens": 19843774.0,
"step": 2545
},
{
"entropy": 1.1255848862230777,
"epoch": 0.6673754804939888,
"grad_norm": 0.44439879059791565,
"learning_rate": 6.657942946872546e-06,
"loss": 1.0964,
"mean_token_accuracy": 0.7162352904677391,
"num_tokens": 19881825.0,
"step": 2550
},
{
"entropy": 1.1439194109290838,
"epoch": 0.6686840598675063,
"grad_norm": 0.29299938678741455,
"learning_rate": 6.6317717874901865e-06,
"loss": 1.1223,
"mean_token_accuracy": 0.7133913572877646,
"num_tokens": 19921618.0,
"step": 2555
},
{
"entropy": 1.029699081927538,
"epoch": 0.6699926392410239,
"grad_norm": 0.40280821919441223,
"learning_rate": 6.605600628107826e-06,
"loss": 0.9621,
"mean_token_accuracy": 0.7391855999827385,
"num_tokens": 19960586.0,
"step": 2560
},
{
"entropy": 1.003632688894868,
"epoch": 0.6713012186145416,
"grad_norm": 0.2862677276134491,
"learning_rate": 6.579429468725466e-06,
"loss": 0.9738,
"mean_token_accuracy": 0.7375729866325855,
"num_tokens": 20000547.0,
"step": 2565
},
{
"entropy": 1.1057576406747103,
"epoch": 0.6726097979880592,
"grad_norm": 0.28728872537612915,
"learning_rate": 6.553258309343105e-06,
"loss": 1.0678,
"mean_token_accuracy": 0.7252844780683517,
"num_tokens": 20045934.0,
"step": 2570
},
{
"entropy": 1.0278929378837347,
"epoch": 0.6739183773615769,
"grad_norm": 0.3557819724082947,
"learning_rate": 6.527087149960743e-06,
"loss": 0.9804,
"mean_token_accuracy": 0.7362292345613242,
"num_tokens": 20088247.0,
"step": 2575
},
{
"entropy": 1.0297599412500857,
"epoch": 0.6752269567350945,
"grad_norm": 0.3730546534061432,
"learning_rate": 6.500915990578383e-06,
"loss": 1.0108,
"mean_token_accuracy": 0.7349158681929111,
"num_tokens": 20130988.0,
"step": 2580
},
{
"entropy": 1.0113672140985728,
"epoch": 0.6765355361086121,
"grad_norm": 0.3825022876262665,
"learning_rate": 6.474744831196023e-06,
"loss": 0.9893,
"mean_token_accuracy": 0.7319887422025204,
"num_tokens": 20169034.0,
"step": 2585
},
{
"entropy": 1.1008230350911616,
"epoch": 0.6778441154821297,
"grad_norm": 0.5840580463409424,
"learning_rate": 6.448573671813662e-06,
"loss": 1.0848,
"mean_token_accuracy": 0.716325681656599,
"num_tokens": 20206269.0,
"step": 2590
},
{
"entropy": 1.0766204059123994,
"epoch": 0.6791526948556473,
"grad_norm": 0.4373389482498169,
"learning_rate": 6.422402512431302e-06,
"loss": 1.0217,
"mean_token_accuracy": 0.7277089573442936,
"num_tokens": 20241866.0,
"step": 2595
},
{
"entropy": 1.0236305307596922,
"epoch": 0.6804612742291649,
"grad_norm": 0.6429861783981323,
"learning_rate": 6.39623135304894e-06,
"loss": 0.9714,
"mean_token_accuracy": 0.7359235771000385,
"num_tokens": 20279405.0,
"step": 2600
},
{
"epoch": 0.6804612742291649,
"eval_entropy": 1.0339577412605285,
"eval_loss": 1.03507399559021,
"eval_mean_token_accuracy": 0.7294820728302002,
"eval_num_tokens": 20279405.0,
"eval_runtime": 97.2889,
"eval_samples_per_second": 10.279,
"eval_steps_per_second": 1.285,
"step": 2600
},
{
"entropy": 1.1064529620110988,
"epoch": 0.6817698536026826,
"grad_norm": 0.6711133718490601,
"learning_rate": 6.3700601936665795e-06,
"loss": 1.0578,
"mean_token_accuracy": 0.7143168658018112,
"num_tokens": 20318358.0,
"step": 2605
},
{
"entropy": 1.0551523761823773,
"epoch": 0.6830784329762002,
"grad_norm": 0.6876797080039978,
"learning_rate": 6.3438890342842196e-06,
"loss": 1.0351,
"mean_token_accuracy": 0.7304907165467739,
"num_tokens": 20355919.0,
"step": 2610
},
{
"entropy": 1.0216753158718348,
"epoch": 0.6843870123497179,
"grad_norm": 0.43939870595932007,
"learning_rate": 6.317717874901859e-06,
"loss": 0.9788,
"mean_token_accuracy": 0.7326975151896477,
"num_tokens": 20395739.0,
"step": 2615
},
{
"entropy": 1.0042755860835313,
"epoch": 0.6856955917232355,
"grad_norm": 0.45771950483322144,
"learning_rate": 6.291546715519498e-06,
"loss": 1.0214,
"mean_token_accuracy": 0.7377393446862698,
"num_tokens": 20439063.0,
"step": 2620
},
{
"entropy": 1.08444495908916,
"epoch": 0.687004171096753,
"grad_norm": 0.328112930059433,
"learning_rate": 6.265375556137138e-06,
"loss": 1.0578,
"mean_token_accuracy": 0.7258397076278925,
"num_tokens": 20480446.0,
"step": 2625
},
{
"entropy": 1.053849907964468,
"epoch": 0.6883127504702707,
"grad_norm": 0.4496280550956726,
"learning_rate": 6.2392043967547764e-06,
"loss": 1.0282,
"mean_token_accuracy": 0.7329249262809754,
"num_tokens": 20519257.0,
"step": 2630
},
{
"entropy": 1.0985306035727262,
"epoch": 0.6896213298437883,
"grad_norm": 0.43236038088798523,
"learning_rate": 6.213033237372416e-06,
"loss": 1.0809,
"mean_token_accuracy": 0.7229348205029964,
"num_tokens": 20560703.0,
"step": 2635
},
{
"entropy": 1.039340978488326,
"epoch": 0.690929909217306,
"grad_norm": 0.592126727104187,
"learning_rate": 6.186862077990056e-06,
"loss": 0.9885,
"mean_token_accuracy": 0.7413885131478309,
"num_tokens": 20602363.0,
"step": 2640
},
{
"entropy": 1.040134682506323,
"epoch": 0.6922384885908236,
"grad_norm": 0.5198757648468018,
"learning_rate": 6.160690918607695e-06,
"loss": 1.0226,
"mean_token_accuracy": 0.7368385374546051,
"num_tokens": 20639437.0,
"step": 2645
},
{
"entropy": 1.1080417584627866,
"epoch": 0.6935470679643412,
"grad_norm": 0.2894703149795532,
"learning_rate": 6.134519759225335e-06,
"loss": 1.1345,
"mean_token_accuracy": 0.7155061598867178,
"num_tokens": 20680275.0,
"step": 2650
},
{
"entropy": 1.0198019824922084,
"epoch": 0.6948556473378589,
"grad_norm": 0.3462557792663574,
"learning_rate": 6.108348599842973e-06,
"loss": 1.0143,
"mean_token_accuracy": 0.7402982845902443,
"num_tokens": 20718417.0,
"step": 2655
},
{
"entropy": 1.0144409965723753,
"epoch": 0.6961642267113765,
"grad_norm": 0.3350953757762909,
"learning_rate": 6.082177440460613e-06,
"loss": 0.9838,
"mean_token_accuracy": 0.7343977279961109,
"num_tokens": 20762227.0,
"step": 2660
},
{
"entropy": 1.0715901810675859,
"epoch": 0.697472806084894,
"grad_norm": 0.6075988411903381,
"learning_rate": 6.056006281078253e-06,
"loss": 1.0055,
"mean_token_accuracy": 0.7305215656757355,
"num_tokens": 20798122.0,
"step": 2665
},
{
"entropy": 1.1007160846143962,
"epoch": 0.6987813854584117,
"grad_norm": 0.4501047134399414,
"learning_rate": 6.029835121695892e-06,
"loss": 1.058,
"mean_token_accuracy": 0.7208323903381825,
"num_tokens": 20834210.0,
"step": 2670
},
{
"entropy": 1.0522528853267432,
"epoch": 0.7000899648319293,
"grad_norm": 0.36250752210617065,
"learning_rate": 6.00366396231353e-06,
"loss": 0.9972,
"mean_token_accuracy": 0.7273079171776772,
"num_tokens": 20872418.0,
"step": 2675
},
{
"entropy": 1.054316360130906,
"epoch": 0.701398544205447,
"grad_norm": 0.3319310247898102,
"learning_rate": 5.977492802931171e-06,
"loss": 1.0515,
"mean_token_accuracy": 0.7271708060055971,
"num_tokens": 20911504.0,
"step": 2680
},
{
"entropy": 1.0553948923945426,
"epoch": 0.7027071235789646,
"grad_norm": 0.46403029561042786,
"learning_rate": 5.9513216435488095e-06,
"loss": 1.0058,
"mean_token_accuracy": 0.733677176013589,
"num_tokens": 20947438.0,
"step": 2685
},
{
"entropy": 1.135214940458536,
"epoch": 0.7040157029524822,
"grad_norm": 0.2997356355190277,
"learning_rate": 5.925150484166449e-06,
"loss": 1.0205,
"mean_token_accuracy": 0.7149875804781913,
"num_tokens": 20984843.0,
"step": 2690
},
{
"entropy": 1.067294117808342,
"epoch": 0.7053242823259999,
"grad_norm": 0.7109698057174683,
"learning_rate": 5.898979324784089e-06,
"loss": 1.0424,
"mean_token_accuracy": 0.7298292949795723,
"num_tokens": 21024317.0,
"step": 2695
},
{
"entropy": 1.0000967428088188,
"epoch": 0.7066328616995174,
"grad_norm": 0.4540089964866638,
"learning_rate": 5.872808165401728e-06,
"loss": 1.0007,
"mean_token_accuracy": 0.734975878149271,
"num_tokens": 21064116.0,
"step": 2700
},
{
"epoch": 0.7066328616995174,
"eval_entropy": 1.033733127117157,
"eval_loss": 1.0336334705352783,
"eval_mean_token_accuracy": 0.7298680348396301,
"eval_num_tokens": 21064116.0,
"eval_runtime": 97.2545,
"eval_samples_per_second": 10.282,
"eval_steps_per_second": 1.285,
"step": 2700
},
{
"entropy": 1.0902955643832684,
"epoch": 0.7079414410730351,
"grad_norm": 0.4074270725250244,
"learning_rate": 5.846637006019366e-06,
"loss": 1.103,
"mean_token_accuracy": 0.7196738637983799,
"num_tokens": 21106096.0,
"step": 2705
},
{
"entropy": 0.9893661700189114,
"epoch": 0.7092500204465527,
"grad_norm": 0.43556877970695496,
"learning_rate": 5.8204658466370065e-06,
"loss": 0.9469,
"mean_token_accuracy": 0.7477334760129452,
"num_tokens": 21143151.0,
"step": 2710
},
{
"entropy": 1.0833146207034587,
"epoch": 0.7105585998200703,
"grad_norm": 0.6449909806251526,
"learning_rate": 5.794294687254646e-06,
"loss": 1.0748,
"mean_token_accuracy": 0.7204306960105896,
"num_tokens": 21179840.0,
"step": 2715
},
{
"entropy": 1.0693892493844033,
"epoch": 0.711867179193588,
"grad_norm": 0.2570124864578247,
"learning_rate": 5.768123527872285e-06,
"loss": 1.0456,
"mean_token_accuracy": 0.722801336273551,
"num_tokens": 21217194.0,
"step": 2720
},
{
"entropy": 1.0765843842178584,
"epoch": 0.7131757585671056,
"grad_norm": 0.5316287279129028,
"learning_rate": 5.741952368489925e-06,
"loss": 1.0263,
"mean_token_accuracy": 0.7246413186192513,
"num_tokens": 21251605.0,
"step": 2725
},
{
"entropy": 1.0620105125010013,
"epoch": 0.7144843379406232,
"grad_norm": 0.3342624306678772,
"learning_rate": 5.715781209107563e-06,
"loss": 1.0065,
"mean_token_accuracy": 0.7301050588488579,
"num_tokens": 21294871.0,
"step": 2730
},
{
"entropy": 1.0648996368050576,
"epoch": 0.7157929173141409,
"grad_norm": 0.5223391056060791,
"learning_rate": 5.689610049725203e-06,
"loss": 1.0258,
"mean_token_accuracy": 0.7259559325873852,
"num_tokens": 21326670.0,
"step": 2735
},
{
"entropy": 1.0742478474974633,
"epoch": 0.7171014966876584,
"grad_norm": 0.5522735118865967,
"learning_rate": 5.663438890342843e-06,
"loss": 1.0703,
"mean_token_accuracy": 0.7225585378706455,
"num_tokens": 21367325.0,
"step": 2740
},
{
"entropy": 0.9710161112248897,
"epoch": 0.7184100760611761,
"grad_norm": 0.33066603541374207,
"learning_rate": 5.637267730960482e-06,
"loss": 0.9442,
"mean_token_accuracy": 0.7384205959737301,
"num_tokens": 21408334.0,
"step": 2745
},
{
"entropy": 1.0664944287389517,
"epoch": 0.7197186554346937,
"grad_norm": 0.4889247417449951,
"learning_rate": 5.611096571578122e-06,
"loss": 1.112,
"mean_token_accuracy": 0.7264700572937727,
"num_tokens": 21449839.0,
"step": 2750
},
{
"entropy": 1.0030995801091194,
"epoch": 0.7210272348082113,
"grad_norm": 0.3732108771800995,
"learning_rate": 5.584925412195761e-06,
"loss": 0.9111,
"mean_token_accuracy": 0.7403148956596851,
"num_tokens": 21487742.0,
"step": 2755
},
{
"entropy": 1.0033697545528413,
"epoch": 0.722335814181729,
"grad_norm": 0.43490299582481384,
"learning_rate": 5.5587542528133995e-06,
"loss": 0.9582,
"mean_token_accuracy": 0.7397238194942475,
"num_tokens": 21526844.0,
"step": 2760
},
{
"entropy": 1.1253223221749067,
"epoch": 0.7236443935552466,
"grad_norm": 0.46164193749427795,
"learning_rate": 5.5325830934310396e-06,
"loss": 1.1347,
"mean_token_accuracy": 0.7200637456029654,
"num_tokens": 21568539.0,
"step": 2765
},
{
"entropy": 1.0714545018970967,
"epoch": 0.7249529729287643,
"grad_norm": 0.39674556255340576,
"learning_rate": 5.506411934048679e-06,
"loss": 1.0253,
"mean_token_accuracy": 0.7276010327041149,
"num_tokens": 21603326.0,
"step": 2770
},
{
"entropy": 1.06046348772943,
"epoch": 0.7262615523022818,
"grad_norm": 0.4999883770942688,
"learning_rate": 5.480240774666318e-06,
"loss": 1.0495,
"mean_token_accuracy": 0.7262998007237911,
"num_tokens": 21642529.0,
"step": 2775
},
{
"entropy": 1.0288800846785306,
"epoch": 0.7275701316757994,
"grad_norm": 0.45190781354904175,
"learning_rate": 5.454069615283958e-06,
"loss": 1.0062,
"mean_token_accuracy": 0.7358099550008774,
"num_tokens": 21675950.0,
"step": 2780
},
{
"entropy": 1.064651170000434,
"epoch": 0.7288787110493171,
"grad_norm": 0.6091992259025574,
"learning_rate": 5.4278984559015964e-06,
"loss": 1.0302,
"mean_token_accuracy": 0.7322472549974919,
"num_tokens": 21712389.0,
"step": 2785
},
{
"entropy": 1.067555221542716,
"epoch": 0.7301872904228347,
"grad_norm": 0.3587604761123657,
"learning_rate": 5.401727296519236e-06,
"loss": 1.0269,
"mean_token_accuracy": 0.7269019264727831,
"num_tokens": 21750233.0,
"step": 2790
},
{
"entropy": 1.0527180466800927,
"epoch": 0.7314958697963523,
"grad_norm": 0.6426623463630676,
"learning_rate": 5.375556137136876e-06,
"loss": 1.045,
"mean_token_accuracy": 0.7300838761031627,
"num_tokens": 21788770.0,
"step": 2795
},
{
"entropy": 1.115583229251206,
"epoch": 0.73280444916987,
"grad_norm": 0.5922237634658813,
"learning_rate": 5.349384977754515e-06,
"loss": 1.0523,
"mean_token_accuracy": 0.7175281938165426,
"num_tokens": 21826397.0,
"step": 2800
},
{
"epoch": 0.73280444916987,
"eval_entropy": 1.0408368062973024,
"eval_loss": 1.0325729846954346,
"eval_mean_token_accuracy": 0.73026052236557,
"eval_num_tokens": 21826397.0,
"eval_runtime": 97.1897,
"eval_samples_per_second": 10.289,
"eval_steps_per_second": 1.286,
"step": 2800
},
{
"entropy": 1.1054367668926717,
"epoch": 0.7341130285433876,
"grad_norm": 0.5465598106384277,
"learning_rate": 5.323213818372154e-06,
"loss": 1.08,
"mean_token_accuracy": 0.720421589165926,
"num_tokens": 21863424.0,
"step": 2805
},
{
"entropy": 1.1147074215114117,
"epoch": 0.7354216079169053,
"grad_norm": 0.4170718193054199,
"learning_rate": 5.297042658989794e-06,
"loss": 1.0508,
"mean_token_accuracy": 0.7232054278254509,
"num_tokens": 21900462.0,
"step": 2810
},
{
"entropy": 1.0979057766497136,
"epoch": 0.7367301872904228,
"grad_norm": 0.4706130027770996,
"learning_rate": 5.270871499607433e-06,
"loss": 1.0168,
"mean_token_accuracy": 0.719353062659502,
"num_tokens": 21933354.0,
"step": 2815
},
{
"entropy": 0.9744918283075095,
"epoch": 0.7380387666639404,
"grad_norm": 0.5279991626739502,
"learning_rate": 5.244700340225073e-06,
"loss": 0.9217,
"mean_token_accuracy": 0.7506507318466902,
"num_tokens": 21970100.0,
"step": 2820
},
{
"entropy": 1.0728078037500381,
"epoch": 0.7393473460374581,
"grad_norm": 0.5053285956382751,
"learning_rate": 5.218529180842712e-06,
"loss": 1.0283,
"mean_token_accuracy": 0.7311844080686569,
"num_tokens": 22004093.0,
"step": 2825
},
{
"entropy": 1.0765360169112683,
"epoch": 0.7406559254109757,
"grad_norm": 0.36004361510276794,
"learning_rate": 5.192358021460351e-06,
"loss": 1.0092,
"mean_token_accuracy": 0.7210248097777366,
"num_tokens": 22045336.0,
"step": 2830
},
{
"entropy": 1.017987198382616,
"epoch": 0.7419645047844934,
"grad_norm": 0.36846044659614563,
"learning_rate": 5.166186862077991e-06,
"loss": 0.9839,
"mean_token_accuracy": 0.7387872941792011,
"num_tokens": 22086534.0,
"step": 2835
},
{
"entropy": 1.0489673603326082,
"epoch": 0.743273084158011,
"grad_norm": 0.46179378032684326,
"learning_rate": 5.1400157026956295e-06,
"loss": 0.9415,
"mean_token_accuracy": 0.7353867895901203,
"num_tokens": 22121230.0,
"step": 2840
},
{
"entropy": 1.1243290316313506,
"epoch": 0.7445816635315285,
"grad_norm": 0.5417360663414001,
"learning_rate": 5.113844543313269e-06,
"loss": 1.0578,
"mean_token_accuracy": 0.7204208463430405,
"num_tokens": 22159032.0,
"step": 2845
},
{
"entropy": 1.0939917907118797,
"epoch": 0.7458902429050462,
"grad_norm": 0.48897913098335266,
"learning_rate": 5.087673383930909e-06,
"loss": 1.0063,
"mean_token_accuracy": 0.7165301315486431,
"num_tokens": 22196955.0,
"step": 2850
},
{
"entropy": 1.0858067644760012,
"epoch": 0.7471988222785638,
"grad_norm": 0.5564597845077515,
"learning_rate": 5.061502224548548e-06,
"loss": 1.0764,
"mean_token_accuracy": 0.7208572022616864,
"num_tokens": 22234201.0,
"step": 2855
},
{
"entropy": 1.0590477578341961,
"epoch": 0.7485074016520814,
"grad_norm": 0.3768922686576843,
"learning_rate": 5.035331065166187e-06,
"loss": 0.9812,
"mean_token_accuracy": 0.7291722763329744,
"num_tokens": 22272779.0,
"step": 2860
},
{
"entropy": 1.1251448437571525,
"epoch": 0.7498159810255991,
"grad_norm": 0.3560837507247925,
"learning_rate": 5.009159905783827e-06,
"loss": 1.0892,
"mean_token_accuracy": 0.7106486916542053,
"num_tokens": 22308851.0,
"step": 2865
},
{
"entropy": 1.119718360155821,
"epoch": 0.7511245603991167,
"grad_norm": 0.5380016565322876,
"learning_rate": 4.982988746401466e-06,
"loss": 1.0983,
"mean_token_accuracy": 0.7206918202340603,
"num_tokens": 22343884.0,
"step": 2870
},
{
"entropy": 1.0894725158810616,
"epoch": 0.7524331397726344,
"grad_norm": 0.3485453426837921,
"learning_rate": 4.956817587019106e-06,
"loss": 1.0491,
"mean_token_accuracy": 0.7222230531275272,
"num_tokens": 22384017.0,
"step": 2875
},
{
"entropy": 1.0093999858945608,
"epoch": 0.753741719146152,
"grad_norm": 0.4127729535102844,
"learning_rate": 4.930646427636745e-06,
"loss": 0.9502,
"mean_token_accuracy": 0.7418296955525875,
"num_tokens": 22418122.0,
"step": 2880
},
{
"entropy": 1.1371208924800158,
"epoch": 0.7550502985196695,
"grad_norm": 0.43648311495780945,
"learning_rate": 4.904475268254384e-06,
"loss": 1.1494,
"mean_token_accuracy": 0.7121193654835224,
"num_tokens": 22459009.0,
"step": 2885
},
{
"entropy": 1.0378689534962178,
"epoch": 0.7563588778931872,
"grad_norm": 0.44997143745422363,
"learning_rate": 4.878304108872023e-06,
"loss": 0.9618,
"mean_token_accuracy": 0.7308846026659012,
"num_tokens": 22493139.0,
"step": 2890
},
{
"entropy": 1.0220247332006693,
"epoch": 0.7576674572667048,
"grad_norm": 0.561329185962677,
"learning_rate": 4.852132949489663e-06,
"loss": 0.9283,
"mean_token_accuracy": 0.7339823555201292,
"num_tokens": 22529225.0,
"step": 2895
},
{
"entropy": 1.0556524885818361,
"epoch": 0.7589760366402225,
"grad_norm": 0.3141125738620758,
"learning_rate": 4.825961790107302e-06,
"loss": 1.0225,
"mean_token_accuracy": 0.732053418457508,
"num_tokens": 22565967.0,
"step": 2900
},
{
"epoch": 0.7589760366402225,
"eval_entropy": 1.0340708026885985,
"eval_loss": 1.0317820310592651,
"eval_mean_token_accuracy": 0.7304344477653504,
"eval_num_tokens": 22565967.0,
"eval_runtime": 97.2034,
"eval_samples_per_second": 10.288,
"eval_steps_per_second": 1.286,
"step": 2900
},
{
"entropy": 1.0628287993371486,
"epoch": 0.7602846160137401,
"grad_norm": 0.5528143048286438,
"learning_rate": 4.799790630724941e-06,
"loss": 1.0052,
"mean_token_accuracy": 0.7236215595155955,
"num_tokens": 22605849.0,
"step": 2905
},
{
"entropy": 1.0544053295627236,
"epoch": 0.7615931953872577,
"grad_norm": 0.5025759339332581,
"learning_rate": 4.773619471342581e-06,
"loss": 0.995,
"mean_token_accuracy": 0.7306290067732334,
"num_tokens": 22647439.0,
"step": 2910
},
{
"entropy": 1.0855362933129071,
"epoch": 0.7629017747607754,
"grad_norm": 0.5423142910003662,
"learning_rate": 4.74744831196022e-06,
"loss": 1.0644,
"mean_token_accuracy": 0.729085548222065,
"num_tokens": 22686489.0,
"step": 2915
},
{
"entropy": 1.1064934462308884,
"epoch": 0.764210354134293,
"grad_norm": 0.324994295835495,
"learning_rate": 4.7212771525778596e-06,
"loss": 1.0572,
"mean_token_accuracy": 0.7155943371355533,
"num_tokens": 22729097.0,
"step": 2920
},
{
"entropy": 1.0514110021293164,
"epoch": 0.7655189335078105,
"grad_norm": 0.48154741525650024,
"learning_rate": 4.695105993195499e-06,
"loss": 0.9801,
"mean_token_accuracy": 0.728981614857912,
"num_tokens": 22767464.0,
"step": 2925
},
{
"entropy": 1.0362781658768654,
"epoch": 0.7668275128813282,
"grad_norm": 0.5839325189590454,
"learning_rate": 4.668934833813139e-06,
"loss": 0.9951,
"mean_token_accuracy": 0.7349088959395885,
"num_tokens": 22806605.0,
"step": 2930
},
{
"entropy": 1.0683909840881824,
"epoch": 0.7681360922548458,
"grad_norm": 0.37387317419052124,
"learning_rate": 4.642763674430777e-06,
"loss": 1.0933,
"mean_token_accuracy": 0.7265824314206839,
"num_tokens": 22844597.0,
"step": 2935
},
{
"entropy": 1.1125381592661143,
"epoch": 0.7694446716283635,
"grad_norm": 0.6262072324752808,
"learning_rate": 4.616592515048417e-06,
"loss": 1.0776,
"mean_token_accuracy": 0.7169565990567207,
"num_tokens": 22883020.0,
"step": 2940
},
{
"entropy": 1.1716872084885837,
"epoch": 0.7707532510018811,
"grad_norm": 0.4011521637439728,
"learning_rate": 4.5904213556660565e-06,
"loss": 1.1654,
"mean_token_accuracy": 0.7077478947117924,
"num_tokens": 22922580.0,
"step": 2945
},
{
"entropy": 1.0219284979626537,
"epoch": 0.7720618303753987,
"grad_norm": 0.6057654619216919,
"learning_rate": 4.564250196283696e-06,
"loss": 1.0102,
"mean_token_accuracy": 0.7341908976435662,
"num_tokens": 22961570.0,
"step": 2950
},
{
"entropy": 1.045608853548765,
"epoch": 0.7733704097489164,
"grad_norm": 0.5418355464935303,
"learning_rate": 4.538079036901335e-06,
"loss": 0.9675,
"mean_token_accuracy": 0.731929623708129,
"num_tokens": 22998884.0,
"step": 2955
},
{
"entropy": 1.1297316145151854,
"epoch": 0.7746789891224339,
"grad_norm": 0.469908207654953,
"learning_rate": 4.511907877518974e-06,
"loss": 1.1027,
"mean_token_accuracy": 0.7112006828188896,
"num_tokens": 23036097.0,
"step": 2960
},
{
"entropy": 1.0829615991562604,
"epoch": 0.7759875684959516,
"grad_norm": 0.42076605558395386,
"learning_rate": 4.485736718136614e-06,
"loss": 0.9541,
"mean_token_accuracy": 0.7311103120446205,
"num_tokens": 23071198.0,
"step": 2965
},
{
"entropy": 1.0218299932777881,
"epoch": 0.7772961478694692,
"grad_norm": 0.6354956030845642,
"learning_rate": 4.459565558754253e-06,
"loss": 0.94,
"mean_token_accuracy": 0.7375240176916122,
"num_tokens": 23105690.0,
"step": 2970
},
{
"entropy": 1.0296128673478961,
"epoch": 0.7786047272429868,
"grad_norm": 0.5609083771705627,
"learning_rate": 4.433394399371893e-06,
"loss": 0.9641,
"mean_token_accuracy": 0.7355793751776218,
"num_tokens": 23149341.0,
"step": 2975
},
{
"entropy": 1.0164244651794434,
"epoch": 0.7799133066165045,
"grad_norm": 0.4903509318828583,
"learning_rate": 4.407223239989532e-06,
"loss": 0.9678,
"mean_token_accuracy": 0.7360415138304234,
"num_tokens": 23184633.0,
"step": 2980
},
{
"entropy": 1.0966936625540256,
"epoch": 0.7812218859900221,
"grad_norm": 0.39143356680870056,
"learning_rate": 4.381052080607171e-06,
"loss": 1.0806,
"mean_token_accuracy": 0.7219570934772491,
"num_tokens": 23221750.0,
"step": 2985
},
{
"entropy": 1.042829731106758,
"epoch": 0.7825304653635398,
"grad_norm": 0.4407028257846832,
"learning_rate": 4.35488092122481e-06,
"loss": 1.0816,
"mean_token_accuracy": 0.7296155303716659,
"num_tokens": 23263629.0,
"step": 2990
},
{
"entropy": 1.067452398315072,
"epoch": 0.7838390447370573,
"grad_norm": 0.4685254991054535,
"learning_rate": 4.32870976184245e-06,
"loss": 1.0394,
"mean_token_accuracy": 0.7196490127593279,
"num_tokens": 23299548.0,
"step": 2995
},
{
"entropy": 1.0206131491810084,
"epoch": 0.7851476241105749,
"grad_norm": 0.4271470010280609,
"learning_rate": 4.30253860246009e-06,
"loss": 0.9486,
"mean_token_accuracy": 0.7346534051001072,
"num_tokens": 23340920.0,
"step": 3000
},
{
"epoch": 0.7851476241105749,
"eval_entropy": 1.0305686144828796,
"eval_loss": 1.03084135055542,
"eval_mean_token_accuracy": 0.7306465420722962,
"eval_num_tokens": 23340920.0,
"eval_runtime": 97.1158,
"eval_samples_per_second": 10.297,
"eval_steps_per_second": 1.287,
"step": 3000
},
{
"entropy": 1.0277832087129355,
"epoch": 0.7864562034840926,
"grad_norm": 0.44500380754470825,
"learning_rate": 4.276367443077729e-06,
"loss": 0.987,
"mean_token_accuracy": 0.7308167792856693,
"num_tokens": 23381557.0,
"step": 3005
},
{
"entropy": 1.086503054201603,
"epoch": 0.7877647828576102,
"grad_norm": 0.3013031780719757,
"learning_rate": 4.250196283695368e-06,
"loss": 1.1165,
"mean_token_accuracy": 0.721335730701685,
"num_tokens": 23422282.0,
"step": 3010
},
{
"entropy": 1.1198953952640296,
"epoch": 0.7890733622311278,
"grad_norm": 0.5127637982368469,
"learning_rate": 4.224025124313007e-06,
"loss": 1.0667,
"mean_token_accuracy": 0.716464214771986,
"num_tokens": 23462818.0,
"step": 3015
},
{
"entropy": 1.0935983188450336,
"epoch": 0.7903819416046455,
"grad_norm": 0.7555818557739258,
"learning_rate": 4.1978539649306465e-06,
"loss": 1.0789,
"mean_token_accuracy": 0.7195366825908422,
"num_tokens": 23500331.0,
"step": 3020
},
{
"entropy": 1.092792271077633,
"epoch": 0.7916905209781631,
"grad_norm": 0.37865763902664185,
"learning_rate": 4.171682805548286e-06,
"loss": 1.0283,
"mean_token_accuracy": 0.7241503912955523,
"num_tokens": 23537976.0,
"step": 3025
},
{
"entropy": 1.0759663056582212,
"epoch": 0.7929991003516808,
"grad_norm": 0.5228607058525085,
"learning_rate": 4.145511646165926e-06,
"loss": 1.0736,
"mean_token_accuracy": 0.7280153460800648,
"num_tokens": 23581747.0,
"step": 3030
},
{
"entropy": 1.1236386895179749,
"epoch": 0.7943076797251983,
"grad_norm": 0.6752633452415466,
"learning_rate": 4.119340486783565e-06,
"loss": 1.0768,
"mean_token_accuracy": 0.7147096544504166,
"num_tokens": 23615855.0,
"step": 3035
},
{
"entropy": 1.1543139606714248,
"epoch": 0.7956162590987159,
"grad_norm": 0.402425616979599,
"learning_rate": 4.093169327401204e-06,
"loss": 1.1301,
"mean_token_accuracy": 0.7124893587082625,
"num_tokens": 23653681.0,
"step": 3040
},
{
"entropy": 1.1155839897692204,
"epoch": 0.7969248384722336,
"grad_norm": 0.3531131148338318,
"learning_rate": 4.066998168018843e-06,
"loss": 1.1011,
"mean_token_accuracy": 0.7147599868476391,
"num_tokens": 23691707.0,
"step": 3045
},
{
"entropy": 1.0141445618122815,
"epoch": 0.7982334178457512,
"grad_norm": 0.8314666152000427,
"learning_rate": 4.0408270086364835e-06,
"loss": 0.9637,
"mean_token_accuracy": 0.7401646625250577,
"num_tokens": 23731289.0,
"step": 3050
},
{
"entropy": 1.1060345830395817,
"epoch": 0.7995419972192689,
"grad_norm": 0.39150726795196533,
"learning_rate": 4.014655849254122e-06,
"loss": 1.1086,
"mean_token_accuracy": 0.7131407152861357,
"num_tokens": 23769110.0,
"step": 3055
},
{
"entropy": 1.1247680716216564,
"epoch": 0.8008505765927865,
"grad_norm": 0.393403023481369,
"learning_rate": 3.988484689871762e-06,
"loss": 1.0878,
"mean_token_accuracy": 0.7203011158853769,
"num_tokens": 23805247.0,
"step": 3060
},
{
"entropy": 1.0936600599437951,
"epoch": 0.802159155966304,
"grad_norm": 0.40403568744659424,
"learning_rate": 3.962313530489401e-06,
"loss": 1.0812,
"mean_token_accuracy": 0.7204918511211872,
"num_tokens": 23846787.0,
"step": 3065
},
{
"entropy": 1.1725836582481861,
"epoch": 0.8034677353398217,
"grad_norm": 0.4385111629962921,
"learning_rate": 3.93614237110704e-06,
"loss": 1.1878,
"mean_token_accuracy": 0.7098725575953722,
"num_tokens": 23884749.0,
"step": 3070
},
{
"entropy": 1.056993416696787,
"epoch": 0.8047763147133393,
"grad_norm": 0.4194464087486267,
"learning_rate": 3.90997121172468e-06,
"loss": 1.0316,
"mean_token_accuracy": 0.7291632607579231,
"num_tokens": 23925854.0,
"step": 3075
},
{
"entropy": 1.122009427472949,
"epoch": 0.8060848940868569,
"grad_norm": 0.34683725237846375,
"learning_rate": 3.883800052342319e-06,
"loss": 1.0402,
"mean_token_accuracy": 0.7219870649278164,
"num_tokens": 23965584.0,
"step": 3080
},
{
"entropy": 0.982495572976768,
"epoch": 0.8073934734603746,
"grad_norm": 0.7075260281562805,
"learning_rate": 3.857628892959959e-06,
"loss": 0.9437,
"mean_token_accuracy": 0.750491987913847,
"num_tokens": 24004600.0,
"step": 3085
},
{
"entropy": 1.0386792987585067,
"epoch": 0.8087020528338922,
"grad_norm": 0.3854619562625885,
"learning_rate": 3.831457733577597e-06,
"loss": 0.9687,
"mean_token_accuracy": 0.7319545924663544,
"num_tokens": 24039464.0,
"step": 3090
},
{
"entropy": 1.0844634655863046,
"epoch": 0.8100106322074099,
"grad_norm": 0.47892338037490845,
"learning_rate": 3.8052865741952373e-06,
"loss": 1.0465,
"mean_token_accuracy": 0.7266070485115051,
"num_tokens": 24076409.0,
"step": 3095
},
{
"entropy": 1.0820760104805232,
"epoch": 0.8113192115809275,
"grad_norm": 0.43651270866394043,
"learning_rate": 3.7791154148128765e-06,
"loss": 1.0007,
"mean_token_accuracy": 0.7236479226499796,
"num_tokens": 24118481.0,
"step": 3100
},
{
"epoch": 0.8113192115809275,
"eval_entropy": 1.036030520915985,
"eval_loss": 1.028921365737915,
"eval_mean_token_accuracy": 0.7310774421691895,
"eval_num_tokens": 24118481.0,
"eval_runtime": 97.2385,
"eval_samples_per_second": 10.284,
"eval_steps_per_second": 1.285,
"step": 3100
},
{
"entropy": 1.1077489994466305,
"epoch": 0.812627790954445,
"grad_norm": 0.42022934556007385,
"learning_rate": 3.7529442554305157e-06,
"loss": 1.0661,
"mean_token_accuracy": 0.7255024470388889,
"num_tokens": 24158004.0,
"step": 3105
},
{
"entropy": 1.0772877372801304,
"epoch": 0.8139363703279627,
"grad_norm": 0.3519681990146637,
"learning_rate": 3.7267730960481554e-06,
"loss": 1.0428,
"mean_token_accuracy": 0.7230064436793328,
"num_tokens": 24194320.0,
"step": 3110
},
{
"entropy": 1.0549692034721374,
"epoch": 0.8152449497014803,
"grad_norm": 0.5634621381759644,
"learning_rate": 3.7006019366657946e-06,
"loss": 1.0893,
"mean_token_accuracy": 0.7291611786931753,
"num_tokens": 24233260.0,
"step": 3115
},
{
"entropy": 0.9838933110237121,
"epoch": 0.816553529074998,
"grad_norm": 0.3864845037460327,
"learning_rate": 3.6744307772834342e-06,
"loss": 0.9414,
"mean_token_accuracy": 0.7454824268817901,
"num_tokens": 24270814.0,
"step": 3120
},
{
"entropy": 1.0704244837164878,
"epoch": 0.8178621084485156,
"grad_norm": 0.5081256628036499,
"learning_rate": 3.648259617901073e-06,
"loss": 1.0257,
"mean_token_accuracy": 0.723455361276865,
"num_tokens": 24309235.0,
"step": 3125
},
{
"entropy": 1.022134768590331,
"epoch": 0.8191706878220332,
"grad_norm": 0.42314413189888,
"learning_rate": 3.6220884585187127e-06,
"loss": 0.9948,
"mean_token_accuracy": 0.7342596560716629,
"num_tokens": 24348724.0,
"step": 3130
},
{
"entropy": 1.0905879106372596,
"epoch": 0.8204792671955509,
"grad_norm": 0.30757448077201843,
"learning_rate": 3.5959172991363523e-06,
"loss": 1.0555,
"mean_token_accuracy": 0.724842281267047,
"num_tokens": 24388628.0,
"step": 3135
},
{
"entropy": 1.0407748512923718,
"epoch": 0.8217878465690684,
"grad_norm": 0.7021998167037964,
"learning_rate": 3.569746139753991e-06,
"loss": 0.9854,
"mean_token_accuracy": 0.7305317636579275,
"num_tokens": 24427341.0,
"step": 3140
},
{
"entropy": 1.1128646079450846,
"epoch": 0.823096425942586,
"grad_norm": 0.3260432779788971,
"learning_rate": 3.5435749803716308e-06,
"loss": 1.0951,
"mean_token_accuracy": 0.712972067296505,
"num_tokens": 24462200.0,
"step": 3145
},
{
"entropy": 1.1428508426994086,
"epoch": 0.8244050053161037,
"grad_norm": 0.37607815861701965,
"learning_rate": 3.5174038209892704e-06,
"loss": 1.1074,
"mean_token_accuracy": 0.7139423068612814,
"num_tokens": 24503955.0,
"step": 3150
},
{
"entropy": 1.0314291812479497,
"epoch": 0.8257135846896213,
"grad_norm": 0.5070587992668152,
"learning_rate": 3.491232661606909e-06,
"loss": 1.0294,
"mean_token_accuracy": 0.7355128142982721,
"num_tokens": 24543206.0,
"step": 3155
},
{
"entropy": 1.1036609571427107,
"epoch": 0.827022164063139,
"grad_norm": 0.41178247332572937,
"learning_rate": 3.465061502224549e-06,
"loss": 1.0937,
"mean_token_accuracy": 0.7211760215461254,
"num_tokens": 24579806.0,
"step": 3160
},
{
"entropy": 1.1116086110472678,
"epoch": 0.8283307434366566,
"grad_norm": 0.5788869857788086,
"learning_rate": 3.438890342842188e-06,
"loss": 1.0604,
"mean_token_accuracy": 0.721147482842207,
"num_tokens": 24616984.0,
"step": 3165
},
{
"entropy": 1.1319230053573848,
"epoch": 0.8296393228101742,
"grad_norm": 0.6010908484458923,
"learning_rate": 3.4127191834598277e-06,
"loss": 1.0807,
"mean_token_accuracy": 0.7168502196669578,
"num_tokens": 24654055.0,
"step": 3170
},
{
"entropy": 1.1207389827817678,
"epoch": 0.8309479021836919,
"grad_norm": 0.5787162780761719,
"learning_rate": 3.386548024077467e-06,
"loss": 1.0741,
"mean_token_accuracy": 0.7188828401267529,
"num_tokens": 24696566.0,
"step": 3175
},
{
"entropy": 1.0550225611776114,
"epoch": 0.8322564815572094,
"grad_norm": 0.4683392643928528,
"learning_rate": 3.360376864695106e-06,
"loss": 1.0378,
"mean_token_accuracy": 0.7269921116530895,
"num_tokens": 24735302.0,
"step": 3180
},
{
"entropy": 1.064477189257741,
"epoch": 0.8335650609307271,
"grad_norm": 0.43787601590156555,
"learning_rate": 3.3342057053127458e-06,
"loss": 1.029,
"mean_token_accuracy": 0.7256352994590998,
"num_tokens": 24777394.0,
"step": 3185
},
{
"entropy": 0.9952436108142138,
"epoch": 0.8348736403042447,
"grad_norm": 0.619921863079071,
"learning_rate": 3.3080345459303846e-06,
"loss": 0.9214,
"mean_token_accuracy": 0.7489484068006277,
"num_tokens": 24820138.0,
"step": 3190
},
{
"entropy": 1.0670820290222764,
"epoch": 0.8361822196777623,
"grad_norm": 0.4853121340274811,
"learning_rate": 3.281863386548024e-06,
"loss": 1.0733,
"mean_token_accuracy": 0.7309010468423367,
"num_tokens": 24857394.0,
"step": 3195
},
{
"entropy": 1.0793268527835607,
"epoch": 0.83749079905128,
"grad_norm": 0.3238151967525482,
"learning_rate": 3.255692227165664e-06,
"loss": 1.0963,
"mean_token_accuracy": 0.7233440794050694,
"num_tokens": 24894819.0,
"step": 3200
},
{
"epoch": 0.83749079905128,
"eval_entropy": 1.0310797443389892,
"eval_loss": 1.0281308889389038,
"eval_mean_token_accuracy": 0.7311860795021057,
"eval_num_tokens": 24894819.0,
"eval_runtime": 97.2362,
"eval_samples_per_second": 10.284,
"eval_steps_per_second": 1.286,
"step": 3200
},
{
"entropy": 1.0457366950809956,
"epoch": 0.8387993784247976,
"grad_norm": 0.44579482078552246,
"learning_rate": 3.2295210677833035e-06,
"loss": 0.995,
"mean_token_accuracy": 0.7295556962490082,
"num_tokens": 24934467.0,
"step": 3205
},
{
"entropy": 1.0695260234177113,
"epoch": 0.8401079577983152,
"grad_norm": 0.4931753873825073,
"learning_rate": 3.2033499084009423e-06,
"loss": 1.0579,
"mean_token_accuracy": 0.7268257789313793,
"num_tokens": 24973181.0,
"step": 3210
},
{
"entropy": 1.0737264335155488,
"epoch": 0.8414165371718328,
"grad_norm": 0.542246401309967,
"learning_rate": 3.177178749018582e-06,
"loss": 1.044,
"mean_token_accuracy": 0.7273680947721004,
"num_tokens": 25008234.0,
"step": 3215
},
{
"entropy": 1.0925125159323215,
"epoch": 0.8427251165453504,
"grad_norm": 0.7821519374847412,
"learning_rate": 3.151007589636221e-06,
"loss": 1.0926,
"mean_token_accuracy": 0.7236344940960407,
"num_tokens": 25042804.0,
"step": 3220
},
{
"entropy": 1.0435322053730487,
"epoch": 0.8440336959188681,
"grad_norm": 0.5571607351303101,
"learning_rate": 3.1248364302538604e-06,
"loss": 1.0314,
"mean_token_accuracy": 0.7397142663598061,
"num_tokens": 25081713.0,
"step": 3225
},
{
"entropy": 1.0707201555371284,
"epoch": 0.8453422752923857,
"grad_norm": 0.46003878116607666,
"learning_rate": 3.0986652708715e-06,
"loss": 1.0396,
"mean_token_accuracy": 0.7285445638000965,
"num_tokens": 25115761.0,
"step": 3230
},
{
"entropy": 1.0421096246689558,
"epoch": 0.8466508546659033,
"grad_norm": 0.5213029980659485,
"learning_rate": 3.0724941114891392e-06,
"loss": 0.9743,
"mean_token_accuracy": 0.729071582481265,
"num_tokens": 25158298.0,
"step": 3235
},
{
"entropy": 0.9816816847771406,
"epoch": 0.847959434039421,
"grad_norm": 0.2949255704879761,
"learning_rate": 3.0463229521067784e-06,
"loss": 0.8791,
"mean_token_accuracy": 0.7419180549681187,
"num_tokens": 25195031.0,
"step": 3240
},
{
"entropy": 1.1069608356803655,
"epoch": 0.8492680134129386,
"grad_norm": 0.4634745121002197,
"learning_rate": 3.0201517927244177e-06,
"loss": 1.0572,
"mean_token_accuracy": 0.7191666610538959,
"num_tokens": 25233843.0,
"step": 3245
},
{
"entropy": 1.0450120318681002,
"epoch": 0.8505765927864563,
"grad_norm": 0.41588935256004333,
"learning_rate": 2.9939806333420573e-06,
"loss": 1.004,
"mean_token_accuracy": 0.7298405677080154,
"num_tokens": 25277199.0,
"step": 3250
},
{
"entropy": 1.0192270964384078,
"epoch": 0.8518851721599738,
"grad_norm": 0.3328169584274292,
"learning_rate": 2.967809473959697e-06,
"loss": 1.0314,
"mean_token_accuracy": 0.7342530839145184,
"num_tokens": 25313963.0,
"step": 3255
},
{
"entropy": 1.068826200440526,
"epoch": 0.8531937515334914,
"grad_norm": 0.4146538972854614,
"learning_rate": 2.9416383145773357e-06,
"loss": 1.0426,
"mean_token_accuracy": 0.7297728694975376,
"num_tokens": 25354460.0,
"step": 3260
},
{
"entropy": 1.0722479410469532,
"epoch": 0.8545023309070091,
"grad_norm": 0.41407662630081177,
"learning_rate": 2.9154671551949754e-06,
"loss": 1.038,
"mean_token_accuracy": 0.7312197633087635,
"num_tokens": 25391042.0,
"step": 3265
},
{
"entropy": 1.0916271772235633,
"epoch": 0.8558109102805267,
"grad_norm": 0.36934009194374084,
"learning_rate": 2.889295995812615e-06,
"loss": 1.1389,
"mean_token_accuracy": 0.7267607122659683,
"num_tokens": 25435161.0,
"step": 3270
},
{
"entropy": 1.051083530113101,
"epoch": 0.8571194896540443,
"grad_norm": 0.35780689120292664,
"learning_rate": 2.863124836430254e-06,
"loss": 1.0192,
"mean_token_accuracy": 0.7258496068418026,
"num_tokens": 25479291.0,
"step": 3275
},
{
"entropy": 1.0690735118463635,
"epoch": 0.858428069027562,
"grad_norm": 0.3563995659351349,
"learning_rate": 2.8369536770478935e-06,
"loss": 1.0232,
"mean_token_accuracy": 0.7209200043231249,
"num_tokens": 25518588.0,
"step": 3280
},
{
"entropy": 1.0599579058587552,
"epoch": 0.8597366484010796,
"grad_norm": 0.50145423412323,
"learning_rate": 2.8107825176655327e-06,
"loss": 1.0418,
"mean_token_accuracy": 0.7253329046070576,
"num_tokens": 25555559.0,
"step": 3285
},
{
"entropy": 1.0637880939990283,
"epoch": 0.8610452277745972,
"grad_norm": 0.32374799251556396,
"learning_rate": 2.7846113582831723e-06,
"loss": 1.0589,
"mean_token_accuracy": 0.729487194865942,
"num_tokens": 25599141.0,
"step": 3290
},
{
"entropy": 1.121987307816744,
"epoch": 0.8623538071481148,
"grad_norm": 0.7996993064880371,
"learning_rate": 2.7584401989008115e-06,
"loss": 1.1568,
"mean_token_accuracy": 0.71376275382936,
"num_tokens": 25635193.0,
"step": 3295
},
{
"entropy": 1.075880871899426,
"epoch": 0.8636623865216324,
"grad_norm": 0.3166949450969696,
"learning_rate": 2.7322690395184508e-06,
"loss": 1.0328,
"mean_token_accuracy": 0.7281475432217122,
"num_tokens": 25671967.0,
"step": 3300
},
{
"epoch": 0.8636623865216324,
"eval_entropy": 1.0315785236358643,
"eval_loss": 1.0274593830108643,
"eval_mean_token_accuracy": 0.7313339967727661,
"eval_num_tokens": 25671967.0,
"eval_runtime": 97.2667,
"eval_samples_per_second": 10.281,
"eval_steps_per_second": 1.285,
"step": 3300
},
{
"entropy": 1.0423275344073772,
"epoch": 0.8649709658951501,
"grad_norm": 0.5029602646827698,
"learning_rate": 2.7060978801360904e-06,
"loss": 1.0144,
"mean_token_accuracy": 0.7371830597519875,
"num_tokens": 25714593.0,
"step": 3305
},
{
"entropy": 1.019960217177868,
"epoch": 0.8662795452686677,
"grad_norm": 0.3714219629764557,
"learning_rate": 2.679926720753729e-06,
"loss": 0.9591,
"mean_token_accuracy": 0.7337398871779441,
"num_tokens": 25753622.0,
"step": 3310
},
{
"entropy": 1.0438139468431473,
"epoch": 0.8675881246421854,
"grad_norm": 0.49829211831092834,
"learning_rate": 2.653755561371369e-06,
"loss": 1.0152,
"mean_token_accuracy": 0.7405216414481401,
"num_tokens": 25789764.0,
"step": 3315
},
{
"entropy": 1.0796325359493495,
"epoch": 0.868896704015703,
"grad_norm": 0.9016978144645691,
"learning_rate": 2.6275844019890085e-06,
"loss": 1.0369,
"mean_token_accuracy": 0.7271438419818879,
"num_tokens": 25827139.0,
"step": 3320
},
{
"entropy": 1.1373902840539813,
"epoch": 0.8702052833892205,
"grad_norm": 0.5277310609817505,
"learning_rate": 2.601413242606648e-06,
"loss": 1.1125,
"mean_token_accuracy": 0.7127894099801779,
"num_tokens": 25866714.0,
"step": 3325
},
{
"entropy": 1.0553706608712674,
"epoch": 0.8715138627627382,
"grad_norm": 0.39588603377342224,
"learning_rate": 2.575242083224287e-06,
"loss": 1.0219,
"mean_token_accuracy": 0.7270961362868548,
"num_tokens": 25909686.0,
"step": 3330
},
{
"entropy": 1.065532573312521,
"epoch": 0.8728224421362558,
"grad_norm": 0.5084222555160522,
"learning_rate": 2.5490709238419266e-06,
"loss": 1.0483,
"mean_token_accuracy": 0.7294468872249127,
"num_tokens": 25950054.0,
"step": 3335
},
{
"entropy": 1.0791085483506322,
"epoch": 0.8741310215097734,
"grad_norm": 0.5307091474533081,
"learning_rate": 2.5228997644595658e-06,
"loss": 1.0317,
"mean_token_accuracy": 0.7308267526328563,
"num_tokens": 25988785.0,
"step": 3340
},
{
"entropy": 1.0206924524158238,
"epoch": 0.8754396008832911,
"grad_norm": 0.44742336869239807,
"learning_rate": 2.4967286050772054e-06,
"loss": 0.9997,
"mean_token_accuracy": 0.7365788772702218,
"num_tokens": 26031548.0,
"step": 3345
},
{
"entropy": 1.072092866897583,
"epoch": 0.8767481802568087,
"grad_norm": 0.4204552471637726,
"learning_rate": 2.4705574456948446e-06,
"loss": 1.0516,
"mean_token_accuracy": 0.7274888075888157,
"num_tokens": 26073312.0,
"step": 3350
},
{
"entropy": 1.0556320391595364,
"epoch": 0.8780567596303264,
"grad_norm": 0.4932871162891388,
"learning_rate": 2.444386286312484e-06,
"loss": 1.0326,
"mean_token_accuracy": 0.7380503460764885,
"num_tokens": 26114431.0,
"step": 3355
},
{
"entropy": 1.091795524954796,
"epoch": 0.879365339003844,
"grad_norm": 0.561880350112915,
"learning_rate": 2.418215126930123e-06,
"loss": 1.0774,
"mean_token_accuracy": 0.7251040559262037,
"num_tokens": 26153886.0,
"step": 3360
},
{
"entropy": 1.07560763284564,
"epoch": 0.8806739183773615,
"grad_norm": 0.48657548427581787,
"learning_rate": 2.3920439675477623e-06,
"loss": 1.0658,
"mean_token_accuracy": 0.7262101206928492,
"num_tokens": 26186389.0,
"step": 3365
},
{
"entropy": 1.0190218634903432,
"epoch": 0.8819824977508792,
"grad_norm": 0.7028672099113464,
"learning_rate": 2.365872808165402e-06,
"loss": 0.9896,
"mean_token_accuracy": 0.7349074624478817,
"num_tokens": 26216742.0,
"step": 3370
},
{
"entropy": 1.0444237791001796,
"epoch": 0.8832910771243968,
"grad_norm": 0.3169384300708771,
"learning_rate": 2.339701648783041e-06,
"loss": 0.9722,
"mean_token_accuracy": 0.7366019688546658,
"num_tokens": 26252709.0,
"step": 3375
},
{
"entropy": 1.0996058486402034,
"epoch": 0.8845996564979145,
"grad_norm": 0.45103803277015686,
"learning_rate": 2.313530489400681e-06,
"loss": 1.0965,
"mean_token_accuracy": 0.7159675225615502,
"num_tokens": 26290405.0,
"step": 3380
},
{
"entropy": 1.120962193235755,
"epoch": 0.8859082358714321,
"grad_norm": 0.4903068244457245,
"learning_rate": 2.28735933001832e-06,
"loss": 1.133,
"mean_token_accuracy": 0.7219177789986133,
"num_tokens": 26328036.0,
"step": 3385
},
{
"entropy": 1.015447051078081,
"epoch": 0.8872168152449497,
"grad_norm": 0.5176340341567993,
"learning_rate": 2.2611881706359592e-06,
"loss": 1.0307,
"mean_token_accuracy": 0.7406690739095211,
"num_tokens": 26370299.0,
"step": 3390
},
{
"entropy": 1.0935089591890574,
"epoch": 0.8885253946184674,
"grad_norm": 0.29109635949134827,
"learning_rate": 2.235017011253599e-06,
"loss": 1.0479,
"mean_token_accuracy": 0.7220962606370449,
"num_tokens": 26410839.0,
"step": 3395
},
{
"entropy": 1.073088315874338,
"epoch": 0.8898339739919849,
"grad_norm": 0.41668152809143066,
"learning_rate": 2.208845851871238e-06,
"loss": 1.0365,
"mean_token_accuracy": 0.7313703790307045,
"num_tokens": 26451902.0,
"step": 3400
},
{
"epoch": 0.8898339739919849,
"eval_entropy": 1.0298821225166321,
"eval_loss": 1.0266761779785156,
"eval_mean_token_accuracy": 0.7314435024261474,
"eval_num_tokens": 26451902.0,
"eval_runtime": 97.2178,
"eval_samples_per_second": 10.286,
"eval_steps_per_second": 1.286,
"step": 3400
},
{
"entropy": 1.1048546012490987,
"epoch": 0.8911425533655025,
"grad_norm": 0.33564722537994385,
"learning_rate": 2.1826746924888777e-06,
"loss": 1.1361,
"mean_token_accuracy": 0.7248132299631834,
"num_tokens": 26494266.0,
"step": 3405
},
{
"entropy": 1.0767640516161918,
"epoch": 0.8924511327390202,
"grad_norm": 0.34754645824432373,
"learning_rate": 2.156503533106517e-06,
"loss": 1.0538,
"mean_token_accuracy": 0.7239204317331314,
"num_tokens": 26538961.0,
"step": 3410
},
{
"entropy": 1.1368263632059097,
"epoch": 0.8937597121125378,
"grad_norm": 0.42159193754196167,
"learning_rate": 2.130332373724156e-06,
"loss": 1.0626,
"mean_token_accuracy": 0.7145794078707695,
"num_tokens": 26576209.0,
"step": 3415
},
{
"entropy": 1.1049470301717519,
"epoch": 0.8950682914860555,
"grad_norm": 0.41963493824005127,
"learning_rate": 2.1041612143417954e-06,
"loss": 1.078,
"mean_token_accuracy": 0.7214818011969328,
"num_tokens": 26615242.0,
"step": 3420
},
{
"entropy": 1.0639567028731107,
"epoch": 0.8963768708595731,
"grad_norm": 0.27538204193115234,
"learning_rate": 2.0779900549594346e-06,
"loss": 1.0215,
"mean_token_accuracy": 0.7226650364696979,
"num_tokens": 26655742.0,
"step": 3425
},
{
"entropy": 1.041541761904955,
"epoch": 0.8976854502330907,
"grad_norm": 0.3388216197490692,
"learning_rate": 2.0518188955770743e-06,
"loss": 1.0363,
"mean_token_accuracy": 0.7356565605849028,
"num_tokens": 26693766.0,
"step": 3430
},
{
"entropy": 1.0412372374907135,
"epoch": 0.8989940296066083,
"grad_norm": 0.3273700773715973,
"learning_rate": 2.0256477361947135e-06,
"loss": 0.9961,
"mean_token_accuracy": 0.7328124668449163,
"num_tokens": 26731172.0,
"step": 3435
},
{
"entropy": 1.0402305848896503,
"epoch": 0.9003026089801259,
"grad_norm": 0.5662808418273926,
"learning_rate": 1.999476576812353e-06,
"loss": 1.0287,
"mean_token_accuracy": 0.7347014851868152,
"num_tokens": 26770030.0,
"step": 3440
},
{
"entropy": 1.0628270111978053,
"epoch": 0.9016111883536436,
"grad_norm": 0.39010319113731384,
"learning_rate": 1.9733054174299923e-06,
"loss": 0.9835,
"mean_token_accuracy": 0.7256053328514099,
"num_tokens": 26811239.0,
"step": 3445
},
{
"entropy": 1.0397684559226037,
"epoch": 0.9029197677271612,
"grad_norm": 0.5970498323440552,
"learning_rate": 1.9471342580476316e-06,
"loss": 1.0163,
"mean_token_accuracy": 0.7326931424438954,
"num_tokens": 26850997.0,
"step": 3450
},
{
"entropy": 1.0377406038343906,
"epoch": 0.9042283471006788,
"grad_norm": 0.45579779148101807,
"learning_rate": 1.920963098665271e-06,
"loss": 1.0484,
"mean_token_accuracy": 0.7341817907989026,
"num_tokens": 26889505.0,
"step": 3455
},
{
"entropy": 1.020282133296132,
"epoch": 0.9055369264741965,
"grad_norm": 0.40827953815460205,
"learning_rate": 1.8947919392829104e-06,
"loss": 1.0153,
"mean_token_accuracy": 0.737718590721488,
"num_tokens": 26927519.0,
"step": 3460
},
{
"entropy": 1.053854411840439,
"epoch": 0.9068455058477141,
"grad_norm": 0.45387595891952515,
"learning_rate": 1.8686207799005498e-06,
"loss": 0.9982,
"mean_token_accuracy": 0.7261220943182707,
"num_tokens": 26966277.0,
"step": 3465
},
{
"entropy": 1.017658719792962,
"epoch": 0.9081540852212318,
"grad_norm": 0.36533382534980774,
"learning_rate": 1.842449620518189e-06,
"loss": 0.9675,
"mean_token_accuracy": 0.7362812142819166,
"num_tokens": 27002691.0,
"step": 3470
},
{
"entropy": 1.053343753516674,
"epoch": 0.9094626645947493,
"grad_norm": 0.3158053755760193,
"learning_rate": 1.8162784611358283e-06,
"loss": 1.0161,
"mean_token_accuracy": 0.7329125508666039,
"num_tokens": 27040733.0,
"step": 3475
},
{
"entropy": 0.9715707473456859,
"epoch": 0.9107712439682669,
"grad_norm": 0.30097201466560364,
"learning_rate": 1.790107301753468e-06,
"loss": 0.9108,
"mean_token_accuracy": 0.741458211094141,
"num_tokens": 27083118.0,
"step": 3480
},
{
"entropy": 1.1143366053700448,
"epoch": 0.9120798233417846,
"grad_norm": 0.5649247765541077,
"learning_rate": 1.7639361423711071e-06,
"loss": 1.0844,
"mean_token_accuracy": 0.7136627584695816,
"num_tokens": 27118214.0,
"step": 3485
},
{
"entropy": 1.053029465302825,
"epoch": 0.9133884027153022,
"grad_norm": 0.41902416944503784,
"learning_rate": 1.7377649829887466e-06,
"loss": 0.9994,
"mean_token_accuracy": 0.7316753648221492,
"num_tokens": 27154246.0,
"step": 3490
},
{
"entropy": 1.0596496284008026,
"epoch": 0.9146969820888198,
"grad_norm": 0.4515739977359772,
"learning_rate": 1.7115938236063858e-06,
"loss": 1.0266,
"mean_token_accuracy": 0.7284657262265682,
"num_tokens": 27190566.0,
"step": 3495
},
{
"entropy": 1.1401704950258136,
"epoch": 0.9160055614623375,
"grad_norm": 0.5285766124725342,
"learning_rate": 1.6854226642240254e-06,
"loss": 1.1079,
"mean_token_accuracy": 0.7140452340245247,
"num_tokens": 27231234.0,
"step": 3500
},
{
"epoch": 0.9160055614623375,
"eval_entropy": 1.0293893675804138,
"eval_loss": 1.0261290073394775,
"eval_mean_token_accuracy": 0.7316831665039063,
"eval_num_tokens": 27231234.0,
"eval_runtime": 97.2439,
"eval_samples_per_second": 10.283,
"eval_steps_per_second": 1.285,
"step": 3500
},
{
"entropy": 1.060255604609847,
"epoch": 0.917314140835855,
"grad_norm": 0.5222223401069641,
"learning_rate": 1.6592515048416647e-06,
"loss": 1.0422,
"mean_token_accuracy": 0.7291194565594197,
"num_tokens": 27274088.0,
"step": 3505
},
{
"entropy": 1.0341279707849025,
"epoch": 0.9186227202093727,
"grad_norm": 0.46550068259239197,
"learning_rate": 1.6330803454593039e-06,
"loss": 1.0274,
"mean_token_accuracy": 0.736503117159009,
"num_tokens": 27314182.0,
"step": 3510
},
{
"entropy": 1.0762779969722032,
"epoch": 0.9199312995828903,
"grad_norm": 0.4758652150630951,
"learning_rate": 1.6069091860769433e-06,
"loss": 1.0411,
"mean_token_accuracy": 0.7197697997093201,
"num_tokens": 27353165.0,
"step": 3515
},
{
"entropy": 1.0701686941087245,
"epoch": 0.9212398789564079,
"grad_norm": 0.49974560737609863,
"learning_rate": 1.5807380266945827e-06,
"loss": 1.0625,
"mean_token_accuracy": 0.7291940327733755,
"num_tokens": 27387526.0,
"step": 3520
},
{
"entropy": 1.083727527782321,
"epoch": 0.9225484583299256,
"grad_norm": 0.5067974328994751,
"learning_rate": 1.5545668673122222e-06,
"loss": 1.0838,
"mean_token_accuracy": 0.719190240651369,
"num_tokens": 27424821.0,
"step": 3525
},
{
"entropy": 1.0643510822206736,
"epoch": 0.9238570377034432,
"grad_norm": 0.6037015914916992,
"learning_rate": 1.5283957079298614e-06,
"loss": 1.08,
"mean_token_accuracy": 0.7282540425658226,
"num_tokens": 27462609.0,
"step": 3530
},
{
"entropy": 1.0500620037317276,
"epoch": 0.9251656170769609,
"grad_norm": 0.6834056377410889,
"learning_rate": 1.5022245485475006e-06,
"loss": 1.0026,
"mean_token_accuracy": 0.7320256743580102,
"num_tokens": 27500178.0,
"step": 3535
},
{
"entropy": 1.0600904449820518,
"epoch": 0.9264741964504785,
"grad_norm": 0.5821903347969055,
"learning_rate": 1.4760533891651402e-06,
"loss": 1.0701,
"mean_token_accuracy": 0.7213537424802781,
"num_tokens": 27539087.0,
"step": 3540
},
{
"entropy": 1.057712410390377,
"epoch": 0.927782775823996,
"grad_norm": 0.28830403089523315,
"learning_rate": 1.4498822297827795e-06,
"loss": 0.9984,
"mean_token_accuracy": 0.7261978283524513,
"num_tokens": 27575481.0,
"step": 3545
},
{
"entropy": 1.0745814580470323,
"epoch": 0.9290913551975137,
"grad_norm": 0.5320100784301758,
"learning_rate": 1.4237110704004189e-06,
"loss": 1.0255,
"mean_token_accuracy": 0.7240989219397307,
"num_tokens": 27610121.0,
"step": 3550
},
{
"entropy": 1.1143604163080454,
"epoch": 0.9303999345710313,
"grad_norm": 0.3023637533187866,
"learning_rate": 1.3975399110180581e-06,
"loss": 1.1067,
"mean_token_accuracy": 0.7260128542780876,
"num_tokens": 27648696.0,
"step": 3555
},
{
"entropy": 1.0836713276803493,
"epoch": 0.9317085139445489,
"grad_norm": 0.6137750148773193,
"learning_rate": 1.3713687516356975e-06,
"loss": 1.0442,
"mean_token_accuracy": 0.7201635256409645,
"num_tokens": 27682007.0,
"step": 3560
},
{
"entropy": 1.1006150282919407,
"epoch": 0.9330170933180666,
"grad_norm": 0.44908300042152405,
"learning_rate": 1.345197592253337e-06,
"loss": 1.1144,
"mean_token_accuracy": 0.7233285054564476,
"num_tokens": 27719353.0,
"step": 3565
},
{
"entropy": 1.0796324241906405,
"epoch": 0.9343256726915842,
"grad_norm": 0.4785999655723572,
"learning_rate": 1.3190264328709762e-06,
"loss": 1.0587,
"mean_token_accuracy": 0.725184840708971,
"num_tokens": 27760487.0,
"step": 3570
},
{
"entropy": 1.0421875026077032,
"epoch": 0.9356342520651019,
"grad_norm": 0.5122357606887817,
"learning_rate": 1.2928552734886158e-06,
"loss": 1.025,
"mean_token_accuracy": 0.7302470609545708,
"num_tokens": 27803053.0,
"step": 3575
},
{
"entropy": 1.0390013061463832,
"epoch": 0.9369428314386195,
"grad_norm": 0.5996967554092407,
"learning_rate": 1.266684114106255e-06,
"loss": 0.9942,
"mean_token_accuracy": 0.7326175101101399,
"num_tokens": 27841890.0,
"step": 3580
},
{
"entropy": 1.0458267323672772,
"epoch": 0.938251410812137,
"grad_norm": 0.4176723062992096,
"learning_rate": 1.2405129547238943e-06,
"loss": 1.0177,
"mean_token_accuracy": 0.724136520922184,
"num_tokens": 27882510.0,
"step": 3585
},
{
"entropy": 1.0069429714232683,
"epoch": 0.9395599901856547,
"grad_norm": 0.40060707926750183,
"learning_rate": 1.2143417953415337e-06,
"loss": 0.9671,
"mean_token_accuracy": 0.7366093017160893,
"num_tokens": 27923388.0,
"step": 3590
},
{
"entropy": 1.16991759352386,
"epoch": 0.9408685695591723,
"grad_norm": 0.3461279571056366,
"learning_rate": 1.1881706359591731e-06,
"loss": 1.1462,
"mean_token_accuracy": 0.7035489819943905,
"num_tokens": 27964084.0,
"step": 3595
},
{
"entropy": 1.0686412911862135,
"epoch": 0.94217714893269,
"grad_norm": 0.2957308888435364,
"learning_rate": 1.1619994765768126e-06,
"loss": 1.1066,
"mean_token_accuracy": 0.7225402969866991,
"num_tokens": 28001682.0,
"step": 3600
},
{
"epoch": 0.94217714893269,
"eval_entropy": 1.0279519958496093,
"eval_loss": 1.0255744457244873,
"eval_mean_token_accuracy": 0.7317811703681946,
"eval_num_tokens": 28001682.0,
"eval_runtime": 97.2947,
"eval_samples_per_second": 10.278,
"eval_steps_per_second": 1.285,
"step": 3600
},
{
"entropy": 1.0206676226109266,
"epoch": 0.9434857283062076,
"grad_norm": 0.35788989067077637,
"learning_rate": 1.1358283171944518e-06,
"loss": 1.0176,
"mean_token_accuracy": 0.7300234287977219,
"num_tokens": 28042579.0,
"step": 3605
},
{
"entropy": 1.1715757973492145,
"epoch": 0.9447943076797252,
"grad_norm": 0.6498376131057739,
"learning_rate": 1.1096571578120912e-06,
"loss": 1.2274,
"mean_token_accuracy": 0.7090075612068176,
"num_tokens": 28077521.0,
"step": 3610
},
{
"entropy": 1.0543236445635558,
"epoch": 0.9461028870532429,
"grad_norm": 0.3194473385810852,
"learning_rate": 1.0834859984297304e-06,
"loss": 1.0491,
"mean_token_accuracy": 0.72547093257308,
"num_tokens": 28116840.0,
"step": 3615
},
{
"entropy": 1.1037466993555427,
"epoch": 0.9474114664267604,
"grad_norm": 0.6278566718101501,
"learning_rate": 1.0573148390473699e-06,
"loss": 1.0452,
"mean_token_accuracy": 0.7226000647991896,
"num_tokens": 28153551.0,
"step": 3620
},
{
"entropy": 1.0038308002054692,
"epoch": 0.948720045800278,
"grad_norm": 0.48984530568122864,
"learning_rate": 1.0311436796650093e-06,
"loss": 0.9795,
"mean_token_accuracy": 0.7407138921320439,
"num_tokens": 28189324.0,
"step": 3625
},
{
"entropy": 1.0743804059922695,
"epoch": 0.9500286251737957,
"grad_norm": 0.35918810963630676,
"learning_rate": 1.0049725202826487e-06,
"loss": 1.1096,
"mean_token_accuracy": 0.7247271824628114,
"num_tokens": 28230753.0,
"step": 3630
},
{
"entropy": 1.0661400273442267,
"epoch": 0.9513372045473133,
"grad_norm": 0.45430681109428406,
"learning_rate": 9.78801360900288e-07,
"loss": 1.0431,
"mean_token_accuracy": 0.7261486738920212,
"num_tokens": 28267961.0,
"step": 3635
},
{
"entropy": 1.0930729560554027,
"epoch": 0.952645783920831,
"grad_norm": 0.5693202018737793,
"learning_rate": 9.526302015179273e-07,
"loss": 1.0501,
"mean_token_accuracy": 0.7252973057329655,
"num_tokens": 28303967.0,
"step": 3640
},
{
"entropy": 1.124368030577898,
"epoch": 0.9539543632943486,
"grad_norm": 0.5943740010261536,
"learning_rate": 9.264590421355667e-07,
"loss": 1.148,
"mean_token_accuracy": 0.7163769982755184,
"num_tokens": 28339080.0,
"step": 3645
},
{
"entropy": 1.0308339811861515,
"epoch": 0.9552629426678662,
"grad_norm": 0.4916401505470276,
"learning_rate": 9.00287882753206e-07,
"loss": 0.9571,
"mean_token_accuracy": 0.7355926677584648,
"num_tokens": 28376833.0,
"step": 3650
},
{
"entropy": 1.0489124123007059,
"epoch": 0.9565715220413838,
"grad_norm": 0.5581033229827881,
"learning_rate": 8.741167233708454e-07,
"loss": 1.0203,
"mean_token_accuracy": 0.7289952971041203,
"num_tokens": 28413990.0,
"step": 3655
},
{
"entropy": 1.027711633220315,
"epoch": 0.9578801014149014,
"grad_norm": 0.5208143591880798,
"learning_rate": 8.479455639884849e-07,
"loss": 0.985,
"mean_token_accuracy": 0.7333911582827568,
"num_tokens": 28451250.0,
"step": 3660
},
{
"entropy": 1.0729194439947605,
"epoch": 0.9591886807884191,
"grad_norm": 0.6603464484214783,
"learning_rate": 8.217744046061241e-07,
"loss": 1.1221,
"mean_token_accuracy": 0.7259935919195414,
"num_tokens": 28488287.0,
"step": 3665
},
{
"entropy": 1.1133702561259269,
"epoch": 0.9604972601619367,
"grad_norm": 0.45469507575035095,
"learning_rate": 7.956032452237634e-07,
"loss": 1.0952,
"mean_token_accuracy": 0.7223341464996338,
"num_tokens": 28523839.0,
"step": 3670
},
{
"entropy": 1.0385285302996636,
"epoch": 0.9618058395354543,
"grad_norm": 0.4402262270450592,
"learning_rate": 7.694320858414028e-07,
"loss": 1.0786,
"mean_token_accuracy": 0.7375961367040873,
"num_tokens": 28561415.0,
"step": 3675
},
{
"entropy": 1.0912371952086688,
"epoch": 0.963114418908972,
"grad_norm": 0.4388654828071594,
"learning_rate": 7.432609264590422e-07,
"loss": 1.1011,
"mean_token_accuracy": 0.7208574242889881,
"num_tokens": 28598012.0,
"step": 3680
},
{
"entropy": 1.0840743597596885,
"epoch": 0.9644229982824896,
"grad_norm": 0.5213295221328735,
"learning_rate": 7.170897670766816e-07,
"loss": 1.0878,
"mean_token_accuracy": 0.7260554354637861,
"num_tokens": 28640120.0,
"step": 3685
},
{
"entropy": 1.0481572005897761,
"epoch": 0.9657315776560071,
"grad_norm": 0.6161360740661621,
"learning_rate": 6.90918607694321e-07,
"loss": 0.9982,
"mean_token_accuracy": 0.7346723277121783,
"num_tokens": 28680039.0,
"step": 3690
},
{
"entropy": 1.081262205913663,
"epoch": 0.9670401570295248,
"grad_norm": 0.5245823264122009,
"learning_rate": 6.647474483119602e-07,
"loss": 1.0661,
"mean_token_accuracy": 0.7203562073409557,
"num_tokens": 28720731.0,
"step": 3695
},
{
"entropy": 1.0632508873939515,
"epoch": 0.9683487364030424,
"grad_norm": 0.3406558334827423,
"learning_rate": 6.385762889295996e-07,
"loss": 1.011,
"mean_token_accuracy": 0.7315301813185215,
"num_tokens": 28758379.0,
"step": 3700
},
{
"epoch": 0.9683487364030424,
"eval_entropy": 1.0293308172225952,
"eval_loss": 1.0250952243804932,
"eval_mean_token_accuracy": 0.7318736510276794,
"eval_num_tokens": 28758379.0,
"eval_runtime": 97.2597,
"eval_samples_per_second": 10.282,
"eval_steps_per_second": 1.285,
"step": 3700
},
{
"entropy": 1.0361842691898346,
"epoch": 0.9696573157765601,
"grad_norm": 0.3829176127910614,
"learning_rate": 6.12405129547239e-07,
"loss": 1.0297,
"mean_token_accuracy": 0.738416987657547,
"num_tokens": 28801046.0,
"step": 3705
},
{
"entropy": 0.9786388706415892,
"epoch": 0.9709658951500777,
"grad_norm": 0.42764556407928467,
"learning_rate": 5.862339701648783e-07,
"loss": 0.897,
"mean_token_accuracy": 0.7432105515152216,
"num_tokens": 28841846.0,
"step": 3710
},
{
"entropy": 1.0343420587480068,
"epoch": 0.9722744745235953,
"grad_norm": 0.3440265357494354,
"learning_rate": 5.600628107825177e-07,
"loss": 1.0067,
"mean_token_accuracy": 0.7312737897038459,
"num_tokens": 28888566.0,
"step": 3715
},
{
"entropy": 1.0930624961853028,
"epoch": 0.973583053897113,
"grad_norm": 0.26113247871398926,
"learning_rate": 5.338916514001571e-07,
"loss": 1.1205,
"mean_token_accuracy": 0.7230574164539576,
"num_tokens": 28926022.0,
"step": 3720
},
{
"entropy": 1.0122245840728283,
"epoch": 0.9748916332706306,
"grad_norm": 0.4856005012989044,
"learning_rate": 5.077204920177964e-07,
"loss": 1.0115,
"mean_token_accuracy": 0.7327728264033795,
"num_tokens": 28967341.0,
"step": 3725
},
{
"entropy": 1.0293555181473493,
"epoch": 0.9762002126441482,
"grad_norm": 0.4436942934989929,
"learning_rate": 4.815493326354357e-07,
"loss": 1.0318,
"mean_token_accuracy": 0.7331165555864573,
"num_tokens": 29011545.0,
"step": 3730
},
{
"entropy": 1.0567047752439975,
"epoch": 0.9775087920176658,
"grad_norm": 0.46222445368766785,
"learning_rate": 4.5537817325307516e-07,
"loss": 1.0437,
"mean_token_accuracy": 0.7290311522781849,
"num_tokens": 29050339.0,
"step": 3735
},
{
"entropy": 1.0223099350929261,
"epoch": 0.9788173713911834,
"grad_norm": 0.6509369611740112,
"learning_rate": 4.2920701387071454e-07,
"loss": 1.0271,
"mean_token_accuracy": 0.743386908620596,
"num_tokens": 29090275.0,
"step": 3740
},
{
"entropy": 1.0612529110163451,
"epoch": 0.9801259507647011,
"grad_norm": 0.5703464150428772,
"learning_rate": 4.0303585448835386e-07,
"loss": 1.0539,
"mean_token_accuracy": 0.7298849746584892,
"num_tokens": 29129073.0,
"step": 3745
},
{
"entropy": 1.0348493970930577,
"epoch": 0.9814345301382187,
"grad_norm": 0.35511285066604614,
"learning_rate": 3.7686469510599324e-07,
"loss": 1.0283,
"mean_token_accuracy": 0.7344648085534573,
"num_tokens": 29171021.0,
"step": 3750
},
{
"entropy": 1.1379429820924998,
"epoch": 0.9827431095117363,
"grad_norm": 0.3323829174041748,
"learning_rate": 3.506935357236326e-07,
"loss": 1.1105,
"mean_token_accuracy": 0.7117891266942025,
"num_tokens": 29206905.0,
"step": 3755
},
{
"entropy": 1.0437415711581708,
"epoch": 0.984051688885254,
"grad_norm": 0.40716320276260376,
"learning_rate": 3.2452237634127194e-07,
"loss": 1.0219,
"mean_token_accuracy": 0.7391361027956009,
"num_tokens": 29247960.0,
"step": 3760
},
{
"entropy": 1.1265955708920956,
"epoch": 0.9853602682587715,
"grad_norm": 0.6306870579719543,
"learning_rate": 2.983512169589113e-07,
"loss": 1.062,
"mean_token_accuracy": 0.7148844588547945,
"num_tokens": 29282043.0,
"step": 3765
},
{
"entropy": 1.0719765815883875,
"epoch": 0.9866688476322892,
"grad_norm": 0.35759237408638,
"learning_rate": 2.7218005757655065e-07,
"loss": 1.0682,
"mean_token_accuracy": 0.7296040445566178,
"num_tokens": 29323533.0,
"step": 3770
},
{
"entropy": 1.0471963623538614,
"epoch": 0.9879774270058068,
"grad_norm": 0.4440222382545471,
"learning_rate": 2.4600889819419e-07,
"loss": 0.9946,
"mean_token_accuracy": 0.7308863338083029,
"num_tokens": 29364067.0,
"step": 3775
},
{
"entropy": 1.0608526386320591,
"epoch": 0.9892860063793244,
"grad_norm": 0.41301101446151733,
"learning_rate": 2.198377388118294e-07,
"loss": 1.0721,
"mean_token_accuracy": 0.730213138461113,
"num_tokens": 29411394.0,
"step": 3780
},
{
"entropy": 1.042181348055601,
"epoch": 0.9905945857528421,
"grad_norm": 0.7013579607009888,
"learning_rate": 1.9366657942946875e-07,
"loss": 0.9976,
"mean_token_accuracy": 0.7371749997138977,
"num_tokens": 29447775.0,
"step": 3785
},
{
"entropy": 1.0719102643430234,
"epoch": 0.9919031651263597,
"grad_norm": 0.6451042890548706,
"learning_rate": 1.6749542004710808e-07,
"loss": 1.0273,
"mean_token_accuracy": 0.725009347498417,
"num_tokens": 29482789.0,
"step": 3790
},
{
"entropy": 1.0650145791471004,
"epoch": 0.9932117444998774,
"grad_norm": 0.46040284633636475,
"learning_rate": 1.4132426066474745e-07,
"loss": 1.0511,
"mean_token_accuracy": 0.7309038709849119,
"num_tokens": 29522469.0,
"step": 3795
},
{
"entropy": 1.099685497954488,
"epoch": 0.994520323873395,
"grad_norm": 0.47909924387931824,
"learning_rate": 1.1515310128238682e-07,
"loss": 1.0294,
"mean_token_accuracy": 0.7283653065562248,
"num_tokens": 29558695.0,
"step": 3800
},
{
"epoch": 0.994520323873395,
"eval_entropy": 1.027329535484314,
"eval_loss": 1.0250593423843384,
"eval_mean_token_accuracy": 0.7319152827262878,
"eval_num_tokens": 29558695.0,
"eval_runtime": 97.2485,
"eval_samples_per_second": 10.283,
"eval_steps_per_second": 1.285,
"step": 3800
},
{
"entropy": 1.0765836983919144,
"epoch": 0.9958289032469125,
"grad_norm": 0.46970200538635254,
"learning_rate": 8.898194190002618e-08,
"loss": 1.0639,
"mean_token_accuracy": 0.7184258241206407,
"num_tokens": 29599289.0,
"step": 3805
},
{
"entropy": 1.0330897614359855,
"epoch": 0.9971374826204302,
"grad_norm": 0.34983229637145996,
"learning_rate": 6.281078251766554e-08,
"loss": 1.0025,
"mean_token_accuracy": 0.7342484682798386,
"num_tokens": 29637188.0,
"step": 3810
},
{
"entropy": 1.04201779961586,
"epoch": 0.9984460619939478,
"grad_norm": 0.39102649688720703,
"learning_rate": 3.6639623135304896e-08,
"loss": 0.9961,
"mean_token_accuracy": 0.7322810731828213,
"num_tokens": 29671533.0,
"step": 3815
},
{
"entropy": 1.1104622691869737,
"epoch": 0.9997546413674654,
"grad_norm": 0.39447513222694397,
"learning_rate": 1.0468463752944255e-08,
"loss": 1.1002,
"mean_token_accuracy": 0.7095972750335932,
"num_tokens": 29712231.0,
"step": 3820
},
{
"epoch": 1.0,
"step": 3821,
"total_flos": 1.297831890260091e+18,
"train_loss": 1.0613669765554028,
"train_runtime": 22265.1627,
"train_samples_per_second": 2.746,
"train_steps_per_second": 0.172
}
],
"logging_steps": 5,
"max_steps": 3821,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.297831890260091e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}