foamGPT-oss-20B / trainer_state.json
finalform's picture
Upload folder using huggingface_hub
f4c7f6c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 554,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 2.358862280845642,
"epoch": 0.0036199095022624436,
"grad_norm": 2.292628288269043,
"learning_rate": 0.0,
"loss": 0.7311,
"mean_token_accuracy": 0.8534883409738541,
"num_tokens": 9316.0,
"step": 1
},
{
"entropy": 2.674945294857025,
"epoch": 0.007239819004524887,
"grad_norm": 3.8950836658477783,
"learning_rate": 1.0219999999999999e-05,
"loss": 1.0621,
"mean_token_accuracy": 0.8183160275220871,
"num_tokens": 17707.0,
"step": 2
},
{
"entropy": 2.4915525913238525,
"epoch": 0.01085972850678733,
"grad_norm": 2.792142868041992,
"learning_rate": 2.0439999999999997e-05,
"loss": 0.8448,
"mean_token_accuracy": 0.8489587754011154,
"num_tokens": 26783.0,
"step": 3
},
{
"entropy": 2.525622010231018,
"epoch": 0.014479638009049774,
"grad_norm": 2.7071900367736816,
"learning_rate": 3.0659999999999994e-05,
"loss": 0.8847,
"mean_token_accuracy": 0.8486668318510056,
"num_tokens": 35947.0,
"step": 4
},
{
"entropy": 2.588509976863861,
"epoch": 0.01809954751131222,
"grad_norm": 2.981574773788452,
"learning_rate": 4.0879999999999995e-05,
"loss": 1.0783,
"mean_token_accuracy": 0.8135111033916473,
"num_tokens": 44505.0,
"step": 5
},
{
"entropy": 2.662865400314331,
"epoch": 0.02171945701357466,
"grad_norm": 2.629283905029297,
"learning_rate": 5.1099999999999995e-05,
"loss": 0.9485,
"mean_token_accuracy": 0.8152717798948288,
"num_tokens": 53140.0,
"step": 6
},
{
"entropy": 2.6662243604660034,
"epoch": 0.025339366515837104,
"grad_norm": 2.730058431625366,
"learning_rate": 6.131999999999999e-05,
"loss": 0.6982,
"mean_token_accuracy": 0.8552135527133942,
"num_tokens": 61932.0,
"step": 7
},
{
"entropy": 2.661384105682373,
"epoch": 0.02895927601809955,
"grad_norm": 2.562839984893799,
"learning_rate": 7.154e-05,
"loss": 0.7296,
"mean_token_accuracy": 0.8579540699720383,
"num_tokens": 70973.0,
"step": 8
},
{
"entropy": 2.7889368534088135,
"epoch": 0.03257918552036199,
"grad_norm": 2.8640544414520264,
"learning_rate": 8.175999999999999e-05,
"loss": 0.5965,
"mean_token_accuracy": 0.8638457208871841,
"num_tokens": 79977.0,
"step": 9
},
{
"entropy": 2.811532199382782,
"epoch": 0.03619909502262444,
"grad_norm": 2.6199426651000977,
"learning_rate": 9.197999999999998e-05,
"loss": 0.4819,
"mean_token_accuracy": 0.8786454051733017,
"num_tokens": 88915.0,
"step": 10
},
{
"entropy": 2.941167712211609,
"epoch": 0.039819004524886875,
"grad_norm": 1.2497272491455078,
"learning_rate": 0.00010219999999999999,
"loss": 0.7192,
"mean_token_accuracy": 0.841494083404541,
"num_tokens": 97749.0,
"step": 11
},
{
"entropy": 3.0547962188720703,
"epoch": 0.04343891402714932,
"grad_norm": 1.436136245727539,
"learning_rate": 0.00011241999999999998,
"loss": 0.5908,
"mean_token_accuracy": 0.8657624870538712,
"num_tokens": 106048.0,
"step": 12
},
{
"entropy": 2.9914053082466125,
"epoch": 0.047058823529411764,
"grad_norm": 0.9903654456138611,
"learning_rate": 0.00012263999999999998,
"loss": 0.4008,
"mean_token_accuracy": 0.8985499292612076,
"num_tokens": 115216.0,
"step": 13
},
{
"entropy": 3.1867465376853943,
"epoch": 0.05067873303167421,
"grad_norm": 1.019572377204895,
"learning_rate": 0.00013286,
"loss": 0.5062,
"mean_token_accuracy": 0.8893097043037415,
"num_tokens": 124040.0,
"step": 14
},
{
"entropy": 3.2431325912475586,
"epoch": 0.05429864253393665,
"grad_norm": 1.2394084930419922,
"learning_rate": 0.00014308,
"loss": 0.361,
"mean_token_accuracy": 0.9009967148303986,
"num_tokens": 132447.0,
"step": 15
},
{
"entropy": 3.1858643889427185,
"epoch": 0.0579185520361991,
"grad_norm": 0.9859603643417358,
"learning_rate": 0.00015329999999999999,
"loss": 0.4498,
"mean_token_accuracy": 0.887280747294426,
"num_tokens": 141228.0,
"step": 16
},
{
"entropy": 3.5029141902923584,
"epoch": 0.06153846153846154,
"grad_norm": 1.453957438468933,
"learning_rate": 0.00016351999999999998,
"loss": 0.4949,
"mean_token_accuracy": 0.888081505894661,
"num_tokens": 149789.0,
"step": 17
},
{
"entropy": 3.4572895765304565,
"epoch": 0.06515837104072399,
"grad_norm": 1.390377402305603,
"learning_rate": 0.00017374,
"loss": 0.5449,
"mean_token_accuracy": 0.8745045810937881,
"num_tokens": 157813.0,
"step": 18
},
{
"entropy": 3.3081750869750977,
"epoch": 0.06877828054298643,
"grad_norm": 1.1171791553497314,
"learning_rate": 0.00018395999999999997,
"loss": 0.4786,
"mean_token_accuracy": 0.8893420845270157,
"num_tokens": 166315.0,
"step": 19
},
{
"entropy": 3.3776715993881226,
"epoch": 0.07239819004524888,
"grad_norm": 1.5567998886108398,
"learning_rate": 0.00019418,
"loss": 0.3669,
"mean_token_accuracy": 0.9146632701158524,
"num_tokens": 175207.0,
"step": 20
},
{
"entropy": 3.2677870988845825,
"epoch": 0.0760180995475113,
"grad_norm": 1.7404611110687256,
"learning_rate": 0.00020439999999999998,
"loss": 0.5287,
"mean_token_accuracy": 0.8777483552694321,
"num_tokens": 183833.0,
"step": 21
},
{
"entropy": 3.313201069831848,
"epoch": 0.07963800904977375,
"grad_norm": 1.0836979150772095,
"learning_rate": 0.00021461999999999997,
"loss": 0.3014,
"mean_token_accuracy": 0.9215261936187744,
"num_tokens": 192591.0,
"step": 22
},
{
"entropy": 3.208672881126404,
"epoch": 0.0832579185520362,
"grad_norm": 1.2197301387786865,
"learning_rate": 0.00022483999999999997,
"loss": 0.4401,
"mean_token_accuracy": 0.9031257778406143,
"num_tokens": 201372.0,
"step": 23
},
{
"entropy": 3.1830995082855225,
"epoch": 0.08687782805429864,
"grad_norm": 1.2422229051589966,
"learning_rate": 0.00023506,
"loss": 0.5144,
"mean_token_accuracy": 0.8915928155183792,
"num_tokens": 210348.0,
"step": 24
},
{
"entropy": 3.085207223892212,
"epoch": 0.09049773755656108,
"grad_norm": 0.8987624049186707,
"learning_rate": 0.00024527999999999996,
"loss": 0.3253,
"mean_token_accuracy": 0.9221627116203308,
"num_tokens": 219131.0,
"step": 25
},
{
"entropy": 3.026031017303467,
"epoch": 0.09411764705882353,
"grad_norm": 1.0273475646972656,
"learning_rate": 0.0002555,
"loss": 0.3495,
"mean_token_accuracy": 0.9147634357213974,
"num_tokens": 228292.0,
"step": 26
},
{
"entropy": 3.0420032739639282,
"epoch": 0.09773755656108597,
"grad_norm": 1.0590945482254028,
"learning_rate": 0.00026572,
"loss": 0.4495,
"mean_token_accuracy": 0.9019353687763214,
"num_tokens": 236942.0,
"step": 27
},
{
"entropy": 3.0469263792037964,
"epoch": 0.10135746606334842,
"grad_norm": 0.9584959745407104,
"learning_rate": 0.00027594,
"loss": 0.405,
"mean_token_accuracy": 0.9216890782117844,
"num_tokens": 245543.0,
"step": 28
},
{
"entropy": 2.92683744430542,
"epoch": 0.10497737556561086,
"grad_norm": 0.8826628923416138,
"learning_rate": 0.00028616,
"loss": 0.4004,
"mean_token_accuracy": 0.9173285663127899,
"num_tokens": 254264.0,
"step": 29
},
{
"entropy": 3.0086968541145325,
"epoch": 0.1085972850678733,
"grad_norm": 0.8521863222122192,
"learning_rate": 0.00029637999999999995,
"loss": 0.2876,
"mean_token_accuracy": 0.9335231184959412,
"num_tokens": 263143.0,
"step": 30
},
{
"entropy": 2.9086623191833496,
"epoch": 0.11221719457013575,
"grad_norm": 0.7830919623374939,
"learning_rate": 0.00030659999999999997,
"loss": 0.548,
"mean_token_accuracy": 0.8831343650817871,
"num_tokens": 272055.0,
"step": 31
},
{
"entropy": 2.9730575680732727,
"epoch": 0.1158371040723982,
"grad_norm": 0.7217472195625305,
"learning_rate": 0.00031682,
"loss": 0.3564,
"mean_token_accuracy": 0.9119151830673218,
"num_tokens": 280971.0,
"step": 32
},
{
"entropy": 3.081720530986786,
"epoch": 0.11945701357466064,
"grad_norm": 0.8697704076766968,
"learning_rate": 0.00032703999999999996,
"loss": 0.334,
"mean_token_accuracy": 0.9234935492277145,
"num_tokens": 289449.0,
"step": 33
},
{
"entropy": 3.1043431162834167,
"epoch": 0.12307692307692308,
"grad_norm": 0.7962514758110046,
"learning_rate": 0.00033726,
"loss": 0.1602,
"mean_token_accuracy": 0.9554370939731598,
"num_tokens": 297804.0,
"step": 34
},
{
"entropy": 3.0275490283966064,
"epoch": 0.12669683257918551,
"grad_norm": 0.5887104272842407,
"learning_rate": 0.00034748,
"loss": 0.2254,
"mean_token_accuracy": 0.9491932094097137,
"num_tokens": 306589.0,
"step": 35
},
{
"entropy": 3.099652886390686,
"epoch": 0.13031674208144797,
"grad_norm": 0.894397497177124,
"learning_rate": 0.00035769999999999997,
"loss": 0.6397,
"mean_token_accuracy": 0.8802188038825989,
"num_tokens": 315534.0,
"step": 36
},
{
"entropy": 3.0312134623527527,
"epoch": 0.1339366515837104,
"grad_norm": 0.6374682188034058,
"learning_rate": 0.00036791999999999993,
"loss": 0.2183,
"mean_token_accuracy": 0.9478497952222824,
"num_tokens": 324492.0,
"step": 37
},
{
"entropy": 3.28497713804245,
"epoch": 0.13755656108597286,
"grad_norm": 0.6740968823432922,
"learning_rate": 0.00037813999999999995,
"loss": 0.3619,
"mean_token_accuracy": 0.9288723170757294,
"num_tokens": 333195.0,
"step": 38
},
{
"entropy": 3.1478323340415955,
"epoch": 0.1411764705882353,
"grad_norm": 0.7235494256019592,
"learning_rate": 0.00038836,
"loss": 0.324,
"mean_token_accuracy": 0.9179254025220871,
"num_tokens": 342028.0,
"step": 39
},
{
"entropy": 3.279879152774811,
"epoch": 0.14479638009049775,
"grad_norm": 0.7512595653533936,
"learning_rate": 0.00039858,
"loss": 0.4804,
"mean_token_accuracy": 0.889826312661171,
"num_tokens": 350902.0,
"step": 40
},
{
"entropy": 3.173546612262726,
"epoch": 0.14841628959276018,
"grad_norm": 0.6978861689567566,
"learning_rate": 0.00040879999999999996,
"loss": 0.3442,
"mean_token_accuracy": 0.9205169230699539,
"num_tokens": 359787.0,
"step": 41
},
{
"entropy": 3.2385765314102173,
"epoch": 0.1520361990950226,
"grad_norm": 0.8108944892883301,
"learning_rate": 0.00041901999999999993,
"loss": 0.4223,
"mean_token_accuracy": 0.8979178965091705,
"num_tokens": 368426.0,
"step": 42
},
{
"entropy": 3.146568477153778,
"epoch": 0.15565610859728507,
"grad_norm": 0.5847787261009216,
"learning_rate": 0.00042923999999999995,
"loss": 0.1953,
"mean_token_accuracy": 0.9556037336587906,
"num_tokens": 377349.0,
"step": 43
},
{
"entropy": 3.066233277320862,
"epoch": 0.1592760180995475,
"grad_norm": 0.7887329459190369,
"learning_rate": 0.00043945999999999997,
"loss": 0.6815,
"mean_token_accuracy": 0.8654293268918991,
"num_tokens": 386603.0,
"step": 44
},
{
"entropy": 3.1745981574058533,
"epoch": 0.16289592760180996,
"grad_norm": 0.7280165553092957,
"learning_rate": 0.00044967999999999994,
"loss": 0.1932,
"mean_token_accuracy": 0.9479279220104218,
"num_tokens": 395070.0,
"step": 45
},
{
"entropy": 3.1094446182250977,
"epoch": 0.1665158371040724,
"grad_norm": 0.6453448534011841,
"learning_rate": 0.00045989999999999996,
"loss": 0.2608,
"mean_token_accuracy": 0.9249396026134491,
"num_tokens": 403651.0,
"step": 46
},
{
"entropy": 2.9050925970077515,
"epoch": 0.17013574660633485,
"grad_norm": 0.6689278483390808,
"learning_rate": 0.00047012,
"loss": 0.4489,
"mean_token_accuracy": 0.898686870932579,
"num_tokens": 412898.0,
"step": 47
},
{
"entropy": 3.2239145040512085,
"epoch": 0.17375565610859728,
"grad_norm": 1.0014020204544067,
"learning_rate": 0.00048033999999999994,
"loss": 0.3234,
"mean_token_accuracy": 0.9231891483068466,
"num_tokens": 421420.0,
"step": 48
},
{
"entropy": 3.035899817943573,
"epoch": 0.17737556561085974,
"grad_norm": 0.6415768265724182,
"learning_rate": 0.0004905599999999999,
"loss": 0.2259,
"mean_token_accuracy": 0.9447792917490005,
"num_tokens": 430258.0,
"step": 49
},
{
"entropy": 3.057477653026581,
"epoch": 0.18099547511312217,
"grad_norm": 0.6042271256446838,
"learning_rate": 0.0005007799999999999,
"loss": 0.2228,
"mean_token_accuracy": 0.9473378211259842,
"num_tokens": 439593.0,
"step": 50
},
{
"entropy": 2.8375911116600037,
"epoch": 0.18461538461538463,
"grad_norm": 0.739811897277832,
"learning_rate": 0.000511,
"loss": 0.3623,
"mean_token_accuracy": 0.9050924181938171,
"num_tokens": 449056.0,
"step": 51
},
{
"entropy": 2.9926682114601135,
"epoch": 0.18823529411764706,
"grad_norm": 0.6637321710586548,
"learning_rate": 0.0005109995633102972,
"loss": 0.2924,
"mean_token_accuracy": 0.9397273659706116,
"num_tokens": 457677.0,
"step": 52
},
{
"entropy": 2.7932987809181213,
"epoch": 0.19185520361990951,
"grad_norm": 0.5666584372520447,
"learning_rate": 0.0005109982532428477,
"loss": 0.2055,
"mean_token_accuracy": 0.9385408014059067,
"num_tokens": 466969.0,
"step": 53
},
{
"entropy": 2.765812337398529,
"epoch": 0.19547511312217195,
"grad_norm": 0.7875120639801025,
"learning_rate": 0.0005109960698026271,
"loss": 0.4549,
"mean_token_accuracy": 0.9052814990282059,
"num_tokens": 476285.0,
"step": 54
},
{
"entropy": 2.884207248687744,
"epoch": 0.19909502262443438,
"grad_norm": 0.7538661956787109,
"learning_rate": 0.0005109930129979285,
"loss": 0.3751,
"mean_token_accuracy": 0.9210246652364731,
"num_tokens": 484668.0,
"step": 55
},
{
"entropy": 2.779718518257141,
"epoch": 0.20271493212669683,
"grad_norm": 0.8069296479225159,
"learning_rate": 0.0005109890828403621,
"loss": 0.3664,
"mean_token_accuracy": 0.9219843596220016,
"num_tokens": 493292.0,
"step": 56
},
{
"entropy": 2.841543674468994,
"epoch": 0.20633484162895926,
"grad_norm": 0.5545904636383057,
"learning_rate": 0.0005109842793448548,
"loss": 0.1973,
"mean_token_accuracy": 0.9547395706176758,
"num_tokens": 501973.0,
"step": 57
},
{
"entropy": 2.8180030584335327,
"epoch": 0.20995475113122172,
"grad_norm": 1.015456199645996,
"learning_rate": 0.0005109786025296513,
"loss": 0.6019,
"mean_token_accuracy": 0.88613361120224,
"num_tokens": 510840.0,
"step": 58
},
{
"entropy": 2.7450912594795227,
"epoch": 0.21357466063348415,
"grad_norm": 0.6784740686416626,
"learning_rate": 0.0005109720524163127,
"loss": 0.2868,
"mean_token_accuracy": 0.9295425117015839,
"num_tokens": 519656.0,
"step": 59
},
{
"entropy": 2.822400987148285,
"epoch": 0.2171945701357466,
"grad_norm": 0.8780149817466736,
"learning_rate": 0.000510964629029717,
"loss": 0.4371,
"mean_token_accuracy": 0.9089596569538116,
"num_tokens": 528105.0,
"step": 60
},
{
"entropy": 2.522100865840912,
"epoch": 0.22081447963800904,
"grad_norm": 0.51394122838974,
"learning_rate": 0.0005109563323980594,
"loss": 0.2509,
"mean_token_accuracy": 0.941976860165596,
"num_tokens": 537707.0,
"step": 61
},
{
"entropy": 2.6596657633781433,
"epoch": 0.2244343891402715,
"grad_norm": 0.6359816789627075,
"learning_rate": 0.0005109471625528516,
"loss": 0.3685,
"mean_token_accuracy": 0.9191890209913254,
"num_tokens": 546517.0,
"step": 62
},
{
"entropy": 2.800311803817749,
"epoch": 0.22805429864253393,
"grad_norm": 0.6862941980361938,
"learning_rate": 0.0005109371195289215,
"loss": 0.2457,
"mean_token_accuracy": 0.9330879002809525,
"num_tokens": 555493.0,
"step": 63
},
{
"entropy": 2.7235344648361206,
"epoch": 0.2316742081447964,
"grad_norm": 1.0464682579040527,
"learning_rate": 0.0005109262033644142,
"loss": 0.4417,
"mean_token_accuracy": 0.8957678377628326,
"num_tokens": 564255.0,
"step": 64
},
{
"entropy": 2.6643534302711487,
"epoch": 0.23529411764705882,
"grad_norm": 1.0790019035339355,
"learning_rate": 0.0005109144141007903,
"loss": 0.4947,
"mean_token_accuracy": 0.8889007717370987,
"num_tokens": 573401.0,
"step": 65
},
{
"entropy": 2.760925054550171,
"epoch": 0.23891402714932128,
"grad_norm": 0.7957189679145813,
"learning_rate": 0.0005109017517828273,
"loss": 0.2259,
"mean_token_accuracy": 0.944578230381012,
"num_tokens": 581905.0,
"step": 66
},
{
"entropy": 2.7048792839050293,
"epoch": 0.2425339366515837,
"grad_norm": 0.9530714750289917,
"learning_rate": 0.0005108882164586181,
"loss": 0.3122,
"mean_token_accuracy": 0.9257418513298035,
"num_tokens": 590802.0,
"step": 67
},
{
"entropy": 2.6733291149139404,
"epoch": 0.24615384615384617,
"grad_norm": 0.8295993208885193,
"learning_rate": 0.0005108738081795716,
"loss": 0.3701,
"mean_token_accuracy": 0.898589238524437,
"num_tokens": 599279.0,
"step": 68
},
{
"entropy": 2.5613606572151184,
"epoch": 0.2497737556561086,
"grad_norm": 0.6205935478210449,
"learning_rate": 0.0005108585270004123,
"loss": 0.4372,
"mean_token_accuracy": 0.9116007685661316,
"num_tokens": 608107.0,
"step": 69
},
{
"entropy": 2.458296835422516,
"epoch": 0.25339366515837103,
"grad_norm": 0.7629838585853577,
"learning_rate": 0.0005108423729791799,
"loss": 0.2307,
"mean_token_accuracy": 0.9386163502931595,
"num_tokens": 616881.0,
"step": 70
},
{
"entropy": 2.4176695346832275,
"epoch": 0.25701357466063346,
"grad_norm": 0.902400016784668,
"learning_rate": 0.0005108253461772298,
"loss": 0.2853,
"mean_token_accuracy": 0.9237343072891235,
"num_tokens": 625323.0,
"step": 71
},
{
"entropy": 2.2265281677246094,
"epoch": 0.26063348416289595,
"grad_norm": 0.7744383811950684,
"learning_rate": 0.0005108074466592316,
"loss": 0.2435,
"mean_token_accuracy": 0.9508260935544968,
"num_tokens": 634260.0,
"step": 72
},
{
"entropy": 2.1855952441692352,
"epoch": 0.2642533936651584,
"grad_norm": 0.8615190386772156,
"learning_rate": 0.0005107886744931702,
"loss": 0.3323,
"mean_token_accuracy": 0.9276078194379807,
"num_tokens": 643235.0,
"step": 73
},
{
"entropy": 2.179121494293213,
"epoch": 0.2678733031674208,
"grad_norm": 0.8953279256820679,
"learning_rate": 0.0005107690297503444,
"loss": 0.2384,
"mean_token_accuracy": 0.9425230622291565,
"num_tokens": 652032.0,
"step": 74
},
{
"entropy": 2.1565526127815247,
"epoch": 0.27149321266968324,
"grad_norm": 0.6830486059188843,
"learning_rate": 0.0005107485125053678,
"loss": 0.2759,
"mean_token_accuracy": 0.9360661953687668,
"num_tokens": 660978.0,
"step": 75
},
{
"entropy": 2.0900665521621704,
"epoch": 0.2751131221719457,
"grad_norm": 0.786665141582489,
"learning_rate": 0.0005107271228361672,
"loss": 0.4061,
"mean_token_accuracy": 0.910009115934372,
"num_tokens": 669817.0,
"step": 76
},
{
"entropy": 2.1311859488487244,
"epoch": 0.27873303167420815,
"grad_norm": 0.6399909853935242,
"learning_rate": 0.0005107048608239836,
"loss": 0.272,
"mean_token_accuracy": 0.9424714297056198,
"num_tokens": 678469.0,
"step": 77
},
{
"entropy": 2.059997320175171,
"epoch": 0.2823529411764706,
"grad_norm": 0.8114754557609558,
"learning_rate": 0.0005106817265533706,
"loss": 0.4029,
"mean_token_accuracy": 0.9037660360336304,
"num_tokens": 687261.0,
"step": 78
},
{
"entropy": 1.9725019037723541,
"epoch": 0.285972850678733,
"grad_norm": 0.9420941472053528,
"learning_rate": 0.0005106577201121952,
"loss": 0.535,
"mean_token_accuracy": 0.8996377140283585,
"num_tokens": 695941.0,
"step": 79
},
{
"entropy": 1.9951164424419403,
"epoch": 0.2895927601809955,
"grad_norm": 0.6476142406463623,
"learning_rate": 0.0005106328415916372,
"loss": 0.2242,
"mean_token_accuracy": 0.941379725933075,
"num_tokens": 704643.0,
"step": 80
},
{
"entropy": 1.8962564170360565,
"epoch": 0.29321266968325793,
"grad_norm": 0.5974630117416382,
"learning_rate": 0.0005106070910861881,
"loss": 0.2934,
"mean_token_accuracy": 0.9217697530984879,
"num_tokens": 713605.0,
"step": 81
},
{
"entropy": 1.9781515896320343,
"epoch": 0.29683257918552036,
"grad_norm": 0.8755478262901306,
"learning_rate": 0.0005105804686936518,
"loss": 0.4551,
"mean_token_accuracy": 0.9051328897476196,
"num_tokens": 722385.0,
"step": 82
},
{
"entropy": 1.9892418384552002,
"epoch": 0.3004524886877828,
"grad_norm": 0.6887345314025879,
"learning_rate": 0.0005105529745151433,
"loss": 0.244,
"mean_token_accuracy": 0.9261117279529572,
"num_tokens": 730962.0,
"step": 83
},
{
"entropy": 2.0053181648254395,
"epoch": 0.3040723981900452,
"grad_norm": 0.6930885910987854,
"learning_rate": 0.0005105246086550893,
"loss": 0.3155,
"mean_token_accuracy": 0.9206147193908691,
"num_tokens": 739499.0,
"step": 84
},
{
"entropy": 1.9716475903987885,
"epoch": 0.3076923076923077,
"grad_norm": 0.5049461722373962,
"learning_rate": 0.0005104953712212266,
"loss": 0.2215,
"mean_token_accuracy": 0.9608763605356216,
"num_tokens": 748604.0,
"step": 85
},
{
"entropy": 1.9186978042125702,
"epoch": 0.31131221719457014,
"grad_norm": 0.5756685733795166,
"learning_rate": 0.000510465262324603,
"loss": 0.2658,
"mean_token_accuracy": 0.9372887462377548,
"num_tokens": 757919.0,
"step": 86
},
{
"entropy": 1.9738290905952454,
"epoch": 0.31493212669683257,
"grad_norm": 0.6163789629936218,
"learning_rate": 0.0005104342820795758,
"loss": 0.2472,
"mean_token_accuracy": 0.9430449157953262,
"num_tokens": 766708.0,
"step": 87
},
{
"entropy": 2.1927571892738342,
"epoch": 0.318552036199095,
"grad_norm": 0.7953162789344788,
"learning_rate": 0.0005104024306038119,
"loss": 0.261,
"mean_token_accuracy": 0.9425829648971558,
"num_tokens": 774601.0,
"step": 88
},
{
"entropy": 2.043731451034546,
"epoch": 0.3221719457013575,
"grad_norm": 0.8098088502883911,
"learning_rate": 0.0005103697080182872,
"loss": 0.3126,
"mean_token_accuracy": 0.9158089309930801,
"num_tokens": 783170.0,
"step": 89
},
{
"entropy": 1.9801572561264038,
"epoch": 0.3257918552036199,
"grad_norm": 0.5227240920066833,
"learning_rate": 0.0005103361144472864,
"loss": 0.1291,
"mean_token_accuracy": 0.9666071832180023,
"num_tokens": 791769.0,
"step": 90
},
{
"entropy": 1.9553790986537933,
"epoch": 0.32941176470588235,
"grad_norm": 0.7819464206695557,
"learning_rate": 0.0005103016500184022,
"loss": 0.531,
"mean_token_accuracy": 0.8817111849784851,
"num_tokens": 800824.0,
"step": 91
},
{
"entropy": 1.9291303753852844,
"epoch": 0.3330316742081448,
"grad_norm": 0.7178757190704346,
"learning_rate": 0.0005102663148625347,
"loss": 0.3301,
"mean_token_accuracy": 0.9357631802558899,
"num_tokens": 809347.0,
"step": 92
},
{
"entropy": 1.9846041798591614,
"epoch": 0.33665158371040727,
"grad_norm": 1.316636085510254,
"learning_rate": 0.0005102301091138916,
"loss": 0.4241,
"mean_token_accuracy": 0.8993304669857025,
"num_tokens": 817174.0,
"step": 93
},
{
"entropy": 1.814637303352356,
"epoch": 0.3402714932126697,
"grad_norm": 0.5486414432525635,
"learning_rate": 0.0005101930329099865,
"loss": 0.116,
"mean_token_accuracy": 0.9674727618694305,
"num_tokens": 826177.0,
"step": 94
},
{
"entropy": 1.9128066003322601,
"epoch": 0.3438914027149321,
"grad_norm": 0.620303750038147,
"learning_rate": 0.00051015508639164,
"loss": 0.1833,
"mean_token_accuracy": 0.9569521993398666,
"num_tokens": 835409.0,
"step": 95
},
{
"entropy": 1.7541870176792145,
"epoch": 0.34751131221719456,
"grad_norm": 0.8337438702583313,
"learning_rate": 0.0005101162697029776,
"loss": 0.3327,
"mean_token_accuracy": 0.9193180054426193,
"num_tokens": 844692.0,
"step": 96
},
{
"entropy": 1.8255240619182587,
"epoch": 0.351131221719457,
"grad_norm": 0.877780556678772,
"learning_rate": 0.00051007658299143,
"loss": 0.2106,
"mean_token_accuracy": 0.9527023881673813,
"num_tokens": 853309.0,
"step": 97
},
{
"entropy": 1.8611579239368439,
"epoch": 0.3547511312217195,
"grad_norm": 1.0667716264724731,
"learning_rate": 0.0005100360264077325,
"loss": 0.3196,
"mean_token_accuracy": 0.9195879399776459,
"num_tokens": 861859.0,
"step": 98
},
{
"entropy": 1.821915864944458,
"epoch": 0.3583710407239819,
"grad_norm": 0.8400309681892395,
"learning_rate": 0.0005099946001059241,
"loss": 0.4036,
"mean_token_accuracy": 0.8951036781072617,
"num_tokens": 871060.0,
"step": 99
},
{
"entropy": 1.7648265063762665,
"epoch": 0.36199095022624433,
"grad_norm": 1.1391404867172241,
"learning_rate": 0.0005099523042433472,
"loss": 0.389,
"mean_token_accuracy": 0.901309460401535,
"num_tokens": 880593.0,
"step": 100
},
{
"entropy": 1.8506875336170197,
"epoch": 0.36561085972850677,
"grad_norm": 0.6923297643661499,
"learning_rate": 0.000509909138980647,
"loss": 0.2504,
"mean_token_accuracy": 0.9384842216968536,
"num_tokens": 889739.0,
"step": 101
},
{
"entropy": 1.9311015605926514,
"epoch": 0.36923076923076925,
"grad_norm": 0.9677391052246094,
"learning_rate": 0.0005098651044817704,
"loss": 0.6953,
"mean_token_accuracy": 0.8752655684947968,
"num_tokens": 898992.0,
"step": 102
},
{
"entropy": 1.9590983986854553,
"epoch": 0.3728506787330317,
"grad_norm": 0.6364567279815674,
"learning_rate": 0.0005098202009139663,
"loss": 0.4318,
"mean_token_accuracy": 0.9056479930877686,
"num_tokens": 908225.0,
"step": 103
},
{
"entropy": 1.9455370008945465,
"epoch": 0.3764705882352941,
"grad_norm": 0.6747863292694092,
"learning_rate": 0.0005097744284477839,
"loss": 0.244,
"mean_token_accuracy": 0.9428392052650452,
"num_tokens": 917134.0,
"step": 104
},
{
"entropy": 1.8632825911045074,
"epoch": 0.38009049773755654,
"grad_norm": 0.5705651044845581,
"learning_rate": 0.0005097277872570731,
"loss": 0.2508,
"mean_token_accuracy": 0.9325222969055176,
"num_tokens": 926573.0,
"step": 105
},
{
"entropy": 1.9370323717594147,
"epoch": 0.38371040723981903,
"grad_norm": 0.6298627853393555,
"learning_rate": 0.000509680277518983,
"loss": 0.2481,
"mean_token_accuracy": 0.9281332045793533,
"num_tokens": 935853.0,
"step": 106
},
{
"entropy": 2.0217572450637817,
"epoch": 0.38733031674208146,
"grad_norm": 0.5434353947639465,
"learning_rate": 0.0005096318994139617,
"loss": 0.1809,
"mean_token_accuracy": 0.9592084139585495,
"num_tokens": 944279.0,
"step": 107
},
{
"entropy": 1.9619770646095276,
"epoch": 0.3909502262443439,
"grad_norm": 0.6959638595581055,
"learning_rate": 0.0005095826531257552,
"loss": 0.1376,
"mean_token_accuracy": 0.9608310014009476,
"num_tokens": 953336.0,
"step": 108
},
{
"entropy": 2.12511146068573,
"epoch": 0.3945701357466063,
"grad_norm": 1.0152848958969116,
"learning_rate": 0.0005095325388414074,
"loss": 0.4382,
"mean_token_accuracy": 0.915201798081398,
"num_tokens": 962002.0,
"step": 109
},
{
"entropy": 2.0171878039836884,
"epoch": 0.39819004524886875,
"grad_norm": 0.8337467312812805,
"learning_rate": 0.0005094815567512587,
"loss": 0.2672,
"mean_token_accuracy": 0.9313560128211975,
"num_tokens": 970954.0,
"step": 110
},
{
"entropy": 2.1024146378040314,
"epoch": 0.40180995475113124,
"grad_norm": 0.8214333057403564,
"learning_rate": 0.0005094297070489455,
"loss": 0.3146,
"mean_token_accuracy": 0.9289091974496841,
"num_tokens": 979929.0,
"step": 111
},
{
"entropy": 2.260519325733185,
"epoch": 0.40542986425339367,
"grad_norm": 1.1298810243606567,
"learning_rate": 0.0005093769899313996,
"loss": 0.3055,
"mean_token_accuracy": 0.9213490188121796,
"num_tokens": 988477.0,
"step": 112
},
{
"entropy": 2.2228699326515198,
"epoch": 0.4090497737556561,
"grad_norm": 0.8601953983306885,
"learning_rate": 0.0005093234055988475,
"loss": 0.2738,
"mean_token_accuracy": 0.920888364315033,
"num_tokens": 997091.0,
"step": 113
},
{
"entropy": 2.2165185809135437,
"epoch": 0.41266968325791853,
"grad_norm": 0.6331561803817749,
"learning_rate": 0.0005092689542548091,
"loss": 0.2241,
"mean_token_accuracy": 0.9408514499664307,
"num_tokens": 1005866.0,
"step": 114
},
{
"entropy": 2.324040472507477,
"epoch": 0.416289592760181,
"grad_norm": 0.680496096611023,
"learning_rate": 0.0005092136361060975,
"loss": 0.2454,
"mean_token_accuracy": 0.9433349967002869,
"num_tokens": 1014277.0,
"step": 115
},
{
"entropy": 2.413789749145508,
"epoch": 0.41990950226244345,
"grad_norm": 0.7489557862281799,
"learning_rate": 0.0005091574513628183,
"loss": 0.2856,
"mean_token_accuracy": 0.934124082326889,
"num_tokens": 1023032.0,
"step": 116
},
{
"entropy": 2.4693005681037903,
"epoch": 0.4235294117647059,
"grad_norm": 0.6842612624168396,
"learning_rate": 0.0005091004002383682,
"loss": 0.2778,
"mean_token_accuracy": 0.9386793673038483,
"num_tokens": 1031883.0,
"step": 117
},
{
"entropy": 2.4351969361305237,
"epoch": 0.4271493212669683,
"grad_norm": 0.9150674343109131,
"learning_rate": 0.0005090424829494347,
"loss": 0.3151,
"mean_token_accuracy": 0.9177709072828293,
"num_tokens": 1040985.0,
"step": 118
},
{
"entropy": 2.5141562819480896,
"epoch": 0.4307692307692308,
"grad_norm": 1.0200655460357666,
"learning_rate": 0.000508983699715995,
"loss": 0.5134,
"mean_token_accuracy": 0.8835459351539612,
"num_tokens": 1049949.0,
"step": 119
},
{
"entropy": 2.479240596294403,
"epoch": 0.4343891402714932,
"grad_norm": 0.783278226852417,
"learning_rate": 0.0005089240507613151,
"loss": 0.2745,
"mean_token_accuracy": 0.9389322698116302,
"num_tokens": 1058953.0,
"step": 120
},
{
"entropy": 2.457803785800934,
"epoch": 0.43800904977375565,
"grad_norm": 0.7620834112167358,
"learning_rate": 0.0005088635363119497,
"loss": 0.3394,
"mean_token_accuracy": 0.9145695865154266,
"num_tokens": 1068624.0,
"step": 121
},
{
"entropy": 2.4909247756004333,
"epoch": 0.4416289592760181,
"grad_norm": 0.5868712067604065,
"learning_rate": 0.0005088021565977403,
"loss": 0.1726,
"mean_token_accuracy": 0.9567564129829407,
"num_tokens": 1077686.0,
"step": 122
},
{
"entropy": 2.5540462732315063,
"epoch": 0.4452488687782805,
"grad_norm": 1.1467291116714478,
"learning_rate": 0.0005087399118518148,
"loss": 0.2617,
"mean_token_accuracy": 0.9329706132411957,
"num_tokens": 1086230.0,
"step": 123
},
{
"entropy": 2.377680242061615,
"epoch": 0.448868778280543,
"grad_norm": 0.7021825909614563,
"learning_rate": 0.0005086768023105866,
"loss": 0.4124,
"mean_token_accuracy": 0.9093360006809235,
"num_tokens": 1095867.0,
"step": 124
},
{
"entropy": 2.55239599943161,
"epoch": 0.45248868778280543,
"grad_norm": 0.5947801470756531,
"learning_rate": 0.0005086128282137538,
"loss": 0.2752,
"mean_token_accuracy": 0.9248816668987274,
"num_tokens": 1105003.0,
"step": 125
},
{
"entropy": 2.4695483446121216,
"epoch": 0.45610859728506786,
"grad_norm": 1.345604658126831,
"learning_rate": 0.0005085479898042985,
"loss": 0.2577,
"mean_token_accuracy": 0.9318550229072571,
"num_tokens": 1114162.0,
"step": 126
},
{
"entropy": 2.4898732900619507,
"epoch": 0.4597285067873303,
"grad_norm": 0.8534179329872131,
"learning_rate": 0.0005084822873284848,
"loss": 0.3013,
"mean_token_accuracy": 0.9195661097764969,
"num_tokens": 1123457.0,
"step": 127
},
{
"entropy": 2.5951223969459534,
"epoch": 0.4633484162895928,
"grad_norm": 1.1677368879318237,
"learning_rate": 0.0005084157210358592,
"loss": 0.1612,
"mean_token_accuracy": 0.9599333852529526,
"num_tokens": 1131774.0,
"step": 128
},
{
"entropy": 2.7315847873687744,
"epoch": 0.4669683257918552,
"grad_norm": 0.7633224129676819,
"learning_rate": 0.0005083482911792492,
"loss": 0.2437,
"mean_token_accuracy": 0.9487509876489639,
"num_tokens": 1140301.0,
"step": 129
},
{
"entropy": 2.6348633766174316,
"epoch": 0.47058823529411764,
"grad_norm": 0.7573317885398865,
"learning_rate": 0.0005082799980147617,
"loss": 0.2426,
"mean_token_accuracy": 0.947308748960495,
"num_tokens": 1148929.0,
"step": 130
},
{
"entropy": 2.60002738237381,
"epoch": 0.47420814479638007,
"grad_norm": 1.8195319175720215,
"learning_rate": 0.0005082108418017829,
"loss": 0.1792,
"mean_token_accuracy": 0.9512491375207901,
"num_tokens": 1157682.0,
"step": 131
},
{
"entropy": 2.5319923162460327,
"epoch": 0.47782805429864256,
"grad_norm": 0.6342993378639221,
"learning_rate": 0.0005081408228029771,
"loss": 0.1843,
"mean_token_accuracy": 0.9440758228302002,
"num_tokens": 1166687.0,
"step": 132
},
{
"entropy": 2.5666881799697876,
"epoch": 0.481447963800905,
"grad_norm": 0.8979415893554688,
"learning_rate": 0.0005080699412842852,
"loss": 0.4824,
"mean_token_accuracy": 0.8837443292140961,
"num_tokens": 1175746.0,
"step": 133
},
{
"entropy": 2.6854636669158936,
"epoch": 0.4850678733031674,
"grad_norm": 0.8302125334739685,
"learning_rate": 0.0005079981975149243,
"loss": 0.267,
"mean_token_accuracy": 0.9279022663831711,
"num_tokens": 1184196.0,
"step": 134
},
{
"entropy": 2.564552128314972,
"epoch": 0.48868778280542985,
"grad_norm": 0.6785959005355835,
"learning_rate": 0.0005079255917673863,
"loss": 0.2031,
"mean_token_accuracy": 0.9463823586702347,
"num_tokens": 1192982.0,
"step": 135
},
{
"entropy": 2.673682928085327,
"epoch": 0.49230769230769234,
"grad_norm": 1.4760410785675049,
"learning_rate": 0.0005078521243174371,
"loss": 0.4791,
"mean_token_accuracy": 0.8969505727291107,
"num_tokens": 1201454.0,
"step": 136
},
{
"entropy": 2.6232714653015137,
"epoch": 0.49592760180995477,
"grad_norm": 0.7845668792724609,
"learning_rate": 0.0005077777954441157,
"loss": 0.2472,
"mean_token_accuracy": 0.9404618591070175,
"num_tokens": 1210182.0,
"step": 137
},
{
"entropy": 2.5614060163497925,
"epoch": 0.4995475113122172,
"grad_norm": 0.725419819355011,
"learning_rate": 0.0005077026054297322,
"loss": 0.3643,
"mean_token_accuracy": 0.9193316847085953,
"num_tokens": 1219487.0,
"step": 138
},
{
"entropy": 2.5907246470451355,
"epoch": 0.5031674208144796,
"grad_norm": 0.7741782665252686,
"learning_rate": 0.0005076265545598682,
"loss": 0.276,
"mean_token_accuracy": 0.9447730481624603,
"num_tokens": 1228066.0,
"step": 139
},
{
"entropy": 2.531104028224945,
"epoch": 0.5067873303167421,
"grad_norm": 0.680992603302002,
"learning_rate": 0.0005075496431233745,
"loss": 0.2004,
"mean_token_accuracy": 0.9470729678869247,
"num_tokens": 1236980.0,
"step": 140
},
{
"entropy": 2.590231478214264,
"epoch": 0.5104072398190045,
"grad_norm": 0.8260406255722046,
"learning_rate": 0.0005074718714123704,
"loss": 0.2756,
"mean_token_accuracy": 0.9301882535219193,
"num_tokens": 1245565.0,
"step": 141
},
{
"entropy": 2.4858668446540833,
"epoch": 0.5140271493212669,
"grad_norm": 0.8085922598838806,
"learning_rate": 0.0005073932397222429,
"loss": 0.2314,
"mean_token_accuracy": 0.9449103325605392,
"num_tokens": 1254366.0,
"step": 142
},
{
"entropy": 2.5374304056167603,
"epoch": 0.5176470588235295,
"grad_norm": 0.7858129143714905,
"learning_rate": 0.0005073137483516452,
"loss": 0.1622,
"mean_token_accuracy": 0.9510673582553864,
"num_tokens": 1263197.0,
"step": 143
},
{
"entropy": 2.608425199985504,
"epoch": 0.5212669683257919,
"grad_norm": 1.2698506116867065,
"learning_rate": 0.0005072333976024957,
"loss": 0.1729,
"mean_token_accuracy": 0.9509973376989365,
"num_tokens": 1271725.0,
"step": 144
},
{
"entropy": 2.437038242816925,
"epoch": 0.5248868778280543,
"grad_norm": 1.0788538455963135,
"learning_rate": 0.0005071521877799765,
"loss": 0.3344,
"mean_token_accuracy": 0.9166721999645233,
"num_tokens": 1280963.0,
"step": 145
},
{
"entropy": 2.589951515197754,
"epoch": 0.5285067873303168,
"grad_norm": 0.9228294491767883,
"learning_rate": 0.0005070701191925332,
"loss": 0.3095,
"mean_token_accuracy": 0.9239777624607086,
"num_tokens": 1289683.0,
"step": 146
},
{
"entropy": 2.575794994831085,
"epoch": 0.5321266968325792,
"grad_norm": 1.359767198562622,
"learning_rate": 0.0005069871921518726,
"loss": 0.2447,
"mean_token_accuracy": 0.9374738186597824,
"num_tokens": 1298397.0,
"step": 147
},
{
"entropy": 2.5628358721733093,
"epoch": 0.5357466063348416,
"grad_norm": 0.9870713353157043,
"learning_rate": 0.000506903406972962,
"loss": 0.4824,
"mean_token_accuracy": 0.9027767181396484,
"num_tokens": 1307191.0,
"step": 148
},
{
"entropy": 2.5513240098953247,
"epoch": 0.539366515837104,
"grad_norm": 0.7921387553215027,
"learning_rate": 0.0005068187639740286,
"loss": 0.3278,
"mean_token_accuracy": 0.9161934554576874,
"num_tokens": 1315878.0,
"step": 149
},
{
"entropy": 2.526439070701599,
"epoch": 0.5429864253393665,
"grad_norm": 0.6320391297340393,
"learning_rate": 0.000506733263476557,
"loss": 0.1701,
"mean_token_accuracy": 0.9575318098068237,
"num_tokens": 1324786.0,
"step": 150
},
{
"entropy": 2.4837265014648438,
"epoch": 0.5466063348416289,
"grad_norm": 0.5369354486465454,
"learning_rate": 0.000506646905805289,
"loss": 0.1328,
"mean_token_accuracy": 0.9636050164699554,
"num_tokens": 1333766.0,
"step": 151
},
{
"entropy": 2.5264737010002136,
"epoch": 0.5502262443438914,
"grad_norm": 0.7346852421760559,
"learning_rate": 0.0005065596912882222,
"loss": 0.2012,
"mean_token_accuracy": 0.9448132663965225,
"num_tokens": 1343004.0,
"step": 152
},
{
"entropy": 2.569309651851654,
"epoch": 0.5538461538461539,
"grad_norm": 0.9926508069038391,
"learning_rate": 0.0005064716202566082,
"loss": 0.2831,
"mean_token_accuracy": 0.9332023113965988,
"num_tokens": 1351561.0,
"step": 153
},
{
"entropy": 2.3148274421691895,
"epoch": 0.5574660633484163,
"grad_norm": 0.6301954984664917,
"learning_rate": 0.0005063826930449523,
"loss": 0.3622,
"mean_token_accuracy": 0.9349419325590134,
"num_tokens": 1360997.0,
"step": 154
},
{
"entropy": 2.497675657272339,
"epoch": 0.5610859728506787,
"grad_norm": 0.8846175670623779,
"learning_rate": 0.000506292909991011,
"loss": 0.2314,
"mean_token_accuracy": 0.9468862265348434,
"num_tokens": 1369600.0,
"step": 155
},
{
"entropy": 2.313987612724304,
"epoch": 0.5647058823529412,
"grad_norm": 0.5701894164085388,
"learning_rate": 0.0005062022714357922,
"loss": 0.2154,
"mean_token_accuracy": 0.945093959569931,
"num_tokens": 1379125.0,
"step": 156
},
{
"entropy": 2.4019755125045776,
"epoch": 0.5683257918552036,
"grad_norm": 0.8769335746765137,
"learning_rate": 0.0005061107777235524,
"loss": 0.3565,
"mean_token_accuracy": 0.9133864492177963,
"num_tokens": 1388111.0,
"step": 157
},
{
"entropy": 2.3127577900886536,
"epoch": 0.571945701357466,
"grad_norm": 1.1026453971862793,
"learning_rate": 0.0005060184292017965,
"loss": 0.2897,
"mean_token_accuracy": 0.899736076593399,
"num_tokens": 1397528.0,
"step": 158
},
{
"entropy": 2.2682697772979736,
"epoch": 0.5755656108597285,
"grad_norm": 0.5426591038703918,
"learning_rate": 0.000505925226221276,
"loss": 0.167,
"mean_token_accuracy": 0.9609879851341248,
"num_tokens": 1406809.0,
"step": 159
},
{
"entropy": 2.4639336466789246,
"epoch": 0.579185520361991,
"grad_norm": 0.6552363038063049,
"learning_rate": 0.0005058311691359875,
"loss": 0.2511,
"mean_token_accuracy": 0.9355164766311646,
"num_tokens": 1415498.0,
"step": 160
},
{
"entropy": 2.467900663614273,
"epoch": 0.5828054298642534,
"grad_norm": 0.7168154120445251,
"learning_rate": 0.000505736258303172,
"loss": 0.234,
"mean_token_accuracy": 0.9450509995222092,
"num_tokens": 1424524.0,
"step": 161
},
{
"entropy": 2.3683157563209534,
"epoch": 0.5864253393665159,
"grad_norm": 0.6433501839637756,
"learning_rate": 0.0005056404940833128,
"loss": 0.3441,
"mean_token_accuracy": 0.9261108189821243,
"num_tokens": 1434194.0,
"step": 162
},
{
"entropy": 2.4686295986175537,
"epoch": 0.5900452488687783,
"grad_norm": 0.9615177512168884,
"learning_rate": 0.0005055438768401348,
"loss": 0.1492,
"mean_token_accuracy": 0.966903567314148,
"num_tokens": 1442972.0,
"step": 163
},
{
"entropy": 2.5551892518997192,
"epoch": 0.5936651583710407,
"grad_norm": 0.4957484006881714,
"learning_rate": 0.0005054464069406023,
"loss": 0.1242,
"mean_token_accuracy": 0.969713419675827,
"num_tokens": 1451324.0,
"step": 164
},
{
"entropy": 2.554121434688568,
"epoch": 0.5972850678733032,
"grad_norm": 0.7399498224258423,
"learning_rate": 0.0005053480847549187,
"loss": 0.206,
"mean_token_accuracy": 0.9498797357082367,
"num_tokens": 1459698.0,
"step": 165
},
{
"entropy": 2.5181015729904175,
"epoch": 0.6009049773755656,
"grad_norm": 0.7433251142501831,
"learning_rate": 0.0005052489106565241,
"loss": 0.2883,
"mean_token_accuracy": 0.9419967085123062,
"num_tokens": 1468460.0,
"step": 166
},
{
"entropy": 2.3073930144309998,
"epoch": 0.604524886877828,
"grad_norm": 0.5920398831367493,
"learning_rate": 0.0005051488850220941,
"loss": 0.197,
"mean_token_accuracy": 0.952111005783081,
"num_tokens": 1477579.0,
"step": 167
},
{
"entropy": 2.532376289367676,
"epoch": 0.6081447963800904,
"grad_norm": 0.7033098936080933,
"learning_rate": 0.0005050480082315392,
"loss": 0.2122,
"mean_token_accuracy": 0.9488633275032043,
"num_tokens": 1486307.0,
"step": 168
},
{
"entropy": 2.397290349006653,
"epoch": 0.611764705882353,
"grad_norm": 0.8026869893074036,
"learning_rate": 0.0005049462806680021,
"loss": 0.2541,
"mean_token_accuracy": 0.9427233040332794,
"num_tokens": 1495152.0,
"step": 169
},
{
"entropy": 2.464823842048645,
"epoch": 0.6153846153846154,
"grad_norm": 0.6508225798606873,
"learning_rate": 0.0005048437027178571,
"loss": 0.2639,
"mean_token_accuracy": 0.9391255974769592,
"num_tokens": 1503903.0,
"step": 170
},
{
"entropy": 2.520734131336212,
"epoch": 0.6190045248868778,
"grad_norm": 0.8373616337776184,
"learning_rate": 0.0005047402747707084,
"loss": 0.3078,
"mean_token_accuracy": 0.9302930980920792,
"num_tokens": 1512588.0,
"step": 171
},
{
"entropy": 2.388108015060425,
"epoch": 0.6226244343891403,
"grad_norm": 0.6334089636802673,
"learning_rate": 0.0005046359972193884,
"loss": 0.1372,
"mean_token_accuracy": 0.9666119515895844,
"num_tokens": 1522011.0,
"step": 172
},
{
"entropy": 2.537126660346985,
"epoch": 0.6262443438914027,
"grad_norm": 0.7665116190910339,
"learning_rate": 0.0005045308704599566,
"loss": 0.2603,
"mean_token_accuracy": 0.9350012242794037,
"num_tokens": 1530767.0,
"step": 173
},
{
"entropy": 2.567205488681793,
"epoch": 0.6298642533936651,
"grad_norm": 0.8043875098228455,
"learning_rate": 0.0005044248948916977,
"loss": 0.2497,
"mean_token_accuracy": 0.9400482773780823,
"num_tokens": 1539971.0,
"step": 174
},
{
"entropy": 2.585887610912323,
"epoch": 0.6334841628959276,
"grad_norm": 0.5282150506973267,
"learning_rate": 0.0005043180709171206,
"loss": 0.1126,
"mean_token_accuracy": 0.9680279046297073,
"num_tokens": 1548971.0,
"step": 175
},
{
"entropy": 2.4289392232894897,
"epoch": 0.63710407239819,
"grad_norm": 0.6838382482528687,
"learning_rate": 0.0005042103989419563,
"loss": 0.2076,
"mean_token_accuracy": 0.9468046277761459,
"num_tokens": 1558403.0,
"step": 176
},
{
"entropy": 2.6080575585365295,
"epoch": 0.6407239819004525,
"grad_norm": 0.9058650732040405,
"learning_rate": 0.0005041018793751566,
"loss": 0.1781,
"mean_token_accuracy": 0.9432647377252579,
"num_tokens": 1567209.0,
"step": 177
},
{
"entropy": 2.5212480425834656,
"epoch": 0.644343891402715,
"grad_norm": 0.796381950378418,
"learning_rate": 0.0005039925126288929,
"loss": 0.2286,
"mean_token_accuracy": 0.9305787235498428,
"num_tokens": 1576255.0,
"step": 178
},
{
"entropy": 2.588195264339447,
"epoch": 0.6479638009049774,
"grad_norm": 0.6489388942718506,
"learning_rate": 0.0005038822991185536,
"loss": 0.1717,
"mean_token_accuracy": 0.9572225511074066,
"num_tokens": 1585335.0,
"step": 179
},
{
"entropy": 2.609215259552002,
"epoch": 0.6515837104072398,
"grad_norm": 0.8551130294799805,
"learning_rate": 0.0005037712392627441,
"loss": 0.2358,
"mean_token_accuracy": 0.9529621452093124,
"num_tokens": 1594354.0,
"step": 180
},
{
"entropy": 2.4199504256248474,
"epoch": 0.6552036199095023,
"grad_norm": 0.5775637030601501,
"learning_rate": 0.0005036593334832836,
"loss": 0.2402,
"mean_token_accuracy": 0.9437069743871689,
"num_tokens": 1603750.0,
"step": 181
},
{
"entropy": 2.516424596309662,
"epoch": 0.6588235294117647,
"grad_norm": 0.6967942118644714,
"learning_rate": 0.0005035465822052047,
"loss": 0.1624,
"mean_token_accuracy": 0.9518167823553085,
"num_tokens": 1612474.0,
"step": 182
},
{
"entropy": 2.463354170322418,
"epoch": 0.6624434389140271,
"grad_norm": 0.49672600626945496,
"learning_rate": 0.000503432985856751,
"loss": 0.1654,
"mean_token_accuracy": 0.9564716964960098,
"num_tokens": 1621563.0,
"step": 183
},
{
"entropy": 2.4456416964530945,
"epoch": 0.6660633484162896,
"grad_norm": 0.6207183003425598,
"learning_rate": 0.000503318544869376,
"loss": 0.1918,
"mean_token_accuracy": 0.9476529806852341,
"num_tokens": 1630801.0,
"step": 184
},
{
"entropy": 2.641440451145172,
"epoch": 0.669683257918552,
"grad_norm": 1.220821499824524,
"learning_rate": 0.000503203259677741,
"loss": 0.4019,
"mean_token_accuracy": 0.9172120243310928,
"num_tokens": 1639522.0,
"step": 185
},
{
"entropy": 2.6447275280952454,
"epoch": 0.6733031674208145,
"grad_norm": 0.7546490430831909,
"learning_rate": 0.000503087130719714,
"loss": 0.2484,
"mean_token_accuracy": 0.9387800246477127,
"num_tokens": 1647964.0,
"step": 186
},
{
"entropy": 2.4657886028289795,
"epoch": 0.676923076923077,
"grad_norm": 0.7679230570793152,
"learning_rate": 0.0005029701584363675,
"loss": 0.2659,
"mean_token_accuracy": 0.930300235748291,
"num_tokens": 1657181.0,
"step": 187
},
{
"entropy": 2.37973552942276,
"epoch": 0.6805429864253394,
"grad_norm": 0.7473414540290833,
"learning_rate": 0.0005028523432719772,
"loss": 0.32,
"mean_token_accuracy": 0.9233052879571915,
"num_tokens": 1666477.0,
"step": 188
},
{
"entropy": 2.5238219499588013,
"epoch": 0.6841628959276018,
"grad_norm": 0.5573673248291016,
"learning_rate": 0.0005027336856740201,
"loss": 0.1846,
"mean_token_accuracy": 0.9445535093545914,
"num_tokens": 1675002.0,
"step": 189
},
{
"entropy": 2.456815242767334,
"epoch": 0.6877828054298643,
"grad_norm": 0.47237634658813477,
"learning_rate": 0.0005026141860931728,
"loss": 0.1065,
"mean_token_accuracy": 0.964375838637352,
"num_tokens": 1683623.0,
"step": 190
},
{
"entropy": 2.548456132411957,
"epoch": 0.6914027149321267,
"grad_norm": 0.7699162364006042,
"learning_rate": 0.00050249384498331,
"loss": 0.1985,
"mean_token_accuracy": 0.9438774734735489,
"num_tokens": 1691718.0,
"step": 191
},
{
"entropy": 2.4514941573143005,
"epoch": 0.6950226244343891,
"grad_norm": 1.4113538265228271,
"learning_rate": 0.0005023726628015027,
"loss": 0.4541,
"mean_token_accuracy": 0.9207872897386551,
"num_tokens": 1699824.0,
"step": 192
},
{
"entropy": 2.2560824751853943,
"epoch": 0.6986425339366515,
"grad_norm": 0.6007948517799377,
"learning_rate": 0.0005022506400080161,
"loss": 0.1871,
"mean_token_accuracy": 0.9502484053373337,
"num_tokens": 1708722.0,
"step": 193
},
{
"entropy": 2.1833614110946655,
"epoch": 0.702262443438914,
"grad_norm": 0.7005489468574524,
"learning_rate": 0.0005021277770663082,
"loss": 0.2222,
"mean_token_accuracy": 0.9386974722146988,
"num_tokens": 1717592.0,
"step": 194
},
{
"entropy": 2.2031923830509186,
"epoch": 0.7058823529411765,
"grad_norm": 0.5830584764480591,
"learning_rate": 0.0005020040744430284,
"loss": 0.1106,
"mean_token_accuracy": 0.9719562232494354,
"num_tokens": 1726149.0,
"step": 195
},
{
"entropy": 2.199785351753235,
"epoch": 0.709502262443439,
"grad_norm": 0.7465847134590149,
"learning_rate": 0.0005018795326080149,
"loss": 0.1935,
"mean_token_accuracy": 0.9497270882129669,
"num_tokens": 1734541.0,
"step": 196
},
{
"entropy": 2.1103186309337616,
"epoch": 0.7131221719457014,
"grad_norm": 1.0782264471054077,
"learning_rate": 0.0005017541520342934,
"loss": 0.2895,
"mean_token_accuracy": 0.9274258464574814,
"num_tokens": 1743722.0,
"step": 197
},
{
"entropy": 2.2248528599739075,
"epoch": 0.7167420814479638,
"grad_norm": 0.6409780979156494,
"learning_rate": 0.0005016279331980754,
"loss": 0.1425,
"mean_token_accuracy": 0.96550352871418,
"num_tokens": 1752156.0,
"step": 198
},
{
"entropy": 2.19924658536911,
"epoch": 0.7203619909502262,
"grad_norm": 0.7019934058189392,
"learning_rate": 0.0005015008765787561,
"loss": 0.1969,
"mean_token_accuracy": 0.9429282248020172,
"num_tokens": 1760978.0,
"step": 199
},
{
"entropy": 2.297484815120697,
"epoch": 0.7239819004524887,
"grad_norm": 0.7826490998268127,
"learning_rate": 0.0005013729826589127,
"loss": 0.2399,
"mean_token_accuracy": 0.9416657984256744,
"num_tokens": 1769533.0,
"step": 200
},
{
"entropy": 2.2471498548984528,
"epoch": 0.7276018099547511,
"grad_norm": 0.621566891670227,
"learning_rate": 0.0005012442519243027,
"loss": 0.1876,
"mean_token_accuracy": 0.9460793286561966,
"num_tokens": 1778286.0,
"step": 201
},
{
"entropy": 2.2212815284729004,
"epoch": 0.7312217194570135,
"grad_norm": 0.622283935546875,
"learning_rate": 0.0005011146848638616,
"loss": 0.1617,
"mean_token_accuracy": 0.9482609927654266,
"num_tokens": 1787392.0,
"step": 202
},
{
"entropy": 2.308752655982971,
"epoch": 0.7348416289592761,
"grad_norm": 0.7263973355293274,
"learning_rate": 0.0005009842819697018,
"loss": 0.2043,
"mean_token_accuracy": 0.9378403723239899,
"num_tokens": 1796133.0,
"step": 203
},
{
"entropy": 2.3376497626304626,
"epoch": 0.7384615384615385,
"grad_norm": 0.5493630766868591,
"learning_rate": 0.0005008530437371101,
"loss": 0.1145,
"mean_token_accuracy": 0.970586434006691,
"num_tokens": 1804769.0,
"step": 204
},
{
"entropy": 2.373005509376526,
"epoch": 0.7420814479638009,
"grad_norm": 0.6313483119010925,
"learning_rate": 0.0005007209706645461,
"loss": 0.2183,
"mean_token_accuracy": 0.9472708404064178,
"num_tokens": 1813364.0,
"step": 205
},
{
"entropy": 2.468949854373932,
"epoch": 0.7457013574660634,
"grad_norm": 1.0125588178634644,
"learning_rate": 0.00050058806325364,
"loss": 0.2225,
"mean_token_accuracy": 0.9351322948932648,
"num_tokens": 1822149.0,
"step": 206
},
{
"entropy": 2.2420623898506165,
"epoch": 0.7493212669683258,
"grad_norm": 0.913761556148529,
"learning_rate": 0.0005004543220091911,
"loss": 0.2386,
"mean_token_accuracy": 0.9453927427530289,
"num_tokens": 1831533.0,
"step": 207
},
{
"entropy": 2.2966006994247437,
"epoch": 0.7529411764705882,
"grad_norm": 0.7386876940727234,
"learning_rate": 0.0005003197474391658,
"loss": 0.1768,
"mean_token_accuracy": 0.949826255440712,
"num_tokens": 1840157.0,
"step": 208
},
{
"entropy": 2.306001305580139,
"epoch": 0.7565610859728507,
"grad_norm": 0.8900741338729858,
"learning_rate": 0.0005001843400546955,
"loss": 0.2899,
"mean_token_accuracy": 0.9241485595703125,
"num_tokens": 1848898.0,
"step": 209
},
{
"entropy": 2.117514967918396,
"epoch": 0.7601809954751131,
"grad_norm": 0.644622802734375,
"learning_rate": 0.0005000481003700746,
"loss": 0.2714,
"mean_token_accuracy": 0.9299416691064835,
"num_tokens": 1858330.0,
"step": 210
},
{
"entropy": 2.3768392205238342,
"epoch": 0.7638009049773755,
"grad_norm": 0.9724471569061279,
"learning_rate": 0.0004999110289027587,
"loss": 0.1633,
"mean_token_accuracy": 0.9550061523914337,
"num_tokens": 1866806.0,
"step": 211
},
{
"entropy": 2.090679556131363,
"epoch": 0.7674208144796381,
"grad_norm": 0.5419518351554871,
"learning_rate": 0.0004997731261733628,
"loss": 0.1369,
"mean_token_accuracy": 0.9619670957326889,
"num_tokens": 1875937.0,
"step": 212
},
{
"entropy": 2.099909245967865,
"epoch": 0.7710407239819005,
"grad_norm": 0.6858121752738953,
"learning_rate": 0.0004996343927056592,
"loss": 0.1633,
"mean_token_accuracy": 0.9528832882642746,
"num_tokens": 1885145.0,
"step": 213
},
{
"entropy": 2.130059242248535,
"epoch": 0.7746606334841629,
"grad_norm": 0.7691065073013306,
"learning_rate": 0.000499494829026575,
"loss": 0.348,
"mean_token_accuracy": 0.9162366837263107,
"num_tokens": 1894255.0,
"step": 214
},
{
"entropy": 2.191373586654663,
"epoch": 0.7782805429864253,
"grad_norm": 0.7427324652671814,
"learning_rate": 0.000499354435666191,
"loss": 0.3373,
"mean_token_accuracy": 0.9311849176883698,
"num_tokens": 1902981.0,
"step": 215
},
{
"entropy": 2.1425398886203766,
"epoch": 0.7819004524886878,
"grad_norm": 0.6410383582115173,
"learning_rate": 0.0004992132131577392,
"loss": 0.2079,
"mean_token_accuracy": 0.949742391705513,
"num_tokens": 1912253.0,
"step": 216
},
{
"entropy": 2.1396586298942566,
"epoch": 0.7855203619909502,
"grad_norm": 0.5689850449562073,
"learning_rate": 0.0004990711620376003,
"loss": 0.1999,
"mean_token_accuracy": 0.946034774184227,
"num_tokens": 1921409.0,
"step": 217
},
{
"entropy": 2.2237865328788757,
"epoch": 0.7891402714932126,
"grad_norm": 0.6408923864364624,
"learning_rate": 0.0004989282828453029,
"loss": 0.2452,
"mean_token_accuracy": 0.9510752111673355,
"num_tokens": 1930397.0,
"step": 218
},
{
"entropy": 2.234771251678467,
"epoch": 0.7927601809954751,
"grad_norm": 0.751447856426239,
"learning_rate": 0.0004987845761235203,
"loss": 0.3057,
"mean_token_accuracy": 0.9217256307601929,
"num_tokens": 1939172.0,
"step": 219
},
{
"entropy": 2.2653815746307373,
"epoch": 0.7963800904977375,
"grad_norm": 0.751455545425415,
"learning_rate": 0.0004986400424180688,
"loss": 0.3245,
"mean_token_accuracy": 0.9256318956613541,
"num_tokens": 1947979.0,
"step": 220
},
{
"entropy": 2.3123483061790466,
"epoch": 0.8,
"grad_norm": 0.5939492583274841,
"learning_rate": 0.0004984946822779061,
"loss": 0.2429,
"mean_token_accuracy": 0.9333402067422867,
"num_tokens": 1956814.0,
"step": 221
},
{
"entropy": 2.3289234042167664,
"epoch": 0.8036199095022625,
"grad_norm": 0.5591994524002075,
"learning_rate": 0.0004983484962551284,
"loss": 0.1507,
"mean_token_accuracy": 0.96376833319664,
"num_tokens": 1965641.0,
"step": 222
},
{
"entropy": 2.4314023852348328,
"epoch": 0.8072398190045249,
"grad_norm": 0.5805783271789551,
"learning_rate": 0.0004982014849049687,
"loss": 0.2049,
"mean_token_accuracy": 0.9586948156356812,
"num_tokens": 1974180.0,
"step": 223
},
{
"entropy": 2.3639765977859497,
"epoch": 0.8108597285067873,
"grad_norm": 0.6924490332603455,
"learning_rate": 0.0004980536487857951,
"loss": 0.2137,
"mean_token_accuracy": 0.9441423565149307,
"num_tokens": 1982744.0,
"step": 224
},
{
"entropy": 2.3361759781837463,
"epoch": 0.8144796380090498,
"grad_norm": 0.4579620361328125,
"learning_rate": 0.0004979049884591077,
"loss": 0.1041,
"mean_token_accuracy": 0.9753208309412003,
"num_tokens": 1991583.0,
"step": 225
},
{
"entropy": 2.286989688873291,
"epoch": 0.8180995475113122,
"grad_norm": 0.6489312052726746,
"learning_rate": 0.0004977555044895377,
"loss": 0.2131,
"mean_token_accuracy": 0.9520440250635147,
"num_tokens": 2000193.0,
"step": 226
},
{
"entropy": 2.288672834634781,
"epoch": 0.8217194570135746,
"grad_norm": 0.7738961577415466,
"learning_rate": 0.0004976051974448441,
"loss": 0.325,
"mean_token_accuracy": 0.9060750156641006,
"num_tokens": 2009233.0,
"step": 227
},
{
"entropy": 2.288076102733612,
"epoch": 0.8253393665158371,
"grad_norm": 0.7042292356491089,
"learning_rate": 0.0004974540678959123,
"loss": 0.2206,
"mean_token_accuracy": 0.94980289041996,
"num_tokens": 2018417.0,
"step": 228
},
{
"entropy": 2.217707335948944,
"epoch": 0.8289592760180996,
"grad_norm": 0.6834208369255066,
"learning_rate": 0.0004973021164167515,
"loss": 0.2907,
"mean_token_accuracy": 0.951058641076088,
"num_tokens": 2027822.0,
"step": 229
},
{
"entropy": 2.1610691249370575,
"epoch": 0.832579185520362,
"grad_norm": 0.665044903755188,
"learning_rate": 0.0004971493435844928,
"loss": 0.2387,
"mean_token_accuracy": 0.9506549835205078,
"num_tokens": 2036983.0,
"step": 230
},
{
"entropy": 2.321135401725769,
"epoch": 0.8361990950226245,
"grad_norm": 0.8208273649215698,
"learning_rate": 0.0004969957499793869,
"loss": 0.2399,
"mean_token_accuracy": 0.9435176253318787,
"num_tokens": 2045574.0,
"step": 231
},
{
"entropy": 2.1943611800670624,
"epoch": 0.8398190045248869,
"grad_norm": 0.6293840408325195,
"learning_rate": 0.0004968413361848019,
"loss": 0.1784,
"mean_token_accuracy": 0.9559669345617294,
"num_tokens": 2054336.0,
"step": 232
},
{
"entropy": 2.2722273468971252,
"epoch": 0.8434389140271493,
"grad_norm": 0.6535817980766296,
"learning_rate": 0.0004966861027872211,
"loss": 0.1675,
"mean_token_accuracy": 0.9532535970211029,
"num_tokens": 2063225.0,
"step": 233
},
{
"entropy": 2.3278334736824036,
"epoch": 0.8470588235294118,
"grad_norm": 1.1610206365585327,
"learning_rate": 0.0004965300503762406,
"loss": 0.1588,
"mean_token_accuracy": 0.9641145765781403,
"num_tokens": 2071738.0,
"step": 234
},
{
"entropy": 2.202972888946533,
"epoch": 0.8506787330316742,
"grad_norm": 0.4811885356903076,
"learning_rate": 0.0004963731795445675,
"loss": 0.0813,
"mean_token_accuracy": 0.9766911715269089,
"num_tokens": 2080375.0,
"step": 235
},
{
"entropy": 2.2433705925941467,
"epoch": 0.8542986425339366,
"grad_norm": 0.8113318681716919,
"learning_rate": 0.0004962154908880171,
"loss": 0.2965,
"mean_token_accuracy": 0.9290606826543808,
"num_tokens": 2089522.0,
"step": 236
},
{
"entropy": 2.2168884873390198,
"epoch": 0.857918552036199,
"grad_norm": 0.6128959655761719,
"learning_rate": 0.0004960569850055111,
"loss": 0.1724,
"mean_token_accuracy": 0.9603384286165237,
"num_tokens": 2098162.0,
"step": 237
},
{
"entropy": 2.2738255858421326,
"epoch": 0.8615384615384616,
"grad_norm": 0.8557195663452148,
"learning_rate": 0.0004958976624990749,
"loss": 0.2596,
"mean_token_accuracy": 0.9487071484327316,
"num_tokens": 2106984.0,
"step": 238
},
{
"entropy": 2.2031425833702087,
"epoch": 0.865158371040724,
"grad_norm": 0.6621816158294678,
"learning_rate": 0.0004957375239738359,
"loss": 0.232,
"mean_token_accuracy": 0.9525040090084076,
"num_tokens": 2116040.0,
"step": 239
},
{
"entropy": 2.374737858772278,
"epoch": 0.8687782805429864,
"grad_norm": 0.8481062054634094,
"learning_rate": 0.0004955765700380204,
"loss": 0.2516,
"mean_token_accuracy": 0.9396061599254608,
"num_tokens": 2124862.0,
"step": 240
},
{
"entropy": 2.266704559326172,
"epoch": 0.8723981900452489,
"grad_norm": 0.6284282803535461,
"learning_rate": 0.0004954148013029521,
"loss": 0.3244,
"mean_token_accuracy": 0.9381244331598282,
"num_tokens": 2134018.0,
"step": 241
},
{
"entropy": 2.3935859203338623,
"epoch": 0.8760180995475113,
"grad_norm": 1.1564176082611084,
"learning_rate": 0.0004952522183830493,
"loss": 0.2706,
"mean_token_accuracy": 0.9297053664922714,
"num_tokens": 2142745.0,
"step": 242
},
{
"entropy": 2.281618118286133,
"epoch": 0.8796380090497737,
"grad_norm": 0.5324040055274963,
"learning_rate": 0.0004950888218958225,
"loss": 0.1573,
"mean_token_accuracy": 0.9568462073802948,
"num_tokens": 2151607.0,
"step": 243
},
{
"entropy": 2.230749189853668,
"epoch": 0.8832579185520362,
"grad_norm": 0.680780291557312,
"learning_rate": 0.0004949246124618726,
"loss": 0.1956,
"mean_token_accuracy": 0.9479999989271164,
"num_tokens": 2160904.0,
"step": 244
},
{
"entropy": 2.21382600069046,
"epoch": 0.8868778280542986,
"grad_norm": 0.6321626305580139,
"learning_rate": 0.0004947595907048877,
"loss": 0.2444,
"mean_token_accuracy": 0.9376699328422546,
"num_tokens": 2170021.0,
"step": 245
},
{
"entropy": 2.3659472465515137,
"epoch": 0.890497737556561,
"grad_norm": 0.9778954982757568,
"learning_rate": 0.0004945937572516417,
"loss": 0.3783,
"mean_token_accuracy": 0.9104805737733841,
"num_tokens": 2178995.0,
"step": 246
},
{
"entropy": 2.3233078718185425,
"epoch": 0.8941176470588236,
"grad_norm": 0.53229820728302,
"learning_rate": 0.0004944271127319909,
"loss": 0.0759,
"mean_token_accuracy": 0.9791453778743744,
"num_tokens": 2187823.0,
"step": 247
},
{
"entropy": 2.2469444274902344,
"epoch": 0.897737556561086,
"grad_norm": 0.6367197632789612,
"learning_rate": 0.0004942596577788728,
"loss": 0.2677,
"mean_token_accuracy": 0.9392691254615784,
"num_tokens": 2196923.0,
"step": 248
},
{
"entropy": 2.4508965611457825,
"epoch": 0.9013574660633484,
"grad_norm": 0.6042234897613525,
"learning_rate": 0.0004940913930283024,
"loss": 0.1102,
"mean_token_accuracy": 0.9762090593576431,
"num_tokens": 2205400.0,
"step": 249
},
{
"entropy": 2.365670144557953,
"epoch": 0.9049773755656109,
"grad_norm": 0.6490639448165894,
"learning_rate": 0.0004939223191193707,
"loss": 0.1532,
"mean_token_accuracy": 0.9489114433526993,
"num_tokens": 2214201.0,
"step": 250
},
{
"entropy": 2.4013625383377075,
"epoch": 0.9085972850678733,
"grad_norm": 0.5969854593276978,
"learning_rate": 0.0004937524366942419,
"loss": 0.1273,
"mean_token_accuracy": 0.9682519882917404,
"num_tokens": 2222979.0,
"step": 251
},
{
"entropy": 2.4402357935905457,
"epoch": 0.9122171945701357,
"grad_norm": 0.7559595704078674,
"learning_rate": 0.0004935817463981513,
"loss": 0.1979,
"mean_token_accuracy": 0.9483373910188675,
"num_tokens": 2231169.0,
"step": 252
},
{
"entropy": 2.4673256874084473,
"epoch": 0.9158371040723982,
"grad_norm": 0.8663308620452881,
"learning_rate": 0.0004934102488794023,
"loss": 0.2453,
"mean_token_accuracy": 0.9408974200487137,
"num_tokens": 2240099.0,
"step": 253
},
{
"entropy": 2.426262080669403,
"epoch": 0.9194570135746606,
"grad_norm": 0.7920467257499695,
"learning_rate": 0.0004932379447893643,
"loss": 0.2828,
"mean_token_accuracy": 0.9319239109754562,
"num_tokens": 2249088.0,
"step": 254
},
{
"entropy": 2.5018852949142456,
"epoch": 0.9230769230769231,
"grad_norm": 0.7216617465019226,
"learning_rate": 0.0004930648347824701,
"loss": 0.1647,
"mean_token_accuracy": 0.9551804810762405,
"num_tokens": 2257710.0,
"step": 255
},
{
"entropy": 2.43031644821167,
"epoch": 0.9266968325791856,
"grad_norm": 0.646794319152832,
"learning_rate": 0.0004928909195162138,
"loss": 0.1328,
"mean_token_accuracy": 0.9663553237915039,
"num_tokens": 2266883.0,
"step": 256
},
{
"entropy": 2.5406370759010315,
"epoch": 0.930316742081448,
"grad_norm": 0.5482825040817261,
"learning_rate": 0.0004927161996511474,
"loss": 0.1872,
"mean_token_accuracy": 0.9557004272937775,
"num_tokens": 2275728.0,
"step": 257
},
{
"entropy": 2.636320471763611,
"epoch": 0.9339366515837104,
"grad_norm": 0.7454632520675659,
"learning_rate": 0.0004925406758508797,
"loss": 0.1461,
"mean_token_accuracy": 0.9578974395990372,
"num_tokens": 2284319.0,
"step": 258
},
{
"entropy": 2.6067575812339783,
"epoch": 0.9375565610859729,
"grad_norm": 0.8695769309997559,
"learning_rate": 0.000492364348782072,
"loss": 0.1712,
"mean_token_accuracy": 0.9652896523475647,
"num_tokens": 2293035.0,
"step": 259
},
{
"entropy": 2.5837162137031555,
"epoch": 0.9411764705882353,
"grad_norm": 0.5752995014190674,
"learning_rate": 0.0004921872191144371,
"loss": 0.1398,
"mean_token_accuracy": 0.9553333520889282,
"num_tokens": 2301802.0,
"step": 260
},
{
"entropy": 2.713033616542816,
"epoch": 0.9447963800904977,
"grad_norm": 0.85626620054245,
"learning_rate": 0.0004920092875207363,
"loss": 0.2207,
"mean_token_accuracy": 0.9468346834182739,
"num_tokens": 2309981.0,
"step": 261
},
{
"entropy": 2.400112509727478,
"epoch": 0.9484162895927601,
"grad_norm": 0.6766608953475952,
"learning_rate": 0.0004918305546767764,
"loss": 0.1644,
"mean_token_accuracy": 0.9502440094947815,
"num_tokens": 2319212.0,
"step": 262
},
{
"entropy": 2.503827154636383,
"epoch": 0.9520361990950226,
"grad_norm": 0.789470911026001,
"learning_rate": 0.0004916510212614072,
"loss": 0.2117,
"mean_token_accuracy": 0.9454390555620193,
"num_tokens": 2328234.0,
"step": 263
},
{
"entropy": 2.669040560722351,
"epoch": 0.9556561085972851,
"grad_norm": 0.9579212069511414,
"learning_rate": 0.0004914706879565197,
"loss": 0.2193,
"mean_token_accuracy": 0.9321542829275131,
"num_tokens": 2336543.0,
"step": 264
},
{
"entropy": 2.507073998451233,
"epoch": 0.9592760180995475,
"grad_norm": 0.5315744876861572,
"learning_rate": 0.000491289555447043,
"loss": 0.0851,
"mean_token_accuracy": 0.9771326780319214,
"num_tokens": 2345292.0,
"step": 265
},
{
"entropy": 2.4205283522605896,
"epoch": 0.96289592760181,
"grad_norm": 0.5441373586654663,
"learning_rate": 0.000491107624420941,
"loss": 0.1323,
"mean_token_accuracy": 0.9541790336370468,
"num_tokens": 2354242.0,
"step": 266
},
{
"entropy": 2.3817258477211,
"epoch": 0.9665158371040724,
"grad_norm": 0.5946238040924072,
"learning_rate": 0.0004909248955692111,
"loss": 0.1708,
"mean_token_accuracy": 0.947738841176033,
"num_tokens": 2363183.0,
"step": 267
},
{
"entropy": 2.5073485374450684,
"epoch": 0.9701357466063348,
"grad_norm": 0.6979324817657471,
"learning_rate": 0.0004907413695858812,
"loss": 0.2099,
"mean_token_accuracy": 0.9423733651638031,
"num_tokens": 2371885.0,
"step": 268
},
{
"entropy": 2.5705007910728455,
"epoch": 0.9737556561085973,
"grad_norm": 0.8203943967819214,
"learning_rate": 0.0004905570471680057,
"loss": 0.217,
"mean_token_accuracy": 0.9511639326810837,
"num_tokens": 2380316.0,
"step": 269
},
{
"entropy": 2.2677993774414062,
"epoch": 0.9773755656108597,
"grad_norm": 0.5840432047843933,
"learning_rate": 0.0004903719290156649,
"loss": 0.2364,
"mean_token_accuracy": 0.9407180696725845,
"num_tokens": 2389723.0,
"step": 270
},
{
"entropy": 2.477886915206909,
"epoch": 0.9809954751131221,
"grad_norm": 0.818929135799408,
"learning_rate": 0.0004901860158319612,
"loss": 0.1707,
"mean_token_accuracy": 0.9579566866159439,
"num_tokens": 2398388.0,
"step": 271
},
{
"entropy": 2.549662232398987,
"epoch": 0.9846153846153847,
"grad_norm": 0.7804781198501587,
"learning_rate": 0.0004899993083230166,
"loss": 0.2944,
"mean_token_accuracy": 0.9381812512874603,
"num_tokens": 2406929.0,
"step": 272
},
{
"entropy": 2.4465304017066956,
"epoch": 0.9882352941176471,
"grad_norm": 0.5218799114227295,
"learning_rate": 0.0004898118071979699,
"loss": 0.1661,
"mean_token_accuracy": 0.9500218778848648,
"num_tokens": 2415631.0,
"step": 273
},
{
"entropy": 2.5852283239364624,
"epoch": 0.9918552036199095,
"grad_norm": 0.591163158416748,
"learning_rate": 0.0004896235131689743,
"loss": 0.2005,
"mean_token_accuracy": 0.9455285370349884,
"num_tokens": 2424091.0,
"step": 274
},
{
"entropy": 2.478701651096344,
"epoch": 0.995475113122172,
"grad_norm": 1.0615383386611938,
"learning_rate": 0.0004894344269511945,
"loss": 0.2864,
"mean_token_accuracy": 0.9306265562772751,
"num_tokens": 2432705.0,
"step": 275
},
{
"entropy": 2.600062847137451,
"epoch": 0.9990950226244344,
"grad_norm": 0.7011683583259583,
"learning_rate": 0.0004892445492628043,
"loss": 0.1664,
"mean_token_accuracy": 0.9547821134328842,
"num_tokens": 2440992.0,
"step": 276
},
{
"entropy": 2.3411240577697754,
"epoch": 1.0,
"grad_norm": 0.4944029450416565,
"learning_rate": 0.000489053880824983,
"loss": 0.022,
"mean_token_accuracy": 0.9929078221321106,
"num_tokens": 2441725.0,
"step": 277
},
{
"epoch": 1.0,
"eval_entropy": 2.5467925265552553,
"eval_loss": 0.21274714171886444,
"eval_mean_token_accuracy": 0.9444630068492114,
"eval_num_tokens": 2441725.0,
"eval_runtime": 116.0434,
"eval_samples_per_second": 3.18,
"eval_steps_per_second": 1.06,
"step": 277
},
{
"entropy": 2.609170138835907,
"epoch": 1.0036199095022624,
"grad_norm": 1.0785081386566162,
"learning_rate": 0.0004888624223619136,
"loss": 0.3167,
"mean_token_accuracy": 0.9296800643205643,
"num_tokens": 2450193.0,
"step": 278
},
{
"entropy": 2.497025430202484,
"epoch": 1.0072398190045249,
"grad_norm": 0.5221985578536987,
"learning_rate": 0.0004886701746007801,
"loss": 0.0854,
"mean_token_accuracy": 0.9753399342298508,
"num_tokens": 2459309.0,
"step": 279
},
{
"entropy": 2.5487362146377563,
"epoch": 1.0108597285067873,
"grad_norm": 0.5161958336830139,
"learning_rate": 0.0004884771382717638,
"loss": 0.0819,
"mean_token_accuracy": 0.9748431146144867,
"num_tokens": 2467844.0,
"step": 280
},
{
"entropy": 2.5276209115982056,
"epoch": 1.0144796380090497,
"grad_norm": 0.5731730461120605,
"learning_rate": 0.0004882833141080412,
"loss": 0.1541,
"mean_token_accuracy": 0.9567564427852631,
"num_tokens": 2476894.0,
"step": 281
},
{
"entropy": 2.4442760348320007,
"epoch": 1.0180995475113122,
"grad_norm": 0.7120366096496582,
"learning_rate": 0.0004880887028457813,
"loss": 0.1945,
"mean_token_accuracy": 0.9465379565954208,
"num_tokens": 2485971.0,
"step": 282
},
{
"entropy": 2.4069360494613647,
"epoch": 1.0217194570135746,
"grad_norm": 0.7468647360801697,
"learning_rate": 0.00048789330522414244,
"loss": 0.2345,
"mean_token_accuracy": 0.9446765780448914,
"num_tokens": 2495043.0,
"step": 283
},
{
"entropy": 2.468382716178894,
"epoch": 1.025339366515837,
"grad_norm": 0.666231632232666,
"learning_rate": 0.0004876971219852697,
"loss": 0.1779,
"mean_token_accuracy": 0.9534575343132019,
"num_tokens": 2503672.0,
"step": 284
},
{
"entropy": 2.4362316727638245,
"epoch": 1.0289592760180994,
"grad_norm": 0.8445858955383301,
"learning_rate": 0.000487500153874292,
"loss": 0.1698,
"mean_token_accuracy": 0.953661322593689,
"num_tokens": 2512322.0,
"step": 285
},
{
"entropy": 2.364333391189575,
"epoch": 1.032579185520362,
"grad_norm": 0.4805246591567993,
"learning_rate": 0.0004873024016393193,
"loss": 0.0778,
"mean_token_accuracy": 0.9824571758508682,
"num_tokens": 2520791.0,
"step": 286
},
{
"entropy": 2.223461151123047,
"epoch": 1.0361990950226245,
"grad_norm": 0.648465096950531,
"learning_rate": 0.0004871038660314399,
"loss": 0.2593,
"mean_token_accuracy": 0.9419913589954376,
"num_tokens": 2530082.0,
"step": 287
},
{
"entropy": 2.3313387036323547,
"epoch": 1.039819004524887,
"grad_norm": 0.6912294626235962,
"learning_rate": 0.00048690454780471725,
"loss": 0.1354,
"mean_token_accuracy": 0.9561934620141983,
"num_tokens": 2538728.0,
"step": 288
},
{
"entropy": 2.191806375980377,
"epoch": 1.0434389140271494,
"grad_norm": 0.8620694279670715,
"learning_rate": 0.0004867044477161874,
"loss": 0.1103,
"mean_token_accuracy": 0.968692272901535,
"num_tokens": 2547219.0,
"step": 289
},
{
"entropy": 2.167125165462494,
"epoch": 1.0470588235294118,
"grad_norm": 0.6192149519920349,
"learning_rate": 0.0004865035665258559,
"loss": 0.1288,
"mean_token_accuracy": 0.9643534421920776,
"num_tokens": 2555940.0,
"step": 290
},
{
"entropy": 2.2750985622406006,
"epoch": 1.0506787330316743,
"grad_norm": 1.7459602355957031,
"learning_rate": 0.0004863019049966953,
"loss": 0.393,
"mean_token_accuracy": 0.9146681725978851,
"num_tokens": 2564362.0,
"step": 291
},
{
"entropy": 2.236129105091095,
"epoch": 1.0542986425339367,
"grad_norm": 0.6311184167861938,
"learning_rate": 0.0004860994638946416,
"loss": 0.1536,
"mean_token_accuracy": 0.9636097103357315,
"num_tokens": 2573316.0,
"step": 292
},
{
"entropy": 2.2642418146133423,
"epoch": 1.0579185520361991,
"grad_norm": 0.6023411154747009,
"learning_rate": 0.000485896243988592,
"loss": 0.191,
"mean_token_accuracy": 0.9476015418767929,
"num_tokens": 2581835.0,
"step": 293
},
{
"entropy": 2.3589024543762207,
"epoch": 1.0615384615384615,
"grad_norm": 0.48049232363700867,
"learning_rate": 0.0004856922460504016,
"loss": 0.1017,
"mean_token_accuracy": 0.9713075459003448,
"num_tokens": 2590317.0,
"step": 294
},
{
"entropy": 2.4141315817832947,
"epoch": 1.065158371040724,
"grad_norm": 0.8456616997718811,
"learning_rate": 0.0004854874708548806,
"loss": 0.1422,
"mean_token_accuracy": 0.9622762501239777,
"num_tokens": 2598538.0,
"step": 295
},
{
"entropy": 2.069903999567032,
"epoch": 1.0687782805429864,
"grad_norm": 0.7641116380691528,
"learning_rate": 0.0004852819191797912,
"loss": 0.2185,
"mean_token_accuracy": 0.9464851468801498,
"num_tokens": 2608219.0,
"step": 296
},
{
"entropy": 2.163217008113861,
"epoch": 1.0723981900452488,
"grad_norm": 0.546085000038147,
"learning_rate": 0.0004850755918058449,
"loss": 0.1035,
"mean_token_accuracy": 0.9708487540483475,
"num_tokens": 2617261.0,
"step": 297
},
{
"entropy": 2.2678662836551666,
"epoch": 1.0760180995475113,
"grad_norm": 0.8699386119842529,
"learning_rate": 0.0004848684895166994,
"loss": 0.2384,
"mean_token_accuracy": 0.9486480504274368,
"num_tokens": 2626144.0,
"step": 298
},
{
"entropy": 2.13065105676651,
"epoch": 1.0796380090497737,
"grad_norm": 0.44323107600212097,
"learning_rate": 0.00048466061309895554,
"loss": 0.0818,
"mean_token_accuracy": 0.9722468554973602,
"num_tokens": 2635626.0,
"step": 299
},
{
"entropy": 2.184772551059723,
"epoch": 1.0832579185520361,
"grad_norm": 0.7928256988525391,
"learning_rate": 0.0004844519633421545,
"loss": 0.2378,
"mean_token_accuracy": 0.9477885961532593,
"num_tokens": 2644674.0,
"step": 300
},
{
"entropy": 2.1669145822525024,
"epoch": 1.0868778280542986,
"grad_norm": 0.5570158362388611,
"learning_rate": 0.00048424254103877456,
"loss": 0.1434,
"mean_token_accuracy": 0.9587411731481552,
"num_tokens": 2653658.0,
"step": 301
},
{
"entropy": 2.3057579398155212,
"epoch": 1.090497737556561,
"grad_norm": 0.9084392189979553,
"learning_rate": 0.00048403234698422837,
"loss": 0.3831,
"mean_token_accuracy": 0.8896283358335495,
"num_tokens": 2662350.0,
"step": 302
},
{
"entropy": 2.1741657853126526,
"epoch": 1.0941176470588236,
"grad_norm": 0.6791238784790039,
"learning_rate": 0.0004838213819768597,
"loss": 0.1648,
"mean_token_accuracy": 0.9576362520456314,
"num_tokens": 2671450.0,
"step": 303
},
{
"entropy": 2.089864045381546,
"epoch": 1.097737556561086,
"grad_norm": 0.5696312189102173,
"learning_rate": 0.0004836096468179406,
"loss": 0.1269,
"mean_token_accuracy": 0.9658148884773254,
"num_tokens": 2680581.0,
"step": 304
},
{
"entropy": 2.2657605409622192,
"epoch": 1.1013574660633485,
"grad_norm": 1.605503797531128,
"learning_rate": 0.0004833971423116682,
"loss": 0.1027,
"mean_token_accuracy": 0.9762597978115082,
"num_tokens": 2689001.0,
"step": 305
},
{
"entropy": 2.079287111759186,
"epoch": 1.104977375565611,
"grad_norm": 0.5804780721664429,
"learning_rate": 0.00048318386926516157,
"loss": 0.1137,
"mean_token_accuracy": 0.9633719325065613,
"num_tokens": 2698050.0,
"step": 306
},
{
"entropy": 2.201345145702362,
"epoch": 1.1085972850678734,
"grad_norm": 0.8606241941452026,
"learning_rate": 0.000482969828488459,
"loss": 0.2124,
"mean_token_accuracy": 0.9472681730985641,
"num_tokens": 2706704.0,
"step": 307
},
{
"entropy": 2.095236599445343,
"epoch": 1.1122171945701358,
"grad_norm": 0.7078782320022583,
"learning_rate": 0.0004827550207945147,
"loss": 0.1957,
"mean_token_accuracy": 0.9564679116010666,
"num_tokens": 2715745.0,
"step": 308
},
{
"entropy": 2.186302363872528,
"epoch": 1.1158371040723982,
"grad_norm": 0.7166503667831421,
"learning_rate": 0.0004825394469991956,
"loss": 0.1539,
"mean_token_accuracy": 0.9662427455186844,
"num_tokens": 2724296.0,
"step": 309
},
{
"entropy": 2.052559405565262,
"epoch": 1.1194570135746607,
"grad_norm": 0.6510501503944397,
"learning_rate": 0.00048232310792127846,
"loss": 0.1831,
"mean_token_accuracy": 0.9533994495868683,
"num_tokens": 2733482.0,
"step": 310
},
{
"entropy": 2.093154102563858,
"epoch": 1.123076923076923,
"grad_norm": 0.711121678352356,
"learning_rate": 0.0004821060043824466,
"loss": 0.2315,
"mean_token_accuracy": 0.9381555914878845,
"num_tokens": 2742912.0,
"step": 311
},
{
"entropy": 2.188497006893158,
"epoch": 1.1266968325791855,
"grad_norm": 0.6782490015029907,
"learning_rate": 0.00048188813720728707,
"loss": 0.2,
"mean_token_accuracy": 0.9501812607049942,
"num_tokens": 2751808.0,
"step": 312
},
{
"entropy": 2.0495824217796326,
"epoch": 1.130316742081448,
"grad_norm": 0.7644634246826172,
"learning_rate": 0.00048166950722328697,
"loss": 0.2152,
"mean_token_accuracy": 0.9440928995609283,
"num_tokens": 2761066.0,
"step": 313
},
{
"entropy": 2.1707025468349457,
"epoch": 1.1339366515837104,
"grad_norm": 0.655131459236145,
"learning_rate": 0.00048145011526083106,
"loss": 0.1637,
"mean_token_accuracy": 0.9500558227300644,
"num_tokens": 2769870.0,
"step": 314
},
{
"entropy": 2.1047372221946716,
"epoch": 1.1375565610859728,
"grad_norm": 0.5353516936302185,
"learning_rate": 0.0004812299621531979,
"loss": 0.1705,
"mean_token_accuracy": 0.9455999433994293,
"num_tokens": 2779383.0,
"step": 315
},
{
"entropy": 2.1921610236167908,
"epoch": 1.1411764705882352,
"grad_norm": 0.8998016119003296,
"learning_rate": 0.00048100904873655696,
"loss": 0.3918,
"mean_token_accuracy": 0.9382697492837906,
"num_tokens": 2788386.0,
"step": 316
},
{
"entropy": 2.0850723683834076,
"epoch": 1.1447963800904977,
"grad_norm": 0.867432713508606,
"learning_rate": 0.0004807873758499656,
"loss": 0.2196,
"mean_token_accuracy": 0.9498324394226074,
"num_tokens": 2797496.0,
"step": 317
},
{
"entropy": 2.1980925798416138,
"epoch": 1.14841628959276,
"grad_norm": 0.6076980233192444,
"learning_rate": 0.00048056494433536577,
"loss": 0.1086,
"mean_token_accuracy": 0.9642161130905151,
"num_tokens": 2805836.0,
"step": 318
},
{
"entropy": 2.15611070394516,
"epoch": 1.1520361990950225,
"grad_norm": 0.6276211738586426,
"learning_rate": 0.0004803417550375806,
"loss": 0.1463,
"mean_token_accuracy": 0.9622830748558044,
"num_tokens": 2814404.0,
"step": 319
},
{
"entropy": 2.0017230808734894,
"epoch": 1.155656108597285,
"grad_norm": 0.5840948820114136,
"learning_rate": 0.0004801178088043115,
"loss": 0.1869,
"mean_token_accuracy": 0.9506777077913284,
"num_tokens": 2823786.0,
"step": 320
},
{
"entropy": 2.1539418697357178,
"epoch": 1.1592760180995474,
"grad_norm": 1.074331283569336,
"learning_rate": 0.0004798931064861349,
"loss": 0.2797,
"mean_token_accuracy": 0.9271649420261383,
"num_tokens": 2832374.0,
"step": 321
},
{
"entropy": 1.930726408958435,
"epoch": 1.16289592760181,
"grad_norm": 0.5121958255767822,
"learning_rate": 0.0004796676489364988,
"loss": 0.1579,
"mean_token_accuracy": 0.9582571685314178,
"num_tokens": 2841561.0,
"step": 322
},
{
"entropy": 2.0205810368061066,
"epoch": 1.1665158371040725,
"grad_norm": 0.6360969543457031,
"learning_rate": 0.00047944143701171966,
"loss": 0.1582,
"mean_token_accuracy": 0.9620308429002762,
"num_tokens": 2850171.0,
"step": 323
},
{
"entropy": 1.9655758142471313,
"epoch": 1.170135746606335,
"grad_norm": 0.6647385358810425,
"learning_rate": 0.0004792144715709792,
"loss": 0.1594,
"mean_token_accuracy": 0.954497441649437,
"num_tokens": 2858905.0,
"step": 324
},
{
"entropy": 1.9725223183631897,
"epoch": 1.1737556561085973,
"grad_norm": 0.6429229974746704,
"learning_rate": 0.0004789867534763211,
"loss": 0.1407,
"mean_token_accuracy": 0.9645214527845383,
"num_tokens": 2867533.0,
"step": 325
},
{
"entropy": 1.9473685026168823,
"epoch": 1.1773755656108598,
"grad_norm": 0.811651349067688,
"learning_rate": 0.0004787582835926477,
"loss": 0.1608,
"mean_token_accuracy": 0.9479968994855881,
"num_tokens": 2876286.0,
"step": 326
},
{
"entropy": 1.8863109350204468,
"epoch": 1.1809954751131222,
"grad_norm": 0.5587059855461121,
"learning_rate": 0.00047852906278771686,
"loss": 0.131,
"mean_token_accuracy": 0.9684520065784454,
"num_tokens": 2885667.0,
"step": 327
},
{
"entropy": 1.8288891315460205,
"epoch": 1.1846153846153846,
"grad_norm": 0.8450536131858826,
"learning_rate": 0.0004782990919321383,
"loss": 0.2224,
"mean_token_accuracy": 0.9377491921186447,
"num_tokens": 2894765.0,
"step": 328
},
{
"entropy": 1.9347718358039856,
"epoch": 1.188235294117647,
"grad_norm": 0.7665867209434509,
"learning_rate": 0.0004780683718993705,
"loss": 0.167,
"mean_token_accuracy": 0.9583602845668793,
"num_tokens": 2903551.0,
"step": 329
},
{
"entropy": 1.9097798764705658,
"epoch": 1.1918552036199095,
"grad_norm": 0.7705667018890381,
"learning_rate": 0.00047783690356571784,
"loss": 0.2115,
"mean_token_accuracy": 0.9526428133249283,
"num_tokens": 2912197.0,
"step": 330
},
{
"entropy": 1.9174850285053253,
"epoch": 1.195475113122172,
"grad_norm": 0.5695499181747437,
"learning_rate": 0.00047760468781032634,
"loss": 0.1033,
"mean_token_accuracy": 0.969958484172821,
"num_tokens": 2920579.0,
"step": 331
},
{
"entropy": 1.8578442931175232,
"epoch": 1.1990950226244343,
"grad_norm": 0.7843735814094543,
"learning_rate": 0.000477371725515181,
"loss": 0.1664,
"mean_token_accuracy": 0.9545005410909653,
"num_tokens": 2929352.0,
"step": 332
},
{
"entropy": 1.8509328961372375,
"epoch": 1.2027149321266968,
"grad_norm": 0.5951048135757446,
"learning_rate": 0.0004771380175651026,
"loss": 0.1566,
"mean_token_accuracy": 0.9551403075456619,
"num_tokens": 2938387.0,
"step": 333
},
{
"entropy": 1.8236390948295593,
"epoch": 1.2063348416289592,
"grad_norm": 0.4988223910331726,
"learning_rate": 0.0004769035648477434,
"loss": 0.1242,
"mean_token_accuracy": 0.966319814324379,
"num_tokens": 2947741.0,
"step": 334
},
{
"entropy": 1.9594822525978088,
"epoch": 1.2099547511312216,
"grad_norm": 0.7550755143165588,
"learning_rate": 0.00047666836825358477,
"loss": 0.1591,
"mean_token_accuracy": 0.9666347652673721,
"num_tokens": 2956313.0,
"step": 335
},
{
"entropy": 1.9148444533348083,
"epoch": 1.213574660633484,
"grad_norm": 0.5889077186584473,
"learning_rate": 0.00047643242867593345,
"loss": 0.1343,
"mean_token_accuracy": 0.9611433297395706,
"num_tokens": 2964928.0,
"step": 336
},
{
"entropy": 1.8126957714557648,
"epoch": 1.2171945701357467,
"grad_norm": 0.5447750091552734,
"learning_rate": 0.0004761957470109179,
"loss": 0.1659,
"mean_token_accuracy": 0.9552300125360489,
"num_tokens": 2974160.0,
"step": 337
},
{
"entropy": 1.7981431782245636,
"epoch": 1.2208144796380092,
"grad_norm": 0.5400761365890503,
"learning_rate": 0.0004759583241574854,
"loss": 0.1339,
"mean_token_accuracy": 0.9620136916637421,
"num_tokens": 2982900.0,
"step": 338
},
{
"entropy": 1.8613979518413544,
"epoch": 1.2244343891402716,
"grad_norm": 0.7452914714813232,
"learning_rate": 0.0004757201610173981,
"loss": 0.4,
"mean_token_accuracy": 0.9068266004323959,
"num_tokens": 2991783.0,
"step": 339
},
{
"entropy": 1.8654026687145233,
"epoch": 1.228054298642534,
"grad_norm": 1.7142685651779175,
"learning_rate": 0.00047548125849523,
"loss": 0.3168,
"mean_token_accuracy": 0.9308896362781525,
"num_tokens": 3000530.0,
"step": 340
},
{
"entropy": 1.7702704071998596,
"epoch": 1.2316742081447964,
"grad_norm": 0.6687431931495667,
"learning_rate": 0.0004752416174983633,
"loss": 0.1697,
"mean_token_accuracy": 0.9530515670776367,
"num_tokens": 3009355.0,
"step": 341
},
{
"entropy": 1.735857516527176,
"epoch": 1.2352941176470589,
"grad_norm": 0.6127599477767944,
"learning_rate": 0.00047500123893698507,
"loss": 0.1706,
"mean_token_accuracy": 0.9593266248703003,
"num_tokens": 3018518.0,
"step": 342
},
{
"entropy": 1.7076368927955627,
"epoch": 1.2389140271493213,
"grad_norm": 0.6973987817764282,
"learning_rate": 0.0004747601237240836,
"loss": 0.1615,
"mean_token_accuracy": 0.9539438933134079,
"num_tokens": 3027752.0,
"step": 343
},
{
"entropy": 1.7353227138519287,
"epoch": 1.2425339366515837,
"grad_norm": 0.8406392335891724,
"learning_rate": 0.00047451827277544546,
"loss": 0.2063,
"mean_token_accuracy": 0.9488435834646225,
"num_tokens": 3036383.0,
"step": 344
},
{
"entropy": 1.6597246527671814,
"epoch": 1.2461538461538462,
"grad_norm": 0.5971431732177734,
"learning_rate": 0.00047427568700965107,
"loss": 0.1013,
"mean_token_accuracy": 0.9721864312887192,
"num_tokens": 3045375.0,
"step": 345
},
{
"entropy": 1.7100033462047577,
"epoch": 1.2497737556561086,
"grad_norm": 0.5883470773696899,
"learning_rate": 0.00047403236734807225,
"loss": 0.1164,
"mean_token_accuracy": 0.9664830714464188,
"num_tokens": 3054084.0,
"step": 346
},
{
"entropy": 1.7402609288692474,
"epoch": 1.253393665158371,
"grad_norm": 0.7355862855911255,
"learning_rate": 0.00047378831471486815,
"loss": 0.2007,
"mean_token_accuracy": 0.9560511559247971,
"num_tokens": 3062727.0,
"step": 347
},
{
"entropy": 1.79518261551857,
"epoch": 1.2570135746606335,
"grad_norm": 0.6006518006324768,
"learning_rate": 0.00047354353003698163,
"loss": 0.1085,
"mean_token_accuracy": 0.9598321914672852,
"num_tokens": 3071178.0,
"step": 348
},
{
"entropy": 1.7328391373157501,
"epoch": 1.260633484162896,
"grad_norm": 0.560342013835907,
"learning_rate": 0.0004732980142441362,
"loss": 0.1593,
"mean_token_accuracy": 0.9579409211874008,
"num_tokens": 3079927.0,
"step": 349
},
{
"entropy": 1.7356511652469635,
"epoch": 1.2642533936651583,
"grad_norm": 0.9149975776672363,
"learning_rate": 0.00047305176826883206,
"loss": 0.4064,
"mean_token_accuracy": 0.9265118837356567,
"num_tokens": 3089314.0,
"step": 350
},
{
"entropy": 1.8573569357395172,
"epoch": 1.2678733031674208,
"grad_norm": 0.8300670981407166,
"learning_rate": 0.0004728047930463428,
"loss": 0.195,
"mean_token_accuracy": 0.9453776180744171,
"num_tokens": 3097702.0,
"step": 351
},
{
"entropy": 1.7906217575073242,
"epoch": 1.2714932126696832,
"grad_norm": 0.5668906569480896,
"learning_rate": 0.0004725570895147118,
"loss": 0.1572,
"mean_token_accuracy": 0.962067037820816,
"num_tokens": 3106379.0,
"step": 352
},
{
"entropy": 1.6957395374774933,
"epoch": 1.2751131221719456,
"grad_norm": 0.4048328399658203,
"learning_rate": 0.0004723086586147487,
"loss": 0.0944,
"mean_token_accuracy": 0.9716819673776627,
"num_tokens": 3115622.0,
"step": 353
},
{
"entropy": 1.8158144056797028,
"epoch": 1.278733031674208,
"grad_norm": 0.6396092772483826,
"learning_rate": 0.00047205950129002564,
"loss": 0.1011,
"mean_token_accuracy": 0.9698463827371597,
"num_tokens": 3124016.0,
"step": 354
},
{
"entropy": 1.730194479227066,
"epoch": 1.2823529411764705,
"grad_norm": 0.662876307964325,
"learning_rate": 0.000471809618486874,
"loss": 0.1641,
"mean_token_accuracy": 0.9520179778337479,
"num_tokens": 3132712.0,
"step": 355
},
{
"entropy": 1.6776110529899597,
"epoch": 1.285972850678733,
"grad_norm": 0.868507981300354,
"learning_rate": 0.0004715590111543804,
"loss": 0.3374,
"mean_token_accuracy": 0.9303739666938782,
"num_tokens": 3142103.0,
"step": 356
},
{
"entropy": 1.6501678824424744,
"epoch": 1.2895927601809956,
"grad_norm": 0.5433686971664429,
"learning_rate": 0.0004713076802443834,
"loss": 0.1237,
"mean_token_accuracy": 0.9653612226247787,
"num_tokens": 3151192.0,
"step": 357
},
{
"entropy": 1.6524465382099152,
"epoch": 1.293212669683258,
"grad_norm": 0.6145523190498352,
"learning_rate": 0.00047105562671147,
"loss": 0.1204,
"mean_token_accuracy": 0.9690534323453903,
"num_tokens": 3159839.0,
"step": 358
},
{
"entropy": 1.5339214205741882,
"epoch": 1.2968325791855204,
"grad_norm": 0.500477135181427,
"learning_rate": 0.00047080285151297144,
"loss": 0.1295,
"mean_token_accuracy": 0.9571033865213394,
"num_tokens": 3169047.0,
"step": 359
},
{
"entropy": 1.6765435338020325,
"epoch": 1.3004524886877828,
"grad_norm": 0.6697553396224976,
"learning_rate": 0.00047054935560896026,
"loss": 0.135,
"mean_token_accuracy": 0.9672541171312332,
"num_tokens": 3177062.0,
"step": 360
},
{
"entropy": 1.5932062566280365,
"epoch": 1.3040723981900453,
"grad_norm": 0.706957221031189,
"learning_rate": 0.0004702951399622462,
"loss": 0.1229,
"mean_token_accuracy": 0.9634416699409485,
"num_tokens": 3185829.0,
"step": 361
},
{
"entropy": 1.5623145997524261,
"epoch": 1.3076923076923077,
"grad_norm": 0.6199461221694946,
"learning_rate": 0.00047004020553837275,
"loss": 0.1449,
"mean_token_accuracy": 0.9620065689086914,
"num_tokens": 3194426.0,
"step": 362
},
{
"entropy": 1.5226828753948212,
"epoch": 1.3113122171945701,
"grad_norm": 0.8962509036064148,
"learning_rate": 0.0004697845533056132,
"loss": 0.2207,
"mean_token_accuracy": 0.9403344839811325,
"num_tokens": 3203655.0,
"step": 363
},
{
"entropy": 1.5395641326904297,
"epoch": 1.3149321266968326,
"grad_norm": 0.5993619561195374,
"learning_rate": 0.00046952818423496727,
"loss": 0.1486,
"mean_token_accuracy": 0.9614185988903046,
"num_tokens": 3212069.0,
"step": 364
},
{
"entropy": 1.5738630294799805,
"epoch": 1.318552036199095,
"grad_norm": 0.7393983602523804,
"learning_rate": 0.00046927109930015756,
"loss": 0.1812,
"mean_token_accuracy": 0.9535021334886551,
"num_tokens": 3220482.0,
"step": 365
},
{
"entropy": 1.5462632775306702,
"epoch": 1.3221719457013574,
"grad_norm": 0.7453555464744568,
"learning_rate": 0.0004690132994776253,
"loss": 0.164,
"mean_token_accuracy": 0.9585814625024796,
"num_tokens": 3229505.0,
"step": 366
},
{
"entropy": 1.5241961777210236,
"epoch": 1.3257918552036199,
"grad_norm": 0.7553415298461914,
"learning_rate": 0.00046875478574652713,
"loss": 0.1445,
"mean_token_accuracy": 0.9682841598987579,
"num_tokens": 3238326.0,
"step": 367
},
{
"entropy": 1.5344699025154114,
"epoch": 1.3294117647058823,
"grad_norm": 0.8565949201583862,
"learning_rate": 0.0004684955590887311,
"loss": 0.2521,
"mean_token_accuracy": 0.920401468873024,
"num_tokens": 3247482.0,
"step": 368
},
{
"entropy": 1.5109277665615082,
"epoch": 1.3330316742081447,
"grad_norm": 0.5170580148696899,
"learning_rate": 0.00046823562048881295,
"loss": 0.1393,
"mean_token_accuracy": 0.9584086239337921,
"num_tokens": 3256464.0,
"step": 369
},
{
"entropy": 1.4666939079761505,
"epoch": 1.3366515837104074,
"grad_norm": 0.6995373368263245,
"learning_rate": 0.0004679749709340529,
"loss": 0.1726,
"mean_token_accuracy": 0.9477890431880951,
"num_tokens": 3265853.0,
"step": 370
},
{
"entropy": 1.4208430051803589,
"epoch": 1.3402714932126698,
"grad_norm": 1.1363991498947144,
"learning_rate": 0.000467713611414431,
"loss": 0.196,
"mean_token_accuracy": 0.9495431333780289,
"num_tokens": 3275367.0,
"step": 371
},
{
"entropy": 1.5009459853172302,
"epoch": 1.3438914027149322,
"grad_norm": 0.7883325219154358,
"learning_rate": 0.00046745154292262414,
"loss": 0.2526,
"mean_token_accuracy": 0.9334618002176285,
"num_tokens": 3284772.0,
"step": 372
},
{
"entropy": 1.5485479533672333,
"epoch": 1.3475113122171947,
"grad_norm": 0.6516429781913757,
"learning_rate": 0.00046718876645400156,
"loss": 0.2057,
"mean_token_accuracy": 0.9546459317207336,
"num_tokens": 3293493.0,
"step": 373
},
{
"entropy": 1.6237249970436096,
"epoch": 1.351131221719457,
"grad_norm": 0.8916263580322266,
"learning_rate": 0.00046692528300662213,
"loss": 0.2123,
"mean_token_accuracy": 0.9456845372915268,
"num_tokens": 3302063.0,
"step": 374
},
{
"entropy": 1.561572015285492,
"epoch": 1.3547511312217195,
"grad_norm": 0.7527791857719421,
"learning_rate": 0.00046666109358122935,
"loss": 0.2113,
"mean_token_accuracy": 0.9537477940320969,
"num_tokens": 3311037.0,
"step": 375
},
{
"entropy": 1.5594256818294525,
"epoch": 1.358371040723982,
"grad_norm": 1.25638747215271,
"learning_rate": 0.0004663961991812485,
"loss": 0.1629,
"mean_token_accuracy": 0.9508458077907562,
"num_tokens": 3319635.0,
"step": 376
},
{
"entropy": 1.6909976303577423,
"epoch": 1.3619909502262444,
"grad_norm": 0.7627813220024109,
"learning_rate": 0.00046613060081278194,
"loss": 0.2303,
"mean_token_accuracy": 0.9425801336765289,
"num_tokens": 3328043.0,
"step": 377
},
{
"entropy": 1.6074829697608948,
"epoch": 1.3656108597285068,
"grad_norm": 0.6584346294403076,
"learning_rate": 0.00046586429948460646,
"loss": 0.1815,
"mean_token_accuracy": 0.9536214470863342,
"num_tokens": 3337143.0,
"step": 378
},
{
"entropy": 1.7382183969020844,
"epoch": 1.3692307692307693,
"grad_norm": 1.37154221534729,
"learning_rate": 0.0004655972962081684,
"loss": 0.1849,
"mean_token_accuracy": 0.948440819978714,
"num_tokens": 3346033.0,
"step": 379
},
{
"entropy": 1.7148900926113129,
"epoch": 1.3728506787330317,
"grad_norm": 0.9487980604171753,
"learning_rate": 0.00046532959199758,
"loss": 0.2521,
"mean_token_accuracy": 0.9344504028558731,
"num_tokens": 3354849.0,
"step": 380
},
{
"entropy": 1.7164019346237183,
"epoch": 1.3764705882352941,
"grad_norm": 0.5609025359153748,
"learning_rate": 0.00046506118786961614,
"loss": 0.1425,
"mean_token_accuracy": 0.9571309834718704,
"num_tokens": 3363674.0,
"step": 381
},
{
"entropy": 1.894619107246399,
"epoch": 1.3800904977375565,
"grad_norm": 0.9811336994171143,
"learning_rate": 0.00046479208484370997,
"loss": 0.2522,
"mean_token_accuracy": 0.9424156546592712,
"num_tokens": 3372325.0,
"step": 382
},
{
"entropy": 1.78870290517807,
"epoch": 1.383710407239819,
"grad_norm": 0.5707085132598877,
"learning_rate": 0.00046452228394194893,
"loss": 0.1354,
"mean_token_accuracy": 0.9613165706396103,
"num_tokens": 3381270.0,
"step": 383
},
{
"entropy": 1.803922712802887,
"epoch": 1.3873303167420814,
"grad_norm": 0.5655364394187927,
"learning_rate": 0.0004642517861890713,
"loss": 0.0818,
"mean_token_accuracy": 0.9776160269975662,
"num_tokens": 3390363.0,
"step": 384
},
{
"entropy": 1.8172507882118225,
"epoch": 1.3909502262443438,
"grad_norm": 0.6950513124465942,
"learning_rate": 0.00046398059261246205,
"loss": 0.1145,
"mean_token_accuracy": 0.963288351893425,
"num_tokens": 3399176.0,
"step": 385
},
{
"entropy": 1.9182518422603607,
"epoch": 1.3945701357466063,
"grad_norm": 0.5900619029998779,
"learning_rate": 0.0004637087042421489,
"loss": 0.108,
"mean_token_accuracy": 0.9723307639360428,
"num_tokens": 3407978.0,
"step": 386
},
{
"entropy": 1.8558574616909027,
"epoch": 1.3981900452488687,
"grad_norm": 0.6279832124710083,
"learning_rate": 0.00046343612211079843,
"loss": 0.1471,
"mean_token_accuracy": 0.9603912532329559,
"num_tokens": 3416856.0,
"step": 387
},
{
"entropy": 1.8146779537200928,
"epoch": 1.4018099547511311,
"grad_norm": 0.6171274781227112,
"learning_rate": 0.0004631628472537125,
"loss": 0.1872,
"mean_token_accuracy": 0.9447146654129028,
"num_tokens": 3426044.0,
"step": 388
},
{
"entropy": 1.9342225790023804,
"epoch": 1.4054298642533936,
"grad_norm": 0.9947887659072876,
"learning_rate": 0.00046288888070882374,
"loss": 0.2966,
"mean_token_accuracy": 0.9279204607009888,
"num_tokens": 3435154.0,
"step": 389
},
{
"entropy": 1.9391801953315735,
"epoch": 1.409049773755656,
"grad_norm": 0.7155653834342957,
"learning_rate": 0.000462614223516692,
"loss": 0.1847,
"mean_token_accuracy": 0.9475171864032745,
"num_tokens": 3444563.0,
"step": 390
},
{
"entropy": 2.0716978013515472,
"epoch": 1.4126696832579184,
"grad_norm": 0.8198989629745483,
"learning_rate": 0.0004623388767205004,
"loss": 0.1317,
"mean_token_accuracy": 0.9608721435070038,
"num_tokens": 3453410.0,
"step": 391
},
{
"entropy": 2.1060431599617004,
"epoch": 1.416289592760181,
"grad_norm": 1.025406002998352,
"learning_rate": 0.00046206284136605106,
"loss": 0.2146,
"mean_token_accuracy": 0.9414294511079788,
"num_tokens": 3461958.0,
"step": 392
},
{
"entropy": 2.1459922194480896,
"epoch": 1.4199095022624435,
"grad_norm": 0.9209627509117126,
"learning_rate": 0.00046178611850176146,
"loss": 0.2137,
"mean_token_accuracy": 0.956874743103981,
"num_tokens": 3470547.0,
"step": 393
},
{
"entropy": 2.0233450531959534,
"epoch": 1.423529411764706,
"grad_norm": 0.5777944922447205,
"learning_rate": 0.00046150870917866025,
"loss": 0.122,
"mean_token_accuracy": 0.9672323018312454,
"num_tokens": 3479618.0,
"step": 394
},
{
"entropy": 2.035937190055847,
"epoch": 1.4271493212669684,
"grad_norm": 0.7945542931556702,
"learning_rate": 0.0004612306144503835,
"loss": 0.2879,
"mean_token_accuracy": 0.946587473154068,
"num_tokens": 3488533.0,
"step": 395
},
{
"entropy": 2.155315637588501,
"epoch": 1.4307692307692308,
"grad_norm": 0.6385292410850525,
"learning_rate": 0.00046095183537317035,
"loss": 0.1008,
"mean_token_accuracy": 0.9655124247074127,
"num_tokens": 3496686.0,
"step": 396
},
{
"entropy": 2.186827063560486,
"epoch": 1.4343891402714932,
"grad_norm": 0.4759826958179474,
"learning_rate": 0.0004606723730058593,
"loss": 0.0768,
"mean_token_accuracy": 0.9783597737550735,
"num_tokens": 3504958.0,
"step": 397
},
{
"entropy": 1.974392294883728,
"epoch": 1.4380090497737557,
"grad_norm": 0.6250292062759399,
"learning_rate": 0.00046039222840988406,
"loss": 0.1381,
"mean_token_accuracy": 0.9586146324872971,
"num_tokens": 3513694.0,
"step": 398
},
{
"entropy": 2.045738846063614,
"epoch": 1.441628959276018,
"grad_norm": 0.5517769455909729,
"learning_rate": 0.0004601114026492695,
"loss": 0.1312,
"mean_token_accuracy": 0.9682512134313583,
"num_tokens": 3522395.0,
"step": 399
},
{
"entropy": 2.105030357837677,
"epoch": 1.4452488687782805,
"grad_norm": 0.6748242974281311,
"learning_rate": 0.0004598298967906276,
"loss": 0.1056,
"mean_token_accuracy": 0.9701305478811264,
"num_tokens": 3530838.0,
"step": 400
},
{
"entropy": 2.024325281381607,
"epoch": 1.448868778280543,
"grad_norm": 0.6320233941078186,
"learning_rate": 0.00045954771190315344,
"loss": 0.1129,
"mean_token_accuracy": 0.9633017927408218,
"num_tokens": 3540184.0,
"step": 401
},
{
"entropy": 2.1561593413352966,
"epoch": 1.4524886877828054,
"grad_norm": 0.7380363941192627,
"learning_rate": 0.0004592648490586213,
"loss": 0.1304,
"mean_token_accuracy": 0.9599586874246597,
"num_tokens": 3548727.0,
"step": 402
},
{
"entropy": 2.2986454367637634,
"epoch": 1.4561085972850678,
"grad_norm": 0.669114351272583,
"learning_rate": 0.00045898130933138024,
"loss": 0.1005,
"mean_token_accuracy": 0.9724964797496796,
"num_tokens": 3556780.0,
"step": 403
},
{
"entropy": 2.103136509656906,
"epoch": 1.4597285067873302,
"grad_norm": 0.6677402853965759,
"learning_rate": 0.0004586970937983504,
"loss": 0.1177,
"mean_token_accuracy": 0.9597653448581696,
"num_tokens": 3565427.0,
"step": 404
},
{
"entropy": 2.112696200609207,
"epoch": 1.463348416289593,
"grad_norm": 0.4597342014312744,
"learning_rate": 0.0004584122035390185,
"loss": 0.0695,
"mean_token_accuracy": 0.9763098359107971,
"num_tokens": 3573902.0,
"step": 405
},
{
"entropy": 2.0472628474235535,
"epoch": 1.4669683257918553,
"grad_norm": 0.7842056751251221,
"learning_rate": 0.0004581266396354339,
"loss": 0.1981,
"mean_token_accuracy": 0.9521032422780991,
"num_tokens": 3582913.0,
"step": 406
},
{
"entropy": 2.236558735370636,
"epoch": 1.4705882352941178,
"grad_norm": 0.7634767293930054,
"learning_rate": 0.000457840403172205,
"loss": 0.1956,
"mean_token_accuracy": 0.9602932929992676,
"num_tokens": 3591197.0,
"step": 407
},
{
"entropy": 2.182949125766754,
"epoch": 1.4742081447963802,
"grad_norm": 0.7084661722183228,
"learning_rate": 0.00045755349523649415,
"loss": 0.2463,
"mean_token_accuracy": 0.9392582327127457,
"num_tokens": 3600134.0,
"step": 408
},
{
"entropy": 2.135133147239685,
"epoch": 1.4778280542986426,
"grad_norm": 0.8172940015792847,
"learning_rate": 0.00045726591691801433,
"loss": 0.2375,
"mean_token_accuracy": 0.9458330571651459,
"num_tokens": 3608945.0,
"step": 409
},
{
"entropy": 2.157473146915436,
"epoch": 1.481447963800905,
"grad_norm": 0.6165594458580017,
"learning_rate": 0.0004569776693090246,
"loss": 0.1628,
"mean_token_accuracy": 0.9586529731750488,
"num_tokens": 3617790.0,
"step": 410
},
{
"entropy": 2.15165376663208,
"epoch": 1.4850678733031675,
"grad_norm": 0.6619407534599304,
"learning_rate": 0.0004566887535043263,
"loss": 0.1866,
"mean_token_accuracy": 0.9545126557350159,
"num_tokens": 3626937.0,
"step": 411
},
{
"entropy": 2.271161735057831,
"epoch": 1.48868778280543,
"grad_norm": 0.5861835479736328,
"learning_rate": 0.0004563991706012582,
"loss": 0.1409,
"mean_token_accuracy": 0.9595955163240433,
"num_tokens": 3636025.0,
"step": 412
},
{
"entropy": 2.277799427509308,
"epoch": 1.4923076923076923,
"grad_norm": 0.6464956402778625,
"learning_rate": 0.00045610892169969323,
"loss": 0.0792,
"mean_token_accuracy": 0.9806316941976547,
"num_tokens": 3644746.0,
"step": 413
},
{
"entropy": 2.2143171429634094,
"epoch": 1.4959276018099548,
"grad_norm": 0.7531687021255493,
"learning_rate": 0.00045581800790203366,
"loss": 0.2584,
"mean_token_accuracy": 0.9225966930389404,
"num_tokens": 3654064.0,
"step": 414
},
{
"entropy": 2.231681764125824,
"epoch": 1.4995475113122172,
"grad_norm": 0.6902768015861511,
"learning_rate": 0.00045552643031320726,
"loss": 0.232,
"mean_token_accuracy": 0.9433842301368713,
"num_tokens": 3663130.0,
"step": 415
},
{
"entropy": 2.2672717571258545,
"epoch": 1.5031674208144796,
"grad_norm": 0.5134314894676208,
"learning_rate": 0.00045523419004066273,
"loss": 0.0874,
"mean_token_accuracy": 0.9708191752433777,
"num_tokens": 3671981.0,
"step": 416
},
{
"entropy": 2.3302834033966064,
"epoch": 1.506787330316742,
"grad_norm": 0.885969340801239,
"learning_rate": 0.0004549412881943659,
"loss": 0.0723,
"mean_token_accuracy": 0.9791463166475296,
"num_tokens": 3680525.0,
"step": 417
},
{
"entropy": 2.2693899869918823,
"epoch": 1.5104072398190045,
"grad_norm": 0.7424856424331665,
"learning_rate": 0.00045464772588679547,
"loss": 0.1509,
"mean_token_accuracy": 0.9600907415151596,
"num_tokens": 3689430.0,
"step": 418
},
{
"entropy": 2.4042725563049316,
"epoch": 1.514027149321267,
"grad_norm": 0.8968034982681274,
"learning_rate": 0.0004543535042329382,
"loss": 0.1984,
"mean_token_accuracy": 0.9488537162542343,
"num_tokens": 3697836.0,
"step": 419
},
{
"entropy": 2.2518428564071655,
"epoch": 1.5176470588235293,
"grad_norm": 0.5963534712791443,
"learning_rate": 0.0004540586243502858,
"loss": 0.1214,
"mean_token_accuracy": 0.9711381644010544,
"num_tokens": 3706675.0,
"step": 420
},
{
"entropy": 2.275522291660309,
"epoch": 1.5212669683257918,
"grad_norm": 1.0797090530395508,
"learning_rate": 0.0004537630873588293,
"loss": 0.2508,
"mean_token_accuracy": 0.9247037768363953,
"num_tokens": 3715631.0,
"step": 421
},
{
"entropy": 2.249617278575897,
"epoch": 1.5248868778280542,
"grad_norm": 0.7636313438415527,
"learning_rate": 0.000453466894381056,
"loss": 0.1112,
"mean_token_accuracy": 0.9681926071643829,
"num_tokens": 3724579.0,
"step": 422
},
{
"entropy": 2.280571699142456,
"epoch": 1.5285067873303166,
"grad_norm": 0.9915648698806763,
"learning_rate": 0.00045317004654194464,
"loss": 0.3532,
"mean_token_accuracy": 0.9360047876834869,
"num_tokens": 3733607.0,
"step": 423
},
{
"entropy": 2.241512656211853,
"epoch": 1.532126696832579,
"grad_norm": 0.924977719783783,
"learning_rate": 0.0004528725449689611,
"loss": 0.1997,
"mean_token_accuracy": 0.9475428760051727,
"num_tokens": 3742611.0,
"step": 424
},
{
"entropy": 2.201731503009796,
"epoch": 1.5357466063348415,
"grad_norm": 0.7018861770629883,
"learning_rate": 0.0004525743907920542,
"loss": 0.1683,
"mean_token_accuracy": 0.9465018659830093,
"num_tokens": 3751737.0,
"step": 425
},
{
"entropy": 2.28944593667984,
"epoch": 1.539366515837104,
"grad_norm": 0.5893452763557434,
"learning_rate": 0.00045227558514365166,
"loss": 0.0969,
"mean_token_accuracy": 0.9711766839027405,
"num_tokens": 3761245.0,
"step": 426
},
{
"entropy": 2.3497202396392822,
"epoch": 1.5429864253393664,
"grad_norm": 0.685279130935669,
"learning_rate": 0.0004519761291586551,
"loss": 0.106,
"mean_token_accuracy": 0.9663016647100449,
"num_tokens": 3769854.0,
"step": 427
},
{
"entropy": 2.308362066745758,
"epoch": 1.5466063348416288,
"grad_norm": 0.5116177797317505,
"learning_rate": 0.00045167602397443694,
"loss": 0.1132,
"mean_token_accuracy": 0.9700013697147369,
"num_tokens": 3778996.0,
"step": 428
},
{
"entropy": 2.238637685775757,
"epoch": 1.5502262443438914,
"grad_norm": 0.8374833464622498,
"learning_rate": 0.00045137527073083457,
"loss": 0.2539,
"mean_token_accuracy": 0.9407305717468262,
"num_tokens": 3787835.0,
"step": 429
},
{
"entropy": 2.3406758308410645,
"epoch": 1.5538461538461539,
"grad_norm": 0.5140913724899292,
"learning_rate": 0.0004510738705701473,
"loss": 0.1113,
"mean_token_accuracy": 0.9635641574859619,
"num_tokens": 3796498.0,
"step": 430
},
{
"entropy": 2.2642539143562317,
"epoch": 1.5574660633484163,
"grad_norm": 0.5750702023506165,
"learning_rate": 0.0004507718246371313,
"loss": 0.1127,
"mean_token_accuracy": 0.9660817235708237,
"num_tokens": 3805464.0,
"step": 431
},
{
"entropy": 2.2058264315128326,
"epoch": 1.5610859728506787,
"grad_norm": 0.6448659300804138,
"learning_rate": 0.0004504691340789955,
"loss": 0.0994,
"mean_token_accuracy": 0.96739861369133,
"num_tokens": 3814309.0,
"step": 432
},
{
"entropy": 2.330399215221405,
"epoch": 1.5647058823529412,
"grad_norm": 0.8432528376579285,
"learning_rate": 0.0004501658000453973,
"loss": 0.1999,
"mean_token_accuracy": 0.9510775059461594,
"num_tokens": 3823126.0,
"step": 433
},
{
"entropy": 2.4211326837539673,
"epoch": 1.5683257918552036,
"grad_norm": 0.8101194500923157,
"learning_rate": 0.00044986182368843806,
"loss": 0.144,
"mean_token_accuracy": 0.9656328558921814,
"num_tokens": 3831274.0,
"step": 434
},
{
"entropy": 2.2594956755638123,
"epoch": 1.571945701357466,
"grad_norm": 0.6753663420677185,
"learning_rate": 0.0004495572061626585,
"loss": 0.1433,
"mean_token_accuracy": 0.9572386592626572,
"num_tokens": 3840206.0,
"step": 435
},
{
"entropy": 2.1233682930469513,
"epoch": 1.5755656108597285,
"grad_norm": 0.48616713285446167,
"learning_rate": 0.000449251948625035,
"loss": 0.0934,
"mean_token_accuracy": 0.9740773588418961,
"num_tokens": 3849363.0,
"step": 436
},
{
"entropy": 2.325556695461273,
"epoch": 1.5791855203619911,
"grad_norm": 0.7744045853614807,
"learning_rate": 0.00044894605223497446,
"loss": 0.127,
"mean_token_accuracy": 0.9687052518129349,
"num_tokens": 3857733.0,
"step": 437
},
{
"entropy": 2.266542673110962,
"epoch": 1.5828054298642535,
"grad_norm": 2.373530387878418,
"learning_rate": 0.00044863951815431045,
"loss": 0.2404,
"mean_token_accuracy": 0.9437267184257507,
"num_tokens": 3866374.0,
"step": 438
},
{
"entropy": 2.1757248640060425,
"epoch": 1.586425339366516,
"grad_norm": 0.5588560700416565,
"learning_rate": 0.00044833234754729847,
"loss": 0.142,
"mean_token_accuracy": 0.9601300358772278,
"num_tokens": 3875520.0,
"step": 439
},
{
"entropy": 2.124377518892288,
"epoch": 1.5900452488687784,
"grad_norm": 0.5602438449859619,
"learning_rate": 0.0004480245415806116,
"loss": 0.1556,
"mean_token_accuracy": 0.9561446160078049,
"num_tokens": 3884345.0,
"step": 440
},
{
"entropy": 2.1571075320243835,
"epoch": 1.5936651583710408,
"grad_norm": 0.472598671913147,
"learning_rate": 0.0004477161014233361,
"loss": 0.0848,
"mean_token_accuracy": 0.9742853343486786,
"num_tokens": 3893129.0,
"step": 441
},
{
"entropy": 2.0434057414531708,
"epoch": 1.5972850678733033,
"grad_norm": 0.7104448676109314,
"learning_rate": 0.00044740702824696703,
"loss": 0.1524,
"mean_token_accuracy": 0.9542464315891266,
"num_tokens": 3902120.0,
"step": 442
},
{
"entropy": 2.1118403673171997,
"epoch": 1.6009049773755657,
"grad_norm": 0.6632394194602966,
"learning_rate": 0.0004470973232254037,
"loss": 0.3001,
"mean_token_accuracy": 0.928197592496872,
"num_tokens": 3910974.0,
"step": 443
},
{
"entropy": 2.0292475819587708,
"epoch": 1.6045248868778281,
"grad_norm": 1.050956130027771,
"learning_rate": 0.00044678698753494527,
"loss": 0.2226,
"mean_token_accuracy": 0.9448522627353668,
"num_tokens": 3920005.0,
"step": 444
},
{
"entropy": 1.991033524274826,
"epoch": 1.6081447963800906,
"grad_norm": 0.670244038105011,
"learning_rate": 0.00044647602235428624,
"loss": 0.2158,
"mean_token_accuracy": 0.9551118016242981,
"num_tokens": 3929334.0,
"step": 445
},
{
"entropy": 2.04949289560318,
"epoch": 1.611764705882353,
"grad_norm": 0.6321494579315186,
"learning_rate": 0.00044616442886451197,
"loss": 0.1743,
"mean_token_accuracy": 0.9494802355766296,
"num_tokens": 3938211.0,
"step": 446
},
{
"entropy": 2.1101951897144318,
"epoch": 1.6153846153846154,
"grad_norm": 0.6970012187957764,
"learning_rate": 0.0004458522082490943,
"loss": 0.1228,
"mean_token_accuracy": 0.9624926447868347,
"num_tokens": 3946534.0,
"step": 447
},
{
"entropy": 1.9337081909179688,
"epoch": 1.6190045248868778,
"grad_norm": 0.5971657633781433,
"learning_rate": 0.0004455393616938868,
"loss": 0.1431,
"mean_token_accuracy": 0.9635348320007324,
"num_tokens": 3955694.0,
"step": 448
},
{
"entropy": 1.9635128676891327,
"epoch": 1.6226244343891403,
"grad_norm": 0.8510827422142029,
"learning_rate": 0.00044522589038712074,
"loss": 0.2446,
"mean_token_accuracy": 0.9457641988992691,
"num_tokens": 3964907.0,
"step": 449
},
{
"entropy": 2.0336360335350037,
"epoch": 1.6262443438914027,
"grad_norm": 0.5803818106651306,
"learning_rate": 0.00044491179551939985,
"loss": 0.0872,
"mean_token_accuracy": 0.9734505414962769,
"num_tokens": 3973584.0,
"step": 450
},
{
"entropy": 2.0668878853321075,
"epoch": 1.6298642533936651,
"grad_norm": 0.6990496516227722,
"learning_rate": 0.0004445970782836967,
"loss": 0.1138,
"mean_token_accuracy": 0.9702571034431458,
"num_tokens": 3982632.0,
"step": 451
},
{
"entropy": 2.1481760144233704,
"epoch": 1.6334841628959276,
"grad_norm": 0.6156729459762573,
"learning_rate": 0.00044428173987534733,
"loss": 0.0936,
"mean_token_accuracy": 0.9739355593919754,
"num_tokens": 3991147.0,
"step": 452
},
{
"entropy": 2.0678701996803284,
"epoch": 1.63710407239819,
"grad_norm": 0.5441684126853943,
"learning_rate": 0.0004439657814920472,
"loss": 0.123,
"mean_token_accuracy": 0.9693446308374405,
"num_tokens": 3999990.0,
"step": 453
},
{
"entropy": 1.9867055118083954,
"epoch": 1.6407239819004524,
"grad_norm": 0.9218093156814575,
"learning_rate": 0.00044364920433384656,
"loss": 0.1997,
"mean_token_accuracy": 0.9564195573329926,
"num_tokens": 4009097.0,
"step": 454
},
{
"entropy": 2.145586997270584,
"epoch": 1.6443438914027149,
"grad_norm": 0.77643883228302,
"learning_rate": 0.0004433320096031458,
"loss": 0.1491,
"mean_token_accuracy": 0.9602408111095428,
"num_tokens": 4018059.0,
"step": 455
},
{
"entropy": 2.071108251810074,
"epoch": 1.6479638009049773,
"grad_norm": 0.5267088413238525,
"learning_rate": 0.0004430141985046909,
"loss": 0.0875,
"mean_token_accuracy": 0.9764399826526642,
"num_tokens": 4027089.0,
"step": 456
},
{
"entropy": 2.1659318804740906,
"epoch": 1.6515837104072397,
"grad_norm": 1.0642318725585938,
"learning_rate": 0.000442695772245569,
"loss": 0.2623,
"mean_token_accuracy": 0.9307756721973419,
"num_tokens": 4035719.0,
"step": 457
},
{
"entropy": 2.0232724249362946,
"epoch": 1.6552036199095022,
"grad_norm": 0.6213289499282837,
"learning_rate": 0.0004423767320352035,
"loss": 0.1597,
"mean_token_accuracy": 0.9599647223949432,
"num_tokens": 4045088.0,
"step": 458
},
{
"entropy": 2.047410547733307,
"epoch": 1.6588235294117646,
"grad_norm": 0.6346105933189392,
"learning_rate": 0.0004420570790853498,
"loss": 0.1422,
"mean_token_accuracy": 0.9649711549282074,
"num_tokens": 4054262.0,
"step": 459
},
{
"entropy": 2.0923012793064117,
"epoch": 1.662443438914027,
"grad_norm": 0.46477749943733215,
"learning_rate": 0.0004417368146100907,
"loss": 0.079,
"mean_token_accuracy": 0.9777993708848953,
"num_tokens": 4063107.0,
"step": 460
},
{
"entropy": 2.168913394212723,
"epoch": 1.6660633484162894,
"grad_norm": 0.5164734721183777,
"learning_rate": 0.0004414159398258312,
"loss": 0.0941,
"mean_token_accuracy": 0.9725133627653122,
"num_tokens": 4071656.0,
"step": 461
},
{
"entropy": 2.152670443058014,
"epoch": 1.6696832579185519,
"grad_norm": 0.8985757231712341,
"learning_rate": 0.00044109445595129495,
"loss": 0.2142,
"mean_token_accuracy": 0.9387252777814865,
"num_tokens": 4080023.0,
"step": 462
},
{
"entropy": 2.111784875392914,
"epoch": 1.6733031674208145,
"grad_norm": 0.47521084547042847,
"learning_rate": 0.0004407723642075184,
"loss": 0.0581,
"mean_token_accuracy": 0.9821985810995102,
"num_tokens": 4088469.0,
"step": 463
},
{
"entropy": 1.9784683287143707,
"epoch": 1.676923076923077,
"grad_norm": 0.5552536249160767,
"learning_rate": 0.0004404496658178472,
"loss": 0.1353,
"mean_token_accuracy": 0.9619844257831573,
"num_tokens": 4097737.0,
"step": 464
},
{
"entropy": 2.015674114227295,
"epoch": 1.6805429864253394,
"grad_norm": 0.6078305244445801,
"learning_rate": 0.0004401263620079309,
"loss": 0.1916,
"mean_token_accuracy": 0.9506707191467285,
"num_tokens": 4107156.0,
"step": 465
},
{
"entropy": 2.0832217931747437,
"epoch": 1.6841628959276018,
"grad_norm": 0.6618755459785461,
"learning_rate": 0.0004398024540057186,
"loss": 0.1671,
"mean_token_accuracy": 0.9617152661085129,
"num_tokens": 4116019.0,
"step": 466
},
{
"entropy": 2.0383114516735077,
"epoch": 1.6877828054298643,
"grad_norm": 0.5774693489074707,
"learning_rate": 0.0004394779430414541,
"loss": 0.2647,
"mean_token_accuracy": 0.9387127161026001,
"num_tokens": 4125001.0,
"step": 467
},
{
"entropy": 2.201409190893173,
"epoch": 1.6914027149321267,
"grad_norm": 0.7600311636924744,
"learning_rate": 0.0004391528303476715,
"loss": 0.073,
"mean_token_accuracy": 0.979825034737587,
"num_tokens": 4133467.0,
"step": 468
},
{
"entropy": 2.168666422367096,
"epoch": 1.6950226244343891,
"grad_norm": 0.7801902294158936,
"learning_rate": 0.00043882711715919015,
"loss": 0.2406,
"mean_token_accuracy": 0.9451306313276291,
"num_tokens": 4141765.0,
"step": 469
},
{
"entropy": 2.1429262161254883,
"epoch": 1.6986425339366515,
"grad_norm": 0.5192358493804932,
"learning_rate": 0.0004385008047131104,
"loss": 0.1052,
"mean_token_accuracy": 0.9749262481927872,
"num_tokens": 4150732.0,
"step": 470
},
{
"entropy": 2.1387495696544647,
"epoch": 1.702262443438914,
"grad_norm": 0.6219777464866638,
"learning_rate": 0.0004381738942488083,
"loss": 0.2127,
"mean_token_accuracy": 0.9398418068885803,
"num_tokens": 4159715.0,
"step": 471
},
{
"entropy": 2.1718398332595825,
"epoch": 1.7058823529411766,
"grad_norm": 0.5738123655319214,
"learning_rate": 0.0004378463870079316,
"loss": 0.1703,
"mean_token_accuracy": 0.9520847648382187,
"num_tokens": 4168526.0,
"step": 472
},
{
"entropy": 2.2768235206604004,
"epoch": 1.709502262443439,
"grad_norm": 0.662564754486084,
"learning_rate": 0.00043751828423439456,
"loss": 0.138,
"mean_token_accuracy": 0.9581841826438904,
"num_tokens": 4177189.0,
"step": 473
},
{
"entropy": 2.29143089056015,
"epoch": 1.7131221719457015,
"grad_norm": 0.8638074398040771,
"learning_rate": 0.00043718958717437324,
"loss": 0.1432,
"mean_token_accuracy": 0.9645630270242691,
"num_tokens": 4185367.0,
"step": 474
},
{
"entropy": 2.2810245156288147,
"epoch": 1.716742081447964,
"grad_norm": 0.6139346957206726,
"learning_rate": 0.00043686029707630097,
"loss": 0.173,
"mean_token_accuracy": 0.9592728316783905,
"num_tokens": 4194418.0,
"step": 475
},
{
"entropy": 2.1307725310325623,
"epoch": 1.7203619909502263,
"grad_norm": 0.5192779302597046,
"learning_rate": 0.00043653041519086354,
"loss": 0.1025,
"mean_token_accuracy": 0.970764696598053,
"num_tokens": 4203705.0,
"step": 476
},
{
"entropy": 2.160595118999481,
"epoch": 1.7239819004524888,
"grad_norm": 0.7398526668548584,
"learning_rate": 0.0004361999427709943,
"loss": 0.229,
"mean_token_accuracy": 0.9352773874998093,
"num_tokens": 4212648.0,
"step": 477
},
{
"entropy": 2.1865442991256714,
"epoch": 1.7276018099547512,
"grad_norm": 0.6227203011512756,
"learning_rate": 0.0004358688810718699,
"loss": 0.1118,
"mean_token_accuracy": 0.9689576476812363,
"num_tokens": 4221208.0,
"step": 478
},
{
"entropy": 2.086527943611145,
"epoch": 1.7312217194570136,
"grad_norm": 0.722144603729248,
"learning_rate": 0.00043553723135090447,
"loss": 0.1656,
"mean_token_accuracy": 0.9537550210952759,
"num_tokens": 4230810.0,
"step": 479
},
{
"entropy": 2.068355441093445,
"epoch": 1.734841628959276,
"grad_norm": 0.5781517028808594,
"learning_rate": 0.0004352049948677462,
"loss": 0.1497,
"mean_token_accuracy": 0.9600837379693985,
"num_tokens": 4240394.0,
"step": 480
},
{
"entropy": 2.185140371322632,
"epoch": 1.7384615384615385,
"grad_norm": 0.7261873483657837,
"learning_rate": 0.0004348721728842715,
"loss": 0.1582,
"mean_token_accuracy": 0.9584025889635086,
"num_tokens": 4249205.0,
"step": 481
},
{
"entropy": 2.21835720539093,
"epoch": 1.742081447963801,
"grad_norm": 0.5321667194366455,
"learning_rate": 0.0004345387666645807,
"loss": 0.1344,
"mean_token_accuracy": 0.9659005403518677,
"num_tokens": 4257808.0,
"step": 482
},
{
"entropy": 2.078131854534149,
"epoch": 1.7457013574660634,
"grad_norm": 0.5598498582839966,
"learning_rate": 0.00043420477747499307,
"loss": 0.1347,
"mean_token_accuracy": 0.9678008407354355,
"num_tokens": 4266728.0,
"step": 483
},
{
"entropy": 2.060504525899887,
"epoch": 1.7493212669683258,
"grad_norm": 0.5017166137695312,
"learning_rate": 0.0004338702065840422,
"loss": 0.0722,
"mean_token_accuracy": 0.9762782007455826,
"num_tokens": 4275514.0,
"step": 484
},
{
"entropy": 2.165244698524475,
"epoch": 1.7529411764705882,
"grad_norm": 0.4664002060890198,
"learning_rate": 0.00043353505526247084,
"loss": 0.1206,
"mean_token_accuracy": 0.9696767777204514,
"num_tokens": 4284013.0,
"step": 485
},
{
"entropy": 2.103049159049988,
"epoch": 1.7565610859728507,
"grad_norm": 0.6669000387191772,
"learning_rate": 0.0004331993247832265,
"loss": 0.1052,
"mean_token_accuracy": 0.9665459096431732,
"num_tokens": 4293011.0,
"step": 486
},
{
"entropy": 2.1286613941192627,
"epoch": 1.760180995475113,
"grad_norm": 0.7821269631385803,
"learning_rate": 0.00043286301642145634,
"loss": 0.3669,
"mean_token_accuracy": 0.9062697291374207,
"num_tokens": 4301965.0,
"step": 487
},
{
"entropy": 2.098009169101715,
"epoch": 1.7638009049773755,
"grad_norm": 0.5720731616020203,
"learning_rate": 0.0004325261314545024,
"loss": 0.1324,
"mean_token_accuracy": 0.9650943875312805,
"num_tokens": 4310914.0,
"step": 488
},
{
"entropy": 2.164614498615265,
"epoch": 1.767420814479638,
"grad_norm": 1.0500473976135254,
"learning_rate": 0.0004321886711618967,
"loss": 0.1182,
"mean_token_accuracy": 0.9720661342144012,
"num_tokens": 4319072.0,
"step": 489
},
{
"entropy": 2.2015402913093567,
"epoch": 1.7710407239819004,
"grad_norm": 0.5770253539085388,
"learning_rate": 0.00043185063682535634,
"loss": 0.1226,
"mean_token_accuracy": 0.9615659862756729,
"num_tokens": 4327539.0,
"step": 490
},
{
"entropy": 2.075456440448761,
"epoch": 1.7746606334841628,
"grad_norm": 0.6456925272941589,
"learning_rate": 0.0004315120297287789,
"loss": 0.1123,
"mean_token_accuracy": 0.9628709554672241,
"num_tokens": 4336523.0,
"step": 491
},
{
"entropy": 2.158169150352478,
"epoch": 1.7782805429864252,
"grad_norm": 0.8282069563865662,
"learning_rate": 0.00043117285115823733,
"loss": 0.2146,
"mean_token_accuracy": 0.9413971602916718,
"num_tokens": 4345294.0,
"step": 492
},
{
"entropy": 2.02735897898674,
"epoch": 1.7819004524886877,
"grad_norm": 0.783597469329834,
"learning_rate": 0.000430833102401975,
"loss": 0.1376,
"mean_token_accuracy": 0.964630737900734,
"num_tokens": 4354107.0,
"step": 493
},
{
"entropy": 2.138492166996002,
"epoch": 1.78552036199095,
"grad_norm": 0.6317175030708313,
"learning_rate": 0.000430492784750401,
"loss": 0.1005,
"mean_token_accuracy": 0.9734214246273041,
"num_tokens": 4362560.0,
"step": 494
},
{
"entropy": 2.0253217220306396,
"epoch": 1.7891402714932125,
"grad_norm": 0.5523395538330078,
"learning_rate": 0.000430151899496085,
"loss": 0.1633,
"mean_token_accuracy": 0.9558031558990479,
"num_tokens": 4371698.0,
"step": 495
},
{
"entropy": 2.160472810268402,
"epoch": 1.792760180995475,
"grad_norm": 0.6557935476303101,
"learning_rate": 0.00042981044793375295,
"loss": 0.1154,
"mean_token_accuracy": 0.9722230583429337,
"num_tokens": 4380612.0,
"step": 496
},
{
"entropy": 2.0284159183502197,
"epoch": 1.7963800904977374,
"grad_norm": 0.7357863187789917,
"learning_rate": 0.00042946843136028117,
"loss": 0.1166,
"mean_token_accuracy": 0.9629471153020859,
"num_tokens": 4389521.0,
"step": 497
},
{
"entropy": 2.1544791162014008,
"epoch": 1.8,
"grad_norm": 0.5604898929595947,
"learning_rate": 0.00042912585107469226,
"loss": 0.0834,
"mean_token_accuracy": 0.9783036410808563,
"num_tokens": 4398059.0,
"step": 498
},
{
"entropy": 2.1051094830036163,
"epoch": 1.8036199095022625,
"grad_norm": 0.4598539173603058,
"learning_rate": 0.0004287827083781497,
"loss": 0.0411,
"mean_token_accuracy": 0.9868490546941757,
"num_tokens": 4406453.0,
"step": 499
},
{
"entropy": 2.0219272077083588,
"epoch": 1.807239819004525,
"grad_norm": 0.8164628744125366,
"learning_rate": 0.00042843900457395343,
"loss": 0.1988,
"mean_token_accuracy": 0.9502352625131607,
"num_tokens": 4415440.0,
"step": 500
},
{
"entropy": 1.980013906955719,
"epoch": 1.8108597285067873,
"grad_norm": 0.572798490524292,
"learning_rate": 0.0004280947409675341,
"loss": 0.1148,
"mean_token_accuracy": 0.966580331325531,
"num_tokens": 4424532.0,
"step": 501
},
{
"entropy": 2.0646563172340393,
"epoch": 1.8144796380090498,
"grad_norm": 0.769386351108551,
"learning_rate": 0.00042774991886644875,
"loss": 0.1592,
"mean_token_accuracy": 0.9553463608026505,
"num_tokens": 4432913.0,
"step": 502
},
{
"entropy": 2.040877491235733,
"epoch": 1.8180995475113122,
"grad_norm": 0.7467371821403503,
"learning_rate": 0.0004274045395803758,
"loss": 0.2247,
"mean_token_accuracy": 0.9526964277029037,
"num_tokens": 4441425.0,
"step": 503
},
{
"entropy": 1.9934698939323425,
"epoch": 1.8217194570135746,
"grad_norm": 0.6602952480316162,
"learning_rate": 0.00042705860442110964,
"loss": 0.1681,
"mean_token_accuracy": 0.9594631940126419,
"num_tokens": 4450383.0,
"step": 504
},
{
"entropy": 2.0858289897441864,
"epoch": 1.825339366515837,
"grad_norm": 0.684380829334259,
"learning_rate": 0.0004267121147025562,
"loss": 0.1154,
"mean_token_accuracy": 0.9638111293315887,
"num_tokens": 4458862.0,
"step": 505
},
{
"entropy": 2.0886995792388916,
"epoch": 1.8289592760180997,
"grad_norm": 0.5784837007522583,
"learning_rate": 0.00042636507174072756,
"loss": 0.1026,
"mean_token_accuracy": 0.9676834791898727,
"num_tokens": 4467386.0,
"step": 506
},
{
"entropy": 2.0236063301563263,
"epoch": 1.8325791855203621,
"grad_norm": 0.5101180672645569,
"learning_rate": 0.00042601747685373716,
"loss": 0.1031,
"mean_token_accuracy": 0.9734093993902206,
"num_tokens": 4476054.0,
"step": 507
},
{
"entropy": 1.9801031053066254,
"epoch": 1.8361990950226246,
"grad_norm": 0.6581607460975647,
"learning_rate": 0.00042566933136179455,
"loss": 0.1548,
"mean_token_accuracy": 0.9581006914377213,
"num_tokens": 4484895.0,
"step": 508
},
{
"entropy": 2.0244787633419037,
"epoch": 1.839819004524887,
"grad_norm": 0.8100608587265015,
"learning_rate": 0.0004253206365872008,
"loss": 0.196,
"mean_token_accuracy": 0.9532899260520935,
"num_tokens": 4493737.0,
"step": 509
},
{
"entropy": 1.9108119010925293,
"epoch": 1.8434389140271494,
"grad_norm": 0.4903942048549652,
"learning_rate": 0.00042497139385434314,
"loss": 0.1313,
"mean_token_accuracy": 0.9667337089776993,
"num_tokens": 4502840.0,
"step": 510
},
{
"entropy": 2.009468197822571,
"epoch": 1.8470588235294119,
"grad_norm": 0.6010113954544067,
"learning_rate": 0.0004246216044896897,
"loss": 0.1013,
"mean_token_accuracy": 0.9692314714193344,
"num_tokens": 4511407.0,
"step": 511
},
{
"entropy": 2.0337170362472534,
"epoch": 1.8506787330316743,
"grad_norm": 0.7906802892684937,
"learning_rate": 0.00042427126982178546,
"loss": 0.1682,
"mean_token_accuracy": 0.9550099819898605,
"num_tokens": 4520018.0,
"step": 512
},
{
"entropy": 1.8813888728618622,
"epoch": 1.8542986425339367,
"grad_norm": 0.5353080034255981,
"learning_rate": 0.00042392039118124586,
"loss": 0.1228,
"mean_token_accuracy": 0.9624074995517731,
"num_tokens": 4529270.0,
"step": 513
},
{
"entropy": 2.012698233127594,
"epoch": 1.8579185520361992,
"grad_norm": 0.6713843941688538,
"learning_rate": 0.00042356896990075285,
"loss": 0.2225,
"mean_token_accuracy": 0.9417333751916885,
"num_tokens": 4538008.0,
"step": 514
},
{
"entropy": 1.880586564540863,
"epoch": 1.8615384615384616,
"grad_norm": 0.5821724534034729,
"learning_rate": 0.00042321700731504916,
"loss": 0.1144,
"mean_token_accuracy": 0.9677341282367706,
"num_tokens": 4546950.0,
"step": 515
},
{
"entropy": 2.0066279470920563,
"epoch": 1.865158371040724,
"grad_norm": 0.4095056354999542,
"learning_rate": 0.0004228645047609335,
"loss": 0.0424,
"mean_token_accuracy": 0.9854962974786758,
"num_tokens": 4555452.0,
"step": 516
},
{
"entropy": 2.042815536260605,
"epoch": 1.8687782805429864,
"grad_norm": 0.5398769974708557,
"learning_rate": 0.0004225114635772555,
"loss": 0.1343,
"mean_token_accuracy": 0.9615450948476791,
"num_tokens": 4564386.0,
"step": 517
},
{
"entropy": 2.0948933362960815,
"epoch": 1.8723981900452489,
"grad_norm": 0.6738974452018738,
"learning_rate": 0.0004221578851049107,
"loss": 0.1541,
"mean_token_accuracy": 0.9526563137769699,
"num_tokens": 4573041.0,
"step": 518
},
{
"entropy": 2.102545380592346,
"epoch": 1.8760180995475113,
"grad_norm": 0.7769943475723267,
"learning_rate": 0.00042180377068683504,
"loss": 0.2362,
"mean_token_accuracy": 0.9472651779651642,
"num_tokens": 4581666.0,
"step": 519
},
{
"entropy": 2.087820291519165,
"epoch": 1.8796380090497737,
"grad_norm": 0.5722424983978271,
"learning_rate": 0.0004214491216680004,
"loss": 0.1657,
"mean_token_accuracy": 0.9537082612514496,
"num_tokens": 4590238.0,
"step": 520
},
{
"entropy": 2.0093430876731873,
"epoch": 1.8832579185520362,
"grad_norm": 0.5844932198524475,
"learning_rate": 0.00042109393939540867,
"loss": 0.1485,
"mean_token_accuracy": 0.9624215811491013,
"num_tokens": 4599352.0,
"step": 521
},
{
"entropy": 1.9117147326469421,
"epoch": 1.8868778280542986,
"grad_norm": 0.46085676550865173,
"learning_rate": 0.0004207382252180876,
"loss": 0.0853,
"mean_token_accuracy": 0.9769327491521835,
"num_tokens": 4608571.0,
"step": 522
},
{
"entropy": 2.0205602943897247,
"epoch": 1.890497737556561,
"grad_norm": 0.5571608543395996,
"learning_rate": 0.000420381980487085,
"loss": 0.1517,
"mean_token_accuracy": 0.9646699875593185,
"num_tokens": 4617445.0,
"step": 523
},
{
"entropy": 1.9571953415870667,
"epoch": 1.8941176470588235,
"grad_norm": 0.470630943775177,
"learning_rate": 0.0004200252065554636,
"loss": 0.1005,
"mean_token_accuracy": 0.9750025719404221,
"num_tokens": 4626756.0,
"step": 524
},
{
"entropy": 2.063209116458893,
"epoch": 1.897737556561086,
"grad_norm": 0.6447069644927979,
"learning_rate": 0.00041966790477829637,
"loss": 0.113,
"mean_token_accuracy": 0.9695079624652863,
"num_tokens": 4635378.0,
"step": 525
},
{
"entropy": 1.9232109785079956,
"epoch": 1.9013574660633483,
"grad_norm": 0.5114295482635498,
"learning_rate": 0.000419310076512661,
"loss": 0.1492,
"mean_token_accuracy": 0.9653338938951492,
"num_tokens": 4644769.0,
"step": 526
},
{
"entropy": 2.1691197752952576,
"epoch": 1.9049773755656108,
"grad_norm": 0.7630137205123901,
"learning_rate": 0.00041895172311763476,
"loss": 0.212,
"mean_token_accuracy": 0.9533941894769669,
"num_tokens": 4652857.0,
"step": 527
},
{
"entropy": 2.04753240942955,
"epoch": 1.9085972850678732,
"grad_norm": 0.6423042416572571,
"learning_rate": 0.00041859284595428955,
"loss": 0.1455,
"mean_token_accuracy": 0.956505224108696,
"num_tokens": 4661591.0,
"step": 528
},
{
"entropy": 1.9440338611602783,
"epoch": 1.9122171945701356,
"grad_norm": 0.5011327266693115,
"learning_rate": 0.00041823344638568656,
"loss": 0.1255,
"mean_token_accuracy": 0.965131089091301,
"num_tokens": 4670594.0,
"step": 529
},
{
"entropy": 2.0554805397987366,
"epoch": 1.915837104072398,
"grad_norm": 0.5821590423583984,
"learning_rate": 0.0004178735257768713,
"loss": 0.0486,
"mean_token_accuracy": 0.9875282496213913,
"num_tokens": 4679344.0,
"step": 530
},
{
"entropy": 2.130349576473236,
"epoch": 1.9194570135746605,
"grad_norm": 0.5332052111625671,
"learning_rate": 0.0004175130854948679,
"loss": 0.0915,
"mean_token_accuracy": 0.9737034440040588,
"num_tokens": 4687922.0,
"step": 531
},
{
"entropy": 2.146788775920868,
"epoch": 1.9230769230769231,
"grad_norm": 0.5016877055168152,
"learning_rate": 0.00041715212690867455,
"loss": 0.1281,
"mean_token_accuracy": 0.9681432545185089,
"num_tokens": 4696593.0,
"step": 532
},
{
"entropy": 2.041268438100815,
"epoch": 1.9266968325791856,
"grad_norm": 0.5257729887962341,
"learning_rate": 0.00041679065138925807,
"loss": 0.1272,
"mean_token_accuracy": 0.9649266451597214,
"num_tokens": 4705792.0,
"step": 533
},
{
"entropy": 2.114819645881653,
"epoch": 1.930316742081448,
"grad_norm": 0.7085135579109192,
"learning_rate": 0.0004164286603095484,
"loss": 0.1545,
"mean_token_accuracy": 0.9581228941679001,
"num_tokens": 4714599.0,
"step": 534
},
{
"entropy": 2.022280514240265,
"epoch": 1.9339366515837104,
"grad_norm": 0.5309014320373535,
"learning_rate": 0.00041606615504443387,
"loss": 0.1933,
"mean_token_accuracy": 0.9562340676784515,
"num_tokens": 4724062.0,
"step": 535
},
{
"entropy": 2.0959260165691376,
"epoch": 1.9375565610859729,
"grad_norm": 0.6528061628341675,
"learning_rate": 0.0004157031369707557,
"loss": 0.1306,
"mean_token_accuracy": 0.9612343460321426,
"num_tokens": 4733077.0,
"step": 536
},
{
"entropy": 2.2772948145866394,
"epoch": 1.9411764705882353,
"grad_norm": 0.7351471185684204,
"learning_rate": 0.0004153396074673028,
"loss": 0.1494,
"mean_token_accuracy": 0.9608108699321747,
"num_tokens": 4741201.0,
"step": 537
},
{
"entropy": 2.0935052037239075,
"epoch": 1.9447963800904977,
"grad_norm": 0.5435840487480164,
"learning_rate": 0.0004149755679148065,
"loss": 0.0884,
"mean_token_accuracy": 0.9745689779520035,
"num_tokens": 4750306.0,
"step": 538
},
{
"entropy": 2.2082818746566772,
"epoch": 1.9484162895927601,
"grad_norm": 0.3780331611633301,
"learning_rate": 0.00041461101969593537,
"loss": 0.0739,
"mean_token_accuracy": 0.9777179658412933,
"num_tokens": 4758954.0,
"step": 539
},
{
"entropy": 2.1683040261268616,
"epoch": 1.9520361990950226,
"grad_norm": 0.4637961685657501,
"learning_rate": 0.00041424596419529017,
"loss": 0.0632,
"mean_token_accuracy": 0.9834533184766769,
"num_tokens": 4767615.0,
"step": 540
},
{
"entropy": 2.075555235147476,
"epoch": 1.9556561085972852,
"grad_norm": 0.7603118419647217,
"learning_rate": 0.00041388040279939804,
"loss": 0.2835,
"mean_token_accuracy": 0.9364205300807953,
"num_tokens": 4776714.0,
"step": 541
},
{
"entropy": 2.18926739692688,
"epoch": 1.9592760180995477,
"grad_norm": 0.8895708918571472,
"learning_rate": 0.0004135143368967079,
"loss": 0.2514,
"mean_token_accuracy": 0.9361050724983215,
"num_tokens": 4785402.0,
"step": 542
},
{
"entropy": 2.2387169003486633,
"epoch": 1.96289592760181,
"grad_norm": 0.6013544797897339,
"learning_rate": 0.00041314776787758454,
"loss": 0.1502,
"mean_token_accuracy": 0.9594238847494125,
"num_tokens": 4793928.0,
"step": 543
},
{
"entropy": 2.208383619785309,
"epoch": 1.9665158371040725,
"grad_norm": 0.6934756636619568,
"learning_rate": 0.00041278069713430386,
"loss": 0.1777,
"mean_token_accuracy": 0.9619583487510681,
"num_tokens": 4802612.0,
"step": 544
},
{
"entropy": 2.2621757984161377,
"epoch": 1.970135746606335,
"grad_norm": 0.6920077800750732,
"learning_rate": 0.00041241312606104743,
"loss": 0.1689,
"mean_token_accuracy": 0.9594835937023163,
"num_tokens": 4811332.0,
"step": 545
},
{
"entropy": 2.2654454112052917,
"epoch": 1.9737556561085974,
"grad_norm": 0.6259592771530151,
"learning_rate": 0.000412045056053897,
"loss": 0.142,
"mean_token_accuracy": 0.9648078680038452,
"num_tokens": 4820441.0,
"step": 546
},
{
"entropy": 2.218056857585907,
"epoch": 1.9773755656108598,
"grad_norm": 0.5390617847442627,
"learning_rate": 0.0004116764885108292,
"loss": 0.1737,
"mean_token_accuracy": 0.9595656991004944,
"num_tokens": 4829437.0,
"step": 547
},
{
"entropy": 2.2571592330932617,
"epoch": 1.9809954751131222,
"grad_norm": 0.3656528890132904,
"learning_rate": 0.0004113074248317108,
"loss": 0.0545,
"mean_token_accuracy": 0.9825418293476105,
"num_tokens": 4838118.0,
"step": 548
},
{
"entropy": 2.1890549659729004,
"epoch": 1.9846153846153847,
"grad_norm": 0.5716155767440796,
"learning_rate": 0.00041093786641829247,
"loss": 0.0997,
"mean_token_accuracy": 0.9715700745582581,
"num_tokens": 4847073.0,
"step": 549
},
{
"entropy": 2.2726192474365234,
"epoch": 1.988235294117647,
"grad_norm": 0.4709530770778656,
"learning_rate": 0.0004105678146742042,
"loss": 0.0746,
"mean_token_accuracy": 0.9799739569425583,
"num_tokens": 4855755.0,
"step": 550
},
{
"entropy": 2.2328362464904785,
"epoch": 1.9918552036199095,
"grad_norm": 0.6773779392242432,
"learning_rate": 0.0004101972710049498,
"loss": 0.1418,
"mean_token_accuracy": 0.9629421681165695,
"num_tokens": 4864601.0,
"step": 551
},
{
"entropy": 2.199812740087509,
"epoch": 1.995475113122172,
"grad_norm": 0.717012882232666,
"learning_rate": 0.00040982623681790113,
"loss": 0.2948,
"mean_token_accuracy": 0.9432803690433502,
"num_tokens": 4873630.0,
"step": 552
},
{
"entropy": 2.2102787494659424,
"epoch": 1.9990950226244344,
"grad_norm": 0.6925314664840698,
"learning_rate": 0.00040945471352229346,
"loss": 0.2579,
"mean_token_accuracy": 0.9435124397277832,
"num_tokens": 4882714.0,
"step": 553
},
{
"entropy": 2.3318979740142822,
"epoch": 2.0,
"grad_norm": 2.688188314437866,
"learning_rate": 0.0004090827025292197,
"loss": 0.0283,
"mean_token_accuracy": 0.9918032884597778,
"num_tokens": 4883450.0,
"step": 554
},
{
"epoch": 2.0,
"eval_entropy": 2.2165925522160723,
"eval_loss": 0.16817161440849304,
"eval_mean_token_accuracy": 0.9567220133494555,
"eval_num_tokens": 4883450.0,
"eval_runtime": 116.1556,
"eval_samples_per_second": 3.177,
"eval_steps_per_second": 1.059,
"step": 554
}
],
"logging_steps": 1,
"max_steps": 1662,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.634384518674615e+17,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}