gemma-finetuned-s0 / trainer_state.json
sha000's picture
Upload folder using huggingface_hub
a00487c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 939,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.471254324913025,
"epoch": 0.03194888178913738,
"grad_norm": 8.043270111083984,
"learning_rate": 1.9808306709265177e-05,
"loss": 1.1636,
"mean_token_accuracy": 0.6890625,
"num_tokens": 12480.0,
"step": 10
},
{
"entropy": 0.90457843542099,
"epoch": 0.06389776357827476,
"grad_norm": 10.676708221435547,
"learning_rate": 1.959531416400426e-05,
"loss": 0.3864,
"mean_token_accuracy": 0.8375,
"num_tokens": 24960.0,
"step": 20
},
{
"entropy": 1.027526319026947,
"epoch": 0.09584664536741214,
"grad_norm": 5.594727039337158,
"learning_rate": 1.9382321618743344e-05,
"loss": 0.3759,
"mean_token_accuracy": 0.83671875,
"num_tokens": 37440.0,
"step": 30
},
{
"entropy": 1.192073893547058,
"epoch": 0.12779552715654952,
"grad_norm": 3.208804130554199,
"learning_rate": 1.916932907348243e-05,
"loss": 0.3341,
"mean_token_accuracy": 0.85703125,
"num_tokens": 49920.0,
"step": 40
},
{
"entropy": 1.1430011987686157,
"epoch": 0.1597444089456869,
"grad_norm": 19.184051513671875,
"learning_rate": 1.895633652822151e-05,
"loss": 0.3429,
"mean_token_accuracy": 0.85078125,
"num_tokens": 62400.0,
"step": 50
},
{
"entropy": 1.1257157444953918,
"epoch": 0.19169329073482427,
"grad_norm": 5.893524646759033,
"learning_rate": 1.87433439829606e-05,
"loss": 0.2334,
"mean_token_accuracy": 0.8875,
"num_tokens": 74880.0,
"step": 60
},
{
"entropy": 0.992573595046997,
"epoch": 0.22364217252396165,
"grad_norm": 15.351304054260254,
"learning_rate": 1.8530351437699682e-05,
"loss": 0.1187,
"mean_token_accuracy": 0.96328125,
"num_tokens": 87360.0,
"step": 70
},
{
"entropy": 0.805773138999939,
"epoch": 0.25559105431309903,
"grad_norm": 74.02106475830078,
"learning_rate": 1.8317358892438765e-05,
"loss": 0.1924,
"mean_token_accuracy": 0.93125,
"num_tokens": 99840.0,
"step": 80
},
{
"entropy": 0.8376959323883056,
"epoch": 0.28753993610223644,
"grad_norm": 9.446106910705566,
"learning_rate": 1.8104366347177852e-05,
"loss": 0.0837,
"mean_token_accuracy": 0.96796875,
"num_tokens": 112320.0,
"step": 90
},
{
"entropy": 0.6883749544620514,
"epoch": 0.3194888178913738,
"grad_norm": 29.95865249633789,
"learning_rate": 1.7891373801916932e-05,
"loss": 0.0712,
"mean_token_accuracy": 0.9671875,
"num_tokens": 124800.0,
"step": 100
},
{
"entropy": 0.5861309468746185,
"epoch": 0.3514376996805112,
"grad_norm": 0.981063723564148,
"learning_rate": 1.767838125665602e-05,
"loss": 0.0339,
"mean_token_accuracy": 0.9890625,
"num_tokens": 137280.0,
"step": 110
},
{
"entropy": 0.6167496562004089,
"epoch": 0.38338658146964855,
"grad_norm": 0.3446030020713806,
"learning_rate": 1.7465388711395103e-05,
"loss": 0.019,
"mean_token_accuracy": 0.9953125,
"num_tokens": 149760.0,
"step": 120
},
{
"entropy": 0.6116879105567932,
"epoch": 0.41533546325878595,
"grad_norm": 7.9384846687316895,
"learning_rate": 1.7252396166134186e-05,
"loss": 0.0179,
"mean_token_accuracy": 0.99453125,
"num_tokens": 162240.0,
"step": 130
},
{
"entropy": 0.5835295200347901,
"epoch": 0.4472843450479233,
"grad_norm": 15.288229942321777,
"learning_rate": 1.7039403620873273e-05,
"loss": 0.0144,
"mean_token_accuracy": 0.99375,
"num_tokens": 174720.0,
"step": 140
},
{
"entropy": 0.5895743370056152,
"epoch": 0.4792332268370607,
"grad_norm": 8.906089782714844,
"learning_rate": 1.6826411075612353e-05,
"loss": 0.0277,
"mean_token_accuracy": 0.9953125,
"num_tokens": 187200.0,
"step": 150
},
{
"entropy": 0.6350247144699097,
"epoch": 0.5111821086261981,
"grad_norm": 12.155186653137207,
"learning_rate": 1.661341853035144e-05,
"loss": 0.009,
"mean_token_accuracy": 0.9984375,
"num_tokens": 199680.0,
"step": 160
},
{
"entropy": 0.6250557661056518,
"epoch": 0.5431309904153354,
"grad_norm": 1.7694993019104004,
"learning_rate": 1.6400425985090524e-05,
"loss": 0.0297,
"mean_token_accuracy": 0.9890625,
"num_tokens": 212160.0,
"step": 170
},
{
"entropy": 0.6015866935253144,
"epoch": 0.5750798722044729,
"grad_norm": 24.392311096191406,
"learning_rate": 1.6187433439829607e-05,
"loss": 0.0199,
"mean_token_accuracy": 0.9921875,
"num_tokens": 224640.0,
"step": 180
},
{
"entropy": 0.6133609235286712,
"epoch": 0.6070287539936102,
"grad_norm": 0.03015461377799511,
"learning_rate": 1.5974440894568694e-05,
"loss": 0.0131,
"mean_token_accuracy": 0.996875,
"num_tokens": 237120.0,
"step": 190
},
{
"entropy": 0.6456725597381592,
"epoch": 0.6389776357827476,
"grad_norm": 18.586185455322266,
"learning_rate": 1.5761448349307774e-05,
"loss": 0.0133,
"mean_token_accuracy": 0.99609375,
"num_tokens": 249600.0,
"step": 200
},
{
"entropy": 0.6170299232006073,
"epoch": 0.670926517571885,
"grad_norm": 49.949588775634766,
"learning_rate": 1.554845580404686e-05,
"loss": 0.0231,
"mean_token_accuracy": 0.9921875,
"num_tokens": 262080.0,
"step": 210
},
{
"entropy": 0.5970291554927826,
"epoch": 0.7028753993610224,
"grad_norm": 0.9214933514595032,
"learning_rate": 1.5335463258785944e-05,
"loss": 0.0179,
"mean_token_accuracy": 0.990625,
"num_tokens": 274560.0,
"step": 220
},
{
"entropy": 0.580757600069046,
"epoch": 0.7348242811501597,
"grad_norm": 8.092296600341797,
"learning_rate": 1.5122470713525028e-05,
"loss": 0.0349,
"mean_token_accuracy": 0.98984375,
"num_tokens": 287040.0,
"step": 230
},
{
"entropy": 0.5787507772445679,
"epoch": 0.7667731629392971,
"grad_norm": 10.055787086486816,
"learning_rate": 1.4909478168264111e-05,
"loss": 0.0065,
"mean_token_accuracy": 0.99609375,
"num_tokens": 299520.0,
"step": 240
},
{
"entropy": 0.5758024156093597,
"epoch": 0.7987220447284346,
"grad_norm": 61.38268280029297,
"learning_rate": 1.4696485623003197e-05,
"loss": 0.0424,
"mean_token_accuracy": 0.9875,
"num_tokens": 312000.0,
"step": 250
},
{
"entropy": 0.6395232379436493,
"epoch": 0.8306709265175719,
"grad_norm": 2.0960898399353027,
"learning_rate": 1.4483493077742282e-05,
"loss": 0.1762,
"mean_token_accuracy": 0.95,
"num_tokens": 324480.0,
"step": 260
},
{
"entropy": 0.6194823384284973,
"epoch": 0.8626198083067093,
"grad_norm": 2.2915937900543213,
"learning_rate": 1.4270500532481364e-05,
"loss": 0.0054,
"mean_token_accuracy": 0.9984375,
"num_tokens": 336960.0,
"step": 270
},
{
"entropy": 0.553073239326477,
"epoch": 0.8945686900958466,
"grad_norm": 0.6241616606712341,
"learning_rate": 1.4057507987220449e-05,
"loss": 0.0121,
"mean_token_accuracy": 0.99765625,
"num_tokens": 349440.0,
"step": 280
},
{
"entropy": 0.588155323266983,
"epoch": 0.9265175718849841,
"grad_norm": 0.8865500688552856,
"learning_rate": 1.3844515441959532e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.99765625,
"num_tokens": 361920.0,
"step": 290
},
{
"entropy": 0.6138588011264801,
"epoch": 0.9584664536741214,
"grad_norm": 0.16805018484592438,
"learning_rate": 1.3631522896698617e-05,
"loss": 0.001,
"mean_token_accuracy": 0.99921875,
"num_tokens": 374400.0,
"step": 300
},
{
"entropy": 0.6157549917697906,
"epoch": 0.9904153354632588,
"grad_norm": 11.855587005615234,
"learning_rate": 1.3418530351437703e-05,
"loss": 0.0051,
"mean_token_accuracy": 0.9984375,
"num_tokens": 386880.0,
"step": 310
},
{
"epoch": 1.0,
"eval_entropy": 0.5923710940759394,
"eval_loss": 0.01570574752986431,
"eval_mean_token_accuracy": 0.995253164556962,
"eval_num_tokens": 389844.0,
"eval_runtime": 13.2845,
"eval_samples_per_second": 188.189,
"eval_steps_per_second": 5.947,
"step": 313
},
{
"entropy": 0.5902234852313996,
"epoch": 1.0223642172523961,
"grad_norm": 0.026217741891741753,
"learning_rate": 1.3205537806176784e-05,
"loss": 0.0056,
"mean_token_accuracy": 0.9984375,
"num_tokens": 398580.0,
"step": 320
},
{
"entropy": 0.562452882528305,
"epoch": 1.0543130990415335,
"grad_norm": 22.588623046875,
"learning_rate": 1.299254526091587e-05,
"loss": 0.0152,
"mean_token_accuracy": 0.9921875,
"num_tokens": 411060.0,
"step": 330
},
{
"entropy": 0.553607851266861,
"epoch": 1.0862619808306708,
"grad_norm": 2.0158348083496094,
"learning_rate": 1.2779552715654953e-05,
"loss": 0.0202,
"mean_token_accuracy": 0.99453125,
"num_tokens": 423540.0,
"step": 340
},
{
"entropy": 0.602299690246582,
"epoch": 1.1182108626198084,
"grad_norm": 0.03288736939430237,
"learning_rate": 1.2566560170394038e-05,
"loss": 0.0265,
"mean_token_accuracy": 0.9921875,
"num_tokens": 436020.0,
"step": 350
},
{
"entropy": 0.5916118025779724,
"epoch": 1.1501597444089458,
"grad_norm": 0.2714002728462219,
"learning_rate": 1.235356762513312e-05,
"loss": 0.0049,
"mean_token_accuracy": 0.996875,
"num_tokens": 448500.0,
"step": 360
},
{
"entropy": 0.6014433860778808,
"epoch": 1.182108626198083,
"grad_norm": 0.8565823435783386,
"learning_rate": 1.2140575079872205e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.99921875,
"num_tokens": 460980.0,
"step": 370
},
{
"entropy": 0.558870005607605,
"epoch": 1.2140575079872205,
"grad_norm": 0.4954104721546173,
"learning_rate": 1.192758253461129e-05,
"loss": 0.0021,
"mean_token_accuracy": 0.99921875,
"num_tokens": 473460.0,
"step": 380
},
{
"entropy": 0.5449843347072602,
"epoch": 1.2460063897763578,
"grad_norm": 0.0184471495449543,
"learning_rate": 1.1714589989350374e-05,
"loss": 0.0012,
"mean_token_accuracy": 1.0,
"num_tokens": 485940.0,
"step": 390
},
{
"entropy": 0.5302130222320557,
"epoch": 1.2779552715654952,
"grad_norm": 0.5405293107032776,
"learning_rate": 1.1501597444089459e-05,
"loss": 0.0021,
"mean_token_accuracy": 0.99921875,
"num_tokens": 498420.0,
"step": 400
},
{
"entropy": 0.5257469773292541,
"epoch": 1.3099041533546325,
"grad_norm": 14.052752494812012,
"learning_rate": 1.1288604898828541e-05,
"loss": 0.0054,
"mean_token_accuracy": 0.9984375,
"num_tokens": 510900.0,
"step": 410
},
{
"entropy": 0.5204551070928574,
"epoch": 1.34185303514377,
"grad_norm": 0.041872043162584305,
"learning_rate": 1.1075612353567626e-05,
"loss": 0.0028,
"mean_token_accuracy": 0.9984375,
"num_tokens": 523380.0,
"step": 420
},
{
"entropy": 0.5394512295722962,
"epoch": 1.3738019169329074,
"grad_norm": 0.06288646906614304,
"learning_rate": 1.086261980830671e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 535860.0,
"step": 430
},
{
"entropy": 0.5291238784790039,
"epoch": 1.4057507987220448,
"grad_norm": 0.0030913001392036676,
"learning_rate": 1.0649627263045795e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.99921875,
"num_tokens": 548340.0,
"step": 440
},
{
"entropy": 0.5285849571228027,
"epoch": 1.4376996805111821,
"grad_norm": 1.8844810724258423,
"learning_rate": 1.043663471778488e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 560820.0,
"step": 450
},
{
"entropy": 0.5385882794857025,
"epoch": 1.4696485623003195,
"grad_norm": 0.11690080910921097,
"learning_rate": 1.0223642172523962e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 573300.0,
"step": 460
},
{
"entropy": 0.5441839516162872,
"epoch": 1.5015974440894568,
"grad_norm": 0.0011391988955438137,
"learning_rate": 1.0010649627263047e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.99921875,
"num_tokens": 585780.0,
"step": 470
},
{
"entropy": 0.5367125928401947,
"epoch": 1.5335463258785942,
"grad_norm": 0.595458984375,
"learning_rate": 9.79765708200213e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 598260.0,
"step": 480
},
{
"entropy": 0.5380379557609558,
"epoch": 1.5654952076677318,
"grad_norm": 0.0110127292573452,
"learning_rate": 9.584664536741216e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 610740.0,
"step": 490
},
{
"entropy": 0.5656424820423126,
"epoch": 1.5974440894568689,
"grad_norm": 0.018918879330158234,
"learning_rate": 9.3716719914803e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 623220.0,
"step": 500
},
{
"entropy": 0.5534205734729767,
"epoch": 1.6293929712460065,
"grad_norm": 0.0005970252677798271,
"learning_rate": 9.158679446219383e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 635700.0,
"step": 510
},
{
"entropy": 0.5591952800750732,
"epoch": 1.6613418530351438,
"grad_norm": 0.23496565222740173,
"learning_rate": 8.945686900958466e-06,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 648180.0,
"step": 520
},
{
"entropy": 0.5553164839744568,
"epoch": 1.6932907348242812,
"grad_norm": 0.015620424412190914,
"learning_rate": 8.732694355697551e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 660660.0,
"step": 530
},
{
"entropy": 0.5558278143405915,
"epoch": 1.7252396166134185,
"grad_norm": 0.013437892310321331,
"learning_rate": 8.519701810436637e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 673140.0,
"step": 540
},
{
"entropy": 0.5494430124759674,
"epoch": 1.7571884984025559,
"grad_norm": 0.05179116502404213,
"learning_rate": 8.30670926517572e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 685620.0,
"step": 550
},
{
"entropy": 0.5591476142406464,
"epoch": 1.7891373801916934,
"grad_norm": 0.001572166453115642,
"learning_rate": 8.093716719914804e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 698100.0,
"step": 560
},
{
"entropy": 0.5543432533740997,
"epoch": 1.8210862619808306,
"grad_norm": 0.0029468077700585127,
"learning_rate": 7.880724174653887e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 710580.0,
"step": 570
},
{
"entropy": 0.5561375498771668,
"epoch": 1.8530351437699681,
"grad_norm": 7.772324897814542e-05,
"learning_rate": 7.667731629392972e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 723060.0,
"step": 580
},
{
"entropy": 0.554932814836502,
"epoch": 1.8849840255591053,
"grad_norm": 0.023860394954681396,
"learning_rate": 7.454739084132056e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 735540.0,
"step": 590
},
{
"entropy": 0.5592103660106659,
"epoch": 1.9169329073482428,
"grad_norm": 6.846313772257417e-05,
"learning_rate": 7.241746538871141e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 748020.0,
"step": 600
},
{
"entropy": 0.5566479444503785,
"epoch": 1.9488817891373802,
"grad_norm": 0.00017782168288249522,
"learning_rate": 7.028753993610224e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 760500.0,
"step": 610
},
{
"entropy": 0.560238641500473,
"epoch": 1.9808306709265175,
"grad_norm": 0.0009979789610952139,
"learning_rate": 6.815761448349309e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 772980.0,
"step": 620
},
{
"epoch": 2.0,
"eval_entropy": 0.5581134014491793,
"eval_loss": 6.910775482538156e-06,
"eval_mean_token_accuracy": 1.0,
"eval_num_tokens": 779688.0,
"eval_runtime": 13.4056,
"eval_samples_per_second": 186.489,
"eval_steps_per_second": 5.893,
"step": 626
},
{
"entropy": 0.5542679131031036,
"epoch": 2.012779552715655,
"grad_norm": 0.0001906445249915123,
"learning_rate": 6.602768903088392e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 784680.0,
"step": 630
},
{
"entropy": 0.5601355612277985,
"epoch": 2.0447284345047922,
"grad_norm": 8.866995631251484e-05,
"learning_rate": 6.3897763578274765e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 797160.0,
"step": 640
},
{
"entropy": 0.5562943339347839,
"epoch": 2.07667731629393,
"grad_norm": 4.927597183268517e-05,
"learning_rate": 6.17678381256656e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 809640.0,
"step": 650
},
{
"entropy": 0.5592762529850006,
"epoch": 2.108626198083067,
"grad_norm": 0.0003652777522802353,
"learning_rate": 5.963791267305645e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 822120.0,
"step": 660
},
{
"entropy": 0.560578465461731,
"epoch": 2.1405750798722045,
"grad_norm": 0.0005100357229821384,
"learning_rate": 5.7507987220447296e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 834600.0,
"step": 670
},
{
"entropy": 0.5573894202709198,
"epoch": 2.1725239616613417,
"grad_norm": 0.0007649549515917897,
"learning_rate": 5.537806176783813e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 847080.0,
"step": 680
},
{
"entropy": 0.5623638391494751,
"epoch": 2.2044728434504792,
"grad_norm": 0.007040001451969147,
"learning_rate": 5.324813631522897e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 859560.0,
"step": 690
},
{
"entropy": 0.5608678042888642,
"epoch": 2.236421725239617,
"grad_norm": 0.0008389271097257733,
"learning_rate": 5.111821086261981e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 872040.0,
"step": 700
},
{
"entropy": 0.562554806470871,
"epoch": 2.268370607028754,
"grad_norm": 0.0008370543946512043,
"learning_rate": 4.898828541001065e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 884520.0,
"step": 710
},
{
"entropy": 0.5613407075405121,
"epoch": 2.3003194888178915,
"grad_norm": 3.100551475654356e-05,
"learning_rate": 4.68583599574015e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 897000.0,
"step": 720
},
{
"entropy": 0.5588717699050904,
"epoch": 2.3322683706070286,
"grad_norm": 0.0035649905912578106,
"learning_rate": 4.472843450479233e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 909480.0,
"step": 730
},
{
"entropy": 0.5609096884727478,
"epoch": 2.364217252396166,
"grad_norm": 0.0003579799085855484,
"learning_rate": 4.259850905218318e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 921960.0,
"step": 740
},
{
"entropy": 0.5581632852554321,
"epoch": 2.3961661341853033,
"grad_norm": 0.00018412918143440038,
"learning_rate": 4.046858359957402e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 934440.0,
"step": 750
},
{
"entropy": 0.5592087864875793,
"epoch": 2.428115015974441,
"grad_norm": 0.001302594318985939,
"learning_rate": 3.833865814696486e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 946920.0,
"step": 760
},
{
"entropy": 0.5602552175521851,
"epoch": 2.460063897763578,
"grad_norm": 0.0001967909629456699,
"learning_rate": 3.6208732694355704e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 959400.0,
"step": 770
},
{
"entropy": 0.5587169051170349,
"epoch": 2.4920127795527156,
"grad_norm": 3.6476201785262674e-05,
"learning_rate": 3.4078807241746544e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 971880.0,
"step": 780
},
{
"entropy": 0.5619259059429169,
"epoch": 2.523961661341853,
"grad_norm": 0.00010852525883819908,
"learning_rate": 3.1948881789137383e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 984360.0,
"step": 790
},
{
"entropy": 0.5560723125934601,
"epoch": 2.5559105431309903,
"grad_norm": 7.974612526595592e-05,
"learning_rate": 2.9818956336528226e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 996840.0,
"step": 800
},
{
"entropy": 0.5587869763374329,
"epoch": 2.587859424920128,
"grad_norm": 0.0005656637367792428,
"learning_rate": 2.7689030883919065e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 1009320.0,
"step": 810
},
{
"entropy": 0.5632799625396728,
"epoch": 2.619808306709265,
"grad_norm": 6.125601794337854e-05,
"learning_rate": 2.5559105431309904e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 1021800.0,
"step": 820
},
{
"entropy": 0.558579832315445,
"epoch": 2.6517571884984026,
"grad_norm": 0.0008585830801166594,
"learning_rate": 2.342917997870075e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 1034280.0,
"step": 830
},
{
"entropy": 0.5579914152622223,
"epoch": 2.68370607028754,
"grad_norm": 5.5771433835616335e-05,
"learning_rate": 2.129925452609159e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 1046760.0,
"step": 840
},
{
"entropy": 0.5599809646606445,
"epoch": 2.7156549520766773,
"grad_norm": 0.00012791369226761162,
"learning_rate": 1.916932907348243e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 1059240.0,
"step": 850
},
{
"entropy": 0.5608228087425232,
"epoch": 2.747603833865815,
"grad_norm": 8.307035022880882e-05,
"learning_rate": 1.7039403620873272e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 1071720.0,
"step": 860
},
{
"entropy": 0.5607754468917847,
"epoch": 2.779552715654952,
"grad_norm": 5.250581671134569e-05,
"learning_rate": 1.4909478168264113e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 1084200.0,
"step": 870
},
{
"entropy": 0.5697051167488099,
"epoch": 2.8115015974440896,
"grad_norm": 0.0002477150410413742,
"learning_rate": 1.2779552715654952e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 1096680.0,
"step": 880
},
{
"entropy": 0.5599372982978821,
"epoch": 2.8434504792332267,
"grad_norm": 9.851283539319411e-05,
"learning_rate": 1.0649627263045796e-06,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 1109160.0,
"step": 890
},
{
"entropy": 0.5631425619125366,
"epoch": 2.8753993610223643,
"grad_norm": 5.103146395413205e-05,
"learning_rate": 8.519701810436636e-07,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 1121640.0,
"step": 900
},
{
"entropy": 0.5630890011787415,
"epoch": 2.9073482428115014,
"grad_norm": 0.0011606919579207897,
"learning_rate": 6.389776357827476e-07,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 1134120.0,
"step": 910
},
{
"entropy": 0.5582900941371918,
"epoch": 2.939297124600639,
"grad_norm": 0.0002894483332056552,
"learning_rate": 4.259850905218318e-07,
"loss": 0.0,
"mean_token_accuracy": 1.0,
"num_tokens": 1146600.0,
"step": 920
},
{
"entropy": 0.5616473734378815,
"epoch": 2.9712460063897765,
"grad_norm": 4.966451888321899e-05,
"learning_rate": 2.129925452609159e-07,
"loss": 0.0008,
"mean_token_accuracy": 0.99921875,
"num_tokens": 1159080.0,
"step": 930
}
],
"logging_steps": 10,
"max_steps": 939,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 704006916867072.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}