NLPSharedTask_QnA / trainer_state.json
vinaybabu's picture
Upload checkpoint-1500
af3bc52 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.29329096908224367,
"eval_steps": 500,
"global_step": 1500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.2905268957838416,
"epoch": 0.0031284370035439325,
"grad_norm": 0.43689030408859253,
"learning_rate": 9.999787808528638e-05,
"loss": 1.7852,
"mean_token_accuracy": 0.6027710330672562,
"num_tokens": 524288.0,
"step": 16
},
{
"entropy": 1.3333816397935152,
"epoch": 0.006256874007087865,
"grad_norm": 0.5030925273895264,
"learning_rate": 9.99909372761763e-05,
"loss": 1.2521,
"mean_token_accuracy": 0.6930539021268487,
"num_tokens": 1048576.0,
"step": 32
},
{
"entropy": 0.8592402129434049,
"epoch": 0.009385311010631798,
"grad_norm": 0.17196309566497803,
"learning_rate": 9.99791688121494e-05,
"loss": 0.8735,
"mean_token_accuracy": 0.789072047919035,
"num_tokens": 1572864.0,
"step": 48
},
{
"entropy": 0.8306971751153469,
"epoch": 0.01251374801417573,
"grad_norm": 0.18978707492351532,
"learning_rate": 9.996257382969333e-05,
"loss": 0.8334,
"mean_token_accuracy": 0.7984222243539989,
"num_tokens": 2096964.0,
"step": 64
},
{
"entropy": 0.7896183831617236,
"epoch": 0.015642185017719662,
"grad_norm": 0.13939695060253143,
"learning_rate": 9.994115393139555e-05,
"loss": 0.7922,
"mean_token_accuracy": 0.8059685812331736,
"num_tokens": 2621252.0,
"step": 80
},
{
"entropy": 0.7537235538475215,
"epoch": 0.018770622021263596,
"grad_norm": 0.14827103912830353,
"learning_rate": 9.991491118578856e-05,
"loss": 0.7564,
"mean_token_accuracy": 0.8114575678482652,
"num_tokens": 3145540.0,
"step": 96
},
{
"entropy": 0.7533294446766376,
"epoch": 0.02189905902480753,
"grad_norm": 0.14420101046562195,
"learning_rate": 9.988384812715006e-05,
"loss": 0.7549,
"mean_token_accuracy": 0.8126546451821923,
"num_tokens": 3669828.0,
"step": 112
},
{
"entropy": 0.735235239379108,
"epoch": 0.02502749602835146,
"grad_norm": 0.15388034284114838,
"learning_rate": 9.984796775525836e-05,
"loss": 0.7378,
"mean_token_accuracy": 0.813876539003104,
"num_tokens": 4194116.0,
"step": 128
},
{
"entropy": 0.745816265232861,
"epoch": 0.028155933031895394,
"grad_norm": 0.16290415823459625,
"learning_rate": 9.980727353510257e-05,
"loss": 0.7469,
"mean_token_accuracy": 0.8129773042164743,
"num_tokens": 4718404.0,
"step": 144
},
{
"entropy": 0.7049960081931204,
"epoch": 0.031284370035439324,
"grad_norm": 0.16859838366508484,
"learning_rate": 9.976176939654804e-05,
"loss": 0.7068,
"mean_token_accuracy": 0.8205063017085195,
"num_tokens": 5242421.0,
"step": 160
},
{
"entropy": 0.7164251236245036,
"epoch": 0.03441280703898326,
"grad_norm": 0.18480005860328674,
"learning_rate": 9.971145973395684e-05,
"loss": 0.7162,
"mean_token_accuracy": 0.8197130090557039,
"num_tokens": 5766709.0,
"step": 176
},
{
"entropy": 0.7345189340412617,
"epoch": 0.03754124404252719,
"grad_norm": 0.20077235996723175,
"learning_rate": 9.965634940576338e-05,
"loss": 0.7373,
"mean_token_accuracy": 0.8166410801932216,
"num_tokens": 6290997.0,
"step": 192
},
{
"entropy": 0.6868870840407908,
"epoch": 0.040669681046071125,
"grad_norm": 0.19620533287525177,
"learning_rate": 9.959644373400523e-05,
"loss": 0.6877,
"mean_token_accuracy": 0.8243126338347793,
"num_tokens": 6814838.0,
"step": 208
},
{
"entropy": 0.7020489743445069,
"epoch": 0.04379811804961506,
"grad_norm": 0.21126843988895416,
"learning_rate": 9.953174850380918e-05,
"loss": 0.7008,
"mean_token_accuracy": 0.8215000284835696,
"num_tokens": 7339126.0,
"step": 224
},
{
"entropy": 0.6887436110991985,
"epoch": 0.046926555053158986,
"grad_norm": 0.20873498916625977,
"learning_rate": 9.946226996283258e-05,
"loss": 0.6878,
"mean_token_accuracy": 0.825064530596137,
"num_tokens": 7863414.0,
"step": 240
},
{
"entropy": 0.6778992149047554,
"epoch": 0.05005499205670292,
"grad_norm": 0.20576342940330505,
"learning_rate": 9.938801482065998e-05,
"loss": 0.6807,
"mean_token_accuracy": 0.8253566385246813,
"num_tokens": 8387702.0,
"step": 256
},
{
"entropy": 0.6773952257353812,
"epoch": 0.05318342906024685,
"grad_norm": 0.2139802724123001,
"learning_rate": 9.930899024815517e-05,
"loss": 0.6734,
"mean_token_accuracy": 0.8275064108893275,
"num_tokens": 8911990.0,
"step": 272
},
{
"entropy": 0.6875678761862218,
"epoch": 0.05631186606379079,
"grad_norm": 0.22833411395549774,
"learning_rate": 9.922520387676868e-05,
"loss": 0.6866,
"mean_token_accuracy": 0.8259179475717247,
"num_tokens": 9436278.0,
"step": 288
},
{
"entropy": 0.6587484122719616,
"epoch": 0.05944030306733472,
"grad_norm": 0.23073996603488922,
"learning_rate": 9.91366637978009e-05,
"loss": 0.6573,
"mean_token_accuracy": 0.8297159126959741,
"num_tokens": 9960272.0,
"step": 304
},
{
"entropy": 0.6715414742939174,
"epoch": 0.06256874007087865,
"grad_norm": 0.2512345612049103,
"learning_rate": 9.904337856162053e-05,
"loss": 0.6645,
"mean_token_accuracy": 0.8285545711405575,
"num_tokens": 10484560.0,
"step": 320
},
{
"entropy": 0.6463351501151919,
"epoch": 0.06569717707442259,
"grad_norm": 0.2298312485218048,
"learning_rate": 9.894535717683902e-05,
"loss": 0.6429,
"mean_token_accuracy": 0.8320847055874765,
"num_tokens": 11008848.0,
"step": 336
},
{
"entropy": 0.6667697406373918,
"epoch": 0.06882561407796652,
"grad_norm": 0.22004173696041107,
"learning_rate": 9.884260910944053e-05,
"loss": 0.6681,
"mean_token_accuracy": 0.8278767997398973,
"num_tokens": 11533136.0,
"step": 352
},
{
"entropy": 0.6179311077576131,
"epoch": 0.07195405108151044,
"grad_norm": 0.22624558210372925,
"learning_rate": 9.873514428186778e-05,
"loss": 0.6173,
"mean_token_accuracy": 0.8369016530923545,
"num_tokens": 12057424.0,
"step": 368
},
{
"entropy": 0.6683201459236443,
"epoch": 0.07508248808505438,
"grad_norm": 0.24455475807189941,
"learning_rate": 9.862297307206392e-05,
"loss": 0.667,
"mean_token_accuracy": 0.828629030380398,
"num_tokens": 12581712.0,
"step": 384
},
{
"entropy": 0.6426783930510283,
"epoch": 0.07821092508859831,
"grad_norm": 0.23066706955432892,
"learning_rate": 9.850610631247019e-05,
"loss": 0.6415,
"mean_token_accuracy": 0.8333791512995958,
"num_tokens": 13106000.0,
"step": 400
},
{
"entropy": 0.6531911985948682,
"epoch": 0.08133936209214225,
"grad_norm": 0.24164645373821259,
"learning_rate": 9.838455528897998e-05,
"loss": 0.6551,
"mean_token_accuracy": 0.8303816900588572,
"num_tokens": 13630288.0,
"step": 416
},
{
"entropy": 0.6194327082484961,
"epoch": 0.08446779909568618,
"grad_norm": 0.23268474638462067,
"learning_rate": 9.82583317398488e-05,
"loss": 0.6175,
"mean_token_accuracy": 0.8367584650404751,
"num_tokens": 14154576.0,
"step": 432
},
{
"entropy": 0.6329398893285543,
"epoch": 0.08759623609923012,
"grad_norm": 0.24408580362796783,
"learning_rate": 9.81274478545608e-05,
"loss": 0.6325,
"mean_token_accuracy": 0.8347576032392681,
"num_tokens": 14678864.0,
"step": 448
},
{
"entropy": 0.6229353230446577,
"epoch": 0.09072467310277404,
"grad_norm": 0.24314868450164795,
"learning_rate": 9.79919162726516e-05,
"loss": 0.6222,
"mean_token_accuracy": 0.8371938765048981,
"num_tokens": 15203026.0,
"step": 464
},
{
"entropy": 0.6139514590613544,
"epoch": 0.09385311010631797,
"grad_norm": 0.25224220752716064,
"learning_rate": 9.785175008248768e-05,
"loss": 0.614,
"mean_token_accuracy": 0.838007087353617,
"num_tokens": 15727314.0,
"step": 480
},
{
"entropy": 0.6109699255321175,
"epoch": 0.09698154710986191,
"grad_norm": 0.24364836513996124,
"learning_rate": 9.770696282000244e-05,
"loss": 0.6106,
"mean_token_accuracy": 0.8386371252126992,
"num_tokens": 16251602.0,
"step": 496
},
{
"entropy": 0.6176430773921311,
"epoch": 0.10010998411340584,
"grad_norm": 0.2621923089027405,
"learning_rate": 9.755756846738902e-05,
"loss": 0.615,
"mean_token_accuracy": 0.8376702214591205,
"num_tokens": 16775637.0,
"step": 512
},
{
"entropy": 0.6160387259442359,
"epoch": 0.10323842111694978,
"grad_norm": 0.2385210245847702,
"learning_rate": 9.740358145174998e-05,
"loss": 0.6184,
"mean_token_accuracy": 0.8375125988386571,
"num_tokens": 17299925.0,
"step": 528
},
{
"entropy": 0.6175491204485297,
"epoch": 0.1063668581204937,
"grad_norm": 0.2511584460735321,
"learning_rate": 9.724501664370418e-05,
"loss": 0.617,
"mean_token_accuracy": 0.8368921047076583,
"num_tokens": 17824213.0,
"step": 544
},
{
"entropy": 0.6241217039059848,
"epoch": 0.10949529512403763,
"grad_norm": 0.25787243247032166,
"learning_rate": 9.708188935595059e-05,
"loss": 0.6254,
"mean_token_accuracy": 0.8372699371539056,
"num_tokens": 18348187.0,
"step": 560
},
{
"entropy": 0.6261336030438542,
"epoch": 0.11262373212758157,
"grad_norm": 0.2410293072462082,
"learning_rate": 9.691421534178966e-05,
"loss": 0.6246,
"mean_token_accuracy": 0.8367050038650632,
"num_tokens": 18872475.0,
"step": 576
},
{
"entropy": 0.6158532982226461,
"epoch": 0.1157521691311255,
"grad_norm": 0.25348979234695435,
"learning_rate": 9.674201079360188e-05,
"loss": 0.6152,
"mean_token_accuracy": 0.8399296645075083,
"num_tokens": 19396763.0,
"step": 592
},
{
"entropy": 0.601322092814371,
"epoch": 0.11888060613466944,
"grad_norm": 0.2700308859348297,
"learning_rate": 9.656529234128418e-05,
"loss": 0.6021,
"mean_token_accuracy": 0.8411092776805162,
"num_tokens": 19920930.0,
"step": 608
},
{
"entropy": 0.6039648232981563,
"epoch": 0.12200904313821337,
"grad_norm": 0.24755984544754028,
"learning_rate": 9.638407705064392e-05,
"loss": 0.6039,
"mean_token_accuracy": 0.8405463420785964,
"num_tokens": 20445218.0,
"step": 624
},
{
"entropy": 0.608759083552286,
"epoch": 0.1251374801417573,
"grad_norm": 0.26475900411605835,
"learning_rate": 9.619838242175083e-05,
"loss": 0.6077,
"mean_token_accuracy": 0.8407429889775813,
"num_tokens": 20969506.0,
"step": 640
},
{
"entropy": 0.5959294943604618,
"epoch": 0.12826591714530122,
"grad_norm": 0.24157044291496277,
"learning_rate": 9.600822638724705e-05,
"loss": 0.5964,
"mean_token_accuracy": 0.8421691716648638,
"num_tokens": 21493794.0,
"step": 656
},
{
"entropy": 0.5929773084353656,
"epoch": 0.13139435414884518,
"grad_norm": 0.2490786910057068,
"learning_rate": 9.581362731061536e-05,
"loss": 0.5924,
"mean_token_accuracy": 0.8422665409743786,
"num_tokens": 22018082.0,
"step": 672
},
{
"entropy": 0.5856014562305063,
"epoch": 0.1345227911523891,
"grad_norm": 0.26487597823143005,
"learning_rate": 9.561460398440577e-05,
"loss": 0.5845,
"mean_token_accuracy": 0.8448229790665209,
"num_tokens": 22542370.0,
"step": 688
},
{
"entropy": 0.5972939203493297,
"epoch": 0.13765122815593303,
"grad_norm": 0.2536788582801819,
"learning_rate": 9.54111756284207e-05,
"loss": 0.5979,
"mean_token_accuracy": 0.8417689246125519,
"num_tokens": 23066518.0,
"step": 704
},
{
"entropy": 0.5793864431325346,
"epoch": 0.14077966515947696,
"grad_norm": 0.2498546540737152,
"learning_rate": 9.520336188785905e-05,
"loss": 0.5797,
"mean_token_accuracy": 0.8449356239289045,
"num_tokens": 23590806.0,
"step": 720
},
{
"entropy": 0.5852908829692751,
"epoch": 0.14390810216302088,
"grad_norm": 0.2533164322376251,
"learning_rate": 9.499118283141887e-05,
"loss": 0.5851,
"mean_token_accuracy": 0.8440230167470872,
"num_tokens": 24115094.0,
"step": 736
},
{
"entropy": 0.5853700931183994,
"epoch": 0.14703653916656484,
"grad_norm": 0.2594294250011444,
"learning_rate": 9.477465894935939e-05,
"loss": 0.5833,
"mean_token_accuracy": 0.8439973699860275,
"num_tokens": 24638793.0,
"step": 752
},
{
"entropy": 0.5820769551210105,
"epoch": 0.15016497617010877,
"grad_norm": 0.26996445655822754,
"learning_rate": 9.455381115152234e-05,
"loss": 0.5813,
"mean_token_accuracy": 0.84427694324404,
"num_tokens": 25163081.0,
"step": 768
},
{
"entropy": 0.5865267035551369,
"epoch": 0.1532934131736527,
"grad_norm": 0.2632192075252533,
"learning_rate": 9.432866076531248e-05,
"loss": 0.5865,
"mean_token_accuracy": 0.843591536860913,
"num_tokens": 25687369.0,
"step": 784
},
{
"entropy": 0.5832636400591582,
"epoch": 0.15642185017719662,
"grad_norm": 0.2748562693595886,
"learning_rate": 9.409922953363824e-05,
"loss": 0.5814,
"mean_token_accuracy": 0.8445194149389863,
"num_tokens": 26211657.0,
"step": 800
},
{
"entropy": 0.5827851279173046,
"epoch": 0.15955028718074055,
"grad_norm": 0.27480757236480713,
"learning_rate": 9.386553961281179e-05,
"loss": 0.5829,
"mean_token_accuracy": 0.8449146216735244,
"num_tokens": 26735945.0,
"step": 816
},
{
"entropy": 0.5730462830979377,
"epoch": 0.1626787241842845,
"grad_norm": 0.26577237248420715,
"learning_rate": 9.362761357040956e-05,
"loss": 0.5748,
"mean_token_accuracy": 0.8454892951995134,
"num_tokens": 27260233.0,
"step": 832
},
{
"entropy": 0.5673604859039187,
"epoch": 0.16580716118782843,
"grad_norm": 0.27280473709106445,
"learning_rate": 9.338547438309269e-05,
"loss": 0.5659,
"mean_token_accuracy": 0.8469823002815247,
"num_tokens": 27784521.0,
"step": 848
},
{
"entropy": 0.5737447079736739,
"epoch": 0.16893559819137235,
"grad_norm": 0.243574321269989,
"learning_rate": 9.313914543438835e-05,
"loss": 0.5735,
"mean_token_accuracy": 0.8462529699318111,
"num_tokens": 28308797.0,
"step": 864
},
{
"entropy": 0.5753675031010062,
"epoch": 0.17206403519491628,
"grad_norm": 0.2583043575286865,
"learning_rate": 9.288865051243142e-05,
"loss": 0.5747,
"mean_token_accuracy": 0.8464267165400088,
"num_tokens": 28833085.0,
"step": 880
},
{
"entropy": 0.5841653808020055,
"epoch": 0.17519247219846024,
"grad_norm": 0.2513315677642822,
"learning_rate": 9.263401380766739e-05,
"loss": 0.5837,
"mean_token_accuracy": 0.8444525892846286,
"num_tokens": 29357373.0,
"step": 896
},
{
"entropy": 0.5843619098886847,
"epoch": 0.17832090920200416,
"grad_norm": 0.2498634308576584,
"learning_rate": 9.237525991051615e-05,
"loss": 0.5848,
"mean_token_accuracy": 0.8453418002463877,
"num_tokens": 29881380.0,
"step": 912
},
{
"entropy": 0.5886711834464222,
"epoch": 0.1814493462055481,
"grad_norm": 0.25676679611206055,
"learning_rate": 9.211241380899739e-05,
"loss": 0.589,
"mean_token_accuracy": 0.8431676919572055,
"num_tokens": 30405668.0,
"step": 928
},
{
"entropy": 0.5667355505283922,
"epoch": 0.18457778320909202,
"grad_norm": 0.2631304860115051,
"learning_rate": 9.184550088631741e-05,
"loss": 0.5636,
"mean_token_accuracy": 0.8479502676054835,
"num_tokens": 30929956.0,
"step": 944
},
{
"entropy": 0.5632404731586576,
"epoch": 0.18770622021263594,
"grad_norm": 0.24389183521270752,
"learning_rate": 9.157454691841789e-05,
"loss": 0.5652,
"mean_token_accuracy": 0.8481321800500154,
"num_tokens": 31453452.0,
"step": 960
},
{
"entropy": 0.5674644499085844,
"epoch": 0.1908346572161799,
"grad_norm": 0.25306281447410583,
"learning_rate": 9.129957807148666e-05,
"loss": 0.5651,
"mean_token_accuracy": 0.8487979606725276,
"num_tokens": 31977740.0,
"step": 976
},
{
"entropy": 0.5516653840895742,
"epoch": 0.19396309421972383,
"grad_norm": 0.26283109188079834,
"learning_rate": 9.102062089943086e-05,
"loss": 0.5535,
"mean_token_accuracy": 0.8500809515826404,
"num_tokens": 32502028.0,
"step": 992
},
{
"entropy": 0.561382147250697,
"epoch": 0.19709153122326775,
"grad_norm": 0.25755682587623596,
"learning_rate": 9.07377023413126e-05,
"loss": 0.5586,
"mean_token_accuracy": 0.8493707147426903,
"num_tokens": 33025959.0,
"step": 1008
},
{
"entropy": 0.5676966737955809,
"epoch": 0.20021996822681168,
"grad_norm": 0.25775137543678284,
"learning_rate": 9.045084971874738e-05,
"loss": 0.5671,
"mean_token_accuracy": 0.847740254830569,
"num_tokens": 33550247.0,
"step": 1024
},
{
"entropy": 0.5602117348462343,
"epoch": 0.2033484052303556,
"grad_norm": 0.25713664293289185,
"learning_rate": 9.016009073326571e-05,
"loss": 0.5619,
"mean_token_accuracy": 0.8491683504544199,
"num_tokens": 34074535.0,
"step": 1040
},
{
"entropy": 0.58018215931952,
"epoch": 0.20647684223389956,
"grad_norm": 0.2585735619068146,
"learning_rate": 8.986545346363792e-05,
"loss": 0.5792,
"mean_token_accuracy": 0.8453449127264321,
"num_tokens": 34598649.0,
"step": 1056
},
{
"entropy": 0.5575782191008329,
"epoch": 0.2096052792374435,
"grad_norm": 0.2617577612400055,
"learning_rate": 8.956696636316255e-05,
"loss": 0.5558,
"mean_token_accuracy": 0.8505983497016132,
"num_tokens": 35122937.0,
"step": 1072
},
{
"entropy": 0.5580868402030319,
"epoch": 0.2127337162409874,
"grad_norm": 0.2838793992996216,
"learning_rate": 8.926465825691865e-05,
"loss": 0.5585,
"mean_token_accuracy": 0.8499568556435406,
"num_tokens": 35647225.0,
"step": 1088
},
{
"entropy": 0.571673326427117,
"epoch": 0.21586215324453134,
"grad_norm": 0.26587942242622375,
"learning_rate": 8.895855833898207e-05,
"loss": 0.5705,
"mean_token_accuracy": 0.8471988807432353,
"num_tokens": 36171427.0,
"step": 1104
},
{
"entropy": 0.568238423904404,
"epoch": 0.21899059024807527,
"grad_norm": 0.24594295024871826,
"learning_rate": 8.864869616960625e-05,
"loss": 0.5682,
"mean_token_accuracy": 0.8479999089613557,
"num_tokens": 36695715.0,
"step": 1120
},
{
"entropy": 0.5731002090033144,
"epoch": 0.22211902725161922,
"grad_norm": 0.25455793738365173,
"learning_rate": 8.833510167236747e-05,
"loss": 0.5732,
"mean_token_accuracy": 0.8478013505227864,
"num_tokens": 37220003.0,
"step": 1136
},
{
"entropy": 0.5481538840103894,
"epoch": 0.22524746425516315,
"grad_norm": 0.26975589990615845,
"learning_rate": 8.801780513127513e-05,
"loss": 0.5475,
"mean_token_accuracy": 0.8506444320082664,
"num_tokens": 37743882.0,
"step": 1152
},
{
"entropy": 0.5527894860133529,
"epoch": 0.22837590125870708,
"grad_norm": 0.2522968053817749,
"learning_rate": 8.769683718784734e-05,
"loss": 0.5516,
"mean_token_accuracy": 0.8505086144432425,
"num_tokens": 38268170.0,
"step": 1168
},
{
"entropy": 0.5540862991474569,
"epoch": 0.231504338262251,
"grad_norm": 0.2609933912754059,
"learning_rate": 8.737222883815164e-05,
"loss": 0.5526,
"mean_token_accuracy": 0.8506460795179009,
"num_tokens": 38792458.0,
"step": 1184
},
{
"entropy": 0.555841225432232,
"epoch": 0.23463277526579493,
"grad_norm": 0.2725919187068939,
"learning_rate": 8.704401142981184e-05,
"loss": 0.5554,
"mean_token_accuracy": 0.8496399251744151,
"num_tokens": 39316746.0,
"step": 1200
},
{
"entropy": 0.5415672848466784,
"epoch": 0.23776121226933888,
"grad_norm": 0.2661166191101074,
"learning_rate": 8.671221665898073e-05,
"loss": 0.5435,
"mean_token_accuracy": 0.8517133295536041,
"num_tokens": 39841034.0,
"step": 1216
},
{
"entropy": 0.5415612279903144,
"epoch": 0.2408896492728828,
"grad_norm": 0.25321218371391296,
"learning_rate": 8.637687656727913e-05,
"loss": 0.5406,
"mean_token_accuracy": 0.8533380702137947,
"num_tokens": 40365322.0,
"step": 1232
},
{
"entropy": 0.5538674369454384,
"epoch": 0.24401808627642674,
"grad_norm": 0.2791917622089386,
"learning_rate": 8.60380235387016e-05,
"loss": 0.5518,
"mean_token_accuracy": 0.8515777760185301,
"num_tokens": 40889610.0,
"step": 1248
},
{
"entropy": 0.5463056627195328,
"epoch": 0.24714652327997066,
"grad_norm": 0.2911370098590851,
"learning_rate": 8.569569029648923e-05,
"loss": 0.5462,
"mean_token_accuracy": 0.8518756083212793,
"num_tokens": 41413898.0,
"step": 1264
},
{
"entropy": 0.5479311102535576,
"epoch": 0.2502749602835146,
"grad_norm": 0.26030269265174866,
"learning_rate": 8.53499098999693e-05,
"loss": 0.5497,
"mean_token_accuracy": 0.8511806610040367,
"num_tokens": 41938186.0,
"step": 1280
},
{
"entropy": 0.5538808973506093,
"epoch": 0.2534033972870585,
"grad_norm": 0.26965585350990295,
"learning_rate": 8.500071574136295e-05,
"loss": 0.5537,
"mean_token_accuracy": 0.8513314896263182,
"num_tokens": 42462474.0,
"step": 1296
},
{
"entropy": 0.5512072397395968,
"epoch": 0.25653183429060245,
"grad_norm": 0.24840131402015686,
"learning_rate": 8.46481415425604e-05,
"loss": 0.5487,
"mean_token_accuracy": 0.8515388667583466,
"num_tokens": 42986398.0,
"step": 1312
},
{
"entropy": 0.5752683402970433,
"epoch": 0.2596602712941464,
"grad_norm": 0.26552262902259827,
"learning_rate": 8.429222135186427e-05,
"loss": 0.5776,
"mean_token_accuracy": 0.8456897586584091,
"num_tokens": 43510686.0,
"step": 1328
},
{
"entropy": 0.5338181289844215,
"epoch": 0.26278870829769035,
"grad_norm": 0.2537406086921692,
"learning_rate": 8.393298954070178e-05,
"loss": 0.5323,
"mean_token_accuracy": 0.8548826249316335,
"num_tokens": 44034974.0,
"step": 1344
},
{
"entropy": 0.5521234918851405,
"epoch": 0.2659171453012343,
"grad_norm": 0.2752557098865509,
"learning_rate": 8.357048080030522e-05,
"loss": 0.5512,
"mean_token_accuracy": 0.851849777624011,
"num_tokens": 44559253.0,
"step": 1360
},
{
"entropy": 0.5413030001800507,
"epoch": 0.2690455823047782,
"grad_norm": 0.2753056585788727,
"learning_rate": 8.320473013836196e-05,
"loss": 0.5387,
"mean_token_accuracy": 0.8531146934255958,
"num_tokens": 45083541.0,
"step": 1376
},
{
"entropy": 0.5636138301342726,
"epoch": 0.27217401930832213,
"grad_norm": 0.2751758396625519,
"learning_rate": 8.283577287563367e-05,
"loss": 0.5662,
"mean_token_accuracy": 0.8482500137761235,
"num_tokens": 45607829.0,
"step": 1392
},
{
"entropy": 0.5462560928426683,
"epoch": 0.27530245631186606,
"grad_norm": 0.2786915898323059,
"learning_rate": 8.246364464254539e-05,
"loss": 0.5458,
"mean_token_accuracy": 0.8512360248714685,
"num_tokens": 46132117.0,
"step": 1408
},
{
"entropy": 0.5498062786646187,
"epoch": 0.27843089331541,
"grad_norm": 0.28462111949920654,
"learning_rate": 8.20883813757447e-05,
"loss": 0.5479,
"mean_token_accuracy": 0.851142474450171,
"num_tokens": 46656405.0,
"step": 1424
},
{
"entropy": 0.561028536176309,
"epoch": 0.2815593303189539,
"grad_norm": 0.28582215309143066,
"learning_rate": 8.171001931463122e-05,
"loss": 0.56,
"mean_token_accuracy": 0.8496361062861979,
"num_tokens": 47180693.0,
"step": 1440
},
{
"entropy": 0.5402022732887417,
"epoch": 0.28468776732249784,
"grad_norm": 0.25975897908210754,
"learning_rate": 8.132859499785707e-05,
"loss": 0.5393,
"mean_token_accuracy": 0.8542964975349605,
"num_tokens": 47704981.0,
"step": 1456
},
{
"entropy": 0.5573246807325631,
"epoch": 0.28781620432604177,
"grad_norm": 0.2716176211833954,
"learning_rate": 8.094414525979822e-05,
"loss": 0.56,
"mean_token_accuracy": 0.8493000832386315,
"num_tokens": 48229269.0,
"step": 1472
},
{
"entropy": 0.5363587085157633,
"epoch": 0.29094464132958575,
"grad_norm": 0.26990601420402527,
"learning_rate": 8.055670722699736e-05,
"loss": 0.5353,
"mean_token_accuracy": 0.8545466028153896,
"num_tokens": 48753557.0,
"step": 1488
}
],
"logging_steps": 16,
"max_steps": 5115,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.1754438561418445e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}