codellama-coding-assistant / trainer_state.json
RajGana's picture
Upload folder using huggingface_hub
ea38030 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.719208870242733,
"eval_steps": 500,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.3359488248825073,
"epoch": 0.0079912096693637,
"grad_norm": 13.5,
"learning_rate": 0.00018,
"loss": 1.1391,
"mean_token_accuracy": 0.7321531124413013,
"num_tokens": 17863.0,
"step": 10
},
{
"entropy": 0.8153514951467514,
"epoch": 0.0159824193387274,
"grad_norm": 0.357421875,
"learning_rate": 0.00019855072463768116,
"loss": 0.735,
"mean_token_accuracy": 0.806717099994421,
"num_tokens": 35538.0,
"step": 20
},
{
"entropy": 0.7226301684975625,
"epoch": 0.0239736290080911,
"grad_norm": 0.27734375,
"learning_rate": 0.00019694041867954914,
"loss": 0.6906,
"mean_token_accuracy": 0.8121421404182911,
"num_tokens": 52898.0,
"step": 30
},
{
"entropy": 0.7096233293414116,
"epoch": 0.0319648386774548,
"grad_norm": 0.408203125,
"learning_rate": 0.00019533011272141707,
"loss": 0.6794,
"mean_token_accuracy": 0.8093454599380493,
"num_tokens": 71120.0,
"step": 40
},
{
"entropy": 0.6432753155007959,
"epoch": 0.0399560483468185,
"grad_norm": 0.29296875,
"learning_rate": 0.00019371980676328502,
"loss": 0.5778,
"mean_token_accuracy": 0.8227488771080971,
"num_tokens": 89183.0,
"step": 50
},
{
"entropy": 0.6528786711394787,
"epoch": 0.0479472580161822,
"grad_norm": 0.19921875,
"learning_rate": 0.000192109500805153,
"loss": 0.6056,
"mean_token_accuracy": 0.8218209922313691,
"num_tokens": 108013.0,
"step": 60
},
{
"entropy": 0.6535129006952047,
"epoch": 0.055938467685545896,
"grad_norm": 0.2236328125,
"learning_rate": 0.00019049919484702096,
"loss": 0.5954,
"mean_token_accuracy": 0.8255642831325531,
"num_tokens": 126649.0,
"step": 70
},
{
"entropy": 0.6079864472150802,
"epoch": 0.0639296773549096,
"grad_norm": 0.2001953125,
"learning_rate": 0.00018888888888888888,
"loss": 0.5911,
"mean_token_accuracy": 0.8231404431164264,
"num_tokens": 144182.0,
"step": 80
},
{
"entropy": 0.6695162910968065,
"epoch": 0.0719208870242733,
"grad_norm": 0.1962890625,
"learning_rate": 0.00018727858293075687,
"loss": 0.6312,
"mean_token_accuracy": 0.8185268670320511,
"num_tokens": 163634.0,
"step": 90
},
{
"entropy": 0.6382982328534126,
"epoch": 0.079912096693637,
"grad_norm": 0.2109375,
"learning_rate": 0.00018566827697262482,
"loss": 0.5965,
"mean_token_accuracy": 0.8219615206122398,
"num_tokens": 182899.0,
"step": 100
},
{
"entropy": 0.615644377656281,
"epoch": 0.0879033063630007,
"grad_norm": 0.17578125,
"learning_rate": 0.00018405797101449275,
"loss": 0.555,
"mean_token_accuracy": 0.8339135125279427,
"num_tokens": 201540.0,
"step": 110
},
{
"entropy": 0.6227292243391276,
"epoch": 0.0958945160323644,
"grad_norm": 0.2392578125,
"learning_rate": 0.00018244766505636073,
"loss": 0.5958,
"mean_token_accuracy": 0.822028624266386,
"num_tokens": 220163.0,
"step": 120
},
{
"entropy": 0.6280987083911895,
"epoch": 0.1038857257017281,
"grad_norm": 0.2177734375,
"learning_rate": 0.00018083735909822868,
"loss": 0.5827,
"mean_token_accuracy": 0.8300345286726951,
"num_tokens": 237989.0,
"step": 130
},
{
"entropy": 0.6323467470705509,
"epoch": 0.11187693537109179,
"grad_norm": 0.154296875,
"learning_rate": 0.00017922705314009664,
"loss": 0.5986,
"mean_token_accuracy": 0.8197977431118488,
"num_tokens": 256250.0,
"step": 140
},
{
"entropy": 0.6162901744246483,
"epoch": 0.1198681450404555,
"grad_norm": 0.1552734375,
"learning_rate": 0.00017761674718196456,
"loss": 0.5769,
"mean_token_accuracy": 0.8255695670843124,
"num_tokens": 274386.0,
"step": 150
},
{
"entropy": 0.6158512964844703,
"epoch": 0.1278593547098192,
"grad_norm": 0.203125,
"learning_rate": 0.00017600644122383254,
"loss": 0.5713,
"mean_token_accuracy": 0.8270042009651661,
"num_tokens": 292835.0,
"step": 160
},
{
"entropy": 0.6406407546252012,
"epoch": 0.1358505643791829,
"grad_norm": 0.2119140625,
"learning_rate": 0.0001743961352657005,
"loss": 0.6104,
"mean_token_accuracy": 0.8200316116213798,
"num_tokens": 310392.0,
"step": 170
},
{
"entropy": 0.6069310043007136,
"epoch": 0.1438417740485466,
"grad_norm": 0.1640625,
"learning_rate": 0.00017278582930756842,
"loss": 0.5624,
"mean_token_accuracy": 0.8317995510995388,
"num_tokens": 331305.0,
"step": 180
},
{
"entropy": 0.6235411781817675,
"epoch": 0.1518329837179103,
"grad_norm": 0.1640625,
"learning_rate": 0.0001711755233494364,
"loss": 0.5852,
"mean_token_accuracy": 0.8278815999627114,
"num_tokens": 349604.0,
"step": 190
},
{
"entropy": 0.6172796195372939,
"epoch": 0.159824193387274,
"grad_norm": 0.1669921875,
"learning_rate": 0.00016956521739130436,
"loss": 0.5764,
"mean_token_accuracy": 0.8313593462109565,
"num_tokens": 367876.0,
"step": 200
},
{
"entropy": 0.6155283484607935,
"epoch": 0.1678154030566377,
"grad_norm": 0.1865234375,
"learning_rate": 0.00016795491143317231,
"loss": 0.5773,
"mean_token_accuracy": 0.8269082359969616,
"num_tokens": 385573.0,
"step": 210
},
{
"entropy": 0.6083606427535415,
"epoch": 0.1758066127260014,
"grad_norm": 0.154296875,
"learning_rate": 0.00016634460547504027,
"loss": 0.5704,
"mean_token_accuracy": 0.8263973362743855,
"num_tokens": 404851.0,
"step": 220
},
{
"entropy": 0.6131238225847483,
"epoch": 0.1837978223953651,
"grad_norm": 0.20703125,
"learning_rate": 0.00016473429951690822,
"loss": 0.5817,
"mean_token_accuracy": 0.8245558224618434,
"num_tokens": 422809.0,
"step": 230
},
{
"entropy": 0.6223553754389286,
"epoch": 0.1917890320647288,
"grad_norm": 0.234375,
"learning_rate": 0.00016312399355877618,
"loss": 0.5871,
"mean_token_accuracy": 0.8232570059597493,
"num_tokens": 439086.0,
"step": 240
},
{
"entropy": 0.6230555597692728,
"epoch": 0.1997802417340925,
"grad_norm": 0.171875,
"learning_rate": 0.00016151368760064413,
"loss": 0.5751,
"mean_token_accuracy": 0.8289580881595612,
"num_tokens": 457157.0,
"step": 250
},
{
"entropy": 0.5794363841414452,
"epoch": 0.2077714514034562,
"grad_norm": 0.2294921875,
"learning_rate": 0.00015990338164251208,
"loss": 0.5627,
"mean_token_accuracy": 0.8365659207105637,
"num_tokens": 474701.0,
"step": 260
},
{
"entropy": 0.5860258772969246,
"epoch": 0.2157626610728199,
"grad_norm": 0.1484375,
"learning_rate": 0.00015829307568438004,
"loss": 0.5363,
"mean_token_accuracy": 0.8401569269597531,
"num_tokens": 495405.0,
"step": 270
},
{
"entropy": 0.581408916413784,
"epoch": 0.22375387074218359,
"grad_norm": 0.205078125,
"learning_rate": 0.000156682769726248,
"loss": 0.5593,
"mean_token_accuracy": 0.8319723285734654,
"num_tokens": 512629.0,
"step": 280
},
{
"entropy": 0.5774631313979626,
"epoch": 0.2317450804115473,
"grad_norm": 0.171875,
"learning_rate": 0.00015507246376811595,
"loss": 0.5445,
"mean_token_accuracy": 0.8400227598845958,
"num_tokens": 531652.0,
"step": 290
},
{
"entropy": 0.5888700131326914,
"epoch": 0.239736290080911,
"grad_norm": 0.1884765625,
"learning_rate": 0.0001534621578099839,
"loss": 0.5475,
"mean_token_accuracy": 0.8398719631135464,
"num_tokens": 551807.0,
"step": 300
},
{
"entropy": 0.6303706657141447,
"epoch": 0.2477274997502747,
"grad_norm": 0.185546875,
"learning_rate": 0.00015185185185185185,
"loss": 0.5994,
"mean_token_accuracy": 0.8221785329282284,
"num_tokens": 570173.0,
"step": 310
},
{
"entropy": 0.591961058229208,
"epoch": 0.2557187094196384,
"grad_norm": 0.1708984375,
"learning_rate": 0.0001502415458937198,
"loss": 0.5588,
"mean_token_accuracy": 0.8339079335331917,
"num_tokens": 587517.0,
"step": 320
},
{
"entropy": 0.6203833676874637,
"epoch": 0.2637099190890021,
"grad_norm": 0.158203125,
"learning_rate": 0.00014863123993558776,
"loss": 0.5993,
"mean_token_accuracy": 0.8254540674388409,
"num_tokens": 605533.0,
"step": 330
},
{
"entropy": 0.5948904637247324,
"epoch": 0.2717011287583658,
"grad_norm": 0.1689453125,
"learning_rate": 0.00014702093397745574,
"loss": 0.5386,
"mean_token_accuracy": 0.835470549017191,
"num_tokens": 623145.0,
"step": 340
},
{
"entropy": 0.5892129261046648,
"epoch": 0.2796923384277295,
"grad_norm": 0.2041015625,
"learning_rate": 0.00014541062801932367,
"loss": 0.5445,
"mean_token_accuracy": 0.8327487081289291,
"num_tokens": 642429.0,
"step": 350
},
{
"entropy": 0.58230458535254,
"epoch": 0.2876835480970932,
"grad_norm": 0.1748046875,
"learning_rate": 0.00014380032206119162,
"loss": 0.5458,
"mean_token_accuracy": 0.8369288526475429,
"num_tokens": 660595.0,
"step": 360
},
{
"entropy": 0.5953514769673347,
"epoch": 0.2956747577664569,
"grad_norm": 0.1494140625,
"learning_rate": 0.0001421900161030596,
"loss": 0.5564,
"mean_token_accuracy": 0.8314083501696586,
"num_tokens": 680301.0,
"step": 370
},
{
"entropy": 0.6272314839065075,
"epoch": 0.3036659674358206,
"grad_norm": 0.189453125,
"learning_rate": 0.00014057971014492753,
"loss": 0.5879,
"mean_token_accuracy": 0.8269149273633957,
"num_tokens": 698836.0,
"step": 380
},
{
"entropy": 0.5974850662052631,
"epoch": 0.3116571771051843,
"grad_norm": 0.1875,
"learning_rate": 0.0001389694041867955,
"loss": 0.5567,
"mean_token_accuracy": 0.8330555327236653,
"num_tokens": 717301.0,
"step": 390
},
{
"entropy": 0.610439121723175,
"epoch": 0.319648386774548,
"grad_norm": 0.1943359375,
"learning_rate": 0.00013735909822866347,
"loss": 0.5798,
"mean_token_accuracy": 0.8273908801376819,
"num_tokens": 735623.0,
"step": 400
},
{
"entropy": 0.6218720726668835,
"epoch": 0.3276395964439117,
"grad_norm": 0.1689453125,
"learning_rate": 0.00013574879227053142,
"loss": 0.5681,
"mean_token_accuracy": 0.8264160886406898,
"num_tokens": 754095.0,
"step": 410
},
{
"entropy": 0.5961160399019718,
"epoch": 0.3356308061132754,
"grad_norm": 0.130859375,
"learning_rate": 0.00013413848631239935,
"loss": 0.5649,
"mean_token_accuracy": 0.8278753645718098,
"num_tokens": 772932.0,
"step": 420
},
{
"entropy": 0.5970222994685173,
"epoch": 0.3436220157826391,
"grad_norm": 0.1552734375,
"learning_rate": 0.0001325281803542673,
"loss": 0.5717,
"mean_token_accuracy": 0.8289826177060604,
"num_tokens": 791954.0,
"step": 430
},
{
"entropy": 0.5869237255305052,
"epoch": 0.3516132254520028,
"grad_norm": 0.23828125,
"learning_rate": 0.00013091787439613528,
"loss": 0.5424,
"mean_token_accuracy": 0.8353226915001869,
"num_tokens": 810372.0,
"step": 440
},
{
"entropy": 0.6047268303111195,
"epoch": 0.3596044351213665,
"grad_norm": 0.16015625,
"learning_rate": 0.0001293075684380032,
"loss": 0.5816,
"mean_token_accuracy": 0.8292522899806499,
"num_tokens": 828031.0,
"step": 450
},
{
"entropy": 0.6398113902658225,
"epoch": 0.3675956447907302,
"grad_norm": 0.193359375,
"learning_rate": 0.00012769726247987117,
"loss": 0.587,
"mean_token_accuracy": 0.8254243724048138,
"num_tokens": 844965.0,
"step": 460
},
{
"entropy": 0.5699722157791257,
"epoch": 0.3755868544600939,
"grad_norm": 0.150390625,
"learning_rate": 0.00012608695652173915,
"loss": 0.5302,
"mean_token_accuracy": 0.8379433415830135,
"num_tokens": 864068.0,
"step": 470
},
{
"entropy": 0.6004057168960572,
"epoch": 0.3835780641294576,
"grad_norm": 0.1689453125,
"learning_rate": 0.0001244766505636071,
"loss": 0.5735,
"mean_token_accuracy": 0.8319472163915634,
"num_tokens": 882516.0,
"step": 480
},
{
"entropy": 0.612379564717412,
"epoch": 0.3915692737988213,
"grad_norm": 0.17578125,
"learning_rate": 0.00012286634460547503,
"loss": 0.5605,
"mean_token_accuracy": 0.8312513306736946,
"num_tokens": 901332.0,
"step": 490
},
{
"entropy": 0.5999802689999342,
"epoch": 0.399560483468185,
"grad_norm": 0.2236328125,
"learning_rate": 0.00012125603864734301,
"loss": 0.5844,
"mean_token_accuracy": 0.8295043386518955,
"num_tokens": 918902.0,
"step": 500
},
{
"entropy": 0.6438330963253975,
"epoch": 0.4075516931375487,
"grad_norm": 0.181640625,
"learning_rate": 0.00011964573268921095,
"loss": 0.6039,
"mean_token_accuracy": 0.8217731453478336,
"num_tokens": 937381.0,
"step": 510
},
{
"entropy": 0.557589478418231,
"epoch": 0.4155429028069124,
"grad_norm": 0.1748046875,
"learning_rate": 0.0001180354267310789,
"loss": 0.5347,
"mean_token_accuracy": 0.8419624336063862,
"num_tokens": 956408.0,
"step": 520
},
{
"entropy": 0.5831106752157211,
"epoch": 0.4235341124762761,
"grad_norm": 0.15625,
"learning_rate": 0.00011642512077294687,
"loss": 0.5566,
"mean_token_accuracy": 0.8344054028391839,
"num_tokens": 974727.0,
"step": 530
},
{
"entropy": 0.6096597962081433,
"epoch": 0.4315253221456398,
"grad_norm": 0.16015625,
"learning_rate": 0.00011481481481481482,
"loss": 0.5906,
"mean_token_accuracy": 0.8249844819307327,
"num_tokens": 992039.0,
"step": 540
},
{
"entropy": 0.6296380385756493,
"epoch": 0.4395165318150035,
"grad_norm": 0.185546875,
"learning_rate": 0.00011320450885668277,
"loss": 0.5774,
"mean_token_accuracy": 0.8290597923099995,
"num_tokens": 1010746.0,
"step": 550
},
{
"entropy": 0.5989726323634386,
"epoch": 0.44750774148436717,
"grad_norm": 0.1552734375,
"learning_rate": 0.00011159420289855073,
"loss": 0.5668,
"mean_token_accuracy": 0.8317835494875908,
"num_tokens": 1029302.0,
"step": 560
},
{
"entropy": 0.5985535632818937,
"epoch": 0.4554989511537309,
"grad_norm": 0.1533203125,
"learning_rate": 0.00010998389694041869,
"loss": 0.5927,
"mean_token_accuracy": 0.8251331336796284,
"num_tokens": 1047787.0,
"step": 570
},
{
"entropy": 0.5919383157044649,
"epoch": 0.4634901608230946,
"grad_norm": 0.140625,
"learning_rate": 0.00010837359098228663,
"loss": 0.5584,
"mean_token_accuracy": 0.8338681124150753,
"num_tokens": 1067471.0,
"step": 580
},
{
"entropy": 0.5655450899153948,
"epoch": 0.4714813704924583,
"grad_norm": 0.146484375,
"learning_rate": 0.00010676328502415461,
"loss": 0.5343,
"mean_token_accuracy": 0.8383485890924931,
"num_tokens": 1086046.0,
"step": 590
},
{
"entropy": 0.616737426072359,
"epoch": 0.479472580161822,
"grad_norm": 0.173828125,
"learning_rate": 0.00010515297906602255,
"loss": 0.5908,
"mean_token_accuracy": 0.8193590499460697,
"num_tokens": 1103227.0,
"step": 600
},
{
"entropy": 0.5877503883093596,
"epoch": 0.4874637898311857,
"grad_norm": 0.16796875,
"learning_rate": 0.0001035426731078905,
"loss": 0.5505,
"mean_token_accuracy": 0.8355500593781471,
"num_tokens": 1121994.0,
"step": 610
},
{
"entropy": 0.5908785469830036,
"epoch": 0.4954549995005494,
"grad_norm": 0.2255859375,
"learning_rate": 0.00010193236714975847,
"loss": 0.5794,
"mean_token_accuracy": 0.8315194040536881,
"num_tokens": 1139965.0,
"step": 620
},
{
"entropy": 0.6211531057953834,
"epoch": 0.5034462091699131,
"grad_norm": 0.134765625,
"learning_rate": 0.00010032206119162641,
"loss": 0.5664,
"mean_token_accuracy": 0.8258850328624249,
"num_tokens": 1159075.0,
"step": 630
},
{
"entropy": 0.5950863931328059,
"epoch": 0.5114374188392768,
"grad_norm": 0.1630859375,
"learning_rate": 9.871175523349438e-05,
"loss": 0.5497,
"mean_token_accuracy": 0.8346662126481533,
"num_tokens": 1176494.0,
"step": 640
},
{
"entropy": 0.5760080838575959,
"epoch": 0.5194286285086405,
"grad_norm": 0.23828125,
"learning_rate": 9.710144927536232e-05,
"loss": 0.5632,
"mean_token_accuracy": 0.8364547491073608,
"num_tokens": 1195371.0,
"step": 650
},
{
"entropy": 0.5897768154740334,
"epoch": 0.5274198381780042,
"grad_norm": 0.150390625,
"learning_rate": 9.549114331723029e-05,
"loss": 0.5611,
"mean_token_accuracy": 0.8353584706783295,
"num_tokens": 1215474.0,
"step": 660
},
{
"entropy": 0.6259935267269612,
"epoch": 0.5354110478473679,
"grad_norm": 0.19921875,
"learning_rate": 9.388083735909823e-05,
"loss": 0.5834,
"mean_token_accuracy": 0.8251566261053085,
"num_tokens": 1233066.0,
"step": 670
},
{
"entropy": 0.5858545243740082,
"epoch": 0.5434022575167315,
"grad_norm": 0.2080078125,
"learning_rate": 9.227053140096618e-05,
"loss": 0.5709,
"mean_token_accuracy": 0.8273489251732826,
"num_tokens": 1249355.0,
"step": 680
},
{
"entropy": 0.6020776845514775,
"epoch": 0.5513934671860953,
"grad_norm": 0.171875,
"learning_rate": 9.066022544283415e-05,
"loss": 0.5657,
"mean_token_accuracy": 0.8260138787329196,
"num_tokens": 1267301.0,
"step": 690
},
{
"entropy": 0.596510236337781,
"epoch": 0.559384676855459,
"grad_norm": 0.154296875,
"learning_rate": 8.904991948470209e-05,
"loss": 0.5557,
"mean_token_accuracy": 0.834468311816454,
"num_tokens": 1285467.0,
"step": 700
},
{
"entropy": 0.5882966015487909,
"epoch": 0.5673758865248227,
"grad_norm": 0.166015625,
"learning_rate": 8.743961352657006e-05,
"loss": 0.5423,
"mean_token_accuracy": 0.8352835536003113,
"num_tokens": 1304357.0,
"step": 710
},
{
"entropy": 0.6191790480166673,
"epoch": 0.5753670961941864,
"grad_norm": 0.1533203125,
"learning_rate": 8.582930756843801e-05,
"loss": 0.5759,
"mean_token_accuracy": 0.8274984866380691,
"num_tokens": 1321602.0,
"step": 720
},
{
"entropy": 0.6024752855300903,
"epoch": 0.5833583058635501,
"grad_norm": 0.2001953125,
"learning_rate": 8.421900161030597e-05,
"loss": 0.5638,
"mean_token_accuracy": 0.8288030169904232,
"num_tokens": 1340012.0,
"step": 730
},
{
"entropy": 0.5633068412542344,
"epoch": 0.5913495155329138,
"grad_norm": 0.1796875,
"learning_rate": 8.260869565217392e-05,
"loss": 0.5262,
"mean_token_accuracy": 0.8409125037491322,
"num_tokens": 1358549.0,
"step": 740
},
{
"entropy": 0.6121923718601465,
"epoch": 0.5993407252022775,
"grad_norm": 0.1640625,
"learning_rate": 8.099838969404187e-05,
"loss": 0.5782,
"mean_token_accuracy": 0.8247248627245426,
"num_tokens": 1376295.0,
"step": 750
},
{
"entropy": 0.5850671246647835,
"epoch": 0.6073319348716412,
"grad_norm": 0.12255859375,
"learning_rate": 7.938808373590983e-05,
"loss": 0.5481,
"mean_token_accuracy": 0.8378213487565518,
"num_tokens": 1396825.0,
"step": 760
},
{
"entropy": 0.6004991352558136,
"epoch": 0.6153231445410049,
"grad_norm": 0.1484375,
"learning_rate": 7.777777777777778e-05,
"loss": 0.5697,
"mean_token_accuracy": 0.8314851686358452,
"num_tokens": 1415779.0,
"step": 770
},
{
"entropy": 0.615888693742454,
"epoch": 0.6233143542103686,
"grad_norm": 0.1494140625,
"learning_rate": 7.616747181964574e-05,
"loss": 0.586,
"mean_token_accuracy": 0.8290720954537392,
"num_tokens": 1433844.0,
"step": 780
},
{
"entropy": 0.631605738401413,
"epoch": 0.6313055638797322,
"grad_norm": 0.1787109375,
"learning_rate": 7.455716586151369e-05,
"loss": 0.5896,
"mean_token_accuracy": 0.823421498388052,
"num_tokens": 1452173.0,
"step": 790
},
{
"entropy": 0.5806491080671549,
"epoch": 0.639296773549096,
"grad_norm": 0.1767578125,
"learning_rate": 7.294685990338164e-05,
"loss": 0.5541,
"mean_token_accuracy": 0.8376387834548951,
"num_tokens": 1469089.0,
"step": 800
},
{
"entropy": 0.5787392556667328,
"epoch": 0.6472879832184597,
"grad_norm": 0.2734375,
"learning_rate": 7.13365539452496e-05,
"loss": 0.5344,
"mean_token_accuracy": 0.832699004560709,
"num_tokens": 1488541.0,
"step": 810
},
{
"entropy": 0.5935858219861985,
"epoch": 0.6552791928878234,
"grad_norm": 0.1435546875,
"learning_rate": 6.972624798711755e-05,
"loss": 0.549,
"mean_token_accuracy": 0.8358408592641353,
"num_tokens": 1506966.0,
"step": 820
},
{
"entropy": 0.6146302495151759,
"epoch": 0.663270402557187,
"grad_norm": 0.15234375,
"learning_rate": 6.811594202898552e-05,
"loss": 0.5794,
"mean_token_accuracy": 0.8260477609932423,
"num_tokens": 1526428.0,
"step": 830
},
{
"entropy": 0.6146373618394136,
"epoch": 0.6712616122265508,
"grad_norm": 0.169921875,
"learning_rate": 6.650563607085346e-05,
"loss": 0.5917,
"mean_token_accuracy": 0.8253330059349537,
"num_tokens": 1543859.0,
"step": 840
},
{
"entropy": 0.5822377149015665,
"epoch": 0.6792528218959145,
"grad_norm": 0.1962890625,
"learning_rate": 6.489533011272141e-05,
"loss": 0.5561,
"mean_token_accuracy": 0.8367624327540397,
"num_tokens": 1562062.0,
"step": 850
},
{
"entropy": 0.5773209661245347,
"epoch": 0.6872440315652782,
"grad_norm": 0.1650390625,
"learning_rate": 6.328502415458938e-05,
"loss": 0.5144,
"mean_token_accuracy": 0.83621421828866,
"num_tokens": 1580374.0,
"step": 860
},
{
"entropy": 0.5891423657536506,
"epoch": 0.6952352412346419,
"grad_norm": 0.18359375,
"learning_rate": 6.167471819645732e-05,
"loss": 0.5766,
"mean_token_accuracy": 0.8288635179400444,
"num_tokens": 1598383.0,
"step": 870
},
{
"entropy": 0.5979194710031152,
"epoch": 0.7032264509040056,
"grad_norm": 0.15234375,
"learning_rate": 6.006441223832528e-05,
"loss": 0.5452,
"mean_token_accuracy": 0.8346437945961952,
"num_tokens": 1617322.0,
"step": 880
},
{
"entropy": 0.6381862349808216,
"epoch": 0.7112176605733693,
"grad_norm": 0.173828125,
"learning_rate": 5.8454106280193244e-05,
"loss": 0.6008,
"mean_token_accuracy": 0.8242271035909653,
"num_tokens": 1633941.0,
"step": 890
},
{
"entropy": 0.5802713014185429,
"epoch": 0.719208870242733,
"grad_norm": 0.15625,
"learning_rate": 5.684380032206119e-05,
"loss": 0.5462,
"mean_token_accuracy": 0.8342008836567402,
"num_tokens": 1651765.0,
"step": 900
}
],
"logging_steps": 10,
"max_steps": 1252,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.51066844059566e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}