MusaCoder-27B / trainer_state.json
DashLuuu's picture
Add files using upload-large-folder tool
603aea8 verified
raw
history blame
67.9 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.1588377723970944,
"eval_steps": 2000,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00387409200968523,
"grad_norm": 1.598986029624939,
"learning_rate": 2.0000000000000002e-07,
"loss": 0.40103477239608765,
"step": 1,
"token_acc": 0.8705013179702646
},
{
"epoch": 0.00774818401937046,
"grad_norm": 1.988427758216858,
"learning_rate": 4.0000000000000003e-07,
"loss": 0.4314175248146057,
"step": 2,
"token_acc": 0.8610088406262493
},
{
"epoch": 0.01162227602905569,
"grad_norm": 1.6525965929031372,
"learning_rate": 6.000000000000001e-07,
"loss": 0.41751521825790405,
"step": 3,
"token_acc": 0.8659394954574845
},
{
"epoch": 0.01549636803874092,
"grad_norm": 1.3594496250152588,
"learning_rate": 8.000000000000001e-07,
"loss": 0.39516761898994446,
"step": 4,
"token_acc": 0.8712739341656057
},
{
"epoch": 0.01937046004842615,
"grad_norm": 1.4459697008132935,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.41443824768066406,
"step": 5,
"token_acc": 0.8673064711013153
},
{
"epoch": 0.02324455205811138,
"grad_norm": 1.165871024131775,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.3951181471347809,
"step": 6,
"token_acc": 0.8717731277799119
},
{
"epoch": 0.02711864406779661,
"grad_norm": 1.150416374206543,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.40562719106674194,
"step": 7,
"token_acc": 0.8683618627898853
},
{
"epoch": 0.03099273607748184,
"grad_norm": 0.7621377110481262,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.4054454565048218,
"step": 8,
"token_acc": 0.8672108063124587
},
{
"epoch": 0.03486682808716707,
"grad_norm": 0.588590681552887,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.383542001247406,
"step": 9,
"token_acc": 0.8732824386699718
},
{
"epoch": 0.0387409200968523,
"grad_norm": 0.5067570805549622,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.3769374489784241,
"step": 10,
"token_acc": 0.8749500487669789
},
{
"epoch": 0.04261501210653753,
"grad_norm": 0.6109248995780945,
"learning_rate": 2.2e-06,
"loss": 0.3687226176261902,
"step": 11,
"token_acc": 0.8778429629931872
},
{
"epoch": 0.04648910411622276,
"grad_norm": 0.6168301701545715,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.3631238639354706,
"step": 12,
"token_acc": 0.8792909317747671
},
{
"epoch": 0.05036319612590799,
"grad_norm": 0.5205990076065063,
"learning_rate": 2.6e-06,
"loss": 0.37530872225761414,
"step": 13,
"token_acc": 0.8747995859550826
},
{
"epoch": 0.05423728813559322,
"grad_norm": 0.4970836639404297,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.33857205510139465,
"step": 14,
"token_acc": 0.8863650931395268
},
{
"epoch": 0.05811138014527845,
"grad_norm": 0.4103075861930847,
"learning_rate": 3e-06,
"loss": 0.38399845361709595,
"step": 15,
"token_acc": 0.8722473100295478
},
{
"epoch": 0.06198547215496368,
"grad_norm": 0.505113959312439,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.37927311658859253,
"step": 16,
"token_acc": 0.8732506907722828
},
{
"epoch": 0.06585956416464891,
"grad_norm": 0.4578634202480316,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.388744592666626,
"step": 17,
"token_acc": 0.8707925977418891
},
{
"epoch": 0.06973365617433414,
"grad_norm": 0.40881460905075073,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.37862884998321533,
"step": 18,
"token_acc": 0.8738148420049672
},
{
"epoch": 0.07360774818401937,
"grad_norm": 0.3267415165901184,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.3523765206336975,
"step": 19,
"token_acc": 0.8821479488850912
},
{
"epoch": 0.0774818401937046,
"grad_norm": 0.3520510196685791,
"learning_rate": 4.000000000000001e-06,
"loss": 0.37575048208236694,
"step": 20,
"token_acc": 0.8746806805808569
},
{
"epoch": 0.08135593220338982,
"grad_norm": 0.3177695870399475,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.3877210021018982,
"step": 21,
"token_acc": 0.8709381583839385
},
{
"epoch": 0.08523002421307506,
"grad_norm": 0.3101595640182495,
"learning_rate": 4.4e-06,
"loss": 0.35647860169410706,
"step": 22,
"token_acc": 0.8802609194999448
},
{
"epoch": 0.0891041162227603,
"grad_norm": 0.42295873165130615,
"learning_rate": 4.600000000000001e-06,
"loss": 0.34535130858421326,
"step": 23,
"token_acc": 0.8842312960154491
},
{
"epoch": 0.09297820823244551,
"grad_norm": 0.38459983468055725,
"learning_rate": 4.800000000000001e-06,
"loss": 0.3480440676212311,
"step": 24,
"token_acc": 0.8830844934941354
},
{
"epoch": 0.09685230024213075,
"grad_norm": 0.3167020082473755,
"learning_rate": 5e-06,
"loss": 0.3617573082447052,
"step": 25,
"token_acc": 0.8794729562611736
},
{
"epoch": 0.10072639225181598,
"grad_norm": 0.3235217332839966,
"learning_rate": 5.2e-06,
"loss": 0.34485846757888794,
"step": 26,
"token_acc": 0.8849654381719892
},
{
"epoch": 0.10460048426150122,
"grad_norm": 0.33688801527023315,
"learning_rate": 5.400000000000001e-06,
"loss": 0.325369268655777,
"step": 27,
"token_acc": 0.8904570911619978
},
{
"epoch": 0.10847457627118644,
"grad_norm": 0.28384602069854736,
"learning_rate": 5.600000000000001e-06,
"loss": 0.3820268213748932,
"step": 28,
"token_acc": 0.8722670041260794
},
{
"epoch": 0.11234866828087167,
"grad_norm": 0.2726050019264221,
"learning_rate": 5.8e-06,
"loss": 0.34821516275405884,
"step": 29,
"token_acc": 0.8829695430808375
},
{
"epoch": 0.1162227602905569,
"grad_norm": 0.2613418698310852,
"learning_rate": 6e-06,
"loss": 0.3505156636238098,
"step": 30,
"token_acc": 0.8820213661332177
},
{
"epoch": 0.12009685230024213,
"grad_norm": 0.27066054940223694,
"learning_rate": 6.200000000000001e-06,
"loss": 0.3500295877456665,
"step": 31,
"token_acc": 0.8819775128328553
},
{
"epoch": 0.12397094430992736,
"grad_norm": 0.2605418562889099,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.32833147048950195,
"step": 32,
"token_acc": 0.8892601629599358
},
{
"epoch": 0.12784503631961258,
"grad_norm": 0.2576088607311249,
"learning_rate": 6.600000000000001e-06,
"loss": 0.3447936475276947,
"step": 33,
"token_acc": 0.8835445537223737
},
{
"epoch": 0.13171912832929783,
"grad_norm": 0.2707255482673645,
"learning_rate": 6.800000000000001e-06,
"loss": 0.352622389793396,
"step": 34,
"token_acc": 0.8808578896779464
},
{
"epoch": 0.13559322033898305,
"grad_norm": 0.23704984784126282,
"learning_rate": 7e-06,
"loss": 0.34251606464385986,
"step": 35,
"token_acc": 0.8839590527934595
},
{
"epoch": 0.13946731234866827,
"grad_norm": 0.2552218735218048,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.36937713623046875,
"step": 36,
"token_acc": 0.8746555562093041
},
{
"epoch": 0.14334140435835352,
"grad_norm": 0.25926339626312256,
"learning_rate": 7.4e-06,
"loss": 0.37243181467056274,
"step": 37,
"token_acc": 0.8742657147624016
},
{
"epoch": 0.14721549636803874,
"grad_norm": 0.25272250175476074,
"learning_rate": 7.600000000000001e-06,
"loss": 0.3371140956878662,
"step": 38,
"token_acc": 0.8851879286597788
},
{
"epoch": 0.15108958837772396,
"grad_norm": 0.2262120097875595,
"learning_rate": 7.800000000000002e-06,
"loss": 0.32758837938308716,
"step": 39,
"token_acc": 0.8883092864316684
},
{
"epoch": 0.1549636803874092,
"grad_norm": 0.26067835092544556,
"learning_rate": 8.000000000000001e-06,
"loss": 0.32051679491996765,
"step": 40,
"token_acc": 0.8908219532219895
},
{
"epoch": 0.15883777239709443,
"grad_norm": 0.22696885466575623,
"learning_rate": 8.2e-06,
"loss": 0.34018558263778687,
"step": 41,
"token_acc": 0.8843387459744694
},
{
"epoch": 0.16271186440677965,
"grad_norm": 0.2458319216966629,
"learning_rate": 8.400000000000001e-06,
"loss": 0.3157382607460022,
"step": 42,
"token_acc": 0.8923707458363505
},
{
"epoch": 0.1665859564164649,
"grad_norm": 0.23234310746192932,
"learning_rate": 8.6e-06,
"loss": 0.32486584782600403,
"step": 43,
"token_acc": 0.8898832391328527
},
{
"epoch": 0.17046004842615012,
"grad_norm": 0.24149972200393677,
"learning_rate": 8.8e-06,
"loss": 0.3565906286239624,
"step": 44,
"token_acc": 0.8786636478836652
},
{
"epoch": 0.17433414043583534,
"grad_norm": 0.23454472422599792,
"learning_rate": 9e-06,
"loss": 0.34243613481521606,
"step": 45,
"token_acc": 0.8836981353220466
},
{
"epoch": 0.1782082324455206,
"grad_norm": 0.22611235082149506,
"learning_rate": 9.200000000000002e-06,
"loss": 0.3169807493686676,
"step": 46,
"token_acc": 0.8913642111117898
},
{
"epoch": 0.1820823244552058,
"grad_norm": 0.2332201898097992,
"learning_rate": 9.4e-06,
"loss": 0.3335682153701782,
"step": 47,
"token_acc": 0.8853106607331619
},
{
"epoch": 0.18595641646489103,
"grad_norm": 0.26498886942863464,
"learning_rate": 9.600000000000001e-06,
"loss": 0.3396722674369812,
"step": 48,
"token_acc": 0.884526213547764
},
{
"epoch": 0.18983050847457628,
"grad_norm": 0.29751622676849365,
"learning_rate": 9.800000000000001e-06,
"loss": 0.3544740676879883,
"step": 49,
"token_acc": 0.8790830507178009
},
{
"epoch": 0.1937046004842615,
"grad_norm": 0.24125243723392487,
"learning_rate": 1e-05,
"loss": 0.3444434702396393,
"step": 50,
"token_acc": 0.8833375152943368
},
{
"epoch": 0.19757869249394674,
"grad_norm": 0.23450158536434174,
"learning_rate": 9.999953315763929e-06,
"loss": 0.34759777784347534,
"step": 51,
"token_acc": 0.8809645656414854
},
{
"epoch": 0.20145278450363197,
"grad_norm": 0.24415536224842072,
"learning_rate": 9.999813263927483e-06,
"loss": 0.3302762508392334,
"step": 52,
"token_acc": 0.8872595593874244
},
{
"epoch": 0.20532687651331719,
"grad_norm": 0.23792694509029388,
"learning_rate": 9.999579847105947e-06,
"loss": 0.3057291805744171,
"step": 53,
"token_acc": 0.8958096559669801
},
{
"epoch": 0.20920096852300243,
"grad_norm": 0.24918483197689056,
"learning_rate": 9.999253069658074e-06,
"loss": 0.3550814390182495,
"step": 54,
"token_acc": 0.8789014457104403
},
{
"epoch": 0.21307506053268765,
"grad_norm": 0.24681781232357025,
"learning_rate": 9.99883293768601e-06,
"loss": 0.329832524061203,
"step": 55,
"token_acc": 0.8862846605616155
},
{
"epoch": 0.21694915254237288,
"grad_norm": 0.25197944045066833,
"learning_rate": 9.998319459035168e-06,
"loss": 0.3133784532546997,
"step": 56,
"token_acc": 0.8929686000759445
},
{
"epoch": 0.22082324455205812,
"grad_norm": 0.29595333337783813,
"learning_rate": 9.997712643294093e-06,
"loss": 0.3238765597343445,
"step": 57,
"token_acc": 0.8900121095092376
},
{
"epoch": 0.22469733656174334,
"grad_norm": 0.2436024248600006,
"learning_rate": 9.997012501794273e-06,
"loss": 0.3283236622810364,
"step": 58,
"token_acc": 0.887762605178964
},
{
"epoch": 0.22857142857142856,
"grad_norm": 0.23041026294231415,
"learning_rate": 9.996219047609943e-06,
"loss": 0.3104722797870636,
"step": 59,
"token_acc": 0.8931121325749851
},
{
"epoch": 0.2324455205811138,
"grad_norm": 0.237432062625885,
"learning_rate": 9.995332295557818e-06,
"loss": 0.30940210819244385,
"step": 60,
"token_acc": 0.8942070394423697
},
{
"epoch": 0.23631961259079903,
"grad_norm": 0.23901380598545074,
"learning_rate": 9.994352262196839e-06,
"loss": 0.32523292303085327,
"step": 61,
"token_acc": 0.8885503611348168
},
{
"epoch": 0.24019370460048425,
"grad_norm": 0.27438339591026306,
"learning_rate": 9.993278965827844e-06,
"loss": 0.3501031994819641,
"step": 62,
"token_acc": 0.8796217252529412
},
{
"epoch": 0.2440677966101695,
"grad_norm": 0.23662753403186798,
"learning_rate": 9.992112426493247e-06,
"loss": 0.32605987787246704,
"step": 63,
"token_acc": 0.8890396653634925
},
{
"epoch": 0.24794188861985472,
"grad_norm": 0.2232031375169754,
"learning_rate": 9.990852665976648e-06,
"loss": 0.3196948170661926,
"step": 64,
"token_acc": 0.8907072739748918
},
{
"epoch": 0.25181598062953997,
"grad_norm": 0.2665523886680603,
"learning_rate": 9.989499707802424e-06,
"loss": 0.33278700709342957,
"step": 65,
"token_acc": 0.8863953116150797
},
{
"epoch": 0.25569007263922516,
"grad_norm": 0.23870785534381866,
"learning_rate": 9.988053577235306e-06,
"loss": 0.351688951253891,
"step": 66,
"token_acc": 0.879823584223047
},
{
"epoch": 0.2595641646489104,
"grad_norm": 0.24755656719207764,
"learning_rate": 9.986514301279894e-06,
"loss": 0.31553030014038086,
"step": 67,
"token_acc": 0.8921622627267041
},
{
"epoch": 0.26343825665859566,
"grad_norm": 0.23198164999485016,
"learning_rate": 9.984881908680157e-06,
"loss": 0.3355843424797058,
"step": 68,
"token_acc": 0.8848336232927391
},
{
"epoch": 0.26731234866828085,
"grad_norm": 0.2461438924074173,
"learning_rate": 9.983156429918895e-06,
"loss": 0.3341342508792877,
"step": 69,
"token_acc": 0.8856444439357114
},
{
"epoch": 0.2711864406779661,
"grad_norm": 0.22579748928546906,
"learning_rate": 9.981337897217171e-06,
"loss": 0.3188900947570801,
"step": 70,
"token_acc": 0.8906781387812132
},
{
"epoch": 0.27506053268765135,
"grad_norm": 0.24103567004203796,
"learning_rate": 9.979426344533712e-06,
"loss": 0.3240354061126709,
"step": 71,
"token_acc": 0.8889305949367731
},
{
"epoch": 0.27893462469733654,
"grad_norm": 0.23146985471248627,
"learning_rate": 9.977421807564264e-06,
"loss": 0.3256258964538574,
"step": 72,
"token_acc": 0.8886470476040884
},
{
"epoch": 0.2828087167070218,
"grad_norm": 0.2992205023765564,
"learning_rate": 9.97532432374094e-06,
"loss": 0.3162704110145569,
"step": 73,
"token_acc": 0.8912486582241803
},
{
"epoch": 0.28668280871670704,
"grad_norm": 0.2314625382423401,
"learning_rate": 9.973133932231514e-06,
"loss": 0.33748123049736023,
"step": 74,
"token_acc": 0.8834313251000246
},
{
"epoch": 0.29055690072639223,
"grad_norm": 0.23197512328624725,
"learning_rate": 9.970850673938684e-06,
"loss": 0.3105667233467102,
"step": 75,
"token_acc": 0.8935043208256486
},
{
"epoch": 0.2944309927360775,
"grad_norm": 0.2275022268295288,
"learning_rate": 9.96847459149932e-06,
"loss": 0.3327932357788086,
"step": 76,
"token_acc": 0.8861917159302386
},
{
"epoch": 0.2983050847457627,
"grad_norm": 0.2508430778980255,
"learning_rate": 9.966005729283658e-06,
"loss": 0.32548677921295166,
"step": 77,
"token_acc": 0.8882381273480396
},
{
"epoch": 0.3021791767554479,
"grad_norm": 0.5134550333023071,
"learning_rate": 9.963444133394478e-06,
"loss": 0.3120523691177368,
"step": 78,
"token_acc": 0.8919503736696569
},
{
"epoch": 0.30605326876513317,
"grad_norm": 0.21315379440784454,
"learning_rate": 9.960789851666237e-06,
"loss": 0.3215460181236267,
"step": 79,
"token_acc": 0.8896002985397907
},
{
"epoch": 0.3099273607748184,
"grad_norm": 0.23902781307697296,
"learning_rate": 9.958042933664186e-06,
"loss": 0.33162713050842285,
"step": 80,
"token_acc": 0.8866171518838251
},
{
"epoch": 0.3138014527845036,
"grad_norm": 0.24128590524196625,
"learning_rate": 9.955203430683425e-06,
"loss": 0.3268725574016571,
"step": 81,
"token_acc": 0.8882163748841388
},
{
"epoch": 0.31767554479418886,
"grad_norm": 0.24751782417297363,
"learning_rate": 9.952271395747969e-06,
"loss": 0.3100839853286743,
"step": 82,
"token_acc": 0.893085253361785
},
{
"epoch": 0.3215496368038741,
"grad_norm": 0.23644764721393585,
"learning_rate": 9.949246883609743e-06,
"loss": 0.32995104789733887,
"step": 83,
"token_acc": 0.8866222032237766
},
{
"epoch": 0.3254237288135593,
"grad_norm": 0.232451930642128,
"learning_rate": 9.94612995074756e-06,
"loss": 0.31272488832473755,
"step": 84,
"token_acc": 0.8926265473810503
},
{
"epoch": 0.32929782082324455,
"grad_norm": 0.21610639989376068,
"learning_rate": 9.942920655366075e-06,
"loss": 0.302722692489624,
"step": 85,
"token_acc": 0.8952372082627079
},
{
"epoch": 0.3331719128329298,
"grad_norm": 0.24474947154521942,
"learning_rate": 9.939619057394687e-06,
"loss": 0.31238657236099243,
"step": 86,
"token_acc": 0.8932181956136864
},
{
"epoch": 0.337046004842615,
"grad_norm": 0.22313052415847778,
"learning_rate": 9.936225218486428e-06,
"loss": 0.30595749616622925,
"step": 87,
"token_acc": 0.8942476419229949
},
{
"epoch": 0.34092009685230024,
"grad_norm": 0.25018593668937683,
"learning_rate": 9.93273920201681e-06,
"loss": 0.34218600392341614,
"step": 88,
"token_acc": 0.8826220754003523
},
{
"epoch": 0.3447941888619855,
"grad_norm": 0.21603761613368988,
"learning_rate": 9.929161073082636e-06,
"loss": 0.26845768094062805,
"step": 89,
"token_acc": 0.9068716054841073
},
{
"epoch": 0.3486682808716707,
"grad_norm": 0.22996748983860016,
"learning_rate": 9.925490898500796e-06,
"loss": 0.32508569955825806,
"step": 90,
"token_acc": 0.8884358725254423
},
{
"epoch": 0.3525423728813559,
"grad_norm": 0.3635949194431305,
"learning_rate": 9.921728746807008e-06,
"loss": 0.34217730164527893,
"step": 91,
"token_acc": 0.8833008019688547
},
{
"epoch": 0.3564164648910412,
"grad_norm": 0.22128325700759888,
"learning_rate": 9.917874688254542e-06,
"loss": 0.32345396280288696,
"step": 92,
"token_acc": 0.8889643834760571
},
{
"epoch": 0.36029055690072637,
"grad_norm": 0.24601417779922485,
"learning_rate": 9.913928794812909e-06,
"loss": 0.3252776265144348,
"step": 93,
"token_acc": 0.8881070006006884
},
{
"epoch": 0.3641646489104116,
"grad_norm": 0.23473182320594788,
"learning_rate": 9.90989114016652e-06,
"loss": 0.33626118302345276,
"step": 94,
"token_acc": 0.8841867411739727
},
{
"epoch": 0.36803874092009686,
"grad_norm": 0.22333025932312012,
"learning_rate": 9.905761799713302e-06,
"loss": 0.34545931220054626,
"step": 95,
"token_acc": 0.8803537032594166
},
{
"epoch": 0.37191283292978206,
"grad_norm": 0.21172457933425903,
"learning_rate": 9.901540850563295e-06,
"loss": 0.3074107766151428,
"step": 96,
"token_acc": 0.8944196156632918
},
{
"epoch": 0.3757869249394673,
"grad_norm": 0.2134028971195221,
"learning_rate": 9.89722837153722e-06,
"loss": 0.2957490086555481,
"step": 97,
"token_acc": 0.8978778618134635
},
{
"epoch": 0.37966101694915255,
"grad_norm": 0.2610202729701996,
"learning_rate": 9.892824443164987e-06,
"loss": 0.3412560224533081,
"step": 98,
"token_acc": 0.8829380073969748
},
{
"epoch": 0.38353510895883774,
"grad_norm": 0.25488367676734924,
"learning_rate": 9.88832914768421e-06,
"loss": 0.3430347442626953,
"step": 99,
"token_acc": 0.8815459290145207
},
{
"epoch": 0.387409200968523,
"grad_norm": 0.22882606089115143,
"learning_rate": 9.883742569038663e-06,
"loss": 0.33350762724876404,
"step": 100,
"token_acc": 0.8861422500817198
},
{
"epoch": 0.39128329297820824,
"grad_norm": 0.304647833108902,
"learning_rate": 9.879064792876717e-06,
"loss": 0.31420135498046875,
"step": 101,
"token_acc": 0.8915588172822687
},
{
"epoch": 0.3951573849878935,
"grad_norm": 0.22871072590351105,
"learning_rate": 9.874295906549728e-06,
"loss": 0.3116581439971924,
"step": 102,
"token_acc": 0.8917020548921253
},
{
"epoch": 0.3990314769975787,
"grad_norm": 0.2979466915130615,
"learning_rate": 9.869435999110428e-06,
"loss": 0.3145788013935089,
"step": 103,
"token_acc": 0.8916011830301528
},
{
"epoch": 0.40290556900726393,
"grad_norm": 0.20779502391815186,
"learning_rate": 9.864485161311242e-06,
"loss": 0.3070036768913269,
"step": 104,
"token_acc": 0.8938107647266995
},
{
"epoch": 0.4067796610169492,
"grad_norm": 0.2354535311460495,
"learning_rate": 9.859443485602603e-06,
"loss": 0.32298558950424194,
"step": 105,
"token_acc": 0.8882189451059107
},
{
"epoch": 0.41065375302663437,
"grad_norm": 0.22240500152111053,
"learning_rate": 9.85431106613122e-06,
"loss": 0.3104989528656006,
"step": 106,
"token_acc": 0.8923007628162216
},
{
"epoch": 0.4145278450363196,
"grad_norm": 0.21981710195541382,
"learning_rate": 9.849087998738328e-06,
"loss": 0.3237101435661316,
"step": 107,
"token_acc": 0.8879955719309623
},
{
"epoch": 0.41840193704600487,
"grad_norm": 0.2649724781513214,
"learning_rate": 9.84377438095789e-06,
"loss": 0.323306679725647,
"step": 108,
"token_acc": 0.8889382382835521
},
{
"epoch": 0.42227602905569006,
"grad_norm": 0.2193301022052765,
"learning_rate": 9.838370312014783e-06,
"loss": 0.31488102674484253,
"step": 109,
"token_acc": 0.8910646836196473
},
{
"epoch": 0.4261501210653753,
"grad_norm": 0.21842491626739502,
"learning_rate": 9.832875892822937e-06,
"loss": 0.3206183910369873,
"step": 110,
"token_acc": 0.8890832728771944
},
{
"epoch": 0.43002421307506056,
"grad_norm": 0.2456243336200714,
"learning_rate": 9.827291225983458e-06,
"loss": 0.3201240301132202,
"step": 111,
"token_acc": 0.8904148288428204
},
{
"epoch": 0.43389830508474575,
"grad_norm": 0.21340763568878174,
"learning_rate": 9.821616415782708e-06,
"loss": 0.29961007833480835,
"step": 112,
"token_acc": 0.8965660205577574
},
{
"epoch": 0.437772397094431,
"grad_norm": 0.2308902144432068,
"learning_rate": 9.815851568190358e-06,
"loss": 0.3107410669326782,
"step": 113,
"token_acc": 0.8927536025516888
},
{
"epoch": 0.44164648910411625,
"grad_norm": 0.2292374223470688,
"learning_rate": 9.80999679085741e-06,
"loss": 0.3277205228805542,
"step": 114,
"token_acc": 0.886787084498464
},
{
"epoch": 0.44552058111380144,
"grad_norm": 0.21509671211242676,
"learning_rate": 9.80405219311419e-06,
"loss": 0.3161908984184265,
"step": 115,
"token_acc": 0.8916077261448497
},
{
"epoch": 0.4493946731234867,
"grad_norm": 0.20529279112815857,
"learning_rate": 9.798017885968295e-06,
"loss": 0.29131007194519043,
"step": 116,
"token_acc": 0.8990066361086406
},
{
"epoch": 0.45326876513317194,
"grad_norm": 0.24888373911380768,
"learning_rate": 9.791893982102537e-06,
"loss": 0.31967025995254517,
"step": 117,
"token_acc": 0.8899925908756566
},
{
"epoch": 0.45714285714285713,
"grad_norm": 0.22014780342578888,
"learning_rate": 9.785680595872824e-06,
"loss": 0.31103435158729553,
"step": 118,
"token_acc": 0.8928936680571538
},
{
"epoch": 0.4610169491525424,
"grad_norm": 0.21783359348773956,
"learning_rate": 9.77937784330603e-06,
"loss": 0.307749480009079,
"step": 119,
"token_acc": 0.8931600584652736
},
{
"epoch": 0.4648910411622276,
"grad_norm": 0.2104286551475525,
"learning_rate": 9.772985842097832e-06,
"loss": 0.31199365854263306,
"step": 120,
"token_acc": 0.8926850259294361
},
{
"epoch": 0.4687651331719128,
"grad_norm": 0.21124128997325897,
"learning_rate": 9.766504711610507e-06,
"loss": 0.3170176148414612,
"step": 121,
"token_acc": 0.8906264477918435
},
{
"epoch": 0.47263922518159807,
"grad_norm": 0.23777632415294647,
"learning_rate": 9.759934572870706e-06,
"loss": 0.3052697777748108,
"step": 122,
"token_acc": 0.894442848003123
},
{
"epoch": 0.4765133171912833,
"grad_norm": 0.2527632713317871,
"learning_rate": 9.753275548567192e-06,
"loss": 0.3045836091041565,
"step": 123,
"token_acc": 0.8951105518605069
},
{
"epoch": 0.4803874092009685,
"grad_norm": 0.20530211925506592,
"learning_rate": 9.74652776304855e-06,
"loss": 0.3366113305091858,
"step": 124,
"token_acc": 0.8836434912892324
},
{
"epoch": 0.48426150121065376,
"grad_norm": 0.26673150062561035,
"learning_rate": 9.739691342320866e-06,
"loss": 0.311764121055603,
"step": 125,
"token_acc": 0.8910826454277961
},
{
"epoch": 0.488135593220339,
"grad_norm": 0.2245185822248459,
"learning_rate": 9.732766414045368e-06,
"loss": 0.31055164337158203,
"step": 126,
"token_acc": 0.8926098098046538
},
{
"epoch": 0.4920096852300242,
"grad_norm": 0.2143883854150772,
"learning_rate": 9.725753107536053e-06,
"loss": 0.33499595522880554,
"step": 127,
"token_acc": 0.8840534260641282
},
{
"epoch": 0.49588377723970944,
"grad_norm": 0.22163285315036774,
"learning_rate": 9.718651553757266e-06,
"loss": 0.31920328736305237,
"step": 128,
"token_acc": 0.8901271163419964
},
{
"epoch": 0.4997578692493947,
"grad_norm": 0.2143898904323578,
"learning_rate": 9.711461885321247e-06,
"loss": 0.3301286995410919,
"step": 129,
"token_acc": 0.8853363916795757
},
{
"epoch": 0.5036319612590799,
"grad_norm": 0.24990734457969666,
"learning_rate": 9.704184236485672e-06,
"loss": 0.3278159201145172,
"step": 130,
"token_acc": 0.8874620923082561
},
{
"epoch": 0.5075060532687651,
"grad_norm": 0.22136539220809937,
"learning_rate": 9.696818743151128e-06,
"loss": 0.3319326937198639,
"step": 131,
"token_acc": 0.885009570455441
},
{
"epoch": 0.5113801452784503,
"grad_norm": 0.2669275999069214,
"learning_rate": 9.68936554285859e-06,
"loss": 0.3023684620857239,
"step": 132,
"token_acc": 0.8951259709956582
},
{
"epoch": 0.5152542372881356,
"grad_norm": 0.21833708882331848,
"learning_rate": 9.68182477478684e-06,
"loss": 0.3089104890823364,
"step": 133,
"token_acc": 0.8930920187299416
},
{
"epoch": 0.5191283292978208,
"grad_norm": 0.21197167038917542,
"learning_rate": 9.67419657974988e-06,
"loss": 0.3144392967224121,
"step": 134,
"token_acc": 0.8910884224709107
},
{
"epoch": 0.5230024213075061,
"grad_norm": 0.21434499323368073,
"learning_rate": 9.66648110019429e-06,
"loss": 0.3246540427207947,
"step": 135,
"token_acc": 0.8876412650671648
},
{
"epoch": 0.5268765133171913,
"grad_norm": 0.20343148708343506,
"learning_rate": 9.658678480196579e-06,
"loss": 0.315585196018219,
"step": 136,
"token_acc": 0.8905443269970013
},
{
"epoch": 0.5307506053268766,
"grad_norm": 0.23613257706165314,
"learning_rate": 9.650788865460487e-06,
"loss": 0.3131225109100342,
"step": 137,
"token_acc": 0.8912192170846405
},
{
"epoch": 0.5346246973365617,
"grad_norm": 0.4212075471878052,
"learning_rate": 9.642812403314272e-06,
"loss": 0.29884475469589233,
"step": 138,
"token_acc": 0.8966553773404051
},
{
"epoch": 0.538498789346247,
"grad_norm": 0.20193685591220856,
"learning_rate": 9.634749242707948e-06,
"loss": 0.26036083698272705,
"step": 139,
"token_acc": 0.9091038865111504
},
{
"epoch": 0.5423728813559322,
"grad_norm": 0.2208104431629181,
"learning_rate": 9.626599534210514e-06,
"loss": 0.33184394240379333,
"step": 140,
"token_acc": 0.8853617134142299
},
{
"epoch": 0.5462469733656174,
"grad_norm": 0.22493727505207062,
"learning_rate": 9.618363430007134e-06,
"loss": 0.31208667159080505,
"step": 141,
"token_acc": 0.8917024215686027
},
{
"epoch": 0.5501210653753027,
"grad_norm": 0.23963193595409393,
"learning_rate": 9.610041083896304e-06,
"loss": 0.33588868379592896,
"step": 142,
"token_acc": 0.883973627021253
},
{
"epoch": 0.553995157384988,
"grad_norm": 0.21784453094005585,
"learning_rate": 9.60163265128697e-06,
"loss": 0.3231375813484192,
"step": 143,
"token_acc": 0.8887875239014834
},
{
"epoch": 0.5578692493946731,
"grad_norm": 0.22835847735404968,
"learning_rate": 9.593138289195634e-06,
"loss": 0.3210199773311615,
"step": 144,
"token_acc": 0.8890582816354493
},
{
"epoch": 0.5617433414043583,
"grad_norm": 0.2136555314064026,
"learning_rate": 9.584558156243418e-06,
"loss": 0.3372665047645569,
"step": 145,
"token_acc": 0.8839793357706921
},
{
"epoch": 0.5656174334140436,
"grad_norm": 0.20598500967025757,
"learning_rate": 9.575892412653102e-06,
"loss": 0.30844664573669434,
"step": 146,
"token_acc": 0.8926156654585412
},
{
"epoch": 0.5694915254237288,
"grad_norm": 0.2522714138031006,
"learning_rate": 9.567141220246136e-06,
"loss": 0.36702272295951843,
"step": 147,
"token_acc": 0.8734296301671142
},
{
"epoch": 0.5733656174334141,
"grad_norm": 0.21975038945674896,
"learning_rate": 9.55830474243961e-06,
"loss": 0.32784411311149597,
"step": 148,
"token_acc": 0.8871756189192851
},
{
"epoch": 0.5772397094430993,
"grad_norm": 0.21233901381492615,
"learning_rate": 9.549383144243213e-06,
"loss": 0.2944122850894928,
"step": 149,
"token_acc": 0.8987453672884691
},
{
"epoch": 0.5811138014527845,
"grad_norm": 0.2199799120426178,
"learning_rate": 9.540376592256142e-06,
"loss": 0.3299463987350464,
"step": 150,
"token_acc": 0.8859144839374592
},
{
"epoch": 0.5849878934624697,
"grad_norm": 0.19698019325733185,
"learning_rate": 9.531285254663997e-06,
"loss": 0.3030051589012146,
"step": 151,
"token_acc": 0.8951707294894029
},
{
"epoch": 0.588861985472155,
"grad_norm": 0.22306668758392334,
"learning_rate": 9.522109301235637e-06,
"loss": 0.29752516746520996,
"step": 152,
"token_acc": 0.8966012679857996
},
{
"epoch": 0.5927360774818402,
"grad_norm": 0.21317337453365326,
"learning_rate": 9.512848903320017e-06,
"loss": 0.3052118122577667,
"step": 153,
"token_acc": 0.8944324633814714
},
{
"epoch": 0.5966101694915255,
"grad_norm": 0.2120915800333023,
"learning_rate": 9.503504233842973e-06,
"loss": 0.29761528968811035,
"step": 154,
"token_acc": 0.8966406260468731
},
{
"epoch": 0.6004842615012107,
"grad_norm": 0.23525090515613556,
"learning_rate": 9.494075467304007e-06,
"loss": 0.3034532070159912,
"step": 155,
"token_acc": 0.8944926637860167
},
{
"epoch": 0.6043583535108958,
"grad_norm": 0.2095353752374649,
"learning_rate": 9.484562779773027e-06,
"loss": 0.2903788089752197,
"step": 156,
"token_acc": 0.8990560027078014
},
{
"epoch": 0.6082324455205811,
"grad_norm": 0.23741677403450012,
"learning_rate": 9.474966348887055e-06,
"loss": 0.31467512249946594,
"step": 157,
"token_acc": 0.8904583329757747
},
{
"epoch": 0.6121065375302663,
"grad_norm": 0.2259555608034134,
"learning_rate": 9.465286353846905e-06,
"loss": 0.3404577374458313,
"step": 158,
"token_acc": 0.8826165622063978
},
{
"epoch": 0.6159806295399516,
"grad_norm": 0.2183879017829895,
"learning_rate": 9.455522975413846e-06,
"loss": 0.2766571640968323,
"step": 159,
"token_acc": 0.9038809421418853
},
{
"epoch": 0.6198547215496368,
"grad_norm": 0.22651784121990204,
"learning_rate": 9.445676395906226e-06,
"loss": 0.29638129472732544,
"step": 160,
"token_acc": 0.8970113168662065
},
{
"epoch": 0.6237288135593221,
"grad_norm": 0.22088395059108734,
"learning_rate": 9.435746799196061e-06,
"loss": 0.3023075759410858,
"step": 161,
"token_acc": 0.8946665593674712
},
{
"epoch": 0.6276029055690072,
"grad_norm": 0.21526560187339783,
"learning_rate": 9.425734370705606e-06,
"loss": 0.28661438822746277,
"step": 162,
"token_acc": 0.9002787847728345
},
{
"epoch": 0.6314769975786925,
"grad_norm": 0.23334769904613495,
"learning_rate": 9.415639297403891e-06,
"loss": 0.31685301661491394,
"step": 163,
"token_acc": 0.890886748080584
},
{
"epoch": 0.6353510895883777,
"grad_norm": 0.200165793299675,
"learning_rate": 9.40546176780323e-06,
"loss": 0.30981898307800293,
"step": 164,
"token_acc": 0.8924871164982372
},
{
"epoch": 0.639225181598063,
"grad_norm": 0.20800836384296417,
"learning_rate": 9.395201971955701e-06,
"loss": 0.3162352740764618,
"step": 165,
"token_acc": 0.8910434805285766
},
{
"epoch": 0.6430992736077482,
"grad_norm": 0.20923736691474915,
"learning_rate": 9.384860101449598e-06,
"loss": 0.32208406925201416,
"step": 166,
"token_acc": 0.8880633815629819
},
{
"epoch": 0.6469733656174335,
"grad_norm": 0.1986808031797409,
"learning_rate": 9.374436349405847e-06,
"loss": 0.28397923707962036,
"step": 167,
"token_acc": 0.9012052212352475
},
{
"epoch": 0.6508474576271186,
"grad_norm": 0.21215273439884186,
"learning_rate": 9.36393091047441e-06,
"loss": 0.3066609799861908,
"step": 168,
"token_acc": 0.894593303584187
},
{
"epoch": 0.6547215496368038,
"grad_norm": 0.20804037153720856,
"learning_rate": 9.353343980830644e-06,
"loss": 0.3097017705440521,
"step": 169,
"token_acc": 0.8926308156125992
},
{
"epoch": 0.6585956416464891,
"grad_norm": 0.20328834652900696,
"learning_rate": 9.342675758171638e-06,
"loss": 0.3010105490684509,
"step": 170,
"token_acc": 0.8950560660129195
},
{
"epoch": 0.6624697336561743,
"grad_norm": 0.2051060050725937,
"learning_rate": 9.331926441712522e-06,
"loss": 0.3019353151321411,
"step": 171,
"token_acc": 0.8949745506999682
},
{
"epoch": 0.6663438256658596,
"grad_norm": 0.24043123424053192,
"learning_rate": 9.32109623218275e-06,
"loss": 0.3116442859172821,
"step": 172,
"token_acc": 0.8915558784861239
},
{
"epoch": 0.6702179176755448,
"grad_norm": 0.21520181000232697,
"learning_rate": 9.310185331822338e-06,
"loss": 0.31186142563819885,
"step": 173,
"token_acc": 0.8917585320277845
},
{
"epoch": 0.67409200968523,
"grad_norm": 0.21344298124313354,
"learning_rate": 9.299193944378112e-06,
"loss": 0.3273160755634308,
"step": 174,
"token_acc": 0.886418268420563
},
{
"epoch": 0.6779661016949152,
"grad_norm": 0.20224156975746155,
"learning_rate": 9.28812227509988e-06,
"loss": 0.31608837842941284,
"step": 175,
"token_acc": 0.8894536504933755
},
{
"epoch": 0.6818401937046005,
"grad_norm": 0.2154257595539093,
"learning_rate": 9.27697053073661e-06,
"loss": 0.34367692470550537,
"step": 176,
"token_acc": 0.8811017511710314
},
{
"epoch": 0.6857142857142857,
"grad_norm": 0.22003678977489471,
"learning_rate": 9.26573891953257e-06,
"loss": 0.3205263018608093,
"step": 177,
"token_acc": 0.8893210947921869
},
{
"epoch": 0.689588377723971,
"grad_norm": 0.21449677646160126,
"learning_rate": 9.254427651223434e-06,
"loss": 0.28666430711746216,
"step": 178,
"token_acc": 0.9003720788020833
},
{
"epoch": 0.6934624697336562,
"grad_norm": 0.22110596299171448,
"learning_rate": 9.243036937032373e-06,
"loss": 0.3156067728996277,
"step": 179,
"token_acc": 0.8902597783694092
},
{
"epoch": 0.6973365617433414,
"grad_norm": 0.19700580835342407,
"learning_rate": 9.2315669896661e-06,
"loss": 0.28897273540496826,
"step": 180,
"token_acc": 0.8994499889336349
},
{
"epoch": 0.7012106537530266,
"grad_norm": 0.21460606157779694,
"learning_rate": 9.220018023310908e-06,
"loss": 0.31268295645713806,
"step": 181,
"token_acc": 0.8918378520876847
},
{
"epoch": 0.7050847457627119,
"grad_norm": 0.21692436933517456,
"learning_rate": 9.208390253628667e-06,
"loss": 0.28844964504241943,
"step": 182,
"token_acc": 0.8997311485616448
},
{
"epoch": 0.7089588377723971,
"grad_norm": 0.201703280210495,
"learning_rate": 9.196683897752794e-06,
"loss": 0.32861441373825073,
"step": 183,
"token_acc": 0.8854774295445417
},
{
"epoch": 0.7128329297820823,
"grad_norm": 3.976747751235962,
"learning_rate": 9.184899174284201e-06,
"loss": 0.33475255966186523,
"step": 184,
"token_acc": 0.8836819705392365
},
{
"epoch": 0.7167070217917676,
"grad_norm": 0.24247053265571594,
"learning_rate": 9.173036303287215e-06,
"loss": 0.3366454243659973,
"step": 185,
"token_acc": 0.8833432089980459
},
{
"epoch": 0.7205811138014527,
"grad_norm": 0.2282845675945282,
"learning_rate": 9.16109550628546e-06,
"loss": 0.2812536656856537,
"step": 186,
"token_acc": 0.9027706860502607
},
{
"epoch": 0.724455205811138,
"grad_norm": 0.2282128632068634,
"learning_rate": 9.149077006257734e-06,
"loss": 0.3136906027793884,
"step": 187,
"token_acc": 0.8912536222754189
},
{
"epoch": 0.7283292978208232,
"grad_norm": 0.20751290023326874,
"learning_rate": 9.136981027633834e-06,
"loss": 0.29636135697364807,
"step": 188,
"token_acc": 0.8974463288547996
},
{
"epoch": 0.7322033898305085,
"grad_norm": 0.23192144930362701,
"learning_rate": 9.124807796290366e-06,
"loss": 0.3046882152557373,
"step": 189,
"token_acc": 0.8943812414560115
},
{
"epoch": 0.7360774818401937,
"grad_norm": 0.221333310008049,
"learning_rate": 9.112557539546535e-06,
"loss": 0.32960376143455505,
"step": 190,
"token_acc": 0.8860915000599271
},
{
"epoch": 0.739951573849879,
"grad_norm": 0.1981872171163559,
"learning_rate": 9.100230486159893e-06,
"loss": 0.32151421904563904,
"step": 191,
"token_acc": 0.888598638535205
},
{
"epoch": 0.7438256658595641,
"grad_norm": 0.2172573357820511,
"learning_rate": 9.087826866322065e-06,
"loss": 0.3255336880683899,
"step": 192,
"token_acc": 0.8864367509340579
},
{
"epoch": 0.7476997578692494,
"grad_norm": 0.21215571463108063,
"learning_rate": 9.075346911654456e-06,
"loss": 0.30505236983299255,
"step": 193,
"token_acc": 0.8936060377931436
},
{
"epoch": 0.7515738498789346,
"grad_norm": 0.21355277299880981,
"learning_rate": 9.062790855203932e-06,
"loss": 0.3349328637123108,
"step": 194,
"token_acc": 0.8847527625851099
},
{
"epoch": 0.7554479418886199,
"grad_norm": 0.20415301620960236,
"learning_rate": 9.050158931438451e-06,
"loss": 0.3010273873806,
"step": 195,
"token_acc": 0.8946901896914337
},
{
"epoch": 0.7593220338983051,
"grad_norm": 0.2100018560886383,
"learning_rate": 9.037451376242696e-06,
"loss": 0.3295148015022278,
"step": 196,
"token_acc": 0.8861214255925314
},
{
"epoch": 0.7631961259079904,
"grad_norm": 0.21248096227645874,
"learning_rate": 9.024668426913671e-06,
"loss": 0.2901475727558136,
"step": 197,
"token_acc": 0.8984891018269412
},
{
"epoch": 0.7670702179176755,
"grad_norm": 0.20735451579093933,
"learning_rate": 9.011810322156269e-06,
"loss": 0.3123668134212494,
"step": 198,
"token_acc": 0.8911118341790296
},
{
"epoch": 0.7709443099273607,
"grad_norm": 0.2119433879852295,
"learning_rate": 8.998877302078803e-06,
"loss": 0.30766892433166504,
"step": 199,
"token_acc": 0.8930650097673094
},
{
"epoch": 0.774818401937046,
"grad_norm": 0.20151817798614502,
"learning_rate": 8.985869608188545e-06,
"loss": 0.294528067111969,
"step": 200,
"token_acc": 0.8973507748438794
},
{
"epoch": 0.7786924939467312,
"grad_norm": 0.20979715883731842,
"learning_rate": 8.97278748338719e-06,
"loss": 0.3116077184677124,
"step": 201,
"token_acc": 0.8916578293780434
},
{
"epoch": 0.7825665859564165,
"grad_norm": 0.21114560961723328,
"learning_rate": 8.95963117196634e-06,
"loss": 0.31117022037506104,
"step": 202,
"token_acc": 0.8922739117136779
},
{
"epoch": 0.7864406779661017,
"grad_norm": 0.2028111070394516,
"learning_rate": 8.946400919602933e-06,
"loss": 0.2925041913986206,
"step": 203,
"token_acc": 0.8979599612123477
},
{
"epoch": 0.790314769975787,
"grad_norm": 0.19873376190662384,
"learning_rate": 8.933096973354665e-06,
"loss": 0.3335387706756592,
"step": 204,
"token_acc": 0.8845781124549695
},
{
"epoch": 0.7941888619854721,
"grad_norm": 0.20865830779075623,
"learning_rate": 8.919719581655357e-06,
"loss": 0.3048374652862549,
"step": 205,
"token_acc": 0.8941424666394205
},
{
"epoch": 0.7980629539951574,
"grad_norm": 0.21847450733184814,
"learning_rate": 8.906268994310339e-06,
"loss": 0.30148929357528687,
"step": 206,
"token_acc": 0.8948231645494126
},
{
"epoch": 0.8019370460048426,
"grad_norm": 0.23447921872138977,
"learning_rate": 8.892745462491763e-06,
"loss": 0.3076891005039215,
"step": 207,
"token_acc": 0.8940680143003497
},
{
"epoch": 0.8058111380145279,
"grad_norm": 0.2047218531370163,
"learning_rate": 8.879149238733932e-06,
"loss": 0.2903471291065216,
"step": 208,
"token_acc": 0.8996930000967329
},
{
"epoch": 0.8096852300242131,
"grad_norm": 0.3560882806777954,
"learning_rate": 8.865480576928578e-06,
"loss": 0.2734353840351105,
"step": 209,
"token_acc": 0.9038816908230364
},
{
"epoch": 0.8135593220338984,
"grad_norm": 0.22588837146759033,
"learning_rate": 8.851739732320109e-06,
"loss": 0.30820316076278687,
"step": 210,
"token_acc": 0.8928903081404425
},
{
"epoch": 0.8174334140435835,
"grad_norm": 0.19928814470767975,
"learning_rate": 8.83792696150086e-06,
"loss": 0.30705487728118896,
"step": 211,
"token_acc": 0.8931717351449738
},
{
"epoch": 0.8213075060532687,
"grad_norm": 0.23134565353393555,
"learning_rate": 8.824042522406295e-06,
"loss": 0.3144133687019348,
"step": 212,
"token_acc": 0.8904542748607169
},
{
"epoch": 0.825181598062954,
"grad_norm": 0.20952780544757843,
"learning_rate": 8.810086674310184e-06,
"loss": 0.3166520595550537,
"step": 213,
"token_acc": 0.8902617260259249
},
{
"epoch": 0.8290556900726392,
"grad_norm": 0.21133121848106384,
"learning_rate": 8.796059677819773e-06,
"loss": 0.31384018063545227,
"step": 214,
"token_acc": 0.8909493414116798
},
{
"epoch": 0.8329297820823245,
"grad_norm": 0.3206462264060974,
"learning_rate": 8.781961794870903e-06,
"loss": 0.30939990282058716,
"step": 215,
"token_acc": 0.8926290243396312
},
{
"epoch": 0.8368038740920097,
"grad_norm": 0.21380406618118286,
"learning_rate": 8.767793288723137e-06,
"loss": 0.3126541078090668,
"step": 216,
"token_acc": 0.8918149018414423
},
{
"epoch": 0.8406779661016949,
"grad_norm": 0.2241922914981842,
"learning_rate": 8.753554423954828e-06,
"loss": 0.32906076312065125,
"step": 217,
"token_acc": 0.8866828065863777
},
{
"epoch": 0.8445520581113801,
"grad_norm": 0.19776619970798492,
"learning_rate": 8.739245466458187e-06,
"loss": 0.28062158823013306,
"step": 218,
"token_acc": 0.9022684784065322
},
{
"epoch": 0.8484261501210654,
"grad_norm": 0.2141999900341034,
"learning_rate": 8.72486668343431e-06,
"loss": 0.3276277184486389,
"step": 219,
"token_acc": 0.8861141792995992
},
{
"epoch": 0.8523002421307506,
"grad_norm": 0.2332129180431366,
"learning_rate": 8.7104183433882e-06,
"loss": 0.3168509304523468,
"step": 220,
"token_acc": 0.8899989570826125
},
{
"epoch": 0.8561743341404359,
"grad_norm": 0.2141677886247635,
"learning_rate": 8.695900716123744e-06,
"loss": 0.3259914219379425,
"step": 221,
"token_acc": 0.8866733094194235
},
{
"epoch": 0.8600484261501211,
"grad_norm": 0.20929858088493347,
"learning_rate": 8.681314072738678e-06,
"loss": 0.2776751220226288,
"step": 222,
"token_acc": 0.9029569916163804
},
{
"epoch": 0.8639225181598063,
"grad_norm": 0.26802197098731995,
"learning_rate": 8.666658685619523e-06,
"loss": 0.3192378282546997,
"step": 223,
"token_acc": 0.8888524656782731
},
{
"epoch": 0.8677966101694915,
"grad_norm": 0.19303195178508759,
"learning_rate": 8.651934828436497e-06,
"loss": 0.2820873260498047,
"step": 224,
"token_acc": 0.9010663601046539
},
{
"epoch": 0.8716707021791767,
"grad_norm": 0.20784462988376617,
"learning_rate": 8.637142776138415e-06,
"loss": 0.2850268483161926,
"step": 225,
"token_acc": 0.9003609394726915
},
{
"epoch": 0.875544794188862,
"grad_norm": 0.2194257229566574,
"learning_rate": 8.622282804947537e-06,
"loss": 0.31484997272491455,
"step": 226,
"token_acc": 0.8909253202507496
},
{
"epoch": 0.8794188861985472,
"grad_norm": 0.21197804808616638,
"learning_rate": 8.607355192354425e-06,
"loss": 0.3072202801704407,
"step": 227,
"token_acc": 0.8929364556285221
},
{
"epoch": 0.8832929782082325,
"grad_norm": 0.19514977931976318,
"learning_rate": 8.592360217112759e-06,
"loss": 0.31343895196914673,
"step": 228,
"token_acc": 0.8909144611151198
},
{
"epoch": 0.8871670702179176,
"grad_norm": 0.2198445200920105,
"learning_rate": 8.57729815923412e-06,
"loss": 0.31176120042800903,
"step": 229,
"token_acc": 0.8916788161998124
},
{
"epoch": 0.8910411622276029,
"grad_norm": 0.20297633111476898,
"learning_rate": 8.562169299982776e-06,
"loss": 0.30840498208999634,
"step": 230,
"token_acc": 0.8921534903182912
},
{
"epoch": 0.8949152542372881,
"grad_norm": 0.21356205642223358,
"learning_rate": 8.546973921870421e-06,
"loss": 0.3210839629173279,
"step": 231,
"token_acc": 0.8882864775840541
},
{
"epoch": 0.8987893462469734,
"grad_norm": 0.21405935287475586,
"learning_rate": 8.531712308650904e-06,
"loss": 0.3006952702999115,
"step": 232,
"token_acc": 0.8953128142705267
},
{
"epoch": 0.9026634382566586,
"grad_norm": 0.21220295131206512,
"learning_rate": 8.516384745314926e-06,
"loss": 0.33272668719291687,
"step": 233,
"token_acc": 0.8845533899027282
},
{
"epoch": 0.9065375302663439,
"grad_norm": 0.19546008110046387,
"learning_rate": 8.50099151808472e-06,
"loss": 0.26581257581710815,
"step": 234,
"token_acc": 0.9067262813046539
},
{
"epoch": 0.910411622276029,
"grad_norm": 0.2057773917913437,
"learning_rate": 8.485532914408712e-06,
"loss": 0.2936754524707794,
"step": 235,
"token_acc": 0.8980145512690381
},
{
"epoch": 0.9142857142857143,
"grad_norm": 0.21968601644039154,
"learning_rate": 8.470009222956138e-06,
"loss": 0.2990136742591858,
"step": 236,
"token_acc": 0.8944779048351311
},
{
"epoch": 0.9181598062953995,
"grad_norm": 0.22149494290351868,
"learning_rate": 8.45442073361167e-06,
"loss": 0.29907599091529846,
"step": 237,
"token_acc": 0.8953804266415489
},
{
"epoch": 0.9220338983050848,
"grad_norm": 0.18807418644428253,
"learning_rate": 8.438767737469995e-06,
"loss": 0.2596169412136078,
"step": 238,
"token_acc": 0.9094668271985952
},
{
"epoch": 0.92590799031477,
"grad_norm": 0.2053857445716858,
"learning_rate": 8.42305052683038e-06,
"loss": 0.320443719625473,
"step": 239,
"token_acc": 0.8882472950063495
},
{
"epoch": 0.9297820823244553,
"grad_norm": 0.19474725425243378,
"learning_rate": 8.407269395191216e-06,
"loss": 0.29054853320121765,
"step": 240,
"token_acc": 0.8986681898213217
},
{
"epoch": 0.9336561743341404,
"grad_norm": 0.22415153682231903,
"learning_rate": 8.391424637244528e-06,
"loss": 0.29720863699913025,
"step": 241,
"token_acc": 0.8967865758573351
},
{
"epoch": 0.9375302663438256,
"grad_norm": 0.20295462012290955,
"learning_rate": 8.375516548870489e-06,
"loss": 0.3213497996330261,
"step": 242,
"token_acc": 0.8888211973402874
},
{
"epoch": 0.9414043583535109,
"grad_norm": 0.235239177942276,
"learning_rate": 8.359545427131876e-06,
"loss": 0.31140708923339844,
"step": 243,
"token_acc": 0.8917541696945803
},
{
"epoch": 0.9452784503631961,
"grad_norm": 0.21419954299926758,
"learning_rate": 8.343511570268541e-06,
"loss": 0.3142154812812805,
"step": 244,
"token_acc": 0.890589961402836
},
{
"epoch": 0.9491525423728814,
"grad_norm": 0.20498663187026978,
"learning_rate": 8.327415277691824e-06,
"loss": 0.3464815020561218,
"step": 245,
"token_acc": 0.8797665540392294
},
{
"epoch": 0.9530266343825666,
"grad_norm": 0.20611073076725006,
"learning_rate": 8.311256849978974e-06,
"loss": 0.31497207283973694,
"step": 246,
"token_acc": 0.889790752866034
},
{
"epoch": 0.9569007263922518,
"grad_norm": 0.21447882056236267,
"learning_rate": 8.295036588867533e-06,
"loss": 0.28588759899139404,
"step": 247,
"token_acc": 0.8993494375908707
},
{
"epoch": 0.960774818401937,
"grad_norm": 0.21430622041225433,
"learning_rate": 8.278754797249702e-06,
"loss": 0.3209206461906433,
"step": 248,
"token_acc": 0.8878057052632179
},
{
"epoch": 0.9646489104116223,
"grad_norm": 0.1971716433763504,
"learning_rate": 8.262411779166681e-06,
"loss": 0.29577910900115967,
"step": 249,
"token_acc": 0.8970768255184925
},
{
"epoch": 0.9685230024213075,
"grad_norm": 0.20728042721748352,
"learning_rate": 8.246007839802997e-06,
"loss": 0.3149109482765198,
"step": 250,
"token_acc": 0.8904120076852685
},
{
"epoch": 0.9723970944309928,
"grad_norm": 0.23157289624214172,
"learning_rate": 8.229543285480797e-06,
"loss": 0.3057391047477722,
"step": 251,
"token_acc": 0.8943966929583815
},
{
"epoch": 0.976271186440678,
"grad_norm": 0.21818409860134125,
"learning_rate": 8.213018423654144e-06,
"loss": 0.3090881109237671,
"step": 252,
"token_acc": 0.8931029437419457
},
{
"epoch": 0.9801452784503631,
"grad_norm": 0.20345434546470642,
"learning_rate": 8.196433562903252e-06,
"loss": 0.2966330051422119,
"step": 253,
"token_acc": 0.8959465166900704
},
{
"epoch": 0.9840193704600484,
"grad_norm": 0.203868567943573,
"learning_rate": 8.179789012928747e-06,
"loss": 0.2893424928188324,
"step": 254,
"token_acc": 0.8989887993032385
},
{
"epoch": 0.9878934624697336,
"grad_norm": 0.20835842192173004,
"learning_rate": 8.163085084545867e-06,
"loss": 0.29561957716941833,
"step": 255,
"token_acc": 0.897130295078995
},
{
"epoch": 0.9917675544794189,
"grad_norm": 0.2602974772453308,
"learning_rate": 8.146322089678668e-06,
"loss": 0.33309951424598694,
"step": 256,
"token_acc": 0.8842519179704944
},
{
"epoch": 0.9956416464891041,
"grad_norm": 0.1993730664253235,
"learning_rate": 8.129500341354192e-06,
"loss": 0.32513946294784546,
"step": 257,
"token_acc": 0.8869922494628838
},
{
"epoch": 0.9995157384987894,
"grad_norm": 0.2033330649137497,
"learning_rate": 8.11262015369663e-06,
"loss": 0.29512181878089905,
"step": 258,
"token_acc": 0.8968425014801387
},
{
"epoch": 1.0,
"grad_norm": 0.6673643589019775,
"learning_rate": 8.095681841921441e-06,
"loss": 0.28728920221328735,
"step": 259,
"token_acc": 0.9003083713758805
},
{
"epoch": 1.0038740920096851,
"grad_norm": 0.32744893431663513,
"learning_rate": 8.07868572232949e-06,
"loss": 0.269972562789917,
"step": 260,
"token_acc": 0.9038492097273063
},
{
"epoch": 1.0077481840193705,
"grad_norm": 0.2596898376941681,
"learning_rate": 8.061632112301122e-06,
"loss": 0.2655790150165558,
"step": 261,
"token_acc": 0.9053338855906853
},
{
"epoch": 1.0116222760290556,
"grad_norm": 0.2612839639186859,
"learning_rate": 8.044521330290235e-06,
"loss": 0.2887282967567444,
"step": 262,
"token_acc": 0.8971828029711167
},
{
"epoch": 1.015496368038741,
"grad_norm": 0.2769652009010315,
"learning_rate": 8.027353695818345e-06,
"loss": 0.26126527786254883,
"step": 263,
"token_acc": 0.9065780969019781
},
{
"epoch": 1.0193704600484261,
"grad_norm": 0.27929142117500305,
"learning_rate": 8.010129529468614e-06,
"loss": 0.27868735790252686,
"step": 264,
"token_acc": 0.9001419249114798
},
{
"epoch": 1.0232445520581113,
"grad_norm": 0.23997750878334045,
"learning_rate": 7.992849152879857e-06,
"loss": 0.2831759750843048,
"step": 265,
"token_acc": 0.899304001670737
},
{
"epoch": 1.0271186440677966,
"grad_norm": 0.25313815474510193,
"learning_rate": 7.97551288874055e-06,
"loss": 0.27934202551841736,
"step": 266,
"token_acc": 0.9004498805562496
},
{
"epoch": 1.0309927360774818,
"grad_norm": 0.23287494480609894,
"learning_rate": 7.95812106078279e-06,
"loss": 0.26112881302833557,
"step": 267,
"token_acc": 0.9065508038300509
},
{
"epoch": 1.0348668280871671,
"grad_norm": 0.22660091519355774,
"learning_rate": 7.940673993776258e-06,
"loss": 0.2504875063896179,
"step": 268,
"token_acc": 0.9097140867981872
},
{
"epoch": 1.0387409200968523,
"grad_norm": 0.2266615480184555,
"learning_rate": 7.923172013522153e-06,
"loss": 0.25760790705680847,
"step": 269,
"token_acc": 0.9073963735109954
},
{
"epoch": 1.0426150121065376,
"grad_norm": 0.22593924403190613,
"learning_rate": 7.905615446847107e-06,
"loss": 0.28686419129371643,
"step": 270,
"token_acc": 0.8976161305002275
},
{
"epoch": 1.0464891041162228,
"grad_norm": 0.2425071895122528,
"learning_rate": 7.888004621597079e-06,
"loss": 0.2573948800563812,
"step": 271,
"token_acc": 0.907380557815819
},
{
"epoch": 1.050363196125908,
"grad_norm": 0.23996935784816742,
"learning_rate": 7.87033986663124e-06,
"loss": 0.2808932065963745,
"step": 272,
"token_acc": 0.8994914728045711
},
{
"epoch": 1.0542372881355933,
"grad_norm": 0.25931164622306824,
"learning_rate": 7.852621511815825e-06,
"loss": 0.26375657320022583,
"step": 273,
"token_acc": 0.9051297163863579
},
{
"epoch": 1.0581113801452784,
"grad_norm": 0.20594951510429382,
"learning_rate": 7.834849888017979e-06,
"loss": 0.23789554834365845,
"step": 274,
"token_acc": 0.9142479611743739
},
{
"epoch": 1.0619854721549637,
"grad_norm": 0.23315519094467163,
"learning_rate": 7.817025327099574e-06,
"loss": 0.24684631824493408,
"step": 275,
"token_acc": 0.9110874200426439
},
{
"epoch": 1.0658595641646489,
"grad_norm": 0.2189839482307434,
"learning_rate": 7.799148161911013e-06,
"loss": 0.2684437334537506,
"step": 276,
"token_acc": 0.9041172254519392
},
{
"epoch": 1.0697336561743342,
"grad_norm": 0.21298226714134216,
"learning_rate": 7.781218726285014e-06,
"loss": 0.2720562815666199,
"step": 277,
"token_acc": 0.9027445373018297
},
{
"epoch": 1.0736077481840194,
"grad_norm": 0.21282611787319183,
"learning_rate": 7.763237355030384e-06,
"loss": 0.2579670548439026,
"step": 278,
"token_acc": 0.9080073119376767
},
{
"epoch": 1.0774818401937045,
"grad_norm": 0.21488887071609497,
"learning_rate": 7.745204383925753e-06,
"loss": 0.2742394804954529,
"step": 279,
"token_acc": 0.9015262545209174
},
{
"epoch": 1.0813559322033899,
"grad_norm": 0.19826629757881165,
"learning_rate": 7.727120149713313e-06,
"loss": 0.23731666803359985,
"step": 280,
"token_acc": 0.9146603883445988
},
{
"epoch": 1.085230024213075,
"grad_norm": 0.20840346813201904,
"learning_rate": 7.708984990092528e-06,
"loss": 0.22673961520195007,
"step": 281,
"token_acc": 0.9184409845576723
},
{
"epoch": 1.0891041162227604,
"grad_norm": 0.21199366450309753,
"learning_rate": 7.690799243713825e-06,
"loss": 0.2788952887058258,
"step": 282,
"token_acc": 0.9002122640890617
},
{
"epoch": 1.0929782082324455,
"grad_norm": 0.23963455855846405,
"learning_rate": 7.672563250172278e-06,
"loss": 0.2703215479850769,
"step": 283,
"token_acc": 0.902904561306835
},
{
"epoch": 1.0968523002421307,
"grad_norm": 0.20739565789699554,
"learning_rate": 7.654277350001255e-06,
"loss": 0.2556743621826172,
"step": 284,
"token_acc": 0.9087778504769448
},
{
"epoch": 1.100726392251816,
"grad_norm": 0.3205340504646301,
"learning_rate": 7.635941884666072e-06,
"loss": 0.2660865783691406,
"step": 285,
"token_acc": 0.9052546447746934
},
{
"epoch": 1.1046004842615011,
"grad_norm": 0.20611628890037537,
"learning_rate": 7.617557196557601e-06,
"loss": 0.2590142488479614,
"step": 286,
"token_acc": 0.9070821077566713
},
{
"epoch": 1.1084745762711865,
"grad_norm": 0.1932753622531891,
"learning_rate": 7.599123628985894e-06,
"loss": 0.2396095246076584,
"step": 287,
"token_acc": 0.9135842317299648
},
{
"epoch": 1.1123486682808716,
"grad_norm": 0.21151748299598694,
"learning_rate": 7.580641526173758e-06,
"loss": 0.2544936537742615,
"step": 288,
"token_acc": 0.9088854539111634
},
{
"epoch": 1.116222760290557,
"grad_norm": 0.1992950737476349,
"learning_rate": 7.5621112332503325e-06,
"loss": 0.2544850707054138,
"step": 289,
"token_acc": 0.9090426161294457
},
{
"epoch": 1.1200968523002421,
"grad_norm": 0.20908565819263458,
"learning_rate": 7.543533096244644e-06,
"loss": 0.2762412428855896,
"step": 290,
"token_acc": 0.9013541447063986
},
{
"epoch": 1.1239709443099273,
"grad_norm": 0.2157965451478958,
"learning_rate": 7.524907462079149e-06,
"loss": 0.25533056259155273,
"step": 291,
"token_acc": 0.9080176353704462
},
{
"epoch": 1.1278450363196126,
"grad_norm": 0.19141145050525665,
"learning_rate": 7.506234678563248e-06,
"loss": 0.2362717241048813,
"step": 292,
"token_acc": 0.9155038610363999
},
{
"epoch": 1.1317191283292978,
"grad_norm": 0.21533732116222382,
"learning_rate": 7.487515094386792e-06,
"loss": 0.23099368810653687,
"step": 293,
"token_acc": 0.9173202498403009
},
{
"epoch": 1.1355932203389831,
"grad_norm": 0.20129309594631195,
"learning_rate": 7.468749059113578e-06,
"loss": 0.26144838333129883,
"step": 294,
"token_acc": 0.9057641431815713
},
{
"epoch": 1.1394673123486683,
"grad_norm": 0.3953739404678345,
"learning_rate": 7.449936923174813e-06,
"loss": 0.2557257413864136,
"step": 295,
"token_acc": 0.9087617787160037
},
{
"epoch": 1.1433414043583534,
"grad_norm": 0.21214410662651062,
"learning_rate": 7.431079037862575e-06,
"loss": 0.27983057498931885,
"step": 296,
"token_acc": 0.8996573827559394
},
{
"epoch": 1.1472154963680388,
"grad_norm": 0.20280665159225464,
"learning_rate": 7.412175755323254e-06,
"loss": 0.2772400677204132,
"step": 297,
"token_acc": 0.9010093723967251
},
{
"epoch": 1.151089588377724,
"grad_norm": 0.21776501834392548,
"learning_rate": 7.39322742855097e-06,
"loss": 0.24517808854579926,
"step": 298,
"token_acc": 0.9120538077359621
},
{
"epoch": 1.1549636803874093,
"grad_norm": 0.21630938351154327,
"learning_rate": 7.374234411380987e-06,
"loss": 0.2736694812774658,
"step": 299,
"token_acc": 0.9020631116999458
},
{
"epoch": 1.1588377723970944,
"grad_norm": 0.19338402152061462,
"learning_rate": 7.355197058483103e-06,
"loss": 0.24092288315296173,
"step": 300,
"token_acc": 0.9133508019967492
}
],
"logging_steps": 1,
"max_steps": 777,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1233780174946304e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}