Mimo-VL-RL_MSR-Align / trainer_state.json
andyc03's picture
Upload folder using huggingface_hub
3b7c8ff verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 30.0,
"global_step": 2982,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000335401643468053,
"grad_norm": 12.375,
"learning_rate": 9.99999722524632e-07,
"loss": 1.274832010269165,
"step": 1,
"token_acc": 0.6722809020839281
},
{
"epoch": 0.001677008217340265,
"grad_norm": 11.5,
"learning_rate": 9.999930631312047e-07,
"loss": 1.330723524093628,
"step": 5,
"token_acc": 0.6504451038575668
},
{
"epoch": 0.00335401643468053,
"grad_norm": 11.4375,
"learning_rate": 9.999722527172996e-07,
"loss": 1.3872632026672362,
"step": 10,
"token_acc": 0.6347948773605383
},
{
"epoch": 0.005031024652020795,
"grad_norm": 10.125,
"learning_rate": 9.999375693357208e-07,
"loss": 1.28599214553833,
"step": 15,
"token_acc": 0.6515521064301553
},
{
"epoch": 0.00670803286936106,
"grad_norm": 10.75,
"learning_rate": 9.99889013948845e-07,
"loss": 1.2960277557373048,
"step": 20,
"token_acc": 0.6534720269890357
},
{
"epoch": 0.008385041086701324,
"grad_norm": 8.6875,
"learning_rate": 9.998265879039611e-07,
"loss": 1.2671630859375,
"step": 25,
"token_acc": 0.655209801582999
},
{
"epoch": 0.01006204930404159,
"grad_norm": 9.625,
"learning_rate": 9.997502929332347e-07,
"loss": 1.261711597442627,
"step": 30,
"token_acc": 0.6588392961193335
},
{
"epoch": 0.011739057521381855,
"grad_norm": 9.125,
"learning_rate": 9.996601311536586e-07,
"loss": 1.2757044792175294,
"step": 35,
"token_acc": 0.6571720381298229
},
{
"epoch": 0.01341606573872212,
"grad_norm": 9.25,
"learning_rate": 9.99556105066994e-07,
"loss": 1.2386951446533203,
"step": 40,
"token_acc": 0.6721679125934445
},
{
"epoch": 0.015093073956062384,
"grad_norm": 8.6875,
"learning_rate": 9.994382175597028e-07,
"loss": 1.2164941787719727,
"step": 45,
"token_acc": 0.6682510594433627
},
{
"epoch": 0.01677008217340265,
"grad_norm": 8.9375,
"learning_rate": 9.993064719028653e-07,
"loss": 1.2428162574768067,
"step": 50,
"token_acc": 0.6631278538812785
},
{
"epoch": 0.018447090390742913,
"grad_norm": 8.3125,
"learning_rate": 9.991608717520907e-07,
"loss": 1.1938260078430176,
"step": 55,
"token_acc": 0.6717246907924874
},
{
"epoch": 0.02012409860808318,
"grad_norm": 7.15625,
"learning_rate": 9.99001421147416e-07,
"loss": 1.22207612991333,
"step": 60,
"token_acc": 0.6641735228122663
},
{
"epoch": 0.021801106825423446,
"grad_norm": 7.5625,
"learning_rate": 9.988281245131927e-07,
"loss": 1.2258419036865233,
"step": 65,
"token_acc": 0.6678375645324391
},
{
"epoch": 0.02347811504276371,
"grad_norm": 7.65625,
"learning_rate": 9.98640986657965e-07,
"loss": 1.2073140144348145,
"step": 70,
"token_acc": 0.6601010387693705
},
{
"epoch": 0.025155123260103975,
"grad_norm": 8.125,
"learning_rate": 9.984400127743356e-07,
"loss": 1.1937344551086426,
"step": 75,
"token_acc": 0.672787979966611
},
{
"epoch": 0.02683213147744424,
"grad_norm": 7.8125,
"learning_rate": 9.982252084388226e-07,
"loss": 1.219521713256836,
"step": 80,
"token_acc": 0.6679824317562684
},
{
"epoch": 0.028509139694784504,
"grad_norm": 7.3125,
"learning_rate": 9.979965796117037e-07,
"loss": 1.1813300132751465,
"step": 85,
"token_acc": 0.6724050059459765
},
{
"epoch": 0.030186147912124768,
"grad_norm": 7.21875,
"learning_rate": 9.977541326368517e-07,
"loss": 1.2407192230224608,
"step": 90,
"token_acc": 0.6603100321731501
},
{
"epoch": 0.03186315612946503,
"grad_norm": 7.40625,
"learning_rate": 9.974978742415584e-07,
"loss": 1.228166675567627,
"step": 95,
"token_acc": 0.6610423922180857
},
{
"epoch": 0.0335401643468053,
"grad_norm": 6.8125,
"learning_rate": 9.97227811536347e-07,
"loss": 1.204833984375,
"step": 100,
"token_acc": 0.6694168490767245
},
{
"epoch": 0.03521717256414556,
"grad_norm": 7.40625,
"learning_rate": 9.969439520147753e-07,
"loss": 1.1833898544311523,
"step": 105,
"token_acc": 0.6732022471910112
},
{
"epoch": 0.036894180781485826,
"grad_norm": 6.90625,
"learning_rate": 9.966463035532288e-07,
"loss": 1.1810292243957519,
"step": 110,
"token_acc": 0.673891419297419
},
{
"epoch": 0.0385711889988261,
"grad_norm": 6.78125,
"learning_rate": 9.963348744107008e-07,
"loss": 1.1781543731689452,
"step": 115,
"token_acc": 0.674726171055698
},
{
"epoch": 0.04024819721616636,
"grad_norm": 7.09375,
"learning_rate": 9.960096732285637e-07,
"loss": 1.1450292587280273,
"step": 120,
"token_acc": 0.681571495988674
},
{
"epoch": 0.04192520543350663,
"grad_norm": 6.96875,
"learning_rate": 9.956707090303289e-07,
"loss": 1.20210599899292,
"step": 125,
"token_acc": 0.6689448241373562
},
{
"epoch": 0.04360221365084689,
"grad_norm": 6.5,
"learning_rate": 9.953179912213974e-07,
"loss": 1.2348913192749023,
"step": 130,
"token_acc": 0.656823718139148
},
{
"epoch": 0.045279221868187156,
"grad_norm": 6.6875,
"learning_rate": 9.949515295887978e-07,
"loss": 1.1797042846679688,
"step": 135,
"token_acc": 0.671729097348932
},
{
"epoch": 0.04695623008552742,
"grad_norm": 7.1875,
"learning_rate": 9.945713343009152e-07,
"loss": 1.1191802978515626,
"step": 140,
"token_acc": 0.6833774573311553
},
{
"epoch": 0.048633238302867685,
"grad_norm": 6.875,
"learning_rate": 9.941774159072088e-07,
"loss": 1.1308756828308106,
"step": 145,
"token_acc": 0.6832326456172347
},
{
"epoch": 0.05031024652020795,
"grad_norm": 6.59375,
"learning_rate": 9.937697853379192e-07,
"loss": 1.2008363723754882,
"step": 150,
"token_acc": 0.665910345250506
},
{
"epoch": 0.051987254737548214,
"grad_norm": 7.78125,
"learning_rate": 9.93348453903766e-07,
"loss": 1.1969077110290527,
"step": 155,
"token_acc": 0.6697475202885482
},
{
"epoch": 0.05366426295488848,
"grad_norm": 6.5625,
"learning_rate": 9.929134332956327e-07,
"loss": 1.1653069496154784,
"step": 160,
"token_acc": 0.6718004091839054
},
{
"epoch": 0.05534127117222874,
"grad_norm": 6.8125,
"learning_rate": 9.924647355842421e-07,
"loss": 1.123036003112793,
"step": 165,
"token_acc": 0.6868122371377542
},
{
"epoch": 0.05701827938956901,
"grad_norm": 6.78125,
"learning_rate": 9.920023732198237e-07,
"loss": 1.1395354270935059,
"step": 170,
"token_acc": 0.6813573228623337
},
{
"epoch": 0.05869528760690927,
"grad_norm": 6.40625,
"learning_rate": 9.915263590317654e-07,
"loss": 1.1792012214660645,
"step": 175,
"token_acc": 0.6715845846896986
},
{
"epoch": 0.060372295824249536,
"grad_norm": 6.34375,
"learning_rate": 9.91036706228259e-07,
"loss": 1.1243346214294434,
"step": 180,
"token_acc": 0.6848594635657103
},
{
"epoch": 0.0620493040415898,
"grad_norm": 6.96875,
"learning_rate": 9.905334283959333e-07,
"loss": 1.143388843536377,
"step": 185,
"token_acc": 0.6788610770756753
},
{
"epoch": 0.06372631225893007,
"grad_norm": 6.21875,
"learning_rate": 9.90016539499478e-07,
"loss": 1.1932525634765625,
"step": 190,
"token_acc": 0.6702515177797052
},
{
"epoch": 0.06540332047627033,
"grad_norm": 6.5,
"learning_rate": 9.894860538812545e-07,
"loss": 1.143165111541748,
"step": 195,
"token_acc": 0.6809917355371901
},
{
"epoch": 0.0670803286936106,
"grad_norm": 6.1875,
"learning_rate": 9.889419862608995e-07,
"loss": 1.1876848220825196,
"step": 200,
"token_acc": 0.6701633023361306
},
{
"epoch": 0.06875733691095086,
"grad_norm": 6.625,
"learning_rate": 9.883843517349157e-07,
"loss": 1.144710636138916,
"step": 205,
"token_acc": 0.6796767632345665
},
{
"epoch": 0.07043434512829112,
"grad_norm": 6.15625,
"learning_rate": 9.878131657762535e-07,
"loss": 1.1471405982971192,
"step": 210,
"token_acc": 0.6775461840900523
},
{
"epoch": 0.07211135334563139,
"grad_norm": 6.8125,
"learning_rate": 9.872284442338807e-07,
"loss": 1.1314339637756348,
"step": 215,
"token_acc": 0.683370644624304
},
{
"epoch": 0.07378836156297165,
"grad_norm": 6.96875,
"learning_rate": 9.86630203332344e-07,
"loss": 1.1659558296203614,
"step": 220,
"token_acc": 0.67846951255023
},
{
"epoch": 0.07546536978031192,
"grad_norm": 6.25,
"learning_rate": 9.860184596713182e-07,
"loss": 1.164224624633789,
"step": 225,
"token_acc": 0.6794126284875184
},
{
"epoch": 0.0771423779976522,
"grad_norm": 6.625,
"learning_rate": 9.853932302251449e-07,
"loss": 1.1158638000488281,
"step": 230,
"token_acc": 0.6871271225332721
},
{
"epoch": 0.07881938621499246,
"grad_norm": 7.0,
"learning_rate": 9.847545323423632e-07,
"loss": 1.1727291107177735,
"step": 235,
"token_acc": 0.6663918982571833
},
{
"epoch": 0.08049639443233272,
"grad_norm": 6.125,
"learning_rate": 9.84102383745226e-07,
"loss": 1.105608081817627,
"step": 240,
"token_acc": 0.6856490759296371
},
{
"epoch": 0.08217340264967299,
"grad_norm": 7.03125,
"learning_rate": 9.834368025292112e-07,
"loss": 1.123321533203125,
"step": 245,
"token_acc": 0.6824586230234418
},
{
"epoch": 0.08385041086701325,
"grad_norm": 5.5625,
"learning_rate": 9.827578071625163e-07,
"loss": 1.1252297401428222,
"step": 250,
"token_acc": 0.6880288131717357
},
{
"epoch": 0.08552741908435352,
"grad_norm": 5.8125,
"learning_rate": 9.82065416485549e-07,
"loss": 1.144364833831787,
"step": 255,
"token_acc": 0.6834830684174154
},
{
"epoch": 0.08720442730169378,
"grad_norm": 6.28125,
"learning_rate": 9.813596497104018e-07,
"loss": 1.1258564949035645,
"step": 260,
"token_acc": 0.6776172457287009
},
{
"epoch": 0.08888143551903405,
"grad_norm": 6.375,
"learning_rate": 9.806405264203213e-07,
"loss": 1.1180498123168945,
"step": 265,
"token_acc": 0.6824193364496637
},
{
"epoch": 0.09055844373637431,
"grad_norm": 6.40625,
"learning_rate": 9.79908066569163e-07,
"loss": 1.1487442016601563,
"step": 270,
"token_acc": 0.6782154722354058
},
{
"epoch": 0.09223545195371458,
"grad_norm": 6.40625,
"learning_rate": 9.79162290480838e-07,
"loss": 1.1168856620788574,
"step": 275,
"token_acc": 0.6839214769806029
},
{
"epoch": 0.09391246017105484,
"grad_norm": 6.78125,
"learning_rate": 9.784032188487506e-07,
"loss": 1.1488576889038087,
"step": 280,
"token_acc": 0.6758741258741259
},
{
"epoch": 0.0955894683883951,
"grad_norm": 6.8125,
"learning_rate": 9.776308727352214e-07,
"loss": 1.105551528930664,
"step": 285,
"token_acc": 0.6866073519082182
},
{
"epoch": 0.09726647660573537,
"grad_norm": 6.3125,
"learning_rate": 9.768452735709054e-07,
"loss": 1.1006428718566894,
"step": 290,
"token_acc": 0.6875408084525435
},
{
"epoch": 0.09894348482307563,
"grad_norm": 6.03125,
"learning_rate": 9.760464431541956e-07,
"loss": 1.1159579277038574,
"step": 295,
"token_acc": 0.6872186685619521
},
{
"epoch": 0.1006204930404159,
"grad_norm": 5.96875,
"learning_rate": 9.752344036506197e-07,
"loss": 1.0885606765747071,
"step": 300,
"token_acc": 0.6948132222520828
},
{
"epoch": 0.10229750125775616,
"grad_norm": 6.25,
"learning_rate": 9.74409177592223e-07,
"loss": 1.0900307655334474,
"step": 305,
"token_acc": 0.6930264477643456
},
{
"epoch": 0.10397450947509643,
"grad_norm": 6.03125,
"learning_rate": 9.735707878769456e-07,
"loss": 1.1054121017456056,
"step": 310,
"token_acc": 0.6892293353527116
},
{
"epoch": 0.10565151769243669,
"grad_norm": 6.21875,
"learning_rate": 9.72719257767985e-07,
"loss": 1.0906902313232423,
"step": 315,
"token_acc": 0.6898504082484057
},
{
"epoch": 0.10732852590977696,
"grad_norm": 6.0,
"learning_rate": 9.71854610893152e-07,
"loss": 1.1318517684936524,
"step": 320,
"token_acc": 0.6846991214013373
},
{
"epoch": 0.10900553412711722,
"grad_norm": 6.1875,
"learning_rate": 9.709768712442142e-07,
"loss": 1.111149215698242,
"step": 325,
"token_acc": 0.6831088755476229
},
{
"epoch": 0.11068254234445749,
"grad_norm": 5.78125,
"learning_rate": 9.700860631762307e-07,
"loss": 1.0418154716491699,
"step": 330,
"token_acc": 0.6975177882787464
},
{
"epoch": 0.11235955056179775,
"grad_norm": 6.90625,
"learning_rate": 9.69182211406876e-07,
"loss": 1.0776394844055175,
"step": 335,
"token_acc": 0.6928902718252803
},
{
"epoch": 0.11403655877913801,
"grad_norm": 6.875,
"learning_rate": 9.68265341015755e-07,
"loss": 1.1534889221191407,
"step": 340,
"token_acc": 0.6755254892486108
},
{
"epoch": 0.11571356699647828,
"grad_norm": 5.96875,
"learning_rate": 9.673354774437062e-07,
"loss": 1.109882926940918,
"step": 345,
"token_acc": 0.684797277474623
},
{
"epoch": 0.11739057521381854,
"grad_norm": 6.15625,
"learning_rate": 9.663926464920956e-07,
"loss": 1.060962200164795,
"step": 350,
"token_acc": 0.6972844314616466
},
{
"epoch": 0.11906758343115881,
"grad_norm": 6.34375,
"learning_rate": 9.65436874322102e-07,
"loss": 1.134366798400879,
"step": 355,
"token_acc": 0.6782759180141691
},
{
"epoch": 0.12074459164849907,
"grad_norm": 6.0,
"learning_rate": 9.6446818745399e-07,
"loss": 1.0987051010131836,
"step": 360,
"token_acc": 0.6879944960440316
},
{
"epoch": 0.12242159986583934,
"grad_norm": 6.3125,
"learning_rate": 9.634866127663737e-07,
"loss": 1.111135196685791,
"step": 365,
"token_acc": 0.6866663123771058
},
{
"epoch": 0.1240986080831796,
"grad_norm": 5.6875,
"learning_rate": 9.624921774954732e-07,
"loss": 1.0923819541931152,
"step": 370,
"token_acc": 0.6919647711210177
},
{
"epoch": 0.12577561630051987,
"grad_norm": 6.625,
"learning_rate": 9.614849092343563e-07,
"loss": 1.0786520004272462,
"step": 375,
"token_acc": 0.6894919168591224
},
{
"epoch": 0.12745262451786013,
"grad_norm": 5.71875,
"learning_rate": 9.60464835932174e-07,
"loss": 1.086344051361084,
"step": 380,
"token_acc": 0.6883529734173747
},
{
"epoch": 0.1291296327352004,
"grad_norm": 6.5625,
"learning_rate": 9.594319858933847e-07,
"loss": 1.1158102989196776,
"step": 385,
"token_acc": 0.6838648035351725
},
{
"epoch": 0.13080664095254066,
"grad_norm": 6.375,
"learning_rate": 9.583863877769696e-07,
"loss": 1.085710620880127,
"step": 390,
"token_acc": 0.6940853163450128
},
{
"epoch": 0.13248364916988092,
"grad_norm": 6.125,
"learning_rate": 9.573280705956364e-07,
"loss": 1.0663482666015625,
"step": 395,
"token_acc": 0.695432995258957
},
{
"epoch": 0.1341606573872212,
"grad_norm": 6.4375,
"learning_rate": 9.562570637150144e-07,
"loss": 1.1113270759582519,
"step": 400,
"token_acc": 0.6843401825205756
},
{
"epoch": 0.13583766560456145,
"grad_norm": 6.0,
"learning_rate": 9.55173396852841e-07,
"loss": 1.0923892974853515,
"step": 405,
"token_acc": 0.6873383560103452
},
{
"epoch": 0.13751467382190172,
"grad_norm": 5.96875,
"learning_rate": 9.540771000781357e-07,
"loss": 1.1199935913085937,
"step": 410,
"token_acc": 0.6863687150837989
},
{
"epoch": 0.13919168203924198,
"grad_norm": 6.65625,
"learning_rate": 9.529682038103653e-07,
"loss": 1.1222180366516112,
"step": 415,
"token_acc": 0.6797772775737472
},
{
"epoch": 0.14086869025658225,
"grad_norm": 5.96875,
"learning_rate": 9.518467388186019e-07,
"loss": 1.1468374252319335,
"step": 420,
"token_acc": 0.6787428571428571
},
{
"epoch": 0.1425456984739225,
"grad_norm": 5.75,
"learning_rate": 9.507127362206675e-07,
"loss": 1.1019716262817383,
"step": 425,
"token_acc": 0.688788173087299
},
{
"epoch": 0.14422270669126278,
"grad_norm": 6.8125,
"learning_rate": 9.495662274822711e-07,
"loss": 1.0751092910766602,
"step": 430,
"token_acc": 0.6973642825690382
},
{
"epoch": 0.14589971490860304,
"grad_norm": 6.28125,
"learning_rate": 9.484072444161354e-07,
"loss": 1.108968734741211,
"step": 435,
"token_acc": 0.6854776790610947
},
{
"epoch": 0.1475767231259433,
"grad_norm": 6.1875,
"learning_rate": 9.472358191811143e-07,
"loss": 1.0823619842529297,
"step": 440,
"token_acc": 0.6868514020755133
},
{
"epoch": 0.14925373134328357,
"grad_norm": 6.3125,
"learning_rate": 9.460519842813003e-07,
"loss": 1.1193353652954101,
"step": 445,
"token_acc": 0.685983750923243
},
{
"epoch": 0.15093073956062383,
"grad_norm": 6.4375,
"learning_rate": 9.448557725651229e-07,
"loss": 1.10097017288208,
"step": 450,
"token_acc": 0.6858677638722213
},
{
"epoch": 0.15260774777796413,
"grad_norm": 6.40625,
"learning_rate": 9.436472172244373e-07,
"loss": 1.0627543449401855,
"step": 455,
"token_acc": 0.6981000522921388
},
{
"epoch": 0.1542847559953044,
"grad_norm": 6.28125,
"learning_rate": 9.424263517936026e-07,
"loss": 1.0785947799682618,
"step": 460,
"token_acc": 0.6919910406198923
},
{
"epoch": 0.15596176421264465,
"grad_norm": 6.21875,
"learning_rate": 9.41193210148552e-07,
"loss": 1.0722208976745606,
"step": 465,
"token_acc": 0.690248354444383
},
{
"epoch": 0.15763877242998492,
"grad_norm": 6.1875,
"learning_rate": 9.399478265058528e-07,
"loss": 1.0784868240356444,
"step": 470,
"token_acc": 0.6923379837983799
},
{
"epoch": 0.15931578064732518,
"grad_norm": 5.5625,
"learning_rate": 9.386902354217565e-07,
"loss": 1.085107421875,
"step": 475,
"token_acc": 0.6940270167634135
},
{
"epoch": 0.16099278886466545,
"grad_norm": 6.15625,
"learning_rate": 9.374204717912408e-07,
"loss": 1.0912228584289552,
"step": 480,
"token_acc": 0.6895368782161235
},
{
"epoch": 0.1626697970820057,
"grad_norm": 6.46875,
"learning_rate": 9.361385708470405e-07,
"loss": 1.0886618614196777,
"step": 485,
"token_acc": 0.6871300524268561
},
{
"epoch": 0.16434680529934598,
"grad_norm": 6.28125,
"learning_rate": 9.3484456815867e-07,
"loss": 1.0357705116271974,
"step": 490,
"token_acc": 0.7033262723993817
},
{
"epoch": 0.16602381351668624,
"grad_norm": 6.5,
"learning_rate": 9.335384996314371e-07,
"loss": 1.0864218711853026,
"step": 495,
"token_acc": 0.6869781488719133
},
{
"epoch": 0.1677008217340265,
"grad_norm": 5.625,
"learning_rate": 9.322204015054454e-07,
"loss": 1.064340591430664,
"step": 500,
"token_acc": 0.6971830985915493
},
{
"epoch": 0.16937782995136677,
"grad_norm": 5.84375,
"learning_rate": 9.308903103545902e-07,
"loss": 1.10338134765625,
"step": 505,
"token_acc": 0.690793249913902
},
{
"epoch": 0.17105483816870704,
"grad_norm": 5.71875,
"learning_rate": 9.295482630855427e-07,
"loss": 1.0584440231323242,
"step": 510,
"token_acc": 0.6965631929046563
},
{
"epoch": 0.1727318463860473,
"grad_norm": 6.40625,
"learning_rate": 9.281942969367262e-07,
"loss": 1.101362133026123,
"step": 515,
"token_acc": 0.6852114339711788
},
{
"epoch": 0.17440885460338756,
"grad_norm": 6.03125,
"learning_rate": 9.268284494772829e-07,
"loss": 1.0686640739440918,
"step": 520,
"token_acc": 0.6929240558534142
},
{
"epoch": 0.17608586282072783,
"grad_norm": 5.875,
"learning_rate": 9.25450758606031e-07,
"loss": 1.068457794189453,
"step": 525,
"token_acc": 0.6926385165326184
},
{
"epoch": 0.1777628710380681,
"grad_norm": 5.78125,
"learning_rate": 9.24061262550414e-07,
"loss": 1.0507415771484374,
"step": 530,
"token_acc": 0.6969854770583811
},
{
"epoch": 0.17943987925540836,
"grad_norm": 5.8125,
"learning_rate": 9.226599998654391e-07,
"loss": 1.0741724967956543,
"step": 535,
"token_acc": 0.6954720153735287
},
{
"epoch": 0.18111688747274862,
"grad_norm": 6.03125,
"learning_rate": 9.212470094326079e-07,
"loss": 1.0903028488159179,
"step": 540,
"token_acc": 0.684026406364611
},
{
"epoch": 0.1827938956900889,
"grad_norm": 6.59375,
"learning_rate": 9.198223304588374e-07,
"loss": 1.0722553253173828,
"step": 545,
"token_acc": 0.691402934400388
},
{
"epoch": 0.18447090390742915,
"grad_norm": 5.625,
"learning_rate": 9.18386002475372e-07,
"loss": 1.0605772972106933,
"step": 550,
"token_acc": 0.6977486740989285
},
{
"epoch": 0.18614791212476942,
"grad_norm": 6.625,
"learning_rate": 9.169380653366869e-07,
"loss": 1.077211570739746,
"step": 555,
"token_acc": 0.6934850863422292
},
{
"epoch": 0.18782492034210968,
"grad_norm": 5.96875,
"learning_rate": 9.154785592193819e-07,
"loss": 1.0502148628234864,
"step": 560,
"token_acc": 0.6995120856934552
},
{
"epoch": 0.18950192855944994,
"grad_norm": 6.46875,
"learning_rate": 9.140075246210665e-07,
"loss": 1.0825450897216797,
"step": 565,
"token_acc": 0.6873532947844507
},
{
"epoch": 0.1911789367767902,
"grad_norm": 6.78125,
"learning_rate": 9.125250023592371e-07,
"loss": 1.0839460372924805,
"step": 570,
"token_acc": 0.6894161870081053
},
{
"epoch": 0.19285594499413047,
"grad_norm": 5.9375,
"learning_rate": 9.11031033570143e-07,
"loss": 1.0788863182067872,
"step": 575,
"token_acc": 0.692129246064623
},
{
"epoch": 0.19453295321147074,
"grad_norm": 6.625,
"learning_rate": 9.095256597076464e-07,
"loss": 1.0854562759399413,
"step": 580,
"token_acc": 0.6882365102336159
},
{
"epoch": 0.196209961428811,
"grad_norm": 5.34375,
"learning_rate": 9.08008922542071e-07,
"loss": 1.0595365524291993,
"step": 585,
"token_acc": 0.6933667083854819
},
{
"epoch": 0.19788696964615127,
"grad_norm": 5.6875,
"learning_rate": 9.064808641590438e-07,
"loss": 1.0630743026733398,
"step": 590,
"token_acc": 0.6950098315353138
},
{
"epoch": 0.19956397786349153,
"grad_norm": 5.875,
"learning_rate": 9.049415269583267e-07,
"loss": 1.0720837593078614,
"step": 595,
"token_acc": 0.6954955895401639
},
{
"epoch": 0.2012409860808318,
"grad_norm": 6.125,
"learning_rate": 9.033909536526405e-07,
"loss": 1.1050517082214355,
"step": 600,
"token_acc": 0.6858130348913759
},
{
"epoch": 0.20291799429817206,
"grad_norm": 6.125,
"learning_rate": 9.018291872664796e-07,
"loss": 1.088867473602295,
"step": 605,
"token_acc": 0.6960822450680745
},
{
"epoch": 0.20459500251551233,
"grad_norm": 6.15625,
"learning_rate": 9.00256271134918e-07,
"loss": 1.059322452545166,
"step": 610,
"token_acc": 0.6967294969417052
},
{
"epoch": 0.2062720107328526,
"grad_norm": 5.875,
"learning_rate": 8.98672248902407e-07,
"loss": 1.0721155166625977,
"step": 615,
"token_acc": 0.6950203894089944
},
{
"epoch": 0.20794901895019285,
"grad_norm": 5.96875,
"learning_rate": 8.970771645215643e-07,
"loss": 1.0299058914184571,
"step": 620,
"token_acc": 0.6998539161703562
},
{
"epoch": 0.20962602716753312,
"grad_norm": 6.0625,
"learning_rate": 8.95471062251954e-07,
"loss": 1.0335143089294434,
"step": 625,
"token_acc": 0.7026568991909031
},
{
"epoch": 0.21130303538487338,
"grad_norm": 6.1875,
"learning_rate": 8.938539866588592e-07,
"loss": 1.0810824394226075,
"step": 630,
"token_acc": 0.6905426180037579
},
{
"epoch": 0.21298004360221365,
"grad_norm": 6.0625,
"learning_rate": 8.922259826120444e-07,
"loss": 1.0932263374328612,
"step": 635,
"token_acc": 0.6908346311357213
},
{
"epoch": 0.2146570518195539,
"grad_norm": 5.53125,
"learning_rate": 8.905870952845118e-07,
"loss": 1.0775763511657714,
"step": 640,
"token_acc": 0.6908716005349977
},
{
"epoch": 0.21633406003689418,
"grad_norm": 6.5,
"learning_rate": 8.889373701512468e-07,
"loss": 1.0795653343200684,
"step": 645,
"token_acc": 0.6852801061327866
},
{
"epoch": 0.21801106825423444,
"grad_norm": 6.25,
"learning_rate": 8.872768529879564e-07,
"loss": 1.021756649017334,
"step": 650,
"token_acc": 0.70451876436048
},
{
"epoch": 0.2196880764715747,
"grad_norm": 5.90625,
"learning_rate": 8.856055898697997e-07,
"loss": 1.1029382705688477,
"step": 655,
"token_acc": 0.6815459687257402
},
{
"epoch": 0.22136508468891497,
"grad_norm": 6.1875,
"learning_rate": 8.839236271701082e-07,
"loss": 1.0495551109313965,
"step": 660,
"token_acc": 0.6949602122015915
},
{
"epoch": 0.22304209290625523,
"grad_norm": 6.8125,
"learning_rate": 8.822310115591007e-07,
"loss": 1.0997918128967286,
"step": 665,
"token_acc": 0.6854550609583215
},
{
"epoch": 0.2247191011235955,
"grad_norm": 5.6875,
"learning_rate": 8.805277900025863e-07,
"loss": 1.0630650520324707,
"step": 670,
"token_acc": 0.6942992874109264
},
{
"epoch": 0.22639610934093576,
"grad_norm": 5.78125,
"learning_rate": 8.788140097606631e-07,
"loss": 1.0519957542419434,
"step": 675,
"token_acc": 0.6950943822764779
},
{
"epoch": 0.22807311755827603,
"grad_norm": 6.0,
"learning_rate": 8.770897183864059e-07,
"loss": 1.1048961639404298,
"step": 680,
"token_acc": 0.6844065077910174
},
{
"epoch": 0.2297501257756163,
"grad_norm": 5.78125,
"learning_rate": 8.753549637245467e-07,
"loss": 1.0654611587524414,
"step": 685,
"token_acc": 0.6934993924665857
},
{
"epoch": 0.23142713399295656,
"grad_norm": 5.84375,
"learning_rate": 8.736097939101476e-07,
"loss": 1.0599603652954102,
"step": 690,
"token_acc": 0.6970522268503685
},
{
"epoch": 0.23310414221029682,
"grad_norm": 6.875,
"learning_rate": 8.718542573672644e-07,
"loss": 1.1132248878479003,
"step": 695,
"token_acc": 0.6831187846989202
},
{
"epoch": 0.2347811504276371,
"grad_norm": 6.40625,
"learning_rate": 8.700884028076041e-07,
"loss": 1.0341250419616699,
"step": 700,
"token_acc": 0.6999466413707239
},
{
"epoch": 0.23645815864497735,
"grad_norm": 5.46875,
"learning_rate": 8.683122792291719e-07,
"loss": 1.0390054702758789,
"step": 705,
"token_acc": 0.6990116801437556
},
{
"epoch": 0.23813516686231762,
"grad_norm": 6.15625,
"learning_rate": 8.66525935914913e-07,
"loss": 1.0598421096801758,
"step": 710,
"token_acc": 0.6982547993019197
},
{
"epoch": 0.23981217507965788,
"grad_norm": 5.625,
"learning_rate": 8.647294224313442e-07,
"loss": 1.0474308967590331,
"step": 715,
"token_acc": 0.6987998238273508
},
{
"epoch": 0.24148918329699814,
"grad_norm": 6.125,
"learning_rate": 8.629227886271786e-07,
"loss": 1.0467673301696778,
"step": 720,
"token_acc": 0.6986263736263736
},
{
"epoch": 0.2431661915143384,
"grad_norm": 6.46875,
"learning_rate": 8.611060846319431e-07,
"loss": 1.1083699226379395,
"step": 725,
"token_acc": 0.6849483810417645
},
{
"epoch": 0.24484319973167867,
"grad_norm": 5.625,
"learning_rate": 8.592793608545863e-07,
"loss": 1.0226441383361817,
"step": 730,
"token_acc": 0.6998483025579327
},
{
"epoch": 0.24652020794901894,
"grad_norm": 6.375,
"learning_rate": 8.574426679820813e-07,
"loss": 1.0406004905700683,
"step": 735,
"token_acc": 0.6985557481224726
},
{
"epoch": 0.2481972161663592,
"grad_norm": 5.96875,
"learning_rate": 8.555960569780176e-07,
"loss": 1.056182861328125,
"step": 740,
"token_acc": 0.6949429037520392
},
{
"epoch": 0.24987422438369947,
"grad_norm": 6.25,
"learning_rate": 8.537395790811885e-07,
"loss": 1.0577526092529297,
"step": 745,
"token_acc": 0.6974981448107707
},
{
"epoch": 0.25155123260103973,
"grad_norm": 5.625,
"learning_rate": 8.518732858041684e-07,
"loss": 1.031444263458252,
"step": 750,
"token_acc": 0.7030166435506241
},
{
"epoch": 0.25322824081838,
"grad_norm": 6.71875,
"learning_rate": 8.499972289318835e-07,
"loss": 1.0735219955444335,
"step": 755,
"token_acc": 0.6897931192136244
},
{
"epoch": 0.25490524903572026,
"grad_norm": 6.53125,
"learning_rate": 8.481114605201754e-07,
"loss": 1.0631572723388671,
"step": 760,
"token_acc": 0.6946527350952674
},
{
"epoch": 0.25658225725306055,
"grad_norm": 6.15625,
"learning_rate": 8.462160328943563e-07,
"loss": 1.0816566467285156,
"step": 765,
"token_acc": 0.6897627597574616
},
{
"epoch": 0.2582592654704008,
"grad_norm": 5.71875,
"learning_rate": 8.443109986477572e-07,
"loss": 1.067441463470459,
"step": 770,
"token_acc": 0.6974284888760474
},
{
"epoch": 0.2599362736877411,
"grad_norm": 6.34375,
"learning_rate": 8.423964106402686e-07,
"loss": 1.0774710655212403,
"step": 775,
"token_acc": 0.6951841988459831
},
{
"epoch": 0.2616132819050813,
"grad_norm": 5.78125,
"learning_rate": 8.404723219968735e-07,
"loss": 1.040436363220215,
"step": 780,
"token_acc": 0.7035983263598327
},
{
"epoch": 0.2632902901224216,
"grad_norm": 6.15625,
"learning_rate": 8.385387861061741e-07,
"loss": 1.0534331321716308,
"step": 785,
"token_acc": 0.69294556654623
},
{
"epoch": 0.26496729833976185,
"grad_norm": 6.34375,
"learning_rate": 8.365958566189093e-07,
"loss": 1.0408141136169433,
"step": 790,
"token_acc": 0.6980718161875072
},
{
"epoch": 0.26664430655710214,
"grad_norm": 6.125,
"learning_rate": 8.346435874464669e-07,
"loss": 1.0549689292907716,
"step": 795,
"token_acc": 0.697240352685217
},
{
"epoch": 0.2683213147744424,
"grad_norm": 6.15625,
"learning_rate": 8.326820327593874e-07,
"loss": 1.0545565605163574,
"step": 800,
"token_acc": 0.6958881578947368
},
{
"epoch": 0.26999832299178267,
"grad_norm": 6.125,
"learning_rate": 8.307112469858608e-07,
"loss": 1.0242762565612793,
"step": 805,
"token_acc": 0.7026535164964189
},
{
"epoch": 0.2716753312091229,
"grad_norm": 5.28125,
"learning_rate": 8.287312848102162e-07,
"loss": 1.0034321784973144,
"step": 810,
"token_acc": 0.7086971121558092
},
{
"epoch": 0.2733523394264632,
"grad_norm": 6.75,
"learning_rate": 8.267422011714052e-07,
"loss": 1.06221284866333,
"step": 815,
"token_acc": 0.6891465953018139
},
{
"epoch": 0.27502934764380343,
"grad_norm": 5.59375,
"learning_rate": 8.247440512614767e-07,
"loss": 1.0829678535461427,
"step": 820,
"token_acc": 0.6906236178681999
},
{
"epoch": 0.2767063558611437,
"grad_norm": 6.0,
"learning_rate": 8.227368905240455e-07,
"loss": 1.0397522926330567,
"step": 825,
"token_acc": 0.6986963959181266
},
{
"epoch": 0.27838336407848396,
"grad_norm": 5.6875,
"learning_rate": 8.207207746527545e-07,
"loss": 1.062535858154297,
"step": 830,
"token_acc": 0.6948282453170226
},
{
"epoch": 0.28006037229582426,
"grad_norm": 6.3125,
"learning_rate": 8.186957595897287e-07,
"loss": 1.0378836631774901,
"step": 835,
"token_acc": 0.7008086253369272
},
{
"epoch": 0.2817373805131645,
"grad_norm": 6.09375,
"learning_rate": 8.166619015240235e-07,
"loss": 1.0530453681945802,
"step": 840,
"token_acc": 0.6956012711155711
},
{
"epoch": 0.2834143887305048,
"grad_norm": 6.4375,
"learning_rate": 8.146192568900649e-07,
"loss": 1.1169618606567382,
"step": 845,
"token_acc": 0.6829694849837172
},
{
"epoch": 0.285091396947845,
"grad_norm": 5.90625,
"learning_rate": 8.125678823660842e-07,
"loss": 1.0521310806274413,
"step": 850,
"token_acc": 0.6975178316690442
},
{
"epoch": 0.2867684051651853,
"grad_norm": 5.75,
"learning_rate": 8.105078348725454e-07,
"loss": 1.0460372924804688,
"step": 855,
"token_acc": 0.6969783842669571
},
{
"epoch": 0.28844541338252555,
"grad_norm": 6.09375,
"learning_rate": 8.084391715705647e-07,
"loss": 1.0443012237548828,
"step": 860,
"token_acc": 0.697248736664795
},
{
"epoch": 0.29012242159986584,
"grad_norm": 5.6875,
"learning_rate": 8.06361949860326e-07,
"loss": 1.0336613655090332,
"step": 865,
"token_acc": 0.6974640522875817
},
{
"epoch": 0.2917994298172061,
"grad_norm": 5.71875,
"learning_rate": 8.042762273794872e-07,
"loss": 1.067410945892334,
"step": 870,
"token_acc": 0.691785183612821
},
{
"epoch": 0.29347643803454637,
"grad_norm": 5.875,
"learning_rate": 8.021820620015812e-07,
"loss": 1.1038573265075684,
"step": 875,
"token_acc": 0.6870427572752398
},
{
"epoch": 0.2951534462518866,
"grad_norm": 5.875,
"learning_rate": 8.000795118344093e-07,
"loss": 1.0380253791809082,
"step": 880,
"token_acc": 0.7000665041010863
},
{
"epoch": 0.2968304544692269,
"grad_norm": 6.28125,
"learning_rate": 7.979686352184306e-07,
"loss": 1.0550785064697266,
"step": 885,
"token_acc": 0.6930515759312321
},
{
"epoch": 0.29850746268656714,
"grad_norm": 6.9375,
"learning_rate": 7.958494907251414e-07,
"loss": 1.074232292175293,
"step": 890,
"token_acc": 0.6983671171171171
},
{
"epoch": 0.30018447090390743,
"grad_norm": 6.28125,
"learning_rate": 7.937221371554512e-07,
"loss": 1.0676633834838867,
"step": 895,
"token_acc": 0.6919191919191919
},
{
"epoch": 0.30186147912124767,
"grad_norm": 6.125,
"learning_rate": 7.915866335380499e-07,
"loss": 1.0565213203430175,
"step": 900,
"token_acc": 0.6963085336675402
},
{
"epoch": 0.30353848733858796,
"grad_norm": 6.5,
"learning_rate": 7.894430391277713e-07,
"loss": 1.0706295013427733,
"step": 905,
"token_acc": 0.689251808318264
},
{
"epoch": 0.30521549555592825,
"grad_norm": 5.59375,
"learning_rate": 7.872914134039484e-07,
"loss": 1.0703039169311523,
"step": 910,
"token_acc": 0.6952719476416159
},
{
"epoch": 0.3068925037732685,
"grad_norm": 6.09375,
"learning_rate": 7.851318160687624e-07,
"loss": 1.015502643585205,
"step": 915,
"token_acc": 0.704557514297506
},
{
"epoch": 0.3085695119906088,
"grad_norm": 6.53125,
"learning_rate": 7.829643070455864e-07,
"loss": 1.0821684837341308,
"step": 920,
"token_acc": 0.6846540956769539
},
{
"epoch": 0.310246520207949,
"grad_norm": 6.25,
"learning_rate": 7.807889464773237e-07,
"loss": 1.041159725189209,
"step": 925,
"token_acc": 0.6964295508144573
},
{
"epoch": 0.3119235284252893,
"grad_norm": 6.3125,
"learning_rate": 7.786057947247375e-07,
"loss": 1.0455968856811524,
"step": 930,
"token_acc": 0.6932714357626265
},
{
"epoch": 0.31360053664262955,
"grad_norm": 5.75,
"learning_rate": 7.764149123647769e-07,
"loss": 1.0698083877563476,
"step": 935,
"token_acc": 0.6874304163883322
},
{
"epoch": 0.31527754485996984,
"grad_norm": 5.90625,
"learning_rate": 7.742163601888958e-07,
"loss": 1.0722060203552246,
"step": 940,
"token_acc": 0.6909824258138865
},
{
"epoch": 0.3169545530773101,
"grad_norm": 6.1875,
"learning_rate": 7.720101992013661e-07,
"loss": 1.0373089790344239,
"step": 945,
"token_acc": 0.6987164794865918
},
{
"epoch": 0.31863156129465037,
"grad_norm": 6.0,
"learning_rate": 7.69796490617585e-07,
"loss": 1.0665985107421876,
"step": 950,
"token_acc": 0.6956569970602917
},
{
"epoch": 0.3203085695119906,
"grad_norm": 6.125,
"learning_rate": 7.675752958623767e-07,
"loss": 1.085744857788086,
"step": 955,
"token_acc": 0.6910532531068998
},
{
"epoch": 0.3219855777293309,
"grad_norm": 6.375,
"learning_rate": 7.653466765682872e-07,
"loss": 1.0077353477478028,
"step": 960,
"token_acc": 0.7070019610104972
},
{
"epoch": 0.32366258594667113,
"grad_norm": 6.03125,
"learning_rate": 7.631106945738754e-07,
"loss": 1.0229363441467285,
"step": 965,
"token_acc": 0.7041835357624832
},
{
"epoch": 0.3253395941640114,
"grad_norm": 6.0,
"learning_rate": 7.60867411921996e-07,
"loss": 1.047335720062256,
"step": 970,
"token_acc": 0.6972093800479029
},
{
"epoch": 0.32701660238135166,
"grad_norm": 6.0,
"learning_rate": 7.586168908580789e-07,
"loss": 1.0534196853637696,
"step": 975,
"token_acc": 0.6946582691859109
},
{
"epoch": 0.32869361059869195,
"grad_norm": 6.78125,
"learning_rate": 7.56359193828401e-07,
"loss": 1.0379526138305664,
"step": 980,
"token_acc": 0.6913040802510924
},
{
"epoch": 0.3303706188160322,
"grad_norm": 5.96875,
"learning_rate": 7.54094383478355e-07,
"loss": 1.0085951805114746,
"step": 985,
"token_acc": 0.7015451501086892
},
{
"epoch": 0.3320476270333725,
"grad_norm": 5.96875,
"learning_rate": 7.5182252265071e-07,
"loss": 1.0620606422424317,
"step": 990,
"token_acc": 0.6930627550457704
},
{
"epoch": 0.3337246352507127,
"grad_norm": 6.5625,
"learning_rate": 7.495436743838677e-07,
"loss": 1.0689016342163087,
"step": 995,
"token_acc": 0.6921185150486006
},
{
"epoch": 0.335401643468053,
"grad_norm": 5.8125,
"learning_rate": 7.472579019101136e-07,
"loss": 1.023653221130371,
"step": 1000,
"token_acc": 0.7040666891968007
},
{
"epoch": 0.33707865168539325,
"grad_norm": 6.5625,
"learning_rate": 7.449652686538632e-07,
"loss": 1.0605965614318849,
"step": 1005,
"token_acc": 0.6947503671071953
},
{
"epoch": 0.33875565990273354,
"grad_norm": 6.28125,
"learning_rate": 7.426658382299005e-07,
"loss": 1.0809215545654296,
"step": 1010,
"token_acc": 0.6894533536228491
},
{
"epoch": 0.3404326681200738,
"grad_norm": 6.0,
"learning_rate": 7.40359674441614e-07,
"loss": 1.0782832145690917,
"step": 1015,
"token_acc": 0.688503381358424
},
{
"epoch": 0.34210967633741407,
"grad_norm": 6.15625,
"learning_rate": 7.380468412792267e-07,
"loss": 1.0309969902038574,
"step": 1020,
"token_acc": 0.699964174826845
},
{
"epoch": 0.3437866845547543,
"grad_norm": 5.875,
"learning_rate": 7.357274029180191e-07,
"loss": 1.0572206497192382,
"step": 1025,
"token_acc": 0.6972423339011925
},
{
"epoch": 0.3454636927720946,
"grad_norm": 6.25,
"learning_rate": 7.334014237165493e-07,
"loss": 1.0772000312805177,
"step": 1030,
"token_acc": 0.6925978312116926
},
{
"epoch": 0.34714070098943484,
"grad_norm": 5.8125,
"learning_rate": 7.310689682148679e-07,
"loss": 1.0255406379699707,
"step": 1035,
"token_acc": 0.7015232377140316
},
{
"epoch": 0.34881770920677513,
"grad_norm": 6.03125,
"learning_rate": 7.287301011327257e-07,
"loss": 1.0338263511657715,
"step": 1040,
"token_acc": 0.6993613405988164
},
{
"epoch": 0.35049471742411537,
"grad_norm": 6.1875,
"learning_rate": 7.263848873677793e-07,
"loss": 1.0368549346923828,
"step": 1045,
"token_acc": 0.7006442705936493
},
{
"epoch": 0.35217172564145566,
"grad_norm": 5.90625,
"learning_rate": 7.240333919937892e-07,
"loss": 1.0463068962097168,
"step": 1050,
"token_acc": 0.6975750577367206
},
{
"epoch": 0.3538487338587959,
"grad_norm": 6.65625,
"learning_rate": 7.216756802588151e-07,
"loss": 1.0312828063964843,
"step": 1055,
"token_acc": 0.7036994812745825
},
{
"epoch": 0.3555257420761362,
"grad_norm": 6.15625,
"learning_rate": 7.193118175834047e-07,
"loss": 1.0586414337158203,
"step": 1060,
"token_acc": 0.6946736263119493
},
{
"epoch": 0.3572027502934764,
"grad_norm": 5.875,
"learning_rate": 7.16941869558779e-07,
"loss": 1.0383371353149413,
"step": 1065,
"token_acc": 0.6991913746630728
},
{
"epoch": 0.3588797585108167,
"grad_norm": 6.3125,
"learning_rate": 7.145659019450121e-07,
"loss": 1.0529624938964843,
"step": 1070,
"token_acc": 0.6956057007125891
},
{
"epoch": 0.36055676672815695,
"grad_norm": 6.28125,
"learning_rate": 7.121839806692062e-07,
"loss": 1.0467044830322265,
"step": 1075,
"token_acc": 0.6972133195719485
},
{
"epoch": 0.36223377494549724,
"grad_norm": 11.0,
"learning_rate": 7.097961718236628e-07,
"loss": 1.0414490699768066,
"step": 1080,
"token_acc": 0.6999028071579669
},
{
"epoch": 0.3639107831628375,
"grad_norm": 6.03125,
"learning_rate": 7.074025416640487e-07,
"loss": 1.0076875686645508,
"step": 1085,
"token_acc": 0.7048360846314811
},
{
"epoch": 0.3655877913801778,
"grad_norm": 6.28125,
"learning_rate": 7.050031566075573e-07,
"loss": 1.0238298416137694,
"step": 1090,
"token_acc": 0.7038746377589353
},
{
"epoch": 0.367264799597518,
"grad_norm": 5.5,
"learning_rate": 7.025980832310658e-07,
"loss": 1.03941650390625,
"step": 1095,
"token_acc": 0.7002581015199312
},
{
"epoch": 0.3689418078148583,
"grad_norm": 6.125,
"learning_rate": 7.001873882692883e-07,
"loss": 1.0348270416259766,
"step": 1100,
"token_acc": 0.6984748930013864
},
{
"epoch": 0.37061881603219854,
"grad_norm": 5.9375,
"learning_rate": 6.977711386129232e-07,
"loss": 1.0247929573059082,
"step": 1105,
"token_acc": 0.7004997620180866
},
{
"epoch": 0.37229582424953883,
"grad_norm": 6.40625,
"learning_rate": 6.953494013067978e-07,
"loss": 1.0276754379272461,
"step": 1110,
"token_acc": 0.6988610216290843
},
{
"epoch": 0.37397283246687907,
"grad_norm": 6.03125,
"learning_rate": 6.929222435480082e-07,
"loss": 1.0176216125488282,
"step": 1115,
"token_acc": 0.7015344002639828
},
{
"epoch": 0.37564984068421936,
"grad_norm": 5.9375,
"learning_rate": 6.904897326840537e-07,
"loss": 1.0373910903930663,
"step": 1120,
"token_acc": 0.7015279241306639
},
{
"epoch": 0.3773268489015596,
"grad_norm": 5.84375,
"learning_rate": 6.880519362109694e-07,
"loss": 1.021230125427246,
"step": 1125,
"token_acc": 0.7006535947712418
},
{
"epoch": 0.3790038571188999,
"grad_norm": 6.125,
"learning_rate": 6.856089217714521e-07,
"loss": 1.0548656463623047,
"step": 1130,
"token_acc": 0.6955671821997356
},
{
"epoch": 0.3806808653362401,
"grad_norm": 5.78125,
"learning_rate": 6.831607571529849e-07,
"loss": 1.0167530059814454,
"step": 1135,
"token_acc": 0.7032388210243192
},
{
"epoch": 0.3823578735535804,
"grad_norm": 5.65625,
"learning_rate": 6.807075102859542e-07,
"loss": 1.0101150512695312,
"step": 1140,
"token_acc": 0.6998482017748715
},
{
"epoch": 0.38403488177092066,
"grad_norm": 5.96875,
"learning_rate": 6.78249249241767e-07,
"loss": 1.026358699798584,
"step": 1145,
"token_acc": 0.7042928742645457
},
{
"epoch": 0.38571188998826095,
"grad_norm": 5.71875,
"learning_rate": 6.757860422309603e-07,
"loss": 1.0511194229125977,
"step": 1150,
"token_acc": 0.6920869208692086
},
{
"epoch": 0.3873888982056012,
"grad_norm": 6.40625,
"learning_rate": 6.733179576013097e-07,
"loss": 1.036262321472168,
"step": 1155,
"token_acc": 0.6977180902520913
},
{
"epoch": 0.3890659064229415,
"grad_norm": 5.65625,
"learning_rate": 6.70845063835932e-07,
"loss": 1.0658405303955079,
"step": 1160,
"token_acc": 0.6915908343383401
},
{
"epoch": 0.3907429146402817,
"grad_norm": 6.46875,
"learning_rate": 6.683674295513858e-07,
"loss": 1.0385713577270508,
"step": 1165,
"token_acc": 0.6958403046212427
},
{
"epoch": 0.392419922857622,
"grad_norm": 5.84375,
"learning_rate": 6.658851234957669e-07,
"loss": 1.002643871307373,
"step": 1170,
"token_acc": 0.7057808455565142
},
{
"epoch": 0.39409693107496224,
"grad_norm": 6.53125,
"learning_rate": 6.633982145468008e-07,
"loss": 1.0385595321655274,
"step": 1175,
"token_acc": 0.7010718539102818
},
{
"epoch": 0.39577393929230253,
"grad_norm": 5.59375,
"learning_rate": 6.609067717099319e-07,
"loss": 1.0319636344909668,
"step": 1180,
"token_acc": 0.7033253524873636
},
{
"epoch": 0.39745094750964277,
"grad_norm": 5.75,
"learning_rate": 6.584108641164086e-07,
"loss": 1.0643960952758789,
"step": 1185,
"token_acc": 0.6947075840943899
},
{
"epoch": 0.39912795572698306,
"grad_norm": 5.90625,
"learning_rate": 6.559105610213648e-07,
"loss": 0.9819327354431152,
"step": 1190,
"token_acc": 0.7112614578786556
},
{
"epoch": 0.4008049639443233,
"grad_norm": 6.03125,
"learning_rate": 6.534059318018988e-07,
"loss": 0.9903202056884766,
"step": 1195,
"token_acc": 0.7077391904323828
},
{
"epoch": 0.4024819721616636,
"grad_norm": 5.96875,
"learning_rate": 6.50897045955147e-07,
"loss": 1.0295546531677247,
"step": 1200,
"token_acc": 0.7006684562345387
},
{
"epoch": 0.4041589803790039,
"grad_norm": 5.75,
"learning_rate": 6.48383973096358e-07,
"loss": 1.0131707191467285,
"step": 1205,
"token_acc": 0.7029982575256865
},
{
"epoch": 0.4058359885963441,
"grad_norm": 6.53125,
"learning_rate": 6.458667829569582e-07,
"loss": 1.034630012512207,
"step": 1210,
"token_acc": 0.697005394010788
},
{
"epoch": 0.4075129968136844,
"grad_norm": 5.9375,
"learning_rate": 6.433455453826186e-07,
"loss": 1.011972713470459,
"step": 1215,
"token_acc": 0.7073240730111353
},
{
"epoch": 0.40919000503102465,
"grad_norm": 7.28125,
"learning_rate": 6.408203303313161e-07,
"loss": 1.0427475929260255,
"step": 1220,
"token_acc": 0.6974992467610727
},
{
"epoch": 0.41086701324836494,
"grad_norm": 5.6875,
"learning_rate": 6.382912078713929e-07,
"loss": 1.030358123779297,
"step": 1225,
"token_acc": 0.7000523560209424
},
{
"epoch": 0.4125440214657052,
"grad_norm": 5.75,
"learning_rate": 6.357582481796113e-07,
"loss": 1.0345232009887695,
"step": 1230,
"token_acc": 0.7003829691371931
},
{
"epoch": 0.41422102968304547,
"grad_norm": 5.6875,
"learning_rate": 6.332215215392079e-07,
"loss": 1.0514300346374512,
"step": 1235,
"token_acc": 0.6924103419516263
},
{
"epoch": 0.4158980379003857,
"grad_norm": 5.90625,
"learning_rate": 6.306810983379418e-07,
"loss": 1.0526814460754395,
"step": 1240,
"token_acc": 0.699214806505889
},
{
"epoch": 0.417575046117726,
"grad_norm": 5.65625,
"learning_rate": 6.281370490661432e-07,
"loss": 1.024818229675293,
"step": 1245,
"token_acc": 0.7030024647098364
},
{
"epoch": 0.41925205433506624,
"grad_norm": 6.3125,
"learning_rate": 6.255894443147557e-07,
"loss": 1.075798797607422,
"step": 1250,
"token_acc": 0.6899942163100058
},
{
"epoch": 0.42092906255240653,
"grad_norm": 5.875,
"learning_rate": 6.230383547733792e-07,
"loss": 1.0811284065246582,
"step": 1255,
"token_acc": 0.6883433394757178
},
{
"epoch": 0.42260607076974677,
"grad_norm": 6.375,
"learning_rate": 6.204838512283071e-07,
"loss": 1.0544434547424317,
"step": 1260,
"token_acc": 0.6923693399736677
},
{
"epoch": 0.42428307898708706,
"grad_norm": 6.21875,
"learning_rate": 6.179260045605637e-07,
"loss": 1.0577333450317383,
"step": 1265,
"token_acc": 0.6935493235789352
},
{
"epoch": 0.4259600872044273,
"grad_norm": 6.71875,
"learning_rate": 6.153648857439352e-07,
"loss": 1.040918731689453,
"step": 1270,
"token_acc": 0.6961138989380324
},
{
"epoch": 0.4276370954217676,
"grad_norm": 6.03125,
"learning_rate": 6.128005658430028e-07,
"loss": 1.0624547958374024,
"step": 1275,
"token_acc": 0.6919805920514638
},
{
"epoch": 0.4293141036391078,
"grad_norm": 6.15625,
"learning_rate": 6.102331160111692e-07,
"loss": 1.0875247955322265,
"step": 1280,
"token_acc": 0.6913573152511205
},
{
"epoch": 0.4309911118564481,
"grad_norm": 5.9375,
"learning_rate": 6.076626074886853e-07,
"loss": 1.016362476348877,
"step": 1285,
"token_acc": 0.7094132926760405
},
{
"epoch": 0.43266812007378835,
"grad_norm": 5.75,
"learning_rate": 6.050891116006718e-07,
"loss": 1.0268930435180663,
"step": 1290,
"token_acc": 0.6988262395787627
},
{
"epoch": 0.43434512829112865,
"grad_norm": 6.0,
"learning_rate": 6.025126997551426e-07,
"loss": 1.0284778594970703,
"step": 1295,
"token_acc": 0.6979727193092229
},
{
"epoch": 0.4360221365084689,
"grad_norm": 5.84375,
"learning_rate": 5.99933443441021e-07,
"loss": 1.0141260147094726,
"step": 1300,
"token_acc": 0.7011908851482881
},
{
"epoch": 0.4376991447258092,
"grad_norm": 6.0625,
"learning_rate": 5.973514142261579e-07,
"loss": 1.0410663604736328,
"step": 1305,
"token_acc": 0.694919168591224
},
{
"epoch": 0.4393761529431494,
"grad_norm": 6.8125,
"learning_rate": 5.947666837553448e-07,
"loss": 1.0731523513793946,
"step": 1310,
"token_acc": 0.6926234606294315
},
{
"epoch": 0.4410531611604897,
"grad_norm": 5.53125,
"learning_rate": 5.921793237483262e-07,
"loss": 1.0419529914855956,
"step": 1315,
"token_acc": 0.6948214091846705
},
{
"epoch": 0.44273016937782994,
"grad_norm": 6.125,
"learning_rate": 5.895894059978095e-07,
"loss": 1.0798610687255858,
"step": 1320,
"token_acc": 0.6885527672739773
},
{
"epoch": 0.44440717759517023,
"grad_norm": 6.03125,
"learning_rate": 5.869970023674735e-07,
"loss": 1.01141300201416,
"step": 1325,
"token_acc": 0.7033395392134048
},
{
"epoch": 0.44608418581251047,
"grad_norm": 6.375,
"learning_rate": 5.844021847899734e-07,
"loss": 1.0301786422729493,
"step": 1330,
"token_acc": 0.6992315482218383
},
{
"epoch": 0.44776119402985076,
"grad_norm": 6.5625,
"learning_rate": 5.818050252649458e-07,
"loss": 1.0755172729492188,
"step": 1335,
"token_acc": 0.690394101734472
},
{
"epoch": 0.449438202247191,
"grad_norm": 6.0625,
"learning_rate": 5.792055958570098e-07,
"loss": 1.0729934692382812,
"step": 1340,
"token_acc": 0.6900311526479751
},
{
"epoch": 0.4511152104645313,
"grad_norm": 6.375,
"learning_rate": 5.766039686937687e-07,
"loss": 1.0664525985717774,
"step": 1345,
"token_acc": 0.6915431560592851
},
{
"epoch": 0.45279221868187153,
"grad_norm": 5.84375,
"learning_rate": 5.740002159638072e-07,
"loss": 1.018766498565674,
"step": 1350,
"token_acc": 0.7020927601809954
},
{
"epoch": 0.4544692268992118,
"grad_norm": 5.9375,
"learning_rate": 5.713944099146901e-07,
"loss": 1.0389814376831055,
"step": 1355,
"token_acc": 0.6960518301124093
},
{
"epoch": 0.45614623511655206,
"grad_norm": 6.15625,
"learning_rate": 5.687866228509558e-07,
"loss": 1.0340880393981933,
"step": 1360,
"token_acc": 0.7008528076198914
},
{
"epoch": 0.45782324333389235,
"grad_norm": 6.09375,
"learning_rate": 5.661769271321113e-07,
"loss": 1.0076488494873046,
"step": 1365,
"token_acc": 0.7054647515271587
},
{
"epoch": 0.4595002515512326,
"grad_norm": 6.09375,
"learning_rate": 5.635653951706234e-07,
"loss": 1.0600255012512207,
"step": 1370,
"token_acc": 0.6946650892907442
},
{
"epoch": 0.4611772597685729,
"grad_norm": 6.4375,
"learning_rate": 5.609520994299108e-07,
"loss": 1.0570174217224122,
"step": 1375,
"token_acc": 0.6946865165819415
},
{
"epoch": 0.4628542679859131,
"grad_norm": 6.125,
"learning_rate": 5.58337112422332e-07,
"loss": 1.0049464225769043,
"step": 1380,
"token_acc": 0.7048176149219137
},
{
"epoch": 0.4645312762032534,
"grad_norm": 6.21875,
"learning_rate": 5.557205067071739e-07,
"loss": 1.0521051406860351,
"step": 1385,
"token_acc": 0.694855340439417
},
{
"epoch": 0.46620828442059364,
"grad_norm": 5.9375,
"learning_rate": 5.531023548886391e-07,
"loss": 1.0039688110351563,
"step": 1390,
"token_acc": 0.708057521113901
},
{
"epoch": 0.46788529263793394,
"grad_norm": 6.375,
"learning_rate": 5.5048272961383e-07,
"loss": 1.0771536827087402,
"step": 1395,
"token_acc": 0.6882698610012431
},
{
"epoch": 0.4695623008552742,
"grad_norm": 5.78125,
"learning_rate": 5.478617035707337e-07,
"loss": 1.0812073707580567,
"step": 1400,
"token_acc": 0.6879506798040403
},
{
"epoch": 0.47123930907261447,
"grad_norm": 5.96875,
"learning_rate": 5.452393494862057e-07,
"loss": 1.0342639923095702,
"step": 1405,
"token_acc": 0.7012715977480101
},
{
"epoch": 0.4729163172899547,
"grad_norm": 5.9375,
"learning_rate": 5.426157401239504e-07,
"loss": 1.036158561706543,
"step": 1410,
"token_acc": 0.6999719495091165
},
{
"epoch": 0.474593325507295,
"grad_norm": 5.78125,
"learning_rate": 5.399909482825038e-07,
"loss": 1.0552077293395996,
"step": 1415,
"token_acc": 0.6963589076723017
},
{
"epoch": 0.47627033372463523,
"grad_norm": 5.75,
"learning_rate": 5.373650467932121e-07,
"loss": 1.005418586730957,
"step": 1420,
"token_acc": 0.709333626325331
},
{
"epoch": 0.4779473419419755,
"grad_norm": 5.96875,
"learning_rate": 5.34738108518212e-07,
"loss": 1.0156227111816407,
"step": 1425,
"token_acc": 0.7042805679169315
},
{
"epoch": 0.47962435015931576,
"grad_norm": 6.84375,
"learning_rate": 5.321102063484079e-07,
"loss": 1.0417983055114746,
"step": 1430,
"token_acc": 0.695285768597136
},
{
"epoch": 0.48130135837665605,
"grad_norm": 5.6875,
"learning_rate": 5.294814132014503e-07,
"loss": 1.0348029136657715,
"step": 1435,
"token_acc": 0.7010616188282873
},
{
"epoch": 0.4829783665939963,
"grad_norm": 6.15625,
"learning_rate": 5.268518020197113e-07,
"loss": 1.020607852935791,
"step": 1440,
"token_acc": 0.7056887635166902
},
{
"epoch": 0.4846553748113366,
"grad_norm": 6.5625,
"learning_rate": 5.242214457682623e-07,
"loss": 0.9982949256896972,
"step": 1445,
"token_acc": 0.708599000516974
},
{
"epoch": 0.4863323830286768,
"grad_norm": 5.59375,
"learning_rate": 5.21590417432848e-07,
"loss": 1.0266719818115235,
"step": 1450,
"token_acc": 0.7015999140985719
},
{
"epoch": 0.4880093912460171,
"grad_norm": 6.125,
"learning_rate": 5.18958790017862e-07,
"loss": 1.0599438667297363,
"step": 1455,
"token_acc": 0.6961610486891385
},
{
"epoch": 0.48968639946335735,
"grad_norm": 6.09375,
"learning_rate": 5.163266365443202e-07,
"loss": 1.0109405517578125,
"step": 1460,
"token_acc": 0.7010133211886599
},
{
"epoch": 0.49136340768069764,
"grad_norm": 5.90625,
"learning_rate": 5.136940300478363e-07,
"loss": 1.052570915222168,
"step": 1465,
"token_acc": 0.6954032957502169
},
{
"epoch": 0.4930404158980379,
"grad_norm": 6.1875,
"learning_rate": 5.110610435765934e-07,
"loss": 1.0087080955505372,
"step": 1470,
"token_acc": 0.7043160955584697
},
{
"epoch": 0.49471742411537817,
"grad_norm": 6.25,
"learning_rate": 5.084277501893186e-07,
"loss": 1.0556130409240723,
"step": 1475,
"token_acc": 0.6968740328071804
},
{
"epoch": 0.4963944323327184,
"grad_norm": 5.9375,
"learning_rate": 5.057942229532544e-07,
"loss": 1.051294708251953,
"step": 1480,
"token_acc": 0.6979548409977538
},
{
"epoch": 0.4980714405500587,
"grad_norm": 6.3125,
"learning_rate": 5.031605349421327e-07,
"loss": 1.0535630226135253,
"step": 1485,
"token_acc": 0.6958853013171699
},
{
"epoch": 0.49974844876739893,
"grad_norm": 16.25,
"learning_rate": 5.00526759234146e-07,
"loss": 1.072017765045166,
"step": 1490,
"token_acc": 0.6943414284005264
},
{
"epoch": 0.5014254569847392,
"grad_norm": 6.125,
"learning_rate": 4.978929689099206e-07,
"loss": 1.0403889656066894,
"step": 1495,
"token_acc": 0.7005302818866871
},
{
"epoch": 0.5031024652020795,
"grad_norm": 6.0,
"learning_rate": 4.952592370504881e-07,
"loss": 1.0573260307312011,
"step": 1500,
"token_acc": 0.6974754631816512
},
{
"epoch": 0.5047794734194198,
"grad_norm": 6.1875,
"learning_rate": 4.926256367352579e-07,
"loss": 1.0464170455932618,
"step": 1505,
"token_acc": 0.6966352336048007
},
{
"epoch": 0.50645648163676,
"grad_norm": 5.90625,
"learning_rate": 4.899922410399895e-07,
"loss": 1.0218128204345702,
"step": 1510,
"token_acc": 0.7029736237889903
},
{
"epoch": 0.5081334898541003,
"grad_norm": 6.90625,
"learning_rate": 4.873591230347641e-07,
"loss": 1.066216278076172,
"step": 1515,
"token_acc": 0.696458297457092
},
{
"epoch": 0.5098104980714405,
"grad_norm": 6.125,
"learning_rate": 4.847263557819587e-07,
"loss": 0.9884692192077636,
"step": 1520,
"token_acc": 0.7126423559185552
},
{
"epoch": 0.5114875062887808,
"grad_norm": 5.59375,
"learning_rate": 4.820940123342173e-07,
"loss": 1.0081668853759767,
"step": 1525,
"token_acc": 0.7077791438470996
},
{
"epoch": 0.5131645145061211,
"grad_norm": 5.96875,
"learning_rate": 4.794621657324241e-07,
"loss": 1.0023490905761718,
"step": 1530,
"token_acc": 0.7051523545706371
},
{
"epoch": 0.5148415227234614,
"grad_norm": 5.65625,
"learning_rate": 4.7683088900367767e-07,
"loss": 1.0025731086730958,
"step": 1535,
"token_acc": 0.7035923141186299
},
{
"epoch": 0.5165185309408016,
"grad_norm": 6.0,
"learning_rate": 4.7420025515926345e-07,
"loss": 1.0486156463623046,
"step": 1540,
"token_acc": 0.6984244776801005
},
{
"epoch": 0.5181955391581419,
"grad_norm": 5.75,
"learning_rate": 4.7157033719262894e-07,
"loss": 1.0556530952453613,
"step": 1545,
"token_acc": 0.6891177739430544
},
{
"epoch": 0.5198725473754822,
"grad_norm": 6.15625,
"learning_rate": 4.6894120807735756e-07,
"loss": 1.0409958839416504,
"step": 1550,
"token_acc": 0.6963211082093789
},
{
"epoch": 0.5215495555928225,
"grad_norm": 6.0625,
"learning_rate": 4.6631294076514426e-07,
"loss": 1.0312573432922363,
"step": 1555,
"token_acc": 0.7006417736289382
},
{
"epoch": 0.5232265638101626,
"grad_norm": 6.09375,
"learning_rate": 4.636856081837709e-07,
"loss": 1.0601593017578126,
"step": 1560,
"token_acc": 0.6912083152880034
},
{
"epoch": 0.5249035720275029,
"grad_norm": 6.25,
"learning_rate": 4.610592832350832e-07,
"loss": 1.0264972686767577,
"step": 1565,
"token_acc": 0.6995477185549894
},
{
"epoch": 0.5265805802448432,
"grad_norm": 6.4375,
"learning_rate": 4.5843403879296757e-07,
"loss": 1.0082528114318847,
"step": 1570,
"token_acc": 0.7059257159058691
},
{
"epoch": 0.5282575884621835,
"grad_norm": 5.8125,
"learning_rate": 4.558099477013288e-07,
"loss": 1.0326814651489258,
"step": 1575,
"token_acc": 0.7001332267519318
},
{
"epoch": 0.5299345966795237,
"grad_norm": 6.21875,
"learning_rate": 4.531870827720695e-07,
"loss": 1.0291913986206054,
"step": 1580,
"token_acc": 0.7006176652254478
},
{
"epoch": 0.531611604896864,
"grad_norm": 6.0,
"learning_rate": 4.5056551678306907e-07,
"loss": 1.098677635192871,
"step": 1585,
"token_acc": 0.6850571099235041
},
{
"epoch": 0.5332886131142043,
"grad_norm": 6.15625,
"learning_rate": 4.4794532247616466e-07,
"loss": 1.047648811340332,
"step": 1590,
"token_acc": 0.6972982177318652
},
{
"epoch": 0.5349656213315446,
"grad_norm": 6.09375,
"learning_rate": 4.4532657255513315e-07,
"loss": 1.051576805114746,
"step": 1595,
"token_acc": 0.6918334234721472
},
{
"epoch": 0.5366426295488848,
"grad_norm": 5.53125,
"learning_rate": 4.4270933968367265e-07,
"loss": 1.045903778076172,
"step": 1600,
"token_acc": 0.7030181086519115
},
{
"epoch": 0.538319637766225,
"grad_norm": 6.40625,
"learning_rate": 4.400936964833874e-07,
"loss": 1.0394445419311524,
"step": 1605,
"token_acc": 0.6985150224473351
},
{
"epoch": 0.5399966459835653,
"grad_norm": 6.34375,
"learning_rate": 4.374797155317721e-07,
"loss": 1.0475746154785157,
"step": 1610,
"token_acc": 0.6994824888674931
},
{
"epoch": 0.5416736542009056,
"grad_norm": 6.0,
"learning_rate": 4.348674693601985e-07,
"loss": 1.03849515914917,
"step": 1615,
"token_acc": 0.7009401404260384
},
{
"epoch": 0.5433506624182458,
"grad_norm": 6.34375,
"learning_rate": 4.322570304519022e-07,
"loss": 1.0555237770080566,
"step": 1620,
"token_acc": 0.6955041993742109
},
{
"epoch": 0.5450276706355861,
"grad_norm": 6.15625,
"learning_rate": 4.296484712399722e-07,
"loss": 1.013861083984375,
"step": 1625,
"token_acc": 0.6988321728164921
},
{
"epoch": 0.5467046788529264,
"grad_norm": 6.25,
"learning_rate": 4.270418641053404e-07,
"loss": 1.030404281616211,
"step": 1630,
"token_acc": 0.7007250418293363
},
{
"epoch": 0.5483816870702667,
"grad_norm": 5.90625,
"learning_rate": 4.2443728137477353e-07,
"loss": 1.0251054763793945,
"step": 1635,
"token_acc": 0.7007932573128408
},
{
"epoch": 0.5500586952876069,
"grad_norm": 6.03125,
"learning_rate": 4.2183479531886644e-07,
"loss": 1.0058277130126954,
"step": 1640,
"token_acc": 0.7088387174252432
},
{
"epoch": 0.5517357035049472,
"grad_norm": 5.90625,
"learning_rate": 4.1923447815003613e-07,
"loss": 1.0381638526916503,
"step": 1645,
"token_acc": 0.6994563662374821
},
{
"epoch": 0.5534127117222875,
"grad_norm": 6.53125,
"learning_rate": 4.16636402020519e-07,
"loss": 0.9976962089538575,
"step": 1650,
"token_acc": 0.7139129469242728
},
{
"epoch": 0.5550897199396277,
"grad_norm": 6.375,
"learning_rate": 4.1404063902036766e-07,
"loss": 1.0349790573120117,
"step": 1655,
"token_acc": 0.6979560097756055
},
{
"epoch": 0.5567667281569679,
"grad_norm": 5.4375,
"learning_rate": 4.114472611754518e-07,
"loss": 0.9997722625732421,
"step": 1660,
"token_acc": 0.7006929187088051
},
{
"epoch": 0.5584437363743082,
"grad_norm": 5.4375,
"learning_rate": 4.0885634044545847e-07,
"loss": 0.9896345138549805,
"step": 1665,
"token_acc": 0.710621704745167
},
{
"epoch": 0.5601207445916485,
"grad_norm": 5.59375,
"learning_rate": 4.062679487218966e-07,
"loss": 1.0382169723510741,
"step": 1670,
"token_acc": 0.6991681815677849
},
{
"epoch": 0.5617977528089888,
"grad_norm": 6.09375,
"learning_rate": 4.0368215782610145e-07,
"loss": 1.021392250061035,
"step": 1675,
"token_acc": 0.704168223301489
},
{
"epoch": 0.563474761026329,
"grad_norm": 5.75,
"learning_rate": 4.010990395072413e-07,
"loss": 1.0188769340515136,
"step": 1680,
"token_acc": 0.7017091454272864
},
{
"epoch": 0.5651517692436693,
"grad_norm": 5.6875,
"learning_rate": 3.98518665440328e-07,
"loss": 1.045937442779541,
"step": 1685,
"token_acc": 0.6954898185425054
},
{
"epoch": 0.5668287774610096,
"grad_norm": 5.875,
"learning_rate": 3.959411072242266e-07,
"loss": 1.0623149871826172,
"step": 1690,
"token_acc": 0.6891484551620196
},
{
"epoch": 0.5685057856783499,
"grad_norm": 5.96875,
"learning_rate": 3.9336643637966984e-07,
"loss": 1.0626046180725097,
"step": 1695,
"token_acc": 0.6948011185299321
},
{
"epoch": 0.57018279389569,
"grad_norm": 6.1875,
"learning_rate": 3.9079472434727324e-07,
"loss": 1.0445612907409667,
"step": 1700,
"token_acc": 0.696189917936694
},
{
"epoch": 0.5718598021130303,
"grad_norm": 6.5,
"learning_rate": 3.882260424855523e-07,
"loss": 1.0410688400268555,
"step": 1705,
"token_acc": 0.698505122893455
},
{
"epoch": 0.5735368103303706,
"grad_norm": 6.75,
"learning_rate": 3.856604620689435e-07,
"loss": 1.0453211784362793,
"step": 1710,
"token_acc": 0.701270692469107
},
{
"epoch": 0.5752138185477109,
"grad_norm": 6.0625,
"learning_rate": 3.8309805428582557e-07,
"loss": 1.041317081451416,
"step": 1715,
"token_acc": 0.695624676594032
},
{
"epoch": 0.5768908267650511,
"grad_norm": 6.15625,
"learning_rate": 3.8053889023654506e-07,
"loss": 1.0477853775024415,
"step": 1720,
"token_acc": 0.700368759975783
},
{
"epoch": 0.5785678349823914,
"grad_norm": 6.21875,
"learning_rate": 3.779830409314427e-07,
"loss": 1.025911235809326,
"step": 1725,
"token_acc": 0.6981572062336481
},
{
"epoch": 0.5802448431997317,
"grad_norm": 6.53125,
"learning_rate": 3.7543057728888387e-07,
"loss": 1.0449981689453125,
"step": 1730,
"token_acc": 0.6980824153406773
},
{
"epoch": 0.581921851417072,
"grad_norm": 6.21875,
"learning_rate": 3.7288157013328986e-07,
"loss": 1.038572120666504,
"step": 1735,
"token_acc": 0.7015234262719172
},
{
"epoch": 0.5835988596344122,
"grad_norm": 5.9375,
"learning_rate": 3.7033609019317367e-07,
"loss": 1.0610506057739257,
"step": 1740,
"token_acc": 0.6917713434106877
},
{
"epoch": 0.5852758678517525,
"grad_norm": 5.96875,
"learning_rate": 3.6779420809917687e-07,
"loss": 0.9886129379272461,
"step": 1745,
"token_acc": 0.7089907067214177
},
{
"epoch": 0.5869528760690927,
"grad_norm": 5.53125,
"learning_rate": 3.6525599438210954e-07,
"loss": 1.0512758255004884,
"step": 1750,
"token_acc": 0.6951606390955759
},
{
"epoch": 0.588629884286433,
"grad_norm": 6.03125,
"learning_rate": 3.6272151947099395e-07,
"loss": 1.054044246673584,
"step": 1755,
"token_acc": 0.6979292209926592
},
{
"epoch": 0.5903068925037732,
"grad_norm": 5.59375,
"learning_rate": 3.6019085369110966e-07,
"loss": 1.051740550994873,
"step": 1760,
"token_acc": 0.6992283605457392
},
{
"epoch": 0.5919839007211135,
"grad_norm": 5.9375,
"learning_rate": 3.576640672620427e-07,
"loss": 1.059780216217041,
"step": 1765,
"token_acc": 0.6915394973070018
},
{
"epoch": 0.5936609089384538,
"grad_norm": 6.375,
"learning_rate": 3.5514123029573674e-07,
"loss": 1.0393645286560058,
"step": 1770,
"token_acc": 0.695433964322732
},
{
"epoch": 0.5953379171557941,
"grad_norm": 5.9375,
"learning_rate": 3.526224127945478e-07,
"loss": 1.0533989906311034,
"step": 1775,
"token_acc": 0.6924574479919902
},
{
"epoch": 0.5970149253731343,
"grad_norm": 5.65625,
"learning_rate": 3.5010768464930234e-07,
"loss": 1.0086621284484862,
"step": 1780,
"token_acc": 0.7095280105866785
},
{
"epoch": 0.5986919335904746,
"grad_norm": 6.625,
"learning_rate": 3.475971156373567e-07,
"loss": 1.0667131423950196,
"step": 1785,
"token_acc": 0.6917589316288095
},
{
"epoch": 0.6003689418078149,
"grad_norm": 5.8125,
"learning_rate": 3.4509077542066254e-07,
"loss": 1.0151333808898926,
"step": 1790,
"token_acc": 0.7043865991805254
},
{
"epoch": 0.6020459500251552,
"grad_norm": 5.5625,
"learning_rate": 3.4258873354383264e-07,
"loss": 1.0384534835815429,
"step": 1795,
"token_acc": 0.6970242507677645
},
{
"epoch": 0.6037229582424953,
"grad_norm": 6.125,
"learning_rate": 3.4009105943221206e-07,
"loss": 1.0263296127319337,
"step": 1800,
"token_acc": 0.7001357689459393
},
{
"epoch": 0.6053999664598356,
"grad_norm": 5.84375,
"learning_rate": 3.3759782238995093e-07,
"loss": 1.049347496032715,
"step": 1805,
"token_acc": 0.6946134277181728
},
{
"epoch": 0.6070769746771759,
"grad_norm": 6.09375,
"learning_rate": 3.3510909159808237e-07,
"loss": 1.0234293937683105,
"step": 1810,
"token_acc": 0.7010961955565977
},
{
"epoch": 0.6087539828945162,
"grad_norm": 6.28125,
"learning_rate": 3.326249361126024e-07,
"loss": 1.032447624206543,
"step": 1815,
"token_acc": 0.7008217918510431
},
{
"epoch": 0.6104309911118565,
"grad_norm": 7.3125,
"learning_rate": 3.301454248625536e-07,
"loss": 1.0159520149230956,
"step": 1820,
"token_acc": 0.7034609635506497
},
{
"epoch": 0.6121079993291967,
"grad_norm": 6.21875,
"learning_rate": 3.276706266481128e-07,
"loss": 1.0407513618469237,
"step": 1825,
"token_acc": 0.7003651425145785
},
{
"epoch": 0.613785007546537,
"grad_norm": 6.1875,
"learning_rate": 3.252006101386819e-07,
"loss": 1.0305519104003906,
"step": 1830,
"token_acc": 0.6989974352996037
},
{
"epoch": 0.6154620157638773,
"grad_norm": 6.3125,
"learning_rate": 3.2273544387098294e-07,
"loss": 1.0071770668029785,
"step": 1835,
"token_acc": 0.7019307211811471
},
{
"epoch": 0.6171390239812176,
"grad_norm": 6.0625,
"learning_rate": 3.2027519624715574e-07,
"loss": 1.041695499420166,
"step": 1840,
"token_acc": 0.6940718062900083
},
{
"epoch": 0.6188160321985577,
"grad_norm": 6.46875,
"learning_rate": 3.1781993553286e-07,
"loss": 1.0573740959167481,
"step": 1845,
"token_acc": 0.6961400118108123
},
{
"epoch": 0.620493040415898,
"grad_norm": 5.875,
"learning_rate": 3.1536972985538164e-07,
"loss": 1.0022772789001464,
"step": 1850,
"token_acc": 0.7091309896739761
},
{
"epoch": 0.6221700486332383,
"grad_norm": 5.8125,
"learning_rate": 3.129246472017416e-07,
"loss": 0.9961446762084961,
"step": 1855,
"token_acc": 0.7071025555924328
},
{
"epoch": 0.6238470568505786,
"grad_norm": 5.75,
"learning_rate": 3.104847554168105e-07,
"loss": 1.0580769538879395,
"step": 1860,
"token_acc": 0.6970332150919059
},
{
"epoch": 0.6255240650679188,
"grad_norm": 6.21875,
"learning_rate": 3.080501222014248e-07,
"loss": 1.074977207183838,
"step": 1865,
"token_acc": 0.6930246189917937
},
{
"epoch": 0.6272010732852591,
"grad_norm": 6.28125,
"learning_rate": 3.056208151105094e-07,
"loss": 1.022191333770752,
"step": 1870,
"token_acc": 0.6983393254579695
},
{
"epoch": 0.6288780815025994,
"grad_norm": 5.5,
"learning_rate": 3.0319690155120235e-07,
"loss": 1.051521396636963,
"step": 1875,
"token_acc": 0.6924612810965409
},
{
"epoch": 0.6305550897199397,
"grad_norm": 5.71875,
"learning_rate": 3.007784487809852e-07,
"loss": 1.0417262077331544,
"step": 1880,
"token_acc": 0.7002479152580573
},
{
"epoch": 0.6322320979372799,
"grad_norm": 6.5,
"learning_rate": 2.9836552390581577e-07,
"loss": 1.0322657585144044,
"step": 1885,
"token_acc": 0.7021973676777947
},
{
"epoch": 0.6339091061546201,
"grad_norm": 6.0625,
"learning_rate": 2.9595819387826747e-07,
"loss": 1.04501953125,
"step": 1890,
"token_acc": 0.6964275668073137
},
{
"epoch": 0.6355861143719604,
"grad_norm": 5.6875,
"learning_rate": 2.935565254956705e-07,
"loss": 1.0145910263061524,
"step": 1895,
"token_acc": 0.7037352406902816
},
{
"epoch": 0.6372631225893007,
"grad_norm": 6.1875,
"learning_rate": 2.911605853982586e-07,
"loss": 1.047500228881836,
"step": 1900,
"token_acc": 0.6984356197352587
},
{
"epoch": 0.6389401308066409,
"grad_norm": 6.28125,
"learning_rate": 2.8877044006732034e-07,
"loss": 1.026676845550537,
"step": 1905,
"token_acc": 0.6990015950847758
},
{
"epoch": 0.6406171390239812,
"grad_norm": 6.25,
"learning_rate": 2.8638615582335376e-07,
"loss": 1.0263890266418456,
"step": 1910,
"token_acc": 0.7036444444444444
},
{
"epoch": 0.6422941472413215,
"grad_norm": 5.875,
"learning_rate": 2.8400779882422676e-07,
"loss": 1.0457491874694824,
"step": 1915,
"token_acc": 0.6944520660703507
},
{
"epoch": 0.6439711554586618,
"grad_norm": 6.125,
"learning_rate": 2.816354350633411e-07,
"loss": 0.9950202941894531,
"step": 1920,
"token_acc": 0.7036231884057971
},
{
"epoch": 0.645648163676002,
"grad_norm": 5.71875,
"learning_rate": 2.792691303678015e-07,
"loss": 1.0635858535766602,
"step": 1925,
"token_acc": 0.6984164085782738
},
{
"epoch": 0.6473251718933423,
"grad_norm": 6.9375,
"learning_rate": 2.7690895039658883e-07,
"loss": 1.074039363861084,
"step": 1930,
"token_acc": 0.6902784014369107
},
{
"epoch": 0.6490021801106826,
"grad_norm": 5.96875,
"learning_rate": 2.745549606387381e-07,
"loss": 0.9852777481079101,
"step": 1935,
"token_acc": 0.7147304804346499
},
{
"epoch": 0.6506791883280229,
"grad_norm": 6.21875,
"learning_rate": 2.7220722641152156e-07,
"loss": 1.0335915565490723,
"step": 1940,
"token_acc": 0.7019733972961186
},
{
"epoch": 0.652356196545363,
"grad_norm": 6.4375,
"learning_rate": 2.6986581285863674e-07,
"loss": 1.0393771171569823,
"step": 1945,
"token_acc": 0.7026867275658248
},
{
"epoch": 0.6540332047627033,
"grad_norm": 5.5625,
"learning_rate": 2.6753078494839796e-07,
"loss": 1.0720300674438477,
"step": 1950,
"token_acc": 0.687844706391262
},
{
"epoch": 0.6557102129800436,
"grad_norm": 5.96875,
"learning_rate": 2.6520220747193423e-07,
"loss": 1.0267016410827636,
"step": 1955,
"token_acc": 0.701956017239474
},
{
"epoch": 0.6573872211973839,
"grad_norm": 5.625,
"learning_rate": 2.62880145041391e-07,
"loss": 1.083882713317871,
"step": 1960,
"token_acc": 0.6892801378344909
},
{
"epoch": 0.6590642294147241,
"grad_norm": 6.03125,
"learning_rate": 2.6056466208813814e-07,
"loss": 1.0755278587341308,
"step": 1965,
"token_acc": 0.6931045645840078
},
{
"epoch": 0.6607412376320644,
"grad_norm": 6.125,
"learning_rate": 2.582558228609817e-07,
"loss": 1.0210668563842773,
"step": 1970,
"token_acc": 0.7000920904532897
},
{
"epoch": 0.6624182458494047,
"grad_norm": 5.96875,
"learning_rate": 2.5595369142438056e-07,
"loss": 1.00156831741333,
"step": 1975,
"token_acc": 0.709395738735592
},
{
"epoch": 0.664095254066745,
"grad_norm": 6.21875,
"learning_rate": 2.5365833165666943e-07,
"loss": 1.0243375778198243,
"step": 1980,
"token_acc": 0.7054905490549055
},
{
"epoch": 0.6657722622840851,
"grad_norm": 6.0,
"learning_rate": 2.5136980724828695e-07,
"loss": 1.0490416526794433,
"step": 1985,
"token_acc": 0.6968990958007434
},
{
"epoch": 0.6674492705014254,
"grad_norm": 5.9375,
"learning_rate": 2.490881817000071e-07,
"loss": 1.0592771530151368,
"step": 1990,
"token_acc": 0.6985099735640471
},
{
"epoch": 0.6691262787187657,
"grad_norm": 6.03125,
"learning_rate": 2.4681351832117814e-07,
"loss": 1.0456266403198242,
"step": 1995,
"token_acc": 0.6956661488187711
},
{
"epoch": 0.670803286936106,
"grad_norm": 5.90625,
"learning_rate": 2.4454588022796556e-07,
"loss": 1.0303558349609374,
"step": 2000,
"token_acc": 0.6989892984542212
},
{
"epoch": 0.6724802951534462,
"grad_norm": 5.5625,
"learning_rate": 2.422853303416015e-07,
"loss": 1.0573721885681153,
"step": 2005,
"token_acc": 0.69331641285956
},
{
"epoch": 0.6741573033707865,
"grad_norm": 6.46875,
"learning_rate": 2.4003193138663754e-07,
"loss": 1.0548274040222168,
"step": 2010,
"token_acc": 0.6974822112753147
},
{
"epoch": 0.6758343115881268,
"grad_norm": 6.21875,
"learning_rate": 2.3778574588920525e-07,
"loss": 1.0316340446472168,
"step": 2015,
"token_acc": 0.6966531008929113
},
{
"epoch": 0.6775113198054671,
"grad_norm": 6.3125,
"learning_rate": 2.3554683617528087e-07,
"loss": 1.0404158592224122,
"step": 2020,
"token_acc": 0.6936044193860699
},
{
"epoch": 0.6791883280228073,
"grad_norm": 6.09375,
"learning_rate": 2.3331526436895643e-07,
"loss": 1.0208246231079101,
"step": 2025,
"token_acc": 0.6978810663021189
},
{
"epoch": 0.6808653362401476,
"grad_norm": 5.9375,
"learning_rate": 2.310910923907149e-07,
"loss": 1.0479655265808105,
"step": 2030,
"token_acc": 0.6983508932661475
},
{
"epoch": 0.6825423444574878,
"grad_norm": 6.03125,
"learning_rate": 2.288743819557134e-07,
"loss": 1.0503520011901855,
"step": 2035,
"token_acc": 0.698051948051948
},
{
"epoch": 0.6842193526748281,
"grad_norm": 6.21875,
"learning_rate": 2.266651945720694e-07,
"loss": 1.0153435707092284,
"step": 2040,
"token_acc": 0.7011975694649283
},
{
"epoch": 0.6858963608921683,
"grad_norm": 5.59375,
"learning_rate": 2.2446359153915523e-07,
"loss": 1.0074621200561524,
"step": 2045,
"token_acc": 0.7071179799738789
},
{
"epoch": 0.6875733691095086,
"grad_norm": 5.96875,
"learning_rate": 2.2226963394589637e-07,
"loss": 1.0132587432861329,
"step": 2050,
"token_acc": 0.7069939843227806
},
{
"epoch": 0.6892503773268489,
"grad_norm": 6.625,
"learning_rate": 2.200833826690766e-07,
"loss": 1.0336087226867676,
"step": 2055,
"token_acc": 0.698604598610248
},
{
"epoch": 0.6909273855441892,
"grad_norm": 6.34375,
"learning_rate": 2.1790489837164877e-07,
"loss": 1.0673924446105958,
"step": 2060,
"token_acc": 0.6897658185093447
},
{
"epoch": 0.6926043937615294,
"grad_norm": 5.8125,
"learning_rate": 2.157342415010523e-07,
"loss": 1.0450064659118652,
"step": 2065,
"token_acc": 0.7005138516532619
},
{
"epoch": 0.6942814019788697,
"grad_norm": 5.53125,
"learning_rate": 2.135714722875346e-07,
"loss": 1.0480844497680664,
"step": 2070,
"token_acc": 0.6980156217014989
},
{
"epoch": 0.69595841019621,
"grad_norm": 6.3125,
"learning_rate": 2.1141665074248067e-07,
"loss": 1.0112756729125976,
"step": 2075,
"token_acc": 0.7010742643624475
},
{
"epoch": 0.6976354184135503,
"grad_norm": 6.34375,
"learning_rate": 2.092698366567478e-07,
"loss": 1.0765247344970703,
"step": 2080,
"token_acc": 0.6889489327208862
},
{
"epoch": 0.6993124266308904,
"grad_norm": 5.5,
"learning_rate": 2.0713108959900689e-07,
"loss": 1.0020230293273926,
"step": 2085,
"token_acc": 0.7083483545346861
},
{
"epoch": 0.7009894348482307,
"grad_norm": 6.125,
"learning_rate": 2.0500046891408857e-07,
"loss": 1.0372941970825196,
"step": 2090,
"token_acc": 0.6972194225410565
},
{
"epoch": 0.702666443065571,
"grad_norm": 5.9375,
"learning_rate": 2.0287803372133756e-07,
"loss": 0.9844575881958008,
"step": 2095,
"token_acc": 0.7124336406817546
},
{
"epoch": 0.7043434512829113,
"grad_norm": 6.15625,
"learning_rate": 2.0076384291297133e-07,
"loss": 1.0537097930908204,
"step": 2100,
"token_acc": 0.6906941049400261
},
{
"epoch": 0.7060204595002515,
"grad_norm": 5.90625,
"learning_rate": 1.9865795515244722e-07,
"loss": 1.0500137329101562,
"step": 2105,
"token_acc": 0.6935847509735602
},
{
"epoch": 0.7076974677175918,
"grad_norm": 5.8125,
"learning_rate": 1.965604288728337e-07,
"loss": 1.040913963317871,
"step": 2110,
"token_acc": 0.697758455197335
},
{
"epoch": 0.7093744759349321,
"grad_norm": 7.125,
"learning_rate": 1.9447132227518893e-07,
"loss": 1.0521238327026368,
"step": 2115,
"token_acc": 0.6945375878853435
},
{
"epoch": 0.7110514841522724,
"grad_norm": 6.28125,
"learning_rate": 1.923906933269463e-07,
"loss": 1.0393115997314453,
"step": 2120,
"token_acc": 0.6945876988219967
},
{
"epoch": 0.7127284923696127,
"grad_norm": 5.3125,
"learning_rate": 1.9031859976030617e-07,
"loss": 1.0050291061401366,
"step": 2125,
"token_acc": 0.7060748959778086
},
{
"epoch": 0.7144055005869528,
"grad_norm": 6.25,
"learning_rate": 1.8825509907063326e-07,
"loss": 1.0308636665344237,
"step": 2130,
"token_acc": 0.6999337602119673
},
{
"epoch": 0.7160825088042931,
"grad_norm": 6.1875,
"learning_rate": 1.862002485148617e-07,
"loss": 1.0438469886779784,
"step": 2135,
"token_acc": 0.6971307120085016
},
{
"epoch": 0.7177595170216334,
"grad_norm": 5.6875,
"learning_rate": 1.8415410510990608e-07,
"loss": 1.0166708946228027,
"step": 2140,
"token_acc": 0.7032047017076957
},
{
"epoch": 0.7194365252389737,
"grad_norm": 6.09375,
"learning_rate": 1.8211672563108023e-07,
"loss": 1.0595402717590332,
"step": 2145,
"token_acc": 0.6895378967825524
},
{
"epoch": 0.7211135334563139,
"grad_norm": 6.125,
"learning_rate": 1.800881666105203e-07,
"loss": 1.0093088150024414,
"step": 2150,
"token_acc": 0.7006111142840825
},
{
"epoch": 0.7227905416736542,
"grad_norm": 6.3125,
"learning_rate": 1.780684843356175e-07,
"loss": 1.0222167015075683,
"step": 2155,
"token_acc": 0.695811209439528
},
{
"epoch": 0.7244675498909945,
"grad_norm": 5.625,
"learning_rate": 1.7605773484745545e-07,
"loss": 1.0139375686645509,
"step": 2160,
"token_acc": 0.6973555868646364
},
{
"epoch": 0.7261445581083348,
"grad_norm": 6.125,
"learning_rate": 1.7405597393925598e-07,
"loss": 1.029660987854004,
"step": 2165,
"token_acc": 0.6958884158637688
},
{
"epoch": 0.727821566325675,
"grad_norm": 5.6875,
"learning_rate": 1.7206325715483e-07,
"loss": 1.035785961151123,
"step": 2170,
"token_acc": 0.700274709217371
},
{
"epoch": 0.7294985745430153,
"grad_norm": 6.5,
"learning_rate": 1.7007963978703693e-07,
"loss": 1.0544404983520508,
"step": 2175,
"token_acc": 0.6898428674129609
},
{
"epoch": 0.7311755827603555,
"grad_norm": 4.90625,
"learning_rate": 1.6810517687625065e-07,
"loss": 1.064098072052002,
"step": 2180,
"token_acc": 0.6946430421717956
},
{
"epoch": 0.7328525909776958,
"grad_norm": 5.9375,
"learning_rate": 1.661399232088318e-07,
"loss": 1.0491707801818848,
"step": 2185,
"token_acc": 0.6936832838120391
},
{
"epoch": 0.734529599195036,
"grad_norm": 6.4375,
"learning_rate": 1.641839333156077e-07,
"loss": 1.0354165077209472,
"step": 2190,
"token_acc": 0.7039037992331822
},
{
"epoch": 0.7362066074123763,
"grad_norm": 6.125,
"learning_rate": 1.6223726147035927e-07,
"loss": 1.011677360534668,
"step": 2195,
"token_acc": 0.7069517493711411
},
{
"epoch": 0.7378836156297166,
"grad_norm": 6.125,
"learning_rate": 1.6029996168831516e-07,
"loss": 1.018637466430664,
"step": 2200,
"token_acc": 0.7055218238057052
},
{
"epoch": 0.7395606238470569,
"grad_norm": 6.375,
"learning_rate": 1.5837208772465326e-07,
"loss": 0.9837164878845215,
"step": 2205,
"token_acc": 0.7087787891746594
},
{
"epoch": 0.7412376320643971,
"grad_norm": 6.0625,
"learning_rate": 1.5645369307300837e-07,
"loss": 0.9903836250305176,
"step": 2210,
"token_acc": 0.7101433174091768
},
{
"epoch": 0.7429146402817374,
"grad_norm": 6.46875,
"learning_rate": 1.5454483096398845e-07,
"loss": 1.002861785888672,
"step": 2215,
"token_acc": 0.7046771644324767
},
{
"epoch": 0.7445916484990777,
"grad_norm": 6.28125,
"learning_rate": 1.5264555436369742e-07,
"loss": 1.0327083587646484,
"step": 2220,
"token_acc": 0.7040962133841391
},
{
"epoch": 0.746268656716418,
"grad_norm": 6.15625,
"learning_rate": 1.5075591597226583e-07,
"loss": 1.026947021484375,
"step": 2225,
"token_acc": 0.7050882658359294
},
{
"epoch": 0.7479456649337581,
"grad_norm": 5.65625,
"learning_rate": 1.488759682223879e-07,
"loss": 1.0323354721069335,
"step": 2230,
"token_acc": 0.6986567495559503
},
{
"epoch": 0.7496226731510984,
"grad_norm": 6.03125,
"learning_rate": 1.4700576327786723e-07,
"loss": 1.0541604042053223,
"step": 2235,
"token_acc": 0.6925109120147025
},
{
"epoch": 0.7512996813684387,
"grad_norm": 6.09375,
"learning_rate": 1.451453530321689e-07,
"loss": 1.0624011039733887,
"step": 2240,
"token_acc": 0.6876182829615186
},
{
"epoch": 0.752976689585779,
"grad_norm": 6.71875,
"learning_rate": 1.4329478910698033e-07,
"loss": 1.0590134620666505,
"step": 2245,
"token_acc": 0.6944770201597141
},
{
"epoch": 0.7546536978031192,
"grad_norm": 5.84375,
"learning_rate": 1.41454122850778e-07,
"loss": 1.033609676361084,
"step": 2250,
"token_acc": 0.701410444916408
},
{
"epoch": 0.7563307060204595,
"grad_norm": 6.3125,
"learning_rate": 1.3962340533740297e-07,
"loss": 1.0673054695129394,
"step": 2255,
"token_acc": 0.6927898114721219
},
{
"epoch": 0.7580077142377998,
"grad_norm": 5.84375,
"learning_rate": 1.3780268736464417e-07,
"loss": 1.0278964996337892,
"step": 2260,
"token_acc": 0.7006198105484739
},
{
"epoch": 0.7596847224551401,
"grad_norm": 6.3125,
"learning_rate": 1.359920194528285e-07,
"loss": 1.0502543449401855,
"step": 2265,
"token_acc": 0.6947629198594227
},
{
"epoch": 0.7613617306724803,
"grad_norm": 5.875,
"learning_rate": 1.341914518434188e-07,
"loss": 1.0808907508850099,
"step": 2270,
"token_acc": 0.6939768693461582
},
{
"epoch": 0.7630387388898205,
"grad_norm": 6.34375,
"learning_rate": 1.3240103449762e-07,
"loss": 1.1043811798095704,
"step": 2275,
"token_acc": 0.6843830610490111
},
{
"epoch": 0.7647157471071608,
"grad_norm": 5.84375,
"learning_rate": 1.30620817094993e-07,
"loss": 1.0409873962402343,
"step": 2280,
"token_acc": 0.6948018528049408
},
{
"epoch": 0.7663927553245011,
"grad_norm": 6.09375,
"learning_rate": 1.288508490320762e-07,
"loss": 1.0083087921142577,
"step": 2285,
"token_acc": 0.7059962385219604
},
{
"epoch": 0.7680697635418413,
"grad_norm": 6.0,
"learning_rate": 1.2709117942101434e-07,
"loss": 1.0333211898803711,
"step": 2290,
"token_acc": 0.7001386642015253
},
{
"epoch": 0.7697467717591816,
"grad_norm": 6.15625,
"learning_rate": 1.2534185708819622e-07,
"loss": 0.9901119232177734,
"step": 2295,
"token_acc": 0.7122798806528988
},
{
"epoch": 0.7714237799765219,
"grad_norm": 6.21875,
"learning_rate": 1.2360293057289988e-07,
"loss": 1.0226807594299316,
"step": 2300,
"token_acc": 0.6997545351134526
},
{
"epoch": 0.7731007881938622,
"grad_norm": 5.78125,
"learning_rate": 1.2187444812594576e-07,
"loss": 1.0424675941467285,
"step": 2305,
"token_acc": 0.6932173913043478
},
{
"epoch": 0.7747777964112024,
"grad_norm": 5.78125,
"learning_rate": 1.2015645770835764e-07,
"loss": 0.9836078643798828,
"step": 2310,
"token_acc": 0.7098943857698722
},
{
"epoch": 0.7764548046285427,
"grad_norm": 6.9375,
"learning_rate": 1.1844900699003174e-07,
"loss": 1.010260009765625,
"step": 2315,
"token_acc": 0.7048983752023502
},
{
"epoch": 0.778131812845883,
"grad_norm": 6.28125,
"learning_rate": 1.1675214334841488e-07,
"loss": 1.0726960182189942,
"step": 2320,
"token_acc": 0.6901127289807422
},
{
"epoch": 0.7798088210632232,
"grad_norm": 6.4375,
"learning_rate": 1.1506591386718861e-07,
"loss": 1.014423942565918,
"step": 2325,
"token_acc": 0.7041590518912333
},
{
"epoch": 0.7814858292805634,
"grad_norm": 6.0625,
"learning_rate": 1.1339036533496355e-07,
"loss": 1.0075566291809082,
"step": 2330,
"token_acc": 0.7022232734153264
},
{
"epoch": 0.7831628374979037,
"grad_norm": 6.375,
"learning_rate": 1.1172554424398123e-07,
"loss": 1.038377857208252,
"step": 2335,
"token_acc": 0.6992858736889087
},
{
"epoch": 0.784839845715244,
"grad_norm": 5.84375,
"learning_rate": 1.1007149678882327e-07,
"loss": 1.0416951179504395,
"step": 2340,
"token_acc": 0.7018931901667138
},
{
"epoch": 0.7865168539325843,
"grad_norm": 6.1875,
"learning_rate": 1.0842826886513074e-07,
"loss": 1.054603385925293,
"step": 2345,
"token_acc": 0.6900633743871817
},
{
"epoch": 0.7881938621499245,
"grad_norm": 5.78125,
"learning_rate": 1.0679590606832945e-07,
"loss": 1.0523086547851563,
"step": 2350,
"token_acc": 0.6997717563059636
},
{
"epoch": 0.7898708703672648,
"grad_norm": 5.1875,
"learning_rate": 1.051744536923656e-07,
"loss": 1.043616485595703,
"step": 2355,
"token_acc": 0.6997246637721063
},
{
"epoch": 0.7915478785846051,
"grad_norm": 6.09375,
"learning_rate": 1.0356395672844864e-07,
"loss": 1.0373201370239258,
"step": 2360,
"token_acc": 0.699506331830865
},
{
"epoch": 0.7932248868019454,
"grad_norm": 5.78125,
"learning_rate": 1.0196445986380336e-07,
"loss": 1.0343366622924806,
"step": 2365,
"token_acc": 0.6987698944790999
},
{
"epoch": 0.7949018950192855,
"grad_norm": 5.9375,
"learning_rate": 1.0037600748042918e-07,
"loss": 1.025636863708496,
"step": 2370,
"token_acc": 0.7044354165507263
},
{
"epoch": 0.7965789032366258,
"grad_norm": 6.09375,
"learning_rate": 9.879864365386908e-08,
"loss": 1.0659350395202636,
"step": 2375,
"token_acc": 0.6936859085148058
},
{
"epoch": 0.7982559114539661,
"grad_norm": 6.09375,
"learning_rate": 9.723241215198691e-08,
"loss": 1.0253265380859375,
"step": 2380,
"token_acc": 0.6990683229813665
},
{
"epoch": 0.7999329196713064,
"grad_norm": 5.96875,
"learning_rate": 9.56773564337523e-08,
"loss": 0.991847038269043,
"step": 2385,
"token_acc": 0.7079682205865171
},
{
"epoch": 0.8016099278886466,
"grad_norm": 5.40625,
"learning_rate": 9.413351964803517e-08,
"loss": 1.0407160758972167,
"step": 2390,
"token_acc": 0.699185631414547
},
{
"epoch": 0.8032869361059869,
"grad_norm": 6.125,
"learning_rate": 9.26009446324083e-08,
"loss": 1.0785944938659668,
"step": 2395,
"token_acc": 0.6866031454482838
},
{
"epoch": 0.8049639443233272,
"grad_norm": 6.5,
"learning_rate": 9.107967391195903e-08,
"loss": 1.062040138244629,
"step": 2400,
"token_acc": 0.6932661267485983
},
{
"epoch": 0.8066409525406675,
"grad_norm": 5.78125,
"learning_rate": 8.956974969810905e-08,
"loss": 1.0173826217651367,
"step": 2405,
"token_acc": 0.7024507192328183
},
{
"epoch": 0.8083179607580078,
"grad_norm": 5.75,
"learning_rate": 8.807121388744288e-08,
"loss": 1.0436044692993165,
"step": 2410,
"token_acc": 0.69497507142457
},
{
"epoch": 0.809994968975348,
"grad_norm": 6.3125,
"learning_rate": 8.658410806054567e-08,
"loss": 1.017934799194336,
"step": 2415,
"token_acc": 0.7072147651006712
},
{
"epoch": 0.8116719771926882,
"grad_norm": 5.96875,
"learning_rate": 8.510847348084943e-08,
"loss": 1.018608283996582,
"step": 2420,
"token_acc": 0.708192842354333
},
{
"epoch": 0.8133489854100285,
"grad_norm": 6.03125,
"learning_rate": 8.364435109348822e-08,
"loss": 0.9961603164672852,
"step": 2425,
"token_acc": 0.7076452599388379
},
{
"epoch": 0.8150259936273688,
"grad_norm": 5.9375,
"learning_rate": 8.219178152416156e-08,
"loss": 1.0228870391845704,
"step": 2430,
"token_acc": 0.7013418833867712
},
{
"epoch": 0.816703001844709,
"grad_norm": 6.09375,
"learning_rate": 8.075080507800747e-08,
"loss": 1.087087059020996,
"step": 2435,
"token_acc": 0.6874622107403947
},
{
"epoch": 0.8183800100620493,
"grad_norm": 5.625,
"learning_rate": 7.932146173848402e-08,
"loss": 1.0141497611999513,
"step": 2440,
"token_acc": 0.7045632042842798
},
{
"epoch": 0.8200570182793896,
"grad_norm": 6.40625,
"learning_rate": 7.790379116626028e-08,
"loss": 1.03942813873291,
"step": 2445,
"token_acc": 0.6949412052915238
},
{
"epoch": 0.8217340264967299,
"grad_norm": 6.5625,
"learning_rate": 7.649783269811521e-08,
"loss": 1.0532546043395996,
"step": 2450,
"token_acc": 0.6938823668220164
},
{
"epoch": 0.8234110347140701,
"grad_norm": 5.90625,
"learning_rate": 7.510362534584636e-08,
"loss": 1.0119309425354004,
"step": 2455,
"token_acc": 0.7017226277372263
},
{
"epoch": 0.8250880429314104,
"grad_norm": 6.4375,
"learning_rate": 7.372120779518787e-08,
"loss": 1.074321174621582,
"step": 2460,
"token_acc": 0.6915051342595134
},
{
"epoch": 0.8267650511487507,
"grad_norm": 5.65625,
"learning_rate": 7.235061840473622e-08,
"loss": 0.995145034790039,
"step": 2465,
"token_acc": 0.7089050765147665
},
{
"epoch": 0.8284420593660909,
"grad_norm": 6.5,
"learning_rate": 7.099189520488664e-08,
"loss": 1.0098725318908692,
"step": 2470,
"token_acc": 0.7019166817824536
},
{
"epoch": 0.8301190675834311,
"grad_norm": 6.8125,
"learning_rate": 6.96450758967772e-08,
"loss": 1.0157196044921875,
"step": 2475,
"token_acc": 0.7020709325396826
},
{
"epoch": 0.8317960758007714,
"grad_norm": 5.71875,
"learning_rate": 6.831019785124337e-08,
"loss": 1.037847900390625,
"step": 2480,
"token_acc": 0.6992954337653956
},
{
"epoch": 0.8334730840181117,
"grad_norm": 5.65625,
"learning_rate": 6.698729810778064e-08,
"loss": 1.0353034019470215,
"step": 2485,
"token_acc": 0.7004344048653345
},
{
"epoch": 0.835150092235452,
"grad_norm": 5.6875,
"learning_rate": 6.567641337351681e-08,
"loss": 1.0612793922424317,
"step": 2490,
"token_acc": 0.6943042912873862
},
{
"epoch": 0.8368271004527922,
"grad_norm": 6.25,
"learning_rate": 6.43775800221934e-08,
"loss": 1.040201473236084,
"step": 2495,
"token_acc": 0.7003977538605521
},
{
"epoch": 0.8385041086701325,
"grad_norm": 6.125,
"learning_rate": 6.309083409315652e-08,
"loss": 1.0146426200866698,
"step": 2500,
"token_acc": 0.7067005321722303
},
{
"epoch": 0.8401811168874728,
"grad_norm": 6.40625,
"learning_rate": 6.181621129035714e-08,
"loss": 1.0435810089111328,
"step": 2505,
"token_acc": 0.6991748352439497
},
{
"epoch": 0.8418581251048131,
"grad_norm": 6.21875,
"learning_rate": 6.055374698135973e-08,
"loss": 1.0399096488952637,
"step": 2510,
"token_acc": 0.6975013881177123
},
{
"epoch": 0.8435351333221532,
"grad_norm": 6.21875,
"learning_rate": 5.930347619636123e-08,
"loss": 1.0619498252868653,
"step": 2515,
"token_acc": 0.6908962597035991
},
{
"epoch": 0.8452121415394935,
"grad_norm": 5.875,
"learning_rate": 5.806543362721944e-08,
"loss": 1.0390033721923828,
"step": 2520,
"token_acc": 0.6962721181927504
},
{
"epoch": 0.8468891497568338,
"grad_norm": 6.53125,
"learning_rate": 5.683965362648974e-08,
"loss": 1.0012418746948242,
"step": 2525,
"token_acc": 0.7041342967349378
},
{
"epoch": 0.8485661579741741,
"grad_norm": 6.21875,
"learning_rate": 5.5626170206472314e-08,
"loss": 1.054752731323242,
"step": 2530,
"token_acc": 0.6903323262839879
},
{
"epoch": 0.8502431661915143,
"grad_norm": 6.21875,
"learning_rate": 5.442501703826802e-08,
"loss": 1.042811965942383,
"step": 2535,
"token_acc": 0.697495183044316
},
{
"epoch": 0.8519201744088546,
"grad_norm": 6.125,
"learning_rate": 5.3236227450844884e-08,
"loss": 1.0447772026062012,
"step": 2540,
"token_acc": 0.6942773672452311
},
{
"epoch": 0.8535971826261949,
"grad_norm": 6.25,
"learning_rate": 5.2059834430112357e-08,
"loss": 1.0507349967956543,
"step": 2545,
"token_acc": 0.6938989408649603
},
{
"epoch": 0.8552741908435352,
"grad_norm": 6.0,
"learning_rate": 5.089587061800643e-08,
"loss": 1.0506488800048828,
"step": 2550,
"token_acc": 0.693307040946526
},
{
"epoch": 0.8569511990608754,
"grad_norm": 5.75,
"learning_rate": 4.974436831158441e-08,
"loss": 1.0228717803955079,
"step": 2555,
"token_acc": 0.702984989441242
},
{
"epoch": 0.8586282072782156,
"grad_norm": 5.5625,
"learning_rate": 4.8605359462127626e-08,
"loss": 1.0637690544128418,
"step": 2560,
"token_acc": 0.6955604883462819
},
{
"epoch": 0.8603052154955559,
"grad_norm": 5.96875,
"learning_rate": 4.747887567425618e-08,
"loss": 1.0389795303344727,
"step": 2565,
"token_acc": 0.7000414716511641
},
{
"epoch": 0.8619822237128962,
"grad_norm": 5.6875,
"learning_rate": 4.636494820505082e-08,
"loss": 1.0297457695007324,
"step": 2570,
"token_acc": 0.7029873942324296
},
{
"epoch": 0.8636592319302364,
"grad_norm": 6.40625,
"learning_rate": 4.526360796318629e-08,
"loss": 0.9971097946166992,
"step": 2575,
"token_acc": 0.7125974658869396
},
{
"epoch": 0.8653362401475767,
"grad_norm": 5.625,
"learning_rate": 4.417488550807386e-08,
"loss": 0.9800214767456055,
"step": 2580,
"token_acc": 0.715742511153601
},
{
"epoch": 0.867013248364917,
"grad_norm": 6.375,
"learning_rate": 4.309881104901264e-08,
"loss": 1.0177087783813477,
"step": 2585,
"token_acc": 0.7067709258930158
},
{
"epoch": 0.8686902565822573,
"grad_norm": 6.0625,
"learning_rate": 4.20354144443521e-08,
"loss": 1.0427945137023926,
"step": 2590,
"token_acc": 0.6946957601561115
},
{
"epoch": 0.8703672647995975,
"grad_norm": 6.5625,
"learning_rate": 4.098472520066293e-08,
"loss": 1.0284164428710938,
"step": 2595,
"token_acc": 0.6990140392240918
},
{
"epoch": 0.8720442730169378,
"grad_norm": 6.59375,
"learning_rate": 3.994677247191908e-08,
"loss": 1.0570375442504882,
"step": 2600,
"token_acc": 0.6961351809624042
},
{
"epoch": 0.8737212812342781,
"grad_norm": 6.21875,
"learning_rate": 3.892158505868798e-08,
"loss": 1.0632140159606933,
"step": 2605,
"token_acc": 0.6917707567964732
},
{
"epoch": 0.8753982894516183,
"grad_norm": 6.15625,
"learning_rate": 3.7909191407332066e-08,
"loss": 1.0276185035705567,
"step": 2610,
"token_acc": 0.7006794751640113
},
{
"epoch": 0.8770752976689585,
"grad_norm": 6.0625,
"learning_rate": 3.690961960921879e-08,
"loss": 1.032079315185547,
"step": 2615,
"token_acc": 0.7047474528506394
},
{
"epoch": 0.8787523058862988,
"grad_norm": 5.875,
"learning_rate": 3.5922897399942144e-08,
"loss": 1.0215091705322266,
"step": 2620,
"token_acc": 0.7023193629018194
},
{
"epoch": 0.8804293141036391,
"grad_norm": 5.96875,
"learning_rate": 3.494905215855187e-08,
"loss": 1.0204978942871095,
"step": 2625,
"token_acc": 0.7062290472585704
},
{
"epoch": 0.8821063223209794,
"grad_norm": 6.34375,
"learning_rate": 3.3988110906794875e-08,
"loss": 1.0444301605224608,
"step": 2630,
"token_acc": 0.7006415252712992
},
{
"epoch": 0.8837833305383196,
"grad_norm": 6.46875,
"learning_rate": 3.304010030836452e-08,
"loss": 1.070432472229004,
"step": 2635,
"token_acc": 0.6982341693847711
},
{
"epoch": 0.8854603387556599,
"grad_norm": 6.75,
"learning_rate": 3.210504666816133e-08,
"loss": 1.0351852416992187,
"step": 2640,
"token_acc": 0.7003612281757977
},
{
"epoch": 0.8871373469730002,
"grad_norm": 6.15625,
"learning_rate": 3.118297593156316e-08,
"loss": 1.0338337898254395,
"step": 2645,
"token_acc": 0.6987579354126414
},
{
"epoch": 0.8888143551903405,
"grad_norm": 5.84375,
"learning_rate": 3.0273913683704745e-08,
"loss": 1.0125846862792969,
"step": 2650,
"token_acc": 0.7042808634489449
},
{
"epoch": 0.8904913634076806,
"grad_norm": 6.15625,
"learning_rate": 2.9377885148768268e-08,
"loss": 1.0237887382507325,
"step": 2655,
"token_acc": 0.7018055399798139
},
{
"epoch": 0.8921683716250209,
"grad_norm": 6.1875,
"learning_rate": 2.849491518928332e-08,
"loss": 1.0422614097595215,
"step": 2660,
"token_acc": 0.6977418455533873
},
{
"epoch": 0.8938453798423612,
"grad_norm": 5.96875,
"learning_rate": 2.7625028305436838e-08,
"loss": 1.0634222030639648,
"step": 2665,
"token_acc": 0.6951148758412625
},
{
"epoch": 0.8955223880597015,
"grad_norm": 6.25,
"learning_rate": 2.67682486343937e-08,
"loss": 1.062954807281494,
"step": 2670,
"token_acc": 0.6916413286673169
},
{
"epoch": 0.8971993962770417,
"grad_norm": 6.28125,
"learning_rate": 2.5924599949626312e-08,
"loss": 1.0950417518615723,
"step": 2675,
"token_acc": 0.684926074888121
},
{
"epoch": 0.898876404494382,
"grad_norm": 6.09375,
"learning_rate": 2.5094105660255883e-08,
"loss": 1.0323663711547852,
"step": 2680,
"token_acc": 0.6994619523443505
},
{
"epoch": 0.9005534127117223,
"grad_norm": 6.40625,
"learning_rate": 2.427678881040196e-08,
"loss": 1.0677906036376954,
"step": 2685,
"token_acc": 0.6956816640289396
},
{
"epoch": 0.9022304209290626,
"grad_norm": 6.28125,
"learning_rate": 2.3472672078543588e-08,
"loss": 1.0511894226074219,
"step": 2690,
"token_acc": 0.6961982540129541
},
{
"epoch": 0.9039074291464028,
"grad_norm": 6.375,
"learning_rate": 2.268177777688973e-08,
"loss": 1.0632619857788086,
"step": 2695,
"token_acc": 0.6953334427485092
},
{
"epoch": 0.9055844373637431,
"grad_norm": 6.34375,
"learning_rate": 2.1904127850760458e-08,
"loss": 1.0489700317382813,
"step": 2700,
"token_acc": 0.6962432915921288
},
{
"epoch": 0.9072614455810833,
"grad_norm": 5.90625,
"learning_rate": 2.11397438779779e-08,
"loss": 1.056574249267578,
"step": 2705,
"token_acc": 0.6962843295638126
},
{
"epoch": 0.9089384537984236,
"grad_norm": 6.0,
"learning_rate": 2.038864706826726e-08,
"loss": 1.018214511871338,
"step": 2710,
"token_acc": 0.7036482909425711
},
{
"epoch": 0.9106154620157639,
"grad_norm": 6.3125,
"learning_rate": 1.9650858262668602e-08,
"loss": 1.0284092903137207,
"step": 2715,
"token_acc": 0.7034700315457413
},
{
"epoch": 0.9122924702331041,
"grad_norm": 5.6875,
"learning_rate": 1.892639793295858e-08,
"loss": 1.033327293395996,
"step": 2720,
"token_acc": 0.7013865593201386
},
{
"epoch": 0.9139694784504444,
"grad_norm": 6.375,
"learning_rate": 1.8215286181082144e-08,
"loss": 1.0109923362731934,
"step": 2725,
"token_acc": 0.7045320429599906
},
{
"epoch": 0.9156464866677847,
"grad_norm": 5.84375,
"learning_rate": 1.751754273859507e-08,
"loss": 0.9927311897277832,
"step": 2730,
"token_acc": 0.7082637489202419
},
{
"epoch": 0.917323494885125,
"grad_norm": 5.65625,
"learning_rate": 1.6833186966116074e-08,
"loss": 1.0338494300842285,
"step": 2735,
"token_acc": 0.7046143482654093
},
{
"epoch": 0.9190005031024652,
"grad_norm": 5.5,
"learning_rate": 1.6162237852790083e-08,
"loss": 1.0125389099121094,
"step": 2740,
"token_acc": 0.7063389391979301
},
{
"epoch": 0.9206775113198055,
"grad_norm": 6.625,
"learning_rate": 1.550471401576092e-08,
"loss": 1.014925193786621,
"step": 2745,
"token_acc": 0.7055999106694211
},
{
"epoch": 0.9223545195371458,
"grad_norm": 6.25,
"learning_rate": 1.4860633699654957e-08,
"loss": 0.9907565116882324,
"step": 2750,
"token_acc": 0.7117064647641235
},
{
"epoch": 0.924031527754486,
"grad_norm": 5.90625,
"learning_rate": 1.4230014776074662e-08,
"loss": 1.0230751037597656,
"step": 2755,
"token_acc": 0.7046689586336919
},
{
"epoch": 0.9257085359718262,
"grad_norm": 5.8125,
"learning_rate": 1.3612874743103186e-08,
"loss": 1.012996292114258,
"step": 2760,
"token_acc": 0.7042463088016482
},
{
"epoch": 0.9273855441891665,
"grad_norm": 6.1875,
"learning_rate": 1.3009230724818132e-08,
"loss": 1.0435994148254395,
"step": 2765,
"token_acc": 0.6978461885430762
},
{
"epoch": 0.9290625524065068,
"grad_norm": 6.59375,
"learning_rate": 1.2419099470816873e-08,
"loss": 1.0455306053161622,
"step": 2770,
"token_acc": 0.6964831804281345
},
{
"epoch": 0.9307395606238471,
"grad_norm": 6.65625,
"learning_rate": 1.1842497355751824e-08,
"loss": 1.0471959114074707,
"step": 2775,
"token_acc": 0.697155162643092
},
{
"epoch": 0.9324165688411873,
"grad_norm": 6.125,
"learning_rate": 1.1279440378875904e-08,
"loss": 1.0397340774536132,
"step": 2780,
"token_acc": 0.6997011952191236
},
{
"epoch": 0.9340935770585276,
"grad_norm": 5.6875,
"learning_rate": 1.0729944163598514e-08,
"loss": 1.0325641632080078,
"step": 2785,
"token_acc": 0.6986331569664903
},
{
"epoch": 0.9357705852758679,
"grad_norm": 6.3125,
"learning_rate": 1.0194023957052268e-08,
"loss": 1.0409153938293456,
"step": 2790,
"token_acc": 0.6998793311498017
},
{
"epoch": 0.9374475934932082,
"grad_norm": 5.59375,
"learning_rate": 9.671694629669768e-09,
"loss": 1.012611484527588,
"step": 2795,
"token_acc": 0.7074814896673666
},
{
"epoch": 0.9391246017105483,
"grad_norm": 6.09375,
"learning_rate": 9.162970674771176e-09,
"loss": 1.0336393356323241,
"step": 2800,
"token_acc": 0.7030631260191008
},
{
"epoch": 0.9408016099278886,
"grad_norm": 6.28125,
"learning_rate": 8.667866208161678e-09,
"loss": 1.0824786186218263,
"step": 2805,
"token_acc": 0.6870084390607957
},
{
"epoch": 0.9424786181452289,
"grad_norm": 5.625,
"learning_rate": 8.186394967740207e-09,
"loss": 1.0423837661743165,
"step": 2810,
"token_acc": 0.6977280590473994
},
{
"epoch": 0.9441556263625692,
"grad_norm": 6.15625,
"learning_rate": 7.718570313118067e-09,
"loss": 1.0381958961486817,
"step": 2815,
"token_acc": 0.6993798542958637
},
{
"epoch": 0.9458326345799094,
"grad_norm": 6.03125,
"learning_rate": 7.2644052252482934e-09,
"loss": 1.0570508003234864,
"step": 2820,
"token_acc": 0.695105855023494
},
{
"epoch": 0.9475096427972497,
"grad_norm": 6.4375,
"learning_rate": 6.823912306065327e-09,
"loss": 1.0608396530151367,
"step": 2825,
"token_acc": 0.6950763644131155
},
{
"epoch": 0.94918665101459,
"grad_norm": 5.90625,
"learning_rate": 6.397103778135571e-09,
"loss": 1.0604951858520508,
"step": 2830,
"token_acc": 0.6971572810866038
},
{
"epoch": 0.9508636592319303,
"grad_norm": 6.25,
"learning_rate": 5.983991484317996e-09,
"loss": 1.0228150367736817,
"step": 2835,
"token_acc": 0.7006679165690181
},
{
"epoch": 0.9525406674492705,
"grad_norm": 6.03125,
"learning_rate": 5.5845868874357385e-09,
"loss": 1.0492300033569335,
"step": 2840,
"token_acc": 0.6925122867880891
},
{
"epoch": 0.9542176756666108,
"grad_norm": 5.84375,
"learning_rate": 5.198901069957961e-09,
"loss": 1.023094081878662,
"step": 2845,
"token_acc": 0.7031940482183812
},
{
"epoch": 0.955894683883951,
"grad_norm": 5.84375,
"learning_rate": 4.826944733692328e-09,
"loss": 1.06231689453125,
"step": 2850,
"token_acc": 0.6977053349517369
},
{
"epoch": 0.9575716921012913,
"grad_norm": 6.0,
"learning_rate": 4.468728199487959e-09,
"loss": 1.0265810012817382,
"step": 2855,
"token_acc": 0.7019859813084112
},
{
"epoch": 0.9592487003186315,
"grad_norm": 6.15625,
"learning_rate": 4.1242614069493255e-09,
"loss": 1.0546887397766114,
"step": 2860,
"token_acc": 0.6936714670950714
},
{
"epoch": 0.9609257085359718,
"grad_norm": 6.21875,
"learning_rate": 3.793553914160253e-09,
"loss": 1.0320470809936524,
"step": 2865,
"token_acc": 0.7007982541779131
},
{
"epoch": 0.9626027167533121,
"grad_norm": 6.09375,
"learning_rate": 3.4766148974185728e-09,
"loss": 1.0567503929138184,
"step": 2870,
"token_acc": 0.6973000742051487
},
{
"epoch": 0.9642797249706524,
"grad_norm": 6.0,
"learning_rate": 3.173453150981831e-09,
"loss": 1.027595043182373,
"step": 2875,
"token_acc": 0.7055168843122591
},
{
"epoch": 0.9659567331879926,
"grad_norm": 6.40625,
"learning_rate": 2.884077086823089e-09,
"loss": 1.0573992729187012,
"step": 2880,
"token_acc": 0.6938729623383924
},
{
"epoch": 0.9676337414053329,
"grad_norm": 6.0,
"learning_rate": 2.608494734397504e-09,
"loss": 0.9933361053466797,
"step": 2885,
"token_acc": 0.7056563259309487
},
{
"epoch": 0.9693107496226732,
"grad_norm": 6.28125,
"learning_rate": 2.3467137404195036e-09,
"loss": 1.061786937713623,
"step": 2890,
"token_acc": 0.6947505226874612
},
{
"epoch": 0.9709877578400135,
"grad_norm": 5.9375,
"learning_rate": 2.098741368650736e-09,
"loss": 1.0342968940734862,
"step": 2895,
"token_acc": 0.6952941849162331
},
{
"epoch": 0.9726647660573536,
"grad_norm": 6.25,
"learning_rate": 1.864584499698507e-09,
"loss": 1.0258048057556153,
"step": 2900,
"token_acc": 0.705042560068958
},
{
"epoch": 0.9743417742746939,
"grad_norm": 6.03125,
"learning_rate": 1.6442496308246567e-09,
"loss": 1.0686445236206055,
"step": 2905,
"token_acc": 0.6945458630567882
},
{
"epoch": 0.9760187824920342,
"grad_norm": 5.875,
"learning_rate": 1.4377428757655353e-09,
"loss": 1.0087648391723634,
"step": 2910,
"token_acc": 0.7034811903425042
},
{
"epoch": 0.9776957907093745,
"grad_norm": 6.21875,
"learning_rate": 1.2450699645621399e-09,
"loss": 1.0127264976501464,
"step": 2915,
"token_acc": 0.7020366250213931
},
{
"epoch": 0.9793727989267147,
"grad_norm": 6.6875,
"learning_rate": 1.0662362434013529e-09,
"loss": 1.0034564971923827,
"step": 2920,
"token_acc": 0.7072002837550249
},
{
"epoch": 0.981049807144055,
"grad_norm": 6.46875,
"learning_rate": 9.012466744673375e-10,
"loss": 1.0292555809020996,
"step": 2925,
"token_acc": 0.6997058823529412
},
{
"epoch": 0.9827268153613953,
"grad_norm": 6.03125,
"learning_rate": 7.50105835804149e-10,
"loss": 1.0281611442565919,
"step": 2930,
"token_acc": 0.700266082329003
},
{
"epoch": 0.9844038235787356,
"grad_norm": 6.0,
"learning_rate": 6.128179211884466e-10,
"loss": 1.0283337593078614,
"step": 2935,
"token_acc": 0.7049981796432101
},
{
"epoch": 0.9860808317960758,
"grad_norm": 5.84375,
"learning_rate": 4.893867400131979e-10,
"loss": 1.0271111488342286,
"step": 2940,
"token_acc": 0.7001903908612387
},
{
"epoch": 0.987757840013416,
"grad_norm": 6.5,
"learning_rate": 3.7981571718204153e-10,
"loss": 0.976413345336914,
"step": 2945,
"token_acc": 0.7121462264150943
},
{
"epoch": 0.9894348482307563,
"grad_norm": 5.90625,
"learning_rate": 2.8410789301425155e-10,
"loss": 1.028111457824707,
"step": 2950,
"token_acc": 0.7010195164579085
},
{
"epoch": 0.9911118564480966,
"grad_norm": 5.9375,
"learning_rate": 2.022659231602497e-10,
"loss": 1.074041748046875,
"step": 2955,
"token_acc": 0.6909885131855686
},
{
"epoch": 0.9927888646654368,
"grad_norm": 6.09375,
"learning_rate": 1.3429207852805324e-10,
"loss": 1.050593662261963,
"step": 2960,
"token_acc": 0.6958233107369342
},
{
"epoch": 0.9944658728827771,
"grad_norm": 6.40625,
"learning_rate": 8.018824522032507e-11,
"loss": 1.0603754043579101,
"step": 2965,
"token_acc": 0.6945898778359512
},
{
"epoch": 0.9961428811001174,
"grad_norm": 6.0625,
"learning_rate": 3.995592448174934e-11,
"loss": 1.0456673622131347,
"step": 2970,
"token_acc": 0.697628927089508
},
{
"epoch": 0.9978198893174577,
"grad_norm": 5.46875,
"learning_rate": 1.359623265767551e-11,
"loss": 1.055088996887207,
"step": 2975,
"token_acc": 0.6971778149708547
},
{
"epoch": 0.9994968975347979,
"grad_norm": 6.1875,
"learning_rate": 1.109901163032223e-12,
"loss": 1.0523943901062012,
"step": 2980,
"token_acc": 0.7027149321266968
}
],
"logging_steps": 5,
"max_steps": 2982,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 80,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.11716299871276e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}