VidEmo-3B / trainer_state.json
nku-zhichengzhang's picture
Upload folder using huggingface_hub
d7a32a4 verified
{
"best_metric": 0.65642351,
"best_model_checkpoint": "/m2v_intern/zhangzhicheng03/code/face-llm/ms-swift/Emo-CFG_bs-512_data-ATTR_OPEN_EMO_MIC_500k_CAP_78k_RATIONALE_120k_scratch_3B_lr-2e-5/v2-20250515-154834/checkpoint-2704",
"epoch": 1.9994455738310848,
"eval_steps": 50,
"global_step": 2704,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007392348918868971,
"grad_norm": 7.145988573209412,
"learning_rate": 1.9999993250737395e-05,
"loss": 1.997387409210205,
"memory(GiB)": 30.3,
"step": 1,
"token_acc": 0.5373563218390804,
"train_speed(iter/s)": 0.022542
},
{
"epoch": 0.0036961744594344852,
"grad_norm": 4.16117609244522,
"learning_rate": 1.9999831268890388e-05,
"loss": 1.7683343887329102,
"memory(GiB)": 30.88,
"step": 5,
"token_acc": 0.5186522262334536,
"train_speed(iter/s)": 0.047276
},
{
"epoch": 0.0073923489188689705,
"grad_norm": 1.9968396412286222,
"learning_rate": 1.999932508125559e-05,
"loss": 1.4955159187316895,
"memory(GiB)": 40.49,
"step": 10,
"token_acc": 0.5502183406113537,
"train_speed(iter/s)": 0.057642
},
{
"epoch": 0.011088523378303456,
"grad_norm": 1.6222058869585758,
"learning_rate": 1.9998481454177528e-05,
"loss": 1.4060004234313965,
"memory(GiB)": 40.49,
"step": 15,
"token_acc": 0.6005361930294906,
"train_speed(iter/s)": 0.059714
},
{
"epoch": 0.014784697837737941,
"grad_norm": 1.873832697605311,
"learning_rate": 1.9997300416125426e-05,
"loss": 1.3838209152221679,
"memory(GiB)": 40.49,
"step": 20,
"token_acc": 0.607051282051282,
"train_speed(iter/s)": 0.060073
},
{
"epoch": 0.018480872297172428,
"grad_norm": 1.659686579754198,
"learning_rate": 1.9995782006954852e-05,
"loss": 1.3265121459960938,
"memory(GiB)": 40.49,
"step": 25,
"token_acc": 0.5963340122199593,
"train_speed(iter/s)": 0.062445
},
{
"epoch": 0.02217704675660691,
"grad_norm": 1.8215744972131866,
"learning_rate": 1.9993926277906387e-05,
"loss": 1.3122464179992677,
"memory(GiB)": 54.96,
"step": 30,
"token_acc": 0.5690406976744186,
"train_speed(iter/s)": 0.06221
},
{
"epoch": 0.0258732212160414,
"grad_norm": 1.5674638552693054,
"learning_rate": 1.9991733291603873e-05,
"loss": 1.3101771354675293,
"memory(GiB)": 54.96,
"step": 35,
"token_acc": 0.5990697674418605,
"train_speed(iter/s)": 0.061909
},
{
"epoch": 0.029569395675475882,
"grad_norm": 1.8121545652568625,
"learning_rate": 1.998920312205231e-05,
"loss": 1.2577611923217773,
"memory(GiB)": 54.96,
"step": 40,
"token_acc": 0.6322274881516587,
"train_speed(iter/s)": 0.062755
},
{
"epoch": 0.03326557013491037,
"grad_norm": 1.7654732822991834,
"learning_rate": 1.9986335854635364e-05,
"loss": 1.2739611625671388,
"memory(GiB)": 54.96,
"step": 45,
"token_acc": 0.5852668213457076,
"train_speed(iter/s)": 0.062974
},
{
"epoch": 0.036961744594344856,
"grad_norm": 1.3381865017125096,
"learning_rate": 1.9983131586112474e-05,
"loss": 1.2759986877441407,
"memory(GiB)": 54.96,
"step": 50,
"token_acc": 0.6090799517878666,
"train_speed(iter/s)": 0.0626
},
{
"epoch": 0.036961744594344856,
"eval_loss": 0.8683156967163086,
"eval_runtime": 85.8388,
"eval_samples_per_second": 81.49,
"eval_steps_per_second": 0.641,
"eval_token_acc": 0.5936057826607904,
"step": 50
},
{
"epoch": 0.040657919053779336,
"grad_norm": 1.7881784961595093,
"learning_rate": 1.9979590424615597e-05,
"loss": 1.2275705337524414,
"memory(GiB)": 74.93,
"step": 55,
"token_acc": 0.5959502991256328,
"train_speed(iter/s)": 0.055536
},
{
"epoch": 0.04435409351321382,
"grad_norm": 1.523952154354778,
"learning_rate": 1.997571248964556e-05,
"loss": 1.2908334732055664,
"memory(GiB)": 74.93,
"step": 60,
"token_acc": 0.6143884892086331,
"train_speed(iter/s)": 0.056437
},
{
"epoch": 0.04805026797264831,
"grad_norm": 1.629094517207129,
"learning_rate": 1.9971497912068014e-05,
"loss": 1.262259292602539,
"memory(GiB)": 74.93,
"step": 65,
"token_acc": 0.6196943972835314,
"train_speed(iter/s)": 0.05732
},
{
"epoch": 0.0517464424320828,
"grad_norm": 1.4002823703096854,
"learning_rate": 1.9966946834109026e-05,
"loss": 1.2578742980957032,
"memory(GiB)": 74.93,
"step": 70,
"token_acc": 0.5597722960151803,
"train_speed(iter/s)": 0.057646
},
{
"epoch": 0.05544261689151728,
"grad_norm": 1.3922622186923228,
"learning_rate": 1.9962059409350286e-05,
"loss": 1.2903871536254883,
"memory(GiB)": 74.93,
"step": 75,
"token_acc": 0.5871787786681404,
"train_speed(iter/s)": 0.058082
},
{
"epoch": 0.059138791350951764,
"grad_norm": 1.5357768582962403,
"learning_rate": 1.9956835802723916e-05,
"loss": 1.2582176208496094,
"memory(GiB)": 74.93,
"step": 80,
"token_acc": 0.5863981319322825,
"train_speed(iter/s)": 0.05893
},
{
"epoch": 0.06283496581038625,
"grad_norm": 1.5430442318289654,
"learning_rate": 1.9951276190506903e-05,
"loss": 1.2459497451782227,
"memory(GiB)": 74.93,
"step": 85,
"token_acc": 0.5826538176426983,
"train_speed(iter/s)": 0.059014
},
{
"epoch": 0.06653114026982074,
"grad_norm": 1.5964710010210896,
"learning_rate": 1.9945380760315153e-05,
"loss": 1.2252405166625977,
"memory(GiB)": 74.93,
"step": 90,
"token_acc": 0.6162060301507538,
"train_speed(iter/s)": 0.059178
},
{
"epoch": 0.07022731472925522,
"grad_norm": 1.5295447471837036,
"learning_rate": 1.9939149711097164e-05,
"loss": 1.235156536102295,
"memory(GiB)": 74.93,
"step": 95,
"token_acc": 0.6264632848527847,
"train_speed(iter/s)": 0.059963
},
{
"epoch": 0.07392348918868971,
"grad_norm": 1.3740769902051975,
"learning_rate": 1.9932583253127302e-05,
"loss": 1.2441673278808594,
"memory(GiB)": 74.93,
"step": 100,
"token_acc": 0.6724137931034483,
"train_speed(iter/s)": 0.060048
},
{
"epoch": 0.07392348918868971,
"eval_loss": 0.8143442273139954,
"eval_runtime": 82.7498,
"eval_samples_per_second": 84.532,
"eval_steps_per_second": 0.665,
"eval_token_acc": 0.6014724453258288,
"step": 100
},
{
"epoch": 0.07761966364812418,
"grad_norm": 1.4813264557157313,
"learning_rate": 1.992568160799872e-05,
"loss": 1.2064315795898437,
"memory(GiB)": 74.93,
"step": 105,
"token_acc": 0.6042131350681537,
"train_speed(iter/s)": 0.056686
},
{
"epoch": 0.08131583810755867,
"grad_norm": 1.4543500652550134,
"learning_rate": 1.9918445008615862e-05,
"loss": 1.2109683990478515,
"memory(GiB)": 74.93,
"step": 110,
"token_acc": 0.5906810035842294,
"train_speed(iter/s)": 0.056861
},
{
"epoch": 0.08501201256699316,
"grad_norm": 1.4682139055625663,
"learning_rate": 1.9910873699186618e-05,
"loss": 1.2368173599243164,
"memory(GiB)": 74.93,
"step": 115,
"token_acc": 0.5186114596403179,
"train_speed(iter/s)": 0.057469
},
{
"epoch": 0.08870818702642765,
"grad_norm": 1.3995801521088191,
"learning_rate": 1.990296793521408e-05,
"loss": 1.2045980453491212,
"memory(GiB)": 74.93,
"step": 120,
"token_acc": 0.5862884160756501,
"train_speed(iter/s)": 0.057707
},
{
"epoch": 0.09240436148586213,
"grad_norm": 1.3021560572956736,
"learning_rate": 1.989472798348791e-05,
"loss": 1.2566261291503906,
"memory(GiB)": 74.93,
"step": 125,
"token_acc": 0.5944492254733219,
"train_speed(iter/s)": 0.057808
},
{
"epoch": 0.09610053594529662,
"grad_norm": 1.4712133484369752,
"learning_rate": 1.9886154122075344e-05,
"loss": 1.192431640625,
"memory(GiB)": 74.93,
"step": 130,
"token_acc": 0.5911908646003262,
"train_speed(iter/s)": 0.058231
},
{
"epoch": 0.0997967104047311,
"grad_norm": 1.3130768997154976,
"learning_rate": 1.9877246640311818e-05,
"loss": 1.2078176498413087,
"memory(GiB)": 74.93,
"step": 135,
"token_acc": 0.6265611990008326,
"train_speed(iter/s)": 0.058455
},
{
"epoch": 0.1034928848641656,
"grad_norm": 1.3930149495863555,
"learning_rate": 1.9868005838791185e-05,
"loss": 1.2078091621398925,
"memory(GiB)": 74.93,
"step": 140,
"token_acc": 0.583790628957366,
"train_speed(iter/s)": 0.05851
},
{
"epoch": 0.10718905932360008,
"grad_norm": 1.3209379748185142,
"learning_rate": 1.9858432029355584e-05,
"loss": 1.2318389892578125,
"memory(GiB)": 74.93,
"step": 145,
"token_acc": 0.5777182235834609,
"train_speed(iter/s)": 0.058766
},
{
"epoch": 0.11088523378303455,
"grad_norm": 1.5171021117309256,
"learning_rate": 1.9848525535084916e-05,
"loss": 1.2017921447753905,
"memory(GiB)": 74.93,
"step": 150,
"token_acc": 0.6249167221852099,
"train_speed(iter/s)": 0.059012
},
{
"epoch": 0.11088523378303455,
"eval_loss": 0.8016136884689331,
"eval_runtime": 89.0739,
"eval_samples_per_second": 78.53,
"eval_steps_per_second": 0.617,
"eval_token_acc": 0.6041989394145771,
"step": 150
},
{
"epoch": 0.11458140824246904,
"grad_norm": 1.3505800383266968,
"learning_rate": 1.983828669028593e-05,
"loss": 1.1807826042175293,
"memory(GiB)": 74.93,
"step": 155,
"token_acc": 0.6214278069142674,
"train_speed(iter/s)": 0.056798
},
{
"epoch": 0.11827758270190353,
"grad_norm": 1.3315385057884146,
"learning_rate": 1.9827715840480962e-05,
"loss": 1.1823822021484376,
"memory(GiB)": 74.93,
"step": 160,
"token_acc": 0.6380498145204028,
"train_speed(iter/s)": 0.056862
},
{
"epoch": 0.12197375716133801,
"grad_norm": 1.5704533949100516,
"learning_rate": 1.9816813342396245e-05,
"loss": 1.1738862991333008,
"memory(GiB)": 74.93,
"step": 165,
"token_acc": 0.6022625781482585,
"train_speed(iter/s)": 0.057054
},
{
"epoch": 0.1256699316207725,
"grad_norm": 1.379238427568713,
"learning_rate": 1.980557956394991e-05,
"loss": 1.1857439041137696,
"memory(GiB)": 74.93,
"step": 170,
"token_acc": 0.620583717357911,
"train_speed(iter/s)": 0.05738
},
{
"epoch": 0.129366106080207,
"grad_norm": 1.2583564805641094,
"learning_rate": 1.9794014884239532e-05,
"loss": 1.2060420989990235,
"memory(GiB)": 74.93,
"step": 175,
"token_acc": 0.6253842775581906,
"train_speed(iter/s)": 0.057484
},
{
"epoch": 0.13306228053964148,
"grad_norm": 1.3557017685070272,
"learning_rate": 1.9782119693529358e-05,
"loss": 1.2089680671691894,
"memory(GiB)": 74.93,
"step": 180,
"token_acc": 0.6479481641468683,
"train_speed(iter/s)": 0.057624
},
{
"epoch": 0.13675845499907596,
"grad_norm": 1.29022693500859,
"learning_rate": 1.9769894393237135e-05,
"loss": 1.1686654090881348,
"memory(GiB)": 74.93,
"step": 185,
"token_acc": 0.6546961325966851,
"train_speed(iter/s)": 0.057936
},
{
"epoch": 0.14045462945851045,
"grad_norm": 1.3838281227163587,
"learning_rate": 1.975733939592056e-05,
"loss": 1.2134584426879882,
"memory(GiB)": 74.93,
"step": 190,
"token_acc": 0.6140988372093024,
"train_speed(iter/s)": 0.058017
},
{
"epoch": 0.14415080391794494,
"grad_norm": 1.4298593013480252,
"learning_rate": 1.974445512526336e-05,
"loss": 1.1823249816894532,
"memory(GiB)": 74.93,
"step": 195,
"token_acc": 0.5794074793589121,
"train_speed(iter/s)": 0.058072
},
{
"epoch": 0.14784697837737942,
"grad_norm": 1.319725742779011,
"learning_rate": 1.9731242016060985e-05,
"loss": 1.237997055053711,
"memory(GiB)": 74.93,
"step": 200,
"token_acc": 0.605226480836237,
"train_speed(iter/s)": 0.058321
},
{
"epoch": 0.14784697837737942,
"eval_loss": 0.7794498801231384,
"eval_runtime": 92.8737,
"eval_samples_per_second": 75.317,
"eval_steps_per_second": 0.592,
"eval_token_acc": 0.6082148043319165,
"step": 200
},
{
"epoch": 0.1515431528368139,
"grad_norm": 1.2737166952101895,
"learning_rate": 1.9717700514205963e-05,
"loss": 1.1960806846618652,
"memory(GiB)": 74.93,
"step": 205,
"token_acc": 0.6102411135968183,
"train_speed(iter/s)": 0.056695
},
{
"epoch": 0.15523932729624837,
"grad_norm": 1.392555342325795,
"learning_rate": 1.9703831076672807e-05,
"loss": 1.1904547691345215,
"memory(GiB)": 74.93,
"step": 210,
"token_acc": 0.6277756868648852,
"train_speed(iter/s)": 0.056817
},
{
"epoch": 0.15893550175568286,
"grad_norm": 1.31415101390755,
"learning_rate": 1.9689634171502642e-05,
"loss": 1.1859335899353027,
"memory(GiB)": 74.93,
"step": 215,
"token_acc": 0.6238479262672811,
"train_speed(iter/s)": 0.056878
},
{
"epoch": 0.16263167621511734,
"grad_norm": 1.309289118136438,
"learning_rate": 1.967511027778738e-05,
"loss": 1.1907655715942382,
"memory(GiB)": 74.93,
"step": 220,
"token_acc": 0.6154311649016642,
"train_speed(iter/s)": 0.05712
},
{
"epoch": 0.16632785067455183,
"grad_norm": 1.263699514290592,
"learning_rate": 1.966025988565356e-05,
"loss": 1.1906933784484863,
"memory(GiB)": 74.93,
"step": 225,
"token_acc": 0.6170634920634921,
"train_speed(iter/s)": 0.057279
},
{
"epoch": 0.17002402513398632,
"grad_norm": 1.2660283851596872,
"learning_rate": 1.9645083496245815e-05,
"loss": 1.2014826774597167,
"memory(GiB)": 74.93,
"step": 230,
"token_acc": 0.5935228023793787,
"train_speed(iter/s)": 0.057316
},
{
"epoch": 0.1737201995934208,
"grad_norm": 1.2430321644094078,
"learning_rate": 1.962958162170994e-05,
"loss": 1.189725971221924,
"memory(GiB)": 74.93,
"step": 235,
"token_acc": 0.7096774193548387,
"train_speed(iter/s)": 0.057486
},
{
"epoch": 0.1774163740528553,
"grad_norm": 1.3183460023053968,
"learning_rate": 1.961375478517564e-05,
"loss": 1.1756509780883788,
"memory(GiB)": 74.93,
"step": 240,
"token_acc": 0.6048,
"train_speed(iter/s)": 0.057691
},
{
"epoch": 0.18111254851228978,
"grad_norm": 1.256458444055883,
"learning_rate": 1.9597603520738853e-05,
"loss": 1.1867225646972657,
"memory(GiB)": 74.93,
"step": 245,
"token_acc": 0.6193625977149729,
"train_speed(iter/s)": 0.057716
},
{
"epoch": 0.18480872297172427,
"grad_norm": 1.2002856679979195,
"learning_rate": 1.9581128373443733e-05,
"loss": 1.1792646408081056,
"memory(GiB)": 74.93,
"step": 250,
"token_acc": 0.6049046321525886,
"train_speed(iter/s)": 0.057828
},
{
"epoch": 0.18480872297172427,
"eval_loss": 0.7706022262573242,
"eval_runtime": 87.903,
"eval_samples_per_second": 79.576,
"eval_steps_per_second": 0.626,
"eval_token_acc": 0.6092432997735232,
"step": 250
},
{
"epoch": 0.18850489743115875,
"grad_norm": 1.2231576564807567,
"learning_rate": 1.9564329899264252e-05,
"loss": 1.1703492164611817,
"memory(GiB)": 74.93,
"step": 255,
"token_acc": 0.6188424362408291,
"train_speed(iter/s)": 0.056504
},
{
"epoch": 0.19220107189059324,
"grad_norm": 1.1802132132030512,
"learning_rate": 1.954720866508546e-05,
"loss": 1.17109956741333,
"memory(GiB)": 74.93,
"step": 260,
"token_acc": 0.6199639206253759,
"train_speed(iter/s)": 0.056699
},
{
"epoch": 0.19589724635002773,
"grad_norm": 1.3103486430673341,
"learning_rate": 1.9529765248684308e-05,
"loss": 1.1841205596923827,
"memory(GiB)": 74.93,
"step": 265,
"token_acc": 0.5825649622799665,
"train_speed(iter/s)": 0.056764
},
{
"epoch": 0.1995934208094622,
"grad_norm": 1.3602279829039154,
"learning_rate": 1.951200023871021e-05,
"loss": 1.1760824203491211,
"memory(GiB)": 74.93,
"step": 270,
"token_acc": 0.6165368484122229,
"train_speed(iter/s)": 0.056855
},
{
"epoch": 0.2032895952688967,
"grad_norm": 1.2205622648558432,
"learning_rate": 1.949391423466513e-05,
"loss": 1.1814783096313477,
"memory(GiB)": 74.93,
"step": 275,
"token_acc": 0.6155863619333084,
"train_speed(iter/s)": 0.057043
},
{
"epoch": 0.2069857697283312,
"grad_norm": 1.2638744947898215,
"learning_rate": 1.9475507846883377e-05,
"loss": 1.1977863311767578,
"memory(GiB)": 74.93,
"step": 280,
"token_acc": 0.6115591397849462,
"train_speed(iter/s)": 0.057131
},
{
"epoch": 0.21068194418776567,
"grad_norm": 1.208174719234681,
"learning_rate": 1.9456781696510996e-05,
"loss": 1.1798893928527832,
"memory(GiB)": 74.93,
"step": 285,
"token_acc": 0.6450809464508095,
"train_speed(iter/s)": 0.057208
},
{
"epoch": 0.21437811864720016,
"grad_norm": 1.29989102329814,
"learning_rate": 1.943773641548481e-05,
"loss": 1.1305645942687987,
"memory(GiB)": 74.93,
"step": 290,
"token_acc": 0.6185250219490781,
"train_speed(iter/s)": 0.057373
},
{
"epoch": 0.21807429310663462,
"grad_norm": 1.2869046963327413,
"learning_rate": 1.9418372646511104e-05,
"loss": 1.1689376831054688,
"memory(GiB)": 74.93,
"step": 295,
"token_acc": 0.639083030472463,
"train_speed(iter/s)": 0.057472
},
{
"epoch": 0.2217704675660691,
"grad_norm": 1.3262776988217382,
"learning_rate": 1.939869104304392e-05,
"loss": 1.1520153045654298,
"memory(GiB)": 74.93,
"step": 300,
"token_acc": 0.6394881170018282,
"train_speed(iter/s)": 0.05753
},
{
"epoch": 0.2217704675660691,
"eval_loss": 0.7643480896949768,
"eval_runtime": 83.8503,
"eval_samples_per_second": 83.422,
"eval_steps_per_second": 0.656,
"eval_token_acc": 0.6108027805160715,
"step": 300
},
{
"epoch": 0.2254666420255036,
"grad_norm": 1.2789640840585799,
"learning_rate": 1.937869226926302e-05,
"loss": 1.1876554489135742,
"memory(GiB)": 74.93,
"step": 305,
"token_acc": 0.6270755222281735,
"train_speed(iter/s)": 0.056408
},
{
"epoch": 0.22916281648493808,
"grad_norm": 1.3387432374923267,
"learning_rate": 1.9358377000051457e-05,
"loss": 1.152684211730957,
"memory(GiB)": 74.93,
"step": 310,
"token_acc": 0.5908354547558435,
"train_speed(iter/s)": 0.056571
},
{
"epoch": 0.23285899094437257,
"grad_norm": 1.3158521752921148,
"learning_rate": 1.9337745920972817e-05,
"loss": 1.1474998474121094,
"memory(GiB)": 74.93,
"step": 315,
"token_acc": 0.6453079785035138,
"train_speed(iter/s)": 0.056681
},
{
"epoch": 0.23655516540380706,
"grad_norm": 1.3574666032311622,
"learning_rate": 1.9316799728248074e-05,
"loss": 1.1646709442138672,
"memory(GiB)": 74.93,
"step": 320,
"token_acc": 0.6396255850234009,
"train_speed(iter/s)": 0.056747
},
{
"epoch": 0.24025133986324154,
"grad_norm": 1.5220421984558397,
"learning_rate": 1.9295539128732096e-05,
"loss": 1.1289070129394532,
"memory(GiB)": 74.93,
"step": 325,
"token_acc": 0.6495638789122627,
"train_speed(iter/s)": 0.056887
},
{
"epoch": 0.24394751432267603,
"grad_norm": 1.2325001012228407,
"learning_rate": 1.927396483988979e-05,
"loss": 1.1668661117553711,
"memory(GiB)": 74.93,
"step": 330,
"token_acc": 0.6125099390405513,
"train_speed(iter/s)": 0.05701
},
{
"epoch": 0.24764368878211052,
"grad_norm": 1.3455071899618125,
"learning_rate": 1.92520775897719e-05,
"loss": 1.160017967224121,
"memory(GiB)": 74.93,
"step": 335,
"token_acc": 0.6224098234842671,
"train_speed(iter/s)": 0.057069
},
{
"epoch": 0.251339863241545,
"grad_norm": 1.1193101615271859,
"learning_rate": 1.922987811699042e-05,
"loss": 1.164522933959961,
"memory(GiB)": 74.93,
"step": 340,
"token_acc": 0.6142303969022265,
"train_speed(iter/s)": 0.057185
},
{
"epoch": 0.2550360377009795,
"grad_norm": 1.184835291510033,
"learning_rate": 1.9207367170693688e-05,
"loss": 1.1658490180969239,
"memory(GiB)": 74.93,
"step": 345,
"token_acc": 0.6181616832779624,
"train_speed(iter/s)": 0.057315
},
{
"epoch": 0.258732212160414,
"grad_norm": 1.2033091005460579,
"learning_rate": 1.918454551054109e-05,
"loss": 1.174658966064453,
"memory(GiB)": 74.93,
"step": 350,
"token_acc": 0.6646234676007006,
"train_speed(iter/s)": 0.057368
},
{
"epoch": 0.258732212160414,
"eval_loss": 0.7548633813858032,
"eval_runtime": 84.0949,
"eval_samples_per_second": 83.18,
"eval_steps_per_second": 0.654,
"eval_token_acc": 0.6124072795776128,
"step": 350
},
{
"epoch": 0.26242838661984846,
"grad_norm": 1.1971162317002557,
"learning_rate": 1.916141390667744e-05,
"loss": 1.1562774658203125,
"memory(GiB)": 74.93,
"step": 355,
"token_acc": 0.6173011120615911,
"train_speed(iter/s)": 0.056434
},
{
"epoch": 0.26612456107928295,
"grad_norm": 1.1301068933758331,
"learning_rate": 1.9137973139706973e-05,
"loss": 1.2061149597167968,
"memory(GiB)": 74.93,
"step": 360,
"token_acc": 0.5783767946088485,
"train_speed(iter/s)": 0.056501
},
{
"epoch": 0.26982073553871744,
"grad_norm": 1.2885970736252064,
"learning_rate": 1.9114224000667014e-05,
"loss": 1.1453168869018555,
"memory(GiB)": 74.93,
"step": 365,
"token_acc": 0.6045895851721095,
"train_speed(iter/s)": 0.056637
},
{
"epoch": 0.2735169099981519,
"grad_norm": 1.2008587437465796,
"learning_rate": 1.9090167291001278e-05,
"loss": 1.151451015472412,
"memory(GiB)": 74.93,
"step": 370,
"token_acc": 0.6464088397790055,
"train_speed(iter/s)": 0.056724
},
{
"epoch": 0.2772130844575864,
"grad_norm": 1.2574733188940939,
"learning_rate": 1.9065803822532825e-05,
"loss": 1.143141269683838,
"memory(GiB)": 74.93,
"step": 375,
"token_acc": 0.6279554937413073,
"train_speed(iter/s)": 0.056779
},
{
"epoch": 0.2809092589170209,
"grad_norm": 1.2230232638304774,
"learning_rate": 1.9041134417436674e-05,
"loss": 1.1681084632873535,
"memory(GiB)": 74.93,
"step": 380,
"token_acc": 0.6278735632183908,
"train_speed(iter/s)": 0.0569
},
{
"epoch": 0.2846054333764554,
"grad_norm": 1.308574420114396,
"learning_rate": 1.9016159908212044e-05,
"loss": 1.1313629150390625,
"memory(GiB)": 74.93,
"step": 385,
"token_acc": 0.6380670611439843,
"train_speed(iter/s)": 0.056973
},
{
"epoch": 0.2883016078358899,
"grad_norm": 1.1949255351547317,
"learning_rate": 1.899088113765426e-05,
"loss": 1.1681228637695313,
"memory(GiB)": 74.93,
"step": 390,
"token_acc": 0.6130952380952381,
"train_speed(iter/s)": 0.057013
},
{
"epoch": 0.29199778229532436,
"grad_norm": 1.1669478994026365,
"learning_rate": 1.896529895882633e-05,
"loss": 1.1387041091918946,
"memory(GiB)": 74.93,
"step": 395,
"token_acc": 0.6152671755725191,
"train_speed(iter/s)": 0.05713
},
{
"epoch": 0.29569395675475885,
"grad_norm": 1.197646998798649,
"learning_rate": 1.8939414235030137e-05,
"loss": 1.1374378204345703,
"memory(GiB)": 74.93,
"step": 400,
"token_acc": 0.6037667511771098,
"train_speed(iter/s)": 0.057204
},
{
"epoch": 0.29569395675475885,
"eval_loss": 0.7541109323501587,
"eval_runtime": 86.4511,
"eval_samples_per_second": 80.913,
"eval_steps_per_second": 0.636,
"eval_token_acc": 0.613960988740803,
"step": 400
},
{
"epoch": 0.29939013121419333,
"grad_norm": 1.3110127948907355,
"learning_rate": 1.8913227839777305e-05,
"loss": 1.1630861282348632,
"memory(GiB)": 74.93,
"step": 405,
"token_acc": 0.6250439264378588,
"train_speed(iter/s)": 0.056415
},
{
"epoch": 0.3030863056736278,
"grad_norm": 1.2066723455387354,
"learning_rate": 1.8886740656759755e-05,
"loss": 1.1657712936401368,
"memory(GiB)": 74.93,
"step": 410,
"token_acc": 0.6286093594424162,
"train_speed(iter/s)": 0.056469
},
{
"epoch": 0.3067824801330623,
"grad_norm": 1.214334257623402,
"learning_rate": 1.8859953579819833e-05,
"loss": 1.129319953918457,
"memory(GiB)": 74.93,
"step": 415,
"token_acc": 0.5934997644842205,
"train_speed(iter/s)": 0.056572
},
{
"epoch": 0.31047865459249674,
"grad_norm": 1.260998092054749,
"learning_rate": 1.883286751292018e-05,
"loss": 1.125650119781494,
"memory(GiB)": 74.93,
"step": 420,
"token_acc": 0.6005237125400058,
"train_speed(iter/s)": 0.056666
},
{
"epoch": 0.3141748290519312,
"grad_norm": 1.1445762169453673,
"learning_rate": 1.880548337011323e-05,
"loss": 1.1848130226135254,
"memory(GiB)": 74.93,
"step": 425,
"token_acc": 0.5819639278557114,
"train_speed(iter/s)": 0.05671
},
{
"epoch": 0.3178710035113657,
"grad_norm": 1.2219231261580983,
"learning_rate": 1.8777802075510338e-05,
"loss": 1.1647357940673828,
"memory(GiB)": 74.93,
"step": 430,
"token_acc": 0.6077451592754528,
"train_speed(iter/s)": 0.056776
},
{
"epoch": 0.3215671779708002,
"grad_norm": 1.1956915969147472,
"learning_rate": 1.8749824563250615e-05,
"loss": 1.1394176483154297,
"memory(GiB)": 74.93,
"step": 435,
"token_acc": 0.6606451612903226,
"train_speed(iter/s)": 0.056853
},
{
"epoch": 0.3252633524302347,
"grad_norm": 1.3354423066052745,
"learning_rate": 1.8721551777469397e-05,
"loss": 1.152536964416504,
"memory(GiB)": 74.93,
"step": 440,
"token_acc": 0.5991432068543452,
"train_speed(iter/s)": 0.056906
},
{
"epoch": 0.3289595268896692,
"grad_norm": 1.2562915522841382,
"learning_rate": 1.869298467226639e-05,
"loss": 1.1220308303833009,
"memory(GiB)": 74.93,
"step": 445,
"token_acc": 0.6066666666666667,
"train_speed(iter/s)": 0.056963
},
{
"epoch": 0.33265570134910366,
"grad_norm": 1.359582068477731,
"learning_rate": 1.8664124211673468e-05,
"loss": 1.1504764556884766,
"memory(GiB)": 74.93,
"step": 450,
"token_acc": 0.5973016235993597,
"train_speed(iter/s)": 0.057049
},
{
"epoch": 0.33265570134910366,
"eval_loss": 0.7460736632347107,
"eval_runtime": 88.8045,
"eval_samples_per_second": 78.769,
"eval_steps_per_second": 0.619,
"eval_token_acc": 0.6144388755116506,
"step": 450
},
{
"epoch": 0.33635187580853815,
"grad_norm": 1.218975457419414,
"learning_rate": 1.863497136962213e-05,
"loss": 1.1313959121704102,
"memory(GiB)": 74.93,
"step": 455,
"token_acc": 0.6262968874700718,
"train_speed(iter/s)": 0.056354
},
{
"epoch": 0.34004805026797263,
"grad_norm": 1.4342194151464063,
"learning_rate": 1.8605527129910663e-05,
"loss": 1.1549379348754882,
"memory(GiB)": 74.93,
"step": 460,
"token_acc": 0.6472244569589702,
"train_speed(iter/s)": 0.056414
},
{
"epoch": 0.3437442247274071,
"grad_norm": 1.440358796861357,
"learning_rate": 1.857579248617091e-05,
"loss": 1.129042625427246,
"memory(GiB)": 74.93,
"step": 465,
"token_acc": 0.6356026785714286,
"train_speed(iter/s)": 0.05648
},
{
"epoch": 0.3474403991868416,
"grad_norm": 1.2091541968931232,
"learning_rate": 1.854576844183476e-05,
"loss": 1.1230792999267578,
"memory(GiB)": 74.93,
"step": 470,
"token_acc": 0.6001645413410119,
"train_speed(iter/s)": 0.056566
},
{
"epoch": 0.3511365736462761,
"grad_norm": 1.212497545728028,
"learning_rate": 1.8515456010100274e-05,
"loss": 1.1627266883850098,
"memory(GiB)": 74.93,
"step": 475,
"token_acc": 0.6375609756097561,
"train_speed(iter/s)": 0.056633
},
{
"epoch": 0.3548327481057106,
"grad_norm": 1.257170599310577,
"learning_rate": 1.8484856213897496e-05,
"loss": 1.1552623748779296,
"memory(GiB)": 74.93,
"step": 480,
"token_acc": 0.6367495451788963,
"train_speed(iter/s)": 0.056696
},
{
"epoch": 0.35852892256514507,
"grad_norm": 1.3061990827470522,
"learning_rate": 1.8453970085853953e-05,
"loss": 1.1611719131469727,
"memory(GiB)": 74.93,
"step": 485,
"token_acc": 0.5953002610966057,
"train_speed(iter/s)": 0.056777
},
{
"epoch": 0.36222509702457956,
"grad_norm": 1.2132042758068045,
"learning_rate": 1.842279866825976e-05,
"loss": 1.1605472564697266,
"memory(GiB)": 74.93,
"step": 490,
"token_acc": 0.6365507776761208,
"train_speed(iter/s)": 0.056851
},
{
"epoch": 0.36592127148401404,
"grad_norm": 1.2900699412115835,
"learning_rate": 1.8391343013032505e-05,
"loss": 1.1752688407897949,
"memory(GiB)": 74.93,
"step": 495,
"token_acc": 0.6413404114134041,
"train_speed(iter/s)": 0.056898
},
{
"epoch": 0.36961744594344853,
"grad_norm": 1.115509938975403,
"learning_rate": 1.8359604181681703e-05,
"loss": 1.1677565574645996,
"memory(GiB)": 74.93,
"step": 500,
"token_acc": 0.635439360929557,
"train_speed(iter/s)": 0.056967
},
{
"epoch": 0.36961744594344853,
"eval_loss": 0.7416162490844727,
"eval_runtime": 87.8438,
"eval_samples_per_second": 79.63,
"eval_steps_per_second": 0.626,
"eval_token_acc": 0.615732863603728,
"step": 500
},
{
"epoch": 0.373313620402883,
"grad_norm": 1.1864096727600855,
"learning_rate": 1.8327583245273004e-05,
"loss": 1.120311164855957,
"memory(GiB)": 74.93,
"step": 505,
"token_acc": 0.6247582205029013,
"train_speed(iter/s)": 0.056337
},
{
"epoch": 0.3770097948623175,
"grad_norm": 1.1558943975448084,
"learning_rate": 1.8295281284392036e-05,
"loss": 1.167508888244629,
"memory(GiB)": 74.93,
"step": 510,
"token_acc": 0.5796680497925312,
"train_speed(iter/s)": 0.056408
},
{
"epoch": 0.380705969321752,
"grad_norm": 1.2905670874481943,
"learning_rate": 1.8262699389107933e-05,
"loss": 1.15736083984375,
"memory(GiB)": 74.93,
"step": 515,
"token_acc": 0.6157240272763739,
"train_speed(iter/s)": 0.056454
},
{
"epoch": 0.3844021437811865,
"grad_norm": 1.2748847596057926,
"learning_rate": 1.8229838658936566e-05,
"loss": 1.1492805480957031,
"memory(GiB)": 74.93,
"step": 520,
"token_acc": 0.6105889724310777,
"train_speed(iter/s)": 0.056519
},
{
"epoch": 0.38809831824062097,
"grad_norm": 1.1876718707954161,
"learning_rate": 1.819670020280343e-05,
"loss": 1.1467121124267579,
"memory(GiB)": 74.93,
"step": 525,
"token_acc": 0.6113826815642458,
"train_speed(iter/s)": 0.056588
},
{
"epoch": 0.39179449270005545,
"grad_norm": 1.2841584592867252,
"learning_rate": 1.816328513900622e-05,
"loss": 1.1653972625732423,
"memory(GiB)": 74.93,
"step": 530,
"token_acc": 0.6273197444478248,
"train_speed(iter/s)": 0.056639
},
{
"epoch": 0.39549066715948994,
"grad_norm": 1.243754331563731,
"learning_rate": 1.8129594595177093e-05,
"loss": 1.154591178894043,
"memory(GiB)": 74.93,
"step": 535,
"token_acc": 0.5926477893691009,
"train_speed(iter/s)": 0.056695
},
{
"epoch": 0.3991868416189244,
"grad_norm": 1.3245067788741383,
"learning_rate": 1.809562970824462e-05,
"loss": 1.157964324951172,
"memory(GiB)": 74.93,
"step": 540,
"token_acc": 0.6192792394428477,
"train_speed(iter/s)": 0.056758
},
{
"epoch": 0.4028830160783589,
"grad_norm": 1.3057962329498682,
"learning_rate": 1.806139162439541e-05,
"loss": 1.1371761322021485,
"memory(GiB)": 74.93,
"step": 545,
"token_acc": 0.596340150699677,
"train_speed(iter/s)": 0.056815
},
{
"epoch": 0.4065791905377934,
"grad_norm": 1.25005365154622,
"learning_rate": 1.8026881499035437e-05,
"loss": 1.1124300956726074,
"memory(GiB)": 74.93,
"step": 550,
"token_acc": 0.6204881402543829,
"train_speed(iter/s)": 0.056864
},
{
"epoch": 0.4065791905377934,
"eval_loss": 0.7460726499557495,
"eval_runtime": 88.6273,
"eval_samples_per_second": 78.926,
"eval_steps_per_second": 0.621,
"eval_token_acc": 0.6162869352221019,
"step": 550
},
{
"epoch": 0.4102753649972279,
"grad_norm": 1.1926510177467409,
"learning_rate": 1.7992100496751054e-05,
"loss": 1.1571131706237794,
"memory(GiB)": 74.93,
"step": 555,
"token_acc": 0.6311389759665622,
"train_speed(iter/s)": 0.056299
},
{
"epoch": 0.4139715394566624,
"grad_norm": 1.1989503074947894,
"learning_rate": 1.7957049791269684e-05,
"loss": 1.1516962051391602,
"memory(GiB)": 74.93,
"step": 560,
"token_acc": 0.5952788231269244,
"train_speed(iter/s)": 0.056369
},
{
"epoch": 0.41766771391609686,
"grad_norm": 1.1212233051313498,
"learning_rate": 1.792173056542021e-05,
"loss": 1.1592437744140625,
"memory(GiB)": 74.93,
"step": 565,
"token_acc": 0.5976621417797888,
"train_speed(iter/s)": 0.056413
},
{
"epoch": 0.42136388837553135,
"grad_norm": 1.1553604640842632,
"learning_rate": 1.7886144011093067e-05,
"loss": 1.1524188041687011,
"memory(GiB)": 74.93,
"step": 570,
"token_acc": 0.6424742268041237,
"train_speed(iter/s)": 0.056462
},
{
"epoch": 0.42506006283496583,
"grad_norm": 1.183725532275657,
"learning_rate": 1.7850291329200015e-05,
"loss": 1.1416030883789063,
"memory(GiB)": 74.93,
"step": 575,
"token_acc": 0.6029700196133371,
"train_speed(iter/s)": 0.056533
},
{
"epoch": 0.4287562372944003,
"grad_norm": 1.2480769087109442,
"learning_rate": 1.7814173729633607e-05,
"loss": 1.164370059967041,
"memory(GiB)": 74.93,
"step": 580,
"token_acc": 0.6192486281131279,
"train_speed(iter/s)": 0.056588
},
{
"epoch": 0.43245241175383475,
"grad_norm": 1.3104680757325256,
"learning_rate": 1.7777792431226384e-05,
"loss": 1.119395637512207,
"memory(GiB)": 74.93,
"step": 585,
"token_acc": 0.6305528922978587,
"train_speed(iter/s)": 0.056638
},
{
"epoch": 0.43614858621326924,
"grad_norm": 1.213929814999547,
"learning_rate": 1.7741148661709707e-05,
"loss": 1.1547592163085938,
"memory(GiB)": 74.93,
"step": 590,
"token_acc": 0.6233905579399142,
"train_speed(iter/s)": 0.056711
},
{
"epoch": 0.4398447606727037,
"grad_norm": 1.2155093557171206,
"learning_rate": 1.770424365767236e-05,
"loss": 1.1199445724487305,
"memory(GiB)": 74.93,
"step": 595,
"token_acc": 0.6336528221512248,
"train_speed(iter/s)": 0.056773
},
{
"epoch": 0.4435409351321382,
"grad_norm": 1.3908702173841363,
"learning_rate": 1.7667078664518796e-05,
"loss": 1.157416534423828,
"memory(GiB)": 74.93,
"step": 600,
"token_acc": 0.6181159420289855,
"train_speed(iter/s)": 0.056815
},
{
"epoch": 0.4435409351321382,
"eval_loss": 0.7338850498199463,
"eval_runtime": 85.3003,
"eval_samples_per_second": 82.004,
"eval_steps_per_second": 0.645,
"eval_token_acc": 0.6175324420475716,
"step": 600
},
{
"epoch": 0.4472371095915727,
"grad_norm": 1.022281205691788,
"learning_rate": 1.7629654936427126e-05,
"loss": 1.1211700439453125,
"memory(GiB)": 74.93,
"step": 605,
"token_acc": 0.6267794070427057,
"train_speed(iter/s)": 0.056289
},
{
"epoch": 0.4509332840510072,
"grad_norm": 1.1115715050120814,
"learning_rate": 1.7591973736306774e-05,
"loss": 1.1568084716796876,
"memory(GiB)": 74.93,
"step": 610,
"token_acc": 0.6001278227524499,
"train_speed(iter/s)": 0.056358
},
{
"epoch": 0.4546294585104417,
"grad_norm": 1.2942894072539404,
"learning_rate": 1.755403633575589e-05,
"loss": 1.1330131530761718,
"memory(GiB)": 74.93,
"step": 615,
"token_acc": 0.6048237476808905,
"train_speed(iter/s)": 0.056424
},
{
"epoch": 0.45832563296987616,
"grad_norm": 1.2115375753993367,
"learning_rate": 1.7515844015018416e-05,
"loss": 1.1604066848754884,
"memory(GiB)": 74.93,
"step": 620,
"token_acc": 0.6332541567695962,
"train_speed(iter/s)": 0.05648
},
{
"epoch": 0.46202180742931065,
"grad_norm": 1.1168616761395809,
"learning_rate": 1.7477398062940868e-05,
"loss": 1.1492230415344238,
"memory(GiB)": 74.93,
"step": 625,
"token_acc": 0.6326703343207787,
"train_speed(iter/s)": 0.056541
},
{
"epoch": 0.46571798188874514,
"grad_norm": 1.3080238975825687,
"learning_rate": 1.7438699776928892e-05,
"loss": 1.159599494934082,
"memory(GiB)": 74.93,
"step": 630,
"token_acc": 0.5911352329262777,
"train_speed(iter/s)": 0.056603
},
{
"epoch": 0.4694141563481796,
"grad_norm": 1.270157306289422,
"learning_rate": 1.739975046290343e-05,
"loss": 1.1172502517700196,
"memory(GiB)": 74.93,
"step": 635,
"token_acc": 0.6800878477306003,
"train_speed(iter/s)": 0.05664
},
{
"epoch": 0.4731103308076141,
"grad_norm": 1.1591581275323428,
"learning_rate": 1.7360551435256673e-05,
"loss": 1.1474403381347655,
"memory(GiB)": 74.93,
"step": 640,
"token_acc": 0.6703857188396557,
"train_speed(iter/s)": 0.056691
},
{
"epoch": 0.4768065052670486,
"grad_norm": 1.3849471969434006,
"learning_rate": 1.7321104016807716e-05,
"loss": 1.1200141906738281,
"memory(GiB)": 74.93,
"step": 645,
"token_acc": 0.6204099060631939,
"train_speed(iter/s)": 0.056741
},
{
"epoch": 0.4805026797264831,
"grad_norm": 1.2181008696775872,
"learning_rate": 1.7281409538757886e-05,
"loss": 1.1367115020751952,
"memory(GiB)": 74.93,
"step": 650,
"token_acc": 0.6141581632653061,
"train_speed(iter/s)": 0.056787
},
{
"epoch": 0.4805026797264831,
"eval_loss": 0.7338098287582397,
"eval_runtime": 86.3351,
"eval_samples_per_second": 81.022,
"eval_steps_per_second": 0.637,
"eval_token_acc": 0.618567863384408,
"step": 650
},
{
"epoch": 0.48419885418591757,
"grad_norm": 1.2381127917004506,
"learning_rate": 1.7241469340645856e-05,
"loss": 1.1498327255249023,
"memory(GiB)": 74.93,
"step": 655,
"token_acc": 0.6240238388820386,
"train_speed(iter/s)": 0.056305
},
{
"epoch": 0.48789502864535206,
"grad_norm": 1.3545670040018443,
"learning_rate": 1.720128477030241e-05,
"loss": 1.123112392425537,
"memory(GiB)": 74.93,
"step": 660,
"token_acc": 0.6101917520357236,
"train_speed(iter/s)": 0.05635
},
{
"epoch": 0.49159120310478654,
"grad_norm": 1.2698188744774948,
"learning_rate": 1.716085718380498e-05,
"loss": 1.1386995315551758,
"memory(GiB)": 74.93,
"step": 665,
"token_acc": 0.6005629477993859,
"train_speed(iter/s)": 0.056398
},
{
"epoch": 0.49528737756422103,
"grad_norm": 1.4609798611237281,
"learning_rate": 1.7120187945431874e-05,
"loss": 1.1037940979003906,
"memory(GiB)": 74.93,
"step": 670,
"token_acc": 0.6407727085902178,
"train_speed(iter/s)": 0.056444
},
{
"epoch": 0.4989835520236555,
"grad_norm": 1.1805190661164426,
"learning_rate": 1.707927842761623e-05,
"loss": 1.1232402801513672,
"memory(GiB)": 74.93,
"step": 675,
"token_acc": 0.5811437403400309,
"train_speed(iter/s)": 0.05646
},
{
"epoch": 0.50267972648309,
"grad_norm": 1.1558010845800675,
"learning_rate": 1.7038130010899716e-05,
"loss": 1.1340635299682618,
"memory(GiB)": 74.93,
"step": 680,
"token_acc": 0.6523545706371191,
"train_speed(iter/s)": 0.056504
},
{
"epoch": 0.5063759009425245,
"grad_norm": 1.1790896957784056,
"learning_rate": 1.6996744083885938e-05,
"loss": 1.1378223419189453,
"memory(GiB)": 74.93,
"step": 685,
"token_acc": 0.6573009791400596,
"train_speed(iter/s)": 0.056546
},
{
"epoch": 0.510072075401959,
"grad_norm": 1.2335317128319008,
"learning_rate": 1.695512204319357e-05,
"loss": 1.1394284248352051,
"memory(GiB)": 74.93,
"step": 690,
"token_acc": 0.6082870568133276,
"train_speed(iter/s)": 0.056586
},
{
"epoch": 0.5137682498613935,
"grad_norm": 0.9893255166681467,
"learning_rate": 1.6913265293409235e-05,
"loss": 1.1198680877685547,
"memory(GiB)": 74.93,
"step": 695,
"token_acc": 0.547270955165692,
"train_speed(iter/s)": 0.05664
},
{
"epoch": 0.517464424320828,
"grad_norm": 1.1351076610632471,
"learning_rate": 1.68711752470401e-05,
"loss": 1.1366339683532716,
"memory(GiB)": 74.93,
"step": 700,
"token_acc": 0.6295369211514393,
"train_speed(iter/s)": 0.056675
},
{
"epoch": 0.517464424320828,
"eval_loss": 0.7255228757858276,
"eval_runtime": 89.5144,
"eval_samples_per_second": 78.144,
"eval_steps_per_second": 0.614,
"eval_token_acc": 0.6190699907885594,
"step": 700
},
{
"epoch": 0.5211605987802624,
"grad_norm": 1.0862208515121348,
"learning_rate": 1.682885332446621e-05,
"loss": 1.1369894981384276,
"memory(GiB)": 74.93,
"step": 705,
"token_acc": 0.6288204532248692,
"train_speed(iter/s)": 0.056212
},
{
"epoch": 0.5248567732396969,
"grad_norm": 1.1660653361907225,
"learning_rate": 1.6786300953892563e-05,
"loss": 1.1410274505615234,
"memory(GiB)": 74.93,
"step": 710,
"token_acc": 0.6100605143721634,
"train_speed(iter/s)": 0.056263
},
{
"epoch": 0.5285529476991314,
"grad_norm": 1.0896922974940084,
"learning_rate": 1.674351957130089e-05,
"loss": 1.1174249649047852,
"memory(GiB)": 74.93,
"step": 715,
"token_acc": 0.6420308483290489,
"train_speed(iter/s)": 0.056309
},
{
"epoch": 0.5322491221585659,
"grad_norm": 1.152348085956414,
"learning_rate": 1.6700510620401223e-05,
"loss": 1.1088247299194336,
"memory(GiB)": 74.93,
"step": 720,
"token_acc": 0.6403995560488346,
"train_speed(iter/s)": 0.056355
},
{
"epoch": 0.5359452966180004,
"grad_norm": 1.1236142627513106,
"learning_rate": 1.6657275552583172e-05,
"loss": 1.137843418121338,
"memory(GiB)": 74.93,
"step": 725,
"token_acc": 0.5981665393430099,
"train_speed(iter/s)": 0.056406
},
{
"epoch": 0.5396414710774349,
"grad_norm": 1.0869362324396392,
"learning_rate": 1.6613815826866923e-05,
"loss": 1.1183334350585938,
"memory(GiB)": 74.93,
"step": 730,
"token_acc": 0.6076433121019108,
"train_speed(iter/s)": 0.056454
},
{
"epoch": 0.5433376455368694,
"grad_norm": 1.0408539682832916,
"learning_rate": 1.6570132909854027e-05,
"loss": 1.1498143196105957,
"memory(GiB)": 74.93,
"step": 735,
"token_acc": 0.6524312896405919,
"train_speed(iter/s)": 0.0565
},
{
"epoch": 0.5470338199963038,
"grad_norm": 1.223295875198057,
"learning_rate": 1.6526228275677892e-05,
"loss": 1.091654109954834,
"memory(GiB)": 74.93,
"step": 740,
"token_acc": 0.6982872200263505,
"train_speed(iter/s)": 0.056544
},
{
"epoch": 0.5507299944557383,
"grad_norm": 1.1558442201312176,
"learning_rate": 1.6482103405954056e-05,
"loss": 1.1205904006958007,
"memory(GiB)": 74.93,
"step": 745,
"token_acc": 0.6377204884667571,
"train_speed(iter/s)": 0.056579
},
{
"epoch": 0.5544261689151728,
"grad_norm": 1.2784643735837162,
"learning_rate": 1.6437759789730154e-05,
"loss": 1.1237329483032226,
"memory(GiB)": 74.93,
"step": 750,
"token_acc": 0.6141374837872893,
"train_speed(iter/s)": 0.056631
},
{
"epoch": 0.5544261689151728,
"eval_loss": 0.7271792888641357,
"eval_runtime": 87.6966,
"eval_samples_per_second": 79.764,
"eval_steps_per_second": 0.627,
"eval_token_acc": 0.6196194451434468,
"step": 750
},
{
"epoch": 0.5581223433746073,
"grad_norm": 1.2055849293387977,
"learning_rate": 1.6393198923435707e-05,
"loss": 1.1234511375427245,
"memory(GiB)": 74.93,
"step": 755,
"token_acc": 0.6244901356863398,
"train_speed(iter/s)": 0.056217
},
{
"epoch": 0.5618185178340418,
"grad_norm": 1.1362509527796705,
"learning_rate": 1.63484223108316e-05,
"loss": 1.125691795349121,
"memory(GiB)": 74.93,
"step": 760,
"token_acc": 0.6037473976405274,
"train_speed(iter/s)": 0.05626
},
{
"epoch": 0.5655146922934763,
"grad_norm": 1.123275540757232,
"learning_rate": 1.6303431462959327e-05,
"loss": 1.1341413497924804,
"memory(GiB)": 74.93,
"step": 765,
"token_acc": 0.6085106382978723,
"train_speed(iter/s)": 0.056308
},
{
"epoch": 0.5692108667529108,
"grad_norm": 1.015989051360902,
"learning_rate": 1.6258227898090037e-05,
"loss": 1.1203922271728515,
"memory(GiB)": 74.93,
"step": 770,
"token_acc": 0.601472134595163,
"train_speed(iter/s)": 0.056355
},
{
"epoch": 0.5729070412123453,
"grad_norm": 1.189393051036189,
"learning_rate": 1.6212813141673254e-05,
"loss": 1.1124958038330077,
"memory(GiB)": 74.93,
"step": 775,
"token_acc": 0.6260790549750114,
"train_speed(iter/s)": 0.056399
},
{
"epoch": 0.5766032156717797,
"grad_norm": 1.1850051513280322,
"learning_rate": 1.6167188726285433e-05,
"loss": 1.114617919921875,
"memory(GiB)": 74.93,
"step": 780,
"token_acc": 0.5942992874109264,
"train_speed(iter/s)": 0.056434
},
{
"epoch": 0.5802993901312142,
"grad_norm": 1.0681729567626044,
"learning_rate": 1.6121356191578213e-05,
"loss": 1.1280495643615722,
"memory(GiB)": 74.93,
"step": 785,
"token_acc": 0.705685618729097,
"train_speed(iter/s)": 0.056481
},
{
"epoch": 0.5839955645906487,
"grad_norm": 1.2860183936318812,
"learning_rate": 1.607531708422649e-05,
"loss": 1.1495230674743653,
"memory(GiB)": 74.93,
"step": 790,
"token_acc": 0.5793650793650794,
"train_speed(iter/s)": 0.056516
},
{
"epoch": 0.5876917390500832,
"grad_norm": 1.0862282113312,
"learning_rate": 1.6029072957876196e-05,
"loss": 1.1175559997558593,
"memory(GiB)": 74.93,
"step": 795,
"token_acc": 0.6226415094339622,
"train_speed(iter/s)": 0.056552
},
{
"epoch": 0.5913879135095177,
"grad_norm": 1.1331799452220792,
"learning_rate": 1.5982625373091877e-05,
"loss": 1.0859192848205566,
"memory(GiB)": 74.93,
"step": 800,
"token_acc": 0.597226235192141,
"train_speed(iter/s)": 0.056592
},
{
"epoch": 0.5913879135095177,
"eval_loss": 0.7157755494117737,
"eval_runtime": 88.6481,
"eval_samples_per_second": 78.907,
"eval_steps_per_second": 0.62,
"eval_token_acc": 0.6206202370041347,
"step": 800
},
{
"epoch": 0.5950840879689522,
"grad_norm": 1.108802407981979,
"learning_rate": 1.593597589730404e-05,
"loss": 1.147084617614746,
"memory(GiB)": 74.93,
"step": 805,
"token_acc": 0.6168687401159726,
"train_speed(iter/s)": 0.056208
},
{
"epoch": 0.5987802624283867,
"grad_norm": 0.9423602415844418,
"learning_rate": 1.5889126104756245e-05,
"loss": 1.1448484420776368,
"memory(GiB)": 74.93,
"step": 810,
"token_acc": 0.5890688259109311,
"train_speed(iter/s)": 0.056247
},
{
"epoch": 0.6024764368878212,
"grad_norm": 1.0816637490179923,
"learning_rate": 1.5842077576451988e-05,
"loss": 1.1083642959594726,
"memory(GiB)": 74.93,
"step": 815,
"token_acc": 0.6413487738419619,
"train_speed(iter/s)": 0.056285
},
{
"epoch": 0.6061726113472556,
"grad_norm": 1.135732608334688,
"learning_rate": 1.5794831900101352e-05,
"loss": 1.1130756378173827,
"memory(GiB)": 74.93,
"step": 820,
"token_acc": 0.620497803806735,
"train_speed(iter/s)": 0.056338
},
{
"epoch": 0.6098687858066901,
"grad_norm": 1.0156136928889437,
"learning_rate": 1.5747390670067412e-05,
"loss": 1.1423524856567382,
"memory(GiB)": 74.93,
"step": 825,
"token_acc": 0.6086384564788424,
"train_speed(iter/s)": 0.056378
},
{
"epoch": 0.6135649602661246,
"grad_norm": 1.233089498837372,
"learning_rate": 1.5699755487312446e-05,
"loss": 1.1060791969299317,
"memory(GiB)": 74.93,
"step": 830,
"token_acc": 0.6365546218487395,
"train_speed(iter/s)": 0.056416
},
{
"epoch": 0.6172611347255591,
"grad_norm": 1.1731325122439864,
"learning_rate": 1.56519279593439e-05,
"loss": 1.0863089561462402,
"memory(GiB)": 74.93,
"step": 835,
"token_acc": 0.6160830090791181,
"train_speed(iter/s)": 0.056451
},
{
"epoch": 0.6209573091849935,
"grad_norm": 1.1022360374731142,
"learning_rate": 1.560390970016015e-05,
"loss": 1.1188045501708985,
"memory(GiB)": 74.93,
"step": 840,
"token_acc": 0.5851091817942646,
"train_speed(iter/s)": 0.05649
},
{
"epoch": 0.624653483644428,
"grad_norm": 1.1163862966216507,
"learning_rate": 1.5555702330196024e-05,
"loss": 1.1088319778442384,
"memory(GiB)": 74.93,
"step": 845,
"token_acc": 0.6556741028128031,
"train_speed(iter/s)": 0.056533
},
{
"epoch": 0.6283496581038625,
"grad_norm": 1.1694067702393547,
"learning_rate": 1.5507307476268126e-05,
"loss": 1.1475400924682617,
"memory(GiB)": 74.93,
"step": 850,
"token_acc": 0.6055389221556886,
"train_speed(iter/s)": 0.056569
},
{
"epoch": 0.6283496581038625,
"eval_loss": 0.7119885683059692,
"eval_runtime": 87.1877,
"eval_samples_per_second": 80.229,
"eval_steps_per_second": 0.631,
"eval_token_acc": 0.621244721890677,
"step": 850
},
{
"epoch": 0.6320458325632969,
"grad_norm": 1.1865540340685679,
"learning_rate": 1.5458726771519946e-05,
"loss": 1.135090446472168,
"memory(GiB)": 74.93,
"step": 855,
"token_acc": 0.6295323704676296,
"train_speed(iter/s)": 0.056205
},
{
"epoch": 0.6357420070227314,
"grad_norm": 0.9908463678598523,
"learning_rate": 1.5409961855366718e-05,
"loss": 1.110205078125,
"memory(GiB)": 74.93,
"step": 860,
"token_acc": 0.6002865329512894,
"train_speed(iter/s)": 0.056248
},
{
"epoch": 0.6394381814821659,
"grad_norm": 1.1394579815051238,
"learning_rate": 1.5361014373440125e-05,
"loss": 1.131001091003418,
"memory(GiB)": 74.93,
"step": 865,
"token_acc": 0.6846254927726675,
"train_speed(iter/s)": 0.056284
},
{
"epoch": 0.6431343559416004,
"grad_norm": 1.2277455515675866,
"learning_rate": 1.5311885977532756e-05,
"loss": 1.1217898368835448,
"memory(GiB)": 74.93,
"step": 870,
"token_acc": 0.5979188900747066,
"train_speed(iter/s)": 0.056322
},
{
"epoch": 0.6468305304010349,
"grad_norm": 1.163464153725413,
"learning_rate": 1.5262578325542366e-05,
"loss": 1.096768569946289,
"memory(GiB)": 74.93,
"step": 875,
"token_acc": 0.6008762322015334,
"train_speed(iter/s)": 0.056371
},
{
"epoch": 0.6505267048604694,
"grad_norm": 1.0920480508914876,
"learning_rate": 1.521309308141592e-05,
"loss": 1.1257577896118165,
"memory(GiB)": 74.93,
"step": 880,
"token_acc": 0.6577503429355281,
"train_speed(iter/s)": 0.056412
},
{
"epoch": 0.6542228793199039,
"grad_norm": 1.1338180174479229,
"learning_rate": 1.5163431915093443e-05,
"loss": 1.1262746810913087,
"memory(GiB)": 74.93,
"step": 885,
"token_acc": 0.6306549628629304,
"train_speed(iter/s)": 0.056447
},
{
"epoch": 0.6579190537793383,
"grad_norm": 1.295043254051827,
"learning_rate": 1.511359650245168e-05,
"loss": 1.1621430397033692,
"memory(GiB)": 74.93,
"step": 890,
"token_acc": 0.6065481230595541,
"train_speed(iter/s)": 0.056485
},
{
"epoch": 0.6616152282387728,
"grad_norm": 1.1985531473315896,
"learning_rate": 1.506358852524752e-05,
"loss": 1.1280719757080078,
"memory(GiB)": 74.93,
"step": 895,
"token_acc": 0.6419322709163346,
"train_speed(iter/s)": 0.056523
},
{
"epoch": 0.6653114026982073,
"grad_norm": 1.0909942367098966,
"learning_rate": 1.5013409671061267e-05,
"loss": 1.125238800048828,
"memory(GiB)": 74.93,
"step": 900,
"token_acc": 0.599232245681382,
"train_speed(iter/s)": 0.056559
},
{
"epoch": 0.6653114026982073,
"eval_loss": 0.7135615348815918,
"eval_runtime": 87.1706,
"eval_samples_per_second": 80.245,
"eval_steps_per_second": 0.631,
"eval_token_acc": 0.6218034107725374,
"step": 900
},
{
"epoch": 0.6690075771576418,
"grad_norm": 1.1857146226848603,
"learning_rate": 1.4963061633239665e-05,
"loss": 1.1094846725463867,
"memory(GiB)": 74.93,
"step": 905,
"token_acc": 0.6268454980245374,
"train_speed(iter/s)": 0.056203
},
{
"epoch": 0.6727037516170763,
"grad_norm": 0.9662742881806529,
"learning_rate": 1.4912546110838775e-05,
"loss": 1.1187602996826171,
"memory(GiB)": 74.93,
"step": 910,
"token_acc": 0.6091391268869849,
"train_speed(iter/s)": 0.056241
},
{
"epoch": 0.6763999260765108,
"grad_norm": 1.0584302453369157,
"learning_rate": 1.4861864808566624e-05,
"loss": 1.101078701019287,
"memory(GiB)": 74.93,
"step": 915,
"token_acc": 0.5681592039800994,
"train_speed(iter/s)": 0.056284
},
{
"epoch": 0.6800961005359453,
"grad_norm": 1.1605002634031412,
"learning_rate": 1.4811019436725684e-05,
"loss": 1.146175003051758,
"memory(GiB)": 74.93,
"step": 920,
"token_acc": 0.63498674744415,
"train_speed(iter/s)": 0.056321
},
{
"epoch": 0.6837922749953798,
"grad_norm": 1.0137203677446553,
"learning_rate": 1.4760011711155164e-05,
"loss": 1.1349545478820802,
"memory(GiB)": 74.93,
"step": 925,
"token_acc": 0.6199203187250996,
"train_speed(iter/s)": 0.056361
},
{
"epoch": 0.6874884494548142,
"grad_norm": 1.183534701619676,
"learning_rate": 1.4708843353173084e-05,
"loss": 1.0977567672729491,
"memory(GiB)": 74.93,
"step": 930,
"token_acc": 0.6462346760070052,
"train_speed(iter/s)": 0.056403
},
{
"epoch": 0.6911846239142487,
"grad_norm": 1.1575204207505418,
"learning_rate": 1.4657516089518211e-05,
"loss": 1.1138565063476562,
"memory(GiB)": 74.93,
"step": 935,
"token_acc": 0.6146223888591323,
"train_speed(iter/s)": 0.056436
},
{
"epoch": 0.6948807983736832,
"grad_norm": 1.1418054839263487,
"learning_rate": 1.4606031652291772e-05,
"loss": 1.1173955917358398,
"memory(GiB)": 74.93,
"step": 940,
"token_acc": 0.6329457364341086,
"train_speed(iter/s)": 0.056463
},
{
"epoch": 0.6985769728331177,
"grad_norm": 1.0817591968148002,
"learning_rate": 1.4554391778899016e-05,
"loss": 1.0996898651123046,
"memory(GiB)": 74.93,
"step": 945,
"token_acc": 0.6234177215189873,
"train_speed(iter/s)": 0.056501
},
{
"epoch": 0.7022731472925522,
"grad_norm": 1.072385635877129,
"learning_rate": 1.4502598211990566e-05,
"loss": 1.1042339324951171,
"memory(GiB)": 74.93,
"step": 950,
"token_acc": 0.6252068394925537,
"train_speed(iter/s)": 0.056535
},
{
"epoch": 0.7022731472925522,
"eval_loss": 0.7057685256004333,
"eval_runtime": 86.3988,
"eval_samples_per_second": 80.962,
"eval_steps_per_second": 0.637,
"eval_token_acc": 0.62234940217981,
"step": 950
},
{
"epoch": 0.7059693217519867,
"grad_norm": 0.9961167453619919,
"learning_rate": 1.4450652699403626e-05,
"loss": 1.1219955444335938,
"memory(GiB)": 74.93,
"step": 955,
"token_acc": 0.6370088719898606,
"train_speed(iter/s)": 0.056207
},
{
"epoch": 0.7096654962114212,
"grad_norm": 1.061517038375997,
"learning_rate": 1.4398556994102996e-05,
"loss": 1.1110521316528321,
"memory(GiB)": 74.93,
"step": 960,
"token_acc": 0.592031029619182,
"train_speed(iter/s)": 0.056234
},
{
"epoch": 0.7133616706708557,
"grad_norm": 1.0852009579100936,
"learning_rate": 1.43463128541219e-05,
"loss": 1.096040916442871,
"memory(GiB)": 74.93,
"step": 965,
"token_acc": 0.6075691411935954,
"train_speed(iter/s)": 0.056271
},
{
"epoch": 0.7170578451302901,
"grad_norm": 0.9770309231987666,
"learning_rate": 1.4293922042502688e-05,
"loss": 1.1151371002197266,
"memory(GiB)": 74.93,
"step": 970,
"token_acc": 0.6337025316455697,
"train_speed(iter/s)": 0.056306
},
{
"epoch": 0.7207540195897246,
"grad_norm": 1.1847784978202587,
"learning_rate": 1.4241386327237312e-05,
"loss": 1.1008172035217285,
"memory(GiB)": 74.93,
"step": 975,
"token_acc": 0.6730158730158731,
"train_speed(iter/s)": 0.05634
},
{
"epoch": 0.7244501940491591,
"grad_norm": 1.143052071292951,
"learning_rate": 1.4188707481207677e-05,
"loss": 1.083547878265381,
"memory(GiB)": 74.93,
"step": 980,
"token_acc": 0.6250749850029994,
"train_speed(iter/s)": 0.056381
},
{
"epoch": 0.7281463685085936,
"grad_norm": 1.0778857332369403,
"learning_rate": 1.4135887282125815e-05,
"loss": 1.1583375930786133,
"memory(GiB)": 74.93,
"step": 985,
"token_acc": 0.6521739130434783,
"train_speed(iter/s)": 0.056416
},
{
"epoch": 0.7318425429680281,
"grad_norm": 1.1338338646435362,
"learning_rate": 1.4082927512473884e-05,
"loss": 1.0937719345092773,
"memory(GiB)": 74.93,
"step": 990,
"token_acc": 0.6181945090739879,
"train_speed(iter/s)": 0.056448
},
{
"epoch": 0.7355387174274626,
"grad_norm": 1.085287732158945,
"learning_rate": 1.4029829959444023e-05,
"loss": 1.1042760848999023,
"memory(GiB)": 74.93,
"step": 995,
"token_acc": 0.600328947368421,
"train_speed(iter/s)": 0.056486
},
{
"epoch": 0.7392348918868971,
"grad_norm": 1.0122719878977164,
"learning_rate": 1.3976596414878044e-05,
"loss": 1.1351425170898437,
"memory(GiB)": 74.93,
"step": 1000,
"token_acc": 0.8054474708171206,
"train_speed(iter/s)": 0.056528
},
{
"epoch": 0.7392348918868971,
"eval_loss": 0.7091466784477234,
"eval_runtime": 87.3344,
"eval_samples_per_second": 80.094,
"eval_steps_per_second": 0.63,
"eval_token_acc": 0.622888467691853,
"step": 1000
},
{
"epoch": 0.7429310663463315,
"grad_norm": 1.1226018601296495,
"learning_rate": 1.392322867520695e-05,
"loss": 1.088837242126465,
"memory(GiB)": 74.93,
"step": 1005,
"token_acc": 0.6355053191489362,
"train_speed(iter/s)": 0.056225
},
{
"epoch": 0.746627240805766,
"grad_norm": 1.021565181098161,
"learning_rate": 1.3869728541390333e-05,
"loss": 1.1350063323974608,
"memory(GiB)": 74.93,
"step": 1010,
"token_acc": 0.6212718064153067,
"train_speed(iter/s)": 0.056258
},
{
"epoch": 0.7503234152652005,
"grad_norm": 1.2612224567220394,
"learning_rate": 1.3816097818855575e-05,
"loss": 1.1172313690185547,
"memory(GiB)": 74.93,
"step": 1015,
"token_acc": 0.5992337164750958,
"train_speed(iter/s)": 0.056287
},
{
"epoch": 0.754019589724635,
"grad_norm": 1.1387539267847184,
"learning_rate": 1.3762338317436948e-05,
"loss": 1.1132306098937987,
"memory(GiB)": 74.93,
"step": 1020,
"token_acc": 0.6117302052785923,
"train_speed(iter/s)": 0.056327
},
{
"epoch": 0.7577157641840695,
"grad_norm": 1.358536367466617,
"learning_rate": 1.3708451851314511e-05,
"loss": 1.1005128860473632,
"memory(GiB)": 74.93,
"step": 1025,
"token_acc": 0.6442417331812998,
"train_speed(iter/s)": 0.05636
},
{
"epoch": 0.761411938643504,
"grad_norm": 1.0707791903089035,
"learning_rate": 1.3654440238952913e-05,
"loss": 1.0914304733276368,
"memory(GiB)": 74.93,
"step": 1030,
"token_acc": 0.6064616582327754,
"train_speed(iter/s)": 0.056391
},
{
"epoch": 0.7651081131029385,
"grad_norm": 1.116060507051338,
"learning_rate": 1.3600305303040007e-05,
"loss": 1.1009283065795898,
"memory(GiB)": 74.93,
"step": 1035,
"token_acc": 0.6307870370370371,
"train_speed(iter/s)": 0.056425
},
{
"epoch": 0.768804287562373,
"grad_norm": 1.1278348104888696,
"learning_rate": 1.3546048870425356e-05,
"loss": 1.1028734207153321,
"memory(GiB)": 74.93,
"step": 1040,
"token_acc": 0.5868608195055875,
"train_speed(iter/s)": 0.056459
},
{
"epoch": 0.7725004620218074,
"grad_norm": 1.1153722062693998,
"learning_rate": 1.349167277205858e-05,
"loss": 1.124934768676758,
"memory(GiB)": 74.93,
"step": 1045,
"token_acc": 0.6122199592668024,
"train_speed(iter/s)": 0.056492
},
{
"epoch": 0.7761966364812419,
"grad_norm": 1.164884012561426,
"learning_rate": 1.3437178842927554e-05,
"loss": 1.1385189056396485,
"memory(GiB)": 74.93,
"step": 1050,
"token_acc": 0.6258808456117874,
"train_speed(iter/s)": 0.056526
},
{
"epoch": 0.7761966364812419,
"eval_loss": 0.7029861211776733,
"eval_runtime": 88.4673,
"eval_samples_per_second": 79.069,
"eval_steps_per_second": 0.622,
"eval_token_acc": 0.623123948129662,
"step": 1050
},
{
"epoch": 0.7798928109406764,
"grad_norm": 1.3055581766553261,
"learning_rate": 1.338256892199651e-05,
"loss": 1.1020261764526367,
"memory(GiB)": 74.93,
"step": 1055,
"token_acc": 0.6311363636363636,
"train_speed(iter/s)": 0.056221
},
{
"epoch": 0.7835889854001109,
"grad_norm": 1.0395384668146148,
"learning_rate": 1.3327844852143956e-05,
"loss": 1.148073959350586,
"memory(GiB)": 74.93,
"step": 1060,
"token_acc": 0.604885993485342,
"train_speed(iter/s)": 0.05626
},
{
"epoch": 0.7872851598595454,
"grad_norm": 1.1665752727714136,
"learning_rate": 1.3273008480100495e-05,
"loss": 1.0979449272155761,
"memory(GiB)": 74.93,
"step": 1065,
"token_acc": 0.6049382716049383,
"train_speed(iter/s)": 0.05629
},
{
"epoch": 0.7909813343189799,
"grad_norm": 1.041985717329155,
"learning_rate": 1.3218061656386517e-05,
"loss": 1.1317058563232423,
"memory(GiB)": 74.93,
"step": 1070,
"token_acc": 0.6433460076045627,
"train_speed(iter/s)": 0.056314
},
{
"epoch": 0.7946775087784144,
"grad_norm": 1.0369279649431482,
"learning_rate": 1.316300623524972e-05,
"loss": 1.1089330673217774,
"memory(GiB)": 74.93,
"step": 1075,
"token_acc": 0.6382868937048504,
"train_speed(iter/s)": 0.056354
},
{
"epoch": 0.7983736832378489,
"grad_norm": 1.1949441156399458,
"learning_rate": 1.3107844074602566e-05,
"loss": 1.0892942428588868,
"memory(GiB)": 74.93,
"step": 1080,
"token_acc": 0.6408912188728703,
"train_speed(iter/s)": 0.056386
},
{
"epoch": 0.8020698576972833,
"grad_norm": 1.0363420805429473,
"learning_rate": 1.305257703595957e-05,
"loss": 1.0744206428527832,
"memory(GiB)": 74.93,
"step": 1085,
"token_acc": 0.6147540983606558,
"train_speed(iter/s)": 0.056414
},
{
"epoch": 0.8057660321567178,
"grad_norm": 0.9805753007460783,
"learning_rate": 1.2997206984374486e-05,
"loss": 1.1048744201660157,
"memory(GiB)": 74.93,
"step": 1090,
"token_acc": 0.6329463792150359,
"train_speed(iter/s)": 0.056452
},
{
"epoch": 0.8094622066161523,
"grad_norm": 1.078880274058704,
"learning_rate": 1.2941735788377356e-05,
"loss": 1.0897531509399414,
"memory(GiB)": 74.93,
"step": 1095,
"token_acc": 0.6396155899626268,
"train_speed(iter/s)": 0.056484
},
{
"epoch": 0.8131583810755868,
"grad_norm": 1.083885052316346,
"learning_rate": 1.2886165319911474e-05,
"loss": 1.1432035446166993,
"memory(GiB)": 74.93,
"step": 1100,
"token_acc": 0.5973259929217459,
"train_speed(iter/s)": 0.056505
},
{
"epoch": 0.8131583810755868,
"eval_loss": 0.6945818662643433,
"eval_runtime": 86.4586,
"eval_samples_per_second": 80.906,
"eval_steps_per_second": 0.636,
"eval_token_acc": 0.6239354321874054,
"step": 1100
},
{
"epoch": 0.8168545555350213,
"grad_norm": 1.1507994138444235,
"learning_rate": 1.2830497454270206e-05,
"loss": 1.1136839866638184,
"memory(GiB)": 74.93,
"step": 1105,
"token_acc": 0.6371170793117918,
"train_speed(iter/s)": 0.05622
},
{
"epoch": 0.8205507299944558,
"grad_norm": 1.0133515901515742,
"learning_rate": 1.2774734070033692e-05,
"loss": 1.1166929244995116,
"memory(GiB)": 74.93,
"step": 1110,
"token_acc": 0.6103855721393034,
"train_speed(iter/s)": 0.056253
},
{
"epoch": 0.8242469044538903,
"grad_norm": 1.1857531032231587,
"learning_rate": 1.2718877049005477e-05,
"loss": 1.1120613098144532,
"memory(GiB)": 74.93,
"step": 1115,
"token_acc": 0.6248982912937348,
"train_speed(iter/s)": 0.056279
},
{
"epoch": 0.8279430789133247,
"grad_norm": 1.0147593247560383,
"learning_rate": 1.2662928276148985e-05,
"loss": 1.0828424453735352,
"memory(GiB)": 74.93,
"step": 1120,
"token_acc": 0.6065897858319604,
"train_speed(iter/s)": 0.056309
},
{
"epoch": 0.8316392533727592,
"grad_norm": 1.0535067736037584,
"learning_rate": 1.2606889639523925e-05,
"loss": 1.082409381866455,
"memory(GiB)": 74.93,
"step": 1125,
"token_acc": 0.6383859286083807,
"train_speed(iter/s)": 0.056339
},
{
"epoch": 0.8353354278321937,
"grad_norm": 1.090903289476391,
"learning_rate": 1.255076303022256e-05,
"loss": 1.1306575775146483,
"memory(GiB)": 74.93,
"step": 1130,
"token_acc": 0.6113028472821398,
"train_speed(iter/s)": 0.056373
},
{
"epoch": 0.8390316022916282,
"grad_norm": 1.1602057234017449,
"learning_rate": 1.2494550342305906e-05,
"loss": 1.1157353401184082,
"memory(GiB)": 74.93,
"step": 1135,
"token_acc": 0.629865985960434,
"train_speed(iter/s)": 0.0564
},
{
"epoch": 0.8427277767510627,
"grad_norm": 1.032443656861064,
"learning_rate": 1.2438253472739805e-05,
"loss": 1.0929494857788087,
"memory(GiB)": 74.93,
"step": 1140,
"token_acc": 0.6280344557556774,
"train_speed(iter/s)": 0.056434
},
{
"epoch": 0.8464239512104972,
"grad_norm": 1.122025726444444,
"learning_rate": 1.2381874321330912e-05,
"loss": 1.1178958892822266,
"memory(GiB)": 74.93,
"step": 1145,
"token_acc": 0.6517412935323383,
"train_speed(iter/s)": 0.056468
},
{
"epoch": 0.8501201256699317,
"grad_norm": 1.0829851308141574,
"learning_rate": 1.2325414790662578e-05,
"loss": 1.0894483566284179,
"memory(GiB)": 74.93,
"step": 1150,
"token_acc": 0.6569058077110785,
"train_speed(iter/s)": 0.05649
},
{
"epoch": 0.8501201256699317,
"eval_loss": 0.6932370066642761,
"eval_runtime": 86.0146,
"eval_samples_per_second": 81.323,
"eval_steps_per_second": 0.639,
"eval_token_acc": 0.6245899292866097,
"step": 1150
},
{
"epoch": 0.8538163001293662,
"grad_norm": 1.3861087034460704,
"learning_rate": 1.2268876786030654e-05,
"loss": 1.1001951217651367,
"memory(GiB)": 74.93,
"step": 1155,
"token_acc": 0.630185845691759,
"train_speed(iter/s)": 0.056209
},
{
"epoch": 0.8575124745888006,
"grad_norm": 1.1867682331739955,
"learning_rate": 1.2212262215379199e-05,
"loss": 1.1211355209350586,
"memory(GiB)": 74.93,
"step": 1160,
"token_acc": 0.6551724137931034,
"train_speed(iter/s)": 0.056235
},
{
"epoch": 0.8612086490482351,
"grad_norm": 1.0901861719096644,
"learning_rate": 1.215557298923607e-05,
"loss": 1.0956010818481445,
"memory(GiB)": 74.93,
"step": 1165,
"token_acc": 0.6244993324432577,
"train_speed(iter/s)": 0.056271
},
{
"epoch": 0.8649048235076695,
"grad_norm": 1.0190543071260865,
"learning_rate": 1.2098811020648475e-05,
"loss": 1.1221609115600586,
"memory(GiB)": 74.93,
"step": 1170,
"token_acc": 0.612531328320802,
"train_speed(iter/s)": 0.056297
},
{
"epoch": 0.868600997967104,
"grad_norm": 1.055731899501751,
"learning_rate": 1.2041978225118409e-05,
"loss": 1.0942396163940429,
"memory(GiB)": 74.93,
"step": 1175,
"token_acc": 0.61580547112462,
"train_speed(iter/s)": 0.056324
},
{
"epoch": 0.8722971724265385,
"grad_norm": 1.1595911679468829,
"learning_rate": 1.1985076520537995e-05,
"loss": 1.1030941009521484,
"memory(GiB)": 74.93,
"step": 1180,
"token_acc": 0.6299868478737396,
"train_speed(iter/s)": 0.056356
},
{
"epoch": 0.875993346885973,
"grad_norm": 1.1461146239140465,
"learning_rate": 1.1928107827124786e-05,
"loss": 1.0970783233642578,
"memory(GiB)": 74.93,
"step": 1185,
"token_acc": 0.644696639022261,
"train_speed(iter/s)": 0.056381
},
{
"epoch": 0.8796895213454075,
"grad_norm": 1.0680776701688195,
"learning_rate": 1.1871074067356952e-05,
"loss": 1.079010009765625,
"memory(GiB)": 74.93,
"step": 1190,
"token_acc": 0.6483679525222552,
"train_speed(iter/s)": 0.056408
},
{
"epoch": 0.8833856958048419,
"grad_norm": 1.1205292458140585,
"learning_rate": 1.1813977165908406e-05,
"loss": 1.098078155517578,
"memory(GiB)": 74.93,
"step": 1195,
"token_acc": 0.6183456183456183,
"train_speed(iter/s)": 0.056441
},
{
"epoch": 0.8870818702642764,
"grad_norm": 1.073187725881319,
"learning_rate": 1.1756819049583861e-05,
"loss": 1.1022902488708497,
"memory(GiB)": 74.93,
"step": 1200,
"token_acc": 0.6195414847161572,
"train_speed(iter/s)": 0.056472
},
{
"epoch": 0.8870818702642764,
"eval_loss": 0.6976271271705627,
"eval_runtime": 87.7392,
"eval_samples_per_second": 79.725,
"eval_steps_per_second": 0.627,
"eval_token_acc": 0.6255041474569267,
"step": 1200
},
{
"epoch": 0.8907780447237109,
"grad_norm": 1.0836927609908615,
"learning_rate": 1.1699601647253791e-05,
"loss": 1.0966317176818847,
"memory(GiB)": 74.93,
"step": 1205,
"token_acc": 0.6305779078273592,
"train_speed(iter/s)": 0.056207
},
{
"epoch": 0.8944742191831454,
"grad_norm": 1.1200101176242079,
"learning_rate": 1.1642326889789352e-05,
"loss": 1.1052473068237305,
"memory(GiB)": 74.93,
"step": 1210,
"token_acc": 0.6330027051397655,
"train_speed(iter/s)": 0.05623
},
{
"epoch": 0.8981703936425799,
"grad_norm": 0.8945893498959235,
"learning_rate": 1.158499670999722e-05,
"loss": 1.0987310409545898,
"memory(GiB)": 74.93,
"step": 1215,
"token_acc": 0.6409691629955947,
"train_speed(iter/s)": 0.05626
},
{
"epoch": 0.9018665681020144,
"grad_norm": 1.1729053883136484,
"learning_rate": 1.1527613042554368e-05,
"loss": 1.1048666000366212,
"memory(GiB)": 74.93,
"step": 1220,
"token_acc": 0.6676938880328711,
"train_speed(iter/s)": 0.056294
},
{
"epoch": 0.9055627425614489,
"grad_norm": 1.0443569914858049,
"learning_rate": 1.147017782394277e-05,
"loss": 1.081749439239502,
"memory(GiB)": 74.93,
"step": 1225,
"token_acc": 0.608612895550797,
"train_speed(iter/s)": 0.056319
},
{
"epoch": 0.9092589170208834,
"grad_norm": 1.2005283092061096,
"learning_rate": 1.1412692992384058e-05,
"loss": 1.091093158721924,
"memory(GiB)": 74.93,
"step": 1230,
"token_acc": 0.606317160534028,
"train_speed(iter/s)": 0.056348
},
{
"epoch": 0.9129550914803178,
"grad_norm": 1.0896928360432243,
"learning_rate": 1.1355160487774119e-05,
"loss": 1.1176409721374512,
"memory(GiB)": 74.93,
"step": 1235,
"token_acc": 0.5716694772344013,
"train_speed(iter/s)": 0.056377
},
{
"epoch": 0.9166512659397523,
"grad_norm": 1.09517195359763,
"learning_rate": 1.1297582251617618e-05,
"loss": 1.1004619598388672,
"memory(GiB)": 74.93,
"step": 1240,
"token_acc": 0.6309497935231472,
"train_speed(iter/s)": 0.056401
},
{
"epoch": 0.9203474403991868,
"grad_norm": 1.0558160321968586,
"learning_rate": 1.1239960226962491e-05,
"loss": 1.1076683044433593,
"memory(GiB)": 74.93,
"step": 1245,
"token_acc": 0.624376731301939,
"train_speed(iter/s)": 0.056433
},
{
"epoch": 0.9240436148586213,
"grad_norm": 1.167401656088389,
"learning_rate": 1.1182296358334373e-05,
"loss": 1.0801752090454102,
"memory(GiB)": 74.93,
"step": 1250,
"token_acc": 0.6274625110261688,
"train_speed(iter/s)": 0.056468
},
{
"epoch": 0.9240436148586213,
"eval_loss": 0.6896535158157349,
"eval_runtime": 89.0061,
"eval_samples_per_second": 78.59,
"eval_steps_per_second": 0.618,
"eval_token_acc": 0.6259704910690581,
"step": 1250
},
{
"epoch": 0.9277397893180558,
"grad_norm": 1.2651651199409124,
"learning_rate": 1.1124592591670964e-05,
"loss": 1.0778679847717285,
"memory(GiB)": 74.93,
"step": 1255,
"token_acc": 0.6440798016763074,
"train_speed(iter/s)": 0.056224
},
{
"epoch": 0.9314359637774903,
"grad_norm": 1.0901265302180776,
"learning_rate": 1.1066850874256387e-05,
"loss": 1.0967378616333008,
"memory(GiB)": 74.93,
"step": 1260,
"token_acc": 0.6274731486715659,
"train_speed(iter/s)": 0.056248
},
{
"epoch": 0.9351321382369248,
"grad_norm": 1.0804226410639166,
"learning_rate": 1.1009073154655452e-05,
"loss": 1.0889236450195312,
"memory(GiB)": 74.93,
"step": 1265,
"token_acc": 0.620845921450151,
"train_speed(iter/s)": 0.056269
},
{
"epoch": 0.9388283126963592,
"grad_norm": 1.228390945564267,
"learning_rate": 1.09512613826479e-05,
"loss": 1.1092605590820312,
"memory(GiB)": 74.93,
"step": 1270,
"token_acc": 0.6499229583975347,
"train_speed(iter/s)": 0.056301
},
{
"epoch": 0.9425244871557937,
"grad_norm": 1.179672539170986,
"learning_rate": 1.0893417509162624e-05,
"loss": 1.099574661254883,
"memory(GiB)": 74.93,
"step": 1275,
"token_acc": 0.6232127838519764,
"train_speed(iter/s)": 0.056325
},
{
"epoch": 0.9462206616152282,
"grad_norm": 1.0309784047078987,
"learning_rate": 1.0835543486211815e-05,
"loss": 1.1081634521484376,
"memory(GiB)": 74.93,
"step": 1280,
"token_acc": 0.6257142857142857,
"train_speed(iter/s)": 0.056352
},
{
"epoch": 0.9499168360746627,
"grad_norm": 1.1083199849496777,
"learning_rate": 1.0777641266825094e-05,
"loss": 1.1096603393554687,
"memory(GiB)": 74.93,
"step": 1285,
"token_acc": 0.6357894736842106,
"train_speed(iter/s)": 0.056378
},
{
"epoch": 0.9536130105340972,
"grad_norm": 1.0035577075576465,
"learning_rate": 1.0719712804983604e-05,
"loss": 1.1045263290405274,
"memory(GiB)": 74.93,
"step": 1290,
"token_acc": 0.6397618260006616,
"train_speed(iter/s)": 0.056405
},
{
"epoch": 0.9573091849935317,
"grad_norm": 1.0502142381441943,
"learning_rate": 1.0661760055554083e-05,
"loss": 1.082082462310791,
"memory(GiB)": 74.93,
"step": 1295,
"token_acc": 0.6266829865361077,
"train_speed(iter/s)": 0.056429
},
{
"epoch": 0.9610053594529662,
"grad_norm": 1.2499115770499312,
"learning_rate": 1.0603784974222862e-05,
"loss": 1.098296546936035,
"memory(GiB)": 74.93,
"step": 1300,
"token_acc": 0.6284748309541698,
"train_speed(iter/s)": 0.056459
},
{
"epoch": 0.9610053594529662,
"eval_loss": 0.6888419389724731,
"eval_runtime": 87.7552,
"eval_samples_per_second": 79.71,
"eval_steps_per_second": 0.627,
"eval_token_acc": 0.6259658738055717,
"step": 1300
},
{
"epoch": 0.9647015339124007,
"grad_norm": 1.2558210208759852,
"learning_rate": 1.054578951742991e-05,
"loss": 1.0757410049438476,
"memory(GiB)": 74.93,
"step": 1305,
"token_acc": 0.6296939859059755,
"train_speed(iter/s)": 0.056222
},
{
"epoch": 0.9683977083718351,
"grad_norm": 1.1509712834800971,
"learning_rate": 1.048777564230278e-05,
"loss": 1.1064401626586915,
"memory(GiB)": 74.93,
"step": 1310,
"token_acc": 0.6144927536231884,
"train_speed(iter/s)": 0.056247
},
{
"epoch": 0.9720938828312696,
"grad_norm": 1.1877122033430165,
"learning_rate": 1.0429745306590573e-05,
"loss": 1.0995939254760743,
"memory(GiB)": 74.93,
"step": 1315,
"token_acc": 0.6551246537396122,
"train_speed(iter/s)": 0.056264
},
{
"epoch": 0.9757900572907041,
"grad_norm": 1.0334473323989715,
"learning_rate": 1.0371700468597886e-05,
"loss": 1.0957868576049805,
"memory(GiB)": 74.93,
"step": 1320,
"token_acc": 0.6152882205513784,
"train_speed(iter/s)": 0.056289
},
{
"epoch": 0.9794862317501386,
"grad_norm": 1.0379714843668957,
"learning_rate": 1.0313643087118692e-05,
"loss": 1.0816888809204102,
"memory(GiB)": 74.93,
"step": 1325,
"token_acc": 0.6423645320197044,
"train_speed(iter/s)": 0.056319
},
{
"epoch": 0.9831824062095731,
"grad_norm": 1.0681169313465444,
"learning_rate": 1.0255575121370277e-05,
"loss": 1.0688974380493164,
"memory(GiB)": 74.93,
"step": 1330,
"token_acc": 0.6287527459116427,
"train_speed(iter/s)": 0.056343
},
{
"epoch": 0.9868785806690076,
"grad_norm": 1.1171758504896703,
"learning_rate": 1.0197498530927102e-05,
"loss": 1.099297332763672,
"memory(GiB)": 74.93,
"step": 1335,
"token_acc": 0.6077836745008846,
"train_speed(iter/s)": 0.056367
},
{
"epoch": 0.9905747551284421,
"grad_norm": 1.0576212439483514,
"learning_rate": 1.0139415275654671e-05,
"loss": 1.0867423057556151,
"memory(GiB)": 74.93,
"step": 1340,
"token_acc": 0.6263262599469496,
"train_speed(iter/s)": 0.056396
},
{
"epoch": 0.9942709295878766,
"grad_norm": 1.258815850774044,
"learning_rate": 1.0081327315643406e-05,
"loss": 1.1155497550964355,
"memory(GiB)": 74.93,
"step": 1345,
"token_acc": 0.655549765502866,
"train_speed(iter/s)": 0.056419
},
{
"epoch": 0.997967104047311,
"grad_norm": 1.0659691536136329,
"learning_rate": 1.0023236611142499e-05,
"loss": 1.057703685760498,
"memory(GiB)": 74.93,
"step": 1350,
"token_acc": 0.712,
"train_speed(iter/s)": 0.056446
},
{
"epoch": 0.997967104047311,
"eval_loss": 0.6881307363510132,
"eval_runtime": 86.0221,
"eval_samples_per_second": 81.316,
"eval_steps_per_second": 0.639,
"eval_token_acc": 0.626765814704599,
"step": 1350
},
{
"epoch": 1.0022177046756606,
"grad_norm": 1.3069033033680353,
"learning_rate": 9.965145122493756e-06,
"loss": 1.2448784828186035,
"memory(GiB)": 74.93,
"step": 1355,
"token_acc": 0.6295214105793451,
"train_speed(iter/s)": 0.056171
},
{
"epoch": 1.0059138791350952,
"grad_norm": 0.9882434180756982,
"learning_rate": 9.907054810065446e-06,
"loss": 1.062336540222168,
"memory(GiB)": 74.93,
"step": 1360,
"token_acc": 0.6483717526527625,
"train_speed(iter/s)": 0.056192
},
{
"epoch": 1.0096100535945296,
"grad_norm": 1.2362454534970095,
"learning_rate": 9.848967634186142e-06,
"loss": 1.0906942367553711,
"memory(GiB)": 74.93,
"step": 1365,
"token_acc": 0.6448347722536469,
"train_speed(iter/s)": 0.056213
},
{
"epoch": 1.0133062280539642,
"grad_norm": 1.070334993285048,
"learning_rate": 9.790885555078575e-06,
"loss": 1.0470151901245117,
"memory(GiB)": 74.93,
"step": 1370,
"token_acc": 0.6228728728728729,
"train_speed(iter/s)": 0.056237
},
{
"epoch": 1.0170024025133986,
"grad_norm": 1.0576680139627181,
"learning_rate": 9.732810532793465e-06,
"loss": 1.0586755752563477,
"memory(GiB)": 74.93,
"step": 1375,
"token_acc": 0.6435643564356436,
"train_speed(iter/s)": 0.056266
},
{
"epoch": 1.0206985769728332,
"grad_norm": 1.0167739538945428,
"learning_rate": 9.674744527143419e-06,
"loss": 1.059821891784668,
"memory(GiB)": 74.93,
"step": 1380,
"token_acc": 0.6397306397306397,
"train_speed(iter/s)": 0.056291
},
{
"epoch": 1.0243947514322675,
"grad_norm": 1.1268503654686965,
"learning_rate": 9.61668949763674e-06,
"loss": 1.0377557754516602,
"memory(GiB)": 74.93,
"step": 1385,
"token_acc": 0.6721439749608764,
"train_speed(iter/s)": 0.056311
},
{
"epoch": 1.0280909258917021,
"grad_norm": 0.9931688648143746,
"learning_rate": 9.558647403411334e-06,
"loss": 1.0480243682861328,
"memory(GiB)": 74.93,
"step": 1390,
"token_acc": 0.6135416666666667,
"train_speed(iter/s)": 0.056336
},
{
"epoch": 1.0317871003511365,
"grad_norm": 1.1339232037274705,
"learning_rate": 9.500620203168604e-06,
"loss": 1.0579310417175294,
"memory(GiB)": 74.93,
"step": 1395,
"token_acc": 0.6699186991869919,
"train_speed(iter/s)": 0.056365
},
{
"epoch": 1.0354832748105711,
"grad_norm": 0.9738636619210117,
"learning_rate": 9.442609855107317e-06,
"loss": 1.0384546279907227,
"memory(GiB)": 74.93,
"step": 1400,
"token_acc": 0.6303651505445227,
"train_speed(iter/s)": 0.056383
},
{
"epoch": 1.0354832748105711,
"eval_loss": 0.6841524243354797,
"eval_runtime": 86.5188,
"eval_samples_per_second": 80.849,
"eval_steps_per_second": 0.636,
"eval_token_acc": 0.6267785121791868,
"step": 1400
},
{
"epoch": 1.0391794492700055,
"grad_norm": 1.0076575163805248,
"learning_rate": 9.38461831685756e-06,
"loss": 1.0656241416931151,
"memory(GiB)": 74.93,
"step": 1405,
"token_acc": 0.6295483423818875,
"train_speed(iter/s)": 0.056156
},
{
"epoch": 1.04287562372944,
"grad_norm": 1.0590248393948134,
"learning_rate": 9.326647545414647e-06,
"loss": 1.0602170944213867,
"memory(GiB)": 74.93,
"step": 1410,
"token_acc": 0.7284836065573771,
"train_speed(iter/s)": 0.056177
},
{
"epoch": 1.0465717981888745,
"grad_norm": 1.0411050083571975,
"learning_rate": 9.268699497073102e-06,
"loss": 1.0623086929321288,
"memory(GiB)": 74.93,
"step": 1415,
"token_acc": 0.6079059829059829,
"train_speed(iter/s)": 0.056203
},
{
"epoch": 1.050267972648309,
"grad_norm": 1.0820280991464322,
"learning_rate": 9.21077612736062e-06,
"loss": 1.0742631912231446,
"memory(GiB)": 74.93,
"step": 1420,
"token_acc": 0.6051423324150597,
"train_speed(iter/s)": 0.056231
},
{
"epoch": 1.0539641471077434,
"grad_norm": 1.0150109672389387,
"learning_rate": 9.152879390972085e-06,
"loss": 1.060621452331543,
"memory(GiB)": 74.93,
"step": 1425,
"token_acc": 0.6677704194260485,
"train_speed(iter/s)": 0.056246
},
{
"epoch": 1.057660321567178,
"grad_norm": 1.0625464742964672,
"learning_rate": 9.095011241703623e-06,
"loss": 1.1060840606689453,
"memory(GiB)": 74.93,
"step": 1430,
"token_acc": 0.617154288572143,
"train_speed(iter/s)": 0.056275
},
{
"epoch": 1.0613564960266124,
"grad_norm": 1.080121630294682,
"learning_rate": 9.037173632386635e-06,
"loss": 1.051788902282715,
"memory(GiB)": 74.93,
"step": 1435,
"token_acc": 0.693069306930693,
"train_speed(iter/s)": 0.056295
},
{
"epoch": 1.065052670486047,
"grad_norm": 0.9965862626370368,
"learning_rate": 8.979368514821917e-06,
"loss": 1.0715249061584473,
"memory(GiB)": 74.93,
"step": 1440,
"token_acc": 0.6563587166602242,
"train_speed(iter/s)": 0.05632
},
{
"epoch": 1.0687488449454814,
"grad_norm": 1.0523645368442776,
"learning_rate": 8.921597839713803e-06,
"loss": 1.0732128143310546,
"memory(GiB)": 74.93,
"step": 1445,
"token_acc": 0.6195273149941883,
"train_speed(iter/s)": 0.056345
},
{
"epoch": 1.072445019404916,
"grad_norm": 0.9439502959144558,
"learning_rate": 8.863863556604312e-06,
"loss": 1.0644493103027344,
"memory(GiB)": 74.93,
"step": 1450,
"token_acc": 0.6215469613259669,
"train_speed(iter/s)": 0.056369
},
{
"epoch": 1.072445019404916,
"eval_loss": 0.6834661960601807,
"eval_runtime": 87.5557,
"eval_samples_per_second": 79.892,
"eval_steps_per_second": 0.628,
"eval_token_acc": 0.627048622093144,
"step": 1450
},
{
"epoch": 1.0761411938643504,
"grad_norm": 1.1260430229381853,
"learning_rate": 8.806167613807374e-06,
"loss": 1.0463625907897949,
"memory(GiB)": 74.93,
"step": 1455,
"token_acc": 0.6380742913000977,
"train_speed(iter/s)": 0.05615
},
{
"epoch": 1.079837368323785,
"grad_norm": 1.1262155455903309,
"learning_rate": 8.748511958343076e-06,
"loss": 1.0758758544921876,
"memory(GiB)": 74.93,
"step": 1460,
"token_acc": 0.6353591160220995,
"train_speed(iter/s)": 0.056173
},
{
"epoch": 1.0835335427832193,
"grad_norm": 1.0836611872394941,
"learning_rate": 8.690898535871967e-06,
"loss": 1.0662212371826172,
"memory(GiB)": 74.93,
"step": 1465,
"token_acc": 0.6074675324675325,
"train_speed(iter/s)": 0.0562
},
{
"epoch": 1.087229717242654,
"grad_norm": 1.1980862381018496,
"learning_rate": 8.633329290629385e-06,
"loss": 1.042177963256836,
"memory(GiB)": 74.93,
"step": 1470,
"token_acc": 0.6368200836820084,
"train_speed(iter/s)": 0.056225
},
{
"epoch": 1.0909258917020883,
"grad_norm": 1.1395698139161996,
"learning_rate": 8.575806165359852e-06,
"loss": 1.0712276458740235,
"memory(GiB)": 74.93,
"step": 1475,
"token_acc": 0.6389548693586699,
"train_speed(iter/s)": 0.056249
},
{
"epoch": 1.094622066161523,
"grad_norm": 1.0531458891625334,
"learning_rate": 8.51833110125153e-06,
"loss": 1.0721662521362305,
"memory(GiB)": 74.93,
"step": 1480,
"token_acc": 0.6220368744512731,
"train_speed(iter/s)": 0.056271
},
{
"epoch": 1.0983182406209573,
"grad_norm": 0.952355580471414,
"learning_rate": 8.460906037870677e-06,
"loss": 1.018984603881836,
"memory(GiB)": 74.93,
"step": 1485,
"token_acc": 0.6109256449165402,
"train_speed(iter/s)": 0.056292
},
{
"epoch": 1.1020144150803919,
"grad_norm": 1.0722820285217056,
"learning_rate": 8.403532913096231e-06,
"loss": 1.0254201889038086,
"memory(GiB)": 74.93,
"step": 1490,
"token_acc": 0.6746411483253588,
"train_speed(iter/s)": 0.056313
},
{
"epoch": 1.1057105895398263,
"grad_norm": 1.0574628279248734,
"learning_rate": 8.346213663054388e-06,
"loss": 1.0446287155151368,
"memory(GiB)": 74.93,
"step": 1495,
"token_acc": 0.6608030592734225,
"train_speed(iter/s)": 0.056333
},
{
"epoch": 1.1094067639992609,
"grad_norm": 1.0816482005421177,
"learning_rate": 8.288950222053287e-06,
"loss": 1.0296789169311524,
"memory(GiB)": 74.93,
"step": 1500,
"token_acc": 0.5984496124031008,
"train_speed(iter/s)": 0.056359
},
{
"epoch": 1.1094067639992609,
"eval_loss": 0.6838507056236267,
"eval_runtime": 89.1049,
"eval_samples_per_second": 78.503,
"eval_steps_per_second": 0.617,
"eval_token_acc": 0.6276061566591329,
"step": 1500
},
{
"epoch": 1.1131029384586952,
"grad_norm": 1.0669000768005377,
"learning_rate": 8.231744522517713e-06,
"loss": 1.052156925201416,
"memory(GiB)": 74.93,
"step": 1505,
"token_acc": 0.6264432872990717,
"train_speed(iter/s)": 0.056154
},
{
"epoch": 1.1167991129181298,
"grad_norm": 1.123101456521189,
"learning_rate": 8.174598494923893e-06,
"loss": 1.0532621383666991,
"memory(GiB)": 74.93,
"step": 1510,
"token_acc": 0.6674074074074074,
"train_speed(iter/s)": 0.056174
},
{
"epoch": 1.1204952873775642,
"grad_norm": 0.9597873108803062,
"learning_rate": 8.117514067734365e-06,
"loss": 1.0872188568115235,
"memory(GiB)": 74.93,
"step": 1515,
"token_acc": 0.6229354939233406,
"train_speed(iter/s)": 0.056193
},
{
"epoch": 1.1241914618369988,
"grad_norm": 1.01751822081855,
"learning_rate": 8.060493167332874e-06,
"loss": 1.0647924423217774,
"memory(GiB)": 74.93,
"step": 1520,
"token_acc": 0.6589195979899497,
"train_speed(iter/s)": 0.056222
},
{
"epoch": 1.1278876362964332,
"grad_norm": 1.2668018355322213,
"learning_rate": 8.003537717959378e-06,
"loss": 1.054795265197754,
"memory(GiB)": 74.93,
"step": 1525,
"token_acc": 0.6280428432327166,
"train_speed(iter/s)": 0.056242
},
{
"epoch": 1.1315838107558678,
"grad_norm": 1.0402787270589529,
"learning_rate": 7.946649641645108e-06,
"loss": 1.0737996101379395,
"memory(GiB)": 74.93,
"step": 1530,
"token_acc": 0.6400172860847018,
"train_speed(iter/s)": 0.056265
},
{
"epoch": 1.1352799852153022,
"grad_norm": 1.1860588895073847,
"learning_rate": 7.889830858147718e-06,
"loss": 1.0505868911743164,
"memory(GiB)": 74.93,
"step": 1535,
"token_acc": 0.6243339253996447,
"train_speed(iter/s)": 0.056293
},
{
"epoch": 1.1389761596747365,
"grad_norm": 1.0989591028912902,
"learning_rate": 7.833083284886484e-06,
"loss": 1.0597726821899414,
"memory(GiB)": 74.93,
"step": 1540,
"token_acc": 0.6668341708542713,
"train_speed(iter/s)": 0.056316
},
{
"epoch": 1.1426723341341711,
"grad_norm": 1.1347824812891065,
"learning_rate": 7.7764088368776e-06,
"loss": 1.0500106811523438,
"memory(GiB)": 74.93,
"step": 1545,
"token_acc": 0.6302988186240445,
"train_speed(iter/s)": 0.056337
},
{
"epoch": 1.1463685085936057,
"grad_norm": 1.0564162756732445,
"learning_rate": 7.719809426669576e-06,
"loss": 1.0577827453613282,
"memory(GiB)": 74.93,
"step": 1550,
"token_acc": 0.6201646090534979,
"train_speed(iter/s)": 0.056358
},
{
"epoch": 1.1463685085936057,
"eval_loss": 0.6770405769348145,
"eval_runtime": 87.0148,
"eval_samples_per_second": 80.389,
"eval_steps_per_second": 0.632,
"eval_token_acc": 0.6278058533049218,
"step": 1550
},
{
"epoch": 1.15006468305304,
"grad_norm": 1.1665260843525407,
"learning_rate": 7.663286964278665e-06,
"loss": 1.046430492401123,
"memory(GiB)": 74.93,
"step": 1555,
"token_acc": 0.6295910639909126,
"train_speed(iter/s)": 0.056161
},
{
"epoch": 1.1537608575124745,
"grad_norm": 1.0893384767496972,
"learning_rate": 7.606843357124426e-06,
"loss": 1.0604162216186523,
"memory(GiB)": 74.93,
"step": 1560,
"token_acc": 0.618162506638343,
"train_speed(iter/s)": 0.056181
},
{
"epoch": 1.157457031971909,
"grad_norm": 1.0091311530942315,
"learning_rate": 7.550480509965348e-06,
"loss": 1.0764715194702148,
"memory(GiB)": 74.93,
"step": 1565,
"token_acc": 0.6651108518086347,
"train_speed(iter/s)": 0.056207
},
{
"epoch": 1.1611532064313437,
"grad_norm": 0.9991849558827516,
"learning_rate": 7.494200324834588e-06,
"loss": 1.076918888092041,
"memory(GiB)": 74.93,
"step": 1570,
"token_acc": 0.6519940915805023,
"train_speed(iter/s)": 0.056225
},
{
"epoch": 1.164849380890778,
"grad_norm": 1.1070133372574182,
"learning_rate": 7.43800470097576e-06,
"loss": 1.0360871315002442,
"memory(GiB)": 74.93,
"step": 1575,
"token_acc": 0.6534121440085975,
"train_speed(iter/s)": 0.056247
},
{
"epoch": 1.1685455553502124,
"grad_norm": 0.9616191113258434,
"learning_rate": 7.381895534778852e-06,
"loss": 1.071969223022461,
"memory(GiB)": 74.93,
"step": 1580,
"token_acc": 0.6318518518518519,
"train_speed(iter/s)": 0.05627
},
{
"epoch": 1.172241729809647,
"grad_norm": 0.9588896754114927,
"learning_rate": 7.3258747197162484e-06,
"loss": 1.0856236457824706,
"memory(GiB)": 74.93,
"step": 1585,
"token_acc": 0.6137469586374696,
"train_speed(iter/s)": 0.05629
},
{
"epoch": 1.1759379042690816,
"grad_norm": 1.155114349369357,
"learning_rate": 7.269944146278801e-06,
"loss": 1.054957962036133,
"memory(GiB)": 74.93,
"step": 1590,
"token_acc": 0.6266263237518911,
"train_speed(iter/s)": 0.056314
},
{
"epoch": 1.179634078728516,
"grad_norm": 1.0144629940415562,
"learning_rate": 7.214105701912054e-06,
"loss": 1.0508974075317383,
"memory(GiB)": 74.93,
"step": 1595,
"token_acc": 0.6369260827092152,
"train_speed(iter/s)": 0.056334
},
{
"epoch": 1.1833302531879504,
"grad_norm": 1.1824656228465167,
"learning_rate": 7.1583612709525405e-06,
"loss": 1.0430817604064941,
"memory(GiB)": 74.93,
"step": 1600,
"token_acc": 0.6061151079136691,
"train_speed(iter/s)": 0.056355
},
{
"epoch": 1.1833302531879504,
"eval_loss": 0.674736499786377,
"eval_runtime": 85.716,
"eval_samples_per_second": 81.607,
"eval_steps_per_second": 0.642,
"eval_token_acc": 0.6284499615612815,
"step": 1600
},
{
"epoch": 1.187026427647385,
"grad_norm": 1.1524983234954504,
"learning_rate": 7.102712734564202e-06,
"loss": 1.046616268157959,
"memory(GiB)": 74.93,
"step": 1605,
"token_acc": 0.6345166331770484,
"train_speed(iter/s)": 0.056165
},
{
"epoch": 1.1907226021068193,
"grad_norm": 0.9309819347033588,
"learning_rate": 7.047161970674896e-06,
"loss": 1.0448005676269532,
"memory(GiB)": 74.93,
"step": 1610,
"token_acc": 0.6130097087378641,
"train_speed(iter/s)": 0.056187
},
{
"epoch": 1.194418776566254,
"grad_norm": 1.0772202352983227,
"learning_rate": 6.991710853913025e-06,
"loss": 1.0570079803466796,
"memory(GiB)": 74.93,
"step": 1615,
"token_acc": 0.6610324349017817,
"train_speed(iter/s)": 0.056205
},
{
"epoch": 1.1981149510256883,
"grad_norm": 1.1619152201928238,
"learning_rate": 6.936361255544288e-06,
"loss": 1.044645118713379,
"memory(GiB)": 74.93,
"step": 1620,
"token_acc": 0.6945525291828794,
"train_speed(iter/s)": 0.056227
},
{
"epoch": 1.201811125485123,
"grad_norm": 1.0467564412195258,
"learning_rate": 6.881115043408512e-06,
"loss": 1.045677661895752,
"memory(GiB)": 74.93,
"step": 1625,
"token_acc": 0.648811228874248,
"train_speed(iter/s)": 0.056246
},
{
"epoch": 1.2055072999445573,
"grad_norm": 1.0325120697680106,
"learning_rate": 6.825974081856626e-06,
"loss": 1.0619203567504882,
"memory(GiB)": 74.93,
"step": 1630,
"token_acc": 0.6202729044834308,
"train_speed(iter/s)": 0.056267
},
{
"epoch": 1.209203474403992,
"grad_norm": 0.9412938462579274,
"learning_rate": 6.770940231687767e-06,
"loss": 1.0478931427001954,
"memory(GiB)": 74.93,
"step": 1635,
"token_acc": 0.6356352537199542,
"train_speed(iter/s)": 0.056289
},
{
"epoch": 1.2128996488634263,
"grad_norm": 1.140398149863178,
"learning_rate": 6.716015350086449e-06,
"loss": 1.0618717193603515,
"memory(GiB)": 74.93,
"step": 1640,
"token_acc": 0.6066892464013548,
"train_speed(iter/s)": 0.05631
},
{
"epoch": 1.2165958233228609,
"grad_norm": 1.0930330137960338,
"learning_rate": 6.661201290559918e-06,
"loss": 1.0522537231445312,
"memory(GiB)": 74.93,
"step": 1645,
"token_acc": 0.6371971185330714,
"train_speed(iter/s)": 0.056329
},
{
"epoch": 1.2202919977822952,
"grad_norm": 1.0731043610961355,
"learning_rate": 6.606499902875585e-06,
"loss": 1.0263765335083008,
"memory(GiB)": 74.93,
"step": 1650,
"token_acc": 0.6519023282226007,
"train_speed(iter/s)": 0.056348
},
{
"epoch": 1.2202919977822952,
"eval_loss": 0.6756451725959778,
"eval_runtime": 86.9798,
"eval_samples_per_second": 80.421,
"eval_steps_per_second": 0.632,
"eval_token_acc": 0.6288528178004742,
"step": 1650
},
{
"epoch": 1.2239881722417298,
"grad_norm": 1.131205462756531,
"learning_rate": 6.5519130329986245e-06,
"loss": 1.0687341690063477,
"memory(GiB)": 74.93,
"step": 1655,
"token_acc": 0.6333847797696782,
"train_speed(iter/s)": 0.056155
},
{
"epoch": 1.2276843467011642,
"grad_norm": 1.046052101501651,
"learning_rate": 6.497442523029663e-06,
"loss": 1.0175907135009765,
"memory(GiB)": 74.93,
"step": 1660,
"token_acc": 0.6453744493392071,
"train_speed(iter/s)": 0.056176
},
{
"epoch": 1.2313805211605988,
"grad_norm": 1.0553291483906215,
"learning_rate": 6.443090211142613e-06,
"loss": 1.0627668380737305,
"memory(GiB)": 74.93,
"step": 1665,
"token_acc": 0.6409149762624082,
"train_speed(iter/s)": 0.056196
},
{
"epoch": 1.2350766956200332,
"grad_norm": 0.9606710463766085,
"learning_rate": 6.388857931522657e-06,
"loss": 1.043929672241211,
"memory(GiB)": 74.93,
"step": 1670,
"token_acc": 0.6334586466165414,
"train_speed(iter/s)": 0.056218
},
{
"epoch": 1.2387728700794678,
"grad_norm": 0.9843358834706085,
"learning_rate": 6.334747514304338e-06,
"loss": 1.0336435317993165,
"memory(GiB)": 74.93,
"step": 1675,
"token_acc": 0.6631016042780749,
"train_speed(iter/s)": 0.056238
},
{
"epoch": 1.2424690445389022,
"grad_norm": 1.0297683983640094,
"learning_rate": 6.280760785509802e-06,
"loss": 1.0500383377075195,
"memory(GiB)": 74.93,
"step": 1680,
"token_acc": 0.6349254639488896,
"train_speed(iter/s)": 0.05626
},
{
"epoch": 1.2461652189983368,
"grad_norm": 1.0776782375280287,
"learning_rate": 6.226899566987177e-06,
"loss": 1.0217618942260742,
"memory(GiB)": 74.93,
"step": 1685,
"token_acc": 0.655511811023622,
"train_speed(iter/s)": 0.056281
},
{
"epoch": 1.2498613934577711,
"grad_norm": 1.0846016823921123,
"learning_rate": 6.173165676349103e-06,
"loss": 1.0370861053466798,
"memory(GiB)": 74.93,
"step": 1690,
"token_acc": 0.6801365964712578,
"train_speed(iter/s)": 0.056303
},
{
"epoch": 1.2535575679172057,
"grad_norm": 1.0790787844363594,
"learning_rate": 6.119560926911377e-06,
"loss": 1.0697561264038087,
"memory(GiB)": 74.93,
"step": 1695,
"token_acc": 0.6681639528354857,
"train_speed(iter/s)": 0.056324
},
{
"epoch": 1.2572537423766401,
"grad_norm": 1.106497642833312,
"learning_rate": 6.066087127631761e-06,
"loss": 1.0666908264160155,
"memory(GiB)": 74.93,
"step": 1700,
"token_acc": 0.6533379694019471,
"train_speed(iter/s)": 0.056341
},
{
"epoch": 1.2572537423766401,
"eval_loss": 0.6751002073287964,
"eval_runtime": 88.5942,
"eval_samples_per_second": 78.955,
"eval_steps_per_second": 0.621,
"eval_token_acc": 0.6288689782226767,
"step": 1700
},
{
"epoch": 1.2609499168360747,
"grad_norm": 1.0779984264892808,
"learning_rate": 6.012746083048966e-06,
"loss": 1.0639089584350585,
"memory(GiB)": 34.88,
"step": 1705,
"token_acc": 0.6968838526912181,
"train_speed(iter/s)": 14.788094
},
{
"epoch": 1.264646091295509,
"grad_norm": 1.1027008185153624,
"learning_rate": 5.959539593221711e-06,
"loss": 1.0941818237304688,
"memory(GiB)": 34.88,
"step": 1710,
"token_acc": 0.6294489092996556,
"train_speed(iter/s)": 9.344634
},
{
"epoch": 1.2683422657549437,
"grad_norm": 1.2059692859973439,
"learning_rate": 5.9064694536680135e-06,
"loss": 1.0492952346801758,
"memory(GiB)": 49.4,
"step": 1715,
"token_acc": 0.6576319543509273,
"train_speed(iter/s)": 6.522706
},
{
"epoch": 1.272038440214378,
"grad_norm": 1.0913297173697671,
"learning_rate": 5.853537455304575e-06,
"loss": 1.0665050506591798,
"memory(GiB)": 49.4,
"step": 1720,
"token_acc": 0.6941935483870968,
"train_speed(iter/s)": 4.977275
},
{
"epoch": 1.2757346146738127,
"grad_norm": 1.1326249785449936,
"learning_rate": 5.800745384386364e-06,
"loss": 1.035014533996582,
"memory(GiB)": 49.4,
"step": 1725,
"token_acc": 0.6055200269269606,
"train_speed(iter/s)": 4.1257
},
{
"epoch": 1.279430789133247,
"grad_norm": 1.011492822170868,
"learning_rate": 5.74809502244632e-06,
"loss": 1.040954875946045,
"memory(GiB)": 49.4,
"step": 1730,
"token_acc": 0.6559888579387186,
"train_speed(iter/s)": 3.505361
},
{
"epoch": 1.2831269635926816,
"grad_norm": 0.9143549731190831,
"learning_rate": 5.695588146235241e-06,
"loss": 1.056338119506836,
"memory(GiB)": 49.4,
"step": 1735,
"token_acc": 0.6355591311343524,
"train_speed(iter/s)": 3.006185
},
{
"epoch": 1.286823138052116,
"grad_norm": 1.0541690596505233,
"learning_rate": 5.643226527661825e-06,
"loss": 1.0424397468566895,
"memory(GiB)": 64.42,
"step": 1740,
"token_acc": 0.6127497621313035,
"train_speed(iter/s)": 2.653736
},
{
"epoch": 1.2905193125115506,
"grad_norm": 1.071302718364978,
"learning_rate": 5.591011933732873e-06,
"loss": 1.0049684524536133,
"memory(GiB)": 64.42,
"step": 1745,
"token_acc": 0.6237816764132553,
"train_speed(iter/s)": 2.414167
},
{
"epoch": 1.294215486970985,
"grad_norm": 1.0017860936129825,
"learning_rate": 5.538946126493659e-06,
"loss": 1.048162841796875,
"memory(GiB)": 64.42,
"step": 1750,
"token_acc": 0.6117103235747303,
"train_speed(iter/s)": 2.163836
},
{
"epoch": 1.294215486970985,
"eval_loss": 0.6697070002555847,
"eval_runtime": 85.8145,
"eval_samples_per_second": 81.513,
"eval_steps_per_second": 0.641,
"eval_token_acc": 0.6293895746807739,
"step": 1750
},
{
"epoch": 1.2979116614304196,
"grad_norm": 1.1010002294868126,
"learning_rate": 5.4870308629684675e-06,
"loss": 1.0428232192993163,
"memory(GiB)": 74.63,
"step": 1755,
"token_acc": 0.634660903571061,
"train_speed(iter/s)": 1.752193
},
{
"epoch": 1.301607835889854,
"grad_norm": 1.1351842621603827,
"learning_rate": 5.435267895101303e-06,
"loss": 1.0705801010131837,
"memory(GiB)": 74.63,
"step": 1760,
"token_acc": 0.663578947368421,
"train_speed(iter/s)": 1.629796
},
{
"epoch": 1.3053040103492886,
"grad_norm": 0.9688327106799416,
"learning_rate": 5.383658969696767e-06,
"loss": 1.043651008605957,
"memory(GiB)": 74.63,
"step": 1765,
"token_acc": 0.6663619744058501,
"train_speed(iter/s)": 1.540319
},
{
"epoch": 1.309000184808723,
"grad_norm": 1.0196740986171486,
"learning_rate": 5.3322058283611045e-06,
"loss": 1.066755485534668,
"memory(GiB)": 74.63,
"step": 1770,
"token_acc": 0.6984352773826458,
"train_speed(iter/s)": 1.440515
},
{
"epoch": 1.3126963592681575,
"grad_norm": 0.9324312791356152,
"learning_rate": 5.2809102074434505e-06,
"loss": 1.0861141204833984,
"memory(GiB)": 74.63,
"step": 1775,
"token_acc": 0.6625352112676056,
"train_speed(iter/s)": 1.355437
},
{
"epoch": 1.316392533727592,
"grad_norm": 1.0475529503023757,
"learning_rate": 5.229773837977208e-06,
"loss": 1.0537721633911132,
"memory(GiB)": 74.63,
"step": 1780,
"token_acc": 0.6779266161910309,
"train_speed(iter/s)": 1.294879
},
{
"epoch": 1.3200887081870265,
"grad_norm": 0.9281011767547357,
"learning_rate": 5.178798445621645e-06,
"loss": 1.0430593490600586,
"memory(GiB)": 74.63,
"step": 1785,
"token_acc": 0.6330935251798561,
"train_speed(iter/s)": 1.224208
},
{
"epoch": 1.3237848826464609,
"grad_norm": 1.0483168678654606,
"learning_rate": 5.127985750603671e-06,
"loss": 1.071333885192871,
"memory(GiB)": 74.63,
"step": 1790,
"token_acc": 0.6417910447761194,
"train_speed(iter/s)": 1.162932
},
{
"epoch": 1.3274810571058955,
"grad_norm": 1.097565660571469,
"learning_rate": 5.077337467659768e-06,
"loss": 1.0753141403198243,
"memory(GiB)": 74.63,
"step": 1795,
"token_acc": 0.6051001821493625,
"train_speed(iter/s)": 1.117195
},
{
"epoch": 1.3311772315653299,
"grad_norm": 1.063181582729188,
"learning_rate": 5.026855305978129e-06,
"loss": 1.0764029502868653,
"memory(GiB)": 74.63,
"step": 1800,
"token_acc": 0.6232106339468303,
"train_speed(iter/s)": 1.067656
},
{
"epoch": 1.3311772315653299,
"eval_loss": 0.6690813899040222,
"eval_runtime": 85.9692,
"eval_samples_per_second": 81.366,
"eval_steps_per_second": 0.64,
"eval_token_acc": 0.6297716532342776,
"step": 1800
},
{
"epoch": 1.3348734060247645,
"grad_norm": 1.0429392429603475,
"learning_rate": 4.976540969140984e-06,
"loss": 1.090817928314209,
"memory(GiB)": 74.63,
"step": 1805,
"token_acc": 0.6356786703601108,
"train_speed(iter/s)": 0.961744
},
{
"epoch": 1.3385695804841988,
"grad_norm": 1.0548409670879852,
"learning_rate": 4.926396155067114e-06,
"loss": 1.0316819190979003,
"memory(GiB)": 74.63,
"step": 1810,
"token_acc": 0.6598138091543833,
"train_speed(iter/s)": 0.923472
},
{
"epoch": 1.3422657549436332,
"grad_norm": 1.0297653617411635,
"learning_rate": 4.876422555954543e-06,
"loss": 1.03601131439209,
"memory(GiB)": 74.63,
"step": 1815,
"token_acc": 0.6965428937259923,
"train_speed(iter/s)": 0.894132
},
{
"epoch": 1.3459619294030678,
"grad_norm": 1.1178512794477986,
"learning_rate": 4.826621858223431e-06,
"loss": 1.0318429946899415,
"memory(GiB)": 74.63,
"step": 1820,
"token_acc": 0.6313304721030043,
"train_speed(iter/s)": 0.864578
},
{
"epoch": 1.3496581038625024,
"grad_norm": 1.0401775609610366,
"learning_rate": 4.776995742459184e-06,
"loss": 1.0820954322814942,
"memory(GiB)": 74.63,
"step": 1825,
"token_acc": 0.6357702349869452,
"train_speed(iter/s)": 0.833393
},
{
"epoch": 1.3533542783219368,
"grad_norm": 1.1053520267340973,
"learning_rate": 4.727545883355713e-06,
"loss": 1.0570013046264648,
"memory(GiB)": 74.63,
"step": 1830,
"token_acc": 0.6462998102466793,
"train_speed(iter/s)": 0.80849
},
{
"epoch": 1.3570504527813712,
"grad_norm": 1.0129657782670332,
"learning_rate": 4.678273949658939e-06,
"loss": 1.0589232444763184,
"memory(GiB)": 74.63,
"step": 1835,
"token_acc": 0.6194251734390486,
"train_speed(iter/s)": 0.785859
},
{
"epoch": 1.3607466272408058,
"grad_norm": 0.9863992139542379,
"learning_rate": 4.629181604110464e-06,
"loss": 1.0515235900878905,
"memory(GiB)": 74.63,
"step": 1840,
"token_acc": 0.6229317851959362,
"train_speed(iter/s)": 0.761135
},
{
"epoch": 1.3644428017002403,
"grad_norm": 1.1494795183000623,
"learning_rate": 4.580270503391487e-06,
"loss": 1.0223835945129394,
"memory(GiB)": 74.63,
"step": 1845,
"token_acc": 0.6583261432269197,
"train_speed(iter/s)": 0.739616
},
{
"epoch": 1.3681389761596747,
"grad_norm": 1.14471617138646,
"learning_rate": 4.531542298066861e-06,
"loss": 1.0207533836364746,
"memory(GiB)": 74.63,
"step": 1850,
"token_acc": 0.6551959114139694,
"train_speed(iter/s)": 0.721142
},
{
"epoch": 1.3681389761596747,
"eval_loss": 0.6679942607879639,
"eval_runtime": 93.3503,
"eval_samples_per_second": 74.933,
"eval_steps_per_second": 0.589,
"eval_token_acc": 0.6300302199895188,
"step": 1850
},
{
"epoch": 1.371835150619109,
"grad_norm": 1.1238157971359715,
"learning_rate": 4.482998632529414e-06,
"loss": 1.0442536354064942,
"memory(GiB)": 74.63,
"step": 1855,
"token_acc": 0.6386843397152675,
"train_speed(iter/s)": 0.673362
},
{
"epoch": 1.3755313250785437,
"grad_norm": 0.9044341600768213,
"learning_rate": 4.434641144944464e-06,
"loss": 1.0640903472900392,
"memory(GiB)": 74.63,
"step": 1860,
"token_acc": 0.6587333602258976,
"train_speed(iter/s)": 0.655234
},
{
"epoch": 1.3792274995379783,
"grad_norm": 1.0166299256206919,
"learning_rate": 4.386471467194513e-06,
"loss": 1.0587308883666993,
"memory(GiB)": 74.63,
"step": 1865,
"token_acc": 0.6148590947907772,
"train_speed(iter/s)": 0.63915
},
{
"epoch": 1.3829236739974127,
"grad_norm": 1.2786373427724909,
"learning_rate": 4.338491224824198e-06,
"loss": 1.0438286781311035,
"memory(GiB)": 74.63,
"step": 1870,
"token_acc": 0.6332835077229696,
"train_speed(iter/s)": 0.625873
},
{
"epoch": 1.386619848456847,
"grad_norm": 1.0910902180920756,
"learning_rate": 4.290702036985423e-06,
"loss": 1.0352885246276855,
"memory(GiB)": 74.63,
"step": 1875,
"token_acc": 0.6918429003021148,
"train_speed(iter/s)": 0.610514
},
{
"epoch": 1.3903160229162816,
"grad_norm": 1.0540455114144576,
"learning_rate": 4.243105516382732e-06,
"loss": 1.0169889450073242,
"memory(GiB)": 74.63,
"step": 1880,
"token_acc": 0.6479912544411042,
"train_speed(iter/s)": 0.59628
},
{
"epoch": 1.3940121973757162,
"grad_norm": 1.0796012032362492,
"learning_rate": 4.1957032692188685e-06,
"loss": 1.0289284706115722,
"memory(GiB)": 74.63,
"step": 1885,
"token_acc": 0.6304772536980184,
"train_speed(iter/s)": 0.584845
},
{
"epoch": 1.3977083718351506,
"grad_norm": 0.9497813177866403,
"learning_rate": 4.148496895140586e-06,
"loss": 1.0058039665222167,
"memory(GiB)": 74.63,
"step": 1890,
"token_acc": 0.6662360034453058,
"train_speed(iter/s)": 0.572483
},
{
"epoch": 1.401404546294585,
"grad_norm": 0.9994791403674819,
"learning_rate": 4.101487987184658e-06,
"loss": 1.0271056175231934,
"memory(GiB)": 74.63,
"step": 1895,
"token_acc": 0.7174721189591078,
"train_speed(iter/s)": 0.559822
},
{
"epoch": 1.4051007207540196,
"grad_norm": 0.9675552310253457,
"learning_rate": 4.054678131724128e-06,
"loss": 1.0421775817871093,
"memory(GiB)": 74.63,
"step": 1900,
"token_acc": 0.6403071017274472,
"train_speed(iter/s)": 0.549398
},
{
"epoch": 1.4051007207540196,
"eval_loss": 0.6665124893188477,
"eval_runtime": 92.5325,
"eval_samples_per_second": 75.595,
"eval_steps_per_second": 0.594,
"eval_token_acc": 0.6305912175031224,
"step": 1900
},
{
"epoch": 1.4087968952134542,
"grad_norm": 0.9383388424277262,
"learning_rate": 4.008068908414764e-06,
"loss": 1.0390195846557617,
"memory(GiB)": 74.63,
"step": 1905,
"token_acc": 0.636108220603538,
"train_speed(iter/s)": 0.522161
},
{
"epoch": 1.4124930696728886,
"grad_norm": 1.0404355020365603,
"learning_rate": 3.961661890141756e-06,
"loss": 1.064806842803955,
"memory(GiB)": 74.63,
"step": 1910,
"token_acc": 0.5955159705159705,
"train_speed(iter/s)": 0.512861
},
{
"epoch": 1.416189244132323,
"grad_norm": 1.1641858092814779,
"learning_rate": 3.91545864296665e-06,
"loss": 1.0407491683959962,
"memory(GiB)": 74.63,
"step": 1915,
"token_acc": 0.6579710144927536,
"train_speed(iter/s)": 0.502749
},
{
"epoch": 1.4198854185917575,
"grad_norm": 0.9981716234289997,
"learning_rate": 3.8694607260744745e-06,
"loss": 1.0334474563598632,
"memory(GiB)": 74.63,
"step": 1920,
"token_acc": 0.6448382126348228,
"train_speed(iter/s)": 0.494436
},
{
"epoch": 1.4235815930511921,
"grad_norm": 1.0999406567886463,
"learning_rate": 3.8236696917211365e-06,
"loss": 1.0606246948242188,
"memory(GiB)": 74.63,
"step": 1925,
"token_acc": 0.6300940438871473,
"train_speed(iter/s)": 0.48651
},
{
"epoch": 1.4272777675106265,
"grad_norm": 1.0161660727647654,
"learning_rate": 3.7780870851810515e-06,
"loss": 1.076219654083252,
"memory(GiB)": 74.63,
"step": 1930,
"token_acc": 0.6260296540362438,
"train_speed(iter/s)": 0.477741
},
{
"epoch": 1.430973941970061,
"grad_norm": 0.9703902428924409,
"learning_rate": 3.7327144446949716e-06,
"loss": 1.0812992095947265,
"memory(GiB)": 74.63,
"step": 1935,
"token_acc": 0.630064591896653,
"train_speed(iter/s)": 0.470034
},
{
"epoch": 1.4346701164294955,
"grad_norm": 1.0947535810008933,
"learning_rate": 3.687553301418092e-06,
"loss": 1.0244592666625976,
"memory(GiB)": 74.63,
"step": 1940,
"token_acc": 0.6301992310380986,
"train_speed(iter/s)": 0.463221
},
{
"epoch": 1.43836629088893,
"grad_norm": 1.0200917528662774,
"learning_rate": 3.6426051793683724e-06,
"loss": 1.0360092163085937,
"memory(GiB)": 74.63,
"step": 1945,
"token_acc": 0.6446078431372549,
"train_speed(iter/s)": 0.45531
},
{
"epoch": 1.4420624653483645,
"grad_norm": 0.9670618590123606,
"learning_rate": 3.5978715953751207e-06,
"loss": 1.0297866821289063,
"memory(GiB)": 74.63,
"step": 1950,
"token_acc": 0.6481696687972109,
"train_speed(iter/s)": 0.448099
},
{
"epoch": 1.4420624653483645,
"eval_loss": 0.6662415862083435,
"eval_runtime": 87.5872,
"eval_samples_per_second": 79.863,
"eval_steps_per_second": 0.628,
"eval_token_acc": 0.6309225061582752,
"step": 1950
},
{
"epoch": 1.4457586398077988,
"grad_norm": 0.9880600888670725,
"learning_rate": 3.5533540590277882e-06,
"loss": 1.0223572731018067,
"memory(GiB)": 74.63,
"step": 1955,
"token_acc": 0.6359920144500428,
"train_speed(iter/s)": 0.430514
},
{
"epoch": 1.4494548142672334,
"grad_norm": 0.9593918073057777,
"learning_rate": 3.509054072625031e-06,
"loss": 1.0360115051269532,
"memory(GiB)": 74.63,
"step": 1960,
"token_acc": 0.6581899775617053,
"train_speed(iter/s)": 0.424799
},
{
"epoch": 1.453150988726668,
"grad_norm": 1.0289280788083641,
"learning_rate": 3.4649731311240276e-06,
"loss": 1.0378742218017578,
"memory(GiB)": 74.63,
"step": 1965,
"token_acc": 0.6424075531077892,
"train_speed(iter/s)": 0.418454
},
{
"epoch": 1.4568471631861024,
"grad_norm": 1.053788067984888,
"learning_rate": 3.4211127220900107e-06,
"loss": 1.0713199615478515,
"memory(GiB)": 74.63,
"step": 1970,
"token_acc": 0.632213608957795,
"train_speed(iter/s)": 0.412536
},
{
"epoch": 1.4605433376455368,
"grad_norm": 1.180153902117692,
"learning_rate": 3.377474325646074e-06,
"loss": 1.0560644149780274,
"memory(GiB)": 74.63,
"step": 1975,
"token_acc": 0.641423703142749,
"train_speed(iter/s)": 0.407398
},
{
"epoch": 1.4642395121049714,
"grad_norm": 0.8918348376917337,
"learning_rate": 3.334059414423233e-06,
"loss": 1.055532169342041,
"memory(GiB)": 74.63,
"step": 1980,
"token_acc": 0.668722786647315,
"train_speed(iter/s)": 0.401897
},
{
"epoch": 1.4679356865644058,
"grad_norm": 1.109026709845534,
"learning_rate": 3.2908694535107144e-06,
"loss": 1.027819538116455,
"memory(GiB)": 74.63,
"step": 1985,
"token_acc": 0.661387220098307,
"train_speed(iter/s)": 0.396281
},
{
"epoch": 1.4716318610238404,
"grad_norm": 1.0886246897973584,
"learning_rate": 3.247905900406523e-06,
"loss": 1.0191631317138672,
"memory(GiB)": 74.63,
"step": 1990,
"token_acc": 0.6097883597883598,
"train_speed(iter/s)": 0.391566
},
{
"epoch": 1.4753280354832747,
"grad_norm": 1.0630977460263966,
"learning_rate": 3.2051702049682554e-06,
"loss": 1.042071533203125,
"memory(GiB)": 74.63,
"step": 1995,
"token_acc": 0.6236017897091722,
"train_speed(iter/s)": 0.386682
},
{
"epoch": 1.4790242099427093,
"grad_norm": 1.1953007407214893,
"learning_rate": 3.162663809364178e-06,
"loss": 1.0401007652282714,
"memory(GiB)": 74.63,
"step": 2000,
"token_acc": 0.6173344235486509,
"train_speed(iter/s)": 0.381535
},
{
"epoch": 1.4790242099427093,
"eval_loss": 0.6649311184883118,
"eval_runtime": 83.4819,
"eval_samples_per_second": 83.791,
"eval_steps_per_second": 0.659,
"eval_token_acc": 0.63089018531387,
"step": 2000
},
{
"epoch": 1.4827203844021437,
"grad_norm": 1.0030060203161786,
"learning_rate": 3.120388148024548e-06,
"loss": 1.0528248786926269,
"memory(GiB)": 74.63,
"step": 2005,
"token_acc": 0.6302038823098522,
"train_speed(iter/s)": 0.368939
},
{
"epoch": 1.4864165588615783,
"grad_norm": 1.1306385027348749,
"learning_rate": 3.0783446475932145e-06,
"loss": 1.061046028137207,
"memory(GiB)": 74.63,
"step": 2010,
"token_acc": 0.6473043478260869,
"train_speed(iter/s)": 0.364909
},
{
"epoch": 1.4901127333210127,
"grad_norm": 1.0935000761259253,
"learning_rate": 3.036534726879473e-06,
"loss": 1.0255512237548827,
"memory(GiB)": 74.63,
"step": 2015,
"token_acc": 0.65625,
"train_speed(iter/s)": 0.360903
},
{
"epoch": 1.4938089077804473,
"grad_norm": 1.088331528861988,
"learning_rate": 2.9949597968101883e-06,
"loss": 1.0589797973632813,
"memory(GiB)": 74.63,
"step": 2020,
"token_acc": 0.6325940212150434,
"train_speed(iter/s)": 0.356624
},
{
"epoch": 1.4975050822398817,
"grad_norm": 1.0677052287012947,
"learning_rate": 2.953621260382171e-06,
"loss": 1.0519143104553224,
"memory(GiB)": 74.63,
"step": 2025,
"token_acc": 0.6626557799742158,
"train_speed(iter/s)": 0.352723
},
{
"epoch": 1.5012012566993163,
"grad_norm": 0.9383180241618552,
"learning_rate": 2.9125205126148535e-06,
"loss": 1.031491470336914,
"memory(GiB)": 74.63,
"step": 2030,
"token_acc": 0.6123364485981309,
"train_speed(iter/s)": 0.349069
},
{
"epoch": 1.5048974311587506,
"grad_norm": 1.0487719308291952,
"learning_rate": 2.871658940503188e-06,
"loss": 1.024942398071289,
"memory(GiB)": 74.63,
"step": 2035,
"token_acc": 0.6477366255144033,
"train_speed(iter/s)": 0.345173
},
{
"epoch": 1.5085936056181852,
"grad_norm": 1.0789502013849968,
"learning_rate": 2.831037922970855e-06,
"loss": 1.0276554107666016,
"memory(GiB)": 74.63,
"step": 2040,
"token_acc": 0.6695604991861096,
"train_speed(iter/s)": 0.341604
},
{
"epoch": 1.5122897800776198,
"grad_norm": 1.0851618366990563,
"learning_rate": 2.7906588308237228e-06,
"loss": 1.027616596221924,
"memory(GiB)": 74.63,
"step": 2045,
"token_acc": 0.7097625329815304,
"train_speed(iter/s)": 0.338222
},
{
"epoch": 1.5159859545370542,
"grad_norm": 0.9179924796471817,
"learning_rate": 2.7505230267036032e-06,
"loss": 1.0497385025024415,
"memory(GiB)": 74.63,
"step": 2050,
"token_acc": 0.5937649880095923,
"train_speed(iter/s)": 0.334489
},
{
"epoch": 1.5159859545370542,
"eval_loss": 0.6642535328865051,
"eval_runtime": 85.9904,
"eval_samples_per_second": 81.346,
"eval_steps_per_second": 0.64,
"eval_token_acc": 0.6313195908181098,
"step": 2050
},
{
"epoch": 1.5196821289964886,
"grad_norm": 1.0681296921147372,
"learning_rate": 2.7106318650422447e-06,
"loss": 1.0099181175231933,
"memory(GiB)": 74.63,
"step": 2055,
"token_acc": 0.6372694090953931,
"train_speed(iter/s)": 0.325208
},
{
"epoch": 1.5233783034559232,
"grad_norm": 1.1164983354073834,
"learning_rate": 2.6709866920156434e-06,
"loss": 1.0027360916137695,
"memory(GiB)": 74.63,
"step": 2060,
"token_acc": 0.631484794275492,
"train_speed(iter/s)": 0.321919
},
{
"epoch": 1.5270744779153576,
"grad_norm": 0.9417253538259095,
"learning_rate": 2.6315888454986017e-06,
"loss": 1.0374462127685546,
"memory(GiB)": 74.63,
"step": 2065,
"token_acc": 0.6671586715867158,
"train_speed(iter/s)": 0.319024
},
{
"epoch": 1.530770652374792,
"grad_norm": 1.1095932914113171,
"learning_rate": 2.5924396550195986e-06,
"loss": 1.03175687789917,
"memory(GiB)": 74.63,
"step": 2070,
"token_acc": 0.6316007454959619,
"train_speed(iter/s)": 0.315819
},
{
"epoch": 1.5344668268342265,
"grad_norm": 1.0582702932147185,
"learning_rate": 2.5535404417159002e-06,
"loss": 1.0430908203125,
"memory(GiB)": 74.63,
"step": 2075,
"token_acc": 0.6477673325499412,
"train_speed(iter/s)": 0.312805
},
{
"epoch": 1.5381630012936611,
"grad_norm": 1.0515415830247143,
"learning_rate": 2.514892518288988e-06,
"loss": 1.0108471870422364,
"memory(GiB)": 74.63,
"step": 2080,
"token_acc": 0.6291390728476821,
"train_speed(iter/s)": 0.310115
},
{
"epoch": 1.5418591757530955,
"grad_norm": 1.018793664843126,
"learning_rate": 2.4764971889602705e-06,
"loss": 1.0460142135620116,
"memory(GiB)": 74.63,
"step": 2085,
"token_acc": 0.6321537789427698,
"train_speed(iter/s)": 0.307239
},
{
"epoch": 1.5455553502125299,
"grad_norm": 1.0684231311720556,
"learning_rate": 2.4383557494270483e-06,
"loss": 1.03402099609375,
"memory(GiB)": 74.63,
"step": 2090,
"token_acc": 0.6098130841121495,
"train_speed(iter/s)": 0.304401
},
{
"epoch": 1.5492515246719645,
"grad_norm": 1.1947182692900764,
"learning_rate": 2.400469486818803e-06,
"loss": 1.0426679611206056,
"memory(GiB)": 74.63,
"step": 2095,
"token_acc": 0.6819553409776705,
"train_speed(iter/s)": 0.301883
},
{
"epoch": 1.552947699131399,
"grad_norm": 1.1961503070894741,
"learning_rate": 2.3628396796537588e-06,
"loss": 1.0395529747009278,
"memory(GiB)": 74.63,
"step": 2100,
"token_acc": 0.6641014033499321,
"train_speed(iter/s)": 0.299223
},
{
"epoch": 1.552947699131399,
"eval_loss": 0.6638895273208618,
"eval_runtime": 88.4322,
"eval_samples_per_second": 79.1,
"eval_steps_per_second": 0.622,
"eval_token_acc": 0.6315920093638103,
"step": 2100
},
{
"epoch": 1.5566438735908334,
"grad_norm": 1.0207020501497954,
"learning_rate": 2.325467597795745e-06,
"loss": 1.0622333526611327,
"memory(GiB)": 74.63,
"step": 2105,
"token_acc": 0.638814317673378,
"train_speed(iter/s)": 0.291998
},
{
"epoch": 1.5603400480502678,
"grad_norm": 1.1172734543264464,
"learning_rate": 2.2883545024113263e-06,
"loss": 1.0403221130371094,
"memory(GiB)": 74.63,
"step": 2110,
"token_acc": 0.6622971285892634,
"train_speed(iter/s)": 0.289437
},
{
"epoch": 1.5640362225097024,
"grad_norm": 1.0571335154576122,
"learning_rate": 2.251501645927253e-06,
"loss": 1.0463993072509765,
"memory(GiB)": 74.63,
"step": 2115,
"token_acc": 0.636986301369863,
"train_speed(iter/s)": 0.28714
},
{
"epoch": 1.567732396969137,
"grad_norm": 0.9556270442029375,
"learning_rate": 2.2149102719882044e-06,
"loss": 1.0251903533935547,
"memory(GiB)": 74.63,
"step": 2120,
"token_acc": 0.647495361781076,
"train_speed(iter/s)": 0.284896
},
{
"epoch": 1.5714285714285714,
"grad_norm": 1.041918735454562,
"learning_rate": 2.178581615414802e-06,
"loss": 1.0483660697937012,
"memory(GiB)": 74.63,
"step": 2125,
"token_acc": 0.5842217484008528,
"train_speed(iter/s)": 0.282449
},
{
"epoch": 1.5751247458880058,
"grad_norm": 1.0827410972952385,
"learning_rate": 2.1425169021619518e-06,
"loss": 1.0664111137390138,
"memory(GiB)": 74.63,
"step": 2130,
"token_acc": 0.6472742066720911,
"train_speed(iter/s)": 0.280246
},
{
"epoch": 1.5788209203474404,
"grad_norm": 1.0343519334837508,
"learning_rate": 2.106717349277475e-06,
"loss": 1.0448074340820312,
"memory(GiB)": 74.63,
"step": 2135,
"token_acc": 0.6223404255319149,
"train_speed(iter/s)": 0.278222
},
{
"epoch": 1.582517094806875,
"grad_norm": 0.9536359374215565,
"learning_rate": 2.0711841648610254e-06,
"loss": 1.0621306419372558,
"memory(GiB)": 74.63,
"step": 2140,
"token_acc": 0.6342119419042496,
"train_speed(iter/s)": 0.276006
},
{
"epoch": 1.5862132692663093,
"grad_norm": 1.072455338512947,
"learning_rate": 2.03591854802333e-06,
"loss": 1.0556835174560546,
"memory(GiB)": 74.63,
"step": 2145,
"token_acc": 0.7222898903775883,
"train_speed(iter/s)": 0.27386
},
{
"epoch": 1.5899094437257437,
"grad_norm": 1.0210760479008887,
"learning_rate": 2.0009216888457206e-06,
"loss": 1.0253107070922851,
"memory(GiB)": 74.63,
"step": 2150,
"token_acc": 0.6356216994719155,
"train_speed(iter/s)": 0.271885
},
{
"epoch": 1.5899094437257437,
"eval_loss": 0.6611568927764893,
"eval_runtime": 89.4271,
"eval_samples_per_second": 78.22,
"eval_steps_per_second": 0.615,
"eval_token_acc": 0.6316070154701413,
"step": 2150
},
{
"epoch": 1.5936056181851783,
"grad_norm": 1.062555004377403,
"learning_rate": 1.966194768339974e-06,
"loss": 1.049751091003418,
"memory(GiB)": 74.63,
"step": 2155,
"token_acc": 0.6423422284052106,
"train_speed(iter/s)": 0.266286
},
{
"epoch": 1.597301792644613,
"grad_norm": 0.8629319967495109,
"learning_rate": 1.931738958408457e-06,
"loss": 1.0435371398925781,
"memory(GiB)": 74.63,
"step": 2160,
"token_acc": 0.6290619251992643,
"train_speed(iter/s)": 0.264247
},
{
"epoch": 1.6009979671040473,
"grad_norm": 0.955539932162413,
"learning_rate": 1.8975554218045733e-06,
"loss": 1.0308834075927735,
"memory(GiB)": 74.63,
"step": 2165,
"token_acc": 0.6610537751222162,
"train_speed(iter/s)": 0.262351
},
{
"epoch": 1.6046941415634817,
"grad_norm": 0.9624917837200193,
"learning_rate": 1.8636453120935428e-06,
"loss": 1.0461854934692383,
"memory(GiB)": 74.63,
"step": 2170,
"token_acc": 0.7152838427947599,
"train_speed(iter/s)": 0.260619
},
{
"epoch": 1.6083903160229163,
"grad_norm": 1.1677655720128766,
"learning_rate": 1.8300097736134482e-06,
"loss": 1.0363172531127929,
"memory(GiB)": 74.63,
"step": 2175,
"token_acc": 0.6848798869524259,
"train_speed(iter/s)": 0.258828
},
{
"epoch": 1.6120864904823509,
"grad_norm": 1.060280622494465,
"learning_rate": 1.796649941436638e-06,
"loss": 1.0246556282043457,
"memory(GiB)": 74.63,
"step": 2180,
"token_acc": 0.6469820554649266,
"train_speed(iter/s)": 0.256928
},
{
"epoch": 1.6157826649417852,
"grad_norm": 0.9704555618707196,
"learning_rate": 1.7635669413314082e-06,
"loss": 1.0577556610107421,
"memory(GiB)": 74.63,
"step": 2185,
"token_acc": 0.698159509202454,
"train_speed(iter/s)": 0.255252
},
{
"epoch": 1.6194788394012196,
"grad_norm": 0.9786880620172256,
"learning_rate": 1.7307618897240274e-06,
"loss": 1.0526361465454102,
"memory(GiB)": 74.63,
"step": 2190,
"token_acc": 0.6385869565217391,
"train_speed(iter/s)": 0.253488
},
{
"epoch": 1.6231750138606542,
"grad_norm": 0.9744613679129237,
"learning_rate": 1.6982358936610454e-06,
"loss": 1.075265598297119,
"memory(GiB)": 74.63,
"step": 2195,
"token_acc": 0.6133072407045009,
"train_speed(iter/s)": 0.251735
},
{
"epoch": 1.6268711883200888,
"grad_norm": 1.0120755932892964,
"learning_rate": 1.6659900507719406e-06,
"loss": 1.064041519165039,
"memory(GiB)": 74.63,
"step": 2200,
"token_acc": 0.648406731113498,
"train_speed(iter/s)": 0.250141
},
{
"epoch": 1.6268711883200888,
"eval_loss": 0.6599572896957397,
"eval_runtime": 90.9305,
"eval_samples_per_second": 76.927,
"eval_steps_per_second": 0.605,
"eval_token_acc": 0.6317178297938161,
"step": 2200
},
{
"epoch": 1.6305673627795232,
"grad_norm": 1.0770018414163383,
"learning_rate": 1.6340254492320873e-06,
"loss": 1.0508115768432618,
"memory(GiB)": 74.63,
"step": 2205,
"token_acc": 0.6418808091853472,
"train_speed(iter/s)": 0.245446
},
{
"epoch": 1.6342635372389576,
"grad_norm": 1.0511841490808227,
"learning_rate": 1.6023431677260215e-06,
"loss": 1.0454177856445312,
"memory(GiB)": 74.63,
"step": 2210,
"token_acc": 0.6532779316712835,
"train_speed(iter/s)": 0.243859
},
{
"epoch": 1.6379597116983922,
"grad_norm": 0.9098679876928407,
"learning_rate": 1.570944275411046e-06,
"loss": 1.0668581962585448,
"memory(GiB)": 74.63,
"step": 2215,
"token_acc": 0.6121688741721855,
"train_speed(iter/s)": 0.242235
},
{
"epoch": 1.6416558861578268,
"grad_norm": 1.0127053695015762,
"learning_rate": 1.5398298318811467e-06,
"loss": 1.0175441741943358,
"memory(GiB)": 74.63,
"step": 2220,
"token_acc": 0.6991780821917808,
"train_speed(iter/s)": 0.240782
},
{
"epoch": 1.6453520606172611,
"grad_norm": 1.1031573706590774,
"learning_rate": 1.5090008871312433e-06,
"loss": 1.0165956497192383,
"memory(GiB)": 74.63,
"step": 2225,
"token_acc": 0.6685121107266436,
"train_speed(iter/s)": 0.23932
},
{
"epoch": 1.6490482350766955,
"grad_norm": 1.0502782153731651,
"learning_rate": 1.4784584815217452e-06,
"loss": 1.0456388473510743,
"memory(GiB)": 74.63,
"step": 2230,
"token_acc": 0.6672802577082375,
"train_speed(iter/s)": 0.237824
},
{
"epoch": 1.65274440953613,
"grad_norm": 1.003637672944472,
"learning_rate": 1.448203645743449e-06,
"loss": 1.0287794113159179,
"memory(GiB)": 74.63,
"step": 2235,
"token_acc": 0.6663223140495868,
"train_speed(iter/s)": 0.236377
},
{
"epoch": 1.6564405839955647,
"grad_norm": 1.037599542215698,
"learning_rate": 1.4182374007827605e-06,
"loss": 1.0127573013305664,
"memory(GiB)": 74.63,
"step": 2240,
"token_acc": 0.6325656132833423,
"train_speed(iter/s)": 0.235012
},
{
"epoch": 1.660136758454999,
"grad_norm": 0.9940434532315588,
"learning_rate": 1.3885607578872295e-06,
"loss": 1.0367406845092773,
"memory(GiB)": 74.63,
"step": 2245,
"token_acc": 0.6187350835322196,
"train_speed(iter/s)": 0.233574
},
{
"epoch": 1.6638329329144335,
"grad_norm": 0.9200899712617193,
"learning_rate": 1.3591747185314342e-06,
"loss": 1.0550609588623048,
"memory(GiB)": 74.63,
"step": 2250,
"token_acc": 0.6650768415474297,
"train_speed(iter/s)": 0.232175
},
{
"epoch": 1.6638329329144335,
"eval_loss": 0.6586793661117554,
"eval_runtime": 87.5544,
"eval_samples_per_second": 79.893,
"eval_steps_per_second": 0.628,
"eval_token_acc": 0.6320167976045638,
"step": 2250
},
{
"epoch": 1.667529107373868,
"grad_norm": 1.0426265291319847,
"learning_rate": 1.3300802743831786e-06,
"loss": 1.0567312240600586,
"memory(GiB)": 74.63,
"step": 2255,
"token_acc": 0.6444471182769823,
"train_speed(iter/s)": 0.228232
},
{
"epoch": 1.6712252818333027,
"grad_norm": 1.1036330750940702,
"learning_rate": 1.3012784072700335e-06,
"loss": 1.0163141250610352,
"memory(GiB)": 74.63,
"step": 2260,
"token_acc": 0.6361031518624641,
"train_speed(iter/s)": 0.226993
},
{
"epoch": 1.674921456292737,
"grad_norm": 1.034827646235815,
"learning_rate": 1.272770089146199e-06,
"loss": 1.042106819152832,
"memory(GiB)": 74.63,
"step": 2265,
"token_acc": 0.6615910503418272,
"train_speed(iter/s)": 0.225676
},
{
"epoch": 1.6786176307521714,
"grad_norm": 0.9379338873318531,
"learning_rate": 1.2445562820597035e-06,
"loss": 1.056378173828125,
"memory(GiB)": 74.63,
"step": 2270,
"token_acc": 0.6658767772511849,
"train_speed(iter/s)": 0.22441
},
{
"epoch": 1.682313805211606,
"grad_norm": 1.018955540383726,
"learning_rate": 1.2166379381199423e-06,
"loss": 1.024850082397461,
"memory(GiB)": 74.63,
"step": 2275,
"token_acc": 0.6339022954679223,
"train_speed(iter/s)": 0.223236
},
{
"epoch": 1.6860099796710406,
"grad_norm": 0.9387152975257087,
"learning_rate": 1.1890159994655425e-06,
"loss": 1.0364057540893554,
"memory(GiB)": 74.63,
"step": 2280,
"token_acc": 0.6378887070376432,
"train_speed(iter/s)": 0.221993
},
{
"epoch": 1.689706154130475,
"grad_norm": 0.9517285751951058,
"learning_rate": 1.1616913982325827e-06,
"loss": 1.0173322677612304,
"memory(GiB)": 74.63,
"step": 2285,
"token_acc": 0.63408913213448,
"train_speed(iter/s)": 0.220748
},
{
"epoch": 1.6934023285899094,
"grad_norm": 1.1148106388917103,
"learning_rate": 1.1346650565231165e-06,
"loss": 1.0427886962890625,
"memory(GiB)": 74.63,
"step": 2290,
"token_acc": 0.640251572327044,
"train_speed(iter/s)": 0.219605
},
{
"epoch": 1.697098503049344,
"grad_norm": 1.1256757463038873,
"learning_rate": 1.1079378863740686e-06,
"loss": 1.0264497756958009,
"memory(GiB)": 74.63,
"step": 2295,
"token_acc": 0.6556603773584906,
"train_speed(iter/s)": 0.21844
},
{
"epoch": 1.7007946775087786,
"grad_norm": 1.0466875757106615,
"learning_rate": 1.0815107897264555e-06,
"loss": 1.0546932220458984,
"memory(GiB)": 74.63,
"step": 2300,
"token_acc": 0.6179956896551724,
"train_speed(iter/s)": 0.217293
},
{
"epoch": 1.7007946775087786,
"eval_loss": 0.6585622429847717,
"eval_runtime": 86.3947,
"eval_samples_per_second": 80.966,
"eval_steps_per_second": 0.637,
"eval_token_acc": 0.6323053765724668,
"step": 2300
},
{
"epoch": 1.704490851968213,
"grad_norm": 0.9844022346926911,
"learning_rate": 1.0553846583949424e-06,
"loss": 1.0470151901245117,
"memory(GiB)": 74.63,
"step": 2305,
"token_acc": 0.638003355704698,
"train_speed(iter/s)": 0.213982
},
{
"epoch": 1.7081870264276473,
"grad_norm": 1.002842594215214,
"learning_rate": 1.0295603740377591e-06,
"loss": 1.0518400192260742,
"memory(GiB)": 74.63,
"step": 2310,
"token_acc": 0.6883333333333334,
"train_speed(iter/s)": 0.212941
},
{
"epoch": 1.711883200887082,
"grad_norm": 0.996843923558558,
"learning_rate": 1.0040388081269336e-06,
"loss": 1.028696632385254,
"memory(GiB)": 74.63,
"step": 2315,
"token_acc": 0.6513243595310465,
"train_speed(iter/s)": 0.211922
},
{
"epoch": 1.7155793753465165,
"grad_norm": 1.0304373058907095,
"learning_rate": 9.788208219188932e-07,
"loss": 1.0363618850708007,
"memory(GiB)": 74.63,
"step": 2320,
"token_acc": 0.6015075376884422,
"train_speed(iter/s)": 0.210816
},
{
"epoch": 1.7192755498059509,
"grad_norm": 1.0716438374724575,
"learning_rate": 9.539072664254e-07,
"loss": 1.065016269683838,
"memory(GiB)": 74.63,
"step": 2325,
"token_acc": 0.6122448979591837,
"train_speed(iter/s)": 0.20983
},
{
"epoch": 1.7229717242653853,
"grad_norm": 1.044849619522368,
"learning_rate": 9.292989823848242e-07,
"loss": 1.0461166381835938,
"memory(GiB)": 74.63,
"step": 2330,
"token_acc": 0.6681818181818182,
"train_speed(iter/s)": 0.208847
},
{
"epoch": 1.7266678987248198,
"grad_norm": 0.9749773034536726,
"learning_rate": 9.049968002337805e-07,
"loss": 1.0064781188964844,
"memory(GiB)": 74.63,
"step": 2335,
"token_acc": 0.6454869358669834,
"train_speed(iter/s)": 0.207824
},
{
"epoch": 1.7303640731842544,
"grad_norm": 1.0478755901703891,
"learning_rate": 8.810015400790994e-07,
"loss": 1.0341422080993652,
"memory(GiB)": 74.63,
"step": 2340,
"token_acc": 0.6453608247422681,
"train_speed(iter/s)": 0.20687
},
{
"epoch": 1.7340602476436888,
"grad_norm": 1.161076200769703,
"learning_rate": 8.573140116701573e-07,
"loss": 1.031747531890869,
"memory(GiB)": 74.63,
"step": 2345,
"token_acc": 0.633889077917659,
"train_speed(iter/s)": 0.205935
},
{
"epoch": 1.7377564221031232,
"grad_norm": 1.0465828745420171,
"learning_rate": 8.339350143715452e-07,
"loss": 1.026121234893799,
"memory(GiB)": 74.63,
"step": 2350,
"token_acc": 0.6303341902313625,
"train_speed(iter/s)": 0.204941
},
{
"epoch": 1.7377564221031232,
"eval_loss": 0.6579257845878601,
"eval_runtime": 85.2135,
"eval_samples_per_second": 82.088,
"eval_steps_per_second": 0.645,
"eval_token_acc": 0.6323030679407236,
"step": 2350
},
{
"epoch": 1.7414525965625578,
"grad_norm": 0.9803139837060567,
"learning_rate": 8.108653371360897e-07,
"loss": 1.0249688148498535,
"memory(GiB)": 74.63,
"step": 2355,
"token_acc": 0.6329644032306312,
"train_speed(iter/s)": 0.202068
},
{
"epoch": 1.7451487710219924,
"grad_norm": 1.0773968420983469,
"learning_rate": 7.881057584782448e-07,
"loss": 1.014153003692627,
"memory(GiB)": 74.63,
"step": 2360,
"token_acc": 0.6533575317604355,
"train_speed(iter/s)": 0.201155
},
{
"epoch": 1.7488449454814268,
"grad_norm": 1.0060807449724751,
"learning_rate": 7.656570464477997e-07,
"loss": 1.041685199737549,
"memory(GiB)": 74.63,
"step": 2365,
"token_acc": 0.6260771824653428,
"train_speed(iter/s)": 0.20029
},
{
"epoch": 1.7525411199408611,
"grad_norm": 0.9990872446739557,
"learning_rate": 7.435199586039721e-07,
"loss": 1.025881576538086,
"memory(GiB)": 74.63,
"step": 2370,
"token_acc": 0.6330558125192722,
"train_speed(iter/s)": 0.199385
},
{
"epoch": 1.7562372944002957,
"grad_norm": 1.0713164560199164,
"learning_rate": 7.216952419898393e-07,
"loss": 1.0439919471740722,
"memory(GiB)": 74.63,
"step": 2375,
"token_acc": 0.6618962432915921,
"train_speed(iter/s)": 0.198497
},
{
"epoch": 1.7599334688597303,
"grad_norm": 1.0964714966010252,
"learning_rate": 7.001836331071365e-07,
"loss": 1.0411014556884766,
"memory(GiB)": 74.63,
"step": 2380,
"token_acc": 0.6824512534818942,
"train_speed(iter/s)": 0.197623
},
{
"epoch": 1.7636296433191647,
"grad_norm": 0.9737095253362634,
"learning_rate": 6.789858578913877e-07,
"loss": 1.0455976486206056,
"memory(GiB)": 74.63,
"step": 2385,
"token_acc": 0.6538119252447345,
"train_speed(iter/s)": 0.196798
},
{
"epoch": 1.767325817778599,
"grad_norm": 1.0585968573237603,
"learning_rate": 6.581026316874184e-07,
"loss": 1.0437522888183595,
"memory(GiB)": 74.63,
"step": 2390,
"token_acc": 0.6448377581120944,
"train_speed(iter/s)": 0.195944
},
{
"epoch": 1.7710219922380337,
"grad_norm": 0.9930747477126893,
"learning_rate": 6.375346592252174e-07,
"loss": 1.035786247253418,
"memory(GiB)": 74.63,
"step": 2395,
"token_acc": 0.6269207129686539,
"train_speed(iter/s)": 0.195132
},
{
"epoch": 1.774718166697468,
"grad_norm": 0.9303672570135261,
"learning_rate": 6.17282634596148e-07,
"loss": 1.0481432914733886,
"memory(GiB)": 74.63,
"step": 2400,
"token_acc": 0.6504672897196262,
"train_speed(iter/s)": 0.194323
},
{
"epoch": 1.774718166697468,
"eval_loss": 0.6574872136116028,
"eval_runtime": 88.3048,
"eval_samples_per_second": 79.214,
"eval_steps_per_second": 0.623,
"eval_token_acc": 0.6324069563691687,
"step": 2400
},
{
"epoch": 1.7784143411569024,
"grad_norm": 1.0622942862025062,
"learning_rate": 5.973472412295256e-07,
"loss": 1.019943618774414,
"memory(GiB)": 74.63,
"step": 2405,
"token_acc": 0.631666271628348,
"train_speed(iter/s)": 0.191801
},
{
"epoch": 1.782110515616337,
"grad_norm": 1.0141126472853548,
"learning_rate": 5.777291518695593e-07,
"loss": 1.0454243659973144,
"memory(GiB)": 74.63,
"step": 2410,
"token_acc": 0.6077097505668935,
"train_speed(iter/s)": 0.191007
},
{
"epoch": 1.7858066900757716,
"grad_norm": 1.0733746716133248,
"learning_rate": 5.584290285526473e-07,
"loss": 1.036181640625,
"memory(GiB)": 74.63,
"step": 2415,
"token_acc": 0.671865626874625,
"train_speed(iter/s)": 0.190213
},
{
"epoch": 1.789502864535206,
"grad_norm": 1.011543008247962,
"learning_rate": 5.394475225850338e-07,
"loss": 1.0618670463562012,
"memory(GiB)": 74.63,
"step": 2420,
"token_acc": 0.6783405172413793,
"train_speed(iter/s)": 0.189455
},
{
"epoch": 1.7931990389946404,
"grad_norm": 0.9605401301883022,
"learning_rate": 5.207852745208298e-07,
"loss": 0.9933710098266602,
"memory(GiB)": 74.63,
"step": 2425,
"token_acc": 0.6471641791044777,
"train_speed(iter/s)": 0.188704
},
{
"epoch": 1.796895213454075,
"grad_norm": 1.1008101055992277,
"learning_rate": 5.024429141404019e-07,
"loss": 0.999241828918457,
"memory(GiB)": 74.63,
"step": 2430,
"token_acc": 0.6457304163726182,
"train_speed(iter/s)": 0.187948
},
{
"epoch": 1.8005913879135096,
"grad_norm": 0.935629646034127,
"learning_rate": 4.844210604291155e-07,
"loss": 1.018147087097168,
"memory(GiB)": 74.63,
"step": 2435,
"token_acc": 0.6178369652945924,
"train_speed(iter/s)": 0.187233
},
{
"epoch": 1.804287562372944,
"grad_norm": 0.9808937018928983,
"learning_rate": 4.667203215564431e-07,
"loss": 1.0448846817016602,
"memory(GiB)": 74.63,
"step": 2440,
"token_acc": 0.6323092170465807,
"train_speed(iter/s)": 0.186484
},
{
"epoch": 1.8079837368323783,
"grad_norm": 1.0392559529080805,
"learning_rate": 4.493412948554454e-07,
"loss": 1.0690251350402833,
"memory(GiB)": 74.63,
"step": 2445,
"token_acc": 0.6409416581371545,
"train_speed(iter/s)": 0.185763
},
{
"epoch": 1.811679911291813,
"grad_norm": 1.0661940200914148,
"learning_rate": 4.3228456680261877e-07,
"loss": 1.0110756874084472,
"memory(GiB)": 74.63,
"step": 2450,
"token_acc": 0.649331352154532,
"train_speed(iter/s)": 0.185079
},
{
"epoch": 1.811679911291813,
"eval_loss": 0.6571330428123474,
"eval_runtime": 89.3337,
"eval_samples_per_second": 78.302,
"eval_steps_per_second": 0.616,
"eval_token_acc": 0.6325177706928434,
"step": 2450
},
{
"epoch": 1.8153760857512475,
"grad_norm": 0.9674903681690532,
"learning_rate": 4.155507129980907e-07,
"loss": 1.0614801406860352,
"memory(GiB)": 74.63,
"step": 2455,
"token_acc": 0.6478157805621402,
"train_speed(iter/s)": 0.182858
},
{
"epoch": 1.819072260210682,
"grad_norm": 1.023357784734463,
"learning_rate": 3.991402981462045e-07,
"loss": 1.0087343215942384,
"memory(GiB)": 74.63,
"step": 2460,
"token_acc": 0.6711140760507005,
"train_speed(iter/s)": 0.182142
},
{
"epoch": 1.8227684346701163,
"grad_norm": 1.1735912820708456,
"learning_rate": 3.8305387603646324e-07,
"loss": 1.0243083953857421,
"memory(GiB)": 74.63,
"step": 2465,
"token_acc": 0.6599799398194583,
"train_speed(iter/s)": 0.181445
},
{
"epoch": 1.8264646091295509,
"grad_norm": 1.0193068857696008,
"learning_rate": 3.6729198952483725e-07,
"loss": 1.032374095916748,
"memory(GiB)": 74.63,
"step": 2470,
"token_acc": 0.6700460829493088,
"train_speed(iter/s)": 0.180793
},
{
"epoch": 1.8301607835889855,
"grad_norm": 0.989197160358902,
"learning_rate": 3.5185517051544494e-07,
"loss": 1.053987693786621,
"memory(GiB)": 74.63,
"step": 2475,
"token_acc": 0.6859160781055256,
"train_speed(iter/s)": 0.180141
},
{
"epoch": 1.8338569580484199,
"grad_norm": 1.0596386275791907,
"learning_rate": 3.367439399426087e-07,
"loss": 1.0508078575134276,
"memory(GiB)": 74.63,
"step": 2480,
"token_acc": 0.6111356606274856,
"train_speed(iter/s)": 0.179489
},
{
"epoch": 1.8375531325078542,
"grad_norm": 1.0148900997448214,
"learning_rate": 3.219588077532687e-07,
"loss": 1.0556805610656739,
"memory(GiB)": 74.63,
"step": 2485,
"token_acc": 0.6928414901387875,
"train_speed(iter/s)": 0.178863
},
{
"epoch": 1.8412493069672888,
"grad_norm": 0.9468756707473351,
"learning_rate": 3.075002728897747e-07,
"loss": 1.0154769897460938,
"memory(GiB)": 74.63,
"step": 2490,
"token_acc": 0.6334152334152334,
"train_speed(iter/s)": 0.178234
},
{
"epoch": 1.8449454814267234,
"grad_norm": 0.9178809513706729,
"learning_rate": 2.933688232730536e-07,
"loss": 1.0376591682434082,
"memory(GiB)": 74.63,
"step": 2495,
"token_acc": 0.6742112482853223,
"train_speed(iter/s)": 0.177603
},
{
"epoch": 1.8486416558861578,
"grad_norm": 1.0627891032300194,
"learning_rate": 2.79564935786143e-07,
"loss": 1.0138132095336914,
"memory(GiB)": 74.63,
"step": 2500,
"token_acc": 0.6157316041725401,
"train_speed(iter/s)": 0.176992
},
{
"epoch": 1.8486416558861578,
"eval_loss": 0.6568954586982727,
"eval_runtime": 89.4508,
"eval_samples_per_second": 78.199,
"eval_steps_per_second": 0.615,
"eval_token_acc": 0.632513153429357,
"step": 2500
},
{
"epoch": 1.8523378303455922,
"grad_norm": 1.0872052595724289,
"learning_rate": 2.660890762580903e-07,
"loss": 1.0546483993530273,
"memory(GiB)": 74.63,
"step": 2505,
"token_acc": 0.6424242424242425,
"train_speed(iter/s)": 0.175004
},
{
"epoch": 1.8560340048050268,
"grad_norm": 1.1281209574136644,
"learning_rate": 2.5294169944824254e-07,
"loss": 1.0317713737487793,
"memory(GiB)": 74.63,
"step": 2510,
"token_acc": 0.6293388429752066,
"train_speed(iter/s)": 0.174416
},
{
"epoch": 1.8597301792644614,
"grad_norm": 0.8926816061055212,
"learning_rate": 2.401232490308969e-07,
"loss": 1.048653793334961,
"memory(GiB)": 74.63,
"step": 2515,
"token_acc": 0.6237929702587872,
"train_speed(iter/s)": 0.173811
},
{
"epoch": 1.8634263537238958,
"grad_norm": 1.0912285805001078,
"learning_rate": 2.2763415758032316e-07,
"loss": 1.0199008941650392,
"memory(GiB)": 74.63,
"step": 2520,
"token_acc": 0.632258064516129,
"train_speed(iter/s)": 0.173239
},
{
"epoch": 1.8671225281833301,
"grad_norm": 1.0989085317685814,
"learning_rate": 2.1547484655617513e-07,
"loss": 1.010093879699707,
"memory(GiB)": 74.63,
"step": 2525,
"token_acc": 0.6342616920651603,
"train_speed(iter/s)": 0.172675
},
{
"epoch": 1.8708187026427647,
"grad_norm": 1.0229802711909943,
"learning_rate": 2.0364572628925993e-07,
"loss": 1.0246079444885254,
"memory(GiB)": 74.63,
"step": 2530,
"token_acc": 0.717948717948718,
"train_speed(iter/s)": 0.172113
},
{
"epoch": 1.8745148771021993,
"grad_norm": 1.1101947156669076,
"learning_rate": 1.921471959676957e-07,
"loss": 1.0213122367858887,
"memory(GiB)": 74.63,
"step": 2535,
"token_acc": 0.6377079482439926,
"train_speed(iter/s)": 0.171534
},
{
"epoch": 1.8782110515616337,
"grad_norm": 0.972824509691789,
"learning_rate": 1.809796436234379e-07,
"loss": 1.0392621040344239,
"memory(GiB)": 74.63,
"step": 2540,
"token_acc": 0.6089108910891089,
"train_speed(iter/s)": 0.17099
},
{
"epoch": 1.881907226021068,
"grad_norm": 1.0893138267742302,
"learning_rate": 1.7014344611918753e-07,
"loss": 1.0224065780639648,
"memory(GiB)": 74.63,
"step": 2545,
"token_acc": 0.628198149156233,
"train_speed(iter/s)": 0.170427
},
{
"epoch": 1.8856034004805027,
"grad_norm": 1.1303784675436226,
"learning_rate": 1.5963896913566923e-07,
"loss": 1.0195607185363769,
"memory(GiB)": 74.63,
"step": 2550,
"token_acc": 0.658051689860835,
"train_speed(iter/s)": 0.169871
},
{
"epoch": 1.8856034004805027,
"eval_loss": 0.6567226648330688,
"eval_runtime": 88.1394,
"eval_samples_per_second": 79.363,
"eval_steps_per_second": 0.624,
"eval_token_acc": 0.6325870296451402,
"step": 2550
},
{
"epoch": 1.8892995749399373,
"grad_norm": 1.09017259229996,
"learning_rate": 1.494665671592943e-07,
"loss": 1.0317469596862794,
"memory(GiB)": 74.63,
"step": 2555,
"token_acc": 0.6337277475748854,
"train_speed(iter/s)": 0.168117
},
{
"epoch": 1.8929957493993717,
"grad_norm": 0.9679050962600252,
"learning_rate": 1.3962658347019819e-07,
"loss": 1.0667352676391602,
"memory(GiB)": 74.63,
"step": 2560,
"token_acc": 0.6295540658700087,
"train_speed(iter/s)": 0.167582
},
{
"epoch": 1.896691923858806,
"grad_norm": 0.88490239893554,
"learning_rate": 1.3011935013065303e-07,
"loss": 1.0192485809326173,
"memory(GiB)": 74.63,
"step": 2565,
"token_acc": 0.599483204134367,
"train_speed(iter/s)": 0.16706
},
{
"epoch": 1.9003880983182406,
"grad_norm": 1.0244757899454908,
"learning_rate": 1.2094518797386657e-07,
"loss": 1.0162858963012695,
"memory(GiB)": 74.63,
"step": 2570,
"token_acc": 0.6262672811059908,
"train_speed(iter/s)": 0.166543
},
{
"epoch": 1.9040842727776752,
"grad_norm": 0.9237340665622228,
"learning_rate": 1.121044065931498e-07,
"loss": 1.0645517349243163,
"memory(GiB)": 74.63,
"step": 2575,
"token_acc": 0.6675933280381255,
"train_speed(iter/s)": 0.166012
},
{
"epoch": 1.9077804472371096,
"grad_norm": 0.9745219678731106,
"learning_rate": 1.0359730433147308e-07,
"loss": 1.0265457153320312,
"memory(GiB)": 74.63,
"step": 2580,
"token_acc": 0.6550632911392406,
"train_speed(iter/s)": 0.165515
},
{
"epoch": 1.911476621696544,
"grad_norm": 1.0007256420566137,
"learning_rate": 9.542416827139855e-08,
"loss": 1.0198524475097657,
"memory(GiB)": 74.63,
"step": 2585,
"token_acc": 0.6085481682496607,
"train_speed(iter/s)": 0.164991
},
{
"epoch": 1.9151727961559786,
"grad_norm": 0.9874298790271662,
"learning_rate": 8.758527422538798e-08,
"loss": 1.0276208877563477,
"memory(GiB)": 74.63,
"step": 2590,
"token_acc": 0.6413793103448275,
"train_speed(iter/s)": 0.164496
},
{
"epoch": 1.9188689706154132,
"grad_norm": 0.985598517098827,
"learning_rate": 8.008088672650016e-08,
"loss": 1.0311683654785155,
"memory(GiB)": 74.63,
"step": 2595,
"token_acc": 0.6960919540229885,
"train_speed(iter/s)": 0.164012
},
{
"epoch": 1.9225651450748475,
"grad_norm": 0.8074933176611375,
"learning_rate": 7.291125901946027e-08,
"loss": 1.0470510482788087,
"memory(GiB)": 74.63,
"step": 2600,
"token_acc": 0.6391111111111111,
"train_speed(iter/s)": 0.163535
},
{
"epoch": 1.9225651450748475,
"eval_loss": 0.6566023230552673,
"eval_runtime": 88.9043,
"eval_samples_per_second": 78.68,
"eval_steps_per_second": 0.619,
"eval_token_acc": 0.6326458997545924,
"step": 2600
},
{
"epoch": 1.926261319534282,
"grad_norm": 1.0347205106897759,
"learning_rate": 6.607663305211675e-08,
"loss": 1.0246917724609375,
"memory(GiB)": 74.63,
"step": 2605,
"token_acc": 0.6372442184283812,
"train_speed(iter/s)": 0.161902
},
{
"epoch": 1.9299574939937165,
"grad_norm": 0.9451923481445313,
"learning_rate": 5.957723946727445e-08,
"loss": 1.030987548828125,
"memory(GiB)": 74.63,
"step": 2610,
"token_acc": 0.655980271270037,
"train_speed(iter/s)": 0.161436
},
{
"epoch": 1.9336536684531511,
"grad_norm": 0.989048068560612,
"learning_rate": 5.341329759491087e-08,
"loss": 1.043976402282715,
"memory(GiB)": 74.63,
"step": 2615,
"token_acc": 0.6610073571024335,
"train_speed(iter/s)": 0.160958
},
{
"epoch": 1.9373498429125855,
"grad_norm": 0.9059448258322844,
"learning_rate": 4.758501544477767e-08,
"loss": 1.03828706741333,
"memory(GiB)": 74.63,
"step": 2620,
"token_acc": 0.663670766319773,
"train_speed(iter/s)": 0.160484
},
{
"epoch": 1.9410460173720199,
"grad_norm": 1.0371951958694063,
"learning_rate": 4.209258969937624e-08,
"loss": 1.0256452560424805,
"memory(GiB)": 74.63,
"step": 2625,
"token_acc": 0.6571687019448214,
"train_speed(iter/s)": 0.160045
},
{
"epoch": 1.9447421918314545,
"grad_norm": 0.9579823005570719,
"learning_rate": 3.6936205707325255e-08,
"loss": 1.0316158294677735,
"memory(GiB)": 74.63,
"step": 2630,
"token_acc": 0.6658135283363803,
"train_speed(iter/s)": 0.159594
},
{
"epoch": 1.948438366290889,
"grad_norm": 1.185629004014561,
"learning_rate": 3.2116037477103454e-08,
"loss": 1.0686611175537108,
"memory(GiB)": 74.63,
"step": 2635,
"token_acc": 0.6998087954110899,
"train_speed(iter/s)": 0.159158
},
{
"epoch": 1.9521345407503234,
"grad_norm": 0.9906589709801633,
"learning_rate": 2.763224767117767e-08,
"loss": 0.9920598983764648,
"memory(GiB)": 74.63,
"step": 2640,
"token_acc": 0.6588921282798834,
"train_speed(iter/s)": 0.158729
},
{
"epoch": 1.9558307152097578,
"grad_norm": 0.9014323974805333,
"learning_rate": 2.3484987600512767e-08,
"loss": 1.0331963539123534,
"memory(GiB)": 74.63,
"step": 2645,
"token_acc": 0.6714507370054306,
"train_speed(iter/s)": 0.158272
},
{
"epoch": 1.9595268896691924,
"grad_norm": 0.9766018351933058,
"learning_rate": 1.9674397219469064e-08,
"loss": 1.037597370147705,
"memory(GiB)": 74.63,
"step": 2650,
"token_acc": 0.6561371841155235,
"train_speed(iter/s)": 0.157844
},
{
"epoch": 1.9595268896691924,
"eval_loss": 0.6565667390823364,
"eval_runtime": 88.279,
"eval_samples_per_second": 79.237,
"eval_steps_per_second": 0.623,
"eval_token_acc": 0.6325916469086267,
"step": 2650
},
{
"epoch": 1.963223064128627,
"grad_norm": 1.0614662558544963,
"learning_rate": 1.620060512107391e-08,
"loss": 1.016525936126709,
"memory(GiB)": 74.63,
"step": 2655,
"token_acc": 0.6412867391807452,
"train_speed(iter/s)": 0.156391
},
{
"epoch": 1.9669192385880614,
"grad_norm": 1.234699645190091,
"learning_rate": 1.3063728532686225e-08,
"loss": 1.0382546424865722,
"memory(GiB)": 74.63,
"step": 2660,
"token_acc": 0.628119293974437,
"train_speed(iter/s)": 0.155979
},
{
"epoch": 1.9706154130474958,
"grad_norm": 1.1176674856308213,
"learning_rate": 1.0263873312040818e-08,
"loss": 1.0646825790405274,
"memory(GiB)": 74.63,
"step": 2665,
"token_acc": 0.6521344232515894,
"train_speed(iter/s)": 0.155534
},
{
"epoch": 1.9743115875069304,
"grad_norm": 0.9542666956735151,
"learning_rate": 7.801133943672323e-09,
"loss": 1.047515296936035,
"memory(GiB)": 74.63,
"step": 2670,
"token_acc": 0.632,
"train_speed(iter/s)": 0.15513
},
{
"epoch": 1.978007761966365,
"grad_norm": 0.966385972017561,
"learning_rate": 5.675593535731106e-09,
"loss": 1.0257146835327149,
"memory(GiB)": 74.63,
"step": 2675,
"token_acc": 0.6467647058823529,
"train_speed(iter/s)": 0.15474
},
{
"epoch": 1.9817039364257993,
"grad_norm": 1.0905550872468757,
"learning_rate": 3.887323817173272e-09,
"loss": 1.0138104438781739,
"memory(GiB)": 74.63,
"step": 2680,
"token_acc": 0.6310845431255337,
"train_speed(iter/s)": 0.154324
},
{
"epoch": 1.9854001108852337,
"grad_norm": 1.0126426754906144,
"learning_rate": 2.436385135348163e-09,
"loss": 1.015495491027832,
"memory(GiB)": 74.63,
"step": 2685,
"token_acc": 0.6567026194144838,
"train_speed(iter/s)": 0.153915
},
{
"epoch": 1.9890962853446683,
"grad_norm": 0.8862791092932369,
"learning_rate": 1.3228264539522084e-09,
"loss": 1.049496841430664,
"memory(GiB)": 74.63,
"step": 2690,
"token_acc": 0.6486280487804879,
"train_speed(iter/s)": 0.153518
},
{
"epoch": 1.992792459804103,
"grad_norm": 1.0787160107890392,
"learning_rate": 5.466853513858006e-10,
"loss": 1.0067996978759766,
"memory(GiB)": 74.63,
"step": 2695,
"token_acc": 0.6233766233766234,
"train_speed(iter/s)": 0.153131
},
{
"epoch": 1.9964886342635373,
"grad_norm": 1.058938505423735,
"learning_rate": 1.0798801947764503e-10,
"loss": 1.0397415161132812,
"memory(GiB)": 74.63,
"step": 2700,
"token_acc": 0.6839266450916937,
"train_speed(iter/s)": 0.152739
},
{
"epoch": 1.9964886342635373,
"eval_loss": 0.6565173864364624,
"eval_runtime": 87.3486,
"eval_samples_per_second": 80.081,
"eval_steps_per_second": 0.63,
"eval_token_acc": 0.6325489372213771,
"step": 2700
},
{
"epoch": 1.9994455738310848,
"eval_loss": 0.6564235091209412,
"eval_runtime": 89.612,
"eval_samples_per_second": 78.059,
"eval_steps_per_second": 0.614,
"eval_token_acc": 0.6324912214277963,
"step": 2704
}
],
"logging_steps": 5,
"max_steps": 2704,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.945781552860365e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}