sft-qwen2.5-7b-multi-step / trainer_state.json
quao627's picture
Upload folder using huggingface_hub
9463593 verified
{
"best_global_step": 1485,
"best_metric": 0.3291038,
"best_model_checkpoint": "/raid/shared/mem1/models/Qwen2.5-7B-search-sft-v2/v0-20250511-083818/checkpoint-1485",
"epoch": 0.9994531611492029,
"eval_steps": 50,
"global_step": 1485,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006730324317503049,
"grad_norm": 179.0,
"learning_rate": 9.999988811118231e-05,
"loss": 0.9820185899734497,
"memory(GiB)": 71.61,
"step": 1,
"token_acc": 0.8044692737430168,
"train_speed(iter/s)": 0.218302
},
{
"epoch": 0.003365162158751525,
"grad_norm": 2928.0,
"learning_rate": 9.999720280459576e-05,
"loss": 5.42672061920166,
"memory(GiB)": 73.26,
"step": 5,
"token_acc": 0.3954864964853866,
"train_speed(iter/s)": 0.397179
},
{
"epoch": 0.00673032431750305,
"grad_norm": 35.75,
"learning_rate": 9.99888115313551e-05,
"loss": 1.4307238578796386,
"memory(GiB)": 73.26,
"step": 10,
"token_acc": 0.7211833231146536,
"train_speed(iter/s)": 0.450251
},
{
"epoch": 0.010095486476254575,
"grad_norm": 7.40625,
"learning_rate": 9.997482711915927e-05,
"loss": 1.1120232582092284,
"memory(GiB)": 73.26,
"step": 15,
"token_acc": 0.7599109131403118,
"train_speed(iter/s)": 0.467479
},
{
"epoch": 0.0134606486350061,
"grad_norm": 5.9375,
"learning_rate": 9.99552511326936e-05,
"loss": 0.819661808013916,
"memory(GiB)": 73.26,
"step": 20,
"token_acc": 0.7947040995374063,
"train_speed(iter/s)": 0.482418
},
{
"epoch": 0.016825810793757626,
"grad_norm": 5.4375,
"learning_rate": 9.993008576227247e-05,
"loss": 0.9263886451721192,
"memory(GiB)": 73.26,
"step": 25,
"token_acc": 0.7752791563275434,
"train_speed(iter/s)": 0.490393
},
{
"epoch": 0.02019097295250915,
"grad_norm": 4.1875,
"learning_rate": 9.989933382359422e-05,
"loss": 0.7815323352813721,
"memory(GiB)": 73.26,
"step": 30,
"token_acc": 0.8033576869267838,
"train_speed(iter/s)": 0.499449
},
{
"epoch": 0.023556135111260673,
"grad_norm": 4.0625,
"learning_rate": 9.986299875742613e-05,
"loss": 0.713004732131958,
"memory(GiB)": 73.26,
"step": 35,
"token_acc": 0.8128159139083646,
"train_speed(iter/s)": 0.503737
},
{
"epoch": 0.0269212972700122,
"grad_norm": 9.4375,
"learning_rate": 9.982108462921937e-05,
"loss": 0.7091834068298339,
"memory(GiB)": 73.26,
"step": 40,
"token_acc": 0.82666015625,
"train_speed(iter/s)": 0.502146
},
{
"epoch": 0.030286459428763724,
"grad_norm": 6.1875,
"learning_rate": 9.977359612865423e-05,
"loss": 0.6785173892974854,
"memory(GiB)": 73.26,
"step": 45,
"token_acc": 0.8104975044276284,
"train_speed(iter/s)": 0.505359
},
{
"epoch": 0.03365162158751525,
"grad_norm": 4.65625,
"learning_rate": 9.972053856911534e-05,
"loss": 0.7147142887115479,
"memory(GiB)": 73.26,
"step": 50,
"token_acc": 0.8093519535540562,
"train_speed(iter/s)": 0.508235
},
{
"epoch": 0.03365162158751525,
"eval_loss": 0.6807255148887634,
"eval_runtime": 6.4656,
"eval_samples_per_second": 37.119,
"eval_steps_per_second": 37.119,
"eval_token_acc": 0.8191331923890064,
"step": 50
},
{
"epoch": 0.037016783746266775,
"grad_norm": 3.875,
"learning_rate": 9.966191788709716e-05,
"loss": 0.7501668453216552,
"memory(GiB)": 73.26,
"step": 55,
"token_acc": 0.81598110608148,
"train_speed(iter/s)": 0.312271
},
{
"epoch": 0.0403819459050183,
"grad_norm": 5.15625,
"learning_rate": 9.959774064153977e-05,
"loss": 0.7556098461151123,
"memory(GiB)": 73.26,
"step": 60,
"token_acc": 0.812223746380125,
"train_speed(iter/s)": 0.323476
},
{
"epoch": 0.04374710806376982,
"grad_norm": 3.28125,
"learning_rate": 9.952801401309503e-05,
"loss": 0.6282004833221435,
"memory(GiB)": 73.26,
"step": 65,
"token_acc": 0.8382288469969311,
"train_speed(iter/s)": 0.332657
},
{
"epoch": 0.047112270222521346,
"grad_norm": 3.84375,
"learning_rate": 9.945274580332316e-05,
"loss": 0.6862215042114258,
"memory(GiB)": 73.26,
"step": 70,
"token_acc": 0.8230115830115831,
"train_speed(iter/s)": 0.341959
},
{
"epoch": 0.05047743238127287,
"grad_norm": 4.96875,
"learning_rate": 9.937194443381972e-05,
"loss": 0.8654034614562989,
"memory(GiB)": 73.26,
"step": 75,
"token_acc": 0.787591859807801,
"train_speed(iter/s)": 0.351409
},
{
"epoch": 0.0538425945400244,
"grad_norm": 3.734375,
"learning_rate": 9.928561894527353e-05,
"loss": 0.7333785057067871,
"memory(GiB)": 73.26,
"step": 80,
"token_acc": 0.8105247240284289,
"train_speed(iter/s)": 0.358757
},
{
"epoch": 0.057207756698775925,
"grad_norm": 10.6875,
"learning_rate": 9.919377899645497e-05,
"loss": 0.6422113418579102,
"memory(GiB)": 73.26,
"step": 85,
"token_acc": 0.8280427771176371,
"train_speed(iter/s)": 0.366331
},
{
"epoch": 0.06057291885752745,
"grad_norm": 3.265625,
"learning_rate": 9.909643486313533e-05,
"loss": 0.7867285251617432,
"memory(GiB)": 73.26,
"step": 90,
"token_acc": 0.8057151496824917,
"train_speed(iter/s)": 0.373092
},
{
"epoch": 0.06393808101627897,
"grad_norm": 3.0625,
"learning_rate": 9.899359743693714e-05,
"loss": 0.675608491897583,
"memory(GiB)": 73.26,
"step": 95,
"token_acc": 0.8164676304211188,
"train_speed(iter/s)": 0.379215
},
{
"epoch": 0.0673032431750305,
"grad_norm": 3.390625,
"learning_rate": 9.888527822411543e-05,
"loss": 0.7589282989501953,
"memory(GiB)": 73.26,
"step": 100,
"token_acc": 0.810183048761729,
"train_speed(iter/s)": 0.385325
},
{
"epoch": 0.0673032431750305,
"eval_loss": 0.6772852540016174,
"eval_runtime": 6.3754,
"eval_samples_per_second": 37.645,
"eval_steps_per_second": 37.645,
"eval_token_acc": 0.8261627906976744,
"step": 100
},
{
"epoch": 0.07066840533378202,
"grad_norm": 3.859375,
"learning_rate": 9.877148934427037e-05,
"loss": 0.6617954730987549,
"memory(GiB)": 73.26,
"step": 105,
"token_acc": 0.8273892727345341,
"train_speed(iter/s)": 0.311491
},
{
"epoch": 0.07403356749253355,
"grad_norm": 3.15625,
"learning_rate": 9.865224352899119e-05,
"loss": 0.7310012340545654,
"memory(GiB)": 73.26,
"step": 110,
"token_acc": 0.8089593596059114,
"train_speed(iter/s)": 0.317239
},
{
"epoch": 0.07739872965128507,
"grad_norm": 3.828125,
"learning_rate": 9.85275541204318e-05,
"loss": 0.703582763671875,
"memory(GiB)": 73.26,
"step": 115,
"token_acc": 0.8229212819376753,
"train_speed(iter/s)": 0.32253
},
{
"epoch": 0.0807638918100366,
"grad_norm": 4.1875,
"learning_rate": 9.839743506981782e-05,
"loss": 0.7135389804840088,
"memory(GiB)": 73.26,
"step": 120,
"token_acc": 0.8188603416104493,
"train_speed(iter/s)": 0.327653
},
{
"epoch": 0.08412905396878811,
"grad_norm": 3.5,
"learning_rate": 9.826190093588563e-05,
"loss": 0.6105506420135498,
"memory(GiB)": 73.26,
"step": 125,
"token_acc": 0.8411754713776117,
"train_speed(iter/s)": 0.332914
},
{
"epoch": 0.08749421612753965,
"grad_norm": 3.0625,
"learning_rate": 9.812096688325354e-05,
"loss": 0.7001046657562255,
"memory(GiB)": 73.26,
"step": 130,
"token_acc": 0.8249736406085254,
"train_speed(iter/s)": 0.337969
},
{
"epoch": 0.09085937828629118,
"grad_norm": 2.9375,
"learning_rate": 9.797464868072488e-05,
"loss": 0.6970377922058105,
"memory(GiB)": 73.26,
"step": 135,
"token_acc": 0.8269230769230769,
"train_speed(iter/s)": 0.342842
},
{
"epoch": 0.09422454044504269,
"grad_norm": 3.59375,
"learning_rate": 9.78229626995238e-05,
"loss": 0.6484662055969238,
"memory(GiB)": 73.26,
"step": 140,
"token_acc": 0.8294907944932824,
"train_speed(iter/s)": 0.347497
},
{
"epoch": 0.09758970260379422,
"grad_norm": 2.9375,
"learning_rate": 9.766592591146352e-05,
"loss": 0.6710952281951904,
"memory(GiB)": 73.26,
"step": 145,
"token_acc": 0.8295923041685753,
"train_speed(iter/s)": 0.352009
},
{
"epoch": 0.10095486476254574,
"grad_norm": 3.140625,
"learning_rate": 9.750355588704727e-05,
"loss": 0.6715181350708008,
"memory(GiB)": 73.26,
"step": 150,
"token_acc": 0.8266999559406668,
"train_speed(iter/s)": 0.355725
},
{
"epoch": 0.10095486476254574,
"eval_loss": 0.65446937084198,
"eval_runtime": 6.2927,
"eval_samples_per_second": 38.139,
"eval_steps_per_second": 38.139,
"eval_token_acc": 0.8305496828752643,
"step": 150
},
{
"epoch": 0.10432002692129727,
"grad_norm": 2.859375,
"learning_rate": 9.733587079350252e-05,
"loss": 0.6584550857543945,
"memory(GiB)": 73.26,
"step": 155,
"token_acc": 0.8323802163833076,
"train_speed(iter/s)": 0.310504
},
{
"epoch": 0.1076851890800488,
"grad_norm": 3.109375,
"learning_rate": 9.716288939274819e-05,
"loss": 0.7138989925384521,
"memory(GiB)": 73.26,
"step": 160,
"token_acc": 0.8222054380664653,
"train_speed(iter/s)": 0.314403
},
{
"epoch": 0.11105035123880032,
"grad_norm": 22.25,
"learning_rate": 9.698463103929542e-05,
"loss": 0.6830341339111328,
"memory(GiB)": 73.26,
"step": 165,
"token_acc": 0.8247949233864726,
"train_speed(iter/s)": 0.318566
},
{
"epoch": 0.11441551339755185,
"grad_norm": 2.859375,
"learning_rate": 9.680111567808213e-05,
"loss": 0.6824192047119141,
"memory(GiB)": 73.26,
"step": 170,
"token_acc": 0.836785661818716,
"train_speed(iter/s)": 0.3222
},
{
"epoch": 0.11778067555630337,
"grad_norm": 2.734375,
"learning_rate": 9.661236384224129e-05,
"loss": 0.6676050186157226,
"memory(GiB)": 73.26,
"step": 175,
"token_acc": 0.8297106664747373,
"train_speed(iter/s)": 0.325905
},
{
"epoch": 0.1211458377150549,
"grad_norm": 3.5,
"learning_rate": 9.641839665080363e-05,
"loss": 0.6630958557128906,
"memory(GiB)": 73.26,
"step": 180,
"token_acc": 0.8245426829268293,
"train_speed(iter/s)": 0.329211
},
{
"epoch": 0.12451099987380641,
"grad_norm": 3.140625,
"learning_rate": 9.62192358063346e-05,
"loss": 0.7024449348449707,
"memory(GiB)": 73.26,
"step": 185,
"token_acc": 0.8313099041533546,
"train_speed(iter/s)": 0.332825
},
{
"epoch": 0.12787616203255794,
"grad_norm": 3.953125,
"learning_rate": 9.601490359250615e-05,
"loss": 0.6541357517242432,
"memory(GiB)": 73.26,
"step": 190,
"token_acc": 0.829295154185022,
"train_speed(iter/s)": 0.336139
},
{
"epoch": 0.13124132419130946,
"grad_norm": 3.515625,
"learning_rate": 9.580542287160348e-05,
"loss": 0.6423999786376953,
"memory(GiB)": 73.26,
"step": 195,
"token_acc": 0.8379697413372377,
"train_speed(iter/s)": 0.339594
},
{
"epoch": 0.134606486350061,
"grad_norm": 3.515625,
"learning_rate": 9.559081708196696e-05,
"loss": 0.7582132339477539,
"memory(GiB)": 73.26,
"step": 200,
"token_acc": 0.8089741740008657,
"train_speed(iter/s)": 0.342609
},
{
"epoch": 0.134606486350061,
"eval_loss": 0.6314957737922668,
"eval_runtime": 6.3575,
"eval_samples_per_second": 37.75,
"eval_steps_per_second": 37.75,
"eval_token_acc": 0.8350422832980973,
"step": 200
},
{
"epoch": 0.13797164850881252,
"grad_norm": 3.96875,
"learning_rate": 9.537111023536973e-05,
"loss": 0.6429227352142334,
"memory(GiB)": 73.26,
"step": 205,
"token_acc": 0.8360507956416086,
"train_speed(iter/s)": 0.306958
},
{
"epoch": 0.14133681066756404,
"grad_norm": 3.109375,
"learning_rate": 9.514632691433107e-05,
"loss": 0.6970784187316894,
"memory(GiB)": 73.26,
"step": 210,
"token_acc": 0.8279309788743751,
"train_speed(iter/s)": 0.310052
},
{
"epoch": 0.14470197282631556,
"grad_norm": 2.84375,
"learning_rate": 9.491649226936585e-05,
"loss": 0.7154839515686036,
"memory(GiB)": 73.26,
"step": 215,
"token_acc": 0.8265466495213601,
"train_speed(iter/s)": 0.313148
},
{
"epoch": 0.1480671349850671,
"grad_norm": 3.5,
"learning_rate": 9.468163201617062e-05,
"loss": 0.6052781105041504,
"memory(GiB)": 73.26,
"step": 220,
"token_acc": 0.8488794669897032,
"train_speed(iter/s)": 0.315973
},
{
"epoch": 0.15143229714381862,
"grad_norm": 2.828125,
"learning_rate": 9.444177243274618e-05,
"loss": 0.5735151290893554,
"memory(GiB)": 73.26,
"step": 225,
"token_acc": 0.8521346213773762,
"train_speed(iter/s)": 0.318863
},
{
"epoch": 0.15479745930257013,
"grad_norm": 3.140625,
"learning_rate": 9.419694035645751e-05,
"loss": 0.6527684211730957,
"memory(GiB)": 73.26,
"step": 230,
"token_acc": 0.8357325655790148,
"train_speed(iter/s)": 0.32165
},
{
"epoch": 0.15816262146132168,
"grad_norm": 3.015625,
"learning_rate": 9.394716318103098e-05,
"loss": 0.672496223449707,
"memory(GiB)": 73.26,
"step": 235,
"token_acc": 0.8255225893459204,
"train_speed(iter/s)": 0.324423
},
{
"epoch": 0.1615277836200732,
"grad_norm": 2.5,
"learning_rate": 9.369246885348926e-05,
"loss": 0.5730775356292724,
"memory(GiB)": 73.26,
"step": 240,
"token_acc": 0.8565380231232699,
"train_speed(iter/s)": 0.327159
},
{
"epoch": 0.1648929457788247,
"grad_norm": 2.859375,
"learning_rate": 9.343288587102443e-05,
"loss": 0.6417149543762207,
"memory(GiB)": 73.26,
"step": 245,
"token_acc": 0.8340062808434275,
"train_speed(iter/s)": 0.329689
},
{
"epoch": 0.16825810793757623,
"grad_norm": 2.671875,
"learning_rate": 9.316844327780955e-05,
"loss": 0.6126539707183838,
"memory(GiB)": 73.26,
"step": 250,
"token_acc": 0.8416943761746422,
"train_speed(iter/s)": 0.332312
},
{
"epoch": 0.16825810793757623,
"eval_loss": 0.6005221009254456,
"eval_runtime": 6.383,
"eval_samples_per_second": 37.6,
"eval_steps_per_second": 37.6,
"eval_token_acc": 0.8414904862579281,
"step": 250
},
{
"epoch": 0.17162327009632777,
"grad_norm": 3.640625,
"learning_rate": 9.289917066174886e-05,
"loss": 0.5673539161682128,
"memory(GiB)": 73.78,
"step": 255,
"token_acc": 0.8447506770750358,
"train_speed(iter/s)": 0.305561
},
{
"epoch": 0.1749884322550793,
"grad_norm": 2.9375,
"learning_rate": 9.262509815116732e-05,
"loss": 0.696702241897583,
"memory(GiB)": 73.78,
"step": 260,
"token_acc": 0.8255023183925811,
"train_speed(iter/s)": 0.308253
},
{
"epoch": 0.1783535944138308,
"grad_norm": 3.34375,
"learning_rate": 9.23462564114396e-05,
"loss": 0.6081646919250489,
"memory(GiB)": 73.78,
"step": 265,
"token_acc": 0.83946592144077,
"train_speed(iter/s)": 0.310782
},
{
"epoch": 0.18171875657258235,
"grad_norm": 3.09375,
"learning_rate": 9.206267664155907e-05,
"loss": 0.6581857681274415,
"memory(GiB)": 73.78,
"step": 270,
"token_acc": 0.8402509652509652,
"train_speed(iter/s)": 0.31337
},
{
"epoch": 0.18508391873133387,
"grad_norm": 2.625,
"learning_rate": 9.177439057064683e-05,
"loss": 0.5923350334167481,
"memory(GiB)": 74.6,
"step": 275,
"token_acc": 0.8534911648653285,
"train_speed(iter/s)": 0.315773
},
{
"epoch": 0.18844908089008539,
"grad_norm": 3.640625,
"learning_rate": 9.14814304544018e-05,
"loss": 0.636703634262085,
"memory(GiB)": 74.6,
"step": 280,
"token_acc": 0.8385491895361901,
"train_speed(iter/s)": 0.318259
},
{
"epoch": 0.1918142430488369,
"grad_norm": 3.109375,
"learning_rate": 9.118382907149165e-05,
"loss": 0.6206272125244141,
"memory(GiB)": 74.6,
"step": 285,
"token_acc": 0.8444787644787645,
"train_speed(iter/s)": 0.320478
},
{
"epoch": 0.19517940520758845,
"grad_norm": 2.546875,
"learning_rate": 9.088161971988516e-05,
"loss": 0.6622869491577148,
"memory(GiB)": 74.6,
"step": 290,
"token_acc": 0.8280620155038759,
"train_speed(iter/s)": 0.322612
},
{
"epoch": 0.19854456736633996,
"grad_norm": 2.546875,
"learning_rate": 9.057483621312671e-05,
"loss": 0.5659195899963378,
"memory(GiB)": 74.6,
"step": 295,
"token_acc": 0.8537543198240654,
"train_speed(iter/s)": 0.324748
},
{
"epoch": 0.20190972952509148,
"grad_norm": 2.78125,
"learning_rate": 9.026351287655294e-05,
"loss": 0.576479721069336,
"memory(GiB)": 74.6,
"step": 300,
"token_acc": 0.8472282845918813,
"train_speed(iter/s)": 0.326837
},
{
"epoch": 0.20190972952509148,
"eval_loss": 0.6099406480789185,
"eval_runtime": 6.5229,
"eval_samples_per_second": 36.793,
"eval_steps_per_second": 36.793,
"eval_token_acc": 0.8406448202959831,
"step": 300
},
{
"epoch": 0.20527489168384302,
"grad_norm": 2.828125,
"learning_rate": 8.994768454345206e-05,
"loss": 0.6150260448455811,
"memory(GiB)": 74.6,
"step": 305,
"token_acc": 0.8427843137254902,
"train_speed(iter/s)": 0.30445
},
{
"epoch": 0.20864005384259454,
"grad_norm": 3.140625,
"learning_rate": 8.962738655116658e-05,
"loss": 0.6955391883850097,
"memory(GiB)": 74.6,
"step": 310,
"token_acc": 0.8351194121249235,
"train_speed(iter/s)": 0.306547
},
{
"epoch": 0.21200521600134606,
"grad_norm": 3.375,
"learning_rate": 8.930265473713938e-05,
"loss": 0.5725995063781738,
"memory(GiB)": 74.6,
"step": 315,
"token_acc": 0.854539641943734,
"train_speed(iter/s)": 0.308729
},
{
"epoch": 0.2153703781600976,
"grad_norm": 2.78125,
"learning_rate": 8.897352543490395e-05,
"loss": 0.5337778568267822,
"memory(GiB)": 74.6,
"step": 320,
"token_acc": 0.856396866840731,
"train_speed(iter/s)": 0.310883
},
{
"epoch": 0.21873554031884912,
"grad_norm": 3.0625,
"learning_rate": 8.864003547001915e-05,
"loss": 0.6348609447479248,
"memory(GiB)": 75.82,
"step": 325,
"token_acc": 0.8423275457531675,
"train_speed(iter/s)": 0.312767
},
{
"epoch": 0.22210070247760064,
"grad_norm": 3.09375,
"learning_rate": 8.83022221559489e-05,
"loss": 0.6023256778717041,
"memory(GiB)": 75.82,
"step": 330,
"token_acc": 0.8492975734355045,
"train_speed(iter/s)": 0.314836
},
{
"epoch": 0.22546586463635215,
"grad_norm": 3.171875,
"learning_rate": 8.796012328988716e-05,
"loss": 0.7502017498016358,
"memory(GiB)": 75.82,
"step": 335,
"token_acc": 0.8177310293012773,
"train_speed(iter/s)": 0.316755
},
{
"epoch": 0.2288310267951037,
"grad_norm": 3.125,
"learning_rate": 8.761377714852899e-05,
"loss": 0.5663125038146972,
"memory(GiB)": 75.82,
"step": 340,
"token_acc": 0.8523967726625534,
"train_speed(iter/s)": 0.318692
},
{
"epoch": 0.23219618895385521,
"grad_norm": 2.6875,
"learning_rate": 8.726322248378775e-05,
"loss": 0.6224043846130372,
"memory(GiB)": 75.82,
"step": 345,
"token_acc": 0.8415334471519479,
"train_speed(iter/s)": 0.32067
},
{
"epoch": 0.23556135111260673,
"grad_norm": 3.0625,
"learning_rate": 8.690849851845933e-05,
"loss": 0.6304502010345459,
"memory(GiB)": 75.82,
"step": 350,
"token_acc": 0.8429706005294691,
"train_speed(iter/s)": 0.322382
},
{
"epoch": 0.23556135111260673,
"eval_loss": 0.584740936756134,
"eval_runtime": 6.3513,
"eval_samples_per_second": 37.787,
"eval_steps_per_second": 37.787,
"eval_token_acc": 0.8461945031712473,
"step": 350
},
{
"epoch": 0.23892651327135828,
"grad_norm": 3.65625,
"learning_rate": 8.654964494183358e-05,
"loss": 0.6737657070159913,
"memory(GiB)": 75.82,
"step": 355,
"token_acc": 0.8434381584701626,
"train_speed(iter/s)": 0.303665
},
{
"epoch": 0.2422916754301098,
"grad_norm": 2.765625,
"learning_rate": 8.618670190525352e-05,
"loss": 0.6179668426513671,
"memory(GiB)": 75.82,
"step": 360,
"token_acc": 0.8392533779077866,
"train_speed(iter/s)": 0.305585
},
{
"epoch": 0.2456568375888613,
"grad_norm": 3.15625,
"learning_rate": 8.581971001762286e-05,
"loss": 0.626660680770874,
"memory(GiB)": 75.82,
"step": 365,
"token_acc": 0.8430995837335895,
"train_speed(iter/s)": 0.307436
},
{
"epoch": 0.24902199974761283,
"grad_norm": 3.265625,
"learning_rate": 8.54487103408625e-05,
"loss": 0.546476411819458,
"memory(GiB)": 75.82,
"step": 370,
"token_acc": 0.856482219741668,
"train_speed(iter/s)": 0.309304
},
{
"epoch": 0.25238716190636434,
"grad_norm": 3.09375,
"learning_rate": 8.507374438531607e-05,
"loss": 0.6368942260742188,
"memory(GiB)": 75.82,
"step": 375,
"token_acc": 0.8362845604224914,
"train_speed(iter/s)": 0.311038
},
{
"epoch": 0.2557523240651159,
"grad_norm": 2.921875,
"learning_rate": 8.469485410510545e-05,
"loss": 0.6205560684204101,
"memory(GiB)": 75.82,
"step": 380,
"token_acc": 0.8416890480453596,
"train_speed(iter/s)": 0.312832
},
{
"epoch": 0.25911748622386743,
"grad_norm": 3.1875,
"learning_rate": 8.43120818934367e-05,
"loss": 0.5687759399414063,
"memory(GiB)": 75.82,
"step": 385,
"token_acc": 0.8516474854169951,
"train_speed(iter/s)": 0.314546
},
{
"epoch": 0.2624826483826189,
"grad_norm": 3.375,
"learning_rate": 8.392547057785661e-05,
"loss": 0.6561696529388428,
"memory(GiB)": 75.82,
"step": 390,
"token_acc": 0.8435968137254902,
"train_speed(iter/s)": 0.316291
},
{
"epoch": 0.26584781054137047,
"grad_norm": 2.84375,
"learning_rate": 8.353506341546104e-05,
"loss": 0.6340418815612793,
"memory(GiB)": 75.82,
"step": 395,
"token_acc": 0.8422232182877634,
"train_speed(iter/s)": 0.317983
},
{
"epoch": 0.269212972700122,
"grad_norm": 2.921875,
"learning_rate": 8.314090408805482e-05,
"loss": 0.5887197017669678,
"memory(GiB)": 75.82,
"step": 400,
"token_acc": 0.8538324420677362,
"train_speed(iter/s)": 0.319589
},
{
"epoch": 0.269212972700122,
"eval_loss": 0.5566386580467224,
"eval_runtime": 6.2654,
"eval_samples_per_second": 38.306,
"eval_steps_per_second": 38.306,
"eval_token_acc": 0.8535412262156448,
"step": 400
},
{
"epoch": 0.2725781348588735,
"grad_norm": 3.828125,
"learning_rate": 8.274303669726426e-05,
"loss": 0.5180852890014649,
"memory(GiB)": 75.82,
"step": 405,
"token_acc": 0.8568248957953948,
"train_speed(iter/s)": 0.303898
},
{
"epoch": 0.27594329701762504,
"grad_norm": 3.171875,
"learning_rate": 8.234150575960288e-05,
"loss": 0.6065554618835449,
"memory(GiB)": 75.82,
"step": 410,
"token_acc": 0.8468217054263566,
"train_speed(iter/s)": 0.305556
},
{
"epoch": 0.2793084591763766,
"grad_norm": 2.078125,
"learning_rate": 8.19363562014904e-05,
"loss": 0.563250207901001,
"memory(GiB)": 75.82,
"step": 415,
"token_acc": 0.8505627962085308,
"train_speed(iter/s)": 0.307215
},
{
"epoch": 0.2826736213351281,
"grad_norm": 3.203125,
"learning_rate": 8.152763335422613e-05,
"loss": 0.5627524375915527,
"memory(GiB)": 75.82,
"step": 420,
"token_acc": 0.8593019632284201,
"train_speed(iter/s)": 0.308773
},
{
"epoch": 0.2860387834938796,
"grad_norm": 2.390625,
"learning_rate": 8.111538294891684e-05,
"loss": 0.5277237892150879,
"memory(GiB)": 75.82,
"step": 425,
"token_acc": 0.8635786802030457,
"train_speed(iter/s)": 0.310337
},
{
"epoch": 0.2894039456526311,
"grad_norm": 2.796875,
"learning_rate": 8.06996511113601e-05,
"loss": 0.610354232788086,
"memory(GiB)": 75.82,
"step": 430,
"token_acc": 0.842520795150148,
"train_speed(iter/s)": 0.311764
},
{
"epoch": 0.29276910781138266,
"grad_norm": 4.3125,
"learning_rate": 8.028048435688333e-05,
"loss": 0.5220999717712402,
"memory(GiB)": 75.82,
"step": 435,
"token_acc": 0.866747609652451,
"train_speed(iter/s)": 0.31318
},
{
"epoch": 0.2961342699701342,
"grad_norm": 2.890625,
"learning_rate": 7.985792958513931e-05,
"loss": 0.6387944698333741,
"memory(GiB)": 75.82,
"step": 440,
"token_acc": 0.8430452550325412,
"train_speed(iter/s)": 0.314765
},
{
"epoch": 0.2994994321288857,
"grad_norm": 2.265625,
"learning_rate": 7.943203407485864e-05,
"loss": 0.44078569412231444,
"memory(GiB)": 75.82,
"step": 445,
"token_acc": 0.8829911533967618,
"train_speed(iter/s)": 0.316117
},
{
"epoch": 0.30286459428763723,
"grad_norm": 2.34375,
"learning_rate": 7.900284547855991e-05,
"loss": 0.5065964698791504,
"memory(GiB)": 75.82,
"step": 450,
"token_acc": 0.8605196982397317,
"train_speed(iter/s)": 0.317607
},
{
"epoch": 0.30286459428763723,
"eval_loss": 0.5302485823631287,
"eval_runtime": 6.3503,
"eval_samples_per_second": 37.794,
"eval_steps_per_second": 37.794,
"eval_token_acc": 0.8599894291754757,
"step": 450
},
{
"epoch": 0.3062297564463888,
"grad_norm": 2.0625,
"learning_rate": 7.857041181721787e-05,
"loss": 0.4503211975097656,
"memory(GiB)": 75.82,
"step": 455,
"token_acc": 0.8636938646426312,
"train_speed(iter/s)": 0.303058
},
{
"epoch": 0.30959491860514027,
"grad_norm": 3.359375,
"learning_rate": 7.813478147489052e-05,
"loss": 0.5654148578643798,
"memory(GiB)": 75.82,
"step": 460,
"token_acc": 0.8542982030111704,
"train_speed(iter/s)": 0.304484
},
{
"epoch": 0.3129600807638918,
"grad_norm": 2.546875,
"learning_rate": 7.769600319330552e-05,
"loss": 0.47755861282348633,
"memory(GiB)": 75.82,
"step": 465,
"token_acc": 0.8762720077531901,
"train_speed(iter/s)": 0.305941
},
{
"epoch": 0.31632524292264336,
"grad_norm": 2.609375,
"learning_rate": 7.725412606640658e-05,
"loss": 0.5518892288208008,
"memory(GiB)": 75.82,
"step": 470,
"token_acc": 0.8566623959000641,
"train_speed(iter/s)": 0.307401
},
{
"epoch": 0.31969040508139485,
"grad_norm": 3.265625,
"learning_rate": 7.680919953486048e-05,
"loss": 0.5913249492645264,
"memory(GiB)": 75.82,
"step": 475,
"token_acc": 0.8497417957687823,
"train_speed(iter/s)": 0.308848
},
{
"epoch": 0.3230555672401464,
"grad_norm": 2.65625,
"learning_rate": 7.636127338052512e-05,
"loss": 0.5384829044342041,
"memory(GiB)": 75.82,
"step": 480,
"token_acc": 0.8643092105263158,
"train_speed(iter/s)": 0.31026
},
{
"epoch": 0.32642072939889794,
"grad_norm": 3.265625,
"learning_rate": 7.591039772087977e-05,
"loss": 0.5349913120269776,
"memory(GiB)": 75.82,
"step": 485,
"token_acc": 0.8635131063573366,
"train_speed(iter/s)": 0.311583
},
{
"epoch": 0.3297858915576494,
"grad_norm": 3.046875,
"learning_rate": 7.545662300341736e-05,
"loss": 0.48796830177307127,
"memory(GiB)": 75.82,
"step": 490,
"token_acc": 0.8717186726102031,
"train_speed(iter/s)": 0.312855
},
{
"epoch": 0.33315105371640097,
"grad_norm": 2.421875,
"learning_rate": 7.500000000000001e-05,
"loss": 0.5078158378601074,
"memory(GiB)": 75.82,
"step": 495,
"token_acc": 0.8613126649076517,
"train_speed(iter/s)": 0.314225
},
{
"epoch": 0.33651621587515246,
"grad_norm": 2.328125,
"learning_rate": 7.454057980117841e-05,
"loss": 0.484033203125,
"memory(GiB)": 75.82,
"step": 500,
"token_acc": 0.8767056530214425,
"train_speed(iter/s)": 0.315565
},
{
"epoch": 0.33651621587515246,
"eval_loss": 0.5204777121543884,
"eval_runtime": 6.2841,
"eval_samples_per_second": 38.192,
"eval_steps_per_second": 38.192,
"eval_token_acc": 0.8624207188160676,
"step": 500
},
{
"epoch": 0.339881378033904,
"grad_norm": 3.25,
"learning_rate": 7.407841381047532e-05,
"loss": 0.5047823905944824,
"memory(GiB)": 75.82,
"step": 505,
"token_acc": 0.8646571869925139,
"train_speed(iter/s)": 0.303291
},
{
"epoch": 0.34324654019265555,
"grad_norm": 2.5,
"learning_rate": 7.361355373863414e-05,
"loss": 0.5279562950134278,
"memory(GiB)": 75.82,
"step": 510,
"token_acc": 0.8671359436867576,
"train_speed(iter/s)": 0.304627
},
{
"epoch": 0.34661170235140704,
"grad_norm": 2.28125,
"learning_rate": 7.314605159783314e-05,
"loss": 0.5070261001586914,
"memory(GiB)": 75.82,
"step": 515,
"token_acc": 0.8705286483064261,
"train_speed(iter/s)": 0.305896
},
{
"epoch": 0.3499768645101586,
"grad_norm": 2.671875,
"learning_rate": 7.267595969586589e-05,
"loss": 0.49044408798217776,
"memory(GiB)": 75.82,
"step": 520,
"token_acc": 0.8736138290932811,
"train_speed(iter/s)": 0.307204
},
{
"epoch": 0.3533420266689101,
"grad_norm": 2.46875,
"learning_rate": 7.220333063028872e-05,
"loss": 0.5966588497161865,
"memory(GiB)": 75.82,
"step": 525,
"token_acc": 0.8524286815728604,
"train_speed(iter/s)": 0.30845
},
{
"epoch": 0.3567071888276616,
"grad_norm": 2.796875,
"learning_rate": 7.172821728253562e-05,
"loss": 0.5701375007629395,
"memory(GiB)": 75.82,
"step": 530,
"token_acc": 0.8540377863233573,
"train_speed(iter/s)": 0.309712
},
{
"epoch": 0.36007235098641316,
"grad_norm": 2.796875,
"learning_rate": 7.12506728120015e-05,
"loss": 0.4613838195800781,
"memory(GiB)": 75.82,
"step": 535,
"token_acc": 0.8718804641551423,
"train_speed(iter/s)": 0.310968
},
{
"epoch": 0.3634375131451647,
"grad_norm": 2.59375,
"learning_rate": 7.077075065009433e-05,
"loss": 0.5259300708770752,
"memory(GiB)": 75.82,
"step": 540,
"token_acc": 0.8661874904419636,
"train_speed(iter/s)": 0.312118
},
{
"epoch": 0.3668026753039162,
"grad_norm": 2.671875,
"learning_rate": 7.02885044942567e-05,
"loss": 0.5487593173980713,
"memory(GiB)": 75.82,
"step": 545,
"token_acc": 0.8593700787401575,
"train_speed(iter/s)": 0.313405
},
{
"epoch": 0.37016783746266774,
"grad_norm": 2.40625,
"learning_rate": 6.980398830195785e-05,
"loss": 0.4660326957702637,
"memory(GiB)": 75.82,
"step": 550,
"token_acc": 0.8734921592279855,
"train_speed(iter/s)": 0.31456
},
{
"epoch": 0.37016783746266774,
"eval_loss": 0.49012547731399536,
"eval_runtime": 6.3431,
"eval_samples_per_second": 37.837,
"eval_steps_per_second": 37.837,
"eval_token_acc": 0.8673890063424947,
"step": 550
},
{
"epoch": 0.3735329996214193,
"grad_norm": 2.5625,
"learning_rate": 6.931725628465643e-05,
"loss": 0.5410624027252198,
"memory(GiB)": 75.82,
"step": 555,
"token_acc": 0.8664497667672768,
"train_speed(iter/s)": 0.302648
},
{
"epoch": 0.37689816178017077,
"grad_norm": 2.328125,
"learning_rate": 6.882836290173493e-05,
"loss": 0.5354323387145996,
"memory(GiB)": 75.82,
"step": 560,
"token_acc": 0.8606255012028869,
"train_speed(iter/s)": 0.303869
},
{
"epoch": 0.3802633239389223,
"grad_norm": 2.390625,
"learning_rate": 6.833736285440632e-05,
"loss": 0.4386926174163818,
"memory(GiB)": 75.82,
"step": 565,
"token_acc": 0.8871541196475499,
"train_speed(iter/s)": 0.30496
},
{
"epoch": 0.3836284860976738,
"grad_norm": 1.6796875,
"learning_rate": 6.784431107959359e-05,
"loss": 0.5115750789642334,
"memory(GiB)": 75.82,
"step": 570,
"token_acc": 0.8649976962064199,
"train_speed(iter/s)": 0.306182
},
{
"epoch": 0.38699364825642535,
"grad_norm": 2.4375,
"learning_rate": 6.734926274378312e-05,
"loss": 0.48287324905395507,
"memory(GiB)": 75.82,
"step": 575,
"token_acc": 0.8724030754130542,
"train_speed(iter/s)": 0.307381
},
{
"epoch": 0.3903588104151769,
"grad_norm": 2.703125,
"learning_rate": 6.685227323685209e-05,
"loss": 0.5082109451293946,
"memory(GiB)": 75.82,
"step": 580,
"token_acc": 0.8686852331606217,
"train_speed(iter/s)": 0.30846
},
{
"epoch": 0.3937239725739284,
"grad_norm": 2.296875,
"learning_rate": 6.635339816587109e-05,
"loss": 0.46943073272705077,
"memory(GiB)": 75.82,
"step": 585,
"token_acc": 0.8746086412022542,
"train_speed(iter/s)": 0.309547
},
{
"epoch": 0.3970891347326799,
"grad_norm": 2.265625,
"learning_rate": 6.585269334888234e-05,
"loss": 0.4492472171783447,
"memory(GiB)": 75.82,
"step": 590,
"token_acc": 0.8817360438851243,
"train_speed(iter/s)": 0.31066
},
{
"epoch": 0.40045429689143147,
"grad_norm": 2.703125,
"learning_rate": 6.535021480865439e-05,
"loss": 0.4906127452850342,
"memory(GiB)": 75.82,
"step": 595,
"token_acc": 0.8699199748940845,
"train_speed(iter/s)": 0.311715
},
{
"epoch": 0.40381945905018296,
"grad_norm": 1.9375,
"learning_rate": 6.484601876641375e-05,
"loss": 0.4776750564575195,
"memory(GiB)": 75.82,
"step": 600,
"token_acc": 0.8830860534124629,
"train_speed(iter/s)": 0.312795
},
{
"epoch": 0.40381945905018296,
"eval_loss": 0.4834407567977905,
"eval_runtime": 6.359,
"eval_samples_per_second": 37.742,
"eval_steps_per_second": 37.742,
"eval_token_acc": 0.8705602536997886,
"step": 600
},
{
"epoch": 0.4071846212089345,
"grad_norm": 2.09375,
"learning_rate": 6.434016163555452e-05,
"loss": 0.42650303840637205,
"memory(GiB)": 75.82,
"step": 605,
"token_acc": 0.873349786500568,
"train_speed(iter/s)": 0.302036
},
{
"epoch": 0.41054978336768605,
"grad_norm": 2.375,
"learning_rate": 6.383270001532635e-05,
"loss": 0.47733469009399415,
"memory(GiB)": 75.82,
"step": 610,
"token_acc": 0.8673865361903155,
"train_speed(iter/s)": 0.303179
},
{
"epoch": 0.41391494552643754,
"grad_norm": 2.3125,
"learning_rate": 6.332369068450174e-05,
"loss": 0.4712835788726807,
"memory(GiB)": 75.82,
"step": 615,
"token_acc": 0.8805620608899297,
"train_speed(iter/s)": 0.304281
},
{
"epoch": 0.4172801076851891,
"grad_norm": 2.625,
"learning_rate": 6.281319059502313e-05,
"loss": 0.45713419914245607,
"memory(GiB)": 75.82,
"step": 620,
"token_acc": 0.877295995182174,
"train_speed(iter/s)": 0.305295
},
{
"epoch": 0.42064526984394063,
"grad_norm": 2.453125,
"learning_rate": 6.230125686563068e-05,
"loss": 0.3812277317047119,
"memory(GiB)": 75.82,
"step": 625,
"token_acc": 0.8977045908183633,
"train_speed(iter/s)": 0.306432
},
{
"epoch": 0.4240104320026921,
"grad_norm": 2.125,
"learning_rate": 6.178794677547137e-05,
"loss": 0.48100833892822265,
"memory(GiB)": 75.82,
"step": 630,
"token_acc": 0.8763653633053665,
"train_speed(iter/s)": 0.307582
},
{
"epoch": 0.42737559416144366,
"grad_norm": 2.8125,
"learning_rate": 6.127331775769023e-05,
"loss": 0.42731170654296874,
"memory(GiB)": 75.82,
"step": 635,
"token_acc": 0.8863417762103238,
"train_speed(iter/s)": 0.308599
},
{
"epoch": 0.4307407563201952,
"grad_norm": 2.09375,
"learning_rate": 6.0757427393004195e-05,
"loss": 0.3901322603225708,
"memory(GiB)": 75.82,
"step": 640,
"token_acc": 0.8963465035543065,
"train_speed(iter/s)": 0.309577
},
{
"epoch": 0.4341059184789467,
"grad_norm": 2.28125,
"learning_rate": 6.024033340325954e-05,
"loss": 0.41823792457580566,
"memory(GiB)": 75.82,
"step": 645,
"token_acc": 0.8839628681177977,
"train_speed(iter/s)": 0.310568
},
{
"epoch": 0.43747108063769824,
"grad_norm": 2.625,
"learning_rate": 5.9722093644973546e-05,
"loss": 0.45659918785095216,
"memory(GiB)": 75.82,
"step": 650,
"token_acc": 0.8752002563281,
"train_speed(iter/s)": 0.311578
},
{
"epoch": 0.43747108063769824,
"eval_loss": 0.4680798649787903,
"eval_runtime": 6.359,
"eval_samples_per_second": 37.742,
"eval_steps_per_second": 37.742,
"eval_token_acc": 0.8731501057082452,
"step": 650
},
{
"epoch": 0.44083624279644973,
"grad_norm": 2.34375,
"learning_rate": 5.920276610286102e-05,
"loss": 0.45874710083007814,
"memory(GiB)": 75.82,
"step": 655,
"token_acc": 0.874843798812871,
"train_speed(iter/s)": 0.303426
},
{
"epoch": 0.4442014049552013,
"grad_norm": 1.890625,
"learning_rate": 5.868240888334653e-05,
"loss": 0.40706768035888674,
"memory(GiB)": 75.82,
"step": 660,
"token_acc": 0.8987694831829368,
"train_speed(iter/s)": 0.304386
},
{
"epoch": 0.4475665671139528,
"grad_norm": 2.3125,
"learning_rate": 5.816108020806297e-05,
"loss": 0.4790656566619873,
"memory(GiB)": 75.82,
"step": 665,
"token_acc": 0.8695376820772641,
"train_speed(iter/s)": 0.30531
},
{
"epoch": 0.4509317292727043,
"grad_norm": 2.625,
"learning_rate": 5.763883840733736e-05,
"loss": 0.4840695858001709,
"memory(GiB)": 75.82,
"step": 670,
"token_acc": 0.8794635643884311,
"train_speed(iter/s)": 0.306292
},
{
"epoch": 0.45429689143145585,
"grad_norm": 2.703125,
"learning_rate": 5.7115741913664264e-05,
"loss": 0.4588040351867676,
"memory(GiB)": 75.82,
"step": 675,
"token_acc": 0.8740804106073568,
"train_speed(iter/s)": 0.307251
},
{
"epoch": 0.4576620535902074,
"grad_norm": 2.3125,
"learning_rate": 5.6591849255168015e-05,
"loss": 0.39728033542633057,
"memory(GiB)": 75.82,
"step": 680,
"token_acc": 0.893990116371752,
"train_speed(iter/s)": 0.308265
},
{
"epoch": 0.4610272157489589,
"grad_norm": 2.1875,
"learning_rate": 5.60672190490541e-05,
"loss": 0.422639799118042,
"memory(GiB)": 75.82,
"step": 685,
"token_acc": 0.8848245180425112,
"train_speed(iter/s)": 0.309233
},
{
"epoch": 0.46439237790771043,
"grad_norm": 2.265625,
"learning_rate": 5.5541909995050554e-05,
"loss": 0.39331207275390623,
"memory(GiB)": 75.82,
"step": 690,
"token_acc": 0.8947537301459971,
"train_speed(iter/s)": 0.310207
},
{
"epoch": 0.467757540066462,
"grad_norm": 2.21875,
"learning_rate": 5.501598086884025e-05,
"loss": 0.43639063835144043,
"memory(GiB)": 75.82,
"step": 695,
"token_acc": 0.8884950048340315,
"train_speed(iter/s)": 0.311107
},
{
"epoch": 0.47112270222521346,
"grad_norm": 2.375,
"learning_rate": 5.448949051548459e-05,
"loss": 0.413299560546875,
"memory(GiB)": 75.82,
"step": 700,
"token_acc": 0.8879505353641984,
"train_speed(iter/s)": 0.312013
},
{
"epoch": 0.47112270222521346,
"eval_loss": 0.4406000077724457,
"eval_runtime": 6.3948,
"eval_samples_per_second": 37.531,
"eval_steps_per_second": 37.531,
"eval_token_acc": 0.8806025369978858,
"step": 700
},
{
"epoch": 0.474487864383965,
"grad_norm": 2.421875,
"learning_rate": 5.396249784283942e-05,
"loss": 0.43725104331970216,
"memory(GiB)": 75.82,
"step": 705,
"token_acc": 0.8810175054704595,
"train_speed(iter/s)": 0.304357
},
{
"epoch": 0.47785302654271655,
"grad_norm": 2.34375,
"learning_rate": 5.343506181496405e-05,
"loss": 0.41141476631164553,
"memory(GiB)": 75.82,
"step": 710,
"token_acc": 0.8912708204811844,
"train_speed(iter/s)": 0.305281
},
{
"epoch": 0.48121818870146804,
"grad_norm": 2.609375,
"learning_rate": 5.290724144552379e-05,
"loss": 0.5024977684020996,
"memory(GiB)": 75.82,
"step": 715,
"token_acc": 0.8692831144168381,
"train_speed(iter/s)": 0.306216
},
{
"epoch": 0.4845833508602196,
"grad_norm": 1.8828125,
"learning_rate": 5.2379095791187124e-05,
"loss": 0.37138142585754397,
"memory(GiB)": 75.82,
"step": 720,
"token_acc": 0.8965241069998399,
"train_speed(iter/s)": 0.307078
},
{
"epoch": 0.4879485130189711,
"grad_norm": 2.484375,
"learning_rate": 5.185068394501791e-05,
"loss": 0.46549081802368164,
"memory(GiB)": 75.82,
"step": 725,
"token_acc": 0.8741355463347165,
"train_speed(iter/s)": 0.308012
},
{
"epoch": 0.4913136751777226,
"grad_norm": 2.828125,
"learning_rate": 5.132206502986368e-05,
"loss": 0.5339263916015625,
"memory(GiB)": 75.82,
"step": 730,
"token_acc": 0.8623881049916553,
"train_speed(iter/s)": 0.308868
},
{
"epoch": 0.49467883733647416,
"grad_norm": 2.5625,
"learning_rate": 5.0793298191740404e-05,
"loss": 0.4308777809143066,
"memory(GiB)": 75.82,
"step": 735,
"token_acc": 0.8834385624089364,
"train_speed(iter/s)": 0.309717
},
{
"epoch": 0.49804399949522565,
"grad_norm": 3.0,
"learning_rate": 5.026444259321489e-05,
"loss": 0.3827210903167725,
"memory(GiB)": 75.82,
"step": 740,
"token_acc": 0.8980582524271845,
"train_speed(iter/s)": 0.310525
},
{
"epoch": 0.5014091616539772,
"grad_norm": 2.1875,
"learning_rate": 4.973555740678511e-05,
"loss": 0.4466721534729004,
"memory(GiB)": 75.82,
"step": 745,
"token_acc": 0.8842266462480858,
"train_speed(iter/s)": 0.311445
},
{
"epoch": 0.5047743238127287,
"grad_norm": 2.765625,
"learning_rate": 4.92067018082596e-05,
"loss": 0.5502868175506592,
"memory(GiB)": 75.82,
"step": 750,
"token_acc": 0.8586033117350612,
"train_speed(iter/s)": 0.312324
},
{
"epoch": 0.5047743238127287,
"eval_loss": 0.42037147283554077,
"eval_runtime": 6.3843,
"eval_samples_per_second": 37.592,
"eval_steps_per_second": 37.592,
"eval_token_acc": 0.8845665961945032,
"step": 750
},
{
"epoch": 0.5081394859714803,
"grad_norm": 2.125,
"learning_rate": 4.8677934970136335e-05,
"loss": 0.5509189128875732,
"memory(GiB)": 75.82,
"step": 755,
"token_acc": 0.8773990147783252,
"train_speed(iter/s)": 0.304846
},
{
"epoch": 0.5115046481302318,
"grad_norm": 1.8203125,
"learning_rate": 4.8149316054982095e-05,
"loss": 0.392488431930542,
"memory(GiB)": 75.82,
"step": 760,
"token_acc": 0.8881675052751177,
"train_speed(iter/s)": 0.305699
},
{
"epoch": 0.5148698102889833,
"grad_norm": 2.125,
"learning_rate": 4.762090420881289e-05,
"loss": 0.34769492149353026,
"memory(GiB)": 75.82,
"step": 765,
"token_acc": 0.904643578195372,
"train_speed(iter/s)": 0.306491
},
{
"epoch": 0.5182349724477349,
"grad_norm": 1.9453125,
"learning_rate": 4.709275855447621e-05,
"loss": 0.34389894008636473,
"memory(GiB)": 75.82,
"step": 770,
"token_acc": 0.9075886411038023,
"train_speed(iter/s)": 0.307358
},
{
"epoch": 0.5216001346064864,
"grad_norm": 2.234375,
"learning_rate": 4.6564938185035956e-05,
"loss": 0.3195344924926758,
"memory(GiB)": 75.82,
"step": 775,
"token_acc": 0.9082976621666118,
"train_speed(iter/s)": 0.308159
},
{
"epoch": 0.5249652967652378,
"grad_norm": 2.21875,
"learning_rate": 4.603750215716057e-05,
"loss": 0.38263275623321535,
"memory(GiB)": 75.82,
"step": 780,
"token_acc": 0.8930994539136191,
"train_speed(iter/s)": 0.30896
},
{
"epoch": 0.5283304589239894,
"grad_norm": 2.4375,
"learning_rate": 4.551050948451542e-05,
"loss": 0.4862419605255127,
"memory(GiB)": 75.82,
"step": 785,
"token_acc": 0.8823529411764706,
"train_speed(iter/s)": 0.309802
},
{
"epoch": 0.5316956210827409,
"grad_norm": 2.359375,
"learning_rate": 4.498401913115975e-05,
"loss": 0.46417646408081054,
"memory(GiB)": 75.82,
"step": 790,
"token_acc": 0.8798179059180576,
"train_speed(iter/s)": 0.310657
},
{
"epoch": 0.5350607832414924,
"grad_norm": 2.03125,
"learning_rate": 4.445809000494946e-05,
"loss": 0.5157633304595948,
"memory(GiB)": 75.82,
"step": 795,
"token_acc": 0.8697006636868482,
"train_speed(iter/s)": 0.311416
},
{
"epoch": 0.538425945400244,
"grad_norm": 2.28125,
"learning_rate": 4.393278095094591e-05,
"loss": 0.36940808296203614,
"memory(GiB)": 75.82,
"step": 800,
"token_acc": 0.8981329839502129,
"train_speed(iter/s)": 0.312237
},
{
"epoch": 0.538425945400244,
"eval_loss": 0.3978128135204315,
"eval_runtime": 6.3728,
"eval_samples_per_second": 37.66,
"eval_steps_per_second": 37.66,
"eval_token_acc": 0.8903276955602537,
"step": 800
},
{
"epoch": 0.5417911075589955,
"grad_norm": 1.9453125,
"learning_rate": 4.340815074483199e-05,
"loss": 0.34389219284057615,
"memory(GiB)": 75.82,
"step": 805,
"token_acc": 0.8938571824626718,
"train_speed(iter/s)": 0.305594
},
{
"epoch": 0.545156269717747,
"grad_norm": 2.109375,
"learning_rate": 4.288425808633575e-05,
"loss": 0.31910243034362795,
"memory(GiB)": 75.82,
"step": 810,
"token_acc": 0.9097661188369153,
"train_speed(iter/s)": 0.306364
},
{
"epoch": 0.5485214318764986,
"grad_norm": 2.109375,
"learning_rate": 4.236116159266265e-05,
"loss": 0.3916430950164795,
"memory(GiB)": 75.82,
"step": 815,
"token_acc": 0.8918385922330098,
"train_speed(iter/s)": 0.307181
},
{
"epoch": 0.5518865940352501,
"grad_norm": 2.265625,
"learning_rate": 4.1838919791937034e-05,
"loss": 0.38680903911590575,
"memory(GiB)": 75.82,
"step": 820,
"token_acc": 0.8982239382239382,
"train_speed(iter/s)": 0.307962
},
{
"epoch": 0.5552517561940016,
"grad_norm": 3.09375,
"learning_rate": 4.131759111665349e-05,
"loss": 0.39566311836242674,
"memory(GiB)": 75.82,
"step": 825,
"token_acc": 0.8838128359152703,
"train_speed(iter/s)": 0.308723
},
{
"epoch": 0.5586169183527532,
"grad_norm": 2.609375,
"learning_rate": 4.0797233897138985e-05,
"loss": 0.3857170820236206,
"memory(GiB)": 75.82,
"step": 830,
"token_acc": 0.8954257979114576,
"train_speed(iter/s)": 0.309512
},
{
"epoch": 0.5619820805115047,
"grad_norm": 2.109375,
"learning_rate": 4.027790635502646e-05,
"loss": 0.4497522354125977,
"memory(GiB)": 75.82,
"step": 835,
"token_acc": 0.8830426939266386,
"train_speed(iter/s)": 0.310275
},
{
"epoch": 0.5653472426702562,
"grad_norm": 2.484375,
"learning_rate": 3.9759666596740476e-05,
"loss": 0.36929664611816404,
"memory(GiB)": 75.82,
"step": 840,
"token_acc": 0.9050632911392406,
"train_speed(iter/s)": 0.31104
},
{
"epoch": 0.5687124048290076,
"grad_norm": 2.828125,
"learning_rate": 3.924257260699583e-05,
"loss": 0.4202712535858154,
"memory(GiB)": 75.82,
"step": 845,
"token_acc": 0.8854133418448771,
"train_speed(iter/s)": 0.311816
},
{
"epoch": 0.5720775669877592,
"grad_norm": 1.875,
"learning_rate": 3.8726682242309794e-05,
"loss": 0.3440741777420044,
"memory(GiB)": 75.82,
"step": 850,
"token_acc": 0.9036319612590799,
"train_speed(iter/s)": 0.312528
},
{
"epoch": 0.5720775669877592,
"eval_loss": 0.38969123363494873,
"eval_runtime": 6.3806,
"eval_samples_per_second": 37.614,
"eval_steps_per_second": 37.614,
"eval_token_acc": 0.8914376321353066,
"step": 850
},
{
"epoch": 0.5754427291465107,
"grad_norm": 2.25,
"learning_rate": 3.821205322452863e-05,
"loss": 0.3591428756713867,
"memory(GiB)": 75.82,
"step": 855,
"token_acc": 0.8932054420676646,
"train_speed(iter/s)": 0.30595
},
{
"epoch": 0.5788078913052622,
"grad_norm": 2.109375,
"learning_rate": 3.769874313436933e-05,
"loss": 0.4184281349182129,
"memory(GiB)": 75.82,
"step": 860,
"token_acc": 0.8862865449846551,
"train_speed(iter/s)": 0.306767
},
{
"epoch": 0.5821730534640138,
"grad_norm": 1.890625,
"learning_rate": 3.718680940497687e-05,
"loss": 0.4002545833587646,
"memory(GiB)": 75.82,
"step": 865,
"token_acc": 0.896395693555937,
"train_speed(iter/s)": 0.307516
},
{
"epoch": 0.5855382156227653,
"grad_norm": 2.71875,
"learning_rate": 3.6676309315498256e-05,
"loss": 0.43825640678405764,
"memory(GiB)": 75.82,
"step": 870,
"token_acc": 0.8854260764829871,
"train_speed(iter/s)": 0.308318
},
{
"epoch": 0.5889033777815168,
"grad_norm": 2.109375,
"learning_rate": 3.616729998467365e-05,
"loss": 0.4464766025543213,
"memory(GiB)": 75.82,
"step": 875,
"token_acc": 0.8829486224869695,
"train_speed(iter/s)": 0.309057
},
{
"epoch": 0.5922685399402684,
"grad_norm": 2.328125,
"learning_rate": 3.5659838364445505e-05,
"loss": 0.3885576486587524,
"memory(GiB)": 75.82,
"step": 880,
"token_acc": 0.8979990239141045,
"train_speed(iter/s)": 0.309821
},
{
"epoch": 0.5956337020990199,
"grad_norm": 2.359375,
"learning_rate": 3.515398123358627e-05,
"loss": 0.29079430103302,
"memory(GiB)": 75.82,
"step": 885,
"token_acc": 0.9171314741035856,
"train_speed(iter/s)": 0.310587
},
{
"epoch": 0.5989988642577714,
"grad_norm": 1.8984375,
"learning_rate": 3.464978519134561e-05,
"loss": 0.35250732898712156,
"memory(GiB)": 75.82,
"step": 890,
"token_acc": 0.9011910753229324,
"train_speed(iter/s)": 0.311324
},
{
"epoch": 0.602364026416523,
"grad_norm": 2.796875,
"learning_rate": 3.414730665111766e-05,
"loss": 0.5763841152191163,
"memory(GiB)": 75.82,
"step": 895,
"token_acc": 0.850735294117647,
"train_speed(iter/s)": 0.312078
},
{
"epoch": 0.6057291885752745,
"grad_norm": 1.8125,
"learning_rate": 3.364660183412892e-05,
"loss": 0.3474919319152832,
"memory(GiB)": 75.82,
"step": 900,
"token_acc": 0.9043635170603674,
"train_speed(iter/s)": 0.312842
},
{
"epoch": 0.6057291885752745,
"eval_loss": 0.3748236298561096,
"eval_runtime": 6.318,
"eval_samples_per_second": 37.987,
"eval_steps_per_second": 37.987,
"eval_token_acc": 0.896247357293869,
"step": 900
},
{
"epoch": 0.609094350734026,
"grad_norm": 2.453125,
"learning_rate": 3.314772676314791e-05,
"loss": 0.3573255777359009,
"memory(GiB)": 75.82,
"step": 905,
"token_acc": 0.8959614555995142,
"train_speed(iter/s)": 0.306732
},
{
"epoch": 0.6124595128927776,
"grad_norm": 2.671875,
"learning_rate": 3.2650737256216886e-05,
"loss": 0.35031719207763673,
"memory(GiB)": 75.82,
"step": 910,
"token_acc": 0.901930971567584,
"train_speed(iter/s)": 0.307389
},
{
"epoch": 0.615824675051529,
"grad_norm": 2.359375,
"learning_rate": 3.215568892040641e-05,
"loss": 0.42179441452026367,
"memory(GiB)": 75.82,
"step": 915,
"token_acc": 0.8914620966496835,
"train_speed(iter/s)": 0.308116
},
{
"epoch": 0.6191898372102805,
"grad_norm": 1.9375,
"learning_rate": 3.16626371455937e-05,
"loss": 0.3349519968032837,
"memory(GiB)": 75.82,
"step": 920,
"token_acc": 0.9102564102564102,
"train_speed(iter/s)": 0.3088
},
{
"epoch": 0.6225549993690321,
"grad_norm": 2.265625,
"learning_rate": 3.1171637098265064e-05,
"loss": 0.3286914587020874,
"memory(GiB)": 75.82,
"step": 925,
"token_acc": 0.9120385232744783,
"train_speed(iter/s)": 0.309527
},
{
"epoch": 0.6259201615277836,
"grad_norm": 2.078125,
"learning_rate": 3.0682743715343564e-05,
"loss": 0.32795827388763427,
"memory(GiB)": 75.82,
"step": 930,
"token_acc": 0.9038667278007031,
"train_speed(iter/s)": 0.3102
},
{
"epoch": 0.6292853236865351,
"grad_norm": 1.7890625,
"learning_rate": 3.019601169804216e-05,
"loss": 0.35293123722076414,
"memory(GiB)": 75.82,
"step": 935,
"token_acc": 0.9023519870235198,
"train_speed(iter/s)": 0.310922
},
{
"epoch": 0.6326504858452867,
"grad_norm": 1.9609375,
"learning_rate": 2.9711495505743313e-05,
"loss": 0.2731185436248779,
"memory(GiB)": 75.82,
"step": 940,
"token_acc": 0.923992673992674,
"train_speed(iter/s)": 0.311559
},
{
"epoch": 0.6360156480040382,
"grad_norm": 2.25,
"learning_rate": 2.9229249349905684e-05,
"loss": 0.46314468383789065,
"memory(GiB)": 75.82,
"step": 945,
"token_acc": 0.8777718407694363,
"train_speed(iter/s)": 0.312255
},
{
"epoch": 0.6393808101627897,
"grad_norm": 2.109375,
"learning_rate": 2.8749327187998515e-05,
"loss": 0.3386242151260376,
"memory(GiB)": 75.82,
"step": 950,
"token_acc": 0.906275336468299,
"train_speed(iter/s)": 0.312929
},
{
"epoch": 0.6393808101627897,
"eval_loss": 0.36514538526535034,
"eval_runtime": 6.288,
"eval_samples_per_second": 38.168,
"eval_steps_per_second": 38.168,
"eval_token_acc": 0.9002642706131079,
"step": 950
},
{
"epoch": 0.6427459723215413,
"grad_norm": 2.0625,
"learning_rate": 2.827178271746441e-05,
"loss": 0.3628067970275879,
"memory(GiB)": 75.82,
"step": 955,
"token_acc": 0.9006407689227073,
"train_speed(iter/s)": 0.307148
},
{
"epoch": 0.6461111344802928,
"grad_norm": 2.171875,
"learning_rate": 2.7796669369711294e-05,
"loss": 0.35329625606536863,
"memory(GiB)": 75.82,
"step": 960,
"token_acc": 0.9036697247706422,
"train_speed(iter/s)": 0.307809
},
{
"epoch": 0.6494762966390443,
"grad_norm": 2.03125,
"learning_rate": 2.7324040304134123e-05,
"loss": 0.3087867259979248,
"memory(GiB)": 75.82,
"step": 965,
"token_acc": 0.915299187800431,
"train_speed(iter/s)": 0.30851
},
{
"epoch": 0.6528414587977959,
"grad_norm": 2.25,
"learning_rate": 2.6853948402166878e-05,
"loss": 0.3155955791473389,
"memory(GiB)": 75.82,
"step": 970,
"token_acc": 0.912190414924413,
"train_speed(iter/s)": 0.309237
},
{
"epoch": 0.6562066209565474,
"grad_norm": 2.078125,
"learning_rate": 2.638644626136587e-05,
"loss": 0.31089558601379397,
"memory(GiB)": 75.82,
"step": 975,
"token_acc": 0.9141963109354414,
"train_speed(iter/s)": 0.30992
},
{
"epoch": 0.6595717831152988,
"grad_norm": 2.390625,
"learning_rate": 2.5921586189524694e-05,
"loss": 0.38663172721862793,
"memory(GiB)": 75.82,
"step": 980,
"token_acc": 0.9041765169424744,
"train_speed(iter/s)": 0.310581
},
{
"epoch": 0.6629369452740504,
"grad_norm": 2.140625,
"learning_rate": 2.5459420198821605e-05,
"loss": 0.34139630794525144,
"memory(GiB)": 75.82,
"step": 985,
"token_acc": 0.908329455560726,
"train_speed(iter/s)": 0.311278
},
{
"epoch": 0.6663021074328019,
"grad_norm": 1.953125,
"learning_rate": 2.500000000000001e-05,
"loss": 0.30913376808166504,
"memory(GiB)": 75.82,
"step": 990,
"token_acc": 0.9187433922368222,
"train_speed(iter/s)": 0.311944
},
{
"epoch": 0.6696672695915534,
"grad_norm": 2.53125,
"learning_rate": 2.454337699658267e-05,
"loss": 0.3810436248779297,
"memory(GiB)": 75.82,
"step": 995,
"token_acc": 0.8920236336779911,
"train_speed(iter/s)": 0.312598
},
{
"epoch": 0.6730324317503049,
"grad_norm": 2.546875,
"learning_rate": 2.4089602279120222e-05,
"loss": 0.3346914768218994,
"memory(GiB)": 75.82,
"step": 1000,
"token_acc": 0.9072181670721817,
"train_speed(iter/s)": 0.313245
},
{
"epoch": 0.6730324317503049,
"eval_loss": 0.35263723134994507,
"eval_runtime": 6.3269,
"eval_samples_per_second": 37.933,
"eval_steps_per_second": 37.933,
"eval_token_acc": 0.9024841437632135,
"step": 1000
},
{
"epoch": 0.6763975939090565,
"grad_norm": 2.53125,
"learning_rate": 2.363872661947488e-05,
"loss": 0.3666444063186646,
"memory(GiB)": 75.82,
"step": 1005,
"token_acc": 0.901596274033677,
"train_speed(iter/s)": 0.307343
},
{
"epoch": 0.679762756067808,
"grad_norm": 1.8828125,
"learning_rate": 2.319080046513954e-05,
"loss": 0.2809803247451782,
"memory(GiB)": 75.82,
"step": 1010,
"token_acc": 0.9265139116202946,
"train_speed(iter/s)": 0.308004
},
{
"epoch": 0.6831279182265595,
"grad_norm": 1.734375,
"learning_rate": 2.274587393359342e-05,
"loss": 0.3413016557693481,
"memory(GiB)": 75.82,
"step": 1015,
"token_acc": 0.9119178921568627,
"train_speed(iter/s)": 0.30864
},
{
"epoch": 0.6864930803853111,
"grad_norm": 2.625,
"learning_rate": 2.2303996806694488e-05,
"loss": 0.378632116317749,
"memory(GiB)": 75.82,
"step": 1020,
"token_acc": 0.9010562286424355,
"train_speed(iter/s)": 0.309282
},
{
"epoch": 0.6898582425440626,
"grad_norm": 1.8359375,
"learning_rate": 2.1865218525109495e-05,
"loss": 0.32521207332611085,
"memory(GiB)": 75.82,
"step": 1025,
"token_acc": 0.9113677264547091,
"train_speed(iter/s)": 0.309938
},
{
"epoch": 0.6932234047028141,
"grad_norm": 2.125,
"learning_rate": 2.1429588182782144e-05,
"loss": 0.3491218090057373,
"memory(GiB)": 75.82,
"step": 1030,
"token_acc": 0.9001129578828465,
"train_speed(iter/s)": 0.310579
},
{
"epoch": 0.6965885668615657,
"grad_norm": 1.9921875,
"learning_rate": 2.09971545214401e-05,
"loss": 0.32745966911315916,
"memory(GiB)": 75.82,
"step": 1035,
"token_acc": 0.9061001818482394,
"train_speed(iter/s)": 0.311215
},
{
"epoch": 0.6999537290203172,
"grad_norm": 2.40625,
"learning_rate": 2.0567965925141363e-05,
"loss": 0.4002220153808594,
"memory(GiB)": 75.82,
"step": 1040,
"token_acc": 0.8949858088930936,
"train_speed(iter/s)": 0.311883
},
{
"epoch": 0.7033188911790687,
"grad_norm": 2.234375,
"learning_rate": 2.0142070414860704e-05,
"loss": 0.33180482387542726,
"memory(GiB)": 75.82,
"step": 1045,
"token_acc": 0.9061082552162081,
"train_speed(iter/s)": 0.31252
},
{
"epoch": 0.7066840533378203,
"grad_norm": 2.375,
"learning_rate": 1.9719515643116674e-05,
"loss": 0.2780953884124756,
"memory(GiB)": 75.82,
"step": 1050,
"token_acc": 0.9193521731945133,
"train_speed(iter/s)": 0.313162
},
{
"epoch": 0.7066840533378203,
"eval_loss": 0.3458307981491089,
"eval_runtime": 6.3244,
"eval_samples_per_second": 37.948,
"eval_steps_per_second": 37.948,
"eval_token_acc": 0.9045983086680761,
"step": 1050
},
{
"epoch": 0.7100492154965717,
"grad_norm": 1.8359375,
"learning_rate": 1.9300348888639914e-05,
"loss": 0.2843871355056763,
"memory(GiB)": 75.82,
"step": 1055,
"token_acc": 0.908830434955629,
"train_speed(iter/s)": 0.307659
},
{
"epoch": 0.7134143776553232,
"grad_norm": 2.265625,
"learning_rate": 1.888461705108318e-05,
"loss": 0.31880433559417726,
"memory(GiB)": 75.82,
"step": 1060,
"token_acc": 0.9162755488266465,
"train_speed(iter/s)": 0.308265
},
{
"epoch": 0.7167795398140748,
"grad_norm": 2.53125,
"learning_rate": 1.847236664577389e-05,
"loss": 0.3458749055862427,
"memory(GiB)": 75.82,
"step": 1065,
"token_acc": 0.905449976441024,
"train_speed(iter/s)": 0.308846
},
{
"epoch": 0.7201447019728263,
"grad_norm": 2.296875,
"learning_rate": 1.8063643798509593e-05,
"loss": 0.3110387325286865,
"memory(GiB)": 75.82,
"step": 1070,
"token_acc": 0.9163458691145988,
"train_speed(iter/s)": 0.309494
},
{
"epoch": 0.7235098641315778,
"grad_norm": 2.484375,
"learning_rate": 1.7658494240397126e-05,
"loss": 0.3040132522583008,
"memory(GiB)": 75.82,
"step": 1075,
"token_acc": 0.9117370892018779,
"train_speed(iter/s)": 0.310049
},
{
"epoch": 0.7268750262903294,
"grad_norm": 2.125,
"learning_rate": 1.725696330273575e-05,
"loss": 0.28740246295928956,
"memory(GiB)": 75.82,
"step": 1080,
"token_acc": 0.9206708975521306,
"train_speed(iter/s)": 0.310661
},
{
"epoch": 0.7302401884490809,
"grad_norm": 2.171875,
"learning_rate": 1.68590959119452e-05,
"loss": 0.3025235176086426,
"memory(GiB)": 75.82,
"step": 1085,
"token_acc": 0.912094861660079,
"train_speed(iter/s)": 0.311236
},
{
"epoch": 0.7336053506078324,
"grad_norm": 1.9375,
"learning_rate": 1.646493658453896e-05,
"loss": 0.3215456485748291,
"memory(GiB)": 75.82,
"step": 1090,
"token_acc": 0.914180252230083,
"train_speed(iter/s)": 0.31184
},
{
"epoch": 0.736970512766584,
"grad_norm": 2.609375,
"learning_rate": 1.60745294221434e-05,
"loss": 0.35790162086486815,
"memory(GiB)": 75.82,
"step": 1095,
"token_acc": 0.8988936693300553,
"train_speed(iter/s)": 0.31246
},
{
"epoch": 0.7403356749253355,
"grad_norm": 2.078125,
"learning_rate": 1.5687918106563326e-05,
"loss": 0.3286574840545654,
"memory(GiB)": 75.82,
"step": 1100,
"token_acc": 0.9104522765088489,
"train_speed(iter/s)": 0.313052
},
{
"epoch": 0.7403356749253355,
"eval_loss": 0.34071242809295654,
"eval_runtime": 6.4053,
"eval_samples_per_second": 37.469,
"eval_steps_per_second": 37.469,
"eval_token_acc": 0.9052325581395348,
"step": 1100
},
{
"epoch": 0.743700837084087,
"grad_norm": 2.5625,
"learning_rate": 1.5305145894894547e-05,
"loss": 0.3178743600845337,
"memory(GiB)": 75.82,
"step": 1105,
"token_acc": 0.9053732762719924,
"train_speed(iter/s)": 0.30749
},
{
"epoch": 0.7470659992428386,
"grad_norm": 1.9453125,
"learning_rate": 1.4926255614683932e-05,
"loss": 0.28967604637145994,
"memory(GiB)": 75.82,
"step": 1110,
"token_acc": 0.9175757575757576,
"train_speed(iter/s)": 0.308051
},
{
"epoch": 0.75043116140159,
"grad_norm": 1.921875,
"learning_rate": 1.4551289659137496e-05,
"loss": 0.3481321096420288,
"memory(GiB)": 75.82,
"step": 1115,
"token_acc": 0.9100428367444074,
"train_speed(iter/s)": 0.308681
},
{
"epoch": 0.7537963235603415,
"grad_norm": 2.34375,
"learning_rate": 1.4180289982377137e-05,
"loss": 0.3199401617050171,
"memory(GiB)": 75.82,
"step": 1120,
"token_acc": 0.9106353591160221,
"train_speed(iter/s)": 0.309223
},
{
"epoch": 0.7571614857190931,
"grad_norm": 2.421875,
"learning_rate": 1.3813298094746491e-05,
"loss": 0.2976421356201172,
"memory(GiB)": 75.82,
"step": 1125,
"token_acc": 0.9175590435675517,
"train_speed(iter/s)": 0.309803
},
{
"epoch": 0.7605266478778446,
"grad_norm": 3.046875,
"learning_rate": 1.345035505816642e-05,
"loss": 0.2948709726333618,
"memory(GiB)": 75.82,
"step": 1130,
"token_acc": 0.9145778364116095,
"train_speed(iter/s)": 0.310376
},
{
"epoch": 0.7638918100365961,
"grad_norm": 2.21875,
"learning_rate": 1.3091501481540674e-05,
"loss": 0.3075523853302002,
"memory(GiB)": 75.82,
"step": 1135,
"token_acc": 0.9111648285239462,
"train_speed(iter/s)": 0.310961
},
{
"epoch": 0.7672569721953476,
"grad_norm": 2.390625,
"learning_rate": 1.2736777516212266e-05,
"loss": 0.2970130205154419,
"memory(GiB)": 75.82,
"step": 1140,
"token_acc": 0.914568783498457,
"train_speed(iter/s)": 0.311547
},
{
"epoch": 0.7706221343540992,
"grad_norm": 2.171875,
"learning_rate": 1.238622285147103e-05,
"loss": 0.29731974601745603,
"memory(GiB)": 75.82,
"step": 1145,
"token_acc": 0.9114565731666103,
"train_speed(iter/s)": 0.312152
},
{
"epoch": 0.7739872965128507,
"grad_norm": 2.8125,
"learning_rate": 1.2039876710112847e-05,
"loss": 0.3596015930175781,
"memory(GiB)": 75.82,
"step": 1150,
"token_acc": 0.9056197074672825,
"train_speed(iter/s)": 0.312745
},
{
"epoch": 0.7739872965128507,
"eval_loss": 0.33650800585746765,
"eval_runtime": 6.306,
"eval_samples_per_second": 38.059,
"eval_steps_per_second": 38.059,
"eval_token_acc": 0.9056025369978858,
"step": 1150
},
{
"epoch": 0.7773524586716022,
"grad_norm": 1.8203125,
"learning_rate": 1.1697777844051105e-05,
"loss": 0.280198860168457,
"memory(GiB)": 75.82,
"step": 1155,
"token_acc": 0.9082849646179534,
"train_speed(iter/s)": 0.306862
},
{
"epoch": 0.7807176208303538,
"grad_norm": 3.359375,
"learning_rate": 1.1359964529980849e-05,
"loss": 0.31822154521942136,
"memory(GiB)": 75.82,
"step": 1160,
"token_acc": 0.9165397502284496,
"train_speed(iter/s)": 0.307449
},
{
"epoch": 0.7840827829891053,
"grad_norm": 2.046875,
"learning_rate": 1.1026474565096068e-05,
"loss": 0.3568312883377075,
"memory(GiB)": 75.82,
"step": 1165,
"token_acc": 0.9036163277976098,
"train_speed(iter/s)": 0.30803
},
{
"epoch": 0.7874479451478568,
"grad_norm": 2.984375,
"learning_rate": 1.0697345262860636e-05,
"loss": 0.3734995603561401,
"memory(GiB)": 75.82,
"step": 1170,
"token_acc": 0.9021773935318604,
"train_speed(iter/s)": 0.308568
},
{
"epoch": 0.7908131073066084,
"grad_norm": 2.5,
"learning_rate": 1.037261344883343e-05,
"loss": 0.3525618314743042,
"memory(GiB)": 75.82,
"step": 1175,
"token_acc": 0.9067342073897497,
"train_speed(iter/s)": 0.309122
},
{
"epoch": 0.7941782694653599,
"grad_norm": 1.5859375,
"learning_rate": 1.0052315456547934e-05,
"loss": 0.2623563289642334,
"memory(GiB)": 75.82,
"step": 1180,
"token_acc": 0.9251740139211136,
"train_speed(iter/s)": 0.309652
},
{
"epoch": 0.7975434316241113,
"grad_norm": 2.015625,
"learning_rate": 9.73648712344707e-06,
"loss": 0.27451634407043457,
"memory(GiB)": 75.82,
"step": 1185,
"token_acc": 0.9259454705364996,
"train_speed(iter/s)": 0.310245
},
{
"epoch": 0.8009085937828629,
"grad_norm": 2.1875,
"learning_rate": 9.425163786873292e-06,
"loss": 0.3021913290023804,
"memory(GiB)": 75.82,
"step": 1190,
"token_acc": 0.9174669867947179,
"train_speed(iter/s)": 0.310785
},
{
"epoch": 0.8042737559416144,
"grad_norm": 2.578125,
"learning_rate": 9.118380280114857e-06,
"loss": 0.3496464014053345,
"memory(GiB)": 75.82,
"step": 1195,
"token_acc": 0.9070032573289902,
"train_speed(iter/s)": 0.311393
},
{
"epoch": 0.8076389181003659,
"grad_norm": 1.9765625,
"learning_rate": 8.816170928508365e-06,
"loss": 0.25614950656890867,
"memory(GiB)": 75.82,
"step": 1200,
"token_acc": 0.9283973187081048,
"train_speed(iter/s)": 0.311948
},
{
"epoch": 0.8076389181003659,
"eval_loss": 0.3316649794578552,
"eval_runtime": 6.3957,
"eval_samples_per_second": 37.525,
"eval_steps_per_second": 37.525,
"eval_token_acc": 0.9070824524312896,
"step": 1200
},
{
"epoch": 0.8110040802591175,
"grad_norm": 2.1875,
"learning_rate": 8.5185695455982e-06,
"loss": 0.3539767026901245,
"memory(GiB)": 75.82,
"step": 1205,
"token_acc": 0.9054973715310323,
"train_speed(iter/s)": 0.306783
},
{
"epoch": 0.814369242417869,
"grad_norm": 1.5859375,
"learning_rate": 8.225609429353187e-06,
"loss": 0.24862987995147706,
"memory(GiB)": 76.54,
"step": 1210,
"token_acc": 0.9274929223026109,
"train_speed(iter/s)": 0.307289
},
{
"epoch": 0.8177344045766205,
"grad_norm": 2.875,
"learning_rate": 7.937323358440935e-06,
"loss": 0.30047638416290284,
"memory(GiB)": 76.54,
"step": 1215,
"token_acc": 0.9201830198271479,
"train_speed(iter/s)": 0.307808
},
{
"epoch": 0.8210995667353721,
"grad_norm": 2.21875,
"learning_rate": 7.653743588560386e-06,
"loss": 0.34635608196258544,
"memory(GiB)": 77.35,
"step": 1220,
"token_acc": 0.90292348580221,
"train_speed(iter/s)": 0.308334
},
{
"epoch": 0.8244647288941236,
"grad_norm": 2.15625,
"learning_rate": 7.374901848832683e-06,
"loss": 0.2774034976959229,
"memory(GiB)": 77.35,
"step": 1225,
"token_acc": 0.9153208206023571,
"train_speed(iter/s)": 0.30888
},
{
"epoch": 0.8278298910528751,
"grad_norm": 2.765625,
"learning_rate": 7.100829338251147e-06,
"loss": 0.3617110729217529,
"memory(GiB)": 77.35,
"step": 1230,
"token_acc": 0.901881936625382,
"train_speed(iter/s)": 0.309428
},
{
"epoch": 0.8311950532116267,
"grad_norm": 2.515625,
"learning_rate": 6.831556722190452e-06,
"loss": 0.3457359790802002,
"memory(GiB)": 77.35,
"step": 1235,
"token_acc": 0.9088393543428133,
"train_speed(iter/s)": 0.309927
},
{
"epoch": 0.8345602153703782,
"grad_norm": 2.140625,
"learning_rate": 6.567114128975571e-06,
"loss": 0.2862051486968994,
"memory(GiB)": 77.35,
"step": 1240,
"token_acc": 0.9152869313615174,
"train_speed(iter/s)": 0.310429
},
{
"epoch": 0.8379253775291297,
"grad_norm": 2.09375,
"learning_rate": 6.3075311465107535e-06,
"loss": 0.32160158157348634,
"memory(GiB)": 77.35,
"step": 1245,
"token_acc": 0.9136561235197121,
"train_speed(iter/s)": 0.310932
},
{
"epoch": 0.8412905396878813,
"grad_norm": 2.09375,
"learning_rate": 6.052836818969026e-06,
"loss": 0.3995193958282471,
"memory(GiB)": 78.27,
"step": 1250,
"token_acc": 0.8993502188038721,
"train_speed(iter/s)": 0.311402
},
{
"epoch": 0.8412905396878813,
"eval_loss": 0.33036008477211,
"eval_runtime": 6.3924,
"eval_samples_per_second": 37.544,
"eval_steps_per_second": 37.544,
"eval_token_acc": 0.9071881606765327,
"step": 1250
},
{
"epoch": 0.8446557018466327,
"grad_norm": 1.8046875,
"learning_rate": 5.803059643542491e-06,
"loss": 0.3073962926864624,
"memory(GiB)": 78.27,
"step": 1255,
"token_acc": 0.9106831510540497,
"train_speed(iter/s)": 0.30711
},
{
"epoch": 0.8480208640053842,
"grad_norm": 1.9140625,
"learning_rate": 5.558227567253832e-06,
"loss": 0.2895121812820435,
"memory(GiB)": 78.27,
"step": 1260,
"token_acc": 0.9245490196078432,
"train_speed(iter/s)": 0.307631
},
{
"epoch": 0.8513860261641358,
"grad_norm": 2.484375,
"learning_rate": 5.318367983829392e-06,
"loss": 0.354207706451416,
"memory(GiB)": 78.27,
"step": 1265,
"token_acc": 0.9037365421152628,
"train_speed(iter/s)": 0.308129
},
{
"epoch": 0.8547511883228873,
"grad_norm": 2.734375,
"learning_rate": 5.083507730634152e-06,
"loss": 0.3204244375228882,
"memory(GiB)": 78.27,
"step": 1270,
"token_acc": 0.9147783251231527,
"train_speed(iter/s)": 0.308617
},
{
"epoch": 0.8581163504816388,
"grad_norm": 2.328125,
"learning_rate": 4.853673085668947e-06,
"loss": 0.29280381202697753,
"memory(GiB)": 78.27,
"step": 1275,
"token_acc": 0.9152905198776758,
"train_speed(iter/s)": 0.309122
},
{
"epoch": 0.8614815126403904,
"grad_norm": 1.7578125,
"learning_rate": 4.6288897646302785e-06,
"loss": 0.29522113800048827,
"memory(GiB)": 78.27,
"step": 1280,
"token_acc": 0.9161471321695761,
"train_speed(iter/s)": 0.309632
},
{
"epoch": 0.8648466747991419,
"grad_norm": 2.265625,
"learning_rate": 4.4091829180330505e-06,
"loss": 0.36226553916931153,
"memory(GiB)": 78.27,
"step": 1285,
"token_acc": 0.9011563440519563,
"train_speed(iter/s)": 0.310141
},
{
"epoch": 0.8682118369578934,
"grad_norm": 2.5,
"learning_rate": 4.19457712839652e-06,
"loss": 0.31820580959320066,
"memory(GiB)": 78.27,
"step": 1290,
"token_acc": 0.9157977883096367,
"train_speed(iter/s)": 0.310634
},
{
"epoch": 0.8715769991166449,
"grad_norm": 1.875,
"learning_rate": 3.9850964074938375e-06,
"loss": 0.2922437906265259,
"memory(GiB)": 78.27,
"step": 1295,
"token_acc": 0.9246329526916802,
"train_speed(iter/s)": 0.311131
},
{
"epoch": 0.8749421612753965,
"grad_norm": 2.421875,
"learning_rate": 3.780764193665398e-06,
"loss": 0.2996021509170532,
"memory(GiB)": 78.27,
"step": 1300,
"token_acc": 0.912474373127267,
"train_speed(iter/s)": 0.311627
},
{
"epoch": 0.8749421612753965,
"eval_loss": 0.3295805752277374,
"eval_runtime": 6.303,
"eval_samples_per_second": 38.077,
"eval_steps_per_second": 38.077,
"eval_token_acc": 0.9073467230443975,
"step": 1300
},
{
"epoch": 0.878307323434148,
"grad_norm": 2.25,
"learning_rate": 3.581603349196372e-06,
"loss": 0.31300716400146483,
"memory(GiB)": 78.27,
"step": 1305,
"token_acc": 0.9083045669166369,
"train_speed(iter/s)": 0.307342
},
{
"epoch": 0.8816724855928995,
"grad_norm": 1.8671875,
"learning_rate": 3.3876361577587113e-06,
"loss": 0.2798715114593506,
"memory(GiB)": 78.27,
"step": 1310,
"token_acc": 0.9237044145873321,
"train_speed(iter/s)": 0.307815
},
{
"epoch": 0.8850376477516511,
"grad_norm": 2.921875,
"learning_rate": 3.1988843219178777e-06,
"loss": 0.3821584701538086,
"memory(GiB)": 78.27,
"step": 1315,
"token_acc": 0.9002973861324151,
"train_speed(iter/s)": 0.3083
},
{
"epoch": 0.8884028099104025,
"grad_norm": 2.296875,
"learning_rate": 3.0153689607045845e-06,
"loss": 0.28262839317321775,
"memory(GiB)": 78.27,
"step": 1320,
"token_acc": 0.921765601217656,
"train_speed(iter/s)": 0.308778
},
{
"epoch": 0.891767972069154,
"grad_norm": 2.421875,
"learning_rate": 2.8371106072518195e-06,
"loss": 0.3282342433929443,
"memory(GiB)": 78.27,
"step": 1325,
"token_acc": 0.9107642467972317,
"train_speed(iter/s)": 0.309285
},
{
"epoch": 0.8951331342279056,
"grad_norm": 2.390625,
"learning_rate": 2.664129206497479e-06,
"loss": 0.3914219617843628,
"memory(GiB)": 78.27,
"step": 1330,
"token_acc": 0.8952534427190155,
"train_speed(iter/s)": 0.309779
},
{
"epoch": 0.8984982963866571,
"grad_norm": 1.71875,
"learning_rate": 2.496444112952734e-06,
"loss": 0.315179443359375,
"memory(GiB)": 78.27,
"step": 1335,
"token_acc": 0.9142040038131554,
"train_speed(iter/s)": 0.310272
},
{
"epoch": 0.9018634585454086,
"grad_norm": 2.265625,
"learning_rate": 2.334074088536492e-06,
"loss": 0.36266069412231444,
"memory(GiB)": 78.27,
"step": 1340,
"token_acc": 0.9021089077746302,
"train_speed(iter/s)": 0.310752
},
{
"epoch": 0.9052286207041602,
"grad_norm": 2.5625,
"learning_rate": 2.1770373004762035e-06,
"loss": 0.4528657913208008,
"memory(GiB)": 78.27,
"step": 1345,
"token_acc": 0.8829328404189772,
"train_speed(iter/s)": 0.31123
},
{
"epoch": 0.9085937828629117,
"grad_norm": 2.390625,
"learning_rate": 2.0253513192751373e-06,
"loss": 0.3175913095474243,
"memory(GiB)": 78.27,
"step": 1350,
"token_acc": 0.9110584518167456,
"train_speed(iter/s)": 0.311745
},
{
"epoch": 0.9085937828629117,
"eval_loss": 0.32911160588264465,
"eval_runtime": 6.3063,
"eval_samples_per_second": 38.057,
"eval_steps_per_second": 38.057,
"eval_token_acc": 0.9075052854122622,
"step": 1350
},
{
"epoch": 0.9119589450216632,
"grad_norm": 2.875,
"learning_rate": 1.879033116746476e-06,
"loss": 0.33309078216552734,
"memory(GiB)": 78.27,
"step": 1355,
"token_acc": 0.9085878012402857,
"train_speed(iter/s)": 0.30713
},
{
"epoch": 0.9153241071804148,
"grad_norm": 1.8046875,
"learning_rate": 1.738099064114368e-06,
"loss": 0.27440056800842283,
"memory(GiB)": 78.27,
"step": 1360,
"token_acc": 0.9196319018404908,
"train_speed(iter/s)": 0.307612
},
{
"epoch": 0.9186892693391663,
"grad_norm": 2.1875,
"learning_rate": 1.6025649301821876e-06,
"loss": 0.3164578199386597,
"memory(GiB)": 78.27,
"step": 1365,
"token_acc": 0.912046908315565,
"train_speed(iter/s)": 0.308062
},
{
"epoch": 0.9220544314979178,
"grad_norm": 2.359375,
"learning_rate": 1.4724458795681962e-06,
"loss": 0.35011940002441405,
"memory(GiB)": 78.27,
"step": 1370,
"token_acc": 0.9021322288694585,
"train_speed(iter/s)": 0.30853
},
{
"epoch": 0.9254195936566694,
"grad_norm": 1.78125,
"learning_rate": 1.3477564710088098e-06,
"loss": 0.26722261905670164,
"memory(GiB)": 78.27,
"step": 1375,
"token_acc": 0.929877564000636,
"train_speed(iter/s)": 0.309022
},
{
"epoch": 0.9287847558154209,
"grad_norm": 2.09375,
"learning_rate": 1.2285106557296477e-06,
"loss": 0.2892286777496338,
"memory(GiB)": 78.27,
"step": 1380,
"token_acc": 0.9129415442325727,
"train_speed(iter/s)": 0.309492
},
{
"epoch": 0.9321499179741723,
"grad_norm": 6.71875,
"learning_rate": 1.1147217758845751e-06,
"loss": 0.32126703262329104,
"memory(GiB)": 78.27,
"step": 1385,
"token_acc": 0.9072528883183568,
"train_speed(iter/s)": 0.309961
},
{
"epoch": 0.935515080132924,
"grad_norm": 2.46875,
"learning_rate": 1.0064025630628582e-06,
"loss": 0.29770004749298096,
"memory(GiB)": 78.27,
"step": 1390,
"token_acc": 0.9165990588998865,
"train_speed(iter/s)": 0.310404
},
{
"epoch": 0.9388802422916754,
"grad_norm": 2.0,
"learning_rate": 9.035651368646648e-07,
"loss": 0.32735161781311034,
"memory(GiB)": 78.27,
"step": 1395,
"token_acc": 0.910242711036483,
"train_speed(iter/s)": 0.310858
},
{
"epoch": 0.9422454044504269,
"grad_norm": 1.7578125,
"learning_rate": 8.062210035450379e-07,
"loss": 0.3620461463928223,
"memory(GiB)": 78.27,
"step": 1400,
"token_acc": 0.9046025104602511,
"train_speed(iter/s)": 0.311345
},
{
"epoch": 0.9422454044504269,
"eval_loss": 0.3291800916194916,
"eval_runtime": 6.3197,
"eval_samples_per_second": 37.976,
"eval_steps_per_second": 37.976,
"eval_token_acc": 0.90776955602537,
"step": 1400
},
{
"epoch": 0.9456105666091785,
"grad_norm": 1.7734375,
"learning_rate": 7.143810547264762e-07,
"loss": 0.3065107583999634,
"memory(GiB)": 78.27,
"step": 1405,
"token_acc": 0.9103534858174114,
"train_speed(iter/s)": 0.307325
},
{
"epoch": 0.94897572876793,
"grad_norm": 1.59375,
"learning_rate": 6.280555661802856e-07,
"loss": 0.260418701171875,
"memory(GiB)": 78.27,
"step": 1410,
"token_acc": 0.9281273692191054,
"train_speed(iter/s)": 0.307789
},
{
"epoch": 0.9523408909266815,
"grad_norm": 1.53125,
"learning_rate": 5.472541966768551e-07,
"loss": 0.23598823547363282,
"memory(GiB)": 78.27,
"step": 1415,
"token_acc": 0.9302064991195774,
"train_speed(iter/s)": 0.308273
},
{
"epoch": 0.9557060530854331,
"grad_norm": 1.8125,
"learning_rate": 4.7198598690496585e-07,
"loss": 0.2811413288116455,
"memory(GiB)": 78.27,
"step": 1420,
"token_acc": 0.9181454836131095,
"train_speed(iter/s)": 0.308702
},
{
"epoch": 0.9590712152441846,
"grad_norm": 2.28125,
"learning_rate": 4.02259358460233e-07,
"loss": 0.23685033321380616,
"memory(GiB)": 78.27,
"step": 1425,
"token_acc": 0.9318723201524536,
"train_speed(iter/s)": 0.30913
},
{
"epoch": 0.9624363774029361,
"grad_norm": 2.6875,
"learning_rate": 3.380821129028489e-07,
"loss": 0.28534321784973143,
"memory(GiB)": 78.27,
"step": 1430,
"token_acc": 0.9202698558724318,
"train_speed(iter/s)": 0.309604
},
{
"epoch": 0.9658015395616877,
"grad_norm": 1.765625,
"learning_rate": 2.794614308846644e-07,
"loss": 0.30334537029266356,
"memory(GiB)": 78.27,
"step": 1435,
"token_acc": 0.9193307439498059,
"train_speed(iter/s)": 0.310037
},
{
"epoch": 0.9691667017204392,
"grad_norm": 2.796875,
"learning_rate": 2.2640387134577058e-07,
"loss": 0.30445032119750975,
"memory(GiB)": 78.27,
"step": 1440,
"token_acc": 0.9128602730490477,
"train_speed(iter/s)": 0.310511
},
{
"epoch": 0.9725318638791907,
"grad_norm": 2.265625,
"learning_rate": 1.789153707806357e-07,
"loss": 0.3070082187652588,
"memory(GiB)": 78.27,
"step": 1445,
"token_acc": 0.9169588779088301,
"train_speed(iter/s)": 0.310939
},
{
"epoch": 0.9758970260379422,
"grad_norm": 2.1875,
"learning_rate": 1.3700124257388092e-07,
"loss": 0.2871255874633789,
"memory(GiB)": 78.27,
"step": 1450,
"token_acc": 0.9249602543720191,
"train_speed(iter/s)": 0.311389
},
{
"epoch": 0.9758970260379422,
"eval_loss": 0.32921192049980164,
"eval_runtime": 6.3471,
"eval_samples_per_second": 37.813,
"eval_steps_per_second": 37.813,
"eval_token_acc": 0.9072938689217759,
"step": 1450
},
{
"epoch": 0.9792621881966938,
"grad_norm": 2.1875,
"learning_rate": 1.0066617640578368e-07,
"loss": 0.32105896472930906,
"memory(GiB)": 78.27,
"step": 1455,
"token_acc": 0.9072366364488903,
"train_speed(iter/s)": 0.307431
},
{
"epoch": 0.9826273503554452,
"grad_norm": 2.625,
"learning_rate": 6.991423772753636e-08,
"loss": 0.3220836639404297,
"memory(GiB)": 78.27,
"step": 1460,
"token_acc": 0.9141094834232845,
"train_speed(iter/s)": 0.307867
},
{
"epoch": 0.9859925125141967,
"grad_norm": 2.609375,
"learning_rate": 4.474886730641004e-08,
"loss": 0.330595874786377,
"memory(GiB)": 78.27,
"step": 1465,
"token_acc": 0.9087763447625039,
"train_speed(iter/s)": 0.308304
},
{
"epoch": 0.9893576746729483,
"grad_norm": 1.8515625,
"learning_rate": 2.5172880840745873e-08,
"loss": 0.3304997444152832,
"memory(GiB)": 78.27,
"step": 1470,
"token_acc": 0.9131480090157776,
"train_speed(iter/s)": 0.308715
},
{
"epoch": 0.9927228368316998,
"grad_norm": 2.5625,
"learning_rate": 1.1188468644907079e-08,
"loss": 0.29615800380706786,
"memory(GiB)": 78.27,
"step": 1475,
"token_acc": 0.919311727363849,
"train_speed(iter/s)": 0.309186
},
{
"epoch": 0.9960879989904513,
"grad_norm": 2.515625,
"learning_rate": 2.797195404247166e-09,
"loss": 0.37999179363250735,
"memory(GiB)": 78.27,
"step": 1480,
"token_acc": 0.8994573890839451,
"train_speed(iter/s)": 0.309628
},
{
"epoch": 0.9994531611492029,
"grad_norm": 2.453125,
"learning_rate": 0.0,
"loss": 0.29352550506591796,
"memory(GiB)": 78.27,
"step": 1485,
"token_acc": 0.9119153858866303,
"train_speed(iter/s)": 0.310084
},
{
"epoch": 0.9994531611492029,
"eval_loss": 0.3291037976741791,
"eval_runtime": 6.3842,
"eval_samples_per_second": 37.593,
"eval_steps_per_second": 37.593,
"eval_token_acc": 0.9075581395348837,
"step": 1485
}
],
"logging_steps": 5,
"max_steps": 1485,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.771720449253366e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}