TBAC-VLR1-7B-SFT / trainer_state.json
zywoou
add model
3439e22
{
"best_metric": 0.39331543,
"best_model_checkpoint": "/group/40174/Zywoou/mm_math_reasoning/ms-swift-exp2/oly_output/SFT_text13k_geomm13k_test_mimi_e5/v0-20250615-110647/checkpoint-600",
"epoch": 4.977667493796526,
"eval_steps": 50,
"global_step": 1005,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004962779156327543,
"grad_norm": 4.768475281197631,
"learning_rate": 3.921568627450981e-07,
"loss": 0.829961359500885,
"memory(GiB)": 34.54,
"step": 1,
"token_acc": 0.7476113479347347,
"train_speed(iter/s)": 0.01403
},
{
"epoch": 0.02481389578163772,
"grad_norm": 4.398648559000095,
"learning_rate": 1.96078431372549e-06,
"loss": 0.8178610801696777,
"memory(GiB)": 82.86,
"step": 5,
"token_acc": 0.7709430756159729,
"train_speed(iter/s)": 0.016462
},
{
"epoch": 0.04962779156327544,
"grad_norm": 1.8736392022689452,
"learning_rate": 3.92156862745098e-06,
"loss": 0.7623313903808594,
"memory(GiB)": 82.86,
"step": 10,
"token_acc": 0.7958252706986702,
"train_speed(iter/s)": 0.016536
},
{
"epoch": 0.07444168734491315,
"grad_norm": 2.045825816277766,
"learning_rate": 5.882352941176471e-06,
"loss": 0.7174508571624756,
"memory(GiB)": 82.86,
"step": 15,
"token_acc": 0.7836124503600792,
"train_speed(iter/s)": 0.017279
},
{
"epoch": 0.09925558312655088,
"grad_norm": 1.4620891759847205,
"learning_rate": 7.84313725490196e-06,
"loss": 0.6413409233093261,
"memory(GiB)": 82.86,
"step": 20,
"token_acc": 0.7984759662668143,
"train_speed(iter/s)": 0.017064
},
{
"epoch": 0.12406947890818859,
"grad_norm": 1.0676208784737973,
"learning_rate": 9.803921568627451e-06,
"loss": 0.6111278533935547,
"memory(GiB)": 82.86,
"step": 25,
"token_acc": 0.8271531241409104,
"train_speed(iter/s)": 0.016967
},
{
"epoch": 0.1488833746898263,
"grad_norm": 0.7618181414678857,
"learning_rate": 1.1764705882352942e-05,
"loss": 0.5910769462585449,
"memory(GiB)": 82.86,
"step": 30,
"token_acc": 0.8199785748096231,
"train_speed(iter/s)": 0.016793
},
{
"epoch": 0.17369727047146402,
"grad_norm": 0.6486866379137493,
"learning_rate": 1.3725490196078432e-05,
"loss": 0.563320541381836,
"memory(GiB)": 82.86,
"step": 35,
"token_acc": 0.8135446844258112,
"train_speed(iter/s)": 0.016772
},
{
"epoch": 0.19851116625310175,
"grad_norm": 0.6289022048785994,
"learning_rate": 1.568627450980392e-05,
"loss": 0.5803719520568847,
"memory(GiB)": 82.86,
"step": 40,
"token_acc": 0.8021277151893771,
"train_speed(iter/s)": 0.016692
},
{
"epoch": 0.22332506203473945,
"grad_norm": 0.6554404813269014,
"learning_rate": 1.7647058823529414e-05,
"loss": 0.5443972110748291,
"memory(GiB)": 82.86,
"step": 45,
"token_acc": 0.8180514241473452,
"train_speed(iter/s)": 0.01667
},
{
"epoch": 0.24813895781637718,
"grad_norm": 0.7064848224001448,
"learning_rate": 1.9607843137254903e-05,
"loss": 0.5299727439880371,
"memory(GiB)": 82.86,
"step": 50,
"token_acc": 0.8285904060313818,
"train_speed(iter/s)": 0.016801
},
{
"epoch": 0.24813895781637718,
"eval_loss": 0.48841050267219543,
"eval_runtime": 36.0679,
"eval_samples_per_second": 7.181,
"eval_steps_per_second": 0.471,
"eval_token_acc": 0.8262438649949989,
"step": 50
},
{
"epoch": 0.2729528535980149,
"grad_norm": 0.5611469832371672,
"learning_rate": 1.9999132465602526e-05,
"loss": 0.5245039939880372,
"memory(GiB)": 82.86,
"step": 55,
"token_acc": 0.8300031537213912,
"train_speed(iter/s)": 0.016271
},
{
"epoch": 0.2977667493796526,
"grad_norm": 0.6906532228678482,
"learning_rate": 1.9995608365087945e-05,
"loss": 0.5370721340179443,
"memory(GiB)": 88.64,
"step": 60,
"token_acc": 0.8404785794642585,
"train_speed(iter/s)": 0.016239
},
{
"epoch": 0.3225806451612903,
"grad_norm": 0.8317526593094506,
"learning_rate": 1.998937443221316e-05,
"loss": 0.5141131401062011,
"memory(GiB)": 88.64,
"step": 65,
"token_acc": 0.8288326753680118,
"train_speed(iter/s)": 0.016216
},
{
"epoch": 0.34739454094292804,
"grad_norm": 0.6085530190810974,
"learning_rate": 1.9980432357011672e-05,
"loss": 0.5197068214416504,
"memory(GiB)": 88.64,
"step": 70,
"token_acc": 0.8228907383370238,
"train_speed(iter/s)": 0.016263
},
{
"epoch": 0.37220843672456577,
"grad_norm": 0.5289893647507578,
"learning_rate": 1.9968784563700586e-05,
"loss": 0.5076879978179931,
"memory(GiB)": 88.64,
"step": 75,
"token_acc": 0.8441243900160267,
"train_speed(iter/s)": 0.016308
},
{
"epoch": 0.3970223325062035,
"grad_norm": 0.6566505741518432,
"learning_rate": 1.9954434210023388e-05,
"loss": 0.5057409286499024,
"memory(GiB)": 88.64,
"step": 80,
"token_acc": 0.8405308008648832,
"train_speed(iter/s)": 0.016444
},
{
"epoch": 0.4218362282878412,
"grad_norm": 0.6488893883989255,
"learning_rate": 1.9937385186393888e-05,
"loss": 0.5170788764953613,
"memory(GiB)": 88.64,
"step": 85,
"token_acc": 0.8368028094412318,
"train_speed(iter/s)": 0.016432
},
{
"epoch": 0.4466501240694789,
"grad_norm": 0.61918836791994,
"learning_rate": 1.9917642114841505e-05,
"loss": 0.4992537498474121,
"memory(GiB)": 88.64,
"step": 90,
"token_acc": 0.8348116071872977,
"train_speed(iter/s)": 0.016433
},
{
"epoch": 0.47146401985111663,
"grad_norm": 0.5951609226226164,
"learning_rate": 1.9895210347758233e-05,
"loss": 0.5035615921020508,
"memory(GiB)": 88.64,
"step": 95,
"token_acc": 0.8535644197481864,
"train_speed(iter/s)": 0.016464
},
{
"epoch": 0.49627791563275436,
"grad_norm": 0.5482278802904141,
"learning_rate": 1.9870095966447592e-05,
"loss": 0.5053007125854492,
"memory(GiB)": 88.64,
"step": 100,
"token_acc": 0.8401285676802426,
"train_speed(iter/s)": 0.01646
},
{
"epoch": 0.49627791563275436,
"eval_loss": 0.4579505920410156,
"eval_runtime": 36.3741,
"eval_samples_per_second": 7.12,
"eval_steps_per_second": 0.467,
"eval_token_acc": 0.8344002233025517,
"step": 100
},
{
"epoch": 0.5210918114143921,
"grad_norm": 0.6730121532635986,
"learning_rate": 1.984230577947597e-05,
"loss": 0.5037405967712403,
"memory(GiB)": 88.64,
"step": 105,
"token_acc": 0.842359144244386,
"train_speed(iter/s)": 0.016306
},
{
"epoch": 0.5459057071960298,
"grad_norm": 0.5976214012072565,
"learning_rate": 1.9811847320826818e-05,
"loss": 0.5020250797271728,
"memory(GiB)": 88.64,
"step": 110,
"token_acc": 0.8231459777567204,
"train_speed(iter/s)": 0.016276
},
{
"epoch": 0.5707196029776674,
"grad_norm": 0.549343197081887,
"learning_rate": 1.977872884785815e-05,
"loss": 0.4945159912109375,
"memory(GiB)": 88.64,
"step": 115,
"token_acc": 0.8355786045950673,
"train_speed(iter/s)": 0.016303
},
{
"epoch": 0.5955334987593052,
"grad_norm": 0.6276703676499341,
"learning_rate": 1.9742959339063977e-05,
"loss": 0.47115507125854494,
"memory(GiB)": 88.64,
"step": 120,
"token_acc": 0.8467319196172923,
"train_speed(iter/s)": 0.016405
},
{
"epoch": 0.6203473945409429,
"grad_norm": 0.48930503328156055,
"learning_rate": 1.9704548491640195e-05,
"loss": 0.4859424591064453,
"memory(GiB)": 88.64,
"step": 125,
"token_acc": 0.8370880609513801,
"train_speed(iter/s)": 0.016377
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.5157224450118135,
"learning_rate": 1.966350671885566e-05,
"loss": 0.4896842956542969,
"memory(GiB)": 88.64,
"step": 130,
"token_acc": 0.8307861188942663,
"train_speed(iter/s)": 0.016357
},
{
"epoch": 0.6699751861042184,
"grad_norm": 0.6061582190169718,
"learning_rate": 1.961984514722914e-05,
"loss": 0.47826318740844725,
"memory(GiB)": 88.64,
"step": 135,
"token_acc": 0.851224802030969,
"train_speed(iter/s)": 0.016369
},
{
"epoch": 0.6947890818858561,
"grad_norm": 0.5516185961170281,
"learning_rate": 1.957357561351287e-05,
"loss": 0.471895694732666,
"memory(GiB)": 88.64,
"step": 140,
"token_acc": 0.8424185959845344,
"train_speed(iter/s)": 0.016408
},
{
"epoch": 0.7196029776674938,
"grad_norm": 0.4947960995048164,
"learning_rate": 1.9524710661483594e-05,
"loss": 0.47608461380004885,
"memory(GiB)": 88.64,
"step": 145,
"token_acc": 0.8376492687508386,
"train_speed(iter/s)": 0.016435
},
{
"epoch": 0.7444168734491315,
"grad_norm": 0.5952017722562365,
"learning_rate": 1.9473263538541916e-05,
"loss": 0.4909799575805664,
"memory(GiB)": 88.64,
"step": 150,
"token_acc": 0.8504602706501667,
"train_speed(iter/s)": 0.016418
},
{
"epoch": 0.7444168734491315,
"eval_loss": 0.4375256896018982,
"eval_runtime": 37.0115,
"eval_samples_per_second": 6.998,
"eval_steps_per_second": 0.459,
"eval_token_acc": 0.8389802516805843,
"step": 150
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.5564783950854367,
"learning_rate": 1.94192481921209e-05,
"loss": 0.4702787399291992,
"memory(GiB)": 88.64,
"step": 155,
"token_acc": 0.8495802700870744,
"train_speed(iter/s)": 0.01631
},
{
"epoch": 0.794044665012407,
"grad_norm": 0.51266416113498,
"learning_rate": 1.936267926590488e-05,
"loss": 0.4766042709350586,
"memory(GiB)": 88.64,
"step": 160,
"token_acc": 0.85023222889173,
"train_speed(iter/s)": 0.016298
},
{
"epoch": 0.8188585607940446,
"grad_norm": 0.4756301510519784,
"learning_rate": 1.9303572095859545e-05,
"loss": 0.4743985652923584,
"memory(GiB)": 88.64,
"step": 165,
"token_acc": 0.8259336826336148,
"train_speed(iter/s)": 0.016268
},
{
"epoch": 0.8436724565756824,
"grad_norm": 0.530391667374793,
"learning_rate": 1.92419427060743e-05,
"loss": 0.47100305557250977,
"memory(GiB)": 88.64,
"step": 170,
"token_acc": 0.8465938389259181,
"train_speed(iter/s)": 0.016284
},
{
"epoch": 0.8684863523573201,
"grad_norm": 0.5711613629006742,
"learning_rate": 1.91778078044181e-05,
"loss": 0.46791276931762693,
"memory(GiB)": 88.64,
"step": 175,
"token_acc": 0.848610394510885,
"train_speed(iter/s)": 0.016279
},
{
"epoch": 0.8933002481389578,
"grad_norm": 0.5215383014934748,
"learning_rate": 1.9111184778009934e-05,
"loss": 0.46720128059387206,
"memory(GiB)": 88.64,
"step": 180,
"token_acc": 0.8429917728410017,
"train_speed(iter/s)": 0.016337
},
{
"epoch": 0.9181141439205955,
"grad_norm": 0.5571619722684491,
"learning_rate": 1.9042091688505104e-05,
"loss": 0.46036605834960936,
"memory(GiB)": 88.64,
"step": 185,
"token_acc": 0.8357287838060233,
"train_speed(iter/s)": 0.016345
},
{
"epoch": 0.9429280397022333,
"grad_norm": 0.59307440115033,
"learning_rate": 1.89705472671987e-05,
"loss": 0.4740591049194336,
"memory(GiB)": 88.64,
"step": 190,
"token_acc": 0.8355302886053912,
"train_speed(iter/s)": 0.016341
},
{
"epoch": 0.967741935483871,
"grad_norm": 0.5582963618116457,
"learning_rate": 1.8896570909947477e-05,
"loss": 0.46262359619140625,
"memory(GiB)": 88.64,
"step": 195,
"token_acc": 0.8320320418277263,
"train_speed(iter/s)": 0.016366
},
{
"epoch": 0.9925558312655087,
"grad_norm": 0.5657776208171481,
"learning_rate": 1.88201826719116e-05,
"loss": 0.462737512588501,
"memory(GiB)": 88.64,
"step": 200,
"token_acc": 0.8392010935601458,
"train_speed(iter/s)": 0.01638
},
{
"epoch": 0.9925558312655087,
"eval_loss": 0.4247380495071411,
"eval_runtime": 36.803,
"eval_samples_per_second": 7.037,
"eval_steps_per_second": 0.462,
"eval_token_acc": 0.8428554813798237,
"step": 200
},
{
"epoch": 1.0148883374689825,
"grad_norm": 0.5518905639056666,
"learning_rate": 1.874140326211766e-05,
"loss": 0.41510915756225586,
"memory(GiB)": 88.64,
"step": 205,
"token_acc": 0.8667589172375147,
"train_speed(iter/s)": 0.016368
},
{
"epoch": 1.0397022332506203,
"grad_norm": 0.586511183543945,
"learning_rate": 1.866025403784439e-05,
"loss": 0.4149285316467285,
"memory(GiB)": 88.64,
"step": 210,
"token_acc": 0.8553997710922171,
"train_speed(iter/s)": 0.016413
},
{
"epoch": 1.064516129032258,
"grad_norm": 0.7352442626023928,
"learning_rate": 1.8576756998832667e-05,
"loss": 0.4105654716491699,
"memory(GiB)": 88.64,
"step": 215,
"token_acc": 0.855252843136141,
"train_speed(iter/s)": 0.016406
},
{
"epoch": 1.0893300248138957,
"grad_norm": 0.6822260227811386,
"learning_rate": 1.849093478132133e-05,
"loss": 0.4040283203125,
"memory(GiB)": 88.64,
"step": 220,
"token_acc": 0.8477043011659889,
"train_speed(iter/s)": 0.016437
},
{
"epoch": 1.1141439205955335,
"grad_norm": 0.5903650939016252,
"learning_rate": 1.8402810651910444e-05,
"loss": 0.3956918716430664,
"memory(GiB)": 88.64,
"step": 225,
"token_acc": 0.8605695208768427,
"train_speed(iter/s)": 0.016458
},
{
"epoch": 1.1389578163771712,
"grad_norm": 0.4978998007240176,
"learning_rate": 1.8312408501253674e-05,
"loss": 0.4057618141174316,
"memory(GiB)": 88.64,
"step": 230,
"token_acc": 0.8609171684050843,
"train_speed(iter/s)": 0.016455
},
{
"epoch": 1.163771712158809,
"grad_norm": 0.4872739462988421,
"learning_rate": 1.8219752837581466e-05,
"loss": 0.40581340789794923,
"memory(GiB)": 88.64,
"step": 235,
"token_acc": 0.8521656572270421,
"train_speed(iter/s)": 0.016453
},
{
"epoch": 1.1885856079404467,
"grad_norm": 0.5234851506683952,
"learning_rate": 1.8124868780056814e-05,
"loss": 0.3941540479660034,
"memory(GiB)": 88.64,
"step": 240,
"token_acc": 0.8785278129421401,
"train_speed(iter/s)": 0.016419
},
{
"epoch": 1.2133995037220844,
"grad_norm": 0.4141490126306291,
"learning_rate": 1.8027782051965408e-05,
"loss": 0.3891263008117676,
"memory(GiB)": 88.64,
"step": 245,
"token_acc": 0.8737247809520465,
"train_speed(iter/s)": 0.016435
},
{
"epoch": 1.2382133995037221,
"grad_norm": 0.5328099303792803,
"learning_rate": 1.7928518973741967e-05,
"loss": 0.4076822280883789,
"memory(GiB)": 88.64,
"step": 250,
"token_acc": 0.8484884195259031,
"train_speed(iter/s)": 0.016452
},
{
"epoch": 1.2382133995037221,
"eval_loss": 0.42026251554489136,
"eval_runtime": 36.9837,
"eval_samples_per_second": 7.003,
"eval_steps_per_second": 0.46,
"eval_token_acc": 0.8458235444627945,
"step": 250
},
{
"epoch": 1.2630272952853598,
"grad_norm": 0.6838326859978137,
"learning_rate": 1.782710645583473e-05,
"loss": 0.4152885913848877,
"memory(GiB)": 88.64,
"step": 255,
"token_acc": 0.8661420845706872,
"train_speed(iter/s)": 0.016363
},
{
"epoch": 1.2878411910669976,
"grad_norm": 0.5427598093088785,
"learning_rate": 1.7723571991409986e-05,
"loss": 0.40700349807739256,
"memory(GiB)": 88.64,
"step": 260,
"token_acc": 0.8587367250591306,
"train_speed(iter/s)": 0.016379
},
{
"epoch": 1.3126550868486353,
"grad_norm": 0.5151142246135575,
"learning_rate": 1.761794364889855e-05,
"loss": 0.40430006980895994,
"memory(GiB)": 88.64,
"step": 265,
"token_acc": 0.8575991930060525,
"train_speed(iter/s)": 0.016383
},
{
"epoch": 1.337468982630273,
"grad_norm": 0.5015356534355152,
"learning_rate": 1.751025006438643e-05,
"loss": 0.40410513877868653,
"memory(GiB)": 88.64,
"step": 270,
"token_acc": 0.8492741510067344,
"train_speed(iter/s)": 0.016388
},
{
"epoch": 1.3622828784119108,
"grad_norm": 0.5321569853483857,
"learning_rate": 1.7400520433851457e-05,
"loss": 0.40181665420532225,
"memory(GiB)": 88.64,
"step": 275,
"token_acc": 0.863210783537583,
"train_speed(iter/s)": 0.016384
},
{
"epoch": 1.3870967741935485,
"grad_norm": 0.5040061479558677,
"learning_rate": 1.728878450524822e-05,
"loss": 0.39484846591949463,
"memory(GiB)": 88.64,
"step": 280,
"token_acc": 0.8496746362506398,
"train_speed(iter/s)": 0.016389
},
{
"epoch": 1.4119106699751862,
"grad_norm": 0.49441565611811733,
"learning_rate": 1.717507257044331e-05,
"loss": 0.38428053855895994,
"memory(GiB)": 89.94,
"step": 285,
"token_acc": 0.8691993353349355,
"train_speed(iter/s)": 0.01639
},
{
"epoch": 1.436724565756824,
"grad_norm": 0.4427481805329584,
"learning_rate": 1.7059415457003144e-05,
"loss": 0.38771657943725585,
"memory(GiB)": 89.94,
"step": 290,
"token_acc": 0.8611781405251951,
"train_speed(iter/s)": 0.016398
},
{
"epoch": 1.4615384615384617,
"grad_norm": 0.4946161550465835,
"learning_rate": 1.694184451983651e-05,
"loss": 0.39400653839111327,
"memory(GiB)": 89.94,
"step": 295,
"token_acc": 0.8555248745014329,
"train_speed(iter/s)": 0.016388
},
{
"epoch": 1.4863523573200992,
"grad_norm": 0.4970277193994229,
"learning_rate": 1.682239163269422e-05,
"loss": 0.3775279998779297,
"memory(GiB)": 89.94,
"step": 300,
"token_acc": 0.870510592163004,
"train_speed(iter/s)": 0.016417
},
{
"epoch": 1.4863523573200992,
"eval_loss": 0.4109738767147064,
"eval_runtime": 37.1778,
"eval_samples_per_second": 6.967,
"eval_steps_per_second": 0.457,
"eval_token_acc": 0.8480263310925542,
"step": 300
},
{
"epoch": 1.5111662531017371,
"grad_norm": 0.5261002463932816,
"learning_rate": 1.6701089179528032e-05,
"loss": 0.3945833683013916,
"memory(GiB)": 89.94,
"step": 305,
"token_acc": 0.8619344282927444,
"train_speed(iter/s)": 0.016379
},
{
"epoch": 1.5359801488833746,
"grad_norm": 0.4575728138284327,
"learning_rate": 1.6577970045711293e-05,
"loss": 0.392360258102417,
"memory(GiB)": 89.94,
"step": 310,
"token_acc": 0.8602967068297687,
"train_speed(iter/s)": 0.016403
},
{
"epoch": 1.5607940446650124,
"grad_norm": 0.40177603172705895,
"learning_rate": 1.6453067609123656e-05,
"loss": 0.3860903739929199,
"memory(GiB)": 89.94,
"step": 315,
"token_acc": 0.872881901939308,
"train_speed(iter/s)": 0.016402
},
{
"epoch": 1.58560794044665,
"grad_norm": 0.4454839273958068,
"learning_rate": 1.6326415731102226e-05,
"loss": 0.4006852149963379,
"memory(GiB)": 89.94,
"step": 320,
"token_acc": 0.8653790030740496,
"train_speed(iter/s)": 0.01639
},
{
"epoch": 1.6104218362282878,
"grad_norm": 0.44655786918995494,
"learning_rate": 1.619804874726171e-05,
"loss": 0.39271857738494875,
"memory(GiB)": 89.94,
"step": 325,
"token_acc": 0.8654047918520109,
"train_speed(iter/s)": 0.0164
},
{
"epoch": 1.6352357320099256,
"grad_norm": 0.4797627971829061,
"learning_rate": 1.6068001458185934e-05,
"loss": 0.37825469970703124,
"memory(GiB)": 89.94,
"step": 330,
"token_acc": 0.8561078435708074,
"train_speed(iter/s)": 0.016385
},
{
"epoch": 1.6600496277915633,
"grad_norm": 0.4861294532147035,
"learning_rate": 1.5936309119993333e-05,
"loss": 0.3874125242233276,
"memory(GiB)": 89.94,
"step": 335,
"token_acc": 0.8552511758857783,
"train_speed(iter/s)": 0.016383
},
{
"epoch": 1.684863523573201,
"grad_norm": 0.41946071511052324,
"learning_rate": 1.5803007434778915e-05,
"loss": 0.38948085308074953,
"memory(GiB)": 89.94,
"step": 340,
"token_acc": 0.8796996530315503,
"train_speed(iter/s)": 0.016394
},
{
"epoch": 1.7096774193548387,
"grad_norm": 0.48162634486442507,
"learning_rate": 1.566813254093538e-05,
"loss": 0.38796045780181887,
"memory(GiB)": 89.94,
"step": 345,
"token_acc": 0.8590043182007245,
"train_speed(iter/s)": 0.016394
},
{
"epoch": 1.7344913151364765,
"grad_norm": 0.4580728622127146,
"learning_rate": 1.553172100335588e-05,
"loss": 0.38542957305908204,
"memory(GiB)": 89.94,
"step": 350,
"token_acc": 0.8623940061939602,
"train_speed(iter/s)": 0.016386
},
{
"epoch": 1.7344913151364765,
"eval_loss": 0.40481674671173096,
"eval_runtime": 37.3868,
"eval_samples_per_second": 6.928,
"eval_steps_per_second": 0.455,
"eval_token_acc": 0.8498720662464236,
"step": 350
},
{
"epoch": 1.759305210918114,
"grad_norm": 0.4101110019379735,
"learning_rate": 1.5393809803521213e-05,
"loss": 0.3963811159133911,
"memory(GiB)": 89.94,
"step": 355,
"token_acc": 0.8648625816625366,
"train_speed(iter/s)": 0.016344
},
{
"epoch": 1.7841191066997517,
"grad_norm": 0.4729658162675547,
"learning_rate": 1.5254436329474062e-05,
"loss": 0.38416252136230467,
"memory(GiB)": 89.94,
"step": 360,
"token_acc": 0.8674258253238613,
"train_speed(iter/s)": 0.01635
},
{
"epoch": 1.8089330024813894,
"grad_norm": 0.48983508519169017,
"learning_rate": 1.5113638365682996e-05,
"loss": 0.3992438316345215,
"memory(GiB)": 89.94,
"step": 365,
"token_acc": 0.8770906339598599,
"train_speed(iter/s)": 0.016361
},
{
"epoch": 1.8337468982630272,
"grad_norm": 0.5401851259187025,
"learning_rate": 1.4971454082799029e-05,
"loss": 0.38352556228637696,
"memory(GiB)": 89.94,
"step": 370,
"token_acc": 0.860992567369929,
"train_speed(iter/s)": 0.016358
},
{
"epoch": 1.858560794044665,
"grad_norm": 0.4230668354018143,
"learning_rate": 1.482792202730745e-05,
"loss": 0.3897742748260498,
"memory(GiB)": 89.94,
"step": 375,
"token_acc": 0.871132879925645,
"train_speed(iter/s)": 0.016351
},
{
"epoch": 1.8833746898263026,
"grad_norm": 0.4019049115904182,
"learning_rate": 1.4683081111077807e-05,
"loss": 0.39033985137939453,
"memory(GiB)": 89.94,
"step": 380,
"token_acc": 0.8773624177836983,
"train_speed(iter/s)": 0.016354
},
{
"epoch": 1.9081885856079404,
"grad_norm": 0.5428075344626317,
"learning_rate": 1.4536970600814789e-05,
"loss": 0.3880493640899658,
"memory(GiB)": 89.94,
"step": 385,
"token_acc": 0.859438589168319,
"train_speed(iter/s)": 0.016358
},
{
"epoch": 1.933002481389578,
"grad_norm": 0.4827683055924454,
"learning_rate": 1.4389630107412942e-05,
"loss": 0.38936262130737304,
"memory(GiB)": 89.94,
"step": 390,
"token_acc": 0.8606305598661481,
"train_speed(iter/s)": 0.016355
},
{
"epoch": 1.9578163771712158,
"grad_norm": 0.4573581809973034,
"learning_rate": 1.424109957521806e-05,
"loss": 0.3780574560165405,
"memory(GiB)": 89.94,
"step": 395,
"token_acc": 0.8642330574236937,
"train_speed(iter/s)": 0.01638
},
{
"epoch": 1.9826302729528535,
"grad_norm": 0.4605796202723116,
"learning_rate": 1.4091419271198197e-05,
"loss": 0.3744480848312378,
"memory(GiB)": 89.94,
"step": 400,
"token_acc": 0.8632874334053888,
"train_speed(iter/s)": 0.016393
},
{
"epoch": 1.9826302729528535,
"eval_loss": 0.40109848976135254,
"eval_runtime": 36.9238,
"eval_samples_per_second": 7.014,
"eval_steps_per_second": 0.46,
"eval_token_acc": 0.8518154962666604,
"step": 400
},
{
"epoch": 2.0049627791563274,
"grad_norm": 0.561513288720162,
"learning_rate": 1.394062977402717e-05,
"loss": 0.36142630577087403,
"memory(GiB)": 89.94,
"step": 405,
"token_acc": 0.8730374487048237,
"train_speed(iter/s)": 0.016389
},
{
"epoch": 2.029776674937965,
"grad_norm": 0.5236938495980928,
"learning_rate": 1.378877196308361e-05,
"loss": 0.30889334678649905,
"memory(GiB)": 89.94,
"step": 410,
"token_acc": 0.8946576830463284,
"train_speed(iter/s)": 0.016398
},
{
"epoch": 2.054590570719603,
"grad_norm": 0.45558075636252165,
"learning_rate": 1.3635887007368467e-05,
"loss": 0.3037071228027344,
"memory(GiB)": 89.94,
"step": 415,
"token_acc": 0.8943636184386575,
"train_speed(iter/s)": 0.016406
},
{
"epoch": 2.0794044665012406,
"grad_norm": 0.5039448394161661,
"learning_rate": 1.348201635434399e-05,
"loss": 0.3107598781585693,
"memory(GiB)": 89.94,
"step": 420,
"token_acc": 0.8958745781247054,
"train_speed(iter/s)": 0.016399
},
{
"epoch": 2.1042183622828783,
"grad_norm": 0.4915435678643858,
"learning_rate": 1.3327201718697232e-05,
"loss": 0.3129460334777832,
"memory(GiB)": 89.94,
"step": 425,
"token_acc": 0.8708129581052927,
"train_speed(iter/s)": 0.016409
},
{
"epoch": 2.129032258064516,
"grad_norm": 0.4526399583483296,
"learning_rate": 1.31714850710311e-05,
"loss": 0.3166365146636963,
"memory(GiB)": 89.94,
"step": 430,
"token_acc": 0.89759354279269,
"train_speed(iter/s)": 0.016395
},
{
"epoch": 2.1538461538461537,
"grad_norm": 0.47166304759252065,
"learning_rate": 1.3014908626486032e-05,
"loss": 0.30022444725036623,
"memory(GiB)": 89.94,
"step": 435,
"token_acc": 0.8798537849342958,
"train_speed(iter/s)": 0.016395
},
{
"epoch": 2.1786600496277915,
"grad_norm": 0.4123439151346761,
"learning_rate": 1.2857514833295369e-05,
"loss": 0.3049207925796509,
"memory(GiB)": 89.94,
"step": 440,
"token_acc": 0.8795848668205352,
"train_speed(iter/s)": 0.016409
},
{
"epoch": 2.203473945409429,
"grad_norm": 0.405269067838883,
"learning_rate": 1.2699346361277538e-05,
"loss": 0.3032404661178589,
"memory(GiB)": 89.94,
"step": 445,
"token_acc": 0.893778727363035,
"train_speed(iter/s)": 0.016409
},
{
"epoch": 2.228287841191067,
"grad_norm": 0.37939542721602953,
"learning_rate": 1.2540446090268193e-05,
"loss": 0.3014317512512207,
"memory(GiB)": 89.94,
"step": 450,
"token_acc": 0.8831544347304245,
"train_speed(iter/s)": 0.016419
},
{
"epoch": 2.228287841191067,
"eval_loss": 0.4072587788105011,
"eval_runtime": 37.3946,
"eval_samples_per_second": 6.926,
"eval_steps_per_second": 0.455,
"eval_token_acc": 0.8520934614221581,
"step": 450
},
{
"epoch": 2.2531017369727047,
"grad_norm": 0.3861335333797535,
"learning_rate": 1.2380857098495355e-05,
"loss": 0.30447826385498045,
"memory(GiB)": 89.94,
"step": 455,
"token_acc": 0.881784783123963,
"train_speed(iter/s)": 0.016377
},
{
"epoch": 2.2779156327543424,
"grad_norm": 0.40103897362549834,
"learning_rate": 1.2220622650900833e-05,
"loss": 0.306304407119751,
"memory(GiB)": 89.94,
"step": 460,
"token_acc": 0.9054849560829752,
"train_speed(iter/s)": 0.016371
},
{
"epoch": 2.30272952853598,
"grad_norm": 0.41467304098935237,
"learning_rate": 1.2059786187410984e-05,
"loss": 0.31237101554870605,
"memory(GiB)": 89.94,
"step": 465,
"token_acc": 0.8715191597554331,
"train_speed(iter/s)": 0.016373
},
{
"epoch": 2.327543424317618,
"grad_norm": 0.40212608233202607,
"learning_rate": 1.1898391311160067e-05,
"loss": 0.30408382415771484,
"memory(GiB)": 89.94,
"step": 470,
"token_acc": 0.8885567438564482,
"train_speed(iter/s)": 0.01638
},
{
"epoch": 2.3523573200992556,
"grad_norm": 0.4506239257919707,
"learning_rate": 1.1736481776669307e-05,
"loss": 0.2938546180725098,
"memory(GiB)": 89.94,
"step": 475,
"token_acc": 0.9089126511337576,
"train_speed(iter/s)": 0.016382
},
{
"epoch": 2.3771712158808933,
"grad_norm": 0.42538484493612555,
"learning_rate": 1.1574101477984966e-05,
"loss": 0.3105756759643555,
"memory(GiB)": 89.94,
"step": 480,
"token_acc": 0.8806113552657332,
"train_speed(iter/s)": 0.016388
},
{
"epoch": 2.401985111662531,
"grad_norm": 0.4226912481964332,
"learning_rate": 1.1411294436778562e-05,
"loss": 0.3021634578704834,
"memory(GiB)": 89.94,
"step": 485,
"token_acc": 0.8833968731418979,
"train_speed(iter/s)": 0.01639
},
{
"epoch": 2.4267990074441688,
"grad_norm": 0.449080842036963,
"learning_rate": 1.124810479041248e-05,
"loss": 0.3032838344573975,
"memory(GiB)": 89.94,
"step": 490,
"token_acc": 0.877714128906116,
"train_speed(iter/s)": 0.016386
},
{
"epoch": 2.4516129032258065,
"grad_norm": 0.4577775380023549,
"learning_rate": 1.1084576779974257e-05,
"loss": 0.3055537223815918,
"memory(GiB)": 89.94,
"step": 495,
"token_acc": 0.8952398880779006,
"train_speed(iter/s)": 0.016386
},
{
"epoch": 2.4764267990074442,
"grad_norm": 0.4327610654637386,
"learning_rate": 1.092075473828269e-05,
"loss": 0.3270174741744995,
"memory(GiB)": 89.94,
"step": 500,
"token_acc": 0.8941933336227983,
"train_speed(iter/s)": 0.016386
},
{
"epoch": 2.4764267990074442,
"eval_loss": 0.3992994427680969,
"eval_runtime": 37.0915,
"eval_samples_per_second": 6.983,
"eval_steps_per_second": 0.458,
"eval_token_acc": 0.8543858016794212,
"step": 500
},
{
"epoch": 2.501240694789082,
"grad_norm": 0.43447637201251166,
"learning_rate": 1.0756683077869133e-05,
"loss": 0.30214927196502683,
"memory(GiB)": 89.94,
"step": 505,
"token_acc": 0.8810097805974094,
"train_speed(iter/s)": 0.016353
},
{
"epoch": 2.5260545905707197,
"grad_norm": 0.4282670862928906,
"learning_rate": 1.0592406278937143e-05,
"loss": 0.29954004287719727,
"memory(GiB)": 89.94,
"step": 510,
"token_acc": 0.8962592483454521,
"train_speed(iter/s)": 0.016353
},
{
"epoch": 2.5508684863523574,
"grad_norm": 0.4520012444271277,
"learning_rate": 1.0427968877303809e-05,
"loss": 0.29749062061309817,
"memory(GiB)": 89.94,
"step": 515,
"token_acc": 0.88993587445068,
"train_speed(iter/s)": 0.016352
},
{
"epoch": 2.575682382133995,
"grad_norm": 0.4094260181661943,
"learning_rate": 1.0263415452325967e-05,
"loss": 0.30545458793640134,
"memory(GiB)": 89.94,
"step": 520,
"token_acc": 0.8991213678952933,
"train_speed(iter/s)": 0.01636
},
{
"epoch": 2.600496277915633,
"grad_norm": 0.4362439165953098,
"learning_rate": 1.0098790614814658e-05,
"loss": 0.29534034729003905,
"memory(GiB)": 89.94,
"step": 525,
"token_acc": 0.8916807528895793,
"train_speed(iter/s)": 0.016369
},
{
"epoch": 2.6253101736972706,
"grad_norm": 0.4946976315555759,
"learning_rate": 9.934138994941023e-06,
"loss": 0.3051294803619385,
"memory(GiB)": 89.94,
"step": 530,
"token_acc": 0.8991352720121762,
"train_speed(iter/s)": 0.016377
},
{
"epoch": 2.6501240694789083,
"grad_norm": 0.4359207003478648,
"learning_rate": 9.769505230136962e-06,
"loss": 0.2859165191650391,
"memory(GiB)": 89.94,
"step": 535,
"token_acc": 0.8830987088713036,
"train_speed(iter/s)": 0.016394
},
{
"epoch": 2.674937965260546,
"grad_norm": 0.45484329583426325,
"learning_rate": 9.604933952993822e-06,
"loss": 0.2968073606491089,
"memory(GiB)": 89.94,
"step": 540,
"token_acc": 0.893879447175874,
"train_speed(iter/s)": 0.016401
},
{
"epoch": 2.699751861042184,
"grad_norm": 0.39016064380810367,
"learning_rate": 9.440469779162407e-06,
"loss": 0.30095710754394533,
"memory(GiB)": 89.94,
"step": 545,
"token_acc": 0.8819626291391867,
"train_speed(iter/s)": 0.016402
},
{
"epoch": 2.7245657568238215,
"grad_norm": 0.3894440736965737,
"learning_rate": 9.276157295257566e-06,
"loss": 0.297087574005127,
"memory(GiB)": 89.94,
"step": 550,
"token_acc": 0.8939326285376584,
"train_speed(iter/s)": 0.016414
},
{
"epoch": 2.7245657568238215,
"eval_loss": 0.39560389518737793,
"eval_runtime": 36.8591,
"eval_samples_per_second": 7.027,
"eval_steps_per_second": 0.461,
"eval_token_acc": 0.856389709474076,
"step": 550
},
{
"epoch": 2.749379652605459,
"grad_norm": 0.398345793637711,
"learning_rate": 9.112041046770653e-06,
"loss": 0.3055715084075928,
"memory(GiB)": 89.94,
"step": 555,
"token_acc": 0.8971510298173304,
"train_speed(iter/s)": 0.016389
},
{
"epoch": 2.774193548387097,
"grad_norm": 0.4307995028287168,
"learning_rate": 8.948165525993162e-06,
"loss": 0.30808732509613035,
"memory(GiB)": 89.94,
"step": 560,
"token_acc": 0.888341487335637,
"train_speed(iter/s)": 0.016392
},
{
"epoch": 2.7990074441687343,
"grad_norm": 0.42720829430324325,
"learning_rate": 8.784575159954748e-06,
"loss": 0.29772372245788575,
"memory(GiB)": 89.94,
"step": 565,
"token_acc": 0.8964727272727273,
"train_speed(iter/s)": 0.016385
},
{
"epoch": 2.8238213399503724,
"grad_norm": 0.378943802971695,
"learning_rate": 8.621314298378958e-06,
"loss": 0.2994475126266479,
"memory(GiB)": 89.94,
"step": 570,
"token_acc": 0.8991428363722879,
"train_speed(iter/s)": 0.016389
},
{
"epoch": 2.8486352357320097,
"grad_norm": 0.39340559598269975,
"learning_rate": 8.458427201659926e-06,
"loss": 0.3069624662399292,
"memory(GiB)": 89.94,
"step": 575,
"token_acc": 0.8998604899265521,
"train_speed(iter/s)": 0.016397
},
{
"epoch": 2.873449131513648,
"grad_norm": 0.4314024894376894,
"learning_rate": 8.295958028863285e-06,
"loss": 0.30196504592895507,
"memory(GiB)": 89.94,
"step": 580,
"token_acc": 0.9014831273211464,
"train_speed(iter/s)": 0.016408
},
{
"epoch": 2.898263027295285,
"grad_norm": 0.4237139579995691,
"learning_rate": 8.133950825754511e-06,
"loss": 0.2988776683807373,
"memory(GiB)": 89.94,
"step": 585,
"token_acc": 0.9028119489350919,
"train_speed(iter/s)": 0.016424
},
{
"epoch": 2.9230769230769234,
"grad_norm": 0.4130611959829037,
"learning_rate": 7.972449512858062e-06,
"loss": 0.30088846683502196,
"memory(GiB)": 89.94,
"step": 590,
"token_acc": 0.8994833915566345,
"train_speed(iter/s)": 0.01642
},
{
"epoch": 2.9478908188585606,
"grad_norm": 0.4169476470359364,
"learning_rate": 7.81149787355039e-06,
"loss": 0.3129019021987915,
"memory(GiB)": 89.94,
"step": 595,
"token_acc": 0.8943572216882053,
"train_speed(iter/s)": 0.016416
},
{
"epoch": 2.9727047146401984,
"grad_norm": 0.4840772402114463,
"learning_rate": 7.651139542190164e-06,
"loss": 0.27456250190734866,
"memory(GiB)": 89.94,
"step": 600,
"token_acc": 0.896454707029423,
"train_speed(iter/s)": 0.016428
},
{
"epoch": 2.9727047146401984,
"eval_loss": 0.3933154344558716,
"eval_runtime": 37.5718,
"eval_samples_per_second": 6.893,
"eval_steps_per_second": 0.452,
"eval_token_acc": 0.8575376241538927,
"step": 600
},
{
"epoch": 2.997518610421836,
"grad_norm": 0.3681469729562029,
"learning_rate": 7.491417992288927e-06,
"loss": 0.29853529930114747,
"memory(GiB)": 89.94,
"step": 605,
"token_acc": 0.8887139445589062,
"train_speed(iter/s)": 0.016389
},
{
"epoch": 3.0198511166253104,
"grad_norm": 0.46754553369454177,
"learning_rate": 7.332376524725298e-06,
"loss": 0.25056142807006837,
"memory(GiB)": 89.94,
"step": 610,
"token_acc": 0.9005600640073151,
"train_speed(iter/s)": 0.016407
},
{
"epoch": 3.044665012406948,
"grad_norm": 0.3905145343550401,
"learning_rate": 7.174058256006012e-06,
"loss": 0.24309511184692384,
"memory(GiB)": 89.94,
"step": 615,
"token_acc": 0.9053923110803498,
"train_speed(iter/s)": 0.016402
},
{
"epoch": 3.069478908188586,
"grad_norm": 0.41901102410789404,
"learning_rate": 7.016506106576942e-06,
"loss": 0.23708744049072267,
"memory(GiB)": 89.94,
"step": 620,
"token_acc": 0.921517517259178,
"train_speed(iter/s)": 0.016398
},
{
"epoch": 3.094292803970223,
"grad_norm": 0.36157357718683,
"learning_rate": 6.859762789187259e-06,
"loss": 0.23210906982421875,
"memory(GiB)": 89.94,
"step": 625,
"token_acc": 0.9225797451873896,
"train_speed(iter/s)": 0.016406
},
{
"epoch": 3.119106699751861,
"grad_norm": 0.3805761890187129,
"learning_rate": 6.703870797309922e-06,
"loss": 0.2322997808456421,
"memory(GiB)": 89.94,
"step": 630,
"token_acc": 0.9135864765989655,
"train_speed(iter/s)": 0.016415
},
{
"epoch": 3.1439205955334986,
"grad_norm": 0.3788702415059945,
"learning_rate": 6.548872393621578e-06,
"loss": 0.22191863059997557,
"memory(GiB)": 89.94,
"step": 635,
"token_acc": 0.9246411397786463,
"train_speed(iter/s)": 0.016413
},
{
"epoch": 3.1687344913151363,
"grad_norm": 0.4063569428162902,
"learning_rate": 6.3948095985450755e-06,
"loss": 0.24599046707153321,
"memory(GiB)": 89.94,
"step": 640,
"token_acc": 0.9072380405759489,
"train_speed(iter/s)": 0.016415
},
{
"epoch": 3.193548387096774,
"grad_norm": 0.3557259507590387,
"learning_rate": 6.241724178857621e-06,
"loss": 0.23447060585021973,
"memory(GiB)": 89.94,
"step": 645,
"token_acc": 0.9291399599260514,
"train_speed(iter/s)": 0.016413
},
{
"epoch": 3.2183622828784118,
"grad_norm": 0.3295522367686504,
"learning_rate": 6.089657636367698e-06,
"loss": 0.23479061126708983,
"memory(GiB)": 89.94,
"step": 650,
"token_acc": 0.9199351455699978,
"train_speed(iter/s)": 0.016409
},
{
"epoch": 3.2183622828784118,
"eval_loss": 0.4014388620853424,
"eval_runtime": 37.5517,
"eval_samples_per_second": 6.897,
"eval_steps_per_second": 0.453,
"eval_token_acc": 0.858085413226024,
"step": 650
},
{
"epoch": 3.2431761786600495,
"grad_norm": 0.4240123332842028,
"learning_rate": 5.938651196663865e-06,
"loss": 0.22697579860687256,
"memory(GiB)": 89.94,
"step": 655,
"token_acc": 0.8996346346571273,
"train_speed(iter/s)": 0.016378
},
{
"epoch": 3.267990074441687,
"grad_norm": 0.35633031030131934,
"learning_rate": 5.788745797938372e-06,
"loss": 0.2304630994796753,
"memory(GiB)": 89.94,
"step": 660,
"token_acc": 0.9186445328577308,
"train_speed(iter/s)": 0.01638
},
{
"epoch": 3.292803970223325,
"grad_norm": 0.38300755718973123,
"learning_rate": 5.6399820798887266e-06,
"loss": 0.23805148601531984,
"memory(GiB)": 89.94,
"step": 665,
"token_acc": 0.9132490030897615,
"train_speed(iter/s)": 0.016388
},
{
"epoch": 3.3176178660049627,
"grad_norm": 0.3833393689853116,
"learning_rate": 5.492400372700195e-06,
"loss": 0.23390157222747804,
"memory(GiB)": 89.94,
"step": 670,
"token_acc": 0.9085175452156439,
"train_speed(iter/s)": 0.016384
},
{
"epoch": 3.3424317617866004,
"grad_norm": 0.3525024053865293,
"learning_rate": 5.346040686112189e-06,
"loss": 0.23395137786865233,
"memory(GiB)": 89.94,
"step": 675,
"token_acc": 0.9463714867825449,
"train_speed(iter/s)": 0.016389
},
{
"epoch": 3.367245657568238,
"grad_norm": 0.3487934368974318,
"learning_rate": 5.200942698571527e-06,
"loss": 0.22507448196411134,
"memory(GiB)": 89.94,
"step": 680,
"token_acc": 0.9121472535129486,
"train_speed(iter/s)": 0.016392
},
{
"epoch": 3.392059553349876,
"grad_norm": 0.35248625950301066,
"learning_rate": 5.0571457464755226e-06,
"loss": 0.23436686992645264,
"memory(GiB)": 89.94,
"step": 685,
"token_acc": 0.9249912229851648,
"train_speed(iter/s)": 0.016389
},
{
"epoch": 3.4168734491315136,
"grad_norm": 0.38081790230168544,
"learning_rate": 4.914688813507798e-06,
"loss": 0.2353280544281006,
"memory(GiB)": 89.94,
"step": 690,
"token_acc": 0.9169562569412102,
"train_speed(iter/s)": 0.016393
},
{
"epoch": 3.4416873449131513,
"grad_norm": 0.3810939058594302,
"learning_rate": 4.773610520069706e-06,
"loss": 0.23074874877929688,
"memory(GiB)": 89.94,
"step": 695,
"token_acc": 0.896226352801347,
"train_speed(iter/s)": 0.0164
},
{
"epoch": 3.466501240694789,
"grad_norm": 0.3874910212223461,
"learning_rate": 4.633949112810271e-06,
"loss": 0.22984590530395507,
"memory(GiB)": 89.94,
"step": 700,
"token_acc": 0.9299721620785648,
"train_speed(iter/s)": 0.016397
},
{
"epoch": 3.466501240694789,
"eval_loss": 0.3995462656021118,
"eval_runtime": 37.598,
"eval_samples_per_second": 6.889,
"eval_steps_per_second": 0.452,
"eval_token_acc": 0.8598125188993045,
"step": 700
},
{
"epoch": 3.4913151364764268,
"grad_norm": 0.3616597597470475,
"learning_rate": 4.495742454257418e-06,
"loss": 0.2223682403564453,
"memory(GiB)": 89.94,
"step": 705,
"token_acc": 0.9105715421148296,
"train_speed(iter/s)": 0.016371
},
{
"epoch": 3.5161290322580645,
"grad_norm": 0.3988361808995699,
"learning_rate": 4.359028012553362e-06,
"loss": 0.2163018226623535,
"memory(GiB)": 89.94,
"step": 710,
"token_acc": 0.9318655704692665,
"train_speed(iter/s)": 0.01638
},
{
"epoch": 3.5409429280397022,
"grad_norm": 0.3932524566015415,
"learning_rate": 4.223842851296907e-06,
"loss": 0.23104467391967773,
"memory(GiB)": 89.94,
"step": 715,
"token_acc": 0.9287522767981982,
"train_speed(iter/s)": 0.016381
},
{
"epoch": 3.56575682382134,
"grad_norm": 0.367974151309137,
"learning_rate": 4.090223619495419e-06,
"loss": 0.23323664665222169,
"memory(GiB)": 89.94,
"step": 720,
"token_acc": 0.9070000777236852,
"train_speed(iter/s)": 0.016385
},
{
"epoch": 3.5905707196029777,
"grad_norm": 0.39001981721162915,
"learning_rate": 3.9582065416291926e-06,
"loss": 0.22505450248718262,
"memory(GiB)": 89.94,
"step": 725,
"token_acc": 0.9127140748875999,
"train_speed(iter/s)": 0.016399
},
{
"epoch": 3.6153846153846154,
"grad_norm": 0.3767818229832471,
"learning_rate": 3.827827407830917e-06,
"loss": 0.2194854736328125,
"memory(GiB)": 89.94,
"step": 730,
"token_acc": 0.9305206381130868,
"train_speed(iter/s)": 0.0164
},
{
"epoch": 3.640198511166253,
"grad_norm": 0.41209090580965746,
"learning_rate": 3.6991215641828903e-06,
"loss": 0.217703914642334,
"memory(GiB)": 89.94,
"step": 735,
"token_acc": 0.918243838028169,
"train_speed(iter/s)": 0.016398
},
{
"epoch": 3.665012406947891,
"grad_norm": 0.37397331270090406,
"learning_rate": 3.5721239031346067e-06,
"loss": 0.2251272201538086,
"memory(GiB)": 89.94,
"step": 740,
"token_acc": 0.9240038816389786,
"train_speed(iter/s)": 0.016395
},
{
"epoch": 3.6898263027295286,
"grad_norm": 0.40120593317876174,
"learning_rate": 3.4468688540433425e-06,
"loss": 0.22675325870513915,
"memory(GiB)": 89.94,
"step": 745,
"token_acc": 0.9157191822608735,
"train_speed(iter/s)": 0.016404
},
{
"epoch": 3.7146401985111663,
"grad_norm": 0.36315220537470405,
"learning_rate": 3.323390373840276e-06,
"loss": 0.23883156776428222,
"memory(GiB)": 89.94,
"step": 750,
"token_acc": 0.9140917431192661,
"train_speed(iter/s)": 0.016405
},
{
"epoch": 3.7146401985111663,
"eval_loss": 0.3998472988605499,
"eval_runtime": 37.7998,
"eval_samples_per_second": 6.852,
"eval_steps_per_second": 0.45,
"eval_token_acc": 0.8604498615989393,
"step": 750
},
{
"epoch": 3.739454094292804,
"grad_norm": 0.35204470112010783,
"learning_rate": 3.2017219378246734e-06,
"loss": 0.22259049415588378,
"memory(GiB)": 89.94,
"step": 755,
"token_acc": 0.9101186207181223,
"train_speed(iter/s)": 0.016387
},
{
"epoch": 3.764267990074442,
"grad_norm": 0.3243960514310002,
"learning_rate": 3.0818965305886794e-06,
"loss": 0.22415781021118164,
"memory(GiB)": 89.94,
"step": 760,
"token_acc": 0.9288042256686896,
"train_speed(iter/s)": 0.016394
},
{
"epoch": 3.7890818858560795,
"grad_norm": 0.4054124995924202,
"learning_rate": 2.963946637075107e-06,
"loss": 0.2139434337615967,
"memory(GiB)": 89.94,
"step": 765,
"token_acc": 0.897647245531552,
"train_speed(iter/s)": 0.016399
},
{
"epoch": 3.8138957816377173,
"grad_norm": 0.4068548613060828,
"learning_rate": 2.847904233770692e-06,
"loss": 0.23969681262969972,
"memory(GiB)": 89.94,
"step": 770,
"token_acc": 0.9088209109362202,
"train_speed(iter/s)": 0.016384
},
{
"epoch": 3.838709677419355,
"grad_norm": 0.37626609353926294,
"learning_rate": 2.7338007800372024e-06,
"loss": 0.2259690284729004,
"memory(GiB)": 89.94,
"step": 775,
"token_acc": 0.9220249520153551,
"train_speed(iter/s)": 0.016383
},
{
"epoch": 3.8635235732009927,
"grad_norm": 0.3294821061527916,
"learning_rate": 2.6216672095827267e-06,
"loss": 0.2296532154083252,
"memory(GiB)": 89.94,
"step": 780,
"token_acc": 0.9264005360302354,
"train_speed(iter/s)": 0.016383
},
{
"epoch": 3.8883374689826304,
"grad_norm": 0.4192434658643637,
"learning_rate": 2.5115339220754796e-06,
"loss": 0.21465823650360108,
"memory(GiB)": 89.94,
"step": 785,
"token_acc": 0.9244031530683692,
"train_speed(iter/s)": 0.016387
},
{
"epoch": 3.9131513647642677,
"grad_norm": 0.3555370222229151,
"learning_rate": 2.403430774902373e-06,
"loss": 0.23048720359802247,
"memory(GiB)": 89.94,
"step": 790,
"token_acc": 0.9265224313413317,
"train_speed(iter/s)": 0.016385
},
{
"epoch": 3.937965260545906,
"grad_norm": 0.32165359038918095,
"learning_rate": 2.2973870750746253e-06,
"loss": 0.21675100326538085,
"memory(GiB)": 89.94,
"step": 795,
"token_acc": 0.9277675867070517,
"train_speed(iter/s)": 0.016387
},
{
"epoch": 3.962779156327543,
"grad_norm": 0.3691842237508398,
"learning_rate": 2.193431571282548e-06,
"loss": 0.22982077598571776,
"memory(GiB)": 89.94,
"step": 800,
"token_acc": 0.9162534196640608,
"train_speed(iter/s)": 0.016398
},
{
"epoch": 3.962779156327543,
"eval_loss": 0.3970131278038025,
"eval_runtime": 37.5975,
"eval_samples_per_second": 6.889,
"eval_steps_per_second": 0.452,
"eval_token_acc": 0.8609941615687005,
"step": 800
},
{
"epoch": 3.9875930521091814,
"grad_norm": 0.3711396772864637,
"learning_rate": 2.09159244610172e-06,
"loss": 0.22216348648071288,
"memory(GiB)": 89.94,
"step": 805,
"token_acc": 0.8936650190172618,
"train_speed(iter/s)": 0.016386
},
{
"epoch": 4.009925558312655,
"grad_norm": 0.48689819654773586,
"learning_rate": 1.991897308352624e-06,
"loss": 0.21132183074951172,
"memory(GiB)": 89.94,
"step": 810,
"token_acc": 0.9407312130437598,
"train_speed(iter/s)": 0.016398
},
{
"epoch": 4.034739454094293,
"grad_norm": 0.5032884501009073,
"learning_rate": 1.8943731856158299e-06,
"loss": 0.18714178800582887,
"memory(GiB)": 89.94,
"step": 815,
"token_acc": 0.9151910921344552,
"train_speed(iter/s)": 0.016407
},
{
"epoch": 4.05955334987593,
"grad_norm": 0.47787650705591495,
"learning_rate": 1.799046516904751e-06,
"loss": 0.19512782096862794,
"memory(GiB)": 89.94,
"step": 820,
"token_acc": 0.9267424518609353,
"train_speed(iter/s)": 0.016407
},
{
"epoch": 4.084367245657568,
"grad_norm": 0.3998650942092245,
"learning_rate": 1.7059431454979825e-06,
"loss": 0.19887795448303222,
"memory(GiB)": 89.94,
"step": 825,
"token_acc": 0.9344387354439184,
"train_speed(iter/s)": 0.016408
},
{
"epoch": 4.109181141439206,
"grad_norm": 0.3511648376611919,
"learning_rate": 1.615088311933114e-06,
"loss": 0.20051450729370118,
"memory(GiB)": 89.94,
"step": 830,
"token_acc": 0.94342090168636,
"train_speed(iter/s)": 0.016408
},
{
"epoch": 4.133995037220844,
"grad_norm": 0.36198509668580564,
"learning_rate": 1.5265066471639701e-06,
"loss": 0.19646989107131957,
"memory(GiB)": 89.94,
"step": 835,
"token_acc": 0.9299070545334905,
"train_speed(iter/s)": 0.016403
},
{
"epoch": 4.158808933002481,
"grad_norm": 0.31845761934810685,
"learning_rate": 1.4402221658830963e-06,
"loss": 0.1856994390487671,
"memory(GiB)": 89.94,
"step": 840,
"token_acc": 0.9413985177001335,
"train_speed(iter/s)": 0.016403
},
{
"epoch": 4.183622828784119,
"grad_norm": 0.31837636190529284,
"learning_rate": 1.3562582600113295e-06,
"loss": 0.18745067119598388,
"memory(GiB)": 89.94,
"step": 845,
"token_acc": 0.917115642208662,
"train_speed(iter/s)": 0.016401
},
{
"epoch": 4.208436724565757,
"grad_norm": 0.36629490437638673,
"learning_rate": 1.274637692356181e-06,
"loss": 0.19683722257614136,
"memory(GiB)": 89.94,
"step": 850,
"token_acc": 0.9203514759298465,
"train_speed(iter/s)": 0.0164
},
{
"epoch": 4.208436724565757,
"eval_loss": 0.4071538746356964,
"eval_runtime": 37.7446,
"eval_samples_per_second": 6.862,
"eval_steps_per_second": 0.45,
"eval_token_acc": 0.860109092600777,
"step": 850
},
{
"epoch": 4.233250620347395,
"grad_norm": 0.35352317433834707,
"learning_rate": 1.1953825904408033e-06,
"loss": 0.1799285888671875,
"memory(GiB)": 89.94,
"step": 855,
"token_acc": 0.9219630589493604,
"train_speed(iter/s)": 0.016381
},
{
"epoch": 4.258064516129032,
"grad_norm": 0.3360379010356317,
"learning_rate": 1.118514440505155e-06,
"loss": 0.19137413501739503,
"memory(GiB)": 89.94,
"step": 860,
"token_acc": 0.9249040837868416,
"train_speed(iter/s)": 0.016385
},
{
"epoch": 4.28287841191067,
"grad_norm": 0.3435835544317236,
"learning_rate": 1.0440540816810395e-06,
"loss": 0.1967417359352112,
"memory(GiB)": 89.94,
"step": 865,
"token_acc": 0.9308696646383973,
"train_speed(iter/s)": 0.016379
},
{
"epoch": 4.3076923076923075,
"grad_norm": 0.32859421884913725,
"learning_rate": 9.720217003425648e-07,
"loss": 0.1809452772140503,
"memory(GiB)": 89.94,
"step": 870,
"token_acc": 0.9282937629449756,
"train_speed(iter/s)": 0.016382
},
{
"epoch": 4.332506203473946,
"grad_norm": 0.3395321554617945,
"learning_rate": 9.024368246335735e-07,
"loss": 0.18605422973632812,
"memory(GiB)": 89.94,
"step": 875,
"token_acc": 0.9546436861343081,
"train_speed(iter/s)": 0.016384
},
{
"epoch": 4.357320099255583,
"grad_norm": 0.3393236511573925,
"learning_rate": 8.353183191735115e-07,
"loss": 0.1946401596069336,
"memory(GiB)": 89.94,
"step": 880,
"token_acc": 0.9236204495723095,
"train_speed(iter/s)": 0.016384
},
{
"epoch": 4.382133995037221,
"grad_norm": 0.34078095822336996,
"learning_rate": 7.706843799431985e-07,
"loss": 0.18827946186065675,
"memory(GiB)": 89.94,
"step": 885,
"token_acc": 0.9302919345345024,
"train_speed(iter/s)": 0.016386
},
{
"epoch": 4.406947890818858,
"grad_norm": 0.353572914930731,
"learning_rate": 7.085525293518469e-07,
"loss": 0.1933911681175232,
"memory(GiB)": 89.94,
"step": 890,
"token_acc": 0.9303721907052539,
"train_speed(iter/s)": 0.016395
},
{
"epoch": 4.431761786600497,
"grad_norm": 0.34090308953100995,
"learning_rate": 6.489396114866942e-07,
"loss": 0.18675589561462402,
"memory(GiB)": 89.94,
"step": 895,
"token_acc": 0.9425964870708806,
"train_speed(iter/s)": 0.016397
},
{
"epoch": 4.456575682382134,
"grad_norm": 0.34608919518252224,
"learning_rate": 5.918617875465449e-07,
"loss": 0.19207412004470825,
"memory(GiB)": 89.94,
"step": 900,
"token_acc": 0.9400631757501285,
"train_speed(iter/s)": 0.016399
},
{
"epoch": 4.456575682382134,
"eval_loss": 0.40813976526260376,
"eval_runtime": 37.5655,
"eval_samples_per_second": 6.895,
"eval_steps_per_second": 0.453,
"eval_token_acc": 0.8603184387429927,
"step": 900
},
{
"epoch": 4.481389578163772,
"grad_norm": 0.35859843949116793,
"learning_rate": 5.373345314604206e-07,
"loss": 0.2071406364440918,
"memory(GiB)": 89.94,
"step": 905,
"token_acc": 0.9223047638884068,
"train_speed(iter/s)": 0.016383
},
{
"epoch": 4.506203473945409,
"grad_norm": 0.3089222557610059,
"learning_rate": 4.853726256925407e-07,
"loss": 0.1951405882835388,
"memory(GiB)": 89.94,
"step": 910,
"token_acc": 0.9288247402227017,
"train_speed(iter/s)": 0.016382
},
{
"epoch": 4.5310173697270475,
"grad_norm": 0.3365251848175135,
"learning_rate": 4.359901572347758e-07,
"loss": 0.19402856826782228,
"memory(GiB)": 89.94,
"step": 915,
"token_acc": 0.9069553201289728,
"train_speed(iter/s)": 0.016381
},
{
"epoch": 4.555831265508685,
"grad_norm": 0.352511343998469,
"learning_rate": 3.892005137876209e-07,
"loss": 0.18840408325195312,
"memory(GiB)": 89.94,
"step": 920,
"token_acc": 0.9290194762860235,
"train_speed(iter/s)": 0.016381
},
{
"epoch": 4.580645161290323,
"grad_norm": 0.31455756384327627,
"learning_rate": 3.450163801307582e-07,
"loss": 0.1860198974609375,
"memory(GiB)": 89.94,
"step": 925,
"token_acc": 0.9461412451458935,
"train_speed(iter/s)": 0.016385
},
{
"epoch": 4.60545905707196,
"grad_norm": 0.31757641861016306,
"learning_rate": 3.034497346841958e-07,
"loss": 0.1895312786102295,
"memory(GiB)": 89.94,
"step": 930,
"token_acc": 0.918751311402793,
"train_speed(iter/s)": 0.016385
},
{
"epoch": 4.630272952853598,
"grad_norm": 0.34601426382495787,
"learning_rate": 2.6451184626087646e-07,
"loss": 0.19062964916229247,
"memory(GiB)": 89.94,
"step": 935,
"token_acc": 0.9377439769272455,
"train_speed(iter/s)": 0.016388
},
{
"epoch": 4.655086848635236,
"grad_norm": 0.3686772430396372,
"learning_rate": 2.2821327101168578e-07,
"loss": 0.18338959217071532,
"memory(GiB)": 89.94,
"step": 940,
"token_acc": 0.9286465593172508,
"train_speed(iter/s)": 0.016389
},
{
"epoch": 4.679900744416873,
"grad_norm": 0.3329148442481642,
"learning_rate": 1.9456384956365149e-07,
"loss": 0.17848238945007325,
"memory(GiB)": 89.94,
"step": 945,
"token_acc": 0.9242079340262307,
"train_speed(iter/s)": 0.016388
},
{
"epoch": 4.704714640198511,
"grad_norm": 0.33580830230246933,
"learning_rate": 1.6357270435212736e-07,
"loss": 0.19694331884384156,
"memory(GiB)": 89.94,
"step": 950,
"token_acc": 0.9275961363852546,
"train_speed(iter/s)": 0.01639
},
{
"epoch": 4.704714640198511,
"eval_loss": 0.4078960418701172,
"eval_runtime": 37.2595,
"eval_samples_per_second": 6.951,
"eval_steps_per_second": 0.456,
"eval_token_acc": 0.8605452303970599,
"step": 950
},
{
"epoch": 4.729528535980149,
"grad_norm": 0.3214090891564049,
"learning_rate": 1.3524823714768375e-07,
"loss": 0.19557987451553344,
"memory(GiB)": 89.94,
"step": 955,
"token_acc": 0.917459338194055,
"train_speed(iter/s)": 0.016374
},
{
"epoch": 4.754342431761787,
"grad_norm": 0.3510644774133595,
"learning_rate": 1.0959812677835968e-07,
"loss": 0.18159072399139403,
"memory(GiB)": 89.94,
"step": 960,
"token_acc": 0.9338328114497434,
"train_speed(iter/s)": 0.016382
},
{
"epoch": 4.779156327543424,
"grad_norm": 0.31852083171219153,
"learning_rate": 8.662932704792793e-08,
"loss": 0.18122289180755616,
"memory(GiB)": 89.94,
"step": 965,
"token_acc": 0.9309320132692916,
"train_speed(iter/s)": 0.016382
},
{
"epoch": 4.803970223325062,
"grad_norm": 0.318877312082489,
"learning_rate": 6.63480648506909e-08,
"loss": 0.19023412466049194,
"memory(GiB)": 89.94,
"step": 970,
"token_acc": 0.9162366937555632,
"train_speed(iter/s)": 0.016379
},
{
"epoch": 4.8287841191067,
"grad_norm": 0.34651828579056343,
"learning_rate": 4.8759838483358745e-08,
"loss": 0.18698248863220215,
"memory(GiB)": 89.94,
"step": 975,
"token_acc": 0.9286984711087747,
"train_speed(iter/s)": 0.016384
},
{
"epoch": 4.8535980148883375,
"grad_norm": 0.3062976957364449,
"learning_rate": 3.386941615445283e-08,
"loss": 0.18977559804916383,
"memory(GiB)": 89.94,
"step": 980,
"token_acc": 0.9407679833647572,
"train_speed(iter/s)": 0.016386
},
{
"epoch": 4.878411910669975,
"grad_norm": 0.29457610025451847,
"learning_rate": 2.1680834691628627e-08,
"loss": 0.19420017004013063,
"memory(GiB)": 89.94,
"step": 985,
"token_acc": 0.9039991194174251,
"train_speed(iter/s)": 0.016388
},
{
"epoch": 4.903225806451613,
"grad_norm": 0.3316519420007468,
"learning_rate": 1.2197398447283404e-08,
"loss": 0.1931779146194458,
"memory(GiB)": 89.94,
"step": 990,
"token_acc": 0.9029619269298459,
"train_speed(iter/s)": 0.016386
},
{
"epoch": 4.92803970223325,
"grad_norm": 0.34572266929696366,
"learning_rate": 5.421678402741659e-09,
"loss": 0.1826627492904663,
"memory(GiB)": 89.94,
"step": 995,
"token_acc": 0.9375434960795533,
"train_speed(iter/s)": 0.016383
},
{
"epoch": 4.9528535980148884,
"grad_norm": 0.3466792398716006,
"learning_rate": 1.3555114712526796e-09,
"loss": 0.18890198469161987,
"memory(GiB)": 89.94,
"step": 1000,
"token_acc": 0.9408911997667525,
"train_speed(iter/s)": 0.016393
},
{
"epoch": 4.9528535980148884,
"eval_loss": 0.40783175826072693,
"eval_runtime": 37.8642,
"eval_samples_per_second": 6.84,
"eval_steps_per_second": 0.449,
"eval_token_acc": 0.8604533507013096,
"step": 1000
},
{
"epoch": 4.977667493796526,
"grad_norm": 0.36020979675081666,
"learning_rate": 0.0,
"loss": 0.1956263303756714,
"memory(GiB)": 89.94,
"step": 1005,
"token_acc": 0.9227749975468551,
"train_speed(iter/s)": 0.016386
},
{
"epoch": 4.977667493796526,
"eval_loss": 0.40774813294410706,
"eval_runtime": 37.2531,
"eval_samples_per_second": 6.952,
"eval_steps_per_second": 0.456,
"eval_token_acc": 0.8606568816729083,
"step": 1005
}
],
"logging_steps": 5,
"max_steps": 1005,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2269146859438080.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}