daviddongdong's picture
Upload 8 files
306c4d3 verified
{
"best_metric": 0.58375472,
"best_model_checkpoint": "/export/home2/zli/kc/mm_rag/Qwen2.5-32B-Instruct_lora/checkpoint-1026",
"epoch": 0.9997563946406821,
"eval_steps": 100,
"global_step": 1026,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00097442143727162,
"grad_norm": 0.8529725074768066,
"learning_rate": 1.9230769230769234e-06,
"loss": 0.9744532108306885,
"memory(GiB)": 255.29,
"step": 1,
"token_acc": 0.7235609103078983,
"train_speed(iter/s)": 0.048136
},
{
"epoch": 0.0048721071863581,
"grad_norm": 0.9313930869102478,
"learning_rate": 9.615384615384616e-06,
"loss": 0.7735831141471863,
"memory(GiB)": 307.13,
"step": 5,
"token_acc": 0.7883002659030477,
"train_speed(iter/s)": 0.052895
},
{
"epoch": 0.0097442143727162,
"grad_norm": 0.5147601962089539,
"learning_rate": 1.923076923076923e-05,
"loss": 0.8932640075683593,
"memory(GiB)": 357.61,
"step": 10,
"token_acc": 0.7670103092783506,
"train_speed(iter/s)": 0.055392
},
{
"epoch": 0.014616321559074299,
"grad_norm": 0.3254891037940979,
"learning_rate": 2.8846153846153845e-05,
"loss": 0.7951784610748291,
"memory(GiB)": 357.63,
"step": 15,
"token_acc": 0.7827709279688514,
"train_speed(iter/s)": 0.058176
},
{
"epoch": 0.0194884287454324,
"grad_norm": 0.5581763982772827,
"learning_rate": 3.846153846153846e-05,
"loss": 0.956171703338623,
"memory(GiB)": 357.63,
"step": 20,
"token_acc": 0.7419072615923009,
"train_speed(iter/s)": 0.05832
},
{
"epoch": 0.024360535931790498,
"grad_norm": 0.3584084212779999,
"learning_rate": 4.8076923076923084e-05,
"loss": 0.8373254776000977,
"memory(GiB)": 357.64,
"step": 25,
"token_acc": 0.7718067293892521,
"train_speed(iter/s)": 0.057013
},
{
"epoch": 0.029232643118148598,
"grad_norm": 0.43336302042007446,
"learning_rate": 5.769230769230769e-05,
"loss": 0.7370581150054931,
"memory(GiB)": 357.64,
"step": 30,
"token_acc": 0.7800821355236139,
"train_speed(iter/s)": 0.057318
},
{
"epoch": 0.0341047503045067,
"grad_norm": 0.6522459983825684,
"learning_rate": 6.730769230769232e-05,
"loss": 0.8316493988037109,
"memory(GiB)": 357.64,
"step": 35,
"token_acc": 0.7533095188064719,
"train_speed(iter/s)": 0.0583
},
{
"epoch": 0.0389768574908648,
"grad_norm": 0.2728117108345032,
"learning_rate": 7.692307692307693e-05,
"loss": 0.7739034652709961,
"memory(GiB)": 357.64,
"step": 40,
"token_acc": 0.7733619763694952,
"train_speed(iter/s)": 0.058058
},
{
"epoch": 0.0438489646772229,
"grad_norm": 0.5565893054008484,
"learning_rate": 8.653846153846155e-05,
"loss": 0.664579200744629,
"memory(GiB)": 357.64,
"step": 45,
"token_acc": 0.7940859054608245,
"train_speed(iter/s)": 0.058044
},
{
"epoch": 0.048721071863580996,
"grad_norm": 0.8710922002792358,
"learning_rate": 9.615384615384617e-05,
"loss": 0.6327468872070312,
"memory(GiB)": 378.13,
"step": 50,
"token_acc": 0.8048907388137357,
"train_speed(iter/s)": 0.057666
},
{
"epoch": 0.0535931790499391,
"grad_norm": 0.5048324465751648,
"learning_rate": 9.999765921804365e-05,
"loss": 0.7563381671905518,
"memory(GiB)": 378.13,
"step": 55,
"token_acc": 0.7769541778975741,
"train_speed(iter/s)": 0.056465
},
{
"epoch": 0.058465286236297195,
"grad_norm": 0.4348597824573517,
"learning_rate": 9.998335523311734e-05,
"loss": 0.6969810009002686,
"memory(GiB)": 378.13,
"step": 60,
"token_acc": 0.7881806108897742,
"train_speed(iter/s)": 0.056264
},
{
"epoch": 0.06333739342265529,
"grad_norm": 0.3995300233364105,
"learning_rate": 9.995605141340247e-05,
"loss": 0.6338334083557129,
"memory(GiB)": 378.13,
"step": 65,
"token_acc": 0.8006442376521117,
"train_speed(iter/s)": 0.056356
},
{
"epoch": 0.0682095006090134,
"grad_norm": 0.25958451628685,
"learning_rate": 9.991575486016592e-05,
"loss": 0.7143070697784424,
"memory(GiB)": 378.13,
"step": 70,
"token_acc": 0.7797544920832592,
"train_speed(iter/s)": 0.056342
},
{
"epoch": 0.0730816077953715,
"grad_norm": 0.3476051688194275,
"learning_rate": 9.986247605386727e-05,
"loss": 0.6742859840393066,
"memory(GiB)": 378.13,
"step": 75,
"token_acc": 0.7907263751763046,
"train_speed(iter/s)": 0.056661
},
{
"epoch": 0.0779537149817296,
"grad_norm": 0.3681824505329132,
"learning_rate": 9.979622885143301e-05,
"loss": 0.5565629959106445,
"memory(GiB)": 378.13,
"step": 80,
"token_acc": 0.8259753593429158,
"train_speed(iter/s)": 0.056802
},
{
"epoch": 0.0828258221680877,
"grad_norm": 0.4626450836658478,
"learning_rate": 9.97170304826526e-05,
"loss": 0.6965714931488037,
"memory(GiB)": 378.13,
"step": 85,
"token_acc": 0.7883802169516455,
"train_speed(iter/s)": 0.057068
},
{
"epoch": 0.0876979293544458,
"grad_norm": 0.6139679551124573,
"learning_rate": 9.962490154569727e-05,
"loss": 0.670227336883545,
"memory(GiB)": 378.13,
"step": 90,
"token_acc": 0.7868181818181819,
"train_speed(iter/s)": 0.056985
},
{
"epoch": 0.0925700365408039,
"grad_norm": 0.3063270151615143,
"learning_rate": 9.95198660017628e-05,
"loss": 0.6900132179260254,
"memory(GiB)": 378.13,
"step": 95,
"token_acc": 0.7969001610305958,
"train_speed(iter/s)": 0.056931
},
{
"epoch": 0.09744214372716199,
"grad_norm": 0.6139111518859863,
"learning_rate": 9.940195116883755e-05,
"loss": 0.6138424396514892,
"memory(GiB)": 378.13,
"step": 100,
"token_acc": 0.8159609120521173,
"train_speed(iter/s)": 0.056746
},
{
"epoch": 0.09744214372716199,
"eval_loss": 0.7148731350898743,
"eval_runtime": 6.1133,
"eval_samples_per_second": 0.654,
"eval_steps_per_second": 0.654,
"step": 100
},
{
"epoch": 0.1023142509135201,
"grad_norm": 0.2960892617702484,
"learning_rate": 9.927118771459763e-05,
"loss": 0.7610847473144531,
"memory(GiB)": 378.13,
"step": 105,
"token_acc": 0.767438173747622,
"train_speed(iter/s)": 0.055274
},
{
"epoch": 0.1071863580998782,
"grad_norm": 0.35161978006362915,
"learning_rate": 9.91276096484306e-05,
"loss": 0.553467845916748,
"memory(GiB)": 378.13,
"step": 110,
"token_acc": 0.8268870335661953,
"train_speed(iter/s)": 0.055276
},
{
"epoch": 0.1120584652862363,
"grad_norm": 0.412120521068573,
"learning_rate": 9.897125431259033e-05,
"loss": 0.634190034866333,
"memory(GiB)": 378.13,
"step": 115,
"token_acc": 0.8068276436303081,
"train_speed(iter/s)": 0.055265
},
{
"epoch": 0.11693057247259439,
"grad_norm": 0.458766907453537,
"learning_rate": 9.880216237248481e-05,
"loss": 0.5070098400115967,
"memory(GiB)": 383.55,
"step": 120,
"token_acc": 0.8488372093023255,
"train_speed(iter/s)": 0.055088
},
{
"epoch": 0.1218026796589525,
"grad_norm": 0.26979780197143555,
"learning_rate": 9.862037780609986e-05,
"loss": 0.6117064476013183,
"memory(GiB)": 383.55,
"step": 125,
"token_acc": 0.8204016064257028,
"train_speed(iter/s)": 0.055201
},
{
"epoch": 0.12667478684531058,
"grad_norm": 0.33145567774772644,
"learning_rate": 9.842594789256103e-05,
"loss": 0.6480350017547607,
"memory(GiB)": 383.55,
"step": 130,
"token_acc": 0.7939520333680917,
"train_speed(iter/s)": 0.055341
},
{
"epoch": 0.1315468940316687,
"grad_norm": 0.29335150122642517,
"learning_rate": 9.821892319983726e-05,
"loss": 0.6102027893066406,
"memory(GiB)": 383.55,
"step": 135,
"token_acc": 0.8075084115459537,
"train_speed(iter/s)": 0.055381
},
{
"epoch": 0.1364190012180268,
"grad_norm": 0.34363335371017456,
"learning_rate": 9.799935757158891e-05,
"loss": 0.6652801513671875,
"memory(GiB)": 383.55,
"step": 140,
"token_acc": 0.7967960995995125,
"train_speed(iter/s)": 0.055397
},
{
"epoch": 0.14129110840438489,
"grad_norm": 1.3419567346572876,
"learning_rate": 9.776730811316394e-05,
"loss": 0.6267284393310547,
"memory(GiB)": 383.55,
"step": 145,
"token_acc": 0.8041536400178652,
"train_speed(iter/s)": 0.055716
},
{
"epoch": 0.146163215590743,
"grad_norm": 0.32478609681129456,
"learning_rate": 9.752283517674575e-05,
"loss": 0.5990486145019531,
"memory(GiB)": 383.55,
"step": 150,
"token_acc": 0.80878414568827,
"train_speed(iter/s)": 0.055975
},
{
"epoch": 0.1510353227771011,
"grad_norm": 0.33393779397010803,
"learning_rate": 9.72660023456566e-05,
"loss": 0.6312044143676758,
"memory(GiB)": 383.55,
"step": 155,
"token_acc": 0.8099249502220861,
"train_speed(iter/s)": 0.055469
},
{
"epoch": 0.1559074299634592,
"grad_norm": 0.39661872386932373,
"learning_rate": 9.699687641782067e-05,
"loss": 0.727474308013916,
"memory(GiB)": 383.55,
"step": 160,
"token_acc": 0.7668711656441718,
"train_speed(iter/s)": 0.055534
},
{
"epoch": 0.1607795371498173,
"grad_norm": 0.3174588978290558,
"learning_rate": 9.671552738839099e-05,
"loss": 0.7284453868865967,
"memory(GiB)": 383.55,
"step": 165,
"token_acc": 0.7788385043754972,
"train_speed(iter/s)": 0.055621
},
{
"epoch": 0.1656516443361754,
"grad_norm": 0.3508811891078949,
"learning_rate": 9.642202843154491e-05,
"loss": 0.6260187149047851,
"memory(GiB)": 383.55,
"step": 170,
"token_acc": 0.8036006546644845,
"train_speed(iter/s)": 0.055841
},
{
"epoch": 0.1705237515225335,
"grad_norm": 0.35121622681617737,
"learning_rate": 9.611645588145272e-05,
"loss": 0.6979084968566894,
"memory(GiB)": 383.55,
"step": 175,
"token_acc": 0.784981684981685,
"train_speed(iter/s)": 0.055902
},
{
"epoch": 0.1753958587088916,
"grad_norm": 0.41958293318748474,
"learning_rate": 9.579888921242439e-05,
"loss": 0.6360678195953369,
"memory(GiB)": 383.55,
"step": 180,
"token_acc": 0.8010867455850961,
"train_speed(iter/s)": 0.05597
},
{
"epoch": 0.18026796589524968,
"grad_norm": 0.4557216763496399,
"learning_rate": 9.546941101823963e-05,
"loss": 0.7268210411071777,
"memory(GiB)": 383.55,
"step": 185,
"token_acc": 0.780511811023622,
"train_speed(iter/s)": 0.056032
},
{
"epoch": 0.1851400730816078,
"grad_norm": 0.44287464022636414,
"learning_rate": 9.512810699066667e-05,
"loss": 0.6450634479522706,
"memory(GiB)": 383.55,
"step": 190,
"token_acc": 0.8003896563939072,
"train_speed(iter/s)": 0.056127
},
{
"epoch": 0.1900121802679659,
"grad_norm": 0.2944161593914032,
"learning_rate": 9.477506589717518e-05,
"loss": 0.5534649848937988,
"memory(GiB)": 383.55,
"step": 195,
"token_acc": 0.8173982442138866,
"train_speed(iter/s)": 0.056116
},
{
"epoch": 0.19488428745432398,
"grad_norm": 0.3743175268173218,
"learning_rate": 9.441037955784944e-05,
"loss": 0.7282295227050781,
"memory(GiB)": 383.55,
"step": 200,
"token_acc": 0.7695568400770713,
"train_speed(iter/s)": 0.05619
},
{
"epoch": 0.19488428745432398,
"eval_loss": 0.6866188049316406,
"eval_runtime": 6.0764,
"eval_samples_per_second": 0.658,
"eval_steps_per_second": 0.658,
"step": 200
},
{
"epoch": 0.1997563946406821,
"grad_norm": 0.397987425327301,
"learning_rate": 9.403414282150738e-05,
"loss": 0.6911158561706543,
"memory(GiB)": 383.55,
"step": 205,
"token_acc": 0.7807744462859726,
"train_speed(iter/s)": 0.055478
},
{
"epoch": 0.2046285018270402,
"grad_norm": 0.852430522441864,
"learning_rate": 9.364645354103206e-05,
"loss": 0.7203257560729981,
"memory(GiB)": 383.55,
"step": 210,
"token_acc": 0.7839163822525598,
"train_speed(iter/s)": 0.055538
},
{
"epoch": 0.2095006090133983,
"grad_norm": 0.36566832661628723,
"learning_rate": 9.324741254792171e-05,
"loss": 0.5751584529876709,
"memory(GiB)": 383.55,
"step": 215,
"token_acc": 0.8120177310786406,
"train_speed(iter/s)": 0.055665
},
{
"epoch": 0.2143727161997564,
"grad_norm": 0.5105158090591431,
"learning_rate": 9.28371236260652e-05,
"loss": 0.5958977699279785,
"memory(GiB)": 383.55,
"step": 220,
"token_acc": 0.8228462471832206,
"train_speed(iter/s)": 0.055807
},
{
"epoch": 0.2192448233861145,
"grad_norm": 0.48028162121772766,
"learning_rate": 9.241569348474954e-05,
"loss": 0.7106984615325928,
"memory(GiB)": 383.55,
"step": 225,
"token_acc": 0.7828740844087897,
"train_speed(iter/s)": 0.055858
},
{
"epoch": 0.2241169305724726,
"grad_norm": 0.32592424750328064,
"learning_rate": 9.198323173090663e-05,
"loss": 0.5898131847381591,
"memory(GiB)": 383.55,
"step": 230,
"token_acc": 0.8189450340567084,
"train_speed(iter/s)": 0.055995
},
{
"epoch": 0.2289890377588307,
"grad_norm": 0.44794151186943054,
"learning_rate": 9.153985084060623e-05,
"loss": 0.6042355060577392,
"memory(GiB)": 383.55,
"step": 235,
"token_acc": 0.810122224134963,
"train_speed(iter/s)": 0.056057
},
{
"epoch": 0.23386114494518878,
"grad_norm": 0.3204025328159332,
"learning_rate": 9.108566612980298e-05,
"loss": 0.5558523654937744,
"memory(GiB)": 383.55,
"step": 240,
"token_acc": 0.8260869565217391,
"train_speed(iter/s)": 0.056072
},
{
"epoch": 0.2387332521315469,
"grad_norm": 0.31540507078170776,
"learning_rate": 9.062079572434448e-05,
"loss": 0.6237210273742676,
"memory(GiB)": 383.55,
"step": 245,
"token_acc": 0.8021445866482186,
"train_speed(iter/s)": 0.056086
},
{
"epoch": 0.243605359317905,
"grad_norm": 0.619088888168335,
"learning_rate": 9.014536052924883e-05,
"loss": 0.664583158493042,
"memory(GiB)": 383.55,
"step": 250,
"token_acc": 0.793002915451895,
"train_speed(iter/s)": 0.056116
},
{
"epoch": 0.24847746650426308,
"grad_norm": 0.6715230345726013,
"learning_rate": 8.965948419725922e-05,
"loss": 0.5711063861846923,
"memory(GiB)": 383.55,
"step": 255,
"token_acc": 0.8185620394343757,
"train_speed(iter/s)": 0.055175
},
{
"epoch": 0.25334957369062117,
"grad_norm": 0.4514237642288208,
"learning_rate": 8.916329309668397e-05,
"loss": 0.721324348449707,
"memory(GiB)": 383.55,
"step": 260,
"token_acc": 0.7792865828942035,
"train_speed(iter/s)": 0.055266
},
{
"epoch": 0.2582216808769793,
"grad_norm": 0.5026947855949402,
"learning_rate": 8.865691627853013e-05,
"loss": 0.6661148548126221,
"memory(GiB)": 383.55,
"step": 265,
"token_acc": 0.7951268025857782,
"train_speed(iter/s)": 0.05533
},
{
"epoch": 0.2630937880633374,
"grad_norm": 0.3138331174850464,
"learning_rate": 8.814048544293965e-05,
"loss": 0.6717385292053223,
"memory(GiB)": 383.55,
"step": 270,
"token_acc": 0.7904462355022607,
"train_speed(iter/s)": 0.055296
},
{
"epoch": 0.2679658952496955,
"grad_norm": 0.3270625174045563,
"learning_rate": 8.76141349049362e-05,
"loss": 0.6027359008789063,
"memory(GiB)": 383.55,
"step": 275,
"token_acc": 0.8082101806239738,
"train_speed(iter/s)": 0.05525
},
{
"epoch": 0.2728380024360536,
"grad_norm": 0.4341810941696167,
"learning_rate": 8.707800155949217e-05,
"loss": 0.6553579330444336,
"memory(GiB)": 383.55,
"step": 280,
"token_acc": 0.797032640949555,
"train_speed(iter/s)": 0.055271
},
{
"epoch": 0.2777101096224117,
"grad_norm": 0.37805306911468506,
"learning_rate": 8.653222484592458e-05,
"loss": 0.6515018463134765,
"memory(GiB)": 383.55,
"step": 285,
"token_acc": 0.794751477233229,
"train_speed(iter/s)": 0.05524
},
{
"epoch": 0.28258221680876977,
"grad_norm": 0.38902854919433594,
"learning_rate": 8.597694671162921e-05,
"loss": 0.592349624633789,
"memory(GiB)": 383.55,
"step": 290,
"token_acc": 0.815828677839851,
"train_speed(iter/s)": 0.05519
},
{
"epoch": 0.2874543239951279,
"grad_norm": 0.3007030487060547,
"learning_rate": 8.541231157516247e-05,
"loss": 0.6616343021392822,
"memory(GiB)": 383.55,
"step": 295,
"token_acc": 0.7961879284400601,
"train_speed(iter/s)": 0.05519
},
{
"epoch": 0.292326431181486,
"grad_norm": 0.43431806564331055,
"learning_rate": 8.483846628868055e-05,
"loss": 0.6408910751342773,
"memory(GiB)": 383.55,
"step": 300,
"token_acc": 0.7999295526593871,
"train_speed(iter/s)": 0.055286
},
{
"epoch": 0.292326431181486,
"eval_loss": 0.6525390148162842,
"eval_runtime": 6.2823,
"eval_samples_per_second": 0.637,
"eval_steps_per_second": 0.637,
"step": 300
},
{
"epoch": 0.2971985383678441,
"grad_norm": 0.4932222068309784,
"learning_rate": 8.425556009974566e-05,
"loss": 0.6335715770721435,
"memory(GiB)": 383.55,
"step": 305,
"token_acc": 0.8036400066789113,
"train_speed(iter/s)": 0.054583
},
{
"epoch": 0.3020706455542022,
"grad_norm": 0.27842646837234497,
"learning_rate": 8.366374461250916e-05,
"loss": 0.570946216583252,
"memory(GiB)": 383.55,
"step": 310,
"token_acc": 0.8238350381555447,
"train_speed(iter/s)": 0.054659
},
{
"epoch": 0.30694275274056027,
"grad_norm": 0.7104659080505371,
"learning_rate": 8.306317374828194e-05,
"loss": 0.566010570526123,
"memory(GiB)": 383.55,
"step": 315,
"token_acc": 0.8189669219488349,
"train_speed(iter/s)": 0.054662
},
{
"epoch": 0.3118148599269184,
"grad_norm": 0.8684744834899902,
"learning_rate": 8.245400370550198e-05,
"loss": 0.677960729598999,
"memory(GiB)": 383.55,
"step": 320,
"token_acc": 0.7772163527790538,
"train_speed(iter/s)": 0.054794
},
{
"epoch": 0.3166869671132765,
"grad_norm": 0.3846539258956909,
"learning_rate": 8.183639291910987e-05,
"loss": 0.5622167587280273,
"memory(GiB)": 383.55,
"step": 325,
"token_acc": 0.830480089318943,
"train_speed(iter/s)": 0.054821
},
{
"epoch": 0.3215590742996346,
"grad_norm": 0.34010785818099976,
"learning_rate": 8.121050201934235e-05,
"loss": 0.5877705574035644,
"memory(GiB)": 383.55,
"step": 330,
"token_acc": 0.8287964389659305,
"train_speed(iter/s)": 0.054899
},
{
"epoch": 0.3264311814859927,
"grad_norm": 0.3751339912414551,
"learning_rate": 8.057649378995526e-05,
"loss": 0.5179604053497314,
"memory(GiB)": 383.55,
"step": 335,
"token_acc": 0.8402439024390244,
"train_speed(iter/s)": 0.054839
},
{
"epoch": 0.3313032886723508,
"grad_norm": 0.3137739896774292,
"learning_rate": 7.993453312588607e-05,
"loss": 0.5339327335357666,
"memory(GiB)": 383.55,
"step": 340,
"token_acc": 0.8365357839042049,
"train_speed(iter/s)": 0.054764
},
{
"epoch": 0.33617539585870887,
"grad_norm": 0.5746834874153137,
"learning_rate": 7.928478699036755e-05,
"loss": 0.6346034049987793,
"memory(GiB)": 383.55,
"step": 345,
"token_acc": 0.7972016183412003,
"train_speed(iter/s)": 0.054713
},
{
"epoch": 0.341047503045067,
"grad_norm": 0.3580325245857239,
"learning_rate": 7.862742437150336e-05,
"loss": 0.6608481884002686,
"memory(GiB)": 383.55,
"step": 350,
"token_acc": 0.7929736511919699,
"train_speed(iter/s)": 0.054762
},
{
"epoch": 0.3459196102314251,
"grad_norm": 0.4622519612312317,
"learning_rate": 7.796261623831713e-05,
"loss": 0.562419080734253,
"memory(GiB)": 383.55,
"step": 355,
"token_acc": 0.8190336211647988,
"train_speed(iter/s)": 0.054396
},
{
"epoch": 0.3507917174177832,
"grad_norm": 0.5616739392280579,
"learning_rate": 7.729053549628622e-05,
"loss": 0.5495719909667969,
"memory(GiB)": 383.55,
"step": 360,
"token_acc": 0.8339377743844245,
"train_speed(iter/s)": 0.054442
},
{
"epoch": 0.3556638246041413,
"grad_norm": 0.7364129424095154,
"learning_rate": 7.661135694237198e-05,
"loss": 0.4548810958862305,
"memory(GiB)": 387.42,
"step": 365,
"token_acc": 0.8370827285921626,
"train_speed(iter/s)": 0.054383
},
{
"epoch": 0.36053593179049936,
"grad_norm": 0.44831952452659607,
"learning_rate": 7.592525721955786e-05,
"loss": 0.5882142066955567,
"memory(GiB)": 387.42,
"step": 370,
"token_acc": 0.8161894662424886,
"train_speed(iter/s)": 0.054337
},
{
"epoch": 0.3654080389768575,
"grad_norm": 0.37750759720802307,
"learning_rate": 7.523241477090763e-05,
"loss": 0.6884512901306152,
"memory(GiB)": 387.42,
"step": 375,
"token_acc": 0.7952127659574468,
"train_speed(iter/s)": 0.054385
},
{
"epoch": 0.3702801461632156,
"grad_norm": 0.5074845552444458,
"learning_rate": 7.45330097931553e-05,
"loss": 0.5458427906036377,
"memory(GiB)": 387.42,
"step": 380,
"token_acc": 0.8217197924388436,
"train_speed(iter/s)": 0.054354
},
{
"epoch": 0.3751522533495737,
"grad_norm": 0.6083484292030334,
"learning_rate": 7.382722418983892e-05,
"loss": 0.5680232048034668,
"memory(GiB)": 387.42,
"step": 385,
"token_acc": 0.8248374239563667,
"train_speed(iter/s)": 0.054329
},
{
"epoch": 0.3800243605359318,
"grad_norm": 0.39138278365135193,
"learning_rate": 7.311524152399054e-05,
"loss": 0.7077183246612548,
"memory(GiB)": 387.42,
"step": 390,
"token_acc": 0.7912014292094686,
"train_speed(iter/s)": 0.054329
},
{
"epoch": 0.3848964677222899,
"grad_norm": 0.4244479238986969,
"learning_rate": 7.239724697039457e-05,
"loss": 0.6999778270721435,
"memory(GiB)": 387.42,
"step": 395,
"token_acc": 0.7828650029475339,
"train_speed(iter/s)": 0.054413
},
{
"epoch": 0.38976857490864797,
"grad_norm": 0.3658107817173004,
"learning_rate": 7.167342726742685e-05,
"loss": 0.5321448802947998,
"memory(GiB)": 387.42,
"step": 400,
"token_acc": 0.8257604205782951,
"train_speed(iter/s)": 0.054414
},
{
"epoch": 0.38976857490864797,
"eval_loss": 0.647614598274231,
"eval_runtime": 6.1299,
"eval_samples_per_second": 0.653,
"eval_steps_per_second": 0.653,
"step": 400
},
{
"epoch": 0.3946406820950061,
"grad_norm": 0.4579378068447113,
"learning_rate": 7.094397066848716e-05,
"loss": 0.6339591979980469,
"memory(GiB)": 387.42,
"step": 405,
"token_acc": 0.7953757225433526,
"train_speed(iter/s)": 0.054198
},
{
"epoch": 0.3995127892813642,
"grad_norm": 0.41108816862106323,
"learning_rate": 7.020906689303766e-05,
"loss": 0.6498037338256836,
"memory(GiB)": 387.42,
"step": 410,
"token_acc": 0.8013311819281969,
"train_speed(iter/s)": 0.054274
},
{
"epoch": 0.4043848964677223,
"grad_norm": 0.3730790615081787,
"learning_rate": 6.946890707726004e-05,
"loss": 0.6224189281463623,
"memory(GiB)": 387.42,
"step": 415,
"token_acc": 0.8109767441860465,
"train_speed(iter/s)": 0.054342
},
{
"epoch": 0.4092570036540804,
"grad_norm": 0.41862693428993225,
"learning_rate": 6.872368372434416e-05,
"loss": 0.6285569190979003,
"memory(GiB)": 387.42,
"step": 420,
"token_acc": 0.793915399041467,
"train_speed(iter/s)": 0.054335
},
{
"epoch": 0.41412911084043846,
"grad_norm": 0.4861293435096741,
"learning_rate": 6.797359065442117e-05,
"loss": 0.5771468162536622,
"memory(GiB)": 387.42,
"step": 425,
"token_acc": 0.8196579720158922,
"train_speed(iter/s)": 0.054339
},
{
"epoch": 0.4190012180267966,
"grad_norm": 0.30941805243492126,
"learning_rate": 6.721882295415425e-05,
"loss": 0.5844586372375489,
"memory(GiB)": 387.42,
"step": 430,
"token_acc": 0.814694173000362,
"train_speed(iter/s)": 0.05432
},
{
"epoch": 0.4238733252131547,
"grad_norm": 0.3820112645626068,
"learning_rate": 6.645957692599969e-05,
"loss": 0.5823289394378662,
"memory(GiB)": 387.42,
"step": 435,
"token_acc": 0.8027565654684299,
"train_speed(iter/s)": 0.054382
},
{
"epoch": 0.4287454323995128,
"grad_norm": 0.3910198509693146,
"learning_rate": 6.569605003715201e-05,
"loss": 0.561509084701538,
"memory(GiB)": 387.42,
"step": 440,
"token_acc": 0.8264751552795031,
"train_speed(iter/s)": 0.054462
},
{
"epoch": 0.4336175395858709,
"grad_norm": 0.3805302381515503,
"learning_rate": 6.492844086818599e-05,
"loss": 0.558375883102417,
"memory(GiB)": 387.42,
"step": 445,
"token_acc": 0.8262056414922657,
"train_speed(iter/s)": 0.05444
},
{
"epoch": 0.438489646772229,
"grad_norm": 0.6036235690116882,
"learning_rate": 6.41569490614092e-05,
"loss": 0.6268420696258545,
"memory(GiB)": 387.42,
"step": 450,
"token_acc": 0.8061224489795918,
"train_speed(iter/s)": 0.054446
},
{
"epoch": 0.44336175395858707,
"grad_norm": 0.4275857210159302,
"learning_rate": 6.338177526893836e-05,
"loss": 0.5441042423248291,
"memory(GiB)": 387.42,
"step": 455,
"token_acc": 0.8360881542699724,
"train_speed(iter/s)": 0.05418
},
{
"epoch": 0.4482338611449452,
"grad_norm": 0.4830683469772339,
"learning_rate": 6.260312110051312e-05,
"loss": 0.606513261795044,
"memory(GiB)": 387.42,
"step": 460,
"token_acc": 0.8049238864875023,
"train_speed(iter/s)": 0.054224
},
{
"epoch": 0.4531059683313033,
"grad_norm": 0.35629284381866455,
"learning_rate": 6.182118907106068e-05,
"loss": 0.538546371459961,
"memory(GiB)": 387.42,
"step": 465,
"token_acc": 0.8373831775700935,
"train_speed(iter/s)": 0.054204
},
{
"epoch": 0.4579780755176614,
"grad_norm": 0.46749940514564514,
"learning_rate": 6.103618254802511e-05,
"loss": 0.5923898696899415,
"memory(GiB)": 387.42,
"step": 470,
"token_acc": 0.8042936553574851,
"train_speed(iter/s)": 0.054261
},
{
"epoch": 0.4628501827040195,
"grad_norm": 0.6278035044670105,
"learning_rate": 6.024830569847477e-05,
"loss": 0.5971939086914062,
"memory(GiB)": 387.42,
"step": 475,
"token_acc": 0.8176121372031663,
"train_speed(iter/s)": 0.054245
},
{
"epoch": 0.46772228989037756,
"grad_norm": 0.3572694957256317,
"learning_rate": 5.945776343600207e-05,
"loss": 0.5843085765838623,
"memory(GiB)": 387.42,
"step": 480,
"token_acc": 0.8212882953652789,
"train_speed(iter/s)": 0.054246
},
{
"epoch": 0.4725943970767357,
"grad_norm": 0.5189170241355896,
"learning_rate": 5.866476136742862e-05,
"loss": 0.5234210968017579,
"memory(GiB)": 387.42,
"step": 485,
"token_acc": 0.8463819691577699,
"train_speed(iter/s)": 0.05426
},
{
"epoch": 0.4774665042630938,
"grad_norm": 0.41832658648490906,
"learning_rate": 5.7869505739330546e-05,
"loss": 0.6695927619934082,
"memory(GiB)": 387.42,
"step": 490,
"token_acc": 0.7924812030075188,
"train_speed(iter/s)": 0.05433
},
{
"epoch": 0.4823386114494519,
"grad_norm": 4.011805534362793,
"learning_rate": 5.7072203384397064e-05,
"loss": 0.5814547538757324,
"memory(GiB)": 387.42,
"step": 495,
"token_acc": 0.8110627719080175,
"train_speed(iter/s)": 0.054376
},
{
"epoch": 0.48721071863581,
"grad_norm": 0.31671130657196045,
"learning_rate": 5.627306166763684e-05,
"loss": 0.5855265617370605,
"memory(GiB)": 387.42,
"step": 500,
"token_acc": 0.8094142629623076,
"train_speed(iter/s)": 0.054362
},
{
"epoch": 0.48721071863581,
"eval_loss": 0.6302051544189453,
"eval_runtime": 6.1545,
"eval_samples_per_second": 0.65,
"eval_steps_per_second": 0.65,
"step": 500
},
{
"epoch": 0.4920828258221681,
"grad_norm": 0.3875284194946289,
"learning_rate": 5.5472288432445774e-05,
"loss": 0.59937744140625,
"memory(GiB)": 387.42,
"step": 505,
"token_acc": 0.7988918837975442,
"train_speed(iter/s)": 0.05424
},
{
"epoch": 0.49695493300852617,
"grad_norm": 0.4411413371562958,
"learning_rate": 5.467009194655045e-05,
"loss": 0.5820174217224121,
"memory(GiB)": 387.42,
"step": 510,
"token_acc": 0.8234812510234157,
"train_speed(iter/s)": 0.054197
},
{
"epoch": 0.5018270401948843,
"grad_norm": 0.5111451148986816,
"learning_rate": 5.386668084784112e-05,
"loss": 0.5154130935668946,
"memory(GiB)": 387.42,
"step": 515,
"token_acc": 0.8397686998694274,
"train_speed(iter/s)": 0.05426
},
{
"epoch": 0.5066991473812423,
"grad_norm": 0.29832109808921814,
"learning_rate": 5.306226409010855e-05,
"loss": 0.5672587394714356,
"memory(GiB)": 387.42,
"step": 520,
"token_acc": 0.8263521756811713,
"train_speed(iter/s)": 0.054274
},
{
"epoch": 0.5115712545676004,
"grad_norm": 0.42139527201652527,
"learning_rate": 5.22570508886986e-05,
"loss": 0.5327470302581787,
"memory(GiB)": 387.42,
"step": 525,
"token_acc": 0.8310478199718706,
"train_speed(iter/s)": 0.054332
},
{
"epoch": 0.5164433617539586,
"grad_norm": 0.34750285744667053,
"learning_rate": 5.145125066609877e-05,
"loss": 0.61210618019104,
"memory(GiB)": 387.42,
"step": 530,
"token_acc": 0.8104413702239789,
"train_speed(iter/s)": 0.054325
},
{
"epoch": 0.5213154689403167,
"grad_norm": 0.5557289123535156,
"learning_rate": 5.0645072997471e-05,
"loss": 0.5486731052398681,
"memory(GiB)": 387.42,
"step": 535,
"token_acc": 0.8223992502343018,
"train_speed(iter/s)": 0.054295
},
{
"epoch": 0.5261875761266748,
"grad_norm": 1.370209813117981,
"learning_rate": 4.983872755614461e-05,
"loss": 0.6499679565429688,
"memory(GiB)": 387.42,
"step": 540,
"token_acc": 0.7975866095757104,
"train_speed(iter/s)": 0.054348
},
{
"epoch": 0.5310596833130329,
"grad_norm": 0.4371365010738373,
"learning_rate": 4.9032424059083774e-05,
"loss": 0.43409147262573244,
"memory(GiB)": 387.42,
"step": 545,
"token_acc": 0.8684942391736193,
"train_speed(iter/s)": 0.054321
},
{
"epoch": 0.535931790499391,
"grad_norm": 0.4735865890979767,
"learning_rate": 4.8226372212343726e-05,
"loss": 0.5776564598083496,
"memory(GiB)": 387.42,
"step": 550,
"token_acc": 0.8255653883972468,
"train_speed(iter/s)": 0.054368
},
{
"epoch": 0.5408038976857491,
"grad_norm": 0.6005700826644897,
"learning_rate": 4.742078165652958e-05,
"loss": 0.5744057178497315,
"memory(GiB)": 387.42,
"step": 555,
"token_acc": 0.8105436573311368,
"train_speed(iter/s)": 0.054325
},
{
"epoch": 0.5456760048721072,
"grad_norm": 0.4128513038158417,
"learning_rate": 4.661586191227247e-05,
"loss": 0.5321125030517578,
"memory(GiB)": 387.42,
"step": 560,
"token_acc": 0.8245080500894454,
"train_speed(iter/s)": 0.054305
},
{
"epoch": 0.5505481120584653,
"grad_norm": 0.4688722491264343,
"learning_rate": 4.581182232573658e-05,
"loss": 0.5235236167907715,
"memory(GiB)": 387.42,
"step": 565,
"token_acc": 0.8205183122724352,
"train_speed(iter/s)": 0.054352
},
{
"epoch": 0.5554202192448234,
"grad_norm": 0.4604549705982208,
"learning_rate": 4.500887201417187e-05,
"loss": 0.6571295261383057,
"memory(GiB)": 387.42,
"step": 570,
"token_acc": 0.8019607843137255,
"train_speed(iter/s)": 0.054361
},
{
"epoch": 0.5602923264311814,
"grad_norm": 0.48336780071258545,
"learning_rate": 4.4207219811526056e-05,
"loss": 0.5963138580322266,
"memory(GiB)": 387.42,
"step": 575,
"token_acc": 0.8077416987708678,
"train_speed(iter/s)": 0.054409
},
{
"epoch": 0.5651644336175395,
"grad_norm": 0.5700681805610657,
"learning_rate": 4.3407074214130446e-05,
"loss": 0.6309503555297852,
"memory(GiB)": 387.42,
"step": 580,
"token_acc": 0.7960770454143842,
"train_speed(iter/s)": 0.054412
},
{
"epoch": 0.5700365408038977,
"grad_norm": 0.40493443608283997,
"learning_rate": 4.2608643326473496e-05,
"loss": 0.5265829563140869,
"memory(GiB)": 387.42,
"step": 585,
"token_acc": 0.8364477970169724,
"train_speed(iter/s)": 0.054419
},
{
"epoch": 0.5749086479902558,
"grad_norm": 0.42441654205322266,
"learning_rate": 4.181213480707637e-05,
"loss": 0.5463868618011475,
"memory(GiB)": 387.42,
"step": 590,
"token_acc": 0.8250831178426302,
"train_speed(iter/s)": 0.054415
},
{
"epoch": 0.5797807551766139,
"grad_norm": 0.5273870825767517,
"learning_rate": 4.1017755814484374e-05,
"loss": 0.6219929218292236,
"memory(GiB)": 387.42,
"step": 595,
"token_acc": 0.8101965601965602,
"train_speed(iter/s)": 0.054492
},
{
"epoch": 0.584652862362972,
"grad_norm": 0.5027340650558472,
"learning_rate": 4.0225712953388494e-05,
"loss": 0.47921223640441896,
"memory(GiB)": 387.42,
"step": 600,
"token_acc": 0.8507462686567164,
"train_speed(iter/s)": 0.054456
},
{
"epoch": 0.584652862362972,
"eval_loss": 0.5931864976882935,
"eval_runtime": 6.2202,
"eval_samples_per_second": 0.643,
"eval_steps_per_second": 0.643,
"step": 600
},
{
"epoch": 0.5895249695493301,
"grad_norm": 0.7974056005477905,
"learning_rate": 3.943621222089102e-05,
"loss": 0.5052922248840332,
"memory(GiB)": 387.42,
"step": 605,
"token_acc": 0.8312937062937062,
"train_speed(iter/s)": 0.054258
},
{
"epoch": 0.5943970767356882,
"grad_norm": 0.38420093059539795,
"learning_rate": 3.864945895292908e-05,
"loss": 0.5411774635314941,
"memory(GiB)": 387.42,
"step": 610,
"token_acc": 0.8309124767225325,
"train_speed(iter/s)": 0.054201
},
{
"epoch": 0.5992691839220463,
"grad_norm": 0.9411633014678955,
"learning_rate": 3.786565777087022e-05,
"loss": 0.6929959297180176,
"memory(GiB)": 387.42,
"step": 615,
"token_acc": 0.7847842261904762,
"train_speed(iter/s)": 0.05425
},
{
"epoch": 0.6041412911084044,
"grad_norm": 0.35226595401763916,
"learning_rate": 3.708501252829386e-05,
"loss": 0.5966301918029785,
"memory(GiB)": 387.42,
"step": 620,
"token_acc": 0.8161076443057722,
"train_speed(iter/s)": 0.054233
},
{
"epoch": 0.6090133982947625,
"grad_norm": 0.4208815097808838,
"learning_rate": 3.6307726257972255e-05,
"loss": 0.5394818782806396,
"memory(GiB)": 387.42,
"step": 625,
"token_acc": 0.8257628294036061,
"train_speed(iter/s)": 0.054209
},
{
"epoch": 0.6138855054811205,
"grad_norm": 0.445925772190094,
"learning_rate": 3.553400111906523e-05,
"loss": 0.6164620399475098,
"memory(GiB)": 387.42,
"step": 630,
"token_acc": 0.8090881366270204,
"train_speed(iter/s)": 0.054222
},
{
"epoch": 0.6187576126674786,
"grad_norm": 0.5922476649284363,
"learning_rate": 3.476403834454183e-05,
"loss": 0.5115623474121094,
"memory(GiB)": 387.42,
"step": 635,
"token_acc": 0.8346325167037862,
"train_speed(iter/s)": 0.054244
},
{
"epoch": 0.6236297198538368,
"grad_norm": 0.5026776790618896,
"learning_rate": 3.399803818884311e-05,
"loss": 0.5328683853149414,
"memory(GiB)": 387.42,
"step": 640,
"token_acc": 0.8462420173571311,
"train_speed(iter/s)": 0.054264
},
{
"epoch": 0.6285018270401949,
"grad_norm": 0.45468801259994507,
"learning_rate": 3.323619987579914e-05,
"loss": 0.6177504062652588,
"memory(GiB)": 387.42,
"step": 645,
"token_acc": 0.80891932520461,
"train_speed(iter/s)": 0.054261
},
{
"epoch": 0.633373934226553,
"grad_norm": 0.6319808959960938,
"learning_rate": 3.247872154681439e-05,
"loss": 0.5958673000335694,
"memory(GiB)": 387.42,
"step": 650,
"token_acc": 0.8096597145993414,
"train_speed(iter/s)": 0.054221
},
{
"epoch": 0.6382460414129111,
"grad_norm": 0.4812871217727661,
"learning_rate": 3.172580020933442e-05,
"loss": 0.5768674850463867,
"memory(GiB)": 387.42,
"step": 655,
"token_acc": 0.8165027102991367,
"train_speed(iter/s)": 0.054185
},
{
"epoch": 0.6431181485992692,
"grad_norm": 0.9395345449447632,
"learning_rate": 3.097763168560741e-05,
"loss": 0.674397611618042,
"memory(GiB)": 387.42,
"step": 660,
"token_acc": 0.7806563039723662,
"train_speed(iter/s)": 0.054211
},
{
"epoch": 0.6479902557856273,
"grad_norm": 0.5097836852073669,
"learning_rate": 3.0234410561754257e-05,
"loss": 0.5154216766357422,
"memory(GiB)": 387.42,
"step": 665,
"token_acc": 0.8327868852459016,
"train_speed(iter/s)": 0.054197
},
{
"epoch": 0.6528623629719854,
"grad_norm": 0.3545515239238739,
"learning_rate": 2.949633013715982e-05,
"loss": 0.5994223117828369,
"memory(GiB)": 387.42,
"step": 670,
"token_acc": 0.8076275080410477,
"train_speed(iter/s)": 0.054247
},
{
"epoch": 0.6577344701583435,
"grad_norm": 0.9892140030860901,
"learning_rate": 2.8763582374199126e-05,
"loss": 0.5891304969787597,
"memory(GiB)": 387.42,
"step": 675,
"token_acc": 0.8036573628488932,
"train_speed(iter/s)": 0.054243
},
{
"epoch": 0.6626065773447016,
"grad_norm": 0.5605654716491699,
"learning_rate": 2.8036357848311012e-05,
"loss": 0.5478427410125732,
"memory(GiB)": 387.42,
"step": 680,
"token_acc": 0.8287547623821937,
"train_speed(iter/s)": 0.054281
},
{
"epoch": 0.6674786845310596,
"grad_norm": 0.4100501239299774,
"learning_rate": 2.7314845698432805e-05,
"loss": 0.6083401203155517,
"memory(GiB)": 387.42,
"step": 685,
"token_acc": 0.7989271180170181,
"train_speed(iter/s)": 0.054288
},
{
"epoch": 0.6723507917174177,
"grad_norm": 0.4639231562614441,
"learning_rate": 2.659923357780828e-05,
"loss": 0.5717390060424805,
"memory(GiB)": 387.42,
"step": 690,
"token_acc": 0.8201791448369106,
"train_speed(iter/s)": 0.054301
},
{
"epoch": 0.6772228989037758,
"grad_norm": 0.30558013916015625,
"learning_rate": 2.5889707605182347e-05,
"loss": 0.4964598178863525,
"memory(GiB)": 387.42,
"step": 695,
"token_acc": 0.8518634024637455,
"train_speed(iter/s)": 0.054314
},
{
"epoch": 0.682095006090134,
"grad_norm": 0.490887314081192,
"learning_rate": 2.518645231639457e-05,
"loss": 0.6779924392700195,
"memory(GiB)": 387.42,
"step": 700,
"token_acc": 0.7798953662182362,
"train_speed(iter/s)": 0.054375
},
{
"epoch": 0.682095006090134,
"eval_loss": 0.587890625,
"eval_runtime": 6.016,
"eval_samples_per_second": 0.665,
"eval_steps_per_second": 0.665,
"step": 700
},
{
"epoch": 0.6869671132764921,
"grad_norm": 0.9540379047393799,
"learning_rate": 2.4489650616384507e-05,
"loss": 0.5919107437133789,
"memory(GiB)": 387.42,
"step": 705,
"token_acc": 0.8063427800269906,
"train_speed(iter/s)": 0.054286
},
{
"epoch": 0.6918392204628502,
"grad_norm": 0.4385371208190918,
"learning_rate": 2.3799483731621237e-05,
"loss": 0.5554671287536621,
"memory(GiB)": 387.42,
"step": 710,
"token_acc": 0.8227891742802965,
"train_speed(iter/s)": 0.054309
},
{
"epoch": 0.6967113276492083,
"grad_norm": 0.37225764989852905,
"learning_rate": 2.311613116296929e-05,
"loss": 0.5223379611968995,
"memory(GiB)": 387.42,
"step": 715,
"token_acc": 0.8422697368421053,
"train_speed(iter/s)": 0.054303
},
{
"epoch": 0.7015834348355664,
"grad_norm": 0.6227976083755493,
"learning_rate": 2.2439770639003627e-05,
"loss": 0.5609029769897461,
"memory(GiB)": 387.42,
"step": 720,
"token_acc": 0.8244803695150116,
"train_speed(iter/s)": 0.054309
},
{
"epoch": 0.7064555420219245,
"grad_norm": 0.4218509793281555,
"learning_rate": 2.177057806978522e-05,
"loss": 0.5789398193359375,
"memory(GiB)": 387.42,
"step": 725,
"token_acc": 0.8195275590551181,
"train_speed(iter/s)": 0.054317
},
{
"epoch": 0.7113276492082826,
"grad_norm": 0.5081908106803894,
"learning_rate": 2.110872750110996e-05,
"loss": 0.49318413734436034,
"memory(GiB)": 387.42,
"step": 730,
"token_acc": 0.8306063522617901,
"train_speed(iter/s)": 0.05436
},
{
"epoch": 0.7161997563946407,
"grad_norm": 0.6738778352737427,
"learning_rate": 2.045439106924217e-05,
"loss": 0.55146803855896,
"memory(GiB)": 387.42,
"step": 735,
"token_acc": 0.8200392927308447,
"train_speed(iter/s)": 0.054367
},
{
"epoch": 0.7210718635809987,
"grad_norm": 0.43147921562194824,
"learning_rate": 1.980773895614481e-05,
"loss": 0.574643898010254,
"memory(GiB)": 387.42,
"step": 740,
"token_acc": 0.8172221384406575,
"train_speed(iter/s)": 0.054386
},
{
"epoch": 0.7259439707673568,
"grad_norm": 0.5750350952148438,
"learning_rate": 1.9168939345218095e-05,
"loss": 0.5682173728942871,
"memory(GiB)": 387.42,
"step": 745,
"token_acc": 0.8214421252371916,
"train_speed(iter/s)": 0.054395
},
{
"epoch": 0.730816077953715,
"grad_norm": 0.461907297372818,
"learning_rate": 1.8538158377557702e-05,
"loss": 0.5272111415863037,
"memory(GiB)": 387.42,
"step": 750,
"token_acc": 0.8257032542746828,
"train_speed(iter/s)": 0.054421
},
{
"epoch": 0.7356881851400731,
"grad_norm": 0.794235348701477,
"learning_rate": 1.791556010874434e-05,
"loss": 0.6292970180511475,
"memory(GiB)": 387.42,
"step": 755,
"token_acc": 0.810012836970475,
"train_speed(iter/s)": 0.054353
},
{
"epoch": 0.7405602923264312,
"grad_norm": 0.6189777851104736,
"learning_rate": 1.7301306466175533e-05,
"loss": 0.5557656288146973,
"memory(GiB)": 387.42,
"step": 760,
"token_acc": 0.8259242957746479,
"train_speed(iter/s)": 0.054349
},
{
"epoch": 0.7454323995127893,
"grad_norm": 0.4845249056816101,
"learning_rate": 1.6695557206951144e-05,
"loss": 0.49696760177612304,
"memory(GiB)": 389.68,
"step": 765,
"token_acc": 0.8422638261243813,
"train_speed(iter/s)": 0.054323
},
{
"epoch": 0.7503045066991474,
"grad_norm": 0.4710843563079834,
"learning_rate": 1.6098469876323093e-05,
"loss": 0.47034273147583006,
"memory(GiB)": 389.68,
"step": 770,
"token_acc": 0.8487571701720842,
"train_speed(iter/s)": 0.05434
},
{
"epoch": 0.7551766138855055,
"grad_norm": 0.45380252599716187,
"learning_rate": 1.551019976672058e-05,
"loss": 0.5777853488922119,
"memory(GiB)": 389.68,
"step": 775,
"token_acc": 0.8110020910406949,
"train_speed(iter/s)": 0.054377
},
{
"epoch": 0.7600487210718636,
"grad_norm": 0.5304797291755676,
"learning_rate": 1.4930899877361015e-05,
"loss": 0.5180749416351318,
"memory(GiB)": 389.68,
"step": 780,
"token_acc": 0.8334659769200159,
"train_speed(iter/s)": 0.05443
},
{
"epoch": 0.7649208282582217,
"grad_norm": 0.447553426027298,
"learning_rate": 1.4360720874457607e-05,
"loss": 0.5336573123931885,
"memory(GiB)": 389.68,
"step": 785,
"token_acc": 0.8346641615782058,
"train_speed(iter/s)": 0.054438
},
{
"epoch": 0.7697929354445798,
"grad_norm": 0.5468970537185669,
"learning_rate": 1.3799811052033467e-05,
"loss": 0.6092133522033691,
"memory(GiB)": 389.68,
"step": 790,
"token_acc": 0.7997620261771206,
"train_speed(iter/s)": 0.054456
},
{
"epoch": 0.7746650426309378,
"grad_norm": 0.6424246430397034,
"learning_rate": 1.3248316293352946e-05,
"loss": 0.6084504127502441,
"memory(GiB)": 389.68,
"step": 795,
"token_acc": 0.8091853471842537,
"train_speed(iter/s)": 0.05451
},
{
"epoch": 0.7795371498172959,
"grad_norm": 0.5339289903640747,
"learning_rate": 1.2706380032979691e-05,
"loss": 0.535353136062622,
"memory(GiB)": 389.68,
"step": 800,
"token_acc": 0.8231229847996315,
"train_speed(iter/s)": 0.054509
},
{
"epoch": 0.7795371498172959,
"eval_loss": 0.587626039981842,
"eval_runtime": 6.1485,
"eval_samples_per_second": 0.651,
"eval_steps_per_second": 0.651,
"step": 800
},
{
"epoch": 0.784409257003654,
"grad_norm": 0.47259068489074707,
"learning_rate": 1.2174143219471878e-05,
"loss": 0.6263217449188232,
"memory(GiB)": 389.68,
"step": 805,
"token_acc": 0.7991557070953077,
"train_speed(iter/s)": 0.054434
},
{
"epoch": 0.7892813641900122,
"grad_norm": 0.5547453761100769,
"learning_rate": 1.1651744278723687e-05,
"loss": 0.5090929985046386,
"memory(GiB)": 389.68,
"step": 810,
"token_acc": 0.8354404976921533,
"train_speed(iter/s)": 0.054448
},
{
"epoch": 0.7941534713763703,
"grad_norm": 0.4848991930484772,
"learning_rate": 1.1139319077963178e-05,
"loss": 0.5273432254791259,
"memory(GiB)": 389.68,
"step": 815,
"token_acc": 0.8295368261199696,
"train_speed(iter/s)": 0.054475
},
{
"epoch": 0.7990255785627284,
"grad_norm": 0.5590830445289612,
"learning_rate": 1.0637000890415388e-05,
"loss": 0.6279808044433594,
"memory(GiB)": 389.68,
"step": 820,
"token_acc": 0.8061934585942937,
"train_speed(iter/s)": 0.054494
},
{
"epoch": 0.8038976857490865,
"grad_norm": 1.119874358177185,
"learning_rate": 1.0144920360640303e-05,
"loss": 0.6255881309509277,
"memory(GiB)": 389.68,
"step": 825,
"token_acc": 0.8063498323802012,
"train_speed(iter/s)": 0.0545
},
{
"epoch": 0.8087697929354446,
"grad_norm": 0.4502837359905243,
"learning_rate": 9.663205470554276e-06,
"loss": 0.5530724048614502,
"memory(GiB)": 389.68,
"step": 830,
"token_acc": 0.8286991062562066,
"train_speed(iter/s)": 0.054498
},
{
"epoch": 0.8136419001218027,
"grad_norm": 0.47327640652656555,
"learning_rate": 9.19198150614417e-06,
"loss": 0.6426435470581054,
"memory(GiB)": 389.68,
"step": 835,
"token_acc": 0.7995495495495496,
"train_speed(iter/s)": 0.054482
},
{
"epoch": 0.8185140073081608,
"grad_norm": 0.45425912737846375,
"learning_rate": 8.73137102488249e-06,
"loss": 0.5113016128540039,
"memory(GiB)": 389.68,
"step": 840,
"token_acc": 0.8368200836820083,
"train_speed(iter/s)": 0.054528
},
{
"epoch": 0.8233861144945189,
"grad_norm": 0.5594798922538757,
"learning_rate": 8.28149382385231e-06,
"loss": 0.5977861881256104,
"memory(GiB)": 389.68,
"step": 845,
"token_acc": 0.8159670164917541,
"train_speed(iter/s)": 0.054545
},
{
"epoch": 0.8282582216808769,
"grad_norm": 0.38594865798950195,
"learning_rate": 7.842466908590006e-06,
"loss": 0.5546538829803467,
"memory(GiB)": 389.68,
"step": 850,
"token_acc": 0.8362763915547025,
"train_speed(iter/s)": 0.05454
},
{
"epoch": 0.833130328867235,
"grad_norm": 0.6128694415092468,
"learning_rate": 7.414404462654051e-06,
"loss": 0.5578857898712158,
"memory(GiB)": 389.68,
"step": 855,
"token_acc": 0.8173973075595443,
"train_speed(iter/s)": 0.054466
},
{
"epoch": 0.8380024360535931,
"grad_norm": 0.5973862409591675,
"learning_rate": 6.997417817927865e-06,
"loss": 0.6116644382476807,
"memory(GiB)": 389.68,
"step": 860,
"token_acc": 0.8100558659217877,
"train_speed(iter/s)": 0.054467
},
{
"epoch": 0.8428745432399513,
"grad_norm": 0.5695779323577881,
"learning_rate": 6.591615425664144e-06,
"loss": 0.6063879013061524,
"memory(GiB)": 389.68,
"step": 865,
"token_acc": 0.8113871180479226,
"train_speed(iter/s)": 0.054502
},
{
"epoch": 0.8477466504263094,
"grad_norm": 0.37414440512657166,
"learning_rate": 6.197102828278611e-06,
"loss": 0.5134734153747559,
"memory(GiB)": 389.68,
"step": 870,
"token_acc": 0.8304152076038019,
"train_speed(iter/s)": 0.054524
},
{
"epoch": 0.8526187576126675,
"grad_norm": 0.8222331404685974,
"learning_rate": 5.813982631900122e-06,
"loss": 0.5653984069824218,
"memory(GiB)": 389.68,
"step": 875,
"token_acc": 0.8229976496112819,
"train_speed(iter/s)": 0.054534
},
{
"epoch": 0.8574908647990256,
"grad_norm": 0.3609310984611511,
"learning_rate": 5.442354479684558e-06,
"loss": 0.49175424575805665,
"memory(GiB)": 389.68,
"step": 880,
"token_acc": 0.8409646976581615,
"train_speed(iter/s)": 0.054533
},
{
"epoch": 0.8623629719853837,
"grad_norm": 0.6293960213661194,
"learning_rate": 5.082315025899315e-06,
"loss": 0.604953384399414,
"memory(GiB)": 389.68,
"step": 885,
"token_acc": 0.8073544433094995,
"train_speed(iter/s)": 0.05455
},
{
"epoch": 0.8672350791717418,
"grad_norm": 0.4242098331451416,
"learning_rate": 4.733957910785114e-06,
"loss": 0.4986411571502686,
"memory(GiB)": 389.68,
"step": 890,
"token_acc": 0.8444040036396724,
"train_speed(iter/s)": 0.054562
},
{
"epoch": 0.8721071863580999,
"grad_norm": 0.5025205612182617,
"learning_rate": 4.397373736201782e-06,
"loss": 0.5355000495910645,
"memory(GiB)": 389.68,
"step": 895,
"token_acc": 0.8340460526315789,
"train_speed(iter/s)": 0.054564
},
{
"epoch": 0.876979293544458,
"grad_norm": 0.42587506771087646,
"learning_rate": 4.072650042064174e-06,
"loss": 0.6113440513610839,
"memory(GiB)": 389.68,
"step": 900,
"token_acc": 0.8042306924765515,
"train_speed(iter/s)": 0.054571
},
{
"epoch": 0.876979293544458,
"eval_loss": 0.5867875814437866,
"eval_runtime": 6.1618,
"eval_samples_per_second": 0.649,
"eval_steps_per_second": 0.649,
"step": 900
},
{
"epoch": 0.881851400730816,
"grad_norm": 0.6062163710594177,
"learning_rate": 3.759871283574562e-06,
"loss": 0.5853659629821777,
"memory(GiB)": 389.68,
"step": 905,
"token_acc": 0.8163235076284995,
"train_speed(iter/s)": 0.054495
},
{
"epoch": 0.8867235079171741,
"grad_norm": 0.5810290575027466,
"learning_rate": 3.4591188092571893e-06,
"loss": 0.5189132213592529,
"memory(GiB)": 389.68,
"step": 910,
"token_acc": 0.848421052631579,
"train_speed(iter/s)": 0.054517
},
{
"epoch": 0.8915956151035322,
"grad_norm": 0.5703849196434021,
"learning_rate": 3.1704708398009486e-06,
"loss": 0.5976828575134278,
"memory(GiB)": 389.68,
"step": 915,
"token_acc": 0.808837066584842,
"train_speed(iter/s)": 0.05451
},
{
"epoch": 0.8964677222898904,
"grad_norm": 0.5777165293693542,
"learning_rate": 2.894002447715399e-06,
"loss": 0.5165195465087891,
"memory(GiB)": 389.68,
"step": 920,
"token_acc": 0.8424015009380863,
"train_speed(iter/s)": 0.054567
},
{
"epoch": 0.9013398294762485,
"grad_norm": 0.48375067114830017,
"learning_rate": 2.6297855378057623e-06,
"loss": 0.46347522735595703,
"memory(GiB)": 389.68,
"step": 925,
"token_acc": 0.8408729585200173,
"train_speed(iter/s)": 0.054561
},
{
"epoch": 0.9062119366626066,
"grad_norm": 0.4930781126022339,
"learning_rate": 2.3778888284716193e-06,
"loss": 0.6031323909759522,
"memory(GiB)": 389.68,
"step": 930,
"token_acc": 0.8058429701765064,
"train_speed(iter/s)": 0.054553
},
{
"epoch": 0.9110840438489647,
"grad_norm": 0.42932575941085815,
"learning_rate": 2.138377833834404e-06,
"loss": 0.5199082851409912,
"memory(GiB)": 389.68,
"step": 935,
"token_acc": 0.837616269903831,
"train_speed(iter/s)": 0.054552
},
{
"epoch": 0.9159561510353228,
"grad_norm": 0.6615188717842102,
"learning_rate": 1.9113148466983254e-06,
"loss": 0.6138844013214111,
"memory(GiB)": 389.68,
"step": 940,
"token_acc": 0.8027118644067797,
"train_speed(iter/s)": 0.054582
},
{
"epoch": 0.9208282582216809,
"grad_norm": 0.41028302907943726,
"learning_rate": 1.696758922348979e-06,
"loss": 0.5526364803314209,
"memory(GiB)": 389.68,
"step": 945,
"token_acc": 0.8190247252747253,
"train_speed(iter/s)": 0.054578
},
{
"epoch": 0.925700365408039,
"grad_norm": 0.48014047741889954,
"learning_rate": 1.4947658631941309e-06,
"loss": 0.49515771865844727,
"memory(GiB)": 389.68,
"step": 950,
"token_acc": 0.832800851970181,
"train_speed(iter/s)": 0.054557
},
{
"epoch": 0.9305724725943971,
"grad_norm": 0.6173512935638428,
"learning_rate": 1.3053882042503796e-06,
"loss": 0.5243947505950928,
"memory(GiB)": 389.68,
"step": 955,
"token_acc": 0.8282737560625112,
"train_speed(iter/s)": 0.054472
},
{
"epoch": 0.9354445797807551,
"grad_norm": 0.6899262070655823,
"learning_rate": 1.1286751994797284e-06,
"loss": 0.636317253112793,
"memory(GiB)": 389.68,
"step": 960,
"token_acc": 0.8041509433962264,
"train_speed(iter/s)": 0.05449
},
{
"epoch": 0.9403166869671132,
"grad_norm": 0.538864016532898,
"learning_rate": 9.646728089794167e-07,
"loss": 0.5281119823455811,
"memory(GiB)": 389.68,
"step": 965,
"token_acc": 0.828132906054984,
"train_speed(iter/s)": 0.054472
},
{
"epoch": 0.9451887941534713,
"grad_norm": 0.7353665828704834,
"learning_rate": 8.134236870284861e-07,
"loss": 0.6087577819824219,
"memory(GiB)": 389.68,
"step": 970,
"token_acc": 0.8098674274207082,
"train_speed(iter/s)": 0.054485
},
{
"epoch": 0.9500609013398295,
"grad_norm": 0.7473301887512207,
"learning_rate": 6.749671709941008e-07,
"loss": 0.6141918182373047,
"memory(GiB)": 389.68,
"step": 975,
"token_acc": 0.8016149752248118,
"train_speed(iter/s)": 0.054518
},
{
"epoch": 0.9549330085261876,
"grad_norm": 0.6487853527069092,
"learning_rate": 5.493392711005796e-07,
"loss": 0.5959615707397461,
"memory(GiB)": 389.68,
"step": 980,
"token_acc": 0.8156642881413524,
"train_speed(iter/s)": 0.054561
},
{
"epoch": 0.9598051157125457,
"grad_norm": 0.678453803062439,
"learning_rate": 4.365726610637222e-07,
"loss": 0.5411821842193604,
"memory(GiB)": 389.68,
"step": 985,
"token_acc": 0.8313556274721323,
"train_speed(iter/s)": 0.054544
},
{
"epoch": 0.9646772228989038,
"grad_norm": 0.5119591355323792,
"learning_rate": 3.366966695929119e-07,
"loss": 0.49676513671875,
"memory(GiB)": 389.68,
"step": 990,
"token_acc": 0.8351805505899178,
"train_speed(iter/s)": 0.054553
},
{
"epoch": 0.9695493300852619,
"grad_norm": 0.6289726495742798,
"learning_rate": 2.4973727276323965e-07,
"loss": 0.60072922706604,
"memory(GiB)": 389.68,
"step": 995,
"token_acc": 0.8124610591900312,
"train_speed(iter/s)": 0.054575
},
{
"epoch": 0.97442143727162,
"grad_norm": 0.5490319132804871,
"learning_rate": 1.7571708725953596e-07,
"loss": 0.5364939212799072,
"memory(GiB)": 389.68,
"step": 1000,
"token_acc": 0.8235892221657346,
"train_speed(iter/s)": 0.054556
},
{
"epoch": 0.97442143727162,
"eval_loss": 0.5838146805763245,
"eval_runtime": 6.1207,
"eval_samples_per_second": 0.654,
"eval_steps_per_second": 0.654,
"step": 1000
},
{
"epoch": 0.9792935444579781,
"grad_norm": 0.4941563010215759,
"learning_rate": 1.1465536449415393e-07,
"loss": 0.5735920906066895,
"memory(GiB)": 389.68,
"step": 1005,
"token_acc": 0.8156670746634027,
"train_speed(iter/s)": 0.054474
},
{
"epoch": 0.9841656516443362,
"grad_norm": 0.5679388046264648,
"learning_rate": 6.656798560001343e-08,
"loss": 0.5337845325469971,
"memory(GiB)": 389.68,
"step": 1010,
"token_acc": 0.8183527641970666,
"train_speed(iter/s)": 0.054492
},
{
"epoch": 0.9890377588306942,
"grad_norm": 0.43481603264808655,
"learning_rate": 3.146745730015499e-08,
"loss": 0.5338433265686036,
"memory(GiB)": 389.68,
"step": 1015,
"token_acc": 0.8283907544701264,
"train_speed(iter/s)": 0.054525
},
{
"epoch": 0.9939098660170523,
"grad_norm": 0.44339102506637573,
"learning_rate": 9.362908654986235e-09,
"loss": 0.5187356472015381,
"memory(GiB)": 389.68,
"step": 1020,
"token_acc": 0.8316270566727605,
"train_speed(iter/s)": 0.054538
},
{
"epoch": 0.9987819732034104,
"grad_norm": 0.7172895669937134,
"learning_rate": 2.6008868793114817e-10,
"loss": 0.5243105888366699,
"memory(GiB)": 389.68,
"step": 1025,
"token_acc": 0.8462152666879591,
"train_speed(iter/s)": 0.054564
},
{
"epoch": 0.9997563946406821,
"eval_loss": 0.5837547183036804,
"eval_runtime": 6.0694,
"eval_samples_per_second": 0.659,
"eval_steps_per_second": 0.659,
"step": 1026
}
],
"logging_steps": 5,
"max_steps": 1026,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.095035636732416e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}