zheminh's picture
Add files using upload-large-folder tool
1ec53ee verified
{
"best_global_step": 100,
"best_metric": 0.39695656,
"best_model_checkpoint": "/home/ubuntu/output/v31-20250504-001829/checkpoint-100",
"epoch": 6.451612903225806,
"eval_steps": 50,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010752688172043012,
"grad_norm": 2.8397373471487466,
"learning_rate": 3.0303030303030305e-07,
"loss": 0.6789064407348633,
"memory(GiB)": 33.21,
"step": 1,
"token_acc": 0.7741811175337187,
"train_speed(iter/s)": 0.077627
},
{
"epoch": 0.053763440860215055,
"grad_norm": 3.004365830179039,
"learning_rate": 1.5151515151515152e-06,
"loss": 0.6730813980102539,
"memory(GiB)": 35.76,
"step": 5,
"token_acc": 0.8227406519132235,
"train_speed(iter/s)": 0.190932
},
{
"epoch": 0.10752688172043011,
"grad_norm": 2.515744462969929,
"learning_rate": 3.0303030303030305e-06,
"loss": 0.6481359958648681,
"memory(GiB)": 35.76,
"step": 10,
"token_acc": 0.8160835048515143,
"train_speed(iter/s)": 0.231618
},
{
"epoch": 0.16129032258064516,
"grad_norm": 1.748868361489095,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.5695308208465576,
"memory(GiB)": 35.76,
"step": 15,
"token_acc": 0.8397823007530009,
"train_speed(iter/s)": 0.244031
},
{
"epoch": 0.21505376344086022,
"grad_norm": 1.0225907480921308,
"learning_rate": 6.060606060606061e-06,
"loss": 0.5106754302978516,
"memory(GiB)": 35.76,
"step": 20,
"token_acc": 0.8501878777977455,
"train_speed(iter/s)": 0.250238
},
{
"epoch": 0.26881720430107525,
"grad_norm": 1.2639964303465994,
"learning_rate": 7.5757575757575764e-06,
"loss": 0.4408127307891846,
"memory(GiB)": 35.76,
"step": 25,
"token_acc": 0.8509183536667113,
"train_speed(iter/s)": 0.254089
},
{
"epoch": 0.3225806451612903,
"grad_norm": 0.8267338618549237,
"learning_rate": 9.090909090909091e-06,
"loss": 0.42465009689331057,
"memory(GiB)": 35.76,
"step": 30,
"token_acc": 0.8319305277221108,
"train_speed(iter/s)": 0.253029
},
{
"epoch": 0.3763440860215054,
"grad_norm": 0.7590056670512465,
"learning_rate": 9.999741584205621e-06,
"loss": 0.40682473182678225,
"memory(GiB)": 35.76,
"step": 35,
"token_acc": 0.8784954317130087,
"train_speed(iter/s)": 0.256755
},
{
"epoch": 0.43010752688172044,
"grad_norm": 0.7473548691186508,
"learning_rate": 9.99683471327489e-06,
"loss": 0.39803519248962405,
"memory(GiB)": 35.76,
"step": 40,
"token_acc": 0.8626253418413856,
"train_speed(iter/s)": 0.259978
},
{
"epoch": 0.4838709677419355,
"grad_norm": 0.7312727330719088,
"learning_rate": 9.99069983579947e-06,
"loss": 0.3946224689483643,
"memory(GiB)": 35.76,
"step": 45,
"token_acc": 0.8563264614993862,
"train_speed(iter/s)": 0.264063
},
{
"epoch": 0.5376344086021505,
"grad_norm": 0.680023295861087,
"learning_rate": 9.981340914973221e-06,
"loss": 0.3800630807876587,
"memory(GiB)": 35.76,
"step": 50,
"token_acc": 0.8677728496752732,
"train_speed(iter/s)": 0.265313
},
{
"epoch": 0.5376344086021505,
"eval_loss": 0.4121361970901489,
"eval_runtime": 1.1144,
"eval_samples_per_second": 14.357,
"eval_steps_per_second": 1.795,
"eval_token_acc": 0.862480083511895,
"step": 50
},
{
"epoch": 0.5913978494623656,
"grad_norm": 0.6125116226090357,
"learning_rate": 9.968763996755115e-06,
"loss": 0.3797069787979126,
"memory(GiB)": 35.76,
"step": 55,
"token_acc": 0.8705602222369204,
"train_speed(iter/s)": 0.212247
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.6413274106626393,
"learning_rate": 9.952977205963496e-06,
"loss": 0.37576003074645997,
"memory(GiB)": 36.2,
"step": 60,
"token_acc": 0.8649204294525242,
"train_speed(iter/s)": 0.216167
},
{
"epoch": 0.6989247311827957,
"grad_norm": 0.6501908922939531,
"learning_rate": 9.93399074102735e-06,
"loss": 0.373861026763916,
"memory(GiB)": 36.2,
"step": 65,
"token_acc": 0.8693455722115829,
"train_speed(iter/s)": 0.21941
},
{
"epoch": 0.7526881720430108,
"grad_norm": 0.7004218187745562,
"learning_rate": 9.911816867398026e-06,
"loss": 0.3793942928314209,
"memory(GiB)": 36.2,
"step": 70,
"token_acc": 0.8730909406456092,
"train_speed(iter/s)": 0.222179
},
{
"epoch": 0.8064516129032258,
"grad_norm": 0.8166238670210874,
"learning_rate": 9.886469909625624e-06,
"loss": 0.3867968559265137,
"memory(GiB)": 36.2,
"step": 75,
"token_acc": 0.848757324712063,
"train_speed(iter/s)": 0.226416
},
{
"epoch": 0.8602150537634409,
"grad_norm": 0.6267634407532141,
"learning_rate": 9.857966242105194e-06,
"loss": 0.3607918739318848,
"memory(GiB)": 36.2,
"step": 80,
"token_acc": 0.8719132441966814,
"train_speed(iter/s)": 0.230221
},
{
"epoch": 0.9139784946236559,
"grad_norm": 0.7896216311488772,
"learning_rate": 9.8263242784987e-06,
"loss": 0.3733763933181763,
"memory(GiB)": 36.2,
"step": 85,
"token_acc": 0.86721273110227,
"train_speed(iter/s)": 0.232953
},
{
"epoch": 0.967741935483871,
"grad_norm": 0.6455395783876484,
"learning_rate": 9.791564459839609e-06,
"loss": 0.36534771919250486,
"memory(GiB)": 36.2,
"step": 90,
"token_acc": 0.8692492781520693,
"train_speed(iter/s)": 0.235041
},
{
"epoch": 1.021505376344086,
"grad_norm": 0.5393287418168153,
"learning_rate": 9.753709241327773e-06,
"loss": 0.3413947343826294,
"memory(GiB)": 36.2,
"step": 95,
"token_acc": 0.8809226932668329,
"train_speed(iter/s)": 0.238594
},
{
"epoch": 1.075268817204301,
"grad_norm": 0.5578267353989265,
"learning_rate": 9.712783077823144e-06,
"loss": 0.2999130725860596,
"memory(GiB)": 36.2,
"step": 100,
"token_acc": 0.876852251474608,
"train_speed(iter/s)": 0.240385
},
{
"epoch": 1.075268817204301,
"eval_loss": 0.39695656299591064,
"eval_runtime": 0.9623,
"eval_samples_per_second": 16.626,
"eval_steps_per_second": 2.078,
"eval_token_acc": 0.8658590187352343,
"step": 100
},
{
"epoch": 1.129032258064516,
"grad_norm": 0.6232434629738186,
"learning_rate": 9.66881240804768e-06,
"loss": 0.2978543758392334,
"memory(GiB)": 36.2,
"step": 105,
"token_acc": 0.8903211301382695,
"train_speed(iter/s)": 0.216685
},
{
"epoch": 1.1827956989247312,
"grad_norm": 0.563444312933744,
"learning_rate": 9.62182563750565e-06,
"loss": 0.2764800786972046,
"memory(GiB)": 36.2,
"step": 110,
"token_acc": 0.8837461046416271,
"train_speed(iter/s)": 0.218683
},
{
"epoch": 1.2365591397849462,
"grad_norm": 0.634129779250207,
"learning_rate": 9.571853120133406e-06,
"loss": 0.2966769695281982,
"memory(GiB)": 36.2,
"step": 115,
"token_acc": 0.8817908276295341,
"train_speed(iter/s)": 0.220159
},
{
"epoch": 1.2903225806451613,
"grad_norm": 0.5952665579415249,
"learning_rate": 9.51892713869041e-06,
"loss": 0.2910531759262085,
"memory(GiB)": 36.2,
"step": 120,
"token_acc": 0.8890962995497969,
"train_speed(iter/s)": 0.221075
},
{
"epoch": 1.3440860215053765,
"grad_norm": 0.5889028303932383,
"learning_rate": 9.463081883904251e-06,
"loss": 0.2941020727157593,
"memory(GiB)": 36.2,
"step": 125,
"token_acc": 0.9022072348252606,
"train_speed(iter/s)": 0.222909
},
{
"epoch": 1.3978494623655915,
"grad_norm": 0.6117493527778533,
"learning_rate": 9.404353432383078e-06,
"loss": 0.299320125579834,
"memory(GiB)": 36.2,
"step": 130,
"token_acc": 0.9017176863081016,
"train_speed(iter/s)": 0.223869
},
{
"epoch": 1.4516129032258065,
"grad_norm": 0.6709733335774591,
"learning_rate": 9.342779723309746e-06,
"loss": 0.2946903228759766,
"memory(GiB)": 36.2,
"step": 135,
"token_acc": 0.8963614673426782,
"train_speed(iter/s)": 0.22548
},
{
"epoch": 1.5053763440860215,
"grad_norm": 0.5231401147766865,
"learning_rate": 9.278400533932703e-06,
"loss": 0.27523131370544435,
"memory(GiB)": 36.2,
"step": 140,
"token_acc": 0.9091817273635455,
"train_speed(iter/s)": 0.22685
},
{
"epoch": 1.5591397849462365,
"grad_norm": 0.6109308073405598,
"learning_rate": 9.211257453869495e-06,
"loss": 0.28516521453857424,
"memory(GiB)": 36.2,
"step": 145,
"token_acc": 0.9022033404140114,
"train_speed(iter/s)": 0.22851
},
{
"epoch": 1.6129032258064515,
"grad_norm": 0.61463883436549,
"learning_rate": 9.141393858239435e-06,
"loss": 0.28318946361541747,
"memory(GiB)": 36.2,
"step": 150,
"token_acc": 0.9105783567448795,
"train_speed(iter/s)": 0.22975
},
{
"epoch": 1.6129032258064515,
"eval_loss": 0.4003598093986511,
"eval_runtime": 0.9657,
"eval_samples_per_second": 16.569,
"eval_steps_per_second": 2.071,
"eval_token_acc": 0.8660787868798417,
"step": 150
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.6132153447244343,
"learning_rate": 9.068854879642833e-06,
"loss": 0.2889599084854126,
"memory(GiB)": 36.32,
"step": 155,
"token_acc": 0.8814800662336009,
"train_speed(iter/s)": 0.208717
},
{
"epoch": 1.7204301075268817,
"grad_norm": 0.5971251557661211,
"learning_rate": 8.99368737900487e-06,
"loss": 0.2876766204833984,
"memory(GiB)": 36.32,
"step": 160,
"token_acc": 0.8966378835201175,
"train_speed(iter/s)": 0.209931
},
{
"epoch": 1.7741935483870968,
"grad_norm": 0.5748854455080764,
"learning_rate": 8.91593991530297e-06,
"loss": 0.3036654472351074,
"memory(GiB)": 36.32,
"step": 165,
"token_acc": 0.9018055115616092,
"train_speed(iter/s)": 0.211255
},
{
"epoch": 1.827956989247312,
"grad_norm": 0.5561505552546223,
"learning_rate": 8.835662714197182e-06,
"loss": 0.3028982639312744,
"memory(GiB)": 36.32,
"step": 170,
"token_acc": 0.9018227009113504,
"train_speed(iter/s)": 0.212718
},
{
"epoch": 1.881720430107527,
"grad_norm": 0.7317876505668975,
"learning_rate": 8.752907635583911e-06,
"loss": 0.29505395889282227,
"memory(GiB)": 36.32,
"step": 175,
"token_acc": 0.8887829072872949,
"train_speed(iter/s)": 0.214568
},
{
"epoch": 1.935483870967742,
"grad_norm": 0.5680265544473229,
"learning_rate": 8.667728140093876e-06,
"loss": 0.29898526668548586,
"memory(GiB)": 36.32,
"step": 180,
"token_acc": 0.8902275769745649,
"train_speed(iter/s)": 0.216215
},
{
"epoch": 1.989247311827957,
"grad_norm": 0.6258148139715933,
"learning_rate": 8.580179254555997e-06,
"loss": 0.2970327615737915,
"memory(GiB)": 36.32,
"step": 185,
"token_acc": 0.8959904359021519,
"train_speed(iter/s)": 0.217886
},
{
"epoch": 2.043010752688172,
"grad_norm": 0.6156380554712465,
"learning_rate": 8.490317536449497e-06,
"loss": 0.22337541580200196,
"memory(GiB)": 36.32,
"step": 190,
"token_acc": 0.9188696893614926,
"train_speed(iter/s)": 0.218922
},
{
"epoch": 2.096774193548387,
"grad_norm": 0.5388492710872453,
"learning_rate": 8.398201037367202e-06,
"loss": 0.20124404430389403,
"memory(GiB)": 36.32,
"step": 195,
"token_acc": 0.926163422957213,
"train_speed(iter/s)": 0.21976
},
{
"epoch": 2.150537634408602,
"grad_norm": 0.5983173894045656,
"learning_rate": 8.303889265513599e-06,
"loss": 0.20379652976989746,
"memory(GiB)": 36.32,
"step": 200,
"token_acc": 0.9316990932701508,
"train_speed(iter/s)": 0.220712
},
{
"epoch": 2.150537634408602,
"eval_loss": 0.42072147130966187,
"eval_runtime": 0.982,
"eval_samples_per_second": 16.293,
"eval_steps_per_second": 2.037,
"eval_token_acc": 0.8620405472226801,
"step": 200
},
{
"epoch": 2.204301075268817,
"grad_norm": 0.6470473264794586,
"learning_rate": 8.20744314726193e-06,
"loss": 0.20558562278747558,
"memory(GiB)": 36.32,
"step": 205,
"token_acc": 0.9130314104639867,
"train_speed(iter/s)": 0.179558
},
{
"epoch": 2.258064516129032,
"grad_norm": 0.5880896656650852,
"learning_rate": 8.108924987795137e-06,
"loss": 0.18437005281448365,
"memory(GiB)": 36.32,
"step": 210,
"token_acc": 0.9324857899075801,
"train_speed(iter/s)": 0.181077
},
{
"epoch": 2.3118279569892475,
"grad_norm": 0.5477307460414933,
"learning_rate": 8.008398430856064e-06,
"loss": 0.18631315231323242,
"memory(GiB)": 36.32,
"step": 215,
"token_acc": 0.9362203632666266,
"train_speed(iter/s)": 0.182362
},
{
"epoch": 2.3655913978494625,
"grad_norm": 0.5227844014342666,
"learning_rate": 7.905928417632947e-06,
"loss": 0.20151617527008056,
"memory(GiB)": 36.32,
"step": 220,
"token_acc": 0.9247515563976645,
"train_speed(iter/s)": 0.183659
},
{
"epoch": 2.4193548387096775,
"grad_norm": 0.5262556350378147,
"learning_rate": 7.801581144806752e-06,
"loss": 0.1893543004989624,
"memory(GiB)": 36.32,
"step": 225,
"token_acc": 0.916718566189191,
"train_speed(iter/s)": 0.184894
},
{
"epoch": 2.4731182795698925,
"grad_norm": 0.5484675051032469,
"learning_rate": 7.695424021787412e-06,
"loss": 0.1866333603858948,
"memory(GiB)": 36.32,
"step": 230,
"token_acc": 0.924685033919424,
"train_speed(iter/s)": 0.186222
},
{
"epoch": 2.5268817204301075,
"grad_norm": 0.5628522228195585,
"learning_rate": 7.587525627166691e-06,
"loss": 0.19393479824066162,
"memory(GiB)": 36.32,
"step": 235,
"token_acc": 0.9374396347352709,
"train_speed(iter/s)": 0.187657
},
{
"epoch": 2.5806451612903225,
"grad_norm": 0.5335622599157593,
"learning_rate": 7.477955664415678e-06,
"loss": 0.19508060216903686,
"memory(GiB)": 36.32,
"step": 240,
"token_acc": 0.9316161484757817,
"train_speed(iter/s)": 0.189112
},
{
"epoch": 2.6344086021505375,
"grad_norm": 0.5245516097527467,
"learning_rate": 7.36678491685565e-06,
"loss": 0.19449775218963622,
"memory(GiB)": 36.32,
"step": 245,
"token_acc": 0.9377052300956551,
"train_speed(iter/s)": 0.19048
},
{
"epoch": 2.688172043010753,
"grad_norm": 0.5779503566004872,
"learning_rate": 7.254085201931305e-06,
"loss": 0.2031865119934082,
"memory(GiB)": 36.32,
"step": 250,
"token_acc": 0.921832884097035,
"train_speed(iter/s)": 0.191753
},
{
"epoch": 2.688172043010753,
"eval_loss": 0.43173694610595703,
"eval_runtime": 0.9856,
"eval_samples_per_second": 16.233,
"eval_steps_per_second": 2.029,
"eval_token_acc": 0.8627273226745783,
"step": 250
},
{
"epoch": 2.741935483870968,
"grad_norm": 0.5128503859951468,
"learning_rate": 7.139929324815965e-06,
"loss": 0.19230486154556276,
"memory(GiB)": 36.78,
"step": 255,
"token_acc": 0.9164345403899722,
"train_speed(iter/s)": 0.164733
},
{
"epoch": 2.795698924731183,
"grad_norm": 0.5561171013285224,
"learning_rate": 7.024391031378686e-06,
"loss": 0.1845786452293396,
"memory(GiB)": 36.78,
"step": 260,
"token_acc": 0.9280777134317205,
"train_speed(iter/s)": 0.166048
},
{
"epoch": 2.849462365591398,
"grad_norm": 0.55054564008372,
"learning_rate": 6.907544960543659e-06,
"loss": 0.18752856254577638,
"memory(GiB)": 36.78,
"step": 265,
"token_acc": 0.928450923562746,
"train_speed(iter/s)": 0.167181
},
{
"epoch": 2.903225806451613,
"grad_norm": 0.5241738277865058,
"learning_rate": 6.7894665960727105e-06,
"loss": 0.19395242929458617,
"memory(GiB)": 36.78,
"step": 270,
"token_acc": 0.9294530630732646,
"train_speed(iter/s)": 0.168566
},
{
"epoch": 2.956989247311828,
"grad_norm": 0.5219243104191204,
"learning_rate": 6.670232217802011e-06,
"loss": 0.1912919521331787,
"memory(GiB)": 36.78,
"step": 275,
"token_acc": 0.9288971457524067,
"train_speed(iter/s)": 0.169806
},
{
"epoch": 3.010752688172043,
"grad_norm": 0.5873962294936487,
"learning_rate": 6.549918852364517e-06,
"loss": 0.18026410341262816,
"memory(GiB)": 36.78,
"step": 280,
"token_acc": 0.9238556338028169,
"train_speed(iter/s)": 0.170794
},
{
"epoch": 3.064516129032258,
"grad_norm": 0.5295922567078138,
"learning_rate": 6.42860422342998e-06,
"loss": 0.12432655096054077,
"memory(GiB)": 36.78,
"step": 285,
"token_acc": 0.9551729045111712,
"train_speed(iter/s)": 0.171865
},
{
"epoch": 3.118279569892473,
"grad_norm": 0.6075093437048568,
"learning_rate": 6.306366701494649e-06,
"loss": 0.12841488122940065,
"memory(GiB)": 36.78,
"step": 290,
"token_acc": 0.9396288908126011,
"train_speed(iter/s)": 0.172844
},
{
"epoch": 3.172043010752688,
"grad_norm": 0.5509357998666382,
"learning_rate": 6.183285253253135e-06,
"loss": 0.11821137666702271,
"memory(GiB)": 36.78,
"step": 295,
"token_acc": 0.9528064255501164,
"train_speed(iter/s)": 0.173786
},
{
"epoch": 3.225806451612903,
"grad_norm": 0.5108788614443498,
"learning_rate": 6.0594393905851065e-06,
"loss": 0.11771461963653565,
"memory(GiB)": 36.78,
"step": 300,
"token_acc": 0.9583916241707658,
"train_speed(iter/s)": 0.175078
},
{
"epoch": 3.225806451612903,
"eval_loss": 0.46923384070396423,
"eval_runtime": 0.9709,
"eval_samples_per_second": 16.48,
"eval_steps_per_second": 2.06,
"eval_token_acc": 0.8575078292401517,
"step": 300
},
{
"epoch": 3.279569892473118,
"grad_norm": 0.532375440211573,
"learning_rate": 5.934909119189806e-06,
"loss": 0.11486297845840454,
"memory(GiB)": 36.78,
"step": 305,
"token_acc": 0.9351425942962281,
"train_speed(iter/s)": 0.156217
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.4986724035910756,
"learning_rate": 5.809774886901538e-06,
"loss": 0.12363936901092529,
"memory(GiB)": 36.78,
"step": 310,
"token_acc": 0.9564072783584979,
"train_speed(iter/s)": 0.15723
},
{
"epoch": 3.3870967741935485,
"grad_norm": 0.5044548969667993,
"learning_rate": 5.684117531719552e-06,
"loss": 0.12493133544921875,
"memory(GiB)": 36.78,
"step": 315,
"token_acc": 0.9532926933974414,
"train_speed(iter/s)": 0.158289
},
{
"epoch": 3.4408602150537635,
"grad_norm": 0.6359389048574728,
"learning_rate": 5.558018229585856e-06,
"loss": 0.10656380653381348,
"memory(GiB)": 36.78,
"step": 320,
"token_acc": 0.9684046407116981,
"train_speed(iter/s)": 0.159481
},
{
"epoch": 3.4946236559139785,
"grad_norm": 0.5211147608340247,
"learning_rate": 5.431558441944731e-06,
"loss": 0.11835185289382935,
"memory(GiB)": 36.78,
"step": 325,
"token_acc": 0.9631512587952983,
"train_speed(iter/s)": 0.160534
},
{
"epoch": 3.5483870967741935,
"grad_norm": 0.5235745875213004,
"learning_rate": 5.304819863117796e-06,
"loss": 0.11486140489578248,
"memory(GiB)": 36.78,
"step": 330,
"token_acc": 0.9587179487179487,
"train_speed(iter/s)": 0.161595
},
{
"epoch": 3.6021505376344085,
"grad_norm": 0.5370422937987905,
"learning_rate": 5.177884367528637e-06,
"loss": 0.12060900926589965,
"memory(GiB)": 36.78,
"step": 335,
"token_acc": 0.9555168079153319,
"train_speed(iter/s)": 0.162556
},
{
"epoch": 3.6559139784946235,
"grad_norm": 0.5133061100021457,
"learning_rate": 5.0508339568111e-06,
"loss": 0.114243483543396,
"memory(GiB)": 36.78,
"step": 340,
"token_acc": 0.9510202454965726,
"train_speed(iter/s)": 0.163581
},
{
"epoch": 3.709677419354839,
"grad_norm": 0.5231222499336802,
"learning_rate": 4.923750706835371e-06,
"loss": 0.12267729043960571,
"memory(GiB)": 36.78,
"step": 345,
"token_acc": 0.9553353973168215,
"train_speed(iter/s)": 0.164482
},
{
"epoch": 3.763440860215054,
"grad_norm": 0.5089747349843902,
"learning_rate": 4.7967167146861446e-06,
"loss": 0.11303888559341431,
"memory(GiB)": 36.78,
"step": 350,
"token_acc": 0.9600354845863828,
"train_speed(iter/s)": 0.165357
},
{
"epoch": 3.763440860215054,
"eval_loss": 0.4767088294029236,
"eval_runtime": 0.9917,
"eval_samples_per_second": 16.134,
"eval_steps_per_second": 2.017,
"eval_token_acc": 0.8583869018185813,
"step": 350
},
{
"epoch": 3.817204301075269,
"grad_norm": 0.5352999649215254,
"learning_rate": 4.669814045627046e-06,
"loss": 0.11718583106994629,
"memory(GiB)": 36.78,
"step": 355,
"token_acc": 0.9347400235070241,
"train_speed(iter/s)": 0.150406
},
{
"epoch": 3.870967741935484,
"grad_norm": 0.573204238275855,
"learning_rate": 4.5431246800856455e-06,
"loss": 0.11329195499420167,
"memory(GiB)": 36.78,
"step": 360,
"token_acc": 0.9657640565712314,
"train_speed(iter/s)": 0.151423
},
{
"epoch": 3.924731182795699,
"grad_norm": 0.5454577647386208,
"learning_rate": 4.416730460693239e-06,
"loss": 0.11979327201843262,
"memory(GiB)": 36.78,
"step": 365,
"token_acc": 0.9548889088782945,
"train_speed(iter/s)": 0.15224
},
{
"epoch": 3.978494623655914,
"grad_norm": 0.4866198331616375,
"learning_rate": 4.290713039413684e-06,
"loss": 0.11887497901916504,
"memory(GiB)": 36.78,
"step": 370,
"token_acc": 0.9653520499108734,
"train_speed(iter/s)": 0.153223
},
{
"epoch": 4.032258064516129,
"grad_norm": 0.45037416589175444,
"learning_rate": 4.165153824795391e-06,
"loss": 0.09095752239227295,
"memory(GiB)": 36.78,
"step": 375,
"token_acc": 0.9678588797029046,
"train_speed(iter/s)": 0.154122
},
{
"epoch": 4.086021505376344,
"grad_norm": 0.485523239824137,
"learning_rate": 4.040133929380551e-06,
"loss": 0.077480149269104,
"memory(GiB)": 36.78,
"step": 380,
"token_acc": 0.975248480169835,
"train_speed(iter/s)": 0.154937
},
{
"epoch": 4.139784946236559,
"grad_norm": 0.42587539911230965,
"learning_rate": 3.915734117305624e-06,
"loss": 0.06480391025543213,
"memory(GiB)": 36.78,
"step": 385,
"token_acc": 0.9769721842225262,
"train_speed(iter/s)": 0.155951
},
{
"epoch": 4.193548387096774,
"grad_norm": 0.5141471588233666,
"learning_rate": 3.7920347521268514e-06,
"loss": 0.07736325263977051,
"memory(GiB)": 36.78,
"step": 390,
"token_acc": 0.9737575974258134,
"train_speed(iter/s)": 0.156692
},
{
"epoch": 4.247311827956989,
"grad_norm": 0.4741421109444635,
"learning_rate": 3.6691157449045915e-06,
"loss": 0.06898297071456909,
"memory(GiB)": 36.78,
"step": 395,
"token_acc": 0.9794690999585234,
"train_speed(iter/s)": 0.15764
},
{
"epoch": 4.301075268817204,
"grad_norm": 0.5032918290160332,
"learning_rate": 3.5470565025799515e-06,
"loss": 0.06421754360198975,
"memory(GiB)": 36.78,
"step": 400,
"token_acc": 0.9791707701398463,
"train_speed(iter/s)": 0.158589
},
{
"epoch": 4.301075268817204,
"eval_loss": 0.5137488842010498,
"eval_runtime": 0.9946,
"eval_samples_per_second": 16.086,
"eval_steps_per_second": 2.011,
"eval_token_acc": 0.8553925608483051,
"step": 400
},
{
"epoch": 4.354838709677419,
"grad_norm": 0.5064324935719208,
"learning_rate": 3.425935876677077e-06,
"loss": 0.06645252704620361,
"memory(GiB)": 36.78,
"step": 405,
"token_acc": 0.9447127229723071,
"train_speed(iter/s)": 0.146452
},
{
"epoch": 4.408602150537634,
"grad_norm": 0.4960758348047365,
"learning_rate": 3.305832112364268e-06,
"loss": 0.07083821892738343,
"memory(GiB)": 36.78,
"step": 410,
"token_acc": 0.9750937850485362,
"train_speed(iter/s)": 0.147329
},
{
"epoch": 4.462365591397849,
"grad_norm": 0.5091487033531963,
"learning_rate": 3.1868227979067985e-06,
"loss": 0.0703616976737976,
"memory(GiB)": 36.78,
"step": 415,
"token_acc": 0.9779349923316677,
"train_speed(iter/s)": 0.148154
},
{
"epoch": 4.516129032258064,
"grad_norm": 0.5085525658110539,
"learning_rate": 3.068984814544087e-06,
"loss": 0.07059448957443237,
"memory(GiB)": 36.78,
"step": 420,
"token_acc": 0.9705357142857143,
"train_speed(iter/s)": 0.148895
},
{
"epoch": 4.56989247311828,
"grad_norm": 0.5064686156678317,
"learning_rate": 2.9523942868236414e-06,
"loss": 0.07201706171035767,
"memory(GiB)": 36.78,
"step": 425,
"token_acc": 0.9695118947938728,
"train_speed(iter/s)": 0.149603
},
{
"epoch": 4.623655913978495,
"grad_norm": 0.5330742370752339,
"learning_rate": 2.8371265334238103e-06,
"loss": 0.06676008701324462,
"memory(GiB)": 36.78,
"step": 430,
"token_acc": 0.9743828804830403,
"train_speed(iter/s)": 0.150406
},
{
"epoch": 4.67741935483871,
"grad_norm": 0.48023829750195657,
"learning_rate": 2.7232560184971437e-06,
"loss": 0.07017686367034912,
"memory(GiB)": 36.78,
"step": 435,
"token_acc": 0.9749133183765042,
"train_speed(iter/s)": 0.1513
},
{
"epoch": 4.731182795698925,
"grad_norm": 0.538651125017694,
"learning_rate": 2.610856303565793e-06,
"loss": 0.0628254771232605,
"memory(GiB)": 36.78,
"step": 440,
"token_acc": 0.9788421297372704,
"train_speed(iter/s)": 0.15211
},
{
"epoch": 4.78494623655914,
"grad_norm": 0.538521707591603,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.06735045909881592,
"memory(GiB)": 36.78,
"step": 445,
"token_acc": 0.9842489435267,
"train_speed(iter/s)": 0.152875
},
{
"epoch": 4.838709677419355,
"grad_norm": 0.44415093447519377,
"learning_rate": 2.390758722110418e-06,
"loss": 0.0701654613018036,
"memory(GiB)": 36.78,
"step": 450,
"token_acc": 0.9781991349556556,
"train_speed(iter/s)": 0.153703
},
{
"epoch": 4.838709677419355,
"eval_loss": 0.5138384103775024,
"eval_runtime": 0.987,
"eval_samples_per_second": 16.21,
"eval_steps_per_second": 2.026,
"eval_token_acc": 0.8553650898302291,
"step": 450
},
{
"epoch": 4.89247311827957,
"grad_norm": 0.44909839508545846,
"learning_rate": 2.283203040884524e-06,
"loss": 0.07383356690406799,
"memory(GiB)": 36.78,
"step": 455,
"token_acc": 0.9474127346079635,
"train_speed(iter/s)": 0.143427
},
{
"epoch": 4.946236559139785,
"grad_norm": 0.5540561379082162,
"learning_rate": 2.1774024383970372e-06,
"loss": 0.06765682101249695,
"memory(GiB)": 36.78,
"step": 460,
"token_acc": 0.9771299435028249,
"train_speed(iter/s)": 0.144189
},
{
"epoch": 5.0,
"grad_norm": 0.4921194250215734,
"learning_rate": 2.0734252629237892e-06,
"loss": 0.062316888570785524,
"memory(GiB)": 36.78,
"step": 465,
"token_acc": 0.9770206022187005,
"train_speed(iter/s)": 0.144983
},
{
"epoch": 5.053763440860215,
"grad_norm": 0.3608411695035307,
"learning_rate": 1.971338684788034e-06,
"loss": 0.04745644629001618,
"memory(GiB)": 36.78,
"step": 470,
"token_acc": 0.9881954568969572,
"train_speed(iter/s)": 0.145689
},
{
"epoch": 5.10752688172043,
"grad_norm": 0.3455316573497327,
"learning_rate": 1.8712086529677214e-06,
"loss": 0.04306984841823578,
"memory(GiB)": 36.78,
"step": 475,
"token_acc": 0.9849460329483053,
"train_speed(iter/s)": 0.146458
},
{
"epoch": 5.161290322580645,
"grad_norm": 0.4335927828524962,
"learning_rate": 1.773099852491796e-06,
"loss": 0.03937138915061951,
"memory(GiB)": 36.78,
"step": 480,
"token_acc": 0.9869653767820774,
"train_speed(iter/s)": 0.147225
},
{
"epoch": 5.21505376344086,
"grad_norm": 0.4489096871443703,
"learning_rate": 1.6770756626529866e-06,
"loss": 0.04089862108230591,
"memory(GiB)": 36.78,
"step": 485,
"token_acc": 0.9865810708394632,
"train_speed(iter/s)": 0.147936
},
{
"epoch": 5.268817204301075,
"grad_norm": 0.422729324631405,
"learning_rate": 1.583198116064144e-06,
"loss": 0.046530479192733766,
"memory(GiB)": 36.78,
"step": 490,
"token_acc": 0.9874636404604858,
"train_speed(iter/s)": 0.148672
},
{
"epoch": 5.32258064516129,
"grad_norm": 0.41974244882967215,
"learning_rate": 1.491527858584535e-06,
"loss": 0.037504765391349795,
"memory(GiB)": 36.78,
"step": 495,
"token_acc": 0.9868473694738947,
"train_speed(iter/s)": 0.149349
},
{
"epoch": 5.376344086021505,
"grad_norm": 0.4026767578452857,
"learning_rate": 1.4021241101419863e-06,
"loss": 0.05219945907592773,
"memory(GiB)": 36.78,
"step": 500,
"token_acc": 0.9789432382945331,
"train_speed(iter/s)": 0.149951
},
{
"epoch": 5.376344086021505,
"eval_loss": 0.546004593372345,
"eval_runtime": 0.9994,
"eval_samples_per_second": 16.009,
"eval_steps_per_second": 2.001,
"eval_token_acc": 0.8528926982033954,
"step": 500
},
{
"epoch": 5.43010752688172,
"grad_norm": 0.5030854207740384,
"learning_rate": 1.3150446264762134e-06,
"loss": 0.03957706689834595,
"memory(GiB)": 36.78,
"step": 505,
"token_acc": 0.9562320657013754,
"train_speed(iter/s)": 0.141178
},
{
"epoch": 5.483870967741936,
"grad_norm": 0.4024434594737618,
"learning_rate": 1.2303456618280141e-06,
"loss": 0.04660770297050476,
"memory(GiB)": 36.78,
"step": 510,
"token_acc": 0.9861975389510129,
"train_speed(iter/s)": 0.141854
},
{
"epoch": 5.53763440860215,
"grad_norm": 0.43599167036494046,
"learning_rate": 1.1480819325984489e-06,
"loss": 0.03592199087142944,
"memory(GiB)": 36.78,
"step": 515,
"token_acc": 0.9862939862939863,
"train_speed(iter/s)": 0.142521
},
{
"epoch": 5.591397849462366,
"grad_norm": 0.4252431424010526,
"learning_rate": 1.0683065820014865e-06,
"loss": 0.03672348260879517,
"memory(GiB)": 36.78,
"step": 520,
"token_acc": 0.9892253485165705,
"train_speed(iter/s)": 0.143282
},
{
"epoch": 5.645161290322581,
"grad_norm": 0.4382310526426677,
"learning_rate": 9.91071145732948e-07,
"loss": 0.039648061990737914,
"memory(GiB)": 36.78,
"step": 525,
"token_acc": 0.989158604008293,
"train_speed(iter/s)": 0.143942
},
{
"epoch": 5.698924731182796,
"grad_norm": 0.4062766877899902,
"learning_rate": 9.164255186779048e-07,
"loss": 0.04137682020664215,
"memory(GiB)": 36.78,
"step": 530,
"token_acc": 0.9825452344117845,
"train_speed(iter/s)": 0.144611
},
{
"epoch": 5.752688172043011,
"grad_norm": 0.5525974041179196,
"learning_rate": 8.444179226780824e-07,
"loss": 0.042464354634284975,
"memory(GiB)": 36.78,
"step": 535,
"token_acc": 0.9898713517665131,
"train_speed(iter/s)": 0.145237
},
{
"epoch": 5.806451612903226,
"grad_norm": 0.3962239310547138,
"learning_rate": 7.750948753800508e-07,
"loss": 0.04482833445072174,
"memory(GiB)": 36.78,
"step": 540,
"token_acc": 0.9841143059992901,
"train_speed(iter/s)": 0.145873
},
{
"epoch": 5.860215053763441,
"grad_norm": 0.34337978747525016,
"learning_rate": 7.085011601843439e-07,
"loss": 0.03781391978263855,
"memory(GiB)": 36.78,
"step": 545,
"token_acc": 0.9846997121337592,
"train_speed(iter/s)": 0.146496
},
{
"epoch": 5.913978494623656,
"grad_norm": 0.5223725509030901,
"learning_rate": 6.4467979731493e-07,
"loss": 0.04320046007633209,
"memory(GiB)": 36.78,
"step": 550,
"token_acc": 0.9883423491424498,
"train_speed(iter/s)": 0.147061
},
{
"epoch": 5.913978494623656,
"eval_loss": 0.5462328791618347,
"eval_runtime": 0.9977,
"eval_samples_per_second": 16.036,
"eval_steps_per_second": 2.005,
"eval_token_acc": 0.8527828141310917,
"step": 550
},
{
"epoch": 5.967741935483871,
"grad_norm": 0.8558960053211487,
"learning_rate": 5.836720160276971e-07,
"loss": 0.04672636985778809,
"memory(GiB)": 36.78,
"step": 555,
"token_acc": 0.954482036972445,
"train_speed(iter/s)": 0.139302
},
{
"epoch": 6.021505376344086,
"grad_norm": 0.3090930847462213,
"learning_rate": 5.255172279759357e-07,
"loss": 0.0372942328453064,
"memory(GiB)": 36.78,
"step": 560,
"token_acc": 0.9877791188895595,
"train_speed(iter/s)": 0.139895
},
{
"epoch": 6.075268817204301,
"grad_norm": 0.26836514463502115,
"learning_rate": 4.7025300175000675e-07,
"loss": 0.027360618114471436,
"memory(GiB)": 36.78,
"step": 565,
"token_acc": 0.9910387804501211,
"train_speed(iter/s)": 0.140608
},
{
"epoch": 6.129032258064516,
"grad_norm": 0.30829251356715115,
"learning_rate": 4.179150386076425e-07,
"loss": 0.03349734842777252,
"memory(GiB)": 36.78,
"step": 570,
"token_acc": 0.9864537788881949,
"train_speed(iter/s)": 0.141208
},
{
"epoch": 6.182795698924731,
"grad_norm": 0.28151847827830856,
"learning_rate": 3.685371494105683e-07,
"loss": 0.030065619945526124,
"memory(GiB)": 36.78,
"step": 575,
"token_acc": 0.9859920709835757,
"train_speed(iter/s)": 0.141799
},
{
"epoch": 6.236559139784946,
"grad_norm": 0.3537445591075822,
"learning_rate": 3.221512327823406e-07,
"loss": 0.03237749636173248,
"memory(GiB)": 36.78,
"step": 580,
"token_acc": 0.9896695119282118,
"train_speed(iter/s)": 0.142428
},
{
"epoch": 6.290322580645161,
"grad_norm": 0.34310476361019154,
"learning_rate": 2.787872545015069e-07,
"loss": 0.03409869074821472,
"memory(GiB)": 36.78,
"step": 585,
"token_acc": 0.9929165383430859,
"train_speed(iter/s)": 0.142999
},
{
"epoch": 6.344086021505376,
"grad_norm": 0.342886198800162,
"learning_rate": 2.3847322814340654e-07,
"loss": 0.03197720050811768,
"memory(GiB)": 36.78,
"step": 590,
"token_acc": 0.9923538469707428,
"train_speed(iter/s)": 0.143587
},
{
"epoch": 6.397849462365591,
"grad_norm": 0.33182194556674444,
"learning_rate": 2.0123519698311e-07,
"loss": 0.028625327348709106,
"memory(GiB)": 36.78,
"step": 595,
"token_acc": 0.9889153496427798,
"train_speed(iter/s)": 0.144228
},
{
"epoch": 6.451612903225806,
"grad_norm": 0.4946310408661817,
"learning_rate": 1.6709721717120042e-07,
"loss": 0.028473174571990965,
"memory(GiB)": 36.78,
"step": 600,
"token_acc": 0.9835623061117602,
"train_speed(iter/s)": 0.144841
},
{
"epoch": 6.451612903225806,
"eval_loss": 0.567303478717804,
"eval_runtime": 1.0088,
"eval_samples_per_second": 15.86,
"eval_steps_per_second": 1.983,
"eval_token_acc": 0.8509971979561562,
"step": 600
}
],
"logging_steps": 5,
"max_steps": 651,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 52529112133632.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}