| { | |
| "best_metric": 0.58375472, | |
| "best_model_checkpoint": "/export/home2/zli/kc/mm_rag/Qwen2.5-32B-Instruct_lora/checkpoint-1026", | |
| "epoch": 0.9997563946406821, | |
| "eval_steps": 100, | |
| "global_step": 1026, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00097442143727162, | |
| "grad_norm": 0.8529725074768066, | |
| "learning_rate": 1.9230769230769234e-06, | |
| "loss": 0.9744532108306885, | |
| "memory(GiB)": 255.29, | |
| "step": 1, | |
| "token_acc": 0.7235609103078983, | |
| "train_speed(iter/s)": 0.048136 | |
| }, | |
| { | |
| "epoch": 0.0048721071863581, | |
| "grad_norm": 0.9313930869102478, | |
| "learning_rate": 9.615384615384616e-06, | |
| "loss": 0.7735831141471863, | |
| "memory(GiB)": 307.13, | |
| "step": 5, | |
| "token_acc": 0.7883002659030477, | |
| "train_speed(iter/s)": 0.052895 | |
| }, | |
| { | |
| "epoch": 0.0097442143727162, | |
| "grad_norm": 0.5147601962089539, | |
| "learning_rate": 1.923076923076923e-05, | |
| "loss": 0.8932640075683593, | |
| "memory(GiB)": 357.61, | |
| "step": 10, | |
| "token_acc": 0.7670103092783506, | |
| "train_speed(iter/s)": 0.055392 | |
| }, | |
| { | |
| "epoch": 0.014616321559074299, | |
| "grad_norm": 0.3254891037940979, | |
| "learning_rate": 2.8846153846153845e-05, | |
| "loss": 0.7951784610748291, | |
| "memory(GiB)": 357.63, | |
| "step": 15, | |
| "token_acc": 0.7827709279688514, | |
| "train_speed(iter/s)": 0.058176 | |
| }, | |
| { | |
| "epoch": 0.0194884287454324, | |
| "grad_norm": 0.5581763982772827, | |
| "learning_rate": 3.846153846153846e-05, | |
| "loss": 0.956171703338623, | |
| "memory(GiB)": 357.63, | |
| "step": 20, | |
| "token_acc": 0.7419072615923009, | |
| "train_speed(iter/s)": 0.05832 | |
| }, | |
| { | |
| "epoch": 0.024360535931790498, | |
| "grad_norm": 0.3584084212779999, | |
| "learning_rate": 4.8076923076923084e-05, | |
| "loss": 0.8373254776000977, | |
| "memory(GiB)": 357.64, | |
| "step": 25, | |
| "token_acc": 0.7718067293892521, | |
| "train_speed(iter/s)": 0.057013 | |
| }, | |
| { | |
| "epoch": 0.029232643118148598, | |
| "grad_norm": 0.43336302042007446, | |
| "learning_rate": 5.769230769230769e-05, | |
| "loss": 0.7370581150054931, | |
| "memory(GiB)": 357.64, | |
| "step": 30, | |
| "token_acc": 0.7800821355236139, | |
| "train_speed(iter/s)": 0.057318 | |
| }, | |
| { | |
| "epoch": 0.0341047503045067, | |
| "grad_norm": 0.6522459983825684, | |
| "learning_rate": 6.730769230769232e-05, | |
| "loss": 0.8316493988037109, | |
| "memory(GiB)": 357.64, | |
| "step": 35, | |
| "token_acc": 0.7533095188064719, | |
| "train_speed(iter/s)": 0.0583 | |
| }, | |
| { | |
| "epoch": 0.0389768574908648, | |
| "grad_norm": 0.2728117108345032, | |
| "learning_rate": 7.692307692307693e-05, | |
| "loss": 0.7739034652709961, | |
| "memory(GiB)": 357.64, | |
| "step": 40, | |
| "token_acc": 0.7733619763694952, | |
| "train_speed(iter/s)": 0.058058 | |
| }, | |
| { | |
| "epoch": 0.0438489646772229, | |
| "grad_norm": 0.5565893054008484, | |
| "learning_rate": 8.653846153846155e-05, | |
| "loss": 0.664579200744629, | |
| "memory(GiB)": 357.64, | |
| "step": 45, | |
| "token_acc": 0.7940859054608245, | |
| "train_speed(iter/s)": 0.058044 | |
| }, | |
| { | |
| "epoch": 0.048721071863580996, | |
| "grad_norm": 0.8710922002792358, | |
| "learning_rate": 9.615384615384617e-05, | |
| "loss": 0.6327468872070312, | |
| "memory(GiB)": 378.13, | |
| "step": 50, | |
| "token_acc": 0.8048907388137357, | |
| "train_speed(iter/s)": 0.057666 | |
| }, | |
| { | |
| "epoch": 0.0535931790499391, | |
| "grad_norm": 0.5048324465751648, | |
| "learning_rate": 9.999765921804365e-05, | |
| "loss": 0.7563381671905518, | |
| "memory(GiB)": 378.13, | |
| "step": 55, | |
| "token_acc": 0.7769541778975741, | |
| "train_speed(iter/s)": 0.056465 | |
| }, | |
| { | |
| "epoch": 0.058465286236297195, | |
| "grad_norm": 0.4348597824573517, | |
| "learning_rate": 9.998335523311734e-05, | |
| "loss": 0.6969810009002686, | |
| "memory(GiB)": 378.13, | |
| "step": 60, | |
| "token_acc": 0.7881806108897742, | |
| "train_speed(iter/s)": 0.056264 | |
| }, | |
| { | |
| "epoch": 0.06333739342265529, | |
| "grad_norm": 0.3995300233364105, | |
| "learning_rate": 9.995605141340247e-05, | |
| "loss": 0.6338334083557129, | |
| "memory(GiB)": 378.13, | |
| "step": 65, | |
| "token_acc": 0.8006442376521117, | |
| "train_speed(iter/s)": 0.056356 | |
| }, | |
| { | |
| "epoch": 0.0682095006090134, | |
| "grad_norm": 0.25958451628685, | |
| "learning_rate": 9.991575486016592e-05, | |
| "loss": 0.7143070697784424, | |
| "memory(GiB)": 378.13, | |
| "step": 70, | |
| "token_acc": 0.7797544920832592, | |
| "train_speed(iter/s)": 0.056342 | |
| }, | |
| { | |
| "epoch": 0.0730816077953715, | |
| "grad_norm": 0.3476051688194275, | |
| "learning_rate": 9.986247605386727e-05, | |
| "loss": 0.6742859840393066, | |
| "memory(GiB)": 378.13, | |
| "step": 75, | |
| "token_acc": 0.7907263751763046, | |
| "train_speed(iter/s)": 0.056661 | |
| }, | |
| { | |
| "epoch": 0.0779537149817296, | |
| "grad_norm": 0.3681824505329132, | |
| "learning_rate": 9.979622885143301e-05, | |
| "loss": 0.5565629959106445, | |
| "memory(GiB)": 378.13, | |
| "step": 80, | |
| "token_acc": 0.8259753593429158, | |
| "train_speed(iter/s)": 0.056802 | |
| }, | |
| { | |
| "epoch": 0.0828258221680877, | |
| "grad_norm": 0.4626450836658478, | |
| "learning_rate": 9.97170304826526e-05, | |
| "loss": 0.6965714931488037, | |
| "memory(GiB)": 378.13, | |
| "step": 85, | |
| "token_acc": 0.7883802169516455, | |
| "train_speed(iter/s)": 0.057068 | |
| }, | |
| { | |
| "epoch": 0.0876979293544458, | |
| "grad_norm": 0.6139679551124573, | |
| "learning_rate": 9.962490154569727e-05, | |
| "loss": 0.670227336883545, | |
| "memory(GiB)": 378.13, | |
| "step": 90, | |
| "token_acc": 0.7868181818181819, | |
| "train_speed(iter/s)": 0.056985 | |
| }, | |
| { | |
| "epoch": 0.0925700365408039, | |
| "grad_norm": 0.3063270151615143, | |
| "learning_rate": 9.95198660017628e-05, | |
| "loss": 0.6900132179260254, | |
| "memory(GiB)": 378.13, | |
| "step": 95, | |
| "token_acc": 0.7969001610305958, | |
| "train_speed(iter/s)": 0.056931 | |
| }, | |
| { | |
| "epoch": 0.09744214372716199, | |
| "grad_norm": 0.6139111518859863, | |
| "learning_rate": 9.940195116883755e-05, | |
| "loss": 0.6138424396514892, | |
| "memory(GiB)": 378.13, | |
| "step": 100, | |
| "token_acc": 0.8159609120521173, | |
| "train_speed(iter/s)": 0.056746 | |
| }, | |
| { | |
| "epoch": 0.09744214372716199, | |
| "eval_loss": 0.7148731350898743, | |
| "eval_runtime": 6.1133, | |
| "eval_samples_per_second": 0.654, | |
| "eval_steps_per_second": 0.654, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1023142509135201, | |
| "grad_norm": 0.2960892617702484, | |
| "learning_rate": 9.927118771459763e-05, | |
| "loss": 0.7610847473144531, | |
| "memory(GiB)": 378.13, | |
| "step": 105, | |
| "token_acc": 0.767438173747622, | |
| "train_speed(iter/s)": 0.055274 | |
| }, | |
| { | |
| "epoch": 0.1071863580998782, | |
| "grad_norm": 0.35161978006362915, | |
| "learning_rate": 9.91276096484306e-05, | |
| "loss": 0.553467845916748, | |
| "memory(GiB)": 378.13, | |
| "step": 110, | |
| "token_acc": 0.8268870335661953, | |
| "train_speed(iter/s)": 0.055276 | |
| }, | |
| { | |
| "epoch": 0.1120584652862363, | |
| "grad_norm": 0.412120521068573, | |
| "learning_rate": 9.897125431259033e-05, | |
| "loss": 0.634190034866333, | |
| "memory(GiB)": 378.13, | |
| "step": 115, | |
| "token_acc": 0.8068276436303081, | |
| "train_speed(iter/s)": 0.055265 | |
| }, | |
| { | |
| "epoch": 0.11693057247259439, | |
| "grad_norm": 0.458766907453537, | |
| "learning_rate": 9.880216237248481e-05, | |
| "loss": 0.5070098400115967, | |
| "memory(GiB)": 383.55, | |
| "step": 120, | |
| "token_acc": 0.8488372093023255, | |
| "train_speed(iter/s)": 0.055088 | |
| }, | |
| { | |
| "epoch": 0.1218026796589525, | |
| "grad_norm": 0.26979780197143555, | |
| "learning_rate": 9.862037780609986e-05, | |
| "loss": 0.6117064476013183, | |
| "memory(GiB)": 383.55, | |
| "step": 125, | |
| "token_acc": 0.8204016064257028, | |
| "train_speed(iter/s)": 0.055201 | |
| }, | |
| { | |
| "epoch": 0.12667478684531058, | |
| "grad_norm": 0.33145567774772644, | |
| "learning_rate": 9.842594789256103e-05, | |
| "loss": 0.6480350017547607, | |
| "memory(GiB)": 383.55, | |
| "step": 130, | |
| "token_acc": 0.7939520333680917, | |
| "train_speed(iter/s)": 0.055341 | |
| }, | |
| { | |
| "epoch": 0.1315468940316687, | |
| "grad_norm": 0.29335150122642517, | |
| "learning_rate": 9.821892319983726e-05, | |
| "loss": 0.6102027893066406, | |
| "memory(GiB)": 383.55, | |
| "step": 135, | |
| "token_acc": 0.8075084115459537, | |
| "train_speed(iter/s)": 0.055381 | |
| }, | |
| { | |
| "epoch": 0.1364190012180268, | |
| "grad_norm": 0.34363335371017456, | |
| "learning_rate": 9.799935757158891e-05, | |
| "loss": 0.6652801513671875, | |
| "memory(GiB)": 383.55, | |
| "step": 140, | |
| "token_acc": 0.7967960995995125, | |
| "train_speed(iter/s)": 0.055397 | |
| }, | |
| { | |
| "epoch": 0.14129110840438489, | |
| "grad_norm": 1.3419567346572876, | |
| "learning_rate": 9.776730811316394e-05, | |
| "loss": 0.6267284393310547, | |
| "memory(GiB)": 383.55, | |
| "step": 145, | |
| "token_acc": 0.8041536400178652, | |
| "train_speed(iter/s)": 0.055716 | |
| }, | |
| { | |
| "epoch": 0.146163215590743, | |
| "grad_norm": 0.32478609681129456, | |
| "learning_rate": 9.752283517674575e-05, | |
| "loss": 0.5990486145019531, | |
| "memory(GiB)": 383.55, | |
| "step": 150, | |
| "token_acc": 0.80878414568827, | |
| "train_speed(iter/s)": 0.055975 | |
| }, | |
| { | |
| "epoch": 0.1510353227771011, | |
| "grad_norm": 0.33393779397010803, | |
| "learning_rate": 9.72660023456566e-05, | |
| "loss": 0.6312044143676758, | |
| "memory(GiB)": 383.55, | |
| "step": 155, | |
| "token_acc": 0.8099249502220861, | |
| "train_speed(iter/s)": 0.055469 | |
| }, | |
| { | |
| "epoch": 0.1559074299634592, | |
| "grad_norm": 0.39661872386932373, | |
| "learning_rate": 9.699687641782067e-05, | |
| "loss": 0.727474308013916, | |
| "memory(GiB)": 383.55, | |
| "step": 160, | |
| "token_acc": 0.7668711656441718, | |
| "train_speed(iter/s)": 0.055534 | |
| }, | |
| { | |
| "epoch": 0.1607795371498173, | |
| "grad_norm": 0.3174588978290558, | |
| "learning_rate": 9.671552738839099e-05, | |
| "loss": 0.7284453868865967, | |
| "memory(GiB)": 383.55, | |
| "step": 165, | |
| "token_acc": 0.7788385043754972, | |
| "train_speed(iter/s)": 0.055621 | |
| }, | |
| { | |
| "epoch": 0.1656516443361754, | |
| "grad_norm": 0.3508811891078949, | |
| "learning_rate": 9.642202843154491e-05, | |
| "loss": 0.6260187149047851, | |
| "memory(GiB)": 383.55, | |
| "step": 170, | |
| "token_acc": 0.8036006546644845, | |
| "train_speed(iter/s)": 0.055841 | |
| }, | |
| { | |
| "epoch": 0.1705237515225335, | |
| "grad_norm": 0.35121622681617737, | |
| "learning_rate": 9.611645588145272e-05, | |
| "loss": 0.6979084968566894, | |
| "memory(GiB)": 383.55, | |
| "step": 175, | |
| "token_acc": 0.784981684981685, | |
| "train_speed(iter/s)": 0.055902 | |
| }, | |
| { | |
| "epoch": 0.1753958587088916, | |
| "grad_norm": 0.41958293318748474, | |
| "learning_rate": 9.579888921242439e-05, | |
| "loss": 0.6360678195953369, | |
| "memory(GiB)": 383.55, | |
| "step": 180, | |
| "token_acc": 0.8010867455850961, | |
| "train_speed(iter/s)": 0.05597 | |
| }, | |
| { | |
| "epoch": 0.18026796589524968, | |
| "grad_norm": 0.4557216763496399, | |
| "learning_rate": 9.546941101823963e-05, | |
| "loss": 0.7268210411071777, | |
| "memory(GiB)": 383.55, | |
| "step": 185, | |
| "token_acc": 0.780511811023622, | |
| "train_speed(iter/s)": 0.056032 | |
| }, | |
| { | |
| "epoch": 0.1851400730816078, | |
| "grad_norm": 0.44287464022636414, | |
| "learning_rate": 9.512810699066667e-05, | |
| "loss": 0.6450634479522706, | |
| "memory(GiB)": 383.55, | |
| "step": 190, | |
| "token_acc": 0.8003896563939072, | |
| "train_speed(iter/s)": 0.056127 | |
| }, | |
| { | |
| "epoch": 0.1900121802679659, | |
| "grad_norm": 0.2944161593914032, | |
| "learning_rate": 9.477506589717518e-05, | |
| "loss": 0.5534649848937988, | |
| "memory(GiB)": 383.55, | |
| "step": 195, | |
| "token_acc": 0.8173982442138866, | |
| "train_speed(iter/s)": 0.056116 | |
| }, | |
| { | |
| "epoch": 0.19488428745432398, | |
| "grad_norm": 0.3743175268173218, | |
| "learning_rate": 9.441037955784944e-05, | |
| "loss": 0.7282295227050781, | |
| "memory(GiB)": 383.55, | |
| "step": 200, | |
| "token_acc": 0.7695568400770713, | |
| "train_speed(iter/s)": 0.05619 | |
| }, | |
| { | |
| "epoch": 0.19488428745432398, | |
| "eval_loss": 0.6866188049316406, | |
| "eval_runtime": 6.0764, | |
| "eval_samples_per_second": 0.658, | |
| "eval_steps_per_second": 0.658, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1997563946406821, | |
| "grad_norm": 0.397987425327301, | |
| "learning_rate": 9.403414282150738e-05, | |
| "loss": 0.6911158561706543, | |
| "memory(GiB)": 383.55, | |
| "step": 205, | |
| "token_acc": 0.7807744462859726, | |
| "train_speed(iter/s)": 0.055478 | |
| }, | |
| { | |
| "epoch": 0.2046285018270402, | |
| "grad_norm": 0.852430522441864, | |
| "learning_rate": 9.364645354103206e-05, | |
| "loss": 0.7203257560729981, | |
| "memory(GiB)": 383.55, | |
| "step": 210, | |
| "token_acc": 0.7839163822525598, | |
| "train_speed(iter/s)": 0.055538 | |
| }, | |
| { | |
| "epoch": 0.2095006090133983, | |
| "grad_norm": 0.36566832661628723, | |
| "learning_rate": 9.324741254792171e-05, | |
| "loss": 0.5751584529876709, | |
| "memory(GiB)": 383.55, | |
| "step": 215, | |
| "token_acc": 0.8120177310786406, | |
| "train_speed(iter/s)": 0.055665 | |
| }, | |
| { | |
| "epoch": 0.2143727161997564, | |
| "grad_norm": 0.5105158090591431, | |
| "learning_rate": 9.28371236260652e-05, | |
| "loss": 0.5958977699279785, | |
| "memory(GiB)": 383.55, | |
| "step": 220, | |
| "token_acc": 0.8228462471832206, | |
| "train_speed(iter/s)": 0.055807 | |
| }, | |
| { | |
| "epoch": 0.2192448233861145, | |
| "grad_norm": 0.48028162121772766, | |
| "learning_rate": 9.241569348474954e-05, | |
| "loss": 0.7106984615325928, | |
| "memory(GiB)": 383.55, | |
| "step": 225, | |
| "token_acc": 0.7828740844087897, | |
| "train_speed(iter/s)": 0.055858 | |
| }, | |
| { | |
| "epoch": 0.2241169305724726, | |
| "grad_norm": 0.32592424750328064, | |
| "learning_rate": 9.198323173090663e-05, | |
| "loss": 0.5898131847381591, | |
| "memory(GiB)": 383.55, | |
| "step": 230, | |
| "token_acc": 0.8189450340567084, | |
| "train_speed(iter/s)": 0.055995 | |
| }, | |
| { | |
| "epoch": 0.2289890377588307, | |
| "grad_norm": 0.44794151186943054, | |
| "learning_rate": 9.153985084060623e-05, | |
| "loss": 0.6042355060577392, | |
| "memory(GiB)": 383.55, | |
| "step": 235, | |
| "token_acc": 0.810122224134963, | |
| "train_speed(iter/s)": 0.056057 | |
| }, | |
| { | |
| "epoch": 0.23386114494518878, | |
| "grad_norm": 0.3204025328159332, | |
| "learning_rate": 9.108566612980298e-05, | |
| "loss": 0.5558523654937744, | |
| "memory(GiB)": 383.55, | |
| "step": 240, | |
| "token_acc": 0.8260869565217391, | |
| "train_speed(iter/s)": 0.056072 | |
| }, | |
| { | |
| "epoch": 0.2387332521315469, | |
| "grad_norm": 0.31540507078170776, | |
| "learning_rate": 9.062079572434448e-05, | |
| "loss": 0.6237210273742676, | |
| "memory(GiB)": 383.55, | |
| "step": 245, | |
| "token_acc": 0.8021445866482186, | |
| "train_speed(iter/s)": 0.056086 | |
| }, | |
| { | |
| "epoch": 0.243605359317905, | |
| "grad_norm": 0.619088888168335, | |
| "learning_rate": 9.014536052924883e-05, | |
| "loss": 0.664583158493042, | |
| "memory(GiB)": 383.55, | |
| "step": 250, | |
| "token_acc": 0.793002915451895, | |
| "train_speed(iter/s)": 0.056116 | |
| }, | |
| { | |
| "epoch": 0.24847746650426308, | |
| "grad_norm": 0.6715230345726013, | |
| "learning_rate": 8.965948419725922e-05, | |
| "loss": 0.5711063861846923, | |
| "memory(GiB)": 383.55, | |
| "step": 255, | |
| "token_acc": 0.8185620394343757, | |
| "train_speed(iter/s)": 0.055175 | |
| }, | |
| { | |
| "epoch": 0.25334957369062117, | |
| "grad_norm": 0.4514237642288208, | |
| "learning_rate": 8.916329309668397e-05, | |
| "loss": 0.721324348449707, | |
| "memory(GiB)": 383.55, | |
| "step": 260, | |
| "token_acc": 0.7792865828942035, | |
| "train_speed(iter/s)": 0.055266 | |
| }, | |
| { | |
| "epoch": 0.2582216808769793, | |
| "grad_norm": 0.5026947855949402, | |
| "learning_rate": 8.865691627853013e-05, | |
| "loss": 0.6661148548126221, | |
| "memory(GiB)": 383.55, | |
| "step": 265, | |
| "token_acc": 0.7951268025857782, | |
| "train_speed(iter/s)": 0.05533 | |
| }, | |
| { | |
| "epoch": 0.2630937880633374, | |
| "grad_norm": 0.3138331174850464, | |
| "learning_rate": 8.814048544293965e-05, | |
| "loss": 0.6717385292053223, | |
| "memory(GiB)": 383.55, | |
| "step": 270, | |
| "token_acc": 0.7904462355022607, | |
| "train_speed(iter/s)": 0.055296 | |
| }, | |
| { | |
| "epoch": 0.2679658952496955, | |
| "grad_norm": 0.3270625174045563, | |
| "learning_rate": 8.76141349049362e-05, | |
| "loss": 0.6027359008789063, | |
| "memory(GiB)": 383.55, | |
| "step": 275, | |
| "token_acc": 0.8082101806239738, | |
| "train_speed(iter/s)": 0.05525 | |
| }, | |
| { | |
| "epoch": 0.2728380024360536, | |
| "grad_norm": 0.4341810941696167, | |
| "learning_rate": 8.707800155949217e-05, | |
| "loss": 0.6553579330444336, | |
| "memory(GiB)": 383.55, | |
| "step": 280, | |
| "token_acc": 0.797032640949555, | |
| "train_speed(iter/s)": 0.055271 | |
| }, | |
| { | |
| "epoch": 0.2777101096224117, | |
| "grad_norm": 0.37805306911468506, | |
| "learning_rate": 8.653222484592458e-05, | |
| "loss": 0.6515018463134765, | |
| "memory(GiB)": 383.55, | |
| "step": 285, | |
| "token_acc": 0.794751477233229, | |
| "train_speed(iter/s)": 0.05524 | |
| }, | |
| { | |
| "epoch": 0.28258221680876977, | |
| "grad_norm": 0.38902854919433594, | |
| "learning_rate": 8.597694671162921e-05, | |
| "loss": 0.592349624633789, | |
| "memory(GiB)": 383.55, | |
| "step": 290, | |
| "token_acc": 0.815828677839851, | |
| "train_speed(iter/s)": 0.05519 | |
| }, | |
| { | |
| "epoch": 0.2874543239951279, | |
| "grad_norm": 0.3007030487060547, | |
| "learning_rate": 8.541231157516247e-05, | |
| "loss": 0.6616343021392822, | |
| "memory(GiB)": 383.55, | |
| "step": 295, | |
| "token_acc": 0.7961879284400601, | |
| "train_speed(iter/s)": 0.05519 | |
| }, | |
| { | |
| "epoch": 0.292326431181486, | |
| "grad_norm": 0.43431806564331055, | |
| "learning_rate": 8.483846628868055e-05, | |
| "loss": 0.6408910751342773, | |
| "memory(GiB)": 383.55, | |
| "step": 300, | |
| "token_acc": 0.7999295526593871, | |
| "train_speed(iter/s)": 0.055286 | |
| }, | |
| { | |
| "epoch": 0.292326431181486, | |
| "eval_loss": 0.6525390148162842, | |
| "eval_runtime": 6.2823, | |
| "eval_samples_per_second": 0.637, | |
| "eval_steps_per_second": 0.637, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.2971985383678441, | |
| "grad_norm": 0.4932222068309784, | |
| "learning_rate": 8.425556009974566e-05, | |
| "loss": 0.6335715770721435, | |
| "memory(GiB)": 383.55, | |
| "step": 305, | |
| "token_acc": 0.8036400066789113, | |
| "train_speed(iter/s)": 0.054583 | |
| }, | |
| { | |
| "epoch": 0.3020706455542022, | |
| "grad_norm": 0.27842646837234497, | |
| "learning_rate": 8.366374461250916e-05, | |
| "loss": 0.570946216583252, | |
| "memory(GiB)": 383.55, | |
| "step": 310, | |
| "token_acc": 0.8238350381555447, | |
| "train_speed(iter/s)": 0.054659 | |
| }, | |
| { | |
| "epoch": 0.30694275274056027, | |
| "grad_norm": 0.7104659080505371, | |
| "learning_rate": 8.306317374828194e-05, | |
| "loss": 0.566010570526123, | |
| "memory(GiB)": 383.55, | |
| "step": 315, | |
| "token_acc": 0.8189669219488349, | |
| "train_speed(iter/s)": 0.054662 | |
| }, | |
| { | |
| "epoch": 0.3118148599269184, | |
| "grad_norm": 0.8684744834899902, | |
| "learning_rate": 8.245400370550198e-05, | |
| "loss": 0.677960729598999, | |
| "memory(GiB)": 383.55, | |
| "step": 320, | |
| "token_acc": 0.7772163527790538, | |
| "train_speed(iter/s)": 0.054794 | |
| }, | |
| { | |
| "epoch": 0.3166869671132765, | |
| "grad_norm": 0.3846539258956909, | |
| "learning_rate": 8.183639291910987e-05, | |
| "loss": 0.5622167587280273, | |
| "memory(GiB)": 383.55, | |
| "step": 325, | |
| "token_acc": 0.830480089318943, | |
| "train_speed(iter/s)": 0.054821 | |
| }, | |
| { | |
| "epoch": 0.3215590742996346, | |
| "grad_norm": 0.34010785818099976, | |
| "learning_rate": 8.121050201934235e-05, | |
| "loss": 0.5877705574035644, | |
| "memory(GiB)": 383.55, | |
| "step": 330, | |
| "token_acc": 0.8287964389659305, | |
| "train_speed(iter/s)": 0.054899 | |
| }, | |
| { | |
| "epoch": 0.3264311814859927, | |
| "grad_norm": 0.3751339912414551, | |
| "learning_rate": 8.057649378995526e-05, | |
| "loss": 0.5179604053497314, | |
| "memory(GiB)": 383.55, | |
| "step": 335, | |
| "token_acc": 0.8402439024390244, | |
| "train_speed(iter/s)": 0.054839 | |
| }, | |
| { | |
| "epoch": 0.3313032886723508, | |
| "grad_norm": 0.3137739896774292, | |
| "learning_rate": 7.993453312588607e-05, | |
| "loss": 0.5339327335357666, | |
| "memory(GiB)": 383.55, | |
| "step": 340, | |
| "token_acc": 0.8365357839042049, | |
| "train_speed(iter/s)": 0.054764 | |
| }, | |
| { | |
| "epoch": 0.33617539585870887, | |
| "grad_norm": 0.5746834874153137, | |
| "learning_rate": 7.928478699036755e-05, | |
| "loss": 0.6346034049987793, | |
| "memory(GiB)": 383.55, | |
| "step": 345, | |
| "token_acc": 0.7972016183412003, | |
| "train_speed(iter/s)": 0.054713 | |
| }, | |
| { | |
| "epoch": 0.341047503045067, | |
| "grad_norm": 0.3580325245857239, | |
| "learning_rate": 7.862742437150336e-05, | |
| "loss": 0.6608481884002686, | |
| "memory(GiB)": 383.55, | |
| "step": 350, | |
| "token_acc": 0.7929736511919699, | |
| "train_speed(iter/s)": 0.054762 | |
| }, | |
| { | |
| "epoch": 0.3459196102314251, | |
| "grad_norm": 0.4622519612312317, | |
| "learning_rate": 7.796261623831713e-05, | |
| "loss": 0.562419080734253, | |
| "memory(GiB)": 383.55, | |
| "step": 355, | |
| "token_acc": 0.8190336211647988, | |
| "train_speed(iter/s)": 0.054396 | |
| }, | |
| { | |
| "epoch": 0.3507917174177832, | |
| "grad_norm": 0.5616739392280579, | |
| "learning_rate": 7.729053549628622e-05, | |
| "loss": 0.5495719909667969, | |
| "memory(GiB)": 383.55, | |
| "step": 360, | |
| "token_acc": 0.8339377743844245, | |
| "train_speed(iter/s)": 0.054442 | |
| }, | |
| { | |
| "epoch": 0.3556638246041413, | |
| "grad_norm": 0.7364129424095154, | |
| "learning_rate": 7.661135694237198e-05, | |
| "loss": 0.4548810958862305, | |
| "memory(GiB)": 387.42, | |
| "step": 365, | |
| "token_acc": 0.8370827285921626, | |
| "train_speed(iter/s)": 0.054383 | |
| }, | |
| { | |
| "epoch": 0.36053593179049936, | |
| "grad_norm": 0.44831952452659607, | |
| "learning_rate": 7.592525721955786e-05, | |
| "loss": 0.5882142066955567, | |
| "memory(GiB)": 387.42, | |
| "step": 370, | |
| "token_acc": 0.8161894662424886, | |
| "train_speed(iter/s)": 0.054337 | |
| }, | |
| { | |
| "epoch": 0.3654080389768575, | |
| "grad_norm": 0.37750759720802307, | |
| "learning_rate": 7.523241477090763e-05, | |
| "loss": 0.6884512901306152, | |
| "memory(GiB)": 387.42, | |
| "step": 375, | |
| "token_acc": 0.7952127659574468, | |
| "train_speed(iter/s)": 0.054385 | |
| }, | |
| { | |
| "epoch": 0.3702801461632156, | |
| "grad_norm": 0.5074845552444458, | |
| "learning_rate": 7.45330097931553e-05, | |
| "loss": 0.5458427906036377, | |
| "memory(GiB)": 387.42, | |
| "step": 380, | |
| "token_acc": 0.8217197924388436, | |
| "train_speed(iter/s)": 0.054354 | |
| }, | |
| { | |
| "epoch": 0.3751522533495737, | |
| "grad_norm": 0.6083484292030334, | |
| "learning_rate": 7.382722418983892e-05, | |
| "loss": 0.5680232048034668, | |
| "memory(GiB)": 387.42, | |
| "step": 385, | |
| "token_acc": 0.8248374239563667, | |
| "train_speed(iter/s)": 0.054329 | |
| }, | |
| { | |
| "epoch": 0.3800243605359318, | |
| "grad_norm": 0.39138278365135193, | |
| "learning_rate": 7.311524152399054e-05, | |
| "loss": 0.7077183246612548, | |
| "memory(GiB)": 387.42, | |
| "step": 390, | |
| "token_acc": 0.7912014292094686, | |
| "train_speed(iter/s)": 0.054329 | |
| }, | |
| { | |
| "epoch": 0.3848964677222899, | |
| "grad_norm": 0.4244479238986969, | |
| "learning_rate": 7.239724697039457e-05, | |
| "loss": 0.6999778270721435, | |
| "memory(GiB)": 387.42, | |
| "step": 395, | |
| "token_acc": 0.7828650029475339, | |
| "train_speed(iter/s)": 0.054413 | |
| }, | |
| { | |
| "epoch": 0.38976857490864797, | |
| "grad_norm": 0.3658107817173004, | |
| "learning_rate": 7.167342726742685e-05, | |
| "loss": 0.5321448802947998, | |
| "memory(GiB)": 387.42, | |
| "step": 400, | |
| "token_acc": 0.8257604205782951, | |
| "train_speed(iter/s)": 0.054414 | |
| }, | |
| { | |
| "epoch": 0.38976857490864797, | |
| "eval_loss": 0.647614598274231, | |
| "eval_runtime": 6.1299, | |
| "eval_samples_per_second": 0.653, | |
| "eval_steps_per_second": 0.653, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3946406820950061, | |
| "grad_norm": 0.4579378068447113, | |
| "learning_rate": 7.094397066848716e-05, | |
| "loss": 0.6339591979980469, | |
| "memory(GiB)": 387.42, | |
| "step": 405, | |
| "token_acc": 0.7953757225433526, | |
| "train_speed(iter/s)": 0.054198 | |
| }, | |
| { | |
| "epoch": 0.3995127892813642, | |
| "grad_norm": 0.41108816862106323, | |
| "learning_rate": 7.020906689303766e-05, | |
| "loss": 0.6498037338256836, | |
| "memory(GiB)": 387.42, | |
| "step": 410, | |
| "token_acc": 0.8013311819281969, | |
| "train_speed(iter/s)": 0.054274 | |
| }, | |
| { | |
| "epoch": 0.4043848964677223, | |
| "grad_norm": 0.3730790615081787, | |
| "learning_rate": 6.946890707726004e-05, | |
| "loss": 0.6224189281463623, | |
| "memory(GiB)": 387.42, | |
| "step": 415, | |
| "token_acc": 0.8109767441860465, | |
| "train_speed(iter/s)": 0.054342 | |
| }, | |
| { | |
| "epoch": 0.4092570036540804, | |
| "grad_norm": 0.41862693428993225, | |
| "learning_rate": 6.872368372434416e-05, | |
| "loss": 0.6285569190979003, | |
| "memory(GiB)": 387.42, | |
| "step": 420, | |
| "token_acc": 0.793915399041467, | |
| "train_speed(iter/s)": 0.054335 | |
| }, | |
| { | |
| "epoch": 0.41412911084043846, | |
| "grad_norm": 0.4861293435096741, | |
| "learning_rate": 6.797359065442117e-05, | |
| "loss": 0.5771468162536622, | |
| "memory(GiB)": 387.42, | |
| "step": 425, | |
| "token_acc": 0.8196579720158922, | |
| "train_speed(iter/s)": 0.054339 | |
| }, | |
| { | |
| "epoch": 0.4190012180267966, | |
| "grad_norm": 0.30941805243492126, | |
| "learning_rate": 6.721882295415425e-05, | |
| "loss": 0.5844586372375489, | |
| "memory(GiB)": 387.42, | |
| "step": 430, | |
| "token_acc": 0.814694173000362, | |
| "train_speed(iter/s)": 0.05432 | |
| }, | |
| { | |
| "epoch": 0.4238733252131547, | |
| "grad_norm": 0.3820112645626068, | |
| "learning_rate": 6.645957692599969e-05, | |
| "loss": 0.5823289394378662, | |
| "memory(GiB)": 387.42, | |
| "step": 435, | |
| "token_acc": 0.8027565654684299, | |
| "train_speed(iter/s)": 0.054382 | |
| }, | |
| { | |
| "epoch": 0.4287454323995128, | |
| "grad_norm": 0.3910198509693146, | |
| "learning_rate": 6.569605003715201e-05, | |
| "loss": 0.561509084701538, | |
| "memory(GiB)": 387.42, | |
| "step": 440, | |
| "token_acc": 0.8264751552795031, | |
| "train_speed(iter/s)": 0.054462 | |
| }, | |
| { | |
| "epoch": 0.4336175395858709, | |
| "grad_norm": 0.3805302381515503, | |
| "learning_rate": 6.492844086818599e-05, | |
| "loss": 0.558375883102417, | |
| "memory(GiB)": 387.42, | |
| "step": 445, | |
| "token_acc": 0.8262056414922657, | |
| "train_speed(iter/s)": 0.05444 | |
| }, | |
| { | |
| "epoch": 0.438489646772229, | |
| "grad_norm": 0.6036235690116882, | |
| "learning_rate": 6.41569490614092e-05, | |
| "loss": 0.6268420696258545, | |
| "memory(GiB)": 387.42, | |
| "step": 450, | |
| "token_acc": 0.8061224489795918, | |
| "train_speed(iter/s)": 0.054446 | |
| }, | |
| { | |
| "epoch": 0.44336175395858707, | |
| "grad_norm": 0.4275857210159302, | |
| "learning_rate": 6.338177526893836e-05, | |
| "loss": 0.5441042423248291, | |
| "memory(GiB)": 387.42, | |
| "step": 455, | |
| "token_acc": 0.8360881542699724, | |
| "train_speed(iter/s)": 0.05418 | |
| }, | |
| { | |
| "epoch": 0.4482338611449452, | |
| "grad_norm": 0.4830683469772339, | |
| "learning_rate": 6.260312110051312e-05, | |
| "loss": 0.606513261795044, | |
| "memory(GiB)": 387.42, | |
| "step": 460, | |
| "token_acc": 0.8049238864875023, | |
| "train_speed(iter/s)": 0.054224 | |
| }, | |
| { | |
| "epoch": 0.4531059683313033, | |
| "grad_norm": 0.35629284381866455, | |
| "learning_rate": 6.182118907106068e-05, | |
| "loss": 0.538546371459961, | |
| "memory(GiB)": 387.42, | |
| "step": 465, | |
| "token_acc": 0.8373831775700935, | |
| "train_speed(iter/s)": 0.054204 | |
| }, | |
| { | |
| "epoch": 0.4579780755176614, | |
| "grad_norm": 0.46749940514564514, | |
| "learning_rate": 6.103618254802511e-05, | |
| "loss": 0.5923898696899415, | |
| "memory(GiB)": 387.42, | |
| "step": 470, | |
| "token_acc": 0.8042936553574851, | |
| "train_speed(iter/s)": 0.054261 | |
| }, | |
| { | |
| "epoch": 0.4628501827040195, | |
| "grad_norm": 0.6278035044670105, | |
| "learning_rate": 6.024830569847477e-05, | |
| "loss": 0.5971939086914062, | |
| "memory(GiB)": 387.42, | |
| "step": 475, | |
| "token_acc": 0.8176121372031663, | |
| "train_speed(iter/s)": 0.054245 | |
| }, | |
| { | |
| "epoch": 0.46772228989037756, | |
| "grad_norm": 0.3572694957256317, | |
| "learning_rate": 5.945776343600207e-05, | |
| "loss": 0.5843085765838623, | |
| "memory(GiB)": 387.42, | |
| "step": 480, | |
| "token_acc": 0.8212882953652789, | |
| "train_speed(iter/s)": 0.054246 | |
| }, | |
| { | |
| "epoch": 0.4725943970767357, | |
| "grad_norm": 0.5189170241355896, | |
| "learning_rate": 5.866476136742862e-05, | |
| "loss": 0.5234210968017579, | |
| "memory(GiB)": 387.42, | |
| "step": 485, | |
| "token_acc": 0.8463819691577699, | |
| "train_speed(iter/s)": 0.05426 | |
| }, | |
| { | |
| "epoch": 0.4774665042630938, | |
| "grad_norm": 0.41832658648490906, | |
| "learning_rate": 5.7869505739330546e-05, | |
| "loss": 0.6695927619934082, | |
| "memory(GiB)": 387.42, | |
| "step": 490, | |
| "token_acc": 0.7924812030075188, | |
| "train_speed(iter/s)": 0.05433 | |
| }, | |
| { | |
| "epoch": 0.4823386114494519, | |
| "grad_norm": 4.011805534362793, | |
| "learning_rate": 5.7072203384397064e-05, | |
| "loss": 0.5814547538757324, | |
| "memory(GiB)": 387.42, | |
| "step": 495, | |
| "token_acc": 0.8110627719080175, | |
| "train_speed(iter/s)": 0.054376 | |
| }, | |
| { | |
| "epoch": 0.48721071863581, | |
| "grad_norm": 0.31671130657196045, | |
| "learning_rate": 5.627306166763684e-05, | |
| "loss": 0.5855265617370605, | |
| "memory(GiB)": 387.42, | |
| "step": 500, | |
| "token_acc": 0.8094142629623076, | |
| "train_speed(iter/s)": 0.054362 | |
| }, | |
| { | |
| "epoch": 0.48721071863581, | |
| "eval_loss": 0.6302051544189453, | |
| "eval_runtime": 6.1545, | |
| "eval_samples_per_second": 0.65, | |
| "eval_steps_per_second": 0.65, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4920828258221681, | |
| "grad_norm": 0.3875284194946289, | |
| "learning_rate": 5.5472288432445774e-05, | |
| "loss": 0.59937744140625, | |
| "memory(GiB)": 387.42, | |
| "step": 505, | |
| "token_acc": 0.7988918837975442, | |
| "train_speed(iter/s)": 0.05424 | |
| }, | |
| { | |
| "epoch": 0.49695493300852617, | |
| "grad_norm": 0.4411413371562958, | |
| "learning_rate": 5.467009194655045e-05, | |
| "loss": 0.5820174217224121, | |
| "memory(GiB)": 387.42, | |
| "step": 510, | |
| "token_acc": 0.8234812510234157, | |
| "train_speed(iter/s)": 0.054197 | |
| }, | |
| { | |
| "epoch": 0.5018270401948843, | |
| "grad_norm": 0.5111451148986816, | |
| "learning_rate": 5.386668084784112e-05, | |
| "loss": 0.5154130935668946, | |
| "memory(GiB)": 387.42, | |
| "step": 515, | |
| "token_acc": 0.8397686998694274, | |
| "train_speed(iter/s)": 0.05426 | |
| }, | |
| { | |
| "epoch": 0.5066991473812423, | |
| "grad_norm": 0.29832109808921814, | |
| "learning_rate": 5.306226409010855e-05, | |
| "loss": 0.5672587394714356, | |
| "memory(GiB)": 387.42, | |
| "step": 520, | |
| "token_acc": 0.8263521756811713, | |
| "train_speed(iter/s)": 0.054274 | |
| }, | |
| { | |
| "epoch": 0.5115712545676004, | |
| "grad_norm": 0.42139527201652527, | |
| "learning_rate": 5.22570508886986e-05, | |
| "loss": 0.5327470302581787, | |
| "memory(GiB)": 387.42, | |
| "step": 525, | |
| "token_acc": 0.8310478199718706, | |
| "train_speed(iter/s)": 0.054332 | |
| }, | |
| { | |
| "epoch": 0.5164433617539586, | |
| "grad_norm": 0.34750285744667053, | |
| "learning_rate": 5.145125066609877e-05, | |
| "loss": 0.61210618019104, | |
| "memory(GiB)": 387.42, | |
| "step": 530, | |
| "token_acc": 0.8104413702239789, | |
| "train_speed(iter/s)": 0.054325 | |
| }, | |
| { | |
| "epoch": 0.5213154689403167, | |
| "grad_norm": 0.5557289123535156, | |
| "learning_rate": 5.0645072997471e-05, | |
| "loss": 0.5486731052398681, | |
| "memory(GiB)": 387.42, | |
| "step": 535, | |
| "token_acc": 0.8223992502343018, | |
| "train_speed(iter/s)": 0.054295 | |
| }, | |
| { | |
| "epoch": 0.5261875761266748, | |
| "grad_norm": 1.370209813117981, | |
| "learning_rate": 4.983872755614461e-05, | |
| "loss": 0.6499679565429688, | |
| "memory(GiB)": 387.42, | |
| "step": 540, | |
| "token_acc": 0.7975866095757104, | |
| "train_speed(iter/s)": 0.054348 | |
| }, | |
| { | |
| "epoch": 0.5310596833130329, | |
| "grad_norm": 0.4371365010738373, | |
| "learning_rate": 4.9032424059083774e-05, | |
| "loss": 0.43409147262573244, | |
| "memory(GiB)": 387.42, | |
| "step": 545, | |
| "token_acc": 0.8684942391736193, | |
| "train_speed(iter/s)": 0.054321 | |
| }, | |
| { | |
| "epoch": 0.535931790499391, | |
| "grad_norm": 0.4735865890979767, | |
| "learning_rate": 4.8226372212343726e-05, | |
| "loss": 0.5776564598083496, | |
| "memory(GiB)": 387.42, | |
| "step": 550, | |
| "token_acc": 0.8255653883972468, | |
| "train_speed(iter/s)": 0.054368 | |
| }, | |
| { | |
| "epoch": 0.5408038976857491, | |
| "grad_norm": 0.6005700826644897, | |
| "learning_rate": 4.742078165652958e-05, | |
| "loss": 0.5744057178497315, | |
| "memory(GiB)": 387.42, | |
| "step": 555, | |
| "token_acc": 0.8105436573311368, | |
| "train_speed(iter/s)": 0.054325 | |
| }, | |
| { | |
| "epoch": 0.5456760048721072, | |
| "grad_norm": 0.4128513038158417, | |
| "learning_rate": 4.661586191227247e-05, | |
| "loss": 0.5321125030517578, | |
| "memory(GiB)": 387.42, | |
| "step": 560, | |
| "token_acc": 0.8245080500894454, | |
| "train_speed(iter/s)": 0.054305 | |
| }, | |
| { | |
| "epoch": 0.5505481120584653, | |
| "grad_norm": 0.4688722491264343, | |
| "learning_rate": 4.581182232573658e-05, | |
| "loss": 0.5235236167907715, | |
| "memory(GiB)": 387.42, | |
| "step": 565, | |
| "token_acc": 0.8205183122724352, | |
| "train_speed(iter/s)": 0.054352 | |
| }, | |
| { | |
| "epoch": 0.5554202192448234, | |
| "grad_norm": 0.4604549705982208, | |
| "learning_rate": 4.500887201417187e-05, | |
| "loss": 0.6571295261383057, | |
| "memory(GiB)": 387.42, | |
| "step": 570, | |
| "token_acc": 0.8019607843137255, | |
| "train_speed(iter/s)": 0.054361 | |
| }, | |
| { | |
| "epoch": 0.5602923264311814, | |
| "grad_norm": 0.48336780071258545, | |
| "learning_rate": 4.4207219811526056e-05, | |
| "loss": 0.5963138580322266, | |
| "memory(GiB)": 387.42, | |
| "step": 575, | |
| "token_acc": 0.8077416987708678, | |
| "train_speed(iter/s)": 0.054409 | |
| }, | |
| { | |
| "epoch": 0.5651644336175395, | |
| "grad_norm": 0.5700681805610657, | |
| "learning_rate": 4.3407074214130446e-05, | |
| "loss": 0.6309503555297852, | |
| "memory(GiB)": 387.42, | |
| "step": 580, | |
| "token_acc": 0.7960770454143842, | |
| "train_speed(iter/s)": 0.054412 | |
| }, | |
| { | |
| "epoch": 0.5700365408038977, | |
| "grad_norm": 0.40493443608283997, | |
| "learning_rate": 4.2608643326473496e-05, | |
| "loss": 0.5265829563140869, | |
| "memory(GiB)": 387.42, | |
| "step": 585, | |
| "token_acc": 0.8364477970169724, | |
| "train_speed(iter/s)": 0.054419 | |
| }, | |
| { | |
| "epoch": 0.5749086479902558, | |
| "grad_norm": 0.42441654205322266, | |
| "learning_rate": 4.181213480707637e-05, | |
| "loss": 0.5463868618011475, | |
| "memory(GiB)": 387.42, | |
| "step": 590, | |
| "token_acc": 0.8250831178426302, | |
| "train_speed(iter/s)": 0.054415 | |
| }, | |
| { | |
| "epoch": 0.5797807551766139, | |
| "grad_norm": 0.5273870825767517, | |
| "learning_rate": 4.1017755814484374e-05, | |
| "loss": 0.6219929218292236, | |
| "memory(GiB)": 387.42, | |
| "step": 595, | |
| "token_acc": 0.8101965601965602, | |
| "train_speed(iter/s)": 0.054492 | |
| }, | |
| { | |
| "epoch": 0.584652862362972, | |
| "grad_norm": 0.5027340650558472, | |
| "learning_rate": 4.0225712953388494e-05, | |
| "loss": 0.47921223640441896, | |
| "memory(GiB)": 387.42, | |
| "step": 600, | |
| "token_acc": 0.8507462686567164, | |
| "train_speed(iter/s)": 0.054456 | |
| }, | |
| { | |
| "epoch": 0.584652862362972, | |
| "eval_loss": 0.5931864976882935, | |
| "eval_runtime": 6.2202, | |
| "eval_samples_per_second": 0.643, | |
| "eval_steps_per_second": 0.643, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5895249695493301, | |
| "grad_norm": 0.7974056005477905, | |
| "learning_rate": 3.943621222089102e-05, | |
| "loss": 0.5052922248840332, | |
| "memory(GiB)": 387.42, | |
| "step": 605, | |
| "token_acc": 0.8312937062937062, | |
| "train_speed(iter/s)": 0.054258 | |
| }, | |
| { | |
| "epoch": 0.5943970767356882, | |
| "grad_norm": 0.38420093059539795, | |
| "learning_rate": 3.864945895292908e-05, | |
| "loss": 0.5411774635314941, | |
| "memory(GiB)": 387.42, | |
| "step": 610, | |
| "token_acc": 0.8309124767225325, | |
| "train_speed(iter/s)": 0.054201 | |
| }, | |
| { | |
| "epoch": 0.5992691839220463, | |
| "grad_norm": 0.9411633014678955, | |
| "learning_rate": 3.786565777087022e-05, | |
| "loss": 0.6929959297180176, | |
| "memory(GiB)": 387.42, | |
| "step": 615, | |
| "token_acc": 0.7847842261904762, | |
| "train_speed(iter/s)": 0.05425 | |
| }, | |
| { | |
| "epoch": 0.6041412911084044, | |
| "grad_norm": 0.35226595401763916, | |
| "learning_rate": 3.708501252829386e-05, | |
| "loss": 0.5966301918029785, | |
| "memory(GiB)": 387.42, | |
| "step": 620, | |
| "token_acc": 0.8161076443057722, | |
| "train_speed(iter/s)": 0.054233 | |
| }, | |
| { | |
| "epoch": 0.6090133982947625, | |
| "grad_norm": 0.4208815097808838, | |
| "learning_rate": 3.6307726257972255e-05, | |
| "loss": 0.5394818782806396, | |
| "memory(GiB)": 387.42, | |
| "step": 625, | |
| "token_acc": 0.8257628294036061, | |
| "train_speed(iter/s)": 0.054209 | |
| }, | |
| { | |
| "epoch": 0.6138855054811205, | |
| "grad_norm": 0.445925772190094, | |
| "learning_rate": 3.553400111906523e-05, | |
| "loss": 0.6164620399475098, | |
| "memory(GiB)": 387.42, | |
| "step": 630, | |
| "token_acc": 0.8090881366270204, | |
| "train_speed(iter/s)": 0.054222 | |
| }, | |
| { | |
| "epoch": 0.6187576126674786, | |
| "grad_norm": 0.5922476649284363, | |
| "learning_rate": 3.476403834454183e-05, | |
| "loss": 0.5115623474121094, | |
| "memory(GiB)": 387.42, | |
| "step": 635, | |
| "token_acc": 0.8346325167037862, | |
| "train_speed(iter/s)": 0.054244 | |
| }, | |
| { | |
| "epoch": 0.6236297198538368, | |
| "grad_norm": 0.5026776790618896, | |
| "learning_rate": 3.399803818884311e-05, | |
| "loss": 0.5328683853149414, | |
| "memory(GiB)": 387.42, | |
| "step": 640, | |
| "token_acc": 0.8462420173571311, | |
| "train_speed(iter/s)": 0.054264 | |
| }, | |
| { | |
| "epoch": 0.6285018270401949, | |
| "grad_norm": 0.45468801259994507, | |
| "learning_rate": 3.323619987579914e-05, | |
| "loss": 0.6177504062652588, | |
| "memory(GiB)": 387.42, | |
| "step": 645, | |
| "token_acc": 0.80891932520461, | |
| "train_speed(iter/s)": 0.054261 | |
| }, | |
| { | |
| "epoch": 0.633373934226553, | |
| "grad_norm": 0.6319808959960938, | |
| "learning_rate": 3.247872154681439e-05, | |
| "loss": 0.5958673000335694, | |
| "memory(GiB)": 387.42, | |
| "step": 650, | |
| "token_acc": 0.8096597145993414, | |
| "train_speed(iter/s)": 0.054221 | |
| }, | |
| { | |
| "epoch": 0.6382460414129111, | |
| "grad_norm": 0.4812871217727661, | |
| "learning_rate": 3.172580020933442e-05, | |
| "loss": 0.5768674850463867, | |
| "memory(GiB)": 387.42, | |
| "step": 655, | |
| "token_acc": 0.8165027102991367, | |
| "train_speed(iter/s)": 0.054185 | |
| }, | |
| { | |
| "epoch": 0.6431181485992692, | |
| "grad_norm": 0.9395345449447632, | |
| "learning_rate": 3.097763168560741e-05, | |
| "loss": 0.674397611618042, | |
| "memory(GiB)": 387.42, | |
| "step": 660, | |
| "token_acc": 0.7806563039723662, | |
| "train_speed(iter/s)": 0.054211 | |
| }, | |
| { | |
| "epoch": 0.6479902557856273, | |
| "grad_norm": 0.5097836852073669, | |
| "learning_rate": 3.0234410561754257e-05, | |
| "loss": 0.5154216766357422, | |
| "memory(GiB)": 387.42, | |
| "step": 665, | |
| "token_acc": 0.8327868852459016, | |
| "train_speed(iter/s)": 0.054197 | |
| }, | |
| { | |
| "epoch": 0.6528623629719854, | |
| "grad_norm": 0.3545515239238739, | |
| "learning_rate": 2.949633013715982e-05, | |
| "loss": 0.5994223117828369, | |
| "memory(GiB)": 387.42, | |
| "step": 670, | |
| "token_acc": 0.8076275080410477, | |
| "train_speed(iter/s)": 0.054247 | |
| }, | |
| { | |
| "epoch": 0.6577344701583435, | |
| "grad_norm": 0.9892140030860901, | |
| "learning_rate": 2.8763582374199126e-05, | |
| "loss": 0.5891304969787597, | |
| "memory(GiB)": 387.42, | |
| "step": 675, | |
| "token_acc": 0.8036573628488932, | |
| "train_speed(iter/s)": 0.054243 | |
| }, | |
| { | |
| "epoch": 0.6626065773447016, | |
| "grad_norm": 0.5605654716491699, | |
| "learning_rate": 2.8036357848311012e-05, | |
| "loss": 0.5478427410125732, | |
| "memory(GiB)": 387.42, | |
| "step": 680, | |
| "token_acc": 0.8287547623821937, | |
| "train_speed(iter/s)": 0.054281 | |
| }, | |
| { | |
| "epoch": 0.6674786845310596, | |
| "grad_norm": 0.4100501239299774, | |
| "learning_rate": 2.7314845698432805e-05, | |
| "loss": 0.6083401203155517, | |
| "memory(GiB)": 387.42, | |
| "step": 685, | |
| "token_acc": 0.7989271180170181, | |
| "train_speed(iter/s)": 0.054288 | |
| }, | |
| { | |
| "epoch": 0.6723507917174177, | |
| "grad_norm": 0.4639231562614441, | |
| "learning_rate": 2.659923357780828e-05, | |
| "loss": 0.5717390060424805, | |
| "memory(GiB)": 387.42, | |
| "step": 690, | |
| "token_acc": 0.8201791448369106, | |
| "train_speed(iter/s)": 0.054301 | |
| }, | |
| { | |
| "epoch": 0.6772228989037758, | |
| "grad_norm": 0.30558013916015625, | |
| "learning_rate": 2.5889707605182347e-05, | |
| "loss": 0.4964598178863525, | |
| "memory(GiB)": 387.42, | |
| "step": 695, | |
| "token_acc": 0.8518634024637455, | |
| "train_speed(iter/s)": 0.054314 | |
| }, | |
| { | |
| "epoch": 0.682095006090134, | |
| "grad_norm": 0.490887314081192, | |
| "learning_rate": 2.518645231639457e-05, | |
| "loss": 0.6779924392700195, | |
| "memory(GiB)": 387.42, | |
| "step": 700, | |
| "token_acc": 0.7798953662182362, | |
| "train_speed(iter/s)": 0.054375 | |
| }, | |
| { | |
| "epoch": 0.682095006090134, | |
| "eval_loss": 0.587890625, | |
| "eval_runtime": 6.016, | |
| "eval_samples_per_second": 0.665, | |
| "eval_steps_per_second": 0.665, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6869671132764921, | |
| "grad_norm": 0.9540379047393799, | |
| "learning_rate": 2.4489650616384507e-05, | |
| "loss": 0.5919107437133789, | |
| "memory(GiB)": 387.42, | |
| "step": 705, | |
| "token_acc": 0.8063427800269906, | |
| "train_speed(iter/s)": 0.054286 | |
| }, | |
| { | |
| "epoch": 0.6918392204628502, | |
| "grad_norm": 0.4385371208190918, | |
| "learning_rate": 2.3799483731621237e-05, | |
| "loss": 0.5554671287536621, | |
| "memory(GiB)": 387.42, | |
| "step": 710, | |
| "token_acc": 0.8227891742802965, | |
| "train_speed(iter/s)": 0.054309 | |
| }, | |
| { | |
| "epoch": 0.6967113276492083, | |
| "grad_norm": 0.37225764989852905, | |
| "learning_rate": 2.311613116296929e-05, | |
| "loss": 0.5223379611968995, | |
| "memory(GiB)": 387.42, | |
| "step": 715, | |
| "token_acc": 0.8422697368421053, | |
| "train_speed(iter/s)": 0.054303 | |
| }, | |
| { | |
| "epoch": 0.7015834348355664, | |
| "grad_norm": 0.6227976083755493, | |
| "learning_rate": 2.2439770639003627e-05, | |
| "loss": 0.5609029769897461, | |
| "memory(GiB)": 387.42, | |
| "step": 720, | |
| "token_acc": 0.8244803695150116, | |
| "train_speed(iter/s)": 0.054309 | |
| }, | |
| { | |
| "epoch": 0.7064555420219245, | |
| "grad_norm": 0.4218509793281555, | |
| "learning_rate": 2.177057806978522e-05, | |
| "loss": 0.5789398193359375, | |
| "memory(GiB)": 387.42, | |
| "step": 725, | |
| "token_acc": 0.8195275590551181, | |
| "train_speed(iter/s)": 0.054317 | |
| }, | |
| { | |
| "epoch": 0.7113276492082826, | |
| "grad_norm": 0.5081908106803894, | |
| "learning_rate": 2.110872750110996e-05, | |
| "loss": 0.49318413734436034, | |
| "memory(GiB)": 387.42, | |
| "step": 730, | |
| "token_acc": 0.8306063522617901, | |
| "train_speed(iter/s)": 0.05436 | |
| }, | |
| { | |
| "epoch": 0.7161997563946407, | |
| "grad_norm": 0.6738778352737427, | |
| "learning_rate": 2.045439106924217e-05, | |
| "loss": 0.55146803855896, | |
| "memory(GiB)": 387.42, | |
| "step": 735, | |
| "token_acc": 0.8200392927308447, | |
| "train_speed(iter/s)": 0.054367 | |
| }, | |
| { | |
| "epoch": 0.7210718635809987, | |
| "grad_norm": 0.43147921562194824, | |
| "learning_rate": 1.980773895614481e-05, | |
| "loss": 0.574643898010254, | |
| "memory(GiB)": 387.42, | |
| "step": 740, | |
| "token_acc": 0.8172221384406575, | |
| "train_speed(iter/s)": 0.054386 | |
| }, | |
| { | |
| "epoch": 0.7259439707673568, | |
| "grad_norm": 0.5750350952148438, | |
| "learning_rate": 1.9168939345218095e-05, | |
| "loss": 0.5682173728942871, | |
| "memory(GiB)": 387.42, | |
| "step": 745, | |
| "token_acc": 0.8214421252371916, | |
| "train_speed(iter/s)": 0.054395 | |
| }, | |
| { | |
| "epoch": 0.730816077953715, | |
| "grad_norm": 0.461907297372818, | |
| "learning_rate": 1.8538158377557702e-05, | |
| "loss": 0.5272111415863037, | |
| "memory(GiB)": 387.42, | |
| "step": 750, | |
| "token_acc": 0.8257032542746828, | |
| "train_speed(iter/s)": 0.054421 | |
| }, | |
| { | |
| "epoch": 0.7356881851400731, | |
| "grad_norm": 0.794235348701477, | |
| "learning_rate": 1.791556010874434e-05, | |
| "loss": 0.6292970180511475, | |
| "memory(GiB)": 387.42, | |
| "step": 755, | |
| "token_acc": 0.810012836970475, | |
| "train_speed(iter/s)": 0.054353 | |
| }, | |
| { | |
| "epoch": 0.7405602923264312, | |
| "grad_norm": 0.6189777851104736, | |
| "learning_rate": 1.7301306466175533e-05, | |
| "loss": 0.5557656288146973, | |
| "memory(GiB)": 387.42, | |
| "step": 760, | |
| "token_acc": 0.8259242957746479, | |
| "train_speed(iter/s)": 0.054349 | |
| }, | |
| { | |
| "epoch": 0.7454323995127893, | |
| "grad_norm": 0.4845249056816101, | |
| "learning_rate": 1.6695557206951144e-05, | |
| "loss": 0.49696760177612304, | |
| "memory(GiB)": 389.68, | |
| "step": 765, | |
| "token_acc": 0.8422638261243813, | |
| "train_speed(iter/s)": 0.054323 | |
| }, | |
| { | |
| "epoch": 0.7503045066991474, | |
| "grad_norm": 0.4710843563079834, | |
| "learning_rate": 1.6098469876323093e-05, | |
| "loss": 0.47034273147583006, | |
| "memory(GiB)": 389.68, | |
| "step": 770, | |
| "token_acc": 0.8487571701720842, | |
| "train_speed(iter/s)": 0.05434 | |
| }, | |
| { | |
| "epoch": 0.7551766138855055, | |
| "grad_norm": 0.45380252599716187, | |
| "learning_rate": 1.551019976672058e-05, | |
| "loss": 0.5777853488922119, | |
| "memory(GiB)": 389.68, | |
| "step": 775, | |
| "token_acc": 0.8110020910406949, | |
| "train_speed(iter/s)": 0.054377 | |
| }, | |
| { | |
| "epoch": 0.7600487210718636, | |
| "grad_norm": 0.5304797291755676, | |
| "learning_rate": 1.4930899877361015e-05, | |
| "loss": 0.5180749416351318, | |
| "memory(GiB)": 389.68, | |
| "step": 780, | |
| "token_acc": 0.8334659769200159, | |
| "train_speed(iter/s)": 0.05443 | |
| }, | |
| { | |
| "epoch": 0.7649208282582217, | |
| "grad_norm": 0.447553426027298, | |
| "learning_rate": 1.4360720874457607e-05, | |
| "loss": 0.5336573123931885, | |
| "memory(GiB)": 389.68, | |
| "step": 785, | |
| "token_acc": 0.8346641615782058, | |
| "train_speed(iter/s)": 0.054438 | |
| }, | |
| { | |
| "epoch": 0.7697929354445798, | |
| "grad_norm": 0.5468970537185669, | |
| "learning_rate": 1.3799811052033467e-05, | |
| "loss": 0.6092133522033691, | |
| "memory(GiB)": 389.68, | |
| "step": 790, | |
| "token_acc": 0.7997620261771206, | |
| "train_speed(iter/s)": 0.054456 | |
| }, | |
| { | |
| "epoch": 0.7746650426309378, | |
| "grad_norm": 0.6424246430397034, | |
| "learning_rate": 1.3248316293352946e-05, | |
| "loss": 0.6084504127502441, | |
| "memory(GiB)": 389.68, | |
| "step": 795, | |
| "token_acc": 0.8091853471842537, | |
| "train_speed(iter/s)": 0.05451 | |
| }, | |
| { | |
| "epoch": 0.7795371498172959, | |
| "grad_norm": 0.5339289903640747, | |
| "learning_rate": 1.2706380032979691e-05, | |
| "loss": 0.535353136062622, | |
| "memory(GiB)": 389.68, | |
| "step": 800, | |
| "token_acc": 0.8231229847996315, | |
| "train_speed(iter/s)": 0.054509 | |
| }, | |
| { | |
| "epoch": 0.7795371498172959, | |
| "eval_loss": 0.587626039981842, | |
| "eval_runtime": 6.1485, | |
| "eval_samples_per_second": 0.651, | |
| "eval_steps_per_second": 0.651, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.784409257003654, | |
| "grad_norm": 0.47259068489074707, | |
| "learning_rate": 1.2174143219471878e-05, | |
| "loss": 0.6263217449188232, | |
| "memory(GiB)": 389.68, | |
| "step": 805, | |
| "token_acc": 0.7991557070953077, | |
| "train_speed(iter/s)": 0.054434 | |
| }, | |
| { | |
| "epoch": 0.7892813641900122, | |
| "grad_norm": 0.5547453761100769, | |
| "learning_rate": 1.1651744278723687e-05, | |
| "loss": 0.5090929985046386, | |
| "memory(GiB)": 389.68, | |
| "step": 810, | |
| "token_acc": 0.8354404976921533, | |
| "train_speed(iter/s)": 0.054448 | |
| }, | |
| { | |
| "epoch": 0.7941534713763703, | |
| "grad_norm": 0.4848991930484772, | |
| "learning_rate": 1.1139319077963178e-05, | |
| "loss": 0.5273432254791259, | |
| "memory(GiB)": 389.68, | |
| "step": 815, | |
| "token_acc": 0.8295368261199696, | |
| "train_speed(iter/s)": 0.054475 | |
| }, | |
| { | |
| "epoch": 0.7990255785627284, | |
| "grad_norm": 0.5590830445289612, | |
| "learning_rate": 1.0637000890415388e-05, | |
| "loss": 0.6279808044433594, | |
| "memory(GiB)": 389.68, | |
| "step": 820, | |
| "token_acc": 0.8061934585942937, | |
| "train_speed(iter/s)": 0.054494 | |
| }, | |
| { | |
| "epoch": 0.8038976857490865, | |
| "grad_norm": 1.119874358177185, | |
| "learning_rate": 1.0144920360640303e-05, | |
| "loss": 0.6255881309509277, | |
| "memory(GiB)": 389.68, | |
| "step": 825, | |
| "token_acc": 0.8063498323802012, | |
| "train_speed(iter/s)": 0.0545 | |
| }, | |
| { | |
| "epoch": 0.8087697929354446, | |
| "grad_norm": 0.4502837359905243, | |
| "learning_rate": 9.663205470554276e-06, | |
| "loss": 0.5530724048614502, | |
| "memory(GiB)": 389.68, | |
| "step": 830, | |
| "token_acc": 0.8286991062562066, | |
| "train_speed(iter/s)": 0.054498 | |
| }, | |
| { | |
| "epoch": 0.8136419001218027, | |
| "grad_norm": 0.47327640652656555, | |
| "learning_rate": 9.19198150614417e-06, | |
| "loss": 0.6426435470581054, | |
| "memory(GiB)": 389.68, | |
| "step": 835, | |
| "token_acc": 0.7995495495495496, | |
| "train_speed(iter/s)": 0.054482 | |
| }, | |
| { | |
| "epoch": 0.8185140073081608, | |
| "grad_norm": 0.45425912737846375, | |
| "learning_rate": 8.73137102488249e-06, | |
| "loss": 0.5113016128540039, | |
| "memory(GiB)": 389.68, | |
| "step": 840, | |
| "token_acc": 0.8368200836820083, | |
| "train_speed(iter/s)": 0.054528 | |
| }, | |
| { | |
| "epoch": 0.8233861144945189, | |
| "grad_norm": 0.5594798922538757, | |
| "learning_rate": 8.28149382385231e-06, | |
| "loss": 0.5977861881256104, | |
| "memory(GiB)": 389.68, | |
| "step": 845, | |
| "token_acc": 0.8159670164917541, | |
| "train_speed(iter/s)": 0.054545 | |
| }, | |
| { | |
| "epoch": 0.8282582216808769, | |
| "grad_norm": 0.38594865798950195, | |
| "learning_rate": 7.842466908590006e-06, | |
| "loss": 0.5546538829803467, | |
| "memory(GiB)": 389.68, | |
| "step": 850, | |
| "token_acc": 0.8362763915547025, | |
| "train_speed(iter/s)": 0.05454 | |
| }, | |
| { | |
| "epoch": 0.833130328867235, | |
| "grad_norm": 0.6128694415092468, | |
| "learning_rate": 7.414404462654051e-06, | |
| "loss": 0.5578857898712158, | |
| "memory(GiB)": 389.68, | |
| "step": 855, | |
| "token_acc": 0.8173973075595443, | |
| "train_speed(iter/s)": 0.054466 | |
| }, | |
| { | |
| "epoch": 0.8380024360535931, | |
| "grad_norm": 0.5973862409591675, | |
| "learning_rate": 6.997417817927865e-06, | |
| "loss": 0.6116644382476807, | |
| "memory(GiB)": 389.68, | |
| "step": 860, | |
| "token_acc": 0.8100558659217877, | |
| "train_speed(iter/s)": 0.054467 | |
| }, | |
| { | |
| "epoch": 0.8428745432399513, | |
| "grad_norm": 0.5695779323577881, | |
| "learning_rate": 6.591615425664144e-06, | |
| "loss": 0.6063879013061524, | |
| "memory(GiB)": 389.68, | |
| "step": 865, | |
| "token_acc": 0.8113871180479226, | |
| "train_speed(iter/s)": 0.054502 | |
| }, | |
| { | |
| "epoch": 0.8477466504263094, | |
| "grad_norm": 0.37414440512657166, | |
| "learning_rate": 6.197102828278611e-06, | |
| "loss": 0.5134734153747559, | |
| "memory(GiB)": 389.68, | |
| "step": 870, | |
| "token_acc": 0.8304152076038019, | |
| "train_speed(iter/s)": 0.054524 | |
| }, | |
| { | |
| "epoch": 0.8526187576126675, | |
| "grad_norm": 0.8222331404685974, | |
| "learning_rate": 5.813982631900122e-06, | |
| "loss": 0.5653984069824218, | |
| "memory(GiB)": 389.68, | |
| "step": 875, | |
| "token_acc": 0.8229976496112819, | |
| "train_speed(iter/s)": 0.054534 | |
| }, | |
| { | |
| "epoch": 0.8574908647990256, | |
| "grad_norm": 0.3609310984611511, | |
| "learning_rate": 5.442354479684558e-06, | |
| "loss": 0.49175424575805665, | |
| "memory(GiB)": 389.68, | |
| "step": 880, | |
| "token_acc": 0.8409646976581615, | |
| "train_speed(iter/s)": 0.054533 | |
| }, | |
| { | |
| "epoch": 0.8623629719853837, | |
| "grad_norm": 0.6293960213661194, | |
| "learning_rate": 5.082315025899315e-06, | |
| "loss": 0.604953384399414, | |
| "memory(GiB)": 389.68, | |
| "step": 885, | |
| "token_acc": 0.8073544433094995, | |
| "train_speed(iter/s)": 0.05455 | |
| }, | |
| { | |
| "epoch": 0.8672350791717418, | |
| "grad_norm": 0.4242098331451416, | |
| "learning_rate": 4.733957910785114e-06, | |
| "loss": 0.4986411571502686, | |
| "memory(GiB)": 389.68, | |
| "step": 890, | |
| "token_acc": 0.8444040036396724, | |
| "train_speed(iter/s)": 0.054562 | |
| }, | |
| { | |
| "epoch": 0.8721071863580999, | |
| "grad_norm": 0.5025205612182617, | |
| "learning_rate": 4.397373736201782e-06, | |
| "loss": 0.5355000495910645, | |
| "memory(GiB)": 389.68, | |
| "step": 895, | |
| "token_acc": 0.8340460526315789, | |
| "train_speed(iter/s)": 0.054564 | |
| }, | |
| { | |
| "epoch": 0.876979293544458, | |
| "grad_norm": 0.42587506771087646, | |
| "learning_rate": 4.072650042064174e-06, | |
| "loss": 0.6113440513610839, | |
| "memory(GiB)": 389.68, | |
| "step": 900, | |
| "token_acc": 0.8042306924765515, | |
| "train_speed(iter/s)": 0.054571 | |
| }, | |
| { | |
| "epoch": 0.876979293544458, | |
| "eval_loss": 0.5867875814437866, | |
| "eval_runtime": 6.1618, | |
| "eval_samples_per_second": 0.649, | |
| "eval_steps_per_second": 0.649, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.881851400730816, | |
| "grad_norm": 0.6062163710594177, | |
| "learning_rate": 3.759871283574562e-06, | |
| "loss": 0.5853659629821777, | |
| "memory(GiB)": 389.68, | |
| "step": 905, | |
| "token_acc": 0.8163235076284995, | |
| "train_speed(iter/s)": 0.054495 | |
| }, | |
| { | |
| "epoch": 0.8867235079171741, | |
| "grad_norm": 0.5810290575027466, | |
| "learning_rate": 3.4591188092571893e-06, | |
| "loss": 0.5189132213592529, | |
| "memory(GiB)": 389.68, | |
| "step": 910, | |
| "token_acc": 0.848421052631579, | |
| "train_speed(iter/s)": 0.054517 | |
| }, | |
| { | |
| "epoch": 0.8915956151035322, | |
| "grad_norm": 0.5703849196434021, | |
| "learning_rate": 3.1704708398009486e-06, | |
| "loss": 0.5976828575134278, | |
| "memory(GiB)": 389.68, | |
| "step": 915, | |
| "token_acc": 0.808837066584842, | |
| "train_speed(iter/s)": 0.05451 | |
| }, | |
| { | |
| "epoch": 0.8964677222898904, | |
| "grad_norm": 0.5777165293693542, | |
| "learning_rate": 2.894002447715399e-06, | |
| "loss": 0.5165195465087891, | |
| "memory(GiB)": 389.68, | |
| "step": 920, | |
| "token_acc": 0.8424015009380863, | |
| "train_speed(iter/s)": 0.054567 | |
| }, | |
| { | |
| "epoch": 0.9013398294762485, | |
| "grad_norm": 0.48375067114830017, | |
| "learning_rate": 2.6297855378057623e-06, | |
| "loss": 0.46347522735595703, | |
| "memory(GiB)": 389.68, | |
| "step": 925, | |
| "token_acc": 0.8408729585200173, | |
| "train_speed(iter/s)": 0.054561 | |
| }, | |
| { | |
| "epoch": 0.9062119366626066, | |
| "grad_norm": 0.4930781126022339, | |
| "learning_rate": 2.3778888284716193e-06, | |
| "loss": 0.6031323909759522, | |
| "memory(GiB)": 389.68, | |
| "step": 930, | |
| "token_acc": 0.8058429701765064, | |
| "train_speed(iter/s)": 0.054553 | |
| }, | |
| { | |
| "epoch": 0.9110840438489647, | |
| "grad_norm": 0.42932575941085815, | |
| "learning_rate": 2.138377833834404e-06, | |
| "loss": 0.5199082851409912, | |
| "memory(GiB)": 389.68, | |
| "step": 935, | |
| "token_acc": 0.837616269903831, | |
| "train_speed(iter/s)": 0.054552 | |
| }, | |
| { | |
| "epoch": 0.9159561510353228, | |
| "grad_norm": 0.6615188717842102, | |
| "learning_rate": 1.9113148466983254e-06, | |
| "loss": 0.6138844013214111, | |
| "memory(GiB)": 389.68, | |
| "step": 940, | |
| "token_acc": 0.8027118644067797, | |
| "train_speed(iter/s)": 0.054582 | |
| }, | |
| { | |
| "epoch": 0.9208282582216809, | |
| "grad_norm": 0.41028302907943726, | |
| "learning_rate": 1.696758922348979e-06, | |
| "loss": 0.5526364803314209, | |
| "memory(GiB)": 389.68, | |
| "step": 945, | |
| "token_acc": 0.8190247252747253, | |
| "train_speed(iter/s)": 0.054578 | |
| }, | |
| { | |
| "epoch": 0.925700365408039, | |
| "grad_norm": 0.48014047741889954, | |
| "learning_rate": 1.4947658631941309e-06, | |
| "loss": 0.49515771865844727, | |
| "memory(GiB)": 389.68, | |
| "step": 950, | |
| "token_acc": 0.832800851970181, | |
| "train_speed(iter/s)": 0.054557 | |
| }, | |
| { | |
| "epoch": 0.9305724725943971, | |
| "grad_norm": 0.6173512935638428, | |
| "learning_rate": 1.3053882042503796e-06, | |
| "loss": 0.5243947505950928, | |
| "memory(GiB)": 389.68, | |
| "step": 955, | |
| "token_acc": 0.8282737560625112, | |
| "train_speed(iter/s)": 0.054472 | |
| }, | |
| { | |
| "epoch": 0.9354445797807551, | |
| "grad_norm": 0.6899262070655823, | |
| "learning_rate": 1.1286751994797284e-06, | |
| "loss": 0.636317253112793, | |
| "memory(GiB)": 389.68, | |
| "step": 960, | |
| "token_acc": 0.8041509433962264, | |
| "train_speed(iter/s)": 0.05449 | |
| }, | |
| { | |
| "epoch": 0.9403166869671132, | |
| "grad_norm": 0.538864016532898, | |
| "learning_rate": 9.646728089794167e-07, | |
| "loss": 0.5281119823455811, | |
| "memory(GiB)": 389.68, | |
| "step": 965, | |
| "token_acc": 0.828132906054984, | |
| "train_speed(iter/s)": 0.054472 | |
| }, | |
| { | |
| "epoch": 0.9451887941534713, | |
| "grad_norm": 0.7353665828704834, | |
| "learning_rate": 8.134236870284861e-07, | |
| "loss": 0.6087577819824219, | |
| "memory(GiB)": 389.68, | |
| "step": 970, | |
| "token_acc": 0.8098674274207082, | |
| "train_speed(iter/s)": 0.054485 | |
| }, | |
| { | |
| "epoch": 0.9500609013398295, | |
| "grad_norm": 0.7473301887512207, | |
| "learning_rate": 6.749671709941008e-07, | |
| "loss": 0.6141918182373047, | |
| "memory(GiB)": 389.68, | |
| "step": 975, | |
| "token_acc": 0.8016149752248118, | |
| "train_speed(iter/s)": 0.054518 | |
| }, | |
| { | |
| "epoch": 0.9549330085261876, | |
| "grad_norm": 0.6487853527069092, | |
| "learning_rate": 5.493392711005796e-07, | |
| "loss": 0.5959615707397461, | |
| "memory(GiB)": 389.68, | |
| "step": 980, | |
| "token_acc": 0.8156642881413524, | |
| "train_speed(iter/s)": 0.054561 | |
| }, | |
| { | |
| "epoch": 0.9598051157125457, | |
| "grad_norm": 0.678453803062439, | |
| "learning_rate": 4.365726610637222e-07, | |
| "loss": 0.5411821842193604, | |
| "memory(GiB)": 389.68, | |
| "step": 985, | |
| "token_acc": 0.8313556274721323, | |
| "train_speed(iter/s)": 0.054544 | |
| }, | |
| { | |
| "epoch": 0.9646772228989038, | |
| "grad_norm": 0.5119591355323792, | |
| "learning_rate": 3.366966695929119e-07, | |
| "loss": 0.49676513671875, | |
| "memory(GiB)": 389.68, | |
| "step": 990, | |
| "token_acc": 0.8351805505899178, | |
| "train_speed(iter/s)": 0.054553 | |
| }, | |
| { | |
| "epoch": 0.9695493300852619, | |
| "grad_norm": 0.6289726495742798, | |
| "learning_rate": 2.4973727276323965e-07, | |
| "loss": 0.60072922706604, | |
| "memory(GiB)": 389.68, | |
| "step": 995, | |
| "token_acc": 0.8124610591900312, | |
| "train_speed(iter/s)": 0.054575 | |
| }, | |
| { | |
| "epoch": 0.97442143727162, | |
| "grad_norm": 0.5490319132804871, | |
| "learning_rate": 1.7571708725953596e-07, | |
| "loss": 0.5364939212799072, | |
| "memory(GiB)": 389.68, | |
| "step": 1000, | |
| "token_acc": 0.8235892221657346, | |
| "train_speed(iter/s)": 0.054556 | |
| }, | |
| { | |
| "epoch": 0.97442143727162, | |
| "eval_loss": 0.5838146805763245, | |
| "eval_runtime": 6.1207, | |
| "eval_samples_per_second": 0.654, | |
| "eval_steps_per_second": 0.654, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.9792935444579781, | |
| "grad_norm": 0.4941563010215759, | |
| "learning_rate": 1.1465536449415393e-07, | |
| "loss": 0.5735920906066895, | |
| "memory(GiB)": 389.68, | |
| "step": 1005, | |
| "token_acc": 0.8156670746634027, | |
| "train_speed(iter/s)": 0.054474 | |
| }, | |
| { | |
| "epoch": 0.9841656516443362, | |
| "grad_norm": 0.5679388046264648, | |
| "learning_rate": 6.656798560001343e-08, | |
| "loss": 0.5337845325469971, | |
| "memory(GiB)": 389.68, | |
| "step": 1010, | |
| "token_acc": 0.8183527641970666, | |
| "train_speed(iter/s)": 0.054492 | |
| }, | |
| { | |
| "epoch": 0.9890377588306942, | |
| "grad_norm": 0.43481603264808655, | |
| "learning_rate": 3.146745730015499e-08, | |
| "loss": 0.5338433265686036, | |
| "memory(GiB)": 389.68, | |
| "step": 1015, | |
| "token_acc": 0.8283907544701264, | |
| "train_speed(iter/s)": 0.054525 | |
| }, | |
| { | |
| "epoch": 0.9939098660170523, | |
| "grad_norm": 0.44339102506637573, | |
| "learning_rate": 9.362908654986235e-09, | |
| "loss": 0.5187356472015381, | |
| "memory(GiB)": 389.68, | |
| "step": 1020, | |
| "token_acc": 0.8316270566727605, | |
| "train_speed(iter/s)": 0.054538 | |
| }, | |
| { | |
| "epoch": 0.9987819732034104, | |
| "grad_norm": 0.7172895669937134, | |
| "learning_rate": 2.6008868793114817e-10, | |
| "loss": 0.5243105888366699, | |
| "memory(GiB)": 389.68, | |
| "step": 1025, | |
| "token_acc": 0.8462152666879591, | |
| "train_speed(iter/s)": 0.054564 | |
| }, | |
| { | |
| "epoch": 0.9997563946406821, | |
| "eval_loss": 0.5837547183036804, | |
| "eval_runtime": 6.0694, | |
| "eval_samples_per_second": 0.659, | |
| "eval_steps_per_second": 0.659, | |
| "step": 1026 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1026, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.095035636732416e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |