{ "best_metric": 0.58375472, "best_model_checkpoint": "/export/home2/zli/kc/mm_rag/Qwen2.5-32B-Instruct_lora/checkpoint-1026", "epoch": 0.9997563946406821, "eval_steps": 100, "global_step": 1026, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00097442143727162, "grad_norm": 0.8529725074768066, "learning_rate": 1.9230769230769234e-06, "loss": 0.9744532108306885, "memory(GiB)": 255.29, "step": 1, "token_acc": 0.7235609103078983, "train_speed(iter/s)": 0.048136 }, { "epoch": 0.0048721071863581, "grad_norm": 0.9313930869102478, "learning_rate": 9.615384615384616e-06, "loss": 0.7735831141471863, "memory(GiB)": 307.13, "step": 5, "token_acc": 0.7883002659030477, "train_speed(iter/s)": 0.052895 }, { "epoch": 0.0097442143727162, "grad_norm": 0.5147601962089539, "learning_rate": 1.923076923076923e-05, "loss": 0.8932640075683593, "memory(GiB)": 357.61, "step": 10, "token_acc": 0.7670103092783506, "train_speed(iter/s)": 0.055392 }, { "epoch": 0.014616321559074299, "grad_norm": 0.3254891037940979, "learning_rate": 2.8846153846153845e-05, "loss": 0.7951784610748291, "memory(GiB)": 357.63, "step": 15, "token_acc": 0.7827709279688514, "train_speed(iter/s)": 0.058176 }, { "epoch": 0.0194884287454324, "grad_norm": 0.5581763982772827, "learning_rate": 3.846153846153846e-05, "loss": 0.956171703338623, "memory(GiB)": 357.63, "step": 20, "token_acc": 0.7419072615923009, "train_speed(iter/s)": 0.05832 }, { "epoch": 0.024360535931790498, "grad_norm": 0.3584084212779999, "learning_rate": 4.8076923076923084e-05, "loss": 0.8373254776000977, "memory(GiB)": 357.64, "step": 25, "token_acc": 0.7718067293892521, "train_speed(iter/s)": 0.057013 }, { "epoch": 0.029232643118148598, "grad_norm": 0.43336302042007446, "learning_rate": 5.769230769230769e-05, "loss": 0.7370581150054931, "memory(GiB)": 357.64, "step": 30, "token_acc": 0.7800821355236139, "train_speed(iter/s)": 0.057318 }, { "epoch": 0.0341047503045067, "grad_norm": 0.6522459983825684, "learning_rate": 6.730769230769232e-05, "loss": 0.8316493988037109, "memory(GiB)": 357.64, "step": 35, "token_acc": 0.7533095188064719, "train_speed(iter/s)": 0.0583 }, { "epoch": 0.0389768574908648, "grad_norm": 0.2728117108345032, "learning_rate": 7.692307692307693e-05, "loss": 0.7739034652709961, "memory(GiB)": 357.64, "step": 40, "token_acc": 0.7733619763694952, "train_speed(iter/s)": 0.058058 }, { "epoch": 0.0438489646772229, "grad_norm": 0.5565893054008484, "learning_rate": 8.653846153846155e-05, "loss": 0.664579200744629, "memory(GiB)": 357.64, "step": 45, "token_acc": 0.7940859054608245, "train_speed(iter/s)": 0.058044 }, { "epoch": 0.048721071863580996, "grad_norm": 0.8710922002792358, "learning_rate": 9.615384615384617e-05, "loss": 0.6327468872070312, "memory(GiB)": 378.13, "step": 50, "token_acc": 0.8048907388137357, "train_speed(iter/s)": 0.057666 }, { "epoch": 0.0535931790499391, "grad_norm": 0.5048324465751648, "learning_rate": 9.999765921804365e-05, "loss": 0.7563381671905518, "memory(GiB)": 378.13, "step": 55, "token_acc": 0.7769541778975741, "train_speed(iter/s)": 0.056465 }, { "epoch": 0.058465286236297195, "grad_norm": 0.4348597824573517, "learning_rate": 9.998335523311734e-05, "loss": 0.6969810009002686, "memory(GiB)": 378.13, "step": 60, "token_acc": 0.7881806108897742, "train_speed(iter/s)": 0.056264 }, { "epoch": 0.06333739342265529, "grad_norm": 0.3995300233364105, "learning_rate": 9.995605141340247e-05, "loss": 0.6338334083557129, "memory(GiB)": 378.13, "step": 65, "token_acc": 0.8006442376521117, "train_speed(iter/s)": 0.056356 }, { "epoch": 0.0682095006090134, "grad_norm": 0.25958451628685, "learning_rate": 9.991575486016592e-05, "loss": 0.7143070697784424, "memory(GiB)": 378.13, "step": 70, "token_acc": 0.7797544920832592, "train_speed(iter/s)": 0.056342 }, { "epoch": 0.0730816077953715, "grad_norm": 0.3476051688194275, "learning_rate": 9.986247605386727e-05, "loss": 0.6742859840393066, "memory(GiB)": 378.13, "step": 75, "token_acc": 0.7907263751763046, "train_speed(iter/s)": 0.056661 }, { "epoch": 0.0779537149817296, "grad_norm": 0.3681824505329132, "learning_rate": 9.979622885143301e-05, "loss": 0.5565629959106445, "memory(GiB)": 378.13, "step": 80, "token_acc": 0.8259753593429158, "train_speed(iter/s)": 0.056802 }, { "epoch": 0.0828258221680877, "grad_norm": 0.4626450836658478, "learning_rate": 9.97170304826526e-05, "loss": 0.6965714931488037, "memory(GiB)": 378.13, "step": 85, "token_acc": 0.7883802169516455, "train_speed(iter/s)": 0.057068 }, { "epoch": 0.0876979293544458, "grad_norm": 0.6139679551124573, "learning_rate": 9.962490154569727e-05, "loss": 0.670227336883545, "memory(GiB)": 378.13, "step": 90, "token_acc": 0.7868181818181819, "train_speed(iter/s)": 0.056985 }, { "epoch": 0.0925700365408039, "grad_norm": 0.3063270151615143, "learning_rate": 9.95198660017628e-05, "loss": 0.6900132179260254, "memory(GiB)": 378.13, "step": 95, "token_acc": 0.7969001610305958, "train_speed(iter/s)": 0.056931 }, { "epoch": 0.09744214372716199, "grad_norm": 0.6139111518859863, "learning_rate": 9.940195116883755e-05, "loss": 0.6138424396514892, "memory(GiB)": 378.13, "step": 100, "token_acc": 0.8159609120521173, "train_speed(iter/s)": 0.056746 }, { "epoch": 0.09744214372716199, "eval_loss": 0.7148731350898743, "eval_runtime": 6.1133, "eval_samples_per_second": 0.654, "eval_steps_per_second": 0.654, "step": 100 }, { "epoch": 0.1023142509135201, "grad_norm": 0.2960892617702484, "learning_rate": 9.927118771459763e-05, "loss": 0.7610847473144531, "memory(GiB)": 378.13, "step": 105, "token_acc": 0.767438173747622, "train_speed(iter/s)": 0.055274 }, { "epoch": 0.1071863580998782, "grad_norm": 0.35161978006362915, "learning_rate": 9.91276096484306e-05, "loss": 0.553467845916748, "memory(GiB)": 378.13, "step": 110, "token_acc": 0.8268870335661953, "train_speed(iter/s)": 0.055276 }, { "epoch": 0.1120584652862363, "grad_norm": 0.412120521068573, "learning_rate": 9.897125431259033e-05, "loss": 0.634190034866333, "memory(GiB)": 378.13, "step": 115, "token_acc": 0.8068276436303081, "train_speed(iter/s)": 0.055265 }, { "epoch": 0.11693057247259439, "grad_norm": 0.458766907453537, "learning_rate": 9.880216237248481e-05, "loss": 0.5070098400115967, "memory(GiB)": 383.55, "step": 120, "token_acc": 0.8488372093023255, "train_speed(iter/s)": 0.055088 }, { "epoch": 0.1218026796589525, "grad_norm": 0.26979780197143555, "learning_rate": 9.862037780609986e-05, "loss": 0.6117064476013183, "memory(GiB)": 383.55, "step": 125, "token_acc": 0.8204016064257028, "train_speed(iter/s)": 0.055201 }, { "epoch": 0.12667478684531058, "grad_norm": 0.33145567774772644, "learning_rate": 9.842594789256103e-05, "loss": 0.6480350017547607, "memory(GiB)": 383.55, "step": 130, "token_acc": 0.7939520333680917, "train_speed(iter/s)": 0.055341 }, { "epoch": 0.1315468940316687, "grad_norm": 0.29335150122642517, "learning_rate": 9.821892319983726e-05, "loss": 0.6102027893066406, "memory(GiB)": 383.55, "step": 135, "token_acc": 0.8075084115459537, "train_speed(iter/s)": 0.055381 }, { "epoch": 0.1364190012180268, "grad_norm": 0.34363335371017456, "learning_rate": 9.799935757158891e-05, "loss": 0.6652801513671875, "memory(GiB)": 383.55, "step": 140, "token_acc": 0.7967960995995125, "train_speed(iter/s)": 0.055397 }, { "epoch": 0.14129110840438489, "grad_norm": 1.3419567346572876, "learning_rate": 9.776730811316394e-05, "loss": 0.6267284393310547, "memory(GiB)": 383.55, "step": 145, "token_acc": 0.8041536400178652, "train_speed(iter/s)": 0.055716 }, { "epoch": 0.146163215590743, "grad_norm": 0.32478609681129456, "learning_rate": 9.752283517674575e-05, "loss": 0.5990486145019531, "memory(GiB)": 383.55, "step": 150, "token_acc": 0.80878414568827, "train_speed(iter/s)": 0.055975 }, { "epoch": 0.1510353227771011, "grad_norm": 0.33393779397010803, "learning_rate": 9.72660023456566e-05, "loss": 0.6312044143676758, "memory(GiB)": 383.55, "step": 155, "token_acc": 0.8099249502220861, "train_speed(iter/s)": 0.055469 }, { "epoch": 0.1559074299634592, "grad_norm": 0.39661872386932373, "learning_rate": 9.699687641782067e-05, "loss": 0.727474308013916, "memory(GiB)": 383.55, "step": 160, "token_acc": 0.7668711656441718, "train_speed(iter/s)": 0.055534 }, { "epoch": 0.1607795371498173, "grad_norm": 0.3174588978290558, "learning_rate": 9.671552738839099e-05, "loss": 0.7284453868865967, "memory(GiB)": 383.55, "step": 165, "token_acc": 0.7788385043754972, "train_speed(iter/s)": 0.055621 }, { "epoch": 0.1656516443361754, "grad_norm": 0.3508811891078949, "learning_rate": 9.642202843154491e-05, "loss": 0.6260187149047851, "memory(GiB)": 383.55, "step": 170, "token_acc": 0.8036006546644845, "train_speed(iter/s)": 0.055841 }, { "epoch": 0.1705237515225335, "grad_norm": 0.35121622681617737, "learning_rate": 9.611645588145272e-05, "loss": 0.6979084968566894, "memory(GiB)": 383.55, "step": 175, "token_acc": 0.784981684981685, "train_speed(iter/s)": 0.055902 }, { "epoch": 0.1753958587088916, "grad_norm": 0.41958293318748474, "learning_rate": 9.579888921242439e-05, "loss": 0.6360678195953369, "memory(GiB)": 383.55, "step": 180, "token_acc": 0.8010867455850961, "train_speed(iter/s)": 0.05597 }, { "epoch": 0.18026796589524968, "grad_norm": 0.4557216763496399, "learning_rate": 9.546941101823963e-05, "loss": 0.7268210411071777, "memory(GiB)": 383.55, "step": 185, "token_acc": 0.780511811023622, "train_speed(iter/s)": 0.056032 }, { "epoch": 0.1851400730816078, "grad_norm": 0.44287464022636414, "learning_rate": 9.512810699066667e-05, "loss": 0.6450634479522706, "memory(GiB)": 383.55, "step": 190, "token_acc": 0.8003896563939072, "train_speed(iter/s)": 0.056127 }, { "epoch": 0.1900121802679659, "grad_norm": 0.2944161593914032, "learning_rate": 9.477506589717518e-05, "loss": 0.5534649848937988, "memory(GiB)": 383.55, "step": 195, "token_acc": 0.8173982442138866, "train_speed(iter/s)": 0.056116 }, { "epoch": 0.19488428745432398, "grad_norm": 0.3743175268173218, "learning_rate": 9.441037955784944e-05, "loss": 0.7282295227050781, "memory(GiB)": 383.55, "step": 200, "token_acc": 0.7695568400770713, "train_speed(iter/s)": 0.05619 }, { "epoch": 0.19488428745432398, "eval_loss": 0.6866188049316406, "eval_runtime": 6.0764, "eval_samples_per_second": 0.658, "eval_steps_per_second": 0.658, "step": 200 }, { "epoch": 0.1997563946406821, "grad_norm": 0.397987425327301, "learning_rate": 9.403414282150738e-05, "loss": 0.6911158561706543, "memory(GiB)": 383.55, "step": 205, "token_acc": 0.7807744462859726, "train_speed(iter/s)": 0.055478 }, { "epoch": 0.2046285018270402, "grad_norm": 0.852430522441864, "learning_rate": 9.364645354103206e-05, "loss": 0.7203257560729981, "memory(GiB)": 383.55, "step": 210, "token_acc": 0.7839163822525598, "train_speed(iter/s)": 0.055538 }, { "epoch": 0.2095006090133983, "grad_norm": 0.36566832661628723, "learning_rate": 9.324741254792171e-05, "loss": 0.5751584529876709, "memory(GiB)": 383.55, "step": 215, "token_acc": 0.8120177310786406, "train_speed(iter/s)": 0.055665 }, { "epoch": 0.2143727161997564, "grad_norm": 0.5105158090591431, "learning_rate": 9.28371236260652e-05, "loss": 0.5958977699279785, "memory(GiB)": 383.55, "step": 220, "token_acc": 0.8228462471832206, "train_speed(iter/s)": 0.055807 }, { "epoch": 0.2192448233861145, "grad_norm": 0.48028162121772766, "learning_rate": 9.241569348474954e-05, "loss": 0.7106984615325928, "memory(GiB)": 383.55, "step": 225, "token_acc": 0.7828740844087897, "train_speed(iter/s)": 0.055858 }, { "epoch": 0.2241169305724726, "grad_norm": 0.32592424750328064, "learning_rate": 9.198323173090663e-05, "loss": 0.5898131847381591, "memory(GiB)": 383.55, "step": 230, "token_acc": 0.8189450340567084, "train_speed(iter/s)": 0.055995 }, { "epoch": 0.2289890377588307, "grad_norm": 0.44794151186943054, "learning_rate": 9.153985084060623e-05, "loss": 0.6042355060577392, "memory(GiB)": 383.55, "step": 235, "token_acc": 0.810122224134963, "train_speed(iter/s)": 0.056057 }, { "epoch": 0.23386114494518878, "grad_norm": 0.3204025328159332, "learning_rate": 9.108566612980298e-05, "loss": 0.5558523654937744, "memory(GiB)": 383.55, "step": 240, "token_acc": 0.8260869565217391, "train_speed(iter/s)": 0.056072 }, { "epoch": 0.2387332521315469, "grad_norm": 0.31540507078170776, "learning_rate": 9.062079572434448e-05, "loss": 0.6237210273742676, "memory(GiB)": 383.55, "step": 245, "token_acc": 0.8021445866482186, "train_speed(iter/s)": 0.056086 }, { "epoch": 0.243605359317905, "grad_norm": 0.619088888168335, "learning_rate": 9.014536052924883e-05, "loss": 0.664583158493042, "memory(GiB)": 383.55, "step": 250, "token_acc": 0.793002915451895, "train_speed(iter/s)": 0.056116 }, { "epoch": 0.24847746650426308, "grad_norm": 0.6715230345726013, "learning_rate": 8.965948419725922e-05, "loss": 0.5711063861846923, "memory(GiB)": 383.55, "step": 255, "token_acc": 0.8185620394343757, "train_speed(iter/s)": 0.055175 }, { "epoch": 0.25334957369062117, "grad_norm": 0.4514237642288208, "learning_rate": 8.916329309668397e-05, "loss": 0.721324348449707, "memory(GiB)": 383.55, "step": 260, "token_acc": 0.7792865828942035, "train_speed(iter/s)": 0.055266 }, { "epoch": 0.2582216808769793, "grad_norm": 0.5026947855949402, "learning_rate": 8.865691627853013e-05, "loss": 0.6661148548126221, "memory(GiB)": 383.55, "step": 265, "token_acc": 0.7951268025857782, "train_speed(iter/s)": 0.05533 }, { "epoch": 0.2630937880633374, "grad_norm": 0.3138331174850464, "learning_rate": 8.814048544293965e-05, "loss": 0.6717385292053223, "memory(GiB)": 383.55, "step": 270, "token_acc": 0.7904462355022607, "train_speed(iter/s)": 0.055296 }, { "epoch": 0.2679658952496955, "grad_norm": 0.3270625174045563, "learning_rate": 8.76141349049362e-05, "loss": 0.6027359008789063, "memory(GiB)": 383.55, "step": 275, "token_acc": 0.8082101806239738, "train_speed(iter/s)": 0.05525 }, { "epoch": 0.2728380024360536, "grad_norm": 0.4341810941696167, "learning_rate": 8.707800155949217e-05, "loss": 0.6553579330444336, "memory(GiB)": 383.55, "step": 280, "token_acc": 0.797032640949555, "train_speed(iter/s)": 0.055271 }, { "epoch": 0.2777101096224117, "grad_norm": 0.37805306911468506, "learning_rate": 8.653222484592458e-05, "loss": 0.6515018463134765, "memory(GiB)": 383.55, "step": 285, "token_acc": 0.794751477233229, "train_speed(iter/s)": 0.05524 }, { "epoch": 0.28258221680876977, "grad_norm": 0.38902854919433594, "learning_rate": 8.597694671162921e-05, "loss": 0.592349624633789, "memory(GiB)": 383.55, "step": 290, "token_acc": 0.815828677839851, "train_speed(iter/s)": 0.05519 }, { "epoch": 0.2874543239951279, "grad_norm": 0.3007030487060547, "learning_rate": 8.541231157516247e-05, "loss": 0.6616343021392822, "memory(GiB)": 383.55, "step": 295, "token_acc": 0.7961879284400601, "train_speed(iter/s)": 0.05519 }, { "epoch": 0.292326431181486, "grad_norm": 0.43431806564331055, "learning_rate": 8.483846628868055e-05, "loss": 0.6408910751342773, "memory(GiB)": 383.55, "step": 300, "token_acc": 0.7999295526593871, "train_speed(iter/s)": 0.055286 }, { "epoch": 0.292326431181486, "eval_loss": 0.6525390148162842, "eval_runtime": 6.2823, "eval_samples_per_second": 0.637, "eval_steps_per_second": 0.637, "step": 300 }, { "epoch": 0.2971985383678441, "grad_norm": 0.4932222068309784, "learning_rate": 8.425556009974566e-05, "loss": 0.6335715770721435, "memory(GiB)": 383.55, "step": 305, "token_acc": 0.8036400066789113, "train_speed(iter/s)": 0.054583 }, { "epoch": 0.3020706455542022, "grad_norm": 0.27842646837234497, "learning_rate": 8.366374461250916e-05, "loss": 0.570946216583252, "memory(GiB)": 383.55, "step": 310, "token_acc": 0.8238350381555447, "train_speed(iter/s)": 0.054659 }, { "epoch": 0.30694275274056027, "grad_norm": 0.7104659080505371, "learning_rate": 8.306317374828194e-05, "loss": 0.566010570526123, "memory(GiB)": 383.55, "step": 315, "token_acc": 0.8189669219488349, "train_speed(iter/s)": 0.054662 }, { "epoch": 0.3118148599269184, "grad_norm": 0.8684744834899902, "learning_rate": 8.245400370550198e-05, "loss": 0.677960729598999, "memory(GiB)": 383.55, "step": 320, "token_acc": 0.7772163527790538, "train_speed(iter/s)": 0.054794 }, { "epoch": 0.3166869671132765, "grad_norm": 0.3846539258956909, "learning_rate": 8.183639291910987e-05, "loss": 0.5622167587280273, "memory(GiB)": 383.55, "step": 325, "token_acc": 0.830480089318943, "train_speed(iter/s)": 0.054821 }, { "epoch": 0.3215590742996346, "grad_norm": 0.34010785818099976, "learning_rate": 8.121050201934235e-05, "loss": 0.5877705574035644, "memory(GiB)": 383.55, "step": 330, "token_acc": 0.8287964389659305, "train_speed(iter/s)": 0.054899 }, { "epoch": 0.3264311814859927, "grad_norm": 0.3751339912414551, "learning_rate": 8.057649378995526e-05, "loss": 0.5179604053497314, "memory(GiB)": 383.55, "step": 335, "token_acc": 0.8402439024390244, "train_speed(iter/s)": 0.054839 }, { "epoch": 0.3313032886723508, "grad_norm": 0.3137739896774292, "learning_rate": 7.993453312588607e-05, "loss": 0.5339327335357666, "memory(GiB)": 383.55, "step": 340, "token_acc": 0.8365357839042049, "train_speed(iter/s)": 0.054764 }, { "epoch": 0.33617539585870887, "grad_norm": 0.5746834874153137, "learning_rate": 7.928478699036755e-05, "loss": 0.6346034049987793, "memory(GiB)": 383.55, "step": 345, "token_acc": 0.7972016183412003, "train_speed(iter/s)": 0.054713 }, { "epoch": 0.341047503045067, "grad_norm": 0.3580325245857239, "learning_rate": 7.862742437150336e-05, "loss": 0.6608481884002686, "memory(GiB)": 383.55, "step": 350, "token_acc": 0.7929736511919699, "train_speed(iter/s)": 0.054762 }, { "epoch": 0.3459196102314251, "grad_norm": 0.4622519612312317, "learning_rate": 7.796261623831713e-05, "loss": 0.562419080734253, "memory(GiB)": 383.55, "step": 355, "token_acc": 0.8190336211647988, "train_speed(iter/s)": 0.054396 }, { "epoch": 0.3507917174177832, "grad_norm": 0.5616739392280579, "learning_rate": 7.729053549628622e-05, "loss": 0.5495719909667969, "memory(GiB)": 383.55, "step": 360, "token_acc": 0.8339377743844245, "train_speed(iter/s)": 0.054442 }, { "epoch": 0.3556638246041413, "grad_norm": 0.7364129424095154, "learning_rate": 7.661135694237198e-05, "loss": 0.4548810958862305, "memory(GiB)": 387.42, "step": 365, "token_acc": 0.8370827285921626, "train_speed(iter/s)": 0.054383 }, { "epoch": 0.36053593179049936, "grad_norm": 0.44831952452659607, "learning_rate": 7.592525721955786e-05, "loss": 0.5882142066955567, "memory(GiB)": 387.42, "step": 370, "token_acc": 0.8161894662424886, "train_speed(iter/s)": 0.054337 }, { "epoch": 0.3654080389768575, "grad_norm": 0.37750759720802307, "learning_rate": 7.523241477090763e-05, "loss": 0.6884512901306152, "memory(GiB)": 387.42, "step": 375, "token_acc": 0.7952127659574468, "train_speed(iter/s)": 0.054385 }, { "epoch": 0.3702801461632156, "grad_norm": 0.5074845552444458, "learning_rate": 7.45330097931553e-05, "loss": 0.5458427906036377, "memory(GiB)": 387.42, "step": 380, "token_acc": 0.8217197924388436, "train_speed(iter/s)": 0.054354 }, { "epoch": 0.3751522533495737, "grad_norm": 0.6083484292030334, "learning_rate": 7.382722418983892e-05, "loss": 0.5680232048034668, "memory(GiB)": 387.42, "step": 385, "token_acc": 0.8248374239563667, "train_speed(iter/s)": 0.054329 }, { "epoch": 0.3800243605359318, "grad_norm": 0.39138278365135193, "learning_rate": 7.311524152399054e-05, "loss": 0.7077183246612548, "memory(GiB)": 387.42, "step": 390, "token_acc": 0.7912014292094686, "train_speed(iter/s)": 0.054329 }, { "epoch": 0.3848964677222899, "grad_norm": 0.4244479238986969, "learning_rate": 7.239724697039457e-05, "loss": 0.6999778270721435, "memory(GiB)": 387.42, "step": 395, "token_acc": 0.7828650029475339, "train_speed(iter/s)": 0.054413 }, { "epoch": 0.38976857490864797, "grad_norm": 0.3658107817173004, "learning_rate": 7.167342726742685e-05, "loss": 0.5321448802947998, "memory(GiB)": 387.42, "step": 400, "token_acc": 0.8257604205782951, "train_speed(iter/s)": 0.054414 }, { "epoch": 0.38976857490864797, "eval_loss": 0.647614598274231, "eval_runtime": 6.1299, "eval_samples_per_second": 0.653, "eval_steps_per_second": 0.653, "step": 400 }, { "epoch": 0.3946406820950061, "grad_norm": 0.4579378068447113, "learning_rate": 7.094397066848716e-05, "loss": 0.6339591979980469, "memory(GiB)": 387.42, "step": 405, "token_acc": 0.7953757225433526, "train_speed(iter/s)": 0.054198 }, { "epoch": 0.3995127892813642, "grad_norm": 0.41108816862106323, "learning_rate": 7.020906689303766e-05, "loss": 0.6498037338256836, "memory(GiB)": 387.42, "step": 410, "token_acc": 0.8013311819281969, "train_speed(iter/s)": 0.054274 }, { "epoch": 0.4043848964677223, "grad_norm": 0.3730790615081787, "learning_rate": 6.946890707726004e-05, "loss": 0.6224189281463623, "memory(GiB)": 387.42, "step": 415, "token_acc": 0.8109767441860465, "train_speed(iter/s)": 0.054342 }, { "epoch": 0.4092570036540804, "grad_norm": 0.41862693428993225, "learning_rate": 6.872368372434416e-05, "loss": 0.6285569190979003, "memory(GiB)": 387.42, "step": 420, "token_acc": 0.793915399041467, "train_speed(iter/s)": 0.054335 }, { "epoch": 0.41412911084043846, "grad_norm": 0.4861293435096741, "learning_rate": 6.797359065442117e-05, "loss": 0.5771468162536622, "memory(GiB)": 387.42, "step": 425, "token_acc": 0.8196579720158922, "train_speed(iter/s)": 0.054339 }, { "epoch": 0.4190012180267966, "grad_norm": 0.30941805243492126, "learning_rate": 6.721882295415425e-05, "loss": 0.5844586372375489, "memory(GiB)": 387.42, "step": 430, "token_acc": 0.814694173000362, "train_speed(iter/s)": 0.05432 }, { "epoch": 0.4238733252131547, "grad_norm": 0.3820112645626068, "learning_rate": 6.645957692599969e-05, "loss": 0.5823289394378662, "memory(GiB)": 387.42, "step": 435, "token_acc": 0.8027565654684299, "train_speed(iter/s)": 0.054382 }, { "epoch": 0.4287454323995128, "grad_norm": 0.3910198509693146, "learning_rate": 6.569605003715201e-05, "loss": 0.561509084701538, "memory(GiB)": 387.42, "step": 440, "token_acc": 0.8264751552795031, "train_speed(iter/s)": 0.054462 }, { "epoch": 0.4336175395858709, "grad_norm": 0.3805302381515503, "learning_rate": 6.492844086818599e-05, "loss": 0.558375883102417, "memory(GiB)": 387.42, "step": 445, "token_acc": 0.8262056414922657, "train_speed(iter/s)": 0.05444 }, { "epoch": 0.438489646772229, "grad_norm": 0.6036235690116882, "learning_rate": 6.41569490614092e-05, "loss": 0.6268420696258545, "memory(GiB)": 387.42, "step": 450, "token_acc": 0.8061224489795918, "train_speed(iter/s)": 0.054446 }, { "epoch": 0.44336175395858707, "grad_norm": 0.4275857210159302, "learning_rate": 6.338177526893836e-05, "loss": 0.5441042423248291, "memory(GiB)": 387.42, "step": 455, "token_acc": 0.8360881542699724, "train_speed(iter/s)": 0.05418 }, { "epoch": 0.4482338611449452, "grad_norm": 0.4830683469772339, "learning_rate": 6.260312110051312e-05, "loss": 0.606513261795044, "memory(GiB)": 387.42, "step": 460, "token_acc": 0.8049238864875023, "train_speed(iter/s)": 0.054224 }, { "epoch": 0.4531059683313033, "grad_norm": 0.35629284381866455, "learning_rate": 6.182118907106068e-05, "loss": 0.538546371459961, "memory(GiB)": 387.42, "step": 465, "token_acc": 0.8373831775700935, "train_speed(iter/s)": 0.054204 }, { "epoch": 0.4579780755176614, "grad_norm": 0.46749940514564514, "learning_rate": 6.103618254802511e-05, "loss": 0.5923898696899415, "memory(GiB)": 387.42, "step": 470, "token_acc": 0.8042936553574851, "train_speed(iter/s)": 0.054261 }, { "epoch": 0.4628501827040195, "grad_norm": 0.6278035044670105, "learning_rate": 6.024830569847477e-05, "loss": 0.5971939086914062, "memory(GiB)": 387.42, "step": 475, "token_acc": 0.8176121372031663, "train_speed(iter/s)": 0.054245 }, { "epoch": 0.46772228989037756, "grad_norm": 0.3572694957256317, "learning_rate": 5.945776343600207e-05, "loss": 0.5843085765838623, "memory(GiB)": 387.42, "step": 480, "token_acc": 0.8212882953652789, "train_speed(iter/s)": 0.054246 }, { "epoch": 0.4725943970767357, "grad_norm": 0.5189170241355896, "learning_rate": 5.866476136742862e-05, "loss": 0.5234210968017579, "memory(GiB)": 387.42, "step": 485, "token_acc": 0.8463819691577699, "train_speed(iter/s)": 0.05426 }, { "epoch": 0.4774665042630938, "grad_norm": 0.41832658648490906, "learning_rate": 5.7869505739330546e-05, "loss": 0.6695927619934082, "memory(GiB)": 387.42, "step": 490, "token_acc": 0.7924812030075188, "train_speed(iter/s)": 0.05433 }, { "epoch": 0.4823386114494519, "grad_norm": 4.011805534362793, "learning_rate": 5.7072203384397064e-05, "loss": 0.5814547538757324, "memory(GiB)": 387.42, "step": 495, "token_acc": 0.8110627719080175, "train_speed(iter/s)": 0.054376 }, { "epoch": 0.48721071863581, "grad_norm": 0.31671130657196045, "learning_rate": 5.627306166763684e-05, "loss": 0.5855265617370605, "memory(GiB)": 387.42, "step": 500, "token_acc": 0.8094142629623076, "train_speed(iter/s)": 0.054362 }, { "epoch": 0.48721071863581, "eval_loss": 0.6302051544189453, "eval_runtime": 6.1545, "eval_samples_per_second": 0.65, "eval_steps_per_second": 0.65, "step": 500 }, { "epoch": 0.4920828258221681, "grad_norm": 0.3875284194946289, "learning_rate": 5.5472288432445774e-05, "loss": 0.59937744140625, "memory(GiB)": 387.42, "step": 505, "token_acc": 0.7988918837975442, "train_speed(iter/s)": 0.05424 }, { "epoch": 0.49695493300852617, "grad_norm": 0.4411413371562958, "learning_rate": 5.467009194655045e-05, "loss": 0.5820174217224121, "memory(GiB)": 387.42, "step": 510, "token_acc": 0.8234812510234157, "train_speed(iter/s)": 0.054197 }, { "epoch": 0.5018270401948843, "grad_norm": 0.5111451148986816, "learning_rate": 5.386668084784112e-05, "loss": 0.5154130935668946, "memory(GiB)": 387.42, "step": 515, "token_acc": 0.8397686998694274, "train_speed(iter/s)": 0.05426 }, { "epoch": 0.5066991473812423, "grad_norm": 0.29832109808921814, "learning_rate": 5.306226409010855e-05, "loss": 0.5672587394714356, "memory(GiB)": 387.42, "step": 520, "token_acc": 0.8263521756811713, "train_speed(iter/s)": 0.054274 }, { "epoch": 0.5115712545676004, "grad_norm": 0.42139527201652527, "learning_rate": 5.22570508886986e-05, "loss": 0.5327470302581787, "memory(GiB)": 387.42, "step": 525, "token_acc": 0.8310478199718706, "train_speed(iter/s)": 0.054332 }, { "epoch": 0.5164433617539586, "grad_norm": 0.34750285744667053, "learning_rate": 5.145125066609877e-05, "loss": 0.61210618019104, "memory(GiB)": 387.42, "step": 530, "token_acc": 0.8104413702239789, "train_speed(iter/s)": 0.054325 }, { "epoch": 0.5213154689403167, "grad_norm": 0.5557289123535156, "learning_rate": 5.0645072997471e-05, "loss": 0.5486731052398681, "memory(GiB)": 387.42, "step": 535, "token_acc": 0.8223992502343018, "train_speed(iter/s)": 0.054295 }, { "epoch": 0.5261875761266748, "grad_norm": 1.370209813117981, "learning_rate": 4.983872755614461e-05, "loss": 0.6499679565429688, "memory(GiB)": 387.42, "step": 540, "token_acc": 0.7975866095757104, "train_speed(iter/s)": 0.054348 }, { "epoch": 0.5310596833130329, "grad_norm": 0.4371365010738373, "learning_rate": 4.9032424059083774e-05, "loss": 0.43409147262573244, "memory(GiB)": 387.42, "step": 545, "token_acc": 0.8684942391736193, "train_speed(iter/s)": 0.054321 }, { "epoch": 0.535931790499391, "grad_norm": 0.4735865890979767, "learning_rate": 4.8226372212343726e-05, "loss": 0.5776564598083496, "memory(GiB)": 387.42, "step": 550, "token_acc": 0.8255653883972468, "train_speed(iter/s)": 0.054368 }, { "epoch": 0.5408038976857491, "grad_norm": 0.6005700826644897, "learning_rate": 4.742078165652958e-05, "loss": 0.5744057178497315, "memory(GiB)": 387.42, "step": 555, "token_acc": 0.8105436573311368, "train_speed(iter/s)": 0.054325 }, { "epoch": 0.5456760048721072, "grad_norm": 0.4128513038158417, "learning_rate": 4.661586191227247e-05, "loss": 0.5321125030517578, "memory(GiB)": 387.42, "step": 560, "token_acc": 0.8245080500894454, "train_speed(iter/s)": 0.054305 }, { "epoch": 0.5505481120584653, "grad_norm": 0.4688722491264343, "learning_rate": 4.581182232573658e-05, "loss": 0.5235236167907715, "memory(GiB)": 387.42, "step": 565, "token_acc": 0.8205183122724352, "train_speed(iter/s)": 0.054352 }, { "epoch": 0.5554202192448234, "grad_norm": 0.4604549705982208, "learning_rate": 4.500887201417187e-05, "loss": 0.6571295261383057, "memory(GiB)": 387.42, "step": 570, "token_acc": 0.8019607843137255, "train_speed(iter/s)": 0.054361 }, { "epoch": 0.5602923264311814, "grad_norm": 0.48336780071258545, "learning_rate": 4.4207219811526056e-05, "loss": 0.5963138580322266, "memory(GiB)": 387.42, "step": 575, "token_acc": 0.8077416987708678, "train_speed(iter/s)": 0.054409 }, { "epoch": 0.5651644336175395, "grad_norm": 0.5700681805610657, "learning_rate": 4.3407074214130446e-05, "loss": 0.6309503555297852, "memory(GiB)": 387.42, "step": 580, "token_acc": 0.7960770454143842, "train_speed(iter/s)": 0.054412 }, { "epoch": 0.5700365408038977, "grad_norm": 0.40493443608283997, "learning_rate": 4.2608643326473496e-05, "loss": 0.5265829563140869, "memory(GiB)": 387.42, "step": 585, "token_acc": 0.8364477970169724, "train_speed(iter/s)": 0.054419 }, { "epoch": 0.5749086479902558, "grad_norm": 0.42441654205322266, "learning_rate": 4.181213480707637e-05, "loss": 0.5463868618011475, "memory(GiB)": 387.42, "step": 590, "token_acc": 0.8250831178426302, "train_speed(iter/s)": 0.054415 }, { "epoch": 0.5797807551766139, "grad_norm": 0.5273870825767517, "learning_rate": 4.1017755814484374e-05, "loss": 0.6219929218292236, "memory(GiB)": 387.42, "step": 595, "token_acc": 0.8101965601965602, "train_speed(iter/s)": 0.054492 }, { "epoch": 0.584652862362972, "grad_norm": 0.5027340650558472, "learning_rate": 4.0225712953388494e-05, "loss": 0.47921223640441896, "memory(GiB)": 387.42, "step": 600, "token_acc": 0.8507462686567164, "train_speed(iter/s)": 0.054456 }, { "epoch": 0.584652862362972, "eval_loss": 0.5931864976882935, "eval_runtime": 6.2202, "eval_samples_per_second": 0.643, "eval_steps_per_second": 0.643, "step": 600 }, { "epoch": 0.5895249695493301, "grad_norm": 0.7974056005477905, "learning_rate": 3.943621222089102e-05, "loss": 0.5052922248840332, "memory(GiB)": 387.42, "step": 605, "token_acc": 0.8312937062937062, "train_speed(iter/s)": 0.054258 }, { "epoch": 0.5943970767356882, "grad_norm": 0.38420093059539795, "learning_rate": 3.864945895292908e-05, "loss": 0.5411774635314941, "memory(GiB)": 387.42, "step": 610, "token_acc": 0.8309124767225325, "train_speed(iter/s)": 0.054201 }, { "epoch": 0.5992691839220463, "grad_norm": 0.9411633014678955, "learning_rate": 3.786565777087022e-05, "loss": 0.6929959297180176, "memory(GiB)": 387.42, "step": 615, "token_acc": 0.7847842261904762, "train_speed(iter/s)": 0.05425 }, { "epoch": 0.6041412911084044, "grad_norm": 0.35226595401763916, "learning_rate": 3.708501252829386e-05, "loss": 0.5966301918029785, "memory(GiB)": 387.42, "step": 620, "token_acc": 0.8161076443057722, "train_speed(iter/s)": 0.054233 }, { "epoch": 0.6090133982947625, "grad_norm": 0.4208815097808838, "learning_rate": 3.6307726257972255e-05, "loss": 0.5394818782806396, "memory(GiB)": 387.42, "step": 625, "token_acc": 0.8257628294036061, "train_speed(iter/s)": 0.054209 }, { "epoch": 0.6138855054811205, "grad_norm": 0.445925772190094, "learning_rate": 3.553400111906523e-05, "loss": 0.6164620399475098, "memory(GiB)": 387.42, "step": 630, "token_acc": 0.8090881366270204, "train_speed(iter/s)": 0.054222 }, { "epoch": 0.6187576126674786, "grad_norm": 0.5922476649284363, "learning_rate": 3.476403834454183e-05, "loss": 0.5115623474121094, "memory(GiB)": 387.42, "step": 635, "token_acc": 0.8346325167037862, "train_speed(iter/s)": 0.054244 }, { "epoch": 0.6236297198538368, "grad_norm": 0.5026776790618896, "learning_rate": 3.399803818884311e-05, "loss": 0.5328683853149414, "memory(GiB)": 387.42, "step": 640, "token_acc": 0.8462420173571311, "train_speed(iter/s)": 0.054264 }, { "epoch": 0.6285018270401949, "grad_norm": 0.45468801259994507, "learning_rate": 3.323619987579914e-05, "loss": 0.6177504062652588, "memory(GiB)": 387.42, "step": 645, "token_acc": 0.80891932520461, "train_speed(iter/s)": 0.054261 }, { "epoch": 0.633373934226553, "grad_norm": 0.6319808959960938, "learning_rate": 3.247872154681439e-05, "loss": 0.5958673000335694, "memory(GiB)": 387.42, "step": 650, "token_acc": 0.8096597145993414, "train_speed(iter/s)": 0.054221 }, { "epoch": 0.6382460414129111, "grad_norm": 0.4812871217727661, "learning_rate": 3.172580020933442e-05, "loss": 0.5768674850463867, "memory(GiB)": 387.42, "step": 655, "token_acc": 0.8165027102991367, "train_speed(iter/s)": 0.054185 }, { "epoch": 0.6431181485992692, "grad_norm": 0.9395345449447632, "learning_rate": 3.097763168560741e-05, "loss": 0.674397611618042, "memory(GiB)": 387.42, "step": 660, "token_acc": 0.7806563039723662, "train_speed(iter/s)": 0.054211 }, { "epoch": 0.6479902557856273, "grad_norm": 0.5097836852073669, "learning_rate": 3.0234410561754257e-05, "loss": 0.5154216766357422, "memory(GiB)": 387.42, "step": 665, "token_acc": 0.8327868852459016, "train_speed(iter/s)": 0.054197 }, { "epoch": 0.6528623629719854, "grad_norm": 0.3545515239238739, "learning_rate": 2.949633013715982e-05, "loss": 0.5994223117828369, "memory(GiB)": 387.42, "step": 670, "token_acc": 0.8076275080410477, "train_speed(iter/s)": 0.054247 }, { "epoch": 0.6577344701583435, "grad_norm": 0.9892140030860901, "learning_rate": 2.8763582374199126e-05, "loss": 0.5891304969787597, "memory(GiB)": 387.42, "step": 675, "token_acc": 0.8036573628488932, "train_speed(iter/s)": 0.054243 }, { "epoch": 0.6626065773447016, "grad_norm": 0.5605654716491699, "learning_rate": 2.8036357848311012e-05, "loss": 0.5478427410125732, "memory(GiB)": 387.42, "step": 680, "token_acc": 0.8287547623821937, "train_speed(iter/s)": 0.054281 }, { "epoch": 0.6674786845310596, "grad_norm": 0.4100501239299774, "learning_rate": 2.7314845698432805e-05, "loss": 0.6083401203155517, "memory(GiB)": 387.42, "step": 685, "token_acc": 0.7989271180170181, "train_speed(iter/s)": 0.054288 }, { "epoch": 0.6723507917174177, "grad_norm": 0.4639231562614441, "learning_rate": 2.659923357780828e-05, "loss": 0.5717390060424805, "memory(GiB)": 387.42, "step": 690, "token_acc": 0.8201791448369106, "train_speed(iter/s)": 0.054301 }, { "epoch": 0.6772228989037758, "grad_norm": 0.30558013916015625, "learning_rate": 2.5889707605182347e-05, "loss": 0.4964598178863525, "memory(GiB)": 387.42, "step": 695, "token_acc": 0.8518634024637455, "train_speed(iter/s)": 0.054314 }, { "epoch": 0.682095006090134, "grad_norm": 0.490887314081192, "learning_rate": 2.518645231639457e-05, "loss": 0.6779924392700195, "memory(GiB)": 387.42, "step": 700, "token_acc": 0.7798953662182362, "train_speed(iter/s)": 0.054375 }, { "epoch": 0.682095006090134, "eval_loss": 0.587890625, "eval_runtime": 6.016, "eval_samples_per_second": 0.665, "eval_steps_per_second": 0.665, "step": 700 }, { "epoch": 0.6869671132764921, "grad_norm": 0.9540379047393799, "learning_rate": 2.4489650616384507e-05, "loss": 0.5919107437133789, "memory(GiB)": 387.42, "step": 705, "token_acc": 0.8063427800269906, "train_speed(iter/s)": 0.054286 }, { "epoch": 0.6918392204628502, "grad_norm": 0.4385371208190918, "learning_rate": 2.3799483731621237e-05, "loss": 0.5554671287536621, "memory(GiB)": 387.42, "step": 710, "token_acc": 0.8227891742802965, "train_speed(iter/s)": 0.054309 }, { "epoch": 0.6967113276492083, "grad_norm": 0.37225764989852905, "learning_rate": 2.311613116296929e-05, "loss": 0.5223379611968995, "memory(GiB)": 387.42, "step": 715, "token_acc": 0.8422697368421053, "train_speed(iter/s)": 0.054303 }, { "epoch": 0.7015834348355664, "grad_norm": 0.6227976083755493, "learning_rate": 2.2439770639003627e-05, "loss": 0.5609029769897461, "memory(GiB)": 387.42, "step": 720, "token_acc": 0.8244803695150116, "train_speed(iter/s)": 0.054309 }, { "epoch": 0.7064555420219245, "grad_norm": 0.4218509793281555, "learning_rate": 2.177057806978522e-05, "loss": 0.5789398193359375, "memory(GiB)": 387.42, "step": 725, "token_acc": 0.8195275590551181, "train_speed(iter/s)": 0.054317 }, { "epoch": 0.7113276492082826, "grad_norm": 0.5081908106803894, "learning_rate": 2.110872750110996e-05, "loss": 0.49318413734436034, "memory(GiB)": 387.42, "step": 730, "token_acc": 0.8306063522617901, "train_speed(iter/s)": 0.05436 }, { "epoch": 0.7161997563946407, "grad_norm": 0.6738778352737427, "learning_rate": 2.045439106924217e-05, "loss": 0.55146803855896, "memory(GiB)": 387.42, "step": 735, "token_acc": 0.8200392927308447, "train_speed(iter/s)": 0.054367 }, { "epoch": 0.7210718635809987, "grad_norm": 0.43147921562194824, "learning_rate": 1.980773895614481e-05, "loss": 0.574643898010254, "memory(GiB)": 387.42, "step": 740, "token_acc": 0.8172221384406575, "train_speed(iter/s)": 0.054386 }, { "epoch": 0.7259439707673568, "grad_norm": 0.5750350952148438, "learning_rate": 1.9168939345218095e-05, "loss": 0.5682173728942871, "memory(GiB)": 387.42, "step": 745, "token_acc": 0.8214421252371916, "train_speed(iter/s)": 0.054395 }, { "epoch": 0.730816077953715, "grad_norm": 0.461907297372818, "learning_rate": 1.8538158377557702e-05, "loss": 0.5272111415863037, "memory(GiB)": 387.42, "step": 750, "token_acc": 0.8257032542746828, "train_speed(iter/s)": 0.054421 }, { "epoch": 0.7356881851400731, "grad_norm": 0.794235348701477, "learning_rate": 1.791556010874434e-05, "loss": 0.6292970180511475, "memory(GiB)": 387.42, "step": 755, "token_acc": 0.810012836970475, "train_speed(iter/s)": 0.054353 }, { "epoch": 0.7405602923264312, "grad_norm": 0.6189777851104736, "learning_rate": 1.7301306466175533e-05, "loss": 0.5557656288146973, "memory(GiB)": 387.42, "step": 760, "token_acc": 0.8259242957746479, "train_speed(iter/s)": 0.054349 }, { "epoch": 0.7454323995127893, "grad_norm": 0.4845249056816101, "learning_rate": 1.6695557206951144e-05, "loss": 0.49696760177612304, "memory(GiB)": 389.68, "step": 765, "token_acc": 0.8422638261243813, "train_speed(iter/s)": 0.054323 }, { "epoch": 0.7503045066991474, "grad_norm": 0.4710843563079834, "learning_rate": 1.6098469876323093e-05, "loss": 0.47034273147583006, "memory(GiB)": 389.68, "step": 770, "token_acc": 0.8487571701720842, "train_speed(iter/s)": 0.05434 }, { "epoch": 0.7551766138855055, "grad_norm": 0.45380252599716187, "learning_rate": 1.551019976672058e-05, "loss": 0.5777853488922119, "memory(GiB)": 389.68, "step": 775, "token_acc": 0.8110020910406949, "train_speed(iter/s)": 0.054377 }, { "epoch": 0.7600487210718636, "grad_norm": 0.5304797291755676, "learning_rate": 1.4930899877361015e-05, "loss": 0.5180749416351318, "memory(GiB)": 389.68, "step": 780, "token_acc": 0.8334659769200159, "train_speed(iter/s)": 0.05443 }, { "epoch": 0.7649208282582217, "grad_norm": 0.447553426027298, "learning_rate": 1.4360720874457607e-05, "loss": 0.5336573123931885, "memory(GiB)": 389.68, "step": 785, "token_acc": 0.8346641615782058, "train_speed(iter/s)": 0.054438 }, { "epoch": 0.7697929354445798, "grad_norm": 0.5468970537185669, "learning_rate": 1.3799811052033467e-05, "loss": 0.6092133522033691, "memory(GiB)": 389.68, "step": 790, "token_acc": 0.7997620261771206, "train_speed(iter/s)": 0.054456 }, { "epoch": 0.7746650426309378, "grad_norm": 0.6424246430397034, "learning_rate": 1.3248316293352946e-05, "loss": 0.6084504127502441, "memory(GiB)": 389.68, "step": 795, "token_acc": 0.8091853471842537, "train_speed(iter/s)": 0.05451 }, { "epoch": 0.7795371498172959, "grad_norm": 0.5339289903640747, "learning_rate": 1.2706380032979691e-05, "loss": 0.535353136062622, "memory(GiB)": 389.68, "step": 800, "token_acc": 0.8231229847996315, "train_speed(iter/s)": 0.054509 }, { "epoch": 0.7795371498172959, "eval_loss": 0.587626039981842, "eval_runtime": 6.1485, "eval_samples_per_second": 0.651, "eval_steps_per_second": 0.651, "step": 800 }, { "epoch": 0.784409257003654, "grad_norm": 0.47259068489074707, "learning_rate": 1.2174143219471878e-05, "loss": 0.6263217449188232, "memory(GiB)": 389.68, "step": 805, "token_acc": 0.7991557070953077, "train_speed(iter/s)": 0.054434 }, { "epoch": 0.7892813641900122, "grad_norm": 0.5547453761100769, "learning_rate": 1.1651744278723687e-05, "loss": 0.5090929985046386, "memory(GiB)": 389.68, "step": 810, "token_acc": 0.8354404976921533, "train_speed(iter/s)": 0.054448 }, { "epoch": 0.7941534713763703, "grad_norm": 0.4848991930484772, "learning_rate": 1.1139319077963178e-05, "loss": 0.5273432254791259, "memory(GiB)": 389.68, "step": 815, "token_acc": 0.8295368261199696, "train_speed(iter/s)": 0.054475 }, { "epoch": 0.7990255785627284, "grad_norm": 0.5590830445289612, "learning_rate": 1.0637000890415388e-05, "loss": 0.6279808044433594, "memory(GiB)": 389.68, "step": 820, "token_acc": 0.8061934585942937, "train_speed(iter/s)": 0.054494 }, { "epoch": 0.8038976857490865, "grad_norm": 1.119874358177185, "learning_rate": 1.0144920360640303e-05, "loss": 0.6255881309509277, "memory(GiB)": 389.68, "step": 825, "token_acc": 0.8063498323802012, "train_speed(iter/s)": 0.0545 }, { "epoch": 0.8087697929354446, "grad_norm": 0.4502837359905243, "learning_rate": 9.663205470554276e-06, "loss": 0.5530724048614502, "memory(GiB)": 389.68, "step": 830, "token_acc": 0.8286991062562066, "train_speed(iter/s)": 0.054498 }, { "epoch": 0.8136419001218027, "grad_norm": 0.47327640652656555, "learning_rate": 9.19198150614417e-06, "loss": 0.6426435470581054, "memory(GiB)": 389.68, "step": 835, "token_acc": 0.7995495495495496, "train_speed(iter/s)": 0.054482 }, { "epoch": 0.8185140073081608, "grad_norm": 0.45425912737846375, "learning_rate": 8.73137102488249e-06, "loss": 0.5113016128540039, "memory(GiB)": 389.68, "step": 840, "token_acc": 0.8368200836820083, "train_speed(iter/s)": 0.054528 }, { "epoch": 0.8233861144945189, "grad_norm": 0.5594798922538757, "learning_rate": 8.28149382385231e-06, "loss": 0.5977861881256104, "memory(GiB)": 389.68, "step": 845, "token_acc": 0.8159670164917541, "train_speed(iter/s)": 0.054545 }, { "epoch": 0.8282582216808769, "grad_norm": 0.38594865798950195, "learning_rate": 7.842466908590006e-06, "loss": 0.5546538829803467, "memory(GiB)": 389.68, "step": 850, "token_acc": 0.8362763915547025, "train_speed(iter/s)": 0.05454 }, { "epoch": 0.833130328867235, "grad_norm": 0.6128694415092468, "learning_rate": 7.414404462654051e-06, "loss": 0.5578857898712158, "memory(GiB)": 389.68, "step": 855, "token_acc": 0.8173973075595443, "train_speed(iter/s)": 0.054466 }, { "epoch": 0.8380024360535931, "grad_norm": 0.5973862409591675, "learning_rate": 6.997417817927865e-06, "loss": 0.6116644382476807, "memory(GiB)": 389.68, "step": 860, "token_acc": 0.8100558659217877, "train_speed(iter/s)": 0.054467 }, { "epoch": 0.8428745432399513, "grad_norm": 0.5695779323577881, "learning_rate": 6.591615425664144e-06, "loss": 0.6063879013061524, "memory(GiB)": 389.68, "step": 865, "token_acc": 0.8113871180479226, "train_speed(iter/s)": 0.054502 }, { "epoch": 0.8477466504263094, "grad_norm": 0.37414440512657166, "learning_rate": 6.197102828278611e-06, "loss": 0.5134734153747559, "memory(GiB)": 389.68, "step": 870, "token_acc": 0.8304152076038019, "train_speed(iter/s)": 0.054524 }, { "epoch": 0.8526187576126675, "grad_norm": 0.8222331404685974, "learning_rate": 5.813982631900122e-06, "loss": 0.5653984069824218, "memory(GiB)": 389.68, "step": 875, "token_acc": 0.8229976496112819, "train_speed(iter/s)": 0.054534 }, { "epoch": 0.8574908647990256, "grad_norm": 0.3609310984611511, "learning_rate": 5.442354479684558e-06, "loss": 0.49175424575805665, "memory(GiB)": 389.68, "step": 880, "token_acc": 0.8409646976581615, "train_speed(iter/s)": 0.054533 }, { "epoch": 0.8623629719853837, "grad_norm": 0.6293960213661194, "learning_rate": 5.082315025899315e-06, "loss": 0.604953384399414, "memory(GiB)": 389.68, "step": 885, "token_acc": 0.8073544433094995, "train_speed(iter/s)": 0.05455 }, { "epoch": 0.8672350791717418, "grad_norm": 0.4242098331451416, "learning_rate": 4.733957910785114e-06, "loss": 0.4986411571502686, "memory(GiB)": 389.68, "step": 890, "token_acc": 0.8444040036396724, "train_speed(iter/s)": 0.054562 }, { "epoch": 0.8721071863580999, "grad_norm": 0.5025205612182617, "learning_rate": 4.397373736201782e-06, "loss": 0.5355000495910645, "memory(GiB)": 389.68, "step": 895, "token_acc": 0.8340460526315789, "train_speed(iter/s)": 0.054564 }, { "epoch": 0.876979293544458, "grad_norm": 0.42587506771087646, "learning_rate": 4.072650042064174e-06, "loss": 0.6113440513610839, "memory(GiB)": 389.68, "step": 900, "token_acc": 0.8042306924765515, "train_speed(iter/s)": 0.054571 }, { "epoch": 0.876979293544458, "eval_loss": 0.5867875814437866, "eval_runtime": 6.1618, "eval_samples_per_second": 0.649, "eval_steps_per_second": 0.649, "step": 900 }, { "epoch": 0.881851400730816, "grad_norm": 0.6062163710594177, "learning_rate": 3.759871283574562e-06, "loss": 0.5853659629821777, "memory(GiB)": 389.68, "step": 905, "token_acc": 0.8163235076284995, "train_speed(iter/s)": 0.054495 }, { "epoch": 0.8867235079171741, "grad_norm": 0.5810290575027466, "learning_rate": 3.4591188092571893e-06, "loss": 0.5189132213592529, "memory(GiB)": 389.68, "step": 910, "token_acc": 0.848421052631579, "train_speed(iter/s)": 0.054517 }, { "epoch": 0.8915956151035322, "grad_norm": 0.5703849196434021, "learning_rate": 3.1704708398009486e-06, "loss": 0.5976828575134278, "memory(GiB)": 389.68, "step": 915, "token_acc": 0.808837066584842, "train_speed(iter/s)": 0.05451 }, { "epoch": 0.8964677222898904, "grad_norm": 0.5777165293693542, "learning_rate": 2.894002447715399e-06, "loss": 0.5165195465087891, "memory(GiB)": 389.68, "step": 920, "token_acc": 0.8424015009380863, "train_speed(iter/s)": 0.054567 }, { "epoch": 0.9013398294762485, "grad_norm": 0.48375067114830017, "learning_rate": 2.6297855378057623e-06, "loss": 0.46347522735595703, "memory(GiB)": 389.68, "step": 925, "token_acc": 0.8408729585200173, "train_speed(iter/s)": 0.054561 }, { "epoch": 0.9062119366626066, "grad_norm": 0.4930781126022339, "learning_rate": 2.3778888284716193e-06, "loss": 0.6031323909759522, "memory(GiB)": 389.68, "step": 930, "token_acc": 0.8058429701765064, "train_speed(iter/s)": 0.054553 }, { "epoch": 0.9110840438489647, "grad_norm": 0.42932575941085815, "learning_rate": 2.138377833834404e-06, "loss": 0.5199082851409912, "memory(GiB)": 389.68, "step": 935, "token_acc": 0.837616269903831, "train_speed(iter/s)": 0.054552 }, { "epoch": 0.9159561510353228, "grad_norm": 0.6615188717842102, "learning_rate": 1.9113148466983254e-06, "loss": 0.6138844013214111, "memory(GiB)": 389.68, "step": 940, "token_acc": 0.8027118644067797, "train_speed(iter/s)": 0.054582 }, { "epoch": 0.9208282582216809, "grad_norm": 0.41028302907943726, "learning_rate": 1.696758922348979e-06, "loss": 0.5526364803314209, "memory(GiB)": 389.68, "step": 945, "token_acc": 0.8190247252747253, "train_speed(iter/s)": 0.054578 }, { "epoch": 0.925700365408039, "grad_norm": 0.48014047741889954, "learning_rate": 1.4947658631941309e-06, "loss": 0.49515771865844727, "memory(GiB)": 389.68, "step": 950, "token_acc": 0.832800851970181, "train_speed(iter/s)": 0.054557 }, { "epoch": 0.9305724725943971, "grad_norm": 0.6173512935638428, "learning_rate": 1.3053882042503796e-06, "loss": 0.5243947505950928, "memory(GiB)": 389.68, "step": 955, "token_acc": 0.8282737560625112, "train_speed(iter/s)": 0.054472 }, { "epoch": 0.9354445797807551, "grad_norm": 0.6899262070655823, "learning_rate": 1.1286751994797284e-06, "loss": 0.636317253112793, "memory(GiB)": 389.68, "step": 960, "token_acc": 0.8041509433962264, "train_speed(iter/s)": 0.05449 }, { "epoch": 0.9403166869671132, "grad_norm": 0.538864016532898, "learning_rate": 9.646728089794167e-07, "loss": 0.5281119823455811, "memory(GiB)": 389.68, "step": 965, "token_acc": 0.828132906054984, "train_speed(iter/s)": 0.054472 }, { "epoch": 0.9451887941534713, "grad_norm": 0.7353665828704834, "learning_rate": 8.134236870284861e-07, "loss": 0.6087577819824219, "memory(GiB)": 389.68, "step": 970, "token_acc": 0.8098674274207082, "train_speed(iter/s)": 0.054485 }, { "epoch": 0.9500609013398295, "grad_norm": 0.7473301887512207, "learning_rate": 6.749671709941008e-07, "loss": 0.6141918182373047, "memory(GiB)": 389.68, "step": 975, "token_acc": 0.8016149752248118, "train_speed(iter/s)": 0.054518 }, { "epoch": 0.9549330085261876, "grad_norm": 0.6487853527069092, "learning_rate": 5.493392711005796e-07, "loss": 0.5959615707397461, "memory(GiB)": 389.68, "step": 980, "token_acc": 0.8156642881413524, "train_speed(iter/s)": 0.054561 }, { "epoch": 0.9598051157125457, "grad_norm": 0.678453803062439, "learning_rate": 4.365726610637222e-07, "loss": 0.5411821842193604, "memory(GiB)": 389.68, "step": 985, "token_acc": 0.8313556274721323, "train_speed(iter/s)": 0.054544 }, { "epoch": 0.9646772228989038, "grad_norm": 0.5119591355323792, "learning_rate": 3.366966695929119e-07, "loss": 0.49676513671875, "memory(GiB)": 389.68, "step": 990, "token_acc": 0.8351805505899178, "train_speed(iter/s)": 0.054553 }, { "epoch": 0.9695493300852619, "grad_norm": 0.6289726495742798, "learning_rate": 2.4973727276323965e-07, "loss": 0.60072922706604, "memory(GiB)": 389.68, "step": 995, "token_acc": 0.8124610591900312, "train_speed(iter/s)": 0.054575 }, { "epoch": 0.97442143727162, "grad_norm": 0.5490319132804871, "learning_rate": 1.7571708725953596e-07, "loss": 0.5364939212799072, "memory(GiB)": 389.68, "step": 1000, "token_acc": 0.8235892221657346, "train_speed(iter/s)": 0.054556 }, { "epoch": 0.97442143727162, "eval_loss": 0.5838146805763245, "eval_runtime": 6.1207, "eval_samples_per_second": 0.654, "eval_steps_per_second": 0.654, "step": 1000 }, { "epoch": 0.9792935444579781, "grad_norm": 0.4941563010215759, "learning_rate": 1.1465536449415393e-07, "loss": 0.5735920906066895, "memory(GiB)": 389.68, "step": 1005, "token_acc": 0.8156670746634027, "train_speed(iter/s)": 0.054474 }, { "epoch": 0.9841656516443362, "grad_norm": 0.5679388046264648, "learning_rate": 6.656798560001343e-08, "loss": 0.5337845325469971, "memory(GiB)": 389.68, "step": 1010, "token_acc": 0.8183527641970666, "train_speed(iter/s)": 0.054492 }, { "epoch": 0.9890377588306942, "grad_norm": 0.43481603264808655, "learning_rate": 3.146745730015499e-08, "loss": 0.5338433265686036, "memory(GiB)": 389.68, "step": 1015, "token_acc": 0.8283907544701264, "train_speed(iter/s)": 0.054525 }, { "epoch": 0.9939098660170523, "grad_norm": 0.44339102506637573, "learning_rate": 9.362908654986235e-09, "loss": 0.5187356472015381, "memory(GiB)": 389.68, "step": 1020, "token_acc": 0.8316270566727605, "train_speed(iter/s)": 0.054538 }, { "epoch": 0.9987819732034104, "grad_norm": 0.7172895669937134, "learning_rate": 2.6008868793114817e-10, "loss": 0.5243105888366699, "memory(GiB)": 389.68, "step": 1025, "token_acc": 0.8462152666879591, "train_speed(iter/s)": 0.054564 }, { "epoch": 0.9997563946406821, "eval_loss": 0.5837547183036804, "eval_runtime": 6.0694, "eval_samples_per_second": 0.659, "eval_steps_per_second": 0.659, "step": 1026 } ], "logging_steps": 5, "max_steps": 1026, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.095035636732416e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }