| { |
| "best_global_step": 100, |
| "best_metric": 0.39695656, |
| "best_model_checkpoint": "/home/ubuntu/output/v31-20250504-001829/checkpoint-100", |
| "epoch": 5.376344086021505, |
| "eval_steps": 50, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.010752688172043012, |
| "grad_norm": 2.8397373471487466, |
| "learning_rate": 3.0303030303030305e-07, |
| "loss": 0.6789064407348633, |
| "memory(GiB)": 33.21, |
| "step": 1, |
| "token_acc": 0.7741811175337187, |
| "train_speed(iter/s)": 0.077627 |
| }, |
| { |
| "epoch": 0.053763440860215055, |
| "grad_norm": 3.004365830179039, |
| "learning_rate": 1.5151515151515152e-06, |
| "loss": 0.6730813980102539, |
| "memory(GiB)": 35.76, |
| "step": 5, |
| "token_acc": 0.8227406519132235, |
| "train_speed(iter/s)": 0.190932 |
| }, |
| { |
| "epoch": 0.10752688172043011, |
| "grad_norm": 2.515744462969929, |
| "learning_rate": 3.0303030303030305e-06, |
| "loss": 0.6481359958648681, |
| "memory(GiB)": 35.76, |
| "step": 10, |
| "token_acc": 0.8160835048515143, |
| "train_speed(iter/s)": 0.231618 |
| }, |
| { |
| "epoch": 0.16129032258064516, |
| "grad_norm": 1.748868361489095, |
| "learning_rate": 4.5454545454545455e-06, |
| "loss": 0.5695308208465576, |
| "memory(GiB)": 35.76, |
| "step": 15, |
| "token_acc": 0.8397823007530009, |
| "train_speed(iter/s)": 0.244031 |
| }, |
| { |
| "epoch": 0.21505376344086022, |
| "grad_norm": 1.0225907480921308, |
| "learning_rate": 6.060606060606061e-06, |
| "loss": 0.5106754302978516, |
| "memory(GiB)": 35.76, |
| "step": 20, |
| "token_acc": 0.8501878777977455, |
| "train_speed(iter/s)": 0.250238 |
| }, |
| { |
| "epoch": 0.26881720430107525, |
| "grad_norm": 1.2639964303465994, |
| "learning_rate": 7.5757575757575764e-06, |
| "loss": 0.4408127307891846, |
| "memory(GiB)": 35.76, |
| "step": 25, |
| "token_acc": 0.8509183536667113, |
| "train_speed(iter/s)": 0.254089 |
| }, |
| { |
| "epoch": 0.3225806451612903, |
| "grad_norm": 0.8267338618549237, |
| "learning_rate": 9.090909090909091e-06, |
| "loss": 0.42465009689331057, |
| "memory(GiB)": 35.76, |
| "step": 30, |
| "token_acc": 0.8319305277221108, |
| "train_speed(iter/s)": 0.253029 |
| }, |
| { |
| "epoch": 0.3763440860215054, |
| "grad_norm": 0.7590056670512465, |
| "learning_rate": 9.999741584205621e-06, |
| "loss": 0.40682473182678225, |
| "memory(GiB)": 35.76, |
| "step": 35, |
| "token_acc": 0.8784954317130087, |
| "train_speed(iter/s)": 0.256755 |
| }, |
| { |
| "epoch": 0.43010752688172044, |
| "grad_norm": 0.7473548691186508, |
| "learning_rate": 9.99683471327489e-06, |
| "loss": 0.39803519248962405, |
| "memory(GiB)": 35.76, |
| "step": 40, |
| "token_acc": 0.8626253418413856, |
| "train_speed(iter/s)": 0.259978 |
| }, |
| { |
| "epoch": 0.4838709677419355, |
| "grad_norm": 0.7312727330719088, |
| "learning_rate": 9.99069983579947e-06, |
| "loss": 0.3946224689483643, |
| "memory(GiB)": 35.76, |
| "step": 45, |
| "token_acc": 0.8563264614993862, |
| "train_speed(iter/s)": 0.264063 |
| }, |
| { |
| "epoch": 0.5376344086021505, |
| "grad_norm": 0.680023295861087, |
| "learning_rate": 9.981340914973221e-06, |
| "loss": 0.3800630807876587, |
| "memory(GiB)": 35.76, |
| "step": 50, |
| "token_acc": 0.8677728496752732, |
| "train_speed(iter/s)": 0.265313 |
| }, |
| { |
| "epoch": 0.5376344086021505, |
| "eval_loss": 0.4121361970901489, |
| "eval_runtime": 1.1144, |
| "eval_samples_per_second": 14.357, |
| "eval_steps_per_second": 1.795, |
| "eval_token_acc": 0.862480083511895, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.5913978494623656, |
| "grad_norm": 0.6125116226090357, |
| "learning_rate": 9.968763996755115e-06, |
| "loss": 0.3797069787979126, |
| "memory(GiB)": 35.76, |
| "step": 55, |
| "token_acc": 0.8705602222369204, |
| "train_speed(iter/s)": 0.212247 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "grad_norm": 0.6413274106626393, |
| "learning_rate": 9.952977205963496e-06, |
| "loss": 0.37576003074645997, |
| "memory(GiB)": 36.2, |
| "step": 60, |
| "token_acc": 0.8649204294525242, |
| "train_speed(iter/s)": 0.216167 |
| }, |
| { |
| "epoch": 0.6989247311827957, |
| "grad_norm": 0.6501908922939531, |
| "learning_rate": 9.93399074102735e-06, |
| "loss": 0.373861026763916, |
| "memory(GiB)": 36.2, |
| "step": 65, |
| "token_acc": 0.8693455722115829, |
| "train_speed(iter/s)": 0.21941 |
| }, |
| { |
| "epoch": 0.7526881720430108, |
| "grad_norm": 0.7004218187745562, |
| "learning_rate": 9.911816867398026e-06, |
| "loss": 0.3793942928314209, |
| "memory(GiB)": 36.2, |
| "step": 70, |
| "token_acc": 0.8730909406456092, |
| "train_speed(iter/s)": 0.222179 |
| }, |
| { |
| "epoch": 0.8064516129032258, |
| "grad_norm": 0.8166238670210874, |
| "learning_rate": 9.886469909625624e-06, |
| "loss": 0.3867968559265137, |
| "memory(GiB)": 36.2, |
| "step": 75, |
| "token_acc": 0.848757324712063, |
| "train_speed(iter/s)": 0.226416 |
| }, |
| { |
| "epoch": 0.8602150537634409, |
| "grad_norm": 0.6267634407532141, |
| "learning_rate": 9.857966242105194e-06, |
| "loss": 0.3607918739318848, |
| "memory(GiB)": 36.2, |
| "step": 80, |
| "token_acc": 0.8719132441966814, |
| "train_speed(iter/s)": 0.230221 |
| }, |
| { |
| "epoch": 0.9139784946236559, |
| "grad_norm": 0.7896216311488772, |
| "learning_rate": 9.8263242784987e-06, |
| "loss": 0.3733763933181763, |
| "memory(GiB)": 36.2, |
| "step": 85, |
| "token_acc": 0.86721273110227, |
| "train_speed(iter/s)": 0.232953 |
| }, |
| { |
| "epoch": 0.967741935483871, |
| "grad_norm": 0.6455395783876484, |
| "learning_rate": 9.791564459839609e-06, |
| "loss": 0.36534771919250486, |
| "memory(GiB)": 36.2, |
| "step": 90, |
| "token_acc": 0.8692492781520693, |
| "train_speed(iter/s)": 0.235041 |
| }, |
| { |
| "epoch": 1.021505376344086, |
| "grad_norm": 0.5393287418168153, |
| "learning_rate": 9.753709241327773e-06, |
| "loss": 0.3413947343826294, |
| "memory(GiB)": 36.2, |
| "step": 95, |
| "token_acc": 0.8809226932668329, |
| "train_speed(iter/s)": 0.238594 |
| }, |
| { |
| "epoch": 1.075268817204301, |
| "grad_norm": 0.5578267353989265, |
| "learning_rate": 9.712783077823144e-06, |
| "loss": 0.2999130725860596, |
| "memory(GiB)": 36.2, |
| "step": 100, |
| "token_acc": 0.876852251474608, |
| "train_speed(iter/s)": 0.240385 |
| }, |
| { |
| "epoch": 1.075268817204301, |
| "eval_loss": 0.39695656299591064, |
| "eval_runtime": 0.9623, |
| "eval_samples_per_second": 16.626, |
| "eval_steps_per_second": 2.078, |
| "eval_token_acc": 0.8658590187352343, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.129032258064516, |
| "grad_norm": 0.6232434629738186, |
| "learning_rate": 9.66881240804768e-06, |
| "loss": 0.2978543758392334, |
| "memory(GiB)": 36.2, |
| "step": 105, |
| "token_acc": 0.8903211301382695, |
| "train_speed(iter/s)": 0.216685 |
| }, |
| { |
| "epoch": 1.1827956989247312, |
| "grad_norm": 0.563444312933744, |
| "learning_rate": 9.62182563750565e-06, |
| "loss": 0.2764800786972046, |
| "memory(GiB)": 36.2, |
| "step": 110, |
| "token_acc": 0.8837461046416271, |
| "train_speed(iter/s)": 0.218683 |
| }, |
| { |
| "epoch": 1.2365591397849462, |
| "grad_norm": 0.634129779250207, |
| "learning_rate": 9.571853120133406e-06, |
| "loss": 0.2966769695281982, |
| "memory(GiB)": 36.2, |
| "step": 115, |
| "token_acc": 0.8817908276295341, |
| "train_speed(iter/s)": 0.220159 |
| }, |
| { |
| "epoch": 1.2903225806451613, |
| "grad_norm": 0.5952665579415249, |
| "learning_rate": 9.51892713869041e-06, |
| "loss": 0.2910531759262085, |
| "memory(GiB)": 36.2, |
| "step": 120, |
| "token_acc": 0.8890962995497969, |
| "train_speed(iter/s)": 0.221075 |
| }, |
| { |
| "epoch": 1.3440860215053765, |
| "grad_norm": 0.5889028303932383, |
| "learning_rate": 9.463081883904251e-06, |
| "loss": 0.2941020727157593, |
| "memory(GiB)": 36.2, |
| "step": 125, |
| "token_acc": 0.9022072348252606, |
| "train_speed(iter/s)": 0.222909 |
| }, |
| { |
| "epoch": 1.3978494623655915, |
| "grad_norm": 0.6117493527778533, |
| "learning_rate": 9.404353432383078e-06, |
| "loss": 0.299320125579834, |
| "memory(GiB)": 36.2, |
| "step": 130, |
| "token_acc": 0.9017176863081016, |
| "train_speed(iter/s)": 0.223869 |
| }, |
| { |
| "epoch": 1.4516129032258065, |
| "grad_norm": 0.6709733335774591, |
| "learning_rate": 9.342779723309746e-06, |
| "loss": 0.2946903228759766, |
| "memory(GiB)": 36.2, |
| "step": 135, |
| "token_acc": 0.8963614673426782, |
| "train_speed(iter/s)": 0.22548 |
| }, |
| { |
| "epoch": 1.5053763440860215, |
| "grad_norm": 0.5231401147766865, |
| "learning_rate": 9.278400533932703e-06, |
| "loss": 0.27523131370544435, |
| "memory(GiB)": 36.2, |
| "step": 140, |
| "token_acc": 0.9091817273635455, |
| "train_speed(iter/s)": 0.22685 |
| }, |
| { |
| "epoch": 1.5591397849462365, |
| "grad_norm": 0.6109308073405598, |
| "learning_rate": 9.211257453869495e-06, |
| "loss": 0.28516521453857424, |
| "memory(GiB)": 36.2, |
| "step": 145, |
| "token_acc": 0.9022033404140114, |
| "train_speed(iter/s)": 0.22851 |
| }, |
| { |
| "epoch": 1.6129032258064515, |
| "grad_norm": 0.61463883436549, |
| "learning_rate": 9.141393858239435e-06, |
| "loss": 0.28318946361541747, |
| "memory(GiB)": 36.2, |
| "step": 150, |
| "token_acc": 0.9105783567448795, |
| "train_speed(iter/s)": 0.22975 |
| }, |
| { |
| "epoch": 1.6129032258064515, |
| "eval_loss": 0.4003598093986511, |
| "eval_runtime": 0.9657, |
| "eval_samples_per_second": 16.569, |
| "eval_steps_per_second": 2.071, |
| "eval_token_acc": 0.8660787868798417, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.6132153447244343, |
| "learning_rate": 9.068854879642833e-06, |
| "loss": 0.2889599084854126, |
| "memory(GiB)": 36.32, |
| "step": 155, |
| "token_acc": 0.8814800662336009, |
| "train_speed(iter/s)": 0.208717 |
| }, |
| { |
| "epoch": 1.7204301075268817, |
| "grad_norm": 0.5971251557661211, |
| "learning_rate": 8.99368737900487e-06, |
| "loss": 0.2876766204833984, |
| "memory(GiB)": 36.32, |
| "step": 160, |
| "token_acc": 0.8966378835201175, |
| "train_speed(iter/s)": 0.209931 |
| }, |
| { |
| "epoch": 1.7741935483870968, |
| "grad_norm": 0.5748854455080764, |
| "learning_rate": 8.91593991530297e-06, |
| "loss": 0.3036654472351074, |
| "memory(GiB)": 36.32, |
| "step": 165, |
| "token_acc": 0.9018055115616092, |
| "train_speed(iter/s)": 0.211255 |
| }, |
| { |
| "epoch": 1.827956989247312, |
| "grad_norm": 0.5561505552546223, |
| "learning_rate": 8.835662714197182e-06, |
| "loss": 0.3028982639312744, |
| "memory(GiB)": 36.32, |
| "step": 170, |
| "token_acc": 0.9018227009113504, |
| "train_speed(iter/s)": 0.212718 |
| }, |
| { |
| "epoch": 1.881720430107527, |
| "grad_norm": 0.7317876505668975, |
| "learning_rate": 8.752907635583911e-06, |
| "loss": 0.29505395889282227, |
| "memory(GiB)": 36.32, |
| "step": 175, |
| "token_acc": 0.8887829072872949, |
| "train_speed(iter/s)": 0.214568 |
| }, |
| { |
| "epoch": 1.935483870967742, |
| "grad_norm": 0.5680265544473229, |
| "learning_rate": 8.667728140093876e-06, |
| "loss": 0.29898526668548586, |
| "memory(GiB)": 36.32, |
| "step": 180, |
| "token_acc": 0.8902275769745649, |
| "train_speed(iter/s)": 0.216215 |
| }, |
| { |
| "epoch": 1.989247311827957, |
| "grad_norm": 0.6258148139715933, |
| "learning_rate": 8.580179254555997e-06, |
| "loss": 0.2970327615737915, |
| "memory(GiB)": 36.32, |
| "step": 185, |
| "token_acc": 0.8959904359021519, |
| "train_speed(iter/s)": 0.217886 |
| }, |
| { |
| "epoch": 2.043010752688172, |
| "grad_norm": 0.6156380554712465, |
| "learning_rate": 8.490317536449497e-06, |
| "loss": 0.22337541580200196, |
| "memory(GiB)": 36.32, |
| "step": 190, |
| "token_acc": 0.9188696893614926, |
| "train_speed(iter/s)": 0.218922 |
| }, |
| { |
| "epoch": 2.096774193548387, |
| "grad_norm": 0.5388492710872453, |
| "learning_rate": 8.398201037367202e-06, |
| "loss": 0.20124404430389403, |
| "memory(GiB)": 36.32, |
| "step": 195, |
| "token_acc": 0.926163422957213, |
| "train_speed(iter/s)": 0.21976 |
| }, |
| { |
| "epoch": 2.150537634408602, |
| "grad_norm": 0.5983173894045656, |
| "learning_rate": 8.303889265513599e-06, |
| "loss": 0.20379652976989746, |
| "memory(GiB)": 36.32, |
| "step": 200, |
| "token_acc": 0.9316990932701508, |
| "train_speed(iter/s)": 0.220712 |
| }, |
| { |
| "epoch": 2.150537634408602, |
| "eval_loss": 0.42072147130966187, |
| "eval_runtime": 0.982, |
| "eval_samples_per_second": 16.293, |
| "eval_steps_per_second": 2.037, |
| "eval_token_acc": 0.8620405472226801, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.204301075268817, |
| "grad_norm": 0.6470473264794586, |
| "learning_rate": 8.20744314726193e-06, |
| "loss": 0.20558562278747558, |
| "memory(GiB)": 36.32, |
| "step": 205, |
| "token_acc": 0.9130314104639867, |
| "train_speed(iter/s)": 0.179558 |
| }, |
| { |
| "epoch": 2.258064516129032, |
| "grad_norm": 0.5880896656650852, |
| "learning_rate": 8.108924987795137e-06, |
| "loss": 0.18437005281448365, |
| "memory(GiB)": 36.32, |
| "step": 210, |
| "token_acc": 0.9324857899075801, |
| "train_speed(iter/s)": 0.181077 |
| }, |
| { |
| "epoch": 2.3118279569892475, |
| "grad_norm": 0.5477307460414933, |
| "learning_rate": 8.008398430856064e-06, |
| "loss": 0.18631315231323242, |
| "memory(GiB)": 36.32, |
| "step": 215, |
| "token_acc": 0.9362203632666266, |
| "train_speed(iter/s)": 0.182362 |
| }, |
| { |
| "epoch": 2.3655913978494625, |
| "grad_norm": 0.5227844014342666, |
| "learning_rate": 7.905928417632947e-06, |
| "loss": 0.20151617527008056, |
| "memory(GiB)": 36.32, |
| "step": 220, |
| "token_acc": 0.9247515563976645, |
| "train_speed(iter/s)": 0.183659 |
| }, |
| { |
| "epoch": 2.4193548387096775, |
| "grad_norm": 0.5262556350378147, |
| "learning_rate": 7.801581144806752e-06, |
| "loss": 0.1893543004989624, |
| "memory(GiB)": 36.32, |
| "step": 225, |
| "token_acc": 0.916718566189191, |
| "train_speed(iter/s)": 0.184894 |
| }, |
| { |
| "epoch": 2.4731182795698925, |
| "grad_norm": 0.5484675051032469, |
| "learning_rate": 7.695424021787412e-06, |
| "loss": 0.1866333603858948, |
| "memory(GiB)": 36.32, |
| "step": 230, |
| "token_acc": 0.924685033919424, |
| "train_speed(iter/s)": 0.186222 |
| }, |
| { |
| "epoch": 2.5268817204301075, |
| "grad_norm": 0.5628522228195585, |
| "learning_rate": 7.587525627166691e-06, |
| "loss": 0.19393479824066162, |
| "memory(GiB)": 36.32, |
| "step": 235, |
| "token_acc": 0.9374396347352709, |
| "train_speed(iter/s)": 0.187657 |
| }, |
| { |
| "epoch": 2.5806451612903225, |
| "grad_norm": 0.5335622599157593, |
| "learning_rate": 7.477955664415678e-06, |
| "loss": 0.19508060216903686, |
| "memory(GiB)": 36.32, |
| "step": 240, |
| "token_acc": 0.9316161484757817, |
| "train_speed(iter/s)": 0.189112 |
| }, |
| { |
| "epoch": 2.6344086021505375, |
| "grad_norm": 0.5245516097527467, |
| "learning_rate": 7.36678491685565e-06, |
| "loss": 0.19449775218963622, |
| "memory(GiB)": 36.32, |
| "step": 245, |
| "token_acc": 0.9377052300956551, |
| "train_speed(iter/s)": 0.19048 |
| }, |
| { |
| "epoch": 2.688172043010753, |
| "grad_norm": 0.5779503566004872, |
| "learning_rate": 7.254085201931305e-06, |
| "loss": 0.2031865119934082, |
| "memory(GiB)": 36.32, |
| "step": 250, |
| "token_acc": 0.921832884097035, |
| "train_speed(iter/s)": 0.191753 |
| }, |
| { |
| "epoch": 2.688172043010753, |
| "eval_loss": 0.43173694610595703, |
| "eval_runtime": 0.9856, |
| "eval_samples_per_second": 16.233, |
| "eval_steps_per_second": 2.029, |
| "eval_token_acc": 0.8627273226745783, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.741935483870968, |
| "grad_norm": 0.5128503859951468, |
| "learning_rate": 7.139929324815965e-06, |
| "loss": 0.19230486154556276, |
| "memory(GiB)": 36.78, |
| "step": 255, |
| "token_acc": 0.9164345403899722, |
| "train_speed(iter/s)": 0.164733 |
| }, |
| { |
| "epoch": 2.795698924731183, |
| "grad_norm": 0.5561171013285224, |
| "learning_rate": 7.024391031378686e-06, |
| "loss": 0.1845786452293396, |
| "memory(GiB)": 36.78, |
| "step": 260, |
| "token_acc": 0.9280777134317205, |
| "train_speed(iter/s)": 0.166048 |
| }, |
| { |
| "epoch": 2.849462365591398, |
| "grad_norm": 0.55054564008372, |
| "learning_rate": 6.907544960543659e-06, |
| "loss": 0.18752856254577638, |
| "memory(GiB)": 36.78, |
| "step": 265, |
| "token_acc": 0.928450923562746, |
| "train_speed(iter/s)": 0.167181 |
| }, |
| { |
| "epoch": 2.903225806451613, |
| "grad_norm": 0.5241738277865058, |
| "learning_rate": 6.7894665960727105e-06, |
| "loss": 0.19395242929458617, |
| "memory(GiB)": 36.78, |
| "step": 270, |
| "token_acc": 0.9294530630732646, |
| "train_speed(iter/s)": 0.168566 |
| }, |
| { |
| "epoch": 2.956989247311828, |
| "grad_norm": 0.5219243104191204, |
| "learning_rate": 6.670232217802011e-06, |
| "loss": 0.1912919521331787, |
| "memory(GiB)": 36.78, |
| "step": 275, |
| "token_acc": 0.9288971457524067, |
| "train_speed(iter/s)": 0.169806 |
| }, |
| { |
| "epoch": 3.010752688172043, |
| "grad_norm": 0.5873962294936487, |
| "learning_rate": 6.549918852364517e-06, |
| "loss": 0.18026410341262816, |
| "memory(GiB)": 36.78, |
| "step": 280, |
| "token_acc": 0.9238556338028169, |
| "train_speed(iter/s)": 0.170794 |
| }, |
| { |
| "epoch": 3.064516129032258, |
| "grad_norm": 0.5295922567078138, |
| "learning_rate": 6.42860422342998e-06, |
| "loss": 0.12432655096054077, |
| "memory(GiB)": 36.78, |
| "step": 285, |
| "token_acc": 0.9551729045111712, |
| "train_speed(iter/s)": 0.171865 |
| }, |
| { |
| "epoch": 3.118279569892473, |
| "grad_norm": 0.6075093437048568, |
| "learning_rate": 6.306366701494649e-06, |
| "loss": 0.12841488122940065, |
| "memory(GiB)": 36.78, |
| "step": 290, |
| "token_acc": 0.9396288908126011, |
| "train_speed(iter/s)": 0.172844 |
| }, |
| { |
| "epoch": 3.172043010752688, |
| "grad_norm": 0.5509357998666382, |
| "learning_rate": 6.183285253253135e-06, |
| "loss": 0.11821137666702271, |
| "memory(GiB)": 36.78, |
| "step": 295, |
| "token_acc": 0.9528064255501164, |
| "train_speed(iter/s)": 0.173786 |
| }, |
| { |
| "epoch": 3.225806451612903, |
| "grad_norm": 0.5108788614443498, |
| "learning_rate": 6.0594393905851065e-06, |
| "loss": 0.11771461963653565, |
| "memory(GiB)": 36.78, |
| "step": 300, |
| "token_acc": 0.9583916241707658, |
| "train_speed(iter/s)": 0.175078 |
| }, |
| { |
| "epoch": 3.225806451612903, |
| "eval_loss": 0.46923384070396423, |
| "eval_runtime": 0.9709, |
| "eval_samples_per_second": 16.48, |
| "eval_steps_per_second": 2.06, |
| "eval_token_acc": 0.8575078292401517, |
| "step": 300 |
| }, |
| { |
| "epoch": 3.279569892473118, |
| "grad_norm": 0.532375440211573, |
| "learning_rate": 5.934909119189806e-06, |
| "loss": 0.11486297845840454, |
| "memory(GiB)": 36.78, |
| "step": 305, |
| "token_acc": 0.9351425942962281, |
| "train_speed(iter/s)": 0.156217 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 0.4986724035910756, |
| "learning_rate": 5.809774886901538e-06, |
| "loss": 0.12363936901092529, |
| "memory(GiB)": 36.78, |
| "step": 310, |
| "token_acc": 0.9564072783584979, |
| "train_speed(iter/s)": 0.15723 |
| }, |
| { |
| "epoch": 3.3870967741935485, |
| "grad_norm": 0.5044548969667993, |
| "learning_rate": 5.684117531719552e-06, |
| "loss": 0.12493133544921875, |
| "memory(GiB)": 36.78, |
| "step": 315, |
| "token_acc": 0.9532926933974414, |
| "train_speed(iter/s)": 0.158289 |
| }, |
| { |
| "epoch": 3.4408602150537635, |
| "grad_norm": 0.6359389048574728, |
| "learning_rate": 5.558018229585856e-06, |
| "loss": 0.10656380653381348, |
| "memory(GiB)": 36.78, |
| "step": 320, |
| "token_acc": 0.9684046407116981, |
| "train_speed(iter/s)": 0.159481 |
| }, |
| { |
| "epoch": 3.4946236559139785, |
| "grad_norm": 0.5211147608340247, |
| "learning_rate": 5.431558441944731e-06, |
| "loss": 0.11835185289382935, |
| "memory(GiB)": 36.78, |
| "step": 325, |
| "token_acc": 0.9631512587952983, |
| "train_speed(iter/s)": 0.160534 |
| }, |
| { |
| "epoch": 3.5483870967741935, |
| "grad_norm": 0.5235745875213004, |
| "learning_rate": 5.304819863117796e-06, |
| "loss": 0.11486140489578248, |
| "memory(GiB)": 36.78, |
| "step": 330, |
| "token_acc": 0.9587179487179487, |
| "train_speed(iter/s)": 0.161595 |
| }, |
| { |
| "epoch": 3.6021505376344085, |
| "grad_norm": 0.5370422937987905, |
| "learning_rate": 5.177884367528637e-06, |
| "loss": 0.12060900926589965, |
| "memory(GiB)": 36.78, |
| "step": 335, |
| "token_acc": 0.9555168079153319, |
| "train_speed(iter/s)": 0.162556 |
| }, |
| { |
| "epoch": 3.6559139784946235, |
| "grad_norm": 0.5133061100021457, |
| "learning_rate": 5.0508339568111e-06, |
| "loss": 0.114243483543396, |
| "memory(GiB)": 36.78, |
| "step": 340, |
| "token_acc": 0.9510202454965726, |
| "train_speed(iter/s)": 0.163581 |
| }, |
| { |
| "epoch": 3.709677419354839, |
| "grad_norm": 0.5231222499336802, |
| "learning_rate": 4.923750706835371e-06, |
| "loss": 0.12267729043960571, |
| "memory(GiB)": 36.78, |
| "step": 345, |
| "token_acc": 0.9553353973168215, |
| "train_speed(iter/s)": 0.164482 |
| }, |
| { |
| "epoch": 3.763440860215054, |
| "grad_norm": 0.5089747349843902, |
| "learning_rate": 4.7967167146861446e-06, |
| "loss": 0.11303888559341431, |
| "memory(GiB)": 36.78, |
| "step": 350, |
| "token_acc": 0.9600354845863828, |
| "train_speed(iter/s)": 0.165357 |
| }, |
| { |
| "epoch": 3.763440860215054, |
| "eval_loss": 0.4767088294029236, |
| "eval_runtime": 0.9917, |
| "eval_samples_per_second": 16.134, |
| "eval_steps_per_second": 2.017, |
| "eval_token_acc": 0.8583869018185813, |
| "step": 350 |
| }, |
| { |
| "epoch": 3.817204301075269, |
| "grad_norm": 0.5352999649215254, |
| "learning_rate": 4.669814045627046e-06, |
| "loss": 0.11718583106994629, |
| "memory(GiB)": 36.78, |
| "step": 355, |
| "token_acc": 0.9347400235070241, |
| "train_speed(iter/s)": 0.150406 |
| }, |
| { |
| "epoch": 3.870967741935484, |
| "grad_norm": 0.573204238275855, |
| "learning_rate": 4.5431246800856455e-06, |
| "loss": 0.11329195499420167, |
| "memory(GiB)": 36.78, |
| "step": 360, |
| "token_acc": 0.9657640565712314, |
| "train_speed(iter/s)": 0.151423 |
| }, |
| { |
| "epoch": 3.924731182795699, |
| "grad_norm": 0.5454577647386208, |
| "learning_rate": 4.416730460693239e-06, |
| "loss": 0.11979327201843262, |
| "memory(GiB)": 36.78, |
| "step": 365, |
| "token_acc": 0.9548889088782945, |
| "train_speed(iter/s)": 0.15224 |
| }, |
| { |
| "epoch": 3.978494623655914, |
| "grad_norm": 0.4866198331616375, |
| "learning_rate": 4.290713039413684e-06, |
| "loss": 0.11887497901916504, |
| "memory(GiB)": 36.78, |
| "step": 370, |
| "token_acc": 0.9653520499108734, |
| "train_speed(iter/s)": 0.153223 |
| }, |
| { |
| "epoch": 4.032258064516129, |
| "grad_norm": 0.45037416589175444, |
| "learning_rate": 4.165153824795391e-06, |
| "loss": 0.09095752239227295, |
| "memory(GiB)": 36.78, |
| "step": 375, |
| "token_acc": 0.9678588797029046, |
| "train_speed(iter/s)": 0.154122 |
| }, |
| { |
| "epoch": 4.086021505376344, |
| "grad_norm": 0.485523239824137, |
| "learning_rate": 4.040133929380551e-06, |
| "loss": 0.077480149269104, |
| "memory(GiB)": 36.78, |
| "step": 380, |
| "token_acc": 0.975248480169835, |
| "train_speed(iter/s)": 0.154937 |
| }, |
| { |
| "epoch": 4.139784946236559, |
| "grad_norm": 0.42587539911230965, |
| "learning_rate": 3.915734117305624e-06, |
| "loss": 0.06480391025543213, |
| "memory(GiB)": 36.78, |
| "step": 385, |
| "token_acc": 0.9769721842225262, |
| "train_speed(iter/s)": 0.155951 |
| }, |
| { |
| "epoch": 4.193548387096774, |
| "grad_norm": 0.5141471588233666, |
| "learning_rate": 3.7920347521268514e-06, |
| "loss": 0.07736325263977051, |
| "memory(GiB)": 36.78, |
| "step": 390, |
| "token_acc": 0.9737575974258134, |
| "train_speed(iter/s)": 0.156692 |
| }, |
| { |
| "epoch": 4.247311827956989, |
| "grad_norm": 0.4741421109444635, |
| "learning_rate": 3.6691157449045915e-06, |
| "loss": 0.06898297071456909, |
| "memory(GiB)": 36.78, |
| "step": 395, |
| "token_acc": 0.9794690999585234, |
| "train_speed(iter/s)": 0.15764 |
| }, |
| { |
| "epoch": 4.301075268817204, |
| "grad_norm": 0.5032918290160332, |
| "learning_rate": 3.5470565025799515e-06, |
| "loss": 0.06421754360198975, |
| "memory(GiB)": 36.78, |
| "step": 400, |
| "token_acc": 0.9791707701398463, |
| "train_speed(iter/s)": 0.158589 |
| }, |
| { |
| "epoch": 4.301075268817204, |
| "eval_loss": 0.5137488842010498, |
| "eval_runtime": 0.9946, |
| "eval_samples_per_second": 16.086, |
| "eval_steps_per_second": 2.011, |
| "eval_token_acc": 0.8553925608483051, |
| "step": 400 |
| }, |
| { |
| "epoch": 4.354838709677419, |
| "grad_norm": 0.5064324935719208, |
| "learning_rate": 3.425935876677077e-06, |
| "loss": 0.06645252704620361, |
| "memory(GiB)": 36.78, |
| "step": 405, |
| "token_acc": 0.9447127229723071, |
| "train_speed(iter/s)": 0.146452 |
| }, |
| { |
| "epoch": 4.408602150537634, |
| "grad_norm": 0.4960758348047365, |
| "learning_rate": 3.305832112364268e-06, |
| "loss": 0.07083821892738343, |
| "memory(GiB)": 36.78, |
| "step": 410, |
| "token_acc": 0.9750937850485362, |
| "train_speed(iter/s)": 0.147329 |
| }, |
| { |
| "epoch": 4.462365591397849, |
| "grad_norm": 0.5091487033531963, |
| "learning_rate": 3.1868227979067985e-06, |
| "loss": 0.0703616976737976, |
| "memory(GiB)": 36.78, |
| "step": 415, |
| "token_acc": 0.9779349923316677, |
| "train_speed(iter/s)": 0.148154 |
| }, |
| { |
| "epoch": 4.516129032258064, |
| "grad_norm": 0.5085525658110539, |
| "learning_rate": 3.068984814544087e-06, |
| "loss": 0.07059448957443237, |
| "memory(GiB)": 36.78, |
| "step": 420, |
| "token_acc": 0.9705357142857143, |
| "train_speed(iter/s)": 0.148895 |
| }, |
| { |
| "epoch": 4.56989247311828, |
| "grad_norm": 0.5064686156678317, |
| "learning_rate": 2.9523942868236414e-06, |
| "loss": 0.07201706171035767, |
| "memory(GiB)": 36.78, |
| "step": 425, |
| "token_acc": 0.9695118947938728, |
| "train_speed(iter/s)": 0.149603 |
| }, |
| { |
| "epoch": 4.623655913978495, |
| "grad_norm": 0.5330742370752339, |
| "learning_rate": 2.8371265334238103e-06, |
| "loss": 0.06676008701324462, |
| "memory(GiB)": 36.78, |
| "step": 430, |
| "token_acc": 0.9743828804830403, |
| "train_speed(iter/s)": 0.150406 |
| }, |
| { |
| "epoch": 4.67741935483871, |
| "grad_norm": 0.48023829750195657, |
| "learning_rate": 2.7232560184971437e-06, |
| "loss": 0.07017686367034912, |
| "memory(GiB)": 36.78, |
| "step": 435, |
| "token_acc": 0.9749133183765042, |
| "train_speed(iter/s)": 0.1513 |
| }, |
| { |
| "epoch": 4.731182795698925, |
| "grad_norm": 0.538651125017694, |
| "learning_rate": 2.610856303565793e-06, |
| "loss": 0.0628254771232605, |
| "memory(GiB)": 36.78, |
| "step": 440, |
| "token_acc": 0.9788421297372704, |
| "train_speed(iter/s)": 0.15211 |
| }, |
| { |
| "epoch": 4.78494623655914, |
| "grad_norm": 0.538521707591603, |
| "learning_rate": 2.5000000000000015e-06, |
| "loss": 0.06735045909881592, |
| "memory(GiB)": 36.78, |
| "step": 445, |
| "token_acc": 0.9842489435267, |
| "train_speed(iter/s)": 0.152875 |
| }, |
| { |
| "epoch": 4.838709677419355, |
| "grad_norm": 0.44415093447519377, |
| "learning_rate": 2.390758722110418e-06, |
| "loss": 0.0701654613018036, |
| "memory(GiB)": 36.78, |
| "step": 450, |
| "token_acc": 0.9781991349556556, |
| "train_speed(iter/s)": 0.153703 |
| }, |
| { |
| "epoch": 4.838709677419355, |
| "eval_loss": 0.5138384103775024, |
| "eval_runtime": 0.987, |
| "eval_samples_per_second": 16.21, |
| "eval_steps_per_second": 2.026, |
| "eval_token_acc": 0.8553650898302291, |
| "step": 450 |
| }, |
| { |
| "epoch": 4.89247311827957, |
| "grad_norm": 0.44909839508545846, |
| "learning_rate": 2.283203040884524e-06, |
| "loss": 0.07383356690406799, |
| "memory(GiB)": 36.78, |
| "step": 455, |
| "token_acc": 0.9474127346079635, |
| "train_speed(iter/s)": 0.143427 |
| }, |
| { |
| "epoch": 4.946236559139785, |
| "grad_norm": 0.5540561379082162, |
| "learning_rate": 2.1774024383970372e-06, |
| "loss": 0.06765682101249695, |
| "memory(GiB)": 36.78, |
| "step": 460, |
| "token_acc": 0.9771299435028249, |
| "train_speed(iter/s)": 0.144189 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.4921194250215734, |
| "learning_rate": 2.0734252629237892e-06, |
| "loss": 0.062316888570785524, |
| "memory(GiB)": 36.78, |
| "step": 465, |
| "token_acc": 0.9770206022187005, |
| "train_speed(iter/s)": 0.144983 |
| }, |
| { |
| "epoch": 5.053763440860215, |
| "grad_norm": 0.3608411695035307, |
| "learning_rate": 1.971338684788034e-06, |
| "loss": 0.04745644629001618, |
| "memory(GiB)": 36.78, |
| "step": 470, |
| "token_acc": 0.9881954568969572, |
| "train_speed(iter/s)": 0.145689 |
| }, |
| { |
| "epoch": 5.10752688172043, |
| "grad_norm": 0.3455316573497327, |
| "learning_rate": 1.8712086529677214e-06, |
| "loss": 0.04306984841823578, |
| "memory(GiB)": 36.78, |
| "step": 475, |
| "token_acc": 0.9849460329483053, |
| "train_speed(iter/s)": 0.146458 |
| }, |
| { |
| "epoch": 5.161290322580645, |
| "grad_norm": 0.4335927828524962, |
| "learning_rate": 1.773099852491796e-06, |
| "loss": 0.03937138915061951, |
| "memory(GiB)": 36.78, |
| "step": 480, |
| "token_acc": 0.9869653767820774, |
| "train_speed(iter/s)": 0.147225 |
| }, |
| { |
| "epoch": 5.21505376344086, |
| "grad_norm": 0.4489096871443703, |
| "learning_rate": 1.6770756626529866e-06, |
| "loss": 0.04089862108230591, |
| "memory(GiB)": 36.78, |
| "step": 485, |
| "token_acc": 0.9865810708394632, |
| "train_speed(iter/s)": 0.147936 |
| }, |
| { |
| "epoch": 5.268817204301075, |
| "grad_norm": 0.422729324631405, |
| "learning_rate": 1.583198116064144e-06, |
| "loss": 0.046530479192733766, |
| "memory(GiB)": 36.78, |
| "step": 490, |
| "token_acc": 0.9874636404604858, |
| "train_speed(iter/s)": 0.148672 |
| }, |
| { |
| "epoch": 5.32258064516129, |
| "grad_norm": 0.41974244882967215, |
| "learning_rate": 1.491527858584535e-06, |
| "loss": 0.037504765391349795, |
| "memory(GiB)": 36.78, |
| "step": 495, |
| "token_acc": 0.9868473694738947, |
| "train_speed(iter/s)": 0.149349 |
| }, |
| { |
| "epoch": 5.376344086021505, |
| "grad_norm": 0.4026767578452857, |
| "learning_rate": 1.4021241101419863e-06, |
| "loss": 0.05219945907592773, |
| "memory(GiB)": 36.78, |
| "step": 500, |
| "token_acc": 0.9789432382945331, |
| "train_speed(iter/s)": 0.149951 |
| }, |
| { |
| "epoch": 5.376344086021505, |
| "eval_loss": 0.546004593372345, |
| "eval_runtime": 0.9994, |
| "eval_samples_per_second": 16.009, |
| "eval_steps_per_second": 2.001, |
| "eval_token_acc": 0.8528926982033954, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 651, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 7, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 43800612777984.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|