| {"loss": 1.56186473, "grad_norm": 39.59119464, "learning_rate": 1e-07, "token_acc": 0.69884491, "epoch": 0.00508906, "global_step/max_steps": "1/197", "percentage": "0.51%", "elapsed_time": "6s", "remaining_time": "22m 30s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.145114} | |
| {"loss": 1.32598543, "grad_norm": 38.84098093, "learning_rate": 2e-07, "token_acc": 0.74625623, "epoch": 0.01017812, "global_step/max_steps": "2/197", "percentage": "1.02%", "elapsed_time": "10s", "remaining_time": "17m 38s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.184269} | |
| {"loss": 1.4535327, "grad_norm": 35.5826905, "learning_rate": 3e-07, "token_acc": 0.69366199, "epoch": 0.01526718, "global_step/max_steps": "3/197", "percentage": "1.52%", "elapsed_time": "14s", "remaining_time": "15m 51s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.203788} | |
| {"loss": 1.35229957, "grad_norm": 37.43614132, "learning_rate": 4e-07, "token_acc": 0.73873121, "epoch": 0.02035623, "global_step/max_steps": "4/197", "percentage": "2.03%", "elapsed_time": "19s", "remaining_time": "15m 21s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.209397} | |
| {"loss": 1.3101356, "grad_norm": 38.96601942, "learning_rate": 5e-07, "token_acc": 0.73830295, "epoch": 0.02544529, "global_step/max_steps": "5/197", "percentage": "2.54%", "elapsed_time": "23s", "remaining_time": "14m 59s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.213456} | |
| {"loss": 1.31299543, "grad_norm": 36.69056841, "learning_rate": 6e-07, "token_acc": 0.74478328, "epoch": 0.03053435, "global_step/max_steps": "6/197", "percentage": "3.05%", "elapsed_time": "28s", "remaining_time": "15m 2s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.211716} | |
| {"loss": 1.22759962, "grad_norm": 36.54255095, "learning_rate": 7e-07, "token_acc": 0.73860914, "epoch": 0.03562341, "global_step/max_steps": "7/197", "percentage": "3.55%", "elapsed_time": "32s", "remaining_time": "14m 34s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.217362} | |
| {"loss": 1.21913123, "grad_norm": 36.46527005, "learning_rate": 8e-07, "token_acc": 0.72345483, "epoch": 0.04071247, "global_step/max_steps": "8/197", "percentage": "4.06%", "elapsed_time": "35s", "remaining_time": "14m 7s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.222907} | |
| {"loss": 1.18052411, "grad_norm": 31.38061783, "learning_rate": 9e-07, "token_acc": 0.74702382, "epoch": 0.04580153, "global_step/max_steps": "9/197", "percentage": "4.57%", "elapsed_time": "39s", "remaining_time": "13m 53s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.225676} | |
| {"loss": 1.12805116, "grad_norm": 30.29716818, "learning_rate": 1e-06, "token_acc": 0.75018758, "epoch": 0.05089059, "global_step/max_steps": "10/197", "percentage": "5.08%", "elapsed_time": "43s", "remaining_time": "13m 40s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.227832} | |
| {"loss": 1.01861393, "grad_norm": 24.72899767, "learning_rate": 1e-06, "token_acc": 0.7525692, "epoch": 0.05597964, "global_step/max_steps": "11/197", "percentage": "5.58%", "elapsed_time": "47s", "remaining_time": "13m 24s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.23133} | |
| {"loss": 1.03555191, "grad_norm": 24.02701249, "learning_rate": 1e-06, "token_acc": 0.73913044, "epoch": 0.0610687, "global_step/max_steps": "12/197", "percentage": "6.09%", "elapsed_time": "51s", "remaining_time": "13m 9s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.234206} | |
| {"loss": 1.08291483, "grad_norm": 21.85423574, "learning_rate": 1e-06, "token_acc": 0.72102648, "epoch": 0.06615776, "global_step/max_steps": "13/197", "percentage": "6.60%", "elapsed_time": "55s", "remaining_time": "13m 8s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.233263} | |
| {"loss": 0.95646894, "grad_norm": 18.44215847, "learning_rate": 1e-06, "token_acc": 0.73561543, "epoch": 0.07124682, "global_step/max_steps": "14/197", "percentage": "7.11%", "elapsed_time": "1m 0s", "remaining_time": "13m 5s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.233067} | |
| {"loss": 0.7082113, "grad_norm": 13.16483021, "learning_rate": 1e-06, "token_acc": 0.79611653, "epoch": 0.07633588, "global_step/max_steps": "15/197", "percentage": "7.61%", "elapsed_time": "1m 4s", "remaining_time": "13m 1s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.232908} | |
| {"loss": 0.89850032, "grad_norm": 14.23889889, "learning_rate": 1e-06, "token_acc": 0.75193197, "epoch": 0.08142494, "global_step/max_steps": "16/197", "percentage": "8.12%", "elapsed_time": "1m 9s", "remaining_time": "13m 1s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.231553} | |
| {"loss": 0.79899234, "grad_norm": 13.6752955, "learning_rate": 1e-06, "token_acc": 0.77467108, "epoch": 0.08651399, "global_step/max_steps": "17/197", "percentage": "8.63%", "elapsed_time": "1m 13s", "remaining_time": "12m 56s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.231796} | |
| {"loss": 0.72529602, "grad_norm": 10.70365521, "learning_rate": 1e-06, "token_acc": 0.80322129, "epoch": 0.09160305, "global_step/max_steps": "18/197", "percentage": "9.14%", "elapsed_time": "1m 17s", "remaining_time": "12m 49s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.232486} | |
| {"loss": 0.78419602, "grad_norm": 11.52388929, "learning_rate": 9.9e-07, "token_acc": 0.77392513, "epoch": 0.09669211, "global_step/max_steps": "19/197", "percentage": "9.64%", "elapsed_time": "1m 21s", "remaining_time": "12m 44s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.232904} | |
| {"loss": 0.65780902, "grad_norm": 10.48927551, "learning_rate": 9.9e-07, "token_acc": 0.80866963, "epoch": 0.10178117, "global_step/max_steps": "20/197", "percentage": "10.15%", "elapsed_time": "1m 25s", "remaining_time": "12m 40s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.232592} | |
| {"loss": 0.78328353, "grad_norm": 11.0767977, "learning_rate": 9.9e-07, "token_acc": 0.78807473, "epoch": 0.10687023, "global_step/max_steps": "21/197", "percentage": "10.66%", "elapsed_time": "1m 30s", "remaining_time": "12m 38s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.231932} | |
| {"loss": 0.70036429, "grad_norm": 10.46619032, "learning_rate": 9.9e-07, "token_acc": 0.78648233, "epoch": 0.11195929, "global_step/max_steps": "22/197", "percentage": "11.17%", "elapsed_time": "1m 34s", "remaining_time": "12m 31s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.232738} | |
| {"loss": 0.72342205, "grad_norm": 10.24050251, "learning_rate": 9.9e-07, "token_acc": 0.77769786, "epoch": 0.11704835, "global_step/max_steps": "23/197", "percentage": "11.68%", "elapsed_time": "1m 38s", "remaining_time": "12m 27s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.232715} | |
| {"loss": 0.80352741, "grad_norm": 10.76392762, "learning_rate": 9.9e-07, "token_acc": 0.76238966, "epoch": 0.1221374, "global_step/max_steps": "24/197", "percentage": "12.18%", "elapsed_time": "1m 42s", "remaining_time": "12m 20s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.233504} | |
| {"loss": 0.58958274, "grad_norm": 8.65309953, "learning_rate": 9.8e-07, "token_acc": 0.82078314, "epoch": 0.12722646, "global_step/max_steps": "25/197", "percentage": "12.69%", "elapsed_time": "1m 46s", "remaining_time": "12m 12s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.234694} | |
| {"loss": 0.66865003, "grad_norm": 9.51587858, "learning_rate": 9.8e-07, "token_acc": 0.79378319, "epoch": 0.13231552, "global_step/max_steps": "26/197", "percentage": "13.20%", "elapsed_time": "1m 51s", "remaining_time": "12m 10s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.233945} | |
| {"loss": 0.62936723, "grad_norm": 9.79993375, "learning_rate": 9.8e-07, "token_acc": 0.80546075, "epoch": 0.13740458, "global_step/max_steps": "27/197", "percentage": "13.71%", "elapsed_time": "1m 55s", "remaining_time": "12m 4s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.234592} | |
| {"loss": 0.6071704, "grad_norm": 9.42719151, "learning_rate": 9.8e-07, "token_acc": 0.82948625, "epoch": 0.14249364, "global_step/max_steps": "28/197", "percentage": "14.21%", "elapsed_time": "1m 58s", "remaining_time": "11m 56s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.235716} | |
| {"loss": 0.67437315, "grad_norm": 9.37922152, "learning_rate": 9.7e-07, "token_acc": 0.78918058, "epoch": 0.1475827, "global_step/max_steps": "29/197", "percentage": "14.72%", "elapsed_time": "2m 2s", "remaining_time": "11m 50s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.236374} | |
| {"loss": 0.71380234, "grad_norm": 8.93068867, "learning_rate": 9.7e-07, "token_acc": 0.79018492, "epoch": 0.15267176, "global_step/max_steps": "30/197", "percentage": "15.23%", "elapsed_time": "2m 6s", "remaining_time": "11m 44s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.237004} | |
| {"loss": 0.75912178, "grad_norm": 9.88176903, "learning_rate": 9.7e-07, "token_acc": 0.7719298, "epoch": 0.15776081, "global_step/max_steps": "31/197", "percentage": "15.74%", "elapsed_time": "2m 11s", "remaining_time": "11m 42s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.236422} | |
| {"loss": 0.64310205, "grad_norm": 9.16211553, "learning_rate": 9.7e-07, "token_acc": 0.80618399, "epoch": 0.16284987, "global_step/max_steps": "32/197", "percentage": "16.24%", "elapsed_time": "2m 14s", "remaining_time": "11m 35s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.237405} | |
| {"loss": 0.49775958, "grad_norm": 7.92908135, "learning_rate": 9.6e-07, "token_acc": 0.83740461, "epoch": 0.16793893, "global_step/max_steps": "33/197", "percentage": "16.75%", "elapsed_time": "2m 18s", "remaining_time": "11m 30s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.237438} | |
| {"loss": 0.52797121, "grad_norm": 8.31814317, "learning_rate": 9.6e-07, "token_acc": 0.82934129, "epoch": 0.17302799, "global_step/max_steps": "34/197", "percentage": "17.26%", "elapsed_time": "2m 24s", "remaining_time": "11m 32s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.235363} | |
| {"loss": 0.71943057, "grad_norm": 9.36161649, "learning_rate": 9.6e-07, "token_acc": 0.7912088, "epoch": 0.17811705, "global_step/max_steps": "35/197", "percentage": "17.77%", "elapsed_time": "2m 28s", "remaining_time": "11m 27s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.23565} | |
| {"loss": 0.73942018, "grad_norm": 10.728333, "learning_rate": 9.5e-07, "token_acc": 0.77711862, "epoch": 0.18320611, "global_step/max_steps": "36/197", "percentage": "18.27%", "elapsed_time": "2m 32s", "remaining_time": "11m 20s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.236466} | |
| {"loss": 0.58502871, "grad_norm": 8.06930709, "learning_rate": 9.5e-07, "token_acc": 0.81266665, "epoch": 0.18829517, "global_step/max_steps": "37/197", "percentage": "18.78%", "elapsed_time": "2m 36s", "remaining_time": "11m 15s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.237021} | |
| {"loss": 0.49855861, "grad_norm": 7.64531998, "learning_rate": 9.5e-07, "token_acc": 0.84546924, "epoch": 0.19338422, "global_step/max_steps": "38/197", "percentage": "19.29%", "elapsed_time": "2m 39s", "remaining_time": "11m 8s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.23777} | |
| {"loss": 0.59507209, "grad_norm": 7.46162063, "learning_rate": 9.4e-07, "token_acc": 0.82557279, "epoch": 0.19847328, "global_step/max_steps": "39/197", "percentage": "19.80%", "elapsed_time": "2m 43s", "remaining_time": "11m 3s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.238056} | |
| {"loss": 0.58693254, "grad_norm": 8.52796275, "learning_rate": 9.4e-07, "token_acc": 0.8185358, "epoch": 0.20356234, "global_step/max_steps": "40/197", "percentage": "20.30%", "elapsed_time": "2m 47s", "remaining_time": "10m 57s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.238653} | |
| {"loss": 0.57638621, "grad_norm": 8.49632825, "learning_rate": 9.3e-07, "token_acc": 0.82546037, "epoch": 0.2086514, "global_step/max_steps": "41/197", "percentage": "20.81%", "elapsed_time": "2m 51s", "remaining_time": "10m 53s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.238741} | |
| {"loss": 0.46660316, "grad_norm": 7.78411326, "learning_rate": 9.3e-07, "token_acc": 0.85930407, "epoch": 0.21374046, "global_step/max_steps": "42/197", "percentage": "21.32%", "elapsed_time": "2m 55s", "remaining_time": "10m 48s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.238843} | |
| {"loss": 0.5544827, "grad_norm": 8.12826297, "learning_rate": 9.3e-07, "token_acc": 0.82527882, "epoch": 0.21882952, "global_step/max_steps": "43/197", "percentage": "21.83%", "elapsed_time": "2m 59s", "remaining_time": "10m 43s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239454} | |
| {"loss": 0.45063287, "grad_norm": 6.59396832, "learning_rate": 9.2e-07, "token_acc": 0.8608644, "epoch": 0.22391858, "global_step/max_steps": "44/197", "percentage": "22.34%", "elapsed_time": "3m 4s", "remaining_time": "10m 42s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.238251} | |
| {"loss": 0.50231916, "grad_norm": 7.612875, "learning_rate": 9.2e-07, "token_acc": 0.83709091, "epoch": 0.22900763, "global_step/max_steps": "45/197", "percentage": "22.84%", "elapsed_time": "3m 8s", "remaining_time": "10m 37s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.238439} | |
| {"loss": 0.55189824, "grad_norm": 8.60476196, "learning_rate": 9.1e-07, "token_acc": 0.82421339, "epoch": 0.23409669, "global_step/max_steps": "46/197", "percentage": "23.35%", "elapsed_time": "3m 12s", "remaining_time": "10m 31s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239011} | |
| {"loss": 0.463889, "grad_norm": 6.58007246, "learning_rate": 9.1e-07, "token_acc": 0.85774946, "epoch": 0.23918575, "global_step/max_steps": "47/197", "percentage": "23.86%", "elapsed_time": "3m 16s", "remaining_time": "10m 26s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.2393} | |
| {"loss": 0.52507579, "grad_norm": 7.54505215, "learning_rate": 9e-07, "token_acc": 0.84202898, "epoch": 0.24427481, "global_step/max_steps": "48/197", "percentage": "24.37%", "elapsed_time": "3m 20s", "remaining_time": "10m 21s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239897} | |
| {"loss": 0.40262166, "grad_norm": 8.04490864, "learning_rate": 9e-07, "token_acc": 0.86885244, "epoch": 0.24936387, "global_step/max_steps": "49/197", "percentage": "24.87%", "elapsed_time": "3m 24s", "remaining_time": "10m 16s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.240054} | |
| {"loss": 0.56976736, "grad_norm": 7.53800764, "learning_rate": 8.9e-07, "token_acc": 0.81918007, "epoch": 0.25445293, "global_step/max_steps": "50/197", "percentage": "25.38%", "elapsed_time": "3m 28s", "remaining_time": "10m 14s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239405} | |
| {"loss": 0.53400099, "grad_norm": 7.94339553, "learning_rate": 8.9e-07, "token_acc": 0.82478005, "epoch": 0.25954198, "global_step/max_steps": "51/197", "percentage": "25.89%", "elapsed_time": "3m 33s", "remaining_time": "10m 10s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239112} | |
| {"loss": 0.43659717, "grad_norm": 7.31271639, "learning_rate": 8.8e-07, "token_acc": 0.85156846, "epoch": 0.26463104, "global_step/max_steps": "52/197", "percentage": "26.40%", "elapsed_time": "3m 37s", "remaining_time": "10m 7s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.238862} | |
| {"loss": 0.59125614, "grad_norm": 7.35061975, "learning_rate": 8.8e-07, "token_acc": 0.82305795, "epoch": 0.2697201, "global_step/max_steps": "53/197", "percentage": "26.90%", "elapsed_time": "3m 41s", "remaining_time": "10m 2s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239138} | |
| {"loss": 0.6188494, "grad_norm": 8.21182282, "learning_rate": 8.7e-07, "token_acc": 0.80929488, "epoch": 0.27480916, "global_step/max_steps": "54/197", "percentage": "27.41%", "elapsed_time": "3m 45s", "remaining_time": "9m 56s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239596} | |
| {"loss": 0.54387861, "grad_norm": 8.041679, "learning_rate": 8.6e-07, "token_acc": 0.82428116, "epoch": 0.27989822, "global_step/max_steps": "55/197", "percentage": "27.92%", "elapsed_time": "3m 49s", "remaining_time": "9m 52s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239602} | |
| {"loss": 0.66257948, "grad_norm": 8.05850094, "learning_rate": 8.6e-07, "token_acc": 0.79469746, "epoch": 0.28498728, "global_step/max_steps": "56/197", "percentage": "28.43%", "elapsed_time": "3m 55s", "remaining_time": "9m 53s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.237517} | |
| {"loss": 0.47140813, "grad_norm": 7.78915749, "learning_rate": 8.5e-07, "token_acc": 0.85053378, "epoch": 0.29007634, "global_step/max_steps": "57/197", "percentage": "28.93%", "elapsed_time": "3m 59s", "remaining_time": "9m 48s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.237915} | |
| {"loss": 0.69341075, "grad_norm": 9.05278764, "learning_rate": 8.5e-07, "token_acc": 0.78848642, "epoch": 0.29516539, "global_step/max_steps": "58/197", "percentage": "29.44%", "elapsed_time": "4m 3s", "remaining_time": "9m 43s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.238341} | |
| {"loss": 0.44947916, "grad_norm": 7.3690936, "learning_rate": 8.4e-07, "token_acc": 0.8565951, "epoch": 0.30025445, "global_step/max_steps": "59/197", "percentage": "29.95%", "elapsed_time": "4m 7s", "remaining_time": "9m 37s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.238856} | |
| {"loss": 0.55836773, "grad_norm": 7.37621212, "learning_rate": 8.3e-07, "token_acc": 0.82913166, "epoch": 0.30534351, "global_step/max_steps": "60/197", "percentage": "30.46%", "elapsed_time": "4m 11s", "remaining_time": "9m 33s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.23893} | |
| {"loss": 0.51242304, "grad_norm": 7.39215762, "learning_rate": 8.3e-07, "token_acc": 0.83225805, "epoch": 0.31043257, "global_step/max_steps": "61/197", "percentage": "30.96%", "elapsed_time": "4m 15s", "remaining_time": "9m 28s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239201} | |
| {"loss": 0.47241044, "grad_norm": 7.10270206, "learning_rate": 8.2e-07, "token_acc": 0.8399123, "epoch": 0.31552163, "global_step/max_steps": "62/197", "percentage": "31.47%", "elapsed_time": "4m 19s", "remaining_time": "9m 24s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239244} | |
| {"loss": 0.50952649, "grad_norm": 7.79568269, "learning_rate": 8.1e-07, "token_acc": 0.83558178, "epoch": 0.32061069, "global_step/max_steps": "63/197", "percentage": "31.98%", "elapsed_time": "4m 23s", "remaining_time": "9m 20s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239023} | |
| {"loss": 0.5506556, "grad_norm": 6.93642144, "learning_rate": 8.1e-07, "token_acc": 0.82535213, "epoch": 0.32569975, "global_step/max_steps": "64/197", "percentage": "32.49%", "elapsed_time": "4m 27s", "remaining_time": "9m 15s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239426} | |
| {"loss": 0.48943406, "grad_norm": 7.44007876, "learning_rate": 8e-07, "token_acc": 0.83781707, "epoch": 0.3307888, "global_step/max_steps": "65/197", "percentage": "32.99%", "elapsed_time": "4m 31s", "remaining_time": "9m 10s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.23969} | |
| {"loss": 0.44389236, "grad_norm": 6.86283233, "learning_rate": 7.9e-07, "token_acc": 0.85901165, "epoch": 0.33587786, "global_step/max_steps": "66/197", "percentage": "33.50%", "elapsed_time": "4m 35s", "remaining_time": "9m 7s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239357} | |
| {"loss": 0.53672677, "grad_norm": 7.50600339, "learning_rate": 7.9e-07, "token_acc": 0.82863671, "epoch": 0.34096692, "global_step/max_steps": "67/197", "percentage": "34.01%", "elapsed_time": "4m 39s", "remaining_time": "9m 2s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239598} | |
| {"loss": 0.49839738, "grad_norm": 8.09979607, "learning_rate": 7.8e-07, "token_acc": 0.83254719, "epoch": 0.34605598, "global_step/max_steps": "68/197", "percentage": "34.52%", "elapsed_time": "4m 43s", "remaining_time": "8m 57s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239852} | |
| {"loss": 0.68678546, "grad_norm": 7.81295691, "learning_rate": 7.7e-07, "token_acc": 0.78335452, "epoch": 0.35114504, "global_step/max_steps": "69/197", "percentage": "35.03%", "elapsed_time": "4m 48s", "remaining_time": "8m 54s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239391} | |
| {"loss": 0.45675319, "grad_norm": 6.71881763, "learning_rate": 7.7e-07, "token_acc": 0.85410768, "epoch": 0.3562341, "global_step/max_steps": "70/197", "percentage": "35.53%", "elapsed_time": "4m 51s", "remaining_time": "8m 49s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239824} | |
| {"loss": 0.41313371, "grad_norm": 6.50179619, "learning_rate": 7.6e-07, "token_acc": 0.86549705, "epoch": 0.36132316, "global_step/max_steps": "71/197", "percentage": "36.04%", "elapsed_time": "4m 55s", "remaining_time": "8m 45s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239984} | |
| {"loss": 0.61304581, "grad_norm": 8.78807293, "learning_rate": 7.5e-07, "token_acc": 0.81342757, "epoch": 0.36641221, "global_step/max_steps": "72/197", "percentage": "36.55%", "elapsed_time": "4m 59s", "remaining_time": "8m 40s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.240305} | |
| {"loss": 0.57236028, "grad_norm": 8.05481964, "learning_rate": 7.5e-07, "token_acc": 0.79859483, "epoch": 0.37150127, "global_step/max_steps": "73/197", "percentage": "37.06%", "elapsed_time": "5m 4s", "remaining_time": "8m 36s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.240076} | |
| {"loss": 0.47571295, "grad_norm": 7.24279553, "learning_rate": 7.4e-07, "token_acc": 0.84695512, "epoch": 0.37659033, "global_step/max_steps": "74/197", "percentage": "37.56%", "elapsed_time": "5m 7s", "remaining_time": "8m 31s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.240273} | |
| {"loss": 0.66273093, "grad_norm": 8.30879508, "learning_rate": 7.3e-07, "token_acc": 0.79322511, "epoch": 0.38167939, "global_step/max_steps": "75/197", "percentage": "38.07%", "elapsed_time": "5m 12s", "remaining_time": "8m 28s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239966} | |
| {"loss": 0.59378934, "grad_norm": 7.5903885, "learning_rate": 7.2e-07, "token_acc": 0.80882353, "epoch": 0.38676845, "global_step/max_steps": "76/197", "percentage": "38.58%", "elapsed_time": "5m 17s", "remaining_time": "8m 25s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239488} | |
| {"loss": 0.60118461, "grad_norm": 7.68385111, "learning_rate": 7.2e-07, "token_acc": 0.81849551, "epoch": 0.39185751, "global_step/max_steps": "77/197", "percentage": "39.09%", "elapsed_time": "5m 21s", "remaining_time": "8m 21s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239261} | |
| {"loss": 0.46121573, "grad_norm": 8.13003391, "learning_rate": 7.1e-07, "token_acc": 0.84978539, "epoch": 0.39694656, "global_step/max_steps": "78/197", "percentage": "39.59%", "elapsed_time": "5m 25s", "remaining_time": "8m 16s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239494} | |
| {"loss": 0.48830536, "grad_norm": 7.09559087, "learning_rate": 7e-07, "token_acc": 0.84571892, "epoch": 0.40203562, "global_step/max_steps": "79/197", "percentage": "40.10%", "elapsed_time": "5m 29s", "remaining_time": "8m 12s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239498} | |
| {"loss": 0.5752387, "grad_norm": 7.05887219, "learning_rate": 6.9e-07, "token_acc": 0.82199311, "epoch": 0.40712468, "global_step/max_steps": "80/197", "percentage": "40.61%", "elapsed_time": "5m 33s", "remaining_time": "8m 8s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.239715} | |
| {"loss": 0.53466225, "grad_norm": 8.69021068, "learning_rate": 6.8e-07, "token_acc": 0.83110118, "epoch": 0.41221374, "global_step/max_steps": "81/197", "percentage": "41.12%", "elapsed_time": "5m 37s", "remaining_time": "8m 3s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.240066} | |
| {"loss": 0.5009892, "grad_norm": 7.07128025, "learning_rate": 6.8e-07, "token_acc": 0.8460452, "epoch": 0.4173028, "global_step/max_steps": "82/197", "percentage": "41.62%", "elapsed_time": "5m 41s", "remaining_time": "7m 58s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.240414} | |
| {"loss": 0.56954145, "grad_norm": 7.66691989, "learning_rate": 6.7e-07, "token_acc": 0.8237952, "epoch": 0.42239186, "global_step/max_steps": "83/197", "percentage": "42.13%", "elapsed_time": "5m 44s", "remaining_time": "7m 53s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.240784} | |
| {"loss": 0.62128776, "grad_norm": 7.74692643, "learning_rate": 6.6e-07, "token_acc": 0.80148548, "epoch": 0.42748092, "global_step/max_steps": "84/197", "percentage": "42.64%", "elapsed_time": "5m 48s", "remaining_time": "7m 49s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.240802} | |
| {"loss": 0.52760661, "grad_norm": 7.38717803, "learning_rate": 6.5e-07, "token_acc": 0.83399522, "epoch": 0.43256997, "global_step/max_steps": "85/197", "percentage": "43.15%", "elapsed_time": "5m 52s", "remaining_time": "7m 44s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.240914} | |
| {"loss": 0.47682995, "grad_norm": 7.50630442, "learning_rate": 6.4e-07, "token_acc": 0.84940237, "epoch": 0.43765903, "global_step/max_steps": "86/197", "percentage": "43.65%", "elapsed_time": "5m 56s", "remaining_time": "7m 40s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.240935} | |
| {"loss": 0.57418931, "grad_norm": 7.01164545, "learning_rate": 6.4e-07, "token_acc": 0.82426471, "epoch": 0.44274809, "global_step/max_steps": "87/197", "percentage": "44.16%", "elapsed_time": "6m 0s", "remaining_time": "7m 35s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.241244} | |
| {"loss": 0.52362263, "grad_norm": 7.21888774, "learning_rate": 6.3e-07, "token_acc": 0.83516484, "epoch": 0.44783715, "global_step/max_steps": "88/197", "percentage": "44.67%", "elapsed_time": "6m 4s", "remaining_time": "7m 31s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.241543} | |
| {"loss": 0.61442173, "grad_norm": 7.30966496, "learning_rate": 6.2e-07, "token_acc": 0.80310518, "epoch": 0.45292621, "global_step/max_steps": "89/197", "percentage": "45.18%", "elapsed_time": "6m 8s", "remaining_time": "7m 26s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.241843} | |
| {"loss": 0.52015805, "grad_norm": 7.97431154, "learning_rate": 6.1e-07, "token_acc": 0.82348114, "epoch": 0.45801527, "global_step/max_steps": "90/197", "percentage": "45.69%", "elapsed_time": "6m 11s", "remaining_time": "7m 21s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242092} | |
| {"loss": 0.49261019, "grad_norm": 6.83350186, "learning_rate": 6e-07, "token_acc": 0.83924693, "epoch": 0.46310433, "global_step/max_steps": "91/197", "percentage": "46.19%", "elapsed_time": "6m 15s", "remaining_time": "7m 17s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242281} | |
| {"loss": 0.57088453, "grad_norm": 6.94790397, "learning_rate": 6e-07, "token_acc": 0.8121345, "epoch": 0.46819338, "global_step/max_steps": "92/197", "percentage": "46.70%", "elapsed_time": "6m 19s", "remaining_time": "7m 13s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242486} | |
| {"loss": 0.62811571, "grad_norm": 7.54898677, "learning_rate": 5.9e-07, "token_acc": 0.78829479, "epoch": 0.47328244, "global_step/max_steps": "93/197", "percentage": "47.21%", "elapsed_time": "6m 23s", "remaining_time": "7m 8s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242601} | |
| {"loss": 0.51668239, "grad_norm": 6.57021887, "learning_rate": 5.8e-07, "token_acc": 0.83604336, "epoch": 0.4783715, "global_step/max_steps": "94/197", "percentage": "47.72%", "elapsed_time": "6m 27s", "remaining_time": "7m 4s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242875} | |
| {"loss": 0.49430782, "grad_norm": 6.87910239, "learning_rate": 5.7e-07, "token_acc": 0.84044117, "epoch": 0.48346056, "global_step/max_steps": "95/197", "percentage": "48.22%", "elapsed_time": "6m 31s", "remaining_time": "7m 0s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242844} | |
| {"loss": 0.53970045, "grad_norm": 6.90003909, "learning_rate": 5.6e-07, "token_acc": 0.83358663, "epoch": 0.48854962, "global_step/max_steps": "96/197", "percentage": "48.73%", "elapsed_time": "6m 35s", "remaining_time": "6m 56s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242752} | |
| {"loss": 0.54623258, "grad_norm": 7.31177505, "learning_rate": 5.5e-07, "token_acc": 0.82109374, "epoch": 0.49363868, "global_step/max_steps": "97/197", "percentage": "49.24%", "elapsed_time": "6m 39s", "remaining_time": "6m 51s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242991} | |
| {"loss": 0.42079484, "grad_norm": 6.23400876, "learning_rate": 5.5e-07, "token_acc": 0.86011904, "epoch": 0.49872774, "global_step/max_steps": "98/197", "percentage": "49.75%", "elapsed_time": "6m 43s", "remaining_time": "6m 47s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242853} | |
| {"loss": 0.48717853, "grad_norm": 6.9063541, "learning_rate": 5.4e-07, "token_acc": 0.84262294, "epoch": 0.50381679, "global_step/max_steps": "99/197", "percentage": "50.25%", "elapsed_time": "6m 47s", "remaining_time": "6m 43s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243017} | |
| {"loss": 0.44850805, "grad_norm": 7.41969498, "learning_rate": 5.3e-07, "token_acc": 0.86020648, "epoch": 0.50890585, "global_step/max_steps": "100/197", "percentage": "50.76%", "elapsed_time": "6m 51s", "remaining_time": "6m 38s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243262} | |
| {"loss": 0.75324798, "grad_norm": 7.58024466, "learning_rate": 5.2e-07, "token_acc": 0.76174933, "epoch": 0.51399491, "global_step/max_steps": "101/197", "percentage": "51.27%", "elapsed_time": "6m 54s", "remaining_time": "6m 34s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243485} | |
| {"loss": 0.54602611, "grad_norm": 6.78586997, "learning_rate": 5.1e-07, "token_acc": 0.8271237, "epoch": 0.51908397, "global_step/max_steps": "102/197", "percentage": "51.78%", "elapsed_time": "6m 59s", "remaining_time": "6m 30s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243318} | |
| {"loss": 0.46133786, "grad_norm": 6.37700456, "learning_rate": 5e-07, "token_acc": 0.85125303, "epoch": 0.52417303, "global_step/max_steps": "103/197", "percentage": "52.28%", "elapsed_time": "7m 3s", "remaining_time": "6m 26s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243011} | |
| {"loss": 0.44735336, "grad_norm": 6.80201496, "learning_rate": 5e-07, "token_acc": 0.85795861, "epoch": 0.52926209, "global_step/max_steps": "104/197", "percentage": "52.79%", "elapsed_time": "7m 8s", "remaining_time": "6m 22s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242863} | |
| {"loss": 0.67596936, "grad_norm": 7.41810854, "learning_rate": 4.9e-07, "token_acc": 0.78414094, "epoch": 0.53435115, "global_step/max_steps": "105/197", "percentage": "53.30%", "elapsed_time": "7m 12s", "remaining_time": "6m 19s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242725} | |
| {"loss": 0.58565164, "grad_norm": 7.48968572, "learning_rate": 4.8e-07, "token_acc": 0.81172162, "epoch": 0.5394402, "global_step/max_steps": "106/197", "percentage": "53.81%", "elapsed_time": "7m 16s", "remaining_time": "6m 15s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242588} | |
| {"loss": 0.42829567, "grad_norm": 6.32886918, "learning_rate": 4.7e-07, "token_acc": 0.87125504, "epoch": 0.54452926, "global_step/max_steps": "107/197", "percentage": "54.31%", "elapsed_time": "7m 21s", "remaining_time": "6m 11s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242467} | |
| {"loss": 0.45178571, "grad_norm": 6.88650126, "learning_rate": 4.6e-07, "token_acc": 0.8500371, "epoch": 0.54961832, "global_step/max_steps": "108/197", "percentage": "54.82%", "elapsed_time": "7m 25s", "remaining_time": "6m 7s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242473} | |
| {"loss": 0.54950231, "grad_norm": 6.8077163, "learning_rate": 4.5e-07, "token_acc": 0.8218978, "epoch": 0.55470738, "global_step/max_steps": "109/197", "percentage": "55.33%", "elapsed_time": "7m 29s", "remaining_time": "6m 3s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242226} | |
| {"loss": 0.54568899, "grad_norm": 7.37972845, "learning_rate": 4.5e-07, "token_acc": 0.82914573, "epoch": 0.55979644, "global_step/max_steps": "110/197", "percentage": "55.84%", "elapsed_time": "7m 33s", "remaining_time": "5m 59s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242327} | |
| {"loss": 0.52254128, "grad_norm": 7.61592417, "learning_rate": 4.4e-07, "token_acc": 0.84045804, "epoch": 0.5648855, "global_step/max_steps": "111/197", "percentage": "56.35%", "elapsed_time": "7m 37s", "remaining_time": "5m 54s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242554} | |
| {"loss": 0.47077897, "grad_norm": 7.4223468, "learning_rate": 4.3e-07, "token_acc": 0.84137368, "epoch": 0.56997455, "global_step/max_steps": "112/197", "percentage": "56.85%", "elapsed_time": "7m 41s", "remaining_time": "5m 50s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242746} | |
| {"loss": 0.5297612, "grad_norm": 7.12394515, "learning_rate": 4.2e-07, "token_acc": 0.83923304, "epoch": 0.57506361, "global_step/max_steps": "113/197", "percentage": "57.36%", "elapsed_time": "7m 45s", "remaining_time": "5m 46s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242754} | |
| {"loss": 0.57527423, "grad_norm": 7.49584846, "learning_rate": 4.1e-07, "token_acc": 0.82128674, "epoch": 0.58015267, "global_step/max_steps": "114/197", "percentage": "57.87%", "elapsed_time": "7m 49s", "remaining_time": "5m 41s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242858} | |
| {"loss": 0.53880215, "grad_norm": 7.32766967, "learning_rate": 4e-07, "token_acc": 0.82852691, "epoch": 0.58524173, "global_step/max_steps": "115/197", "percentage": "58.38%", "elapsed_time": "7m 53s", "remaining_time": "5m 37s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.242962} | |
| {"loss": 0.67893004, "grad_norm": 9.61861334, "learning_rate": 4e-07, "token_acc": 0.78473282, "epoch": 0.59033079, "global_step/max_steps": "116/197", "percentage": "58.88%", "elapsed_time": "7m 57s", "remaining_time": "5m 33s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243181} | |
| {"loss": 0.54717386, "grad_norm": 7.17795262, "learning_rate": 3.9e-07, "token_acc": 0.83360928, "epoch": 0.59541985, "global_step/max_steps": "117/197", "percentage": "59.39%", "elapsed_time": "8m 0s", "remaining_time": "5m 28s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243253} | |
| {"loss": 0.50989783, "grad_norm": 8.05294557, "learning_rate": 3.8e-07, "token_acc": 0.82882881, "epoch": 0.60050891, "global_step/max_steps": "118/197", "percentage": "59.90%", "elapsed_time": "8m 5s", "remaining_time": "5m 24s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243185} | |
| {"loss": 0.34250611, "grad_norm": 6.0464214, "learning_rate": 3.7e-07, "token_acc": 0.87890327, "epoch": 0.60559796, "global_step/max_steps": "119/197", "percentage": "60.41%", "elapsed_time": "8m 9s", "remaining_time": "5m 20s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243305} | |
| {"loss": 0.46307933, "grad_norm": 6.57378637, "learning_rate": 3.6e-07, "token_acc": 0.86627042, "epoch": 0.61068702, "global_step/max_steps": "120/197", "percentage": "60.91%", "elapsed_time": "8m 13s", "remaining_time": "5m 16s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243263} | |
| {"loss": 0.45755184, "grad_norm": 8.39716076, "learning_rate": 3.6e-07, "token_acc": 0.84983498, "epoch": 0.61577608, "global_step/max_steps": "121/197", "percentage": "61.42%", "elapsed_time": "8m 17s", "remaining_time": "5m 12s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.24323} | |
| {"loss": 0.4360773, "grad_norm": 6.4054163, "learning_rate": 3.5e-07, "token_acc": 0.85838151, "epoch": 0.62086514, "global_step/max_steps": "122/197", "percentage": "61.93%", "elapsed_time": "8m 21s", "remaining_time": "5m 8s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243441} | |
| {"loss": 0.59345126, "grad_norm": 6.54393617, "learning_rate": 3.4e-07, "token_acc": 0.81446111, "epoch": 0.6259542, "global_step/max_steps": "123/197", "percentage": "62.44%", "elapsed_time": "8m 25s", "remaining_time": "5m 3s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243447} | |
| {"loss": 0.51387501, "grad_norm": 6.57920888, "learning_rate": 3.3e-07, "token_acc": 0.83321196, "epoch": 0.63104326, "global_step/max_steps": "124/197", "percentage": "62.94%", "elapsed_time": "8m 28s", "remaining_time": "4m 59s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243622} | |
| {"loss": 0.51637888, "grad_norm": 6.53132468, "learning_rate": 3.2e-07, "token_acc": 0.83906364, "epoch": 0.63613232, "global_step/max_steps": "125/197", "percentage": "63.45%", "elapsed_time": "8m 32s", "remaining_time": "4m 55s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.24383} | |
| {"loss": 0.54957986, "grad_norm": 7.24353519, "learning_rate": 3.2e-07, "token_acc": 0.83049536, "epoch": 0.64122137, "global_step/max_steps": "126/197", "percentage": "63.96%", "elapsed_time": "8m 37s", "remaining_time": "4m 51s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243608} | |
| {"loss": 0.63409388, "grad_norm": 8.07354679, "learning_rate": 3.1e-07, "token_acc": 0.79635257, "epoch": 0.64631043, "global_step/max_steps": "127/197", "percentage": "64.47%", "elapsed_time": "8m 41s", "remaining_time": "4m 47s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243643} | |
| {"loss": 0.5798468, "grad_norm": 6.90645549, "learning_rate": 3e-07, "token_acc": 0.81764704, "epoch": 0.65139949, "global_step/max_steps": "128/197", "percentage": "64.97%", "elapsed_time": "8m 44s", "remaining_time": "4m 42s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.24389} | |
| {"loss": 0.58569705, "grad_norm": 7.16163026, "learning_rate": 2.9e-07, "token_acc": 0.81284261, "epoch": 0.65648855, "global_step/max_steps": "129/197", "percentage": "65.48%", "elapsed_time": "8m 48s", "remaining_time": "4m 38s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.24407} | |
| {"loss": 0.5146997, "grad_norm": 7.24870757, "learning_rate": 2.8e-07, "token_acc": 0.83619213, "epoch": 0.66157761, "global_step/max_steps": "130/197", "percentage": "65.99%", "elapsed_time": "8m 52s", "remaining_time": "4m 34s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244273} | |
| {"loss": 0.52068889, "grad_norm": 7.05546712, "learning_rate": 2.8e-07, "token_acc": 0.83408749, "epoch": 0.66666667, "global_step/max_steps": "131/197", "percentage": "66.50%", "elapsed_time": "8m 56s", "remaining_time": "4m 30s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244329} | |
| {"loss": 0.61376828, "grad_norm": 6.85333394, "learning_rate": 2.7e-07, "token_acc": 0.81409812, "epoch": 0.67175573, "global_step/max_steps": "132/197", "percentage": "67.01%", "elapsed_time": "9m 0s", "remaining_time": "4m 25s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244395} | |
| {"loss": 0.47430393, "grad_norm": 6.08416387, "learning_rate": 2.6e-07, "token_acc": 0.84733385, "epoch": 0.67684478, "global_step/max_steps": "133/197", "percentage": "67.51%", "elapsed_time": "9m 5s", "remaining_time": "4m 22s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243949} | |
| {"loss": 0.42840767, "grad_norm": 6.2043626, "learning_rate": 2.5e-07, "token_acc": 0.85558778, "epoch": 0.68193384, "global_step/max_steps": "134/197", "percentage": "68.02%", "elapsed_time": "9m 8s", "remaining_time": "4m 18s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.24409} | |
| {"loss": 0.55414164, "grad_norm": 6.47501131, "learning_rate": 2.5e-07, "token_acc": 0.82836789, "epoch": 0.6870229, "global_step/max_steps": "135/197", "percentage": "68.53%", "elapsed_time": "9m 12s", "remaining_time": "4m 13s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244218} | |
| {"loss": 0.39828593, "grad_norm": 6.29896089, "learning_rate": 2.4e-07, "token_acc": 0.86433792, "epoch": 0.69211196, "global_step/max_steps": "136/197", "percentage": "69.04%", "elapsed_time": "9m 16s", "remaining_time": "4m 9s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244396} | |
| {"loss": 0.52512676, "grad_norm": 6.71873406, "learning_rate": 2.3e-07, "token_acc": 0.82071096, "epoch": 0.69720102, "global_step/max_steps": "137/197", "percentage": "69.54%", "elapsed_time": "9m 20s", "remaining_time": "4m 5s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244508} | |
| {"loss": 0.48356572, "grad_norm": 6.49569449, "learning_rate": 2.3e-07, "token_acc": 0.84665698, "epoch": 0.70229008, "global_step/max_steps": "138/197", "percentage": "70.05%", "elapsed_time": "9m 23s", "remaining_time": "4m 1s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244687} | |
| {"loss": 0.57873267, "grad_norm": 6.41568111, "learning_rate": 2.2e-07, "token_acc": 0.81084573, "epoch": 0.70737913, "global_step/max_steps": "139/197", "percentage": "70.56%", "elapsed_time": "9m 28s", "remaining_time": "3m 57s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244573} | |
| {"loss": 0.39413959, "grad_norm": 6.2334649, "learning_rate": 2.1e-07, "token_acc": 0.87471527, "epoch": 0.71246819, "global_step/max_steps": "140/197", "percentage": "71.07%", "elapsed_time": "9m 32s", "remaining_time": "3m 53s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244499} | |
| {"loss": 0.46607459, "grad_norm": 6.40814045, "learning_rate": 2.1e-07, "token_acc": 0.85933149, "epoch": 0.71755725, "global_step/max_steps": "141/197", "percentage": "71.57%", "elapsed_time": "9m 37s", "remaining_time": "3m 49s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244282} | |
| {"loss": 0.47638828, "grad_norm": 6.28760019, "learning_rate": 2e-07, "token_acc": 0.84839433, "epoch": 0.72264631, "global_step/max_steps": "142/197", "percentage": "72.08%", "elapsed_time": "9m 40s", "remaining_time": "3m 44s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244452} | |
| {"loss": 0.59133214, "grad_norm": 6.81863549, "learning_rate": 1.9e-07, "token_acc": 0.82158273, "epoch": 0.72773537, "global_step/max_steps": "143/197", "percentage": "72.59%", "elapsed_time": "9m 44s", "remaining_time": "3m 40s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244611} | |
| {"loss": 0.43834746, "grad_norm": 6.84149097, "learning_rate": 1.9e-07, "token_acc": 0.85504884, "epoch": 0.73282443, "global_step/max_steps": "144/197", "percentage": "73.10%", "elapsed_time": "9m 48s", "remaining_time": "3m 36s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244639} | |
| {"loss": 0.57960486, "grad_norm": 6.94281742, "learning_rate": 1.8e-07, "token_acc": 0.81653929, "epoch": 0.73791349, "global_step/max_steps": "145/197", "percentage": "73.60%", "elapsed_time": "9m 52s", "remaining_time": "3m 32s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244633} | |
| {"loss": 0.67013347, "grad_norm": 6.7613004, "learning_rate": 1.7e-07, "token_acc": 0.78359622, "epoch": 0.74300254, "global_step/max_steps": "146/197", "percentage": "74.11%", "elapsed_time": "9m 56s", "remaining_time": "3m 28s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244591} | |
| {"loss": 0.40048811, "grad_norm": 7.03417155, "learning_rate": 1.7e-07, "token_acc": 0.85413152, "epoch": 0.7480916, "global_step/max_steps": "147/197", "percentage": "74.62%", "elapsed_time": "10m 1s", "remaining_time": "3m 24s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244496} | |
| {"loss": 0.55576533, "grad_norm": 7.27841043, "learning_rate": 1.6e-07, "token_acc": 0.82662767, "epoch": 0.75318066, "global_step/max_steps": "148/197", "percentage": "75.13%", "elapsed_time": "10m 5s", "remaining_time": "3m 20s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244418} | |
| {"loss": 0.48110685, "grad_norm": 7.39271505, "learning_rate": 1.5e-07, "token_acc": 0.84879726, "epoch": 0.75826972, "global_step/max_steps": "149/197", "percentage": "75.63%", "elapsed_time": "10m 9s", "remaining_time": "3m 16s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244372} | |
| {"loss": 0.51099169, "grad_norm": 6.82061016, "learning_rate": 1.5e-07, "token_acc": 0.83235723, "epoch": 0.76335878, "global_step/max_steps": "150/197", "percentage": "76.14%", "elapsed_time": "10m 13s", "remaining_time": "3m 12s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244526} | |
| {"loss": 0.52887177, "grad_norm": 6.75300603, "learning_rate": 1.4e-07, "token_acc": 0.83091438, "epoch": 0.76844784, "global_step/max_steps": "151/197", "percentage": "76.65%", "elapsed_time": "10m 18s", "remaining_time": "3m 8s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244248} | |
| {"loss": 0.50057006, "grad_norm": 7.2945453, "learning_rate": 1.4e-07, "token_acc": 0.83188635, "epoch": 0.7735369, "global_step/max_steps": "152/197", "percentage": "77.16%", "elapsed_time": "10m 23s", "remaining_time": "3m 4s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.24389} | |
| {"loss": 0.35479891, "grad_norm": 7.88559088, "learning_rate": 1.3e-07, "token_acc": 0.88328075, "epoch": 0.77862595, "global_step/max_steps": "153/197", "percentage": "77.66%", "elapsed_time": "10m 26s", "remaining_time": "3m 0s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244052} | |
| {"loss": 0.50404, "grad_norm": 6.72471234, "learning_rate": 1.2e-07, "token_acc": 0.8365922, "epoch": 0.78371501, "global_step/max_steps": "154/197", "percentage": "78.17%", "elapsed_time": "10m 31s", "remaining_time": "2m 56s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243737} | |
| {"loss": 0.50706899, "grad_norm": 6.45571072, "learning_rate": 1.2e-07, "token_acc": 0.82493186, "epoch": 0.78880407, "global_step/max_steps": "155/197", "percentage": "78.68%", "elapsed_time": "10m 35s", "remaining_time": "2m 52s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243763} | |
| {"loss": 0.52005351, "grad_norm": 7.06880016, "learning_rate": 1.1e-07, "token_acc": 0.83655274, "epoch": 0.79389313, "global_step/max_steps": "156/197", "percentage": "79.19%", "elapsed_time": "10m 40s", "remaining_time": "2m 48s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243717} | |
| {"loss": 0.40978032, "grad_norm": 6.85008129, "learning_rate": 1.1e-07, "token_acc": 0.86201161, "epoch": 0.79898219, "global_step/max_steps": "157/197", "percentage": "79.70%", "elapsed_time": "10m 44s", "remaining_time": "2m 44s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243747} | |
| {"loss": 0.54613638, "grad_norm": 7.60922457, "learning_rate": 1e-07, "token_acc": 0.82715076, "epoch": 0.80407125, "global_step/max_steps": "158/197", "percentage": "80.20%", "elapsed_time": "10m 48s", "remaining_time": "2m 40s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243666} | |
| {"loss": 0.47203174, "grad_norm": 6.3126241, "learning_rate": 1e-07, "token_acc": 0.8570348, "epoch": 0.80916031, "global_step/max_steps": "159/197", "percentage": "80.71%", "elapsed_time": "10m 52s", "remaining_time": "2m 35s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243596} | |
| {"loss": 0.50006372, "grad_norm": 6.96486743, "learning_rate": 9e-08, "token_acc": 0.83797288, "epoch": 0.81424936, "global_step/max_steps": "160/197", "percentage": "81.22%", "elapsed_time": "10m 56s", "remaining_time": "2m 31s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.24363} | |
| {"loss": 0.40088487, "grad_norm": 6.45175114, "learning_rate": 9e-08, "token_acc": 0.86931819, "epoch": 0.81933842, "global_step/max_steps": "161/197", "percentage": "81.73%", "elapsed_time": "11m 0s", "remaining_time": "2m 27s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243648} | |
| {"loss": 0.52439237, "grad_norm": 7.1501308, "learning_rate": 8e-08, "token_acc": 0.83977902, "epoch": 0.82442748, "global_step/max_steps": "162/197", "percentage": "82.23%", "elapsed_time": "11m 4s", "remaining_time": "2m 23s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243661} | |
| {"loss": 0.69401765, "grad_norm": 6.74886782, "learning_rate": 8e-08, "token_acc": 0.79217744, "epoch": 0.82951654, "global_step/max_steps": "163/197", "percentage": "82.74%", "elapsed_time": "11m 9s", "remaining_time": "2m 19s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243466} | |
| {"loss": 0.52588379, "grad_norm": 7.15071543, "learning_rate": 7e-08, "token_acc": 0.83565891, "epoch": 0.8346056, "global_step/max_steps": "164/197", "percentage": "83.25%", "elapsed_time": "11m 13s", "remaining_time": "2m 15s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.24342} | |
| {"loss": 0.4019556, "grad_norm": 6.09615029, "learning_rate": 7e-08, "token_acc": 0.8778317, "epoch": 0.83969466, "global_step/max_steps": "165/197", "percentage": "83.76%", "elapsed_time": "11m 17s", "remaining_time": "2m 11s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243464} | |
| {"loss": 0.51561689, "grad_norm": 7.24255851, "learning_rate": 7e-08, "token_acc": 0.8408916, "epoch": 0.84478372, "global_step/max_steps": "166/197", "percentage": "84.26%", "elapsed_time": "11m 21s", "remaining_time": "2m 7s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243472} | |
| {"loss": 0.45759654, "grad_norm": 6.89571244, "learning_rate": 6e-08, "token_acc": 0.85201794, "epoch": 0.84987277, "global_step/max_steps": "167/197", "percentage": "84.77%", "elapsed_time": "11m 25s", "remaining_time": "2m 3s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243521} | |
| {"loss": 0.50834095, "grad_norm": 6.89282576, "learning_rate": 6e-08, "token_acc": 0.83758193, "epoch": 0.85496183, "global_step/max_steps": "168/197", "percentage": "85.28%", "elapsed_time": "11m 30s", "remaining_time": "1m 59s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243452} | |
| {"loss": 0.44572502, "grad_norm": 6.76626351, "learning_rate": 5e-08, "token_acc": 0.85768074, "epoch": 0.86005089, "global_step/max_steps": "169/197", "percentage": "85.79%", "elapsed_time": "11m 34s", "remaining_time": "1m 55s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243438} | |
| {"loss": 0.4539907, "grad_norm": 6.00128019, "learning_rate": 5e-08, "token_acc": 0.86046511, "epoch": 0.86513995, "global_step/max_steps": "170/197", "percentage": "86.29%", "elapsed_time": "11m 37s", "remaining_time": "1m 50s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243602} | |
| {"loss": 0.46429244, "grad_norm": 7.14211423, "learning_rate": 5e-08, "token_acc": 0.84323043, "epoch": 0.87022901, "global_step/max_steps": "171/197", "percentage": "86.80%", "elapsed_time": "11m 41s", "remaining_time": "1m 46s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243678} | |
| {"loss": 0.38561523, "grad_norm": 5.60280583, "learning_rate": 4e-08, "token_acc": 0.88394719, "epoch": 0.87531807, "global_step/max_steps": "172/197", "percentage": "87.31%", "elapsed_time": "11m 45s", "remaining_time": "1m 42s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243712} | |
| {"loss": 0.46086422, "grad_norm": 6.33931144, "learning_rate": 4e-08, "token_acc": 0.84214503, "epoch": 0.88040712, "global_step/max_steps": "173/197", "percentage": "87.82%", "elapsed_time": "11m 50s", "remaining_time": "1m 38s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243442} | |
| {"loss": 0.45966834, "grad_norm": 6.54509415, "learning_rate": 4e-08, "token_acc": 0.85967743, "epoch": 0.88549618, "global_step/max_steps": "174/197", "percentage": "88.32%", "elapsed_time": "11m 54s", "remaining_time": "1m 34s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243587} | |
| {"loss": 0.429304, "grad_norm": 6.43417319, "learning_rate": 3e-08, "token_acc": 0.8674897, "epoch": 0.89058524, "global_step/max_steps": "175/197", "percentage": "88.83%", "elapsed_time": "11m 58s", "remaining_time": "1m 30s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243568} | |
| {"loss": 0.47356996, "grad_norm": 6.74307223, "learning_rate": 3e-08, "token_acc": 0.84972024, "epoch": 0.8956743, "global_step/max_steps": "176/197", "percentage": "89.34%", "elapsed_time": "12m 2s", "remaining_time": "1m 26s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.24373} | |
| {"loss": 0.4741419, "grad_norm": 6.38713943, "learning_rate": 3e-08, "token_acc": 0.8517192, "epoch": 0.90076336, "global_step/max_steps": "177/197", "percentage": "89.85%", "elapsed_time": "12m 6s", "remaining_time": "1m 22s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243761} | |
| {"loss": 0.48497796, "grad_norm": 6.61100064, "learning_rate": 3e-08, "token_acc": 0.83670169, "epoch": 0.90585242, "global_step/max_steps": "178/197", "percentage": "90.36%", "elapsed_time": "12m 9s", "remaining_time": "1m 17s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243874} | |
| {"loss": 0.42370287, "grad_norm": 6.17486043, "learning_rate": 2e-08, "token_acc": 0.87003613, "epoch": 0.91094148, "global_step/max_steps": "179/197", "percentage": "90.86%", "elapsed_time": "12m 13s", "remaining_time": "1m 13s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243921} | |
| {"loss": 0.48196888, "grad_norm": 6.92904695, "learning_rate": 2e-08, "token_acc": 0.8519398, "epoch": 0.91603053, "global_step/max_steps": "180/197", "percentage": "91.37%", "elapsed_time": "12m 17s", "remaining_time": "1m 9s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243915} | |
| {"loss": 0.48997873, "grad_norm": 6.37320599, "learning_rate": 2e-08, "token_acc": 0.86093187, "epoch": 0.92111959, "global_step/max_steps": "181/197", "percentage": "91.88%", "elapsed_time": "12m 21s", "remaining_time": "1m 5s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244065} | |
| {"loss": 0.52174777, "grad_norm": 6.57050019, "learning_rate": 2e-08, "token_acc": 0.83417088, "epoch": 0.92620865, "global_step/max_steps": "182/197", "percentage": "92.39%", "elapsed_time": "12m 25s", "remaining_time": "1m 1s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244206} | |
| {"loss": 0.4388586, "grad_norm": 7.40764781, "learning_rate": 1e-08, "token_acc": 0.84475642, "epoch": 0.93129771, "global_step/max_steps": "183/197", "percentage": "92.89%", "elapsed_time": "12m 29s", "remaining_time": "57s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.24428} | |
| {"loss": 0.44787735, "grad_norm": 6.4602726, "learning_rate": 1e-08, "token_acc": 0.85474008, "epoch": 0.93638677, "global_step/max_steps": "184/197", "percentage": "93.40%", "elapsed_time": "12m 33s", "remaining_time": "53s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244353} | |
| {"loss": 0.45180649, "grad_norm": 6.93100538, "learning_rate": 1e-08, "token_acc": 0.85307348, "epoch": 0.94147583, "global_step/max_steps": "185/197", "percentage": "93.91%", "elapsed_time": "12m 36s", "remaining_time": "49s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244479} | |
| {"loss": 0.52098626, "grad_norm": 6.76788342, "learning_rate": 1e-08, "token_acc": 0.8448416, "epoch": 0.94656489, "global_step/max_steps": "186/197", "percentage": "94.42%", "elapsed_time": "12m 40s", "remaining_time": "44s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.24462} | |
| {"loss": 0.3923229, "grad_norm": 6.4985915, "learning_rate": 1e-08, "token_acc": 0.86656672, "epoch": 0.95165394, "global_step/max_steps": "187/197", "percentage": "94.92%", "elapsed_time": "12m 44s", "remaining_time": "40s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244733} | |
| {"loss": 0.47490713, "grad_norm": 6.72339756, "learning_rate": 1e-08, "token_acc": 0.84798098, "epoch": 0.956743, "global_step/max_steps": "188/197", "percentage": "95.43%", "elapsed_time": "12m 48s", "remaining_time": "36s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244724} | |
| {"loss": 0.63166517, "grad_norm": 6.47914329, "learning_rate": 0.0, "token_acc": 0.80115092, "epoch": 0.96183206, "global_step/max_steps": "189/197", "percentage": "95.94%", "elapsed_time": "12m 52s", "remaining_time": "32s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244704} | |
| {"loss": 0.549137, "grad_norm": 7.40403591, "learning_rate": 0.0, "token_acc": 0.82612056, "epoch": 0.96692112, "global_step/max_steps": "190/197", "percentage": "96.45%", "elapsed_time": "12m 56s", "remaining_time": "28s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244675} | |
| {"loss": 0.65245885, "grad_norm": 7.11699182, "learning_rate": 0.0, "token_acc": 0.79383427, "epoch": 0.97201018, "global_step/max_steps": "191/197", "percentage": "96.95%", "elapsed_time": "13m 1s", "remaining_time": "24s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.24431} | |
| {"loss": 0.45887458, "grad_norm": 6.79487138, "learning_rate": 0.0, "token_acc": 0.85173249, "epoch": 0.97709924, "global_step/max_steps": "192/197", "percentage": "97.46%", "elapsed_time": "13m 5s", "remaining_time": "20s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244432} | |
| {"loss": 0.48686248, "grad_norm": 6.45045175, "learning_rate": 0.0, "token_acc": 0.84332132, "epoch": 0.9821883, "global_step/max_steps": "193/197", "percentage": "97.97%", "elapsed_time": "13m 9s", "remaining_time": "16s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244571} | |
| {"loss": 0.47770101, "grad_norm": 6.8483167, "learning_rate": 0.0, "token_acc": 0.84163988, "epoch": 0.98727735, "global_step/max_steps": "194/197", "percentage": "98.48%", "elapsed_time": "13m 13s", "remaining_time": "12s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.24456} | |
| {"loss": 0.43710124, "grad_norm": 6.31795327, "learning_rate": 0.0, "token_acc": 0.85625434, "epoch": 0.99236641, "global_step/max_steps": "195/197", "percentage": "98.98%", "elapsed_time": "13m 17s", "remaining_time": "8s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244574} | |
| {"loss": 0.46438512, "grad_norm": 6.41996748, "learning_rate": 0.0, "token_acc": 0.85284811, "epoch": 0.99745547, "global_step/max_steps": "196/197", "percentage": "99.49%", "elapsed_time": "13m 21s", "remaining_time": "4s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244417} | |
| {"loss": 0.54358232, "grad_norm": 9.59049701, "learning_rate": 0.0, "token_acc": 0.82326281, "epoch": 1.0, "global_step/max_steps": "197/197", "percentage": "100.00%", "elapsed_time": "13m 24s", "remaining_time": "0s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.244982} | |
| {"eval_loss": 0.32756793, "eval_runtime": 3.5597, "eval_samples_per_second": 0.843, "eval_steps_per_second": 0.281, "eval_token_acc": 0.91451615, "epoch": 1.0, "global_step/max_steps": "197/197", "percentage": "100.00%", "elapsed_time": "13m 27s", "remaining_time": "0s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.243873} | |
| {"eval_loss": 0.32756793, "eval_runtime": 2.8528, "eval_samples_per_second": 1.052, "eval_steps_per_second": 0.351, "eval_token_acc": 0.91451615, "epoch": 1.0, "global_step/max_steps": "197/197", "percentage": "100.00%", "elapsed_time": "17m 7s", "remaining_time": "0s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.191673} | |
| {"train_runtime": 1261.0874, "train_samples_per_second": 2.496, "train_steps_per_second": 0.156, "total_flos": 18220985847808.0, "train_loss": 0.5864294, "epoch": 1.0, "global_step/max_steps": "197/197", "percentage": "100.00%", "elapsed_time": "21m 0s", "remaining_time": "0s", "memory(GiB)": 26.11, "train_speed(iter/s)": 0.156228} | |
| {"train_dataset": "2904.456163±705.736381, min=867.000000, max=6847.000000, size=3148", "val_dataset": "2996.333333±505.896783, min=2281.000000, max=3365.000000, size=3", "model_parameter_info": "Qwen2ForCausalLM: 7615.6165M Params (7615.6165M Trainable [100.0000%]), 0.0001M Buffers.", "last_model_checkpoint": "/group/40143/hongzhuyi/ms-swift/output/v0-20250818-222949/checkpoint-197", "best_model_checkpoint": "/group/40143/hongzhuyi/ms-swift/output/v0-20250818-222949/checkpoint-197", "best_metric": 0.32756793, "global_step": 197, "log_history": [{"loss": 1.5618647336959839, "grad_norm": 39.59119464455215, "learning_rate": 1e-07, "token_acc": 0.6988449096679688, "epoch": 0.005089058524173028, "step": 1}, {"loss": 1.3259854316711426, "grad_norm": 38.840980931352185, "learning_rate": 2e-07, "token_acc": 0.7462562322616577, "epoch": 0.010178117048346057, "step": 2}, {"loss": 1.4535326957702637, "grad_norm": 35.58269049892918, "learning_rate": 3e-07, "token_acc": 0.6936619877815247, "epoch": 0.015267175572519083, "step": 3}, {"loss": 1.3522995710372925, "grad_norm": 37.43614131716039, "learning_rate": 4e-07, "token_acc": 0.7387312054634094, "epoch": 0.020356234096692113, "step": 4}, {"loss": 1.3101356029510498, "grad_norm": 38.966019422115906, "learning_rate": 5e-07, "token_acc": 0.7383029460906982, "epoch": 0.02544529262086514, "step": 5}, {"loss": 1.312995433807373, "grad_norm": 36.69056841432856, "learning_rate": 6e-07, "token_acc": 0.7447832822799683, "epoch": 0.030534351145038167, "step": 6}, {"loss": 1.2275996208190918, "grad_norm": 36.54255094918598, "learning_rate": 7e-07, "token_acc": 0.7386091351509094, "epoch": 0.035623409669211195, "step": 7}, {"loss": 1.2191312313079834, "grad_norm": 36.46527005025605, "learning_rate": 8e-07, "token_acc": 0.7234548330307007, "epoch": 0.04071246819338423, "step": 8}, {"loss": 1.1805241107940674, "grad_norm": 31.38061782576404, "learning_rate": 9e-07, "token_acc": 0.7470238208770752, "epoch": 0.04580152671755725, "step": 9}, {"loss": 1.1280511617660522, "grad_norm": 30.297168179272013, "learning_rate": 1e-06, "token_acc": 0.7501875758171082, "epoch": 0.05089058524173028, "step": 10}, {"loss": 1.0186139345169067, "grad_norm": 24.72899766553136, "learning_rate": 9.999294419895387e-07, "token_acc": 0.7525691986083984, "epoch": 0.05597964376590331, "step": 11}, {"loss": 1.035551905632019, "grad_norm": 24.027012489059675, "learning_rate": 9.997177878718868e-07, "token_acc": 0.739130437374115, "epoch": 0.061068702290076333, "step": 12}, {"loss": 1.0829148292541504, "grad_norm": 21.85423573595879, "learning_rate": 9.993650973826175e-07, "token_acc": 0.7210264801979065, "epoch": 0.06615776081424936, "step": 13}, {"loss": 0.956468939781189, "grad_norm": 18.44215846855443, "learning_rate": 9.98871470062288e-07, "token_acc": 0.7356154322624207, "epoch": 0.07124681933842239, "step": 14}, {"loss": 0.7082113027572632, "grad_norm": 13.164830213554048, "learning_rate": 9.982370452283449e-07, "token_acc": 0.7961165308952332, "epoch": 0.07633587786259542, "step": 15}, {"loss": 0.8985003232955933, "grad_norm": 14.238898889820645, "learning_rate": 9.974620019358044e-07, "token_acc": 0.7519319653511047, "epoch": 0.08142493638676845, "step": 16}, {"loss": 0.7989923357963562, "grad_norm": 13.67529550123635, "learning_rate": 9.965465589267174e-07, "token_acc": 0.7746710777282715, "epoch": 0.08651399491094147, "step": 17}, {"loss": 0.7252960205078125, "grad_norm": 10.703655214207734, "learning_rate": 9.954909745684338e-07, "token_acc": 0.8032212853431702, "epoch": 0.0916030534351145, "step": 18}, {"loss": 0.7841960191726685, "grad_norm": 11.523889293639645, "learning_rate": 9.94295546780682e-07, "token_acc": 0.7739251255989075, "epoch": 0.09669211195928754, "step": 19}, {"loss": 0.6578090190887451, "grad_norm": 10.489275511207074, "learning_rate": 9.929606129514875e-07, "token_acc": 0.8086696267127991, "epoch": 0.10178117048346055, "step": 20}, {"loss": 0.783283531665802, "grad_norm": 11.076797697308274, "learning_rate": 9.91486549841951e-07, "token_acc": 0.7880747318267822, "epoch": 0.10687022900763359, "step": 21}, {"loss": 0.7003642916679382, "grad_norm": 10.466190317489326, "learning_rate": 9.898737734799133e-07, "token_acc": 0.7864823341369629, "epoch": 0.11195928753180662, "step": 22}, {"loss": 0.7234220504760742, "grad_norm": 10.240502510630266, "learning_rate": 9.881227390425402e-07, "token_acc": 0.7776978611946106, "epoch": 0.11704834605597965, "step": 23}, {"loss": 0.8035274147987366, "grad_norm": 10.763927624859033, "learning_rate": 9.862339407278563e-07, "token_acc": 0.7623896598815918, "epoch": 0.12213740458015267, "step": 24}, {"loss": 0.5895827412605286, "grad_norm": 8.653099525894065, "learning_rate": 9.842079116152668e-07, "token_acc": 0.8207831382751465, "epoch": 0.1272264631043257, "step": 25}, {"loss": 0.6686500310897827, "grad_norm": 9.515878581058445, "learning_rate": 9.820452235151048e-07, "token_acc": 0.7937831878662109, "epoch": 0.13231552162849872, "step": 26}, {"loss": 0.6293672323226929, "grad_norm": 9.799933748108716, "learning_rate": 9.797464868072486e-07, "token_acc": 0.8054607510566711, "epoch": 0.13740458015267176, "step": 27}, {"loss": 0.6071704030036926, "grad_norm": 9.427191512484255, "learning_rate": 9.773123502688532e-07, "token_acc": 0.8294862508773804, "epoch": 0.14249363867684478, "step": 28}, {"loss": 0.6743731498718262, "grad_norm": 9.37922151687679, "learning_rate": 9.747435008912436e-07, "token_acc": 0.7891805768013, "epoch": 0.1475826972010178, "step": 29}, {"loss": 0.7138023376464844, "grad_norm": 8.930688669632072, "learning_rate": 9.72040663686025e-07, "token_acc": 0.7901849150657654, "epoch": 0.15267175572519084, "step": 30}, {"loss": 0.7591217756271362, "grad_norm": 9.881769025162873, "learning_rate": 9.692046014804608e-07, "token_acc": 0.7719298005104065, "epoch": 0.15776081424936386, "step": 31}, {"loss": 0.6431020498275757, "grad_norm": 9.162115530110038, "learning_rate": 9.66236114702178e-07, "token_acc": 0.8061839938163757, "epoch": 0.1628498727735369, "step": 32}, {"loss": 0.4977595806121826, "grad_norm": 7.929081347058714, "learning_rate": 9.631360411532608e-07, "token_acc": 0.8374046087265015, "epoch": 0.16793893129770993, "step": 33}, {"loss": 0.5279712080955505, "grad_norm": 8.318143171263882, "learning_rate": 9.599052557737972e-07, "token_acc": 0.8293412923812866, "epoch": 0.17302798982188294, "step": 34}, {"loss": 0.7194305658340454, "grad_norm": 9.361616489488647, "learning_rate": 9.565446703949415e-07, "token_acc": 0.791208803653717, "epoch": 0.178117048346056, "step": 35}, {"loss": 0.7394201755523682, "grad_norm": 10.728332996265143, "learning_rate": 9.53055233481567e-07, "token_acc": 0.7771186232566833, "epoch": 0.183206106870229, "step": 36}, {"loss": 0.5850287079811096, "grad_norm": 8.069307090882255, "learning_rate": 9.494379298645787e-07, "token_acc": 0.812666654586792, "epoch": 0.18829516539440203, "step": 37}, {"loss": 0.4985586106777191, "grad_norm": 7.645319984809178, "learning_rate": 9.456937804629623e-07, "token_acc": 0.8454692363739014, "epoch": 0.19338422391857507, "step": 38}, {"loss": 0.5950720906257629, "grad_norm": 7.461620630685697, "learning_rate": 9.418238419956483e-07, "token_acc": 0.8255727887153625, "epoch": 0.1984732824427481, "step": 39}, {"loss": 0.5869325399398804, "grad_norm": 8.527962752078853, "learning_rate": 9.378292066832723e-07, "token_acc": 0.8185358047485352, "epoch": 0.2035623409669211, "step": 40}, {"loss": 0.5763862133026123, "grad_norm": 8.496328249097074, "learning_rate": 9.337110019399148e-07, "token_acc": 0.8254603743553162, "epoch": 0.20865139949109415, "step": 41}, {"loss": 0.46660315990448, "grad_norm": 7.7841132583684205, "learning_rate": 9.294703900549094e-07, "token_acc": 0.8593040704727173, "epoch": 0.21374045801526717, "step": 42}, {"loss": 0.5544826984405518, "grad_norm": 8.128262972440295, "learning_rate": 9.251085678648071e-07, "token_acc": 0.8252788186073303, "epoch": 0.21882951653944022, "step": 43}, {"loss": 0.45063287019729614, "grad_norm": 6.59396831761356, "learning_rate": 9.206267664155906e-07, "token_acc": 0.8608644008636475, "epoch": 0.22391857506361323, "step": 44}, {"loss": 0.5023191571235657, "grad_norm": 7.6128749952424135, "learning_rate": 9.16026250615234e-07, "token_acc": 0.8370909094810486, "epoch": 0.22900763358778625, "step": 45}, {"loss": 0.5518982410430908, "grad_norm": 8.604761961587872, "learning_rate": 9.113083188767055e-07, "token_acc": 0.8242133855819702, "epoch": 0.2340966921119593, "step": 46}, {"loss": 0.4638890027999878, "grad_norm": 6.580072457034871, "learning_rate": 9.064743027515127e-07, "token_acc": 0.8577494621276855, "epoch": 0.23918575063613232, "step": 47}, {"loss": 0.5250757932662964, "grad_norm": 7.5450521470602885, "learning_rate": 9.015255665538971e-07, "token_acc": 0.8420289754867554, "epoch": 0.24427480916030533, "step": 48}, {"loss": 0.40262165665626526, "grad_norm": 8.044908642807194, "learning_rate": 8.964635069757801e-07, "token_acc": 0.868852436542511, "epoch": 0.24936386768447838, "step": 49}, {"loss": 0.5697673559188843, "grad_norm": 7.538007637149986, "learning_rate": 8.912895526925724e-07, "token_acc": 0.8191800713539124, "epoch": 0.2544529262086514, "step": 50}, {"loss": 0.5340009927749634, "grad_norm": 7.943395533009222, "learning_rate": 8.860051639599559e-07, "token_acc": 0.8247800469398499, "epoch": 0.2595419847328244, "step": 51}, {"loss": 0.43659716844558716, "grad_norm": 7.312716386718961, "learning_rate": 8.806118322017524e-07, "token_acc": 0.8515684604644775, "epoch": 0.26463104325699743, "step": 52}, {"loss": 0.5912561416625977, "grad_norm": 7.350619751080193, "learning_rate": 8.751110795889965e-07, "token_acc": 0.8230579495429993, "epoch": 0.2697201017811705, "step": 53}, {"loss": 0.6188493967056274, "grad_norm": 8.211822816297223, "learning_rate": 8.695044586103295e-07, "token_acc": 0.8092948794364929, "epoch": 0.2748091603053435, "step": 54}, {"loss": 0.5438786149024963, "grad_norm": 8.041679003906003, "learning_rate": 8.637935516338384e-07, "token_acc": 0.8242811560630798, "epoch": 0.27989821882951654, "step": 55}, {"loss": 0.6625794768333435, "grad_norm": 8.058500943867548, "learning_rate": 8.579799704604596e-07, "token_acc": 0.7946974635124207, "epoch": 0.28498727735368956, "step": 56}, {"loss": 0.4714081287384033, "grad_norm": 7.789157487379679, "learning_rate": 8.520653558690784e-07, "token_acc": 0.8505337834358215, "epoch": 0.2900763358778626, "step": 57}, {"loss": 0.6934107542037964, "grad_norm": 9.052787635360971, "learning_rate": 8.460513771534475e-07, "token_acc": 0.7884864211082458, "epoch": 0.2951653944020356, "step": 58}, {"loss": 0.4494791626930237, "grad_norm": 7.369093603412542, "learning_rate": 8.399397316510595e-07, "token_acc": 0.8565950989723206, "epoch": 0.30025445292620867, "step": 59}, {"loss": 0.5583677291870117, "grad_norm": 7.3762121220819115, "learning_rate": 8.337321442641035e-07, "token_acc": 0.8291316628456116, "epoch": 0.3053435114503817, "step": 60}, {"loss": 0.512423038482666, "grad_norm": 7.392157621374455, "learning_rate": 8.274303669726426e-07, "token_acc": 0.8322580456733704, "epoch": 0.3104325699745547, "step": 61}, {"loss": 0.4724104404449463, "grad_norm": 7.102702056910771, "learning_rate": 8.210361783401491e-07, "token_acc": 0.8399122953414917, "epoch": 0.3155216284987277, "step": 62}, {"loss": 0.5095264911651611, "grad_norm": 7.795682688711404, "learning_rate": 8.145513830115366e-07, "token_acc": 0.8355817794799805, "epoch": 0.32061068702290074, "step": 63}, {"loss": 0.5506556034088135, "grad_norm": 6.936421435554609, "learning_rate": 8.079778112038318e-07, "token_acc": 0.825352132320404, "epoch": 0.3256997455470738, "step": 64}, {"loss": 0.48943406343460083, "grad_norm": 7.440078763607836, "learning_rate": 8.013173181896282e-07, "token_acc": 0.8378170728683472, "epoch": 0.33078880407124683, "step": 65}, {"loss": 0.44389235973358154, "grad_norm": 6.862832326044178, "learning_rate": 7.945717837734687e-07, "token_acc": 0.8590116500854492, "epoch": 0.33587786259541985, "step": 66}, {"loss": 0.5367267727851868, "grad_norm": 7.506003385387902, "learning_rate": 7.877431117613051e-07, "token_acc": 0.8286367058753967, "epoch": 0.34096692111959287, "step": 67}, {"loss": 0.4983973801136017, "grad_norm": 8.099796074984626, "learning_rate": 7.808332294231823e-07, "token_acc": 0.8325471878051758, "epoch": 0.3460559796437659, "step": 68}, {"loss": 0.6867854595184326, "grad_norm": 7.812956905553791, "learning_rate": 7.738440869493017e-07, "token_acc": 0.7833545207977295, "epoch": 0.3511450381679389, "step": 69}, {"loss": 0.4567531943321228, "grad_norm": 6.718817631268139, "learning_rate": 7.667776568996142e-07, "token_acc": 0.854107677936554, "epoch": 0.356234096692112, "step": 70}, {"loss": 0.4131337106227875, "grad_norm": 6.501796191199332, "learning_rate": 7.596359336471014e-07, "token_acc": 0.8654970526695251, "epoch": 0.361323155216285, "step": 71}, {"loss": 0.6130458116531372, "grad_norm": 8.788072928188402, "learning_rate": 7.524209328148994e-07, "token_acc": 0.8134275674819946, "epoch": 0.366412213740458, "step": 72}, {"loss": 0.5723602771759033, "grad_norm": 8.054819642399062, "learning_rate": 7.451346907074244e-07, "token_acc": 0.7985948324203491, "epoch": 0.37150127226463103, "step": 73}, {"loss": 0.47571295499801636, "grad_norm": 7.242795533985142, "learning_rate": 7.377792637356643e-07, "token_acc": 0.8469551205635071, "epoch": 0.37659033078880405, "step": 74}, {"loss": 0.6627309322357178, "grad_norm": 8.308795084553635, "learning_rate": 7.303567278367917e-07, "token_acc": 0.793225109577179, "epoch": 0.3816793893129771, "step": 75}, {"loss": 0.5937893390655518, "grad_norm": 7.59038850429703, "learning_rate": 7.228691778882692e-07, "token_acc": 0.8088235259056091, "epoch": 0.38676844783715014, "step": 76}, {"loss": 0.601184606552124, "grad_norm": 7.683851106547246, "learning_rate": 7.15318727116607e-07, "token_acc": 0.818495512008667, "epoch": 0.39185750636132316, "step": 77}, {"loss": 0.4612157344818115, "grad_norm": 8.130033906261136, "learning_rate": 7.077075065009433e-07, "token_acc": 0.8497853875160217, "epoch": 0.3969465648854962, "step": 78}, {"loss": 0.48830536007881165, "grad_norm": 7.095590871692628, "learning_rate": 7.000376641716132e-07, "token_acc": 0.8457189202308655, "epoch": 0.4020356234096692, "step": 79}, {"loss": 0.5752387046813965, "grad_norm": 7.058872191401623, "learning_rate": 6.923113648038783e-07, "token_acc": 0.8219931125640869, "epoch": 0.4071246819338422, "step": 80}, {"loss": 0.5346622467041016, "grad_norm": 8.690210681310855, "learning_rate": 6.84530789006985e-07, "token_acc": 0.8311011791229248, "epoch": 0.4122137404580153, "step": 81}, {"loss": 0.5009891986846924, "grad_norm": 7.071280252066903, "learning_rate": 6.76698132708727e-07, "token_acc": 0.846045196056366, "epoch": 0.4173027989821883, "step": 82}, {"loss": 0.5695414543151855, "grad_norm": 7.666919888145399, "learning_rate": 6.688156065356844e-07, "token_acc": 0.8237951993942261, "epoch": 0.4223918575063613, "step": 83}, {"loss": 0.6212877631187439, "grad_norm": 7.746926433795022, "learning_rate": 6.60885435189314e-07, "token_acc": 0.8014854788780212, "epoch": 0.42748091603053434, "step": 84}, {"loss": 0.5276066064834595, "grad_norm": 7.387178025589133, "learning_rate": 6.529098568180671e-07, "token_acc": 0.8339952230453491, "epoch": 0.43256997455470736, "step": 85}, {"loss": 0.4768299460411072, "grad_norm": 7.506304422823719, "learning_rate": 6.448911223857123e-07, "token_acc": 0.8494023680686951, "epoch": 0.43765903307888043, "step": 86}, {"loss": 0.574189305305481, "grad_norm": 7.011645447853534, "learning_rate": 6.368314950360415e-07, "token_acc": 0.8242647051811218, "epoch": 0.44274809160305345, "step": 87}, {"loss": 0.5236226320266724, "grad_norm": 7.218887744556149, "learning_rate": 6.287332494541379e-07, "token_acc": 0.8351648449897766, "epoch": 0.44783715012722647, "step": 88}, {"loss": 0.6144217252731323, "grad_norm": 7.30966496133741, "learning_rate": 6.205986712243875e-07, "token_acc": 0.8031051754951477, "epoch": 0.4529262086513995, "step": 89}, {"loss": 0.520158052444458, "grad_norm": 7.974311536016522, "learning_rate": 6.124300561854138e-07, "token_acc": 0.8234811425209045, "epoch": 0.4580152671755725, "step": 90}, {"loss": 0.4926101863384247, "grad_norm": 6.833501864688494, "learning_rate": 6.042297097821183e-07, "token_acc": 0.839246928691864, "epoch": 0.4631043256997455, "step": 91}, {"loss": 0.5708845257759094, "grad_norm": 6.947903974695247, "learning_rate": 5.9599994641501e-07, "token_acc": 0.8121345043182373, "epoch": 0.4681933842239186, "step": 92}, {"loss": 0.628115713596344, "grad_norm": 7.548986770798889, "learning_rate": 5.877430887870081e-07, "token_acc": 0.788294792175293, "epoch": 0.4732824427480916, "step": 93}, {"loss": 0.5166823863983154, "grad_norm": 6.570218872839558, "learning_rate": 5.794614672478999e-07, "token_acc": 0.8360433578491211, "epoch": 0.47837150127226463, "step": 94}, {"loss": 0.49430781602859497, "grad_norm": 6.879102389364668, "learning_rate": 5.711574191366427e-07, "token_acc": 0.8404411673545837, "epoch": 0.48346055979643765, "step": 95}, {"loss": 0.539700448513031, "grad_norm": 6.900039087620259, "learning_rate": 5.628332881216898e-07, "token_acc": 0.8335866332054138, "epoch": 0.48854961832061067, "step": 96}, {"loss": 0.5462325811386108, "grad_norm": 7.311775050492225, "learning_rate": 5.544914235395346e-07, "token_acc": 0.821093738079071, "epoch": 0.49363867684478374, "step": 97}, {"loss": 0.42079484462738037, "grad_norm": 6.2340087632083145, "learning_rate": 5.46134179731651e-07, "token_acc": 0.8601190447807312, "epoch": 0.49872773536895676, "step": 98}, {"loss": 0.4871785342693329, "grad_norm": 6.906354100365807, "learning_rate": 5.377639153800228e-07, "token_acc": 0.8426229357719421, "epoch": 0.5038167938931297, "step": 99}, {"loss": 0.44850805401802063, "grad_norm": 7.419694975770268, "learning_rate": 5.29382992841449e-07, "token_acc": 0.8602064847946167, "epoch": 0.5089058524173028, "step": 100}, {"loss": 0.7532479763031006, "grad_norm": 7.58024465843975, "learning_rate": 5.209937774808097e-07, "token_acc": 0.7617493271827698, "epoch": 0.5139949109414759, "step": 101}, {"loss": 0.5460261106491089, "grad_norm": 6.785869972847848, "learning_rate": 5.125986370034862e-07, "token_acc": 0.8271237015724182, "epoch": 0.5190839694656488, "step": 102}, {"loss": 0.4613378643989563, "grad_norm": 6.377004561413436, "learning_rate": 5.041999407871167e-07, "token_acc": 0.8512530326843262, "epoch": 0.5241730279898219, "step": 103}, {"loss": 0.4473533630371094, "grad_norm": 6.80201496287452, "learning_rate": 4.958000592128833e-07, "token_acc": 0.8579586148262024, "epoch": 0.5292620865139949, "step": 104}, {"loss": 0.6759693622589111, "grad_norm": 7.418108541295539, "learning_rate": 4.874013629965138e-07, "token_acc": 0.784140944480896, "epoch": 0.5343511450381679, "step": 105}, {"loss": 0.5856516361236572, "grad_norm": 7.489685718682303, "learning_rate": 4.790062225191901e-07, "token_acc": 0.8117216229438782, "epoch": 0.539440203562341, "step": 106}, {"loss": 0.42829567193984985, "grad_norm": 6.328869183566899, "learning_rate": 4.706170071585512e-07, "token_acc": 0.8712550401687622, "epoch": 0.544529262086514, "step": 107}, {"loss": 0.45178571343421936, "grad_norm": 6.886501255634082, "learning_rate": 4.622360846199772e-07, "token_acc": 0.8500370979309082, "epoch": 0.549618320610687, "step": 108}, {"loss": 0.5495023131370544, "grad_norm": 6.8077162970553005, "learning_rate": 4.5386582026834904e-07, "token_acc": 0.8218978047370911, "epoch": 0.55470737913486, "step": 109}, {"loss": 0.5456889867782593, "grad_norm": 7.379728449003184, "learning_rate": 4.4550857646046526e-07, "token_acc": 0.8291457295417786, "epoch": 0.5597964376590331, "step": 110}, {"loss": 0.5225412845611572, "grad_norm": 7.615924172233695, "learning_rate": 4.3716671187831003e-07, "token_acc": 0.8404580354690552, "epoch": 0.5648854961832062, "step": 111}, {"loss": 0.4707789719104767, "grad_norm": 7.422346795235618, "learning_rate": 4.2884258086335745e-07, "token_acc": 0.8413736820220947, "epoch": 0.5699745547073791, "step": 112}, {"loss": 0.5297611951828003, "grad_norm": 7.123945145135712, "learning_rate": 4.205385327521001e-07, "token_acc": 0.8392330408096313, "epoch": 0.5750636132315522, "step": 113}, {"loss": 0.5752742290496826, "grad_norm": 7.4958484597821995, "learning_rate": 4.1225691121299197e-07, "token_acc": 0.8212867379188538, "epoch": 0.5801526717557252, "step": 114}, {"loss": 0.5388021469116211, "grad_norm": 7.327669668772754, "learning_rate": 4.0400005358498996e-07, "token_acc": 0.8285269141197205, "epoch": 0.5852417302798982, "step": 115}, {"loss": 0.6789300441741943, "grad_norm": 9.618613337873144, "learning_rate": 3.957702902178816e-07, "token_acc": 0.7847328186035156, "epoch": 0.5903307888040712, "step": 116}, {"loss": 0.5471738576889038, "grad_norm": 7.177952623547619, "learning_rate": 3.875699438145862e-07, "token_acc": 0.8336092829704285, "epoch": 0.5954198473282443, "step": 117}, {"loss": 0.5098978281021118, "grad_norm": 8.052945572812362, "learning_rate": 3.794013287756125e-07, "token_acc": 0.8288288116455078, "epoch": 0.6005089058524173, "step": 118}, {"loss": 0.3425061106681824, "grad_norm": 6.046421400580426, "learning_rate": 3.7126675054586216e-07, "token_acc": 0.8789032697677612, "epoch": 0.6055979643765903, "step": 119}, {"loss": 0.4630793333053589, "grad_norm": 6.5737863737615765, "learning_rate": 3.6316850496395855e-07, "token_acc": 0.8662704229354858, "epoch": 0.6106870229007634, "step": 120}, {"loss": 0.45755183696746826, "grad_norm": 8.397160760098581, "learning_rate": 3.551088776142876e-07, "token_acc": 0.8498349785804749, "epoch": 0.6157760814249363, "step": 121}, {"loss": 0.4360772967338562, "grad_norm": 6.4054163015674135, "learning_rate": 3.470901431819329e-07, "token_acc": 0.8583815097808838, "epoch": 0.6208651399491094, "step": 122}, {"loss": 0.5934512615203857, "grad_norm": 6.54393616683031, "learning_rate": 3.391145648106861e-07, "token_acc": 0.8144611120223999, "epoch": 0.6259541984732825, "step": 123}, {"loss": 0.5138750076293945, "grad_norm": 6.579208878415561, "learning_rate": 3.3118439346431565e-07, "token_acc": 0.8332119584083557, "epoch": 0.6310432569974554, "step": 124}, {"loss": 0.5163788795471191, "grad_norm": 6.531324677437054, "learning_rate": 3.2330186729127307e-07, "token_acc": 0.8390636444091797, "epoch": 0.6361323155216285, "step": 125}, {"loss": 0.5495798587799072, "grad_norm": 7.24353518688214, "learning_rate": 3.1546921099301505e-07, "token_acc": 0.8304953575134277, "epoch": 0.6412213740458015, "step": 126}, {"loss": 0.6340938806533813, "grad_norm": 8.073546794978538, "learning_rate": 3.0768863519612163e-07, "token_acc": 0.7963525652885437, "epoch": 0.6463104325699746, "step": 127}, {"loss": 0.5798467993736267, "grad_norm": 6.906455485991713, "learning_rate": 2.9996233582838683e-07, "token_acc": 0.8176470398902893, "epoch": 0.6513994910941476, "step": 128}, {"loss": 0.5856970548629761, "grad_norm": 7.1616302611374945, "learning_rate": 2.922924934990568e-07, "token_acc": 0.812842607498169, "epoch": 0.6564885496183206, "step": 129}, {"loss": 0.5146996974945068, "grad_norm": 7.248707569801376, "learning_rate": 2.8468127288339304e-07, "token_acc": 0.8361921310424805, "epoch": 0.6615776081424937, "step": 130}, {"loss": 0.5206888914108276, "grad_norm": 7.055467124955925, "learning_rate": 2.771308221117309e-07, "token_acc": 0.8340874910354614, "epoch": 0.6666666666666666, "step": 131}, {"loss": 0.6137682795524597, "grad_norm": 6.853333937334987, "learning_rate": 2.6964327216320814e-07, "token_acc": 0.8140981197357178, "epoch": 0.6717557251908397, "step": 132}, {"loss": 0.4743039309978485, "grad_norm": 6.084163869064432, "learning_rate": 2.6222073626433585e-07, "token_acc": 0.8473338484764099, "epoch": 0.6768447837150128, "step": 133}, {"loss": 0.4284076690673828, "grad_norm": 6.204362600314127, "learning_rate": 2.548653092925757e-07, "token_acc": 0.8555877804756165, "epoch": 0.6819338422391857, "step": 134}, {"loss": 0.554141640663147, "grad_norm": 6.475011309803279, "learning_rate": 2.475790671851007e-07, "token_acc": 0.8283678889274597, "epoch": 0.6870229007633588, "step": 135}, {"loss": 0.3982859253883362, "grad_norm": 6.298960887105995, "learning_rate": 2.403640663528986e-07, "token_acc": 0.8643379211425781, "epoch": 0.6921119592875318, "step": 136}, {"loss": 0.5251267552375793, "grad_norm": 6.718734055041829, "learning_rate": 2.3322234310038587e-07, "token_acc": 0.8207109570503235, "epoch": 0.6972010178117048, "step": 137}, {"loss": 0.48356571793556213, "grad_norm": 6.4956944903697345, "learning_rate": 2.2615591305069842e-07, "token_acc": 0.8466569781303406, "epoch": 0.7022900763358778, "step": 138}, {"loss": 0.5787326693534851, "grad_norm": 6.415681112067578, "learning_rate": 2.1916677057681782e-07, "token_acc": 0.8108457326889038, "epoch": 0.7073791348600509, "step": 139}, {"loss": 0.3941395878791809, "grad_norm": 6.233464900375372, "learning_rate": 2.1225688823869493e-07, "token_acc": 0.874715268611908, "epoch": 0.712468193384224, "step": 140}, {"loss": 0.4660745859146118, "grad_norm": 6.4081404514839475, "learning_rate": 2.0542821622653128e-07, "token_acc": 0.859331488609314, "epoch": 0.7175572519083969, "step": 141}, {"loss": 0.47638827562332153, "grad_norm": 6.287600191293197, "learning_rate": 1.9868268181037184e-07, "token_acc": 0.8483943343162537, "epoch": 0.72264631043257, "step": 142}, {"loss": 0.5913321375846863, "grad_norm": 6.81863548950789, "learning_rate": 1.920221887961682e-07, "token_acc": 0.8215827345848083, "epoch": 0.727735368956743, "step": 143}, {"loss": 0.4383474588394165, "grad_norm": 6.841490974043838, "learning_rate": 1.8544861698846349e-07, "token_acc": 0.8550488352775574, "epoch": 0.732824427480916, "step": 144}, {"loss": 0.5796048641204834, "grad_norm": 6.942817420503061, "learning_rate": 1.7896382165985092e-07, "token_acc": 0.8165392875671387, "epoch": 0.7379134860050891, "step": 145}, {"loss": 0.6701334714889526, "grad_norm": 6.7613004032540465, "learning_rate": 1.725696330273575e-07, "token_acc": 0.7835962176322937, "epoch": 0.7430025445292621, "step": 146}, {"loss": 0.40048810839653015, "grad_norm": 7.034171547644415, "learning_rate": 1.6626785573589663e-07, "token_acc": 0.8541315197944641, "epoch": 0.7480916030534351, "step": 147}, {"loss": 0.5557653307914734, "grad_norm": 7.278410426269352, "learning_rate": 1.6006026834894066e-07, "token_acc": 0.8266276717185974, "epoch": 0.7531806615776081, "step": 148}, {"loss": 0.48110684752464294, "grad_norm": 7.392715047702279, "learning_rate": 1.5394862284655263e-07, "token_acc": 0.8487972617149353, "epoch": 0.7582697201017812, "step": 149}, {"loss": 0.5109916925430298, "grad_norm": 6.820610160916925, "learning_rate": 1.479346441309216e-07, "token_acc": 0.8323572278022766, "epoch": 0.7633587786259542, "step": 150}, {"loss": 0.5288717746734619, "grad_norm": 6.75300602800803, "learning_rate": 1.420200295395404e-07, "token_acc": 0.8309143781661987, "epoch": 0.7684478371501272, "step": 151}, {"loss": 0.5005700588226318, "grad_norm": 7.294545297758498, "learning_rate": 1.3620644836616168e-07, "token_acc": 0.831886351108551, "epoch": 0.7735368956743003, "step": 152}, {"loss": 0.35479891300201416, "grad_norm": 7.8855908810954904, "learning_rate": 1.304955413896705e-07, "token_acc": 0.8832807540893555, "epoch": 0.7786259541984732, "step": 153}, {"loss": 0.504040002822876, "grad_norm": 6.724712339824582, "learning_rate": 1.2488892041100363e-07, "token_acc": 0.8365921974182129, "epoch": 0.7837150127226463, "step": 154}, {"loss": 0.5070689916610718, "grad_norm": 6.455710720242099, "learning_rate": 1.193881677982475e-07, "token_acc": 0.8249318599700928, "epoch": 0.7888040712468194, "step": 155}, {"loss": 0.520053505897522, "grad_norm": 7.068800156140281, "learning_rate": 1.1399483604004401e-07, "token_acc": 0.8365527391433716, "epoch": 0.7938931297709924, "step": 156}, {"loss": 0.4097803235054016, "grad_norm": 6.850081288418033, "learning_rate": 1.0871044730742752e-07, "token_acc": 0.8620116114616394, "epoch": 0.7989821882951654, "step": 157}, {"loss": 0.5461363792419434, "grad_norm": 7.60922457079162, "learning_rate": 1.0353649302421979e-07, "token_acc": 0.8271507620811462, "epoch": 0.8040712468193384, "step": 158}, {"loss": 0.47203174233436584, "grad_norm": 6.31262409570355, "learning_rate": 9.847443344610296e-08, "token_acc": 0.8570348024368286, "epoch": 0.8091603053435115, "step": 159}, {"loss": 0.5000637173652649, "grad_norm": 6.964867427745047, "learning_rate": 9.352569724848713e-08, "token_acc": 0.83797287940979, "epoch": 0.8142493638676844, "step": 160}, {"loss": 0.40088486671447754, "grad_norm": 6.451751143662267, "learning_rate": 8.86916811232944e-08, "token_acc": 0.8693181872367859, "epoch": 0.8193384223918575, "step": 161}, {"loss": 0.5243923664093018, "grad_norm": 7.150130800522952, "learning_rate": 8.397374938476592e-08, "token_acc": 0.8397790193557739, "epoch": 0.8244274809160306, "step": 162}, {"loss": 0.6940176486968994, "grad_norm": 6.748867820996097, "learning_rate": 7.937323358440934e-08, "token_acc": 0.7921774387359619, "epoch": 0.8295165394402035, "step": 163}, {"loss": 0.5258837938308716, "grad_norm": 7.150715431546907, "learning_rate": 7.4891432135193e-08, "token_acc": 0.8356589078903198, "epoch": 0.8346055979643766, "step": 164}, {"loss": 0.40195560455322266, "grad_norm": 6.0961502935870255, "learning_rate": 7.052960994509054e-08, "token_acc": 0.8778316974639893, "epoch": 0.8396946564885496, "step": 165}, {"loss": 0.5156168937683105, "grad_norm": 7.242558509557879, "learning_rate": 6.628899806008514e-08, "token_acc": 0.8408915996551514, "epoch": 0.8447837150127226, "step": 166}, {"loss": 0.4575965404510498, "grad_norm": 6.8957124413510025, "learning_rate": 6.217079331672776e-08, "token_acc": 0.8520179390907288, "epoch": 0.8498727735368957, "step": 167}, {"loss": 0.5083409547805786, "grad_norm": 6.892825756883802, "learning_rate": 5.817615800435166e-08, "token_acc": 0.8375819325447083, "epoch": 0.8549618320610687, "step": 168}, {"loss": 0.4457250237464905, "grad_norm": 6.766263512755089, "learning_rate": 5.4306219537037845e-08, "token_acc": 0.8576807379722595, "epoch": 0.8600508905852418, "step": 169}, {"loss": 0.4539906978607178, "grad_norm": 6.00128018588962, "learning_rate": 5.05620701354213e-08, "token_acc": 0.8604651093482971, "epoch": 0.8651399491094147, "step": 170}, {"loss": 0.46429243683815, "grad_norm": 7.1421142340529, "learning_rate": 4.6944766518432934e-08, "token_acc": 0.8432304263114929, "epoch": 0.8702290076335878, "step": 171}, {"loss": 0.3856152296066284, "grad_norm": 5.6028058281665425, "learning_rate": 4.3455329605058435e-08, "token_acc": 0.8839471936225891, "epoch": 0.8753180661577609, "step": 172}, {"loss": 0.46086421608924866, "grad_norm": 6.33931144273652, "learning_rate": 4.0094744226202684e-08, "token_acc": 0.8421450257301331, "epoch": 0.8804071246819338, "step": 173}, {"loss": 0.4596683382987976, "grad_norm": 6.545094147756003, "learning_rate": 3.686395884673921e-08, "token_acc": 0.8596774339675903, "epoch": 0.8854961832061069, "step": 174}, {"loss": 0.42930400371551514, "grad_norm": 6.434173190803067, "learning_rate": 3.376388529782215e-08, "token_acc": 0.8674896955490112, "epoch": 0.8905852417302799, "step": 175}, {"loss": 0.47356995940208435, "grad_norm": 6.743072227511501, "learning_rate": 3.0795398519539106e-08, "token_acc": 0.8497202396392822, "epoch": 0.8956743002544529, "step": 176}, {"loss": 0.4741418957710266, "grad_norm": 6.3871394256293845, "learning_rate": 2.7959336313974845e-08, "token_acc": 0.8517192006111145, "epoch": 0.9007633587786259, "step": 177}, {"loss": 0.48497796058654785, "grad_norm": 6.611000642419291, "learning_rate": 2.525649910875627e-08, "token_acc": 0.8367016911506653, "epoch": 0.905852417302799, "step": 178}, {"loss": 0.4237028658390045, "grad_norm": 6.174860429472809, "learning_rate": 2.268764973114684e-08, "token_acc": 0.8700361251831055, "epoch": 0.910941475826972, "step": 179}, {"loss": 0.48196887969970703, "grad_norm": 6.929046947049144, "learning_rate": 2.025351319275137e-08, "token_acc": 0.8519397974014282, "epoch": 0.916030534351145, "step": 180}, {"loss": 0.48997873067855835, "grad_norm": 6.373205986334354, "learning_rate": 1.7954776484895183e-08, "token_acc": 0.8609318733215332, "epoch": 0.9211195928753181, "step": 181}, {"loss": 0.5217477679252625, "grad_norm": 6.570500193502052, "learning_rate": 1.5792088384733173e-08, "token_acc": 0.8341708779335022, "epoch": 0.926208651399491, "step": 182}, {"loss": 0.43885859847068787, "grad_norm": 7.407647806021225, "learning_rate": 1.3766059272143637e-08, "token_acc": 0.8447564244270325, "epoch": 0.9312977099236641, "step": 183}, {"loss": 0.44787734746932983, "grad_norm": 6.460272598428203, "learning_rate": 1.1877260957459833e-08, "token_acc": 0.8547400832176208, "epoch": 0.9363867684478372, "step": 184}, {"loss": 0.4518064856529236, "grad_norm": 6.9310053806080125, "learning_rate": 1.0126226520086822e-08, "token_acc": 0.8530734777450562, "epoch": 0.9414758269720102, "step": 185}, {"loss": 0.5209862589836121, "grad_norm": 6.767883421032736, "learning_rate": 8.513450158049106e-09, "token_acc": 0.8448415994644165, "epoch": 0.9465648854961832, "step": 186}, {"loss": 0.3923228979110718, "grad_norm": 6.498591495564704, "learning_rate": 7.03938704851248e-09, "token_acc": 0.8665667176246643, "epoch": 0.9516539440203562, "step": 187}, {"loss": 0.47490713000297546, "grad_norm": 6.723397561934564, "learning_rate": 5.704453219318117e-09, "token_acc": 0.8479809761047363, "epoch": 0.9567430025445293, "step": 188}, {"loss": 0.6316651701927185, "grad_norm": 6.47914328663187, "learning_rate": 4.5090254315662824e-09, "token_acc": 0.801150918006897, "epoch": 0.9618320610687023, "step": 189}, {"loss": 0.5491369962692261, "grad_norm": 7.404035910696133, "learning_rate": 3.453441073282548e-09, "token_acc": 0.8261205554008484, "epoch": 0.9669211195928753, "step": 190}, {"loss": 0.6524588465690613, "grad_norm": 7.1169918222728885, "learning_rate": 2.537998064195579e-09, "token_acc": 0.7938342690467834, "epoch": 0.9720101781170484, "step": 191}, {"loss": 0.45887458324432373, "grad_norm": 6.794871377840053, "learning_rate": 1.7629547716550008e-09, "token_acc": 0.8517324924468994, "epoch": 0.9770992366412213, "step": 192}, {"loss": 0.4868624806404114, "grad_norm": 6.450451747589884, "learning_rate": 1.1285299377118972e-09, "token_acc": 0.8433213233947754, "epoch": 0.9821882951653944, "step": 193}, {"loss": 0.47770100831985474, "grad_norm": 6.848316700094573, "learning_rate": 6.349026173824712e-10, "token_acc": 0.8416398763656616, "epoch": 0.9872773536895675, "step": 194}, {"loss": 0.43710124492645264, "grad_norm": 6.317953268328354, "learning_rate": 2.8221212811324613e-10, "token_acc": 0.8562543392181396, "epoch": 0.9923664122137404, "step": 195}, {"loss": 0.46438512206077576, "grad_norm": 6.419967480198675, "learning_rate": 7.05580104611303e-11, "token_acc": 0.8528481125831604, "epoch": 0.9974554707379135, "step": 196}, {"loss": 0.5435823202133179, "grad_norm": 9.590497005590018, "learning_rate": 0.0, "token_acc": 0.8232628107070923, "epoch": 1.0, "step": 197}, {"eval_loss": 0.3275679349899292, "eval_runtime": 3.5597, "eval_samples_per_second": 0.843, "eval_steps_per_second": 0.281, "eval_token_acc": 0.9145161509513855, "epoch": 1.0, "step": 197}, {"eval_loss": 0.3275679349899292, "eval_runtime": 2.8528, "eval_samples_per_second": 1.052, "eval_steps_per_second": 0.351, "eval_token_acc": 0.9145161509513855, "epoch": 1.0, "step": 197}, {"train_runtime": 1261.0874, "train_samples_per_second": 2.496, "train_steps_per_second": 0.156, "total_flos": 18220985847808.0, "train_loss": 0.5864294044257421, "epoch": 1.0, "step": 197}], "memory": 26.109375} | |