| {"loss": 1.89079952, "token_acc": 0.62420382, "grad_norm": 63.27988052, "learning_rate": 5e-08, "memory(GiB)": 66.36, "train_speed(iter/s)": 0.017598, "epoch": 0.00261952, "global_step/max_steps": "1/382", "percentage": "0.26%", "elapsed_time": "48s", "remaining_time": "5h 11m 5s"} |
| {"loss": 1.94063377, "token_acc": 0.65533981, "grad_norm": 63.2440567, "learning_rate": 1e-07, "memory(GiB)": 75.54, "train_speed(iter/s)": 0.019794, "epoch": 0.00523903, "global_step/max_steps": "2/382", "percentage": "0.52%", "elapsed_time": "1m 33s", "remaining_time": "4h 55m 8s"} |
| {"loss": 1.95058537, "token_acc": 0.62440191, "grad_norm": 66.90901947, "learning_rate": 1.5e-07, "memory(GiB)": 80.63, "train_speed(iter/s)": 0.020661, "epoch": 0.00785855, "global_step/max_steps": "3/382", "percentage": "0.79%", "elapsed_time": "2m 17s", "remaining_time": "4h 49m 13s"} |
| {"loss": 2.05707073, "token_acc": 0.60840708, "grad_norm": 64.376297, "learning_rate": 2e-07, "memory(GiB)": 80.63, "train_speed(iter/s)": 0.021127, "epoch": 0.01047806, "global_step/max_steps": "4/382", "percentage": "1.05%", "elapsed_time": "3m 1s", "remaining_time": "4h 45m 51s"} |
| {"loss": 2.02341604, "token_acc": 0.62039046, "grad_norm": 65.68408203, "learning_rate": 2.5e-07, "memory(GiB)": 81.03, "train_speed(iter/s)": 0.021392, "epoch": 0.01309758, "global_step/max_steps": "5/382", "percentage": "1.31%", "elapsed_time": "3m 45s", "remaining_time": "4h 43m 52s"} |
| {"loss": 1.91747224, "token_acc": 0.58851675, "grad_norm": 64.10668182, "learning_rate": 3e-07, "memory(GiB)": 81.03, "train_speed(iter/s)": 0.021578, "epoch": 0.01571709, "global_step/max_steps": "6/382", "percentage": "1.57%", "elapsed_time": "4m 30s", "remaining_time": "4h 42m 13s"} |
| {"loss": 1.8793149, "token_acc": 0.64636542, "grad_norm": 59.3901062, "learning_rate": 3.5e-07, "memory(GiB)": 81.03, "train_speed(iter/s)": 0.02172, "epoch": 0.01833661, "global_step/max_steps": "7/382", "percentage": "1.83%", "elapsed_time": "5m 14s", "remaining_time": "4h 40m 45s"} |
| {"loss": 1.93486214, "token_acc": 0.61036036, "grad_norm": 65.04813385, "learning_rate": 4e-07, "memory(GiB)": 81.03, "train_speed(iter/s)": 0.021821, "epoch": 0.02095612, "global_step/max_steps": "8/382", "percentage": "2.09%", "elapsed_time": "5m 58s", "remaining_time": "4h 39m 33s"} |
| {"loss": 1.91066146, "token_acc": 0.63031915, "grad_norm": 60.7717514, "learning_rate": 4.5e-07, "memory(GiB)": 81.03, "train_speed(iter/s)": 0.021909, "epoch": 0.02357564, "global_step/max_steps": "9/382", "percentage": "2.36%", "elapsed_time": "6m 42s", "remaining_time": "4h 38m 20s"} |
| {"loss": 1.8817265, "token_acc": 0.66818182, "grad_norm": 61.84583282, "learning_rate": 5e-07, "memory(GiB)": 81.03, "train_speed(iter/s)": 0.021989, "epoch": 0.02619515, "global_step/max_steps": "10/382", "percentage": "2.62%", "elapsed_time": "7m 26s", "remaining_time": "4h 37m 5s"} |
| {"loss": 1.8080368, "token_acc": 0.63840399, "grad_norm": 53.12112045, "learning_rate": 5.5e-07, "memory(GiB)": 81.18, "train_speed(iter/s)": 0.022041, "epoch": 0.02881467, "global_step/max_steps": "11/382", "percentage": "2.88%", "elapsed_time": "8m 11s", "remaining_time": "4h 36m 7s"} |
| {"loss": 1.7626909, "token_acc": 0.62076749, "grad_norm": 55.34021759, "learning_rate": 6e-07, "memory(GiB)": 81.18, "train_speed(iter/s)": 0.022085, "epoch": 0.03143418, "global_step/max_steps": "12/382", "percentage": "3.14%", "elapsed_time": "8m 55s", "remaining_time": "4h 35m 12s"} |
| {"loss": 1.75792098, "token_acc": 0.6612529, "grad_norm": 51.83931351, "learning_rate": 6.5e-07, "memory(GiB)": 81.18, "train_speed(iter/s)": 0.022128, "epoch": 0.0340537, "global_step/max_steps": "13/382", "percentage": "3.40%", "elapsed_time": "9m 39s", "remaining_time": "4h 34m 13s"} |
| {"loss": 1.53016615, "token_acc": 0.67674419, "grad_norm": 36.43444443, "learning_rate": 7e-07, "memory(GiB)": 81.18, "train_speed(iter/s)": 0.022161, "epoch": 0.03667322, "global_step/max_steps": "14/382", "percentage": "3.66%", "elapsed_time": "10m 23s", "remaining_time": "4h 33m 19s"} |
| {"loss": 1.41359329, "token_acc": 0.69868996, "grad_norm": 34.469944, "learning_rate": 7.5e-07, "memory(GiB)": 81.18, "train_speed(iter/s)": 0.022192, "epoch": 0.03929273, "global_step/max_steps": "15/382", "percentage": "3.93%", "elapsed_time": "11m 8s", "remaining_time": "4h 32m 25s"} |
| {"loss": 1.3387754, "token_acc": 0.67628866, "grad_norm": 31.91835022, "learning_rate": 8e-07, "memory(GiB)": 81.18, "train_speed(iter/s)": 0.022214, "epoch": 0.04191225, "global_step/max_steps": "16/382", "percentage": "4.19%", "elapsed_time": "11m 52s", "remaining_time": "4h 31m 36s"} |
| {"loss": 1.38612688, "token_acc": 0.68553459, "grad_norm": 32.01660156, "learning_rate": 8.5e-07, "memory(GiB)": 81.18, "train_speed(iter/s)": 0.022242, "epoch": 0.04453176, "global_step/max_steps": "17/382", "percentage": "4.45%", "elapsed_time": "12m 36s", "remaining_time": "4h 30m 42s"} |
| {"loss": 1.25918829, "token_acc": 0.70097087, "grad_norm": 30.60653305, "learning_rate": 9e-07, "memory(GiB)": 81.18, "train_speed(iter/s)": 0.022266, "epoch": 0.04715128, "global_step/max_steps": "18/382", "percentage": "4.71%", "elapsed_time": "13m 20s", "remaining_time": "4h 29m 49s"} |
| {"loss": 1.06507492, "token_acc": 0.70020121, "grad_norm": 19.74515724, "learning_rate": 9.5e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022284, "epoch": 0.04977079, "global_step/max_steps": "19/382", "percentage": "4.97%", "elapsed_time": "14m 4s", "remaining_time": "4h 28m 59s"} |
| {"loss": 0.94790208, "token_acc": 0.70967742, "grad_norm": 17.4563446, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022301, "epoch": 0.05239031, "global_step/max_steps": "20/382", "percentage": "5.24%", "elapsed_time": "14m 48s", "remaining_time": "4h 28m 10s"} |
| {"loss": 0.97835648, "token_acc": 0.71701721, "grad_norm": 15.00493336, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022318, "epoch": 0.05500982, "global_step/max_steps": "21/382", "percentage": "5.50%", "elapsed_time": "15m 33s", "remaining_time": "4h 27m 20s"} |
| {"loss": 0.97605169, "token_acc": 0.73, "grad_norm": 14.92234993, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022332, "epoch": 0.05762934, "global_step/max_steps": "22/382", "percentage": "5.76%", "elapsed_time": "16m 17s", "remaining_time": "4h 26m 32s"} |
| {"loss": 0.89981449, "token_acc": 0.75550122, "grad_norm": 13.87676048, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022342, "epoch": 0.06024885, "global_step/max_steps": "23/382", "percentage": "6.02%", "elapsed_time": "17m 1s", "remaining_time": "4h 25m 45s"} |
| {"loss": 0.85592413, "token_acc": 0.74487472, "grad_norm": 11.73654366, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022352, "epoch": 0.06286837, "global_step/max_steps": "24/382", "percentage": "6.28%", "elapsed_time": "17m 45s", "remaining_time": "4h 24m 59s"} |
| {"loss": 0.87852091, "token_acc": 0.77598152, "grad_norm": 10.46651173, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022364, "epoch": 0.06548788, "global_step/max_steps": "25/382", "percentage": "6.54%", "elapsed_time": "18m 30s", "remaining_time": "4h 24m 11s"} |
| {"loss": 0.84338558, "token_acc": 0.76842105, "grad_norm": 9.2196064, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022375, "epoch": 0.0681074, "global_step/max_steps": "26/382", "percentage": "6.81%", "elapsed_time": "19m 14s", "remaining_time": "4h 23m 23s"} |
| {"loss": 0.83075631, "token_acc": 0.76041667, "grad_norm": 5.72198439, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022383, "epoch": 0.07072692, "global_step/max_steps": "27/382", "percentage": "7.07%", "elapsed_time": "19m 58s", "remaining_time": "4h 22m 37s"} |
| {"loss": 0.73186272, "token_acc": 0.81092437, "grad_norm": 5.88393927, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022393, "epoch": 0.07334643, "global_step/max_steps": "28/382", "percentage": "7.33%", "elapsed_time": "20m 42s", "remaining_time": "4h 21m 49s"} |
| {"loss": 0.74590898, "token_acc": 0.81798715, "grad_norm": 4.77961111, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022404, "epoch": 0.07596595, "global_step/max_steps": "29/382", "percentage": "7.59%", "elapsed_time": "21m 26s", "remaining_time": "4h 21m 1s"} |
| {"loss": 0.71116292, "token_acc": 0.77882353, "grad_norm": 6.08825731, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02241, "epoch": 0.07858546, "global_step/max_steps": "30/382", "percentage": "7.85%", "elapsed_time": "22m 10s", "remaining_time": "4h 20m 15s"} |
| {"loss": 0.78373051, "token_acc": 0.76327434, "grad_norm": 5.21471548, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022416, "epoch": 0.08120498, "global_step/max_steps": "31/382", "percentage": "8.12%", "elapsed_time": "22m 55s", "remaining_time": "4h 19m 29s"} |
| {"loss": 0.72540426, "token_acc": 0.76659039, "grad_norm": 4.57634354, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022422, "epoch": 0.08382449, "global_step/max_steps": "32/382", "percentage": "8.38%", "elapsed_time": "23m 39s", "remaining_time": "4h 18m 44s"} |
| {"loss": 0.62134641, "token_acc": 0.78420039, "grad_norm": 3.63199067, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022427, "epoch": 0.08644401, "global_step/max_steps": "33/382", "percentage": "8.64%", "elapsed_time": "24m 23s", "remaining_time": "4h 17m 58s"} |
| {"loss": 0.7393645, "token_acc": 0.76104418, "grad_norm": 3.26421452, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022432, "epoch": 0.08906352, "global_step/max_steps": "34/382", "percentage": "8.90%", "elapsed_time": "25m 7s", "remaining_time": "4h 17m 13s"} |
| {"loss": 0.72759438, "token_acc": 0.77111111, "grad_norm": 4.26885796, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022435, "epoch": 0.09168304, "global_step/max_steps": "35/382", "percentage": "9.16%", "elapsed_time": "25m 52s", "remaining_time": "4h 16m 29s"} |
| {"loss": 0.70990753, "token_acc": 0.77829099, "grad_norm": 3.63333511, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022441, "epoch": 0.09430255, "global_step/max_steps": "36/382", "percentage": "9.42%", "elapsed_time": "26m 36s", "remaining_time": "4h 15m 43s"} |
| {"loss": 0.65245998, "token_acc": 0.8009828, "grad_norm": 4.2254858, "learning_rate": 9.9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022446, "epoch": 0.09692207, "global_step/max_steps": "37/382", "percentage": "9.69%", "elapsed_time": "27m 20s", "remaining_time": "4h 14m 57s"} |
| {"loss": 0.69660842, "token_acc": 0.76371308, "grad_norm": 3.55075979, "learning_rate": 9.9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02245, "epoch": 0.09954158, "global_step/max_steps": "38/382", "percentage": "9.95%", "elapsed_time": "28m 4s", "remaining_time": "4h 14m 11s"} |
| {"loss": 0.60551822, "token_acc": 0.78165939, "grad_norm": 3.71743274, "learning_rate": 9.9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022457, "epoch": 0.1021611, "global_step/max_steps": "39/382", "percentage": "10.21%", "elapsed_time": "28m 48s", "remaining_time": "4h 13m 24s"} |
| {"loss": 0.66080427, "token_acc": 0.75983437, "grad_norm": 4.52653313, "learning_rate": 9.9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02246, "epoch": 0.10478062, "global_step/max_steps": "40/382", "percentage": "10.47%", "elapsed_time": "29m 33s", "remaining_time": "4h 12m 40s"} |
| {"loss": 0.65186536, "token_acc": 0.79147982, "grad_norm": 3.64908457, "learning_rate": 9.9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022465, "epoch": 0.10740013, "global_step/max_steps": "41/382", "percentage": "10.73%", "elapsed_time": "30m 17s", "remaining_time": "4h 11m 54s"} |
| {"loss": 0.68240446, "token_acc": 0.77661795, "grad_norm": 3.86726975, "learning_rate": 9.9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022469, "epoch": 0.11001965, "global_step/max_steps": "42/382", "percentage": "10.99%", "elapsed_time": "31m 1s", "remaining_time": "4h 11m 8s"} |
| {"loss": 0.61152977, "token_acc": 0.82034632, "grad_norm": 3.15071702, "learning_rate": 9.9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022473, "epoch": 0.11263916, "global_step/max_steps": "43/382", "percentage": "11.26%", "elapsed_time": "31m 45s", "remaining_time": "4h 10m 22s"} |
| {"loss": 0.62651813, "token_acc": 0.81221719, "grad_norm": 3.61004853, "learning_rate": 9.9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022476, "epoch": 0.11525868, "global_step/max_steps": "44/382", "percentage": "11.52%", "elapsed_time": "32m 29s", "remaining_time": "4h 9m 38s"} |
| {"loss": 0.59762698, "token_acc": 0.81432361, "grad_norm": 3.53385091, "learning_rate": 9.9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02248, "epoch": 0.11787819, "global_step/max_steps": "45/382", "percentage": "11.78%", "elapsed_time": "33m 13s", "remaining_time": "4h 8m 52s"} |
| {"loss": 0.66770619, "token_acc": 0.80697674, "grad_norm": 3.72001886, "learning_rate": 9.9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022483, "epoch": 0.12049771, "global_step/max_steps": "46/382", "percentage": "12.04%", "elapsed_time": "33m 58s", "remaining_time": "4h 8m 7s"} |
| {"loss": 0.65925568, "token_acc": 0.7987013, "grad_norm": 4.36251593, "learning_rate": 9.9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022486, "epoch": 0.12311722, "global_step/max_steps": "47/382", "percentage": "12.30%", "elapsed_time": "34m 42s", "remaining_time": "4h 7m 22s"} |
| {"loss": 0.60573405, "token_acc": 0.82045455, "grad_norm": 3.52368331, "learning_rate": 9.9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022487, "epoch": 0.12573674, "global_step/max_steps": "48/382", "percentage": "12.57%", "elapsed_time": "35m 26s", "remaining_time": "4h 6m 38s"} |
| {"loss": 0.64376664, "token_acc": 0.80042918, "grad_norm": 3.17806268, "learning_rate": 9.8e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02249, "epoch": 0.12835625, "global_step/max_steps": "49/382", "percentage": "12.83%", "elapsed_time": "36m 10s", "remaining_time": "4h 5m 53s"} |
| {"loss": 0.599733, "token_acc": 0.80561555, "grad_norm": 3.21537805, "learning_rate": 9.8e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022492, "epoch": 0.13097577, "global_step/max_steps": "50/382", "percentage": "13.09%", "elapsed_time": "36m 55s", "remaining_time": "4h 5m 8s"} |
| {"loss": 0.62038302, "token_acc": 0.80803571, "grad_norm": 3.20868015, "learning_rate": 9.8e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022313, "epoch": 0.13359528, "global_step/max_steps": "51/382", "percentage": "13.35%", "elapsed_time": "37m 57s", "remaining_time": "4h 6m 23s"} |
| {"loss": 0.58062661, "token_acc": 0.82366589, "grad_norm": 2.97014928, "learning_rate": 9.8e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022317, "epoch": 0.1362148, "global_step/max_steps": "52/382", "percentage": "13.61%", "elapsed_time": "38m 42s", "remaining_time": "4h 5m 37s"} |
| {"loss": 0.59990096, "token_acc": 0.80522088, "grad_norm": 2.60128856, "learning_rate": 9.8e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022323, "epoch": 0.13883432, "global_step/max_steps": "53/382", "percentage": "13.87%", "elapsed_time": "39m 26s", "remaining_time": "4h 4m 49s"} |
| {"loss": 0.60569948, "token_acc": 0.81377551, "grad_norm": 3.06451631, "learning_rate": 9.8e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022325, "epoch": 0.14145383, "global_step/max_steps": "54/382", "percentage": "14.14%", "elapsed_time": "40m 10s", "remaining_time": "4h 4m 4s"} |
| {"loss": 0.57674718, "token_acc": 0.77991453, "grad_norm": 3.12936854, "learning_rate": 9.8e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02233, "epoch": 0.14407335, "global_step/max_steps": "55/382", "percentage": "14.40%", "elapsed_time": "40m 55s", "remaining_time": "4h 3m 17s"} |
| {"loss": 0.61199594, "token_acc": 0.77217742, "grad_norm": 3.52171731, "learning_rate": 9.8e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022335, "epoch": 0.14669286, "global_step/max_steps": "56/382", "percentage": "14.66%", "elapsed_time": "41m 39s", "remaining_time": "4h 2m 30s"} |
| {"loss": 0.57319468, "token_acc": 0.78378378, "grad_norm": 3.37167382, "learning_rate": 9.7e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02234, "epoch": 0.14931238, "global_step/max_steps": "57/382", "percentage": "14.92%", "elapsed_time": "42m 23s", "remaining_time": "4h 1m 43s"} |
| {"loss": 0.54970801, "token_acc": 0.80694143, "grad_norm": 3.98172116, "learning_rate": 9.7e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022344, "epoch": 0.15193189, "global_step/max_steps": "58/382", "percentage": "15.18%", "elapsed_time": "43m 7s", "remaining_time": "4h 0m 56s"} |
| {"loss": 0.61220706, "token_acc": 0.78043912, "grad_norm": 3.11273694, "learning_rate": 9.7e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022349, "epoch": 0.15455141, "global_step/max_steps": "59/382", "percentage": "15.45%", "elapsed_time": "43m 52s", "remaining_time": "4h 0m 9s"} |
| {"loss": 0.61768484, "token_acc": 0.81620553, "grad_norm": 3.27498055, "learning_rate": 9.7e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022354, "epoch": 0.15717092, "global_step/max_steps": "60/382", "percentage": "15.71%", "elapsed_time": "44m 36s", "remaining_time": "3h 59m 22s"} |
| {"loss": 0.59785318, "token_acc": 0.76493256, "grad_norm": 2.65820551, "learning_rate": 9.7e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02236, "epoch": 0.15979044, "global_step/max_steps": "61/382", "percentage": "15.97%", "elapsed_time": "45m 20s", "remaining_time": "3h 58m 35s"} |
| {"loss": 0.59577692, "token_acc": 0.79741379, "grad_norm": 2.86553001, "learning_rate": 9.7e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022365, "epoch": 0.16240995, "global_step/max_steps": "62/382", "percentage": "16.23%", "elapsed_time": "46m 4s", "remaining_time": "3h 57m 47s"} |
| {"loss": 0.65320361, "token_acc": 0.76808905, "grad_norm": 3.04645538, "learning_rate": 9.7e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022367, "epoch": 0.16502947, "global_step/max_steps": "63/382", "percentage": "16.49%", "elapsed_time": "46m 48s", "remaining_time": "3h 57m 2s"} |
| {"loss": 0.59086221, "token_acc": 0.8091954, "grad_norm": 2.94969845, "learning_rate": 9.6e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02237, "epoch": 0.16764898, "global_step/max_steps": "64/382", "percentage": "16.75%", "elapsed_time": "47m 33s", "remaining_time": "3h 56m 16s"} |
| {"loss": 0.57665777, "token_acc": 0.81798715, "grad_norm": 2.82841063, "learning_rate": 9.6e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022375, "epoch": 0.1702685, "global_step/max_steps": "65/382", "percentage": "17.02%", "elapsed_time": "48m 17s", "remaining_time": "3h 55m 29s"} |
| {"loss": 0.59075731, "token_acc": 0.81263158, "grad_norm": 2.81398058, "learning_rate": 9.6e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022378, "epoch": 0.17288802, "global_step/max_steps": "66/382", "percentage": "17.28%", "elapsed_time": "49m 1s", "remaining_time": "3h 54m 43s"} |
| {"loss": 0.64419186, "token_acc": 0.7745098, "grad_norm": 3.11548734, "learning_rate": 9.6e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022383, "epoch": 0.17550753, "global_step/max_steps": "67/382", "percentage": "17.54%", "elapsed_time": "49m 45s", "remaining_time": "3h 53m 56s"} |
| {"loss": 0.52819848, "token_acc": 0.82891566, "grad_norm": 3.01923728, "learning_rate": 9.6e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022384, "epoch": 0.17812705, "global_step/max_steps": "68/382", "percentage": "17.80%", "elapsed_time": "50m 30s", "remaining_time": "3h 53m 11s"} |
| {"loss": 0.57159936, "token_acc": 0.80080483, "grad_norm": 2.69560075, "learning_rate": 9.6e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022387, "epoch": 0.18074656, "global_step/max_steps": "69/382", "percentage": "18.06%", "elapsed_time": "51m 14s", "remaining_time": "3h 52m 25s"} |
| {"loss": 0.57501435, "token_acc": 0.82590234, "grad_norm": 2.46505594, "learning_rate": 9.5e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02239, "epoch": 0.18336608, "global_step/max_steps": "70/382", "percentage": "18.32%", "elapsed_time": "51m 58s", "remaining_time": "3h 51m 39s"} |
| {"loss": 0.54292703, "token_acc": 0.80631579, "grad_norm": 2.79004002, "learning_rate": 9.5e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022393, "epoch": 0.18598559, "global_step/max_steps": "71/382", "percentage": "18.59%", "elapsed_time": "52m 42s", "remaining_time": "3h 50m 53s"} |
| {"loss": 0.62836897, "token_acc": 0.79275654, "grad_norm": 5.67881441, "learning_rate": 9.5e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022396, "epoch": 0.18860511, "global_step/max_steps": "72/382", "percentage": "18.85%", "elapsed_time": "53m 27s", "remaining_time": "3h 50m 8s"} |
| {"loss": 0.54798818, "token_acc": 0.8125, "grad_norm": 2.67142057, "learning_rate": 9.5e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022399, "epoch": 0.19122462, "global_step/max_steps": "73/382", "percentage": "19.11%", "elapsed_time": "54m 11s", "remaining_time": "3h 49m 21s"} |
| {"loss": 0.53348339, "token_acc": 0.86904762, "grad_norm": 2.73582673, "learning_rate": 9.5e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022403, "epoch": 0.19384414, "global_step/max_steps": "74/382", "percentage": "19.37%", "elapsed_time": "54m 55s", "remaining_time": "3h 48m 35s"} |
| {"loss": 0.57408327, "token_acc": 0.80972516, "grad_norm": 2.85904121, "learning_rate": 9.4e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022406, "epoch": 0.19646365, "global_step/max_steps": "75/382", "percentage": "19.63%", "elapsed_time": "55m 39s", "remaining_time": "3h 47m 49s"} |
| {"loss": 0.58941638, "token_acc": 0.81410256, "grad_norm": 3.15846062, "learning_rate": 9.4e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022407, "epoch": 0.19908317, "global_step/max_steps": "76/382", "percentage": "19.90%", "elapsed_time": "56m 23s", "remaining_time": "3h 47m 4s"} |
| {"loss": 0.5662564, "token_acc": 0.7962963, "grad_norm": 2.71515369, "learning_rate": 9.4e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022409, "epoch": 0.20170269, "global_step/max_steps": "77/382", "percentage": "20.16%", "elapsed_time": "57m 8s", "remaining_time": "3h 46m 19s"} |
| {"loss": 0.6219126, "token_acc": 0.78807947, "grad_norm": 3.22065401, "learning_rate": 9.4e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022412, "epoch": 0.2043222, "global_step/max_steps": "78/382", "percentage": "20.42%", "elapsed_time": "57m 52s", "remaining_time": "3h 45m 33s"} |
| {"loss": 0.63431448, "token_acc": 0.81904762, "grad_norm": 3.06673241, "learning_rate": 9.4e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022414, "epoch": 0.20694172, "global_step/max_steps": "79/382", "percentage": "20.68%", "elapsed_time": "58m 36s", "remaining_time": "3h 44m 48s"} |
| {"loss": 0.54292887, "token_acc": 0.80257511, "grad_norm": 2.91285133, "learning_rate": 9.3e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022416, "epoch": 0.20956123, "global_step/max_steps": "80/382", "percentage": "20.94%", "elapsed_time": "59m 21s", "remaining_time": "3h 44m 3s"} |
| {"loss": 0.58918417, "token_acc": 0.79555556, "grad_norm": 3.09997725, "learning_rate": 9.3e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022419, "epoch": 0.21218075, "global_step/max_steps": "81/382", "percentage": "21.20%", "elapsed_time": "1h 0m 5s", "remaining_time": "3h 43m 16s"} |
| {"loss": 0.54062927, "token_acc": 0.80659341, "grad_norm": 2.48728752, "learning_rate": 9.3e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022421, "epoch": 0.21480026, "global_step/max_steps": "82/382", "percentage": "21.47%", "elapsed_time": "1h 0m 49s", "remaining_time": "3h 42m 31s"} |
| {"loss": 0.56648374, "token_acc": 0.80125523, "grad_norm": 3.31763268, "learning_rate": 9.3e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022424, "epoch": 0.21741978, "global_step/max_steps": "83/382", "percentage": "21.73%", "elapsed_time": "1h 1m 33s", "remaining_time": "3h 41m 45s"} |
| {"loss": 0.55309772, "token_acc": 0.83484163, "grad_norm": 2.73038173, "learning_rate": 9.2e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022425, "epoch": 0.22003929, "global_step/max_steps": "84/382", "percentage": "21.99%", "elapsed_time": "1h 2m 17s", "remaining_time": "3h 41m 0s"} |
| {"loss": 0.57023978, "token_acc": 0.8012685, "grad_norm": 3.15801311, "learning_rate": 9.2e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022428, "epoch": 0.22265881, "global_step/max_steps": "85/382", "percentage": "22.25%", "elapsed_time": "1h 3m 2s", "remaining_time": "3h 40m 14s"} |
| {"loss": 0.51269996, "token_acc": 0.84221748, "grad_norm": 2.52478576, "learning_rate": 9.2e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02243, "epoch": 0.22527832, "global_step/max_steps": "86/382", "percentage": "22.51%", "elapsed_time": "1h 3m 46s", "remaining_time": "3h 39m 29s"} |
| {"loss": 0.54462188, "token_acc": 0.82819383, "grad_norm": 2.58417821, "learning_rate": 9.2e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022432, "epoch": 0.22789784, "global_step/max_steps": "87/382", "percentage": "22.77%", "elapsed_time": "1h 4m 30s", "remaining_time": "3h 38m 44s"} |
| {"loss": 0.58474964, "token_acc": 0.79802956, "grad_norm": 2.99677539, "learning_rate": 9.2e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022434, "epoch": 0.23051735, "global_step/max_steps": "88/382", "percentage": "23.04%", "elapsed_time": "1h 5m 14s", "remaining_time": "3h 37m 58s"} |
| {"loss": 0.55316675, "token_acc": 0.81778742, "grad_norm": 3.07680249, "learning_rate": 9.1e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022436, "epoch": 0.23313687, "global_step/max_steps": "89/382", "percentage": "23.30%", "elapsed_time": "1h 5m 58s", "remaining_time": "3h 37m 13s"} |
| {"loss": 0.55461621, "token_acc": 0.80120482, "grad_norm": 2.64685225, "learning_rate": 9.1e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022439, "epoch": 0.23575639, "global_step/max_steps": "90/382", "percentage": "23.56%", "elapsed_time": "1h 6m 43s", "remaining_time": "3h 36m 27s"} |
| {"loss": 0.50809103, "token_acc": 0.81875, "grad_norm": 2.47200179, "learning_rate": 9.1e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02244, "epoch": 0.2383759, "global_step/max_steps": "91/382", "percentage": "23.82%", "elapsed_time": "1h 7m 27s", "remaining_time": "3h 35m 42s"} |
| {"loss": 0.58417046, "token_acc": 0.81422018, "grad_norm": 2.9243257, "learning_rate": 9.1e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022442, "epoch": 0.24099542, "global_step/max_steps": "92/382", "percentage": "24.08%", "elapsed_time": "1h 8m 11s", "remaining_time": "3h 34m 57s"} |
| {"loss": 0.56323284, "token_acc": 0.8173516, "grad_norm": 2.69686198, "learning_rate": 9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022444, "epoch": 0.24361493, "global_step/max_steps": "93/382", "percentage": "24.35%", "elapsed_time": "1h 8m 55s", "remaining_time": "3h 34m 11s"} |
| {"loss": 0.57045102, "token_acc": 0.81512605, "grad_norm": 2.80572057, "learning_rate": 9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022445, "epoch": 0.24623445, "global_step/max_steps": "94/382", "percentage": "24.61%", "elapsed_time": "1h 9m 40s", "remaining_time": "3h 33m 27s"} |
| {"loss": 0.5538668, "token_acc": 0.81796117, "grad_norm": 3.27861238, "learning_rate": 9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022448, "epoch": 0.24885396, "global_step/max_steps": "95/382", "percentage": "24.87%", "elapsed_time": "1h 10m 24s", "remaining_time": "3h 32m 41s"} |
| {"loss": 0.53795999, "token_acc": 0.82572614, "grad_norm": 2.47431779, "learning_rate": 9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02245, "epoch": 0.25147348, "global_step/max_steps": "96/382", "percentage": "25.13%", "elapsed_time": "1h 11m 8s", "remaining_time": "3h 31m 56s"} |
| {"loss": 0.56026649, "token_acc": 0.78969957, "grad_norm": 2.45989275, "learning_rate": 8.9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022452, "epoch": 0.25409299, "global_step/max_steps": "97/382", "percentage": "25.39%", "elapsed_time": "1h 11m 52s", "remaining_time": "3h 31m 11s"} |
| {"loss": 0.53506076, "token_acc": 0.82150538, "grad_norm": 2.28632545, "learning_rate": 8.9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022454, "epoch": 0.25671251, "global_step/max_steps": "98/382", "percentage": "25.65%", "elapsed_time": "1h 12m 36s", "remaining_time": "3h 30m 25s"} |
| {"loss": 0.55808818, "token_acc": 0.85193133, "grad_norm": 2.95643806, "learning_rate": 8.9e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022456, "epoch": 0.25933202, "global_step/max_steps": "99/382", "percentage": "25.92%", "elapsed_time": "1h 13m 20s", "remaining_time": "3h 29m 40s"} |
| {"loss": 0.51788855, "token_acc": 0.81655481, "grad_norm": 2.50421, "learning_rate": 8.8e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022457, "epoch": 0.26195154, "global_step/max_steps": "100/382", "percentage": "26.18%", "elapsed_time": "1h 14m 5s", "remaining_time": "3h 28m 55s"} |
| {"loss": 0.53758514, "token_acc": 0.81333333, "grad_norm": 2.50001502, "learning_rate": 8.8e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022356, "epoch": 0.26457105, "global_step/max_steps": "101/382", "percentage": "26.44%", "elapsed_time": "1h 15m 9s", "remaining_time": "3h 29m 7s"} |
| {"loss": 0.584602, "token_acc": 0.81156317, "grad_norm": 2.84934139, "learning_rate": 8.8e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022359, "epoch": 0.26719057, "global_step/max_steps": "102/382", "percentage": "26.70%", "elapsed_time": "1h 15m 54s", "remaining_time": "3h 28m 21s"} |
| {"loss": 0.58011544, "token_acc": 0.83649289, "grad_norm": 3.06900954, "learning_rate": 8.8e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02236, "epoch": 0.26981009, "global_step/max_steps": "103/382", "percentage": "26.96%", "elapsed_time": "1h 16m 38s", "remaining_time": "3h 27m 36s"} |
| {"loss": 0.54638553, "token_acc": 0.84269663, "grad_norm": 2.55726433, "learning_rate": 8.7e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022363, "epoch": 0.2724296, "global_step/max_steps": "104/382", "percentage": "27.23%", "elapsed_time": "1h 17m 22s", "remaining_time": "3h 26m 50s"} |
| {"loss": 0.57277572, "token_acc": 0.80042017, "grad_norm": 2.82760882, "learning_rate": 8.7e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022365, "epoch": 0.27504912, "global_step/max_steps": "105/382", "percentage": "27.49%", "elapsed_time": "1h 18m 7s", "remaining_time": "3h 26m 4s"} |
| {"loss": 0.56662297, "token_acc": 0.81762295, "grad_norm": 2.8642962, "learning_rate": 8.7e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022367, "epoch": 0.27766863, "global_step/max_steps": "106/382", "percentage": "27.75%", "elapsed_time": "1h 18m 51s", "remaining_time": "3h 25m 18s"} |
| {"loss": 0.54071724, "token_acc": 0.84178499, "grad_norm": 3.29882622, "learning_rate": 8.6e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02237, "epoch": 0.28028815, "global_step/max_steps": "107/382", "percentage": "28.01%", "elapsed_time": "1h 19m 35s", "remaining_time": "3h 24m 33s"} |
| {"loss": 0.55981302, "token_acc": 0.82352941, "grad_norm": 2.72793126, "learning_rate": 8.6e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022373, "epoch": 0.28290766, "global_step/max_steps": "108/382", "percentage": "28.27%", "elapsed_time": "1h 20m 19s", "remaining_time": "3h 23m 47s"} |
| {"loss": 0.53673041, "token_acc": 0.81474104, "grad_norm": 3.23735809, "learning_rate": 8.6e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022373, "epoch": 0.28552718, "global_step/max_steps": "109/382", "percentage": "28.53%", "elapsed_time": "1h 21m 4s", "remaining_time": "3h 23m 2s"} |
| {"loss": 0.513282, "token_acc": 0.84711779, "grad_norm": 2.66610456, "learning_rate": 8.6e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022376, "epoch": 0.28814669, "global_step/max_steps": "110/382", "percentage": "28.80%", "elapsed_time": "1h 21m 48s", "remaining_time": "3h 22m 16s"} |
| {"loss": 0.5320757, "token_acc": 0.82828283, "grad_norm": 2.53704047, "learning_rate": 8.5e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022378, "epoch": 0.29076621, "global_step/max_steps": "111/382", "percentage": "29.06%", "elapsed_time": "1h 22m 32s", "remaining_time": "3h 21m 30s"} |
| {"loss": 0.5458982, "token_acc": 0.81595092, "grad_norm": 2.57676959, "learning_rate": 8.5e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02238, "epoch": 0.29338572, "global_step/max_steps": "112/382", "percentage": "29.32%", "elapsed_time": "1h 23m 16s", "remaining_time": "3h 20m 45s"} |
| {"loss": 0.51977658, "token_acc": 0.81374723, "grad_norm": 2.7759788, "learning_rate": 8.5e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022382, "epoch": 0.29600524, "global_step/max_steps": "113/382", "percentage": "29.58%", "elapsed_time": "1h 24m 0s", "remaining_time": "3h 19m 59s"} |
| {"loss": 0.55347836, "token_acc": 0.81385281, "grad_norm": 2.89089918, "learning_rate": 8.4e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022384, "epoch": 0.29862475, "global_step/max_steps": "114/382", "percentage": "29.84%", "elapsed_time": "1h 24m 45s", "remaining_time": "3h 19m 14s"} |
| {"loss": 0.55322838, "token_acc": 0.81967213, "grad_norm": 2.58021998, "learning_rate": 8.4e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022386, "epoch": 0.30124427, "global_step/max_steps": "115/382", "percentage": "30.10%", "elapsed_time": "1h 25m 29s", "remaining_time": "3h 18m 29s"} |
| {"loss": 0.58144367, "token_acc": 0.78469751, "grad_norm": 2.74242997, "learning_rate": 8.4e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022387, "epoch": 0.30386379, "global_step/max_steps": "116/382", "percentage": "30.37%", "elapsed_time": "1h 26m 13s", "remaining_time": "3h 17m 43s"} |
| {"loss": 0.55434275, "token_acc": 0.84174312, "grad_norm": 3.10054278, "learning_rate": 8.3e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022389, "epoch": 0.3064833, "global_step/max_steps": "117/382", "percentage": "30.63%", "elapsed_time": "1h 26m 57s", "remaining_time": "3h 16m 58s"} |
| {"loss": 0.55493283, "token_acc": 0.79411765, "grad_norm": 2.91019678, "learning_rate": 8.3e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022391, "epoch": 0.30910282, "global_step/max_steps": "118/382", "percentage": "30.89%", "elapsed_time": "1h 27m 42s", "remaining_time": "3h 16m 13s"} |
| {"loss": 0.54549479, "token_acc": 0.78431373, "grad_norm": 3.06090236, "learning_rate": 8.3e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022392, "epoch": 0.31172233, "global_step/max_steps": "119/382", "percentage": "31.15%", "elapsed_time": "1h 28m 26s", "remaining_time": "3h 15m 27s"} |
| {"loss": 0.54473174, "token_acc": 0.80513919, "grad_norm": 2.69655848, "learning_rate": 8.2e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022394, "epoch": 0.31434185, "global_step/max_steps": "120/382", "percentage": "31.41%", "elapsed_time": "1h 29m 10s", "remaining_time": "3h 14m 42s"} |
| {"loss": 0.52929747, "token_acc": 0.80078125, "grad_norm": 2.54633093, "learning_rate": 8.2e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022397, "epoch": 0.31696136, "global_step/max_steps": "121/382", "percentage": "31.68%", "elapsed_time": "1h 29m 54s", "remaining_time": "3h 13m 56s"} |
| {"loss": 0.57147837, "token_acc": 0.81065089, "grad_norm": 2.43160534, "learning_rate": 8.2e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022398, "epoch": 0.31958088, "global_step/max_steps": "122/382", "percentage": "31.94%", "elapsed_time": "1h 30m 39s", "remaining_time": "3h 13m 11s"} |
| {"loss": 0.52648687, "token_acc": 0.82444444, "grad_norm": 3.25904512, "learning_rate": 8.1e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.0224, "epoch": 0.32220039, "global_step/max_steps": "123/382", "percentage": "32.20%", "elapsed_time": "1h 31m 23s", "remaining_time": "3h 12m 25s"} |
| {"loss": 0.56884331, "token_acc": 0.77317554, "grad_norm": 2.78394985, "learning_rate": 8.1e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022403, "epoch": 0.32481991, "global_step/max_steps": "124/382", "percentage": "32.46%", "elapsed_time": "1h 32m 7s", "remaining_time": "3h 11m 40s"} |
| {"loss": 0.53383148, "token_acc": 0.8515625, "grad_norm": 2.60660028, "learning_rate": 8.1e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022405, "epoch": 0.32743942, "global_step/max_steps": "125/382", "percentage": "32.72%", "elapsed_time": "1h 32m 51s", "remaining_time": "3h 10m 54s"} |
| {"loss": 0.55269098, "token_acc": 0.78793774, "grad_norm": 2.74498105, "learning_rate": 8e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022407, "epoch": 0.33005894, "global_step/max_steps": "126/382", "percentage": "32.98%", "elapsed_time": "1h 33m 35s", "remaining_time": "3h 10m 8s"} |
| {"loss": 0.57169771, "token_acc": 0.80449438, "grad_norm": 2.77639174, "learning_rate": 8e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02241, "epoch": 0.33267845, "global_step/max_steps": "127/382", "percentage": "33.25%", "elapsed_time": "1h 34m 19s", "remaining_time": "3h 9m 23s"} |
| {"loss": 0.52535373, "token_acc": 0.83027523, "grad_norm": 2.79327416, "learning_rate": 8e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022412, "epoch": 0.33529797, "global_step/max_steps": "128/382", "percentage": "33.51%", "elapsed_time": "1h 35m 3s", "remaining_time": "3h 8m 37s"} |
| {"loss": 0.54805946, "token_acc": 0.82366589, "grad_norm": 2.862463, "learning_rate": 7.9e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022412, "epoch": 0.33791749, "global_step/max_steps": "129/382", "percentage": "33.77%", "elapsed_time": "1h 35m 47s", "remaining_time": "3h 7m 53s"} |
| {"loss": 0.52405667, "token_acc": 0.79680365, "grad_norm": 2.48460793, "learning_rate": 7.9e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022414, "epoch": 0.340537, "global_step/max_steps": "130/382", "percentage": "34.03%", "elapsed_time": "1h 36m 32s", "remaining_time": "3h 7m 7s"} |
| {"loss": 0.56306177, "token_acc": 0.81341719, "grad_norm": 2.54159451, "learning_rate": 7.9e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022415, "epoch": 0.34315652, "global_step/max_steps": "131/382", "percentage": "34.29%", "elapsed_time": "1h 37m 16s", "remaining_time": "3h 6m 22s"} |
| {"loss": 0.51485419, "token_acc": 0.83333333, "grad_norm": 2.78697515, "learning_rate": 7.8e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022417, "epoch": 0.34577603, "global_step/max_steps": "132/382", "percentage": "34.55%", "elapsed_time": "1h 38m 0s", "remaining_time": "3h 5m 37s"} |
| {"loss": 0.54357243, "token_acc": 0.8434238, "grad_norm": 2.9632628, "learning_rate": 7.8e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022419, "epoch": 0.34839555, "global_step/max_steps": "133/382", "percentage": "34.82%", "elapsed_time": "1h 38m 44s", "remaining_time": "3h 4m 51s"} |
| {"loss": 0.48579407, "token_acc": 0.88095238, "grad_norm": 2.49095058, "learning_rate": 7.7e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.02242, "epoch": 0.35101506, "global_step/max_steps": "134/382", "percentage": "35.08%", "elapsed_time": "1h 39m 28s", "remaining_time": "3h 4m 6s"} |
| {"loss": 0.52659452, "token_acc": 0.79885057, "grad_norm": 2.55596328, "learning_rate": 7.7e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022422, "epoch": 0.35363458, "global_step/max_steps": "135/382", "percentage": "35.34%", "elapsed_time": "1h 40m 13s", "remaining_time": "3h 3m 21s"} |
| {"loss": 0.52635705, "token_acc": 0.78611632, "grad_norm": 2.50859308, "learning_rate": 7.7e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022424, "epoch": 0.35625409, "global_step/max_steps": "136/382", "percentage": "35.60%", "elapsed_time": "1h 40m 57s", "remaining_time": "3h 2m 36s"} |
| {"loss": 0.52039886, "token_acc": 0.81687243, "grad_norm": 2.16778994, "learning_rate": 7.6e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022426, "epoch": 0.35887361, "global_step/max_steps": "137/382", "percentage": "35.86%", "elapsed_time": "1h 41m 41s", "remaining_time": "3h 1m 51s"} |
| {"loss": 0.56878233, "token_acc": 0.8325, "grad_norm": 2.98542762, "learning_rate": 7.6e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022427, "epoch": 0.36149312, "global_step/max_steps": "138/382", "percentage": "36.13%", "elapsed_time": "1h 42m 25s", "remaining_time": "3h 1m 5s"} |
| {"loss": 0.52613407, "token_acc": 0.82459677, "grad_norm": 2.5845294, "learning_rate": 7.6e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022429, "epoch": 0.36411264, "global_step/max_steps": "139/382", "percentage": "36.39%", "elapsed_time": "1h 43m 9s", "remaining_time": "3h 0m 20s"} |
| {"loss": 0.51853687, "token_acc": 0.78598485, "grad_norm": 2.47631788, "learning_rate": 7.5e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.02243, "epoch": 0.36673215, "global_step/max_steps": "140/382", "percentage": "36.65%", "elapsed_time": "1h 43m 53s", "remaining_time": "2h 59m 35s"} |
| {"loss": 0.49533948, "token_acc": 0.85572139, "grad_norm": 2.77369237, "learning_rate": 7.5e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022431, "epoch": 0.36935167, "global_step/max_steps": "141/382", "percentage": "36.91%", "elapsed_time": "1h 44m 37s", "remaining_time": "2h 58m 50s"} |
| {"loss": 0.49524927, "token_acc": 0.8, "grad_norm": 2.41416574, "learning_rate": 7.4e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022432, "epoch": 0.37197119, "global_step/max_steps": "142/382", "percentage": "37.17%", "elapsed_time": "1h 45m 22s", "remaining_time": "2h 58m 5s"} |
| {"loss": 0.56055558, "token_acc": 0.79822616, "grad_norm": 2.49759245, "learning_rate": 7.4e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022434, "epoch": 0.3745907, "global_step/max_steps": "143/382", "percentage": "37.43%", "elapsed_time": "1h 46m 6s", "remaining_time": "2h 57m 20s"} |
| {"loss": 0.53179586, "token_acc": 0.80511811, "grad_norm": 2.02871633, "learning_rate": 7.4e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022435, "epoch": 0.37721022, "global_step/max_steps": "144/382", "percentage": "37.70%", "elapsed_time": "1h 46m 50s", "remaining_time": "2h 56m 35s"} |
| {"loss": 0.4882631, "token_acc": 0.83095723, "grad_norm": 2.7108078, "learning_rate": 7.3e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022436, "epoch": 0.37982973, "global_step/max_steps": "145/382", "percentage": "37.96%", "elapsed_time": "1h 47m 35s", "remaining_time": "2h 55m 50s"} |
| {"loss": 0.53704327, "token_acc": 0.80578512, "grad_norm": 2.56037617, "learning_rate": 7.3e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022437, "epoch": 0.38244925, "global_step/max_steps": "146/382", "percentage": "38.22%", "elapsed_time": "1h 48m 19s", "remaining_time": "2h 55m 5s"} |
| {"loss": 0.53114104, "token_acc": 0.84299517, "grad_norm": 2.42046642, "learning_rate": 7.3e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022439, "epoch": 0.38506876, "global_step/max_steps": "147/382", "percentage": "38.48%", "elapsed_time": "1h 49m 3s", "remaining_time": "2h 54m 20s"} |
| {"loss": 0.50710648, "token_acc": 0.81538462, "grad_norm": 2.6499052, "learning_rate": 7.2e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.02244, "epoch": 0.38768828, "global_step/max_steps": "148/382", "percentage": "38.74%", "elapsed_time": "1h 49m 47s", "remaining_time": "2h 53m 35s"} |
| {"loss": 0.5461446, "token_acc": 0.81124498, "grad_norm": 2.7632947, "learning_rate": 7.2e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022441, "epoch": 0.39030779, "global_step/max_steps": "149/382", "percentage": "39.01%", "elapsed_time": "1h 50m 31s", "remaining_time": "2h 52m 50s"} |
| {"loss": 0.49172658, "token_acc": 0.82844244, "grad_norm": 2.63244271, "learning_rate": 7.1e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022443, "epoch": 0.39292731, "global_step/max_steps": "150/382", "percentage": "39.27%", "elapsed_time": "1h 51m 15s", "remaining_time": "2h 52m 5s"} |
| {"loss": 0.52759278, "token_acc": 0.82618026, "grad_norm": 2.82195163, "learning_rate": 7.1e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022379, "epoch": 0.39554682, "global_step/max_steps": "151/382", "percentage": "39.53%", "elapsed_time": "1h 52m 19s", "remaining_time": "2h 51m 50s"} |
| {"loss": 0.49508414, "token_acc": 0.82315789, "grad_norm": 2.38450098, "learning_rate": 7.1e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022381, "epoch": 0.39816634, "global_step/max_steps": "152/382", "percentage": "39.79%", "elapsed_time": "1h 53m 3s", "remaining_time": "2h 51m 4s"} |
| {"loss": 0.5475089, "token_acc": 0.83660131, "grad_norm": 2.9017446, "learning_rate": 7e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022382, "epoch": 0.40078585, "global_step/max_steps": "153/382", "percentage": "40.05%", "elapsed_time": "1h 53m 48s", "remaining_time": "2h 50m 19s"} |
| {"loss": 0.49471945, "token_acc": 0.86732187, "grad_norm": 2.78416133, "learning_rate": 7e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022383, "epoch": 0.40340537, "global_step/max_steps": "154/382", "percentage": "40.31%", "elapsed_time": "1h 54m 32s", "remaining_time": "2h 49m 34s"} |
| {"loss": 0.51877034, "token_acc": 0.81799163, "grad_norm": 3.03494954, "learning_rate": 6.9e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022384, "epoch": 0.40602489, "global_step/max_steps": "155/382", "percentage": "40.58%", "elapsed_time": "1h 55m 16s", "remaining_time": "2h 48m 49s"} |
| {"loss": 0.52270067, "token_acc": 0.80578512, "grad_norm": 2.52233911, "learning_rate": 6.9e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022385, "epoch": 0.4086444, "global_step/max_steps": "156/382", "percentage": "40.84%", "elapsed_time": "1h 56m 1s", "remaining_time": "2h 48m 4s"} |
| {"loss": 0.51957548, "token_acc": 0.81923077, "grad_norm": 2.73326564, "learning_rate": 6.9e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022386, "epoch": 0.41126392, "global_step/max_steps": "157/382", "percentage": "41.10%", "elapsed_time": "1h 56m 45s", "remaining_time": "2h 47m 19s"} |
| {"loss": 0.48856601, "token_acc": 0.81034483, "grad_norm": 2.67405033, "learning_rate": 6.8e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022388, "epoch": 0.41388343, "global_step/max_steps": "158/382", "percentage": "41.36%", "elapsed_time": "1h 57m 29s", "remaining_time": "2h 46m 34s"} |
| {"loss": 0.53309989, "token_acc": 0.8062954, "grad_norm": 2.57290864, "learning_rate": 6.8e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.02239, "epoch": 0.41650295, "global_step/max_steps": "159/382", "percentage": "41.62%", "elapsed_time": "1h 58m 13s", "remaining_time": "2h 45m 48s"} |
| {"loss": 0.46343875, "token_acc": 0.83406114, "grad_norm": 2.55110002, "learning_rate": 6.7e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022391, "epoch": 0.41912246, "global_step/max_steps": "160/382", "percentage": "41.88%", "elapsed_time": "1h 58m 57s", "remaining_time": "2h 45m 3s"} |
| {"loss": 0.48153919, "token_acc": 0.8215103, "grad_norm": 2.29955912, "learning_rate": 6.7e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022393, "epoch": 0.42174198, "global_step/max_steps": "161/382", "percentage": "42.15%", "elapsed_time": "1h 59m 41s", "remaining_time": "2h 44m 18s"} |
| {"loss": 0.527336, "token_acc": 0.818, "grad_norm": 2.67446971, "learning_rate": 6.7e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022394, "epoch": 0.42436149, "global_step/max_steps": "162/382", "percentage": "42.41%", "elapsed_time": "2h 0m 26s", "remaining_time": "2h 43m 33s"} |
| {"loss": 0.47238955, "token_acc": 0.85286783, "grad_norm": 2.8775475, "learning_rate": 6.6e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022395, "epoch": 0.42698101, "global_step/max_steps": "163/382", "percentage": "42.67%", "elapsed_time": "2h 1m 10s", "remaining_time": "2h 42m 48s"} |
| {"loss": 0.51551741, "token_acc": 0.83703704, "grad_norm": 3.4531374, "learning_rate": 6.6e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022397, "epoch": 0.42960052, "global_step/max_steps": "164/382", "percentage": "42.93%", "elapsed_time": "2h 1m 54s", "remaining_time": "2h 42m 3s"} |
| {"loss": 0.54353023, "token_acc": 0.81023454, "grad_norm": 2.82989383, "learning_rate": 6.5e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022398, "epoch": 0.43222004, "global_step/max_steps": "165/382", "percentage": "43.19%", "elapsed_time": "2h 2m 38s", "remaining_time": "2h 41m 18s"} |
| {"loss": 0.54591262, "token_acc": 0.82352941, "grad_norm": 2.85191512, "learning_rate": 6.5e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022399, "epoch": 0.43483955, "global_step/max_steps": "166/382", "percentage": "43.46%", "elapsed_time": "2h 3m 23s", "remaining_time": "2h 40m 33s"} |
| {"loss": 0.50945723, "token_acc": 0.8377193, "grad_norm": 2.76378155, "learning_rate": 6.5e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.0224, "epoch": 0.43745907, "global_step/max_steps": "167/382", "percentage": "43.72%", "elapsed_time": "2h 4m 7s", "remaining_time": "2h 39m 48s"} |
| {"loss": 0.54632461, "token_acc": 0.8558952, "grad_norm": 2.6083324, "learning_rate": 6.4e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022401, "epoch": 0.44007859, "global_step/max_steps": "168/382", "percentage": "43.98%", "elapsed_time": "2h 4m 51s", "remaining_time": "2h 39m 3s"} |
| {"loss": 0.51947129, "token_acc": 0.81380753, "grad_norm": 2.77406454, "learning_rate": 6.4e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022402, "epoch": 0.4426981, "global_step/max_steps": "169/382", "percentage": "44.24%", "elapsed_time": "2h 5m 36s", "remaining_time": "2h 38m 18s"} |
| {"loss": 0.51768351, "token_acc": 0.81419624, "grad_norm": 2.21139479, "learning_rate": 6.3e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022403, "epoch": 0.44531762, "global_step/max_steps": "170/382", "percentage": "44.50%", "elapsed_time": "2h 6m 20s", "remaining_time": "2h 37m 33s"} |
| {"loss": 0.48924193, "token_acc": 0.79541109, "grad_norm": 2.71219563, "learning_rate": 6.3e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022404, "epoch": 0.44793713, "global_step/max_steps": "171/382", "percentage": "44.76%", "elapsed_time": "2h 7m 4s", "remaining_time": "2h 36m 48s"} |
| {"loss": 0.50808412, "token_acc": 0.81649485, "grad_norm": 2.42337561, "learning_rate": 6.2e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022405, "epoch": 0.45055665, "global_step/max_steps": "172/382", "percentage": "45.03%", "elapsed_time": "2h 7m 48s", "remaining_time": "2h 36m 3s"} |
| {"loss": 0.5036391, "token_acc": 0.81956522, "grad_norm": 2.55144739, "learning_rate": 6.2e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022407, "epoch": 0.45317616, "global_step/max_steps": "173/382", "percentage": "45.29%", "elapsed_time": "2h 8m 33s", "remaining_time": "2h 35m 18s"} |
| {"loss": 0.53655308, "token_acc": 0.80044346, "grad_norm": 2.69291735, "learning_rate": 6.2e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022408, "epoch": 0.45579568, "global_step/max_steps": "174/382", "percentage": "45.55%", "elapsed_time": "2h 9m 17s", "remaining_time": "2h 34m 32s"} |
| {"loss": 0.50436962, "token_acc": 0.84101382, "grad_norm": 2.59588885, "learning_rate": 6.1e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022409, "epoch": 0.45841519, "global_step/max_steps": "175/382", "percentage": "45.81%", "elapsed_time": "2h 10m 1s", "remaining_time": "2h 33m 48s"} |
| {"loss": 0.53702331, "token_acc": 0.81417625, "grad_norm": 2.6926198, "learning_rate": 6.1e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.02241, "epoch": 0.46103471, "global_step/max_steps": "176/382", "percentage": "46.07%", "elapsed_time": "2h 10m 45s", "remaining_time": "2h 33m 3s"} |
| {"loss": 0.53248501, "token_acc": 0.85596708, "grad_norm": 2.41775131, "learning_rate": 6e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022411, "epoch": 0.46365422, "global_step/max_steps": "177/382", "percentage": "46.34%", "elapsed_time": "2h 11m 29s", "remaining_time": "2h 32m 18s"} |
| {"loss": 0.50666475, "token_acc": 0.81026786, "grad_norm": 2.33523512, "learning_rate": 6e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022413, "epoch": 0.46627374, "global_step/max_steps": "178/382", "percentage": "46.60%", "elapsed_time": "2h 12m 14s", "remaining_time": "2h 31m 33s"} |
| {"loss": 0.56043899, "token_acc": 0.79303279, "grad_norm": 2.75262332, "learning_rate": 5.9e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022413, "epoch": 0.46889325, "global_step/max_steps": "179/382", "percentage": "46.86%", "elapsed_time": "2h 12m 58s", "remaining_time": "2h 30m 48s"} |
| {"loss": 0.53187525, "token_acc": 0.7806841, "grad_norm": 3.07034922, "learning_rate": 5.9e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022415, "epoch": 0.47151277, "global_step/max_steps": "180/382", "percentage": "47.12%", "elapsed_time": "2h 13m 42s", "remaining_time": "2h 30m 3s"} |
| {"loss": 0.52598518, "token_acc": 0.8266129, "grad_norm": 3.64936066, "learning_rate": 5.9e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022416, "epoch": 0.47413229, "global_step/max_steps": "181/382", "percentage": "47.38%", "elapsed_time": "2h 14m 26s", "remaining_time": "2h 29m 18s"} |
| {"loss": 0.53214073, "token_acc": 0.8212766, "grad_norm": 2.21829605, "learning_rate": 5.8e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022417, "epoch": 0.4767518, "global_step/max_steps": "182/382", "percentage": "47.64%", "elapsed_time": "2h 15m 11s", "remaining_time": "2h 28m 33s"} |
| {"loss": 0.49147552, "token_acc": 0.83469388, "grad_norm": 2.34784102, "learning_rate": 5.8e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022418, "epoch": 0.47937132, "global_step/max_steps": "183/382", "percentage": "47.91%", "elapsed_time": "2h 15m 55s", "remaining_time": "2h 27m 48s"} |
| {"loss": 0.53238106, "token_acc": 0.82051282, "grad_norm": 2.511379, "learning_rate": 5.7e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022419, "epoch": 0.48199083, "global_step/max_steps": "184/382", "percentage": "48.17%", "elapsed_time": "2h 16m 39s", "remaining_time": "2h 27m 3s"} |
| {"loss": 0.56500477, "token_acc": 0.82150101, "grad_norm": 2.98427033, "learning_rate": 5.7e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02242, "epoch": 0.48461035, "global_step/max_steps": "185/382", "percentage": "48.43%", "elapsed_time": "2h 17m 23s", "remaining_time": "2h 26m 18s"} |
| {"loss": 0.53457993, "token_acc": 0.7938343, "grad_norm": 2.43210125, "learning_rate": 5.6e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022421, "epoch": 0.48722986, "global_step/max_steps": "186/382", "percentage": "48.69%", "elapsed_time": "2h 18m 7s", "remaining_time": "2h 25m 33s"} |
| {"loss": 0.4752824, "token_acc": 0.83222958, "grad_norm": 2.2675736, "learning_rate": 5.6e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022422, "epoch": 0.48984938, "global_step/max_steps": "187/382", "percentage": "48.95%", "elapsed_time": "2h 18m 52s", "remaining_time": "2h 24m 48s"} |
| {"loss": 0.48174724, "token_acc": 0.80761905, "grad_norm": 2.72362661, "learning_rate": 5.6e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022423, "epoch": 0.49246889, "global_step/max_steps": "188/382", "percentage": "49.21%", "elapsed_time": "2h 19m 36s", "remaining_time": "2h 24m 3s"} |
| {"loss": 0.53375477, "token_acc": 0.81400438, "grad_norm": 2.6596787, "learning_rate": 5.5e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022424, "epoch": 0.49508841, "global_step/max_steps": "189/382", "percentage": "49.48%", "elapsed_time": "2h 20m 20s", "remaining_time": "2h 23m 18s"} |
| {"loss": 0.52485222, "token_acc": 0.82959641, "grad_norm": 2.37396288, "learning_rate": 5.5e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022425, "epoch": 0.49770792, "global_step/max_steps": "190/382", "percentage": "49.74%", "elapsed_time": "2h 21m 4s", "remaining_time": "2h 22m 33s"} |
| {"loss": 0.51352477, "token_acc": 0.84878049, "grad_norm": 2.49276638, "learning_rate": 5.4e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022426, "epoch": 0.50032744, "global_step/max_steps": "191/382", "percentage": "50.00%", "elapsed_time": "2h 21m 48s", "remaining_time": "2h 21m 48s"} |
| {"loss": 0.49876159, "token_acc": 0.81726908, "grad_norm": 2.74990988, "learning_rate": 5.4e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022427, "epoch": 0.50294695, "global_step/max_steps": "192/382", "percentage": "50.26%", "elapsed_time": "2h 22m 33s", "remaining_time": "2h 21m 4s"} |
| {"loss": 0.47379774, "token_acc": 0.82815735, "grad_norm": 2.98273993, "learning_rate": 5.3e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022428, "epoch": 0.50556647, "global_step/max_steps": "193/382", "percentage": "50.52%", "elapsed_time": "2h 23m 17s", "remaining_time": "2h 20m 19s"} |
| {"loss": 0.48123741, "token_acc": 0.83265306, "grad_norm": 2.53498387, "learning_rate": 5.3e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022429, "epoch": 0.50818599, "global_step/max_steps": "194/382", "percentage": "50.79%", "elapsed_time": "2h 24m 1s", "remaining_time": "2h 19m 34s"} |
| {"loss": 0.52715617, "token_acc": 0.81312127, "grad_norm": 2.60506988, "learning_rate": 5.3e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02243, "epoch": 0.5108055, "global_step/max_steps": "195/382", "percentage": "51.05%", "elapsed_time": "2h 24m 45s", "remaining_time": "2h 18m 49s"} |
| {"loss": 0.54259253, "token_acc": 0.83259912, "grad_norm": 2.84257627, "learning_rate": 5.2e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022431, "epoch": 0.51342502, "global_step/max_steps": "196/382", "percentage": "51.31%", "elapsed_time": "2h 25m 29s", "remaining_time": "2h 18m 4s"} |
| {"loss": 0.54637933, "token_acc": 0.81162325, "grad_norm": 2.70492864, "learning_rate": 5.2e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022433, "epoch": 0.51604453, "global_step/max_steps": "197/382", "percentage": "51.57%", "elapsed_time": "2h 26m 13s", "remaining_time": "2h 17m 19s"} |
| {"loss": 0.49658859, "token_acc": 0.85324948, "grad_norm": 2.72348547, "learning_rate": 5.1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022434, "epoch": 0.51866405, "global_step/max_steps": "198/382", "percentage": "51.83%", "elapsed_time": "2h 26m 58s", "remaining_time": "2h 16m 34s"} |
| {"loss": 0.52023447, "token_acc": 0.84868421, "grad_norm": 3.04039073, "learning_rate": 5.1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022434, "epoch": 0.52128356, "global_step/max_steps": "199/382", "percentage": "52.09%", "elapsed_time": "2h 27m 42s", "remaining_time": "2h 15m 49s"} |
| {"loss": 0.52931917, "token_acc": 0.78269618, "grad_norm": 2.84322786, "learning_rate": 5e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022436, "epoch": 0.52390308, "global_step/max_steps": "200/382", "percentage": "52.36%", "elapsed_time": "2h 28m 26s", "remaining_time": "2h 15m 4s"} |
| {"loss": 0.50149959, "token_acc": 0.80842105, "grad_norm": 2.38139582, "learning_rate": 5e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022391, "epoch": 0.52652259, "global_step/max_steps": "201/382", "percentage": "52.62%", "elapsed_time": "2h 29m 28s", "remaining_time": "2h 14m 36s"} |
| {"loss": 0.57289803, "token_acc": 0.7804878, "grad_norm": 2.76520133, "learning_rate": 5e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022391, "epoch": 0.52914211, "global_step/max_steps": "202/382", "percentage": "52.88%", "elapsed_time": "2h 30m 13s", "remaining_time": "2h 13m 51s"} |
| {"loss": 0.51116443, "token_acc": 0.8380744, "grad_norm": 2.90713072, "learning_rate": 4.9e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022393, "epoch": 0.53176162, "global_step/max_steps": "203/382", "percentage": "53.14%", "elapsed_time": "2h 30m 57s", "remaining_time": "2h 13m 6s"} |
| {"loss": 0.50021529, "token_acc": 0.80508475, "grad_norm": 2.45720553, "learning_rate": 4.9e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022394, "epoch": 0.53438114, "global_step/max_steps": "204/382", "percentage": "53.40%", "elapsed_time": "2h 31m 41s", "remaining_time": "2h 12m 21s"} |
| {"loss": 0.50174642, "token_acc": 0.8233945, "grad_norm": 2.62117815, "learning_rate": 4.8e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022395, "epoch": 0.53700065, "global_step/max_steps": "205/382", "percentage": "53.66%", "elapsed_time": "2h 32m 26s", "remaining_time": "2h 11m 36s"} |
| {"loss": 0.45794088, "token_acc": 0.82403433, "grad_norm": 2.28542829, "learning_rate": 4.8e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022396, "epoch": 0.53962017, "global_step/max_steps": "206/382", "percentage": "53.93%", "elapsed_time": "2h 33m 10s", "remaining_time": "2h 10m 51s"} |
| {"loss": 0.49056792, "token_acc": 0.83258929, "grad_norm": 2.48000002, "learning_rate": 4.7e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022398, "epoch": 0.54223969, "global_step/max_steps": "207/382", "percentage": "54.19%", "elapsed_time": "2h 33m 54s", "remaining_time": "2h 10m 6s"} |
| {"loss": 0.52487731, "token_acc": 0.81278539, "grad_norm": 2.82355165, "learning_rate": 4.7e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022399, "epoch": 0.5448592, "global_step/max_steps": "208/382", "percentage": "54.45%", "elapsed_time": "2h 34m 38s", "remaining_time": "2h 9m 21s"} |
| {"loss": 0.53685921, "token_acc": 0.87223587, "grad_norm": 2.60529661, "learning_rate": 4.7e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.0224, "epoch": 0.54747872, "global_step/max_steps": "209/382", "percentage": "54.71%", "elapsed_time": "2h 35m 22s", "remaining_time": "2h 8m 36s"} |
| {"loss": 0.49091154, "token_acc": 0.80909091, "grad_norm": 2.4320972, "learning_rate": 4.6e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022401, "epoch": 0.55009823, "global_step/max_steps": "210/382", "percentage": "54.97%", "elapsed_time": "2h 36m 6s", "remaining_time": "2h 7m 51s"} |
| {"loss": 0.50956798, "token_acc": 0.81767956, "grad_norm": 2.56773639, "learning_rate": 4.6e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022402, "epoch": 0.55271775, "global_step/max_steps": "211/382", "percentage": "55.24%", "elapsed_time": "2h 36m 50s", "remaining_time": "2h 7m 6s"} |
| {"loss": 0.50908512, "token_acc": 0.8045738, "grad_norm": 2.58115029, "learning_rate": 4.5e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022403, "epoch": 0.55533726, "global_step/max_steps": "212/382", "percentage": "55.50%", "elapsed_time": "2h 37m 35s", "remaining_time": "2h 6m 21s"} |
| {"loss": 0.49449897, "token_acc": 0.82758621, "grad_norm": 2.55412769, "learning_rate": 4.5e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022404, "epoch": 0.55795678, "global_step/max_steps": "213/382", "percentage": "55.76%", "elapsed_time": "2h 38m 19s", "remaining_time": "2h 5m 36s"} |
| {"loss": 0.52230346, "token_acc": 0.812749, "grad_norm": 2.5495348, "learning_rate": 4.4e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022405, "epoch": 0.56057629, "global_step/max_steps": "214/382", "percentage": "56.02%", "elapsed_time": "2h 39m 3s", "remaining_time": "2h 4m 52s"} |
| {"loss": 0.48673448, "token_acc": 0.80425532, "grad_norm": 2.7251575, "learning_rate": 4.4e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022406, "epoch": 0.56319581, "global_step/max_steps": "215/382", "percentage": "56.28%", "elapsed_time": "2h 39m 47s", "remaining_time": "2h 4m 7s"} |
| {"loss": 0.49226636, "token_acc": 0.82211538, "grad_norm": 2.68823886, "learning_rate": 4.4e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022406, "epoch": 0.56581532, "global_step/max_steps": "216/382", "percentage": "56.54%", "elapsed_time": "2h 40m 32s", "remaining_time": "2h 3m 22s"} |
| {"loss": 0.54549497, "token_acc": 0.8254717, "grad_norm": 2.74458098, "learning_rate": 4.3e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022407, "epoch": 0.56843484, "global_step/max_steps": "217/382", "percentage": "56.81%", "elapsed_time": "2h 41m 16s", "remaining_time": "2h 2m 37s"} |
| {"loss": 0.54418159, "token_acc": 0.81990521, "grad_norm": 3.21369839, "learning_rate": 4.3e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022409, "epoch": 0.57105435, "global_step/max_steps": "218/382", "percentage": "57.07%", "elapsed_time": "2h 42m 0s", "remaining_time": "2h 1m 52s"} |
| {"loss": 0.49353039, "token_acc": 0.85333333, "grad_norm": 2.59322667, "learning_rate": 4.2e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02241, "epoch": 0.57367387, "global_step/max_steps": "219/382", "percentage": "57.33%", "elapsed_time": "2h 42m 44s", "remaining_time": "2h 1m 7s"} |
| {"loss": 0.49543607, "token_acc": 0.81047382, "grad_norm": 2.44461513, "learning_rate": 4.2e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022411, "epoch": 0.57629339, "global_step/max_steps": "220/382", "percentage": "57.59%", "elapsed_time": "2h 43m 28s", "remaining_time": "2h 0m 22s"} |
| {"loss": 0.45800793, "token_acc": 0.82526316, "grad_norm": 2.39580822, "learning_rate": 4.1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022412, "epoch": 0.5789129, "global_step/max_steps": "221/382", "percentage": "57.85%", "elapsed_time": "2h 44m 13s", "remaining_time": "1h 59m 38s"} |
| {"loss": 0.53381026, "token_acc": 0.81198347, "grad_norm": 2.54051995, "learning_rate": 4.1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022413, "epoch": 0.58153242, "global_step/max_steps": "222/382", "percentage": "58.12%", "elapsed_time": "2h 44m 57s", "remaining_time": "1h 58m 53s"} |
| {"loss": 0.5074054, "token_acc": 0.79955947, "grad_norm": 2.42853284, "learning_rate": 4.1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022414, "epoch": 0.58415193, "global_step/max_steps": "223/382", "percentage": "58.38%", "elapsed_time": "2h 45m 41s", "remaining_time": "1h 58m 8s"} |
| {"loss": 0.55419117, "token_acc": 0.83062645, "grad_norm": 3.53357792, "learning_rate": 4e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022415, "epoch": 0.58677145, "global_step/max_steps": "224/382", "percentage": "58.64%", "elapsed_time": "2h 46m 25s", "remaining_time": "1h 57m 23s"} |
| {"loss": 0.46316904, "token_acc": 0.83682008, "grad_norm": 2.45293427, "learning_rate": 4e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022415, "epoch": 0.58939096, "global_step/max_steps": "225/382", "percentage": "58.90%", "elapsed_time": "2h 47m 9s", "remaining_time": "1h 56m 38s"} |
| {"loss": 0.53508967, "token_acc": 0.84210526, "grad_norm": 2.62295222, "learning_rate": 3.9e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022416, "epoch": 0.59201048, "global_step/max_steps": "226/382", "percentage": "59.16%", "elapsed_time": "2h 47m 54s", "remaining_time": "1h 55m 53s"} |
| {"loss": 0.49788809, "token_acc": 0.82954545, "grad_norm": 2.87762332, "learning_rate": 3.9e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022417, "epoch": 0.59462999, "global_step/max_steps": "227/382", "percentage": "59.42%", "elapsed_time": "2h 48m 38s", "remaining_time": "1h 55m 9s"} |
| {"loss": 0.47531447, "token_acc": 0.8524173, "grad_norm": 2.69508195, "learning_rate": 3.8e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022418, "epoch": 0.59724951, "global_step/max_steps": "228/382", "percentage": "59.69%", "elapsed_time": "2h 49m 22s", "remaining_time": "1h 54m 24s"} |
| {"loss": 0.52965879, "token_acc": 0.80519481, "grad_norm": 2.6119206, "learning_rate": 3.8e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022418, "epoch": 0.59986902, "global_step/max_steps": "229/382", "percentage": "59.95%", "elapsed_time": "2h 50m 7s", "remaining_time": "1h 53m 39s"} |
| {"loss": 0.51018512, "token_acc": 0.82236842, "grad_norm": 2.78572464, "learning_rate": 3.8e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022419, "epoch": 0.60248854, "global_step/max_steps": "230/382", "percentage": "60.21%", "elapsed_time": "2h 50m 51s", "remaining_time": "1h 52m 54s"} |
| {"loss": 0.47091985, "token_acc": 0.8128655, "grad_norm": 2.36248946, "learning_rate": 3.7e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02242, "epoch": 0.60510806, "global_step/max_steps": "231/382", "percentage": "60.47%", "elapsed_time": "2h 51m 35s", "remaining_time": "1h 52m 9s"} |
| {"loss": 0.49558592, "token_acc": 0.84221748, "grad_norm": 2.6870563, "learning_rate": 3.7e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022421, "epoch": 0.60772757, "global_step/max_steps": "232/382", "percentage": "60.73%", "elapsed_time": "2h 52m 19s", "remaining_time": "1h 51m 25s"} |
| {"loss": 0.48437756, "token_acc": 0.81967213, "grad_norm": 2.46211338, "learning_rate": 3.6e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022422, "epoch": 0.61034709, "global_step/max_steps": "233/382", "percentage": "60.99%", "elapsed_time": "2h 53m 3s", "remaining_time": "1h 50m 40s"} |
| {"loss": 0.4818233, "token_acc": 0.8254717, "grad_norm": 3.0773921, "learning_rate": 3.6e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022423, "epoch": 0.6129666, "global_step/max_steps": "234/382", "percentage": "61.26%", "elapsed_time": "2h 53m 47s", "remaining_time": "1h 49m 55s"} |
| {"loss": 0.53113687, "token_acc": 0.84953704, "grad_norm": 2.68407202, "learning_rate": 3.5e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022424, "epoch": 0.61558612, "global_step/max_steps": "235/382", "percentage": "61.52%", "elapsed_time": "2h 54m 32s", "remaining_time": "1h 49m 10s"} |
| {"loss": 0.49189091, "token_acc": 0.84449761, "grad_norm": 2.44615984, "learning_rate": 3.5e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022424, "epoch": 0.61820563, "global_step/max_steps": "236/382", "percentage": "61.78%", "elapsed_time": "2h 55m 16s", "remaining_time": "1h 48m 25s"} |
| {"loss": 0.46242249, "token_acc": 0.86363636, "grad_norm": 2.42844248, "learning_rate": 3.5e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022425, "epoch": 0.62082515, "global_step/max_steps": "237/382", "percentage": "62.04%", "elapsed_time": "2h 56m 0s", "remaining_time": "1h 47m 41s"} |
| {"loss": 0.53156078, "token_acc": 0.82905983, "grad_norm": 2.66220117, "learning_rate": 3.4e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022426, "epoch": 0.62344466, "global_step/max_steps": "238/382", "percentage": "62.30%", "elapsed_time": "2h 56m 44s", "remaining_time": "1h 46m 56s"} |
| {"loss": 0.55279171, "token_acc": 0.80290456, "grad_norm": 2.61038232, "learning_rate": 3.4e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022427, "epoch": 0.62606418, "global_step/max_steps": "239/382", "percentage": "62.57%", "elapsed_time": "2h 57m 28s", "remaining_time": "1h 46m 11s"} |
| {"loss": 0.48870692, "token_acc": 0.82826087, "grad_norm": 2.41831994, "learning_rate": 3.3e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022428, "epoch": 0.62868369, "global_step/max_steps": "240/382", "percentage": "62.83%", "elapsed_time": "2h 58m 13s", "remaining_time": "1h 45m 26s"} |
| {"loss": 0.46961686, "token_acc": 0.82102908, "grad_norm": 2.78620481, "learning_rate": 3.3e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022428, "epoch": 0.63130321, "global_step/max_steps": "241/382", "percentage": "63.09%", "elapsed_time": "2h 58m 57s", "remaining_time": "1h 44m 42s"} |
| {"loss": 0.51153982, "token_acc": 0.82417582, "grad_norm": 2.72946906, "learning_rate": 3.3e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022429, "epoch": 0.63392272, "global_step/max_steps": "242/382", "percentage": "63.35%", "elapsed_time": "2h 59m 41s", "remaining_time": "1h 43m 57s"} |
| {"loss": 0.50912637, "token_acc": 0.81306306, "grad_norm": 2.7466619, "learning_rate": 3.2e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02243, "epoch": 0.63654224, "global_step/max_steps": "243/382", "percentage": "63.61%", "elapsed_time": "3h 0m 26s", "remaining_time": "1h 43m 12s"} |
| {"loss": 0.4814063, "token_acc": 0.84651163, "grad_norm": 2.29395461, "learning_rate": 3.2e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02243, "epoch": 0.63916176, "global_step/max_steps": "244/382", "percentage": "63.87%", "elapsed_time": "3h 1m 10s", "remaining_time": "1h 42m 28s"} |
| {"loss": 0.48852107, "token_acc": 0.82978723, "grad_norm": 2.71862125, "learning_rate": 3.1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022431, "epoch": 0.64178127, "global_step/max_steps": "245/382", "percentage": "64.14%", "elapsed_time": "3h 1m 54s", "remaining_time": "1h 41m 43s"} |
| {"loss": 0.57106948, "token_acc": 0.79268293, "grad_norm": 9.9169302, "learning_rate": 3.1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022431, "epoch": 0.64440079, "global_step/max_steps": "246/382", "percentage": "64.40%", "elapsed_time": "3h 2m 38s", "remaining_time": "1h 40m 58s"} |
| {"loss": 0.52299559, "token_acc": 0.82869379, "grad_norm": 2.85000706, "learning_rate": 3.1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022432, "epoch": 0.6470203, "global_step/max_steps": "247/382", "percentage": "64.66%", "elapsed_time": "3h 3m 23s", "remaining_time": "1h 40m 13s"} |
| {"loss": 0.52206075, "token_acc": 0.77628635, "grad_norm": 2.68530512, "learning_rate": 3e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022433, "epoch": 0.64963982, "global_step/max_steps": "248/382", "percentage": "64.92%", "elapsed_time": "3h 4m 7s", "remaining_time": "1h 39m 29s"} |
| {"loss": 0.53021812, "token_acc": 0.80087527, "grad_norm": 3.06006098, "learning_rate": 3e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022433, "epoch": 0.65225933, "global_step/max_steps": "249/382", "percentage": "65.18%", "elapsed_time": "3h 4m 51s", "remaining_time": "1h 38m 44s"} |
| {"loss": 0.47941399, "token_acc": 0.80174292, "grad_norm": 2.51002908, "learning_rate": 2.9e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022434, "epoch": 0.65487885, "global_step/max_steps": "250/382", "percentage": "65.45%", "elapsed_time": "3h 5m 36s", "remaining_time": "1h 37m 59s"} |
| {"loss": 0.50820911, "token_acc": 0.84146341, "grad_norm": 2.50101304, "learning_rate": 2.9e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022395, "epoch": 0.65749836, "global_step/max_steps": "251/382", "percentage": "65.71%", "elapsed_time": "3h 6m 39s", "remaining_time": "1h 37m 25s"} |
| {"loss": 0.46485996, "token_acc": 0.83544304, "grad_norm": 2.6000011, "learning_rate": 2.9e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022396, "epoch": 0.66011788, "global_step/max_steps": "252/382", "percentage": "65.97%", "elapsed_time": "3h 7m 24s", "remaining_time": "1h 36m 40s"} |
| {"loss": 0.49137983, "token_acc": 0.8261851, "grad_norm": 2.58280015, "learning_rate": 2.8e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022397, "epoch": 0.66273739, "global_step/max_steps": "253/382", "percentage": "66.23%", "elapsed_time": "3h 8m 8s", "remaining_time": "1h 35m 55s"} |
| {"loss": 0.53226566, "token_acc": 0.78378378, "grad_norm": 2.64956212, "learning_rate": 2.8e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022398, "epoch": 0.66535691, "global_step/max_steps": "254/382", "percentage": "66.49%", "elapsed_time": "3h 8m 52s", "remaining_time": "1h 35m 10s"} |
| {"loss": 0.53264558, "token_acc": 0.80549683, "grad_norm": 2.43987751, "learning_rate": 2.7e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022399, "epoch": 0.66797642, "global_step/max_steps": "255/382", "percentage": "66.75%", "elapsed_time": "3h 9m 36s", "remaining_time": "1h 34m 26s"} |
| {"loss": 0.48848391, "token_acc": 0.79754601, "grad_norm": 2.47274494, "learning_rate": 2.7e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.0224, "epoch": 0.67059594, "global_step/max_steps": "256/382", "percentage": "67.02%", "elapsed_time": "3h 10m 20s", "remaining_time": "1h 33m 41s"} |
| {"loss": 0.48551306, "token_acc": 0.818, "grad_norm": 2.72192407, "learning_rate": 2.7e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022401, "epoch": 0.67321546, "global_step/max_steps": "257/382", "percentage": "67.28%", "elapsed_time": "3h 11m 5s", "remaining_time": "1h 32m 56s"} |
| {"loss": 0.53804684, "token_acc": 0.81372549, "grad_norm": 2.85644126, "learning_rate": 2.6e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022401, "epoch": 0.67583497, "global_step/max_steps": "258/382", "percentage": "67.54%", "elapsed_time": "3h 11m 49s", "remaining_time": "1h 32m 11s"} |
| {"loss": 0.52442718, "token_acc": 0.83898305, "grad_norm": 2.68102241, "learning_rate": 2.6e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022402, "epoch": 0.67845449, "global_step/max_steps": "259/382", "percentage": "67.80%", "elapsed_time": "3h 12m 33s", "remaining_time": "1h 31m 26s"} |
| {"loss": 0.44407117, "token_acc": 0.79782609, "grad_norm": 2.53717995, "learning_rate": 2.6e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022403, "epoch": 0.681074, "global_step/max_steps": "260/382", "percentage": "68.06%", "elapsed_time": "3h 13m 17s", "remaining_time": "1h 30m 41s"} |
| {"loss": 0.51437664, "token_acc": 0.82286996, "grad_norm": 2.33952022, "learning_rate": 2.5e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022404, "epoch": 0.68369352, "global_step/max_steps": "261/382", "percentage": "68.32%", "elapsed_time": "3h 14m 1s", "remaining_time": "1h 29m 57s"} |
| {"loss": 0.43805832, "token_acc": 0.8304721, "grad_norm": 2.75318861, "learning_rate": 2.5e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022405, "epoch": 0.68631303, "global_step/max_steps": "262/382", "percentage": "68.59%", "elapsed_time": "3h 14m 46s", "remaining_time": "1h 29m 12s"} |
| {"loss": 0.4871158, "token_acc": 0.8110883, "grad_norm": 2.66617346, "learning_rate": 2.4e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022406, "epoch": 0.68893255, "global_step/max_steps": "263/382", "percentage": "68.85%", "elapsed_time": "3h 15m 30s", "remaining_time": "1h 28m 27s"} |
| {"loss": 0.50007355, "token_acc": 0.83524027, "grad_norm": 2.59673572, "learning_rate": 2.4e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022407, "epoch": 0.69155206, "global_step/max_steps": "264/382", "percentage": "69.11%", "elapsed_time": "3h 16m 14s", "remaining_time": "1h 27m 42s"} |
| {"loss": 0.5367918, "token_acc": 0.81944444, "grad_norm": 2.60177636, "learning_rate": 2.4e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022407, "epoch": 0.69417158, "global_step/max_steps": "265/382", "percentage": "69.37%", "elapsed_time": "3h 16m 58s", "remaining_time": "1h 26m 58s"} |
| {"loss": 0.53543031, "token_acc": 0.80538302, "grad_norm": 2.67493725, "learning_rate": 2.3e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022408, "epoch": 0.69679109, "global_step/max_steps": "266/382", "percentage": "69.63%", "elapsed_time": "3h 17m 42s", "remaining_time": "1h 26m 13s"} |
| {"loss": 0.46658239, "token_acc": 0.83196721, "grad_norm": 2.56142879, "learning_rate": 2.3e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022409, "epoch": 0.69941061, "global_step/max_steps": "267/382", "percentage": "69.90%", "elapsed_time": "3h 18m 26s", "remaining_time": "1h 25m 28s"} |
| {"loss": 0.51969707, "token_acc": 0.81782946, "grad_norm": 2.26208115, "learning_rate": 2.3e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022409, "epoch": 0.70203012, "global_step/max_steps": "268/382", "percentage": "70.16%", "elapsed_time": "3h 19m 11s", "remaining_time": "1h 24m 43s"} |
| {"loss": 0.5194627, "token_acc": 0.79310345, "grad_norm": 4.23315954, "learning_rate": 2.2e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02241, "epoch": 0.70464964, "global_step/max_steps": "269/382", "percentage": "70.42%", "elapsed_time": "3h 19m 55s", "remaining_time": "1h 23m 59s"} |
| {"loss": 0.5166223, "token_acc": 0.81755196, "grad_norm": 2.91183615, "learning_rate": 2.2e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022411, "epoch": 0.70726916, "global_step/max_steps": "270/382", "percentage": "70.68%", "elapsed_time": "3h 20m 39s", "remaining_time": "1h 23m 14s"} |
| {"loss": 0.53052491, "token_acc": 0.80626223, "grad_norm": 2.69587111, "learning_rate": 2.1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022411, "epoch": 0.70988867, "global_step/max_steps": "271/382", "percentage": "70.94%", "elapsed_time": "3h 21m 24s", "remaining_time": "1h 22m 29s"} |
| {"loss": 0.50876284, "token_acc": 0.80842105, "grad_norm": 2.69760776, "learning_rate": 2.1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022412, "epoch": 0.71250819, "global_step/max_steps": "272/382", "percentage": "71.20%", "elapsed_time": "3h 22m 8s", "remaining_time": "1h 21m 44s"} |
| {"loss": 0.52076101, "token_acc": 0.84510251, "grad_norm": 2.41476989, "learning_rate": 2.1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022413, "epoch": 0.7151277, "global_step/max_steps": "273/382", "percentage": "71.47%", "elapsed_time": "3h 22m 52s", "remaining_time": "1h 21m 0s"} |
| {"loss": 0.48224354, "token_acc": 0.82978723, "grad_norm": 2.84710622, "learning_rate": 2e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022414, "epoch": 0.71774722, "global_step/max_steps": "274/382", "percentage": "71.73%", "elapsed_time": "3h 23m 36s", "remaining_time": "1h 20m 15s"} |
| {"loss": 0.50363386, "token_acc": 0.83971292, "grad_norm": 2.71087527, "learning_rate": 2e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022414, "epoch": 0.72036673, "global_step/max_steps": "275/382", "percentage": "71.99%", "elapsed_time": "3h 24m 21s", "remaining_time": "1h 19m 30s"} |
| {"loss": 0.44907174, "token_acc": 0.85650224, "grad_norm": 3.22561288, "learning_rate": 2e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022415, "epoch": 0.72298625, "global_step/max_steps": "276/382", "percentage": "72.25%", "elapsed_time": "3h 25m 5s", "remaining_time": "1h 18m 46s"} |
| {"loss": 0.49216294, "token_acc": 0.83443709, "grad_norm": 2.34182072, "learning_rate": 1.9e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022416, "epoch": 0.72560576, "global_step/max_steps": "277/382", "percentage": "72.51%", "elapsed_time": "3h 25m 49s", "remaining_time": "1h 18m 1s"} |
| {"loss": 0.52063382, "token_acc": 0.82666667, "grad_norm": 2.22454214, "learning_rate": 1.9e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022417, "epoch": 0.72822528, "global_step/max_steps": "278/382", "percentage": "72.77%", "elapsed_time": "3h 26m 33s", "remaining_time": "1h 17m 16s"} |
| {"loss": 0.50330299, "token_acc": 0.84516129, "grad_norm": 2.62523866, "learning_rate": 1.9e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022417, "epoch": 0.73084479, "global_step/max_steps": "279/382", "percentage": "73.04%", "elapsed_time": "3h 27m 18s", "remaining_time": "1h 16m 31s"} |
| {"loss": 0.47721252, "token_acc": 0.81818182, "grad_norm": 3.46505189, "learning_rate": 1.8e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022418, "epoch": 0.73346431, "global_step/max_steps": "280/382", "percentage": "73.30%", "elapsed_time": "3h 28m 2s", "remaining_time": "1h 15m 47s"} |
| {"loss": 0.53198409, "token_acc": 0.83924843, "grad_norm": 2.58057904, "learning_rate": 1.8e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022419, "epoch": 0.73608382, "global_step/max_steps": "281/382", "percentage": "73.56%", "elapsed_time": "3h 28m 46s", "remaining_time": "1h 15m 2s"} |
| {"loss": 0.5247941, "token_acc": 0.808, "grad_norm": 2.27877045, "learning_rate": 1.8e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022419, "epoch": 0.73870334, "global_step/max_steps": "282/382", "percentage": "73.82%", "elapsed_time": "3h 29m 30s", "remaining_time": "1h 14m 17s"} |
| {"loss": 0.50651246, "token_acc": 0.85217391, "grad_norm": 2.35515666, "learning_rate": 1.7e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02242, "epoch": 0.74132286, "global_step/max_steps": "283/382", "percentage": "74.08%", "elapsed_time": "3h 30m 14s", "remaining_time": "1h 13m 32s"} |
| {"loss": 0.51326668, "token_acc": 0.81967213, "grad_norm": 2.47311163, "learning_rate": 1.7e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022421, "epoch": 0.74394237, "global_step/max_steps": "284/382", "percentage": "74.35%", "elapsed_time": "3h 30m 59s", "remaining_time": "1h 12m 48s"} |
| {"loss": 0.51927936, "token_acc": 0.79867257, "grad_norm": 2.68407464, "learning_rate": 1.7e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022421, "epoch": 0.74656189, "global_step/max_steps": "285/382", "percentage": "74.61%", "elapsed_time": "3h 31m 43s", "remaining_time": "1h 12m 3s"} |
| {"loss": 0.49618238, "token_acc": 0.81489842, "grad_norm": 2.45383286, "learning_rate": 1.6e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022422, "epoch": 0.7491814, "global_step/max_steps": "286/382", "percentage": "74.87%", "elapsed_time": "3h 32m 27s", "remaining_time": "1h 11m 18s"} |
| {"loss": 0.49362797, "token_acc": 0.8534279, "grad_norm": 2.26142335, "learning_rate": 1.6e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022422, "epoch": 0.75180092, "global_step/max_steps": "287/382", "percentage": "75.13%", "elapsed_time": "3h 33m 11s", "remaining_time": "1h 10m 34s"} |
| {"loss": 0.49022061, "token_acc": 0.81140351, "grad_norm": 2.60141063, "learning_rate": 1.6e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022423, "epoch": 0.75442043, "global_step/max_steps": "288/382", "percentage": "75.39%", "elapsed_time": "3h 33m 56s", "remaining_time": "1h 9m 49s"} |
| {"loss": 0.51620996, "token_acc": 0.82993197, "grad_norm": 2.60625219, "learning_rate": 1.5e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022423, "epoch": 0.75703995, "global_step/max_steps": "289/382", "percentage": "75.65%", "elapsed_time": "3h 34m 40s", "remaining_time": "1h 9m 4s"} |
| {"loss": 0.51137459, "token_acc": 0.84119107, "grad_norm": 2.60016298, "learning_rate": 1.5e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022424, "epoch": 0.75965946, "global_step/max_steps": "290/382", "percentage": "75.92%", "elapsed_time": "3h 35m 24s", "remaining_time": "1h 8m 20s"} |
| {"loss": 0.49486947, "token_acc": 0.82629108, "grad_norm": 2.57735276, "learning_rate": 1.5e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022424, "epoch": 0.76227898, "global_step/max_steps": "291/382", "percentage": "76.18%", "elapsed_time": "3h 36m 9s", "remaining_time": "1h 7m 35s"} |
| {"loss": 0.50529313, "token_acc": 0.82532751, "grad_norm": 2.46743131, "learning_rate": 1.4e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022425, "epoch": 0.76489849, "global_step/max_steps": "292/382", "percentage": "76.44%", "elapsed_time": "3h 36m 53s", "remaining_time": "1h 6m 50s"} |
| {"loss": 0.52136004, "token_acc": 0.80085653, "grad_norm": 2.79872966, "learning_rate": 1.4e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022426, "epoch": 0.76751801, "global_step/max_steps": "293/382", "percentage": "76.70%", "elapsed_time": "3h 37m 37s", "remaining_time": "1h 6m 6s"} |
| {"loss": 0.54597229, "token_acc": 0.81466395, "grad_norm": 2.39943099, "learning_rate": 1.4e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022426, "epoch": 0.77013752, "global_step/max_steps": "294/382", "percentage": "76.96%", "elapsed_time": "3h 38m 21s", "remaining_time": "1h 5m 21s"} |
| {"loss": 0.51290905, "token_acc": 0.81882353, "grad_norm": 2.72690916, "learning_rate": 1.4e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022427, "epoch": 0.77275704, "global_step/max_steps": "295/382", "percentage": "77.23%", "elapsed_time": "3h 39m 5s", "remaining_time": "1h 4m 36s"} |
| {"loss": 0.47813663, "token_acc": 0.86363636, "grad_norm": 2.36479402, "learning_rate": 1.3e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022428, "epoch": 0.77537656, "global_step/max_steps": "296/382", "percentage": "77.49%", "elapsed_time": "3h 39m 49s", "remaining_time": "1h 3m 52s"} |
| {"loss": 0.50160491, "token_acc": 0.79640719, "grad_norm": 1.9940778, "learning_rate": 1.3e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022429, "epoch": 0.77799607, "global_step/max_steps": "297/382", "percentage": "77.75%", "elapsed_time": "3h 40m 34s", "remaining_time": "1h 3m 7s"} |
| {"loss": 0.55133939, "token_acc": 0.80090498, "grad_norm": 2.57687211, "learning_rate": 1.3e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022429, "epoch": 0.78061559, "global_step/max_steps": "298/382", "percentage": "78.01%", "elapsed_time": "3h 41m 18s", "remaining_time": "1h 2m 22s"} |
| {"loss": 0.48956871, "token_acc": 0.7956778, "grad_norm": 2.56108522, "learning_rate": 1.2e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02243, "epoch": 0.7832351, "global_step/max_steps": "299/382", "percentage": "78.27%", "elapsed_time": "3h 42m 2s", "remaining_time": "1h 1m 38s"} |
| {"loss": 0.54290378, "token_acc": 0.83578947, "grad_norm": 2.6277113, "learning_rate": 1.2e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022431, "epoch": 0.78585462, "global_step/max_steps": "300/382", "percentage": "78.53%", "elapsed_time": "3h 42m 46s", "remaining_time": "1h 0m 53s"} |
| {"loss": 0.49910355, "token_acc": 0.80119284, "grad_norm": 2.74415398, "learning_rate": 1.2e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022398, "epoch": 0.78847413, "global_step/max_steps": "301/382", "percentage": "78.80%", "elapsed_time": "3h 43m 50s", "remaining_time": "1h 0m 14s"} |
| {"loss": 0.49603665, "token_acc": 0.86896552, "grad_norm": 2.73191929, "learning_rate": 1.2e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022398, "epoch": 0.79109365, "global_step/max_steps": "302/382", "percentage": "79.06%", "elapsed_time": "3h 44m 35s", "remaining_time": "59m 29s"} |
| {"loss": 0.49729991, "token_acc": 0.84766585, "grad_norm": 2.39375973, "learning_rate": 1.1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022399, "epoch": 0.79371316, "global_step/max_steps": "303/382", "percentage": "79.32%", "elapsed_time": "3h 45m 19s", "remaining_time": "58m 44s"} |
| {"loss": 0.50356328, "token_acc": 0.80698152, "grad_norm": 2.73662424, "learning_rate": 1.1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.0224, "epoch": 0.79633268, "global_step/max_steps": "304/382", "percentage": "79.58%", "elapsed_time": "3h 46m 3s", "remaining_time": "58m 0s"} |
| {"loss": 0.50889683, "token_acc": 0.85344828, "grad_norm": 2.53246546, "learning_rate": 1.1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022401, "epoch": 0.79895219, "global_step/max_steps": "305/382", "percentage": "79.84%", "elapsed_time": "3h 46m 47s", "remaining_time": "57m 15s"} |
| {"loss": 0.47774571, "token_acc": 0.79035639, "grad_norm": 2.75037813, "learning_rate": 1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022401, "epoch": 0.80157171, "global_step/max_steps": "306/382", "percentage": "80.10%", "elapsed_time": "3h 47m 32s", "remaining_time": "56m 30s"} |
| {"loss": 0.4808327, "token_acc": 0.83224401, "grad_norm": 2.48030877, "learning_rate": 1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022402, "epoch": 0.80419122, "global_step/max_steps": "307/382", "percentage": "80.37%", "elapsed_time": "3h 48m 16s", "remaining_time": "55m 46s"} |
| {"loss": 0.46578383, "token_acc": 0.86757991, "grad_norm": 2.69799662, "learning_rate": 1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022403, "epoch": 0.80681074, "global_step/max_steps": "308/382", "percentage": "80.63%", "elapsed_time": "3h 49m 0s", "remaining_time": "55m 1s"} |
| {"loss": 0.52523816, "token_acc": 0.82452431, "grad_norm": 2.73827147, "learning_rate": 1e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022403, "epoch": 0.80943026, "global_step/max_steps": "309/382", "percentage": "80.89%", "elapsed_time": "3h 49m 44s", "remaining_time": "54m 16s"} |
| {"loss": 0.5307312, "token_acc": 0.80217391, "grad_norm": 2.19081426, "learning_rate": 9e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022404, "epoch": 0.81204977, "global_step/max_steps": "310/382", "percentage": "81.15%", "elapsed_time": "3h 50m 28s", "remaining_time": "53m 31s"} |
| {"loss": 0.4815802, "token_acc": 0.80952381, "grad_norm": 2.48326087, "learning_rate": 9e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022405, "epoch": 0.81466929, "global_step/max_steps": "311/382", "percentage": "81.41%", "elapsed_time": "3h 51m 13s", "remaining_time": "52m 47s"} |
| {"loss": 0.52469128, "token_acc": 0.81891348, "grad_norm": 2.34951735, "learning_rate": 9e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022406, "epoch": 0.8172888, "global_step/max_steps": "312/382", "percentage": "81.68%", "elapsed_time": "3h 51m 57s", "remaining_time": "52m 2s"} |
| {"loss": 0.4958902, "token_acc": 0.81422018, "grad_norm": 2.82174635, "learning_rate": 9e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022406, "epoch": 0.81990832, "global_step/max_steps": "313/382", "percentage": "81.94%", "elapsed_time": "3h 52m 41s", "remaining_time": "51m 17s"} |
| {"loss": 0.53449541, "token_acc": 0.83482143, "grad_norm": 2.67067432, "learning_rate": 8e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022407, "epoch": 0.82252783, "global_step/max_steps": "314/382", "percentage": "82.20%", "elapsed_time": "3h 53m 25s", "remaining_time": "50m 33s"} |
| {"loss": 0.50882912, "token_acc": 0.83538084, "grad_norm": 2.40520668, "learning_rate": 8e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022407, "epoch": 0.82514735, "global_step/max_steps": "315/382", "percentage": "82.46%", "elapsed_time": "3h 54m 10s", "remaining_time": "49m 48s"} |
| {"loss": 0.52352041, "token_acc": 0.82532751, "grad_norm": 2.43502712, "learning_rate": 8e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022408, "epoch": 0.82776686, "global_step/max_steps": "316/382", "percentage": "82.72%", "elapsed_time": "3h 54m 54s", "remaining_time": "49m 3s"} |
| {"loss": 0.55709004, "token_acc": 0.80042463, "grad_norm": 2.5967803, "learning_rate": 8e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022409, "epoch": 0.83038638, "global_step/max_steps": "317/382", "percentage": "82.98%", "elapsed_time": "3h 55m 38s", "remaining_time": "48m 19s"} |
| {"loss": 0.52302444, "token_acc": 0.84578313, "grad_norm": 2.48452401, "learning_rate": 8e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022409, "epoch": 0.83300589, "global_step/max_steps": "318/382", "percentage": "83.25%", "elapsed_time": "3h 56m 22s", "remaining_time": "47m 34s"} |
| {"loss": 0.46841729, "token_acc": 0.84682713, "grad_norm": 2.25732088, "learning_rate": 7e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02241, "epoch": 0.83562541, "global_step/max_steps": "319/382", "percentage": "83.51%", "elapsed_time": "3h 57m 6s", "remaining_time": "46m 49s"} |
| {"loss": 0.51713204, "token_acc": 0.81425486, "grad_norm": 2.80643606, "learning_rate": 7e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022411, "epoch": 0.83824492, "global_step/max_steps": "320/382", "percentage": "83.77%", "elapsed_time": "3h 57m 51s", "remaining_time": "46m 5s"} |
| {"loss": 0.5110063, "token_acc": 0.8326087, "grad_norm": 2.41785693, "learning_rate": 7e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022412, "epoch": 0.84086444, "global_step/max_steps": "321/382", "percentage": "84.03%", "elapsed_time": "3h 58m 35s", "remaining_time": "45m 20s"} |
| {"loss": 0.54232895, "token_acc": 0.80555556, "grad_norm": 2.49596977, "learning_rate": 7e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022412, "epoch": 0.84348396, "global_step/max_steps": "322/382", "percentage": "84.29%", "elapsed_time": "3h 59m 19s", "remaining_time": "44m 35s"} |
| {"loss": 0.49026245, "token_acc": 0.85745614, "grad_norm": 2.96252728, "learning_rate": 6e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022412, "epoch": 0.84610347, "global_step/max_steps": "323/382", "percentage": "84.55%", "elapsed_time": "4h 0m 3s", "remaining_time": "43m 51s"} |
| {"loss": 0.53639245, "token_acc": 0.79795918, "grad_norm": 2.49937296, "learning_rate": 6e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022413, "epoch": 0.84872299, "global_step/max_steps": "324/382", "percentage": "84.82%", "elapsed_time": "4h 0m 47s", "remaining_time": "43m 6s"} |
| {"loss": 0.52216214, "token_acc": 0.80522088, "grad_norm": 2.77321339, "learning_rate": 6e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022414, "epoch": 0.8513425, "global_step/max_steps": "325/382", "percentage": "85.08%", "elapsed_time": "4h 1m 32s", "remaining_time": "42m 21s"} |
| {"loss": 0.48864049, "token_acc": 0.81873727, "grad_norm": 2.40346599, "learning_rate": 6e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022414, "epoch": 0.85396202, "global_step/max_steps": "326/382", "percentage": "85.34%", "elapsed_time": "4h 2m 16s", "remaining_time": "41m 37s"} |
| {"loss": 0.52119392, "token_acc": 0.79174853, "grad_norm": 2.32100582, "learning_rate": 6e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022415, "epoch": 0.85658153, "global_step/max_steps": "327/382", "percentage": "85.60%", "elapsed_time": "4h 3m 0s", "remaining_time": "40m 52s"} |
| {"loss": 0.52729017, "token_acc": 0.85327314, "grad_norm": 2.26873779, "learning_rate": 5e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022416, "epoch": 0.85920105, "global_step/max_steps": "328/382", "percentage": "85.86%", "elapsed_time": "4h 3m 44s", "remaining_time": "40m 7s"} |
| {"loss": 0.48712325, "token_acc": 0.84787018, "grad_norm": 2.48419213, "learning_rate": 5e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022416, "epoch": 0.86182056, "global_step/max_steps": "329/382", "percentage": "86.13%", "elapsed_time": "4h 4m 28s", "remaining_time": "39m 23s"} |
| {"loss": 0.48572382, "token_acc": 0.82478632, "grad_norm": 2.29721284, "learning_rate": 5e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022417, "epoch": 0.86444008, "global_step/max_steps": "330/382", "percentage": "86.39%", "elapsed_time": "4h 5m 13s", "remaining_time": "38m 38s"} |
| {"loss": 0.48416871, "token_acc": 0.8377193, "grad_norm": 2.38312817, "learning_rate": 5e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022417, "epoch": 0.86705959, "global_step/max_steps": "331/382", "percentage": "86.65%", "elapsed_time": "4h 5m 57s", "remaining_time": "37m 53s"} |
| {"loss": 0.53322816, "token_acc": 0.80532787, "grad_norm": 2.75555444, "learning_rate": 5e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022418, "epoch": 0.86967911, "global_step/max_steps": "332/382", "percentage": "86.91%", "elapsed_time": "4h 6m 41s", "remaining_time": "37m 9s"} |
| {"loss": 0.50208074, "token_acc": 0.82067511, "grad_norm": 2.55903435, "learning_rate": 4e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022418, "epoch": 0.87229862, "global_step/max_steps": "333/382", "percentage": "87.17%", "elapsed_time": "4h 7m 26s", "remaining_time": "36m 24s"} |
| {"loss": 0.4761245, "token_acc": 0.80392157, "grad_norm": 2.36446524, "learning_rate": 4e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022419, "epoch": 0.87491814, "global_step/max_steps": "334/382", "percentage": "87.43%", "elapsed_time": "4h 8m 10s", "remaining_time": "35m 39s"} |
| {"loss": 0.54205018, "token_acc": 0.79379157, "grad_norm": 2.47903013, "learning_rate": 4e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022419, "epoch": 0.87753766, "global_step/max_steps": "335/382", "percentage": "87.70%", "elapsed_time": "4h 8m 54s", "remaining_time": "34m 55s"} |
| {"loss": 0.53201473, "token_acc": 0.8089172, "grad_norm": 2.66940379, "learning_rate": 4e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022419, "epoch": 0.88015717, "global_step/max_steps": "336/382", "percentage": "87.96%", "elapsed_time": "4h 9m 39s", "remaining_time": "34m 10s"} |
| {"loss": 0.54966015, "token_acc": 0.82840237, "grad_norm": 2.39184332, "learning_rate": 4e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.02242, "epoch": 0.88277669, "global_step/max_steps": "337/382", "percentage": "88.22%", "elapsed_time": "4h 10m 23s", "remaining_time": "33m 26s"} |
| {"loss": 0.5212338, "token_acc": 0.8247191, "grad_norm": 2.96336603, "learning_rate": 4e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022421, "epoch": 0.8853962, "global_step/max_steps": "338/382", "percentage": "88.48%", "elapsed_time": "4h 11m 7s", "remaining_time": "32m 41s"} |
| {"loss": 0.51319742, "token_acc": 0.82881002, "grad_norm": 2.72804213, "learning_rate": 3e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022421, "epoch": 0.88801572, "global_step/max_steps": "339/382", "percentage": "88.74%", "elapsed_time": "4h 11m 51s", "remaining_time": "31m 56s"} |
| {"loss": 0.51108897, "token_acc": 0.81744422, "grad_norm": 2.48715901, "learning_rate": 3e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022422, "epoch": 0.89063523, "global_step/max_steps": "340/382", "percentage": "89.01%", "elapsed_time": "4h 12m 35s", "remaining_time": "31m 12s"} |
| {"loss": 0.53462887, "token_acc": 0.80504587, "grad_norm": 2.75841522, "learning_rate": 3e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022422, "epoch": 0.89325475, "global_step/max_steps": "341/382", "percentage": "89.27%", "elapsed_time": "4h 13m 20s", "remaining_time": "30m 27s"} |
| {"loss": 0.47114405, "token_acc": 0.80777538, "grad_norm": 2.18638015, "learning_rate": 3e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022423, "epoch": 0.89587426, "global_step/max_steps": "342/382", "percentage": "89.53%", "elapsed_time": "4h 14m 4s", "remaining_time": "29m 42s"} |
| {"loss": 0.49620247, "token_acc": 0.84777518, "grad_norm": 2.44373512, "learning_rate": 3e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022423, "epoch": 0.89849378, "global_step/max_steps": "343/382", "percentage": "89.79%", "elapsed_time": "4h 14m 48s", "remaining_time": "28m 58s"} |
| {"loss": 0.51603442, "token_acc": 0.8525641, "grad_norm": 2.42920709, "learning_rate": 3e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022424, "epoch": 0.90111329, "global_step/max_steps": "344/382", "percentage": "90.05%", "elapsed_time": "4h 15m 32s", "remaining_time": "28m 13s"} |
| {"loss": 0.52078712, "token_acc": 0.82188841, "grad_norm": 2.35882974, "learning_rate": 3e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022424, "epoch": 0.90373281, "global_step/max_steps": "345/382", "percentage": "90.31%", "elapsed_time": "4h 16m 17s", "remaining_time": "27m 29s"} |
| {"loss": 0.54026228, "token_acc": 0.80544747, "grad_norm": 2.89594555, "learning_rate": 2e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022425, "epoch": 0.90635232, "global_step/max_steps": "346/382", "percentage": "90.58%", "elapsed_time": "4h 17m 1s", "remaining_time": "26m 44s"} |
| {"loss": 0.50974214, "token_acc": 0.77189409, "grad_norm": 2.36402082, "learning_rate": 2e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022425, "epoch": 0.90897184, "global_step/max_steps": "347/382", "percentage": "90.84%", "elapsed_time": "4h 17m 46s", "remaining_time": "25m 59s"} |
| {"loss": 0.55470902, "token_acc": 0.8212766, "grad_norm": 2.52900362, "learning_rate": 2e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022425, "epoch": 0.91159136, "global_step/max_steps": "348/382", "percentage": "91.10%", "elapsed_time": "4h 18m 30s", "remaining_time": "25m 15s"} |
| {"loss": 0.51533997, "token_acc": 0.80306346, "grad_norm": 2.37846899, "learning_rate": 2e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022426, "epoch": 0.91421087, "global_step/max_steps": "349/382", "percentage": "91.36%", "elapsed_time": "4h 19m 14s", "remaining_time": "24m 30s"} |
| {"loss": 0.47448784, "token_acc": 0.8677686, "grad_norm": 2.31588745, "learning_rate": 2e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022427, "epoch": 0.91683039, "global_step/max_steps": "350/382", "percentage": "91.62%", "elapsed_time": "4h 19m 58s", "remaining_time": "23m 46s"} |
| {"loss": 0.47689721, "token_acc": 0.85205479, "grad_norm": 2.32343364, "learning_rate": 2e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022398, "epoch": 0.9194499, "global_step/max_steps": "351/382", "percentage": "91.88%", "elapsed_time": "4h 21m 3s", "remaining_time": "23m 3s"} |
| {"loss": 0.49429876, "token_acc": 0.80606061, "grad_norm": 2.5884788, "learning_rate": 2e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022398, "epoch": 0.92206942, "global_step/max_steps": "352/382", "percentage": "92.15%", "elapsed_time": "4h 21m 47s", "remaining_time": "22m 18s"} |
| {"loss": 0.53715676, "token_acc": 0.79875519, "grad_norm": 2.44041348, "learning_rate": 2e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022399, "epoch": 0.92468893, "global_step/max_steps": "353/382", "percentage": "92.41%", "elapsed_time": "4h 22m 31s", "remaining_time": "21m 34s"} |
| {"loss": 0.5101127, "token_acc": 0.83125, "grad_norm": 2.532974, "learning_rate": 1e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.0224, "epoch": 0.92730845, "global_step/max_steps": "354/382", "percentage": "92.67%", "elapsed_time": "4h 23m 15s", "remaining_time": "20m 49s"} |
| {"loss": 0.48404869, "token_acc": 0.82713348, "grad_norm": 2.31485009, "learning_rate": 1e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.0224, "epoch": 0.92992796, "global_step/max_steps": "355/382", "percentage": "92.93%", "elapsed_time": "4h 24m 0s", "remaining_time": "20m 4s"} |
| {"loss": 0.50825667, "token_acc": 0.81290323, "grad_norm": 2.82953334, "learning_rate": 1e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022401, "epoch": 0.93254748, "global_step/max_steps": "356/382", "percentage": "93.19%", "elapsed_time": "4h 24m 44s", "remaining_time": "19m 20s"} |
| {"loss": 0.54859322, "token_acc": 0.84340045, "grad_norm": 2.71464229, "learning_rate": 1e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022402, "epoch": 0.93516699, "global_step/max_steps": "357/382", "percentage": "93.46%", "elapsed_time": "4h 25m 28s", "remaining_time": "18m 35s"} |
| {"loss": 0.49649796, "token_acc": 0.8545082, "grad_norm": 2.03234982, "learning_rate": 1e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022402, "epoch": 0.93778651, "global_step/max_steps": "358/382", "percentage": "93.72%", "elapsed_time": "4h 26m 12s", "remaining_time": "17m 50s"} |
| {"loss": 0.50043595, "token_acc": 0.79613734, "grad_norm": 2.50991654, "learning_rate": 1e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022403, "epoch": 0.94040602, "global_step/max_steps": "359/382", "percentage": "93.98%", "elapsed_time": "4h 26m 56s", "remaining_time": "17m 6s"} |
| {"loss": 0.52862531, "token_acc": 0.8045977, "grad_norm": 2.64091945, "learning_rate": 1e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022404, "epoch": 0.94302554, "global_step/max_steps": "360/382", "percentage": "94.24%", "elapsed_time": "4h 27m 41s", "remaining_time": "16m 21s"} |
| {"loss": 0.49931329, "token_acc": 0.85682819, "grad_norm": 2.39248824, "learning_rate": 1e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022404, "epoch": 0.94564506, "global_step/max_steps": "361/382", "percentage": "94.50%", "elapsed_time": "4h 28m 25s", "remaining_time": "15m 36s"} |
| {"loss": 0.49103531, "token_acc": 0.82304527, "grad_norm": 2.53784513, "learning_rate": 1e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022405, "epoch": 0.94826457, "global_step/max_steps": "362/382", "percentage": "94.76%", "elapsed_time": "4h 29m 9s", "remaining_time": "14m 52s"} |
| {"loss": 0.56352901, "token_acc": 0.80618557, "grad_norm": 2.66073084, "learning_rate": 1e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022406, "epoch": 0.95088409, "global_step/max_steps": "363/382", "percentage": "95.03%", "elapsed_time": "4h 29m 53s", "remaining_time": "14m 7s"} |
| {"loss": 0.51507562, "token_acc": 0.81395349, "grad_norm": 2.35630298, "learning_rate": 1e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022406, "epoch": 0.9535036, "global_step/max_steps": "364/382", "percentage": "95.29%", "elapsed_time": "4h 30m 37s", "remaining_time": "13m 22s"} |
| {"loss": 0.48375154, "token_acc": 0.80753968, "grad_norm": 2.55094218, "learning_rate": 1e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022407, "epoch": 0.95612312, "global_step/max_steps": "365/382", "percentage": "95.55%", "elapsed_time": "4h 31m 21s", "remaining_time": "12m 38s"} |
| {"loss": 0.46853399, "token_acc": 0.83823529, "grad_norm": 2.1849525, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022408, "epoch": 0.95874263, "global_step/max_steps": "366/382", "percentage": "95.81%", "elapsed_time": "4h 32m 5s", "remaining_time": "11m 53s"} |
| {"loss": 0.52192491, "token_acc": 0.81937173, "grad_norm": 3.56556511, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022408, "epoch": 0.96136215, "global_step/max_steps": "367/382", "percentage": "96.07%", "elapsed_time": "4h 32m 50s", "remaining_time": "11m 9s"} |
| {"loss": 0.47167692, "token_acc": 0.83403361, "grad_norm": 2.94072413, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022409, "epoch": 0.96398166, "global_step/max_steps": "368/382", "percentage": "96.34%", "elapsed_time": "4h 33m 34s", "remaining_time": "10m 24s"} |
| {"loss": 0.46407065, "token_acc": 0.81449893, "grad_norm": 2.05138278, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022409, "epoch": 0.96660118, "global_step/max_steps": "369/382", "percentage": "96.60%", "elapsed_time": "4h 34m 18s", "remaining_time": "9m 39s"} |
| {"loss": 0.5055849, "token_acc": 0.82886598, "grad_norm": 2.4933784, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.02241, "epoch": 0.96922069, "global_step/max_steps": "370/382", "percentage": "96.86%", "elapsed_time": "4h 35m 2s", "remaining_time": "8m 55s"} |
| {"loss": 0.51736391, "token_acc": 0.79777778, "grad_norm": 2.90468764, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022411, "epoch": 0.97184021, "global_step/max_steps": "371/382", "percentage": "97.12%", "elapsed_time": "4h 35m 46s", "remaining_time": "8m 10s"} |
| {"loss": 0.51728517, "token_acc": 0.82191781, "grad_norm": 2.25023341, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022411, "epoch": 0.97445972, "global_step/max_steps": "372/382", "percentage": "97.38%", "elapsed_time": "4h 36m 31s", "remaining_time": "7m 26s"} |
| {"loss": 0.46636567, "token_acc": 0.81489362, "grad_norm": 2.28610539, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022411, "epoch": 0.97707924, "global_step/max_steps": "373/382", "percentage": "97.64%", "elapsed_time": "4h 37m 15s", "remaining_time": "6m 41s"} |
| {"loss": 0.53415972, "token_acc": 0.81858407, "grad_norm": 2.35870314, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022412, "epoch": 0.97969876, "global_step/max_steps": "374/382", "percentage": "97.91%", "elapsed_time": "4h 37m 59s", "remaining_time": "5m 56s"} |
| {"loss": 0.50586045, "token_acc": 0.84251969, "grad_norm": 2.33148217, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022413, "epoch": 0.98231827, "global_step/max_steps": "375/382", "percentage": "98.17%", "elapsed_time": "4h 38m 43s", "remaining_time": "5m 12s"} |
| {"loss": 0.49814487, "token_acc": 0.83069977, "grad_norm": 2.65465045, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022413, "epoch": 0.98493779, "global_step/max_steps": "376/382", "percentage": "98.43%", "elapsed_time": "4h 39m 27s", "remaining_time": "4m 27s"} |
| {"loss": 0.4845404, "token_acc": 0.81763527, "grad_norm": 2.27329588, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022414, "epoch": 0.9875573, "global_step/max_steps": "377/382", "percentage": "98.69%", "elapsed_time": "4h 40m 11s", "remaining_time": "3m 42s"} |
| {"loss": 0.51448095, "token_acc": 0.80396476, "grad_norm": 2.75910759, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022415, "epoch": 0.99017682, "global_step/max_steps": "378/382", "percentage": "98.95%", "elapsed_time": "4h 40m 56s", "remaining_time": "2m 58s"} |
| {"loss": 0.55216378, "token_acc": 0.78858351, "grad_norm": 2.63409185, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022415, "epoch": 0.99279633, "global_step/max_steps": "379/382", "percentage": "99.21%", "elapsed_time": "4h 41m 40s", "remaining_time": "2m 13s"} |
| {"loss": 0.53165829, "token_acc": 0.8062201, "grad_norm": 2.53039455, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022416, "epoch": 0.99541585, "global_step/max_steps": "380/382", "percentage": "99.48%", "elapsed_time": "4h 42m 24s", "remaining_time": "1m 29s"} |
| {"loss": 0.52184331, "token_acc": 0.81488934, "grad_norm": 2.37092257, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022416, "epoch": 0.99803536, "global_step/max_steps": "381/382", "percentage": "99.74%", "elapsed_time": "4h 43m 8s", "remaining_time": "44s"} |
| {"loss": 0.49973977, "token_acc": 0.82371795, "grad_norm": 3.36275077, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022436, "epoch": 1.0, "global_step/max_steps": "382/382", "percentage": "100.00%", "elapsed_time": "4h 43m 38s", "remaining_time": "0s"} |
| {"eval_loss": 0.47624764, "eval_token_acc": 0.83046996, "eval_runtime": 139.0135, "eval_samples_per_second": 2.662, "eval_steps_per_second": 0.115, "epoch": 1.0, "global_step/max_steps": "382/382", "percentage": "100.00%", "elapsed_time": "4h 45m 57s", "remaining_time": "0s"} |
| {"train_runtime": 17179.794, "train_samples_per_second": 2.133, "train_steps_per_second": 0.022, "total_flos": 7.42031915510399e+18, "train_loss": 0.600046, "epoch": 1.0, "global_step/max_steps": "382/382", "percentage": "100.00%", "elapsed_time": "4h 46m 17s", "remaining_time": "0s"} |
| {"model_parameter_info": "Qwen2_5_VLForConditionalGeneration: 8292.1667M Params (7615.6165M Trainable [91.8411%]), 0.0019M Buffers.", "last_model_checkpoint": "/workspace/checkpoint/gui_exp/sft_amex_nav_complete_only/v0-20260416_140817/checkpoint-382", "best_model_checkpoint": "/workspace/checkpoint/gui_exp/sft_amex_nav_complete_only/v0-20260416_140817/checkpoint-382", "best_metric": 0.47624764, "global_step": 382, "log_history": [{"loss": 1.8907995223999023, "token_acc": 0.6242038216560509, "grad_norm": 63.27988052368164, "learning_rate": 5e-08, "memory(GiB)": 66.36, "train_speed(iter/s)": 0.017598, "epoch": 0.0026195153896529143, "step": 1}, {"loss": 1.940633773803711, "token_acc": 0.6553398058252428, "grad_norm": 63.244056701660156, "learning_rate": 1e-07, "memory(GiB)": 75.54, "train_speed(iter/s)": 0.019794, "epoch": 0.005239030779305829, "step": 2}, {"loss": 1.9505853652954102, "token_acc": 0.6244019138755981, "grad_norm": 66.90901947021484, "learning_rate": 1.5e-07, "memory(GiB)": 80.63, "train_speed(iter/s)": 0.020661, "epoch": 0.007858546168958742, "step": 3}, {"loss": 2.057070732116699, "token_acc": 0.6084070796460177, "grad_norm": 64.37629699707031, "learning_rate": 2e-07, "memory(GiB)": 80.63, "train_speed(iter/s)": 0.021127, "epoch": 0.010478061558611657, "step": 4}, {"loss": 2.023416042327881, "token_acc": 0.6203904555314533, "grad_norm": 65.68408203125, "learning_rate": 2.5e-07, "memory(GiB)": 81.03, "train_speed(iter/s)": 0.021392, "epoch": 0.01309757694826457, "step": 5}, {"loss": 1.917472243309021, "token_acc": 0.5885167464114832, "grad_norm": 64.10668182373047, "learning_rate": 3e-07, "memory(GiB)": 81.03, "train_speed(iter/s)": 0.021578, "epoch": 0.015717092337917484, "step": 6}, {"loss": 1.87931489944458, "token_acc": 0.6463654223968566, "grad_norm": 59.390106201171875, "learning_rate": 3.5e-07, "memory(GiB)": 81.03, "train_speed(iter/s)": 0.02172, "epoch": 0.0183366077275704, "step": 7}, {"loss": 1.9348621368408203, "token_acc": 0.6103603603603603, "grad_norm": 65.04813385009766, "learning_rate": 4e-07, "memory(GiB)": 81.03, "train_speed(iter/s)": 0.021821, "epoch": 0.020956123117223315, "step": 8}, {"loss": 1.9106614589691162, "token_acc": 0.6303191489361702, "grad_norm": 60.771751403808594, "learning_rate": 4.5e-07, "memory(GiB)": 81.03, "train_speed(iter/s)": 0.021909, "epoch": 0.023575638506876228, "step": 9}, {"loss": 1.8817265033721924, "token_acc": 0.6681818181818182, "grad_norm": 61.84583282470703, "learning_rate": 5e-07, "memory(GiB)": 81.03, "train_speed(iter/s)": 0.021989, "epoch": 0.02619515389652914, "step": 10}, {"loss": 1.8080368041992188, "token_acc": 0.6384039900249376, "grad_norm": 53.12112045288086, "learning_rate": 5.5e-07, "memory(GiB)": 81.18, "train_speed(iter/s)": 0.022041, "epoch": 0.028814669286182055, "step": 11}, {"loss": 1.7626909017562866, "token_acc": 0.6207674943566591, "grad_norm": 55.34021759033203, "learning_rate": 6e-07, "memory(GiB)": 81.18, "train_speed(iter/s)": 0.022085, "epoch": 0.03143418467583497, "step": 12}, {"loss": 1.7579209804534912, "token_acc": 0.6612529002320185, "grad_norm": 51.83931350708008, "learning_rate": 6.5e-07, "memory(GiB)": 81.18, "train_speed(iter/s)": 0.022128, "epoch": 0.034053700065487885, "step": 13}, {"loss": 1.5301661491394043, "token_acc": 0.6767441860465117, "grad_norm": 36.434444427490234, "learning_rate": 7e-07, "memory(GiB)": 81.18, "train_speed(iter/s)": 0.022161, "epoch": 0.0366732154551408, "step": 14}, {"loss": 1.4135932922363281, "token_acc": 0.6986899563318777, "grad_norm": 34.46994400024414, "learning_rate": 7.5e-07, "memory(GiB)": 81.18, "train_speed(iter/s)": 0.022192, "epoch": 0.03929273084479371, "step": 15}, {"loss": 1.338775396347046, "token_acc": 0.6762886597938145, "grad_norm": 31.918350219726562, "learning_rate": 8e-07, "memory(GiB)": 81.18, "train_speed(iter/s)": 0.022214, "epoch": 0.04191224623444663, "step": 16}, {"loss": 1.3861268758773804, "token_acc": 0.6855345911949685, "grad_norm": 32.0166015625, "learning_rate": 8.499999999999999e-07, "memory(GiB)": 81.18, "train_speed(iter/s)": 0.022242, "epoch": 0.04453176162409954, "step": 17}, {"loss": 1.2591882944107056, "token_acc": 0.7009708737864078, "grad_norm": 30.60653305053711, "learning_rate": 9e-07, "memory(GiB)": 81.18, "train_speed(iter/s)": 0.022266, "epoch": 0.047151277013752456, "step": 18}, {"loss": 1.0650749206542969, "token_acc": 0.7002012072434608, "grad_norm": 19.74515724182129, "learning_rate": 9.499999999999999e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022284, "epoch": 0.04977079240340537, "step": 19}, {"loss": 0.9479020833969116, "token_acc": 0.7096774193548387, "grad_norm": 17.456344604492188, "learning_rate": 1e-06, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022301, "epoch": 0.05239030779305828, "step": 20}, {"loss": 0.9783564805984497, "token_acc": 0.7170172084130019, "grad_norm": 15.00493335723877, "learning_rate": 9.99981171319448e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022318, "epoch": 0.0550098231827112, "step": 21}, {"loss": 0.9760516881942749, "token_acc": 0.73, "grad_norm": 14.92234992980957, "learning_rate": 9.999246866958693e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022332, "epoch": 0.05762933857236411, "step": 22}, {"loss": 0.8998144865036011, "token_acc": 0.7555012224938875, "grad_norm": 13.876760482788086, "learning_rate": 9.99830550383387e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022342, "epoch": 0.06024885396201703, "step": 23}, {"loss": 0.855924129486084, "token_acc": 0.744874715261959, "grad_norm": 11.736543655395508, "learning_rate": 9.996987694718518e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022352, "epoch": 0.06286836935166994, "step": 24}, {"loss": 0.8785209059715271, "token_acc": 0.7759815242494227, "grad_norm": 10.466511726379395, "learning_rate": 9.995293538863063e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022364, "epoch": 0.06548788474132286, "step": 25}, {"loss": 0.8433855772018433, "token_acc": 0.7684210526315789, "grad_norm": 9.219606399536133, "learning_rate": 9.993223163862385e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022375, "epoch": 0.06810740013097577, "step": 26}, {"loss": 0.8307563066482544, "token_acc": 0.7604166666666666, "grad_norm": 5.721984386444092, "learning_rate": 9.990776725646197e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022383, "epoch": 0.07072691552062868, "step": 27}, {"loss": 0.7318627238273621, "token_acc": 0.8109243697478992, "grad_norm": 5.883939266204834, "learning_rate": 9.987954408467319e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022393, "epoch": 0.0733464309102816, "step": 28}, {"loss": 0.7459089756011963, "token_acc": 0.8179871520342612, "grad_norm": 4.779611110687256, "learning_rate": 9.98475642488778e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022404, "epoch": 0.07596594629993451, "step": 29}, {"loss": 0.7111629247665405, "token_acc": 0.7788235294117647, "grad_norm": 6.088257312774658, "learning_rate": 9.981183015762831e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02241, "epoch": 0.07858546168958742, "step": 30}, {"loss": 0.7837305068969727, "token_acc": 0.7632743362831859, "grad_norm": 5.214715480804443, "learning_rate": 9.977234450222783e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022416, "epoch": 0.08120497707924033, "step": 31}, {"loss": 0.7254042625427246, "token_acc": 0.7665903890160183, "grad_norm": 4.576343536376953, "learning_rate": 9.972911025652754e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022422, "epoch": 0.08382449246889326, "step": 32}, {"loss": 0.6213464140892029, "token_acc": 0.7842003853564548, "grad_norm": 3.631990671157837, "learning_rate": 9.968213067670264e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022427, "epoch": 0.08644400785854617, "step": 33}, {"loss": 0.739364504814148, "token_acc": 0.7610441767068273, "grad_norm": 3.264214515686035, "learning_rate": 9.963140930100713e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022432, "epoch": 0.08906352324819908, "step": 34}, {"loss": 0.7275943756103516, "token_acc": 0.7711111111111111, "grad_norm": 4.268857955932617, "learning_rate": 9.957694994950737e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022435, "epoch": 0.091683038637852, "step": 35}, {"loss": 0.7099075317382812, "token_acc": 0.7782909930715936, "grad_norm": 3.6333351135253906, "learning_rate": 9.951875672379423e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022441, "epoch": 0.09430255402750491, "step": 36}, {"loss": 0.652459979057312, "token_acc": 0.800982800982801, "grad_norm": 4.225485801696777, "learning_rate": 9.945683400667438e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022446, "epoch": 0.09692206941715782, "step": 37}, {"loss": 0.6966084241867065, "token_acc": 0.7637130801687764, "grad_norm": 3.550759792327881, "learning_rate": 9.939118646184005e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02245, "epoch": 0.09954158480681075, "step": 38}, {"loss": 0.6055182218551636, "token_acc": 0.7816593886462883, "grad_norm": 3.717432737350464, "learning_rate": 9.932181903351783e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022457, "epoch": 0.10216110019646366, "step": 39}, {"loss": 0.660804271697998, "token_acc": 0.7598343685300207, "grad_norm": 4.526533126831055, "learning_rate": 9.924873694609634e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02246, "epoch": 0.10478061558611657, "step": 40}, {"loss": 0.6518653631210327, "token_acc": 0.7914798206278026, "grad_norm": 3.6490845680236816, "learning_rate": 9.917194570373268e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022465, "epoch": 0.10740013097576949, "step": 41}, {"loss": 0.6824044585227966, "token_acc": 0.7766179540709812, "grad_norm": 3.86726975440979, "learning_rate": 9.909145108993792e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022469, "epoch": 0.1100196463654224, "step": 42}, {"loss": 0.6115297675132751, "token_acc": 0.8203463203463204, "grad_norm": 3.15071702003479, "learning_rate": 9.900725916714155e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022473, "epoch": 0.11263916175507531, "step": 43}, {"loss": 0.6265181303024292, "token_acc": 0.8122171945701357, "grad_norm": 3.610048532485962, "learning_rate": 9.891937627623485e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022476, "epoch": 0.11525867714472822, "step": 44}, {"loss": 0.5976269841194153, "token_acc": 0.8143236074270557, "grad_norm": 3.533850908279419, "learning_rate": 9.882780903609335e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02248, "epoch": 0.11787819253438114, "step": 45}, {"loss": 0.6677061915397644, "token_acc": 0.8069767441860465, "grad_norm": 3.7200188636779785, "learning_rate": 9.873256434307828e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022483, "epoch": 0.12049770792403405, "step": 46}, {"loss": 0.6592556834220886, "token_acc": 0.7987012987012987, "grad_norm": 4.362515926361084, "learning_rate": 9.863364937051724e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022486, "epoch": 0.12311722331368696, "step": 47}, {"loss": 0.6057340502738953, "token_acc": 0.8204545454545454, "grad_norm": 3.5236833095550537, "learning_rate": 9.85310715681639e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022487, "epoch": 0.12573673870333987, "step": 48}, {"loss": 0.6437666416168213, "token_acc": 0.8004291845493562, "grad_norm": 3.178062677383423, "learning_rate": 9.842483866163698e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02249, "epoch": 0.12835625409299278, "step": 49}, {"loss": 0.5997329950332642, "token_acc": 0.8056155507559395, "grad_norm": 3.2153780460357666, "learning_rate": 9.831495865183832e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022492, "epoch": 0.13097576948264572, "step": 50}, {"loss": 0.6203830242156982, "token_acc": 0.8080357142857143, "grad_norm": 3.2086801528930664, "learning_rate": 9.820143981435028e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022313, "epoch": 0.13359528487229863, "step": 51}, {"loss": 0.5806266069412231, "token_acc": 0.8236658932714617, "grad_norm": 2.970149278640747, "learning_rate": 9.808429069881266e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022317, "epoch": 0.13621480026195154, "step": 52}, {"loss": 0.5999009609222412, "token_acc": 0.8052208835341366, "grad_norm": 2.6012885570526123, "learning_rate": 9.79635201282785e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022323, "epoch": 0.13883431565160445, "step": 53}, {"loss": 0.6056994795799255, "token_acc": 0.8137755102040817, "grad_norm": 3.064516305923462, "learning_rate": 9.783913719854976e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022325, "epoch": 0.14145383104125736, "step": 54}, {"loss": 0.5767471790313721, "token_acc": 0.7799145299145299, "grad_norm": 3.129368543624878, "learning_rate": 9.771115127749227e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02233, "epoch": 0.14407334643091027, "step": 55}, {"loss": 0.6119959354400635, "token_acc": 0.7721774193548387, "grad_norm": 3.5217173099517822, "learning_rate": 9.75795720043301e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022335, "epoch": 0.1466928618205632, "step": 56}, {"loss": 0.573194682598114, "token_acc": 0.7837837837837838, "grad_norm": 3.371673822402954, "learning_rate": 9.744440928891966e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02234, "epoch": 0.14931237721021612, "step": 57}, {"loss": 0.5497080087661743, "token_acc": 0.806941431670282, "grad_norm": 3.9817211627960205, "learning_rate": 9.730567331100333e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022344, "epoch": 0.15193189259986903, "step": 58}, {"loss": 0.6122070550918579, "token_acc": 0.780439121756487, "grad_norm": 3.112736940383911, "learning_rate": 9.716337451944274e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022349, "epoch": 0.15455140798952194, "step": 59}, {"loss": 0.6176848411560059, "token_acc": 0.8162055335968379, "grad_norm": 3.2749805450439453, "learning_rate": 9.701752363143183e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022354, "epoch": 0.15717092337917485, "step": 60}, {"loss": 0.5978531837463379, "token_acc": 0.7649325626204239, "grad_norm": 2.658205509185791, "learning_rate": 9.686813163168972e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02236, "epoch": 0.15979043876882776, "step": 61}, {"loss": 0.5957769155502319, "token_acc": 0.7974137931034483, "grad_norm": 2.865530014038086, "learning_rate": 9.671520977163339e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022365, "epoch": 0.16240995415848067, "step": 62}, {"loss": 0.6532036066055298, "token_acc": 0.7680890538033395, "grad_norm": 3.0464553833007812, "learning_rate": 9.655876956853024e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022367, "epoch": 0.1650294695481336, "step": 63}, {"loss": 0.5908622145652771, "token_acc": 0.8091954022988506, "grad_norm": 2.9496984481811523, "learning_rate": 9.63988228046307e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02237, "epoch": 0.16764898493778652, "step": 64}, {"loss": 0.576657772064209, "token_acc": 0.8179871520342612, "grad_norm": 2.8284106254577637, "learning_rate": 9.623538152628087e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022375, "epoch": 0.17026850032743943, "step": 65}, {"loss": 0.5907573103904724, "token_acc": 0.8126315789473684, "grad_norm": 2.8139805793762207, "learning_rate": 9.606845804301523e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022378, "epoch": 0.17288801571709234, "step": 66}, {"loss": 0.6441918611526489, "token_acc": 0.7745098039215687, "grad_norm": 3.1154873371124268, "learning_rate": 9.589806492662954e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022383, "epoch": 0.17550753110674525, "step": 67}, {"loss": 0.5281984806060791, "token_acc": 0.8289156626506025, "grad_norm": 3.0192372798919678, "learning_rate": 9.572421501023401e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022384, "epoch": 0.17812704649639816, "step": 68}, {"loss": 0.5715993642807007, "token_acc": 0.8008048289738431, "grad_norm": 2.695600748062134, "learning_rate": 9.554692138728683e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022387, "epoch": 0.1807465618860511, "step": 69}, {"loss": 0.5750143527984619, "token_acc": 0.8259023354564756, "grad_norm": 2.4650559425354004, "learning_rate": 9.536619741060799e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02239, "epoch": 0.183366077275704, "step": 70}, {"loss": 0.5429270267486572, "token_acc": 0.8063157894736842, "grad_norm": 2.7900400161743164, "learning_rate": 9.518205669137351e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022393, "epoch": 0.1859855926653569, "step": 71}, {"loss": 0.6283689737319946, "token_acc": 0.7927565392354124, "grad_norm": 5.67881441116333, "learning_rate": 9.499451309809057e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022396, "epoch": 0.18860510805500982, "step": 72}, {"loss": 0.5479881763458252, "token_acc": 0.8125, "grad_norm": 2.6714205741882324, "learning_rate": 9.480358075555277e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022399, "epoch": 0.19122462344466273, "step": 73}, {"loss": 0.5334833860397339, "token_acc": 0.8690476190476191, "grad_norm": 2.7358267307281494, "learning_rate": 9.460927404377646e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022403, "epoch": 0.19384413883431564, "step": 74}, {"loss": 0.5740832686424255, "token_acc": 0.8097251585623678, "grad_norm": 2.859041213989258, "learning_rate": 9.441160759691766e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022406, "epoch": 0.19646365422396855, "step": 75}, {"loss": 0.5894163846969604, "token_acc": 0.8141025641025641, "grad_norm": 3.1584606170654297, "learning_rate": 9.42105963021699e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022407, "epoch": 0.1990831696136215, "step": 76}, {"loss": 0.5662564039230347, "token_acc": 0.7962962962962963, "grad_norm": 2.715153694152832, "learning_rate": 9.400625529864301e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022409, "epoch": 0.2017026850032744, "step": 77}, {"loss": 0.6219125986099243, "token_acc": 0.7880794701986755, "grad_norm": 3.220654010772705, "learning_rate": 9.37985999762229e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022412, "epoch": 0.2043222003929273, "step": 78}, {"loss": 0.6343144774436951, "token_acc": 0.819047619047619, "grad_norm": 3.066732406616211, "learning_rate": 9.358764597441249e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022414, "epoch": 0.20694171578258022, "step": 79}, {"loss": 0.5429288744926453, "token_acc": 0.8025751072961373, "grad_norm": 2.912851333618164, "learning_rate": 9.337340918115384e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022416, "epoch": 0.20956123117223313, "step": 80}, {"loss": 0.5891841650009155, "token_acc": 0.7955555555555556, "grad_norm": 3.0999772548675537, "learning_rate": 9.315590573163152e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022419, "epoch": 0.21218074656188604, "step": 81}, {"loss": 0.5406292676925659, "token_acc": 0.8065934065934066, "grad_norm": 2.4872875213623047, "learning_rate": 9.293515200705739e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022421, "epoch": 0.21480026195153898, "step": 82}, {"loss": 0.566483736038208, "token_acc": 0.801255230125523, "grad_norm": 3.3176326751708984, "learning_rate": 9.271116463343691e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022424, "epoch": 0.2174197773411919, "step": 83}, {"loss": 0.5530977249145508, "token_acc": 0.834841628959276, "grad_norm": 2.730381727218628, "learning_rate": 9.248396048031689e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022425, "epoch": 0.2200392927308448, "step": 84}, {"loss": 0.570239782333374, "token_acc": 0.8012684989429175, "grad_norm": 3.158013105392456, "learning_rate": 9.225355665951502e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022428, "epoch": 0.2226588081204977, "step": 85}, {"loss": 0.5126999616622925, "token_acc": 0.8422174840085288, "grad_norm": 2.5247857570648193, "learning_rate": 9.201997052383106e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02243, "epoch": 0.22527832351015062, "step": 86}, {"loss": 0.5446218848228455, "token_acc": 0.8281938325991189, "grad_norm": 2.5841782093048096, "learning_rate": 9.178321966573992e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022432, "epoch": 0.22789783889980353, "step": 87}, {"loss": 0.5847496390342712, "token_acc": 0.7980295566502463, "grad_norm": 2.9967753887176514, "learning_rate": 9.154332191606671e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022434, "epoch": 0.23051735428945644, "step": 88}, {"loss": 0.5531667470932007, "token_acc": 0.8177874186550976, "grad_norm": 3.0768024921417236, "learning_rate": 9.130029534264381e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022436, "epoch": 0.23313686967910938, "step": 89}, {"loss": 0.5546162128448486, "token_acc": 0.8012048192771084, "grad_norm": 2.6468522548675537, "learning_rate": 9.105415824895007e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022439, "epoch": 0.2357563850687623, "step": 90}, {"loss": 0.5080910325050354, "token_acc": 0.81875, "grad_norm": 2.472001791000366, "learning_rate": 9.080492917273237e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02244, "epoch": 0.2383759004584152, "step": 91}, {"loss": 0.5841704607009888, "token_acc": 0.8142201834862385, "grad_norm": 2.924325704574585, "learning_rate": 9.05526268846093e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022442, "epoch": 0.2409954158480681, "step": 92}, {"loss": 0.5632328391075134, "token_acc": 0.817351598173516, "grad_norm": 2.696861982345581, "learning_rate": 9.029727038665763e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022444, "epoch": 0.24361493123772102, "step": 93}, {"loss": 0.570451021194458, "token_acc": 0.8151260504201681, "grad_norm": 2.805720567703247, "learning_rate": 9.003887891098106e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022445, "epoch": 0.24623444662737393, "step": 94}, {"loss": 0.5538668036460876, "token_acc": 0.8179611650485437, "grad_norm": 3.2786123752593994, "learning_rate": 8.977747191826182e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022448, "epoch": 0.24885396201702684, "step": 95}, {"loss": 0.5379599928855896, "token_acc": 0.8257261410788381, "grad_norm": 2.474317789077759, "learning_rate": 8.951306909629492e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02245, "epoch": 0.25147347740667975, "step": 96}, {"loss": 0.5602664947509766, "token_acc": 0.7896995708154506, "grad_norm": 2.459892749786377, "learning_rate": 8.924569035850545e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022452, "epoch": 0.2540929927963327, "step": 97}, {"loss": 0.5350607633590698, "token_acc": 0.821505376344086, "grad_norm": 2.286325454711914, "learning_rate": 8.897535584244879e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022454, "epoch": 0.25671250818598557, "step": 98}, {"loss": 0.5580881834030151, "token_acc": 0.851931330472103, "grad_norm": 2.9564380645751953, "learning_rate": 8.870208590829394e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022456, "epoch": 0.2593320235756385, "step": 99}, {"loss": 0.5178885459899902, "token_acc": 0.8165548098434005, "grad_norm": 2.5042099952697754, "learning_rate": 8.842590113729e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022457, "epoch": 0.26195153896529144, "step": 100}, {"loss": 0.5375851392745972, "token_acc": 0.8133333333333334, "grad_norm": 2.5000150203704834, "learning_rate": 8.814682233021635e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022356, "epoch": 0.2645710543549443, "step": 101}, {"loss": 0.5846019983291626, "token_acc": 0.8115631691648822, "grad_norm": 2.84934139251709, "learning_rate": 8.786487050581581e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022359, "epoch": 0.26719056974459726, "step": 102}, {"loss": 0.5801154375076294, "token_acc": 0.8364928909952607, "grad_norm": 3.06900954246521, "learning_rate": 8.758006689921168e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02236, "epoch": 0.26981008513425014, "step": 103}, {"loss": 0.5463855266571045, "token_acc": 0.8426966292134831, "grad_norm": 2.5572643280029297, "learning_rate": 8.72924329603085e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022363, "epoch": 0.2724296005239031, "step": 104}, {"loss": 0.5727757215499878, "token_acc": 0.8004201680672269, "grad_norm": 2.827608823776245, "learning_rate": 8.700199035217646e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022365, "epoch": 0.275049115913556, "step": 105}, {"loss": 0.5666229724884033, "token_acc": 0.8176229508196722, "grad_norm": 2.8642961978912354, "learning_rate": 8.67087609494199e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022367, "epoch": 0.2776686313032089, "step": 106}, {"loss": 0.5407172441482544, "token_acc": 0.8417849898580122, "grad_norm": 3.298826217651367, "learning_rate": 8.641276683652987e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02237, "epoch": 0.28028814669286184, "step": 107}, {"loss": 0.5598130226135254, "token_acc": 0.8235294117647058, "grad_norm": 2.727931261062622, "learning_rate": 8.611403030622074e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022373, "epoch": 0.2829076620825147, "step": 108}, {"loss": 0.5367304086685181, "token_acc": 0.8147410358565738, "grad_norm": 3.2373580932617188, "learning_rate": 8.581257385775126e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022373, "epoch": 0.28552717747216766, "step": 109}, {"loss": 0.5132820010185242, "token_acc": 0.8471177944862155, "grad_norm": 2.666104555130005, "learning_rate": 8.550842019523018e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022376, "epoch": 0.28814669286182054, "step": 110}, {"loss": 0.5320757031440735, "token_acc": 0.8282828282828283, "grad_norm": 2.5370404720306396, "learning_rate": 8.520159222590604e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022378, "epoch": 0.2907662082514735, "step": 111}, {"loss": 0.5458981990814209, "token_acc": 0.8159509202453987, "grad_norm": 2.5767695903778076, "learning_rate": 8.489211305844215e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02238, "epoch": 0.2933857236411264, "step": 112}, {"loss": 0.5197765827178955, "token_acc": 0.8137472283813747, "grad_norm": 2.7759788036346436, "learning_rate": 8.458000600117603e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022382, "epoch": 0.2960052390307793, "step": 113}, {"loss": 0.5534783601760864, "token_acc": 0.8138528138528138, "grad_norm": 2.890899181365967, "learning_rate": 8.4265294560364e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022384, "epoch": 0.29862475442043224, "step": 114}, {"loss": 0.5532283782958984, "token_acc": 0.819672131147541, "grad_norm": 2.5802199840545654, "learning_rate": 8.394800243841077e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022386, "epoch": 0.3012442698100851, "step": 115}, {"loss": 0.5814436674118042, "token_acc": 0.7846975088967971, "grad_norm": 2.7424299716949463, "learning_rate": 8.36281535320844e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022387, "epoch": 0.30386378519973806, "step": 116}, {"loss": 0.5543427467346191, "token_acc": 0.841743119266055, "grad_norm": 3.1005427837371826, "learning_rate": 8.33057719307164e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022389, "epoch": 0.30648330058939094, "step": 117}, {"loss": 0.5549328327178955, "token_acc": 0.7941176470588235, "grad_norm": 2.9101967811584473, "learning_rate": 8.298088191438752e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022391, "epoch": 0.3091028159790439, "step": 118}, {"loss": 0.545494794845581, "token_acc": 0.7843137254901961, "grad_norm": 3.0609023571014404, "learning_rate": 8.265350795209912e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022392, "epoch": 0.3117223313686968, "step": 119}, {"loss": 0.5447317361831665, "token_acc": 0.8051391862955032, "grad_norm": 2.6965584754943848, "learning_rate": 8.232367469993017e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022394, "epoch": 0.3143418467583497, "step": 120}, {"loss": 0.5292974710464478, "token_acc": 0.80078125, "grad_norm": 2.5463309288024902, "learning_rate": 8.199140699918048e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022397, "epoch": 0.31696136214800263, "step": 121}, {"loss": 0.5714783668518066, "token_acc": 0.8106508875739645, "grad_norm": 2.431605339050293, "learning_rate": 8.165672987449961e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022398, "epoch": 0.3195808775376555, "step": 122}, {"loss": 0.526486873626709, "token_acc": 0.8244444444444444, "grad_norm": 3.259045124053955, "learning_rate": 8.131966853200225e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.0224, "epoch": 0.32220039292730845, "step": 123}, {"loss": 0.5688433051109314, "token_acc": 0.7731755424063116, "grad_norm": 2.783949851989746, "learning_rate": 8.098024835736976e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022403, "epoch": 0.32481990831696134, "step": 124}, {"loss": 0.5338314771652222, "token_acc": 0.8515625, "grad_norm": 2.606600284576416, "learning_rate": 8.06384949139383e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022405, "epoch": 0.3274394237066143, "step": 125}, {"loss": 0.5526909828186035, "token_acc": 0.7879377431906615, "grad_norm": 2.744981050491333, "learning_rate": 8.029443394077355e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022407, "epoch": 0.3300589390962672, "step": 126}, {"loss": 0.5716977119445801, "token_acc": 0.8044943820224719, "grad_norm": 2.7763917446136475, "learning_rate": 7.994809135073211e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.02241, "epoch": 0.3326784544859201, "step": 127}, {"loss": 0.525353729724884, "token_acc": 0.8302752293577982, "grad_norm": 2.793274164199829, "learning_rate": 7.959949322850993e-07, "memory(GiB)": 81.4, "train_speed(iter/s)": 0.022412, "epoch": 0.33529796987557303, "step": 128}, {"loss": 0.5480594635009766, "token_acc": 0.8236658932714617, "grad_norm": 2.8624629974365234, "learning_rate": 7.924866582867777e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022412, "epoch": 0.3379174852652259, "step": 129}, {"loss": 0.5240566730499268, "token_acc": 0.7968036529680366, "grad_norm": 2.4846079349517822, "learning_rate": 7.889563557370377e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022414, "epoch": 0.34053700065487885, "step": 130}, {"loss": 0.5630617737770081, "token_acc": 0.8134171907756813, "grad_norm": 2.5415945053100586, "learning_rate": 7.854042905196353e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022415, "epoch": 0.34315651604453173, "step": 131}, {"loss": 0.5148541927337646, "token_acc": 0.8333333333333334, "grad_norm": 2.786975145339966, "learning_rate": 7.818307301573755e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022417, "epoch": 0.34577603143418467, "step": 132}, {"loss": 0.5435724258422852, "token_acc": 0.8434237995824635, "grad_norm": 2.9632627964019775, "learning_rate": 7.782359437919643e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022419, "epoch": 0.3483955468238376, "step": 133}, {"loss": 0.4857940673828125, "token_acc": 0.8809523809523809, "grad_norm": 2.490950584411621, "learning_rate": 7.746202021637383e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.02242, "epoch": 0.3510150622134905, "step": 134}, {"loss": 0.5265945196151733, "token_acc": 0.7988505747126436, "grad_norm": 2.5559632778167725, "learning_rate": 7.709837775912736e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022422, "epoch": 0.35363457760314343, "step": 135}, {"loss": 0.5263570547103882, "token_acc": 0.7861163227016885, "grad_norm": 2.5085930824279785, "learning_rate": 7.673269439508769e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022424, "epoch": 0.3562540929927963, "step": 136}, {"loss": 0.5203988552093506, "token_acc": 0.8168724279835391, "grad_norm": 2.167789936065674, "learning_rate": 7.636499766559581e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022426, "epoch": 0.35887360838244925, "step": 137}, {"loss": 0.5687823295593262, "token_acc": 0.8325, "grad_norm": 2.9854276180267334, "learning_rate": 7.599531526362873e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022427, "epoch": 0.3614931237721022, "step": 138}, {"loss": 0.5261340737342834, "token_acc": 0.8245967741935484, "grad_norm": 2.584529399871826, "learning_rate": 7.562367503171385e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022429, "epoch": 0.36411263916175507, "step": 139}, {"loss": 0.5185368657112122, "token_acc": 0.7859848484848485, "grad_norm": 2.476317882537842, "learning_rate": 7.525010495983201e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.02243, "epoch": 0.366732154551408, "step": 140}, {"loss": 0.4953394830226898, "token_acc": 0.8557213930348259, "grad_norm": 2.7736923694610596, "learning_rate": 7.487463318330944e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022431, "epoch": 0.3693516699410609, "step": 141}, {"loss": 0.49524927139282227, "token_acc": 0.8, "grad_norm": 2.414165735244751, "learning_rate": 7.449728798069863e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022432, "epoch": 0.3719711853307138, "step": 142}, {"loss": 0.5605555772781372, "token_acc": 0.7982261640798226, "grad_norm": 2.4975924491882324, "learning_rate": 7.411809777164872e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022434, "epoch": 0.3745907007203667, "step": 143}, {"loss": 0.531795859336853, "token_acc": 0.8051181102362205, "grad_norm": 2.0287163257598877, "learning_rate": 7.373709111476497e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022435, "epoch": 0.37721021611001965, "step": 144}, {"loss": 0.4882631003856659, "token_acc": 0.8309572301425662, "grad_norm": 2.7108078002929688, "learning_rate": 7.335429670545788e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022436, "epoch": 0.3798297314996726, "step": 145}, {"loss": 0.5370432734489441, "token_acc": 0.8057851239669421, "grad_norm": 2.5603761672973633, "learning_rate": 7.296974337378207e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022437, "epoch": 0.38244924688932547, "step": 146}, {"loss": 0.5311410427093506, "token_acc": 0.8429951690821256, "grad_norm": 2.420466423034668, "learning_rate": 7.258346008226489e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022439, "epoch": 0.3850687622789784, "step": 147}, {"loss": 0.5071064829826355, "token_acc": 0.8153846153846154, "grad_norm": 2.649905204772949, "learning_rate": 7.219547592372511e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.02244, "epoch": 0.3876882776686313, "step": 148}, {"loss": 0.5461446046829224, "token_acc": 0.8112449799196787, "grad_norm": 2.7632946968078613, "learning_rate": 7.180582011908187e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022441, "epoch": 0.3903077930582842, "step": 149}, {"loss": 0.4917265772819519, "token_acc": 0.8284424379232506, "grad_norm": 2.6324427127838135, "learning_rate": 7.141452201515385e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022443, "epoch": 0.3929273084479371, "step": 150}, {"loss": 0.5275927782058716, "token_acc": 0.8261802575107297, "grad_norm": 2.8219516277313232, "learning_rate": 7.102161108244906e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022379, "epoch": 0.39554682383759004, "step": 151}, {"loss": 0.49508413672447205, "token_acc": 0.8231578947368421, "grad_norm": 2.3845009803771973, "learning_rate": 7.062711691294524e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022381, "epoch": 0.398166339227243, "step": 152}, {"loss": 0.5475088953971863, "token_acc": 0.8366013071895425, "grad_norm": 2.9017446041107178, "learning_rate": 7.023106921786117e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022382, "epoch": 0.40078585461689586, "step": 153}, {"loss": 0.4947194457054138, "token_acc": 0.8673218673218673, "grad_norm": 2.784161329269409, "learning_rate": 6.983349782541901e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022383, "epoch": 0.4034053700065488, "step": 154}, {"loss": 0.5187703371047974, "token_acc": 0.8179916317991632, "grad_norm": 3.034949541091919, "learning_rate": 6.943443267859769e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022384, "epoch": 0.4060248853962017, "step": 155}, {"loss": 0.5227006673812866, "token_acc": 0.8057851239669421, "grad_norm": 2.522339105606079, "learning_rate": 6.903390383287794e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022385, "epoch": 0.4086444007858546, "step": 156}, {"loss": 0.5195754766464233, "token_acc": 0.8192307692307692, "grad_norm": 2.7332656383514404, "learning_rate": 6.863194145397848e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022386, "epoch": 0.4112639161755075, "step": 157}, {"loss": 0.48856601119041443, "token_acc": 0.8103448275862069, "grad_norm": 2.6740503311157227, "learning_rate": 6.822857581558422e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022388, "epoch": 0.41388343156516044, "step": 158}, {"loss": 0.533099889755249, "token_acc": 0.8062953995157385, "grad_norm": 2.572908639907837, "learning_rate": 6.782383729706617e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.02239, "epoch": 0.4165029469548134, "step": 159}, {"loss": 0.4634387493133545, "token_acc": 0.834061135371179, "grad_norm": 2.551100015640259, "learning_rate": 6.741775638119344e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022391, "epoch": 0.41912246234446626, "step": 160}, {"loss": 0.48153918981552124, "token_acc": 0.8215102974828375, "grad_norm": 2.2995591163635254, "learning_rate": 6.701036365183737e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022393, "epoch": 0.4217419777341192, "step": 161}, {"loss": 0.5273360013961792, "token_acc": 0.818, "grad_norm": 2.6744697093963623, "learning_rate": 6.660168979166819e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022394, "epoch": 0.4243614931237721, "step": 162}, {"loss": 0.4723895490169525, "token_acc": 0.8528678304239401, "grad_norm": 2.8775475025177, "learning_rate": 6.619176557984419e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022395, "epoch": 0.426981008513425, "step": 163}, {"loss": 0.5155174136161804, "token_acc": 0.837037037037037, "grad_norm": 3.4531373977661133, "learning_rate": 6.578062188969349e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022397, "epoch": 0.42960052390307796, "step": 164}, {"loss": 0.5435302257537842, "token_acc": 0.8102345415778252, "grad_norm": 2.8298938274383545, "learning_rate": 6.53682896863889e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022398, "epoch": 0.43222003929273084, "step": 165}, {"loss": 0.5459126234054565, "token_acc": 0.8235294117647058, "grad_norm": 2.851915121078491, "learning_rate": 6.495480002461577e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022399, "epoch": 0.4348395546823838, "step": 166}, {"loss": 0.5094572305679321, "token_acc": 0.8377192982456141, "grad_norm": 2.7637815475463867, "learning_rate": 6.454018404623311e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.0224, "epoch": 0.43745907007203666, "step": 167}, {"loss": 0.546324610710144, "token_acc": 0.8558951965065502, "grad_norm": 2.608332395553589, "learning_rate": 6.412447297792818e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022401, "epoch": 0.4400785854616896, "step": 168}, {"loss": 0.519471287727356, "token_acc": 0.8138075313807531, "grad_norm": 2.774064540863037, "learning_rate": 6.370769812886458e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022402, "epoch": 0.4426981008513425, "step": 169}, {"loss": 0.5176835060119629, "token_acc": 0.81419624217119, "grad_norm": 2.211394786834717, "learning_rate": 6.32898908883243e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022403, "epoch": 0.4453176162409954, "step": 170}, {"loss": 0.48924192786216736, "token_acc": 0.7954110898661568, "grad_norm": 2.712195634841919, "learning_rate": 6.287108272334359e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022404, "epoch": 0.44793713163064836, "step": 171}, {"loss": 0.5080841183662415, "token_acc": 0.8164948453608247, "grad_norm": 2.4233756065368652, "learning_rate": 6.245130517634306e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022405, "epoch": 0.45055664702030124, "step": 172}, {"loss": 0.5036391019821167, "token_acc": 0.8195652173913044, "grad_norm": 2.5514473915100098, "learning_rate": 6.203058986275206e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022407, "epoch": 0.4531761624099542, "step": 173}, {"loss": 0.5365530848503113, "token_acc": 0.8004434589800443, "grad_norm": 2.6929173469543457, "learning_rate": 6.160896846862753e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022408, "epoch": 0.45579567779960706, "step": 174}, {"loss": 0.5043696165084839, "token_acc": 0.8410138248847926, "grad_norm": 2.59588885307312, "learning_rate": 6.118647274826769e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022409, "epoch": 0.45841519318926, "step": 175}, {"loss": 0.5370233058929443, "token_acc": 0.814176245210728, "grad_norm": 2.692619800567627, "learning_rate": 6.076313452182032e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.02241, "epoch": 0.4610347085789129, "step": 176}, {"loss": 0.5324850082397461, "token_acc": 0.8559670781893004, "grad_norm": 2.4177513122558594, "learning_rate": 6.033898567288637e-07, "memory(GiB)": 81.71, "train_speed(iter/s)": 0.022411, "epoch": 0.4636542239685658, "step": 177}, {"loss": 0.5066647529602051, "token_acc": 0.8102678571428571, "grad_norm": 2.335235118865967, "learning_rate": 5.991405814611855e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022413, "epoch": 0.46627373935821875, "step": 178}, {"loss": 0.5604389905929565, "token_acc": 0.7930327868852459, "grad_norm": 2.7526233196258545, "learning_rate": 5.94883839448155e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022413, "epoch": 0.46889325474787164, "step": 179}, {"loss": 0.5318752527236938, "token_acc": 0.7806841046277666, "grad_norm": 3.0703492164611816, "learning_rate": 5.906199512851144e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022415, "epoch": 0.4715127701375246, "step": 180}, {"loss": 0.5259851813316345, "token_acc": 0.8266129032258065, "grad_norm": 3.6493606567382812, "learning_rate": 5.863492381056163e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022416, "epoch": 0.47413228552717746, "step": 181}, {"loss": 0.5321407318115234, "token_acc": 0.8212765957446808, "grad_norm": 2.2182960510253906, "learning_rate": 5.820720215572374e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022417, "epoch": 0.4767518009168304, "step": 182}, {"loss": 0.49147552251815796, "token_acc": 0.8346938775510204, "grad_norm": 2.3478410243988037, "learning_rate": 5.777886237773541e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022418, "epoch": 0.4793713163064833, "step": 183}, {"loss": 0.5323810577392578, "token_acc": 0.8205128205128205, "grad_norm": 2.5113790035247803, "learning_rate": 5.7349936736888e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022419, "epoch": 0.4819908316961362, "step": 184}, {"loss": 0.5650047659873962, "token_acc": 0.821501014198783, "grad_norm": 2.9842703342437744, "learning_rate": 5.692045753759701e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02242, "epoch": 0.48461034708578915, "step": 185}, {"loss": 0.5345799326896667, "token_acc": 0.7938342967244701, "grad_norm": 2.432101249694824, "learning_rate": 5.649045712596903e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022421, "epoch": 0.48722986247544203, "step": 186}, {"loss": 0.4752824008464813, "token_acc": 0.8322295805739515, "grad_norm": 2.267573595046997, "learning_rate": 5.60599678873656e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022422, "epoch": 0.48984937786509497, "step": 187}, {"loss": 0.48174723982810974, "token_acc": 0.8076190476190476, "grad_norm": 2.7236266136169434, "learning_rate": 5.562902224396415e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022423, "epoch": 0.49246889325474785, "step": 188}, {"loss": 0.5337547659873962, "token_acc": 0.8140043763676149, "grad_norm": 2.6596786975860596, "learning_rate": 5.519765265231608e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022424, "epoch": 0.4950884086444008, "step": 189}, {"loss": 0.5248522162437439, "token_acc": 0.8295964125560538, "grad_norm": 2.373962879180908, "learning_rate": 5.476589160090237e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022425, "epoch": 0.4977079240340537, "step": 190}, {"loss": 0.5135247707366943, "token_acc": 0.848780487804878, "grad_norm": 2.4927663803100586, "learning_rate": 5.433377160768668e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022426, "epoch": 0.5003274394237066, "step": 191}, {"loss": 0.4987615942955017, "token_acc": 0.8172690763052208, "grad_norm": 2.7499098777770996, "learning_rate": 5.390132521766625e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022427, "epoch": 0.5029469548133595, "step": 192}, {"loss": 0.4737977385520935, "token_acc": 0.8281573498964804, "grad_norm": 2.9827399253845215, "learning_rate": 5.346858500042079e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022428, "epoch": 0.5055664702030125, "step": 193}, {"loss": 0.48123741149902344, "token_acc": 0.8326530612244898, "grad_norm": 2.5349838733673096, "learning_rate": 5.303558354765959e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022429, "epoch": 0.5081859855926654, "step": 194}, {"loss": 0.5271561741828918, "token_acc": 0.8131212723658051, "grad_norm": 2.605069875717163, "learning_rate": 5.260235347076676e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02243, "epoch": 0.5108055009823183, "step": 195}, {"loss": 0.5425925254821777, "token_acc": 0.8325991189427313, "grad_norm": 2.842576265335083, "learning_rate": 5.216892739834519e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022431, "epoch": 0.5134250163719711, "step": 196}, {"loss": 0.5463793277740479, "token_acc": 0.811623246492986, "grad_norm": 2.7049286365509033, "learning_rate": 5.17353379737591e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022433, "epoch": 0.5160445317616241, "step": 197}, {"loss": 0.4965885877609253, "token_acc": 0.8532494758909853, "grad_norm": 2.7234854698181152, "learning_rate": 5.13016178526756e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022434, "epoch": 0.518664047151277, "step": 198}, {"loss": 0.5202344655990601, "token_acc": 0.8486842105263158, "grad_norm": 3.040390729904175, "learning_rate": 5.08677997006051e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022434, "epoch": 0.5212835625409299, "step": 199}, {"loss": 0.529319167137146, "token_acc": 0.7826961770623743, "grad_norm": 2.8432278633117676, "learning_rate": 5.043391619044122e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022436, "epoch": 0.5239030779305829, "step": 200}, {"loss": 0.501499593257904, "token_acc": 0.8084210526315789, "grad_norm": 2.3813958168029785, "learning_rate": 5e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022391, "epoch": 0.5265225933202358, "step": 201}, {"loss": 0.5728980302810669, "token_acc": 0.7804878048780488, "grad_norm": 2.7652013301849365, "learning_rate": 4.956608380955877e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022391, "epoch": 0.5291421087098886, "step": 202}, {"loss": 0.5111644268035889, "token_acc": 0.838074398249453, "grad_norm": 2.907130718231201, "learning_rate": 4.913220029939491e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022393, "epoch": 0.5317616240995415, "step": 203}, {"loss": 0.5002152919769287, "token_acc": 0.8050847457627118, "grad_norm": 2.4572055339813232, "learning_rate": 4.86983821473244e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022394, "epoch": 0.5343811394891945, "step": 204}, {"loss": 0.501746416091919, "token_acc": 0.823394495412844, "grad_norm": 2.621178150177002, "learning_rate": 4.82646620262409e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022395, "epoch": 0.5370006548788474, "step": 205}, {"loss": 0.45794087648391724, "token_acc": 0.8240343347639485, "grad_norm": 2.285428285598755, "learning_rate": 4.783107260165483e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022396, "epoch": 0.5396201702685003, "step": 206}, {"loss": 0.4905679225921631, "token_acc": 0.8325892857142857, "grad_norm": 2.4800000190734863, "learning_rate": 4.739764652923326e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022398, "epoch": 0.5422396856581533, "step": 207}, {"loss": 0.5248773097991943, "token_acc": 0.8127853881278538, "grad_norm": 2.823551654815674, "learning_rate": 4.696441645234041e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022399, "epoch": 0.5448592010478062, "step": 208}, {"loss": 0.5368592143058777, "token_acc": 0.8722358722358723, "grad_norm": 2.6052966117858887, "learning_rate": 4.6531414999579194e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.0224, "epoch": 0.547478716437459, "step": 209}, {"loss": 0.4909115433692932, "token_acc": 0.8090909090909091, "grad_norm": 2.4320971965789795, "learning_rate": 4.609867478233376e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022401, "epoch": 0.550098231827112, "step": 210}, {"loss": 0.5095679759979248, "token_acc": 0.8176795580110497, "grad_norm": 2.5677363872528076, "learning_rate": 4.566622839231331e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022402, "epoch": 0.5527177472167649, "step": 211}, {"loss": 0.5090851187705994, "token_acc": 0.8045738045738046, "grad_norm": 2.5811502933502197, "learning_rate": 4.523410839909763e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022403, "epoch": 0.5553372626064178, "step": 212}, {"loss": 0.49449896812438965, "token_acc": 0.8275862068965517, "grad_norm": 2.5541276931762695, "learning_rate": 4.480234734768392e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022404, "epoch": 0.5579567779960707, "step": 213}, {"loss": 0.5223034620285034, "token_acc": 0.8127490039840638, "grad_norm": 2.549534797668457, "learning_rate": 4.4370977756035865e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022405, "epoch": 0.5605762933857237, "step": 214}, {"loss": 0.4867344796657562, "token_acc": 0.8042553191489362, "grad_norm": 2.7251574993133545, "learning_rate": 4.39400321126344e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022406, "epoch": 0.5631958087753766, "step": 215}, {"loss": 0.49226635694503784, "token_acc": 0.8221153846153846, "grad_norm": 2.6882388591766357, "learning_rate": 4.350954287403099e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022406, "epoch": 0.5658153241650294, "step": 216}, {"loss": 0.5454949736595154, "token_acc": 0.8254716981132075, "grad_norm": 2.7445809841156006, "learning_rate": 4.307954246240299e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022407, "epoch": 0.5684348395546824, "step": 217}, {"loss": 0.5441815853118896, "token_acc": 0.8199052132701422, "grad_norm": 3.213698387145996, "learning_rate": 4.2650063263111983e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022409, "epoch": 0.5710543549443353, "step": 218}, {"loss": 0.49353039264678955, "token_acc": 0.8533333333333334, "grad_norm": 2.593226671218872, "learning_rate": 4.2221137622264593e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02241, "epoch": 0.5736738703339882, "step": 219}, {"loss": 0.49543607234954834, "token_acc": 0.8104738154613467, "grad_norm": 2.444615125656128, "learning_rate": 4.179279784427624e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022411, "epoch": 0.5762933857236411, "step": 220}, {"loss": 0.45800793170928955, "token_acc": 0.8252631578947368, "grad_norm": 2.395808219909668, "learning_rate": 4.1365076189438363e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022412, "epoch": 0.5789129011132941, "step": 221}, {"loss": 0.5338102579116821, "token_acc": 0.8119834710743802, "grad_norm": 2.540519952774048, "learning_rate": 4.0938004871488564e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022413, "epoch": 0.581532416502947, "step": 222}, {"loss": 0.5074054002761841, "token_acc": 0.7995594713656388, "grad_norm": 2.428532838821411, "learning_rate": 4.0511616055184515e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022414, "epoch": 0.5841519318925998, "step": 223}, {"loss": 0.5541911721229553, "token_acc": 0.8306264501160093, "grad_norm": 3.5335779190063477, "learning_rate": 4.0085941853881456e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022415, "epoch": 0.5867714472822528, "step": 224}, {"loss": 0.46316903829574585, "token_acc": 0.8368200836820083, "grad_norm": 2.4529342651367188, "learning_rate": 3.966101432711363e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022415, "epoch": 0.5893909626719057, "step": 225}, {"loss": 0.5350896716117859, "token_acc": 0.8421052631578947, "grad_norm": 2.6229522228240967, "learning_rate": 3.923686547817968e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022416, "epoch": 0.5920104780615586, "step": 226}, {"loss": 0.49788808822631836, "token_acc": 0.8295454545454546, "grad_norm": 2.8776233196258545, "learning_rate": 3.88135272517323e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022417, "epoch": 0.5946299934512115, "step": 227}, {"loss": 0.4753144681453705, "token_acc": 0.8524173027989822, "grad_norm": 2.695081949234009, "learning_rate": 3.839103153137247e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022418, "epoch": 0.5972495088408645, "step": 228}, {"loss": 0.5296587944030762, "token_acc": 0.8051948051948052, "grad_norm": 2.6119205951690674, "learning_rate": 3.796941013724795e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022418, "epoch": 0.5998690242305174, "step": 229}, {"loss": 0.5101851224899292, "token_acc": 0.8223684210526315, "grad_norm": 2.785724639892578, "learning_rate": 3.754869482365694e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022419, "epoch": 0.6024885396201702, "step": 230}, {"loss": 0.4709198474884033, "token_acc": 0.8128654970760234, "grad_norm": 2.3624894618988037, "learning_rate": 3.71289172766564e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02242, "epoch": 0.6051080550098232, "step": 231}, {"loss": 0.49558591842651367, "token_acc": 0.8422174840085288, "grad_norm": 2.687056303024292, "learning_rate": 3.671010911167571e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022421, "epoch": 0.6077275703994761, "step": 232}, {"loss": 0.48437756299972534, "token_acc": 0.819672131147541, "grad_norm": 2.462113380432129, "learning_rate": 3.629230187113542e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022422, "epoch": 0.610347085789129, "step": 233}, {"loss": 0.48182329535484314, "token_acc": 0.8254716981132075, "grad_norm": 3.077392101287842, "learning_rate": 3.5875527022071806e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022423, "epoch": 0.6129666011787819, "step": 234}, {"loss": 0.5311368703842163, "token_acc": 0.8495370370370371, "grad_norm": 2.6840720176696777, "learning_rate": 3.5459815953766883e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022424, "epoch": 0.6155861165684349, "step": 235}, {"loss": 0.49189090728759766, "token_acc": 0.8444976076555024, "grad_norm": 2.446159839630127, "learning_rate": 3.504519997538422e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022424, "epoch": 0.6182056319580878, "step": 236}, {"loss": 0.4624224901199341, "token_acc": 0.8636363636363636, "grad_norm": 2.4284424781799316, "learning_rate": 3.463171031361111e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022425, "epoch": 0.6208251473477406, "step": 237}, {"loss": 0.5315607786178589, "token_acc": 0.8290598290598291, "grad_norm": 2.662201166152954, "learning_rate": 3.421937811030652e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022426, "epoch": 0.6234446627373936, "step": 238}, {"loss": 0.5527917146682739, "token_acc": 0.8029045643153527, "grad_norm": 2.610382318496704, "learning_rate": 3.3808234420155816e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022427, "epoch": 0.6260641781270465, "step": 239}, {"loss": 0.48870691657066345, "token_acc": 0.8282608695652174, "grad_norm": 2.4183199405670166, "learning_rate": 3.3398310208331803e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022428, "epoch": 0.6286836935166994, "step": 240}, {"loss": 0.4696168601512909, "token_acc": 0.8210290827740492, "grad_norm": 2.7862048149108887, "learning_rate": 3.298963634816263e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022428, "epoch": 0.6313032089063523, "step": 241}, {"loss": 0.5115398168563843, "token_acc": 0.8241758241758241, "grad_norm": 2.729469060897827, "learning_rate": 3.258224361880657e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022429, "epoch": 0.6339227242960053, "step": 242}, {"loss": 0.5091263651847839, "token_acc": 0.8130630630630631, "grad_norm": 2.746661901473999, "learning_rate": 3.217616270293381e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02243, "epoch": 0.6365422396856582, "step": 243}, {"loss": 0.4814063012599945, "token_acc": 0.8465116279069768, "grad_norm": 2.293954610824585, "learning_rate": 3.177142418441578e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02243, "epoch": 0.639161755075311, "step": 244}, {"loss": 0.4885210692882538, "token_acc": 0.8297872340425532, "grad_norm": 2.718621253967285, "learning_rate": 3.136805854602152e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022431, "epoch": 0.641781270464964, "step": 245}, {"loss": 0.5710694789886475, "token_acc": 0.7926829268292683, "grad_norm": 9.916930198669434, "learning_rate": 3.096609616712207e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022431, "epoch": 0.6444007858546169, "step": 246}, {"loss": 0.5229955911636353, "token_acc": 0.828693790149893, "grad_norm": 2.8500070571899414, "learning_rate": 3.05655673214023e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022432, "epoch": 0.6470203012442698, "step": 247}, {"loss": 0.522060751914978, "token_acc": 0.7762863534675615, "grad_norm": 2.685305118560791, "learning_rate": 3.016650217458101e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022433, "epoch": 0.6496398166339227, "step": 248}, {"loss": 0.5302181243896484, "token_acc": 0.8008752735229759, "grad_norm": 3.060060977935791, "learning_rate": 2.976893078213882e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022433, "epoch": 0.6522593320235757, "step": 249}, {"loss": 0.4794139862060547, "token_acc": 0.8017429193899782, "grad_norm": 2.5100290775299072, "learning_rate": 2.9372883087054744e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022434, "epoch": 0.6548788474132285, "step": 250}, {"loss": 0.5082091093063354, "token_acc": 0.8414634146341463, "grad_norm": 2.5010130405426025, "learning_rate": 2.897838891755093e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022395, "epoch": 0.6574983628028814, "step": 251}, {"loss": 0.4648599624633789, "token_acc": 0.8354430379746836, "grad_norm": 2.600001096725464, "learning_rate": 2.858547798484613e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022396, "epoch": 0.6601178781925344, "step": 252}, {"loss": 0.49137982726097107, "token_acc": 0.8261851015801355, "grad_norm": 2.5828001499176025, "learning_rate": 2.8194179880918133e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022397, "epoch": 0.6627373935821873, "step": 253}, {"loss": 0.5322656631469727, "token_acc": 0.7837837837837838, "grad_norm": 2.649562120437622, "learning_rate": 2.7804524076274896e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022398, "epoch": 0.6653569089718402, "step": 254}, {"loss": 0.532645583152771, "token_acc": 0.8054968287526427, "grad_norm": 2.439877510070801, "learning_rate": 2.7416539917735124e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022399, "epoch": 0.6679764243614931, "step": 255}, {"loss": 0.48848390579223633, "token_acc": 0.7975460122699386, "grad_norm": 2.472744941711426, "learning_rate": 2.7030256626217927e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.0224, "epoch": 0.6705959397511461, "step": 256}, {"loss": 0.4855130612850189, "token_acc": 0.818, "grad_norm": 2.721924066543579, "learning_rate": 2.664570329454211e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022401, "epoch": 0.673215455140799, "step": 257}, {"loss": 0.5380468368530273, "token_acc": 0.8137254901960784, "grad_norm": 2.8564412593841553, "learning_rate": 2.626290888523504e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022401, "epoch": 0.6758349705304518, "step": 258}, {"loss": 0.5244271755218506, "token_acc": 0.8389830508474576, "grad_norm": 2.6810224056243896, "learning_rate": 2.588190222835127e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022402, "epoch": 0.6784544859201048, "step": 259}, {"loss": 0.4440711736679077, "token_acc": 0.7978260869565217, "grad_norm": 2.537179946899414, "learning_rate": 2.5502712019301357e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022403, "epoch": 0.6810740013097577, "step": 260}, {"loss": 0.5143766403198242, "token_acc": 0.8228699551569507, "grad_norm": 2.339520215988159, "learning_rate": 2.512536681669055e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022404, "epoch": 0.6836935166994106, "step": 261}, {"loss": 0.4380583167076111, "token_acc": 0.8304721030042919, "grad_norm": 2.7531886100769043, "learning_rate": 2.4749895040167977e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022405, "epoch": 0.6863130320890635, "step": 262}, {"loss": 0.4871158003807068, "token_acc": 0.811088295687885, "grad_norm": 2.6661734580993652, "learning_rate": 2.437632496828615e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022406, "epoch": 0.6889325474787165, "step": 263}, {"loss": 0.5000735521316528, "token_acc": 0.8352402745995423, "grad_norm": 2.596735715866089, "learning_rate": 2.400468473637127e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022407, "epoch": 0.6915520628683693, "step": 264}, {"loss": 0.5367918014526367, "token_acc": 0.8194444444444444, "grad_norm": 2.601776361465454, "learning_rate": 2.3635002334404197e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022407, "epoch": 0.6941715782580222, "step": 265}, {"loss": 0.5354303121566772, "token_acc": 0.8053830227743272, "grad_norm": 2.6749372482299805, "learning_rate": 2.3267305604912296e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022408, "epoch": 0.6967910936476752, "step": 266}, {"loss": 0.46658238768577576, "token_acc": 0.8319672131147541, "grad_norm": 2.5614287853240967, "learning_rate": 2.2901622240872638e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022409, "epoch": 0.6994106090373281, "step": 267}, {"loss": 0.5196970701217651, "token_acc": 0.8178294573643411, "grad_norm": 2.2620811462402344, "learning_rate": 2.2537979783626165e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022409, "epoch": 0.702030124426981, "step": 268}, {"loss": 0.5194627046585083, "token_acc": 0.7931034482758621, "grad_norm": 4.23315954208374, "learning_rate": 2.2176405620803573e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02241, "epoch": 0.704649639816634, "step": 269}, {"loss": 0.5166223049163818, "token_acc": 0.8175519630484989, "grad_norm": 2.9118361473083496, "learning_rate": 2.181692698426245e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022411, "epoch": 0.7072691552062869, "step": 270}, {"loss": 0.5305249094963074, "token_acc": 0.8062622309197651, "grad_norm": 2.695871114730835, "learning_rate": 2.1459570948036483e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022411, "epoch": 0.7098886705959397, "step": 271}, {"loss": 0.5087628364562988, "token_acc": 0.8084210526315789, "grad_norm": 2.6976077556610107, "learning_rate": 2.1104364426296233e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022412, "epoch": 0.7125081859855926, "step": 272}, {"loss": 0.5207610130310059, "token_acc": 0.8451025056947609, "grad_norm": 2.4147698879241943, "learning_rate": 2.0751334171322226e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022413, "epoch": 0.7151277013752456, "step": 273}, {"loss": 0.48224353790283203, "token_acc": 0.8297872340425532, "grad_norm": 2.8471062183380127, "learning_rate": 2.0400506771490077e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022414, "epoch": 0.7177472167648985, "step": 274}, {"loss": 0.5036338567733765, "token_acc": 0.8397129186602871, "grad_norm": 2.7108752727508545, "learning_rate": 2.0051908649267896e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022414, "epoch": 0.7203667321545514, "step": 275}, {"loss": 0.4490717351436615, "token_acc": 0.8565022421524664, "grad_norm": 3.2256128787994385, "learning_rate": 1.9705566059226447e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022415, "epoch": 0.7229862475442044, "step": 276}, {"loss": 0.49216294288635254, "token_acc": 0.8344370860927153, "grad_norm": 2.34182071685791, "learning_rate": 1.9361505086061687e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022416, "epoch": 0.7256057629338573, "step": 277}, {"loss": 0.5206338167190552, "token_acc": 0.8266666666666667, "grad_norm": 2.2245421409606934, "learning_rate": 1.901975164263025e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022417, "epoch": 0.7282252783235101, "step": 278}, {"loss": 0.5033029913902283, "token_acc": 0.8451612903225807, "grad_norm": 2.6252386569976807, "learning_rate": 1.8680331467997752e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022417, "epoch": 0.730844793713163, "step": 279}, {"loss": 0.477212518453598, "token_acc": 0.8181818181818182, "grad_norm": 3.4650518894195557, "learning_rate": 1.8343270125500377e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022418, "epoch": 0.733464309102816, "step": 280}, {"loss": 0.5319840908050537, "token_acc": 0.8392484342379958, "grad_norm": 2.5805790424346924, "learning_rate": 1.8008593000819518e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022419, "epoch": 0.7360838244924689, "step": 281}, {"loss": 0.5247941017150879, "token_acc": 0.808, "grad_norm": 2.2787704467773438, "learning_rate": 1.7676325300069822e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022419, "epoch": 0.7387033398821218, "step": 282}, {"loss": 0.506512463092804, "token_acc": 0.8521739130434782, "grad_norm": 2.355156660079956, "learning_rate": 1.7346492047900896e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02242, "epoch": 0.7413228552717748, "step": 283}, {"loss": 0.5132666826248169, "token_acc": 0.819672131147541, "grad_norm": 2.473111629486084, "learning_rate": 1.7019118085612472e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022421, "epoch": 0.7439423706614277, "step": 284}, {"loss": 0.5192793607711792, "token_acc": 0.7986725663716814, "grad_norm": 2.684074640274048, "learning_rate": 1.669422806928361e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022421, "epoch": 0.7465618860510805, "step": 285}, {"loss": 0.496182382106781, "token_acc": 0.8148984198645598, "grad_norm": 2.4538328647613525, "learning_rate": 1.6371846467915602e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022422, "epoch": 0.7491814014407334, "step": 286}, {"loss": 0.49362796545028687, "token_acc": 0.8534278959810875, "grad_norm": 2.261423349380493, "learning_rate": 1.6051997561589243e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022422, "epoch": 0.7518009168303864, "step": 287}, {"loss": 0.4902206063270569, "token_acc": 0.8114035087719298, "grad_norm": 2.6014106273651123, "learning_rate": 1.5734705439636014e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022423, "epoch": 0.7544204322200393, "step": 288}, {"loss": 0.5162099599838257, "token_acc": 0.8299319727891157, "grad_norm": 2.6062521934509277, "learning_rate": 1.5419993998823965e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022423, "epoch": 0.7570399476096922, "step": 289}, {"loss": 0.5113745927810669, "token_acc": 0.8411910669975186, "grad_norm": 2.600162982940674, "learning_rate": 1.510788694155785e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022424, "epoch": 0.7596594629993452, "step": 290}, {"loss": 0.4948694705963135, "token_acc": 0.8262910798122066, "grad_norm": 2.57735276222229, "learning_rate": 1.4798407774093953e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022424, "epoch": 0.762278978388998, "step": 291}, {"loss": 0.5052931308746338, "token_acc": 0.8253275109170306, "grad_norm": 2.4674313068389893, "learning_rate": 1.4491579804769815e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022425, "epoch": 0.7648984937786509, "step": 292}, {"loss": 0.5213600397109985, "token_acc": 0.8008565310492506, "grad_norm": 2.798729658126831, "learning_rate": 1.418742614224872e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022426, "epoch": 0.7675180091683038, "step": 293}, {"loss": 0.5459722876548767, "token_acc": 0.814663951120163, "grad_norm": 2.399430990219116, "learning_rate": 1.3885969693779276e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022426, "epoch": 0.7701375245579568, "step": 294}, {"loss": 0.5129090547561646, "token_acc": 0.8188235294117647, "grad_norm": 2.7269091606140137, "learning_rate": 1.3587233163470124e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022427, "epoch": 0.7727570399476097, "step": 295}, {"loss": 0.4781366288661957, "token_acc": 0.8636363636363636, "grad_norm": 2.3647940158843994, "learning_rate": 1.3291239050580083e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022428, "epoch": 0.7753765553372626, "step": 296}, {"loss": 0.5016049146652222, "token_acc": 0.7964071856287425, "grad_norm": 1.9940778017044067, "learning_rate": 1.2998009647823544e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022429, "epoch": 0.7779960707269156, "step": 297}, {"loss": 0.5513393878936768, "token_acc": 0.8009049773755657, "grad_norm": 2.5768721103668213, "learning_rate": 1.2707567039691502e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022429, "epoch": 0.7806155861165684, "step": 298}, {"loss": 0.48956871032714844, "token_acc": 0.7956777996070727, "grad_norm": 2.5610852241516113, "learning_rate": 1.2419933100788323e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02243, "epoch": 0.7832351015062213, "step": 299}, {"loss": 0.5429037809371948, "token_acc": 0.8357894736842105, "grad_norm": 2.627711296081543, "learning_rate": 1.2135129494184187e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022431, "epoch": 0.7858546168958742, "step": 300}, {"loss": 0.4991035461425781, "token_acc": 0.8011928429423459, "grad_norm": 2.7441539764404297, "learning_rate": 1.1853177669783643e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022398, "epoch": 0.7884741322855272, "step": 301}, {"loss": 0.4960366487503052, "token_acc": 0.8689655172413793, "grad_norm": 2.731919288635254, "learning_rate": 1.1574098862709992e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022398, "epoch": 0.7910936476751801, "step": 302}, {"loss": 0.4972999095916748, "token_acc": 0.8476658476658476, "grad_norm": 2.3937597274780273, "learning_rate": 1.1297914091706084e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022399, "epoch": 0.793713163064833, "step": 303}, {"loss": 0.5035632848739624, "token_acc": 0.8069815195071869, "grad_norm": 2.736624240875244, "learning_rate": 1.1024644157551205e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.0224, "epoch": 0.796332678454486, "step": 304}, {"loss": 0.5088968276977539, "token_acc": 0.853448275862069, "grad_norm": 2.5324654579162598, "learning_rate": 1.0754309641494542e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022401, "epoch": 0.7989521938441388, "step": 305}, {"loss": 0.4777457118034363, "token_acc": 0.790356394129979, "grad_norm": 2.750378131866455, "learning_rate": 1.0486930903705094e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022401, "epoch": 0.8015717092337917, "step": 306}, {"loss": 0.48083269596099854, "token_acc": 0.8322440087145969, "grad_norm": 2.480308771133423, "learning_rate": 1.0222528081738186e-07, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022402, "epoch": 0.8041912246234446, "step": 307}, {"loss": 0.46578383445739746, "token_acc": 0.867579908675799, "grad_norm": 2.6979966163635254, "learning_rate": 9.961121089018931e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022403, "epoch": 0.8068107400130976, "step": 308}, {"loss": 0.5252381563186646, "token_acc": 0.8245243128964059, "grad_norm": 2.738271474838257, "learning_rate": 9.702729613342359e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022403, "epoch": 0.8094302554027505, "step": 309}, {"loss": 0.530731201171875, "token_acc": 0.8021739130434783, "grad_norm": 2.190814256668091, "learning_rate": 9.4473731153907e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022404, "epoch": 0.8120497707924034, "step": 310}, {"loss": 0.4815801978111267, "token_acc": 0.8095238095238095, "grad_norm": 2.4832608699798584, "learning_rate": 9.195070827267632e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022405, "epoch": 0.8146692861820564, "step": 311}, {"loss": 0.5246912837028503, "token_acc": 0.8189134808853119, "grad_norm": 2.349517345428467, "learning_rate": 8.945841751049916e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022406, "epoch": 0.8172888015717092, "step": 312}, {"loss": 0.49589020013809204, "token_acc": 0.8142201834862385, "grad_norm": 2.821746349334717, "learning_rate": 8.699704657356194e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022406, "epoch": 0.8199083169613621, "step": 313}, {"loss": 0.5344954133033752, "token_acc": 0.8348214285714286, "grad_norm": 2.6706743240356445, "learning_rate": 8.456678083933289e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022407, "epoch": 0.822527832351015, "step": 314}, {"loss": 0.5088291168212891, "token_acc": 0.8353808353808354, "grad_norm": 2.4052066802978516, "learning_rate": 8.216780334260087e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022407, "epoch": 0.825147347740668, "step": 315}, {"loss": 0.5235204100608826, "token_acc": 0.8253275109170306, "grad_norm": 2.4350271224975586, "learning_rate": 7.980029476168942e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022408, "epoch": 0.8277668631303209, "step": 316}, {"loss": 0.5570900440216064, "token_acc": 0.8004246284501062, "grad_norm": 2.596780300140381, "learning_rate": 7.746443340484982e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022409, "epoch": 0.8303863785199738, "step": 317}, {"loss": 0.5230244398117065, "token_acc": 0.8457831325301205, "grad_norm": 2.4845240116119385, "learning_rate": 7.516039519683104e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022409, "epoch": 0.8330058939096268, "step": 318}, {"loss": 0.46841728687286377, "token_acc": 0.8468271334792122, "grad_norm": 2.2573208808898926, "learning_rate": 7.2888353665631e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.02241, "epoch": 0.8356254092992796, "step": 319}, {"loss": 0.517132043838501, "token_acc": 0.8142548596112311, "grad_norm": 2.806436061859131, "learning_rate": 7.064847992942613e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022411, "epoch": 0.8382449246889325, "step": 320}, {"loss": 0.5110062956809998, "token_acc": 0.8326086956521739, "grad_norm": 2.4178569316864014, "learning_rate": 6.844094268368484e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022412, "epoch": 0.8408644400785854, "step": 321}, {"loss": 0.542328953742981, "token_acc": 0.8055555555555556, "grad_norm": 2.495969772338867, "learning_rate": 6.626590818846162e-08, "memory(GiB)": 81.73, "train_speed(iter/s)": 0.022412, "epoch": 0.8434839554682384, "step": 322}, {"loss": 0.4902624487876892, "token_acc": 0.8574561403508771, "grad_norm": 2.962527275085449, "learning_rate": 6.412354025587507e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022412, "epoch": 0.8461034708578913, "step": 323}, {"loss": 0.5363924503326416, "token_acc": 0.7979591836734694, "grad_norm": 2.499372959136963, "learning_rate": 6.201400023777103e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022413, "epoch": 0.8487229862475442, "step": 324}, {"loss": 0.522162139415741, "token_acc": 0.8052208835341366, "grad_norm": 2.7732133865356445, "learning_rate": 5.99374470135699e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022414, "epoch": 0.8513425016371972, "step": 325}, {"loss": 0.4886404871940613, "token_acc": 0.8187372708757638, "grad_norm": 2.403465986251831, "learning_rate": 5.789403697830103e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022414, "epoch": 0.85396201702685, "step": 326}, {"loss": 0.5211939215660095, "token_acc": 0.7917485265225933, "grad_norm": 2.3210058212280273, "learning_rate": 5.588392403082337e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022415, "epoch": 0.8565815324165029, "step": 327}, {"loss": 0.5272901654243469, "token_acc": 0.8532731376975169, "grad_norm": 2.26873779296875, "learning_rate": 5.39072595622353e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022416, "epoch": 0.8592010478061559, "step": 328}, {"loss": 0.4871232509613037, "token_acc": 0.847870182555781, "grad_norm": 2.484192132949829, "learning_rate": 5.196419244447231e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022416, "epoch": 0.8618205631958088, "step": 329}, {"loss": 0.4857238233089447, "token_acc": 0.8247863247863247, "grad_norm": 2.297212839126587, "learning_rate": 5.005486901909428e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022417, "epoch": 0.8644400785854617, "step": 330}, {"loss": 0.4841687083244324, "token_acc": 0.8377192982456141, "grad_norm": 2.3831281661987305, "learning_rate": 4.817943308626487e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022417, "epoch": 0.8670595939751146, "step": 331}, {"loss": 0.5332281589508057, "token_acc": 0.805327868852459, "grad_norm": 2.755554437637329, "learning_rate": 4.6338025893920164e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022418, "epoch": 0.8696791093647676, "step": 332}, {"loss": 0.5020807385444641, "token_acc": 0.820675105485232, "grad_norm": 2.5590343475341797, "learning_rate": 4.453078612713157e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022418, "epoch": 0.8722986247544204, "step": 333}, {"loss": 0.47612449526786804, "token_acc": 0.803921568627451, "grad_norm": 2.3644652366638184, "learning_rate": 4.275784989765985e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022419, "epoch": 0.8749181401440733, "step": 334}, {"loss": 0.5420501828193665, "token_acc": 0.7937915742793792, "grad_norm": 2.479030132293701, "learning_rate": 4.101935073370466e-08, "memory(GiB)": 82.67, "train_speed(iter/s)": 0.022419, "epoch": 0.8775376555337263, "step": 335}, {"loss": 0.5320147275924683, "token_acc": 0.8089171974522293, "grad_norm": 2.6694037914276123, "learning_rate": 3.93154195698478e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022419, "epoch": 0.8801571709233792, "step": 336}, {"loss": 0.5496601462364197, "token_acc": 0.8284023668639053, "grad_norm": 2.391843318939209, "learning_rate": 3.764618473719128e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.02242, "epoch": 0.8827766863130321, "step": 337}, {"loss": 0.5212337970733643, "token_acc": 0.8247191011235955, "grad_norm": 2.9633660316467285, "learning_rate": 3.601177195369304e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022421, "epoch": 0.885396201702685, "step": 338}, {"loss": 0.5131974220275879, "token_acc": 0.8288100208768268, "grad_norm": 2.7280421257019043, "learning_rate": 3.44123043146976e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022421, "epoch": 0.888015717092338, "step": 339}, {"loss": 0.5110889673233032, "token_acc": 0.8174442190669371, "grad_norm": 2.487159013748169, "learning_rate": 3.284790228366602e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022422, "epoch": 0.8906352324819908, "step": 340}, {"loss": 0.5346288681030273, "token_acc": 0.805045871559633, "grad_norm": 2.7584152221679688, "learning_rate": 3.1318683683102756e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022422, "epoch": 0.8932547478716437, "step": 341}, {"loss": 0.47114405035972595, "token_acc": 0.8077753779697624, "grad_norm": 2.18638014793396, "learning_rate": 2.9824763685681765e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022423, "epoch": 0.8958742632612967, "step": 342}, {"loss": 0.4962024688720703, "token_acc": 0.8477751756440282, "grad_norm": 2.443735122680664, "learning_rate": 2.8366254805572642e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022423, "epoch": 0.8984937786509496, "step": 343}, {"loss": 0.5160344243049622, "token_acc": 0.8525641025641025, "grad_norm": 2.4292070865631104, "learning_rate": 2.694326688996662e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022424, "epoch": 0.9011132940406025, "step": 344}, {"loss": 0.5207871198654175, "token_acc": 0.8218884120171673, "grad_norm": 2.3588297367095947, "learning_rate": 2.5555907110803355e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022424, "epoch": 0.9037328094302554, "step": 345}, {"loss": 0.5402622818946838, "token_acc": 0.8054474708171206, "grad_norm": 2.8959455490112305, "learning_rate": 2.420427995669899e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022425, "epoch": 0.9063523248199084, "step": 346}, {"loss": 0.5097421407699585, "token_acc": 0.7718940936863544, "grad_norm": 2.364020824432373, "learning_rate": 2.2888487225077356e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022425, "epoch": 0.9089718402095612, "step": 347}, {"loss": 0.5547090172767639, "token_acc": 0.8212765957446808, "grad_norm": 2.529003620147705, "learning_rate": 2.160862801450236e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022425, "epoch": 0.9115913555992141, "step": 348}, {"loss": 0.5153399705886841, "token_acc": 0.8030634573304157, "grad_norm": 2.3784689903259277, "learning_rate": 2.0364798717215082e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022426, "epoch": 0.9142108709888671, "step": 349}, {"loss": 0.474487841129303, "token_acc": 0.8677685950413223, "grad_norm": 2.315887451171875, "learning_rate": 1.9157093011873348e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022427, "epoch": 0.91683038637852, "step": 350}, {"loss": 0.4768972098827362, "token_acc": 0.852054794520548, "grad_norm": 2.3234336376190186, "learning_rate": 1.7985601856496947e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022398, "epoch": 0.9194499017681729, "step": 351}, {"loss": 0.4942987561225891, "token_acc": 0.806060606060606, "grad_norm": 2.5884788036346436, "learning_rate": 1.6850413481616867e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022398, "epoch": 0.9220694171578258, "step": 352}, {"loss": 0.5371567606925964, "token_acc": 0.7987551867219918, "grad_norm": 2.440413475036621, "learning_rate": 1.5751613383630126e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022399, "epoch": 0.9246889325474787, "step": 353}, {"loss": 0.5101127028465271, "token_acc": 0.83125, "grad_norm": 2.5329740047454834, "learning_rate": 1.4689284318360918e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.0224, "epoch": 0.9273084479371316, "step": 354}, {"loss": 0.4840486943721771, "token_acc": 0.8271334792122538, "grad_norm": 2.314850091934204, "learning_rate": 1.3663506294827653e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.0224, "epoch": 0.9299279633267845, "step": 355}, {"loss": 0.5082566738128662, "token_acc": 0.8129032258064516, "grad_norm": 2.829533338546753, "learning_rate": 1.2674356569217282e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022401, "epoch": 0.9325474787164375, "step": 356}, {"loss": 0.5485932230949402, "token_acc": 0.843400447427293, "grad_norm": 2.714642286300659, "learning_rate": 1.1721909639066496e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022402, "epoch": 0.9351669941060904, "step": 357}, {"loss": 0.4964979588985443, "token_acc": 0.8545081967213115, "grad_norm": 2.0323498249053955, "learning_rate": 1.080623723765134e-08, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022402, "epoch": 0.9377865094957433, "step": 358}, {"loss": 0.5004359483718872, "token_acc": 0.796137339055794, "grad_norm": 2.5099165439605713, "learning_rate": 9.927408328584409e-09, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022403, "epoch": 0.9404060248853962, "step": 359}, {"loss": 0.5286253094673157, "token_acc": 0.8045977011494253, "grad_norm": 2.6409194469451904, "learning_rate": 9.085489100620735e-09, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022404, "epoch": 0.9430255402750491, "step": 360}, {"loss": 0.4993132948875427, "token_acc": 0.8568281938325991, "grad_norm": 2.3924882411956787, "learning_rate": 8.280542962673165e-09, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022404, "epoch": 0.945645055664702, "step": 361}, {"loss": 0.4910353124141693, "token_acc": 0.823045267489712, "grad_norm": 2.5378451347351074, "learning_rate": 7.512630539036502e-09, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022405, "epoch": 0.9482645710543549, "step": 362}, {"loss": 0.5635290145874023, "token_acc": 0.8061855670103093, "grad_norm": 2.6607308387756348, "learning_rate": 6.781809664821558e-09, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022406, "epoch": 0.9508840864440079, "step": 363}, {"loss": 0.5150756239891052, "token_acc": 0.813953488372093, "grad_norm": 2.3563029766082764, "learning_rate": 6.0881353815994126e-09, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022406, "epoch": 0.9535036018336608, "step": 364}, {"loss": 0.4837515354156494, "token_acc": 0.8075396825396826, "grad_norm": 2.5509421825408936, "learning_rate": 5.431659933256172e-09, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022407, "epoch": 0.9561231172233137, "step": 365}, {"loss": 0.468533992767334, "token_acc": 0.8382352941176471, "grad_norm": 2.1849524974823, "learning_rate": 4.812432762057672e-09, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022408, "epoch": 0.9587426326129665, "step": 366}, {"loss": 0.5219249129295349, "token_acc": 0.819371727748691, "grad_norm": 3.5655651092529297, "learning_rate": 4.230500504926404e-09, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022408, "epoch": 0.9613621480026195, "step": 367}, {"loss": 0.47167691588401794, "token_acc": 0.8340336134453782, "grad_norm": 2.9407241344451904, "learning_rate": 3.6859069899286554e-09, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022409, "epoch": 0.9639816633922724, "step": 368}, {"loss": 0.4640706479549408, "token_acc": 0.814498933901919, "grad_norm": 2.0513827800750732, "learning_rate": 3.1786932329736506e-09, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022409, "epoch": 0.9666011787819253, "step": 369}, {"loss": 0.5055848956108093, "token_acc": 0.8288659793814434, "grad_norm": 2.4933784008026123, "learning_rate": 2.7088974347246885e-09, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.02241, "epoch": 0.9692206941715783, "step": 370}, {"loss": 0.5173639059066772, "token_acc": 0.7977777777777778, "grad_norm": 2.9046876430511475, "learning_rate": 2.2765549777217786e-09, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022411, "epoch": 0.9718402095612312, "step": 371}, {"loss": 0.517285168170929, "token_acc": 0.821917808219178, "grad_norm": 2.2502334117889404, "learning_rate": 1.8816984237169375e-09, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022411, "epoch": 0.9744597249508841, "step": 372}, {"loss": 0.46636566519737244, "token_acc": 0.8148936170212766, "grad_norm": 2.2861053943634033, "learning_rate": 1.5243575112218744e-09, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022411, "epoch": 0.977079240340537, "step": 373}, {"loss": 0.5341597199440002, "token_acc": 0.8185840707964602, "grad_norm": 2.358703136444092, "learning_rate": 1.2045591532681143e-09, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022412, "epoch": 0.9796987557301899, "step": 374}, {"loss": 0.505860447883606, "token_acc": 0.84251968503937, "grad_norm": 2.331482172012329, "learning_rate": 9.223274353802324e-10, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022413, "epoch": 0.9823182711198428, "step": 375}, {"loss": 0.49814486503601074, "token_acc": 0.8306997742663657, "grad_norm": 2.6546504497528076, "learning_rate": 6.77683613761526e-10, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022413, "epoch": 0.9849377865094957, "step": 376}, {"loss": 0.4845404028892517, "token_acc": 0.8176352705410822, "grad_norm": 2.2732958793640137, "learning_rate": 4.706461136935736e-10, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022414, "epoch": 0.9875573018991487, "step": 377}, {"loss": 0.5144809484481812, "token_acc": 0.8039647577092511, "grad_norm": 2.7591075897216797, "learning_rate": 3.01230528148122e-10, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022415, "epoch": 0.9901768172888016, "step": 378}, {"loss": 0.5521637797355652, "token_acc": 0.7885835095137421, "grad_norm": 2.634091854095459, "learning_rate": 1.6944961661297108e-10, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022415, "epoch": 0.9927963326784545, "step": 379}, {"loss": 0.5316582918167114, "token_acc": 0.80622009569378, "grad_norm": 2.5303945541381836, "learning_rate": 7.53133041307974e-11, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022416, "epoch": 0.9954158480681073, "step": 380}, {"loss": 0.5218433141708374, "token_acc": 0.8148893360160966, "grad_norm": 2.370922565460205, "learning_rate": 1.8828680551918885e-11, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022416, "epoch": 0.9980353634577603, "step": 381}, {"loss": 0.49973976612091064, "token_acc": 0.8237179487179487, "grad_norm": 3.362750768661499, "learning_rate": 0.0, "memory(GiB)": 83.0, "train_speed(iter/s)": 0.022436, "epoch": 1.0, "step": 382}, {"eval_loss": 0.476247638463974, "eval_token_acc": 0.8304699572766112, "eval_runtime": 139.0135, "eval_samples_per_second": 2.662, "eval_steps_per_second": 0.115, "epoch": 1.0, "step": 382}, {"train_runtime": 17179.794, "train_samples_per_second": 2.133, "train_steps_per_second": 0.022, "total_flos": 7.42031915510399e+18, "train_loss": 0.600046001725796, "epoch": 1.0, "step": 382}], "memory": 83.0} |
|
|