| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9997672540080229, | |
| "eval_steps": 1000, | |
| "global_step": 2282, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0021905505127257294, | |
| "grad_norm": 18.828125, | |
| "learning_rate": 4.366812227074236e-07, | |
| "loss": 23.3068, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.004381101025451459, | |
| "grad_norm": 26.671875, | |
| "learning_rate": 8.733624454148472e-07, | |
| "loss": 23.6989, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.006571651538177188, | |
| "grad_norm": 15.6796875, | |
| "learning_rate": 1.3100436681222709e-06, | |
| "loss": 22.5467, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.008762202050902917, | |
| "grad_norm": 23.453125, | |
| "learning_rate": 1.7467248908296944e-06, | |
| "loss": 23.4159, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.010952752563628647, | |
| "grad_norm": 27.640625, | |
| "learning_rate": 2.183406113537118e-06, | |
| "loss": 23.3828, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.013143303076354376, | |
| "grad_norm": 13.21875, | |
| "learning_rate": 2.6200873362445417e-06, | |
| "loss": 23.1888, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.015333853589080106, | |
| "grad_norm": 14.28125, | |
| "learning_rate": 3.0567685589519653e-06, | |
| "loss": 23.2673, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.017524404101805835, | |
| "grad_norm": 16.484375, | |
| "learning_rate": 3.493449781659389e-06, | |
| "loss": 23.3523, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.019714954614531564, | |
| "grad_norm": 40.625, | |
| "learning_rate": 3.930131004366812e-06, | |
| "loss": 23.8635, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.021905505127257294, | |
| "grad_norm": 17.96875, | |
| "learning_rate": 4.366812227074236e-06, | |
| "loss": 23.2009, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.024096055639983023, | |
| "grad_norm": 24.34375, | |
| "learning_rate": 4.80349344978166e-06, | |
| "loss": 24.8502, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.026286606152708752, | |
| "grad_norm": 56.125, | |
| "learning_rate": 5.2401746724890834e-06, | |
| "loss": 23.8839, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.028477156665434482, | |
| "grad_norm": 24.03125, | |
| "learning_rate": 5.676855895196507e-06, | |
| "loss": 24.2237, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.03066770717816021, | |
| "grad_norm": 32.96875, | |
| "learning_rate": 6.1135371179039305e-06, | |
| "loss": 24.3763, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.03285825769088594, | |
| "grad_norm": 38.40625, | |
| "learning_rate": 6.550218340611354e-06, | |
| "loss": 24.7635, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.03504880820361167, | |
| "grad_norm": 27.96875, | |
| "learning_rate": 6.986899563318778e-06, | |
| "loss": 24.4172, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0372393587163374, | |
| "grad_norm": 41.75, | |
| "learning_rate": 7.423580786026201e-06, | |
| "loss": 24.4402, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.03942990922906313, | |
| "grad_norm": 22.140625, | |
| "learning_rate": 7.860262008733624e-06, | |
| "loss": 24.2919, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.04162045974178886, | |
| "grad_norm": 26.375, | |
| "learning_rate": 8.296943231441049e-06, | |
| "loss": 25.0967, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.04381101025451459, | |
| "grad_norm": 69.6875, | |
| "learning_rate": 8.733624454148473e-06, | |
| "loss": 24.6318, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04600156076724032, | |
| "grad_norm": 21.0, | |
| "learning_rate": 9.170305676855896e-06, | |
| "loss": 25.4068, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.048192111279966046, | |
| "grad_norm": 33.34375, | |
| "learning_rate": 9.60698689956332e-06, | |
| "loss": 24.4346, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.050382661792691776, | |
| "grad_norm": 60.25, | |
| "learning_rate": 1.0043668122270742e-05, | |
| "loss": 25.8622, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.052573212305417505, | |
| "grad_norm": 47.0625, | |
| "learning_rate": 1.0480349344978167e-05, | |
| "loss": 26.2608, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.054763762818143234, | |
| "grad_norm": 65.125, | |
| "learning_rate": 1.0917030567685592e-05, | |
| "loss": 26.0549, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.056954313330868964, | |
| "grad_norm": 40.46875, | |
| "learning_rate": 1.1353711790393014e-05, | |
| "loss": 26.1449, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.05914486384359469, | |
| "grad_norm": 25.875, | |
| "learning_rate": 1.179039301310044e-05, | |
| "loss": 25.1006, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.06133541435632042, | |
| "grad_norm": 52.65625, | |
| "learning_rate": 1.2227074235807861e-05, | |
| "loss": 25.8101, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.06352596486904616, | |
| "grad_norm": 29.21875, | |
| "learning_rate": 1.2663755458515286e-05, | |
| "loss": 25.9608, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.06571651538177188, | |
| "grad_norm": 87.125, | |
| "learning_rate": 1.3100436681222708e-05, | |
| "loss": 26.1509, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.06790706589449762, | |
| "grad_norm": 52.0625, | |
| "learning_rate": 1.3537117903930132e-05, | |
| "loss": 25.8696, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.07009761640722334, | |
| "grad_norm": 43.3125, | |
| "learning_rate": 1.3973799126637555e-05, | |
| "loss": 26.6963, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.07228816691994908, | |
| "grad_norm": 199.75, | |
| "learning_rate": 1.4410480349344979e-05, | |
| "loss": 26.2112, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.0744787174326748, | |
| "grad_norm": 47.5625, | |
| "learning_rate": 1.4847161572052402e-05, | |
| "loss": 26.319, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.07666926794540053, | |
| "grad_norm": 122.0625, | |
| "learning_rate": 1.5283842794759826e-05, | |
| "loss": 27.5255, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.07885981845812626, | |
| "grad_norm": 43.40625, | |
| "learning_rate": 1.5720524017467248e-05, | |
| "loss": 26.8478, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.081050368970852, | |
| "grad_norm": 151.625, | |
| "learning_rate": 1.6157205240174673e-05, | |
| "loss": 26.8845, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.08324091948357772, | |
| "grad_norm": 83.75, | |
| "learning_rate": 1.6593886462882098e-05, | |
| "loss": 26.692, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.08543146999630345, | |
| "grad_norm": 79.5, | |
| "learning_rate": 1.703056768558952e-05, | |
| "loss": 26.7475, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.08762202050902917, | |
| "grad_norm": 70.3125, | |
| "learning_rate": 1.7467248908296945e-05, | |
| "loss": 26.1916, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08981257102175491, | |
| "grad_norm": 70.4375, | |
| "learning_rate": 1.7903930131004367e-05, | |
| "loss": 25.9641, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.09200312153448063, | |
| "grad_norm": 53.53125, | |
| "learning_rate": 1.8340611353711792e-05, | |
| "loss": 26.1586, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.09419367204720637, | |
| "grad_norm": 189.0, | |
| "learning_rate": 1.8777292576419214e-05, | |
| "loss": 26.4408, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.09638422255993209, | |
| "grad_norm": 73.0, | |
| "learning_rate": 1.921397379912664e-05, | |
| "loss": 27.2165, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.09857477307265783, | |
| "grad_norm": 56.59375, | |
| "learning_rate": 1.965065502183406e-05, | |
| "loss": 27.3976, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.10076532358538355, | |
| "grad_norm": 135.75, | |
| "learning_rate": 1.9990258158792012e-05, | |
| "loss": 27.3135, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.10295587409810929, | |
| "grad_norm": 92.125, | |
| "learning_rate": 1.994154895275207e-05, | |
| "loss": 26.7273, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.10514642461083501, | |
| "grad_norm": 142.875, | |
| "learning_rate": 1.989283974671213e-05, | |
| "loss": 27.3197, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.10733697512356075, | |
| "grad_norm": 122.0625, | |
| "learning_rate": 1.984413054067219e-05, | |
| "loss": 27.6129, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.10952752563628647, | |
| "grad_norm": 91.9375, | |
| "learning_rate": 1.979542133463225e-05, | |
| "loss": 26.8124, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1117180761490122, | |
| "grad_norm": 69.375, | |
| "learning_rate": 1.9746712128592305e-05, | |
| "loss": 27.1558, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.11390862666173793, | |
| "grad_norm": 133.125, | |
| "learning_rate": 1.9698002922552364e-05, | |
| "loss": 27.3605, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.11609917717446366, | |
| "grad_norm": 212.75, | |
| "learning_rate": 1.9649293716512423e-05, | |
| "loss": 27.3959, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.11828972768718939, | |
| "grad_norm": 64.1875, | |
| "learning_rate": 1.960058451047248e-05, | |
| "loss": 27.9033, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.12048027819991512, | |
| "grad_norm": 166.375, | |
| "learning_rate": 1.955187530443254e-05, | |
| "loss": 25.922, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.12267082871264084, | |
| "grad_norm": 54.59375, | |
| "learning_rate": 1.9503166098392598e-05, | |
| "loss": 26.778, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.12486137922536658, | |
| "grad_norm": 69.8125, | |
| "learning_rate": 1.9454456892352657e-05, | |
| "loss": 26.1686, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.12705192973809232, | |
| "grad_norm": 120.0625, | |
| "learning_rate": 1.9405747686312716e-05, | |
| "loss": 27.1558, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.12924248025081803, | |
| "grad_norm": 75.75, | |
| "learning_rate": 1.9357038480272775e-05, | |
| "loss": 26.8669, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.13143303076354376, | |
| "grad_norm": 110.25, | |
| "learning_rate": 1.930832927423283e-05, | |
| "loss": 27.4517, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1336235812762695, | |
| "grad_norm": 92.875, | |
| "learning_rate": 1.925962006819289e-05, | |
| "loss": 27.628, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.13581413178899523, | |
| "grad_norm": 98.375, | |
| "learning_rate": 1.921091086215295e-05, | |
| "loss": 28.8423, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.13800468230172094, | |
| "grad_norm": 112.875, | |
| "learning_rate": 1.9162201656113005e-05, | |
| "loss": 27.2378, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.14019523281444668, | |
| "grad_norm": 144.5, | |
| "learning_rate": 1.9113492450073065e-05, | |
| "loss": 27.3709, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.14238578332717242, | |
| "grad_norm": 115.625, | |
| "learning_rate": 1.9064783244033124e-05, | |
| "loss": 27.6172, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.14457633383989815, | |
| "grad_norm": 66.625, | |
| "learning_rate": 1.9016074037993183e-05, | |
| "loss": 26.9181, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.14676688435262386, | |
| "grad_norm": 116.3125, | |
| "learning_rate": 1.8967364831953242e-05, | |
| "loss": 26.983, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.1489574348653496, | |
| "grad_norm": 75.5625, | |
| "learning_rate": 1.8918655625913298e-05, | |
| "loss": 27.2569, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.15114798537807533, | |
| "grad_norm": 138.125, | |
| "learning_rate": 1.8869946419873357e-05, | |
| "loss": 27.3038, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.15333853589080107, | |
| "grad_norm": 111.1875, | |
| "learning_rate": 1.8821237213833417e-05, | |
| "loss": 27.6782, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.15552908640352678, | |
| "grad_norm": 266.5, | |
| "learning_rate": 1.8772528007793472e-05, | |
| "loss": 27.2354, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.15771963691625251, | |
| "grad_norm": 134.875, | |
| "learning_rate": 1.872381880175353e-05, | |
| "loss": 27.7307, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.15991018742897825, | |
| "grad_norm": 116.375, | |
| "learning_rate": 1.867510959571359e-05, | |
| "loss": 27.2934, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.162100737941704, | |
| "grad_norm": 93.5, | |
| "learning_rate": 1.862640038967365e-05, | |
| "loss": 26.9397, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.1642912884544297, | |
| "grad_norm": 96.1875, | |
| "learning_rate": 1.857769118363371e-05, | |
| "loss": 26.796, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.16648183896715543, | |
| "grad_norm": 108.1875, | |
| "learning_rate": 1.852898197759377e-05, | |
| "loss": 26.8906, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.16867238947988117, | |
| "grad_norm": 161.25, | |
| "learning_rate": 1.8480272771553824e-05, | |
| "loss": 27.8457, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.1708629399926069, | |
| "grad_norm": 103.875, | |
| "learning_rate": 1.8431563565513884e-05, | |
| "loss": 27.5416, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.1730534905053326, | |
| "grad_norm": 97.875, | |
| "learning_rate": 1.8382854359473943e-05, | |
| "loss": 27.5614, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.17524404101805835, | |
| "grad_norm": 76.6875, | |
| "learning_rate": 1.8334145153434e-05, | |
| "loss": 26.4402, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.17743459153078409, | |
| "grad_norm": 105.5, | |
| "learning_rate": 1.8285435947394058e-05, | |
| "loss": 26.8441, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.17962514204350982, | |
| "grad_norm": 123.875, | |
| "learning_rate": 1.8236726741354117e-05, | |
| "loss": 27.1768, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.18181569255623553, | |
| "grad_norm": 263.5, | |
| "learning_rate": 1.8188017535314176e-05, | |
| "loss": 26.4176, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.18400624306896127, | |
| "grad_norm": 108.5, | |
| "learning_rate": 1.8139308329274236e-05, | |
| "loss": 26.831, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.186196793581687, | |
| "grad_norm": 216.25, | |
| "learning_rate": 1.8090599123234295e-05, | |
| "loss": 27.7374, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.18838734409441274, | |
| "grad_norm": 83.0, | |
| "learning_rate": 1.804188991719435e-05, | |
| "loss": 26.2929, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.19057789460713845, | |
| "grad_norm": 129.25, | |
| "learning_rate": 1.799318071115441e-05, | |
| "loss": 27.1493, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.19276844511986418, | |
| "grad_norm": 120.3125, | |
| "learning_rate": 1.794447150511447e-05, | |
| "loss": 27.135, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.19495899563258992, | |
| "grad_norm": 116.1875, | |
| "learning_rate": 1.7895762299074525e-05, | |
| "loss": 27.5562, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.19714954614531566, | |
| "grad_norm": 127.0, | |
| "learning_rate": 1.7847053093034584e-05, | |
| "loss": 27.7403, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.19934009665804137, | |
| "grad_norm": 175.625, | |
| "learning_rate": 1.7798343886994643e-05, | |
| "loss": 27.837, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.2015306471707671, | |
| "grad_norm": 118.1875, | |
| "learning_rate": 1.7749634680954703e-05, | |
| "loss": 27.3186, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.20372119768349284, | |
| "grad_norm": 86.4375, | |
| "learning_rate": 1.7700925474914762e-05, | |
| "loss": 26.8235, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.20591174819621857, | |
| "grad_norm": 178.5, | |
| "learning_rate": 1.7652216268874818e-05, | |
| "loss": 27.0347, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.20810229870894428, | |
| "grad_norm": 107.0, | |
| "learning_rate": 1.7603507062834877e-05, | |
| "loss": 27.8433, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.21029284922167002, | |
| "grad_norm": 295.5, | |
| "learning_rate": 1.7554797856794936e-05, | |
| "loss": 27.151, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.21248339973439576, | |
| "grad_norm": 117.6875, | |
| "learning_rate": 1.7506088650754992e-05, | |
| "loss": 28.0214, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.2146739502471215, | |
| "grad_norm": 159.375, | |
| "learning_rate": 1.745737944471505e-05, | |
| "loss": 27.2854, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2168645007598472, | |
| "grad_norm": 542.0, | |
| "learning_rate": 1.740867023867511e-05, | |
| "loss": 28.374, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.21905505127257294, | |
| "grad_norm": 107.75, | |
| "learning_rate": 1.735996103263517e-05, | |
| "loss": 27.1016, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.22124560178529867, | |
| "grad_norm": 258.5, | |
| "learning_rate": 1.731125182659523e-05, | |
| "loss": 27.078, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.2234361522980244, | |
| "grad_norm": 110.6875, | |
| "learning_rate": 1.7262542620555288e-05, | |
| "loss": 27.0512, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.22562670281075012, | |
| "grad_norm": 116.8125, | |
| "learning_rate": 1.7213833414515344e-05, | |
| "loss": 27.1887, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.22781725332347585, | |
| "grad_norm": 113.875, | |
| "learning_rate": 1.7165124208475403e-05, | |
| "loss": 26.8509, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2300078038362016, | |
| "grad_norm": 160.625, | |
| "learning_rate": 1.7116415002435462e-05, | |
| "loss": 26.0831, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.23219835434892733, | |
| "grad_norm": 83.875, | |
| "learning_rate": 1.7067705796395518e-05, | |
| "loss": 26.9917, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.23438890486165304, | |
| "grad_norm": 249.0, | |
| "learning_rate": 1.7018996590355577e-05, | |
| "loss": 26.9605, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.23657945537437877, | |
| "grad_norm": 91.8125, | |
| "learning_rate": 1.6970287384315637e-05, | |
| "loss": 25.9063, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.2387700058871045, | |
| "grad_norm": 88.5625, | |
| "learning_rate": 1.6921578178275696e-05, | |
| "loss": 26.5997, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.24096055639983024, | |
| "grad_norm": 78.25, | |
| "learning_rate": 1.6872868972235755e-05, | |
| "loss": 26.8746, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.24315110691255595, | |
| "grad_norm": 217.0, | |
| "learning_rate": 1.6824159766195814e-05, | |
| "loss": 27.2071, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.2453416574252817, | |
| "grad_norm": 110.6875, | |
| "learning_rate": 1.677545056015587e-05, | |
| "loss": 27.6879, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.24753220793800743, | |
| "grad_norm": 180.25, | |
| "learning_rate": 1.672674135411593e-05, | |
| "loss": 26.5545, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.24972275845073316, | |
| "grad_norm": 116.5, | |
| "learning_rate": 1.667803214807599e-05, | |
| "loss": 26.3239, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.25191330896345887, | |
| "grad_norm": 220.0, | |
| "learning_rate": 1.6629322942036044e-05, | |
| "loss": 27.3587, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.25410385947618463, | |
| "grad_norm": 140.0, | |
| "learning_rate": 1.6580613735996104e-05, | |
| "loss": 27.5635, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.25629440998891034, | |
| "grad_norm": 95.375, | |
| "learning_rate": 1.6531904529956163e-05, | |
| "loss": 27.7093, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.25848496050163605, | |
| "grad_norm": 108.6875, | |
| "learning_rate": 1.6483195323916222e-05, | |
| "loss": 26.7236, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.2606755110143618, | |
| "grad_norm": 103.6875, | |
| "learning_rate": 1.643448611787628e-05, | |
| "loss": 26.3303, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.2628660615270875, | |
| "grad_norm": 117.5, | |
| "learning_rate": 1.6385776911836337e-05, | |
| "loss": 27.3181, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.26505661203981323, | |
| "grad_norm": 116.5, | |
| "learning_rate": 1.6337067705796396e-05, | |
| "loss": 26.9053, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.267247162552539, | |
| "grad_norm": 78.0, | |
| "learning_rate": 1.6288358499756456e-05, | |
| "loss": 26.7388, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.2694377130652647, | |
| "grad_norm": 138.5, | |
| "learning_rate": 1.623964929371651e-05, | |
| "loss": 26.8026, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.27162826357799047, | |
| "grad_norm": 223.875, | |
| "learning_rate": 1.619094008767657e-05, | |
| "loss": 27.1799, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2738188140907162, | |
| "grad_norm": 112.5625, | |
| "learning_rate": 1.614223088163663e-05, | |
| "loss": 26.5119, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.2760093646034419, | |
| "grad_norm": 96.1875, | |
| "learning_rate": 1.609352167559669e-05, | |
| "loss": 26.9968, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.27819991511616765, | |
| "grad_norm": 81.125, | |
| "learning_rate": 1.604481246955675e-05, | |
| "loss": 27.0376, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.28039046562889336, | |
| "grad_norm": 115.875, | |
| "learning_rate": 1.5996103263516808e-05, | |
| "loss": 26.5923, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.28258101614161907, | |
| "grad_norm": 114.875, | |
| "learning_rate": 1.5947394057476863e-05, | |
| "loss": 26.4362, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.28477156665434483, | |
| "grad_norm": 245.125, | |
| "learning_rate": 1.5898684851436923e-05, | |
| "loss": 26.8871, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.28696211716707054, | |
| "grad_norm": 71.875, | |
| "learning_rate": 1.5849975645396982e-05, | |
| "loss": 27.8365, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.2891526676797963, | |
| "grad_norm": 97.1875, | |
| "learning_rate": 1.5801266439357038e-05, | |
| "loss": 25.91, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.291343218192522, | |
| "grad_norm": 100.625, | |
| "learning_rate": 1.5752557233317097e-05, | |
| "loss": 26.5731, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.2935337687052477, | |
| "grad_norm": 251.0, | |
| "learning_rate": 1.5703848027277156e-05, | |
| "loss": 26.581, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.2957243192179735, | |
| "grad_norm": 102.625, | |
| "learning_rate": 1.5655138821237215e-05, | |
| "loss": 26.9665, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.2979148697306992, | |
| "grad_norm": 88.3125, | |
| "learning_rate": 1.5606429615197275e-05, | |
| "loss": 26.7198, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.3001054202434249, | |
| "grad_norm": 117.9375, | |
| "learning_rate": 1.5557720409157334e-05, | |
| "loss": 26.493, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.30229597075615067, | |
| "grad_norm": 98.125, | |
| "learning_rate": 1.550901120311739e-05, | |
| "loss": 26.7671, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.3044865212688764, | |
| "grad_norm": 96.5625, | |
| "learning_rate": 1.546030199707745e-05, | |
| "loss": 27.1227, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.30667707178160214, | |
| "grad_norm": 108.125, | |
| "learning_rate": 1.5411592791037505e-05, | |
| "loss": 25.9982, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.30886762229432785, | |
| "grad_norm": 228.625, | |
| "learning_rate": 1.5362883584997564e-05, | |
| "loss": 26.5684, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.31105817280705356, | |
| "grad_norm": 86.75, | |
| "learning_rate": 1.5314174378957623e-05, | |
| "loss": 26.5311, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.3132487233197793, | |
| "grad_norm": 125.0625, | |
| "learning_rate": 1.5265465172917682e-05, | |
| "loss": 26.0567, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.31543927383250503, | |
| "grad_norm": 102.625, | |
| "learning_rate": 1.5216755966877742e-05, | |
| "loss": 27.0244, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.31762982434523074, | |
| "grad_norm": 76.0, | |
| "learning_rate": 1.51680467608378e-05, | |
| "loss": 27.4083, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.3198203748579565, | |
| "grad_norm": 115.5625, | |
| "learning_rate": 1.5119337554797857e-05, | |
| "loss": 26.4623, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.3220109253706822, | |
| "grad_norm": 74.3125, | |
| "learning_rate": 1.5070628348757916e-05, | |
| "loss": 27.2791, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.324201475883408, | |
| "grad_norm": 118.125, | |
| "learning_rate": 1.5021919142717975e-05, | |
| "loss": 26.8857, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.3263920263961337, | |
| "grad_norm": 122.875, | |
| "learning_rate": 1.4973209936678033e-05, | |
| "loss": 27.0058, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.3285825769088594, | |
| "grad_norm": 150.25, | |
| "learning_rate": 1.4924500730638092e-05, | |
| "loss": 26.6868, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.33077312742158516, | |
| "grad_norm": 83.875, | |
| "learning_rate": 1.4875791524598151e-05, | |
| "loss": 26.3589, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.33296367793431086, | |
| "grad_norm": 94.5, | |
| "learning_rate": 1.4827082318558209e-05, | |
| "loss": 26.6773, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.3351542284470366, | |
| "grad_norm": 125.25, | |
| "learning_rate": 1.4778373112518268e-05, | |
| "loss": 26.6099, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.33734477895976234, | |
| "grad_norm": 77.3125, | |
| "learning_rate": 1.4729663906478327e-05, | |
| "loss": 26.3338, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.33953532947248805, | |
| "grad_norm": 113.0625, | |
| "learning_rate": 1.4680954700438383e-05, | |
| "loss": 26.2556, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.3417258799852138, | |
| "grad_norm": 158.625, | |
| "learning_rate": 1.4632245494398442e-05, | |
| "loss": 26.6553, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.3439164304979395, | |
| "grad_norm": 99.25, | |
| "learning_rate": 1.4583536288358501e-05, | |
| "loss": 25.8313, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.3461069810106652, | |
| "grad_norm": 96.9375, | |
| "learning_rate": 1.4534827082318559e-05, | |
| "loss": 26.7807, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.348297531523391, | |
| "grad_norm": 111.3125, | |
| "learning_rate": 1.4486117876278618e-05, | |
| "loss": 27.0631, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.3504880820361167, | |
| "grad_norm": 97.5, | |
| "learning_rate": 1.4437408670238677e-05, | |
| "loss": 26.5211, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.3526786325488424, | |
| "grad_norm": 251.625, | |
| "learning_rate": 1.4388699464198735e-05, | |
| "loss": 26.6557, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.35486918306156817, | |
| "grad_norm": 235.25, | |
| "learning_rate": 1.4339990258158794e-05, | |
| "loss": 26.6773, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.3570597335742939, | |
| "grad_norm": 141.75, | |
| "learning_rate": 1.429128105211885e-05, | |
| "loss": 26.6795, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.35925028408701964, | |
| "grad_norm": 174.875, | |
| "learning_rate": 1.4242571846078909e-05, | |
| "loss": 26.6367, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.36144083459974535, | |
| "grad_norm": 133.75, | |
| "learning_rate": 1.4193862640038968e-05, | |
| "loss": 26.7941, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.36363138511247106, | |
| "grad_norm": 72.4375, | |
| "learning_rate": 1.4145153433999026e-05, | |
| "loss": 26.3353, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.3658219356251968, | |
| "grad_norm": 235.125, | |
| "learning_rate": 1.4096444227959085e-05, | |
| "loss": 26.7903, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.36801248613792253, | |
| "grad_norm": 176.0, | |
| "learning_rate": 1.4047735021919144e-05, | |
| "loss": 27.144, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.37020303665064824, | |
| "grad_norm": 129.625, | |
| "learning_rate": 1.3999025815879202e-05, | |
| "loss": 27.1957, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.372393587163374, | |
| "grad_norm": 222.5, | |
| "learning_rate": 1.3950316609839261e-05, | |
| "loss": 26.9646, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.3745841376760997, | |
| "grad_norm": 183.125, | |
| "learning_rate": 1.390160740379932e-05, | |
| "loss": 26.983, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.3767746881888255, | |
| "grad_norm": 118.875, | |
| "learning_rate": 1.3852898197759376e-05, | |
| "loss": 27.2643, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.3789652387015512, | |
| "grad_norm": 93.25, | |
| "learning_rate": 1.3804188991719435e-05, | |
| "loss": 26.2762, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.3811557892142769, | |
| "grad_norm": 129.5, | |
| "learning_rate": 1.3755479785679495e-05, | |
| "loss": 26.2637, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.38334633972700266, | |
| "grad_norm": 77.8125, | |
| "learning_rate": 1.3706770579639552e-05, | |
| "loss": 26.0782, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.38553689023972837, | |
| "grad_norm": 73.25, | |
| "learning_rate": 1.3658061373599611e-05, | |
| "loss": 25.8536, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.3877274407524541, | |
| "grad_norm": 104.4375, | |
| "learning_rate": 1.360935216755967e-05, | |
| "loss": 26.7719, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.38991799126517984, | |
| "grad_norm": 129.375, | |
| "learning_rate": 1.3560642961519728e-05, | |
| "loss": 26.5563, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.39210854177790555, | |
| "grad_norm": 117.5625, | |
| "learning_rate": 1.3511933755479787e-05, | |
| "loss": 26.1647, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.3942990922906313, | |
| "grad_norm": 103.9375, | |
| "learning_rate": 1.3463224549439847e-05, | |
| "loss": 26.663, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.396489642803357, | |
| "grad_norm": 126.8125, | |
| "learning_rate": 1.3414515343399902e-05, | |
| "loss": 26.0344, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.39868019331608273, | |
| "grad_norm": 98.8125, | |
| "learning_rate": 1.3365806137359962e-05, | |
| "loss": 25.2949, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.4008707438288085, | |
| "grad_norm": 188.25, | |
| "learning_rate": 1.331709693132002e-05, | |
| "loss": 25.8863, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.4030612943415342, | |
| "grad_norm": 92.0625, | |
| "learning_rate": 1.3268387725280078e-05, | |
| "loss": 26.487, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.4052518448542599, | |
| "grad_norm": 82.9375, | |
| "learning_rate": 1.3219678519240138e-05, | |
| "loss": 25.7554, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.4074423953669857, | |
| "grad_norm": 67.3125, | |
| "learning_rate": 1.3170969313200197e-05, | |
| "loss": 26.1106, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.4096329458797114, | |
| "grad_norm": 100.8125, | |
| "learning_rate": 1.3122260107160254e-05, | |
| "loss": 26.4613, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.41182349639243715, | |
| "grad_norm": 98.4375, | |
| "learning_rate": 1.3073550901120314e-05, | |
| "loss": 26.3597, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.41401404690516286, | |
| "grad_norm": 88.4375, | |
| "learning_rate": 1.302484169508037e-05, | |
| "loss": 26.033, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.41620459741788857, | |
| "grad_norm": 140.25, | |
| "learning_rate": 1.2976132489040429e-05, | |
| "loss": 25.6496, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.41839514793061433, | |
| "grad_norm": 67.25, | |
| "learning_rate": 1.2927423283000488e-05, | |
| "loss": 25.2236, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.42058569844334004, | |
| "grad_norm": 253.5, | |
| "learning_rate": 1.2878714076960545e-05, | |
| "loss": 26.5954, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.42277624895606575, | |
| "grad_norm": 112.3125, | |
| "learning_rate": 1.2830004870920605e-05, | |
| "loss": 26.4357, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.4249667994687915, | |
| "grad_norm": 100.9375, | |
| "learning_rate": 1.2781295664880664e-05, | |
| "loss": 25.9741, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.4271573499815172, | |
| "grad_norm": 167.875, | |
| "learning_rate": 1.2732586458840721e-05, | |
| "loss": 26.1234, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.429347900494243, | |
| "grad_norm": 147.125, | |
| "learning_rate": 1.268387725280078e-05, | |
| "loss": 25.8006, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.4315384510069687, | |
| "grad_norm": 130.625, | |
| "learning_rate": 1.263516804676084e-05, | |
| "loss": 26.9726, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.4337290015196944, | |
| "grad_norm": 70.5, | |
| "learning_rate": 1.2586458840720897e-05, | |
| "loss": 25.7618, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.43591955203242017, | |
| "grad_norm": 71.375, | |
| "learning_rate": 1.2537749634680957e-05, | |
| "loss": 25.2961, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.4381101025451459, | |
| "grad_norm": 112.0, | |
| "learning_rate": 1.2489040428641016e-05, | |
| "loss": 25.6715, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4381101025451459, | |
| "eval_loss": NaN, | |
| "eval_runtime": 246.5162, | |
| "eval_samples_per_second": 998.04, | |
| "eval_steps_per_second": 31.191, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4403006530578716, | |
| "grad_norm": 158.5, | |
| "learning_rate": 1.2440331222601072e-05, | |
| "loss": 25.968, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.44249120357059735, | |
| "grad_norm": 66.1875, | |
| "learning_rate": 1.2391622016561131e-05, | |
| "loss": 25.9174, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.44468175408332306, | |
| "grad_norm": 68.75, | |
| "learning_rate": 1.234291281052119e-05, | |
| "loss": 25.4954, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.4468723045960488, | |
| "grad_norm": 130.75, | |
| "learning_rate": 1.2294203604481248e-05, | |
| "loss": 25.5188, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.44906285510877453, | |
| "grad_norm": 112.3125, | |
| "learning_rate": 1.2245494398441307e-05, | |
| "loss": 25.8739, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.45125340562150024, | |
| "grad_norm": 117.0, | |
| "learning_rate": 1.2196785192401366e-05, | |
| "loss": 25.0384, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.453443956134226, | |
| "grad_norm": 77.375, | |
| "learning_rate": 1.2148075986361424e-05, | |
| "loss": 25.6608, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.4556345066469517, | |
| "grad_norm": 60.34375, | |
| "learning_rate": 1.2099366780321483e-05, | |
| "loss": 25.7742, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.4578250571596774, | |
| "grad_norm": 78.5625, | |
| "learning_rate": 1.2050657574281542e-05, | |
| "loss": 25.5131, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.4600156076724032, | |
| "grad_norm": 115.1875, | |
| "learning_rate": 1.2001948368241598e-05, | |
| "loss": 25.5009, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.4622061581851289, | |
| "grad_norm": 118.4375, | |
| "learning_rate": 1.1953239162201657e-05, | |
| "loss": 25.7924, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.46439670869785465, | |
| "grad_norm": 68.375, | |
| "learning_rate": 1.1904529956161715e-05, | |
| "loss": 25.9603, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.46658725921058036, | |
| "grad_norm": 121.375, | |
| "learning_rate": 1.1855820750121774e-05, | |
| "loss": 25.6717, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.46877780972330607, | |
| "grad_norm": 83.6875, | |
| "learning_rate": 1.1807111544081833e-05, | |
| "loss": 25.6563, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.47096836023603184, | |
| "grad_norm": 64.5625, | |
| "learning_rate": 1.175840233804189e-05, | |
| "loss": 26.0566, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.47315891074875754, | |
| "grad_norm": 81.6875, | |
| "learning_rate": 1.170969313200195e-05, | |
| "loss": 25.7885, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.47534946126148325, | |
| "grad_norm": 100.1875, | |
| "learning_rate": 1.1660983925962009e-05, | |
| "loss": 25.5735, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.477540011774209, | |
| "grad_norm": 116.0, | |
| "learning_rate": 1.1612274719922065e-05, | |
| "loss": 25.7891, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.4797305622869347, | |
| "grad_norm": 70.125, | |
| "learning_rate": 1.1563565513882124e-05, | |
| "loss": 25.6365, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.4819211127996605, | |
| "grad_norm": 52.96875, | |
| "learning_rate": 1.1514856307842183e-05, | |
| "loss": 25.5813, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.4841116633123862, | |
| "grad_norm": 150.125, | |
| "learning_rate": 1.1466147101802241e-05, | |
| "loss": 26.0147, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.4863022138251119, | |
| "grad_norm": 125.0625, | |
| "learning_rate": 1.14174378957623e-05, | |
| "loss": 26.1209, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.48849276433783767, | |
| "grad_norm": 89.875, | |
| "learning_rate": 1.136872868972236e-05, | |
| "loss": 25.7375, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.4906833148505634, | |
| "grad_norm": 125.5625, | |
| "learning_rate": 1.1320019483682417e-05, | |
| "loss": 25.246, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.4928738653632891, | |
| "grad_norm": 96.625, | |
| "learning_rate": 1.1271310277642476e-05, | |
| "loss": 25.7476, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.49506441587601485, | |
| "grad_norm": 148.375, | |
| "learning_rate": 1.1222601071602535e-05, | |
| "loss": 25.0299, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.49725496638874056, | |
| "grad_norm": 131.25, | |
| "learning_rate": 1.1173891865562591e-05, | |
| "loss": 25.7002, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.4994455169014663, | |
| "grad_norm": 99.0, | |
| "learning_rate": 1.112518265952265e-05, | |
| "loss": 25.3308, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.501636067414192, | |
| "grad_norm": 52.90625, | |
| "learning_rate": 1.107647345348271e-05, | |
| "loss": 25.7107, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.5038266179269177, | |
| "grad_norm": 92.375, | |
| "learning_rate": 1.1027764247442767e-05, | |
| "loss": 25.6609, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.5060171684396435, | |
| "grad_norm": 59.46875, | |
| "learning_rate": 1.0979055041402826e-05, | |
| "loss": 25.5677, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.5082077189523693, | |
| "grad_norm": 96.375, | |
| "learning_rate": 1.0930345835362886e-05, | |
| "loss": 25.3615, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.5103982694650949, | |
| "grad_norm": 75.0, | |
| "learning_rate": 1.0881636629322943e-05, | |
| "loss": 25.4644, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.5125888199778207, | |
| "grad_norm": 98.5, | |
| "learning_rate": 1.0832927423283002e-05, | |
| "loss": 25.6878, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.5147793704905465, | |
| "grad_norm": 76.1875, | |
| "learning_rate": 1.0784218217243058e-05, | |
| "loss": 25.8905, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.5169699210032721, | |
| "grad_norm": 52.4375, | |
| "learning_rate": 1.0735509011203117e-05, | |
| "loss": 25.7829, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.5191604715159979, | |
| "grad_norm": 105.625, | |
| "learning_rate": 1.0686799805163177e-05, | |
| "loss": 25.244, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.5213510220287236, | |
| "grad_norm": 59.84375, | |
| "learning_rate": 1.0638090599123234e-05, | |
| "loss": 25.1215, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.5235415725414493, | |
| "grad_norm": 54.03125, | |
| "learning_rate": 1.0589381393083293e-05, | |
| "loss": 25.3532, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.525732123054175, | |
| "grad_norm": 134.0, | |
| "learning_rate": 1.0540672187043353e-05, | |
| "loss": 24.9972, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5279226735669008, | |
| "grad_norm": 77.5625, | |
| "learning_rate": 1.049196298100341e-05, | |
| "loss": 24.5776, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.5301132240796265, | |
| "grad_norm": 163.25, | |
| "learning_rate": 1.044325377496347e-05, | |
| "loss": 25.7603, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.5323037745923522, | |
| "grad_norm": 77.4375, | |
| "learning_rate": 1.0394544568923529e-05, | |
| "loss": 24.9012, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.534494325105078, | |
| "grad_norm": 101.25, | |
| "learning_rate": 1.0345835362883584e-05, | |
| "loss": 25.5309, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.5366848756178036, | |
| "grad_norm": 127.625, | |
| "learning_rate": 1.0297126156843644e-05, | |
| "loss": 25.7696, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.5388754261305294, | |
| "grad_norm": 79.75, | |
| "learning_rate": 1.0248416950803703e-05, | |
| "loss": 25.5109, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.5410659766432552, | |
| "grad_norm": 151.0, | |
| "learning_rate": 1.019970774476376e-05, | |
| "loss": 25.7191, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.5432565271559809, | |
| "grad_norm": 54.0, | |
| "learning_rate": 1.015099853872382e-05, | |
| "loss": 25.1186, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.5454470776687066, | |
| "grad_norm": 54.3125, | |
| "learning_rate": 1.0102289332683879e-05, | |
| "loss": 24.919, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.5476376281814324, | |
| "grad_norm": 106.8125, | |
| "learning_rate": 1.0053580126643936e-05, | |
| "loss": 25.1853, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.5498281786941581, | |
| "grad_norm": 97.75, | |
| "learning_rate": 1.0004870920603996e-05, | |
| "loss": 24.8172, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.5520187292068838, | |
| "grad_norm": 67.8125, | |
| "learning_rate": 9.956161714564053e-06, | |
| "loss": 24.8634, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.5542092797196095, | |
| "grad_norm": 169.125, | |
| "learning_rate": 9.90745250852411e-06, | |
| "loss": 25.2829, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.5563998302323353, | |
| "grad_norm": 116.375, | |
| "learning_rate": 9.85874330248417e-06, | |
| "loss": 25.702, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.558590380745061, | |
| "grad_norm": 229.125, | |
| "learning_rate": 9.810034096444229e-06, | |
| "loss": 24.9604, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.5607809312577867, | |
| "grad_norm": 51.5625, | |
| "learning_rate": 9.761324890404287e-06, | |
| "loss": 24.766, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.5629714817705125, | |
| "grad_norm": 88.5625, | |
| "learning_rate": 9.712615684364346e-06, | |
| "loss": 24.9662, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.5651620322832381, | |
| "grad_norm": 92.375, | |
| "learning_rate": 9.663906478324403e-06, | |
| "loss": 25.0194, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.5673525827959639, | |
| "grad_norm": 83.8125, | |
| "learning_rate": 9.615197272284463e-06, | |
| "loss": 25.4854, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.5695431333086897, | |
| "grad_norm": 76.75, | |
| "learning_rate": 9.566488066244522e-06, | |
| "loss": 24.8307, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5717336838214153, | |
| "grad_norm": 92.75, | |
| "learning_rate": 9.51777886020458e-06, | |
| "loss": 24.5626, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.5739242343341411, | |
| "grad_norm": 98.6875, | |
| "learning_rate": 9.469069654164637e-06, | |
| "loss": 24.9844, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.5761147848468668, | |
| "grad_norm": 63.28125, | |
| "learning_rate": 9.420360448124696e-06, | |
| "loss": 25.0084, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.5783053353595926, | |
| "grad_norm": 272.75, | |
| "learning_rate": 9.371651242084755e-06, | |
| "loss": 25.2756, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.5804958858723183, | |
| "grad_norm": 214.125, | |
| "learning_rate": 9.322942036044813e-06, | |
| "loss": 25.1908, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.582686436385044, | |
| "grad_norm": 93.875, | |
| "learning_rate": 9.27423283000487e-06, | |
| "loss": 24.5392, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.5848769868977698, | |
| "grad_norm": 119.5625, | |
| "learning_rate": 9.22552362396493e-06, | |
| "loss": 25.1336, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.5870675374104954, | |
| "grad_norm": 104.4375, | |
| "learning_rate": 9.176814417924989e-06, | |
| "loss": 24.3678, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.5892580879232212, | |
| "grad_norm": 77.0625, | |
| "learning_rate": 9.128105211885046e-06, | |
| "loss": 25.1012, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.591448638435947, | |
| "grad_norm": 196.375, | |
| "learning_rate": 9.079396005845106e-06, | |
| "loss": 24.9593, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.5936391889486726, | |
| "grad_norm": 71.4375, | |
| "learning_rate": 9.030686799805163e-06, | |
| "loss": 24.8451, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.5958297394613984, | |
| "grad_norm": 389.5, | |
| "learning_rate": 8.981977593765222e-06, | |
| "loss": 25.182, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.5980202899741242, | |
| "grad_norm": 56.03125, | |
| "learning_rate": 8.933268387725282e-06, | |
| "loss": 24.3735, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.6002108404868498, | |
| "grad_norm": 55.53125, | |
| "learning_rate": 8.884559181685339e-06, | |
| "loss": 25.3762, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.6024013909995756, | |
| "grad_norm": 115.5, | |
| "learning_rate": 8.835849975645398e-06, | |
| "loss": 23.874, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.6045919415123013, | |
| "grad_norm": 76.0, | |
| "learning_rate": 8.787140769605456e-06, | |
| "loss": 25.0251, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.606782492025027, | |
| "grad_norm": 152.625, | |
| "learning_rate": 8.738431563565515e-06, | |
| "loss": 25.099, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.6089730425377528, | |
| "grad_norm": 81.625, | |
| "learning_rate": 8.689722357525573e-06, | |
| "loss": 24.6503, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.6111635930504785, | |
| "grad_norm": 99.9375, | |
| "learning_rate": 8.641013151485632e-06, | |
| "loss": 25.0892, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.6133541435632043, | |
| "grad_norm": 67.5625, | |
| "learning_rate": 8.592303945445691e-06, | |
| "loss": 25.1785, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.6155446940759299, | |
| "grad_norm": 133.125, | |
| "learning_rate": 8.543594739405749e-06, | |
| "loss": 25.2308, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.6177352445886557, | |
| "grad_norm": 127.1875, | |
| "learning_rate": 8.494885533365806e-06, | |
| "loss": 24.7525, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.6199257951013815, | |
| "grad_norm": 88.4375, | |
| "learning_rate": 8.446176327325865e-06, | |
| "loss": 25.3764, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.6221163456141071, | |
| "grad_norm": 68.1875, | |
| "learning_rate": 8.397467121285925e-06, | |
| "loss": 25.6512, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.6243068961268329, | |
| "grad_norm": 105.25, | |
| "learning_rate": 8.348757915245982e-06, | |
| "loss": 24.5618, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.6264974466395586, | |
| "grad_norm": 62.96875, | |
| "learning_rate": 8.300048709206041e-06, | |
| "loss": 25.5549, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.6286879971522843, | |
| "grad_norm": 112.75, | |
| "learning_rate": 8.251339503166099e-06, | |
| "loss": 25.7603, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.6308785476650101, | |
| "grad_norm": 146.75, | |
| "learning_rate": 8.202630297126158e-06, | |
| "loss": 24.9323, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.6330690981777358, | |
| "grad_norm": 72.8125, | |
| "learning_rate": 8.153921091086216e-06, | |
| "loss": 25.1751, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.6352596486904615, | |
| "grad_norm": 131.75, | |
| "learning_rate": 8.105211885046275e-06, | |
| "loss": 24.7063, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.6374501992031872, | |
| "grad_norm": 81.5, | |
| "learning_rate": 8.056502679006332e-06, | |
| "loss": 25.0378, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.639640749715913, | |
| "grad_norm": 74.0625, | |
| "learning_rate": 8.007793472966392e-06, | |
| "loss": 24.7719, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.6418313002286387, | |
| "grad_norm": 65.0, | |
| "learning_rate": 7.95908426692645e-06, | |
| "loss": 24.439, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.6440218507413644, | |
| "grad_norm": 66.8125, | |
| "learning_rate": 7.910375060886508e-06, | |
| "loss": 24.7476, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.6462124012540902, | |
| "grad_norm": 38.3125, | |
| "learning_rate": 7.861665854846566e-06, | |
| "loss": 25.3374, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.648402951766816, | |
| "grad_norm": 55.5625, | |
| "learning_rate": 7.812956648806625e-06, | |
| "loss": 24.3505, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.6505935022795416, | |
| "grad_norm": 86.25, | |
| "learning_rate": 7.764247442766684e-06, | |
| "loss": 24.8686, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.6527840527922674, | |
| "grad_norm": 78.4375, | |
| "learning_rate": 7.715538236726742e-06, | |
| "loss": 24.9231, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.6549746033049931, | |
| "grad_norm": 110.9375, | |
| "learning_rate": 7.666829030686801e-06, | |
| "loss": 25.0029, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.6571651538177188, | |
| "grad_norm": 43.84375, | |
| "learning_rate": 7.6181198246468595e-06, | |
| "loss": 24.4183, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6593557043304445, | |
| "grad_norm": 50.46875, | |
| "learning_rate": 7.569410618606917e-06, | |
| "loss": 24.3607, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.6615462548431703, | |
| "grad_norm": 220.125, | |
| "learning_rate": 7.520701412566975e-06, | |
| "loss": 24.7162, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.663736805355896, | |
| "grad_norm": 89.625, | |
| "learning_rate": 7.471992206527035e-06, | |
| "loss": 24.8443, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.6659273558686217, | |
| "grad_norm": 45.96875, | |
| "learning_rate": 7.423283000487093e-06, | |
| "loss": 24.0884, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.6681179063813475, | |
| "grad_norm": 41.125, | |
| "learning_rate": 7.3745737944471505e-06, | |
| "loss": 24.1827, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.6703084568940731, | |
| "grad_norm": 46.0625, | |
| "learning_rate": 7.32586458840721e-06, | |
| "loss": 24.6457, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.6724990074067989, | |
| "grad_norm": 49.96875, | |
| "learning_rate": 7.277155382367268e-06, | |
| "loss": 24.2197, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.6746895579195247, | |
| "grad_norm": 55.4375, | |
| "learning_rate": 7.2284461763273265e-06, | |
| "loss": 24.5083, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.6768801084322503, | |
| "grad_norm": 147.375, | |
| "learning_rate": 7.179736970287386e-06, | |
| "loss": 24.5295, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.6790706589449761, | |
| "grad_norm": 66.8125, | |
| "learning_rate": 7.131027764247443e-06, | |
| "loss": 24.3484, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.6812612094577019, | |
| "grad_norm": 99.125, | |
| "learning_rate": 7.082318558207502e-06, | |
| "loss": 24.5569, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.6834517599704276, | |
| "grad_norm": 55.34375, | |
| "learning_rate": 7.03360935216756e-06, | |
| "loss": 24.0178, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.6856423104831533, | |
| "grad_norm": 89.8125, | |
| "learning_rate": 6.984900146127619e-06, | |
| "loss": 23.5264, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.687832860995879, | |
| "grad_norm": 248.5, | |
| "learning_rate": 6.936190940087677e-06, | |
| "loss": 25.177, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.6900234115086048, | |
| "grad_norm": 97.75, | |
| "learning_rate": 6.887481734047735e-06, | |
| "loss": 24.3799, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.6922139620213305, | |
| "grad_norm": 101.0, | |
| "learning_rate": 6.838772528007794e-06, | |
| "loss": 24.7024, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.6944045125340562, | |
| "grad_norm": 94.1875, | |
| "learning_rate": 6.790063321967853e-06, | |
| "loss": 24.3573, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.696595063046782, | |
| "grad_norm": 71.0625, | |
| "learning_rate": 6.74135411592791e-06, | |
| "loss": 24.3899, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.6987856135595076, | |
| "grad_norm": 62.90625, | |
| "learning_rate": 6.6926449098879695e-06, | |
| "loss": 24.4977, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.7009761640722334, | |
| "grad_norm": 42.25, | |
| "learning_rate": 6.643935703848028e-06, | |
| "loss": 24.0119, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.7031667145849592, | |
| "grad_norm": 72.75, | |
| "learning_rate": 6.595226497808086e-06, | |
| "loss": 24.1055, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.7053572650976848, | |
| "grad_norm": 50.1875, | |
| "learning_rate": 6.5465172917681454e-06, | |
| "loss": 24.6823, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.7075478156104106, | |
| "grad_norm": 76.75, | |
| "learning_rate": 6.497808085728203e-06, | |
| "loss": 24.3863, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.7097383661231363, | |
| "grad_norm": 41.8125, | |
| "learning_rate": 6.449098879688261e-06, | |
| "loss": 24.5434, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.711928916635862, | |
| "grad_norm": 49.71875, | |
| "learning_rate": 6.40038967364832e-06, | |
| "loss": 23.9173, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.7141194671485878, | |
| "grad_norm": 303.75, | |
| "learning_rate": 6.351680467608379e-06, | |
| "loss": 23.8009, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.7163100176613135, | |
| "grad_norm": 107.0, | |
| "learning_rate": 6.3029712615684365e-06, | |
| "loss": 24.348, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.7185005681740393, | |
| "grad_norm": 57.5625, | |
| "learning_rate": 6.254262055528495e-06, | |
| "loss": 24.7125, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.7206911186867649, | |
| "grad_norm": 83.125, | |
| "learning_rate": 6.205552849488554e-06, | |
| "loss": 24.2265, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.7228816691994907, | |
| "grad_norm": 42.28125, | |
| "learning_rate": 6.1568436434486125e-06, | |
| "loss": 24.2224, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.7250722197122165, | |
| "grad_norm": 64.125, | |
| "learning_rate": 6.10813443740867e-06, | |
| "loss": 24.8299, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.7272627702249421, | |
| "grad_norm": 105.125, | |
| "learning_rate": 6.059425231368729e-06, | |
| "loss": 23.6778, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.7294533207376679, | |
| "grad_norm": 113.6875, | |
| "learning_rate": 6.010716025328788e-06, | |
| "loss": 24.2767, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.7316438712503937, | |
| "grad_norm": 67.9375, | |
| "learning_rate": 5.962006819288846e-06, | |
| "loss": 23.9942, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.7338344217631193, | |
| "grad_norm": 54.03125, | |
| "learning_rate": 5.913297613248905e-06, | |
| "loss": 23.7793, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.7360249722758451, | |
| "grad_norm": 63.625, | |
| "learning_rate": 5.8645884072089636e-06, | |
| "loss": 23.582, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.7382155227885708, | |
| "grad_norm": 58.375, | |
| "learning_rate": 5.815879201169021e-06, | |
| "loss": 24.222, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.7404060733012965, | |
| "grad_norm": 68.8125, | |
| "learning_rate": 5.7671699951290795e-06, | |
| "loss": 23.639, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.7425966238140222, | |
| "grad_norm": 32.53125, | |
| "learning_rate": 5.718460789089139e-06, | |
| "loss": 23.6367, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.744787174326748, | |
| "grad_norm": 62.53125, | |
| "learning_rate": 5.669751583049197e-06, | |
| "loss": 24.6188, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.7469777248394737, | |
| "grad_norm": 108.75, | |
| "learning_rate": 5.621042377009255e-06, | |
| "loss": 24.1849, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.7491682753521994, | |
| "grad_norm": 126.0, | |
| "learning_rate": 5.572333170969314e-06, | |
| "loss": 24.3651, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.7513588258649252, | |
| "grad_norm": 50.84375, | |
| "learning_rate": 5.523623964929372e-06, | |
| "loss": 24.2613, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.753549376377651, | |
| "grad_norm": 74.5625, | |
| "learning_rate": 5.474914758889431e-06, | |
| "loss": 23.998, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.7557399268903766, | |
| "grad_norm": 86.125, | |
| "learning_rate": 5.42620555284949e-06, | |
| "loss": 23.9456, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.7579304774031024, | |
| "grad_norm": 61.09375, | |
| "learning_rate": 5.377496346809547e-06, | |
| "loss": 23.3448, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.7601210279158281, | |
| "grad_norm": 65.125, | |
| "learning_rate": 5.328787140769606e-06, | |
| "loss": 24.1856, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.7623115784285538, | |
| "grad_norm": 54.375, | |
| "learning_rate": 5.280077934729664e-06, | |
| "loss": 23.7022, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.7645021289412796, | |
| "grad_norm": 73.8125, | |
| "learning_rate": 5.231368728689723e-06, | |
| "loss": 24.0991, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.7666926794540053, | |
| "grad_norm": 41.6875, | |
| "learning_rate": 5.182659522649781e-06, | |
| "loss": 23.627, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.768883229966731, | |
| "grad_norm": 95.5625, | |
| "learning_rate": 5.133950316609839e-06, | |
| "loss": 24.0103, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.7710737804794567, | |
| "grad_norm": 58.09375, | |
| "learning_rate": 5.0852411105698985e-06, | |
| "loss": 24.4259, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.7732643309921825, | |
| "grad_norm": 51.1875, | |
| "learning_rate": 5.036531904529957e-06, | |
| "loss": 24.3509, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.7754548815049082, | |
| "grad_norm": 26.265625, | |
| "learning_rate": 4.987822698490015e-06, | |
| "loss": 24.2639, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.7776454320176339, | |
| "grad_norm": 37.03125, | |
| "learning_rate": 4.939113492450074e-06, | |
| "loss": 23.9353, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.7798359825303597, | |
| "grad_norm": 71.6875, | |
| "learning_rate": 4.890404286410132e-06, | |
| "loss": 23.7745, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.7820265330430853, | |
| "grad_norm": 53.875, | |
| "learning_rate": 4.84169508037019e-06, | |
| "loss": 24.0737, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.7842170835558111, | |
| "grad_norm": 62.15625, | |
| "learning_rate": 4.792985874330249e-06, | |
| "loss": 24.3623, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.7864076340685369, | |
| "grad_norm": 46.3125, | |
| "learning_rate": 4.744276668290307e-06, | |
| "loss": 24.1168, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.7885981845812626, | |
| "grad_norm": 49.90625, | |
| "learning_rate": 4.6955674622503655e-06, | |
| "loss": 23.9208, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.7907887350939883, | |
| "grad_norm": 70.0625, | |
| "learning_rate": 4.646858256210424e-06, | |
| "loss": 23.9657, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.792979285606714, | |
| "grad_norm": 273.5, | |
| "learning_rate": 4.598149050170483e-06, | |
| "loss": 23.4883, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.7951698361194398, | |
| "grad_norm": 36.46875, | |
| "learning_rate": 4.549439844130541e-06, | |
| "loss": 24.1732, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.7973603866321655, | |
| "grad_norm": 98.1875, | |
| "learning_rate": 4.5007306380906e-06, | |
| "loss": 24.5668, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.7995509371448912, | |
| "grad_norm": 79.9375, | |
| "learning_rate": 4.452021432050657e-06, | |
| "loss": 23.3709, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.801741487657617, | |
| "grad_norm": 51.5, | |
| "learning_rate": 4.403312226010717e-06, | |
| "loss": 23.4818, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.8039320381703426, | |
| "grad_norm": 72.25, | |
| "learning_rate": 4.354603019970775e-06, | |
| "loss": 23.678, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.8061225886830684, | |
| "grad_norm": 56.21875, | |
| "learning_rate": 4.305893813930833e-06, | |
| "loss": 23.3332, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.8083131391957942, | |
| "grad_norm": 179.25, | |
| "learning_rate": 4.257184607890892e-06, | |
| "loss": 23.4972, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.8105036897085198, | |
| "grad_norm": 105.875, | |
| "learning_rate": 4.20847540185095e-06, | |
| "loss": 23.0243, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.8126942402212456, | |
| "grad_norm": 49.53125, | |
| "learning_rate": 4.1597661958110085e-06, | |
| "loss": 23.926, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 0.8148847907339714, | |
| "grad_norm": 42.1875, | |
| "learning_rate": 4.111056989771067e-06, | |
| "loss": 24.0595, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.817075341246697, | |
| "grad_norm": 38.0625, | |
| "learning_rate": 4.062347783731125e-06, | |
| "loss": 23.3749, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 0.8192658917594228, | |
| "grad_norm": 46.21875, | |
| "learning_rate": 4.0136385776911845e-06, | |
| "loss": 23.5141, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.8214564422721485, | |
| "grad_norm": 64.6875, | |
| "learning_rate": 3.964929371651242e-06, | |
| "loss": 22.9993, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.8236469927848743, | |
| "grad_norm": 104.1875, | |
| "learning_rate": 3.916220165611301e-06, | |
| "loss": 23.1683, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.8258375432976, | |
| "grad_norm": 58.84375, | |
| "learning_rate": 3.86751095957136e-06, | |
| "loss": 23.006, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 0.8280280938103257, | |
| "grad_norm": 47.625, | |
| "learning_rate": 3.818801753531418e-06, | |
| "loss": 23.3635, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.8302186443230515, | |
| "grad_norm": 37.03125, | |
| "learning_rate": 3.7700925474914763e-06, | |
| "loss": 23.9395, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 0.8324091948357771, | |
| "grad_norm": 40.0625, | |
| "learning_rate": 3.7213833414515347e-06, | |
| "loss": 23.6212, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.8345997453485029, | |
| "grad_norm": 144.875, | |
| "learning_rate": 3.672674135411593e-06, | |
| "loss": 23.753, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 0.8367902958612287, | |
| "grad_norm": 34.53125, | |
| "learning_rate": 3.6239649293716515e-06, | |
| "loss": 22.3497, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.8389808463739543, | |
| "grad_norm": 44.71875, | |
| "learning_rate": 3.57525572333171e-06, | |
| "loss": 22.9936, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 0.8411713968866801, | |
| "grad_norm": 42.5625, | |
| "learning_rate": 3.5265465172917687e-06, | |
| "loss": 23.6379, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.8433619473994058, | |
| "grad_norm": 77.8125, | |
| "learning_rate": 3.477837311251827e-06, | |
| "loss": 23.9546, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.8455524979121315, | |
| "grad_norm": 149.375, | |
| "learning_rate": 3.4291281052118854e-06, | |
| "loss": 23.1372, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.8477430484248573, | |
| "grad_norm": 105.0625, | |
| "learning_rate": 3.3804188991719438e-06, | |
| "loss": 23.1796, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 0.849933598937583, | |
| "grad_norm": 27.484375, | |
| "learning_rate": 3.331709693132002e-06, | |
| "loss": 23.3445, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.8521241494503087, | |
| "grad_norm": 50.78125, | |
| "learning_rate": 3.2830004870920605e-06, | |
| "loss": 23.4366, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 0.8543146999630344, | |
| "grad_norm": 33.125, | |
| "learning_rate": 3.2342912810521193e-06, | |
| "loss": 23.522, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.8565052504757602, | |
| "grad_norm": 36.71875, | |
| "learning_rate": 3.1855820750121773e-06, | |
| "loss": 23.6913, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 0.858695800988486, | |
| "grad_norm": 48.59375, | |
| "learning_rate": 3.136872868972236e-06, | |
| "loss": 23.2535, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.8608863515012116, | |
| "grad_norm": 90.1875, | |
| "learning_rate": 3.088163662932294e-06, | |
| "loss": 23.2659, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 0.8630769020139374, | |
| "grad_norm": 49.5625, | |
| "learning_rate": 3.039454456892353e-06, | |
| "loss": 22.8549, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.8652674525266632, | |
| "grad_norm": 134.625, | |
| "learning_rate": 2.9907452508524117e-06, | |
| "loss": 23.9329, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.8674580030393888, | |
| "grad_norm": 43.875, | |
| "learning_rate": 2.9420360448124696e-06, | |
| "loss": 22.7826, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.8696485535521146, | |
| "grad_norm": 31.65625, | |
| "learning_rate": 2.8933268387725284e-06, | |
| "loss": 23.0787, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.8718391040648403, | |
| "grad_norm": 56.9375, | |
| "learning_rate": 2.8446176327325868e-06, | |
| "loss": 23.0323, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.874029654577566, | |
| "grad_norm": 39.78125, | |
| "learning_rate": 2.795908426692645e-06, | |
| "loss": 23.3068, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 0.8762202050902917, | |
| "grad_norm": 48.09375, | |
| "learning_rate": 2.7471992206527035e-06, | |
| "loss": 23.4063, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8762202050902917, | |
| "eval_loss": NaN, | |
| "eval_runtime": 243.6077, | |
| "eval_samples_per_second": 1009.956, | |
| "eval_steps_per_second": 31.563, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8784107556030175, | |
| "grad_norm": 45.9375, | |
| "learning_rate": 2.698490014612762e-06, | |
| "loss": 23.181, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 0.8806013061157432, | |
| "grad_norm": 40.0625, | |
| "learning_rate": 2.6497808085728203e-06, | |
| "loss": 23.8107, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.8827918566284689, | |
| "grad_norm": 62.84375, | |
| "learning_rate": 2.601071602532879e-06, | |
| "loss": 23.4075, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 0.8849824071411947, | |
| "grad_norm": 46.90625, | |
| "learning_rate": 2.5523623964929375e-06, | |
| "loss": 22.9923, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.8871729576539203, | |
| "grad_norm": 60.84375, | |
| "learning_rate": 2.503653190452996e-06, | |
| "loss": 23.0107, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.8893635081666461, | |
| "grad_norm": 49.84375, | |
| "learning_rate": 2.4549439844130542e-06, | |
| "loss": 23.4251, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.8915540586793719, | |
| "grad_norm": 51.75, | |
| "learning_rate": 2.4062347783731126e-06, | |
| "loss": 23.4416, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 0.8937446091920976, | |
| "grad_norm": 41.5, | |
| "learning_rate": 2.357525572333171e-06, | |
| "loss": 23.0098, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.8959351597048233, | |
| "grad_norm": 32.0625, | |
| "learning_rate": 2.3088163662932294e-06, | |
| "loss": 23.1968, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 0.8981257102175491, | |
| "grad_norm": 53.25, | |
| "learning_rate": 2.260107160253288e-06, | |
| "loss": 22.7051, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.9003162607302748, | |
| "grad_norm": 31.796875, | |
| "learning_rate": 2.2113979542133465e-06, | |
| "loss": 23.8739, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 0.9025068112430005, | |
| "grad_norm": 27.796875, | |
| "learning_rate": 2.162688748173405e-06, | |
| "loss": 22.8889, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.9046973617557262, | |
| "grad_norm": 31.0, | |
| "learning_rate": 2.1139795421334633e-06, | |
| "loss": 22.6707, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 0.906887912268452, | |
| "grad_norm": 60.9375, | |
| "learning_rate": 2.065270336093522e-06, | |
| "loss": 23.7109, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.9090784627811777, | |
| "grad_norm": 29.1875, | |
| "learning_rate": 2.0165611300535805e-06, | |
| "loss": 22.7048, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.9112690132939034, | |
| "grad_norm": 38.5625, | |
| "learning_rate": 1.967851924013639e-06, | |
| "loss": 22.6618, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.9134595638066292, | |
| "grad_norm": 54.4375, | |
| "learning_rate": 1.9191427179736972e-06, | |
| "loss": 22.6925, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 0.9156501143193548, | |
| "grad_norm": 45.40625, | |
| "learning_rate": 1.8704335119337556e-06, | |
| "loss": 22.5756, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.9178406648320806, | |
| "grad_norm": 45.625, | |
| "learning_rate": 1.8217243058938142e-06, | |
| "loss": 22.4134, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 0.9200312153448064, | |
| "grad_norm": 60.59375, | |
| "learning_rate": 1.7730150998538726e-06, | |
| "loss": 23.2034, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.922221765857532, | |
| "grad_norm": 31.5, | |
| "learning_rate": 1.724305893813931e-06, | |
| "loss": 22.6316, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 0.9244123163702578, | |
| "grad_norm": 52.9375, | |
| "learning_rate": 1.6755966877739893e-06, | |
| "loss": 22.7477, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.9266028668829835, | |
| "grad_norm": 53.875, | |
| "learning_rate": 1.626887481734048e-06, | |
| "loss": 23.2602, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 0.9287934173957093, | |
| "grad_norm": 30.921875, | |
| "learning_rate": 1.5781782756941063e-06, | |
| "loss": 22.416, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.930983967908435, | |
| "grad_norm": 35.15625, | |
| "learning_rate": 1.5294690696541647e-06, | |
| "loss": 23.0874, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.9331745184211607, | |
| "grad_norm": 52.78125, | |
| "learning_rate": 1.480759863614223e-06, | |
| "loss": 22.3363, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.9353650689338865, | |
| "grad_norm": 39.8125, | |
| "learning_rate": 1.4320506575742814e-06, | |
| "loss": 22.8326, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 0.9375556194466121, | |
| "grad_norm": 24.90625, | |
| "learning_rate": 1.3833414515343402e-06, | |
| "loss": 22.8988, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.9397461699593379, | |
| "grad_norm": 30.078125, | |
| "learning_rate": 1.3346322454943986e-06, | |
| "loss": 22.8919, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 0.9419367204720637, | |
| "grad_norm": 139.25, | |
| "learning_rate": 1.285923039454457e-06, | |
| "loss": 22.809, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.9441272709847893, | |
| "grad_norm": 36.34375, | |
| "learning_rate": 1.2372138334145156e-06, | |
| "loss": 22.6055, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 0.9463178214975151, | |
| "grad_norm": 27.328125, | |
| "learning_rate": 1.188504627374574e-06, | |
| "loss": 22.1049, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.9485083720102409, | |
| "grad_norm": 37.9375, | |
| "learning_rate": 1.1397954213346323e-06, | |
| "loss": 22.528, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 0.9506989225229665, | |
| "grad_norm": 45.53125, | |
| "learning_rate": 1.0910862152946907e-06, | |
| "loss": 22.8135, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.9528894730356923, | |
| "grad_norm": 32.40625, | |
| "learning_rate": 1.0423770092547493e-06, | |
| "loss": 22.6803, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.955080023548418, | |
| "grad_norm": 34.875, | |
| "learning_rate": 9.936678032148077e-07, | |
| "loss": 22.2686, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.9572705740611437, | |
| "grad_norm": 34.65625, | |
| "learning_rate": 9.44958597174866e-07, | |
| "loss": 22.7458, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 0.9594611245738695, | |
| "grad_norm": 34.90625, | |
| "learning_rate": 8.962493911349246e-07, | |
| "loss": 22.261, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.9616516750865952, | |
| "grad_norm": 48.6875, | |
| "learning_rate": 8.47540185094983e-07, | |
| "loss": 23.0004, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 0.963842225599321, | |
| "grad_norm": 37.3125, | |
| "learning_rate": 7.988309790550415e-07, | |
| "loss": 22.7078, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.9660327761120466, | |
| "grad_norm": 28.53125, | |
| "learning_rate": 7.501217730150999e-07, | |
| "loss": 22.2032, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 0.9682233266247724, | |
| "grad_norm": 30.078125, | |
| "learning_rate": 7.014125669751585e-07, | |
| "loss": 22.2181, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.9704138771374982, | |
| "grad_norm": 42.03125, | |
| "learning_rate": 6.527033609352168e-07, | |
| "loss": 21.9983, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 0.9726044276502238, | |
| "grad_norm": 34.40625, | |
| "learning_rate": 6.039941548952752e-07, | |
| "loss": 22.8827, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.9747949781629496, | |
| "grad_norm": 95.125, | |
| "learning_rate": 5.552849488553337e-07, | |
| "loss": 22.6381, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.9769855286756753, | |
| "grad_norm": 41.5625, | |
| "learning_rate": 5.065757428153922e-07, | |
| "loss": 22.6659, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.979176079188401, | |
| "grad_norm": 44.28125, | |
| "learning_rate": 4.578665367754506e-07, | |
| "loss": 21.9627, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 0.9813666297011268, | |
| "grad_norm": 32.375, | |
| "learning_rate": 4.091573307355091e-07, | |
| "loss": 22.8064, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.9835571802138525, | |
| "grad_norm": 42.65625, | |
| "learning_rate": 3.6044812469556747e-07, | |
| "loss": 22.0932, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 0.9857477307265782, | |
| "grad_norm": 49.0625, | |
| "learning_rate": 3.1173891865562595e-07, | |
| "loss": 22.53, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.9879382812393039, | |
| "grad_norm": 38.125, | |
| "learning_rate": 2.630297126156844e-07, | |
| "loss": 21.6858, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 0.9901288317520297, | |
| "grad_norm": 37.78125, | |
| "learning_rate": 2.1432050657574284e-07, | |
| "loss": 22.6313, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.9923193822647554, | |
| "grad_norm": 24.296875, | |
| "learning_rate": 1.6561130053580127e-07, | |
| "loss": 22.1555, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 0.9945099327774811, | |
| "grad_norm": 22.265625, | |
| "learning_rate": 1.1690209449585972e-07, | |
| "loss": 22.26, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.9967004832902069, | |
| "grad_norm": 24.921875, | |
| "learning_rate": 6.819288845591817e-08, | |
| "loss": 21.7191, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.9988910338029326, | |
| "grad_norm": 56.75, | |
| "learning_rate": 1.9483682415976622e-08, | |
| "loss": 21.334, | |
| "step": 2280 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2282, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.9772879918220706e+19, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |