| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 4358, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 9.394675251276627, | |
| "learning_rate": 4.587155963302753e-08, | |
| "loss": 1.0722, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 11.510146873139346, | |
| "learning_rate": 2.2935779816513764e-07, | |
| "loss": 1.1568, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 8.09186869433803, | |
| "learning_rate": 4.587155963302753e-07, | |
| "loss": 1.1267, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 5.001305949141049, | |
| "learning_rate": 6.880733944954129e-07, | |
| "loss": 1.0408, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 5.089979244080159, | |
| "learning_rate": 9.174311926605506e-07, | |
| "loss": 1.0286, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 3.857643916857598, | |
| "learning_rate": 1.1467889908256882e-06, | |
| "loss": 1.0247, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 3.6352518195110446, | |
| "learning_rate": 1.3761467889908258e-06, | |
| "loss": 0.9997, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 3.498581149423037, | |
| "learning_rate": 1.6055045871559635e-06, | |
| "loss": 0.9847, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 3.337414380712645, | |
| "learning_rate": 1.8348623853211011e-06, | |
| "loss": 0.9918, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 3.5774352168297394, | |
| "learning_rate": 2.064220183486239e-06, | |
| "loss": 1.0183, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 3.472559365553104, | |
| "learning_rate": 2.2935779816513764e-06, | |
| "loss": 1.015, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 3.33817794356789, | |
| "learning_rate": 2.522935779816514e-06, | |
| "loss": 0.9892, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 6.135442418177604, | |
| "learning_rate": 2.7522935779816517e-06, | |
| "loss": 0.9965, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 3.858279898663446, | |
| "learning_rate": 2.981651376146789e-06, | |
| "loss": 0.9898, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 3.435351371137228, | |
| "learning_rate": 3.211009174311927e-06, | |
| "loss": 0.9854, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 3.7508665634033758, | |
| "learning_rate": 3.4403669724770644e-06, | |
| "loss": 1.0167, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 3.3955349867095177, | |
| "learning_rate": 3.6697247706422022e-06, | |
| "loss": 0.9613, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 3.240473789973621, | |
| "learning_rate": 3.89908256880734e-06, | |
| "loss": 0.9584, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 3.585344808953774, | |
| "learning_rate": 4.128440366972478e-06, | |
| "loss": 0.9908, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 3.362297510865859, | |
| "learning_rate": 4.357798165137615e-06, | |
| "loss": 0.9994, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 3.3222849745943717, | |
| "learning_rate": 4.587155963302753e-06, | |
| "loss": 1.0184, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 3.322973143553916, | |
| "learning_rate": 4.816513761467891e-06, | |
| "loss": 0.9319, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 3.676944381124791, | |
| "learning_rate": 5.045871559633028e-06, | |
| "loss": 0.9762, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 3.4468638326854797, | |
| "learning_rate": 5.275229357798165e-06, | |
| "loss": 0.9759, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 3.315867018218443, | |
| "learning_rate": 5.504587155963303e-06, | |
| "loss": 0.9617, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 3.486244791929344, | |
| "learning_rate": 5.733944954128441e-06, | |
| "loss": 1.0092, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 3.5300892522492577, | |
| "learning_rate": 5.963302752293578e-06, | |
| "loss": 0.9802, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 3.1417248587005067, | |
| "learning_rate": 6.192660550458715e-06, | |
| "loss": 0.9852, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 3.180858225250927, | |
| "learning_rate": 6.422018348623854e-06, | |
| "loss": 0.9823, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 3.3289726314603283, | |
| "learning_rate": 6.651376146788992e-06, | |
| "loss": 0.9894, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 3.2711775527420084, | |
| "learning_rate": 6.880733944954129e-06, | |
| "loss": 1.0085, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 3.224762131634632, | |
| "learning_rate": 7.110091743119267e-06, | |
| "loss": 0.9885, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 3.2576863695830527, | |
| "learning_rate": 7.3394495412844045e-06, | |
| "loss": 0.9887, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 3.108725515279286, | |
| "learning_rate": 7.568807339449542e-06, | |
| "loss": 0.9546, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 3.3107498026119355, | |
| "learning_rate": 7.79816513761468e-06, | |
| "loss": 0.9938, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 3.3240424189638356, | |
| "learning_rate": 8.027522935779817e-06, | |
| "loss": 1.024, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 3.154260812846157, | |
| "learning_rate": 8.256880733944956e-06, | |
| "loss": 1.0029, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 3.441212795928307, | |
| "learning_rate": 8.486238532110093e-06, | |
| "loss": 0.9668, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 3.6266522820185063, | |
| "learning_rate": 8.71559633027523e-06, | |
| "loss": 0.9973, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 3.350159055683761, | |
| "learning_rate": 8.944954128440367e-06, | |
| "loss": 1.0421, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 3.205900107365007, | |
| "learning_rate": 9.174311926605506e-06, | |
| "loss": 0.9982, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 3.2252222521575464, | |
| "learning_rate": 9.403669724770643e-06, | |
| "loss": 1.0121, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 3.3039077242433996, | |
| "learning_rate": 9.633027522935781e-06, | |
| "loss": 1.0222, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 3.196932072104769, | |
| "learning_rate": 9.862385321100918e-06, | |
| "loss": 1.0575, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 4.286375011174814, | |
| "learning_rate": 1.0091743119266055e-05, | |
| "loss": 0.9753, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 3.0473780635111942, | |
| "learning_rate": 1.0321100917431192e-05, | |
| "loss": 1.0052, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.926738004897812, | |
| "learning_rate": 1.055045871559633e-05, | |
| "loss": 1.0091, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 3.9780839869679707, | |
| "learning_rate": 1.077981651376147e-05, | |
| "loss": 1.0237, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.371486237167096, | |
| "learning_rate": 1.1009174311926607e-05, | |
| "loss": 1.0224, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.3486037926379106, | |
| "learning_rate": 1.1238532110091744e-05, | |
| "loss": 1.0563, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.231176251781338, | |
| "learning_rate": 1.1467889908256882e-05, | |
| "loss": 1.0541, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.335545232558565, | |
| "learning_rate": 1.169724770642202e-05, | |
| "loss": 1.0375, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.1805921107957467, | |
| "learning_rate": 1.1926605504587156e-05, | |
| "loss": 1.033, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.0269986709638537, | |
| "learning_rate": 1.2155963302752293e-05, | |
| "loss": 1.0143, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.1030715735729024, | |
| "learning_rate": 1.238532110091743e-05, | |
| "loss": 1.0232, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.1577072382180664, | |
| "learning_rate": 1.261467889908257e-05, | |
| "loss": 1.0391, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.032906066233452, | |
| "learning_rate": 1.2844036697247708e-05, | |
| "loss": 1.0034, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.426516468568716, | |
| "learning_rate": 1.3073394495412845e-05, | |
| "loss": 1.0713, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.115183010494265, | |
| "learning_rate": 1.3302752293577984e-05, | |
| "loss": 1.036, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.2288498137146586, | |
| "learning_rate": 1.353211009174312e-05, | |
| "loss": 1.0215, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.223471739538807, | |
| "learning_rate": 1.3761467889908258e-05, | |
| "loss": 1.0256, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.2891011086195214, | |
| "learning_rate": 1.3990825688073395e-05, | |
| "loss": 1.0366, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.0537956353875324, | |
| "learning_rate": 1.4220183486238533e-05, | |
| "loss": 1.0817, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.100613029348784, | |
| "learning_rate": 1.4449541284403672e-05, | |
| "loss": 1.0531, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.127100337039988, | |
| "learning_rate": 1.4678899082568809e-05, | |
| "loss": 1.0594, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.2040550452600325, | |
| "learning_rate": 1.4908256880733946e-05, | |
| "loss": 1.0814, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.164126270067494, | |
| "learning_rate": 1.5137614678899085e-05, | |
| "loss": 1.0609, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.8307375736866796, | |
| "learning_rate": 1.536697247706422e-05, | |
| "loss": 1.0418, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.0304190806703972, | |
| "learning_rate": 1.559633027522936e-05, | |
| "loss": 1.0655, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.1653216968924633, | |
| "learning_rate": 1.5825688073394497e-05, | |
| "loss": 1.037, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.058091371029834, | |
| "learning_rate": 1.6055045871559634e-05, | |
| "loss": 1.0899, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.020116940253991, | |
| "learning_rate": 1.628440366972477e-05, | |
| "loss": 1.0358, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.003561000700209, | |
| "learning_rate": 1.6513761467889912e-05, | |
| "loss": 1.0367, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.030349207340203, | |
| "learning_rate": 1.674311926605505e-05, | |
| "loss": 1.0779, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.972268792440487, | |
| "learning_rate": 1.6972477064220186e-05, | |
| "loss": 1.0587, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.0024168971293586, | |
| "learning_rate": 1.7201834862385323e-05, | |
| "loss": 1.0621, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.204045198122664, | |
| "learning_rate": 1.743119266055046e-05, | |
| "loss": 1.0539, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 2.967217430578547, | |
| "learning_rate": 1.7660550458715597e-05, | |
| "loss": 1.0734, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 2.9810040743388173, | |
| "learning_rate": 1.7889908256880734e-05, | |
| "loss": 1.08, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 2.9561283294791445, | |
| "learning_rate": 1.811926605504587e-05, | |
| "loss": 1.0549, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.103685050292982, | |
| "learning_rate": 1.834862385321101e-05, | |
| "loss": 1.0536, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 2.966374643255888, | |
| "learning_rate": 1.8577981651376148e-05, | |
| "loss": 1.0493, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 2.961623318533173, | |
| "learning_rate": 1.8807339449541285e-05, | |
| "loss": 1.1001, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.213995630508863, | |
| "learning_rate": 1.9036697247706422e-05, | |
| "loss": 1.0964, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.058722713545753, | |
| "learning_rate": 1.9266055045871563e-05, | |
| "loss": 1.0958, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.100037959558587, | |
| "learning_rate": 1.94954128440367e-05, | |
| "loss": 1.0735, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.1066528399698305, | |
| "learning_rate": 1.9724770642201837e-05, | |
| "loss": 1.0932, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 2.962622864501778, | |
| "learning_rate": 1.9954128440366974e-05, | |
| "loss": 1.0906, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.0108264145191432, | |
| "learning_rate": 1.9999948669655127e-05, | |
| "loss": 1.0644, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 2.833061974778976, | |
| "learning_rate": 1.9999740141032216e-05, | |
| "loss": 1.0696, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 2.9158581052830965, | |
| "learning_rate": 1.999937120932709e-05, | |
| "loss": 1.1006, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 2.856147725205616, | |
| "learning_rate": 1.9998841880457682e-05, | |
| "loss": 1.0769, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 2.9755007034045593, | |
| "learning_rate": 1.9998152162914807e-05, | |
| "loss": 1.1161, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.645560434344824, | |
| "learning_rate": 1.9997302067762044e-05, | |
| "loss": 1.1022, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.122685192865999, | |
| "learning_rate": 1.9996291608635527e-05, | |
| "loss": 1.0537, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 2.937474072999667, | |
| "learning_rate": 1.999512080174375e-05, | |
| "loss": 1.0876, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.3759125922583513, | |
| "learning_rate": 1.9993789665867316e-05, | |
| "loss": 1.1046, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.214821660194427, | |
| "learning_rate": 1.9992298222358603e-05, | |
| "loss": 1.1342, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.6555429390099374, | |
| "learning_rate": 1.9990646495141445e-05, | |
| "loss": 1.1175, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 2.9606668287180455, | |
| "learning_rate": 1.9988834510710747e-05, | |
| "loss": 1.0842, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.1350054453428213, | |
| "learning_rate": 1.998686229813205e-05, | |
| "loss": 1.0979, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 2.7934482490231054, | |
| "learning_rate": 1.9984729889041077e-05, | |
| "loss": 1.0637, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 2.91038630187397, | |
| "learning_rate": 1.9982437317643218e-05, | |
| "loss": 1.1089, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.4360032792740673, | |
| "learning_rate": 1.9979984620712972e-05, | |
| "loss": 1.1245, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.073630199634191, | |
| "learning_rate": 1.9977371837593382e-05, | |
| "loss": 1.0963, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.244084086033738, | |
| "learning_rate": 1.9974599010195384e-05, | |
| "loss": 1.1517, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.036785127574316, | |
| "learning_rate": 1.997166618299714e-05, | |
| "loss": 1.1162, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.5966815313979446, | |
| "learning_rate": 1.9968573403043325e-05, | |
| "loss": 1.0828, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 2.85584309172754, | |
| "learning_rate": 1.9965320719944366e-05, | |
| "loss": 1.1187, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.210724272586593, | |
| "learning_rate": 1.9961908185875662e-05, | |
| "loss": 1.1095, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.0107803370726685, | |
| "learning_rate": 1.995833585557674e-05, | |
| "loss": 1.0474, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.084146667029137, | |
| "learning_rate": 1.9954603786350353e-05, | |
| "loss": 1.1063, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.2688781509444476, | |
| "learning_rate": 1.9950712038061617e-05, | |
| "loss": 1.1266, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 680.7081090329712, | |
| "learning_rate": 1.994666067313698e-05, | |
| "loss": 1.1471, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 149.93179306713003, | |
| "learning_rate": 1.994244975656328e-05, | |
| "loss": 1.7807, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 220.01504858608797, | |
| "learning_rate": 1.9938079355886674e-05, | |
| "loss": 6.4289, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 496.48020483148116, | |
| "learning_rate": 1.993354954121155e-05, | |
| "loss": 12.59, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 100.33483837207477, | |
| "learning_rate": 1.992886038519943e-05, | |
| "loss": 10.3831, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 34.991765615273025, | |
| "learning_rate": 1.9924011963067765e-05, | |
| "loss": 8.1883, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 45.90912397238394, | |
| "learning_rate": 1.9919004352588768e-05, | |
| "loss": 7.508, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 25.835640875802444, | |
| "learning_rate": 1.9913837634088143e-05, | |
| "loss": 7.4129, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 15.174156610898672, | |
| "learning_rate": 1.99085118904438e-05, | |
| "loss": 7.3342, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 17.635001034280123, | |
| "learning_rate": 1.9903027207084525e-05, | |
| "loss": 7.2874, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 9.893720942330273, | |
| "learning_rate": 1.989738367198862e-05, | |
| "loss": 7.2536, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 9.867615007061273, | |
| "learning_rate": 1.9891581375682472e-05, | |
| "loss": 7.1948, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 9.030991653289398, | |
| "learning_rate": 1.9885620411239134e-05, | |
| "loss": 7.2219, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 7.379829275629753, | |
| "learning_rate": 1.9879500874276788e-05, | |
| "loss": 7.2081, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 6.130413517671043, | |
| "learning_rate": 1.9873222862957243e-05, | |
| "loss": 7.241, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 7.032182637604816, | |
| "learning_rate": 1.9866786477984357e-05, | |
| "loss": 7.2104, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.450500360030072, | |
| "learning_rate": 1.9860191822602415e-05, | |
| "loss": 7.2306, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 6.241894562599629, | |
| "learning_rate": 1.985343900259446e-05, | |
| "loss": 7.2092, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 7.704992268267875, | |
| "learning_rate": 1.9846528126280632e-05, | |
| "loss": 7.2195, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.892577300152109, | |
| "learning_rate": 1.983945930451639e-05, | |
| "loss": 7.2134, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 7.162244013604885, | |
| "learning_rate": 1.9832232650690765e-05, | |
| "loss": 7.2153, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.49392312570169, | |
| "learning_rate": 1.982484828072452e-05, | |
| "loss": 7.2018, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.954680533596231, | |
| "learning_rate": 1.981730631306831e-05, | |
| "loss": 7.1981, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 7.245712488666381, | |
| "learning_rate": 1.9809606868700755e-05, | |
| "loss": 7.2166, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 6.280016322704388, | |
| "learning_rate": 1.9801750071126536e-05, | |
| "loss": 7.2043, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 6.1226575129071215, | |
| "learning_rate": 1.9793736046374375e-05, | |
| "loss": 7.1994, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.1738890947124965, | |
| "learning_rate": 1.9785564922995042e-05, | |
| "loss": 7.197, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 7.070513738096005, | |
| "learning_rate": 1.977723683205928e-05, | |
| "loss": 7.1694, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 7.1998596365209995, | |
| "learning_rate": 1.9768751907155707e-05, | |
| "loss": 7.2087, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 6.8756525556203885, | |
| "learning_rate": 1.9760110284388667e-05, | |
| "loss": 7.2004, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.673754116753309, | |
| "learning_rate": 1.9751312102376062e-05, | |
| "loss": 7.1969, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.928999080043428, | |
| "learning_rate": 1.9742357502247104e-05, | |
| "loss": 7.1754, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 7.534058043728272, | |
| "learning_rate": 1.9733246627640072e-05, | |
| "loss": 7.2245, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 6.419671206121361, | |
| "learning_rate": 1.9723979624700004e-05, | |
| "loss": 7.1981, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.014238279563543, | |
| "learning_rate": 1.9714556642076347e-05, | |
| "loss": 7.2059, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.4286747899069745, | |
| "learning_rate": 1.970497783092057e-05, | |
| "loss": 7.1769, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.105148382009604, | |
| "learning_rate": 1.969524334488375e-05, | |
| "loss": 7.2066, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.826988284774489, | |
| "learning_rate": 1.9685353340114104e-05, | |
| "loss": 7.1971, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.244080325535858, | |
| "learning_rate": 1.9675307975254478e-05, | |
| "loss": 7.2065, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 7.248352747427355, | |
| "learning_rate": 1.9665107411439805e-05, | |
| "loss": 7.1707, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.693767897081214, | |
| "learning_rate": 1.965475181229453e-05, | |
| "loss": 7.1989, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.256405796849654, | |
| "learning_rate": 1.9644241343929966e-05, | |
| "loss": 7.2026, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.230559774612038, | |
| "learning_rate": 1.963357617494165e-05, | |
| "loss": 7.1968, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.299356891163277, | |
| "learning_rate": 1.9622756476406625e-05, | |
| "loss": 7.2201, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.771781395899692, | |
| "learning_rate": 1.9611782421880702e-05, | |
| "loss": 7.2188, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 4.975609755551546, | |
| "learning_rate": 1.9600654187395666e-05, | |
| "loss": 7.2074, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.486489059003917, | |
| "learning_rate": 1.958937195145647e-05, | |
| "loss": 7.223, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.4870554264978235, | |
| "learning_rate": 1.9577935895038363e-05, | |
| "loss": 7.2093, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.297769552074883, | |
| "learning_rate": 1.9566346201583974e-05, | |
| "loss": 7.1872, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 4.767621827384491, | |
| "learning_rate": 1.9554603057000397e-05, | |
| "loss": 7.1857, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.953451938027194, | |
| "learning_rate": 1.954270664965618e-05, | |
| "loss": 7.1737, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.758676615210085, | |
| "learning_rate": 1.953065717037832e-05, | |
| "loss": 7.1809, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.385168274540292, | |
| "learning_rate": 1.951845481244921e-05, | |
| "loss": 7.1792, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 4.254446787862434, | |
| "learning_rate": 1.9506099771603515e-05, | |
| "loss": 7.2077, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.197281648875432, | |
| "learning_rate": 1.9493592246025047e-05, | |
| "loss": 7.2155, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.78819455170524, | |
| "learning_rate": 1.9480932436343584e-05, | |
| "loss": 7.1863, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.163370463039743, | |
| "learning_rate": 1.9468120545631647e-05, | |
| "loss": 7.2101, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.7662949673961315, | |
| "learning_rate": 1.945515677940127e-05, | |
| "loss": 7.1567, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.75746195424063, | |
| "learning_rate": 1.944204134560064e-05, | |
| "loss": 7.1651, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.382060329721597, | |
| "learning_rate": 1.9428774454610845e-05, | |
| "loss": 7.1916, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 4.893754566211905, | |
| "learning_rate": 1.941535631924242e-05, | |
| "loss": 7.2095, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.477578724305367, | |
| "learning_rate": 1.9401787154731993e-05, | |
| "loss": 7.2044, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.61002124085074, | |
| "learning_rate": 1.9388067178738807e-05, | |
| "loss": 7.195, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.116708741280613, | |
| "learning_rate": 1.9374196611341212e-05, | |
| "loss": 7.1967, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.753967686244243, | |
| "learning_rate": 1.936017567503317e-05, | |
| "loss": 7.199, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 7.364972728350276, | |
| "learning_rate": 1.934600459472067e-05, | |
| "loss": 7.1762, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.603911277491834, | |
| "learning_rate": 1.933168359771811e-05, | |
| "loss": 7.2118, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 7.012396533406363, | |
| "learning_rate": 1.931721291374467e-05, | |
| "loss": 7.2058, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 7.895351473028401, | |
| "learning_rate": 1.9302592774920606e-05, | |
| "loss": 7.1931, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.280257845408824, | |
| "learning_rate": 1.9287823415763552e-05, | |
| "loss": 7.1738, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.876634320902484, | |
| "learning_rate": 1.9272905073184734e-05, | |
| "loss": 7.192, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 4.854212629080888, | |
| "learning_rate": 1.9257837986485187e-05, | |
| "loss": 7.1925, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.092400379079062, | |
| "learning_rate": 1.92426223973519e-05, | |
| "loss": 7.1856, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.428211058950048, | |
| "learning_rate": 1.922725854985396e-05, | |
| "loss": 7.1597, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 4.794758754464533, | |
| "learning_rate": 1.921174669043862e-05, | |
| "loss": 7.2268, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.101883671966147, | |
| "learning_rate": 1.9196087067927348e-05, | |
| "loss": 7.1848, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.317894374914432, | |
| "learning_rate": 1.918027993351185e-05, | |
| "loss": 7.1811, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.305336773894683, | |
| "learning_rate": 1.916432554075002e-05, | |
| "loss": 7.1873, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 4.6840416735309915, | |
| "learning_rate": 1.9148224145561876e-05, | |
| "loss": 7.1889, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.867312525781805, | |
| "learning_rate": 1.913197600622549e-05, | |
| "loss": 7.2023, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 4.758609581127356, | |
| "learning_rate": 1.9115581383372782e-05, | |
| "loss": 7.1905, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.244788780284041, | |
| "learning_rate": 1.9099040539985395e-05, | |
| "loss": 7.1896, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 7.35187418176669, | |
| "learning_rate": 1.9082353741390453e-05, | |
| "loss": 7.1811, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.6595340281862825, | |
| "learning_rate": 1.90655212552563e-05, | |
| "loss": 7.1919, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 4.892032669535677, | |
| "learning_rate": 1.904854335158822e-05, | |
| "loss": 7.1865, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.7552292559003035, | |
| "learning_rate": 1.9031420302724093e-05, | |
| "loss": 7.1996, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 4.674540158335838, | |
| "learning_rate": 1.901415238333005e-05, | |
| "loss": 7.1851, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 4.803373360265408, | |
| "learning_rate": 1.8996739870396027e-05, | |
| "loss": 7.2195, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 4.740149041137212, | |
| "learning_rate": 1.897918304323136e-05, | |
| "loss": 7.186, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.394971774083842, | |
| "learning_rate": 1.896148218346028e-05, | |
| "loss": 7.2, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 4.8368244052167375, | |
| "learning_rate": 1.8943637575017428e-05, | |
| "loss": 7.1863, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 4.795222702764058, | |
| "learning_rate": 1.8925649504143244e-05, | |
| "loss": 7.194, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 6.091441424838663, | |
| "learning_rate": 1.890751825937944e-05, | |
| "loss": 7.1919, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.2139746246710965, | |
| "learning_rate": 1.888924413156432e-05, | |
| "loss": 7.1813, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.924868386178008, | |
| "learning_rate": 1.8870827413828148e-05, | |
| "loss": 7.1969, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 4.75305228923696, | |
| "learning_rate": 1.885226840158843e-05, | |
| "loss": 7.2101, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.751123883354145, | |
| "learning_rate": 1.8833567392545177e-05, | |
| "loss": 7.1988, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 7.371173831840808, | |
| "learning_rate": 1.8814724686676133e-05, | |
| "loss": 7.2179, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 6.00599017571554, | |
| "learning_rate": 1.879574058623196e-05, | |
| "loss": 7.1914, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.991137258758085, | |
| "learning_rate": 1.8776615395731398e-05, | |
| "loss": 7.183, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.718123489352958, | |
| "learning_rate": 1.875734942195637e-05, | |
| "loss": 7.1905, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 4.487539169972883, | |
| "learning_rate": 1.8737942973947062e-05, | |
| "loss": 7.1581, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 4.825603371326703, | |
| "learning_rate": 1.8718396362996968e-05, | |
| "loss": 7.1935, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 4.813620283639029, | |
| "learning_rate": 1.8698709902647903e-05, | |
| "loss": 7.1977, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 8.758806033943968, | |
| "learning_rate": 1.8678883908684964e-05, | |
| "loss": 7.1901, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.36268133923744, | |
| "learning_rate": 1.865891869913147e-05, | |
| "loss": 7.1914, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.610339067780085, | |
| "learning_rate": 1.863881459424386e-05, | |
| "loss": 7.1798, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.469361658862883, | |
| "learning_rate": 1.8618571916506548e-05, | |
| "loss": 7.1721, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.07301012439838, | |
| "learning_rate": 1.8598190990626764e-05, | |
| "loss": 7.2065, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 6.39877570039683, | |
| "learning_rate": 1.8577672143529337e-05, | |
| "loss": 7.1823, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.823362939728546, | |
| "learning_rate": 1.8557015704351453e-05, | |
| "loss": 7.1601, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 6.353964897246578, | |
| "learning_rate": 1.853622200443737e-05, | |
| "loss": 7.1801, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 4.4888019416686795, | |
| "learning_rate": 1.8515291377333114e-05, | |
| "loss": 7.1615, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 4.737996647818345, | |
| "learning_rate": 1.849422415878112e-05, | |
| "loss": 7.1752, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.655355199762672, | |
| "learning_rate": 1.8473020686714847e-05, | |
| "loss": 7.1897, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 4.905574751971008, | |
| "learning_rate": 1.8451681301253363e-05, | |
| "loss": 7.1759, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.093954229069838, | |
| "learning_rate": 1.8430206344695875e-05, | |
| "loss": 7.1841, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 4.659167952013244, | |
| "learning_rate": 1.840859616151627e-05, | |
| "loss": 7.1793, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 4.779633769093793, | |
| "learning_rate": 1.8386851098357538e-05, | |
| "loss": 7.1827, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.011930861735435, | |
| "learning_rate": 1.8364971504026273e-05, | |
| "loss": 7.1792, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.881425426906034, | |
| "learning_rate": 1.834295772948703e-05, | |
| "loss": 7.1934, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 4.491821561313667, | |
| "learning_rate": 1.8320810127856706e-05, | |
| "loss": 7.1638, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 4.4905503941670535, | |
| "learning_rate": 1.8298529054398896e-05, | |
| "loss": 7.1787, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.456686168415449, | |
| "learning_rate": 1.827611486651817e-05, | |
| "loss": 7.1807, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 4.7472408032814695, | |
| "learning_rate": 1.8253567923754353e-05, | |
| "loss": 7.2154, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.260242429793549, | |
| "learning_rate": 1.8230888587776758e-05, | |
| "loss": 7.2009, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 4.459555242885236, | |
| "learning_rate": 1.8208077222378376e-05, | |
| "loss": 7.1827, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.311364125445347, | |
| "learning_rate": 1.8185134193470043e-05, | |
| "loss": 7.1902, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 8.45135390718489, | |
| "learning_rate": 1.8162059869074586e-05, | |
| "loss": 7.1864, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.379082505010177, | |
| "learning_rate": 1.8138854619320893e-05, | |
| "loss": 7.2273, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.710277796266043, | |
| "learning_rate": 1.8115518816437997e-05, | |
| "loss": 7.1802, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.500870680883128, | |
| "learning_rate": 1.8092052834749094e-05, | |
| "loss": 7.1981, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 6.202612921478623, | |
| "learning_rate": 1.8068457050665547e-05, | |
| "loss": 7.2037, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.334951680536002, | |
| "learning_rate": 1.804473184268084e-05, | |
| "loss": 7.2078, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.668688696015915, | |
| "learning_rate": 1.8020877591364508e-05, | |
| "loss": 7.1816, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.76363061015334, | |
| "learning_rate": 1.799689467935604e-05, | |
| "loss": 7.1904, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 4.299305529851326, | |
| "learning_rate": 1.797278349135874e-05, | |
| "loss": 7.2004, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 6.0714518763544225, | |
| "learning_rate": 1.7948544414133534e-05, | |
| "loss": 7.2004, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.397050722956672, | |
| "learning_rate": 1.7924177836492802e-05, | |
| "loss": 7.1913, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 7.384985978864621, | |
| "learning_rate": 1.7899684149294118e-05, | |
| "loss": 7.2051, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 6.435771900748507, | |
| "learning_rate": 1.7875063745433978e-05, | |
| "loss": 7.1817, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.075431695444233, | |
| "learning_rate": 1.7850317019841514e-05, | |
| "loss": 7.2229, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 4.750020994304407, | |
| "learning_rate": 1.7825444369472147e-05, | |
| "loss": 7.2127, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.765962718023732, | |
| "learning_rate": 1.7800446193301225e-05, | |
| "loss": 7.2135, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 4.801689882588788, | |
| "learning_rate": 1.7775322892317618e-05, | |
| "loss": 7.2023, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.012853900353026, | |
| "learning_rate": 1.7750074869517285e-05, | |
| "loss": 7.1841, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.146195314914873, | |
| "learning_rate": 1.7724702529896824e-05, | |
| "loss": 7.2267, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.3192085523839205, | |
| "learning_rate": 1.7699206280446955e-05, | |
| "loss": 7.1775, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.5101183654984816, | |
| "learning_rate": 1.767358653014601e-05, | |
| "loss": 7.2029, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 6.5468845839854914, | |
| "learning_rate": 1.7647843689953352e-05, | |
| "loss": 7.1753, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 4.353192953649322, | |
| "learning_rate": 1.762197817280281e-05, | |
| "loss": 7.1881, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 4.6727420241772, | |
| "learning_rate": 1.759599039359603e-05, | |
| "loss": 7.1746, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 6.204254264607091, | |
| "learning_rate": 1.756988076919583e-05, | |
| "loss": 7.1543, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 4.416954900150789, | |
| "learning_rate": 1.754364971841952e-05, | |
| "loss": 7.2003, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.866999572748804, | |
| "learning_rate": 1.7517297662032174e-05, | |
| "loss": 7.1931, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.7422281580185714, | |
| "learning_rate": 1.749082502273988e-05, | |
| "loss": 7.1866, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.574328843512533, | |
| "learning_rate": 1.746423222518297e-05, | |
| "loss": 7.209, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 4.825095531858083, | |
| "learning_rate": 1.7437519695929194e-05, | |
| "loss": 7.2021, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 4.918401678159191, | |
| "learning_rate": 1.741068786346689e-05, | |
| "loss": 7.1856, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 4.7129421004109515, | |
| "learning_rate": 1.738373715819811e-05, | |
| "loss": 7.1646, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 6.2682617034576635, | |
| "learning_rate": 1.7356668012431705e-05, | |
| "loss": 7.1869, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 6.142810873086463, | |
| "learning_rate": 1.7329480860376392e-05, | |
| "loss": 7.1795, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 4.7006273967413215, | |
| "learning_rate": 1.7302176138133814e-05, | |
| "loss": 7.211, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.497329345480043, | |
| "learning_rate": 1.7274754283691507e-05, | |
| "loss": 7.1711, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.806714944962353, | |
| "learning_rate": 1.72472157369159e-05, | |
| "loss": 7.1923, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 6.801596277714087, | |
| "learning_rate": 1.7219560939545246e-05, | |
| "loss": 7.1905, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 4.996882387174238, | |
| "learning_rate": 1.719179033518255e-05, | |
| "loss": 7.1942, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 4.829570844242962, | |
| "learning_rate": 1.7163904369288443e-05, | |
| "loss": 7.1832, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.477705999486753, | |
| "learning_rate": 1.7135903489174034e-05, | |
| "loss": 7.1766, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 4.267188678316321, | |
| "learning_rate": 1.710778814399374e-05, | |
| "loss": 7.1899, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.064274909871023, | |
| "learning_rate": 1.7079558784738092e-05, | |
| "loss": 7.2137, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.290438730448353, | |
| "learning_rate": 1.705121586422647e-05, | |
| "loss": 7.201, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.517582652147351, | |
| "learning_rate": 1.702275983709987e-05, | |
| "loss": 7.178, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.324522216215293, | |
| "learning_rate": 1.699419115981361e-05, | |
| "loss": 7.1811, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.4511667927982215, | |
| "learning_rate": 1.6965510290629973e-05, | |
| "loss": 7.1675, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.273917433416757, | |
| "learning_rate": 1.69367176896109e-05, | |
| "loss": 7.2079, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 4.543337661243557, | |
| "learning_rate": 1.6907813818610597e-05, | |
| "loss": 7.1508, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 6.433592856571139, | |
| "learning_rate": 1.6878799141268107e-05, | |
| "loss": 7.1795, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 6.031774153730769, | |
| "learning_rate": 1.6849674122999878e-05, | |
| "loss": 7.1793, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.455052489494696, | |
| "learning_rate": 1.682043923099234e-05, | |
| "loss": 7.1835, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 4.523617138804165, | |
| "learning_rate": 1.679109493419435e-05, | |
| "loss": 7.1809, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.187074166481253, | |
| "learning_rate": 1.6761641703309702e-05, | |
| "loss": 7.151, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 6.86249092476398, | |
| "learning_rate": 1.673208001078958e-05, | |
| "loss": 7.193, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 6.567170673390032, | |
| "learning_rate": 1.6702410330824962e-05, | |
| "loss": 7.179, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.073442019585416, | |
| "learning_rate": 1.6672633139339028e-05, | |
| "loss": 7.1656, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 3.9925808755541996, | |
| "learning_rate": 1.6642748913979515e-05, | |
| "loss": 7.18, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 4.80371655505946, | |
| "learning_rate": 1.6612758134111072e-05, | |
| "loss": 7.1768, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 4.733455824267269, | |
| "learning_rate": 1.6582661280807553e-05, | |
| "loss": 7.2038, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 3.906745836511784, | |
| "learning_rate": 1.65524588368443e-05, | |
| "loss": 7.1664, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.163199284772482, | |
| "learning_rate": 1.652215128669042e-05, | |
| "loss": 7.2011, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 3.9325541368096313, | |
| "learning_rate": 1.649173911650099e-05, | |
| "loss": 7.1661, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.541114208005493, | |
| "learning_rate": 1.646122281410927e-05, | |
| "loss": 7.1731, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 4.645120765156564, | |
| "learning_rate": 1.6430602869018867e-05, | |
| "loss": 7.1854, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.396492917895077, | |
| "learning_rate": 1.6399879772395915e-05, | |
| "loss": 7.1975, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 6.111332313811058, | |
| "learning_rate": 1.636905401706116e-05, | |
| "loss": 7.1962, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 4.5879994028450355, | |
| "learning_rate": 1.633812609748206e-05, | |
| "loss": 7.1896, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 4.777276796655454, | |
| "learning_rate": 1.630709650976487e-05, | |
| "loss": 7.196, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.754696932989834, | |
| "learning_rate": 1.6275965751646682e-05, | |
| "loss": 7.1952, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 4.820867978838945, | |
| "learning_rate": 1.6244734322487415e-05, | |
| "loss": 7.1951, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 4.5062148240565385, | |
| "learning_rate": 1.6213402723261852e-05, | |
| "loss": 7.1925, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 4.9221473358752, | |
| "learning_rate": 1.618197145655155e-05, | |
| "loss": 7.1882, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.248482149727314, | |
| "learning_rate": 1.6150441026536827e-05, | |
| "loss": 7.163, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.521139746786196, | |
| "learning_rate": 1.6118811938988632e-05, | |
| "loss": 7.1897, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 4.793529660386469, | |
| "learning_rate": 1.6087084701260468e-05, | |
| "loss": 7.1675, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 4.630271784366099, | |
| "learning_rate": 1.605525982228023e-05, | |
| "loss": 7.171, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 4.653150385236314, | |
| "learning_rate": 1.6023337812542048e-05, | |
| "loss": 7.1867, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 6.004405747433293, | |
| "learning_rate": 1.5991319184098107e-05, | |
| "loss": 7.1813, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.924373425919494, | |
| "learning_rate": 1.5959204450550427e-05, | |
| "loss": 7.1775, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 7.753697903529501, | |
| "learning_rate": 1.5926994127042615e-05, | |
| "loss": 7.1672, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 8.078702081068387, | |
| "learning_rate": 1.5894688730251613e-05, | |
| "loss": 7.1701, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 9.526882240137281, | |
| "learning_rate": 1.586228877837941e-05, | |
| "loss": 7.1323, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 37.28886157765147, | |
| "learning_rate": 1.5829794791144723e-05, | |
| "loss": 7.1004, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 23.093005264330223, | |
| "learning_rate": 1.5797207289774668e-05, | |
| "loss": 7.1948, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 25.898784884168748, | |
| "learning_rate": 1.57645267969964e-05, | |
| "loss": 7.1653, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 16.78438950960542, | |
| "learning_rate": 1.5731753837028714e-05, | |
| "loss": 7.1468, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 10.923555549438724, | |
| "learning_rate": 1.569888893557365e-05, | |
| "loss": 7.0813, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 11.108288539909235, | |
| "learning_rate": 1.5665932619808058e-05, | |
| "loss": 7.0424, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 15.199836700972632, | |
| "learning_rate": 1.5632885418375136e-05, | |
| "loss": 6.9435, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 10.04303401418099, | |
| "learning_rate": 1.5599747861375957e-05, | |
| "loss": 6.9432, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 6.925107402391229, | |
| "learning_rate": 1.556652048036096e-05, | |
| "loss": 6.8624, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 13.70186301929785, | |
| "learning_rate": 1.553320380832143e-05, | |
| "loss": 6.8157, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 15.620537966762095, | |
| "learning_rate": 1.549979837968094e-05, | |
| "loss": 6.7753, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 30.677693169182618, | |
| "learning_rate": 1.5466304730286795e-05, | |
| "loss": 6.794, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 7.848469368296769, | |
| "learning_rate": 1.5432723397401406e-05, | |
| "loss": 6.7671, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 21.469195766575073, | |
| "learning_rate": 1.5399054919693704e-05, | |
| "loss": 6.7119, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 24.46255165124564, | |
| "learning_rate": 1.5365299837230483e-05, | |
| "loss": 6.6899, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 23.20384615490851, | |
| "learning_rate": 1.5331458691467742e-05, | |
| "loss": 6.6424, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 18.350112389930576, | |
| "learning_rate": 1.5297532025241993e-05, | |
| "loss": 6.6069, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 35.95084330385222, | |
| "learning_rate": 1.5263520382761563e-05, | |
| "loss": 6.5677, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 32.90819956258818, | |
| "learning_rate": 1.5229424309597853e-05, | |
| "loss": 6.5251, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 54.76562189780166, | |
| "learning_rate": 1.5195244352676606e-05, | |
| "loss": 6.4826, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 12.591984595179603, | |
| "learning_rate": 1.5160981060269107e-05, | |
| "loss": 6.5287, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 10.351716266476027, | |
| "learning_rate": 1.5126634981983412e-05, | |
| "loss": 6.4656, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 12.622397404252, | |
| "learning_rate": 1.5092206668755518e-05, | |
| "loss": 6.3774, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 23.45116611899055, | |
| "learning_rate": 1.5057696672840529e-05, | |
| "loss": 6.4034, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 40.24642870474456, | |
| "learning_rate": 1.5023105547803807e-05, | |
| "loss": 6.3587, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 42.78142739794163, | |
| "learning_rate": 1.4988433848512074e-05, | |
| "loss": 6.3162, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 33.07779044777228, | |
| "learning_rate": 1.4953682131124527e-05, | |
| "loss": 6.2552, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 16.884418478781473, | |
| "learning_rate": 1.491885095308391e-05, | |
| "loss": 6.1878, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 26.06314374849514, | |
| "learning_rate": 1.4883940873107572e-05, | |
| "loss": 6.2067, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 11.772139032290678, | |
| "learning_rate": 1.4848952451178508e-05, | |
| "loss": 6.1506, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 7.890512493835399, | |
| "learning_rate": 1.4813886248536376e-05, | |
| "loss": 6.1331, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 12.62470607783592, | |
| "learning_rate": 1.4778742827668484e-05, | |
| "loss": 6.1142, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 36.700960091806486, | |
| "learning_rate": 1.4743522752300793e-05, | |
| "loss": 6.0802, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 14.397456689103558, | |
| "learning_rate": 1.4708226587388845e-05, | |
| "loss": 6.0312, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 33.258017170458196, | |
| "learning_rate": 1.467285489910872e-05, | |
| "loss": 6.0318, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 22.65861713891252, | |
| "learning_rate": 1.4637408254847936e-05, | |
| "loss": 6.0082, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 27.453970567083232, | |
| "learning_rate": 1.4601887223196374e-05, | |
| "loss": 5.9184, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 22.483790124784434, | |
| "learning_rate": 1.4566292373937133e-05, | |
| "loss": 5.9385, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 76.714301112878, | |
| "learning_rate": 1.4530624278037406e-05, | |
| "loss": 5.8839, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 60.99442830394419, | |
| "learning_rate": 1.449488350763931e-05, | |
| "loss": 5.9291, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 43.48487974907191, | |
| "learning_rate": 1.4459070636050721e-05, | |
| "loss": 5.9295, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 8.849205696409507, | |
| "learning_rate": 1.4423186237736063e-05, | |
| "loss": 5.8609, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 46.120560612475195, | |
| "learning_rate": 1.4387230888307098e-05, | |
| "loss": 5.8535, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 42.42359692143847, | |
| "learning_rate": 1.4351205164513708e-05, | |
| "loss": 5.8279, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 33.64892053133189, | |
| "learning_rate": 1.4315109644234619e-05, | |
| "loss": 5.8832, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 44.342036592354745, | |
| "learning_rate": 1.427894490646815e-05, | |
| "loss": 5.7869, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 23.531884493857213, | |
| "learning_rate": 1.4242711531322912e-05, | |
| "loss": 5.8184, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 24.495321259837898, | |
| "learning_rate": 1.420641010000852e-05, | |
| "loss": 5.7591, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 101.90422975423697, | |
| "learning_rate": 1.4170041194826247e-05, | |
| "loss": 5.8044, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 63.98708014495446, | |
| "learning_rate": 1.4133605399159706e-05, | |
| "loss": 5.9446, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 29.38341129380048, | |
| "learning_rate": 1.4097103297465471e-05, | |
| "loss": 5.9626, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 16.457857993310515, | |
| "learning_rate": 1.4060535475263725e-05, | |
| "loss": 5.8796, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 12.75715712434224, | |
| "learning_rate": 1.402390251912885e-05, | |
| "loss": 5.8067, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 10.553879277739714, | |
| "learning_rate": 1.398720501668002e-05, | |
| "loss": 5.791, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 23.985007630134017, | |
| "learning_rate": 1.395044355657178e-05, | |
| "loss": 5.736, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 20.71153720384459, | |
| "learning_rate": 1.391361872848461e-05, | |
| "loss": 5.7062, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 33.58186355970371, | |
| "learning_rate": 1.387673112311545e-05, | |
| "loss": 5.7455, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 24.602274943269077, | |
| "learning_rate": 1.3839781332168236e-05, | |
| "loss": 5.6321, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 18.305365670645493, | |
| "learning_rate": 1.3802769948344406e-05, | |
| "loss": 5.6455, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 17.656269054544428, | |
| "learning_rate": 1.3765697565333387e-05, | |
| "loss": 5.6137, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 33.06252808092646, | |
| "learning_rate": 1.3728564777803089e-05, | |
| "loss": 5.6283, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 7.31153267089378, | |
| "learning_rate": 1.369137218139034e-05, | |
| "loss": 5.6687, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 43.46939760510257, | |
| "learning_rate": 1.3654120372691361e-05, | |
| "loss": 5.6522, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 40.352268702600746, | |
| "learning_rate": 1.3616809949252168e-05, | |
| "loss": 5.6521, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 14.07491035131935, | |
| "learning_rate": 1.3579441509559007e-05, | |
| "loss": 5.6476, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 13.1869662531745, | |
| "learning_rate": 1.3542015653028742e-05, | |
| "loss": 5.5999, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 12.602728660576666, | |
| "learning_rate": 1.350453297999925e-05, | |
| "loss": 5.5798, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 47.72655669632253, | |
| "learning_rate": 1.3466994091719782e-05, | |
| "loss": 5.6063, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 44.8093903764745, | |
| "learning_rate": 1.3429399590341325e-05, | |
| "loss": 5.604, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 18.97308595224727, | |
| "learning_rate": 1.3391750078906939e-05, | |
| "loss": 5.5722, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 85.6251743171489, | |
| "learning_rate": 1.3354046161342087e-05, | |
| "loss": 5.5877, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 30.512861408284476, | |
| "learning_rate": 1.3316288442444943e-05, | |
| "loss": 5.5643, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 12.905340157899301, | |
| "learning_rate": 1.327847752787669e-05, | |
| "loss": 5.5623, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 60.35647636456591, | |
| "learning_rate": 1.324061402415182e-05, | |
| "loss": 5.5357, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 28.424225727617344, | |
| "learning_rate": 1.3202698538628376e-05, | |
| "loss": 5.5233, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 153.36892036409608, | |
| "learning_rate": 1.3164731679498249e-05, | |
| "loss": 5.4883, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 15.941356320454116, | |
| "learning_rate": 1.3126714055777378e-05, | |
| "loss": 5.551, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 53.360743928106146, | |
| "learning_rate": 1.3088646277296018e-05, | |
| "loss": 5.5101, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 22.283754442776264, | |
| "learning_rate": 1.3050528954688932e-05, | |
| "loss": 5.4968, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 15.309834032348661, | |
| "learning_rate": 1.3012362699385616e-05, | |
| "loss": 5.4641, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 48.765379913872955, | |
| "learning_rate": 1.2974148123600477e-05, | |
| "loss": 5.4745, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 85.68051399317197, | |
| "learning_rate": 1.2935885840323015e-05, | |
| "loss": 5.532, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 33.710633120635386, | |
| "learning_rate": 1.2897576463307999e-05, | |
| "loss": 5.4799, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 34.47592415932075, | |
| "learning_rate": 1.285922060706561e-05, | |
| "loss": 5.482, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 14.767073605394202, | |
| "learning_rate": 1.2820818886851599e-05, | |
| "loss": 5.4112, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 12.482712560989532, | |
| "learning_rate": 1.2782371918657393e-05, | |
| "loss": 5.3771, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 41.50415361625991, | |
| "learning_rate": 1.2743880319200241e-05, | |
| "loss": 5.3874, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 31.642237047280826, | |
| "learning_rate": 1.270534470591331e-05, | |
| "loss": 5.3966, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 69.19319134724441, | |
| "learning_rate": 1.2666765696935773e-05, | |
| "loss": 5.3924, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 32.008395804279004, | |
| "learning_rate": 1.2628143911102905e-05, | |
| "loss": 5.4084, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 50.15983811581157, | |
| "learning_rate": 1.2589479967936163e-05, | |
| "loss": 5.382, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 13.619109989883537, | |
| "learning_rate": 1.2550774487633218e-05, | |
| "loss": 5.3693, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 84.80172491530355, | |
| "learning_rate": 1.2512028091058044e-05, | |
| "loss": 5.3354, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 116.07832106775594, | |
| "learning_rate": 1.2473241399730931e-05, | |
| "loss": 5.3473, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 26.694652075068255, | |
| "learning_rate": 1.2434415035818535e-05, | |
| "loss": 5.345, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 54.00503230741141, | |
| "learning_rate": 1.239554962212388e-05, | |
| "loss": 5.3973, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 10.543680083461279, | |
| "learning_rate": 1.2356645782076384e-05, | |
| "loss": 5.3688, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 65.51859427381903, | |
| "learning_rate": 1.2317704139721847e-05, | |
| "loss": 5.3773, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 29.71675462869479, | |
| "learning_rate": 1.2278725319712449e-05, | |
| "loss": 5.2786, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 33.01336130546269, | |
| "learning_rate": 1.2239709947296722e-05, | |
| "loss": 5.311, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 29.973987092234548, | |
| "learning_rate": 1.2200658648309531e-05, | |
| "loss": 5.2992, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 48.926488754680314, | |
| "learning_rate": 1.2161572049162027e-05, | |
| "loss": 5.2774, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 8.5731820792718, | |
| "learning_rate": 1.2122450776831593e-05, | |
| "loss": 5.2921, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 54.271928916848765, | |
| "learning_rate": 1.208329545885181e-05, | |
| "loss": 5.2721, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 58.51752529939886, | |
| "learning_rate": 1.2044106723302364e-05, | |
| "loss": 5.3084, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 33.27476309879864, | |
| "learning_rate": 1.200488519879899e-05, | |
| "loss": 5.2501, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 25.846871549849688, | |
| "learning_rate": 1.1965631514483376e-05, | |
| "loss": 5.273, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 29.71630100350262, | |
| "learning_rate": 1.1926346300013078e-05, | |
| "loss": 5.1903, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 48.29209358595899, | |
| "learning_rate": 1.1887030185551427e-05, | |
| "loss": 5.202, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 57.498341779085, | |
| "learning_rate": 1.18476838017574e-05, | |
| "loss": 5.2558, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 37.88134720461833, | |
| "learning_rate": 1.1808307779775518e-05, | |
| "loss": 5.2759, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 21.238832228632518, | |
| "learning_rate": 1.176890275122573e-05, | |
| "loss": 5.2207, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 58.74754679184001, | |
| "learning_rate": 1.1729469348193263e-05, | |
| "loss": 5.1915, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 85.34069836046139, | |
| "learning_rate": 1.1690008203218493e-05, | |
| "loss": 5.2966, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 35.44463556250631, | |
| "learning_rate": 1.1650519949286797e-05, | |
| "loss": 5.2205, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 29.508279045032964, | |
| "learning_rate": 1.1611005219818392e-05, | |
| "loss": 5.2509, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 19.983013642914806, | |
| "learning_rate": 1.1571464648658201e-05, | |
| "loss": 5.2294, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 51.50574440943992, | |
| "learning_rate": 1.1531898870065645e-05, | |
| "loss": 5.1938, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 59.492851827921314, | |
| "learning_rate": 1.1492308518704507e-05, | |
| "loss": 5.1673, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 40.117703874194646, | |
| "learning_rate": 1.145269422963272e-05, | |
| "loss": 5.1442, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 43.459311512165996, | |
| "learning_rate": 1.1413056638292215e-05, | |
| "loss": 5.1993, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 82.49562635086012, | |
| "learning_rate": 1.1373396380498683e-05, | |
| "loss": 5.1647, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 49.800451164925974, | |
| "learning_rate": 1.1333714092431423e-05, | |
| "loss": 5.194, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 25.30211289206568, | |
| "learning_rate": 1.1294010410623107e-05, | |
| "loss": 5.1499, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 77.40197466561355, | |
| "learning_rate": 1.1254285971949574e-05, | |
| "loss": 5.1234, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 25.94865795704941, | |
| "learning_rate": 1.1214541413619628e-05, | |
| "loss": 5.1313, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 42.470163548722276, | |
| "learning_rate": 1.1174777373164797e-05, | |
| "loss": 5.0979, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 52.3446908357727, | |
| "learning_rate": 1.1134994488429128e-05, | |
| "loss": 5.1355, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 40.38483541097707, | |
| "learning_rate": 1.109519339755893e-05, | |
| "loss": 5.1091, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 73.05590392589481, | |
| "learning_rate": 1.1055374738992561e-05, | |
| "loss": 5.094, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 14.70864089128146, | |
| "learning_rate": 1.1015539151450172e-05, | |
| "loss": 5.1089, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 126.77678907405712, | |
| "learning_rate": 1.0975687273923474e-05, | |
| "loss": 5.1169, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 116.95168890571357, | |
| "learning_rate": 1.0935819745665477e-05, | |
| "loss": 5.137, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 16.051304830755644, | |
| "learning_rate": 1.0895937206180243e-05, | |
| "loss": 5.0797, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 22.43120059083249, | |
| "learning_rate": 1.0856040295212614e-05, | |
| "loss": 5.0401, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 39.29902824176953, | |
| "learning_rate": 1.0816129652737976e-05, | |
| "loss": 5.0754, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 48.77985418941213, | |
| "learning_rate": 1.077620591895197e-05, | |
| "loss": 5.0088, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 28.967042464927275, | |
| "learning_rate": 1.0736269734260232e-05, | |
| "loss": 5.0327, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 35.80838537119951, | |
| "learning_rate": 1.069632173926812e-05, | |
| "loss": 4.949, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 25.37744948872279, | |
| "learning_rate": 1.0656362574770442e-05, | |
| "loss": 5.0487, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 27.443743147851325, | |
| "learning_rate": 1.0616392881741166e-05, | |
| "loss": 5.0757, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 95.45635298424027, | |
| "learning_rate": 1.0576413301323148e-05, | |
| "loss": 5.0677, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 47.6117313918869, | |
| "learning_rate": 1.0536424474817848e-05, | |
| "loss": 4.9705, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 39.12748920114918, | |
| "learning_rate": 1.0496427043675032e-05, | |
| "loss": 5.0286, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 73.58917778375972, | |
| "learning_rate": 1.0456421649482502e-05, | |
| "loss": 4.9928, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 78.45734276993822, | |
| "learning_rate": 1.041640893395578e-05, | |
| "loss": 5.0972, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 25.26009599076755, | |
| "learning_rate": 1.0376389538927841e-05, | |
| "loss": 5.0298, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 70.6590336000904, | |
| "learning_rate": 1.0336364106338793e-05, | |
| "loss": 4.9628, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 107.78270188957804, | |
| "learning_rate": 1.0296333278225599e-05, | |
| "loss": 5.0169, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 52.33879582194398, | |
| "learning_rate": 1.0256297696711764e-05, | |
| "loss": 5.0315, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 16.249102954138092, | |
| "learning_rate": 1.0216258003997044e-05, | |
| "loss": 4.9982, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 20.332719936580876, | |
| "learning_rate": 1.0176214842347143e-05, | |
| "loss": 4.9946, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 37.984031001896334, | |
| "learning_rate": 1.0136168854083401e-05, | |
| "loss": 4.9295, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 53.098834473437336, | |
| "learning_rate": 1.0096120681572513e-05, | |
| "loss": 4.9064, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 54.783283517303545, | |
| "learning_rate": 1.0056070967216199e-05, | |
| "loss": 4.9895, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 37.5165014648596, | |
| "learning_rate": 1.0016020353440916e-05, | |
| "loss": 4.9422, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 108.68042109667304, | |
| "learning_rate": 9.975969482687547e-06, | |
| "loss": 4.9495, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 123.58611812164843, | |
| "learning_rate": 9.935918997401104e-06, | |
| "loss": 4.9624, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 76.39873130451743, | |
| "learning_rate": 9.8958695400204e-06, | |
| "loss": 4.9523, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 61.8471682011305, | |
| "learning_rate": 9.855821752967779e-06, | |
| "loss": 4.9636, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 59.995751706401286, | |
| "learning_rate": 9.815776278638772e-06, | |
| "loss": 4.9458, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 16.402048254533458, | |
| "learning_rate": 9.775733759391833e-06, | |
| "loss": 4.9456, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 28.336679722259976, | |
| "learning_rate": 9.735694837537993e-06, | |
| "loss": 4.9485, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 34.684944838819, | |
| "learning_rate": 9.695660155330598e-06, | |
| "loss": 4.8956, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 55.40359426382184, | |
| "learning_rate": 9.655630354954974e-06, | |
| "loss": 4.9379, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 56.22243606993078, | |
| "learning_rate": 9.615606078518143e-06, | |
| "loss": 4.8888, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 25.444922627514334, | |
| "learning_rate": 9.57558796803852e-06, | |
| "loss": 4.9219, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 27.49053795893979, | |
| "learning_rate": 9.535576665435606e-06, | |
| "loss": 4.9364, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 23.530923406419333, | |
| "learning_rate": 9.495572812519718e-06, | |
| "loss": 4.8681, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 49.62532394537909, | |
| "learning_rate": 9.455577050981648e-06, | |
| "loss": 4.8465, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 38.36145744939352, | |
| "learning_rate": 9.41559002238242e-06, | |
| "loss": 4.8363, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 60.0717352423416, | |
| "learning_rate": 9.375612368142962e-06, | |
| "loss": 4.8311, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 80.43091159408323, | |
| "learning_rate": 9.33564472953383e-06, | |
| "loss": 4.856, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 157.04490281080777, | |
| "learning_rate": 9.295687747664935e-06, | |
| "loss": 4.9268, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 40.77389952062912, | |
| "learning_rate": 9.255742063475228e-06, | |
| "loss": 4.8845, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 50.41517786447708, | |
| "learning_rate": 9.215808317722453e-06, | |
| "loss": 4.8417, | |
| "step": 2495 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 43.470119721373855, | |
| "learning_rate": 9.175887150972841e-06, | |
| "loss": 4.8295, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 38.52488378294851, | |
| "learning_rate": 9.135979203590852e-06, | |
| "loss": 4.7927, | |
| "step": 2505 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 50.05829822932659, | |
| "learning_rate": 9.096085115728902e-06, | |
| "loss": 4.7938, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 32.417062147957665, | |
| "learning_rate": 9.056205527317082e-06, | |
| "loss": 4.7832, | |
| "step": 2515 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 43.17389049870212, | |
| "learning_rate": 9.016341078052908e-06, | |
| "loss": 4.8322, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 26.175168734109757, | |
| "learning_rate": 8.976492407391046e-06, | |
| "loss": 4.7375, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 54.56821168706554, | |
| "learning_rate": 8.93666015453307e-06, | |
| "loss": 4.777, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 55.92901066668165, | |
| "learning_rate": 8.89684495841719e-06, | |
| "loss": 4.8629, | |
| "step": 2535 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 60.84437729594054, | |
| "learning_rate": 8.857047457708023e-06, | |
| "loss": 4.7472, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 66.07551312053982, | |
| "learning_rate": 8.817268290786343e-06, | |
| "loss": 4.8064, | |
| "step": 2545 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 70.80552970949772, | |
| "learning_rate": 8.777508095738818e-06, | |
| "loss": 4.7755, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 40.034281163404245, | |
| "learning_rate": 8.737767510347816e-06, | |
| "loss": 4.7675, | |
| "step": 2555 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 43.61238525728124, | |
| "learning_rate": 8.698047172081129e-06, | |
| "loss": 4.7917, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 70.59672678835062, | |
| "learning_rate": 8.658347718081791e-06, | |
| "loss": 4.7439, | |
| "step": 2565 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 66.1516485301477, | |
| "learning_rate": 8.618669785157825e-06, | |
| "loss": 4.7205, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 51.425818625655715, | |
| "learning_rate": 8.579014009772045e-06, | |
| "loss": 4.765, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 59.5563139018077, | |
| "learning_rate": 8.539381028031838e-06, | |
| "loss": 4.7086, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 32.02533818205619, | |
| "learning_rate": 8.499771475678968e-06, | |
| "loss": 4.7159, | |
| "step": 2585 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 28.169693520409528, | |
| "learning_rate": 8.46018598807938e-06, | |
| "loss": 4.781, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 33.43326529222529, | |
| "learning_rate": 8.420625200212985e-06, | |
| "loss": 4.7727, | |
| "step": 2595 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 15.602721631920888, | |
| "learning_rate": 8.381089746663517e-06, | |
| "loss": 4.7277, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 75.75678646235137, | |
| "learning_rate": 8.341580261608305e-06, | |
| "loss": 4.7178, | |
| "step": 2605 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 105.35921413917552, | |
| "learning_rate": 8.302097378808147e-06, | |
| "loss": 4.7169, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 66.6503863002048, | |
| "learning_rate": 8.262641731597097e-06, | |
| "loss": 4.7065, | |
| "step": 2615 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 63.36937965279217, | |
| "learning_rate": 8.223213952872353e-06, | |
| "loss": 4.7571, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 42.26449627514292, | |
| "learning_rate": 8.183814675084074e-06, | |
| "loss": 4.7193, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 51.922201070153356, | |
| "learning_rate": 8.144444530225237e-06, | |
| "loss": 4.645, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 49.62760310535778, | |
| "learning_rate": 8.105104149821515e-06, | |
| "loss": 4.6761, | |
| "step": 2635 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 26.063474264685297, | |
| "learning_rate": 8.065794164921128e-06, | |
| "loss": 4.7211, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 37.10041174063637, | |
| "learning_rate": 8.026515206084744e-06, | |
| "loss": 4.62, | |
| "step": 2645 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 49.537074028126945, | |
| "learning_rate": 7.987267903375331e-06, | |
| "loss": 4.6471, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 51.18992061136639, | |
| "learning_rate": 7.948052886348091e-06, | |
| "loss": 4.7218, | |
| "step": 2655 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 32.615492742378834, | |
| "learning_rate": 7.90887078404033e-06, | |
| "loss": 4.6906, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 31.099865231660658, | |
| "learning_rate": 7.869722224961372e-06, | |
| "loss": 4.6481, | |
| "step": 2665 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 56.24729430957337, | |
| "learning_rate": 7.830607837082494e-06, | |
| "loss": 4.5412, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 53.552077180701694, | |
| "learning_rate": 7.791528247826832e-06, | |
| "loss": 4.6727, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 22.552847832781552, | |
| "learning_rate": 7.75248408405934e-06, | |
| "loss": 4.6075, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 25.173048725283913, | |
| "learning_rate": 7.71347597207671e-06, | |
| "loss": 4.6629, | |
| "step": 2685 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 23.941386790396614, | |
| "learning_rate": 7.674504537597336e-06, | |
| "loss": 4.6419, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 97.73934134607612, | |
| "learning_rate": 7.635570405751297e-06, | |
| "loss": 4.686, | |
| "step": 2695 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 25.939426037429264, | |
| "learning_rate": 7.596674201070282e-06, | |
| "loss": 4.6312, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 60.83860372254808, | |
| "learning_rate": 7.557816547477627e-06, | |
| "loss": 4.6386, | |
| "step": 2705 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 32.30676478489584, | |
| "learning_rate": 7.518998068278266e-06, | |
| "loss": 4.613, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 25.044495875697613, | |
| "learning_rate": 7.480219386148751e-06, | |
| "loss": 4.5508, | |
| "step": 2715 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 43.24371720695532, | |
| "learning_rate": 7.441481123127257e-06, | |
| "loss": 4.5489, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 12.562426692181319, | |
| "learning_rate": 7.402783900603612e-06, | |
| "loss": 4.6438, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 60.56989492512174, | |
| "learning_rate": 7.364128339309326e-06, | |
| "loss": 4.532, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 26.419914483143693, | |
| "learning_rate": 7.325515059307622e-06, | |
| "loss": 4.5474, | |
| "step": 2735 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 64.0140334222756, | |
| "learning_rate": 7.286944679983521e-06, | |
| "loss": 4.5868, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 47.227122182136696, | |
| "learning_rate": 7.248417820033857e-06, | |
| "loss": 4.4863, | |
| "step": 2745 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 57.003929679910804, | |
| "learning_rate": 7.209935097457412e-06, | |
| "loss": 4.5547, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 51.97090726817012, | |
| "learning_rate": 7.171497129544946e-06, | |
| "loss": 4.5544, | |
| "step": 2755 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 87.12591293798738, | |
| "learning_rate": 7.133104532869342e-06, | |
| "loss": 4.4572, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 31.837006106829726, | |
| "learning_rate": 7.094757923275688e-06, | |
| "loss": 4.4516, | |
| "step": 2765 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 34.74652280757694, | |
| "learning_rate": 7.056457915871399e-06, | |
| "loss": 4.4672, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 51.35076516856966, | |
| "learning_rate": 7.018205125016369e-06, | |
| "loss": 4.479, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 63.95419645820714, | |
| "learning_rate": 6.980000164313093e-06, | |
| "loss": 4.5476, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 64.70406060026058, | |
| "learning_rate": 6.9418436465968485e-06, | |
| "loss": 4.5368, | |
| "step": 2785 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 33.66827494802027, | |
| "learning_rate": 6.903736183925835e-06, | |
| "loss": 4.5201, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 52.74134921214354, | |
| "learning_rate": 6.865678387571394e-06, | |
| "loss": 4.4905, | |
| "step": 2795 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 56.22271055622349, | |
| "learning_rate": 6.82767086800817e-06, | |
| "loss": 4.4965, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 16.41040693265605, | |
| "learning_rate": 6.789714234904332e-06, | |
| "loss": 4.4832, | |
| "step": 2805 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 60.85653173977498, | |
| "learning_rate": 6.751809097111799e-06, | |
| "loss": 4.3844, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 32.72687745774018, | |
| "learning_rate": 6.71395606265646e-06, | |
| "loss": 4.494, | |
| "step": 2815 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 24.316547206805122, | |
| "learning_rate": 6.676155738728438e-06, | |
| "loss": 4.4608, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 14.434036241184234, | |
| "learning_rate": 6.638408731672332e-06, | |
| "loss": 4.4666, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 57.148441922309786, | |
| "learning_rate": 6.600715646977503e-06, | |
| "loss": 4.4279, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 27.612312611508564, | |
| "learning_rate": 6.5630770892683656e-06, | |
| "loss": 4.3871, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 46.055770557265205, | |
| "learning_rate": 6.525493662294669e-06, | |
| "loss": 4.3828, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 29.944780931656958, | |
| "learning_rate": 6.487965968921834e-06, | |
| "loss": 4.3734, | |
| "step": 2845 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 65.19612839352436, | |
| "learning_rate": 6.450494611121274e-06, | |
| "loss": 4.3356, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 29.427807906606667, | |
| "learning_rate": 6.413080189960734e-06, | |
| "loss": 4.4448, | |
| "step": 2855 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 34.62611381334959, | |
| "learning_rate": 6.375723305594658e-06, | |
| "loss": 4.3736, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 40.05866733756267, | |
| "learning_rate": 6.338424557254556e-06, | |
| "loss": 4.3007, | |
| "step": 2865 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 29.52996151229796, | |
| "learning_rate": 6.301184543239398e-06, | |
| "loss": 4.3379, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 53.268001034947524, | |
| "learning_rate": 6.264003860906003e-06, | |
| "loss": 4.3931, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 54.62261873319705, | |
| "learning_rate": 6.2268831066594846e-06, | |
| "loss": 4.3074, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 126.40837022827374, | |
| "learning_rate": 6.189822875943644e-06, | |
| "loss": 4.3585, | |
| "step": 2885 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 38.42244306123947, | |
| "learning_rate": 6.152823763231463e-06, | |
| "loss": 4.4187, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 99.40712122912547, | |
| "learning_rate": 6.115886362015525e-06, | |
| "loss": 4.3485, | |
| "step": 2895 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 29.73588763253472, | |
| "learning_rate": 6.079011264798534e-06, | |
| "loss": 4.4134, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 44.79201001634174, | |
| "learning_rate": 6.042199063083787e-06, | |
| "loss": 4.3128, | |
| "step": 2905 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 16.491851726212843, | |
| "learning_rate": 6.005450347365687e-06, | |
| "loss": 4.2906, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 54.87856940808512, | |
| "learning_rate": 5.96876570712028e-06, | |
| "loss": 4.2281, | |
| "step": 2915 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 79.43830358158179, | |
| "learning_rate": 5.932145730795793e-06, | |
| "loss": 4.3322, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 10.817241852028406, | |
| "learning_rate": 5.895591005803198e-06, | |
| "loss": 4.2711, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 35.67244995828527, | |
| "learning_rate": 5.859102118506787e-06, | |
| "loss": 4.2798, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 37.49555978702204, | |
| "learning_rate": 5.822679654214771e-06, | |
| "loss": 4.3644, | |
| "step": 2935 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 34.7133878312333, | |
| "learning_rate": 5.786324197169887e-06, | |
| "loss": 4.3002, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 44.151270816410126, | |
| "learning_rate": 5.7500363305400185e-06, | |
| "loss": 4.3286, | |
| "step": 2945 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 17.03079214584477, | |
| "learning_rate": 5.713816636408871e-06, | |
| "loss": 4.2349, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 24.552884846518282, | |
| "learning_rate": 5.677665695766581e-06, | |
| "loss": 4.2901, | |
| "step": 2955 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 33.95441883738904, | |
| "learning_rate": 5.641584088500461e-06, | |
| "loss": 4.2871, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 25.835754131711642, | |
| "learning_rate": 5.605572393385645e-06, | |
| "loss": 4.265, | |
| "step": 2965 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 25.26568170761081, | |
| "learning_rate": 5.569631188075842e-06, | |
| "loss": 4.2861, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 76.32391957126073, | |
| "learning_rate": 5.5337610490940375e-06, | |
| "loss": 4.2465, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 28.611274776347827, | |
| "learning_rate": 5.497962551823266e-06, | |
| "loss": 4.2638, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 51.74402041961238, | |
| "learning_rate": 5.46223627049739e-06, | |
| "loss": 4.2331, | |
| "step": 2985 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 31.717225193208684, | |
| "learning_rate": 5.426582778191858e-06, | |
| "loss": 4.3613, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 93.29031808462936, | |
| "learning_rate": 5.3910026468145384e-06, | |
| "loss": 4.2825, | |
| "step": 2995 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 45.06093242733675, | |
| "learning_rate": 5.355496447096533e-06, | |
| "loss": 4.1915, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 143.69932721172492, | |
| "learning_rate": 5.320064748583031e-06, | |
| "loss": 4.2229, | |
| "step": 3005 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 43.33436292395085, | |
| "learning_rate": 5.284708119624173e-06, | |
| "loss": 4.1983, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 34.00278112862677, | |
| "learning_rate": 5.249427127365918e-06, | |
| "loss": 4.24, | |
| "step": 3015 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 47.614893220448685, | |
| "learning_rate": 5.2142223377409616e-06, | |
| "loss": 4.2645, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 35.06663560378835, | |
| "learning_rate": 5.179094315459652e-06, | |
| "loss": 4.2547, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 20.809033630860146, | |
| "learning_rate": 5.144043624000944e-06, | |
| "loss": 4.2138, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 57.39876741653422, | |
| "learning_rate": 5.109070825603338e-06, | |
| "loss": 4.213, | |
| "step": 3035 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 26.21823422312812, | |
| "learning_rate": 5.074176481255873e-06, | |
| "loss": 4.1925, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 39.3403157676951, | |
| "learning_rate": 5.039361150689141e-06, | |
| "loss": 4.2599, | |
| "step": 3045 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 39.47336093394705, | |
| "learning_rate": 5.00462539236628e-06, | |
| "loss": 4.1208, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 52.22125643489011, | |
| "learning_rate": 4.969969763474047e-06, | |
| "loss": 4.1573, | |
| "step": 3055 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 54.28036221168733, | |
| "learning_rate": 4.935394819913849e-06, | |
| "loss": 4.1955, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 34.034655711045716, | |
| "learning_rate": 4.900901116292854e-06, | |
| "loss": 4.1996, | |
| "step": 3065 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 26.78872189890714, | |
| "learning_rate": 4.866489205915072e-06, | |
| "loss": 4.1856, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 16.312287518234115, | |
| "learning_rate": 4.8321596407725044e-06, | |
| "loss": 4.1166, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 75.08013865287577, | |
| "learning_rate": 4.7979129715362625e-06, | |
| "loss": 4.0856, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 12.006364091554866, | |
| "learning_rate": 4.7637497475477465e-06, | |
| "loss": 4.1962, | |
| "step": 3085 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 60.3078722361271, | |
| "learning_rate": 4.72967051680985e-06, | |
| "loss": 4.1743, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 71.3931741313261, | |
| "learning_rate": 4.695675825978133e-06, | |
| "loss": 4.2264, | |
| "step": 3095 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 39.88478916067746, | |
| "learning_rate": 4.661766220352098e-06, | |
| "loss": 4.1791, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 35.51853711087642, | |
| "learning_rate": 4.627942243866387e-06, | |
| "loss": 4.2068, | |
| "step": 3105 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 22.525777126158957, | |
| "learning_rate": 4.594204439082122e-06, | |
| "loss": 4.1823, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 27.12535016689027, | |
| "learning_rate": 4.560553347178144e-06, | |
| "loss": 4.1541, | |
| "step": 3115 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 30.924051240195272, | |
| "learning_rate": 4.526989507942374e-06, | |
| "loss": 4.1083, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 36.007531222594395, | |
| "learning_rate": 4.493513459763126e-06, | |
| "loss": 4.1531, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 43.057060831713464, | |
| "learning_rate": 4.460125739620479e-06, | |
| "loss": 4.0741, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 55.48363364948151, | |
| "learning_rate": 4.426826883077681e-06, | |
| "loss": 4.1667, | |
| "step": 3135 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 35.8318271641625, | |
| "learning_rate": 4.393617424272527e-06, | |
| "loss": 4.1549, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 23.77098245342959, | |
| "learning_rate": 4.360497895908826e-06, | |
| "loss": 4.1396, | |
| "step": 3145 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 47.72018152839063, | |
| "learning_rate": 4.3274688292478105e-06, | |
| "loss": 4.0997, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 62.64419565990156, | |
| "learning_rate": 4.294530754099666e-06, | |
| "loss": 4.1044, | |
| "step": 3155 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 115.91048946848494, | |
| "learning_rate": 4.261684198815004e-06, | |
| "loss": 4.0457, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 51.14718657604795, | |
| "learning_rate": 4.228929690276381e-06, | |
| "loss": 4.0961, | |
| "step": 3165 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 43.71547478412355, | |
| "learning_rate": 4.196267753889864e-06, | |
| "loss": 4.1202, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 24.62288935078393, | |
| "learning_rate": 4.163698913576592e-06, | |
| "loss": 4.1129, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 20.18023214978946, | |
| "learning_rate": 4.131223691764384e-06, | |
| "loss": 4.0219, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 18.01338344676861, | |
| "learning_rate": 4.098842609379339e-06, | |
| "loss": 4.1014, | |
| "step": 3185 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 27.60045755810515, | |
| "learning_rate": 4.066556185837494e-06, | |
| "loss": 4.1146, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 34.42048003123422, | |
| "learning_rate": 4.0343649390365e-06, | |
| "loss": 4.0762, | |
| "step": 3195 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 20.689902728976875, | |
| "learning_rate": 4.002269385347289e-06, | |
| "loss": 4.0448, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 18.015958502412772, | |
| "learning_rate": 3.970270039605818e-06, | |
| "loss": 4.0524, | |
| "step": 3205 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 61.6572445957151, | |
| "learning_rate": 3.9383674151047936e-06, | |
| "loss": 4.0754, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 58.461465621421034, | |
| "learning_rate": 3.906562023585442e-06, | |
| "loss": 4.051, | |
| "step": 3215 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 31.812316184769323, | |
| "learning_rate": 3.8748543752293e-06, | |
| "loss": 4.0391, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 62.678768499001514, | |
| "learning_rate": 3.843244978650045e-06, | |
| "loss": 4.0376, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 28.498015835842963, | |
| "learning_rate": 3.8117343408853124e-06, | |
| "loss": 4.1165, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 35.579180059381116, | |
| "learning_rate": 3.780322967388577e-06, | |
| "loss": 4.0979, | |
| "step": 3235 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 43.80592325623231, | |
| "learning_rate": 3.7490113620210487e-06, | |
| "loss": 3.9952, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 69.85816894896105, | |
| "learning_rate": 3.7178000270435765e-06, | |
| "loss": 3.9794, | |
| "step": 3245 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 83.09539466736378, | |
| "learning_rate": 3.686689463108608e-06, | |
| "loss": 4.0066, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 29.653561320118907, | |
| "learning_rate": 3.6556801692521426e-06, | |
| "loss": 4.0893, | |
| "step": 3255 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 44.601159546521934, | |
| "learning_rate": 3.6247726428857344e-06, | |
| "loss": 3.9974, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 32.63133900722214, | |
| "learning_rate": 3.593967379788522e-06, | |
| "loss": 4.0271, | |
| "step": 3265 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 26.804136313740308, | |
| "learning_rate": 3.563264874099258e-06, | |
| "loss": 4.0592, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 57.97164352032171, | |
| "learning_rate": 3.532665618308395e-06, | |
| "loss": 3.9575, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 30.365309058990356, | |
| "learning_rate": 3.5021701032501777e-06, | |
| "loss": 3.943, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 19.20476555535661, | |
| "learning_rate": 3.4717788180947855e-06, | |
| "loss": 4.0183, | |
| "step": 3285 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 26.969291231079545, | |
| "learning_rate": 3.441492250340461e-06, | |
| "loss": 3.943, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 53.27848011595771, | |
| "learning_rate": 3.4113108858057175e-06, | |
| "loss": 3.9395, | |
| "step": 3295 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 23.697016529967343, | |
| "learning_rate": 3.3812352086215216e-06, | |
| "loss": 3.9381, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 23.821110733096624, | |
| "learning_rate": 3.3512657012235396e-06, | |
| "loss": 3.9144, | |
| "step": 3305 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 14.6960350856719, | |
| "learning_rate": 3.3214028443444034e-06, | |
| "loss": 3.9815, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 38.22586864203478, | |
| "learning_rate": 3.2916471170059895e-06, | |
| "loss": 4.0093, | |
| "step": 3315 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 51.93090441245013, | |
| "learning_rate": 3.261998996511736e-06, | |
| "loss": 3.971, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 21.215271536556212, | |
| "learning_rate": 3.232458958438992e-06, | |
| "loss": 3.9256, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 27.686900367908216, | |
| "learning_rate": 3.203027476631386e-06, | |
| "loss": 3.9097, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 22.1101543095489, | |
| "learning_rate": 3.1737050231912324e-06, | |
| "loss": 4.0827, | |
| "step": 3335 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 21.295283181859492, | |
| "learning_rate": 3.1444920684719394e-06, | |
| "loss": 3.896, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 21.99467485644529, | |
| "learning_rate": 3.115389081070481e-06, | |
| "loss": 3.9685, | |
| "step": 3345 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 46.127703111002745, | |
| "learning_rate": 3.086396527819876e-06, | |
| "loss": 3.9347, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 65.73981490894823, | |
| "learning_rate": 3.057514873781703e-06, | |
| "loss": 3.992, | |
| "step": 3355 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 47.02561208426134, | |
| "learning_rate": 3.028744582238633e-06, | |
| "loss": 3.9291, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 37.63324176122822, | |
| "learning_rate": 3.0000861146869963e-06, | |
| "loss": 3.9341, | |
| "step": 3365 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 35.919928715936734, | |
| "learning_rate": 2.9715399308294003e-06, | |
| "loss": 3.9403, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 26.76480814686508, | |
| "learning_rate": 2.9431064885673245e-06, | |
| "loss": 3.9465, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 29.416416160949314, | |
| "learning_rate": 2.914786243993808e-06, | |
| "loss": 3.8873, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 37.14000936405318, | |
| "learning_rate": 2.8865796513860933e-06, | |
| "loss": 3.8889, | |
| "step": 3385 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 29.815072807879385, | |
| "learning_rate": 2.858487163198389e-06, | |
| "loss": 3.9574, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 62.26541335752987, | |
| "learning_rate": 2.8305092300545668e-06, | |
| "loss": 3.9163, | |
| "step": 3395 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 58.06457655612948, | |
| "learning_rate": 2.8026463007409665e-06, | |
| "loss": 3.8697, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 45.73491570077404, | |
| "learning_rate": 2.7748988221991722e-06, | |
| "loss": 3.9373, | |
| "step": 3405 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 36.275458403222174, | |
| "learning_rate": 2.747267239518857e-06, | |
| "loss": 3.9232, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 22.988083070741016, | |
| "learning_rate": 2.719751995930645e-06, | |
| "loss": 3.9188, | |
| "step": 3415 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 22.974384854653206, | |
| "learning_rate": 2.6923535327989925e-06, | |
| "loss": 3.8638, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 45.882590739178596, | |
| "learning_rate": 2.6650722896151126e-06, | |
| "loss": 3.8769, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 40.954221331076866, | |
| "learning_rate": 2.637908703989924e-06, | |
| "loss": 3.9264, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 26.599677518965485, | |
| "learning_rate": 2.610863211647038e-06, | |
| "loss": 3.9088, | |
| "step": 3435 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 35.47565296693497, | |
| "learning_rate": 2.5839362464157635e-06, | |
| "loss": 3.8627, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 41.40869117005486, | |
| "learning_rate": 2.5571282402241435e-06, | |
| "loss": 3.9094, | |
| "step": 3445 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 68.17036804468498, | |
| "learning_rate": 2.5304396230920346e-06, | |
| "loss": 3.8402, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 83.47999334447974, | |
| "learning_rate": 2.5038708231242047e-06, | |
| "loss": 3.9403, | |
| "step": 3455 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 77.05079977066599, | |
| "learning_rate": 2.477422266503473e-06, | |
| "loss": 3.9137, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 51.46036104014942, | |
| "learning_rate": 2.4510943774838624e-06, | |
| "loss": 3.8816, | |
| "step": 3465 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 27.50749097944802, | |
| "learning_rate": 2.424887578383799e-06, | |
| "loss": 3.84, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 41.66172111681471, | |
| "learning_rate": 2.398802289579347e-06, | |
| "loss": 3.7918, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 80.68457553134964, | |
| "learning_rate": 2.3728389294974472e-06, | |
| "loss": 3.8675, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 33.59208488462572, | |
| "learning_rate": 2.346997914609226e-06, | |
| "loss": 3.8922, | |
| "step": 3485 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 64.96350685792753, | |
| "learning_rate": 2.3212796594232947e-06, | |
| "loss": 3.9088, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 20.84613398845108, | |
| "learning_rate": 2.2956845764791126e-06, | |
| "loss": 3.8694, | |
| "step": 3495 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 79.71883116991208, | |
| "learning_rate": 2.2702130763403674e-06, | |
| "loss": 3.8558, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 16.048059898233294, | |
| "learning_rate": 2.2448655675883936e-06, | |
| "loss": 3.8667, | |
| "step": 3505 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 28.03725607393679, | |
| "learning_rate": 2.2196424568156073e-06, | |
| "loss": 3.8559, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 18.840441075178965, | |
| "learning_rate": 2.1945441486189913e-06, | |
| "loss": 3.7797, | |
| "step": 3515 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 40.18702021213058, | |
| "learning_rate": 2.1695710455936115e-06, | |
| "loss": 3.8923, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 21.072274094013498, | |
| "learning_rate": 2.144723548326142e-06, | |
| "loss": 3.8318, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 34.134477250167194, | |
| "learning_rate": 2.1200020553884603e-06, | |
| "loss": 3.8564, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 27.2459014612492, | |
| "learning_rate": 2.095406963331236e-06, | |
| "loss": 3.8176, | |
| "step": 3535 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 31.566520170408914, | |
| "learning_rate": 2.0709386666775732e-06, | |
| "loss": 3.8081, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 26.095568886047694, | |
| "learning_rate": 2.0465975579166984e-06, | |
| "loss": 3.8181, | |
| "step": 3545 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 38.14381147775237, | |
| "learning_rate": 2.0223840274976413e-06, | |
| "loss": 3.8871, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 21.22373392273956, | |
| "learning_rate": 1.998298463822986e-06, | |
| "loss": 3.8263, | |
| "step": 3555 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 12.56697575734541, | |
| "learning_rate": 1.9743412532426355e-06, | |
| "loss": 3.7559, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 29.10671316471521, | |
| "learning_rate": 1.950512780047622e-06, | |
| "loss": 3.8685, | |
| "step": 3565 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 32.741627262783176, | |
| "learning_rate": 1.9268134264639273e-06, | |
| "loss": 3.7997, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 30.45945628820104, | |
| "learning_rate": 1.9032435726463716e-06, | |
| "loss": 3.8634, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 22.91093812019858, | |
| "learning_rate": 1.879803596672497e-06, | |
| "loss": 3.8075, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 47.862363303838954, | |
| "learning_rate": 1.8564938745365102e-06, | |
| "loss": 3.7731, | |
| "step": 3585 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 33.53396034332934, | |
| "learning_rate": 1.8333147801432616e-06, | |
| "loss": 3.8076, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 42.040944658368346, | |
| "learning_rate": 1.8102666853022277e-06, | |
| "loss": 3.8322, | |
| "step": 3595 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 21.193540791343914, | |
| "learning_rate": 1.7873499597215604e-06, | |
| "loss": 3.8067, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 44.81510993536675, | |
| "learning_rate": 1.7645649710021528e-06, | |
| "loss": 3.8462, | |
| "step": 3605 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 29.535086551021763, | |
| "learning_rate": 1.7419120846317462e-06, | |
| "loss": 3.8056, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 25.498349063798265, | |
| "learning_rate": 1.7193916639790665e-06, | |
| "loss": 3.7899, | |
| "step": 3615 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 51.21765240200761, | |
| "learning_rate": 1.697004070287982e-06, | |
| "loss": 3.8017, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 19.225579683734967, | |
| "learning_rate": 1.6747496626717318e-06, | |
| "loss": 3.7372, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 12.71969214880765, | |
| "learning_rate": 1.6526287981071477e-06, | |
| "loss": 3.737, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 44.04789051079506, | |
| "learning_rate": 1.6306418314289408e-06, | |
| "loss": 3.7432, | |
| "step": 3635 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 22.156761731139095, | |
| "learning_rate": 1.6087891153239932e-06, | |
| "loss": 3.7768, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 15.43891391835237, | |
| "learning_rate": 1.5870710003257162e-06, | |
| "loss": 3.7451, | |
| "step": 3645 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 31.42896775673814, | |
| "learning_rate": 1.5654878348084246e-06, | |
| "loss": 3.7385, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 27.228741759625965, | |
| "learning_rate": 1.5440399649817384e-06, | |
| "loss": 3.7595, | |
| "step": 3655 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 71.63638200049408, | |
| "learning_rate": 1.5227277348850466e-06, | |
| "loss": 3.7062, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 26.887275059592724, | |
| "learning_rate": 1.5015514863819625e-06, | |
| "loss": 3.8185, | |
| "step": 3665 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 19.83325501228405, | |
| "learning_rate": 1.4805115591548746e-06, | |
| "loss": 3.8578, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 34.539575677278755, | |
| "learning_rate": 1.4596082906994658e-06, | |
| "loss": 3.8065, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 33.170185299027224, | |
| "learning_rate": 1.4388420163193217e-06, | |
| "loss": 3.7483, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 27.730066097249708, | |
| "learning_rate": 1.4182130691205399e-06, | |
| "loss": 3.7441, | |
| "step": 3685 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 33.489727448755154, | |
| "learning_rate": 1.3977217800063847e-06, | |
| "loss": 3.798, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 48.01255191546678, | |
| "learning_rate": 1.3773684776719987e-06, | |
| "loss": 3.7754, | |
| "step": 3695 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 41.97717842787009, | |
| "learning_rate": 1.3571534885991044e-06, | |
| "loss": 3.7466, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 36.296648212146444, | |
| "learning_rate": 1.337077137050784e-06, | |
| "loss": 3.7657, | |
| "step": 3705 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 41.91557775464321, | |
| "learning_rate": 1.3171397450662716e-06, | |
| "loss": 3.7902, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 73.28373291496773, | |
| "learning_rate": 1.297341632455793e-06, | |
| "loss": 3.7137, | |
| "step": 3715 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 27.703907254747342, | |
| "learning_rate": 1.2776831167954252e-06, | |
| "loss": 3.7574, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 32.47665767602999, | |
| "learning_rate": 1.258164513422019e-06, | |
| "loss": 3.6842, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 30.127478496239906, | |
| "learning_rate": 1.2387861354281194e-06, | |
| "loss": 3.7497, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 30.31251538683249, | |
| "learning_rate": 1.2195482936569603e-06, | |
| "loss": 3.7801, | |
| "step": 3735 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 32.52496481302236, | |
| "learning_rate": 1.2004512966974746e-06, | |
| "loss": 3.7157, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 14.156403859014825, | |
| "learning_rate": 1.1814954508793397e-06, | |
| "loss": 3.839, | |
| "step": 3745 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 37.50877570394944, | |
| "learning_rate": 1.162681060268065e-06, | |
| "loss": 3.6964, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 19.32986922764744, | |
| "learning_rate": 1.1440084266601148e-06, | |
| "loss": 3.7188, | |
| "step": 3755 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 24.332267876030233, | |
| "learning_rate": 1.1254778495780749e-06, | |
| "loss": 3.7324, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 34.29097555764843, | |
| "learning_rate": 1.1070896262658381e-06, | |
| "loss": 3.7136, | |
| "step": 3765 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 20.828700764112394, | |
| "learning_rate": 1.0888440516838373e-06, | |
| "loss": 3.7861, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 16.25551955958299, | |
| "learning_rate": 1.0707414185043163e-06, | |
| "loss": 3.7257, | |
| "step": 3775 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 17.428505907748793, | |
| "learning_rate": 1.0527820171066372e-06, | |
| "loss": 3.7063, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 16.776980287582877, | |
| "learning_rate": 1.0349661355726215e-06, | |
| "loss": 3.7172, | |
| "step": 3785 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 22.39618908121105, | |
| "learning_rate": 1.0172940596819258e-06, | |
| "loss": 3.7102, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 29.720064640396235, | |
| "learning_rate": 9.997660729074587e-07, | |
| "loss": 3.7362, | |
| "step": 3795 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 12.610115583045804, | |
| "learning_rate": 9.823824564108408e-07, | |
| "loss": 3.7097, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 15.909574598713629, | |
| "learning_rate": 9.651434890378797e-07, | |
| "loss": 3.6483, | |
| "step": 3805 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 12.590177297776139, | |
| "learning_rate": 9.480494473141189e-07, | |
| "loss": 3.755, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 34.813242896296885, | |
| "learning_rate": 9.311006054403726e-07, | |
| "loss": 3.7565, | |
| "step": 3815 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 25.00551994408005, | |
| "learning_rate": 9.142972352883595e-07, | |
| "loss": 3.7124, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 27.98697623414369, | |
| "learning_rate": 8.976396063963156e-07, | |
| "loss": 3.7042, | |
| "step": 3825 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 17.034734259958352, | |
| "learning_rate": 8.811279859646915e-07, | |
| "loss": 3.7073, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 13.422751386569267, | |
| "learning_rate": 8.647626388518471e-07, | |
| "loss": 3.7712, | |
| "step": 3835 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 24.8158518349583, | |
| "learning_rate": 8.485438275698154e-07, | |
| "loss": 3.7182, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 18.715838846810584, | |
| "learning_rate": 8.324718122800912e-07, | |
| "loss": 3.6951, | |
| "step": 3845 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 13.452940566527365, | |
| "learning_rate": 8.165468507894514e-07, | |
| "loss": 3.6549, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 13.545934881206449, | |
| "learning_rate": 8.007691985458277e-07, | |
| "loss": 3.6982, | |
| "step": 3855 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 14.27044438801209, | |
| "learning_rate": 7.851391086341953e-07, | |
| "loss": 3.7319, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 26.361556662611267, | |
| "learning_rate": 7.696568317725339e-07, | |
| "loss": 3.6546, | |
| "step": 3865 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 20.180688580230548, | |
| "learning_rate": 7.543226163077899e-07, | |
| "loss": 3.6958, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 19.613411785549815, | |
| "learning_rate": 7.391367082118961e-07, | |
| "loss": 3.7838, | |
| "step": 3875 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 11.201677788887183, | |
| "learning_rate": 7.240993510778304e-07, | |
| "loss": 3.7625, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 18.496564500858582, | |
| "learning_rate": 7.092107861157004e-07, | |
| "loss": 3.6805, | |
| "step": 3885 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 13.038218490522087, | |
| "learning_rate": 6.944712521488884e-07, | |
| "loss": 3.7393, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 27.280200290755396, | |
| "learning_rate": 6.798809856102028e-07, | |
| "loss": 3.7157, | |
| "step": 3895 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 15.2881947610183, | |
| "learning_rate": 6.654402205380961e-07, | |
| "loss": 3.6811, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 11.770606575689413, | |
| "learning_rate": 6.511491885729149e-07, | |
| "loss": 3.7428, | |
| "step": 3905 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 22.301488201013317, | |
| "learning_rate": 6.370081189531707e-07, | |
| "loss": 3.6475, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 21.077284580886506, | |
| "learning_rate": 6.230172385118738e-07, | |
| "loss": 3.6893, | |
| "step": 3915 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 15.076688760938024, | |
| "learning_rate": 6.091767716728924e-07, | |
| "loss": 3.5956, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 19.018811518390564, | |
| "learning_rate": 5.954869404473473e-07, | |
| "loss": 3.691, | |
| "step": 3925 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 20.79504311040266, | |
| "learning_rate": 5.819479644300563e-07, | |
| "loss": 3.6939, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 14.766741254863161, | |
| "learning_rate": 5.685600607960129e-07, | |
| "loss": 3.5967, | |
| "step": 3935 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 21.241474366469944, | |
| "learning_rate": 5.553234442969014e-07, | |
| "loss": 3.6332, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 16.355235705781315, | |
| "learning_rate": 5.422383272576426e-07, | |
| "loss": 3.7295, | |
| "step": 3945 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 16.264682212634607, | |
| "learning_rate": 5.293049195730038e-07, | |
| "loss": 3.6247, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 12.47936237691352, | |
| "learning_rate": 5.165234287042198e-07, | |
| "loss": 3.6133, | |
| "step": 3955 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 13.306179294534777, | |
| "learning_rate": 5.038940596756747e-07, | |
| "loss": 3.6881, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 16.391206536288802, | |
| "learning_rate": 4.914170150716024e-07, | |
| "loss": 3.6579, | |
| "step": 3965 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 14.242791211418306, | |
| "learning_rate": 4.790924950328435e-07, | |
| "loss": 3.631, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 24.849350152016854, | |
| "learning_rate": 4.6692069725363887e-07, | |
| "loss": 3.6937, | |
| "step": 3975 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 21.64209756625074, | |
| "learning_rate": 4.5490181697844916e-07, | |
| "loss": 3.6635, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 11.723108661682744, | |
| "learning_rate": 4.4303604699882594e-07, | |
| "loss": 3.6442, | |
| "step": 3985 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 23.715955779604574, | |
| "learning_rate": 4.313235776503244e-07, | |
| "loss": 3.7092, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 26.33500590884361, | |
| "learning_rate": 4.197645968094466e-07, | |
| "loss": 3.7199, | |
| "step": 3995 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 15.97634043977573, | |
| "learning_rate": 4.08359289890623e-07, | |
| "loss": 3.7013, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 16.249998954911213, | |
| "learning_rate": 3.971078398432482e-07, | |
| "loss": 3.692, | |
| "step": 4005 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 12.650307490766737, | |
| "learning_rate": 3.860104271487397e-07, | |
| "loss": 3.7514, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 20.944524374009152, | |
| "learning_rate": 3.750672298176405e-07, | |
| "loss": 3.6776, | |
| "step": 4015 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 31.837250069023384, | |
| "learning_rate": 3.6427842338677353e-07, | |
| "loss": 3.6802, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 35.16277225180415, | |
| "learning_rate": 3.5364418091641374e-07, | |
| "loss": 3.6035, | |
| "step": 4025 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 35.67667244362796, | |
| "learning_rate": 3.4316467298752264e-07, | |
| "loss": 3.6372, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 17.219392618044115, | |
| "learning_rate": 3.328400676990029e-07, | |
| "loss": 3.6292, | |
| "step": 4035 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 10.04557723669283, | |
| "learning_rate": 3.226705306650113e-07, | |
| "loss": 3.72, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 21.846859098930196, | |
| "learning_rate": 3.1265622501229554e-07, | |
| "loss": 3.6557, | |
| "step": 4045 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 17.605374506200285, | |
| "learning_rate": 3.027973113775795e-07, | |
| "loss": 3.6747, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 25.49080172625827, | |
| "learning_rate": 2.9309394790498547e-07, | |
| "loss": 3.7104, | |
| "step": 4055 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 12.882615183890971, | |
| "learning_rate": 2.835462902434971e-07, | |
| "loss": 3.674, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 20.504280922780172, | |
| "learning_rate": 2.741544915444694e-07, | |
| "loss": 3.6457, | |
| "step": 4065 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 16.681593532660717, | |
| "learning_rate": 2.649187024591604e-07, | |
| "loss": 3.6835, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 12.650054676447523, | |
| "learning_rate": 2.5583907113632456e-07, | |
| "loss": 3.647, | |
| "step": 4075 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 17.534906906242455, | |
| "learning_rate": 2.4691574321983216e-07, | |
| "loss": 3.6579, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 19.926506010778407, | |
| "learning_rate": 2.3814886184633012e-07, | |
| "loss": 3.6499, | |
| "step": 4085 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 12.234267069451622, | |
| "learning_rate": 2.2953856764295623e-07, | |
| "loss": 3.6078, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 8.223939533474807, | |
| "learning_rate": 2.210849987250685e-07, | |
| "loss": 3.6654, | |
| "step": 4095 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 18.599130278136133, | |
| "learning_rate": 2.1278829069404483e-07, | |
| "loss": 3.6817, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 16.196978860217815, | |
| "learning_rate": 2.0464857663509473e-07, | |
| "loss": 3.6475, | |
| "step": 4105 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 13.396466803933027, | |
| "learning_rate": 1.9666598711513663e-07, | |
| "loss": 3.6074, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 14.768338009628959, | |
| "learning_rate": 1.8884065018069165e-07, | |
| "loss": 3.6512, | |
| "step": 4115 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 21.524152342417754, | |
| "learning_rate": 1.811726913558387e-07, | |
| "loss": 3.7483, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 18.22167319217679, | |
| "learning_rate": 1.736622336401983e-07, | |
| "loss": 3.7415, | |
| "step": 4125 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 19.595031034548562, | |
| "learning_rate": 1.663093975069552e-07, | |
| "loss": 3.6581, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 14.772246875655348, | |
| "learning_rate": 1.5911430090093437e-07, | |
| "loss": 3.6186, | |
| "step": 4135 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 14.004789266507018, | |
| "learning_rate": 1.5207705923670158e-07, | |
| "loss": 3.6816, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 17.056919214526435, | |
| "learning_rate": 1.451977853967146e-07, | |
| "loss": 3.6623, | |
| "step": 4145 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 11.302137776127884, | |
| "learning_rate": 1.3847658972951482e-07, | |
| "loss": 3.5906, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 12.07905744766456, | |
| "learning_rate": 1.319135800479543e-07, | |
| "loss": 3.5944, | |
| "step": 4155 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 18.674654546847137, | |
| "learning_rate": 1.2550886162746468e-07, | |
| "loss": 3.6017, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 11.839458481793278, | |
| "learning_rate": 1.192625372043754e-07, | |
| "loss": 3.6178, | |
| "step": 4165 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 19.786389992269886, | |
| "learning_rate": 1.1317470697425837e-07, | |
| "loss": 3.6542, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 11.174068584947278, | |
| "learning_rate": 1.072454685903257e-07, | |
| "loss": 3.733, | |
| "step": 4175 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 24.21761073466553, | |
| "learning_rate": 1.0147491716185675e-07, | |
| "loss": 3.6381, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 19.459674614347303, | |
| "learning_rate": 9.586314525268369e-08, | |
| "loss": 3.6084, | |
| "step": 4185 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 15.59530798472988, | |
| "learning_rate": 9.041024287969491e-08, | |
| "loss": 3.6231, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 30.42366766942627, | |
| "learning_rate": 8.511629751139949e-08, | |
| "loss": 3.6688, | |
| "step": 4195 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 9.11994003002298, | |
| "learning_rate": 7.99813940665195e-08, | |
| "loss": 3.681, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 29.254431985701988, | |
| "learning_rate": 7.50056149126277e-08, | |
| "loss": 3.6489, | |
| "step": 4205 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 8.244989458828204, | |
| "learning_rate": 7.018903986483083e-08, | |
| "loss": 3.6852, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 23.642383946399335, | |
| "learning_rate": 6.553174618448399e-08, | |
| "loss": 3.6476, | |
| "step": 4215 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 11.497305087171618, | |
| "learning_rate": 6.103380857795604e-08, | |
| "loss": 3.6077, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 11.260541601085492, | |
| "learning_rate": 5.6695299195425045e-08, | |
| "loss": 3.6514, | |
| "step": 4225 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 15.021990993208474, | |
| "learning_rate": 5.251628762972916e-08, | |
| "loss": 3.6486, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 11.79501214076045, | |
| "learning_rate": 4.84968409152442e-08, | |
| "loss": 3.6583, | |
| "step": 4235 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 11.469889869893892, | |
| "learning_rate": 4.4637023526807875e-08, | |
| "loss": 3.6266, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 10.951279521137277, | |
| "learning_rate": 4.0936897378691664e-08, | |
| "loss": 3.6709, | |
| "step": 4245 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 16.923113614818572, | |
| "learning_rate": 3.739652182360054e-08, | |
| "loss": 3.6802, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 12.114560682787932, | |
| "learning_rate": 3.401595365172483e-08, | |
| "loss": 3.6402, | |
| "step": 4255 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 9.182946295232345, | |
| "learning_rate": 3.079524708983095e-08, | |
| "loss": 3.6225, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 10.451056436364329, | |
| "learning_rate": 2.773445380038653e-08, | |
| "loss": 3.6414, | |
| "step": 4265 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 8.236622614247617, | |
| "learning_rate": 2.483362288073443e-08, | |
| "loss": 3.6163, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 14.14954738204664, | |
| "learning_rate": 2.2092800862305587e-08, | |
| "loss": 3.6195, | |
| "step": 4275 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 21.05844392360743, | |
| "learning_rate": 1.9512031709874037e-08, | |
| "loss": 3.6474, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 9.31164701024037, | |
| "learning_rate": 1.7091356820848616e-08, | |
| "loss": 3.6775, | |
| "step": 4285 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 10.110842718868811, | |
| "learning_rate": 1.4830815024606815e-08, | |
| "loss": 3.618, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 21.53619047566387, | |
| "learning_rate": 1.2730442581879721e-08, | |
| "loss": 3.6245, | |
| "step": 4295 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 13.23611241300099, | |
| "learning_rate": 1.0790273184164701e-08, | |
| "loss": 3.6271, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 15.48506813893137, | |
| "learning_rate": 9.010337953185843e-09, | |
| "loss": 3.6317, | |
| "step": 4305 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 12.562935111145112, | |
| "learning_rate": 7.390665440393241e-09, | |
| "loss": 3.6198, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 12.689542859801007, | |
| "learning_rate": 5.931281626508911e-09, | |
| "loss": 3.6293, | |
| "step": 4315 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 13.307479835934826, | |
| "learning_rate": 4.632209921107133e-09, | |
| "loss": 3.6791, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 15.251068214534937, | |
| "learning_rate": 3.493471162241413e-09, | |
| "loss": 3.6444, | |
| "step": 4325 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 12.13951542897477, | |
| "learning_rate": 2.5150836161058624e-09, | |
| "loss": 3.5564, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 9.08622318333974, | |
| "learning_rate": 1.6970629767465441e-09, | |
| "loss": 3.5891, | |
| "step": 4335 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 11.684988146759082, | |
| "learning_rate": 1.03942236580723e-09, | |
| "loss": 3.6092, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 17.508480063342134, | |
| "learning_rate": 5.421723323195682e-10, | |
| "loss": 3.591, | |
| "step": 4345 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 19.286758978873294, | |
| "learning_rate": 2.053208525365502e-10, | |
| "loss": 3.6626, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 11.364851389553667, | |
| "learning_rate": 2.8873329798173588e-11, | |
| "loss": 3.614, | |
| "step": 4355 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 3.6477067470550537, | |
| "eval_runtime": 315.4083, | |
| "eval_samples_per_second": 48.924, | |
| "eval_steps_per_second": 0.767, | |
| "step": 4358 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 4358, | |
| "total_flos": 456238269726720.0, | |
| "train_loss": 4.517249699085335, | |
| "train_runtime": 13676.9113, | |
| "train_samples_per_second": 10.194, | |
| "train_steps_per_second": 0.319 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 4358, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "total_flos": 456238269726720.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |