diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,38606 @@ +{ + "best_global_step": 50000, + "best_metric": 0.4009660835826073, + "best_model_checkpoint": "./whisper-translate-bn-or\\checkpoint-50000", + "epoch": 5.508839585315771, + "eval_steps": 10000, + "global_step": 55000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00010016527269995493, + "grad_norm": 1.0699678659439087, + "learning_rate": 0.0, + "loss": 3.4658, + "step": 1 + }, + { + "epoch": 0.0010016527269995493, + "grad_norm": 0.946382999420166, + "learning_rate": 4.5e-06, + "loss": 3.1873, + "step": 10 + }, + { + "epoch": 0.0020033054539990987, + "grad_norm": 1.0159204006195068, + "learning_rate": 9.5e-06, + "loss": 3.3418, + "step": 20 + }, + { + "epoch": 0.0030049581809986478, + "grad_norm": 0.963729202747345, + "learning_rate": 1.45e-05, + "loss": 3.2141, + "step": 30 + }, + { + "epoch": 0.004006610907998197, + "grad_norm": 1.0494544506072998, + "learning_rate": 1.9500000000000003e-05, + "loss": 3.2094, + "step": 40 + }, + { + "epoch": 0.005008263634997746, + "grad_norm": 1.1693147420883179, + "learning_rate": 2.45e-05, + "loss": 3.1064, + "step": 50 + }, + { + "epoch": 0.0060099163619972956, + "grad_norm": 1.0876061916351318, + "learning_rate": 2.95e-05, + "loss": 2.8635, + "step": 60 + }, + { + "epoch": 0.007011569088996845, + "grad_norm": 1.2751100063323975, + "learning_rate": 3.45e-05, + "loss": 2.7471, + "step": 70 + }, + { + "epoch": 0.008013221815996395, + "grad_norm": 1.3546741008758545, + "learning_rate": 3.9500000000000005e-05, + "loss": 2.6155, + "step": 80 + }, + { + "epoch": 0.009014874542995942, + "grad_norm": 1.6993346214294434, + "learning_rate": 4.4500000000000004e-05, + "loss": 2.5273, + "step": 90 + }, + { + "epoch": 0.010016527269995492, + "grad_norm": 1.595080852508545, + "learning_rate": 4.9500000000000004e-05, + "loss": 2.413, + "step": 100 + }, + { + "epoch": 0.011018179996995042, + "grad_norm": 1.627835750579834, + "learning_rate": 4.9999995973874405e-05, + "loss": 2.1941, + "step": 110 + }, + { + "epoch": 0.012019832723994591, + "grad_norm": 1.5041255950927734, + "learning_rate": 4.999998205640485e-05, + "loss": 2.1507, + "step": 120 + }, + { + "epoch": 0.01302148545099414, + "grad_norm": 1.970567226409912, + "learning_rate": 4.999995819789164e-05, + "loss": 2.0218, + "step": 130 + }, + { + "epoch": 0.01402313817799369, + "grad_norm": 1.9568697214126587, + "learning_rate": 4.999992439834424e-05, + "loss": 1.6866, + "step": 140 + }, + { + "epoch": 0.015024790904993238, + "grad_norm": 2.1707658767700195, + "learning_rate": 4.99998806577761e-05, + "loss": 1.6941, + "step": 150 + }, + { + "epoch": 0.01602644363199279, + "grad_norm": 2.4342153072357178, + "learning_rate": 4.999982697620461e-05, + "loss": 1.7162, + "step": 160 + }, + { + "epoch": 0.01702809635899234, + "grad_norm": 2.1355204582214355, + "learning_rate": 4.9999763353651117e-05, + "loss": 1.6787, + "step": 170 + }, + { + "epoch": 0.018029749085991885, + "grad_norm": 2.1012508869171143, + "learning_rate": 4.999968979014093e-05, + "loss": 1.5662, + "step": 180 + }, + { + "epoch": 0.019031401812991434, + "grad_norm": 1.8109233379364014, + "learning_rate": 4.999960628570328e-05, + "loss": 1.5796, + "step": 190 + }, + { + "epoch": 0.020033054539990984, + "grad_norm": 2.47389554977417, + "learning_rate": 4.9999512840371386e-05, + "loss": 1.6256, + "step": 200 + }, + { + "epoch": 0.021034707266990534, + "grad_norm": 1.9195002317428589, + "learning_rate": 4.999940945418241e-05, + "loss": 1.5094, + "step": 210 + }, + { + "epoch": 0.022036359993990083, + "grad_norm": 2.2418041229248047, + "learning_rate": 4.9999296127177453e-05, + "loss": 1.5252, + "step": 220 + }, + { + "epoch": 0.023038012720989633, + "grad_norm": 2.5955166816711426, + "learning_rate": 4.999917285940158e-05, + "loss": 1.5686, + "step": 230 + }, + { + "epoch": 0.024039665447989182, + "grad_norm": 1.936550498008728, + "learning_rate": 4.999903965090381e-05, + "loss": 1.5285, + "step": 240 + }, + { + "epoch": 0.025041318174988732, + "grad_norm": 1.894722819328308, + "learning_rate": 4.9998896501737116e-05, + "loss": 1.5413, + "step": 250 + }, + { + "epoch": 0.02604297090198828, + "grad_norm": 2.192305326461792, + "learning_rate": 4.99987434119584e-05, + "loss": 1.5218, + "step": 260 + }, + { + "epoch": 0.02704462362898783, + "grad_norm": 2.490818500518799, + "learning_rate": 4.999858038162857e-05, + "loss": 1.4895, + "step": 270 + }, + { + "epoch": 0.02804627635598738, + "grad_norm": 2.2005741596221924, + "learning_rate": 4.999840741081243e-05, + "loss": 1.481, + "step": 280 + }, + { + "epoch": 0.02904792908298693, + "grad_norm": 2.2879180908203125, + "learning_rate": 4.999822449957876e-05, + "loss": 1.3541, + "step": 290 + }, + { + "epoch": 0.030049581809986476, + "grad_norm": 1.9570573568344116, + "learning_rate": 4.999803164800031e-05, + "loss": 1.4484, + "step": 300 + }, + { + "epoch": 0.031051234536986026, + "grad_norm": 2.203392744064331, + "learning_rate": 4.999782885615375e-05, + "loss": 1.4385, + "step": 310 + }, + { + "epoch": 0.03205288726398558, + "grad_norm": 2.16286563873291, + "learning_rate": 4.999761612411972e-05, + "loss": 1.4522, + "step": 320 + }, + { + "epoch": 0.03305453999098513, + "grad_norm": 2.594715118408203, + "learning_rate": 4.999739345198282e-05, + "loss": 1.4169, + "step": 330 + }, + { + "epoch": 0.03405619271798468, + "grad_norm": 2.793673515319824, + "learning_rate": 4.999716083983159e-05, + "loss": 1.525, + "step": 340 + }, + { + "epoch": 0.03505784544498423, + "grad_norm": 2.3903636932373047, + "learning_rate": 4.999691828775853e-05, + "loss": 1.4475, + "step": 350 + }, + { + "epoch": 0.03605949817198377, + "grad_norm": 2.7492942810058594, + "learning_rate": 4.999666579586008e-05, + "loss": 1.397, + "step": 360 + }, + { + "epoch": 0.03706115089898332, + "grad_norm": 2.458517551422119, + "learning_rate": 4.9996403364236645e-05, + "loss": 1.3987, + "step": 370 + }, + { + "epoch": 0.03806280362598287, + "grad_norm": 2.7841598987579346, + "learning_rate": 4.999613099299258e-05, + "loss": 1.3781, + "step": 380 + }, + { + "epoch": 0.03906445635298242, + "grad_norm": 2.5349690914154053, + "learning_rate": 4.9995848682236194e-05, + "loss": 1.3281, + "step": 390 + }, + { + "epoch": 0.04006610907998197, + "grad_norm": 2.858383893966675, + "learning_rate": 4.999555643207974e-05, + "loss": 1.3173, + "step": 400 + }, + { + "epoch": 0.04106776180698152, + "grad_norm": 2.612161874771118, + "learning_rate": 4.999525424263943e-05, + "loss": 1.4123, + "step": 410 + }, + { + "epoch": 0.04206941453398107, + "grad_norm": 2.9477760791778564, + "learning_rate": 4.9994942114035435e-05, + "loss": 1.3716, + "step": 420 + }, + { + "epoch": 0.04307106726098062, + "grad_norm": 2.3722777366638184, + "learning_rate": 4.999462004639186e-05, + "loss": 1.3067, + "step": 430 + }, + { + "epoch": 0.044072719987980166, + "grad_norm": 2.594200611114502, + "learning_rate": 4.999428803983678e-05, + "loss": 1.3702, + "step": 440 + }, + { + "epoch": 0.045074372714979716, + "grad_norm": 2.670445203781128, + "learning_rate": 4.99939460945022e-05, + "loss": 1.3728, + "step": 450 + }, + { + "epoch": 0.046076025441979265, + "grad_norm": 2.574885606765747, + "learning_rate": 4.9993594210524115e-05, + "loss": 1.351, + "step": 460 + }, + { + "epoch": 0.047077678168978815, + "grad_norm": 2.1980414390563965, + "learning_rate": 4.999323238804243e-05, + "loss": 1.425, + "step": 470 + }, + { + "epoch": 0.048079330895978364, + "grad_norm": 2.689714193344116, + "learning_rate": 4.9992860627201034e-05, + "loss": 1.3985, + "step": 480 + }, + { + "epoch": 0.049080983622977914, + "grad_norm": 2.989649772644043, + "learning_rate": 4.999247892814775e-05, + "loss": 1.3723, + "step": 490 + }, + { + "epoch": 0.050082636349977463, + "grad_norm": 2.6088790893554688, + "learning_rate": 4.999208729103435e-05, + "loss": 1.2535, + "step": 500 + }, + { + "epoch": 0.05108428907697701, + "grad_norm": 2.829699754714966, + "learning_rate": 4.999168571601658e-05, + "loss": 1.2934, + "step": 510 + }, + { + "epoch": 0.05208594180397656, + "grad_norm": 1.9769593477249146, + "learning_rate": 4.999127420325411e-05, + "loss": 1.1993, + "step": 520 + }, + { + "epoch": 0.05308759453097611, + "grad_norm": 2.4946117401123047, + "learning_rate": 4.9990852752910576e-05, + "loss": 1.2418, + "step": 530 + }, + { + "epoch": 0.05408924725797566, + "grad_norm": 2.7048685550689697, + "learning_rate": 4.999042136515358e-05, + "loss": 1.2653, + "step": 540 + }, + { + "epoch": 0.05509089998497521, + "grad_norm": 2.877206802368164, + "learning_rate": 4.998998004015464e-05, + "loss": 1.2766, + "step": 550 + }, + { + "epoch": 0.05609255271197476, + "grad_norm": 2.246899366378784, + "learning_rate": 4.998952877808925e-05, + "loss": 1.3626, + "step": 560 + }, + { + "epoch": 0.05709420543897431, + "grad_norm": 2.2098586559295654, + "learning_rate": 4.998906757913686e-05, + "loss": 1.2818, + "step": 570 + }, + { + "epoch": 0.05809585816597386, + "grad_norm": 3.2610485553741455, + "learning_rate": 4.998859644348085e-05, + "loss": 1.2254, + "step": 580 + }, + { + "epoch": 0.05909751089297341, + "grad_norm": 3.1614911556243896, + "learning_rate": 4.998811537130857e-05, + "loss": 1.2474, + "step": 590 + }, + { + "epoch": 0.06009916361997295, + "grad_norm": 2.5031402111053467, + "learning_rate": 4.9987624362811324e-05, + "loss": 1.1746, + "step": 600 + }, + { + "epoch": 0.0611008163469725, + "grad_norm": 2.7153284549713135, + "learning_rate": 4.9987123418184344e-05, + "loss": 1.285, + "step": 610 + }, + { + "epoch": 0.06210246907397205, + "grad_norm": 2.6688461303710938, + "learning_rate": 4.998661253762683e-05, + "loss": 1.2516, + "step": 620 + }, + { + "epoch": 0.06310412180097161, + "grad_norm": 2.5687406063079834, + "learning_rate": 4.998609172134192e-05, + "loss": 1.1905, + "step": 630 + }, + { + "epoch": 0.06410577452797116, + "grad_norm": 2.876481533050537, + "learning_rate": 4.9985560969536736e-05, + "loss": 1.192, + "step": 640 + }, + { + "epoch": 0.0651074272549707, + "grad_norm": 3.5918831825256348, + "learning_rate": 4.9985020282422304e-05, + "loss": 1.3036, + "step": 650 + }, + { + "epoch": 0.06610907998197026, + "grad_norm": 2.464890718460083, + "learning_rate": 4.998446966021364e-05, + "loss": 1.1751, + "step": 660 + }, + { + "epoch": 0.0671107327089698, + "grad_norm": 2.5518927574157715, + "learning_rate": 4.998390910312969e-05, + "loss": 1.2133, + "step": 670 + }, + { + "epoch": 0.06811238543596936, + "grad_norm": 2.709120512008667, + "learning_rate": 4.998333861139335e-05, + "loss": 1.2233, + "step": 680 + }, + { + "epoch": 0.0691140381629689, + "grad_norm": 2.6371023654937744, + "learning_rate": 4.998275818523147e-05, + "loss": 1.1879, + "step": 690 + }, + { + "epoch": 0.07011569088996845, + "grad_norm": 2.7895636558532715, + "learning_rate": 4.998216782487486e-05, + "loss": 1.1956, + "step": 700 + }, + { + "epoch": 0.071117343616968, + "grad_norm": 2.9015183448791504, + "learning_rate": 4.998156753055826e-05, + "loss": 1.1716, + "step": 710 + }, + { + "epoch": 0.07211899634396754, + "grad_norm": 3.0008327960968018, + "learning_rate": 4.9980957302520396e-05, + "loss": 1.2089, + "step": 720 + }, + { + "epoch": 0.07312064907096709, + "grad_norm": 2.6571829319000244, + "learning_rate": 4.9980337141003895e-05, + "loss": 1.1451, + "step": 730 + }, + { + "epoch": 0.07412230179796664, + "grad_norm": 2.932692766189575, + "learning_rate": 4.9979707046255376e-05, + "loss": 1.1582, + "step": 740 + }, + { + "epoch": 0.07512395452496619, + "grad_norm": 2.8995957374572754, + "learning_rate": 4.9979067018525383e-05, + "loss": 1.1619, + "step": 750 + }, + { + "epoch": 0.07612560725196574, + "grad_norm": 2.709153890609741, + "learning_rate": 4.9978417058068414e-05, + "loss": 1.202, + "step": 760 + }, + { + "epoch": 0.07712725997896529, + "grad_norm": 2.4261250495910645, + "learning_rate": 4.997775716514293e-05, + "loss": 1.2184, + "step": 770 + }, + { + "epoch": 0.07812891270596484, + "grad_norm": 2.8053817749023438, + "learning_rate": 4.997708734001133e-05, + "loss": 1.2656, + "step": 780 + }, + { + "epoch": 0.07913056543296439, + "grad_norm": 2.360339879989624, + "learning_rate": 4.997640758293996e-05, + "loss": 1.1717, + "step": 790 + }, + { + "epoch": 0.08013221815996394, + "grad_norm": 2.579951524734497, + "learning_rate": 4.9975717894199126e-05, + "loss": 1.2521, + "step": 800 + }, + { + "epoch": 0.08113387088696349, + "grad_norm": 2.9729485511779785, + "learning_rate": 4.997501827406307e-05, + "loss": 1.1597, + "step": 810 + }, + { + "epoch": 0.08213552361396304, + "grad_norm": 3.1533076763153076, + "learning_rate": 4.997430872280999e-05, + "loss": 1.1519, + "step": 820 + }, + { + "epoch": 0.08313717634096258, + "grad_norm": 4.105657577514648, + "learning_rate": 4.997358924072205e-05, + "loss": 1.1856, + "step": 830 + }, + { + "epoch": 0.08413882906796213, + "grad_norm": 2.565356731414795, + "learning_rate": 4.997285982808533e-05, + "loss": 1.232, + "step": 840 + }, + { + "epoch": 0.08514048179496168, + "grad_norm": 3.074532985687256, + "learning_rate": 4.9972120485189874e-05, + "loss": 1.1723, + "step": 850 + }, + { + "epoch": 0.08614213452196123, + "grad_norm": 2.80393123626709, + "learning_rate": 4.997137121232969e-05, + "loss": 1.2643, + "step": 860 + }, + { + "epoch": 0.08714378724896078, + "grad_norm": 2.783311128616333, + "learning_rate": 4.99706120098027e-05, + "loss": 1.1941, + "step": 870 + }, + { + "epoch": 0.08814543997596033, + "grad_norm": 2.6996560096740723, + "learning_rate": 4.9969842877910814e-05, + "loss": 1.18, + "step": 880 + }, + { + "epoch": 0.08914709270295988, + "grad_norm": 3.0572733879089355, + "learning_rate": 4.996906381695986e-05, + "loss": 1.1936, + "step": 890 + }, + { + "epoch": 0.09014874542995943, + "grad_norm": 2.5547292232513428, + "learning_rate": 4.996827482725963e-05, + "loss": 1.2188, + "step": 900 + }, + { + "epoch": 0.09115039815695898, + "grad_norm": 2.564838171005249, + "learning_rate": 4.996747590912386e-05, + "loss": 1.1149, + "step": 910 + }, + { + "epoch": 0.09215205088395853, + "grad_norm": 2.6100144386291504, + "learning_rate": 4.9966667062870235e-05, + "loss": 1.1286, + "step": 920 + }, + { + "epoch": 0.09315370361095808, + "grad_norm": 2.742168664932251, + "learning_rate": 4.996584828882038e-05, + "loss": 1.1574, + "step": 930 + }, + { + "epoch": 0.09415535633795763, + "grad_norm": 2.3155078887939453, + "learning_rate": 4.9965019587299874e-05, + "loss": 1.0794, + "step": 940 + }, + { + "epoch": 0.09515700906495718, + "grad_norm": 3.218953847885132, + "learning_rate": 4.996418095863824e-05, + "loss": 1.1358, + "step": 950 + }, + { + "epoch": 0.09615866179195673, + "grad_norm": 2.9602880477905273, + "learning_rate": 4.996333240316897e-05, + "loss": 1.1506, + "step": 960 + }, + { + "epoch": 0.09716031451895628, + "grad_norm": 2.4367001056671143, + "learning_rate": 4.996247392122948e-05, + "loss": 1.1119, + "step": 970 + }, + { + "epoch": 0.09816196724595583, + "grad_norm": 2.4974889755249023, + "learning_rate": 4.996160551316112e-05, + "loss": 1.1244, + "step": 980 + }, + { + "epoch": 0.09916361997295538, + "grad_norm": 3.0747549533843994, + "learning_rate": 4.9960727179309216e-05, + "loss": 1.1237, + "step": 990 + }, + { + "epoch": 0.10016527269995493, + "grad_norm": 3.293057441711426, + "learning_rate": 4.9959838920023036e-05, + "loss": 1.1125, + "step": 1000 + }, + { + "epoch": 0.10116692542695448, + "grad_norm": 3.2793030738830566, + "learning_rate": 4.995894073565579e-05, + "loss": 1.1349, + "step": 1010 + }, + { + "epoch": 0.10216857815395403, + "grad_norm": 2.5710489749908447, + "learning_rate": 4.995803262656463e-05, + "loss": 1.1671, + "step": 1020 + }, + { + "epoch": 0.10317023088095358, + "grad_norm": 2.81728458404541, + "learning_rate": 4.995711459311065e-05, + "loss": 1.098, + "step": 1030 + }, + { + "epoch": 0.10417188360795313, + "grad_norm": 2.4683609008789062, + "learning_rate": 4.995618663565892e-05, + "loss": 1.0697, + "step": 1040 + }, + { + "epoch": 0.10517353633495267, + "grad_norm": 2.7064619064331055, + "learning_rate": 4.995524875457841e-05, + "loss": 1.1379, + "step": 1050 + }, + { + "epoch": 0.10617518906195222, + "grad_norm": 2.810743570327759, + "learning_rate": 4.995430095024207e-05, + "loss": 1.1032, + "step": 1060 + }, + { + "epoch": 0.10717684178895177, + "grad_norm": 2.8417282104492188, + "learning_rate": 4.9953343223026796e-05, + "loss": 1.1375, + "step": 1070 + }, + { + "epoch": 0.10817849451595132, + "grad_norm": 2.6498005390167236, + "learning_rate": 4.9952375573313415e-05, + "loss": 1.0805, + "step": 1080 + }, + { + "epoch": 0.10918014724295087, + "grad_norm": 2.90468692779541, + "learning_rate": 4.9951398001486704e-05, + "loss": 1.1367, + "step": 1090 + }, + { + "epoch": 0.11018179996995042, + "grad_norm": 2.9180171489715576, + "learning_rate": 4.995041050793538e-05, + "loss": 1.0368, + "step": 1100 + }, + { + "epoch": 0.11118345269694997, + "grad_norm": 2.695122480392456, + "learning_rate": 4.994941309305212e-05, + "loss": 1.1418, + "step": 1110 + }, + { + "epoch": 0.11218510542394952, + "grad_norm": 2.9390275478363037, + "learning_rate": 4.994840575723354e-05, + "loss": 1.1555, + "step": 1120 + }, + { + "epoch": 0.11318675815094907, + "grad_norm": 3.146312713623047, + "learning_rate": 4.99473885008802e-05, + "loss": 1.105, + "step": 1130 + }, + { + "epoch": 0.11418841087794862, + "grad_norm": 2.8160715103149414, + "learning_rate": 4.99463613243966e-05, + "loss": 1.0991, + "step": 1140 + }, + { + "epoch": 0.11519006360494817, + "grad_norm": 2.578988790512085, + "learning_rate": 4.994532422819117e-05, + "loss": 1.0957, + "step": 1150 + }, + { + "epoch": 0.11619171633194772, + "grad_norm": 2.783162832260132, + "learning_rate": 4.994427721267634e-05, + "loss": 1.0774, + "step": 1160 + }, + { + "epoch": 0.11719336905894727, + "grad_norm": 2.6223232746124268, + "learning_rate": 4.9943220278268424e-05, + "loss": 1.1457, + "step": 1170 + }, + { + "epoch": 0.11819502178594682, + "grad_norm": 3.61311411857605, + "learning_rate": 4.994215342538771e-05, + "loss": 1.1417, + "step": 1180 + }, + { + "epoch": 0.11919667451294637, + "grad_norm": 2.6800315380096436, + "learning_rate": 4.994107665445841e-05, + "loss": 1.1383, + "step": 1190 + }, + { + "epoch": 0.1201983272399459, + "grad_norm": 2.0521180629730225, + "learning_rate": 4.993998996590872e-05, + "loss": 0.992, + "step": 1200 + }, + { + "epoch": 0.12119997996694545, + "grad_norm": 3.1667087078094482, + "learning_rate": 4.9938893360170726e-05, + "loss": 1.0216, + "step": 1210 + }, + { + "epoch": 0.122201632693945, + "grad_norm": 3.2581300735473633, + "learning_rate": 4.99377868376805e-05, + "loss": 1.0763, + "step": 1220 + }, + { + "epoch": 0.12320328542094455, + "grad_norm": 2.8601009845733643, + "learning_rate": 4.993667039887804e-05, + "loss": 1.1149, + "step": 1230 + }, + { + "epoch": 0.1242049381479441, + "grad_norm": 2.900925636291504, + "learning_rate": 4.99355440442073e-05, + "loss": 1.0522, + "step": 1240 + }, + { + "epoch": 0.12520659087494365, + "grad_norm": 2.9990346431732178, + "learning_rate": 4.993440777411613e-05, + "loss": 1.0632, + "step": 1250 + }, + { + "epoch": 0.12620824360194322, + "grad_norm": 2.717073917388916, + "learning_rate": 4.99332615890564e-05, + "loss": 1.0612, + "step": 1260 + }, + { + "epoch": 0.12720989632894275, + "grad_norm": 3.6020913124084473, + "learning_rate": 4.993210548948385e-05, + "loss": 0.9953, + "step": 1270 + }, + { + "epoch": 0.12821154905594231, + "grad_norm": 2.9735605716705322, + "learning_rate": 4.993093947585822e-05, + "loss": 1.1802, + "step": 1280 + }, + { + "epoch": 0.12921320178294185, + "grad_norm": 2.907518148422241, + "learning_rate": 4.9929763548643145e-05, + "loss": 1.1365, + "step": 1290 + }, + { + "epoch": 0.1302148545099414, + "grad_norm": 2.708824396133423, + "learning_rate": 4.9928577708306235e-05, + "loss": 1.0376, + "step": 1300 + }, + { + "epoch": 0.13121650723694095, + "grad_norm": 3.0437746047973633, + "learning_rate": 4.992738195531902e-05, + "loss": 1.1522, + "step": 1310 + }, + { + "epoch": 0.1322181599639405, + "grad_norm": 3.0373198986053467, + "learning_rate": 4.9926176290157e-05, + "loss": 1.0515, + "step": 1320 + }, + { + "epoch": 0.13321981269094005, + "grad_norm": 2.457960367202759, + "learning_rate": 4.9924960713299575e-05, + "loss": 1.0844, + "step": 1330 + }, + { + "epoch": 0.1342214654179396, + "grad_norm": 3.0508358478546143, + "learning_rate": 4.9923735225230127e-05, + "loss": 1.0746, + "step": 1340 + }, + { + "epoch": 0.13522311814493915, + "grad_norm": 2.9356350898742676, + "learning_rate": 4.992249982643595e-05, + "loss": 1.1505, + "step": 1350 + }, + { + "epoch": 0.1362247708719387, + "grad_norm": 3.3241400718688965, + "learning_rate": 4.99212545174083e-05, + "loss": 1.1369, + "step": 1360 + }, + { + "epoch": 0.13722642359893825, + "grad_norm": 3.232041120529175, + "learning_rate": 4.9919999298642364e-05, + "loss": 0.9972, + "step": 1370 + }, + { + "epoch": 0.1382280763259378, + "grad_norm": 2.7437665462493896, + "learning_rate": 4.991873417063726e-05, + "loss": 1.0838, + "step": 1380 + }, + { + "epoch": 0.13922972905293735, + "grad_norm": 2.6859841346740723, + "learning_rate": 4.991745913389606e-05, + "loss": 1.0474, + "step": 1390 + }, + { + "epoch": 0.1402313817799369, + "grad_norm": 2.495213508605957, + "learning_rate": 4.9916174188925776e-05, + "loss": 1.0574, + "step": 1400 + }, + { + "epoch": 0.14123303450693644, + "grad_norm": 2.9001944065093994, + "learning_rate": 4.991487933623736e-05, + "loss": 0.9758, + "step": 1410 + }, + { + "epoch": 0.142234687233936, + "grad_norm": 2.9604761600494385, + "learning_rate": 4.991357457634569e-05, + "loss": 1.0665, + "step": 1420 + }, + { + "epoch": 0.14323633996093554, + "grad_norm": 2.8431453704833984, + "learning_rate": 4.9912259909769595e-05, + "loss": 1.1466, + "step": 1430 + }, + { + "epoch": 0.14423799268793508, + "grad_norm": 3.013521909713745, + "learning_rate": 4.991093533703184e-05, + "loss": 1.0455, + "step": 1440 + }, + { + "epoch": 0.14523964541493464, + "grad_norm": 3.180718421936035, + "learning_rate": 4.9909600858659146e-05, + "loss": 1.1061, + "step": 1450 + }, + { + "epoch": 0.14624129814193418, + "grad_norm": 2.9873087406158447, + "learning_rate": 4.9908256475182133e-05, + "loss": 0.9961, + "step": 1460 + }, + { + "epoch": 0.14724295086893374, + "grad_norm": 2.9176833629608154, + "learning_rate": 4.990690218713541e-05, + "loss": 1.0316, + "step": 1470 + }, + { + "epoch": 0.14824460359593328, + "grad_norm": 2.420295476913452, + "learning_rate": 4.990553799505748e-05, + "loss": 1.03, + "step": 1480 + }, + { + "epoch": 0.14924625632293284, + "grad_norm": 2.4549472332000732, + "learning_rate": 4.990416389949081e-05, + "loss": 1.0582, + "step": 1490 + }, + { + "epoch": 0.15024790904993238, + "grad_norm": 3.2612345218658447, + "learning_rate": 4.99027799009818e-05, + "loss": 1.0485, + "step": 1500 + }, + { + "epoch": 0.15124956177693194, + "grad_norm": 2.7688887119293213, + "learning_rate": 4.9901386000080785e-05, + "loss": 1.0013, + "step": 1510 + }, + { + "epoch": 0.15225121450393148, + "grad_norm": 2.7964026927948, + "learning_rate": 4.989998219734203e-05, + "loss": 0.9652, + "step": 1520 + }, + { + "epoch": 0.15325286723093104, + "grad_norm": 2.564159393310547, + "learning_rate": 4.989856849332376e-05, + "loss": 0.9539, + "step": 1530 + }, + { + "epoch": 0.15425451995793057, + "grad_norm": 2.9798643589019775, + "learning_rate": 4.989714488858812e-05, + "loss": 1.0204, + "step": 1540 + }, + { + "epoch": 0.15525617268493014, + "grad_norm": 3.4313089847564697, + "learning_rate": 4.989571138370118e-05, + "loss": 0.9848, + "step": 1550 + }, + { + "epoch": 0.15625782541192967, + "grad_norm": 3.4639475345611572, + "learning_rate": 4.989426797923299e-05, + "loss": 1.0276, + "step": 1560 + }, + { + "epoch": 0.15725947813892924, + "grad_norm": 2.630809783935547, + "learning_rate": 4.989281467575748e-05, + "loss": 1.0028, + "step": 1570 + }, + { + "epoch": 0.15826113086592877, + "grad_norm": 3.15830397605896, + "learning_rate": 4.989135147385255e-05, + "loss": 0.9953, + "step": 1580 + }, + { + "epoch": 0.15926278359292834, + "grad_norm": 2.3013908863067627, + "learning_rate": 4.9889878374100054e-05, + "loss": 0.9445, + "step": 1590 + }, + { + "epoch": 0.16026443631992787, + "grad_norm": 2.9990906715393066, + "learning_rate": 4.9888395377085734e-05, + "loss": 1.0629, + "step": 1600 + }, + { + "epoch": 0.16126608904692744, + "grad_norm": 2.9761180877685547, + "learning_rate": 4.98869024833993e-05, + "loss": 1.0075, + "step": 1610 + }, + { + "epoch": 0.16226774177392697, + "grad_norm": 2.7043395042419434, + "learning_rate": 4.9885399693634385e-05, + "loss": 1.0862, + "step": 1620 + }, + { + "epoch": 0.16326939450092653, + "grad_norm": 2.7392871379852295, + "learning_rate": 4.988388700838856e-05, + "loss": 1.0467, + "step": 1630 + }, + { + "epoch": 0.16427104722792607, + "grad_norm": 2.6225688457489014, + "learning_rate": 4.988236442826335e-05, + "loss": 1.0083, + "step": 1640 + }, + { + "epoch": 0.16527269995492563, + "grad_norm": 2.732118606567383, + "learning_rate": 4.988083195386418e-05, + "loss": 1.0408, + "step": 1650 + }, + { + "epoch": 0.16627435268192517, + "grad_norm": 3.136059284210205, + "learning_rate": 4.987928958580043e-05, + "loss": 1.0722, + "step": 1660 + }, + { + "epoch": 0.16727600540892473, + "grad_norm": 3.0781443119049072, + "learning_rate": 4.987773732468541e-05, + "loss": 1.0305, + "step": 1670 + }, + { + "epoch": 0.16827765813592427, + "grad_norm": 2.469252824783325, + "learning_rate": 4.9876175171136366e-05, + "loss": 1.0357, + "step": 1680 + }, + { + "epoch": 0.16927931086292383, + "grad_norm": 2.5065886974334717, + "learning_rate": 4.9874603125774465e-05, + "loss": 0.9225, + "step": 1690 + }, + { + "epoch": 0.17028096358992337, + "grad_norm": 2.7271981239318848, + "learning_rate": 4.987302118922484e-05, + "loss": 0.9579, + "step": 1700 + }, + { + "epoch": 0.17128261631692293, + "grad_norm": 3.1236214637756348, + "learning_rate": 4.9871429362116517e-05, + "loss": 0.9902, + "step": 1710 + }, + { + "epoch": 0.17228426904392247, + "grad_norm": 2.5821967124938965, + "learning_rate": 4.986982764508248e-05, + "loss": 0.9573, + "step": 1720 + }, + { + "epoch": 0.17328592177092203, + "grad_norm": 2.606112480163574, + "learning_rate": 4.986821603875964e-05, + "loss": 1.0657, + "step": 1730 + }, + { + "epoch": 0.17428757449792157, + "grad_norm": 2.6190571784973145, + "learning_rate": 4.986659454378885e-05, + "loss": 1.0047, + "step": 1740 + }, + { + "epoch": 0.17528922722492113, + "grad_norm": 2.8123345375061035, + "learning_rate": 4.986496316081486e-05, + "loss": 0.994, + "step": 1750 + }, + { + "epoch": 0.17629087995192066, + "grad_norm": 2.665658712387085, + "learning_rate": 4.9863321890486386e-05, + "loss": 0.956, + "step": 1760 + }, + { + "epoch": 0.17729253267892023, + "grad_norm": 2.5470478534698486, + "learning_rate": 4.986167073345608e-05, + "loss": 1.0184, + "step": 1770 + }, + { + "epoch": 0.17829418540591976, + "grad_norm": 3.3314783573150635, + "learning_rate": 4.986000969038049e-05, + "loss": 0.9555, + "step": 1780 + }, + { + "epoch": 0.17929583813291933, + "grad_norm": 3.4173240661621094, + "learning_rate": 4.985833876192013e-05, + "loss": 0.9164, + "step": 1790 + }, + { + "epoch": 0.18029749085991886, + "grad_norm": 3.0542538166046143, + "learning_rate": 4.985665794873944e-05, + "loss": 0.992, + "step": 1800 + }, + { + "epoch": 0.18129914358691843, + "grad_norm": 3.4689574241638184, + "learning_rate": 4.9854967251506755e-05, + "loss": 1.075, + "step": 1810 + }, + { + "epoch": 0.18230079631391796, + "grad_norm": 2.872692346572876, + "learning_rate": 4.98532666708944e-05, + "loss": 1.0528, + "step": 1820 + }, + { + "epoch": 0.18330244904091753, + "grad_norm": 2.7932639122009277, + "learning_rate": 4.9851556207578565e-05, + "loss": 0.9843, + "step": 1830 + }, + { + "epoch": 0.18430410176791706, + "grad_norm": 2.615530014038086, + "learning_rate": 4.9849835862239426e-05, + "loss": 0.8914, + "step": 1840 + }, + { + "epoch": 0.18530575449491662, + "grad_norm": 2.508965253829956, + "learning_rate": 4.984810563556106e-05, + "loss": 1.0039, + "step": 1850 + }, + { + "epoch": 0.18630740722191616, + "grad_norm": 2.9795517921447754, + "learning_rate": 4.9846365528231466e-05, + "loss": 0.9747, + "step": 1860 + }, + { + "epoch": 0.18730905994891572, + "grad_norm": 3.470461845397949, + "learning_rate": 4.9844615540942584e-05, + "loss": 0.9412, + "step": 1870 + }, + { + "epoch": 0.18831071267591526, + "grad_norm": 3.2644646167755127, + "learning_rate": 4.9842855674390306e-05, + "loss": 0.9621, + "step": 1880 + }, + { + "epoch": 0.18931236540291482, + "grad_norm": 2.799272298812866, + "learning_rate": 4.9841085929274404e-05, + "loss": 1.0094, + "step": 1890 + }, + { + "epoch": 0.19031401812991436, + "grad_norm": 2.402174949645996, + "learning_rate": 4.983930630629862e-05, + "loss": 0.9504, + "step": 1900 + }, + { + "epoch": 0.1913156708569139, + "grad_norm": 3.4706664085388184, + "learning_rate": 4.983751680617059e-05, + "loss": 0.9323, + "step": 1910 + }, + { + "epoch": 0.19231732358391346, + "grad_norm": 3.1237800121307373, + "learning_rate": 4.9835717429601905e-05, + "loss": 0.9726, + "step": 1920 + }, + { + "epoch": 0.193318976310913, + "grad_norm": 2.986846446990967, + "learning_rate": 4.983390817730808e-05, + "loss": 0.9088, + "step": 1930 + }, + { + "epoch": 0.19432062903791256, + "grad_norm": 2.4418344497680664, + "learning_rate": 4.983208905000854e-05, + "loss": 1.0038, + "step": 1940 + }, + { + "epoch": 0.1953222817649121, + "grad_norm": 3.0399439334869385, + "learning_rate": 4.9830260048426636e-05, + "loss": 1.0384, + "step": 1950 + }, + { + "epoch": 0.19632393449191166, + "grad_norm": 2.5777769088745117, + "learning_rate": 4.9828421173289676e-05, + "loss": 0.9796, + "step": 1960 + }, + { + "epoch": 0.1973255872189112, + "grad_norm": 3.4346256256103516, + "learning_rate": 4.982657242532886e-05, + "loss": 0.99, + "step": 1970 + }, + { + "epoch": 0.19832723994591075, + "grad_norm": 2.746309518814087, + "learning_rate": 4.982471380527934e-05, + "loss": 0.9916, + "step": 1980 + }, + { + "epoch": 0.1993288926729103, + "grad_norm": 2.7143187522888184, + "learning_rate": 4.982284531388017e-05, + "loss": 0.9572, + "step": 1990 + }, + { + "epoch": 0.20033054539990985, + "grad_norm": 2.336798667907715, + "learning_rate": 4.9820966951874345e-05, + "loss": 0.9508, + "step": 2000 + }, + { + "epoch": 0.2013321981269094, + "grad_norm": 2.9657061100006104, + "learning_rate": 4.981907872000878e-05, + "loss": 1.0059, + "step": 2010 + }, + { + "epoch": 0.20233385085390895, + "grad_norm": 2.5726466178894043, + "learning_rate": 4.9817180619034324e-05, + "loss": 0.9964, + "step": 2020 + }, + { + "epoch": 0.2033355035809085, + "grad_norm": 2.542393207550049, + "learning_rate": 4.9815272649705733e-05, + "loss": 0.9314, + "step": 2030 + }, + { + "epoch": 0.20433715630790805, + "grad_norm": 2.635509729385376, + "learning_rate": 4.98133548127817e-05, + "loss": 0.9909, + "step": 2040 + }, + { + "epoch": 0.2053388090349076, + "grad_norm": 2.7643823623657227, + "learning_rate": 4.981142710902482e-05, + "loss": 0.9538, + "step": 2050 + }, + { + "epoch": 0.20634046176190715, + "grad_norm": 2.9941859245300293, + "learning_rate": 4.980948953920166e-05, + "loss": 1.0004, + "step": 2060 + }, + { + "epoch": 0.2073421144889067, + "grad_norm": 2.7872562408447266, + "learning_rate": 4.980754210408266e-05, + "loss": 0.9699, + "step": 2070 + }, + { + "epoch": 0.20834376721590625, + "grad_norm": 4.475282192230225, + "learning_rate": 4.98055848044422e-05, + "loss": 1.0183, + "step": 2080 + }, + { + "epoch": 0.20934541994290579, + "grad_norm": 2.9876139163970947, + "learning_rate": 4.980361764105859e-05, + "loss": 0.9116, + "step": 2090 + }, + { + "epoch": 0.21034707266990535, + "grad_norm": 3.074251890182495, + "learning_rate": 4.980164061471405e-05, + "loss": 0.9749, + "step": 2100 + }, + { + "epoch": 0.21134872539690489, + "grad_norm": 3.2771337032318115, + "learning_rate": 4.979965372619475e-05, + "loss": 0.9717, + "step": 2110 + }, + { + "epoch": 0.21235037812390445, + "grad_norm": 3.6136999130249023, + "learning_rate": 4.9797656976290736e-05, + "loss": 0.95, + "step": 2120 + }, + { + "epoch": 0.21335203085090398, + "grad_norm": 3.1407272815704346, + "learning_rate": 4.979565036579601e-05, + "loss": 0.9831, + "step": 2130 + }, + { + "epoch": 0.21435368357790355, + "grad_norm": 2.501585006713867, + "learning_rate": 4.979363389550849e-05, + "loss": 0.9725, + "step": 2140 + }, + { + "epoch": 0.21535533630490308, + "grad_norm": 2.292156219482422, + "learning_rate": 4.979160756623e-05, + "loss": 0.8937, + "step": 2150 + }, + { + "epoch": 0.21635698903190265, + "grad_norm": 2.627570629119873, + "learning_rate": 4.978957137876629e-05, + "loss": 0.9211, + "step": 2160 + }, + { + "epoch": 0.21735864175890218, + "grad_norm": 3.1033174991607666, + "learning_rate": 4.978752533392705e-05, + "loss": 0.9802, + "step": 2170 + }, + { + "epoch": 0.21836029448590175, + "grad_norm": 2.952439069747925, + "learning_rate": 4.978546943252586e-05, + "loss": 0.8931, + "step": 2180 + }, + { + "epoch": 0.21936194721290128, + "grad_norm": 2.534576416015625, + "learning_rate": 4.978340367538023e-05, + "loss": 1.0088, + "step": 2190 + }, + { + "epoch": 0.22036359993990084, + "grad_norm": 2.862668752670288, + "learning_rate": 4.9781328063311614e-05, + "loss": 0.9563, + "step": 2200 + }, + { + "epoch": 0.22136525266690038, + "grad_norm": 2.849451780319214, + "learning_rate": 4.977924259714534e-05, + "loss": 0.986, + "step": 2210 + }, + { + "epoch": 0.22236690539389994, + "grad_norm": 2.9716901779174805, + "learning_rate": 4.977714727771069e-05, + "loss": 0.8527, + "step": 2220 + }, + { + "epoch": 0.22336855812089948, + "grad_norm": 2.634556770324707, + "learning_rate": 4.977504210584084e-05, + "loss": 0.9426, + "step": 2230 + }, + { + "epoch": 0.22437021084789904, + "grad_norm": 2.3726024627685547, + "learning_rate": 4.97729270823729e-05, + "loss": 0.9265, + "step": 2240 + }, + { + "epoch": 0.22537186357489858, + "grad_norm": 3.005441665649414, + "learning_rate": 4.977080220814789e-05, + "loss": 0.919, + "step": 2250 + }, + { + "epoch": 0.22637351630189814, + "grad_norm": 3.0273005962371826, + "learning_rate": 4.976866748401075e-05, + "loss": 0.9681, + "step": 2260 + }, + { + "epoch": 0.22737516902889768, + "grad_norm": 2.832097291946411, + "learning_rate": 4.976652291081035e-05, + "loss": 0.9547, + "step": 2270 + }, + { + "epoch": 0.22837682175589724, + "grad_norm": 2.40688157081604, + "learning_rate": 4.976436848939944e-05, + "loss": 0.949, + "step": 2280 + }, + { + "epoch": 0.22937847448289678, + "grad_norm": 2.177398681640625, + "learning_rate": 4.976220422063473e-05, + "loss": 0.9046, + "step": 2290 + }, + { + "epoch": 0.23038012720989634, + "grad_norm": 2.506649971008301, + "learning_rate": 4.97600301053768e-05, + "loss": 0.9084, + "step": 2300 + }, + { + "epoch": 0.23138177993689588, + "grad_norm": 2.7216312885284424, + "learning_rate": 4.975784614449019e-05, + "loss": 0.8898, + "step": 2310 + }, + { + "epoch": 0.23238343266389544, + "grad_norm": 2.622244358062744, + "learning_rate": 4.975565233884333e-05, + "loss": 0.9973, + "step": 2320 + }, + { + "epoch": 0.23338508539089498, + "grad_norm": 2.5111606121063232, + "learning_rate": 4.9753448689308557e-05, + "loss": 0.9258, + "step": 2330 + }, + { + "epoch": 0.23438673811789454, + "grad_norm": 3.298383951187134, + "learning_rate": 4.975123519676215e-05, + "loss": 0.9868, + "step": 2340 + }, + { + "epoch": 0.23538839084489407, + "grad_norm": 3.6510188579559326, + "learning_rate": 4.974901186208428e-05, + "loss": 0.9603, + "step": 2350 + }, + { + "epoch": 0.23639004357189364, + "grad_norm": 2.3483967781066895, + "learning_rate": 4.9746778686159045e-05, + "loss": 0.8984, + "step": 2360 + }, + { + "epoch": 0.23739169629889317, + "grad_norm": 2.734997510910034, + "learning_rate": 4.9744535669874435e-05, + "loss": 0.9412, + "step": 2370 + }, + { + "epoch": 0.23839334902589274, + "grad_norm": 2.5782012939453125, + "learning_rate": 4.974228281412239e-05, + "loss": 1.002, + "step": 2380 + }, + { + "epoch": 0.23939500175289227, + "grad_norm": 2.8419687747955322, + "learning_rate": 4.974002011979871e-05, + "loss": 0.9378, + "step": 2390 + }, + { + "epoch": 0.2403966544798918, + "grad_norm": 2.9137983322143555, + "learning_rate": 4.973774758780316e-05, + "loss": 0.9227, + "step": 2400 + }, + { + "epoch": 0.24139830720689137, + "grad_norm": 2.580446481704712, + "learning_rate": 4.9735465219039396e-05, + "loss": 0.9542, + "step": 2410 + }, + { + "epoch": 0.2423999599338909, + "grad_norm": 3.0914855003356934, + "learning_rate": 4.973317301441497e-05, + "loss": 0.9186, + "step": 2420 + }, + { + "epoch": 0.24340161266089047, + "grad_norm": 2.856234073638916, + "learning_rate": 4.973087097484136e-05, + "loss": 0.9466, + "step": 2430 + }, + { + "epoch": 0.24440326538789, + "grad_norm": 3.1362180709838867, + "learning_rate": 4.972855910123397e-05, + "loss": 0.8701, + "step": 2440 + }, + { + "epoch": 0.24540491811488957, + "grad_norm": 2.7053310871124268, + "learning_rate": 4.972623739451208e-05, + "loss": 0.9137, + "step": 2450 + }, + { + "epoch": 0.2464065708418891, + "grad_norm": 2.5193698406219482, + "learning_rate": 4.9723905855598904e-05, + "loss": 0.8837, + "step": 2460 + }, + { + "epoch": 0.24740822356888867, + "grad_norm": 2.841562509536743, + "learning_rate": 4.972156448542156e-05, + "loss": 0.9857, + "step": 2470 + }, + { + "epoch": 0.2484098762958882, + "grad_norm": 2.9049248695373535, + "learning_rate": 4.9719213284911084e-05, + "loss": 0.947, + "step": 2480 + }, + { + "epoch": 0.24941152902288777, + "grad_norm": 2.958031415939331, + "learning_rate": 4.971685225500239e-05, + "loss": 0.9208, + "step": 2490 + }, + { + "epoch": 0.2504131817498873, + "grad_norm": 2.562629222869873, + "learning_rate": 4.9714481396634346e-05, + "loss": 0.8968, + "step": 2500 + }, + { + "epoch": 0.25141483447688684, + "grad_norm": 2.949819803237915, + "learning_rate": 4.97121007107497e-05, + "loss": 0.856, + "step": 2510 + }, + { + "epoch": 0.25241648720388643, + "grad_norm": 3.606531858444214, + "learning_rate": 4.97097101982951e-05, + "loss": 0.9165, + "step": 2520 + }, + { + "epoch": 0.25341813993088597, + "grad_norm": 3.1925840377807617, + "learning_rate": 4.970730986022113e-05, + "loss": 0.9481, + "step": 2530 + }, + { + "epoch": 0.2544197926578855, + "grad_norm": 3.001221179962158, + "learning_rate": 4.9704899697482256e-05, + "loss": 0.9386, + "step": 2540 + }, + { + "epoch": 0.25542144538488504, + "grad_norm": 3.6826331615448, + "learning_rate": 4.9702479711036864e-05, + "loss": 0.8899, + "step": 2550 + }, + { + "epoch": 0.25642309811188463, + "grad_norm": 3.200007200241089, + "learning_rate": 4.970004990184724e-05, + "loss": 0.8934, + "step": 2560 + }, + { + "epoch": 0.25742475083888416, + "grad_norm": 2.6780619621276855, + "learning_rate": 4.969761027087957e-05, + "loss": 0.8705, + "step": 2570 + }, + { + "epoch": 0.2584264035658837, + "grad_norm": 2.7149860858917236, + "learning_rate": 4.969516081910397e-05, + "loss": 0.9574, + "step": 2580 + }, + { + "epoch": 0.25942805629288324, + "grad_norm": 2.5190060138702393, + "learning_rate": 4.969270154749444e-05, + "loss": 0.9411, + "step": 2590 + }, + { + "epoch": 0.2604297090198828, + "grad_norm": 2.75197696685791, + "learning_rate": 4.9690232457028887e-05, + "loss": 0.9248, + "step": 2600 + }, + { + "epoch": 0.26143136174688236, + "grad_norm": 2.462170362472534, + "learning_rate": 4.968775354868912e-05, + "loss": 0.9209, + "step": 2610 + }, + { + "epoch": 0.2624330144738819, + "grad_norm": 2.7664783000946045, + "learning_rate": 4.9685264823460866e-05, + "loss": 0.8915, + "step": 2620 + }, + { + "epoch": 0.26343466720088143, + "grad_norm": 2.3150320053100586, + "learning_rate": 4.968276628233374e-05, + "loss": 0.9826, + "step": 2630 + }, + { + "epoch": 0.264436319927881, + "grad_norm": 2.6080338954925537, + "learning_rate": 4.9680257926301274e-05, + "loss": 0.8408, + "step": 2640 + }, + { + "epoch": 0.26543797265488056, + "grad_norm": 3.5915043354034424, + "learning_rate": 4.967773975636088e-05, + "loss": 0.844, + "step": 2650 + }, + { + "epoch": 0.2664396253818801, + "grad_norm": 2.8301284313201904, + "learning_rate": 4.967521177351392e-05, + "loss": 0.9263, + "step": 2660 + }, + { + "epoch": 0.26744127810887963, + "grad_norm": 2.7187178134918213, + "learning_rate": 4.9672673978765594e-05, + "loss": 0.8942, + "step": 2670 + }, + { + "epoch": 0.2684429308358792, + "grad_norm": 2.7909884452819824, + "learning_rate": 4.9670126373125056e-05, + "loss": 0.9328, + "step": 2680 + }, + { + "epoch": 0.26944458356287876, + "grad_norm": 2.38917875289917, + "learning_rate": 4.9667568957605324e-05, + "loss": 0.9005, + "step": 2690 + }, + { + "epoch": 0.2704462362898783, + "grad_norm": 2.494300127029419, + "learning_rate": 4.966500173322335e-05, + "loss": 0.8757, + "step": 2700 + }, + { + "epoch": 0.27144788901687783, + "grad_norm": 2.545513868331909, + "learning_rate": 4.966242470099997e-05, + "loss": 0.9322, + "step": 2710 + }, + { + "epoch": 0.2724495417438774, + "grad_norm": 2.4247043132781982, + "learning_rate": 4.96598378619599e-05, + "loss": 0.9301, + "step": 2720 + }, + { + "epoch": 0.27345119447087696, + "grad_norm": 2.355764865875244, + "learning_rate": 4.96572412171318e-05, + "loss": 0.8569, + "step": 2730 + }, + { + "epoch": 0.2744528471978765, + "grad_norm": 3.5294244289398193, + "learning_rate": 4.965463476754819e-05, + "loss": 0.9178, + "step": 2740 + }, + { + "epoch": 0.27545449992487603, + "grad_norm": 3.509269952774048, + "learning_rate": 4.9652018514245514e-05, + "loss": 0.861, + "step": 2750 + }, + { + "epoch": 0.2764561526518756, + "grad_norm": 2.9568450450897217, + "learning_rate": 4.96493924582641e-05, + "loss": 0.8833, + "step": 2760 + }, + { + "epoch": 0.27745780537887516, + "grad_norm": 2.9352688789367676, + "learning_rate": 4.964675660064817e-05, + "loss": 0.8888, + "step": 2770 + }, + { + "epoch": 0.2784594581058747, + "grad_norm": 2.9790830612182617, + "learning_rate": 4.9644110942445864e-05, + "loss": 0.9142, + "step": 2780 + }, + { + "epoch": 0.2794611108328742, + "grad_norm": 2.514970064163208, + "learning_rate": 4.9641455484709196e-05, + "loss": 0.9056, + "step": 2790 + }, + { + "epoch": 0.2804627635598738, + "grad_norm": 3.5085692405700684, + "learning_rate": 4.9638790228494106e-05, + "loss": 0.857, + "step": 2800 + }, + { + "epoch": 0.28146441628687335, + "grad_norm": 4.000670433044434, + "learning_rate": 4.9636115174860396e-05, + "loss": 0.8086, + "step": 2810 + }, + { + "epoch": 0.2824660690138729, + "grad_norm": 2.9119763374328613, + "learning_rate": 4.9633430324871776e-05, + "loss": 0.8537, + "step": 2820 + }, + { + "epoch": 0.2834677217408724, + "grad_norm": 1.9512786865234375, + "learning_rate": 4.963073567959587e-05, + "loss": 0.7311, + "step": 2830 + }, + { + "epoch": 0.284469374467872, + "grad_norm": 2.5956673622131348, + "learning_rate": 4.962803124010417e-05, + "loss": 0.8748, + "step": 2840 + }, + { + "epoch": 0.28547102719487155, + "grad_norm": 2.648887872695923, + "learning_rate": 4.9625317007472085e-05, + "loss": 0.911, + "step": 2850 + }, + { + "epoch": 0.2864726799218711, + "grad_norm": 2.7080471515655518, + "learning_rate": 4.96225929827789e-05, + "loss": 0.8843, + "step": 2860 + }, + { + "epoch": 0.2874743326488706, + "grad_norm": 2.9496068954467773, + "learning_rate": 4.961985916710781e-05, + "loss": 0.8772, + "step": 2870 + }, + { + "epoch": 0.28847598537587016, + "grad_norm": 3.3540945053100586, + "learning_rate": 4.961711556154588e-05, + "loss": 0.8951, + "step": 2880 + }, + { + "epoch": 0.28947763810286975, + "grad_norm": 2.877737045288086, + "learning_rate": 4.961436216718409e-05, + "loss": 0.9708, + "step": 2890 + }, + { + "epoch": 0.2904792908298693, + "grad_norm": 2.5713319778442383, + "learning_rate": 4.961159898511732e-05, + "loss": 0.8817, + "step": 2900 + }, + { + "epoch": 0.2914809435568688, + "grad_norm": 3.155395746231079, + "learning_rate": 4.960882601644431e-05, + "loss": 0.9464, + "step": 2910 + }, + { + "epoch": 0.29248259628386836, + "grad_norm": 2.9532198905944824, + "learning_rate": 4.960604326226771e-05, + "loss": 0.8817, + "step": 2920 + }, + { + "epoch": 0.29348424901086795, + "grad_norm": 3.1550869941711426, + "learning_rate": 4.960325072369407e-05, + "loss": 0.8659, + "step": 2930 + }, + { + "epoch": 0.2944859017378675, + "grad_norm": 2.6398537158966064, + "learning_rate": 4.960044840183381e-05, + "loss": 0.9133, + "step": 2940 + }, + { + "epoch": 0.295487554464867, + "grad_norm": 2.6897711753845215, + "learning_rate": 4.959763629780126e-05, + "loss": 0.8332, + "step": 2950 + }, + { + "epoch": 0.29648920719186656, + "grad_norm": 3.5697755813598633, + "learning_rate": 4.959481441271462e-05, + "loss": 0.9082, + "step": 2960 + }, + { + "epoch": 0.29749085991886615, + "grad_norm": 3.7671542167663574, + "learning_rate": 4.9591982747696006e-05, + "loss": 0.9394, + "step": 2970 + }, + { + "epoch": 0.2984925126458657, + "grad_norm": 2.823279857635498, + "learning_rate": 4.958914130387139e-05, + "loss": 0.8998, + "step": 2980 + }, + { + "epoch": 0.2994941653728652, + "grad_norm": 2.3585197925567627, + "learning_rate": 4.958629008237066e-05, + "loss": 0.8573, + "step": 2990 + }, + { + "epoch": 0.30049581809986475, + "grad_norm": 2.3386313915252686, + "learning_rate": 4.958342908432757e-05, + "loss": 0.7891, + "step": 3000 + }, + { + "epoch": 0.30149747082686434, + "grad_norm": 2.2583234310150146, + "learning_rate": 4.958055831087979e-05, + "loss": 0.912, + "step": 3010 + }, + { + "epoch": 0.3024991235538639, + "grad_norm": 3.0650525093078613, + "learning_rate": 4.957767776316885e-05, + "loss": 0.9714, + "step": 3020 + }, + { + "epoch": 0.3035007762808634, + "grad_norm": 2.90987491607666, + "learning_rate": 4.9574787442340185e-05, + "loss": 0.7965, + "step": 3030 + }, + { + "epoch": 0.30450242900786295, + "grad_norm": 2.936350107192993, + "learning_rate": 4.95718873495431e-05, + "loss": 0.8288, + "step": 3040 + }, + { + "epoch": 0.30550408173486254, + "grad_norm": 3.2421929836273193, + "learning_rate": 4.956897748593079e-05, + "loss": 0.8186, + "step": 3050 + }, + { + "epoch": 0.3065057344618621, + "grad_norm": 2.5979058742523193, + "learning_rate": 4.956605785266034e-05, + "loss": 0.875, + "step": 3060 + }, + { + "epoch": 0.3075073871888616, + "grad_norm": 2.861781597137451, + "learning_rate": 4.956312845089274e-05, + "loss": 0.8187, + "step": 3070 + }, + { + "epoch": 0.30850903991586115, + "grad_norm": 2.5205078125, + "learning_rate": 4.9560189281792815e-05, + "loss": 0.8718, + "step": 3080 + }, + { + "epoch": 0.30951069264286074, + "grad_norm": 2.3308043479919434, + "learning_rate": 4.955724034652931e-05, + "loss": 0.9551, + "step": 3090 + }, + { + "epoch": 0.3105123453698603, + "grad_norm": 2.916992425918579, + "learning_rate": 4.955428164627486e-05, + "loss": 0.828, + "step": 3100 + }, + { + "epoch": 0.3115139980968598, + "grad_norm": 2.490170478820801, + "learning_rate": 4.955131318220595e-05, + "loss": 0.8135, + "step": 3110 + }, + { + "epoch": 0.31251565082385935, + "grad_norm": 3.505068302154541, + "learning_rate": 4.954833495550297e-05, + "loss": 0.8689, + "step": 3120 + }, + { + "epoch": 0.31351730355085894, + "grad_norm": 2.6903154850006104, + "learning_rate": 4.9545346967350204e-05, + "loss": 0.9086, + "step": 3130 + }, + { + "epoch": 0.3145189562778585, + "grad_norm": 3.43782377243042, + "learning_rate": 4.954234921893579e-05, + "loss": 0.8903, + "step": 3140 + }, + { + "epoch": 0.315520609004858, + "grad_norm": 2.6618101596832275, + "learning_rate": 4.953934171145175e-05, + "loss": 0.9068, + "step": 3150 + }, + { + "epoch": 0.31652226173185755, + "grad_norm": 3.6636927127838135, + "learning_rate": 4.953632444609401e-05, + "loss": 0.8745, + "step": 3160 + }, + { + "epoch": 0.31752391445885714, + "grad_norm": 2.4644505977630615, + "learning_rate": 4.953329742406235e-05, + "loss": 0.9156, + "step": 3170 + }, + { + "epoch": 0.3185255671858567, + "grad_norm": 2.7352774143218994, + "learning_rate": 4.9530260646560455e-05, + "loss": 0.8799, + "step": 3180 + }, + { + "epoch": 0.3195272199128562, + "grad_norm": 2.6652235984802246, + "learning_rate": 4.952721411479587e-05, + "loss": 0.8353, + "step": 3190 + }, + { + "epoch": 0.32052887263985574, + "grad_norm": 2.7542600631713867, + "learning_rate": 4.952415782998001e-05, + "loss": 0.7625, + "step": 3200 + }, + { + "epoch": 0.32153052536685534, + "grad_norm": 2.849294662475586, + "learning_rate": 4.9521091793328204e-05, + "loss": 0.8022, + "step": 3210 + }, + { + "epoch": 0.32253217809385487, + "grad_norm": 2.48455548286438, + "learning_rate": 4.951801600605962e-05, + "loss": 0.8215, + "step": 3220 + }, + { + "epoch": 0.3235338308208544, + "grad_norm": 3.339585304260254, + "learning_rate": 4.9514930469397326e-05, + "loss": 0.8675, + "step": 3230 + }, + { + "epoch": 0.32453548354785394, + "grad_norm": 2.703068494796753, + "learning_rate": 4.951183518456827e-05, + "loss": 0.7836, + "step": 3240 + }, + { + "epoch": 0.32553713627485353, + "grad_norm": 2.545600414276123, + "learning_rate": 4.950873015280325e-05, + "loss": 0.8031, + "step": 3250 + }, + { + "epoch": 0.32653878900185307, + "grad_norm": 3.298454523086548, + "learning_rate": 4.9505615375336965e-05, + "loss": 0.8976, + "step": 3260 + }, + { + "epoch": 0.3275404417288526, + "grad_norm": 2.814692735671997, + "learning_rate": 4.950249085340799e-05, + "loss": 0.8475, + "step": 3270 + }, + { + "epoch": 0.32854209445585214, + "grad_norm": 2.775646448135376, + "learning_rate": 4.949935658825876e-05, + "loss": 0.8412, + "step": 3280 + }, + { + "epoch": 0.32954374718285173, + "grad_norm": 2.9351887702941895, + "learning_rate": 4.9496212581135585e-05, + "loss": 0.8447, + "step": 3290 + }, + { + "epoch": 0.33054539990985127, + "grad_norm": 2.701782464981079, + "learning_rate": 4.9493058833288666e-05, + "loss": 0.9201, + "step": 3300 + }, + { + "epoch": 0.3315470526368508, + "grad_norm": 3.1329259872436523, + "learning_rate": 4.948989534597205e-05, + "loss": 0.7931, + "step": 3310 + }, + { + "epoch": 0.33254870536385034, + "grad_norm": 3.1399502754211426, + "learning_rate": 4.948672212044369e-05, + "loss": 0.8213, + "step": 3320 + }, + { + "epoch": 0.33355035809084993, + "grad_norm": 2.487262725830078, + "learning_rate": 4.948353915796538e-05, + "loss": 0.82, + "step": 3330 + }, + { + "epoch": 0.33455201081784947, + "grad_norm": 2.3617560863494873, + "learning_rate": 4.948034645980281e-05, + "loss": 0.8816, + "step": 3340 + }, + { + "epoch": 0.335553663544849, + "grad_norm": 2.443876266479492, + "learning_rate": 4.947714402722552e-05, + "loss": 0.8592, + "step": 3350 + }, + { + "epoch": 0.33655531627184854, + "grad_norm": 3.451190948486328, + "learning_rate": 4.947393186150694e-05, + "loss": 0.8691, + "step": 3360 + }, + { + "epoch": 0.3375569689988481, + "grad_norm": 2.79589581489563, + "learning_rate": 4.947070996392435e-05, + "loss": 0.8485, + "step": 3370 + }, + { + "epoch": 0.33855862172584766, + "grad_norm": 3.133854389190674, + "learning_rate": 4.946747833575894e-05, + "loss": 0.8401, + "step": 3380 + }, + { + "epoch": 0.3395602744528472, + "grad_norm": 2.8713510036468506, + "learning_rate": 4.946423697829572e-05, + "loss": 0.7918, + "step": 3390 + }, + { + "epoch": 0.34056192717984674, + "grad_norm": 3.4451394081115723, + "learning_rate": 4.946098589282359e-05, + "loss": 0.8507, + "step": 3400 + }, + { + "epoch": 0.34156357990684627, + "grad_norm": 2.4211623668670654, + "learning_rate": 4.945772508063531e-05, + "loss": 0.9028, + "step": 3410 + }, + { + "epoch": 0.34256523263384586, + "grad_norm": 3.622356414794922, + "learning_rate": 4.945445454302754e-05, + "loss": 0.7338, + "step": 3420 + }, + { + "epoch": 0.3435668853608454, + "grad_norm": 2.8370401859283447, + "learning_rate": 4.945117428130076e-05, + "loss": 0.8819, + "step": 3430 + }, + { + "epoch": 0.34456853808784493, + "grad_norm": 2.372068405151367, + "learning_rate": 4.944788429675936e-05, + "loss": 0.817, + "step": 3440 + }, + { + "epoch": 0.34557019081484447, + "grad_norm": 2.5194711685180664, + "learning_rate": 4.944458459071156e-05, + "loss": 0.8744, + "step": 3450 + }, + { + "epoch": 0.34657184354184406, + "grad_norm": 2.253473997116089, + "learning_rate": 4.944127516446947e-05, + "loss": 0.8461, + "step": 3460 + }, + { + "epoch": 0.3475734962688436, + "grad_norm": 2.651766300201416, + "learning_rate": 4.9437956019349054e-05, + "loss": 0.8544, + "step": 3470 + }, + { + "epoch": 0.34857514899584313, + "grad_norm": 2.0998165607452393, + "learning_rate": 4.943462715667015e-05, + "loss": 0.8272, + "step": 3480 + }, + { + "epoch": 0.34957680172284267, + "grad_norm": 3.30053973197937, + "learning_rate": 4.9431288577756446e-05, + "loss": 0.8739, + "step": 3490 + }, + { + "epoch": 0.35057845444984226, + "grad_norm": 2.7177000045776367, + "learning_rate": 4.9427940283935504e-05, + "loss": 0.8858, + "step": 3500 + }, + { + "epoch": 0.3515801071768418, + "grad_norm": 2.158338785171509, + "learning_rate": 4.9424582276538746e-05, + "loss": 0.7827, + "step": 3510 + }, + { + "epoch": 0.35258175990384133, + "grad_norm": 3.1199424266815186, + "learning_rate": 4.9421214556901454e-05, + "loss": 0.8491, + "step": 3520 + }, + { + "epoch": 0.35358341263084087, + "grad_norm": 2.8127048015594482, + "learning_rate": 4.941783712636278e-05, + "loss": 0.8002, + "step": 3530 + }, + { + "epoch": 0.35458506535784046, + "grad_norm": 2.9365146160125732, + "learning_rate": 4.941444998626573e-05, + "loss": 0.9608, + "step": 3540 + }, + { + "epoch": 0.35558671808484, + "grad_norm": 3.4413721561431885, + "learning_rate": 4.9411053137957174e-05, + "loss": 0.8685, + "step": 3550 + }, + { + "epoch": 0.35658837081183953, + "grad_norm": 2.9965898990631104, + "learning_rate": 4.940764658278785e-05, + "loss": 0.8144, + "step": 3560 + }, + { + "epoch": 0.35759002353883906, + "grad_norm": 2.883406162261963, + "learning_rate": 4.9404230322112324e-05, + "loss": 0.8437, + "step": 3570 + }, + { + "epoch": 0.35859167626583865, + "grad_norm": 3.0368754863739014, + "learning_rate": 4.940080435728907e-05, + "loss": 0.8707, + "step": 3580 + }, + { + "epoch": 0.3595933289928382, + "grad_norm": 2.8644607067108154, + "learning_rate": 4.939736868968038e-05, + "loss": 0.8184, + "step": 3590 + }, + { + "epoch": 0.3605949817198377, + "grad_norm": 2.6353414058685303, + "learning_rate": 4.9393923320652424e-05, + "loss": 0.8343, + "step": 3600 + }, + { + "epoch": 0.36159663444683726, + "grad_norm": 3.469896078109741, + "learning_rate": 4.939046825157523e-05, + "loss": 0.845, + "step": 3610 + }, + { + "epoch": 0.36259828717383685, + "grad_norm": 2.8434600830078125, + "learning_rate": 4.9387003483822666e-05, + "loss": 0.8833, + "step": 3620 + }, + { + "epoch": 0.3635999399008364, + "grad_norm": 3.1709251403808594, + "learning_rate": 4.938352901877249e-05, + "loss": 0.826, + "step": 3630 + }, + { + "epoch": 0.3646015926278359, + "grad_norm": 2.7404329776763916, + "learning_rate": 4.938004485780628e-05, + "loss": 0.7455, + "step": 3640 + }, + { + "epoch": 0.36560324535483546, + "grad_norm": 4.057974338531494, + "learning_rate": 4.937655100230949e-05, + "loss": 0.7923, + "step": 3650 + }, + { + "epoch": 0.36660489808183505, + "grad_norm": 2.943912982940674, + "learning_rate": 4.937304745367143e-05, + "loss": 0.8386, + "step": 3660 + }, + { + "epoch": 0.3676065508088346, + "grad_norm": 3.354377269744873, + "learning_rate": 4.936953421328524e-05, + "loss": 0.8257, + "step": 3670 + }, + { + "epoch": 0.3686082035358341, + "grad_norm": 2.800428628921509, + "learning_rate": 4.936601128254794e-05, + "loss": 0.7587, + "step": 3680 + }, + { + "epoch": 0.36960985626283366, + "grad_norm": 3.1578376293182373, + "learning_rate": 4.936247866286041e-05, + "loss": 0.7953, + "step": 3690 + }, + { + "epoch": 0.37061150898983325, + "grad_norm": 2.962111711502075, + "learning_rate": 4.935893635562735e-05, + "loss": 0.826, + "step": 3700 + }, + { + "epoch": 0.3716131617168328, + "grad_norm": 2.9009153842926025, + "learning_rate": 4.935538436225733e-05, + "loss": 0.7948, + "step": 3710 + }, + { + "epoch": 0.3726148144438323, + "grad_norm": 2.9375569820404053, + "learning_rate": 4.9351822684162787e-05, + "loss": 0.8348, + "step": 3720 + }, + { + "epoch": 0.37361646717083186, + "grad_norm": 3.2307522296905518, + "learning_rate": 4.934825132275999e-05, + "loss": 0.8755, + "step": 3730 + }, + { + "epoch": 0.37461811989783145, + "grad_norm": 3.187584161758423, + "learning_rate": 4.934467027946905e-05, + "loss": 0.8315, + "step": 3740 + }, + { + "epoch": 0.375619772624831, + "grad_norm": 2.4784653186798096, + "learning_rate": 4.9341079555713946e-05, + "loss": 0.8161, + "step": 3750 + }, + { + "epoch": 0.3766214253518305, + "grad_norm": 3.0355725288391113, + "learning_rate": 4.9337479152922504e-05, + "loss": 0.7234, + "step": 3760 + }, + { + "epoch": 0.37762307807883005, + "grad_norm": 5.005615711212158, + "learning_rate": 4.93338690725264e-05, + "loss": 0.872, + "step": 3770 + }, + { + "epoch": 0.37862473080582965, + "grad_norm": 2.9800662994384766, + "learning_rate": 4.933024931596114e-05, + "loss": 0.8399, + "step": 3780 + }, + { + "epoch": 0.3796263835328292, + "grad_norm": 3.00419282913208, + "learning_rate": 4.93266198846661e-05, + "loss": 0.7835, + "step": 3790 + }, + { + "epoch": 0.3806280362598287, + "grad_norm": 2.776589870452881, + "learning_rate": 4.93229807800845e-05, + "loss": 0.8586, + "step": 3800 + }, + { + "epoch": 0.38162968898682825, + "grad_norm": 2.228154420852661, + "learning_rate": 4.9319332003663385e-05, + "loss": 0.7737, + "step": 3810 + }, + { + "epoch": 0.3826313417138278, + "grad_norm": 2.2705202102661133, + "learning_rate": 4.931567355685368e-05, + "loss": 0.8327, + "step": 3820 + }, + { + "epoch": 0.3836329944408274, + "grad_norm": 3.1992833614349365, + "learning_rate": 4.9312005441110126e-05, + "loss": 0.7732, + "step": 3830 + }, + { + "epoch": 0.3846346471678269, + "grad_norm": 2.792924642562866, + "learning_rate": 4.930832765789132e-05, + "loss": 0.7673, + "step": 3840 + }, + { + "epoch": 0.38563629989482645, + "grad_norm": 2.705148935317993, + "learning_rate": 4.9304640208659714e-05, + "loss": 0.7368, + "step": 3850 + }, + { + "epoch": 0.386637952621826, + "grad_norm": 2.6847712993621826, + "learning_rate": 4.930094309488158e-05, + "loss": 0.8845, + "step": 3860 + }, + { + "epoch": 0.3876396053488256, + "grad_norm": 2.3294968605041504, + "learning_rate": 4.929723631802705e-05, + "loss": 0.8204, + "step": 3870 + }, + { + "epoch": 0.3886412580758251, + "grad_norm": 3.4525415897369385, + "learning_rate": 4.9293519879570095e-05, + "loss": 0.7846, + "step": 3880 + }, + { + "epoch": 0.38964291080282465, + "grad_norm": 3.896613597869873, + "learning_rate": 4.9289793780988526e-05, + "loss": 0.7868, + "step": 3890 + }, + { + "epoch": 0.3906445635298242, + "grad_norm": 2.876314878463745, + "learning_rate": 4.928605802376399e-05, + "loss": 0.7856, + "step": 3900 + }, + { + "epoch": 0.3916462162568238, + "grad_norm": 3.1712610721588135, + "learning_rate": 4.9282312609382004e-05, + "loss": 0.748, + "step": 3910 + }, + { + "epoch": 0.3926478689838233, + "grad_norm": 3.0286600589752197, + "learning_rate": 4.927855753933188e-05, + "loss": 0.8701, + "step": 3920 + }, + { + "epoch": 0.39364952171082285, + "grad_norm": 2.6831576824188232, + "learning_rate": 4.9274792815106794e-05, + "loss": 0.8515, + "step": 3930 + }, + { + "epoch": 0.3946511744378224, + "grad_norm": 2.632242202758789, + "learning_rate": 4.9271018438203766e-05, + "loss": 0.8772, + "step": 3940 + }, + { + "epoch": 0.395652827164822, + "grad_norm": 3.1001410484313965, + "learning_rate": 4.9267234410123644e-05, + "loss": 0.7569, + "step": 3950 + }, + { + "epoch": 0.3966544798918215, + "grad_norm": 3.0256540775299072, + "learning_rate": 4.9263440732371116e-05, + "loss": 0.8183, + "step": 3960 + }, + { + "epoch": 0.39765613261882105, + "grad_norm": 3.7380168437957764, + "learning_rate": 4.925963740645471e-05, + "loss": 0.8241, + "step": 3970 + }, + { + "epoch": 0.3986577853458206, + "grad_norm": 3.397099256515503, + "learning_rate": 4.925582443388679e-05, + "loss": 0.8335, + "step": 3980 + }, + { + "epoch": 0.3996594380728202, + "grad_norm": 2.583353281021118, + "learning_rate": 4.925200181618354e-05, + "loss": 0.7761, + "step": 3990 + }, + { + "epoch": 0.4006610907998197, + "grad_norm": 2.2222886085510254, + "learning_rate": 4.924816955486501e-05, + "loss": 0.7627, + "step": 4000 + }, + { + "epoch": 0.40166274352681924, + "grad_norm": 2.9799551963806152, + "learning_rate": 4.9244327651455065e-05, + "loss": 0.8719, + "step": 4010 + }, + { + "epoch": 0.4026643962538188, + "grad_norm": 2.8081564903259277, + "learning_rate": 4.9240476107481405e-05, + "loss": 0.858, + "step": 4020 + }, + { + "epoch": 0.40366604898081837, + "grad_norm": 3.4415876865386963, + "learning_rate": 4.923661492447556e-05, + "loss": 0.828, + "step": 4030 + }, + { + "epoch": 0.4046677017078179, + "grad_norm": 3.00347900390625, + "learning_rate": 4.92327441039729e-05, + "loss": 0.7376, + "step": 4040 + }, + { + "epoch": 0.40566935443481744, + "grad_norm": 2.860119581222534, + "learning_rate": 4.922886364751263e-05, + "loss": 0.8286, + "step": 4050 + }, + { + "epoch": 0.406671007161817, + "grad_norm": 2.8311233520507812, + "learning_rate": 4.92249735566378e-05, + "loss": 0.8347, + "step": 4060 + }, + { + "epoch": 0.40767265988881657, + "grad_norm": 2.2550930976867676, + "learning_rate": 4.922107383289524e-05, + "loss": 0.8662, + "step": 4070 + }, + { + "epoch": 0.4086743126158161, + "grad_norm": 2.128415584564209, + "learning_rate": 4.921716447783566e-05, + "loss": 0.7438, + "step": 4080 + }, + { + "epoch": 0.40967596534281564, + "grad_norm": 2.728203773498535, + "learning_rate": 4.921324549301359e-05, + "loss": 0.8708, + "step": 4090 + }, + { + "epoch": 0.4106776180698152, + "grad_norm": 2.9716641902923584, + "learning_rate": 4.9209316879987374e-05, + "loss": 0.8483, + "step": 4100 + }, + { + "epoch": 0.41167927079681477, + "grad_norm": 2.1702470779418945, + "learning_rate": 4.92053786403192e-05, + "loss": 0.7749, + "step": 4110 + }, + { + "epoch": 0.4126809235238143, + "grad_norm": 2.6090810298919678, + "learning_rate": 4.9201430775575074e-05, + "loss": 0.7221, + "step": 4120 + }, + { + "epoch": 0.41368257625081384, + "grad_norm": 3.643763780593872, + "learning_rate": 4.9197473287324835e-05, + "loss": 0.8035, + "step": 4130 + }, + { + "epoch": 0.4146842289778134, + "grad_norm": 2.5547125339508057, + "learning_rate": 4.919350617714215e-05, + "loss": 0.779, + "step": 4140 + }, + { + "epoch": 0.41568588170481297, + "grad_norm": 2.8796565532684326, + "learning_rate": 4.9189529446604484e-05, + "loss": 0.8606, + "step": 4150 + }, + { + "epoch": 0.4166875344318125, + "grad_norm": 2.778782606124878, + "learning_rate": 4.918554309729318e-05, + "loss": 0.8208, + "step": 4160 + }, + { + "epoch": 0.41768918715881204, + "grad_norm": 3.649463176727295, + "learning_rate": 4.9181547130793385e-05, + "loss": 0.8072, + "step": 4170 + }, + { + "epoch": 0.41869083988581157, + "grad_norm": 2.776176929473877, + "learning_rate": 4.917754154869403e-05, + "loss": 0.7387, + "step": 4180 + }, + { + "epoch": 0.41969249261281116, + "grad_norm": 3.0687742233276367, + "learning_rate": 4.917352635258794e-05, + "loss": 0.8521, + "step": 4190 + }, + { + "epoch": 0.4206941453398107, + "grad_norm": 2.9265615940093994, + "learning_rate": 4.916950154407169e-05, + "loss": 0.8117, + "step": 4200 + }, + { + "epoch": 0.42169579806681023, + "grad_norm": 3.030052900314331, + "learning_rate": 4.916546712474573e-05, + "loss": 0.8334, + "step": 4210 + }, + { + "epoch": 0.42269745079380977, + "grad_norm": 3.2427847385406494, + "learning_rate": 4.916142309621432e-05, + "loss": 0.8247, + "step": 4220 + }, + { + "epoch": 0.42369910352080936, + "grad_norm": 2.7868199348449707, + "learning_rate": 4.9157369460085535e-05, + "loss": 0.8202, + "step": 4230 + }, + { + "epoch": 0.4247007562478089, + "grad_norm": 2.5167086124420166, + "learning_rate": 4.915330621797126e-05, + "loss": 0.8389, + "step": 4240 + }, + { + "epoch": 0.42570240897480843, + "grad_norm": 2.5550220012664795, + "learning_rate": 4.914923337148722e-05, + "loss": 0.8128, + "step": 4250 + }, + { + "epoch": 0.42670406170180797, + "grad_norm": 2.4748544692993164, + "learning_rate": 4.9145150922252944e-05, + "loss": 0.826, + "step": 4260 + }, + { + "epoch": 0.42770571442880756, + "grad_norm": 3.1754608154296875, + "learning_rate": 4.9141058871891793e-05, + "loss": 0.7927, + "step": 4270 + }, + { + "epoch": 0.4287073671558071, + "grad_norm": 3.4101622104644775, + "learning_rate": 4.9136957222030934e-05, + "loss": 0.8662, + "step": 4280 + }, + { + "epoch": 0.42970901988280663, + "grad_norm": 2.1540229320526123, + "learning_rate": 4.9132845974301357e-05, + "loss": 0.7403, + "step": 4290 + }, + { + "epoch": 0.43071067260980617, + "grad_norm": 3.1899423599243164, + "learning_rate": 4.912872513033786e-05, + "loss": 0.7887, + "step": 4300 + }, + { + "epoch": 0.4317123253368057, + "grad_norm": 2.9181227684020996, + "learning_rate": 4.912459469177907e-05, + "loss": 0.7683, + "step": 4310 + }, + { + "epoch": 0.4327139780638053, + "grad_norm": 2.837829351425171, + "learning_rate": 4.9120454660267426e-05, + "loss": 0.7659, + "step": 4320 + }, + { + "epoch": 0.43371563079080483, + "grad_norm": 2.8570311069488525, + "learning_rate": 4.911630503744916e-05, + "loss": 0.8271, + "step": 4330 + }, + { + "epoch": 0.43471728351780436, + "grad_norm": 2.584602117538452, + "learning_rate": 4.911214582497436e-05, + "loss": 0.8455, + "step": 4340 + }, + { + "epoch": 0.4357189362448039, + "grad_norm": 3.5033724308013916, + "learning_rate": 4.91079770244969e-05, + "loss": 0.8245, + "step": 4350 + }, + { + "epoch": 0.4367205889718035, + "grad_norm": 2.954213857650757, + "learning_rate": 4.910379863767446e-05, + "loss": 0.832, + "step": 4360 + }, + { + "epoch": 0.437722241698803, + "grad_norm": 3.0140154361724854, + "learning_rate": 4.909961066616855e-05, + "loss": 0.7357, + "step": 4370 + }, + { + "epoch": 0.43872389442580256, + "grad_norm": 2.633049726486206, + "learning_rate": 4.909541311164448e-05, + "loss": 0.7821, + "step": 4380 + }, + { + "epoch": 0.4397255471528021, + "grad_norm": 3.462796688079834, + "learning_rate": 4.909120597577137e-05, + "loss": 0.759, + "step": 4390 + }, + { + "epoch": 0.4407271998798017, + "grad_norm": 2.973219633102417, + "learning_rate": 4.9086989260222165e-05, + "loss": 0.7813, + "step": 4400 + }, + { + "epoch": 0.4417288526068012, + "grad_norm": 2.684271812438965, + "learning_rate": 4.90827629666736e-05, + "loss": 0.7966, + "step": 4410 + }, + { + "epoch": 0.44273050533380076, + "grad_norm": 2.947772264480591, + "learning_rate": 4.9078527096806225e-05, + "loss": 0.7481, + "step": 4420 + }, + { + "epoch": 0.4437321580608003, + "grad_norm": 3.4349887371063232, + "learning_rate": 4.907428165230441e-05, + "loss": 0.7553, + "step": 4430 + }, + { + "epoch": 0.4447338107877999, + "grad_norm": 2.8952155113220215, + "learning_rate": 4.907002663485632e-05, + "loss": 0.7569, + "step": 4440 + }, + { + "epoch": 0.4457354635147994, + "grad_norm": 2.7803821563720703, + "learning_rate": 4.9065762046153914e-05, + "loss": 0.8073, + "step": 4450 + }, + { + "epoch": 0.44673711624179896, + "grad_norm": 3.2252511978149414, + "learning_rate": 4.9061487887892985e-05, + "loss": 0.7818, + "step": 4460 + }, + { + "epoch": 0.4477387689687985, + "grad_norm": 3.153841018676758, + "learning_rate": 4.905720416177312e-05, + "loss": 0.839, + "step": 4470 + }, + { + "epoch": 0.4487404216957981, + "grad_norm": 2.7690632343292236, + "learning_rate": 4.9052910869497704e-05, + "loss": 0.7685, + "step": 4480 + }, + { + "epoch": 0.4497420744227976, + "grad_norm": 2.3396732807159424, + "learning_rate": 4.904860801277392e-05, + "loss": 0.6467, + "step": 4490 + }, + { + "epoch": 0.45074372714979716, + "grad_norm": 2.70893931388855, + "learning_rate": 4.904429559331279e-05, + "loss": 0.7481, + "step": 4500 + }, + { + "epoch": 0.4517453798767967, + "grad_norm": 3.1398801803588867, + "learning_rate": 4.9039973612829094e-05, + "loss": 0.7929, + "step": 4510 + }, + { + "epoch": 0.4527470326037963, + "grad_norm": 2.2686514854431152, + "learning_rate": 4.903564207304143e-05, + "loss": 0.8276, + "step": 4520 + }, + { + "epoch": 0.4537486853307958, + "grad_norm": 2.6630866527557373, + "learning_rate": 4.903130097567222e-05, + "loss": 0.7651, + "step": 4530 + }, + { + "epoch": 0.45475033805779536, + "grad_norm": 2.9019227027893066, + "learning_rate": 4.902695032244765e-05, + "loss": 0.8739, + "step": 4540 + }, + { + "epoch": 0.4557519907847949, + "grad_norm": 2.878720760345459, + "learning_rate": 4.9022590115097723e-05, + "loss": 0.8215, + "step": 4550 + }, + { + "epoch": 0.4567536435117945, + "grad_norm": 2.4830129146575928, + "learning_rate": 4.9018220355356246e-05, + "loss": 0.7917, + "step": 4560 + }, + { + "epoch": 0.457755296238794, + "grad_norm": 2.447023630142212, + "learning_rate": 4.901384104496083e-05, + "loss": 0.7182, + "step": 4570 + }, + { + "epoch": 0.45875694896579355, + "grad_norm": 2.4233357906341553, + "learning_rate": 4.900945218565285e-05, + "loss": 0.8353, + "step": 4580 + }, + { + "epoch": 0.4597586016927931, + "grad_norm": 2.6926865577697754, + "learning_rate": 4.900505377917751e-05, + "loss": 0.7566, + "step": 4590 + }, + { + "epoch": 0.4607602544197927, + "grad_norm": 3.171412229537964, + "learning_rate": 4.900064582728381e-05, + "loss": 0.8012, + "step": 4600 + }, + { + "epoch": 0.4617619071467922, + "grad_norm": 2.605421781539917, + "learning_rate": 4.899622833172452e-05, + "loss": 0.7585, + "step": 4610 + }, + { + "epoch": 0.46276355987379175, + "grad_norm": 2.7635443210601807, + "learning_rate": 4.899180129425625e-05, + "loss": 0.7919, + "step": 4620 + }, + { + "epoch": 0.4637652126007913, + "grad_norm": 3.7377936840057373, + "learning_rate": 4.8987364716639346e-05, + "loss": 0.7324, + "step": 4630 + }, + { + "epoch": 0.4647668653277909, + "grad_norm": 2.2805068492889404, + "learning_rate": 4.898291860063799e-05, + "loss": 0.7262, + "step": 4640 + }, + { + "epoch": 0.4657685180547904, + "grad_norm": 2.1092188358306885, + "learning_rate": 4.897846294802014e-05, + "loss": 0.7749, + "step": 4650 + }, + { + "epoch": 0.46677017078178995, + "grad_norm": 2.417418956756592, + "learning_rate": 4.8973997760557566e-05, + "loss": 0.8886, + "step": 4660 + }, + { + "epoch": 0.4677718235087895, + "grad_norm": 2.4715187549591064, + "learning_rate": 4.89695230400258e-05, + "loss": 0.7872, + "step": 4670 + }, + { + "epoch": 0.4687734762357891, + "grad_norm": 3.4084720611572266, + "learning_rate": 4.8965038788204185e-05, + "loss": 0.7903, + "step": 4680 + }, + { + "epoch": 0.4697751289627886, + "grad_norm": 2.9078190326690674, + "learning_rate": 4.8960545006875844e-05, + "loss": 0.7957, + "step": 4690 + }, + { + "epoch": 0.47077678168978815, + "grad_norm": 3.02535080909729, + "learning_rate": 4.895604169782769e-05, + "loss": 0.7782, + "step": 4700 + }, + { + "epoch": 0.4717784344167877, + "grad_norm": 2.9437060356140137, + "learning_rate": 4.8951528862850444e-05, + "loss": 0.8061, + "step": 4710 + }, + { + "epoch": 0.4727800871437873, + "grad_norm": 2.7083654403686523, + "learning_rate": 4.894700650373858e-05, + "loss": 0.6672, + "step": 4720 + }, + { + "epoch": 0.4737817398707868, + "grad_norm": 2.350159168243408, + "learning_rate": 4.894247462229038e-05, + "loss": 0.7149, + "step": 4730 + }, + { + "epoch": 0.47478339259778635, + "grad_norm": 2.328615665435791, + "learning_rate": 4.893793322030793e-05, + "loss": 0.734, + "step": 4740 + }, + { + "epoch": 0.4757850453247859, + "grad_norm": 2.403383255004883, + "learning_rate": 4.8933382299597063e-05, + "loss": 0.7973, + "step": 4750 + }, + { + "epoch": 0.4767866980517855, + "grad_norm": 3.2092747688293457, + "learning_rate": 4.892882186196742e-05, + "loss": 0.7937, + "step": 4760 + }, + { + "epoch": 0.477788350778785, + "grad_norm": 2.68188214302063, + "learning_rate": 4.892425190923242e-05, + "loss": 0.7354, + "step": 4770 + }, + { + "epoch": 0.47879000350578454, + "grad_norm": 2.7987594604492188, + "learning_rate": 4.891967244320929e-05, + "loss": 0.8261, + "step": 4780 + }, + { + "epoch": 0.4797916562327841, + "grad_norm": 2.9749321937561035, + "learning_rate": 4.8915083465718984e-05, + "loss": 0.7946, + "step": 4790 + }, + { + "epoch": 0.4807933089597836, + "grad_norm": 2.480320692062378, + "learning_rate": 4.891048497858629e-05, + "loss": 0.7868, + "step": 4800 + }, + { + "epoch": 0.4817949616867832, + "grad_norm": 3.2900009155273438, + "learning_rate": 4.8905876983639775e-05, + "loss": 0.7455, + "step": 4810 + }, + { + "epoch": 0.48279661441378274, + "grad_norm": 2.6279869079589844, + "learning_rate": 4.8901259482711744e-05, + "loss": 0.7786, + "step": 4820 + }, + { + "epoch": 0.4837982671407823, + "grad_norm": 2.4610986709594727, + "learning_rate": 4.8896632477638324e-05, + "loss": 0.7257, + "step": 4830 + }, + { + "epoch": 0.4847999198677818, + "grad_norm": 2.5319132804870605, + "learning_rate": 4.88919959702594e-05, + "loss": 0.8303, + "step": 4840 + }, + { + "epoch": 0.4858015725947814, + "grad_norm": 2.8233323097229004, + "learning_rate": 4.888734996241865e-05, + "loss": 0.7286, + "step": 4850 + }, + { + "epoch": 0.48680322532178094, + "grad_norm": 2.4413650035858154, + "learning_rate": 4.8882694455963516e-05, + "loss": 0.7694, + "step": 4860 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 3.778494358062744, + "learning_rate": 4.887802945274523e-05, + "loss": 0.77, + "step": 4870 + }, + { + "epoch": 0.48880653077578, + "grad_norm": 3.3177435398101807, + "learning_rate": 4.887335495461879e-05, + "loss": 0.8672, + "step": 4880 + }, + { + "epoch": 0.4898081835027796, + "grad_norm": 2.7683284282684326, + "learning_rate": 4.886867096344296e-05, + "loss": 0.7497, + "step": 4890 + }, + { + "epoch": 0.49080983622977914, + "grad_norm": 3.1127943992614746, + "learning_rate": 4.886397748108031e-05, + "loss": 0.7698, + "step": 4900 + }, + { + "epoch": 0.4918114889567787, + "grad_norm": 2.4643614292144775, + "learning_rate": 4.885927450939716e-05, + "loss": 0.8021, + "step": 4910 + }, + { + "epoch": 0.4928131416837782, + "grad_norm": 3.0037858486175537, + "learning_rate": 4.8854562050263614e-05, + "loss": 0.8111, + "step": 4920 + }, + { + "epoch": 0.4938147944107778, + "grad_norm": 2.6661083698272705, + "learning_rate": 4.8849840105553536e-05, + "loss": 0.7426, + "step": 4930 + }, + { + "epoch": 0.49481644713777734, + "grad_norm": 2.461570978164673, + "learning_rate": 4.8845108677144565e-05, + "loss": 0.8039, + "step": 4940 + }, + { + "epoch": 0.4958180998647769, + "grad_norm": 2.258551836013794, + "learning_rate": 4.884036776691813e-05, + "loss": 0.6808, + "step": 4950 + }, + { + "epoch": 0.4968197525917764, + "grad_norm": 2.733013153076172, + "learning_rate": 4.8835617376759405e-05, + "loss": 0.8166, + "step": 4960 + }, + { + "epoch": 0.497821405318776, + "grad_norm": 2.9279303550720215, + "learning_rate": 4.8830857508557344e-05, + "loss": 0.7596, + "step": 4970 + }, + { + "epoch": 0.49882305804577554, + "grad_norm": 2.5595335960388184, + "learning_rate": 4.882608816420467e-05, + "loss": 0.7144, + "step": 4980 + }, + { + "epoch": 0.49982471077277507, + "grad_norm": 1.9373695850372314, + "learning_rate": 4.88213093455979e-05, + "loss": 0.7014, + "step": 4990 + }, + { + "epoch": 0.5008263634997746, + "grad_norm": 3.3078603744506836, + "learning_rate": 4.8816521054637264e-05, + "loss": 0.7673, + "step": 5000 + }, + { + "epoch": 0.5018280162267742, + "grad_norm": 3.2751052379608154, + "learning_rate": 4.8811723293226796e-05, + "loss": 0.7277, + "step": 5010 + }, + { + "epoch": 0.5028296689537737, + "grad_norm": 2.650498628616333, + "learning_rate": 4.880691606327429e-05, + "loss": 0.8175, + "step": 5020 + }, + { + "epoch": 0.5038313216807733, + "grad_norm": 3.0442254543304443, + "learning_rate": 4.88020993666913e-05, + "loss": 0.7676, + "step": 5030 + }, + { + "epoch": 0.5048329744077729, + "grad_norm": 2.6046295166015625, + "learning_rate": 4.8797273205393144e-05, + "loss": 0.6961, + "step": 5040 + }, + { + "epoch": 0.5058346271347723, + "grad_norm": 2.5837275981903076, + "learning_rate": 4.8792437581298923e-05, + "loss": 0.7531, + "step": 5050 + }, + { + "epoch": 0.5068362798617719, + "grad_norm": 2.275136947631836, + "learning_rate": 4.8787592496331456e-05, + "loss": 0.7709, + "step": 5060 + }, + { + "epoch": 0.5078379325887715, + "grad_norm": 3.156888961791992, + "learning_rate": 4.8782737952417376e-05, + "loss": 0.818, + "step": 5070 + }, + { + "epoch": 0.508839585315771, + "grad_norm": 3.180203914642334, + "learning_rate": 4.877787395148705e-05, + "loss": 0.8009, + "step": 5080 + }, + { + "epoch": 0.5098412380427706, + "grad_norm": 2.5815541744232178, + "learning_rate": 4.877300049547461e-05, + "loss": 0.766, + "step": 5090 + }, + { + "epoch": 0.5108428907697701, + "grad_norm": 3.7498693466186523, + "learning_rate": 4.876811758631793e-05, + "loss": 0.6966, + "step": 5100 + }, + { + "epoch": 0.5118445434967697, + "grad_norm": 2.6118204593658447, + "learning_rate": 4.8763225225958686e-05, + "loss": 0.7633, + "step": 5110 + }, + { + "epoch": 0.5128461962237693, + "grad_norm": 2.1438217163085938, + "learning_rate": 4.875832341634227e-05, + "loss": 0.7183, + "step": 5120 + }, + { + "epoch": 0.5138478489507687, + "grad_norm": 3.222095012664795, + "learning_rate": 4.875341215941784e-05, + "loss": 0.7274, + "step": 5130 + }, + { + "epoch": 0.5148495016777683, + "grad_norm": 2.6931872367858887, + "learning_rate": 4.874849145713833e-05, + "loss": 0.7493, + "step": 5140 + }, + { + "epoch": 0.5158511544047679, + "grad_norm": 2.759004831314087, + "learning_rate": 4.8743561311460424e-05, + "loss": 0.7586, + "step": 5150 + }, + { + "epoch": 0.5168528071317674, + "grad_norm": 2.3311071395874023, + "learning_rate": 4.873862172434455e-05, + "loss": 0.7245, + "step": 5160 + }, + { + "epoch": 0.517854459858767, + "grad_norm": 2.3808846473693848, + "learning_rate": 4.8733672697754884e-05, + "loss": 0.7359, + "step": 5170 + }, + { + "epoch": 0.5188561125857665, + "grad_norm": 2.6579818725585938, + "learning_rate": 4.8728714233659376e-05, + "loss": 0.7077, + "step": 5180 + }, + { + "epoch": 0.5198577653127661, + "grad_norm": 2.8846435546875, + "learning_rate": 4.872374633402972e-05, + "loss": 0.7361, + "step": 5190 + }, + { + "epoch": 0.5208594180397657, + "grad_norm": 3.370232105255127, + "learning_rate": 4.871876900084137e-05, + "loss": 0.7259, + "step": 5200 + }, + { + "epoch": 0.5218610707667651, + "grad_norm": 2.5193967819213867, + "learning_rate": 4.871378223607351e-05, + "loss": 0.7675, + "step": 5210 + }, + { + "epoch": 0.5228627234937647, + "grad_norm": 2.9328339099884033, + "learning_rate": 4.870878604170909e-05, + "loss": 0.8851, + "step": 5220 + }, + { + "epoch": 0.5238643762207643, + "grad_norm": 3.101991891860962, + "learning_rate": 4.870378041973481e-05, + "loss": 0.776, + "step": 5230 + }, + { + "epoch": 0.5248660289477638, + "grad_norm": 3.0238535404205322, + "learning_rate": 4.8698765372141106e-05, + "loss": 0.7957, + "step": 5240 + }, + { + "epoch": 0.5258676816747634, + "grad_norm": 2.859778642654419, + "learning_rate": 4.8693740900922193e-05, + "loss": 0.7584, + "step": 5250 + }, + { + "epoch": 0.5268693344017629, + "grad_norm": 2.8081979751586914, + "learning_rate": 4.868870700807599e-05, + "loss": 0.8051, + "step": 5260 + }, + { + "epoch": 0.5278709871287625, + "grad_norm": 3.409069061279297, + "learning_rate": 4.86836636956042e-05, + "loss": 0.7246, + "step": 5270 + }, + { + "epoch": 0.528872639855762, + "grad_norm": 2.7888481616973877, + "learning_rate": 4.867861096551224e-05, + "loss": 0.7656, + "step": 5280 + }, + { + "epoch": 0.5298742925827615, + "grad_norm": 2.872657299041748, + "learning_rate": 4.867354881980931e-05, + "loss": 0.8549, + "step": 5290 + }, + { + "epoch": 0.5308759453097611, + "grad_norm": 3.0871007442474365, + "learning_rate": 4.8668477260508304e-05, + "loss": 0.757, + "step": 5300 + }, + { + "epoch": 0.5318775980367606, + "grad_norm": 2.824735641479492, + "learning_rate": 4.866339628962591e-05, + "loss": 0.8681, + "step": 5310 + }, + { + "epoch": 0.5328792507637602, + "grad_norm": 3.060173273086548, + "learning_rate": 4.8658305909182535e-05, + "loss": 0.7545, + "step": 5320 + }, + { + "epoch": 0.5338809034907598, + "grad_norm": 2.845705509185791, + "learning_rate": 4.865320612120231e-05, + "loss": 0.766, + "step": 5330 + }, + { + "epoch": 0.5348825562177593, + "grad_norm": 2.0810883045196533, + "learning_rate": 4.8648096927713135e-05, + "loss": 0.7974, + "step": 5340 + }, + { + "epoch": 0.5358842089447589, + "grad_norm": 2.593435764312744, + "learning_rate": 4.864297833074665e-05, + "loss": 0.758, + "step": 5350 + }, + { + "epoch": 0.5368858616717584, + "grad_norm": 2.5074310302734375, + "learning_rate": 4.863785033233821e-05, + "loss": 0.7201, + "step": 5360 + }, + { + "epoch": 0.5378875143987579, + "grad_norm": 2.603663444519043, + "learning_rate": 4.863271293452693e-05, + "loss": 0.6588, + "step": 5370 + }, + { + "epoch": 0.5388891671257575, + "grad_norm": 2.7700934410095215, + "learning_rate": 4.862756613935565e-05, + "loss": 0.6865, + "step": 5380 + }, + { + "epoch": 0.539890819852757, + "grad_norm": 2.748194932937622, + "learning_rate": 4.862240994887096e-05, + "loss": 0.7789, + "step": 5390 + }, + { + "epoch": 0.5408924725797566, + "grad_norm": 2.4352834224700928, + "learning_rate": 4.8617244365123174e-05, + "loss": 0.7766, + "step": 5400 + }, + { + "epoch": 0.5418941253067562, + "grad_norm": 2.7042815685272217, + "learning_rate": 4.8612069390166344e-05, + "loss": 0.7305, + "step": 5410 + }, + { + "epoch": 0.5428957780337557, + "grad_norm": 2.7817211151123047, + "learning_rate": 4.860688502605826e-05, + "loss": 0.8304, + "step": 5420 + }, + { + "epoch": 0.5438974307607553, + "grad_norm": 2.6968953609466553, + "learning_rate": 4.860169127486043e-05, + "loss": 0.6718, + "step": 5430 + }, + { + "epoch": 0.5448990834877548, + "grad_norm": 2.5667612552642822, + "learning_rate": 4.859648813863813e-05, + "loss": 0.7263, + "step": 5440 + }, + { + "epoch": 0.5459007362147543, + "grad_norm": 2.4719300270080566, + "learning_rate": 4.859127561946033e-05, + "loss": 0.7459, + "step": 5450 + }, + { + "epoch": 0.5469023889417539, + "grad_norm": 2.9818973541259766, + "learning_rate": 4.858605371939976e-05, + "loss": 0.7407, + "step": 5460 + }, + { + "epoch": 0.5479040416687534, + "grad_norm": 2.792579412460327, + "learning_rate": 4.8580822440532845e-05, + "loss": 0.6898, + "step": 5470 + }, + { + "epoch": 0.548905694395753, + "grad_norm": 2.4535489082336426, + "learning_rate": 4.857558178493979e-05, + "loss": 0.7494, + "step": 5480 + }, + { + "epoch": 0.5499073471227526, + "grad_norm": 2.920201539993286, + "learning_rate": 4.857033175470448e-05, + "loss": 0.7872, + "step": 5490 + }, + { + "epoch": 0.5509089998497521, + "grad_norm": 2.750412702560425, + "learning_rate": 4.856507235191454e-05, + "loss": 0.784, + "step": 5500 + }, + { + "epoch": 0.5519106525767516, + "grad_norm": 3.8967442512512207, + "learning_rate": 4.8559803578661356e-05, + "loss": 0.7216, + "step": 5510 + }, + { + "epoch": 0.5529123053037512, + "grad_norm": 2.851120948791504, + "learning_rate": 4.855452543704e-05, + "loss": 0.7928, + "step": 5520 + }, + { + "epoch": 0.5539139580307507, + "grad_norm": 2.98593807220459, + "learning_rate": 4.8549237929149275e-05, + "loss": 0.8081, + "step": 5530 + }, + { + "epoch": 0.5549156107577503, + "grad_norm": 3.0606110095977783, + "learning_rate": 4.854394105709173e-05, + "loss": 0.7919, + "step": 5540 + }, + { + "epoch": 0.5559172634847498, + "grad_norm": 2.4815075397491455, + "learning_rate": 4.8538634822973616e-05, + "loss": 0.8171, + "step": 5550 + }, + { + "epoch": 0.5569189162117494, + "grad_norm": 3.970489740371704, + "learning_rate": 4.853331922890492e-05, + "loss": 0.7901, + "step": 5560 + }, + { + "epoch": 0.557920568938749, + "grad_norm": 3.3668770790100098, + "learning_rate": 4.852799427699934e-05, + "loss": 0.7935, + "step": 5570 + }, + { + "epoch": 0.5589222216657485, + "grad_norm": 2.571666955947876, + "learning_rate": 4.8522659969374303e-05, + "loss": 0.7554, + "step": 5580 + }, + { + "epoch": 0.559923874392748, + "grad_norm": 2.3285953998565674, + "learning_rate": 4.851731630815095e-05, + "loss": 0.7063, + "step": 5590 + }, + { + "epoch": 0.5609255271197476, + "grad_norm": 2.6997599601745605, + "learning_rate": 4.8511963295454156e-05, + "loss": 0.6977, + "step": 5600 + }, + { + "epoch": 0.5619271798467471, + "grad_norm": 3.021113395690918, + "learning_rate": 4.8506600933412494e-05, + "loss": 0.8443, + "step": 5610 + }, + { + "epoch": 0.5629288325737467, + "grad_norm": 2.5922207832336426, + "learning_rate": 4.850122922415827e-05, + "loss": 0.7096, + "step": 5620 + }, + { + "epoch": 0.5639304853007462, + "grad_norm": 2.5463130474090576, + "learning_rate": 4.84958481698275e-05, + "loss": 0.7234, + "step": 5630 + }, + { + "epoch": 0.5649321380277458, + "grad_norm": 3.00300931930542, + "learning_rate": 4.8490457772559915e-05, + "loss": 0.7499, + "step": 5640 + }, + { + "epoch": 0.5659337907547454, + "grad_norm": 1.9227794408798218, + "learning_rate": 4.848505803449897e-05, + "loss": 0.7959, + "step": 5650 + }, + { + "epoch": 0.5669354434817448, + "grad_norm": 2.412663698196411, + "learning_rate": 4.847964895779181e-05, + "loss": 0.6528, + "step": 5660 + }, + { + "epoch": 0.5679370962087444, + "grad_norm": 2.4437978267669678, + "learning_rate": 4.847423054458933e-05, + "loss": 0.7367, + "step": 5670 + }, + { + "epoch": 0.568938748935744, + "grad_norm": 2.260493040084839, + "learning_rate": 4.846880279704612e-05, + "loss": 0.7545, + "step": 5680 + }, + { + "epoch": 0.5699404016627435, + "grad_norm": 2.8233468532562256, + "learning_rate": 4.846336571732046e-05, + "loss": 0.7347, + "step": 5690 + }, + { + "epoch": 0.5709420543897431, + "grad_norm": 2.773327350616455, + "learning_rate": 4.845791930757438e-05, + "loss": 0.7908, + "step": 5700 + }, + { + "epoch": 0.5719437071167426, + "grad_norm": 3.0723698139190674, + "learning_rate": 4.84524635699736e-05, + "loss": 0.7355, + "step": 5710 + }, + { + "epoch": 0.5729453598437422, + "grad_norm": 3.2027089595794678, + "learning_rate": 4.844699850668756e-05, + "loss": 0.792, + "step": 5720 + }, + { + "epoch": 0.5739470125707418, + "grad_norm": 2.687241554260254, + "learning_rate": 4.844152411988937e-05, + "loss": 0.6883, + "step": 5730 + }, + { + "epoch": 0.5749486652977412, + "grad_norm": 3.0531158447265625, + "learning_rate": 4.8436040411755887e-05, + "loss": 0.7003, + "step": 5740 + }, + { + "epoch": 0.5759503180247408, + "grad_norm": 3.0482845306396484, + "learning_rate": 4.8430547384467684e-05, + "loss": 0.701, + "step": 5750 + }, + { + "epoch": 0.5769519707517403, + "grad_norm": 2.8414599895477295, + "learning_rate": 4.8425045040209e-05, + "loss": 0.7466, + "step": 5760 + }, + { + "epoch": 0.5779536234787399, + "grad_norm": 2.379384994506836, + "learning_rate": 4.84195333811678e-05, + "loss": 0.7467, + "step": 5770 + }, + { + "epoch": 0.5789552762057395, + "grad_norm": 2.5822062492370605, + "learning_rate": 4.8414012409535755e-05, + "loss": 0.7882, + "step": 5780 + }, + { + "epoch": 0.579956928932739, + "grad_norm": 2.840921640396118, + "learning_rate": 4.840848212750824e-05, + "loss": 0.7207, + "step": 5790 + }, + { + "epoch": 0.5809585816597386, + "grad_norm": 2.9414560794830322, + "learning_rate": 4.840294253728431e-05, + "loss": 0.754, + "step": 5800 + }, + { + "epoch": 0.5819602343867382, + "grad_norm": 2.658170223236084, + "learning_rate": 4.8397393641066755e-05, + "loss": 0.7069, + "step": 5810 + }, + { + "epoch": 0.5829618871137376, + "grad_norm": 2.7854273319244385, + "learning_rate": 4.839183544106205e-05, + "loss": 0.7554, + "step": 5820 + }, + { + "epoch": 0.5839635398407372, + "grad_norm": 2.4555301666259766, + "learning_rate": 4.838626793948037e-05, + "loss": 0.7387, + "step": 5830 + }, + { + "epoch": 0.5849651925677367, + "grad_norm": 2.213578939437866, + "learning_rate": 4.838069113853557e-05, + "loss": 0.6798, + "step": 5840 + }, + { + "epoch": 0.5859668452947363, + "grad_norm": 3.014767646789551, + "learning_rate": 4.837510504044523e-05, + "loss": 0.6855, + "step": 5850 + }, + { + "epoch": 0.5869684980217359, + "grad_norm": 2.637495994567871, + "learning_rate": 4.836950964743063e-05, + "loss": 0.7603, + "step": 5860 + }, + { + "epoch": 0.5879701507487354, + "grad_norm": 2.4447574615478516, + "learning_rate": 4.8363904961716726e-05, + "loss": 0.7416, + "step": 5870 + }, + { + "epoch": 0.588971803475735, + "grad_norm": 2.8144404888153076, + "learning_rate": 4.835829098553217e-05, + "loss": 0.7945, + "step": 5880 + }, + { + "epoch": 0.5899734562027346, + "grad_norm": 2.2470645904541016, + "learning_rate": 4.8352667721109314e-05, + "loss": 0.6977, + "step": 5890 + }, + { + "epoch": 0.590975108929734, + "grad_norm": 4.678529739379883, + "learning_rate": 4.834703517068422e-05, + "loss": 0.695, + "step": 5900 + }, + { + "epoch": 0.5919767616567336, + "grad_norm": 3.0645904541015625, + "learning_rate": 4.83413933364966e-05, + "loss": 0.7326, + "step": 5910 + }, + { + "epoch": 0.5929784143837331, + "grad_norm": 3.1756017208099365, + "learning_rate": 4.833574222078991e-05, + "loss": 0.7561, + "step": 5920 + }, + { + "epoch": 0.5939800671107327, + "grad_norm": 2.3154969215393066, + "learning_rate": 4.833008182581127e-05, + "loss": 0.7144, + "step": 5930 + }, + { + "epoch": 0.5949817198377323, + "grad_norm": 3.0473766326904297, + "learning_rate": 4.832441215381147e-05, + "loss": 0.6792, + "step": 5940 + }, + { + "epoch": 0.5959833725647318, + "grad_norm": 2.6138970851898193, + "learning_rate": 4.8318733207045026e-05, + "loss": 0.7414, + "step": 5950 + }, + { + "epoch": 0.5969850252917314, + "grad_norm": 2.8817873001098633, + "learning_rate": 4.831304498777012e-05, + "loss": 0.6992, + "step": 5960 + }, + { + "epoch": 0.597986678018731, + "grad_norm": 2.776374340057373, + "learning_rate": 4.830734749824863e-05, + "loss": 0.7556, + "step": 5970 + }, + { + "epoch": 0.5989883307457304, + "grad_norm": 3.001796245574951, + "learning_rate": 4.830164074074612e-05, + "loss": 0.7315, + "step": 5980 + }, + { + "epoch": 0.59998998347273, + "grad_norm": 2.222621202468872, + "learning_rate": 4.8295924717531833e-05, + "loss": 0.6979, + "step": 5990 + }, + { + "epoch": 0.6009916361997295, + "grad_norm": 2.8218321800231934, + "learning_rate": 4.82901994308787e-05, + "loss": 0.6922, + "step": 6000 + }, + { + "epoch": 0.6019932889267291, + "grad_norm": 2.5931153297424316, + "learning_rate": 4.828446488306333e-05, + "loss": 0.6913, + "step": 6010 + }, + { + "epoch": 0.6029949416537287, + "grad_norm": 2.5786752700805664, + "learning_rate": 4.827872107636604e-05, + "loss": 0.7638, + "step": 6020 + }, + { + "epoch": 0.6039965943807282, + "grad_norm": 3.696964740753174, + "learning_rate": 4.8272968013070785e-05, + "loss": 0.726, + "step": 6030 + }, + { + "epoch": 0.6049982471077278, + "grad_norm": 3.4040944576263428, + "learning_rate": 4.8267205695465236e-05, + "loss": 0.712, + "step": 6040 + }, + { + "epoch": 0.6059998998347274, + "grad_norm": 2.7188923358917236, + "learning_rate": 4.8261434125840735e-05, + "loss": 0.6717, + "step": 6050 + }, + { + "epoch": 0.6070015525617268, + "grad_norm": 2.0458145141601562, + "learning_rate": 4.825565330649229e-05, + "loss": 0.6694, + "step": 6060 + }, + { + "epoch": 0.6080032052887264, + "grad_norm": 2.1476712226867676, + "learning_rate": 4.8249863239718604e-05, + "loss": 0.7041, + "step": 6070 + }, + { + "epoch": 0.6090048580157259, + "grad_norm": 2.831986904144287, + "learning_rate": 4.824406392782206e-05, + "loss": 0.6909, + "step": 6080 + }, + { + "epoch": 0.6100065107427255, + "grad_norm": 2.525928497314453, + "learning_rate": 4.82382553731087e-05, + "loss": 0.6722, + "step": 6090 + }, + { + "epoch": 0.6110081634697251, + "grad_norm": 2.2550671100616455, + "learning_rate": 4.823243757788825e-05, + "loss": 0.7381, + "step": 6100 + }, + { + "epoch": 0.6120098161967246, + "grad_norm": 2.4147746562957764, + "learning_rate": 4.822661054447411e-05, + "loss": 0.8251, + "step": 6110 + }, + { + "epoch": 0.6130114689237242, + "grad_norm": 2.8840737342834473, + "learning_rate": 4.822077427518335e-05, + "loss": 0.7597, + "step": 6120 + }, + { + "epoch": 0.6140131216507237, + "grad_norm": 2.4112513065338135, + "learning_rate": 4.821492877233672e-05, + "loss": 0.7331, + "step": 6130 + }, + { + "epoch": 0.6150147743777232, + "grad_norm": 2.4079248905181885, + "learning_rate": 4.8209074038258636e-05, + "loss": 0.8029, + "step": 6140 + }, + { + "epoch": 0.6160164271047228, + "grad_norm": 2.258690595626831, + "learning_rate": 4.8203210075277194e-05, + "loss": 0.6666, + "step": 6150 + }, + { + "epoch": 0.6170180798317223, + "grad_norm": 2.800661325454712, + "learning_rate": 4.819733688572414e-05, + "loss": 0.702, + "step": 6160 + }, + { + "epoch": 0.6180197325587219, + "grad_norm": 2.231372833251953, + "learning_rate": 4.81914544719349e-05, + "loss": 0.6428, + "step": 6170 + }, + { + "epoch": 0.6190213852857215, + "grad_norm": 2.4702677726745605, + "learning_rate": 4.818556283624858e-05, + "loss": 0.6676, + "step": 6180 + }, + { + "epoch": 0.620023038012721, + "grad_norm": 2.390310287475586, + "learning_rate": 4.817966198100794e-05, + "loss": 0.715, + "step": 6190 + }, + { + "epoch": 0.6210246907397206, + "grad_norm": 2.378634214401245, + "learning_rate": 4.81737519085594e-05, + "loss": 0.7538, + "step": 6200 + }, + { + "epoch": 0.62202634346672, + "grad_norm": 2.6578643321990967, + "learning_rate": 4.816783262125306e-05, + "loss": 0.7171, + "step": 6210 + }, + { + "epoch": 0.6230279961937196, + "grad_norm": 2.7009196281433105, + "learning_rate": 4.816190412144268e-05, + "loss": 0.7951, + "step": 6220 + }, + { + "epoch": 0.6240296489207192, + "grad_norm": 3.3209683895111084, + "learning_rate": 4.8155966411485676e-05, + "loss": 0.7156, + "step": 6230 + }, + { + "epoch": 0.6250313016477187, + "grad_norm": 2.5062389373779297, + "learning_rate": 4.8150019493743125e-05, + "loss": 0.7081, + "step": 6240 + }, + { + "epoch": 0.6260329543747183, + "grad_norm": 2.785179376602173, + "learning_rate": 4.8144063370579785e-05, + "loss": 0.6611, + "step": 6250 + }, + { + "epoch": 0.6270346071017179, + "grad_norm": 3.0619423389434814, + "learning_rate": 4.8138098044364056e-05, + "loss": 0.7037, + "step": 6260 + }, + { + "epoch": 0.6280362598287174, + "grad_norm": 3.4450790882110596, + "learning_rate": 4.8132123517467995e-05, + "loss": 0.7167, + "step": 6270 + }, + { + "epoch": 0.629037912555717, + "grad_norm": 3.5104589462280273, + "learning_rate": 4.8126139792267334e-05, + "loss": 0.6591, + "step": 6280 + }, + { + "epoch": 0.6300395652827164, + "grad_norm": 2.983975410461426, + "learning_rate": 4.812014687114145e-05, + "loss": 0.6971, + "step": 6290 + }, + { + "epoch": 0.631041218009716, + "grad_norm": 3.098531484603882, + "learning_rate": 4.811414475647337e-05, + "loss": 0.6375, + "step": 6300 + }, + { + "epoch": 0.6320428707367156, + "grad_norm": 2.6577069759368896, + "learning_rate": 4.81081334506498e-05, + "loss": 0.7211, + "step": 6310 + }, + { + "epoch": 0.6330445234637151, + "grad_norm": 2.649841547012329, + "learning_rate": 4.8102112956061105e-05, + "loss": 0.6979, + "step": 6320 + }, + { + "epoch": 0.6340461761907147, + "grad_norm": 3.851295232772827, + "learning_rate": 4.809608327510125e-05, + "loss": 0.7051, + "step": 6330 + }, + { + "epoch": 0.6350478289177143, + "grad_norm": 2.3493716716766357, + "learning_rate": 4.8090044410167914e-05, + "loss": 0.7381, + "step": 6340 + }, + { + "epoch": 0.6360494816447138, + "grad_norm": 2.7770872116088867, + "learning_rate": 4.8083996363662386e-05, + "loss": 0.7313, + "step": 6350 + }, + { + "epoch": 0.6370511343717133, + "grad_norm": 2.8262245655059814, + "learning_rate": 4.8077939137989645e-05, + "loss": 0.736, + "step": 6360 + }, + { + "epoch": 0.6380527870987128, + "grad_norm": 2.4160664081573486, + "learning_rate": 4.807187273555828e-05, + "loss": 0.6895, + "step": 6370 + }, + { + "epoch": 0.6390544398257124, + "grad_norm": 2.7470526695251465, + "learning_rate": 4.8065797158780556e-05, + "loss": 0.7013, + "step": 6380 + }, + { + "epoch": 0.640056092552712, + "grad_norm": 2.770486354827881, + "learning_rate": 4.8059712410072376e-05, + "loss": 0.8161, + "step": 6390 + }, + { + "epoch": 0.6410577452797115, + "grad_norm": 2.5913543701171875, + "learning_rate": 4.80536184918533e-05, + "loss": 0.6284, + "step": 6400 + }, + { + "epoch": 0.6420593980067111, + "grad_norm": 2.855409860610962, + "learning_rate": 4.804751540654651e-05, + "loss": 0.695, + "step": 6410 + }, + { + "epoch": 0.6430610507337107, + "grad_norm": 3.0769102573394775, + "learning_rate": 4.8041403156578864e-05, + "loss": 0.7523, + "step": 6420 + }, + { + "epoch": 0.6440627034607102, + "grad_norm": 3.437361240386963, + "learning_rate": 4.803528174438084e-05, + "loss": 0.7152, + "step": 6430 + }, + { + "epoch": 0.6450643561877097, + "grad_norm": 3.7562596797943115, + "learning_rate": 4.802915117238657e-05, + "loss": 0.7299, + "step": 6440 + }, + { + "epoch": 0.6460660089147092, + "grad_norm": 2.92394757270813, + "learning_rate": 4.8023011443033835e-05, + "loss": 0.7394, + "step": 6450 + }, + { + "epoch": 0.6470676616417088, + "grad_norm": 2.9074056148529053, + "learning_rate": 4.8016862558764034e-05, + "loss": 0.7502, + "step": 6460 + }, + { + "epoch": 0.6480693143687084, + "grad_norm": 2.7694075107574463, + "learning_rate": 4.801070452202224e-05, + "loss": 0.7541, + "step": 6470 + }, + { + "epoch": 0.6490709670957079, + "grad_norm": 3.2710673809051514, + "learning_rate": 4.800453733525714e-05, + "loss": 0.6723, + "step": 6480 + }, + { + "epoch": 0.6500726198227075, + "grad_norm": 2.673231601715088, + "learning_rate": 4.7998361000921055e-05, + "loss": 0.7593, + "step": 6490 + }, + { + "epoch": 0.6510742725497071, + "grad_norm": 2.528330087661743, + "learning_rate": 4.7992175521469975e-05, + "loss": 0.7758, + "step": 6500 + }, + { + "epoch": 0.6520759252767065, + "grad_norm": 2.416775703430176, + "learning_rate": 4.798598089936349e-05, + "loss": 0.7003, + "step": 6510 + }, + { + "epoch": 0.6530775780037061, + "grad_norm": 3.5243756771087646, + "learning_rate": 4.7979777137064854e-05, + "loss": 0.6824, + "step": 6520 + }, + { + "epoch": 0.6540792307307056, + "grad_norm": 2.3242297172546387, + "learning_rate": 4.7973564237040936e-05, + "loss": 0.6347, + "step": 6530 + }, + { + "epoch": 0.6550808834577052, + "grad_norm": 2.6728193759918213, + "learning_rate": 4.7967342201762244e-05, + "loss": 0.6672, + "step": 6540 + }, + { + "epoch": 0.6560825361847048, + "grad_norm": 2.3104145526885986, + "learning_rate": 4.7961111033702933e-05, + "loss": 0.6489, + "step": 6550 + }, + { + "epoch": 0.6570841889117043, + "grad_norm": 2.7398927211761475, + "learning_rate": 4.795487073534077e-05, + "loss": 0.6653, + "step": 6560 + }, + { + "epoch": 0.6580858416387039, + "grad_norm": 3.224385976791382, + "learning_rate": 4.794862130915716e-05, + "loss": 0.6914, + "step": 6570 + }, + { + "epoch": 0.6590874943657035, + "grad_norm": 2.406184196472168, + "learning_rate": 4.794236275763714e-05, + "loss": 0.6799, + "step": 6580 + }, + { + "epoch": 0.6600891470927029, + "grad_norm": 3.6518681049346924, + "learning_rate": 4.793609508326936e-05, + "loss": 0.7738, + "step": 6590 + }, + { + "epoch": 0.6610907998197025, + "grad_norm": 3.039585828781128, + "learning_rate": 4.7929818288546136e-05, + "loss": 0.641, + "step": 6600 + }, + { + "epoch": 0.662092452546702, + "grad_norm": 3.637732744216919, + "learning_rate": 4.792353237596336e-05, + "loss": 0.7518, + "step": 6610 + }, + { + "epoch": 0.6630941052737016, + "grad_norm": 2.7439217567443848, + "learning_rate": 4.7917237348020594e-05, + "loss": 0.6375, + "step": 6620 + }, + { + "epoch": 0.6640957580007012, + "grad_norm": 2.8755624294281006, + "learning_rate": 4.7910933207220985e-05, + "loss": 0.6667, + "step": 6630 + }, + { + "epoch": 0.6650974107277007, + "grad_norm": 2.3648972511291504, + "learning_rate": 4.790461995607135e-05, + "loss": 0.7319, + "step": 6640 + }, + { + "epoch": 0.6660990634547003, + "grad_norm": 2.573380470275879, + "learning_rate": 4.789829759708209e-05, + "loss": 0.7848, + "step": 6650 + }, + { + "epoch": 0.6671007161816999, + "grad_norm": 2.4021103382110596, + "learning_rate": 4.789196613276723e-05, + "loss": 0.6436, + "step": 6660 + }, + { + "epoch": 0.6681023689086993, + "grad_norm": 2.582141399383545, + "learning_rate": 4.7885625565644444e-05, + "loss": 0.7283, + "step": 6670 + }, + { + "epoch": 0.6691040216356989, + "grad_norm": 2.6833086013793945, + "learning_rate": 4.7879275898235e-05, + "loss": 0.6683, + "step": 6680 + }, + { + "epoch": 0.6701056743626984, + "grad_norm": 2.5321731567382812, + "learning_rate": 4.787291713306379e-05, + "loss": 0.7005, + "step": 6690 + }, + { + "epoch": 0.671107327089698, + "grad_norm": 2.673696279525757, + "learning_rate": 4.786654927265933e-05, + "loss": 0.7181, + "step": 6700 + }, + { + "epoch": 0.6721089798166976, + "grad_norm": 2.5562915802001953, + "learning_rate": 4.7860172319553753e-05, + "loss": 0.6797, + "step": 6710 + }, + { + "epoch": 0.6731106325436971, + "grad_norm": 2.444516897201538, + "learning_rate": 4.78537862762828e-05, + "loss": 0.6879, + "step": 6720 + }, + { + "epoch": 0.6741122852706967, + "grad_norm": 2.7865781784057617, + "learning_rate": 4.7847391145385834e-05, + "loss": 0.6981, + "step": 6730 + }, + { + "epoch": 0.6751139379976961, + "grad_norm": 2.3912670612335205, + "learning_rate": 4.784098692940582e-05, + "loss": 0.6845, + "step": 6740 + }, + { + "epoch": 0.6761155907246957, + "grad_norm": 3.1096134185791016, + "learning_rate": 4.7834573630889333e-05, + "loss": 0.7959, + "step": 6750 + }, + { + "epoch": 0.6771172434516953, + "grad_norm": 2.7713048458099365, + "learning_rate": 4.78281512523866e-05, + "loss": 0.7643, + "step": 6760 + }, + { + "epoch": 0.6781188961786948, + "grad_norm": 2.5605435371398926, + "learning_rate": 4.782171979645141e-05, + "loss": 0.6974, + "step": 6770 + }, + { + "epoch": 0.6791205489056944, + "grad_norm": 2.641727924346924, + "learning_rate": 4.7815279265641186e-05, + "loss": 0.7517, + "step": 6780 + }, + { + "epoch": 0.680122201632694, + "grad_norm": 2.469717502593994, + "learning_rate": 4.780882966251694e-05, + "loss": 0.7026, + "step": 6790 + }, + { + "epoch": 0.6811238543596935, + "grad_norm": 2.858966827392578, + "learning_rate": 4.7802370989643324e-05, + "loss": 0.7318, + "step": 6800 + }, + { + "epoch": 0.6821255070866931, + "grad_norm": 2.377610206604004, + "learning_rate": 4.779590324958857e-05, + "loss": 0.7191, + "step": 6810 + }, + { + "epoch": 0.6831271598136925, + "grad_norm": 2.691115379333496, + "learning_rate": 4.7789426444924525e-05, + "loss": 0.7618, + "step": 6820 + }, + { + "epoch": 0.6841288125406921, + "grad_norm": 2.288198947906494, + "learning_rate": 4.778294057822663e-05, + "loss": 0.7858, + "step": 6830 + }, + { + "epoch": 0.6851304652676917, + "grad_norm": 2.3845489025115967, + "learning_rate": 4.7776445652073944e-05, + "loss": 0.7049, + "step": 6840 + }, + { + "epoch": 0.6861321179946912, + "grad_norm": 2.66279673576355, + "learning_rate": 4.776994166904913e-05, + "loss": 0.7543, + "step": 6850 + }, + { + "epoch": 0.6871337707216908, + "grad_norm": 2.5185487270355225, + "learning_rate": 4.776342863173844e-05, + "loss": 0.7063, + "step": 6860 + }, + { + "epoch": 0.6881354234486904, + "grad_norm": 2.798473834991455, + "learning_rate": 4.775690654273172e-05, + "loss": 0.7993, + "step": 6870 + }, + { + "epoch": 0.6891370761756899, + "grad_norm": 4.163158893585205, + "learning_rate": 4.775037540462245e-05, + "loss": 0.6505, + "step": 6880 + }, + { + "epoch": 0.6901387289026895, + "grad_norm": 2.7145819664001465, + "learning_rate": 4.774383522000766e-05, + "loss": 0.7308, + "step": 6890 + }, + { + "epoch": 0.6911403816296889, + "grad_norm": 2.461956024169922, + "learning_rate": 4.7737285991488027e-05, + "loss": 0.6925, + "step": 6900 + }, + { + "epoch": 0.6921420343566885, + "grad_norm": 2.0583384037017822, + "learning_rate": 4.7730727721667776e-05, + "loss": 0.6763, + "step": 6910 + }, + { + "epoch": 0.6931436870836881, + "grad_norm": 3.1688992977142334, + "learning_rate": 4.7724160413154764e-05, + "loss": 0.6261, + "step": 6920 + }, + { + "epoch": 0.6941453398106876, + "grad_norm": 3.163015604019165, + "learning_rate": 4.771758406856043e-05, + "loss": 0.6836, + "step": 6930 + }, + { + "epoch": 0.6951469925376872, + "grad_norm": 2.387488842010498, + "learning_rate": 4.7710998690499794e-05, + "loss": 0.7193, + "step": 6940 + }, + { + "epoch": 0.6961486452646868, + "grad_norm": 3.6488964557647705, + "learning_rate": 4.770440428159149e-05, + "loss": 0.6738, + "step": 6950 + }, + { + "epoch": 0.6971502979916863, + "grad_norm": 2.8245675563812256, + "learning_rate": 4.769780084445773e-05, + "loss": 0.7247, + "step": 6960 + }, + { + "epoch": 0.6981519507186859, + "grad_norm": 2.353736162185669, + "learning_rate": 4.769118838172432e-05, + "loss": 0.6718, + "step": 6970 + }, + { + "epoch": 0.6991536034456853, + "grad_norm": 2.815807819366455, + "learning_rate": 4.7684566896020645e-05, + "loss": 0.7584, + "step": 6980 + }, + { + "epoch": 0.7001552561726849, + "grad_norm": 3.2099483013153076, + "learning_rate": 4.767793638997969e-05, + "loss": 0.6991, + "step": 6990 + }, + { + "epoch": 0.7011569088996845, + "grad_norm": 2.4797520637512207, + "learning_rate": 4.7671296866238025e-05, + "loss": 0.7686, + "step": 7000 + }, + { + "epoch": 0.702158561626684, + "grad_norm": 2.62703275680542, + "learning_rate": 4.76646483274358e-05, + "loss": 0.7068, + "step": 7010 + }, + { + "epoch": 0.7031602143536836, + "grad_norm": 2.9379310607910156, + "learning_rate": 4.765799077621677e-05, + "loss": 0.696, + "step": 7020 + }, + { + "epoch": 0.7041618670806832, + "grad_norm": 2.635833740234375, + "learning_rate": 4.765132421522823e-05, + "loss": 0.6948, + "step": 7030 + }, + { + "epoch": 0.7051635198076827, + "grad_norm": 2.5286707878112793, + "learning_rate": 4.7644648647121096e-05, + "loss": 0.6741, + "step": 7040 + }, + { + "epoch": 0.7061651725346823, + "grad_norm": 2.699763059616089, + "learning_rate": 4.7637964074549865e-05, + "loss": 0.721, + "step": 7050 + }, + { + "epoch": 0.7071668252616817, + "grad_norm": 2.9536404609680176, + "learning_rate": 4.763127050017259e-05, + "loss": 0.7539, + "step": 7060 + }, + { + "epoch": 0.7081684779886813, + "grad_norm": 1.9956881999969482, + "learning_rate": 4.762456792665093e-05, + "loss": 0.6176, + "step": 7070 + }, + { + "epoch": 0.7091701307156809, + "grad_norm": 2.333440065383911, + "learning_rate": 4.761785635665009e-05, + "loss": 0.7214, + "step": 7080 + }, + { + "epoch": 0.7101717834426804, + "grad_norm": 2.3856465816497803, + "learning_rate": 4.76111357928389e-05, + "loss": 0.681, + "step": 7090 + }, + { + "epoch": 0.71117343616968, + "grad_norm": 2.928725242614746, + "learning_rate": 4.760440623788972e-05, + "loss": 0.738, + "step": 7100 + }, + { + "epoch": 0.7121750888966796, + "grad_norm": 2.650521993637085, + "learning_rate": 4.75976676944785e-05, + "loss": 0.6664, + "step": 7110 + }, + { + "epoch": 0.7131767416236791, + "grad_norm": 2.572051525115967, + "learning_rate": 4.7590920165284785e-05, + "loss": 0.7096, + "step": 7120 + }, + { + "epoch": 0.7141783943506786, + "grad_norm": 2.118530750274658, + "learning_rate": 4.758416365299166e-05, + "loss": 0.7019, + "step": 7130 + }, + { + "epoch": 0.7151800470776781, + "grad_norm": 2.7526917457580566, + "learning_rate": 4.757739816028581e-05, + "loss": 0.6642, + "step": 7140 + }, + { + "epoch": 0.7161816998046777, + "grad_norm": 2.8893725872039795, + "learning_rate": 4.757062368985748e-05, + "loss": 0.685, + "step": 7150 + }, + { + "epoch": 0.7171833525316773, + "grad_norm": 2.5351388454437256, + "learning_rate": 4.756384024440047e-05, + "loss": 0.6915, + "step": 7160 + }, + { + "epoch": 0.7181850052586768, + "grad_norm": 2.42851185798645, + "learning_rate": 4.7557047826612176e-05, + "loss": 0.7623, + "step": 7170 + }, + { + "epoch": 0.7191866579856764, + "grad_norm": 3.844846725463867, + "learning_rate": 4.7550246439193546e-05, + "loss": 0.7131, + "step": 7180 + }, + { + "epoch": 0.7201883107126759, + "grad_norm": 2.4313747882843018, + "learning_rate": 4.7543436084849094e-05, + "loss": 0.6911, + "step": 7190 + }, + { + "epoch": 0.7211899634396755, + "grad_norm": 2.404658317565918, + "learning_rate": 4.7536616766286915e-05, + "loss": 0.6961, + "step": 7200 + }, + { + "epoch": 0.722191616166675, + "grad_norm": 2.1430392265319824, + "learning_rate": 4.752978848621863e-05, + "loss": 0.7149, + "step": 7210 + }, + { + "epoch": 0.7231932688936745, + "grad_norm": 2.6081392765045166, + "learning_rate": 4.7522951247359484e-05, + "loss": 0.7602, + "step": 7220 + }, + { + "epoch": 0.7241949216206741, + "grad_norm": 2.632668972015381, + "learning_rate": 4.751610505242822e-05, + "loss": 0.6545, + "step": 7230 + }, + { + "epoch": 0.7251965743476737, + "grad_norm": 2.9545376300811768, + "learning_rate": 4.750924990414719e-05, + "loss": 0.7311, + "step": 7240 + }, + { + "epoch": 0.7261982270746732, + "grad_norm": 2.353555679321289, + "learning_rate": 4.7502385805242286e-05, + "loss": 0.6705, + "step": 7250 + }, + { + "epoch": 0.7271998798016728, + "grad_norm": 2.0468590259552, + "learning_rate": 4.749551275844297e-05, + "loss": 0.6565, + "step": 7260 + }, + { + "epoch": 0.7282015325286723, + "grad_norm": 2.693467855453491, + "learning_rate": 4.748863076648224e-05, + "loss": 0.7045, + "step": 7270 + }, + { + "epoch": 0.7292031852556718, + "grad_norm": 2.921161651611328, + "learning_rate": 4.748173983209667e-05, + "loss": 0.6481, + "step": 7280 + }, + { + "epoch": 0.7302048379826714, + "grad_norm": 2.5260236263275146, + "learning_rate": 4.74748399580264e-05, + "loss": 0.6944, + "step": 7290 + }, + { + "epoch": 0.7312064907096709, + "grad_norm": 2.695580244064331, + "learning_rate": 4.746793114701508e-05, + "loss": 0.7933, + "step": 7300 + }, + { + "epoch": 0.7322081434366705, + "grad_norm": 1.944222092628479, + "learning_rate": 4.7461013401809974e-05, + "loss": 0.683, + "step": 7310 + }, + { + "epoch": 0.7332097961636701, + "grad_norm": 2.3878674507141113, + "learning_rate": 4.745408672516184e-05, + "loss": 0.6057, + "step": 7320 + }, + { + "epoch": 0.7342114488906696, + "grad_norm": 2.2860217094421387, + "learning_rate": 4.744715111982504e-05, + "loss": 0.6008, + "step": 7330 + }, + { + "epoch": 0.7352131016176692, + "grad_norm": 2.850374937057495, + "learning_rate": 4.744020658855745e-05, + "loss": 0.6024, + "step": 7340 + }, + { + "epoch": 0.7362147543446687, + "grad_norm": 2.4920895099639893, + "learning_rate": 4.7433253134120515e-05, + "loss": 0.7168, + "step": 7350 + }, + { + "epoch": 0.7372164070716682, + "grad_norm": 2.360792875289917, + "learning_rate": 4.742629075927921e-05, + "loss": 0.6677, + "step": 7360 + }, + { + "epoch": 0.7382180597986678, + "grad_norm": 2.3476812839508057, + "learning_rate": 4.7419319466802074e-05, + "loss": 0.7269, + "step": 7370 + }, + { + "epoch": 0.7392197125256673, + "grad_norm": 2.4540038108825684, + "learning_rate": 4.7412339259461194e-05, + "loss": 0.7336, + "step": 7380 + }, + { + "epoch": 0.7402213652526669, + "grad_norm": 2.768291473388672, + "learning_rate": 4.740535014003218e-05, + "loss": 0.6893, + "step": 7390 + }, + { + "epoch": 0.7412230179796665, + "grad_norm": 2.2983901500701904, + "learning_rate": 4.73983521112942e-05, + "loss": 0.6626, + "step": 7400 + }, + { + "epoch": 0.742224670706666, + "grad_norm": 3.18436336517334, + "learning_rate": 4.739134517602998e-05, + "loss": 0.7355, + "step": 7410 + }, + { + "epoch": 0.7432263234336656, + "grad_norm": 2.746232748031616, + "learning_rate": 4.738432933702575e-05, + "loss": 0.6544, + "step": 7420 + }, + { + "epoch": 0.744227976160665, + "grad_norm": 2.6315596103668213, + "learning_rate": 4.737730459707132e-05, + "loss": 0.6887, + "step": 7430 + }, + { + "epoch": 0.7452296288876646, + "grad_norm": 2.708827495574951, + "learning_rate": 4.737027095896002e-05, + "loss": 0.6689, + "step": 7440 + }, + { + "epoch": 0.7462312816146642, + "grad_norm": 2.332124948501587, + "learning_rate": 4.73632284254887e-05, + "loss": 0.6914, + "step": 7450 + }, + { + "epoch": 0.7472329343416637, + "grad_norm": 2.743593692779541, + "learning_rate": 4.73561769994578e-05, + "loss": 0.7856, + "step": 7460 + }, + { + "epoch": 0.7482345870686633, + "grad_norm": 2.3313615322113037, + "learning_rate": 4.7349116683671247e-05, + "loss": 0.6917, + "step": 7470 + }, + { + "epoch": 0.7492362397956629, + "grad_norm": 2.5874178409576416, + "learning_rate": 4.7342047480936516e-05, + "loss": 0.6347, + "step": 7480 + }, + { + "epoch": 0.7502378925226624, + "grad_norm": 2.8353426456451416, + "learning_rate": 4.733496939406462e-05, + "loss": 0.7379, + "step": 7490 + }, + { + "epoch": 0.751239545249662, + "grad_norm": 3.2078945636749268, + "learning_rate": 4.73278824258701e-05, + "loss": 0.7179, + "step": 7500 + }, + { + "epoch": 0.7522411979766614, + "grad_norm": 2.3015921115875244, + "learning_rate": 4.732078657917105e-05, + "loss": 0.745, + "step": 7510 + }, + { + "epoch": 0.753242850703661, + "grad_norm": 2.150043487548828, + "learning_rate": 4.7313681856789054e-05, + "loss": 0.5916, + "step": 7520 + }, + { + "epoch": 0.7542445034306606, + "grad_norm": 2.9892406463623047, + "learning_rate": 4.7306568261549264e-05, + "loss": 0.7063, + "step": 7530 + }, + { + "epoch": 0.7552461561576601, + "grad_norm": 2.488659143447876, + "learning_rate": 4.7299445796280345e-05, + "loss": 0.7205, + "step": 7540 + }, + { + "epoch": 0.7562478088846597, + "grad_norm": 2.3893909454345703, + "learning_rate": 4.729231446381448e-05, + "loss": 0.7307, + "step": 7550 + }, + { + "epoch": 0.7572494616116593, + "grad_norm": 2.462712049484253, + "learning_rate": 4.7285174266987395e-05, + "loss": 0.6747, + "step": 7560 + }, + { + "epoch": 0.7582511143386588, + "grad_norm": 3.0485661029815674, + "learning_rate": 4.727802520863832e-05, + "loss": 0.6703, + "step": 7570 + }, + { + "epoch": 0.7592527670656584, + "grad_norm": 3.7530717849731445, + "learning_rate": 4.727086729161003e-05, + "loss": 0.689, + "step": 7580 + }, + { + "epoch": 0.7602544197926578, + "grad_norm": 2.519653081893921, + "learning_rate": 4.726370051874882e-05, + "loss": 0.7283, + "step": 7590 + }, + { + "epoch": 0.7612560725196574, + "grad_norm": 2.17928409576416, + "learning_rate": 4.725652489290449e-05, + "loss": 0.6723, + "step": 7600 + }, + { + "epoch": 0.762257725246657, + "grad_norm": 2.4222123622894287, + "learning_rate": 4.724934041693036e-05, + "loss": 0.6834, + "step": 7610 + }, + { + "epoch": 0.7632593779736565, + "grad_norm": 2.6158435344696045, + "learning_rate": 4.724214709368331e-05, + "loss": 0.5397, + "step": 7620 + }, + { + "epoch": 0.7642610307006561, + "grad_norm": 2.603689193725586, + "learning_rate": 4.723494492602368e-05, + "loss": 0.7108, + "step": 7630 + }, + { + "epoch": 0.7652626834276556, + "grad_norm": 2.116384744644165, + "learning_rate": 4.722773391681536e-05, + "loss": 0.6177, + "step": 7640 + }, + { + "epoch": 0.7662643361546552, + "grad_norm": 2.3073859214782715, + "learning_rate": 4.722051406892577e-05, + "loss": 0.713, + "step": 7650 + }, + { + "epoch": 0.7672659888816548, + "grad_norm": 2.8427605628967285, + "learning_rate": 4.7213285385225803e-05, + "loss": 0.7059, + "step": 7660 + }, + { + "epoch": 0.7682676416086542, + "grad_norm": 3.5749852657318115, + "learning_rate": 4.72060478685899e-05, + "loss": 0.7576, + "step": 7670 + }, + { + "epoch": 0.7692692943356538, + "grad_norm": 2.713200569152832, + "learning_rate": 4.7198801521895985e-05, + "loss": 0.7588, + "step": 7680 + }, + { + "epoch": 0.7702709470626534, + "grad_norm": 3.1419548988342285, + "learning_rate": 4.7191546348025526e-05, + "loss": 0.7121, + "step": 7690 + }, + { + "epoch": 0.7712725997896529, + "grad_norm": 2.1218645572662354, + "learning_rate": 4.718428234986348e-05, + "loss": 0.6079, + "step": 7700 + }, + { + "epoch": 0.7722742525166525, + "grad_norm": 2.9028546810150146, + "learning_rate": 4.717700953029833e-05, + "loss": 0.7087, + "step": 7710 + }, + { + "epoch": 0.773275905243652, + "grad_norm": 2.609346389770508, + "learning_rate": 4.7169727892222023e-05, + "loss": 0.6647, + "step": 7720 + }, + { + "epoch": 0.7742775579706516, + "grad_norm": 2.2379143238067627, + "learning_rate": 4.716243743853008e-05, + "loss": 0.6019, + "step": 7730 + }, + { + "epoch": 0.7752792106976512, + "grad_norm": 2.9337775707244873, + "learning_rate": 4.7155138172121475e-05, + "loss": 0.6523, + "step": 7740 + }, + { + "epoch": 0.7762808634246506, + "grad_norm": 2.9232068061828613, + "learning_rate": 4.7147830095898704e-05, + "loss": 0.7737, + "step": 7750 + }, + { + "epoch": 0.7772825161516502, + "grad_norm": 2.630415916442871, + "learning_rate": 4.714051321276776e-05, + "loss": 0.6862, + "step": 7760 + }, + { + "epoch": 0.7782841688786498, + "grad_norm": 2.902622938156128, + "learning_rate": 4.7133187525638156e-05, + "loss": 0.7157, + "step": 7770 + }, + { + "epoch": 0.7792858216056493, + "grad_norm": 2.3229422569274902, + "learning_rate": 4.7125853037422885e-05, + "loss": 0.6656, + "step": 7780 + }, + { + "epoch": 0.7802874743326489, + "grad_norm": 3.199389934539795, + "learning_rate": 4.711850975103844e-05, + "loss": 0.6862, + "step": 7790 + }, + { + "epoch": 0.7812891270596484, + "grad_norm": 2.6107218265533447, + "learning_rate": 4.711115766940484e-05, + "loss": 0.6955, + "step": 7800 + }, + { + "epoch": 0.782290779786648, + "grad_norm": 2.342616558074951, + "learning_rate": 4.710379679544557e-05, + "loss": 0.7143, + "step": 7810 + }, + { + "epoch": 0.7832924325136476, + "grad_norm": 2.832531213760376, + "learning_rate": 4.709642713208762e-05, + "loss": 0.6895, + "step": 7820 + }, + { + "epoch": 0.784294085240647, + "grad_norm": 3.0061943531036377, + "learning_rate": 4.7089048682261485e-05, + "loss": 0.6991, + "step": 7830 + }, + { + "epoch": 0.7852957379676466, + "grad_norm": 2.8207926750183105, + "learning_rate": 4.7081661448901136e-05, + "loss": 0.7095, + "step": 7840 + }, + { + "epoch": 0.7862973906946462, + "grad_norm": 2.1883671283721924, + "learning_rate": 4.707426543494407e-05, + "loss": 0.7691, + "step": 7850 + }, + { + "epoch": 0.7872990434216457, + "grad_norm": 2.096417188644409, + "learning_rate": 4.7066860643331226e-05, + "loss": 0.6957, + "step": 7860 + }, + { + "epoch": 0.7883006961486453, + "grad_norm": 2.4886560440063477, + "learning_rate": 4.705944707700708e-05, + "loss": 0.7012, + "step": 7870 + }, + { + "epoch": 0.7893023488756448, + "grad_norm": 2.392404079437256, + "learning_rate": 4.705202473891957e-05, + "loss": 0.6981, + "step": 7880 + }, + { + "epoch": 0.7903040016026444, + "grad_norm": 1.8706591129302979, + "learning_rate": 4.704459363202012e-05, + "loss": 0.7211, + "step": 7890 + }, + { + "epoch": 0.791305654329644, + "grad_norm": 2.4096221923828125, + "learning_rate": 4.703715375926367e-05, + "loss": 0.6804, + "step": 7900 + }, + { + "epoch": 0.7923073070566434, + "grad_norm": 2.4984166622161865, + "learning_rate": 4.7029705123608604e-05, + "loss": 0.6623, + "step": 7910 + }, + { + "epoch": 0.793308959783643, + "grad_norm": 2.8774752616882324, + "learning_rate": 4.7022247728016836e-05, + "loss": 0.6281, + "step": 7920 + }, + { + "epoch": 0.7943106125106426, + "grad_norm": 2.1365795135498047, + "learning_rate": 4.7014781575453724e-05, + "loss": 0.6289, + "step": 7930 + }, + { + "epoch": 0.7953122652376421, + "grad_norm": 2.5661661624908447, + "learning_rate": 4.700730666888813e-05, + "loss": 0.6251, + "step": 7940 + }, + { + "epoch": 0.7963139179646417, + "grad_norm": 3.090698480606079, + "learning_rate": 4.69998230112924e-05, + "loss": 0.7738, + "step": 7950 + }, + { + "epoch": 0.7973155706916412, + "grad_norm": 2.6969759464263916, + "learning_rate": 4.699233060564233e-05, + "loss": 0.685, + "step": 7960 + }, + { + "epoch": 0.7983172234186408, + "grad_norm": 2.0235366821289062, + "learning_rate": 4.698482945491722e-05, + "loss": 0.6514, + "step": 7970 + }, + { + "epoch": 0.7993188761456403, + "grad_norm": 3.036466360092163, + "learning_rate": 4.6977319562099866e-05, + "loss": 0.662, + "step": 7980 + }, + { + "epoch": 0.8003205288726398, + "grad_norm": 2.331421375274658, + "learning_rate": 4.696980093017649e-05, + "loss": 0.6494, + "step": 7990 + }, + { + "epoch": 0.8013221815996394, + "grad_norm": 2.20649790763855, + "learning_rate": 4.6962273562136826e-05, + "loss": 0.6467, + "step": 8000 + }, + { + "epoch": 0.802323834326639, + "grad_norm": 2.333871603012085, + "learning_rate": 4.6954737460974074e-05, + "loss": 0.6229, + "step": 8010 + }, + { + "epoch": 0.8033254870536385, + "grad_norm": 2.6986920833587646, + "learning_rate": 4.69471926296849e-05, + "loss": 0.6443, + "step": 8020 + }, + { + "epoch": 0.8043271397806381, + "grad_norm": 2.474196434020996, + "learning_rate": 4.6939639071269454e-05, + "loss": 0.6341, + "step": 8030 + }, + { + "epoch": 0.8053287925076376, + "grad_norm": 2.205517053604126, + "learning_rate": 4.6932076788731336e-05, + "loss": 0.5968, + "step": 8040 + }, + { + "epoch": 0.8063304452346371, + "grad_norm": 2.337341785430908, + "learning_rate": 4.6924505785077645e-05, + "loss": 0.6785, + "step": 8050 + }, + { + "epoch": 0.8073320979616367, + "grad_norm": 3.4576077461242676, + "learning_rate": 4.6916926063318914e-05, + "loss": 0.7103, + "step": 8060 + }, + { + "epoch": 0.8083337506886362, + "grad_norm": 2.9819705486297607, + "learning_rate": 4.6909337626469155e-05, + "loss": 0.6929, + "step": 8070 + }, + { + "epoch": 0.8093354034156358, + "grad_norm": 2.672879457473755, + "learning_rate": 4.6901740477545874e-05, + "loss": 0.6572, + "step": 8080 + }, + { + "epoch": 0.8103370561426354, + "grad_norm": 3.234699010848999, + "learning_rate": 4.6894134619569996e-05, + "loss": 0.6925, + "step": 8090 + }, + { + "epoch": 0.8113387088696349, + "grad_norm": 2.5342490673065186, + "learning_rate": 4.688652005556593e-05, + "loss": 0.6874, + "step": 8100 + }, + { + "epoch": 0.8123403615966345, + "grad_norm": 4.199151515960693, + "learning_rate": 4.687889678856156e-05, + "loss": 0.7003, + "step": 8110 + }, + { + "epoch": 0.813342014323634, + "grad_norm": 2.5348429679870605, + "learning_rate": 4.6871264821588214e-05, + "loss": 0.6502, + "step": 8120 + }, + { + "epoch": 0.8143436670506335, + "grad_norm": 2.564166307449341, + "learning_rate": 4.686362415768066e-05, + "loss": 0.7122, + "step": 8130 + }, + { + "epoch": 0.8153453197776331, + "grad_norm": 3.15704607963562, + "learning_rate": 4.685597479987718e-05, + "loss": 0.6718, + "step": 8140 + }, + { + "epoch": 0.8163469725046326, + "grad_norm": 3.303502082824707, + "learning_rate": 4.684831675121947e-05, + "loss": 0.6489, + "step": 8150 + }, + { + "epoch": 0.8173486252316322, + "grad_norm": 2.5538973808288574, + "learning_rate": 4.6840650014752675e-05, + "loss": 0.6453, + "step": 8160 + }, + { + "epoch": 0.8183502779586317, + "grad_norm": 2.57222580909729, + "learning_rate": 4.683297459352544e-05, + "loss": 0.6897, + "step": 8170 + }, + { + "epoch": 0.8193519306856313, + "grad_norm": 2.3636786937713623, + "learning_rate": 4.68252904905898e-05, + "loss": 0.6485, + "step": 8180 + }, + { + "epoch": 0.8203535834126309, + "grad_norm": 2.2532460689544678, + "learning_rate": 4.6817597709001314e-05, + "loss": 0.7237, + "step": 8190 + }, + { + "epoch": 0.8213552361396304, + "grad_norm": 2.7108314037323, + "learning_rate": 4.680989625181893e-05, + "loss": 0.6412, + "step": 8200 + }, + { + "epoch": 0.8223568888666299, + "grad_norm": 2.512598991394043, + "learning_rate": 4.6802186122105084e-05, + "loss": 0.6971, + "step": 8210 + }, + { + "epoch": 0.8233585415936295, + "grad_norm": 2.5848026275634766, + "learning_rate": 4.6794467322925636e-05, + "loss": 0.6953, + "step": 8220 + }, + { + "epoch": 0.824360194320629, + "grad_norm": 2.2770142555236816, + "learning_rate": 4.678673985734992e-05, + "loss": 0.7248, + "step": 8230 + }, + { + "epoch": 0.8253618470476286, + "grad_norm": 1.9408689737319946, + "learning_rate": 4.677900372845069e-05, + "loss": 0.6522, + "step": 8240 + }, + { + "epoch": 0.8263634997746281, + "grad_norm": 1.805517315864563, + "learning_rate": 4.677125893930416e-05, + "loss": 0.7298, + "step": 8250 + }, + { + "epoch": 0.8273651525016277, + "grad_norm": 2.5754191875457764, + "learning_rate": 4.676350549298998e-05, + "loss": 0.6288, + "step": 8260 + }, + { + "epoch": 0.8283668052286273, + "grad_norm": 2.7753493785858154, + "learning_rate": 4.675574339259125e-05, + "loss": 0.6635, + "step": 8270 + }, + { + "epoch": 0.8293684579556267, + "grad_norm": 2.473208427429199, + "learning_rate": 4.674797264119451e-05, + "loss": 0.6893, + "step": 8280 + }, + { + "epoch": 0.8303701106826263, + "grad_norm": 5.45206356048584, + "learning_rate": 4.674019324188973e-05, + "loss": 0.7061, + "step": 8290 + }, + { + "epoch": 0.8313717634096259, + "grad_norm": 2.2079737186431885, + "learning_rate": 4.673240519777033e-05, + "loss": 0.7112, + "step": 8300 + }, + { + "epoch": 0.8323734161366254, + "grad_norm": 2.702819347381592, + "learning_rate": 4.6724608511933166e-05, + "loss": 0.6834, + "step": 8310 + }, + { + "epoch": 0.833375068863625, + "grad_norm": 2.0012543201446533, + "learning_rate": 4.671680318747852e-05, + "loss": 0.6652, + "step": 8320 + }, + { + "epoch": 0.8343767215906245, + "grad_norm": 2.19185209274292, + "learning_rate": 4.670898922751012e-05, + "loss": 0.6717, + "step": 8330 + }, + { + "epoch": 0.8353783743176241, + "grad_norm": 2.6093597412109375, + "learning_rate": 4.670116663513514e-05, + "loss": 0.7139, + "step": 8340 + }, + { + "epoch": 0.8363800270446237, + "grad_norm": 2.486466884613037, + "learning_rate": 4.6693335413464156e-05, + "loss": 0.6394, + "step": 8350 + }, + { + "epoch": 0.8373816797716231, + "grad_norm": 2.484245777130127, + "learning_rate": 4.6685495565611196e-05, + "loss": 0.7008, + "step": 8360 + }, + { + "epoch": 0.8383833324986227, + "grad_norm": 2.261580228805542, + "learning_rate": 4.66776470946937e-05, + "loss": 0.7157, + "step": 8370 + }, + { + "epoch": 0.8393849852256223, + "grad_norm": 2.329270362854004, + "learning_rate": 4.666979000383257e-05, + "loss": 0.6162, + "step": 8380 + }, + { + "epoch": 0.8403866379526218, + "grad_norm": 2.7080113887786865, + "learning_rate": 4.666192429615211e-05, + "loss": 0.6352, + "step": 8390 + }, + { + "epoch": 0.8413882906796214, + "grad_norm": 3.4409689903259277, + "learning_rate": 4.665404997478004e-05, + "loss": 0.6912, + "step": 8400 + }, + { + "epoch": 0.8423899434066209, + "grad_norm": 2.870689868927002, + "learning_rate": 4.6646167042847545e-05, + "loss": 0.6864, + "step": 8410 + }, + { + "epoch": 0.8433915961336205, + "grad_norm": 3.0154449939727783, + "learning_rate": 4.663827550348919e-05, + "loss": 0.649, + "step": 8420 + }, + { + "epoch": 0.8443932488606201, + "grad_norm": 2.5777080059051514, + "learning_rate": 4.663037535984299e-05, + "loss": 0.6965, + "step": 8430 + }, + { + "epoch": 0.8453949015876195, + "grad_norm": 2.8088622093200684, + "learning_rate": 4.6622466615050386e-05, + "loss": 0.5746, + "step": 8440 + }, + { + "epoch": 0.8463965543146191, + "grad_norm": 2.371058464050293, + "learning_rate": 4.6614549272256216e-05, + "loss": 0.6209, + "step": 8450 + }, + { + "epoch": 0.8473982070416187, + "grad_norm": 3.050855875015259, + "learning_rate": 4.660662333460874e-05, + "loss": 0.7257, + "step": 8460 + }, + { + "epoch": 0.8483998597686182, + "grad_norm": 2.714768409729004, + "learning_rate": 4.659868880525966e-05, + "loss": 0.6487, + "step": 8470 + }, + { + "epoch": 0.8494015124956178, + "grad_norm": 2.225267171859741, + "learning_rate": 4.659074568736407e-05, + "loss": 0.615, + "step": 8480 + }, + { + "epoch": 0.8504031652226173, + "grad_norm": 2.1432557106018066, + "learning_rate": 4.658279398408049e-05, + "loss": 0.6767, + "step": 8490 + }, + { + "epoch": 0.8514048179496169, + "grad_norm": 2.810725688934326, + "learning_rate": 4.657483369857085e-05, + "loss": 0.6305, + "step": 8500 + }, + { + "epoch": 0.8524064706766165, + "grad_norm": 2.351372003555298, + "learning_rate": 4.65668648340005e-05, + "loss": 0.6995, + "step": 8510 + }, + { + "epoch": 0.8534081234036159, + "grad_norm": 2.379453420639038, + "learning_rate": 4.6558887393538185e-05, + "loss": 0.6905, + "step": 8520 + }, + { + "epoch": 0.8544097761306155, + "grad_norm": 2.038651466369629, + "learning_rate": 4.655090138035607e-05, + "loss": 0.6281, + "step": 8530 + }, + { + "epoch": 0.8554114288576151, + "grad_norm": 2.4542579650878906, + "learning_rate": 4.654290679762975e-05, + "loss": 0.6665, + "step": 8540 + }, + { + "epoch": 0.8564130815846146, + "grad_norm": 2.596134662628174, + "learning_rate": 4.653490364853818e-05, + "loss": 0.6725, + "step": 8550 + }, + { + "epoch": 0.8574147343116142, + "grad_norm": 2.8084211349487305, + "learning_rate": 4.652689193626377e-05, + "loss": 0.7161, + "step": 8560 + }, + { + "epoch": 0.8584163870386137, + "grad_norm": 2.6709377765655518, + "learning_rate": 4.651887166399229e-05, + "loss": 0.7052, + "step": 8570 + }, + { + "epoch": 0.8594180397656133, + "grad_norm": 2.4577574729919434, + "learning_rate": 4.6510842834912966e-05, + "loss": 0.6728, + "step": 8580 + }, + { + "epoch": 0.8604196924926129, + "grad_norm": 3.887608766555786, + "learning_rate": 4.650280545221838e-05, + "loss": 0.577, + "step": 8590 + }, + { + "epoch": 0.8614213452196123, + "grad_norm": 2.4187510013580322, + "learning_rate": 4.649475951910454e-05, + "loss": 0.7278, + "step": 8600 + }, + { + "epoch": 0.8624229979466119, + "grad_norm": 2.964956760406494, + "learning_rate": 4.6486705038770836e-05, + "loss": 0.6633, + "step": 8610 + }, + { + "epoch": 0.8634246506736114, + "grad_norm": 3.0552287101745605, + "learning_rate": 4.647864201442008e-05, + "loss": 0.6312, + "step": 8620 + }, + { + "epoch": 0.864426303400611, + "grad_norm": 2.2803544998168945, + "learning_rate": 4.647057044925847e-05, + "loss": 0.6599, + "step": 8630 + }, + { + "epoch": 0.8654279561276106, + "grad_norm": 2.118612766265869, + "learning_rate": 4.646249034649559e-05, + "loss": 0.6478, + "step": 8640 + }, + { + "epoch": 0.8664296088546101, + "grad_norm": 2.494878053665161, + "learning_rate": 4.645440170934443e-05, + "loss": 0.7454, + "step": 8650 + }, + { + "epoch": 0.8674312615816097, + "grad_norm": 3.2278289794921875, + "learning_rate": 4.6446304541021384e-05, + "loss": 0.6687, + "step": 8660 + }, + { + "epoch": 0.8684329143086092, + "grad_norm": 3.0382726192474365, + "learning_rate": 4.6438198844746216e-05, + "loss": 0.7565, + "step": 8670 + }, + { + "epoch": 0.8694345670356087, + "grad_norm": 3.102384328842163, + "learning_rate": 4.643008462374209e-05, + "loss": 0.6778, + "step": 8680 + }, + { + "epoch": 0.8704362197626083, + "grad_norm": 3.0376174449920654, + "learning_rate": 4.6421961881235565e-05, + "loss": 0.6931, + "step": 8690 + }, + { + "epoch": 0.8714378724896078, + "grad_norm": 2.6684823036193848, + "learning_rate": 4.641383062045659e-05, + "loss": 0.602, + "step": 8700 + }, + { + "epoch": 0.8724395252166074, + "grad_norm": 2.0593066215515137, + "learning_rate": 4.640569084463849e-05, + "loss": 0.6166, + "step": 8710 + }, + { + "epoch": 0.873441177943607, + "grad_norm": 2.141124725341797, + "learning_rate": 4.639754255701798e-05, + "loss": 0.5958, + "step": 8720 + }, + { + "epoch": 0.8744428306706065, + "grad_norm": 2.7273616790771484, + "learning_rate": 4.638938576083517e-05, + "loss": 0.5938, + "step": 8730 + }, + { + "epoch": 0.875444483397606, + "grad_norm": 2.745358467102051, + "learning_rate": 4.638122045933353e-05, + "loss": 0.6457, + "step": 8740 + }, + { + "epoch": 0.8764461361246056, + "grad_norm": 2.2381041049957275, + "learning_rate": 4.637304665575994e-05, + "loss": 0.6258, + "step": 8750 + }, + { + "epoch": 0.8774477888516051, + "grad_norm": 2.4562036991119385, + "learning_rate": 4.6364864353364645e-05, + "loss": 0.6121, + "step": 8760 + }, + { + "epoch": 0.8784494415786047, + "grad_norm": 1.8846955299377441, + "learning_rate": 4.6356673555401274e-05, + "loss": 0.6452, + "step": 8770 + }, + { + "epoch": 0.8794510943056042, + "grad_norm": 1.8423177003860474, + "learning_rate": 4.6348474265126836e-05, + "loss": 0.6773, + "step": 8780 + }, + { + "epoch": 0.8804527470326038, + "grad_norm": 2.811655044555664, + "learning_rate": 4.63402664858017e-05, + "loss": 0.6714, + "step": 8790 + }, + { + "epoch": 0.8814543997596034, + "grad_norm": 2.5115389823913574, + "learning_rate": 4.633205022068963e-05, + "loss": 0.6618, + "step": 8800 + }, + { + "epoch": 0.8824560524866029, + "grad_norm": 2.644765615463257, + "learning_rate": 4.632382547305777e-05, + "loss": 0.6575, + "step": 8810 + }, + { + "epoch": 0.8834577052136025, + "grad_norm": 2.793731212615967, + "learning_rate": 4.6315592246176606e-05, + "loss": 0.6772, + "step": 8820 + }, + { + "epoch": 0.884459357940602, + "grad_norm": 1.6667630672454834, + "learning_rate": 4.630735054332003e-05, + "loss": 0.6543, + "step": 8830 + }, + { + "epoch": 0.8854610106676015, + "grad_norm": 3.318857192993164, + "learning_rate": 4.629910036776528e-05, + "loss": 0.668, + "step": 8840 + }, + { + "epoch": 0.8864626633946011, + "grad_norm": 2.305460214614868, + "learning_rate": 4.629084172279298e-05, + "loss": 0.6429, + "step": 8850 + }, + { + "epoch": 0.8874643161216006, + "grad_norm": 2.154136896133423, + "learning_rate": 4.628257461168711e-05, + "loss": 0.6032, + "step": 8860 + }, + { + "epoch": 0.8884659688486002, + "grad_norm": 2.2056803703308105, + "learning_rate": 4.627429903773502e-05, + "loss": 0.6624, + "step": 8870 + }, + { + "epoch": 0.8894676215755998, + "grad_norm": 2.4072306156158447, + "learning_rate": 4.626601500422743e-05, + "loss": 0.689, + "step": 8880 + }, + { + "epoch": 0.8904692743025993, + "grad_norm": 2.359260320663452, + "learning_rate": 4.6257722514458426e-05, + "loss": 0.6717, + "step": 8890 + }, + { + "epoch": 0.8914709270295988, + "grad_norm": 2.8342933654785156, + "learning_rate": 4.624942157172543e-05, + "loss": 0.6374, + "step": 8900 + }, + { + "epoch": 0.8924725797565984, + "grad_norm": 2.6110384464263916, + "learning_rate": 4.624111217932926e-05, + "loss": 0.6617, + "step": 8910 + }, + { + "epoch": 0.8934742324835979, + "grad_norm": 2.4755423069000244, + "learning_rate": 4.623279434057408e-05, + "loss": 0.6272, + "step": 8920 + }, + { + "epoch": 0.8944758852105975, + "grad_norm": 1.939832091331482, + "learning_rate": 4.6224468058767414e-05, + "loss": 0.6798, + "step": 8930 + }, + { + "epoch": 0.895477537937597, + "grad_norm": 2.495572566986084, + "learning_rate": 4.621613333722013e-05, + "loss": 0.6241, + "step": 8940 + }, + { + "epoch": 0.8964791906645966, + "grad_norm": 1.9067161083221436, + "learning_rate": 4.620779017924648e-05, + "loss": 0.5824, + "step": 8950 + }, + { + "epoch": 0.8974808433915962, + "grad_norm": 2.223557472229004, + "learning_rate": 4.619943858816403e-05, + "loss": 0.5958, + "step": 8960 + }, + { + "epoch": 0.8984824961185957, + "grad_norm": 2.710803747177124, + "learning_rate": 4.619107856729376e-05, + "loss": 0.6501, + "step": 8970 + }, + { + "epoch": 0.8994841488455952, + "grad_norm": 2.848140001296997, + "learning_rate": 4.618271011995994e-05, + "loss": 0.6274, + "step": 8980 + }, + { + "epoch": 0.9004858015725948, + "grad_norm": 2.5017592906951904, + "learning_rate": 4.617433324949021e-05, + "loss": 0.6688, + "step": 8990 + }, + { + "epoch": 0.9014874542995943, + "grad_norm": 2.8840415477752686, + "learning_rate": 4.616594795921558e-05, + "loss": 0.6324, + "step": 9000 + }, + { + "epoch": 0.9024891070265939, + "grad_norm": 2.0160205364227295, + "learning_rate": 4.61575542524704e-05, + "loss": 0.6435, + "step": 9010 + }, + { + "epoch": 0.9034907597535934, + "grad_norm": 2.5726449489593506, + "learning_rate": 4.6149152132592346e-05, + "loss": 0.6423, + "step": 9020 + }, + { + "epoch": 0.904492412480593, + "grad_norm": 2.614830493927002, + "learning_rate": 4.6140741602922466e-05, + "loss": 0.5871, + "step": 9030 + }, + { + "epoch": 0.9054940652075926, + "grad_norm": 2.377392530441284, + "learning_rate": 4.6132322666805125e-05, + "loss": 0.6431, + "step": 9040 + }, + { + "epoch": 0.906495717934592, + "grad_norm": 3.235520839691162, + "learning_rate": 4.612389532758806e-05, + "loss": 0.7138, + "step": 9050 + }, + { + "epoch": 0.9074973706615916, + "grad_norm": 2.785496473312378, + "learning_rate": 4.611545958862233e-05, + "loss": 0.6979, + "step": 9060 + }, + { + "epoch": 0.9084990233885911, + "grad_norm": 2.0526037216186523, + "learning_rate": 4.610701545326234e-05, + "loss": 0.6125, + "step": 9070 + }, + { + "epoch": 0.9095006761155907, + "grad_norm": 2.4618897438049316, + "learning_rate": 4.609856292486583e-05, + "loss": 0.6538, + "step": 9080 + }, + { + "epoch": 0.9105023288425903, + "grad_norm": 2.852219343185425, + "learning_rate": 4.6090102006793875e-05, + "loss": 0.5938, + "step": 9090 + }, + { + "epoch": 0.9115039815695898, + "grad_norm": 2.0149478912353516, + "learning_rate": 4.608163270241092e-05, + "loss": 0.6686, + "step": 9100 + }, + { + "epoch": 0.9125056342965894, + "grad_norm": 3.2436070442199707, + "learning_rate": 4.6073155015084676e-05, + "loss": 0.7302, + "step": 9110 + }, + { + "epoch": 0.913507287023589, + "grad_norm": 3.005521059036255, + "learning_rate": 4.606466894818625e-05, + "loss": 0.6218, + "step": 9120 + }, + { + "epoch": 0.9145089397505884, + "grad_norm": 2.209559679031372, + "learning_rate": 4.6056174505090066e-05, + "loss": 0.7016, + "step": 9130 + }, + { + "epoch": 0.915510592477588, + "grad_norm": 1.9661232233047485, + "learning_rate": 4.604767168917386e-05, + "loss": 0.6632, + "step": 9140 + }, + { + "epoch": 0.9165122452045875, + "grad_norm": 2.880694627761841, + "learning_rate": 4.603916050381871e-05, + "loss": 0.6389, + "step": 9150 + }, + { + "epoch": 0.9175138979315871, + "grad_norm": 2.2712185382843018, + "learning_rate": 4.603064095240902e-05, + "loss": 0.64, + "step": 9160 + }, + { + "epoch": 0.9185155506585867, + "grad_norm": 2.218029022216797, + "learning_rate": 4.6022113038332534e-05, + "loss": 0.5949, + "step": 9170 + }, + { + "epoch": 0.9195172033855862, + "grad_norm": 2.7205657958984375, + "learning_rate": 4.6013576764980293e-05, + "loss": 0.6492, + "step": 9180 + }, + { + "epoch": 0.9205188561125858, + "grad_norm": 2.42616605758667, + "learning_rate": 4.6005032135746684e-05, + "loss": 0.5645, + "step": 9190 + }, + { + "epoch": 0.9215205088395854, + "grad_norm": 2.109893560409546, + "learning_rate": 4.5996479154029406e-05, + "loss": 0.7036, + "step": 9200 + }, + { + "epoch": 0.9225221615665848, + "grad_norm": 2.179466485977173, + "learning_rate": 4.59879178232295e-05, + "loss": 0.6462, + "step": 9210 + }, + { + "epoch": 0.9235238142935844, + "grad_norm": 2.5310440063476562, + "learning_rate": 4.597934814675129e-05, + "loss": 0.6169, + "step": 9220 + }, + { + "epoch": 0.9245254670205839, + "grad_norm": 3.850490093231201, + "learning_rate": 4.597077012800245e-05, + "loss": 0.7126, + "step": 9230 + }, + { + "epoch": 0.9255271197475835, + "grad_norm": 1.9445955753326416, + "learning_rate": 4.596218377039397e-05, + "loss": 0.6866, + "step": 9240 + }, + { + "epoch": 0.9265287724745831, + "grad_norm": 2.2818901538848877, + "learning_rate": 4.595358907734013e-05, + "loss": 0.6533, + "step": 9250 + }, + { + "epoch": 0.9275304252015826, + "grad_norm": 3.0567245483398438, + "learning_rate": 4.5944986052258555e-05, + "loss": 0.6954, + "step": 9260 + }, + { + "epoch": 0.9285320779285822, + "grad_norm": 2.6508970260620117, + "learning_rate": 4.5936374698570154e-05, + "loss": 0.6175, + "step": 9270 + }, + { + "epoch": 0.9295337306555818, + "grad_norm": 2.2970590591430664, + "learning_rate": 4.5927755019699175e-05, + "loss": 0.5485, + "step": 9280 + }, + { + "epoch": 0.9305353833825812, + "grad_norm": 2.39508056640625, + "learning_rate": 4.591912701907316e-05, + "loss": 0.6005, + "step": 9290 + }, + { + "epoch": 0.9315370361095808, + "grad_norm": 2.541376829147339, + "learning_rate": 4.591049070012297e-05, + "loss": 0.6507, + "step": 9300 + }, + { + "epoch": 0.9325386888365803, + "grad_norm": 2.181727647781372, + "learning_rate": 4.590184606628276e-05, + "loss": 0.6837, + "step": 9310 + }, + { + "epoch": 0.9335403415635799, + "grad_norm": 2.3271026611328125, + "learning_rate": 4.589319312099001e-05, + "loss": 0.6608, + "step": 9320 + }, + { + "epoch": 0.9345419942905795, + "grad_norm": 2.447262763977051, + "learning_rate": 4.588453186768549e-05, + "loss": 0.662, + "step": 9330 + }, + { + "epoch": 0.935543647017579, + "grad_norm": 2.4956793785095215, + "learning_rate": 4.587586230981327e-05, + "loss": 0.6311, + "step": 9340 + }, + { + "epoch": 0.9365452997445786, + "grad_norm": 2.4352951049804688, + "learning_rate": 4.5867184450820746e-05, + "loss": 0.6221, + "step": 9350 + }, + { + "epoch": 0.9375469524715782, + "grad_norm": 2.322591543197632, + "learning_rate": 4.5858498294158594e-05, + "loss": 0.6063, + "step": 9360 + }, + { + "epoch": 0.9385486051985776, + "grad_norm": 3.020512819290161, + "learning_rate": 4.584980384328078e-05, + "loss": 0.6755, + "step": 9370 + }, + { + "epoch": 0.9395502579255772, + "grad_norm": 2.604862928390503, + "learning_rate": 4.5841101101644604e-05, + "loss": 0.6647, + "step": 9380 + }, + { + "epoch": 0.9405519106525767, + "grad_norm": 3.1676621437072754, + "learning_rate": 4.5832390072710634e-05, + "loss": 0.6585, + "step": 9390 + }, + { + "epoch": 0.9415535633795763, + "grad_norm": 2.4377002716064453, + "learning_rate": 4.582367075994274e-05, + "loss": 0.648, + "step": 9400 + }, + { + "epoch": 0.9425552161065759, + "grad_norm": 2.3394460678100586, + "learning_rate": 4.581494316680809e-05, + "loss": 0.5972, + "step": 9410 + }, + { + "epoch": 0.9435568688335754, + "grad_norm": 2.6361212730407715, + "learning_rate": 4.580620729677714e-05, + "loss": 0.6573, + "step": 9420 + }, + { + "epoch": 0.944558521560575, + "grad_norm": 3.2919678688049316, + "learning_rate": 4.5797463153323625e-05, + "loss": 0.6253, + "step": 9430 + }, + { + "epoch": 0.9455601742875746, + "grad_norm": 2.6798956394195557, + "learning_rate": 4.578871073992461e-05, + "loss": 0.6699, + "step": 9440 + }, + { + "epoch": 0.946561827014574, + "grad_norm": 3.6646926403045654, + "learning_rate": 4.577995006006042e-05, + "loss": 0.6509, + "step": 9450 + }, + { + "epoch": 0.9475634797415736, + "grad_norm": 2.593172788619995, + "learning_rate": 4.577118111721464e-05, + "loss": 0.6777, + "step": 9460 + }, + { + "epoch": 0.9485651324685731, + "grad_norm": 2.2215142250061035, + "learning_rate": 4.576240391487421e-05, + "loss": 0.6076, + "step": 9470 + }, + { + "epoch": 0.9495667851955727, + "grad_norm": 2.540949583053589, + "learning_rate": 4.575361845652928e-05, + "loss": 0.6286, + "step": 9480 + }, + { + "epoch": 0.9505684379225723, + "grad_norm": 2.5423927307128906, + "learning_rate": 4.574482474567334e-05, + "loss": 0.607, + "step": 9490 + }, + { + "epoch": 0.9515700906495718, + "grad_norm": 2.4569149017333984, + "learning_rate": 4.573602278580313e-05, + "loss": 0.662, + "step": 9500 + }, + { + "epoch": 0.9525717433765714, + "grad_norm": 2.495696783065796, + "learning_rate": 4.572721258041868e-05, + "loss": 0.5999, + "step": 9510 + }, + { + "epoch": 0.953573396103571, + "grad_norm": 2.62209153175354, + "learning_rate": 4.57183941330233e-05, + "loss": 0.6722, + "step": 9520 + }, + { + "epoch": 0.9545750488305704, + "grad_norm": 2.378762722015381, + "learning_rate": 4.5709567447123577e-05, + "loss": 0.6478, + "step": 9530 + }, + { + "epoch": 0.95557670155757, + "grad_norm": 4.010514259338379, + "learning_rate": 4.5700732526229364e-05, + "loss": 0.6729, + "step": 9540 + }, + { + "epoch": 0.9565783542845695, + "grad_norm": 2.489445447921753, + "learning_rate": 4.5691889373853806e-05, + "loss": 0.6039, + "step": 9550 + }, + { + "epoch": 0.9575800070115691, + "grad_norm": 2.4253251552581787, + "learning_rate": 4.56830379935133e-05, + "loss": 0.5937, + "step": 9560 + }, + { + "epoch": 0.9585816597385687, + "grad_norm": 2.035104513168335, + "learning_rate": 4.567417838872754e-05, + "loss": 0.6291, + "step": 9570 + }, + { + "epoch": 0.9595833124655682, + "grad_norm": 2.7530062198638916, + "learning_rate": 4.566531056301948e-05, + "loss": 0.6301, + "step": 9580 + }, + { + "epoch": 0.9605849651925678, + "grad_norm": 2.7969632148742676, + "learning_rate": 4.565643451991533e-05, + "loss": 0.6635, + "step": 9590 + }, + { + "epoch": 0.9615866179195672, + "grad_norm": 3.570622444152832, + "learning_rate": 4.564755026294457e-05, + "loss": 0.6416, + "step": 9600 + }, + { + "epoch": 0.9625882706465668, + "grad_norm": 2.4569382667541504, + "learning_rate": 4.563865779563997e-05, + "loss": 0.6606, + "step": 9610 + }, + { + "epoch": 0.9635899233735664, + "grad_norm": 2.543860912322998, + "learning_rate": 4.562975712153754e-05, + "loss": 0.6704, + "step": 9620 + }, + { + "epoch": 0.9645915761005659, + "grad_norm": 1.9001827239990234, + "learning_rate": 4.562084824417657e-05, + "loss": 0.6436, + "step": 9630 + }, + { + "epoch": 0.9655932288275655, + "grad_norm": 3.1932127475738525, + "learning_rate": 4.56119311670996e-05, + "loss": 0.5471, + "step": 9640 + }, + { + "epoch": 0.9665948815545651, + "grad_norm": 2.821469306945801, + "learning_rate": 4.560300589385243e-05, + "loss": 0.6969, + "step": 9650 + }, + { + "epoch": 0.9675965342815646, + "grad_norm": 2.833065986633301, + "learning_rate": 4.559407242798413e-05, + "loss": 0.6397, + "step": 9660 + }, + { + "epoch": 0.9685981870085641, + "grad_norm": 2.193622350692749, + "learning_rate": 4.558513077304703e-05, + "loss": 0.5948, + "step": 9670 + }, + { + "epoch": 0.9695998397355636, + "grad_norm": 2.1319234371185303, + "learning_rate": 4.557618093259668e-05, + "loss": 0.6201, + "step": 9680 + }, + { + "epoch": 0.9706014924625632, + "grad_norm": 2.4496283531188965, + "learning_rate": 4.5567222910191945e-05, + "loss": 0.6134, + "step": 9690 + }, + { + "epoch": 0.9716031451895628, + "grad_norm": 2.4668614864349365, + "learning_rate": 4.5558256709394884e-05, + "loss": 0.665, + "step": 9700 + }, + { + "epoch": 0.9726047979165623, + "grad_norm": 2.3747925758361816, + "learning_rate": 4.554928233377086e-05, + "loss": 0.6614, + "step": 9710 + }, + { + "epoch": 0.9736064506435619, + "grad_norm": 2.282470941543579, + "learning_rate": 4.5540299786888443e-05, + "loss": 0.6946, + "step": 9720 + }, + { + "epoch": 0.9746081033705615, + "grad_norm": 2.3198509216308594, + "learning_rate": 4.553130907231947e-05, + "loss": 0.7002, + "step": 9730 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 2.980468511581421, + "learning_rate": 4.552231019363904e-05, + "loss": 0.6124, + "step": 9740 + }, + { + "epoch": 0.9766114088245605, + "grad_norm": 2.8307383060455322, + "learning_rate": 4.551330315442549e-05, + "loss": 0.6939, + "step": 9750 + }, + { + "epoch": 0.97761306155156, + "grad_norm": 2.3871123790740967, + "learning_rate": 4.5504287958260376e-05, + "loss": 0.6207, + "step": 9760 + }, + { + "epoch": 0.9786147142785596, + "grad_norm": 2.3071370124816895, + "learning_rate": 4.5495264608728535e-05, + "loss": 0.6086, + "step": 9770 + }, + { + "epoch": 0.9796163670055592, + "grad_norm": 2.3521180152893066, + "learning_rate": 4.548623310941802e-05, + "loss": 0.7043, + "step": 9780 + }, + { + "epoch": 0.9806180197325587, + "grad_norm": 2.123138904571533, + "learning_rate": 4.547719346392015e-05, + "loss": 0.6656, + "step": 9790 + }, + { + "epoch": 0.9816196724595583, + "grad_norm": 2.3112244606018066, + "learning_rate": 4.546814567582946e-05, + "loss": 0.6143, + "step": 9800 + }, + { + "epoch": 0.9826213251865579, + "grad_norm": 4.398324966430664, + "learning_rate": 4.545908974874371e-05, + "loss": 0.6633, + "step": 9810 + }, + { + "epoch": 0.9836229779135574, + "grad_norm": 3.7052175998687744, + "learning_rate": 4.5450025686263955e-05, + "loss": 0.6285, + "step": 9820 + }, + { + "epoch": 0.9846246306405569, + "grad_norm": 2.8410427570343018, + "learning_rate": 4.544095349199442e-05, + "loss": 0.7186, + "step": 9830 + }, + { + "epoch": 0.9856262833675564, + "grad_norm": 2.3758535385131836, + "learning_rate": 4.5431873169542596e-05, + "loss": 0.6215, + "step": 9840 + }, + { + "epoch": 0.986627936094556, + "grad_norm": 2.0713741779327393, + "learning_rate": 4.542278472251921e-05, + "loss": 0.6362, + "step": 9850 + }, + { + "epoch": 0.9876295888215556, + "grad_norm": 2.075610637664795, + "learning_rate": 4.54136881545382e-05, + "loss": 0.5963, + "step": 9860 + }, + { + "epoch": 0.9886312415485551, + "grad_norm": 2.8744397163391113, + "learning_rate": 4.5404583469216756e-05, + "loss": 0.6866, + "step": 9870 + }, + { + "epoch": 0.9896328942755547, + "grad_norm": 2.514857769012451, + "learning_rate": 4.539547067017528e-05, + "loss": 0.6553, + "step": 9880 + }, + { + "epoch": 0.9906345470025543, + "grad_norm": 2.687924385070801, + "learning_rate": 4.538634976103738e-05, + "loss": 0.6553, + "step": 9890 + }, + { + "epoch": 0.9916361997295537, + "grad_norm": 3.2299506664276123, + "learning_rate": 4.5377220745429954e-05, + "loss": 0.6194, + "step": 9900 + }, + { + "epoch": 0.9926378524565533, + "grad_norm": 2.1460533142089844, + "learning_rate": 4.536808362698305e-05, + "loss": 0.6697, + "step": 9910 + }, + { + "epoch": 0.9936395051835528, + "grad_norm": 2.8532016277313232, + "learning_rate": 4.535893840932999e-05, + "loss": 0.6531, + "step": 9920 + }, + { + "epoch": 0.9946411579105524, + "grad_norm": 2.4520788192749023, + "learning_rate": 4.5349785096107275e-05, + "loss": 0.6417, + "step": 9930 + }, + { + "epoch": 0.995642810637552, + "grad_norm": 2.3643345832824707, + "learning_rate": 4.534062369095467e-05, + "loss": 0.6163, + "step": 9940 + }, + { + "epoch": 0.9966444633645515, + "grad_norm": 2.5428478717803955, + "learning_rate": 4.5331454197515126e-05, + "loss": 0.6432, + "step": 9950 + }, + { + "epoch": 0.9976461160915511, + "grad_norm": 2.155560255050659, + "learning_rate": 4.5322276619434814e-05, + "loss": 0.5903, + "step": 9960 + }, + { + "epoch": 0.9986477688185507, + "grad_norm": 2.393480062484741, + "learning_rate": 4.531309096036313e-05, + "loss": 0.6368, + "step": 9970 + }, + { + "epoch": 0.9996494215455501, + "grad_norm": 2.6624627113342285, + "learning_rate": 4.530389722395268e-05, + "loss": 0.5995, + "step": 9980 + }, + { + "epoch": 1.0006009916361998, + "grad_norm": 2.460616111755371, + "learning_rate": 4.529469541385928e-05, + "loss": 0.6235, + "step": 9990 + }, + { + "epoch": 1.0016026443631993, + "grad_norm": 2.41973614692688, + "learning_rate": 4.5285485533741946e-05, + "loss": 0.6751, + "step": 10000 + }, + { + "epoch": 1.0016026443631993, + "eval_bleu": 0.08263337231259318, + "eval_loss": 0.6337834000587463, + "eval_rouge1": 0.4374259000948164, + "eval_rouge2": 0.32409193917532053, + "eval_rougeL": 0.41067563698366927, + "eval_runtime": 106969.7121, + "eval_samples_per_second": 0.166, + "eval_steps_per_second": 0.021, + "eval_wer": 4.417791876035146, + "step": 10000 + }, + { + "epoch": 1.002604297090199, + "grad_norm": 2.241828680038452, + "learning_rate": 4.527626758726292e-05, + "loss": 0.5698, + "step": 10010 + }, + { + "epoch": 1.0036059498171983, + "grad_norm": 2.0964417457580566, + "learning_rate": 4.526704157808765e-05, + "loss": 0.6447, + "step": 10020 + }, + { + "epoch": 1.004607602544198, + "grad_norm": 2.1923632621765137, + "learning_rate": 4.525780750988479e-05, + "loss": 0.5937, + "step": 10030 + }, + { + "epoch": 1.0056092552711975, + "grad_norm": 2.193221092224121, + "learning_rate": 4.524856538632617e-05, + "loss": 0.5717, + "step": 10040 + }, + { + "epoch": 1.006610907998197, + "grad_norm": 1.8794662952423096, + "learning_rate": 4.5239315211086865e-05, + "loss": 0.574, + "step": 10050 + }, + { + "epoch": 1.0076125607251967, + "grad_norm": 2.7534232139587402, + "learning_rate": 4.523005698784514e-05, + "loss": 0.607, + "step": 10060 + }, + { + "epoch": 1.008614213452196, + "grad_norm": 1.9754960536956787, + "learning_rate": 4.522079072028243e-05, + "loss": 0.5808, + "step": 10070 + }, + { + "epoch": 1.0096158661791956, + "grad_norm": 2.343451499938965, + "learning_rate": 4.5211516412083416e-05, + "loss": 0.6064, + "step": 10080 + }, + { + "epoch": 1.0106175189061952, + "grad_norm": 2.7020604610443115, + "learning_rate": 4.520223406693594e-05, + "loss": 0.6037, + "step": 10090 + }, + { + "epoch": 1.0116191716331948, + "grad_norm": 2.809910774230957, + "learning_rate": 4.5192943688531056e-05, + "loss": 0.5117, + "step": 10100 + }, + { + "epoch": 1.0126208243601944, + "grad_norm": 2.667957305908203, + "learning_rate": 4.5183645280563e-05, + "loss": 0.583, + "step": 10110 + }, + { + "epoch": 1.0136224770871938, + "grad_norm": 2.3873534202575684, + "learning_rate": 4.517433884672924e-05, + "loss": 0.6762, + "step": 10120 + }, + { + "epoch": 1.0146241298141934, + "grad_norm": 2.1108696460723877, + "learning_rate": 4.516502439073037e-05, + "loss": 0.556, + "step": 10130 + }, + { + "epoch": 1.015625782541193, + "grad_norm": 2.5046160221099854, + "learning_rate": 4.5155701916270215e-05, + "loss": 0.5785, + "step": 10140 + }, + { + "epoch": 1.0166274352681925, + "grad_norm": 2.8896470069885254, + "learning_rate": 4.5146371427055803e-05, + "loss": 0.5929, + "step": 10150 + }, + { + "epoch": 1.0176290879951921, + "grad_norm": 1.979096531867981, + "learning_rate": 4.513703292679731e-05, + "loss": 0.6493, + "step": 10160 + }, + { + "epoch": 1.0186307407221915, + "grad_norm": 2.9013965129852295, + "learning_rate": 4.5127686419208125e-05, + "loss": 0.5868, + "step": 10170 + }, + { + "epoch": 1.019632393449191, + "grad_norm": 1.9182281494140625, + "learning_rate": 4.511833190800481e-05, + "loss": 0.5256, + "step": 10180 + }, + { + "epoch": 1.0206340461761907, + "grad_norm": 3.2245006561279297, + "learning_rate": 4.510896939690711e-05, + "loss": 0.6408, + "step": 10190 + }, + { + "epoch": 1.0216356989031903, + "grad_norm": 2.499907970428467, + "learning_rate": 4.509959888963795e-05, + "loss": 0.5626, + "step": 10200 + }, + { + "epoch": 1.0226373516301899, + "grad_norm": 1.939091682434082, + "learning_rate": 4.509022038992345e-05, + "loss": 0.5804, + "step": 10210 + }, + { + "epoch": 1.0236390043571895, + "grad_norm": 2.0463554859161377, + "learning_rate": 4.5080833901492884e-05, + "loss": 0.549, + "step": 10220 + }, + { + "epoch": 1.0246406570841888, + "grad_norm": 2.2921488285064697, + "learning_rate": 4.507143942807872e-05, + "loss": 0.6272, + "step": 10230 + }, + { + "epoch": 1.0256423098111884, + "grad_norm": 2.6805219650268555, + "learning_rate": 4.506203697341659e-05, + "loss": 0.6537, + "step": 10240 + }, + { + "epoch": 1.026643962538188, + "grad_norm": 2.7690038681030273, + "learning_rate": 4.505262654124531e-05, + "loss": 0.6039, + "step": 10250 + }, + { + "epoch": 1.0276456152651876, + "grad_norm": 2.6545157432556152, + "learning_rate": 4.504320813530687e-05, + "loss": 0.554, + "step": 10260 + }, + { + "epoch": 1.0286472679921872, + "grad_norm": 2.342416763305664, + "learning_rate": 4.5033781759346406e-05, + "loss": 0.6261, + "step": 10270 + }, + { + "epoch": 1.0296489207191866, + "grad_norm": 2.439129590988159, + "learning_rate": 4.502434741711226e-05, + "loss": 0.6274, + "step": 10280 + }, + { + "epoch": 1.0306505734461862, + "grad_norm": 3.0521368980407715, + "learning_rate": 4.501490511235591e-05, + "loss": 0.6079, + "step": 10290 + }, + { + "epoch": 1.0316522261731857, + "grad_norm": 2.864180326461792, + "learning_rate": 4.5005454848832014e-05, + "loss": 0.6524, + "step": 10300 + }, + { + "epoch": 1.0326538789001853, + "grad_norm": 2.0836775302886963, + "learning_rate": 4.4995996630298395e-05, + "loss": 0.5631, + "step": 10310 + }, + { + "epoch": 1.033655531627185, + "grad_norm": 1.885198712348938, + "learning_rate": 4.4986530460516054e-05, + "loss": 0.5759, + "step": 10320 + }, + { + "epoch": 1.0346571843541843, + "grad_norm": 1.9586544036865234, + "learning_rate": 4.497705634324912e-05, + "loss": 0.5971, + "step": 10330 + }, + { + "epoch": 1.035658837081184, + "grad_norm": 2.376250982284546, + "learning_rate": 4.49675742822649e-05, + "loss": 0.6532, + "step": 10340 + }, + { + "epoch": 1.0366604898081835, + "grad_norm": 2.651041269302368, + "learning_rate": 4.495808428133387e-05, + "loss": 0.658, + "step": 10350 + }, + { + "epoch": 1.037662142535183, + "grad_norm": 2.460192918777466, + "learning_rate": 4.494858634422965e-05, + "loss": 0.5724, + "step": 10360 + }, + { + "epoch": 1.0386637952621827, + "grad_norm": 2.251406669616699, + "learning_rate": 4.493908047472901e-05, + "loss": 0.5986, + "step": 10370 + }, + { + "epoch": 1.0396654479891823, + "grad_norm": 2.525148868560791, + "learning_rate": 4.49295666766119e-05, + "loss": 0.5119, + "step": 10380 + }, + { + "epoch": 1.0406671007161816, + "grad_norm": 2.9440059661865234, + "learning_rate": 4.492004495366139e-05, + "loss": 0.6081, + "step": 10390 + }, + { + "epoch": 1.0416687534431812, + "grad_norm": 2.5194528102874756, + "learning_rate": 4.491051530966372e-05, + "loss": 0.5246, + "step": 10400 + }, + { + "epoch": 1.0426704061701808, + "grad_norm": 2.512968063354492, + "learning_rate": 4.490097774840829e-05, + "loss": 0.6149, + "step": 10410 + }, + { + "epoch": 1.0436720588971804, + "grad_norm": 3.91227650642395, + "learning_rate": 4.489143227368763e-05, + "loss": 0.6073, + "step": 10420 + }, + { + "epoch": 1.04467371162418, + "grad_norm": 2.4288504123687744, + "learning_rate": 4.488187888929741e-05, + "loss": 0.6019, + "step": 10430 + }, + { + "epoch": 1.0456753643511794, + "grad_norm": 3.138383150100708, + "learning_rate": 4.487231759903647e-05, + "loss": 0.6624, + "step": 10440 + }, + { + "epoch": 1.046677017078179, + "grad_norm": 2.4413421154022217, + "learning_rate": 4.486274840670677e-05, + "loss": 0.5642, + "step": 10450 + }, + { + "epoch": 1.0476786698051785, + "grad_norm": 2.7690060138702393, + "learning_rate": 4.485317131611344e-05, + "loss": 0.6147, + "step": 10460 + }, + { + "epoch": 1.0486803225321781, + "grad_norm": 2.3034732341766357, + "learning_rate": 4.484358633106471e-05, + "loss": 0.5251, + "step": 10470 + }, + { + "epoch": 1.0496819752591777, + "grad_norm": 2.441673994064331, + "learning_rate": 4.483399345537199e-05, + "loss": 0.551, + "step": 10480 + }, + { + "epoch": 1.050683627986177, + "grad_norm": 2.078007936477661, + "learning_rate": 4.482439269284981e-05, + "loss": 0.6121, + "step": 10490 + }, + { + "epoch": 1.0516852807131767, + "grad_norm": 2.7018327713012695, + "learning_rate": 4.4814784047315836e-05, + "loss": 0.7006, + "step": 10500 + }, + { + "epoch": 1.0526869334401763, + "grad_norm": 2.6600396633148193, + "learning_rate": 4.480516752259086e-05, + "loss": 0.5935, + "step": 10510 + }, + { + "epoch": 1.0536885861671759, + "grad_norm": 2.5032167434692383, + "learning_rate": 4.4795543122498826e-05, + "loss": 0.6069, + "step": 10520 + }, + { + "epoch": 1.0546902388941755, + "grad_norm": 2.078220844268799, + "learning_rate": 4.478591085086681e-05, + "loss": 0.5766, + "step": 10530 + }, + { + "epoch": 1.055691891621175, + "grad_norm": 2.401156187057495, + "learning_rate": 4.477627071152498e-05, + "loss": 0.6457, + "step": 10540 + }, + { + "epoch": 1.0566935443481744, + "grad_norm": 2.45200514793396, + "learning_rate": 4.476662270830668e-05, + "loss": 0.6415, + "step": 10550 + }, + { + "epoch": 1.057695197075174, + "grad_norm": 2.3690593242645264, + "learning_rate": 4.4756966845048355e-05, + "loss": 0.5754, + "step": 10560 + }, + { + "epoch": 1.0586968498021736, + "grad_norm": 1.8617242574691772, + "learning_rate": 4.474730312558959e-05, + "loss": 0.5331, + "step": 10570 + }, + { + "epoch": 1.0596985025291732, + "grad_norm": 2.688246250152588, + "learning_rate": 4.473763155377307e-05, + "loss": 0.5913, + "step": 10580 + }, + { + "epoch": 1.0607001552561728, + "grad_norm": 2.441681146621704, + "learning_rate": 4.472795213344464e-05, + "loss": 0.6329, + "step": 10590 + }, + { + "epoch": 1.0617018079831722, + "grad_norm": 1.5429996252059937, + "learning_rate": 4.471826486845323e-05, + "loss": 0.5835, + "step": 10600 + }, + { + "epoch": 1.0627034607101717, + "grad_norm": 2.474956512451172, + "learning_rate": 4.4708569762650904e-05, + "loss": 0.6397, + "step": 10610 + }, + { + "epoch": 1.0637051134371713, + "grad_norm": 2.1213226318359375, + "learning_rate": 4.4698866819892846e-05, + "loss": 0.6741, + "step": 10620 + }, + { + "epoch": 1.064706766164171, + "grad_norm": 2.361389636993408, + "learning_rate": 4.4689156044037363e-05, + "loss": 0.6327, + "step": 10630 + }, + { + "epoch": 1.0657084188911705, + "grad_norm": 2.966655731201172, + "learning_rate": 4.4679437438945855e-05, + "loss": 0.6742, + "step": 10640 + }, + { + "epoch": 1.0667100716181699, + "grad_norm": 2.1844279766082764, + "learning_rate": 4.466971100848285e-05, + "loss": 0.6256, + "step": 10650 + }, + { + "epoch": 1.0677117243451695, + "grad_norm": 2.297964334487915, + "learning_rate": 4.465997675651599e-05, + "loss": 0.6274, + "step": 10660 + }, + { + "epoch": 1.068713377072169, + "grad_norm": 2.126434087753296, + "learning_rate": 4.465023468691601e-05, + "loss": 0.6694, + "step": 10670 + }, + { + "epoch": 1.0697150297991687, + "grad_norm": 2.612492799758911, + "learning_rate": 4.464048480355679e-05, + "loss": 0.5821, + "step": 10680 + }, + { + "epoch": 1.0707166825261683, + "grad_norm": 2.2019944190979004, + "learning_rate": 4.463072711031529e-05, + "loss": 0.6195, + "step": 10690 + }, + { + "epoch": 1.0717183352531676, + "grad_norm": 2.5340359210968018, + "learning_rate": 4.462096161107154e-05, + "loss": 0.67, + "step": 10700 + }, + { + "epoch": 1.0727199879801672, + "grad_norm": 2.8773465156555176, + "learning_rate": 4.4611188309708766e-05, + "loss": 0.5325, + "step": 10710 + }, + { + "epoch": 1.0737216407071668, + "grad_norm": 2.5338494777679443, + "learning_rate": 4.460140721011321e-05, + "loss": 0.674, + "step": 10720 + }, + { + "epoch": 1.0747232934341664, + "grad_norm": 2.4730725288391113, + "learning_rate": 4.459161831617426e-05, + "loss": 0.5787, + "step": 10730 + }, + { + "epoch": 1.075724946161166, + "grad_norm": 1.9959347248077393, + "learning_rate": 4.458182163178439e-05, + "loss": 0.5417, + "step": 10740 + }, + { + "epoch": 1.0767265988881656, + "grad_norm": 2.4486429691314697, + "learning_rate": 4.4572017160839176e-05, + "loss": 0.6154, + "step": 10750 + }, + { + "epoch": 1.077728251615165, + "grad_norm": 2.2543139457702637, + "learning_rate": 4.4562204907237274e-05, + "loss": 0.5951, + "step": 10760 + }, + { + "epoch": 1.0787299043421645, + "grad_norm": 4.040517807006836, + "learning_rate": 4.455238487488047e-05, + "loss": 0.5494, + "step": 10770 + }, + { + "epoch": 1.0797315570691641, + "grad_norm": 2.089376926422119, + "learning_rate": 4.454255706767361e-05, + "loss": 0.5377, + "step": 10780 + }, + { + "epoch": 1.0807332097961637, + "grad_norm": 2.3366119861602783, + "learning_rate": 4.453272148952464e-05, + "loss": 0.6139, + "step": 10790 + }, + { + "epoch": 1.0817348625231633, + "grad_norm": 2.543785333633423, + "learning_rate": 4.4522878144344606e-05, + "loss": 0.5641, + "step": 10800 + }, + { + "epoch": 1.0827365152501627, + "grad_norm": 2.731062173843384, + "learning_rate": 4.451302703604763e-05, + "loss": 0.6759, + "step": 10810 + }, + { + "epoch": 1.0837381679771623, + "grad_norm": 2.7335755825042725, + "learning_rate": 4.4503168168550934e-05, + "loss": 0.6713, + "step": 10820 + }, + { + "epoch": 1.0847398207041619, + "grad_norm": 2.4380624294281006, + "learning_rate": 4.449330154577481e-05, + "loss": 0.6745, + "step": 10830 + }, + { + "epoch": 1.0857414734311615, + "grad_norm": 2.1496427059173584, + "learning_rate": 4.4483427171642643e-05, + "loss": 0.6305, + "step": 10840 + }, + { + "epoch": 1.086743126158161, + "grad_norm": 2.2780048847198486, + "learning_rate": 4.4473545050080915e-05, + "loss": 0.5717, + "step": 10850 + }, + { + "epoch": 1.0877447788851604, + "grad_norm": 2.7438337802886963, + "learning_rate": 4.446365518501915e-05, + "loss": 0.5682, + "step": 10860 + }, + { + "epoch": 1.08874643161216, + "grad_norm": 2.0671546459198, + "learning_rate": 4.4453757580389984e-05, + "loss": 0.6222, + "step": 10870 + }, + { + "epoch": 1.0897480843391596, + "grad_norm": 2.4361698627471924, + "learning_rate": 4.444385224012912e-05, + "loss": 0.5047, + "step": 10880 + }, + { + "epoch": 1.0907497370661592, + "grad_norm": 2.4023778438568115, + "learning_rate": 4.443393916817535e-05, + "loss": 0.5796, + "step": 10890 + }, + { + "epoch": 1.0917513897931588, + "grad_norm": 2.4326682090759277, + "learning_rate": 4.442401836847051e-05, + "loss": 0.5218, + "step": 10900 + }, + { + "epoch": 1.0927530425201581, + "grad_norm": 1.928910493850708, + "learning_rate": 4.441408984495953e-05, + "loss": 0.5586, + "step": 10910 + }, + { + "epoch": 1.0937546952471577, + "grad_norm": 2.215165138244629, + "learning_rate": 4.4404153601590415e-05, + "loss": 0.5602, + "step": 10920 + }, + { + "epoch": 1.0947563479741573, + "grad_norm": 2.9248368740081787, + "learning_rate": 4.439420964231422e-05, + "loss": 0.6545, + "step": 10930 + }, + { + "epoch": 1.095758000701157, + "grad_norm": 2.902667999267578, + "learning_rate": 4.43842579710851e-05, + "loss": 0.5476, + "step": 10940 + }, + { + "epoch": 1.0967596534281565, + "grad_norm": 2.431466579437256, + "learning_rate": 4.437429859186025e-05, + "loss": 0.5683, + "step": 10950 + }, + { + "epoch": 1.097761306155156, + "grad_norm": 1.8508896827697754, + "learning_rate": 4.436433150859993e-05, + "loss": 0.5298, + "step": 10960 + }, + { + "epoch": 1.0987629588821555, + "grad_norm": 2.972052812576294, + "learning_rate": 4.4354356725267485e-05, + "loss": 0.6163, + "step": 10970 + }, + { + "epoch": 1.099764611609155, + "grad_norm": 3.3024284839630127, + "learning_rate": 4.4344374245829294e-05, + "loss": 0.5949, + "step": 10980 + }, + { + "epoch": 1.1007662643361547, + "grad_norm": 2.4315567016601562, + "learning_rate": 4.433438407425482e-05, + "loss": 0.5789, + "step": 10990 + }, + { + "epoch": 1.1017679170631542, + "grad_norm": 2.4305036067962646, + "learning_rate": 4.4324386214516576e-05, + "loss": 0.5941, + "step": 11000 + }, + { + "epoch": 1.1027695697901538, + "grad_norm": 2.330115795135498, + "learning_rate": 4.4314380670590125e-05, + "loss": 0.6021, + "step": 11010 + }, + { + "epoch": 1.1037712225171532, + "grad_norm": 2.429572582244873, + "learning_rate": 4.4304367446454084e-05, + "loss": 0.5992, + "step": 11020 + }, + { + "epoch": 1.1047728752441528, + "grad_norm": 2.6118149757385254, + "learning_rate": 4.429434654609016e-05, + "loss": 0.5482, + "step": 11030 + }, + { + "epoch": 1.1057745279711524, + "grad_norm": 2.079375982284546, + "learning_rate": 4.428431797348306e-05, + "loss": 0.6004, + "step": 11040 + }, + { + "epoch": 1.106776180698152, + "grad_norm": 2.0178675651550293, + "learning_rate": 4.4274281732620574e-05, + "loss": 0.5709, + "step": 11050 + }, + { + "epoch": 1.1077778334251516, + "grad_norm": 2.3021092414855957, + "learning_rate": 4.426423782749352e-05, + "loss": 0.5602, + "step": 11060 + }, + { + "epoch": 1.1087794861521512, + "grad_norm": 2.4679481983184814, + "learning_rate": 4.42541862620958e-05, + "loss": 0.6025, + "step": 11070 + }, + { + "epoch": 1.1097811388791505, + "grad_norm": 2.0906708240509033, + "learning_rate": 4.424412704042432e-05, + "loss": 0.5415, + "step": 11080 + }, + { + "epoch": 1.1107827916061501, + "grad_norm": 2.103607416152954, + "learning_rate": 4.423406016647906e-05, + "loss": 0.5777, + "step": 11090 + }, + { + "epoch": 1.1117844443331497, + "grad_norm": 2.2250356674194336, + "learning_rate": 4.422398564426303e-05, + "loss": 0.5818, + "step": 11100 + }, + { + "epoch": 1.1127860970601493, + "grad_norm": 2.02903151512146, + "learning_rate": 4.421390347778228e-05, + "loss": 0.6417, + "step": 11110 + }, + { + "epoch": 1.113787749787149, + "grad_norm": 2.283987283706665, + "learning_rate": 4.420381367104591e-05, + "loss": 0.603, + "step": 11120 + }, + { + "epoch": 1.1147894025141483, + "grad_norm": 2.4817943572998047, + "learning_rate": 4.419371622806604e-05, + "loss": 0.5143, + "step": 11130 + }, + { + "epoch": 1.1157910552411479, + "grad_norm": 1.9237391948699951, + "learning_rate": 4.4183611152857854e-05, + "loss": 0.5663, + "step": 11140 + }, + { + "epoch": 1.1167927079681474, + "grad_norm": 2.5406248569488525, + "learning_rate": 4.417349844943953e-05, + "loss": 0.564, + "step": 11150 + }, + { + "epoch": 1.117794360695147, + "grad_norm": 2.506094217300415, + "learning_rate": 4.416337812183233e-05, + "loss": 0.5951, + "step": 11160 + }, + { + "epoch": 1.1187960134221466, + "grad_norm": 2.317397117614746, + "learning_rate": 4.415325017406051e-05, + "loss": 0.5939, + "step": 11170 + }, + { + "epoch": 1.119797666149146, + "grad_norm": 2.1029767990112305, + "learning_rate": 4.4143114610151374e-05, + "loss": 0.5199, + "step": 11180 + }, + { + "epoch": 1.1207993188761456, + "grad_norm": 2.195216417312622, + "learning_rate": 4.413297143413523e-05, + "loss": 0.6576, + "step": 11190 + }, + { + "epoch": 1.1218009716031452, + "grad_norm": 2.5397441387176514, + "learning_rate": 4.412282065004546e-05, + "loss": 0.5166, + "step": 11200 + }, + { + "epoch": 1.1228026243301448, + "grad_norm": 2.8908276557922363, + "learning_rate": 4.4112662261918415e-05, + "loss": 0.6163, + "step": 11210 + }, + { + "epoch": 1.1238042770571444, + "grad_norm": 2.3265693187713623, + "learning_rate": 4.41024962737935e-05, + "loss": 0.5597, + "step": 11220 + }, + { + "epoch": 1.1248059297841437, + "grad_norm": 1.9278279542922974, + "learning_rate": 4.4092322689713164e-05, + "loss": 0.6404, + "step": 11230 + }, + { + "epoch": 1.1258075825111433, + "grad_norm": 2.3198344707489014, + "learning_rate": 4.408214151372283e-05, + "loss": 0.6297, + "step": 11240 + }, + { + "epoch": 1.126809235238143, + "grad_norm": 2.27579402923584, + "learning_rate": 4.407195274987096e-05, + "loss": 0.5715, + "step": 11250 + }, + { + "epoch": 1.1278108879651425, + "grad_norm": 2.439666748046875, + "learning_rate": 4.4061756402209047e-05, + "loss": 0.6156, + "step": 11260 + }, + { + "epoch": 1.128812540692142, + "grad_norm": 2.574446201324463, + "learning_rate": 4.4051552474791585e-05, + "loss": 0.6065, + "step": 11270 + }, + { + "epoch": 1.1298141934191417, + "grad_norm": 2.493682384490967, + "learning_rate": 4.404134097167608e-05, + "loss": 0.517, + "step": 11280 + }, + { + "epoch": 1.130815846146141, + "grad_norm": 2.36503529548645, + "learning_rate": 4.403112189692305e-05, + "loss": 0.5445, + "step": 11290 + }, + { + "epoch": 1.1318174988731406, + "grad_norm": 2.6429598331451416, + "learning_rate": 4.402089525459604e-05, + "loss": 0.5817, + "step": 11300 + }, + { + "epoch": 1.1328191516001402, + "grad_norm": 2.0238144397735596, + "learning_rate": 4.401066104876158e-05, + "loss": 0.6503, + "step": 11310 + }, + { + "epoch": 1.1338208043271398, + "grad_norm": 1.9457943439483643, + "learning_rate": 4.4000419283489234e-05, + "loss": 0.5834, + "step": 11320 + }, + { + "epoch": 1.1348224570541394, + "grad_norm": 3.049051523208618, + "learning_rate": 4.3990169962851556e-05, + "loss": 0.5707, + "step": 11330 + }, + { + "epoch": 1.1358241097811388, + "grad_norm": 2.3645284175872803, + "learning_rate": 4.39799130909241e-05, + "loss": 0.6106, + "step": 11340 + }, + { + "epoch": 1.1368257625081384, + "grad_norm": 2.4572527408599854, + "learning_rate": 4.3969648671785427e-05, + "loss": 0.576, + "step": 11350 + }, + { + "epoch": 1.137827415235138, + "grad_norm": 2.7629027366638184, + "learning_rate": 4.395937670951712e-05, + "loss": 0.5461, + "step": 11360 + }, + { + "epoch": 1.1388290679621376, + "grad_norm": 1.8488332033157349, + "learning_rate": 4.3949097208203715e-05, + "loss": 0.5847, + "step": 11370 + }, + { + "epoch": 1.1398307206891372, + "grad_norm": 2.3692805767059326, + "learning_rate": 4.3938810171932795e-05, + "loss": 0.58, + "step": 11380 + }, + { + "epoch": 1.1408323734161367, + "grad_norm": 2.6989333629608154, + "learning_rate": 4.392851560479492e-05, + "loss": 0.5461, + "step": 11390 + }, + { + "epoch": 1.1418340261431361, + "grad_norm": 2.201425552368164, + "learning_rate": 4.3918213510883624e-05, + "loss": 0.5628, + "step": 11400 + }, + { + "epoch": 1.1428356788701357, + "grad_norm": 2.2745614051818848, + "learning_rate": 4.390790389429546e-05, + "loss": 0.5372, + "step": 11410 + }, + { + "epoch": 1.1438373315971353, + "grad_norm": 1.8337105512619019, + "learning_rate": 4.3897586759129975e-05, + "loss": 0.5933, + "step": 11420 + }, + { + "epoch": 1.144838984324135, + "grad_norm": 2.181349992752075, + "learning_rate": 4.388726210948969e-05, + "loss": 0.6146, + "step": 11430 + }, + { + "epoch": 1.1458406370511343, + "grad_norm": 2.9680917263031006, + "learning_rate": 4.387692994948012e-05, + "loss": 0.6383, + "step": 11440 + }, + { + "epoch": 1.1468422897781338, + "grad_norm": 2.7716708183288574, + "learning_rate": 4.386659028320975e-05, + "loss": 0.6112, + "step": 11450 + }, + { + "epoch": 1.1478439425051334, + "grad_norm": 2.357621908187866, + "learning_rate": 4.385624311479009e-05, + "loss": 0.5798, + "step": 11460 + }, + { + "epoch": 1.148845595232133, + "grad_norm": 2.055521249771118, + "learning_rate": 4.3845888448335596e-05, + "loss": 0.5666, + "step": 11470 + }, + { + "epoch": 1.1498472479591326, + "grad_norm": 2.8949215412139893, + "learning_rate": 4.3835526287963726e-05, + "loss": 0.5668, + "step": 11480 + }, + { + "epoch": 1.1508489006861322, + "grad_norm": 2.6228830814361572, + "learning_rate": 4.38251566377949e-05, + "loss": 0.6357, + "step": 11490 + }, + { + "epoch": 1.1518505534131316, + "grad_norm": 2.118854284286499, + "learning_rate": 4.381477950195254e-05, + "loss": 0.5745, + "step": 11500 + }, + { + "epoch": 1.1528522061401312, + "grad_norm": 2.6163575649261475, + "learning_rate": 4.380439488456301e-05, + "loss": 0.5824, + "step": 11510 + }, + { + "epoch": 1.1538538588671308, + "grad_norm": 2.5866751670837402, + "learning_rate": 4.3794002789755705e-05, + "loss": 0.6282, + "step": 11520 + }, + { + "epoch": 1.1548555115941304, + "grad_norm": 1.9551018476486206, + "learning_rate": 4.3783603221662925e-05, + "loss": 0.5497, + "step": 11530 + }, + { + "epoch": 1.15585716432113, + "grad_norm": 2.2837166786193848, + "learning_rate": 4.3773196184419996e-05, + "loss": 0.5256, + "step": 11540 + }, + { + "epoch": 1.1568588170481293, + "grad_norm": 2.673213005065918, + "learning_rate": 4.376278168216518e-05, + "loss": 0.638, + "step": 11550 + }, + { + "epoch": 1.157860469775129, + "grad_norm": 2.9592199325561523, + "learning_rate": 4.375235971903973e-05, + "loss": 0.5934, + "step": 11560 + }, + { + "epoch": 1.1588621225021285, + "grad_norm": 2.324190855026245, + "learning_rate": 4.374193029918786e-05, + "loss": 0.5472, + "step": 11570 + }, + { + "epoch": 1.159863775229128, + "grad_norm": 3.0013883113861084, + "learning_rate": 4.3731493426756734e-05, + "loss": 0.5461, + "step": 11580 + }, + { + "epoch": 1.1608654279561277, + "grad_norm": 2.3880512714385986, + "learning_rate": 4.372104910589649e-05, + "loss": 0.6383, + "step": 11590 + }, + { + "epoch": 1.1618670806831273, + "grad_norm": 2.222216844558716, + "learning_rate": 4.371059734076024e-05, + "loss": 0.5615, + "step": 11600 + }, + { + "epoch": 1.1628687334101266, + "grad_norm": 2.4552252292633057, + "learning_rate": 4.3700138135504044e-05, + "loss": 0.5334, + "step": 11610 + }, + { + "epoch": 1.1638703861371262, + "grad_norm": 2.393084764480591, + "learning_rate": 4.3689671494286913e-05, + "loss": 0.6087, + "step": 11620 + }, + { + "epoch": 1.1648720388641258, + "grad_norm": 2.7061352729797363, + "learning_rate": 4.367919742127083e-05, + "loss": 0.5929, + "step": 11630 + }, + { + "epoch": 1.1658736915911254, + "grad_norm": 2.2179784774780273, + "learning_rate": 4.366871592062073e-05, + "loss": 0.5751, + "step": 11640 + }, + { + "epoch": 1.1668753443181248, + "grad_norm": 2.616370677947998, + "learning_rate": 4.3658226996504494e-05, + "loss": 0.6019, + "step": 11650 + }, + { + "epoch": 1.1678769970451244, + "grad_norm": 2.4288666248321533, + "learning_rate": 4.364773065309296e-05, + "loss": 0.5673, + "step": 11660 + }, + { + "epoch": 1.168878649772124, + "grad_norm": 2.0479538440704346, + "learning_rate": 4.36372268945599e-05, + "loss": 0.6108, + "step": 11670 + }, + { + "epoch": 1.1698803024991236, + "grad_norm": 2.7070276737213135, + "learning_rate": 4.362671572508207e-05, + "loss": 0.5864, + "step": 11680 + }, + { + "epoch": 1.1708819552261232, + "grad_norm": 3.1016364097595215, + "learning_rate": 4.3616197148839155e-05, + "loss": 0.5442, + "step": 11690 + }, + { + "epoch": 1.1718836079531227, + "grad_norm": 2.680513620376587, + "learning_rate": 4.360567117001377e-05, + "loss": 0.5797, + "step": 11700 + }, + { + "epoch": 1.172885260680122, + "grad_norm": 1.764623999595642, + "learning_rate": 4.359513779279149e-05, + "loss": 0.5252, + "step": 11710 + }, + { + "epoch": 1.1738869134071217, + "grad_norm": 2.167689323425293, + "learning_rate": 4.358459702136083e-05, + "loss": 0.5704, + "step": 11720 + }, + { + "epoch": 1.1748885661341213, + "grad_norm": 3.4920172691345215, + "learning_rate": 4.3574048859913247e-05, + "loss": 0.6426, + "step": 11730 + }, + { + "epoch": 1.1758902188611209, + "grad_norm": 2.359330415725708, + "learning_rate": 4.3563493312643125e-05, + "loss": 0.6073, + "step": 11740 + }, + { + "epoch": 1.1768918715881205, + "grad_norm": 2.428713798522949, + "learning_rate": 4.3552930383747806e-05, + "loss": 0.5856, + "step": 11750 + }, + { + "epoch": 1.1778935243151198, + "grad_norm": 2.476402521133423, + "learning_rate": 4.354236007742754e-05, + "loss": 0.5963, + "step": 11760 + }, + { + "epoch": 1.1788951770421194, + "grad_norm": 2.6675846576690674, + "learning_rate": 4.3531782397885534e-05, + "loss": 0.5157, + "step": 11770 + }, + { + "epoch": 1.179896829769119, + "grad_norm": 1.9533215761184692, + "learning_rate": 4.352119734932791e-05, + "loss": 0.6073, + "step": 11780 + }, + { + "epoch": 1.1808984824961186, + "grad_norm": 2.8943727016448975, + "learning_rate": 4.351060493596375e-05, + "loss": 0.6105, + "step": 11790 + }, + { + "epoch": 1.1819001352231182, + "grad_norm": 3.005053758621216, + "learning_rate": 4.350000516200501e-05, + "loss": 0.557, + "step": 11800 + }, + { + "epoch": 1.1829017879501178, + "grad_norm": 2.9034831523895264, + "learning_rate": 4.348939803166664e-05, + "loss": 0.5801, + "step": 11810 + }, + { + "epoch": 1.1839034406771172, + "grad_norm": 2.1721954345703125, + "learning_rate": 4.347878354916645e-05, + "loss": 0.6232, + "step": 11820 + }, + { + "epoch": 1.1849050934041168, + "grad_norm": 2.2449421882629395, + "learning_rate": 4.346816171872522e-05, + "loss": 0.5813, + "step": 11830 + }, + { + "epoch": 1.1859067461311164, + "grad_norm": 2.2640445232391357, + "learning_rate": 4.345753254456663e-05, + "loss": 0.6179, + "step": 11840 + }, + { + "epoch": 1.186908398858116, + "grad_norm": 1.8900947570800781, + "learning_rate": 4.34468960309173e-05, + "loss": 0.5509, + "step": 11850 + }, + { + "epoch": 1.1879100515851155, + "grad_norm": 2.5752103328704834, + "learning_rate": 4.343625218200674e-05, + "loss": 0.6068, + "step": 11860 + }, + { + "epoch": 1.188911704312115, + "grad_norm": 2.4647738933563232, + "learning_rate": 4.342560100206739e-05, + "loss": 0.5853, + "step": 11870 + }, + { + "epoch": 1.1899133570391145, + "grad_norm": 2.0032401084899902, + "learning_rate": 4.3414942495334634e-05, + "loss": 0.5757, + "step": 11880 + }, + { + "epoch": 1.190915009766114, + "grad_norm": 2.42807674407959, + "learning_rate": 4.340427666604671e-05, + "loss": 0.6069, + "step": 11890 + }, + { + "epoch": 1.1919166624931137, + "grad_norm": 2.145582914352417, + "learning_rate": 4.3393603518444803e-05, + "loss": 0.5653, + "step": 11900 + }, + { + "epoch": 1.1929183152201133, + "grad_norm": 2.337334632873535, + "learning_rate": 4.338292305677303e-05, + "loss": 0.5728, + "step": 11910 + }, + { + "epoch": 1.1939199679471129, + "grad_norm": 2.395955801010132, + "learning_rate": 4.337223528527836e-05, + "loss": 0.5418, + "step": 11920 + }, + { + "epoch": 1.1949216206741122, + "grad_norm": 2.5528903007507324, + "learning_rate": 4.3361540208210725e-05, + "loss": 0.5833, + "step": 11930 + }, + { + "epoch": 1.1959232734011118, + "grad_norm": 1.9089552164077759, + "learning_rate": 4.335083782982293e-05, + "loss": 0.556, + "step": 11940 + }, + { + "epoch": 1.1969249261281114, + "grad_norm": 2.391150951385498, + "learning_rate": 4.334012815437069e-05, + "loss": 0.6625, + "step": 11950 + }, + { + "epoch": 1.197926578855111, + "grad_norm": 2.3361966609954834, + "learning_rate": 4.3329411186112616e-05, + "loss": 0.5594, + "step": 11960 + }, + { + "epoch": 1.1989282315821104, + "grad_norm": 1.8724137544631958, + "learning_rate": 4.3318686929310235e-05, + "loss": 0.5774, + "step": 11970 + }, + { + "epoch": 1.19992988430911, + "grad_norm": 2.7264559268951416, + "learning_rate": 4.330795538822795e-05, + "loss": 0.5614, + "step": 11980 + }, + { + "epoch": 1.2009315370361096, + "grad_norm": 2.320357322692871, + "learning_rate": 4.3297216567133085e-05, + "loss": 0.5901, + "step": 11990 + }, + { + "epoch": 1.2019331897631091, + "grad_norm": 2.6622066497802734, + "learning_rate": 4.328647047029584e-05, + "loss": 0.6711, + "step": 12000 + }, + { + "epoch": 1.2029348424901087, + "grad_norm": 2.5785880088806152, + "learning_rate": 4.3275717101989316e-05, + "loss": 0.6132, + "step": 12010 + }, + { + "epoch": 1.2039364952171083, + "grad_norm": 2.7682085037231445, + "learning_rate": 4.3264956466489504e-05, + "loss": 0.5557, + "step": 12020 + }, + { + "epoch": 1.2049381479441077, + "grad_norm": 2.4161806106567383, + "learning_rate": 4.325418856807529e-05, + "loss": 0.6732, + "step": 12030 + }, + { + "epoch": 1.2059398006711073, + "grad_norm": 2.803616523742676, + "learning_rate": 4.324341341102843e-05, + "loss": 0.5855, + "step": 12040 + }, + { + "epoch": 1.2069414533981069, + "grad_norm": 2.3754525184631348, + "learning_rate": 4.3232630999633595e-05, + "loss": 0.6923, + "step": 12050 + }, + { + "epoch": 1.2079431061251065, + "grad_norm": 2.4744246006011963, + "learning_rate": 4.3221841338178316e-05, + "loss": 0.646, + "step": 12060 + }, + { + "epoch": 1.208944758852106, + "grad_norm": 2.2677700519561768, + "learning_rate": 4.321104443095302e-05, + "loss": 0.5519, + "step": 12070 + }, + { + "epoch": 1.2099464115791054, + "grad_norm": 2.31657075881958, + "learning_rate": 4.3200240282251005e-05, + "loss": 0.5602, + "step": 12080 + }, + { + "epoch": 1.210948064306105, + "grad_norm": 2.340644598007202, + "learning_rate": 4.3189428896368456e-05, + "loss": 0.6619, + "step": 12090 + }, + { + "epoch": 1.2119497170331046, + "grad_norm": 2.118570566177368, + "learning_rate": 4.317861027760444e-05, + "loss": 0.5768, + "step": 12100 + }, + { + "epoch": 1.2129513697601042, + "grad_norm": 2.9038562774658203, + "learning_rate": 4.3167784430260895e-05, + "loss": 0.5889, + "step": 12110 + }, + { + "epoch": 1.2139530224871038, + "grad_norm": 2.5005056858062744, + "learning_rate": 4.3156951358642626e-05, + "loss": 0.5664, + "step": 12120 + }, + { + "epoch": 1.2149546752141034, + "grad_norm": 2.4353575706481934, + "learning_rate": 4.314611106705732e-05, + "loss": 0.6251, + "step": 12130 + }, + { + "epoch": 1.2159563279411028, + "grad_norm": 2.5719215869903564, + "learning_rate": 4.313526355981554e-05, + "loss": 0.6044, + "step": 12140 + }, + { + "epoch": 1.2169579806681023, + "grad_norm": 2.214345932006836, + "learning_rate": 4.3124408841230696e-05, + "loss": 0.5722, + "step": 12150 + }, + { + "epoch": 1.217959633395102, + "grad_norm": 2.1811490058898926, + "learning_rate": 4.3113546915619095e-05, + "loss": 0.5788, + "step": 12160 + }, + { + "epoch": 1.2189612861221015, + "grad_norm": 2.520916700363159, + "learning_rate": 4.3102677787299886e-05, + "loss": 0.6075, + "step": 12170 + }, + { + "epoch": 1.219962938849101, + "grad_norm": 2.66408634185791, + "learning_rate": 4.309180146059509e-05, + "loss": 0.5751, + "step": 12180 + }, + { + "epoch": 1.2209645915761005, + "grad_norm": 2.4396579265594482, + "learning_rate": 4.3080917939829604e-05, + "loss": 0.5761, + "step": 12190 + }, + { + "epoch": 1.2219662443031, + "grad_norm": 1.9858520030975342, + "learning_rate": 4.3070027229331155e-05, + "loss": 0.617, + "step": 12200 + }, + { + "epoch": 1.2229678970300997, + "grad_norm": 2.1379098892211914, + "learning_rate": 4.305912933343037e-05, + "loss": 0.5834, + "step": 12210 + }, + { + "epoch": 1.2239695497570993, + "grad_norm": 2.563939094543457, + "learning_rate": 4.304822425646069e-05, + "loss": 0.569, + "step": 12220 + }, + { + "epoch": 1.2249712024840989, + "grad_norm": 2.471806287765503, + "learning_rate": 4.303731200275844e-05, + "loss": 0.5577, + "step": 12230 + }, + { + "epoch": 1.2259728552110982, + "grad_norm": 2.36328387260437, + "learning_rate": 4.302639257666279e-05, + "loss": 0.6297, + "step": 12240 + }, + { + "epoch": 1.2269745079380978, + "grad_norm": 2.4426753520965576, + "learning_rate": 4.3015465982515765e-05, + "loss": 0.5597, + "step": 12250 + }, + { + "epoch": 1.2279761606650974, + "grad_norm": 2.5594425201416016, + "learning_rate": 4.300453222466224e-05, + "loss": 0.5633, + "step": 12260 + }, + { + "epoch": 1.228977813392097, + "grad_norm": 2.1706836223602295, + "learning_rate": 4.299359130744993e-05, + "loss": 0.5809, + "step": 12270 + }, + { + "epoch": 1.2299794661190966, + "grad_norm": 2.1926965713500977, + "learning_rate": 4.298264323522941e-05, + "loss": 0.5436, + "step": 12280 + }, + { + "epoch": 1.230981118846096, + "grad_norm": 3.1730353832244873, + "learning_rate": 4.297168801235409e-05, + "loss": 0.6721, + "step": 12290 + }, + { + "epoch": 1.2319827715730955, + "grad_norm": 2.2085063457489014, + "learning_rate": 4.296072564318023e-05, + "loss": 0.5455, + "step": 12300 + }, + { + "epoch": 1.2329844243000951, + "grad_norm": 2.361359119415283, + "learning_rate": 4.2949756132066924e-05, + "loss": 0.551, + "step": 12310 + }, + { + "epoch": 1.2339860770270947, + "grad_norm": 2.3133087158203125, + "learning_rate": 4.29387794833761e-05, + "loss": 0.6182, + "step": 12320 + }, + { + "epoch": 1.2349877297540943, + "grad_norm": 2.7090024948120117, + "learning_rate": 4.2927795701472564e-05, + "loss": 0.6011, + "step": 12330 + }, + { + "epoch": 1.235989382481094, + "grad_norm": 2.7313599586486816, + "learning_rate": 4.291680479072391e-05, + "loss": 0.6484, + "step": 12340 + }, + { + "epoch": 1.2369910352080933, + "grad_norm": 2.4200663566589355, + "learning_rate": 4.290580675550059e-05, + "loss": 0.6089, + "step": 12350 + }, + { + "epoch": 1.2379926879350929, + "grad_norm": 2.2249794006347656, + "learning_rate": 4.2894801600175885e-05, + "loss": 0.609, + "step": 12360 + }, + { + "epoch": 1.2389943406620925, + "grad_norm": 2.8315131664276123, + "learning_rate": 4.2883789329125894e-05, + "loss": 0.551, + "step": 12370 + }, + { + "epoch": 1.239995993389092, + "grad_norm": 2.9285271167755127, + "learning_rate": 4.287276994672959e-05, + "loss": 0.5966, + "step": 12380 + }, + { + "epoch": 1.2409976461160916, + "grad_norm": 2.0900344848632812, + "learning_rate": 4.286174345736871e-05, + "loss": 0.488, + "step": 12390 + }, + { + "epoch": 1.241999298843091, + "grad_norm": 2.371091604232788, + "learning_rate": 4.285070986542787e-05, + "loss": 0.5586, + "step": 12400 + }, + { + "epoch": 1.2430009515700906, + "grad_norm": 2.1431174278259277, + "learning_rate": 4.283966917529448e-05, + "loss": 0.5699, + "step": 12410 + }, + { + "epoch": 1.2440026042970902, + "grad_norm": 2.3268375396728516, + "learning_rate": 4.282862139135879e-05, + "loss": 0.5179, + "step": 12420 + }, + { + "epoch": 1.2450042570240898, + "grad_norm": 2.512864112854004, + "learning_rate": 4.281756651801386e-05, + "loss": 0.6027, + "step": 12430 + }, + { + "epoch": 1.2460059097510894, + "grad_norm": 3.174145221710205, + "learning_rate": 4.280650455965557e-05, + "loss": 0.5509, + "step": 12440 + }, + { + "epoch": 1.247007562478089, + "grad_norm": 2.1517832279205322, + "learning_rate": 4.279543552068263e-05, + "loss": 0.618, + "step": 12450 + }, + { + "epoch": 1.2480092152050883, + "grad_norm": 2.6362404823303223, + "learning_rate": 4.278435940549653e-05, + "loss": 0.5599, + "step": 12460 + }, + { + "epoch": 1.249010867932088, + "grad_norm": 2.2011988162994385, + "learning_rate": 4.277327621850162e-05, + "loss": 0.5423, + "step": 12470 + }, + { + "epoch": 1.2500125206590875, + "grad_norm": 2.244523286819458, + "learning_rate": 4.2762185964105045e-05, + "loss": 0.5017, + "step": 12480 + }, + { + "epoch": 1.2510141733860871, + "grad_norm": 2.230452060699463, + "learning_rate": 4.275108864671674e-05, + "loss": 0.5543, + "step": 12490 + }, + { + "epoch": 1.2520158261130865, + "grad_norm": 2.8010337352752686, + "learning_rate": 4.273998427074948e-05, + "loss": 0.6166, + "step": 12500 + }, + { + "epoch": 1.253017478840086, + "grad_norm": 2.4748897552490234, + "learning_rate": 4.2728872840618814e-05, + "loss": 0.61, + "step": 12510 + }, + { + "epoch": 1.2540191315670857, + "grad_norm": 2.357698917388916, + "learning_rate": 4.271775436074313e-05, + "loss": 0.5702, + "step": 12520 + }, + { + "epoch": 1.2550207842940853, + "grad_norm": 2.1585423946380615, + "learning_rate": 4.270662883554361e-05, + "loss": 0.5865, + "step": 12530 + }, + { + "epoch": 1.2560224370210848, + "grad_norm": 2.516031503677368, + "learning_rate": 4.2695496269444196e-05, + "loss": 0.6007, + "step": 12540 + }, + { + "epoch": 1.2570240897480844, + "grad_norm": 2.1451737880706787, + "learning_rate": 4.2684356666871696e-05, + "loss": 0.6683, + "step": 12550 + }, + { + "epoch": 1.2580257424750838, + "grad_norm": 2.237550735473633, + "learning_rate": 4.267321003225567e-05, + "loss": 0.5501, + "step": 12560 + }, + { + "epoch": 1.2590273952020834, + "grad_norm": 2.1487345695495605, + "learning_rate": 4.266205637002849e-05, + "loss": 0.6097, + "step": 12570 + }, + { + "epoch": 1.260029047929083, + "grad_norm": 1.9155800342559814, + "learning_rate": 4.2650895684625325e-05, + "loss": 0.5589, + "step": 12580 + }, + { + "epoch": 1.2610307006560826, + "grad_norm": 2.6508138179779053, + "learning_rate": 4.263972798048413e-05, + "loss": 0.624, + "step": 12590 + }, + { + "epoch": 1.262032353383082, + "grad_norm": 3.7485527992248535, + "learning_rate": 4.262855326204565e-05, + "loss": 0.5505, + "step": 12600 + }, + { + "epoch": 1.2630340061100815, + "grad_norm": 2.4273035526275635, + "learning_rate": 4.2617371533753445e-05, + "loss": 0.5796, + "step": 12610 + }, + { + "epoch": 1.2640356588370811, + "grad_norm": 2.114297866821289, + "learning_rate": 4.2606182800053806e-05, + "loss": 0.614, + "step": 12620 + }, + { + "epoch": 1.2650373115640807, + "grad_norm": 2.203754425048828, + "learning_rate": 4.259498706539586e-05, + "loss": 0.5348, + "step": 12630 + }, + { + "epoch": 1.2660389642910803, + "grad_norm": 2.6585121154785156, + "learning_rate": 4.258378433423152e-05, + "loss": 0.5813, + "step": 12640 + }, + { + "epoch": 1.26704061701808, + "grad_norm": 3.2380688190460205, + "learning_rate": 4.257257461101542e-05, + "loss": 0.6336, + "step": 12650 + }, + { + "epoch": 1.2680422697450795, + "grad_norm": 2.2499895095825195, + "learning_rate": 4.256135790020506e-05, + "loss": 0.6303, + "step": 12660 + }, + { + "epoch": 1.2690439224720789, + "grad_norm": 1.870112419128418, + "learning_rate": 4.255013420626064e-05, + "loss": 0.622, + "step": 12670 + }, + { + "epoch": 1.2700455751990785, + "grad_norm": 2.667515277862549, + "learning_rate": 4.2538903533645206e-05, + "loss": 0.5994, + "step": 12680 + }, + { + "epoch": 1.271047227926078, + "grad_norm": 2.1761505603790283, + "learning_rate": 4.252766588682452e-05, + "loss": 0.5713, + "step": 12690 + }, + { + "epoch": 1.2720488806530776, + "grad_norm": 2.370919704437256, + "learning_rate": 4.251642127026715e-05, + "loss": 0.5416, + "step": 12700 + }, + { + "epoch": 1.273050533380077, + "grad_norm": 1.990342617034912, + "learning_rate": 4.2505169688444435e-05, + "loss": 0.5408, + "step": 12710 + }, + { + "epoch": 1.2740521861070766, + "grad_norm": 2.1336588859558105, + "learning_rate": 4.2493911145830464e-05, + "loss": 0.6172, + "step": 12720 + }, + { + "epoch": 1.2750538388340762, + "grad_norm": 2.30049467086792, + "learning_rate": 4.248264564690212e-05, + "loss": 0.5551, + "step": 12730 + }, + { + "epoch": 1.2760554915610758, + "grad_norm": 2.0460050106048584, + "learning_rate": 4.247137319613904e-05, + "loss": 0.5742, + "step": 12740 + }, + { + "epoch": 1.2770571442880754, + "grad_norm": 2.2391974925994873, + "learning_rate": 4.246009379802361e-05, + "loss": 0.5554, + "step": 12750 + }, + { + "epoch": 1.278058797015075, + "grad_norm": 2.327256917953491, + "learning_rate": 4.2448807457041006e-05, + "loss": 0.5254, + "step": 12760 + }, + { + "epoch": 1.2790604497420746, + "grad_norm": 3.1748154163360596, + "learning_rate": 4.243751417767915e-05, + "loss": 0.5259, + "step": 12770 + }, + { + "epoch": 1.280062102469074, + "grad_norm": 2.3406453132629395, + "learning_rate": 4.2426213964428704e-05, + "loss": 0.5984, + "step": 12780 + }, + { + "epoch": 1.2810637551960735, + "grad_norm": 2.08475399017334, + "learning_rate": 4.241490682178314e-05, + "loss": 0.5541, + "step": 12790 + }, + { + "epoch": 1.282065407923073, + "grad_norm": 2.3705079555511475, + "learning_rate": 4.240359275423863e-05, + "loss": 0.6299, + "step": 12800 + }, + { + "epoch": 1.2830670606500727, + "grad_norm": 2.232518196105957, + "learning_rate": 4.239227176629413e-05, + "loss": 0.5389, + "step": 12810 + }, + { + "epoch": 1.284068713377072, + "grad_norm": 2.1422078609466553, + "learning_rate": 4.238094386245134e-05, + "loss": 0.5609, + "step": 12820 + }, + { + "epoch": 1.2850703661040717, + "grad_norm": 2.5519092082977295, + "learning_rate": 4.236960904721472e-05, + "loss": 0.5631, + "step": 12830 + }, + { + "epoch": 1.2860720188310713, + "grad_norm": 3.0536038875579834, + "learning_rate": 4.2358267325091456e-05, + "loss": 0.5331, + "step": 12840 + }, + { + "epoch": 1.2870736715580708, + "grad_norm": 2.136219024658203, + "learning_rate": 4.2346918700591497e-05, + "loss": 0.6137, + "step": 12850 + }, + { + "epoch": 1.2880753242850704, + "grad_norm": 2.4962804317474365, + "learning_rate": 4.2335563178227544e-05, + "loss": 0.5247, + "step": 12860 + }, + { + "epoch": 1.28907697701207, + "grad_norm": 3.05644154548645, + "learning_rate": 4.232420076251501e-05, + "loss": 0.5721, + "step": 12870 + }, + { + "epoch": 1.2900786297390694, + "grad_norm": 2.8292741775512695, + "learning_rate": 4.231283145797208e-05, + "loss": 0.5837, + "step": 12880 + }, + { + "epoch": 1.291080282466069, + "grad_norm": 3.002732753753662, + "learning_rate": 4.2301455269119665e-05, + "loss": 0.5213, + "step": 12890 + }, + { + "epoch": 1.2920819351930686, + "grad_norm": 2.6589910984039307, + "learning_rate": 4.229007220048142e-05, + "loss": 0.5508, + "step": 12900 + }, + { + "epoch": 1.2930835879200682, + "grad_norm": 2.5175070762634277, + "learning_rate": 4.227868225658373e-05, + "loss": 0.5646, + "step": 12910 + }, + { + "epoch": 1.2940852406470675, + "grad_norm": 2.4398906230926514, + "learning_rate": 4.226728544195572e-05, + "loss": 0.5605, + "step": 12920 + }, + { + "epoch": 1.2950868933740671, + "grad_norm": 1.8536179065704346, + "learning_rate": 4.225588176112922e-05, + "loss": 0.5898, + "step": 12930 + }, + { + "epoch": 1.2960885461010667, + "grad_norm": 2.7143056392669678, + "learning_rate": 4.224447121863885e-05, + "loss": 0.6005, + "step": 12940 + }, + { + "epoch": 1.2970901988280663, + "grad_norm": 2.5145957469940186, + "learning_rate": 4.223305381902189e-05, + "loss": 0.5204, + "step": 12950 + }, + { + "epoch": 1.298091851555066, + "grad_norm": 1.898524522781372, + "learning_rate": 4.222162956681839e-05, + "loss": 0.5403, + "step": 12960 + }, + { + "epoch": 1.2990935042820655, + "grad_norm": 2.350985527038574, + "learning_rate": 4.221019846657112e-05, + "loss": 0.5891, + "step": 12970 + }, + { + "epoch": 1.300095157009065, + "grad_norm": 2.1936440467834473, + "learning_rate": 4.219876052282555e-05, + "loss": 0.5486, + "step": 12980 + }, + { + "epoch": 1.3010968097360645, + "grad_norm": 2.114114999771118, + "learning_rate": 4.21873157401299e-05, + "loss": 0.6083, + "step": 12990 + }, + { + "epoch": 1.302098462463064, + "grad_norm": 2.300907850265503, + "learning_rate": 4.2175864123035085e-05, + "loss": 0.6078, + "step": 13000 + }, + { + "epoch": 1.3031001151900636, + "grad_norm": 2.4883675575256348, + "learning_rate": 4.2164405676094766e-05, + "loss": 0.5808, + "step": 13010 + }, + { + "epoch": 1.3041017679170632, + "grad_norm": 2.3520798683166504, + "learning_rate": 4.215294040386528e-05, + "loss": 0.6279, + "step": 13020 + }, + { + "epoch": 1.3051034206440626, + "grad_norm": 2.238900661468506, + "learning_rate": 4.214146831090572e-05, + "loss": 0.6265, + "step": 13030 + }, + { + "epoch": 1.3061050733710622, + "grad_norm": 2.766570806503296, + "learning_rate": 4.2129989401777876e-05, + "loss": 0.6319, + "step": 13040 + }, + { + "epoch": 1.3071067260980618, + "grad_norm": 2.1312856674194336, + "learning_rate": 4.211850368104623e-05, + "loss": 0.5227, + "step": 13050 + }, + { + "epoch": 1.3081083788250614, + "grad_norm": 2.324946165084839, + "learning_rate": 4.210701115327799e-05, + "loss": 0.5999, + "step": 13060 + }, + { + "epoch": 1.309110031552061, + "grad_norm": 2.1552188396453857, + "learning_rate": 4.2095511823043064e-05, + "loss": 0.5514, + "step": 13070 + }, + { + "epoch": 1.3101116842790606, + "grad_norm": 2.5594635009765625, + "learning_rate": 4.208400569491408e-05, + "loss": 0.5999, + "step": 13080 + }, + { + "epoch": 1.31111333700606, + "grad_norm": 1.8165738582611084, + "learning_rate": 4.2072492773466366e-05, + "loss": 0.5742, + "step": 13090 + }, + { + "epoch": 1.3121149897330595, + "grad_norm": 2.341984987258911, + "learning_rate": 4.2060973063277924e-05, + "loss": 0.5683, + "step": 13100 + }, + { + "epoch": 1.313116642460059, + "grad_norm": 2.9836859703063965, + "learning_rate": 4.204944656892948e-05, + "loss": 0.6223, + "step": 13110 + }, + { + "epoch": 1.3141182951870587, + "grad_norm": 2.9429287910461426, + "learning_rate": 4.203791329500446e-05, + "loss": 0.5961, + "step": 13120 + }, + { + "epoch": 1.315119947914058, + "grad_norm": 2.433025598526001, + "learning_rate": 4.202637324608897e-05, + "loss": 0.5167, + "step": 13130 + }, + { + "epoch": 1.3161216006410577, + "grad_norm": 2.171271800994873, + "learning_rate": 4.2014826426771825e-05, + "loss": 0.5361, + "step": 13140 + }, + { + "epoch": 1.3171232533680572, + "grad_norm": 2.2172558307647705, + "learning_rate": 4.2003272841644525e-05, + "loss": 0.5201, + "step": 13150 + }, + { + "epoch": 1.3181249060950568, + "grad_norm": 2.2057085037231445, + "learning_rate": 4.199171249530125e-05, + "loss": 0.5465, + "step": 13160 + }, + { + "epoch": 1.3191265588220564, + "grad_norm": 2.7693426609039307, + "learning_rate": 4.1980145392338896e-05, + "loss": 0.586, + "step": 13170 + }, + { + "epoch": 1.320128211549056, + "grad_norm": 2.1529502868652344, + "learning_rate": 4.196857153735702e-05, + "loss": 0.5835, + "step": 13180 + }, + { + "epoch": 1.3211298642760556, + "grad_norm": 2.1191604137420654, + "learning_rate": 4.195699093495788e-05, + "loss": 0.6044, + "step": 13190 + }, + { + "epoch": 1.322131517003055, + "grad_norm": 1.6969443559646606, + "learning_rate": 4.194540358974639e-05, + "loss": 0.5782, + "step": 13200 + }, + { + "epoch": 1.3231331697300546, + "grad_norm": 2.8751461505889893, + "learning_rate": 4.19338095063302e-05, + "loss": 0.5474, + "step": 13210 + }, + { + "epoch": 1.3241348224570542, + "grad_norm": 1.8890577554702759, + "learning_rate": 4.192220868931958e-05, + "loss": 0.5827, + "step": 13220 + }, + { + "epoch": 1.3251364751840538, + "grad_norm": 2.4971513748168945, + "learning_rate": 4.1910601143327496e-05, + "loss": 0.566, + "step": 13230 + }, + { + "epoch": 1.3261381279110531, + "grad_norm": 2.1435883045196533, + "learning_rate": 4.1898986872969626e-05, + "loss": 0.5873, + "step": 13240 + }, + { + "epoch": 1.3271397806380527, + "grad_norm": 2.1177799701690674, + "learning_rate": 4.188736588286426e-05, + "loss": 0.5642, + "step": 13250 + }, + { + "epoch": 1.3281414333650523, + "grad_norm": 2.0428977012634277, + "learning_rate": 4.187573817763242e-05, + "loss": 0.5201, + "step": 13260 + }, + { + "epoch": 1.329143086092052, + "grad_norm": 2.122833490371704, + "learning_rate": 4.1864103761897746e-05, + "loss": 0.5834, + "step": 13270 + }, + { + "epoch": 1.3301447388190515, + "grad_norm": 2.3523943424224854, + "learning_rate": 4.185246264028659e-05, + "loss": 0.5832, + "step": 13280 + }, + { + "epoch": 1.331146391546051, + "grad_norm": 1.596888780593872, + "learning_rate": 4.184081481742794e-05, + "loss": 0.6354, + "step": 13290 + }, + { + "epoch": 1.3321480442730507, + "grad_norm": 2.279982328414917, + "learning_rate": 4.182916029795346e-05, + "loss": 0.5331, + "step": 13300 + }, + { + "epoch": 1.33314969700005, + "grad_norm": 2.457702875137329, + "learning_rate": 4.181749908649748e-05, + "loss": 0.6325, + "step": 13310 + }, + { + "epoch": 1.3341513497270496, + "grad_norm": 1.8950936794281006, + "learning_rate": 4.180583118769699e-05, + "loss": 0.5559, + "step": 13320 + }, + { + "epoch": 1.3351530024540492, + "grad_norm": 2.870814561843872, + "learning_rate": 4.179415660619164e-05, + "loss": 0.5552, + "step": 13330 + }, + { + "epoch": 1.3361546551810488, + "grad_norm": 1.7458791732788086, + "learning_rate": 4.178247534662372e-05, + "loss": 0.5206, + "step": 13340 + }, + { + "epoch": 1.3371563079080482, + "grad_norm": 2.9492814540863037, + "learning_rate": 4.17707874136382e-05, + "loss": 0.499, + "step": 13350 + }, + { + "epoch": 1.3381579606350478, + "grad_norm": 2.308802604675293, + "learning_rate": 4.1759092811882696e-05, + "loss": 0.6022, + "step": 13360 + }, + { + "epoch": 1.3391596133620474, + "grad_norm": 1.9808357954025269, + "learning_rate": 4.174739154600746e-05, + "loss": 0.5856, + "step": 13370 + }, + { + "epoch": 1.340161266089047, + "grad_norm": 2.5292766094207764, + "learning_rate": 4.173568362066542e-05, + "loss": 0.6017, + "step": 13380 + }, + { + "epoch": 1.3411629188160465, + "grad_norm": 1.9373970031738281, + "learning_rate": 4.172396904051215e-05, + "loss": 0.5282, + "step": 13390 + }, + { + "epoch": 1.3421645715430461, + "grad_norm": 3.495178699493408, + "learning_rate": 4.1712247810205824e-05, + "loss": 0.5122, + "step": 13400 + }, + { + "epoch": 1.3431662242700455, + "grad_norm": 2.2471678256988525, + "learning_rate": 4.170051993440733e-05, + "loss": 0.5307, + "step": 13410 + }, + { + "epoch": 1.344167876997045, + "grad_norm": 2.055947780609131, + "learning_rate": 4.1688785417780155e-05, + "loss": 0.5769, + "step": 13420 + }, + { + "epoch": 1.3451695297240447, + "grad_norm": 2.5820884704589844, + "learning_rate": 4.167704426499042e-05, + "loss": 0.5912, + "step": 13430 + }, + { + "epoch": 1.3461711824510443, + "grad_norm": 2.5380706787109375, + "learning_rate": 4.1665296480706917e-05, + "loss": 0.6127, + "step": 13440 + }, + { + "epoch": 1.3471728351780436, + "grad_norm": 2.453075885772705, + "learning_rate": 4.1653542069601055e-05, + "loss": 0.5635, + "step": 13450 + }, + { + "epoch": 1.3481744879050432, + "grad_norm": 2.359921932220459, + "learning_rate": 4.164178103634688e-05, + "loss": 0.5789, + "step": 13460 + }, + { + "epoch": 1.3491761406320428, + "grad_norm": 2.4070520401000977, + "learning_rate": 4.163001338562108e-05, + "loss": 0.6129, + "step": 13470 + }, + { + "epoch": 1.3501777933590424, + "grad_norm": 2.7085323333740234, + "learning_rate": 4.1618239122102965e-05, + "loss": 0.6108, + "step": 13480 + }, + { + "epoch": 1.351179446086042, + "grad_norm": 2.0704357624053955, + "learning_rate": 4.160645825047447e-05, + "loss": 0.6244, + "step": 13490 + }, + { + "epoch": 1.3521810988130416, + "grad_norm": 2.2796645164489746, + "learning_rate": 4.159467077542016e-05, + "loss": 0.5491, + "step": 13500 + }, + { + "epoch": 1.3531827515400412, + "grad_norm": 2.2350540161132812, + "learning_rate": 4.158287670162725e-05, + "loss": 0.5411, + "step": 13510 + }, + { + "epoch": 1.3541844042670406, + "grad_norm": 2.277599334716797, + "learning_rate": 4.1571076033785556e-05, + "loss": 0.5953, + "step": 13520 + }, + { + "epoch": 1.3551860569940402, + "grad_norm": 2.4963531494140625, + "learning_rate": 4.155926877658751e-05, + "loss": 0.5321, + "step": 13530 + }, + { + "epoch": 1.3561877097210397, + "grad_norm": 1.7574398517608643, + "learning_rate": 4.154745493472817e-05, + "loss": 0.5299, + "step": 13540 + }, + { + "epoch": 1.3571893624480393, + "grad_norm": 2.508462905883789, + "learning_rate": 4.1535634512905225e-05, + "loss": 0.5877, + "step": 13550 + }, + { + "epoch": 1.3581910151750387, + "grad_norm": 3.9622373580932617, + "learning_rate": 4.152380751581897e-05, + "loss": 0.5962, + "step": 13560 + }, + { + "epoch": 1.3591926679020383, + "grad_norm": 2.56437087059021, + "learning_rate": 4.151197394817231e-05, + "loss": 0.5113, + "step": 13570 + }, + { + "epoch": 1.360194320629038, + "grad_norm": 3.0979878902435303, + "learning_rate": 4.150013381467078e-05, + "loss": 0.5881, + "step": 13580 + }, + { + "epoch": 1.3611959733560375, + "grad_norm": 2.280622959136963, + "learning_rate": 4.148828712002252e-05, + "loss": 0.6048, + "step": 13590 + }, + { + "epoch": 1.362197626083037, + "grad_norm": 2.422917366027832, + "learning_rate": 4.147643386893825e-05, + "loss": 0.6643, + "step": 13600 + }, + { + "epoch": 1.3631992788100367, + "grad_norm": 1.9158912897109985, + "learning_rate": 4.146457406613134e-05, + "loss": 0.5696, + "step": 13610 + }, + { + "epoch": 1.364200931537036, + "grad_norm": 2.052349328994751, + "learning_rate": 4.145270771631773e-05, + "loss": 0.5746, + "step": 13620 + }, + { + "epoch": 1.3652025842640356, + "grad_norm": 3.113779306411743, + "learning_rate": 4.144083482421599e-05, + "loss": 0.5859, + "step": 13630 + }, + { + "epoch": 1.3662042369910352, + "grad_norm": 2.7686078548431396, + "learning_rate": 4.1428955394547286e-05, + "loss": 0.5829, + "step": 13640 + }, + { + "epoch": 1.3672058897180348, + "grad_norm": 2.467316150665283, + "learning_rate": 4.141706943203537e-05, + "loss": 0.51, + "step": 13650 + }, + { + "epoch": 1.3682075424450342, + "grad_norm": 2.354332447052002, + "learning_rate": 4.140517694140661e-05, + "loss": 0.5914, + "step": 13660 + }, + { + "epoch": 1.3692091951720338, + "grad_norm": 2.3805387020111084, + "learning_rate": 4.1393277927389946e-05, + "loss": 0.6216, + "step": 13670 + }, + { + "epoch": 1.3702108478990334, + "grad_norm": 1.6980246305465698, + "learning_rate": 4.138137239471693e-05, + "loss": 0.65, + "step": 13680 + }, + { + "epoch": 1.371212500626033, + "grad_norm": 2.8044466972351074, + "learning_rate": 4.136946034812171e-05, + "loss": 0.5768, + "step": 13690 + }, + { + "epoch": 1.3722141533530325, + "grad_norm": 2.284212112426758, + "learning_rate": 4.135754179234102e-05, + "loss": 0.548, + "step": 13700 + }, + { + "epoch": 1.3732158060800321, + "grad_norm": 2.20108699798584, + "learning_rate": 4.134561673211417e-05, + "loss": 0.5497, + "step": 13710 + }, + { + "epoch": 1.3742174588070317, + "grad_norm": 2.3220138549804688, + "learning_rate": 4.133368517218305e-05, + "loss": 0.6195, + "step": 13720 + }, + { + "epoch": 1.375219111534031, + "grad_norm": 2.1386282444000244, + "learning_rate": 4.132174711729217e-05, + "loss": 0.563, + "step": 13730 + }, + { + "epoch": 1.3762207642610307, + "grad_norm": 3.0128397941589355, + "learning_rate": 4.130980257218861e-05, + "loss": 0.5758, + "step": 13740 + }, + { + "epoch": 1.3772224169880303, + "grad_norm": 2.1034698486328125, + "learning_rate": 4.129785154162201e-05, + "loss": 0.536, + "step": 13750 + }, + { + "epoch": 1.3782240697150299, + "grad_norm": 2.1103596687316895, + "learning_rate": 4.12858940303446e-05, + "loss": 0.5476, + "step": 13760 + }, + { + "epoch": 1.3792257224420292, + "grad_norm": 2.607357978820801, + "learning_rate": 4.1273930043111185e-05, + "loss": 0.5823, + "step": 13770 + }, + { + "epoch": 1.3802273751690288, + "grad_norm": 2.7858710289001465, + "learning_rate": 4.1261959584679156e-05, + "loss": 0.5756, + "step": 13780 + }, + { + "epoch": 1.3812290278960284, + "grad_norm": 2.6473164558410645, + "learning_rate": 4.124998265980848e-05, + "loss": 0.6113, + "step": 13790 + }, + { + "epoch": 1.382230680623028, + "grad_norm": 2.2737371921539307, + "learning_rate": 4.1237999273261676e-05, + "loss": 0.6199, + "step": 13800 + }, + { + "epoch": 1.3832323333500276, + "grad_norm": 2.420541524887085, + "learning_rate": 4.1226009429803836e-05, + "loss": 0.5491, + "step": 13810 + }, + { + "epoch": 1.3842339860770272, + "grad_norm": 1.7622172832489014, + "learning_rate": 4.121401313420264e-05, + "loss": 0.4744, + "step": 13820 + }, + { + "epoch": 1.3852356388040268, + "grad_norm": 2.3782007694244385, + "learning_rate": 4.1202010391228306e-05, + "loss": 0.6206, + "step": 13830 + }, + { + "epoch": 1.3862372915310262, + "grad_norm": 2.8256185054779053, + "learning_rate": 4.1190001205653636e-05, + "loss": 0.5888, + "step": 13840 + }, + { + "epoch": 1.3872389442580257, + "grad_norm": 2.546830177307129, + "learning_rate": 4.117798558225399e-05, + "loss": 0.5943, + "step": 13850 + }, + { + "epoch": 1.3882405969850253, + "grad_norm": 3.37953782081604, + "learning_rate": 4.116596352580728e-05, + "loss": 0.5787, + "step": 13860 + }, + { + "epoch": 1.389242249712025, + "grad_norm": 2.2097134590148926, + "learning_rate": 4.1153935041093974e-05, + "loss": 0.5856, + "step": 13870 + }, + { + "epoch": 1.3902439024390243, + "grad_norm": 2.6662843227386475, + "learning_rate": 4.114190013289712e-05, + "loss": 0.5243, + "step": 13880 + }, + { + "epoch": 1.3912455551660239, + "grad_norm": 1.8162366151809692, + "learning_rate": 4.112985880600229e-05, + "loss": 0.5903, + "step": 13890 + }, + { + "epoch": 1.3922472078930235, + "grad_norm": 2.9583468437194824, + "learning_rate": 4.111781106519763e-05, + "loss": 0.5232, + "step": 13900 + }, + { + "epoch": 1.393248860620023, + "grad_norm": 2.8035693168640137, + "learning_rate": 4.1105756915273826e-05, + "loss": 0.5892, + "step": 13910 + }, + { + "epoch": 1.3942505133470227, + "grad_norm": 2.036820650100708, + "learning_rate": 4.10936963610241e-05, + "loss": 0.5729, + "step": 13920 + }, + { + "epoch": 1.3952521660740222, + "grad_norm": 2.5496792793273926, + "learning_rate": 4.108162940724427e-05, + "loss": 0.5305, + "step": 13930 + }, + { + "epoch": 1.3962538188010216, + "grad_norm": 2.4634103775024414, + "learning_rate": 4.1069556058732624e-05, + "loss": 0.6228, + "step": 13940 + }, + { + "epoch": 1.3972554715280212, + "grad_norm": 1.6737140417099, + "learning_rate": 4.105747632029006e-05, + "loss": 0.5499, + "step": 13950 + }, + { + "epoch": 1.3982571242550208, + "grad_norm": 2.705306053161621, + "learning_rate": 4.104539019671997e-05, + "loss": 0.5735, + "step": 13960 + }, + { + "epoch": 1.3992587769820204, + "grad_norm": 2.2632908821105957, + "learning_rate": 4.103329769282832e-05, + "loss": 0.5479, + "step": 13970 + }, + { + "epoch": 1.4002604297090198, + "grad_norm": 1.8262296915054321, + "learning_rate": 4.10211988134236e-05, + "loss": 0.5874, + "step": 13980 + }, + { + "epoch": 1.4012620824360194, + "grad_norm": 2.7786033153533936, + "learning_rate": 4.100909356331682e-05, + "loss": 0.5826, + "step": 13990 + }, + { + "epoch": 1.402263735163019, + "grad_norm": 2.1953024864196777, + "learning_rate": 4.099698194732154e-05, + "loss": 0.5816, + "step": 14000 + }, + { + "epoch": 1.4032653878900185, + "grad_norm": 2.1544125080108643, + "learning_rate": 4.098486397025386e-05, + "loss": 0.5336, + "step": 14010 + }, + { + "epoch": 1.4042670406170181, + "grad_norm": 2.1569459438323975, + "learning_rate": 4.097273963693239e-05, + "loss": 0.5122, + "step": 14020 + }, + { + "epoch": 1.4052686933440177, + "grad_norm": 2.009190559387207, + "learning_rate": 4.096060895217826e-05, + "loss": 0.5175, + "step": 14030 + }, + { + "epoch": 1.4062703460710173, + "grad_norm": 2.4000799655914307, + "learning_rate": 4.094847192081516e-05, + "loss": 0.5778, + "step": 14040 + }, + { + "epoch": 1.4072719987980167, + "grad_norm": 1.9345402717590332, + "learning_rate": 4.0936328547669264e-05, + "loss": 0.5855, + "step": 14050 + }, + { + "epoch": 1.4082736515250163, + "grad_norm": 2.416372060775757, + "learning_rate": 4.09241788375693e-05, + "loss": 0.627, + "step": 14060 + }, + { + "epoch": 1.4092753042520159, + "grad_norm": 2.3259639739990234, + "learning_rate": 4.091202279534651e-05, + "loss": 0.5788, + "step": 14070 + }, + { + "epoch": 1.4102769569790155, + "grad_norm": 2.430366277694702, + "learning_rate": 4.089986042583465e-05, + "loss": 0.5928, + "step": 14080 + }, + { + "epoch": 1.4112786097060148, + "grad_norm": 2.045180320739746, + "learning_rate": 4.088769173386996e-05, + "loss": 0.6082, + "step": 14090 + }, + { + "epoch": 1.4122802624330144, + "grad_norm": 2.674597978591919, + "learning_rate": 4.0875516724291255e-05, + "loss": 0.5751, + "step": 14100 + }, + { + "epoch": 1.413281915160014, + "grad_norm": 2.7040843963623047, + "learning_rate": 4.0863335401939815e-05, + "loss": 0.5585, + "step": 14110 + }, + { + "epoch": 1.4142835678870136, + "grad_norm": 2.3586766719818115, + "learning_rate": 4.085114777165945e-05, + "loss": 0.519, + "step": 14120 + }, + { + "epoch": 1.4152852206140132, + "grad_norm": 1.9052765369415283, + "learning_rate": 4.0838953838296464e-05, + "loss": 0.5796, + "step": 14130 + }, + { + "epoch": 1.4162868733410128, + "grad_norm": 1.9968295097351074, + "learning_rate": 4.08267536066997e-05, + "loss": 0.5678, + "step": 14140 + }, + { + "epoch": 1.4172885260680121, + "grad_norm": 2.5665669441223145, + "learning_rate": 4.081454708172047e-05, + "loss": 0.5907, + "step": 14150 + }, + { + "epoch": 1.4182901787950117, + "grad_norm": 2.1774866580963135, + "learning_rate": 4.080233426821259e-05, + "loss": 0.5392, + "step": 14160 + }, + { + "epoch": 1.4192918315220113, + "grad_norm": 2.24665904045105, + "learning_rate": 4.079011517103241e-05, + "loss": 0.5701, + "step": 14170 + }, + { + "epoch": 1.420293484249011, + "grad_norm": 2.3016774654388428, + "learning_rate": 4.0777889795038736e-05, + "loss": 0.5526, + "step": 14180 + }, + { + "epoch": 1.4212951369760103, + "grad_norm": 3.0300395488739014, + "learning_rate": 4.07656581450929e-05, + "loss": 0.5284, + "step": 14190 + }, + { + "epoch": 1.4222967897030099, + "grad_norm": 2.790093421936035, + "learning_rate": 4.0753420226058724e-05, + "loss": 0.5558, + "step": 14200 + }, + { + "epoch": 1.4232984424300095, + "grad_norm": 2.152724027633667, + "learning_rate": 4.074117604280252e-05, + "loss": 0.6086, + "step": 14210 + }, + { + "epoch": 1.424300095157009, + "grad_norm": 2.25683856010437, + "learning_rate": 4.0728925600193076e-05, + "loss": 0.5255, + "step": 14220 + }, + { + "epoch": 1.4253017478840087, + "grad_norm": 3.285686492919922, + "learning_rate": 4.07166689031017e-05, + "loss": 0.5968, + "step": 14230 + }, + { + "epoch": 1.4263034006110082, + "grad_norm": 2.408635377883911, + "learning_rate": 4.070440595640217e-05, + "loss": 0.5599, + "step": 14240 + }, + { + "epoch": 1.4273050533380078, + "grad_norm": 2.2016947269439697, + "learning_rate": 4.069213676497073e-05, + "loss": 0.5839, + "step": 14250 + }, + { + "epoch": 1.4283067060650072, + "grad_norm": 2.815704107284546, + "learning_rate": 4.067986133368614e-05, + "loss": 0.6123, + "step": 14260 + }, + { + "epoch": 1.4293083587920068, + "grad_norm": 2.9127774238586426, + "learning_rate": 4.0667579667429625e-05, + "loss": 0.5208, + "step": 14270 + }, + { + "epoch": 1.4303100115190064, + "grad_norm": 2.2251622676849365, + "learning_rate": 4.0655291771084896e-05, + "loss": 0.5773, + "step": 14280 + }, + { + "epoch": 1.431311664246006, + "grad_norm": 2.434687614440918, + "learning_rate": 4.064299764953813e-05, + "loss": 0.5782, + "step": 14290 + }, + { + "epoch": 1.4323133169730053, + "grad_norm": 2.1257598400115967, + "learning_rate": 4.0630697307678e-05, + "loss": 0.5693, + "step": 14300 + }, + { + "epoch": 1.433314969700005, + "grad_norm": 2.1355576515197754, + "learning_rate": 4.061839075039562e-05, + "loss": 0.5607, + "step": 14310 + }, + { + "epoch": 1.4343166224270045, + "grad_norm": 2.9941141605377197, + "learning_rate": 4.060607798258459e-05, + "loss": 0.6015, + "step": 14320 + }, + { + "epoch": 1.4353182751540041, + "grad_norm": 1.9984222650527954, + "learning_rate": 4.059375900914102e-05, + "loss": 0.5565, + "step": 14330 + }, + { + "epoch": 1.4363199278810037, + "grad_norm": 2.6559393405914307, + "learning_rate": 4.058143383496341e-05, + "loss": 0.5428, + "step": 14340 + }, + { + "epoch": 1.4373215806080033, + "grad_norm": 2.177220106124878, + "learning_rate": 4.05691024649528e-05, + "loss": 0.607, + "step": 14350 + }, + { + "epoch": 1.438323233335003, + "grad_norm": 2.5836830139160156, + "learning_rate": 4.055676490401264e-05, + "loss": 0.5665, + "step": 14360 + }, + { + "epoch": 1.4393248860620023, + "grad_norm": 3.489238977432251, + "learning_rate": 4.0544421157048875e-05, + "loss": 0.5957, + "step": 14370 + }, + { + "epoch": 1.4403265387890019, + "grad_norm": 2.5777053833007812, + "learning_rate": 4.053207122896989e-05, + "loss": 0.5574, + "step": 14380 + }, + { + "epoch": 1.4413281915160014, + "grad_norm": 2.3161325454711914, + "learning_rate": 4.0519715124686535e-05, + "loss": 0.5757, + "step": 14390 + }, + { + "epoch": 1.4423298442430008, + "grad_norm": 2.5164365768432617, + "learning_rate": 4.050735284911212e-05, + "loss": 0.5554, + "step": 14400 + }, + { + "epoch": 1.4433314969700004, + "grad_norm": 2.887172222137451, + "learning_rate": 4.049498440716241e-05, + "loss": 0.6013, + "step": 14410 + }, + { + "epoch": 1.444333149697, + "grad_norm": 1.7889225482940674, + "learning_rate": 4.0482609803755604e-05, + "loss": 0.5402, + "step": 14420 + }, + { + "epoch": 1.4453348024239996, + "grad_norm": 2.003990650177002, + "learning_rate": 4.047022904381238e-05, + "loss": 0.5742, + "step": 14430 + }, + { + "epoch": 1.4463364551509992, + "grad_norm": 2.028975248336792, + "learning_rate": 4.045784213225584e-05, + "loss": 0.5762, + "step": 14440 + }, + { + "epoch": 1.4473381078779988, + "grad_norm": 3.170283317565918, + "learning_rate": 4.0445449074011535e-05, + "loss": 0.5331, + "step": 14450 + }, + { + "epoch": 1.4483397606049984, + "grad_norm": 2.5726683139801025, + "learning_rate": 4.0433049874007475e-05, + "loss": 0.5711, + "step": 14460 + }, + { + "epoch": 1.4493414133319977, + "grad_norm": 2.1386616230010986, + "learning_rate": 4.042064453717411e-05, + "loss": 0.5832, + "step": 14470 + }, + { + "epoch": 1.4503430660589973, + "grad_norm": 3.0277740955352783, + "learning_rate": 4.040823306844431e-05, + "loss": 0.5874, + "step": 14480 + }, + { + "epoch": 1.451344718785997, + "grad_norm": 2.120304822921753, + "learning_rate": 4.039581547275339e-05, + "loss": 0.5645, + "step": 14490 + }, + { + "epoch": 1.4523463715129965, + "grad_norm": 1.4980655908584595, + "learning_rate": 4.038339175503914e-05, + "loss": 0.5484, + "step": 14500 + }, + { + "epoch": 1.4533480242399959, + "grad_norm": 2.6482741832733154, + "learning_rate": 4.037096192024171e-05, + "loss": 0.573, + "step": 14510 + }, + { + "epoch": 1.4543496769669955, + "grad_norm": 2.6279139518737793, + "learning_rate": 4.035852597330375e-05, + "loss": 0.5177, + "step": 14520 + }, + { + "epoch": 1.455351329693995, + "grad_norm": 2.7452750205993652, + "learning_rate": 4.034608391917032e-05, + "loss": 0.489, + "step": 14530 + }, + { + "epoch": 1.4563529824209946, + "grad_norm": 1.965122103691101, + "learning_rate": 4.033363576278889e-05, + "loss": 0.575, + "step": 14540 + }, + { + "epoch": 1.4573546351479942, + "grad_norm": 2.4850306510925293, + "learning_rate": 4.0321181509109374e-05, + "loss": 0.5551, + "step": 14550 + }, + { + "epoch": 1.4583562878749938, + "grad_norm": 2.552098274230957, + "learning_rate": 4.0308721163084105e-05, + "loss": 0.5781, + "step": 14560 + }, + { + "epoch": 1.4593579406019934, + "grad_norm": 2.414721727371216, + "learning_rate": 4.029625472966785e-05, + "loss": 0.5155, + "step": 14570 + }, + { + "epoch": 1.4603595933289928, + "grad_norm": 2.572413444519043, + "learning_rate": 4.028378221381778e-05, + "loss": 0.5722, + "step": 14580 + }, + { + "epoch": 1.4613612460559924, + "grad_norm": 2.4709455966949463, + "learning_rate": 4.027130362049348e-05, + "loss": 0.6083, + "step": 14590 + }, + { + "epoch": 1.462362898782992, + "grad_norm": 2.268739700317383, + "learning_rate": 4.025881895465699e-05, + "loss": 0.541, + "step": 14600 + }, + { + "epoch": 1.4633645515099916, + "grad_norm": 2.681468963623047, + "learning_rate": 4.024632822127271e-05, + "loss": 0.5514, + "step": 14610 + }, + { + "epoch": 1.464366204236991, + "grad_norm": 2.1005759239196777, + "learning_rate": 4.023383142530751e-05, + "loss": 0.5443, + "step": 14620 + }, + { + "epoch": 1.4653678569639905, + "grad_norm": 2.0630645751953125, + "learning_rate": 4.022132857173061e-05, + "loss": 0.5303, + "step": 14630 + }, + { + "epoch": 1.4663695096909901, + "grad_norm": 2.8516407012939453, + "learning_rate": 4.0208819665513684e-05, + "loss": 0.5339, + "step": 14640 + }, + { + "epoch": 1.4673711624179897, + "grad_norm": 2.4817099571228027, + "learning_rate": 4.0196304711630805e-05, + "loss": 0.5363, + "step": 14650 + }, + { + "epoch": 1.4683728151449893, + "grad_norm": 3.3822922706604004, + "learning_rate": 4.0183783715058444e-05, + "loss": 0.528, + "step": 14660 + }, + { + "epoch": 1.4693744678719889, + "grad_norm": 2.3686294555664062, + "learning_rate": 4.017125668077546e-05, + "loss": 0.5148, + "step": 14670 + }, + { + "epoch": 1.4703761205989883, + "grad_norm": 2.2232017517089844, + "learning_rate": 4.0158723613763145e-05, + "loss": 0.5897, + "step": 14680 + }, + { + "epoch": 1.4713777733259878, + "grad_norm": 2.1292519569396973, + "learning_rate": 4.014618451900517e-05, + "loss": 0.59, + "step": 14690 + }, + { + "epoch": 1.4723794260529874, + "grad_norm": 2.4203102588653564, + "learning_rate": 4.013363940148759e-05, + "loss": 0.5647, + "step": 14700 + }, + { + "epoch": 1.473381078779987, + "grad_norm": 1.9357800483703613, + "learning_rate": 4.0121088266198906e-05, + "loss": 0.5342, + "step": 14710 + }, + { + "epoch": 1.4743827315069864, + "grad_norm": 2.3694651126861572, + "learning_rate": 4.0108531118129934e-05, + "loss": 0.5526, + "step": 14720 + }, + { + "epoch": 1.475384384233986, + "grad_norm": 2.351116180419922, + "learning_rate": 4.009596796227396e-05, + "loss": 0.489, + "step": 14730 + }, + { + "epoch": 1.4763860369609856, + "grad_norm": 2.0364739894866943, + "learning_rate": 4.0083398803626606e-05, + "loss": 0.6545, + "step": 14740 + }, + { + "epoch": 1.4773876896879852, + "grad_norm": 3.0939629077911377, + "learning_rate": 4.00708236471859e-05, + "loss": 0.5798, + "step": 14750 + }, + { + "epoch": 1.4783893424149848, + "grad_norm": 1.8895127773284912, + "learning_rate": 4.005824249795225e-05, + "loss": 0.5247, + "step": 14760 + }, + { + "epoch": 1.4793909951419844, + "grad_norm": 2.274951934814453, + "learning_rate": 4.004565536092845e-05, + "loss": 0.5154, + "step": 14770 + }, + { + "epoch": 1.480392647868984, + "grad_norm": 2.3646650314331055, + "learning_rate": 4.0033062241119676e-05, + "loss": 0.5363, + "step": 14780 + }, + { + "epoch": 1.4813943005959833, + "grad_norm": 2.129894256591797, + "learning_rate": 4.002046314353348e-05, + "loss": 0.537, + "step": 14790 + }, + { + "epoch": 1.482395953322983, + "grad_norm": 2.5647120475769043, + "learning_rate": 4.000785807317981e-05, + "loss": 0.5885, + "step": 14800 + }, + { + "epoch": 1.4833976060499825, + "grad_norm": 1.9728856086730957, + "learning_rate": 3.999524703507095e-05, + "loss": 0.5233, + "step": 14810 + }, + { + "epoch": 1.484399258776982, + "grad_norm": 2.2294158935546875, + "learning_rate": 3.998263003422159e-05, + "loss": 0.5812, + "step": 14820 + }, + { + "epoch": 1.4854009115039815, + "grad_norm": 2.4784843921661377, + "learning_rate": 3.997000707564877e-05, + "loss": 0.5809, + "step": 14830 + }, + { + "epoch": 1.486402564230981, + "grad_norm": 1.835519552230835, + "learning_rate": 3.995737816437192e-05, + "loss": 0.5403, + "step": 14840 + }, + { + "epoch": 1.4874042169579806, + "grad_norm": 2.5397074222564697, + "learning_rate": 3.994474330541282e-05, + "loss": 0.6364, + "step": 14850 + }, + { + "epoch": 1.4884058696849802, + "grad_norm": 2.101902961730957, + "learning_rate": 3.9932102503795616e-05, + "loss": 0.5697, + "step": 14860 + }, + { + "epoch": 1.4894075224119798, + "grad_norm": 2.234866142272949, + "learning_rate": 3.991945576454683e-05, + "loss": 0.4622, + "step": 14870 + }, + { + "epoch": 1.4904091751389794, + "grad_norm": 2.1409013271331787, + "learning_rate": 3.990680309269534e-05, + "loss": 0.5951, + "step": 14880 + }, + { + "epoch": 1.4914108278659788, + "grad_norm": 1.8268654346466064, + "learning_rate": 3.9894144493272376e-05, + "loss": 0.6112, + "step": 14890 + }, + { + "epoch": 1.4924124805929784, + "grad_norm": 2.455508232116699, + "learning_rate": 3.988147997131152e-05, + "loss": 0.6239, + "step": 14900 + }, + { + "epoch": 1.493414133319978, + "grad_norm": 2.3702023029327393, + "learning_rate": 3.986880953184874e-05, + "loss": 0.5383, + "step": 14910 + }, + { + "epoch": 1.4944157860469776, + "grad_norm": 2.136279821395874, + "learning_rate": 3.985613317992231e-05, + "loss": 0.6343, + "step": 14920 + }, + { + "epoch": 1.495417438773977, + "grad_norm": 2.7083253860473633, + "learning_rate": 3.98434509205729e-05, + "loss": 0.5874, + "step": 14930 + }, + { + "epoch": 1.4964190915009765, + "grad_norm": 2.658341407775879, + "learning_rate": 3.9830762758843496e-05, + "loss": 0.5574, + "step": 14940 + }, + { + "epoch": 1.497420744227976, + "grad_norm": 1.8282076120376587, + "learning_rate": 3.981806869977945e-05, + "loss": 0.5252, + "step": 14950 + }, + { + "epoch": 1.4984223969549757, + "grad_norm": 2.072915554046631, + "learning_rate": 3.980536874842846e-05, + "loss": 0.5425, + "step": 14960 + }, + { + "epoch": 1.4994240496819753, + "grad_norm": 2.2089407444000244, + "learning_rate": 3.979266290984055e-05, + "loss": 0.5542, + "step": 14970 + }, + { + "epoch": 1.5004257024089749, + "grad_norm": 1.9677778482437134, + "learning_rate": 3.97799511890681e-05, + "loss": 0.59, + "step": 14980 + }, + { + "epoch": 1.5014273551359745, + "grad_norm": 1.9966034889221191, + "learning_rate": 3.976723359116583e-05, + "loss": 0.616, + "step": 14990 + }, + { + "epoch": 1.502429007862974, + "grad_norm": 2.4626691341400146, + "learning_rate": 3.975451012119078e-05, + "loss": 0.5608, + "step": 15000 + }, + { + "epoch": 1.5034306605899734, + "grad_norm": 1.9694550037384033, + "learning_rate": 3.974178078420234e-05, + "loss": 0.5136, + "step": 15010 + }, + { + "epoch": 1.504432313316973, + "grad_norm": 2.206510066986084, + "learning_rate": 3.9729045585262235e-05, + "loss": 0.5868, + "step": 15020 + }, + { + "epoch": 1.5054339660439724, + "grad_norm": 1.8147963285446167, + "learning_rate": 3.9716304529434504e-05, + "loss": 0.5395, + "step": 15030 + }, + { + "epoch": 1.506435618770972, + "grad_norm": 2.563148021697998, + "learning_rate": 3.970355762178555e-05, + "loss": 0.5693, + "step": 15040 + }, + { + "epoch": 1.5074372714979716, + "grad_norm": 3.0714714527130127, + "learning_rate": 3.9690804867384046e-05, + "loss": 0.6563, + "step": 15050 + }, + { + "epoch": 1.5084389242249712, + "grad_norm": 2.1805124282836914, + "learning_rate": 3.967804627130105e-05, + "loss": 0.5471, + "step": 15060 + }, + { + "epoch": 1.5094405769519708, + "grad_norm": 2.3275880813598633, + "learning_rate": 3.9665281838609905e-05, + "loss": 0.4971, + "step": 15070 + }, + { + "epoch": 1.5104422296789703, + "grad_norm": 2.1147117614746094, + "learning_rate": 3.9652511574386286e-05, + "loss": 0.5394, + "step": 15080 + }, + { + "epoch": 1.51144388240597, + "grad_norm": 2.3367505073547363, + "learning_rate": 3.9639735483708195e-05, + "loss": 0.5847, + "step": 15090 + }, + { + "epoch": 1.5124455351329695, + "grad_norm": 2.4513463973999023, + "learning_rate": 3.9626953571655926e-05, + "loss": 0.5493, + "step": 15100 + }, + { + "epoch": 1.513447187859969, + "grad_norm": 2.5679237842559814, + "learning_rate": 3.961416584331212e-05, + "loss": 0.6568, + "step": 15110 + }, + { + "epoch": 1.5144488405869685, + "grad_norm": 2.4463155269622803, + "learning_rate": 3.960137230376171e-05, + "loss": 0.5491, + "step": 15120 + }, + { + "epoch": 1.515450493313968, + "grad_norm": 3.207108497619629, + "learning_rate": 3.958857295809195e-05, + "loss": 0.6069, + "step": 15130 + }, + { + "epoch": 1.5164521460409675, + "grad_norm": 2.2444918155670166, + "learning_rate": 3.957576781139238e-05, + "loss": 0.5662, + "step": 15140 + }, + { + "epoch": 1.517453798767967, + "grad_norm": 3.2347822189331055, + "learning_rate": 3.9562956868754884e-05, + "loss": 0.6197, + "step": 15150 + }, + { + "epoch": 1.5184554514949666, + "grad_norm": 2.22012996673584, + "learning_rate": 3.955014013527363e-05, + "loss": 0.6081, + "step": 15160 + }, + { + "epoch": 1.5194571042219662, + "grad_norm": 2.368472099304199, + "learning_rate": 3.9537317616045075e-05, + "loss": 0.4841, + "step": 15170 + }, + { + "epoch": 1.5204587569489658, + "grad_norm": 1.771808385848999, + "learning_rate": 3.952448931616801e-05, + "loss": 0.5171, + "step": 15180 + }, + { + "epoch": 1.5214604096759654, + "grad_norm": 2.661496877670288, + "learning_rate": 3.9511655240743494e-05, + "loss": 0.5657, + "step": 15190 + }, + { + "epoch": 1.522462062402965, + "grad_norm": 2.4911489486694336, + "learning_rate": 3.949881539487489e-05, + "loss": 0.5156, + "step": 15200 + }, + { + "epoch": 1.5234637151299646, + "grad_norm": 2.6235358715057373, + "learning_rate": 3.948596978366787e-05, + "loss": 0.5028, + "step": 15210 + }, + { + "epoch": 1.524465367856964, + "grad_norm": 2.052980661392212, + "learning_rate": 3.9473118412230406e-05, + "loss": 0.5424, + "step": 15220 + }, + { + "epoch": 1.5254670205839636, + "grad_norm": 2.1379051208496094, + "learning_rate": 3.9460261285672716e-05, + "loss": 0.6054, + "step": 15230 + }, + { + "epoch": 1.5264686733109631, + "grad_norm": 2.0419297218322754, + "learning_rate": 3.944739840910733e-05, + "loss": 0.5623, + "step": 15240 + }, + { + "epoch": 1.5274703260379625, + "grad_norm": 1.9566359519958496, + "learning_rate": 3.9434529787649096e-05, + "loss": 0.579, + "step": 15250 + }, + { + "epoch": 1.528471978764962, + "grad_norm": 2.9252963066101074, + "learning_rate": 3.9421655426415094e-05, + "loss": 0.5774, + "step": 15260 + }, + { + "epoch": 1.5294736314919617, + "grad_norm": 1.8522799015045166, + "learning_rate": 3.940877533052473e-05, + "loss": 0.546, + "step": 15270 + }, + { + "epoch": 1.5304752842189613, + "grad_norm": 1.8291130065917969, + "learning_rate": 3.939588950509966e-05, + "loss": 0.4684, + "step": 15280 + }, + { + "epoch": 1.5314769369459609, + "grad_norm": 2.20719313621521, + "learning_rate": 3.9382997955263826e-05, + "loss": 0.6081, + "step": 15290 + }, + { + "epoch": 1.5324785896729605, + "grad_norm": 2.3221805095672607, + "learning_rate": 3.937010068614346e-05, + "loss": 0.5657, + "step": 15300 + }, + { + "epoch": 1.53348024239996, + "grad_norm": 2.302234172821045, + "learning_rate": 3.935719770286706e-05, + "loss": 0.6058, + "step": 15310 + }, + { + "epoch": 1.5344818951269596, + "grad_norm": 1.9883896112442017, + "learning_rate": 3.934428901056538e-05, + "loss": 0.5818, + "step": 15320 + }, + { + "epoch": 1.535483547853959, + "grad_norm": 2.019928216934204, + "learning_rate": 3.9331374614371485e-05, + "loss": 0.5142, + "step": 15330 + }, + { + "epoch": 1.5364852005809586, + "grad_norm": 2.7909915447235107, + "learning_rate": 3.931845451942065e-05, + "loss": 0.5924, + "step": 15340 + }, + { + "epoch": 1.537486853307958, + "grad_norm": 2.2816619873046875, + "learning_rate": 3.930552873085047e-05, + "loss": 0.4988, + "step": 15350 + }, + { + "epoch": 1.5384885060349576, + "grad_norm": 2.1884818077087402, + "learning_rate": 3.929259725380077e-05, + "loss": 0.4728, + "step": 15360 + }, + { + "epoch": 1.5394901587619572, + "grad_norm": 1.680711269378662, + "learning_rate": 3.927966009341365e-05, + "loss": 0.5373, + "step": 15370 + }, + { + "epoch": 1.5404918114889568, + "grad_norm": 1.9891623258590698, + "learning_rate": 3.9266717254833475e-05, + "loss": 0.4977, + "step": 15380 + }, + { + "epoch": 1.5414934642159563, + "grad_norm": 2.236722946166992, + "learning_rate": 3.9253768743206867e-05, + "loss": 0.5669, + "step": 15390 + }, + { + "epoch": 1.542495116942956, + "grad_norm": 2.8090734481811523, + "learning_rate": 3.924081456368268e-05, + "loss": 0.6334, + "step": 15400 + }, + { + "epoch": 1.5434967696699555, + "grad_norm": 2.2476439476013184, + "learning_rate": 3.922785472141205e-05, + "loss": 0.5412, + "step": 15410 + }, + { + "epoch": 1.5444984223969551, + "grad_norm": 2.49711012840271, + "learning_rate": 3.9214889221548365e-05, + "loss": 0.5622, + "step": 15420 + }, + { + "epoch": 1.5455000751239545, + "grad_norm": 2.0667357444763184, + "learning_rate": 3.920191806924723e-05, + "loss": 0.5434, + "step": 15430 + }, + { + "epoch": 1.546501727850954, + "grad_norm": 2.0976450443267822, + "learning_rate": 3.9188941269666544e-05, + "loss": 0.5828, + "step": 15440 + }, + { + "epoch": 1.5475033805779537, + "grad_norm": 2.21097469329834, + "learning_rate": 3.9175958827966416e-05, + "loss": 0.5162, + "step": 15450 + }, + { + "epoch": 1.548505033304953, + "grad_norm": 2.5535941123962402, + "learning_rate": 3.9162970749309207e-05, + "loss": 0.6123, + "step": 15460 + }, + { + "epoch": 1.5495066860319526, + "grad_norm": 2.2607061862945557, + "learning_rate": 3.9149977038859534e-05, + "loss": 0.6232, + "step": 15470 + }, + { + "epoch": 1.5505083387589522, + "grad_norm": 1.9447360038757324, + "learning_rate": 3.913697770178423e-05, + "loss": 0.5005, + "step": 15480 + }, + { + "epoch": 1.5515099914859518, + "grad_norm": 1.894853115081787, + "learning_rate": 3.9123972743252394e-05, + "loss": 0.5458, + "step": 15490 + }, + { + "epoch": 1.5525116442129514, + "grad_norm": 1.628859281539917, + "learning_rate": 3.9110962168435315e-05, + "loss": 0.5476, + "step": 15500 + }, + { + "epoch": 1.553513296939951, + "grad_norm": 2.6020190715789795, + "learning_rate": 3.9097945982506584e-05, + "loss": 0.5331, + "step": 15510 + }, + { + "epoch": 1.5545149496669506, + "grad_norm": 2.6441216468811035, + "learning_rate": 3.908492419064196e-05, + "loss": 0.578, + "step": 15520 + }, + { + "epoch": 1.5555166023939502, + "grad_norm": 1.7780492305755615, + "learning_rate": 3.907189679801945e-05, + "loss": 0.501, + "step": 15530 + }, + { + "epoch": 1.5565182551209495, + "grad_norm": 2.1867527961730957, + "learning_rate": 3.90588638098193e-05, + "loss": 0.5687, + "step": 15540 + }, + { + "epoch": 1.5575199078479491, + "grad_norm": 2.3934149742126465, + "learning_rate": 3.904582523122398e-05, + "loss": 0.595, + "step": 15550 + }, + { + "epoch": 1.5585215605749485, + "grad_norm": 2.72482967376709, + "learning_rate": 3.9032781067418176e-05, + "loss": 0.5956, + "step": 15560 + }, + { + "epoch": 1.559523213301948, + "grad_norm": 2.589207410812378, + "learning_rate": 3.9019731323588785e-05, + "loss": 0.5599, + "step": 15570 + }, + { + "epoch": 1.5605248660289477, + "grad_norm": 2.120452404022217, + "learning_rate": 3.900667600492494e-05, + "loss": 0.5554, + "step": 15580 + }, + { + "epoch": 1.5615265187559473, + "grad_norm": 2.300694704055786, + "learning_rate": 3.8993615116617985e-05, + "loss": 0.5926, + "step": 15590 + }, + { + "epoch": 1.5625281714829469, + "grad_norm": 2.3215324878692627, + "learning_rate": 3.8980548663861485e-05, + "loss": 0.5243, + "step": 15600 + }, + { + "epoch": 1.5635298242099465, + "grad_norm": 2.2662432193756104, + "learning_rate": 3.8967476651851196e-05, + "loss": 0.5805, + "step": 15610 + }, + { + "epoch": 1.564531476936946, + "grad_norm": 2.3407816886901855, + "learning_rate": 3.895439908578511e-05, + "loss": 0.5136, + "step": 15620 + }, + { + "epoch": 1.5655331296639456, + "grad_norm": 2.640747547149658, + "learning_rate": 3.894131597086341e-05, + "loss": 0.5318, + "step": 15630 + }, + { + "epoch": 1.566534782390945, + "grad_norm": 2.426098585128784, + "learning_rate": 3.89282273122885e-05, + "loss": 0.5826, + "step": 15640 + }, + { + "epoch": 1.5675364351179446, + "grad_norm": 2.3364737033843994, + "learning_rate": 3.891513311526498e-05, + "loss": 0.4968, + "step": 15650 + }, + { + "epoch": 1.5685380878449442, + "grad_norm": 2.151834011077881, + "learning_rate": 3.890203338499965e-05, + "loss": 0.5438, + "step": 15660 + }, + { + "epoch": 1.5695397405719436, + "grad_norm": 2.491666555404663, + "learning_rate": 3.8888928126701515e-05, + "loss": 0.5787, + "step": 15670 + }, + { + "epoch": 1.5705413932989432, + "grad_norm": 2.0937247276306152, + "learning_rate": 3.887581734558177e-05, + "loss": 0.5191, + "step": 15680 + }, + { + "epoch": 1.5715430460259427, + "grad_norm": 2.5927858352661133, + "learning_rate": 3.886270104685382e-05, + "loss": 0.5813, + "step": 15690 + }, + { + "epoch": 1.5725446987529423, + "grad_norm": 1.6065778732299805, + "learning_rate": 3.884957923573325e-05, + "loss": 0.5401, + "step": 15700 + }, + { + "epoch": 1.573546351479942, + "grad_norm": 3.2194879055023193, + "learning_rate": 3.883645191743786e-05, + "loss": 0.5814, + "step": 15710 + }, + { + "epoch": 1.5745480042069415, + "grad_norm": 2.245382308959961, + "learning_rate": 3.88233190971876e-05, + "loss": 0.586, + "step": 15720 + }, + { + "epoch": 1.5755496569339411, + "grad_norm": 2.3317959308624268, + "learning_rate": 3.8810180780204645e-05, + "loss": 0.5584, + "step": 15730 + }, + { + "epoch": 1.5765513096609407, + "grad_norm": 2.8339760303497314, + "learning_rate": 3.8797036971713344e-05, + "loss": 0.5879, + "step": 15740 + }, + { + "epoch": 1.57755296238794, + "grad_norm": 2.1983373165130615, + "learning_rate": 3.8783887676940225e-05, + "loss": 0.6056, + "step": 15750 + }, + { + "epoch": 1.5785546151149397, + "grad_norm": 2.6038060188293457, + "learning_rate": 3.8770732901113994e-05, + "loss": 0.5227, + "step": 15760 + }, + { + "epoch": 1.5795562678419393, + "grad_norm": 3.548153877258301, + "learning_rate": 3.875757264946555e-05, + "loss": 0.5142, + "step": 15770 + }, + { + "epoch": 1.5805579205689386, + "grad_norm": 2.2880027294158936, + "learning_rate": 3.874440692722796e-05, + "loss": 0.5454, + "step": 15780 + }, + { + "epoch": 1.5815595732959382, + "grad_norm": 2.094909191131592, + "learning_rate": 3.8731235739636476e-05, + "loss": 0.5399, + "step": 15790 + }, + { + "epoch": 1.5825612260229378, + "grad_norm": 2.4892055988311768, + "learning_rate": 3.87180590919285e-05, + "loss": 0.5292, + "step": 15800 + }, + { + "epoch": 1.5835628787499374, + "grad_norm": 2.3183560371398926, + "learning_rate": 3.870487698934363e-05, + "loss": 0.5417, + "step": 15810 + }, + { + "epoch": 1.584564531476937, + "grad_norm": 2.3660850524902344, + "learning_rate": 3.869168943712362e-05, + "loss": 0.5847, + "step": 15820 + }, + { + "epoch": 1.5855661842039366, + "grad_norm": 2.3487000465393066, + "learning_rate": 3.8678496440512415e-05, + "loss": 0.5391, + "step": 15830 + }, + { + "epoch": 1.5865678369309362, + "grad_norm": 1.8460687398910522, + "learning_rate": 3.8665298004756075e-05, + "loss": 0.5236, + "step": 15840 + }, + { + "epoch": 1.5875694896579355, + "grad_norm": 2.310218334197998, + "learning_rate": 3.8652094135102865e-05, + "loss": 0.5462, + "step": 15850 + }, + { + "epoch": 1.5885711423849351, + "grad_norm": 2.01802659034729, + "learning_rate": 3.8638884836803205e-05, + "loss": 0.5358, + "step": 15860 + }, + { + "epoch": 1.5895727951119347, + "grad_norm": 1.9128429889678955, + "learning_rate": 3.8625670115109667e-05, + "loss": 0.6124, + "step": 15870 + }, + { + "epoch": 1.590574447838934, + "grad_norm": 2.289938449859619, + "learning_rate": 3.8612449975276965e-05, + "loss": 0.5916, + "step": 15880 + }, + { + "epoch": 1.5915761005659337, + "grad_norm": 3.483588457107544, + "learning_rate": 3.8599224422561997e-05, + "loss": 0.5341, + "step": 15890 + }, + { + "epoch": 1.5925777532929333, + "grad_norm": 2.2982561588287354, + "learning_rate": 3.858599346222379e-05, + "loss": 0.5238, + "step": 15900 + }, + { + "epoch": 1.5935794060199329, + "grad_norm": 3.378920555114746, + "learning_rate": 3.857275709952354e-05, + "loss": 0.5332, + "step": 15910 + }, + { + "epoch": 1.5945810587469325, + "grad_norm": 1.8684322834014893, + "learning_rate": 3.855951533972457e-05, + "loss": 0.5602, + "step": 15920 + }, + { + "epoch": 1.595582711473932, + "grad_norm": 2.5684077739715576, + "learning_rate": 3.854626818809237e-05, + "loss": 0.6066, + "step": 15930 + }, + { + "epoch": 1.5965843642009316, + "grad_norm": 2.02518630027771, + "learning_rate": 3.853301564989455e-05, + "loss": 0.5548, + "step": 15940 + }, + { + "epoch": 1.5975860169279312, + "grad_norm": 2.17681884765625, + "learning_rate": 3.8519757730400894e-05, + "loss": 0.5176, + "step": 15950 + }, + { + "epoch": 1.5985876696549306, + "grad_norm": 2.2103607654571533, + "learning_rate": 3.85064944348833e-05, + "loss": 0.5658, + "step": 15960 + }, + { + "epoch": 1.5995893223819302, + "grad_norm": 1.7236950397491455, + "learning_rate": 3.849322576861582e-05, + "loss": 0.5217, + "step": 15970 + }, + { + "epoch": 1.6005909751089298, + "grad_norm": 1.8692692518234253, + "learning_rate": 3.847995173687461e-05, + "loss": 0.6032, + "step": 15980 + }, + { + "epoch": 1.6015926278359292, + "grad_norm": 2.6671016216278076, + "learning_rate": 3.8466672344938005e-05, + "loss": 0.6187, + "step": 15990 + }, + { + "epoch": 1.6025942805629287, + "grad_norm": 2.8001585006713867, + "learning_rate": 3.845338759808644e-05, + "loss": 0.5646, + "step": 16000 + }, + { + "epoch": 1.6035959332899283, + "grad_norm": 2.768728733062744, + "learning_rate": 3.844009750160249e-05, + "loss": 0.6015, + "step": 16010 + }, + { + "epoch": 1.604597586016928, + "grad_norm": 3.4323201179504395, + "learning_rate": 3.842680206077086e-05, + "loss": 0.5414, + "step": 16020 + }, + { + "epoch": 1.6055992387439275, + "grad_norm": 2.223552942276001, + "learning_rate": 3.841350128087837e-05, + "loss": 0.5509, + "step": 16030 + }, + { + "epoch": 1.606600891470927, + "grad_norm": 2.1531848907470703, + "learning_rate": 3.840019516721398e-05, + "loss": 0.6104, + "step": 16040 + }, + { + "epoch": 1.6076025441979267, + "grad_norm": 2.558938503265381, + "learning_rate": 3.8386883725068745e-05, + "loss": 0.5257, + "step": 16050 + }, + { + "epoch": 1.6086041969249263, + "grad_norm": 2.4752094745635986, + "learning_rate": 3.837356695973586e-05, + "loss": 0.5968, + "step": 16060 + }, + { + "epoch": 1.6096058496519257, + "grad_norm": 2.3830080032348633, + "learning_rate": 3.836024487651064e-05, + "loss": 0.5585, + "step": 16070 + }, + { + "epoch": 1.6106075023789252, + "grad_norm": 2.419825315475464, + "learning_rate": 3.834691748069049e-05, + "loss": 0.5552, + "step": 16080 + }, + { + "epoch": 1.6116091551059246, + "grad_norm": 2.2571637630462646, + "learning_rate": 3.833358477757496e-05, + "loss": 0.5709, + "step": 16090 + }, + { + "epoch": 1.6126108078329242, + "grad_norm": 2.4724700450897217, + "learning_rate": 3.8320246772465674e-05, + "loss": 0.553, + "step": 16100 + }, + { + "epoch": 1.6136124605599238, + "grad_norm": 2.4371488094329834, + "learning_rate": 3.8306903470666385e-05, + "loss": 0.5119, + "step": 16110 + }, + { + "epoch": 1.6146141132869234, + "grad_norm": 2.0440852642059326, + "learning_rate": 3.829355487748297e-05, + "loss": 0.6072, + "step": 16120 + }, + { + "epoch": 1.615615766013923, + "grad_norm": 1.7289458513259888, + "learning_rate": 3.828020099822338e-05, + "loss": 0.5656, + "step": 16130 + }, + { + "epoch": 1.6166174187409226, + "grad_norm": 2.366450071334839, + "learning_rate": 3.826684183819768e-05, + "loss": 0.5754, + "step": 16140 + }, + { + "epoch": 1.6176190714679222, + "grad_norm": 1.9177955389022827, + "learning_rate": 3.825347740271802e-05, + "loss": 0.5357, + "step": 16150 + }, + { + "epoch": 1.6186207241949218, + "grad_norm": 2.2635581493377686, + "learning_rate": 3.824010769709868e-05, + "loss": 0.5734, + "step": 16160 + }, + { + "epoch": 1.6196223769219211, + "grad_norm": 2.226444959640503, + "learning_rate": 3.8226732726656005e-05, + "loss": 0.5329, + "step": 16170 + }, + { + "epoch": 1.6206240296489207, + "grad_norm": 1.7379987239837646, + "learning_rate": 3.821335249670845e-05, + "loss": 0.5442, + "step": 16180 + }, + { + "epoch": 1.6216256823759203, + "grad_norm": 2.6636769771575928, + "learning_rate": 3.8199967012576566e-05, + "loss": 0.5654, + "step": 16190 + }, + { + "epoch": 1.6226273351029197, + "grad_norm": 2.491895914077759, + "learning_rate": 3.818657627958296e-05, + "loss": 0.5794, + "step": 16200 + }, + { + "epoch": 1.6236289878299193, + "grad_norm": 2.4376542568206787, + "learning_rate": 3.817318030305238e-05, + "loss": 0.5393, + "step": 16210 + }, + { + "epoch": 1.6246306405569189, + "grad_norm": 2.03060245513916, + "learning_rate": 3.815977908831161e-05, + "loss": 0.5238, + "step": 16220 + }, + { + "epoch": 1.6256322932839185, + "grad_norm": 1.902113914489746, + "learning_rate": 3.8146372640689536e-05, + "loss": 0.5333, + "step": 16230 + }, + { + "epoch": 1.626633946010918, + "grad_norm": 2.258549928665161, + "learning_rate": 3.8132960965517135e-05, + "loss": 0.5551, + "step": 16240 + }, + { + "epoch": 1.6276355987379176, + "grad_norm": 1.7955327033996582, + "learning_rate": 3.811954406812744e-05, + "loss": 0.5322, + "step": 16250 + }, + { + "epoch": 1.6286372514649172, + "grad_norm": 1.9605530500411987, + "learning_rate": 3.810612195385558e-05, + "loss": 0.5111, + "step": 16260 + }, + { + "epoch": 1.6296389041919168, + "grad_norm": 2.7651565074920654, + "learning_rate": 3.8092694628038764e-05, + "loss": 0.472, + "step": 16270 + }, + { + "epoch": 1.6306405569189162, + "grad_norm": 2.146113395690918, + "learning_rate": 3.807926209601624e-05, + "loss": 0.6197, + "step": 16280 + }, + { + "epoch": 1.6316422096459158, + "grad_norm": 2.629794120788574, + "learning_rate": 3.806582436312936e-05, + "loss": 0.5674, + "step": 16290 + }, + { + "epoch": 1.6326438623729151, + "grad_norm": 1.815111756324768, + "learning_rate": 3.805238143472154e-05, + "loss": 0.5137, + "step": 16300 + }, + { + "epoch": 1.6336455150999147, + "grad_norm": 2.728011131286621, + "learning_rate": 3.8038933316138225e-05, + "loss": 0.5547, + "step": 16310 + }, + { + "epoch": 1.6346471678269143, + "grad_norm": 1.9159232378005981, + "learning_rate": 3.802548001272698e-05, + "loss": 0.5681, + "step": 16320 + }, + { + "epoch": 1.635648820553914, + "grad_norm": 2.153987407684326, + "learning_rate": 3.801202152983738e-05, + "loss": 0.5581, + "step": 16330 + }, + { + "epoch": 1.6366504732809135, + "grad_norm": 2.767010450363159, + "learning_rate": 3.7998557872821104e-05, + "loss": 0.5055, + "step": 16340 + }, + { + "epoch": 1.637652126007913, + "grad_norm": 2.1452322006225586, + "learning_rate": 3.798508904703186e-05, + "loss": 0.5336, + "step": 16350 + }, + { + "epoch": 1.6386537787349127, + "grad_norm": 1.8051600456237793, + "learning_rate": 3.797161505782543e-05, + "loss": 0.566, + "step": 16360 + }, + { + "epoch": 1.6396554314619123, + "grad_norm": 2.4228758811950684, + "learning_rate": 3.795813591055961e-05, + "loss": 0.5281, + "step": 16370 + }, + { + "epoch": 1.6406570841889117, + "grad_norm": 2.133131742477417, + "learning_rate": 3.794465161059431e-05, + "loss": 0.5489, + "step": 16380 + }, + { + "epoch": 1.6416587369159112, + "grad_norm": 2.345156192779541, + "learning_rate": 3.793116216329143e-05, + "loss": 0.5207, + "step": 16390 + }, + { + "epoch": 1.6426603896429108, + "grad_norm": 2.2094056606292725, + "learning_rate": 3.791766757401495e-05, + "loss": 0.5303, + "step": 16400 + }, + { + "epoch": 1.6436620423699102, + "grad_norm": 2.3264756202697754, + "learning_rate": 3.790416784813088e-05, + "loss": 0.5847, + "step": 16410 + }, + { + "epoch": 1.6446636950969098, + "grad_norm": 2.0210537910461426, + "learning_rate": 3.7890662991007294e-05, + "loss": 0.6641, + "step": 16420 + }, + { + "epoch": 1.6456653478239094, + "grad_norm": 2.2783915996551514, + "learning_rate": 3.7877153008014275e-05, + "loss": 0.6113, + "step": 16430 + }, + { + "epoch": 1.646667000550909, + "grad_norm": 2.1935906410217285, + "learning_rate": 3.7863637904523956e-05, + "loss": 0.5113, + "step": 16440 + }, + { + "epoch": 1.6476686532779086, + "grad_norm": 2.478006601333618, + "learning_rate": 3.7850117685910535e-05, + "loss": 0.5166, + "step": 16450 + }, + { + "epoch": 1.6486703060049082, + "grad_norm": 2.1115570068359375, + "learning_rate": 3.783659235755019e-05, + "loss": 0.5136, + "step": 16460 + }, + { + "epoch": 1.6496719587319078, + "grad_norm": 2.320053815841675, + "learning_rate": 3.782306192482119e-05, + "loss": 0.5142, + "step": 16470 + }, + { + "epoch": 1.6506736114589073, + "grad_norm": 3.074125051498413, + "learning_rate": 3.7809526393103785e-05, + "loss": 0.5026, + "step": 16480 + }, + { + "epoch": 1.6516752641859067, + "grad_norm": 2.644097328186035, + "learning_rate": 3.779598576778026e-05, + "loss": 0.5428, + "step": 16490 + }, + { + "epoch": 1.6526769169129063, + "grad_norm": 2.510568857192993, + "learning_rate": 3.7782440054234966e-05, + "loss": 0.5376, + "step": 16500 + }, + { + "epoch": 1.653678569639906, + "grad_norm": 1.993364691734314, + "learning_rate": 3.7768889257854224e-05, + "loss": 0.5449, + "step": 16510 + }, + { + "epoch": 1.6546802223669053, + "grad_norm": 2.8113162517547607, + "learning_rate": 3.775533338402641e-05, + "loss": 0.5261, + "step": 16520 + }, + { + "epoch": 1.6556818750939049, + "grad_norm": 2.3818821907043457, + "learning_rate": 3.7741772438141916e-05, + "loss": 0.6036, + "step": 16530 + }, + { + "epoch": 1.6566835278209044, + "grad_norm": 2.2668466567993164, + "learning_rate": 3.7728206425593126e-05, + "loss": 0.5026, + "step": 16540 + }, + { + "epoch": 1.657685180547904, + "grad_norm": 2.132610321044922, + "learning_rate": 3.771463535177447e-05, + "loss": 0.5779, + "step": 16550 + }, + { + "epoch": 1.6586868332749036, + "grad_norm": 2.991370439529419, + "learning_rate": 3.770105922208239e-05, + "loss": 0.6101, + "step": 16560 + }, + { + "epoch": 1.6596884860019032, + "grad_norm": 2.392225742340088, + "learning_rate": 3.768747804191529e-05, + "loss": 0.5587, + "step": 16570 + }, + { + "epoch": 1.6606901387289028, + "grad_norm": 2.6654226779937744, + "learning_rate": 3.767389181667365e-05, + "loss": 0.4945, + "step": 16580 + }, + { + "epoch": 1.6616917914559024, + "grad_norm": 2.315359115600586, + "learning_rate": 3.766030055175991e-05, + "loss": 0.6415, + "step": 16590 + }, + { + "epoch": 1.6626934441829018, + "grad_norm": 3.1376960277557373, + "learning_rate": 3.764670425257853e-05, + "loss": 0.5494, + "step": 16600 + }, + { + "epoch": 1.6636950969099014, + "grad_norm": 2.255483865737915, + "learning_rate": 3.763310292453597e-05, + "loss": 0.5318, + "step": 16610 + }, + { + "epoch": 1.6646967496369007, + "grad_norm": 1.7115904092788696, + "learning_rate": 3.761949657304068e-05, + "loss": 0.5176, + "step": 16620 + }, + { + "epoch": 1.6656984023639003, + "grad_norm": 1.925130009651184, + "learning_rate": 3.760588520350315e-05, + "loss": 0.5681, + "step": 16630 + }, + { + "epoch": 1.6667000550909, + "grad_norm": 3.0387139320373535, + "learning_rate": 3.75922688213358e-05, + "loss": 0.4607, + "step": 16640 + }, + { + "epoch": 1.6677017078178995, + "grad_norm": 1.6796802282333374, + "learning_rate": 3.7578647431953086e-05, + "loss": 0.6343, + "step": 16650 + }, + { + "epoch": 1.668703360544899, + "grad_norm": 2.335080146789551, + "learning_rate": 3.756502104077145e-05, + "loss": 0.5031, + "step": 16660 + }, + { + "epoch": 1.6697050132718987, + "grad_norm": 1.8291679620742798, + "learning_rate": 3.75513896532093e-05, + "loss": 0.5351, + "step": 16670 + }, + { + "epoch": 1.6707066659988983, + "grad_norm": 2.4488725662231445, + "learning_rate": 3.753775327468708e-05, + "loss": 0.5344, + "step": 16680 + }, + { + "epoch": 1.6717083187258979, + "grad_norm": 2.563654661178589, + "learning_rate": 3.7524111910627157e-05, + "loss": 0.6013, + "step": 16690 + }, + { + "epoch": 1.6727099714528972, + "grad_norm": 2.365469455718994, + "learning_rate": 3.7510465566453924e-05, + "loss": 0.4762, + "step": 16700 + }, + { + "epoch": 1.6737116241798968, + "grad_norm": 2.369333028793335, + "learning_rate": 3.749681424759374e-05, + "loss": 0.6089, + "step": 16710 + }, + { + "epoch": 1.6747132769068964, + "grad_norm": 3.0419299602508545, + "learning_rate": 3.748315795947495e-05, + "loss": 0.5973, + "step": 16720 + }, + { + "epoch": 1.6757149296338958, + "grad_norm": 2.051492929458618, + "learning_rate": 3.7469496707527854e-05, + "loss": 0.5512, + "step": 16730 + }, + { + "epoch": 1.6767165823608954, + "grad_norm": 2.6813406944274902, + "learning_rate": 3.745583049718475e-05, + "loss": 0.5789, + "step": 16740 + }, + { + "epoch": 1.677718235087895, + "grad_norm": 1.9882440567016602, + "learning_rate": 3.74421593338799e-05, + "loss": 0.5064, + "step": 16750 + }, + { + "epoch": 1.6787198878148946, + "grad_norm": 2.8968048095703125, + "learning_rate": 3.742848322304952e-05, + "loss": 0.5373, + "step": 16760 + }, + { + "epoch": 1.6797215405418942, + "grad_norm": 2.5871849060058594, + "learning_rate": 3.741480217013182e-05, + "loss": 0.5037, + "step": 16770 + }, + { + "epoch": 1.6807231932688937, + "grad_norm": 2.5383193492889404, + "learning_rate": 3.7401116180566954e-05, + "loss": 0.4799, + "step": 16780 + }, + { + "epoch": 1.6817248459958933, + "grad_norm": 2.1125566959381104, + "learning_rate": 3.738742525979705e-05, + "loss": 0.469, + "step": 16790 + }, + { + "epoch": 1.682726498722893, + "grad_norm": 2.907052755355835, + "learning_rate": 3.737372941326619e-05, + "loss": 0.6347, + "step": 16800 + }, + { + "epoch": 1.6837281514498923, + "grad_norm": 2.6465771198272705, + "learning_rate": 3.736002864642042e-05, + "loss": 0.5437, + "step": 16810 + }, + { + "epoch": 1.6847298041768919, + "grad_norm": 2.14072322845459, + "learning_rate": 3.7346322964707744e-05, + "loss": 0.4944, + "step": 16820 + }, + { + "epoch": 1.6857314569038913, + "grad_norm": 2.4284651279449463, + "learning_rate": 3.733261237357812e-05, + "loss": 0.5874, + "step": 16830 + }, + { + "epoch": 1.6867331096308908, + "grad_norm": 2.144977569580078, + "learning_rate": 3.731889687848344e-05, + "loss": 0.5592, + "step": 16840 + }, + { + "epoch": 1.6877347623578904, + "grad_norm": 1.864293098449707, + "learning_rate": 3.730517648487758e-05, + "loss": 0.5409, + "step": 16850 + }, + { + "epoch": 1.68873641508489, + "grad_norm": 2.6154680252075195, + "learning_rate": 3.7291451198216334e-05, + "loss": 0.5337, + "step": 16860 + }, + { + "epoch": 1.6897380678118896, + "grad_norm": 3.94486403465271, + "learning_rate": 3.727772102395745e-05, + "loss": 0.4953, + "step": 16870 + }, + { + "epoch": 1.6907397205388892, + "grad_norm": 2.5961368083953857, + "learning_rate": 3.726398596756063e-05, + "loss": 0.5295, + "step": 16880 + }, + { + "epoch": 1.6917413732658888, + "grad_norm": 2.7672832012176514, + "learning_rate": 3.725024603448751e-05, + "loss": 0.5942, + "step": 16890 + }, + { + "epoch": 1.6927430259928884, + "grad_norm": 3.2715678215026855, + "learning_rate": 3.723650123020166e-05, + "loss": 0.6036, + "step": 16900 + }, + { + "epoch": 1.6937446787198878, + "grad_norm": 2.5985188484191895, + "learning_rate": 3.72227515601686e-05, + "loss": 0.5415, + "step": 16910 + }, + { + "epoch": 1.6947463314468874, + "grad_norm": 2.263279676437378, + "learning_rate": 3.7208997029855764e-05, + "loss": 0.5924, + "step": 16920 + }, + { + "epoch": 1.695747984173887, + "grad_norm": 2.0555026531219482, + "learning_rate": 3.7195237644732545e-05, + "loss": 0.5864, + "step": 16930 + }, + { + "epoch": 1.6967496369008863, + "grad_norm": 2.1603214740753174, + "learning_rate": 3.718147341027024e-05, + "loss": 0.5368, + "step": 16940 + }, + { + "epoch": 1.697751289627886, + "grad_norm": 3.1869144439697266, + "learning_rate": 3.71677043319421e-05, + "loss": 0.6791, + "step": 16950 + }, + { + "epoch": 1.6987529423548855, + "grad_norm": 2.106552839279175, + "learning_rate": 3.715393041522328e-05, + "loss": 0.5239, + "step": 16960 + }, + { + "epoch": 1.699754595081885, + "grad_norm": 2.6369104385375977, + "learning_rate": 3.714015166559087e-05, + "loss": 0.5722, + "step": 16970 + }, + { + "epoch": 1.7007562478088847, + "grad_norm": 2.095924139022827, + "learning_rate": 3.7126368088523884e-05, + "loss": 0.597, + "step": 16980 + }, + { + "epoch": 1.7017579005358843, + "grad_norm": 2.8147194385528564, + "learning_rate": 3.711257968950325e-05, + "loss": 0.5293, + "step": 16990 + }, + { + "epoch": 1.7027595532628839, + "grad_norm": 2.2241532802581787, + "learning_rate": 3.709878647401181e-05, + "loss": 0.5732, + "step": 17000 + }, + { + "epoch": 1.7037612059898835, + "grad_norm": 2.0934462547302246, + "learning_rate": 3.708498844753433e-05, + "loss": 0.5241, + "step": 17010 + }, + { + "epoch": 1.7047628587168828, + "grad_norm": 1.8266611099243164, + "learning_rate": 3.707118561555748e-05, + "loss": 0.4946, + "step": 17020 + }, + { + "epoch": 1.7057645114438824, + "grad_norm": 1.8321797847747803, + "learning_rate": 3.705737798356985e-05, + "loss": 0.6098, + "step": 17030 + }, + { + "epoch": 1.706766164170882, + "grad_norm": 2.4191246032714844, + "learning_rate": 3.704356555706195e-05, + "loss": 0.5116, + "step": 17040 + }, + { + "epoch": 1.7077678168978814, + "grad_norm": 2.142313003540039, + "learning_rate": 3.702974834152616e-05, + "loss": 0.5581, + "step": 17050 + }, + { + "epoch": 1.708769469624881, + "grad_norm": 2.1364362239837646, + "learning_rate": 3.70159263424568e-05, + "loss": 0.4971, + "step": 17060 + }, + { + "epoch": 1.7097711223518806, + "grad_norm": 2.3862712383270264, + "learning_rate": 3.7002099565350053e-05, + "loss": 0.5418, + "step": 17070 + }, + { + "epoch": 1.7107727750788801, + "grad_norm": 2.108258008956909, + "learning_rate": 3.698826801570406e-05, + "loss": 0.5097, + "step": 17080 + }, + { + "epoch": 1.7117744278058797, + "grad_norm": 2.0370752811431885, + "learning_rate": 3.6974431699018806e-05, + "loss": 0.5491, + "step": 17090 + }, + { + "epoch": 1.7127760805328793, + "grad_norm": 1.8179024457931519, + "learning_rate": 3.69605906207962e-05, + "loss": 0.5095, + "step": 17100 + }, + { + "epoch": 1.713777733259879, + "grad_norm": 2.070310592651367, + "learning_rate": 3.694674478654003e-05, + "loss": 0.579, + "step": 17110 + }, + { + "epoch": 1.7147793859868785, + "grad_norm": 2.2030398845672607, + "learning_rate": 3.693289420175599e-05, + "loss": 0.5857, + "step": 17120 + }, + { + "epoch": 1.7157810387138779, + "grad_norm": 2.334590196609497, + "learning_rate": 3.691903887195165e-05, + "loss": 0.5118, + "step": 17130 + }, + { + "epoch": 1.7167826914408775, + "grad_norm": 2.140634298324585, + "learning_rate": 3.690517880263647e-05, + "loss": 0.5775, + "step": 17140 + }, + { + "epoch": 1.7177843441678768, + "grad_norm": 1.9049276113510132, + "learning_rate": 3.68913139993218e-05, + "loss": 0.5149, + "step": 17150 + }, + { + "epoch": 1.7187859968948764, + "grad_norm": 2.7127888202667236, + "learning_rate": 3.687744446752086e-05, + "loss": 0.5779, + "step": 17160 + }, + { + "epoch": 1.719787649621876, + "grad_norm": 2.346618413925171, + "learning_rate": 3.686357021274877e-05, + "loss": 0.5229, + "step": 17170 + }, + { + "epoch": 1.7207893023488756, + "grad_norm": 2.411379337310791, + "learning_rate": 3.684969124052251e-05, + "loss": 0.5378, + "step": 17180 + }, + { + "epoch": 1.7217909550758752, + "grad_norm": 2.139772891998291, + "learning_rate": 3.683580755636094e-05, + "loss": 0.5433, + "step": 17190 + }, + { + "epoch": 1.7227926078028748, + "grad_norm": 2.1813013553619385, + "learning_rate": 3.682191916578481e-05, + "loss": 0.5352, + "step": 17200 + }, + { + "epoch": 1.7237942605298744, + "grad_norm": 2.3435051441192627, + "learning_rate": 3.680802607431673e-05, + "loss": 0.5517, + "step": 17210 + }, + { + "epoch": 1.724795913256874, + "grad_norm": 2.307100534439087, + "learning_rate": 3.6794128287481136e-05, + "loss": 0.5181, + "step": 17220 + }, + { + "epoch": 1.7257975659838733, + "grad_norm": 1.8775631189346313, + "learning_rate": 3.6780225810804426e-05, + "loss": 0.5047, + "step": 17230 + }, + { + "epoch": 1.726799218710873, + "grad_norm": 2.518465042114258, + "learning_rate": 3.676631864981478e-05, + "loss": 0.4953, + "step": 17240 + }, + { + "epoch": 1.7278008714378725, + "grad_norm": 2.4192512035369873, + "learning_rate": 3.675240681004227e-05, + "loss": 0.5995, + "step": 17250 + }, + { + "epoch": 1.728802524164872, + "grad_norm": 2.5736687183380127, + "learning_rate": 3.673849029701883e-05, + "loss": 0.6249, + "step": 17260 + }, + { + "epoch": 1.7298041768918715, + "grad_norm": 1.8964473009109497, + "learning_rate": 3.672456911627826e-05, + "loss": 0.6039, + "step": 17270 + }, + { + "epoch": 1.730805829618871, + "grad_norm": 1.9748671054840088, + "learning_rate": 3.6710643273356206e-05, + "loss": 0.5387, + "step": 17280 + }, + { + "epoch": 1.7318074823458707, + "grad_norm": 2.3679747581481934, + "learning_rate": 3.669671277379016e-05, + "loss": 0.5977, + "step": 17290 + }, + { + "epoch": 1.7328091350728703, + "grad_norm": 2.5027241706848145, + "learning_rate": 3.6682777623119474e-05, + "loss": 0.5304, + "step": 17300 + }, + { + "epoch": 1.7338107877998699, + "grad_norm": 1.9013116359710693, + "learning_rate": 3.666883782688535e-05, + "loss": 0.5139, + "step": 17310 + }, + { + "epoch": 1.7348124405268694, + "grad_norm": 2.220552682876587, + "learning_rate": 3.665489339063085e-05, + "loss": 0.5624, + "step": 17320 + }, + { + "epoch": 1.735814093253869, + "grad_norm": 1.7051708698272705, + "learning_rate": 3.664094431990085e-05, + "loss": 0.4726, + "step": 17330 + }, + { + "epoch": 1.7368157459808684, + "grad_norm": 1.9945650100708008, + "learning_rate": 3.662699062024209e-05, + "loss": 0.5907, + "step": 17340 + }, + { + "epoch": 1.737817398707868, + "grad_norm": 2.306936502456665, + "learning_rate": 3.661303229720316e-05, + "loss": 0.5114, + "step": 17350 + }, + { + "epoch": 1.7388190514348674, + "grad_norm": 2.131807565689087, + "learning_rate": 3.659906935633446e-05, + "loss": 0.5478, + "step": 17360 + }, + { + "epoch": 1.739820704161867, + "grad_norm": 2.530008554458618, + "learning_rate": 3.658510180318826e-05, + "loss": 0.558, + "step": 17370 + }, + { + "epoch": 1.7408223568888666, + "grad_norm": 1.6221373081207275, + "learning_rate": 3.657112964331862e-05, + "loss": 0.5436, + "step": 17380 + }, + { + "epoch": 1.7418240096158661, + "grad_norm": 2.6965410709381104, + "learning_rate": 3.6557152882281497e-05, + "loss": 0.5593, + "step": 17390 + }, + { + "epoch": 1.7428256623428657, + "grad_norm": 2.156054735183716, + "learning_rate": 3.65431715256346e-05, + "loss": 0.522, + "step": 17400 + }, + { + "epoch": 1.7438273150698653, + "grad_norm": 2.559577465057373, + "learning_rate": 3.652918557893753e-05, + "loss": 0.5416, + "step": 17410 + }, + { + "epoch": 1.744828967796865, + "grad_norm": 2.3171913623809814, + "learning_rate": 3.651519504775167e-05, + "loss": 0.5473, + "step": 17420 + }, + { + "epoch": 1.7458306205238645, + "grad_norm": 2.0408565998077393, + "learning_rate": 3.650119993764025e-05, + "loss": 0.4801, + "step": 17430 + }, + { + "epoch": 1.7468322732508639, + "grad_norm": 2.4444737434387207, + "learning_rate": 3.648720025416832e-05, + "loss": 0.5292, + "step": 17440 + }, + { + "epoch": 1.7478339259778635, + "grad_norm": 3.833693027496338, + "learning_rate": 3.647319600290273e-05, + "loss": 0.5295, + "step": 17450 + }, + { + "epoch": 1.748835578704863, + "grad_norm": 2.1103272438049316, + "learning_rate": 3.6459187189412175e-05, + "loss": 0.5395, + "step": 17460 + }, + { + "epoch": 1.7498372314318624, + "grad_norm": 2.4942052364349365, + "learning_rate": 3.6445173819267133e-05, + "loss": 0.5076, + "step": 17470 + }, + { + "epoch": 1.750838884158862, + "grad_norm": 2.432206869125366, + "learning_rate": 3.643115589803992e-05, + "loss": 0.5246, + "step": 17480 + }, + { + "epoch": 1.7518405368858616, + "grad_norm": 2.308864116668701, + "learning_rate": 3.641713343130465e-05, + "loss": 0.6066, + "step": 17490 + }, + { + "epoch": 1.7528421896128612, + "grad_norm": 2.465198516845703, + "learning_rate": 3.640310642463723e-05, + "loss": 0.5812, + "step": 17500 + }, + { + "epoch": 1.7538438423398608, + "grad_norm": 2.269287347793579, + "learning_rate": 3.6389074883615395e-05, + "loss": 0.6508, + "step": 17510 + }, + { + "epoch": 1.7548454950668604, + "grad_norm": 2.149580717086792, + "learning_rate": 3.637503881381869e-05, + "loss": 0.5685, + "step": 17520 + }, + { + "epoch": 1.75584714779386, + "grad_norm": 2.5178351402282715, + "learning_rate": 3.6360998220828436e-05, + "loss": 0.5466, + "step": 17530 + }, + { + "epoch": 1.7568488005208596, + "grad_norm": 2.1612465381622314, + "learning_rate": 3.634695311022775e-05, + "loss": 0.4816, + "step": 17540 + }, + { + "epoch": 1.757850453247859, + "grad_norm": 2.286977529525757, + "learning_rate": 3.6332903487601584e-05, + "loss": 0.5219, + "step": 17550 + }, + { + "epoch": 1.7588521059748585, + "grad_norm": 1.9696747064590454, + "learning_rate": 3.6318849358536635e-05, + "loss": 0.4647, + "step": 17560 + }, + { + "epoch": 1.7598537587018581, + "grad_norm": 2.1928446292877197, + "learning_rate": 3.630479072862143e-05, + "loss": 0.5435, + "step": 17570 + }, + { + "epoch": 1.7608554114288575, + "grad_norm": 2.580597162246704, + "learning_rate": 3.629072760344627e-05, + "loss": 0.5458, + "step": 17580 + }, + { + "epoch": 1.761857064155857, + "grad_norm": 1.8262009620666504, + "learning_rate": 3.6276659988603234e-05, + "loss": 0.5767, + "step": 17590 + }, + { + "epoch": 1.7628587168828567, + "grad_norm": 2.46189284324646, + "learning_rate": 3.6262587889686205e-05, + "loss": 0.5401, + "step": 17600 + }, + { + "epoch": 1.7638603696098563, + "grad_norm": 3.054136037826538, + "learning_rate": 3.624851131229084e-05, + "loss": 0.4778, + "step": 17610 + }, + { + "epoch": 1.7648620223368559, + "grad_norm": 2.439973831176758, + "learning_rate": 3.6234430262014594e-05, + "loss": 0.5884, + "step": 17620 + }, + { + "epoch": 1.7658636750638554, + "grad_norm": 1.986525058746338, + "learning_rate": 3.622034474445665e-05, + "loss": 0.5668, + "step": 17630 + }, + { + "epoch": 1.766865327790855, + "grad_norm": 1.8597817420959473, + "learning_rate": 3.620625476521803e-05, + "loss": 0.5563, + "step": 17640 + }, + { + "epoch": 1.7678669805178546, + "grad_norm": 1.821752905845642, + "learning_rate": 3.6192160329901484e-05, + "loss": 0.5131, + "step": 17650 + }, + { + "epoch": 1.768868633244854, + "grad_norm": 2.221463918685913, + "learning_rate": 3.617806144411156e-05, + "loss": 0.545, + "step": 17660 + }, + { + "epoch": 1.7698702859718536, + "grad_norm": 2.033578395843506, + "learning_rate": 3.6163958113454574e-05, + "loss": 0.5321, + "step": 17670 + }, + { + "epoch": 1.770871938698853, + "grad_norm": 1.9562311172485352, + "learning_rate": 3.61498503435386e-05, + "loss": 0.5289, + "step": 17680 + }, + { + "epoch": 1.7718735914258525, + "grad_norm": 2.811387062072754, + "learning_rate": 3.6135738139973466e-05, + "loss": 0.5456, + "step": 17690 + }, + { + "epoch": 1.7728752441528521, + "grad_norm": 1.598136067390442, + "learning_rate": 3.6121621508370805e-05, + "loss": 0.5147, + "step": 17700 + }, + { + "epoch": 1.7738768968798517, + "grad_norm": 1.8884917497634888, + "learning_rate": 3.610750045434396e-05, + "loss": 0.5152, + "step": 17710 + }, + { + "epoch": 1.7748785496068513, + "grad_norm": 1.971439003944397, + "learning_rate": 3.609337498350805e-05, + "loss": 0.5127, + "step": 17720 + }, + { + "epoch": 1.775880202333851, + "grad_norm": 2.07023286819458, + "learning_rate": 3.607924510147998e-05, + "loss": 0.5321, + "step": 17730 + }, + { + "epoch": 1.7768818550608505, + "grad_norm": 2.4199936389923096, + "learning_rate": 3.6065110813878365e-05, + "loss": 0.5243, + "step": 17740 + }, + { + "epoch": 1.77788350778785, + "grad_norm": 2.1403119564056396, + "learning_rate": 3.6050972126323615e-05, + "loss": 0.553, + "step": 17750 + }, + { + "epoch": 1.7788851605148495, + "grad_norm": 3.023754596710205, + "learning_rate": 3.6036829044437835e-05, + "loss": 0.5912, + "step": 17760 + }, + { + "epoch": 1.779886813241849, + "grad_norm": 1.9545767307281494, + "learning_rate": 3.602268157384493e-05, + "loss": 0.6121, + "step": 17770 + }, + { + "epoch": 1.7808884659688486, + "grad_norm": 1.6822097301483154, + "learning_rate": 3.6008529720170524e-05, + "loss": 0.5864, + "step": 17780 + }, + { + "epoch": 1.781890118695848, + "grad_norm": 2.0851354598999023, + "learning_rate": 3.5994373489041995e-05, + "loss": 0.5074, + "step": 17790 + }, + { + "epoch": 1.7828917714228476, + "grad_norm": 1.7990679740905762, + "learning_rate": 3.598021288608845e-05, + "loss": 0.4877, + "step": 17800 + }, + { + "epoch": 1.7838934241498472, + "grad_norm": 2.6405606269836426, + "learning_rate": 3.5966047916940734e-05, + "loss": 0.5953, + "step": 17810 + }, + { + "epoch": 1.7848950768768468, + "grad_norm": 2.476152181625366, + "learning_rate": 3.595187858723144e-05, + "loss": 0.532, + "step": 17820 + }, + { + "epoch": 1.7858967296038464, + "grad_norm": 2.792933940887451, + "learning_rate": 3.593770490259489e-05, + "loss": 0.5781, + "step": 17830 + }, + { + "epoch": 1.786898382330846, + "grad_norm": 2.29030179977417, + "learning_rate": 3.592352686866713e-05, + "loss": 0.5617, + "step": 17840 + }, + { + "epoch": 1.7879000350578456, + "grad_norm": 2.300529956817627, + "learning_rate": 3.590934449108596e-05, + "loss": 0.5806, + "step": 17850 + }, + { + "epoch": 1.7889016877848452, + "grad_norm": 2.4189414978027344, + "learning_rate": 3.589515777549087e-05, + "loss": 0.5626, + "step": 17860 + }, + { + "epoch": 1.7899033405118445, + "grad_norm": 1.9713430404663086, + "learning_rate": 3.588096672752309e-05, + "loss": 0.5583, + "step": 17870 + }, + { + "epoch": 1.7909049932388441, + "grad_norm": 2.270739793777466, + "learning_rate": 3.5866771352825605e-05, + "loss": 0.5607, + "step": 17880 + }, + { + "epoch": 1.7919066459658435, + "grad_norm": 2.855412006378174, + "learning_rate": 3.5852571657043075e-05, + "loss": 0.5981, + "step": 17890 + }, + { + "epoch": 1.792908298692843, + "grad_norm": 2.324159622192383, + "learning_rate": 3.583836764582189e-05, + "loss": 0.4861, + "step": 17900 + }, + { + "epoch": 1.7939099514198427, + "grad_norm": 2.3632209300994873, + "learning_rate": 3.5824159324810175e-05, + "loss": 0.5487, + "step": 17910 + }, + { + "epoch": 1.7949116041468423, + "grad_norm": 2.1716160774230957, + "learning_rate": 3.580994669965774e-05, + "loss": 0.4606, + "step": 17920 + }, + { + "epoch": 1.7959132568738418, + "grad_norm": 2.514984607696533, + "learning_rate": 3.579572977601615e-05, + "loss": 0.4827, + "step": 17930 + }, + { + "epoch": 1.7969149096008414, + "grad_norm": 2.0306577682495117, + "learning_rate": 3.578150855953861e-05, + "loss": 0.6076, + "step": 17940 + }, + { + "epoch": 1.797916562327841, + "grad_norm": 2.062478542327881, + "learning_rate": 3.576728305588012e-05, + "loss": 0.558, + "step": 17950 + }, + { + "epoch": 1.7989182150548406, + "grad_norm": 2.077491283416748, + "learning_rate": 3.57530532706973e-05, + "loss": 0.5423, + "step": 17960 + }, + { + "epoch": 1.79991986778184, + "grad_norm": 1.7659790515899658, + "learning_rate": 3.573881920964853e-05, + "loss": 0.5647, + "step": 17970 + }, + { + "epoch": 1.8009215205088396, + "grad_norm": 2.1615078449249268, + "learning_rate": 3.5724580878393867e-05, + "loss": 0.5863, + "step": 17980 + }, + { + "epoch": 1.8019231732358392, + "grad_norm": 1.5051380395889282, + "learning_rate": 3.571033828259507e-05, + "loss": 0.5236, + "step": 17990 + }, + { + "epoch": 1.8029248259628385, + "grad_norm": 2.394617795944214, + "learning_rate": 3.569609142791559e-05, + "loss": 0.5616, + "step": 18000 + }, + { + "epoch": 1.8039264786898381, + "grad_norm": 2.699385643005371, + "learning_rate": 3.5681840320020585e-05, + "loss": 0.6394, + "step": 18010 + }, + { + "epoch": 1.8049281314168377, + "grad_norm": 2.4347870349884033, + "learning_rate": 3.566758496457688e-05, + "loss": 0.5286, + "step": 18020 + }, + { + "epoch": 1.8059297841438373, + "grad_norm": 1.4760379791259766, + "learning_rate": 3.5653325367253016e-05, + "loss": 0.5071, + "step": 18030 + }, + { + "epoch": 1.806931436870837, + "grad_norm": 2.0530753135681152, + "learning_rate": 3.56390615337192e-05, + "loss": 0.5589, + "step": 18040 + }, + { + "epoch": 1.8079330895978365, + "grad_norm": 2.031733751296997, + "learning_rate": 3.5624793469647344e-05, + "loss": 0.4645, + "step": 18050 + }, + { + "epoch": 1.808934742324836, + "grad_norm": 2.617835283279419, + "learning_rate": 3.5610521180711015e-05, + "loss": 0.6479, + "step": 18060 + }, + { + "epoch": 1.8099363950518357, + "grad_norm": 2.419201374053955, + "learning_rate": 3.559624467258548e-05, + "loss": 0.4964, + "step": 18070 + }, + { + "epoch": 1.810938047778835, + "grad_norm": 1.7701033353805542, + "learning_rate": 3.5581963950947686e-05, + "loss": 0.525, + "step": 18080 + }, + { + "epoch": 1.8119397005058346, + "grad_norm": 2.738837480545044, + "learning_rate": 3.556767902147623e-05, + "loss": 0.5093, + "step": 18090 + }, + { + "epoch": 1.8129413532328342, + "grad_norm": 1.944037914276123, + "learning_rate": 3.5553389889851426e-05, + "loss": 0.5076, + "step": 18100 + }, + { + "epoch": 1.8139430059598336, + "grad_norm": 2.223052978515625, + "learning_rate": 3.553909656175522e-05, + "loss": 0.5436, + "step": 18110 + }, + { + "epoch": 1.8149446586868332, + "grad_norm": 2.251912832260132, + "learning_rate": 3.552479904287123e-05, + "loss": 0.5097, + "step": 18120 + }, + { + "epoch": 1.8159463114138328, + "grad_norm": 2.422960042953491, + "learning_rate": 3.5510497338884774e-05, + "loss": 0.5101, + "step": 18130 + }, + { + "epoch": 1.8169479641408324, + "grad_norm": 2.5609405040740967, + "learning_rate": 3.549619145548279e-05, + "loss": 0.542, + "step": 18140 + }, + { + "epoch": 1.817949616867832, + "grad_norm": 2.37107515335083, + "learning_rate": 3.54818813983539e-05, + "loss": 0.5574, + "step": 18150 + }, + { + "epoch": 1.8189512695948316, + "grad_norm": 2.355593681335449, + "learning_rate": 3.54675671731884e-05, + "loss": 0.5213, + "step": 18160 + }, + { + "epoch": 1.8199529223218311, + "grad_norm": 2.0474841594696045, + "learning_rate": 3.545324878567821e-05, + "loss": 0.5093, + "step": 18170 + }, + { + "epoch": 1.8209545750488307, + "grad_norm": 2.3665642738342285, + "learning_rate": 3.543892624151693e-05, + "loss": 0.5607, + "step": 18180 + }, + { + "epoch": 1.82195622777583, + "grad_norm": 2.511904239654541, + "learning_rate": 3.542459954639981e-05, + "loss": 0.5602, + "step": 18190 + }, + { + "epoch": 1.8229578805028297, + "grad_norm": 2.2814455032348633, + "learning_rate": 3.541026870602375e-05, + "loss": 0.5188, + "step": 18200 + }, + { + "epoch": 1.823959533229829, + "grad_norm": 2.252317428588867, + "learning_rate": 3.539593372608727e-05, + "loss": 0.4779, + "step": 18210 + }, + { + "epoch": 1.8249611859568287, + "grad_norm": 1.9406018257141113, + "learning_rate": 3.538159461229059e-05, + "loss": 0.6143, + "step": 18220 + }, + { + "epoch": 1.8259628386838282, + "grad_norm": 2.396653175354004, + "learning_rate": 3.5367251370335526e-05, + "loss": 0.5132, + "step": 18230 + }, + { + "epoch": 1.8269644914108278, + "grad_norm": 2.2335658073425293, + "learning_rate": 3.535290400592556e-05, + "loss": 0.5352, + "step": 18240 + }, + { + "epoch": 1.8279661441378274, + "grad_norm": 2.568760395050049, + "learning_rate": 3.53385525247658e-05, + "loss": 0.5214, + "step": 18250 + }, + { + "epoch": 1.828967796864827, + "grad_norm": 1.8453973531723022, + "learning_rate": 3.532419693256301e-05, + "loss": 0.5491, + "step": 18260 + }, + { + "epoch": 1.8299694495918266, + "grad_norm": 2.2260847091674805, + "learning_rate": 3.5309837235025574e-05, + "loss": 0.5602, + "step": 18270 + }, + { + "epoch": 1.8309711023188262, + "grad_norm": 1.965449333190918, + "learning_rate": 3.5295473437863505e-05, + "loss": 0.554, + "step": 18280 + }, + { + "epoch": 1.8319727550458256, + "grad_norm": 2.2575039863586426, + "learning_rate": 3.528110554678846e-05, + "loss": 0.597, + "step": 18290 + }, + { + "epoch": 1.8329744077728252, + "grad_norm": 1.884130835533142, + "learning_rate": 3.526673356751371e-05, + "loss": 0.5049, + "step": 18300 + }, + { + "epoch": 1.8339760604998248, + "grad_norm": 2.2805416584014893, + "learning_rate": 3.525235750575416e-05, + "loss": 0.5269, + "step": 18310 + }, + { + "epoch": 1.8349777132268241, + "grad_norm": 2.192514419555664, + "learning_rate": 3.523797736722634e-05, + "loss": 0.5541, + "step": 18320 + }, + { + "epoch": 1.8359793659538237, + "grad_norm": 2.4717044830322266, + "learning_rate": 3.52235931576484e-05, + "loss": 0.5149, + "step": 18330 + }, + { + "epoch": 1.8369810186808233, + "grad_norm": 2.0693359375, + "learning_rate": 3.520920488274009e-05, + "loss": 0.5642, + "step": 18340 + }, + { + "epoch": 1.837982671407823, + "grad_norm": 2.423137903213501, + "learning_rate": 3.51948125482228e-05, + "loss": 0.5392, + "step": 18350 + }, + { + "epoch": 1.8389843241348225, + "grad_norm": 2.4728684425354004, + "learning_rate": 3.518041615981954e-05, + "loss": 0.5382, + "step": 18360 + }, + { + "epoch": 1.839985976861822, + "grad_norm": 2.3842051029205322, + "learning_rate": 3.516601572325491e-05, + "loss": 0.5356, + "step": 18370 + }, + { + "epoch": 1.8409876295888217, + "grad_norm": 1.8995606899261475, + "learning_rate": 3.515161124425513e-05, + "loss": 0.505, + "step": 18380 + }, + { + "epoch": 1.8419892823158213, + "grad_norm": 2.8482911586761475, + "learning_rate": 3.513720272854802e-05, + "loss": 0.5478, + "step": 18390 + }, + { + "epoch": 1.8429909350428206, + "grad_norm": 1.9546759128570557, + "learning_rate": 3.5122790181863017e-05, + "loss": 0.4782, + "step": 18400 + }, + { + "epoch": 1.8439925877698202, + "grad_norm": 2.6709511280059814, + "learning_rate": 3.510837360993116e-05, + "loss": 0.5357, + "step": 18410 + }, + { + "epoch": 1.8449942404968196, + "grad_norm": 2.3356592655181885, + "learning_rate": 3.5093953018485076e-05, + "loss": 0.5344, + "step": 18420 + }, + { + "epoch": 1.8459958932238192, + "grad_norm": 2.1896042823791504, + "learning_rate": 3.507952841325899e-05, + "loss": 0.6113, + "step": 18430 + }, + { + "epoch": 1.8469975459508188, + "grad_norm": 1.9698134660720825, + "learning_rate": 3.5065099799988766e-05, + "loss": 0.4655, + "step": 18440 + }, + { + "epoch": 1.8479991986778184, + "grad_norm": 2.549419403076172, + "learning_rate": 3.505066718441179e-05, + "loss": 0.4673, + "step": 18450 + }, + { + "epoch": 1.849000851404818, + "grad_norm": 2.0355772972106934, + "learning_rate": 3.503623057226709e-05, + "loss": 0.5131, + "step": 18460 + }, + { + "epoch": 1.8500025041318175, + "grad_norm": 2.1255834102630615, + "learning_rate": 3.502178996929527e-05, + "loss": 0.5656, + "step": 18470 + }, + { + "epoch": 1.8510041568588171, + "grad_norm": 2.1015443801879883, + "learning_rate": 3.500734538123852e-05, + "loss": 0.5569, + "step": 18480 + }, + { + "epoch": 1.8520058095858167, + "grad_norm": 2.9126319885253906, + "learning_rate": 3.4992896813840624e-05, + "loss": 0.5279, + "step": 18490 + }, + { + "epoch": 1.853007462312816, + "grad_norm": 2.6113879680633545, + "learning_rate": 3.497844427284693e-05, + "loss": 0.6152, + "step": 18500 + }, + { + "epoch": 1.8540091150398157, + "grad_norm": 2.1508989334106445, + "learning_rate": 3.496398776400437e-05, + "loss": 0.5262, + "step": 18510 + }, + { + "epoch": 1.8550107677668153, + "grad_norm": 2.867258071899414, + "learning_rate": 3.4949527293061475e-05, + "loss": 0.6089, + "step": 18520 + }, + { + "epoch": 1.8560124204938147, + "grad_norm": 2.2705745697021484, + "learning_rate": 3.493506286576832e-05, + "loss": 0.5411, + "step": 18530 + }, + { + "epoch": 1.8570140732208142, + "grad_norm": 2.0853490829467773, + "learning_rate": 3.492059448787659e-05, + "loss": 0.5184, + "step": 18540 + }, + { + "epoch": 1.8580157259478138, + "grad_norm": 2.6328513622283936, + "learning_rate": 3.4906122165139496e-05, + "loss": 0.5298, + "step": 18550 + }, + { + "epoch": 1.8590173786748134, + "grad_norm": 2.7920444011688232, + "learning_rate": 3.489164590331186e-05, + "loss": 0.5228, + "step": 18560 + }, + { + "epoch": 1.860019031401813, + "grad_norm": 1.7941895723342896, + "learning_rate": 3.487716570815004e-05, + "loss": 0.5057, + "step": 18570 + }, + { + "epoch": 1.8610206841288126, + "grad_norm": 1.9120043516159058, + "learning_rate": 3.4862681585411984e-05, + "loss": 0.5185, + "step": 18580 + }, + { + "epoch": 1.8620223368558122, + "grad_norm": 1.4897717237472534, + "learning_rate": 3.484819354085717e-05, + "loss": 0.4982, + "step": 18590 + }, + { + "epoch": 1.8630239895828118, + "grad_norm": 1.8527815341949463, + "learning_rate": 3.483370158024667e-05, + "loss": 0.5612, + "step": 18600 + }, + { + "epoch": 1.8640256423098112, + "grad_norm": 2.3453290462493896, + "learning_rate": 3.481920570934308e-05, + "loss": 0.4998, + "step": 18610 + }, + { + "epoch": 1.8650272950368108, + "grad_norm": 2.2166287899017334, + "learning_rate": 3.48047059339106e-05, + "loss": 0.5478, + "step": 18620 + }, + { + "epoch": 1.8660289477638103, + "grad_norm": 1.6502530574798584, + "learning_rate": 3.479020225971491e-05, + "loss": 0.535, + "step": 18630 + }, + { + "epoch": 1.8670306004908097, + "grad_norm": 1.860781192779541, + "learning_rate": 3.4775694692523306e-05, + "loss": 0.4861, + "step": 18640 + }, + { + "epoch": 1.8680322532178093, + "grad_norm": 2.397423505783081, + "learning_rate": 3.476118323810459e-05, + "loss": 0.5373, + "step": 18650 + }, + { + "epoch": 1.869033905944809, + "grad_norm": 2.4832894802093506, + "learning_rate": 3.474666790222914e-05, + "loss": 0.5306, + "step": 18660 + }, + { + "epoch": 1.8700355586718085, + "grad_norm": 2.5092248916625977, + "learning_rate": 3.4732148690668866e-05, + "loss": 0.5647, + "step": 18670 + }, + { + "epoch": 1.871037211398808, + "grad_norm": 2.138489007949829, + "learning_rate": 3.471762560919719e-05, + "loss": 0.5775, + "step": 18680 + }, + { + "epoch": 1.8720388641258077, + "grad_norm": 2.200510263442993, + "learning_rate": 3.470309866358914e-05, + "loss": 0.5667, + "step": 18690 + }, + { + "epoch": 1.8730405168528073, + "grad_norm": 2.08661150932312, + "learning_rate": 3.46885678596212e-05, + "loss": 0.5655, + "step": 18700 + }, + { + "epoch": 1.8740421695798066, + "grad_norm": 2.4580740928649902, + "learning_rate": 3.4674033203071464e-05, + "loss": 0.522, + "step": 18710 + }, + { + "epoch": 1.8750438223068062, + "grad_norm": 2.8262064456939697, + "learning_rate": 3.46594946997195e-05, + "loss": 0.5338, + "step": 18720 + }, + { + "epoch": 1.8760454750338058, + "grad_norm": 2.0510544776916504, + "learning_rate": 3.4644952355346435e-05, + "loss": 0.588, + "step": 18730 + }, + { + "epoch": 1.8770471277608052, + "grad_norm": 2.456479549407959, + "learning_rate": 3.463040617573491e-05, + "loss": 0.6353, + "step": 18740 + }, + { + "epoch": 1.8780487804878048, + "grad_norm": 2.268458127975464, + "learning_rate": 3.461585616666911e-05, + "loss": 0.5018, + "step": 18750 + }, + { + "epoch": 1.8790504332148044, + "grad_norm": 2.170693874359131, + "learning_rate": 3.460130233393472e-05, + "loss": 0.5805, + "step": 18760 + }, + { + "epoch": 1.880052085941804, + "grad_norm": 2.764228582382202, + "learning_rate": 3.458674468331896e-05, + "loss": 0.5206, + "step": 18770 + }, + { + "epoch": 1.8810537386688035, + "grad_norm": 1.9646960496902466, + "learning_rate": 3.457218322061056e-05, + "loss": 0.4927, + "step": 18780 + }, + { + "epoch": 1.8820553913958031, + "grad_norm": 2.303696870803833, + "learning_rate": 3.455761795159978e-05, + "loss": 0.5437, + "step": 18790 + }, + { + "epoch": 1.8830570441228027, + "grad_norm": 2.2272274494171143, + "learning_rate": 3.454304888207837e-05, + "loss": 0.6098, + "step": 18800 + }, + { + "epoch": 1.8840586968498023, + "grad_norm": 2.036489248275757, + "learning_rate": 3.452847601783959e-05, + "loss": 0.5595, + "step": 18810 + }, + { + "epoch": 1.8850603495768017, + "grad_norm": 2.238173007965088, + "learning_rate": 3.451389936467827e-05, + "loss": 0.5055, + "step": 18820 + }, + { + "epoch": 1.8860620023038013, + "grad_norm": 2.1000025272369385, + "learning_rate": 3.4499318928390665e-05, + "loss": 0.5292, + "step": 18830 + }, + { + "epoch": 1.8870636550308009, + "grad_norm": 2.115056037902832, + "learning_rate": 3.448473471477457e-05, + "loss": 0.5193, + "step": 18840 + }, + { + "epoch": 1.8880653077578002, + "grad_norm": 2.3937647342681885, + "learning_rate": 3.44701467296293e-05, + "loss": 0.5607, + "step": 18850 + }, + { + "epoch": 1.8890669604847998, + "grad_norm": 2.403609037399292, + "learning_rate": 3.4455554978755634e-05, + "loss": 0.4612, + "step": 18860 + }, + { + "epoch": 1.8900686132117994, + "grad_norm": 2.042008399963379, + "learning_rate": 3.444095946795587e-05, + "loss": 0.5366, + "step": 18870 + }, + { + "epoch": 1.891070265938799, + "grad_norm": 2.9627082347869873, + "learning_rate": 3.44263602030338e-05, + "loss": 0.5807, + "step": 18880 + }, + { + "epoch": 1.8920719186657986, + "grad_norm": 1.7297526597976685, + "learning_rate": 3.4411757189794703e-05, + "loss": 0.4538, + "step": 18890 + }, + { + "epoch": 1.8930735713927982, + "grad_norm": 2.134171724319458, + "learning_rate": 3.439715043404535e-05, + "loss": 0.5695, + "step": 18900 + }, + { + "epoch": 1.8940752241197978, + "grad_norm": 3.0094010829925537, + "learning_rate": 3.4382539941594e-05, + "loss": 0.4792, + "step": 18910 + }, + { + "epoch": 1.8950768768467974, + "grad_norm": 1.9440160989761353, + "learning_rate": 3.4367925718250405e-05, + "loss": 0.5362, + "step": 18920 + }, + { + "epoch": 1.8960785295737967, + "grad_norm": 2.718689203262329, + "learning_rate": 3.4353307769825794e-05, + "loss": 0.4992, + "step": 18930 + }, + { + "epoch": 1.8970801823007963, + "grad_norm": 2.141523599624634, + "learning_rate": 3.433868610213286e-05, + "loss": 0.5421, + "step": 18940 + }, + { + "epoch": 1.8980818350277957, + "grad_norm": 2.2745018005371094, + "learning_rate": 3.4324060720985815e-05, + "loss": 0.5296, + "step": 18950 + }, + { + "epoch": 1.8990834877547953, + "grad_norm": 2.313880443572998, + "learning_rate": 3.4309431632200325e-05, + "loss": 0.5812, + "step": 18960 + }, + { + "epoch": 1.9000851404817949, + "grad_norm": 2.571535110473633, + "learning_rate": 3.429479884159351e-05, + "loss": 0.5244, + "step": 18970 + }, + { + "epoch": 1.9010867932087945, + "grad_norm": 2.148982286453247, + "learning_rate": 3.4280162354984e-05, + "loss": 0.4673, + "step": 18980 + }, + { + "epoch": 1.902088445935794, + "grad_norm": 3.1783385276794434, + "learning_rate": 3.426552217819187e-05, + "loss": 0.4956, + "step": 18990 + }, + { + "epoch": 1.9030900986627937, + "grad_norm": 2.790367841720581, + "learning_rate": 3.425087831703868e-05, + "loss": 0.5036, + "step": 19000 + }, + { + "epoch": 1.9040917513897933, + "grad_norm": 2.6144115924835205, + "learning_rate": 3.423623077734743e-05, + "loss": 0.5183, + "step": 19010 + }, + { + "epoch": 1.9050934041167928, + "grad_norm": 1.9311896562576294, + "learning_rate": 3.4221579564942604e-05, + "loss": 0.5641, + "step": 19020 + }, + { + "epoch": 1.9060950568437922, + "grad_norm": 2.1071672439575195, + "learning_rate": 3.4206924685650143e-05, + "loss": 0.5145, + "step": 19030 + }, + { + "epoch": 1.9070967095707918, + "grad_norm": 2.4376401901245117, + "learning_rate": 3.419226614529744e-05, + "loss": 0.4721, + "step": 19040 + }, + { + "epoch": 1.9080983622977914, + "grad_norm": 2.126660108566284, + "learning_rate": 3.417760394971335e-05, + "loss": 0.4731, + "step": 19050 + }, + { + "epoch": 1.9091000150247908, + "grad_norm": 2.3037705421447754, + "learning_rate": 3.4162938104728165e-05, + "loss": 0.5206, + "step": 19060 + }, + { + "epoch": 1.9101016677517904, + "grad_norm": 2.035919427871704, + "learning_rate": 3.4148268616173655e-05, + "loss": 0.564, + "step": 19070 + }, + { + "epoch": 1.91110332047879, + "grad_norm": 2.29209566116333, + "learning_rate": 3.413359548988303e-05, + "loss": 0.4901, + "step": 19080 + }, + { + "epoch": 1.9121049732057895, + "grad_norm": 3.5236032009124756, + "learning_rate": 3.4118918731690925e-05, + "loss": 0.564, + "step": 19090 + }, + { + "epoch": 1.9131066259327891, + "grad_norm": 3.52905011177063, + "learning_rate": 3.410423834743345e-05, + "loss": 0.6002, + "step": 19100 + }, + { + "epoch": 1.9141082786597887, + "grad_norm": 2.6410086154937744, + "learning_rate": 3.408955434294813e-05, + "loss": 0.5538, + "step": 19110 + }, + { + "epoch": 1.9151099313867883, + "grad_norm": 2.0407655239105225, + "learning_rate": 3.407486672407395e-05, + "loss": 0.5388, + "step": 19120 + }, + { + "epoch": 1.916111584113788, + "grad_norm": 2.2420051097869873, + "learning_rate": 3.406017549665134e-05, + "loss": 0.567, + "step": 19130 + }, + { + "epoch": 1.9171132368407873, + "grad_norm": 2.550873041152954, + "learning_rate": 3.404548066652211e-05, + "loss": 0.5538, + "step": 19140 + }, + { + "epoch": 1.9181148895677869, + "grad_norm": 2.334690570831299, + "learning_rate": 3.403078223952959e-05, + "loss": 0.5259, + "step": 19150 + }, + { + "epoch": 1.9191165422947862, + "grad_norm": 1.7644654512405396, + "learning_rate": 3.4016080221518455e-05, + "loss": 0.5255, + "step": 19160 + }, + { + "epoch": 1.9201181950217858, + "grad_norm": 2.645530939102173, + "learning_rate": 3.4001374618334856e-05, + "loss": 0.5159, + "step": 19170 + }, + { + "epoch": 1.9211198477487854, + "grad_norm": 2.168233871459961, + "learning_rate": 3.398666543582637e-05, + "loss": 0.5738, + "step": 19180 + }, + { + "epoch": 1.922121500475785, + "grad_norm": 2.2151002883911133, + "learning_rate": 3.397195267984197e-05, + "loss": 0.558, + "step": 19190 + }, + { + "epoch": 1.9231231532027846, + "grad_norm": 1.8769047260284424, + "learning_rate": 3.395723635623208e-05, + "loss": 0.5495, + "step": 19200 + }, + { + "epoch": 1.9241248059297842, + "grad_norm": 2.8317298889160156, + "learning_rate": 3.394251647084852e-05, + "loss": 0.5709, + "step": 19210 + }, + { + "epoch": 1.9251264586567838, + "grad_norm": 2.330803871154785, + "learning_rate": 3.392779302954454e-05, + "loss": 0.5585, + "step": 19220 + }, + { + "epoch": 1.9261281113837834, + "grad_norm": 2.302245616912842, + "learning_rate": 3.39130660381748e-05, + "loss": 0.5349, + "step": 19230 + }, + { + "epoch": 1.9271297641107827, + "grad_norm": 2.2176613807678223, + "learning_rate": 3.389833550259536e-05, + "loss": 0.47, + "step": 19240 + }, + { + "epoch": 1.9281314168377823, + "grad_norm": 1.9405122995376587, + "learning_rate": 3.388360142866371e-05, + "loss": 0.5174, + "step": 19250 + }, + { + "epoch": 1.929133069564782, + "grad_norm": 2.862607479095459, + "learning_rate": 3.386886382223874e-05, + "loss": 0.5595, + "step": 19260 + }, + { + "epoch": 1.9301347222917813, + "grad_norm": 1.8453691005706787, + "learning_rate": 3.385412268918073e-05, + "loss": 0.5051, + "step": 19270 + }, + { + "epoch": 1.9311363750187809, + "grad_norm": 2.0955848693847656, + "learning_rate": 3.383937803535139e-05, + "loss": 0.5681, + "step": 19280 + }, + { + "epoch": 1.9321380277457805, + "grad_norm": 2.8637776374816895, + "learning_rate": 3.3824629866613795e-05, + "loss": 0.5354, + "step": 19290 + }, + { + "epoch": 1.93313968047278, + "grad_norm": 1.8948688507080078, + "learning_rate": 3.380987818883245e-05, + "loss": 0.5965, + "step": 19300 + }, + { + "epoch": 1.9341413331997797, + "grad_norm": 2.108222246170044, + "learning_rate": 3.379512300787324e-05, + "loss": 0.5391, + "step": 19310 + }, + { + "epoch": 1.9351429859267792, + "grad_norm": 2.2994465827941895, + "learning_rate": 3.3780364329603445e-05, + "loss": 0.532, + "step": 19320 + }, + { + "epoch": 1.9361446386537788, + "grad_norm": 2.433121681213379, + "learning_rate": 3.376560215989174e-05, + "loss": 0.5057, + "step": 19330 + }, + { + "epoch": 1.9371462913807784, + "grad_norm": 2.2182536125183105, + "learning_rate": 3.3750836504608176e-05, + "loss": 0.5601, + "step": 19340 + }, + { + "epoch": 1.9381479441077778, + "grad_norm": 3.7288570404052734, + "learning_rate": 3.373606736962419e-05, + "loss": 0.587, + "step": 19350 + }, + { + "epoch": 1.9391495968347774, + "grad_norm": 2.0662009716033936, + "learning_rate": 3.372129476081264e-05, + "loss": 0.5102, + "step": 19360 + }, + { + "epoch": 1.940151249561777, + "grad_norm": 2.209134817123413, + "learning_rate": 3.37065186840477e-05, + "loss": 0.5374, + "step": 19370 + }, + { + "epoch": 1.9411529022887763, + "grad_norm": 1.951745867729187, + "learning_rate": 3.369173914520499e-05, + "loss": 0.5269, + "step": 19380 + }, + { + "epoch": 1.942154555015776, + "grad_norm": 2.849954605102539, + "learning_rate": 3.367695615016146e-05, + "loss": 0.5075, + "step": 19390 + }, + { + "epoch": 1.9431562077427755, + "grad_norm": 2.189919948577881, + "learning_rate": 3.3662169704795454e-05, + "loss": 0.5444, + "step": 19400 + }, + { + "epoch": 1.9441578604697751, + "grad_norm": 2.3285562992095947, + "learning_rate": 3.364737981498668e-05, + "loss": 0.527, + "step": 19410 + }, + { + "epoch": 1.9451595131967747, + "grad_norm": 2.5581185817718506, + "learning_rate": 3.363258648661623e-05, + "loss": 0.5201, + "step": 19420 + }, + { + "epoch": 1.9461611659237743, + "grad_norm": 2.3942644596099854, + "learning_rate": 3.361778972556655e-05, + "loss": 0.5137, + "step": 19430 + }, + { + "epoch": 1.947162818650774, + "grad_norm": 2.7398712635040283, + "learning_rate": 3.360298953772144e-05, + "loss": 0.597, + "step": 19440 + }, + { + "epoch": 1.9481644713777735, + "grad_norm": 2.240510940551758, + "learning_rate": 3.358818592896609e-05, + "loss": 0.5167, + "step": 19450 + }, + { + "epoch": 1.9491661241047729, + "grad_norm": 2.2413768768310547, + "learning_rate": 3.357337890518704e-05, + "loss": 0.4973, + "step": 19460 + }, + { + "epoch": 1.9501677768317724, + "grad_norm": 2.2250759601593018, + "learning_rate": 3.355856847227217e-05, + "loss": 0.5633, + "step": 19470 + }, + { + "epoch": 1.9511694295587718, + "grad_norm": 2.3619704246520996, + "learning_rate": 3.3543754636110755e-05, + "loss": 0.5079, + "step": 19480 + }, + { + "epoch": 1.9521710822857714, + "grad_norm": 1.8453508615493774, + "learning_rate": 3.3528937402593375e-05, + "loss": 0.535, + "step": 19490 + }, + { + "epoch": 1.953172735012771, + "grad_norm": 1.8270654678344727, + "learning_rate": 3.351411677761199e-05, + "loss": 0.4869, + "step": 19500 + }, + { + "epoch": 1.9541743877397706, + "grad_norm": 1.9440797567367554, + "learning_rate": 3.349929276705992e-05, + "loss": 0.5365, + "step": 19510 + }, + { + "epoch": 1.9551760404667702, + "grad_norm": 2.2624425888061523, + "learning_rate": 3.3484465376831784e-05, + "loss": 0.5449, + "step": 19520 + }, + { + "epoch": 1.9561776931937698, + "grad_norm": 2.4637839794158936, + "learning_rate": 3.3469634612823616e-05, + "loss": 0.5565, + "step": 19530 + }, + { + "epoch": 1.9571793459207694, + "grad_norm": 2.3287465572357178, + "learning_rate": 3.345480048093272e-05, + "loss": 0.5509, + "step": 19540 + }, + { + "epoch": 1.958180998647769, + "grad_norm": 1.6043740510940552, + "learning_rate": 3.3439962987057774e-05, + "loss": 0.5079, + "step": 19550 + }, + { + "epoch": 1.9591826513747683, + "grad_norm": 2.930209159851074, + "learning_rate": 3.3425122137098794e-05, + "loss": 0.5571, + "step": 19560 + }, + { + "epoch": 1.960184304101768, + "grad_norm": 2.760525941848755, + "learning_rate": 3.341027793695713e-05, + "loss": 0.5042, + "step": 19570 + }, + { + "epoch": 1.9611859568287675, + "grad_norm": 2.3335533142089844, + "learning_rate": 3.3395430392535455e-05, + "loss": 0.549, + "step": 19580 + }, + { + "epoch": 1.9621876095557669, + "grad_norm": 2.1368236541748047, + "learning_rate": 3.338057950973778e-05, + "loss": 0.4718, + "step": 19590 + }, + { + "epoch": 1.9631892622827665, + "grad_norm": 1.9596712589263916, + "learning_rate": 3.336572529446944e-05, + "loss": 0.5775, + "step": 19600 + }, + { + "epoch": 1.964190915009766, + "grad_norm": 2.1493420600891113, + "learning_rate": 3.335086775263709e-05, + "loss": 0.5564, + "step": 19610 + }, + { + "epoch": 1.9651925677367656, + "grad_norm": 2.2132809162139893, + "learning_rate": 3.333600689014872e-05, + "loss": 0.5046, + "step": 19620 + }, + { + "epoch": 1.9661942204637652, + "grad_norm": 2.2998745441436768, + "learning_rate": 3.3321142712913625e-05, + "loss": 0.4809, + "step": 19630 + }, + { + "epoch": 1.9671958731907648, + "grad_norm": 1.8581064939498901, + "learning_rate": 3.330627522684244e-05, + "loss": 0.5627, + "step": 19640 + }, + { + "epoch": 1.9681975259177644, + "grad_norm": 2.277087926864624, + "learning_rate": 3.329140443784709e-05, + "loss": 0.5622, + "step": 19650 + }, + { + "epoch": 1.969199178644764, + "grad_norm": 2.815183162689209, + "learning_rate": 3.3276530351840825e-05, + "loss": 0.4914, + "step": 19660 + }, + { + "epoch": 1.9702008313717634, + "grad_norm": 3.2514102458953857, + "learning_rate": 3.326165297473821e-05, + "loss": 0.5576, + "step": 19670 + }, + { + "epoch": 1.971202484098763, + "grad_norm": 1.6029939651489258, + "learning_rate": 3.324677231245512e-05, + "loss": 0.5235, + "step": 19680 + }, + { + "epoch": 1.9722041368257623, + "grad_norm": 1.8514877557754517, + "learning_rate": 3.323188837090874e-05, + "loss": 0.5727, + "step": 19690 + }, + { + "epoch": 1.973205789552762, + "grad_norm": 2.3635377883911133, + "learning_rate": 3.3217001156017526e-05, + "loss": 0.4909, + "step": 19700 + }, + { + "epoch": 1.9742074422797615, + "grad_norm": 2.3811750411987305, + "learning_rate": 3.320211067370128e-05, + "loss": 0.5762, + "step": 19710 + }, + { + "epoch": 1.9752090950067611, + "grad_norm": 2.026413679122925, + "learning_rate": 3.318721692988108e-05, + "loss": 0.4796, + "step": 19720 + }, + { + "epoch": 1.9762107477337607, + "grad_norm": 2.4324848651885986, + "learning_rate": 3.317231993047929e-05, + "loss": 0.4812, + "step": 19730 + }, + { + "epoch": 1.9772124004607603, + "grad_norm": 3.1195809841156006, + "learning_rate": 3.3157419681419616e-05, + "loss": 0.5272, + "step": 19740 + }, + { + "epoch": 1.97821405318776, + "grad_norm": 1.83283269405365, + "learning_rate": 3.314251618862699e-05, + "loss": 0.5282, + "step": 19750 + }, + { + "epoch": 1.9792157059147595, + "grad_norm": 1.9941926002502441, + "learning_rate": 3.3127609458027675e-05, + "loss": 0.4591, + "step": 19760 + }, + { + "epoch": 1.9802173586417589, + "grad_norm": 2.339540958404541, + "learning_rate": 3.311269949554923e-05, + "loss": 0.4986, + "step": 19770 + }, + { + "epoch": 1.9812190113687584, + "grad_norm": 1.8773863315582275, + "learning_rate": 3.309778630712047e-05, + "loss": 0.6094, + "step": 19780 + }, + { + "epoch": 1.982220664095758, + "grad_norm": 2.326773166656494, + "learning_rate": 3.30828698986715e-05, + "loss": 0.4437, + "step": 19790 + }, + { + "epoch": 1.9832223168227574, + "grad_norm": 2.306333541870117, + "learning_rate": 3.3067950276133716e-05, + "loss": 0.4891, + "step": 19800 + }, + { + "epoch": 1.984223969549757, + "grad_norm": 2.0173776149749756, + "learning_rate": 3.30530274454398e-05, + "loss": 0.5067, + "step": 19810 + }, + { + "epoch": 1.9852256222767566, + "grad_norm": 2.5315053462982178, + "learning_rate": 3.303810141252368e-05, + "loss": 0.4984, + "step": 19820 + }, + { + "epoch": 1.9862272750037562, + "grad_norm": 2.567328453063965, + "learning_rate": 3.302317218332058e-05, + "loss": 0.6134, + "step": 19830 + }, + { + "epoch": 1.9872289277307558, + "grad_norm": 2.731470823287964, + "learning_rate": 3.300823976376699e-05, + "loss": 0.4655, + "step": 19840 + }, + { + "epoch": 1.9882305804577554, + "grad_norm": 1.868723750114441, + "learning_rate": 3.2993304159800666e-05, + "loss": 0.5736, + "step": 19850 + }, + { + "epoch": 1.989232233184755, + "grad_norm": 2.778196334838867, + "learning_rate": 3.2978365377360625e-05, + "loss": 0.5701, + "step": 19860 + }, + { + "epoch": 1.9902338859117545, + "grad_norm": 2.3369123935699463, + "learning_rate": 3.2963423422387175e-05, + "loss": 0.5677, + "step": 19870 + }, + { + "epoch": 1.991235538638754, + "grad_norm": 2.2308642864227295, + "learning_rate": 3.294847830082184e-05, + "loss": 0.5402, + "step": 19880 + }, + { + "epoch": 1.9922371913657535, + "grad_norm": 3.2330994606018066, + "learning_rate": 3.293353001860745e-05, + "loss": 0.5364, + "step": 19890 + }, + { + "epoch": 1.993238844092753, + "grad_norm": 2.73323392868042, + "learning_rate": 3.291857858168805e-05, + "loss": 0.5763, + "step": 19900 + }, + { + "epoch": 1.9942404968197525, + "grad_norm": 1.7553918361663818, + "learning_rate": 3.2903623996008984e-05, + "loss": 0.5238, + "step": 19910 + }, + { + "epoch": 1.995242149546752, + "grad_norm": 1.860754370689392, + "learning_rate": 3.2888666267516806e-05, + "loss": 0.5092, + "step": 19920 + }, + { + "epoch": 1.9962438022737516, + "grad_norm": 2.701542377471924, + "learning_rate": 3.287370540215934e-05, + "loss": 0.4812, + "step": 19930 + }, + { + "epoch": 1.9972454550007512, + "grad_norm": 2.274592638015747, + "learning_rate": 3.285874140588566e-05, + "loss": 0.5499, + "step": 19940 + }, + { + "epoch": 1.9982471077277508, + "grad_norm": 2.010769844055176, + "learning_rate": 3.2843774284646074e-05, + "loss": 0.4987, + "step": 19950 + }, + { + "epoch": 1.9992487604547504, + "grad_norm": 1.8435958623886108, + "learning_rate": 3.282880404439214e-05, + "loss": 0.5644, + "step": 19960 + }, + { + "epoch": 2.0002003305454, + "grad_norm": 2.158094644546509, + "learning_rate": 3.281383069107666e-05, + "loss": 0.5283, + "step": 19970 + }, + { + "epoch": 2.0012019832723995, + "grad_norm": 1.6558252573013306, + "learning_rate": 3.279885423065367e-05, + "loss": 0.4891, + "step": 19980 + }, + { + "epoch": 2.002203635999399, + "grad_norm": 2.214022159576416, + "learning_rate": 3.278387466907843e-05, + "loss": 0.5128, + "step": 19990 + }, + { + "epoch": 2.0032052887263987, + "grad_norm": 2.0052435398101807, + "learning_rate": 3.276889201230745e-05, + "loss": 0.4326, + "step": 20000 + }, + { + "epoch": 2.0032052887263987, + "eval_bleu": 0.2002503205345812, + "eval_loss": 0.5508657097816467, + "eval_rouge1": 0.6375570480064021, + "eval_rouge2": 0.4898024330131999, + "eval_rougeL": 0.6005766639745327, + "eval_runtime": 89842.9594, + "eval_samples_per_second": 0.198, + "eval_steps_per_second": 0.025, + "eval_wer": 1.7028008589955927, + "step": 20000 + }, + { + "epoch": 2.0042069414533983, + "grad_norm": 2.0601089000701904, + "learning_rate": 3.275390626629846e-05, + "loss": 0.453, + "step": 20010 + }, + { + "epoch": 2.005208594180398, + "grad_norm": 2.3491313457489014, + "learning_rate": 3.273891743701043e-05, + "loss": 0.4925, + "step": 20020 + }, + { + "epoch": 2.006210246907397, + "grad_norm": 2.14555025100708, + "learning_rate": 3.272392553040354e-05, + "loss": 0.5125, + "step": 20030 + }, + { + "epoch": 2.0072118996343966, + "grad_norm": 2.575279712677002, + "learning_rate": 3.270893055243921e-05, + "loss": 0.5094, + "step": 20040 + }, + { + "epoch": 2.008213552361396, + "grad_norm": 2.2042462825775146, + "learning_rate": 3.2693932509080066e-05, + "loss": 0.4826, + "step": 20050 + }, + { + "epoch": 2.009215205088396, + "grad_norm": 1.9596309661865234, + "learning_rate": 3.267893140628996e-05, + "loss": 0.5047, + "step": 20060 + }, + { + "epoch": 2.0102168578153954, + "grad_norm": 1.733544111251831, + "learning_rate": 3.266392725003398e-05, + "loss": 0.539, + "step": 20070 + }, + { + "epoch": 2.011218510542395, + "grad_norm": 2.1472814083099365, + "learning_rate": 3.264892004627838e-05, + "loss": 0.5984, + "step": 20080 + }, + { + "epoch": 2.0122201632693946, + "grad_norm": 2.2403430938720703, + "learning_rate": 3.2633909800990666e-05, + "loss": 0.5175, + "step": 20090 + }, + { + "epoch": 2.013221815996394, + "grad_norm": 1.9223722219467163, + "learning_rate": 3.261889652013955e-05, + "loss": 0.4314, + "step": 20100 + }, + { + "epoch": 2.0142234687233938, + "grad_norm": 2.1457340717315674, + "learning_rate": 3.2603880209694946e-05, + "loss": 0.4796, + "step": 20110 + }, + { + "epoch": 2.0152251214503933, + "grad_norm": 2.3787331581115723, + "learning_rate": 3.258886087562796e-05, + "loss": 0.4678, + "step": 20120 + }, + { + "epoch": 2.0162267741773925, + "grad_norm": 2.1419434547424316, + "learning_rate": 3.2573838523910905e-05, + "loss": 0.5041, + "step": 20130 + }, + { + "epoch": 2.017228426904392, + "grad_norm": 2.3345863819122314, + "learning_rate": 3.255881316051732e-05, + "loss": 0.5159, + "step": 20140 + }, + { + "epoch": 2.0182300796313917, + "grad_norm": 2.441622734069824, + "learning_rate": 3.25437847914219e-05, + "loss": 0.4615, + "step": 20150 + }, + { + "epoch": 2.0192317323583913, + "grad_norm": 1.5690834522247314, + "learning_rate": 3.2528753422600564e-05, + "loss": 0.4809, + "step": 20160 + }, + { + "epoch": 2.020233385085391, + "grad_norm": 1.9578380584716797, + "learning_rate": 3.251371906003042e-05, + "loss": 0.5748, + "step": 20170 + }, + { + "epoch": 2.0212350378123904, + "grad_norm": 1.7934218645095825, + "learning_rate": 3.249868170968977e-05, + "loss": 0.4488, + "step": 20180 + }, + { + "epoch": 2.02223669053939, + "grad_norm": 2.1038122177124023, + "learning_rate": 3.248364137755808e-05, + "loss": 0.5006, + "step": 20190 + }, + { + "epoch": 2.0232383432663896, + "grad_norm": 1.5591011047363281, + "learning_rate": 3.2468598069616016e-05, + "loss": 0.4804, + "step": 20200 + }, + { + "epoch": 2.024239995993389, + "grad_norm": 2.1266472339630127, + "learning_rate": 3.245355179184545e-05, + "loss": 0.4946, + "step": 20210 + }, + { + "epoch": 2.025241648720389, + "grad_norm": 2.56298565864563, + "learning_rate": 3.24385025502294e-05, + "loss": 0.4698, + "step": 20220 + }, + { + "epoch": 2.0262433014473884, + "grad_norm": 1.669932246208191, + "learning_rate": 3.242345035075208e-05, + "loss": 0.4983, + "step": 20230 + }, + { + "epoch": 2.0272449541743875, + "grad_norm": 2.1388626098632812, + "learning_rate": 3.240839519939887e-05, + "loss": 0.491, + "step": 20240 + }, + { + "epoch": 2.028246606901387, + "grad_norm": 2.082549810409546, + "learning_rate": 3.239333710215635e-05, + "loss": 0.4602, + "step": 20250 + }, + { + "epoch": 2.0292482596283867, + "grad_norm": 3.176657199859619, + "learning_rate": 3.237827606501223e-05, + "loss": 0.5682, + "step": 20260 + }, + { + "epoch": 2.0302499123553863, + "grad_norm": 2.1459405422210693, + "learning_rate": 3.236321209395543e-05, + "loss": 0.4811, + "step": 20270 + }, + { + "epoch": 2.031251565082386, + "grad_norm": 2.0369656085968018, + "learning_rate": 3.2348145194976e-05, + "loss": 0.507, + "step": 20280 + }, + { + "epoch": 2.0322532178093855, + "grad_norm": 2.6629416942596436, + "learning_rate": 3.233307537406519e-05, + "loss": 0.4818, + "step": 20290 + }, + { + "epoch": 2.033254870536385, + "grad_norm": 2.0255343914031982, + "learning_rate": 3.231800263721538e-05, + "loss": 0.5273, + "step": 20300 + }, + { + "epoch": 2.0342565232633847, + "grad_norm": 1.8463060855865479, + "learning_rate": 3.230292699042014e-05, + "loss": 0.4931, + "step": 20310 + }, + { + "epoch": 2.0352581759903843, + "grad_norm": 2.6563429832458496, + "learning_rate": 3.228784843967416e-05, + "loss": 0.5148, + "step": 20320 + }, + { + "epoch": 2.036259828717384, + "grad_norm": 2.2401204109191895, + "learning_rate": 3.2272766990973316e-05, + "loss": 0.4384, + "step": 20330 + }, + { + "epoch": 2.037261481444383, + "grad_norm": 2.7477822303771973, + "learning_rate": 3.225768265031463e-05, + "loss": 0.5228, + "step": 20340 + }, + { + "epoch": 2.0382631341713826, + "grad_norm": 2.590658187866211, + "learning_rate": 3.2242595423696277e-05, + "loss": 0.4807, + "step": 20350 + }, + { + "epoch": 2.039264786898382, + "grad_norm": 2.0068235397338867, + "learning_rate": 3.222750531711755e-05, + "loss": 0.5156, + "step": 20360 + }, + { + "epoch": 2.040266439625382, + "grad_norm": 2.5355372428894043, + "learning_rate": 3.2212412336578926e-05, + "loss": 0.6042, + "step": 20370 + }, + { + "epoch": 2.0412680923523814, + "grad_norm": 2.3344709873199463, + "learning_rate": 3.2197316488081995e-05, + "loss": 0.5159, + "step": 20380 + }, + { + "epoch": 2.042269745079381, + "grad_norm": 2.0762157440185547, + "learning_rate": 3.218221777762952e-05, + "loss": 0.4592, + "step": 20390 + }, + { + "epoch": 2.0432713978063806, + "grad_norm": 2.093581199645996, + "learning_rate": 3.216711621122537e-05, + "loss": 0.4531, + "step": 20400 + }, + { + "epoch": 2.04427305053338, + "grad_norm": 2.9248385429382324, + "learning_rate": 3.215201179487456e-05, + "loss": 0.469, + "step": 20410 + }, + { + "epoch": 2.0452747032603797, + "grad_norm": 1.7484266757965088, + "learning_rate": 3.213690453458325e-05, + "loss": 0.4237, + "step": 20420 + }, + { + "epoch": 2.0462763559873793, + "grad_norm": 1.9926973581314087, + "learning_rate": 3.212179443635872e-05, + "loss": 0.471, + "step": 20430 + }, + { + "epoch": 2.047278008714379, + "grad_norm": 1.6716692447662354, + "learning_rate": 3.210668150620937e-05, + "loss": 0.4656, + "step": 20440 + }, + { + "epoch": 2.048279661441378, + "grad_norm": 1.3834606409072876, + "learning_rate": 3.209156575014475e-05, + "loss": 0.4969, + "step": 20450 + }, + { + "epoch": 2.0492813141683777, + "grad_norm": 1.8815782070159912, + "learning_rate": 3.207644717417551e-05, + "loss": 0.4847, + "step": 20460 + }, + { + "epoch": 2.0502829668953773, + "grad_norm": 1.73691987991333, + "learning_rate": 3.2061325784313436e-05, + "loss": 0.4458, + "step": 20470 + }, + { + "epoch": 2.051284619622377, + "grad_norm": 2.1354963779449463, + "learning_rate": 3.204620158657144e-05, + "loss": 0.4935, + "step": 20480 + }, + { + "epoch": 2.0522862723493764, + "grad_norm": 1.9221725463867188, + "learning_rate": 3.20310745869635e-05, + "loss": 0.4741, + "step": 20490 + }, + { + "epoch": 2.053287925076376, + "grad_norm": 2.065915822982788, + "learning_rate": 3.20159447915048e-05, + "loss": 0.5215, + "step": 20500 + }, + { + "epoch": 2.0542895778033756, + "grad_norm": 2.0788304805755615, + "learning_rate": 3.200081220621154e-05, + "loss": 0.5446, + "step": 20510 + }, + { + "epoch": 2.055291230530375, + "grad_norm": 2.2969303131103516, + "learning_rate": 3.1985676837101095e-05, + "loss": 0.5335, + "step": 20520 + }, + { + "epoch": 2.056292883257375, + "grad_norm": 1.6240125894546509, + "learning_rate": 3.1970538690191914e-05, + "loss": 0.5019, + "step": 20530 + }, + { + "epoch": 2.0572945359843744, + "grad_norm": 2.1084020137786865, + "learning_rate": 3.195539777150356e-05, + "loss": 0.4362, + "step": 20540 + }, + { + "epoch": 2.058296188711374, + "grad_norm": 1.850347876548767, + "learning_rate": 3.194025408705671e-05, + "loss": 0.4695, + "step": 20550 + }, + { + "epoch": 2.059297841438373, + "grad_norm": 2.2284724712371826, + "learning_rate": 3.192510764287311e-05, + "loss": 0.4978, + "step": 20560 + }, + { + "epoch": 2.0602994941653727, + "grad_norm": 2.4977900981903076, + "learning_rate": 3.1909958444975636e-05, + "loss": 0.5228, + "step": 20570 + }, + { + "epoch": 2.0613011468923723, + "grad_norm": 3.1662728786468506, + "learning_rate": 3.189480649938825e-05, + "loss": 0.4964, + "step": 20580 + }, + { + "epoch": 2.062302799619372, + "grad_norm": 1.9692362546920776, + "learning_rate": 3.1879651812135995e-05, + "loss": 0.4977, + "step": 20590 + }, + { + "epoch": 2.0633044523463715, + "grad_norm": 1.8094409704208374, + "learning_rate": 3.1864494389245016e-05, + "loss": 0.4916, + "step": 20600 + }, + { + "epoch": 2.064306105073371, + "grad_norm": 2.69331431388855, + "learning_rate": 3.184933423674254e-05, + "loss": 0.4704, + "step": 20610 + }, + { + "epoch": 2.0653077578003707, + "grad_norm": 1.9121180772781372, + "learning_rate": 3.183417136065686e-05, + "loss": 0.5054, + "step": 20620 + }, + { + "epoch": 2.0663094105273703, + "grad_norm": 1.9425513744354248, + "learning_rate": 3.1819005767017406e-05, + "loss": 0.437, + "step": 20630 + }, + { + "epoch": 2.06731106325437, + "grad_norm": 2.331561326980591, + "learning_rate": 3.180383746185464e-05, + "loss": 0.4489, + "step": 20640 + }, + { + "epoch": 2.0683127159813695, + "grad_norm": 2.1345226764678955, + "learning_rate": 3.178866645120012e-05, + "loss": 0.488, + "step": 20650 + }, + { + "epoch": 2.0693143687083686, + "grad_norm": 2.7039108276367188, + "learning_rate": 3.1773492741086474e-05, + "loss": 0.4713, + "step": 20660 + }, + { + "epoch": 2.070316021435368, + "grad_norm": 2.335880756378174, + "learning_rate": 3.1758316337547414e-05, + "loss": 0.5457, + "step": 20670 + }, + { + "epoch": 2.071317674162368, + "grad_norm": 1.7798826694488525, + "learning_rate": 3.1743137246617714e-05, + "loss": 0.5122, + "step": 20680 + }, + { + "epoch": 2.0723193268893674, + "grad_norm": 1.9185101985931396, + "learning_rate": 3.172795547433321e-05, + "loss": 0.5018, + "step": 20690 + }, + { + "epoch": 2.073320979616367, + "grad_norm": 2.214864730834961, + "learning_rate": 3.171277102673082e-05, + "loss": 0.4785, + "step": 20700 + }, + { + "epoch": 2.0743226323433666, + "grad_norm": 2.6873626708984375, + "learning_rate": 3.169758390984852e-05, + "loss": 0.4523, + "step": 20710 + }, + { + "epoch": 2.075324285070366, + "grad_norm": 1.6275378465652466, + "learning_rate": 3.168239412972534e-05, + "loss": 0.5009, + "step": 20720 + }, + { + "epoch": 2.0763259377973657, + "grad_norm": 1.7685691118240356, + "learning_rate": 3.166720169240138e-05, + "loss": 0.4612, + "step": 20730 + }, + { + "epoch": 2.0773275905243653, + "grad_norm": 2.1459972858428955, + "learning_rate": 3.165200660391779e-05, + "loss": 0.441, + "step": 20740 + }, + { + "epoch": 2.078329243251365, + "grad_norm": 2.075395107269287, + "learning_rate": 3.1636808870316775e-05, + "loss": 0.4499, + "step": 20750 + }, + { + "epoch": 2.0793308959783645, + "grad_norm": 2.5253570079803467, + "learning_rate": 3.1621608497641605e-05, + "loss": 0.5045, + "step": 20760 + }, + { + "epoch": 2.0803325487053637, + "grad_norm": 2.685187816619873, + "learning_rate": 3.1606405491936566e-05, + "loss": 0.5111, + "step": 20770 + }, + { + "epoch": 2.0813342014323633, + "grad_norm": 2.0310208797454834, + "learning_rate": 3.159119985924702e-05, + "loss": 0.4889, + "step": 20780 + }, + { + "epoch": 2.082335854159363, + "grad_norm": 2.0942888259887695, + "learning_rate": 3.157599160561937e-05, + "loss": 0.4823, + "step": 20790 + }, + { + "epoch": 2.0833375068863624, + "grad_norm": 2.0800859928131104, + "learning_rate": 3.156078073710105e-05, + "loss": 0.4514, + "step": 20800 + }, + { + "epoch": 2.084339159613362, + "grad_norm": 2.227480411529541, + "learning_rate": 3.1545567259740554e-05, + "loss": 0.4826, + "step": 20810 + }, + { + "epoch": 2.0853408123403616, + "grad_norm": 2.061187744140625, + "learning_rate": 3.153035117958739e-05, + "loss": 0.4673, + "step": 20820 + }, + { + "epoch": 2.086342465067361, + "grad_norm": 1.741713047027588, + "learning_rate": 3.15151325026921e-05, + "loss": 0.4825, + "step": 20830 + }, + { + "epoch": 2.087344117794361, + "grad_norm": 4.484399795532227, + "learning_rate": 3.14999112351063e-05, + "loss": 0.5185, + "step": 20840 + }, + { + "epoch": 2.0883457705213604, + "grad_norm": 1.8805603981018066, + "learning_rate": 3.148468738288258e-05, + "loss": 0.4914, + "step": 20850 + }, + { + "epoch": 2.08934742324836, + "grad_norm": 2.6285297870635986, + "learning_rate": 3.146946095207459e-05, + "loss": 0.4523, + "step": 20860 + }, + { + "epoch": 2.090349075975359, + "grad_norm": 2.502845525741577, + "learning_rate": 3.1454231948737e-05, + "loss": 0.4848, + "step": 20870 + }, + { + "epoch": 2.0913507287023587, + "grad_norm": 2.0391061305999756, + "learning_rate": 3.1439000378925496e-05, + "loss": 0.5003, + "step": 20880 + }, + { + "epoch": 2.0923523814293583, + "grad_norm": 2.1008460521698, + "learning_rate": 3.1423766248696804e-05, + "loss": 0.4715, + "step": 20890 + }, + { + "epoch": 2.093354034156358, + "grad_norm": 2.340632200241089, + "learning_rate": 3.1408529564108644e-05, + "loss": 0.5281, + "step": 20900 + }, + { + "epoch": 2.0943556868833575, + "grad_norm": 2.45475172996521, + "learning_rate": 3.139329033121977e-05, + "loss": 0.4497, + "step": 20910 + }, + { + "epoch": 2.095357339610357, + "grad_norm": 1.7948698997497559, + "learning_rate": 3.137804855608993e-05, + "loss": 0.4581, + "step": 20920 + }, + { + "epoch": 2.0963589923373567, + "grad_norm": 2.1844940185546875, + "learning_rate": 3.1362804244779906e-05, + "loss": 0.447, + "step": 20930 + }, + { + "epoch": 2.0973606450643563, + "grad_norm": 2.17244553565979, + "learning_rate": 3.134755740335147e-05, + "loss": 0.5166, + "step": 20940 + }, + { + "epoch": 2.098362297791356, + "grad_norm": 2.8408572673797607, + "learning_rate": 3.133230803786741e-05, + "loss": 0.4978, + "step": 20950 + }, + { + "epoch": 2.0993639505183554, + "grad_norm": 2.3953614234924316, + "learning_rate": 3.131705615439152e-05, + "loss": 0.4383, + "step": 20960 + }, + { + "epoch": 2.100365603245355, + "grad_norm": 1.9969464540481567, + "learning_rate": 3.130180175898857e-05, + "loss": 0.4647, + "step": 20970 + }, + { + "epoch": 2.101367255972354, + "grad_norm": 1.826015591621399, + "learning_rate": 3.1286544857724366e-05, + "loss": 0.4327, + "step": 20980 + }, + { + "epoch": 2.1023689086993538, + "grad_norm": 2.493840217590332, + "learning_rate": 3.1271285456665696e-05, + "loss": 0.4594, + "step": 20990 + }, + { + "epoch": 2.1033705614263534, + "grad_norm": 2.08193039894104, + "learning_rate": 3.125602356188032e-05, + "loss": 0.4489, + "step": 21000 + }, + { + "epoch": 2.104372214153353, + "grad_norm": 2.3533523082733154, + "learning_rate": 3.124075917943704e-05, + "loss": 0.4333, + "step": 21010 + }, + { + "epoch": 2.1053738668803526, + "grad_norm": 2.025791645050049, + "learning_rate": 3.122549231540558e-05, + "loss": 0.4217, + "step": 21020 + }, + { + "epoch": 2.106375519607352, + "grad_norm": 2.909153699874878, + "learning_rate": 3.121022297585672e-05, + "loss": 0.5087, + "step": 21030 + }, + { + "epoch": 2.1073771723343517, + "grad_norm": 3.3912322521209717, + "learning_rate": 3.1194951166862165e-05, + "loss": 0.4623, + "step": 21040 + }, + { + "epoch": 2.1083788250613513, + "grad_norm": 2.3475584983825684, + "learning_rate": 3.117967689449464e-05, + "loss": 0.5099, + "step": 21050 + }, + { + "epoch": 2.109380477788351, + "grad_norm": 2.320850133895874, + "learning_rate": 3.116440016482784e-05, + "loss": 0.4373, + "step": 21060 + }, + { + "epoch": 2.1103821305153505, + "grad_norm": 1.9819415807724, + "learning_rate": 3.1149120983936434e-05, + "loss": 0.469, + "step": 21070 + }, + { + "epoch": 2.11138378324235, + "grad_norm": 1.4946792125701904, + "learning_rate": 3.1133839357896055e-05, + "loss": 0.4708, + "step": 21080 + }, + { + "epoch": 2.1123854359693492, + "grad_norm": 2.409613847732544, + "learning_rate": 3.1118555292783336e-05, + "loss": 0.513, + "step": 21090 + }, + { + "epoch": 2.113387088696349, + "grad_norm": 2.309664011001587, + "learning_rate": 3.110326879467585e-05, + "loss": 0.521, + "step": 21100 + }, + { + "epoch": 2.1143887414233484, + "grad_norm": 1.7967244386672974, + "learning_rate": 3.108797986965217e-05, + "loss": 0.4797, + "step": 21110 + }, + { + "epoch": 2.115390394150348, + "grad_norm": 2.344731330871582, + "learning_rate": 3.107268852379179e-05, + "loss": 0.5341, + "step": 21120 + }, + { + "epoch": 2.1163920468773476, + "grad_norm": 1.8803640604019165, + "learning_rate": 3.105739476317521e-05, + "loss": 0.4447, + "step": 21130 + }, + { + "epoch": 2.117393699604347, + "grad_norm": 2.1133954524993896, + "learning_rate": 3.104209859388387e-05, + "loss": 0.5363, + "step": 21140 + }, + { + "epoch": 2.118395352331347, + "grad_norm": 2.5146517753601074, + "learning_rate": 3.102680002200017e-05, + "loss": 0.5577, + "step": 21150 + }, + { + "epoch": 2.1193970050583464, + "grad_norm": 1.94539213180542, + "learning_rate": 3.1011499053607445e-05, + "loss": 0.4741, + "step": 21160 + }, + { + "epoch": 2.120398657785346, + "grad_norm": 2.0953662395477295, + "learning_rate": 3.0996195694790037e-05, + "loss": 0.4667, + "step": 21170 + }, + { + "epoch": 2.1214003105123456, + "grad_norm": 2.344572067260742, + "learning_rate": 3.098088995163318e-05, + "loss": 0.5503, + "step": 21180 + }, + { + "epoch": 2.1224019632393447, + "grad_norm": 2.016023874282837, + "learning_rate": 3.096558183022309e-05, + "loss": 0.5102, + "step": 21190 + }, + { + "epoch": 2.1234036159663443, + "grad_norm": 1.793877363204956, + "learning_rate": 3.095027133664692e-05, + "loss": 0.4622, + "step": 21200 + }, + { + "epoch": 2.124405268693344, + "grad_norm": 1.8648747205734253, + "learning_rate": 3.093495847699276e-05, + "loss": 0.5166, + "step": 21210 + }, + { + "epoch": 2.1254069214203435, + "grad_norm": 2.274259328842163, + "learning_rate": 3.091964325734965e-05, + "loss": 0.503, + "step": 21220 + }, + { + "epoch": 2.126408574147343, + "grad_norm": 2.119114398956299, + "learning_rate": 3.090432568380757e-05, + "loss": 0.4296, + "step": 21230 + }, + { + "epoch": 2.1274102268743427, + "grad_norm": 2.1176578998565674, + "learning_rate": 3.088900576245742e-05, + "loss": 0.4331, + "step": 21240 + }, + { + "epoch": 2.1284118796013423, + "grad_norm": 2.4417192935943604, + "learning_rate": 3.087368349939106e-05, + "loss": 0.5897, + "step": 21250 + }, + { + "epoch": 2.129413532328342, + "grad_norm": 2.3847012519836426, + "learning_rate": 3.0858358900701254e-05, + "loss": 0.4446, + "step": 21260 + }, + { + "epoch": 2.1304151850553414, + "grad_norm": 2.534501791000366, + "learning_rate": 3.0843031972481706e-05, + "loss": 0.5521, + "step": 21270 + }, + { + "epoch": 2.131416837782341, + "grad_norm": 2.3288180828094482, + "learning_rate": 3.082770272082706e-05, + "loss": 0.4715, + "step": 21280 + }, + { + "epoch": 2.13241849050934, + "grad_norm": 2.3620693683624268, + "learning_rate": 3.081237115183285e-05, + "loss": 0.4759, + "step": 21290 + }, + { + "epoch": 2.1334201432363398, + "grad_norm": 1.644264578819275, + "learning_rate": 3.079703727159556e-05, + "loss": 0.4878, + "step": 21300 + }, + { + "epoch": 2.1344217959633394, + "grad_norm": 1.9679802656173706, + "learning_rate": 3.07817010862126e-05, + "loss": 0.4896, + "step": 21310 + }, + { + "epoch": 2.135423448690339, + "grad_norm": 1.9164067506790161, + "learning_rate": 3.0766362601782276e-05, + "loss": 0.4766, + "step": 21320 + }, + { + "epoch": 2.1364251014173385, + "grad_norm": 3.025406837463379, + "learning_rate": 3.075102182440379e-05, + "loss": 0.5287, + "step": 21330 + }, + { + "epoch": 2.137426754144338, + "grad_norm": 2.551084280014038, + "learning_rate": 3.073567876017732e-05, + "loss": 0.5036, + "step": 21340 + }, + { + "epoch": 2.1384284068713377, + "grad_norm": 1.9237477779388428, + "learning_rate": 3.072033341520388e-05, + "loss": 0.5097, + "step": 21350 + }, + { + "epoch": 2.1394300595983373, + "grad_norm": 1.697118878364563, + "learning_rate": 3.070498579558543e-05, + "loss": 0.4359, + "step": 21360 + }, + { + "epoch": 2.140431712325337, + "grad_norm": 1.9887810945510864, + "learning_rate": 3.0689635907424845e-05, + "loss": 0.4565, + "step": 21370 + }, + { + "epoch": 2.1414333650523365, + "grad_norm": 2.0479846000671387, + "learning_rate": 3.067428375682587e-05, + "loss": 0.4848, + "step": 21380 + }, + { + "epoch": 2.142435017779336, + "grad_norm": 1.8660268783569336, + "learning_rate": 3.0658929349893174e-05, + "loss": 0.4491, + "step": 21390 + }, + { + "epoch": 2.1434366705063352, + "grad_norm": 2.399012327194214, + "learning_rate": 3.0643572692732305e-05, + "loss": 0.5697, + "step": 21400 + }, + { + "epoch": 2.144438323233335, + "grad_norm": 2.653744697570801, + "learning_rate": 3.062821379144973e-05, + "loss": 0.5021, + "step": 21410 + }, + { + "epoch": 2.1454399759603344, + "grad_norm": 2.3516993522644043, + "learning_rate": 3.061285265215279e-05, + "loss": 0.4619, + "step": 21420 + }, + { + "epoch": 2.146441628687334, + "grad_norm": 2.4360647201538086, + "learning_rate": 3.059748928094971e-05, + "loss": 0.4099, + "step": 21430 + }, + { + "epoch": 2.1474432814143336, + "grad_norm": 1.9208985567092896, + "learning_rate": 3.058212368394962e-05, + "loss": 0.4671, + "step": 21440 + }, + { + "epoch": 2.148444934141333, + "grad_norm": 1.6549501419067383, + "learning_rate": 3.056675586726252e-05, + "loss": 0.5079, + "step": 21450 + }, + { + "epoch": 2.149446586868333, + "grad_norm": 2.041652202606201, + "learning_rate": 3.05513858369993e-05, + "loss": 0.4768, + "step": 21460 + }, + { + "epoch": 2.1504482395953324, + "grad_norm": 2.64847731590271, + "learning_rate": 3.053601359927174e-05, + "loss": 0.499, + "step": 21470 + }, + { + "epoch": 2.151449892322332, + "grad_norm": 2.6299538612365723, + "learning_rate": 3.0520639160192484e-05, + "loss": 0.4828, + "step": 21480 + }, + { + "epoch": 2.1524515450493316, + "grad_norm": 2.515505790710449, + "learning_rate": 3.0505262525875045e-05, + "loss": 0.4939, + "step": 21490 + }, + { + "epoch": 2.153453197776331, + "grad_norm": 2.789552927017212, + "learning_rate": 3.048988370243384e-05, + "loss": 0.4844, + "step": 21500 + }, + { + "epoch": 2.1544548505033303, + "grad_norm": 2.123347282409668, + "learning_rate": 3.0474502695984113e-05, + "loss": 0.5176, + "step": 21510 + }, + { + "epoch": 2.15545650323033, + "grad_norm": 2.637828826904297, + "learning_rate": 3.045911951264202e-05, + "loss": 0.4876, + "step": 21520 + }, + { + "epoch": 2.1564581559573295, + "grad_norm": 1.7636555433273315, + "learning_rate": 3.0443734158524538e-05, + "loss": 0.4821, + "step": 21530 + }, + { + "epoch": 2.157459808684329, + "grad_norm": 2.0353643894195557, + "learning_rate": 3.042834663974955e-05, + "loss": 0.4496, + "step": 21540 + }, + { + "epoch": 2.1584614614113287, + "grad_norm": 1.786335825920105, + "learning_rate": 3.0412956962435773e-05, + "loss": 0.4764, + "step": 21550 + }, + { + "epoch": 2.1594631141383283, + "grad_norm": 1.59462571144104, + "learning_rate": 3.039756513270278e-05, + "loss": 0.4608, + "step": 21560 + }, + { + "epoch": 2.160464766865328, + "grad_norm": 2.1465108394622803, + "learning_rate": 3.0382171156671028e-05, + "loss": 0.5007, + "step": 21570 + }, + { + "epoch": 2.1614664195923274, + "grad_norm": 1.9091776609420776, + "learning_rate": 3.0366775040461802e-05, + "loss": 0.5191, + "step": 21580 + }, + { + "epoch": 2.162468072319327, + "grad_norm": 2.044081449508667, + "learning_rate": 3.035137679019724e-05, + "loss": 0.4763, + "step": 21590 + }, + { + "epoch": 2.1634697250463266, + "grad_norm": 1.9604840278625488, + "learning_rate": 3.033597641200034e-05, + "loss": 0.4886, + "step": 21600 + }, + { + "epoch": 2.164471377773326, + "grad_norm": 2.2669661045074463, + "learning_rate": 3.032057391199493e-05, + "loss": 0.4859, + "step": 21610 + }, + { + "epoch": 2.1654730305003254, + "grad_norm": 2.0096001625061035, + "learning_rate": 3.0305169296305695e-05, + "loss": 0.5297, + "step": 21620 + }, + { + "epoch": 2.166474683227325, + "grad_norm": 2.0376088619232178, + "learning_rate": 3.0289762571058167e-05, + "loss": 0.47, + "step": 21630 + }, + { + "epoch": 2.1674763359543245, + "grad_norm": 2.465101718902588, + "learning_rate": 3.0274353742378697e-05, + "loss": 0.5087, + "step": 21640 + }, + { + "epoch": 2.168477988681324, + "grad_norm": 1.4533342123031616, + "learning_rate": 3.0258942816394492e-05, + "loss": 0.52, + "step": 21650 + }, + { + "epoch": 2.1694796414083237, + "grad_norm": 1.972916841506958, + "learning_rate": 3.0243529799233572e-05, + "loss": 0.4282, + "step": 21660 + }, + { + "epoch": 2.1704812941353233, + "grad_norm": 1.6645925045013428, + "learning_rate": 3.0228114697024813e-05, + "loss": 0.4742, + "step": 21670 + }, + { + "epoch": 2.171482946862323, + "grad_norm": 2.133258581161499, + "learning_rate": 3.0212697515897893e-05, + "loss": 0.5186, + "step": 21680 + }, + { + "epoch": 2.1724845995893225, + "grad_norm": 1.7237110137939453, + "learning_rate": 3.0197278261983343e-05, + "loss": 0.4855, + "step": 21690 + }, + { + "epoch": 2.173486252316322, + "grad_norm": 1.968004822731018, + "learning_rate": 3.0181856941412512e-05, + "loss": 0.4972, + "step": 21700 + }, + { + "epoch": 2.1744879050433217, + "grad_norm": 1.9006356000900269, + "learning_rate": 3.0166433560317543e-05, + "loss": 0.4153, + "step": 21710 + }, + { + "epoch": 2.175489557770321, + "grad_norm": 2.670501708984375, + "learning_rate": 3.0151008124831438e-05, + "loss": 0.5024, + "step": 21720 + }, + { + "epoch": 2.1764912104973204, + "grad_norm": 1.9438066482543945, + "learning_rate": 3.0135580641088e-05, + "loss": 0.4693, + "step": 21730 + }, + { + "epoch": 2.17749286322432, + "grad_norm": 2.635740280151367, + "learning_rate": 3.0120151115221834e-05, + "loss": 0.5429, + "step": 21740 + }, + { + "epoch": 2.1784945159513196, + "grad_norm": 1.8196395635604858, + "learning_rate": 3.010471955336838e-05, + "loss": 0.4438, + "step": 21750 + }, + { + "epoch": 2.179496168678319, + "grad_norm": 2.196422576904297, + "learning_rate": 3.008928596166386e-05, + "loss": 0.5454, + "step": 21760 + }, + { + "epoch": 2.180497821405319, + "grad_norm": 2.2442145347595215, + "learning_rate": 3.0073850346245337e-05, + "loss": 0.4403, + "step": 21770 + }, + { + "epoch": 2.1814994741323184, + "grad_norm": 1.8484165668487549, + "learning_rate": 3.005841271325065e-05, + "loss": 0.4638, + "step": 21780 + }, + { + "epoch": 2.182501126859318, + "grad_norm": 1.8003365993499756, + "learning_rate": 3.004297306881845e-05, + "loss": 0.4501, + "step": 21790 + }, + { + "epoch": 2.1835027795863176, + "grad_norm": 2.756234884262085, + "learning_rate": 3.002753141908819e-05, + "loss": 0.4455, + "step": 21800 + }, + { + "epoch": 2.184504432313317, + "grad_norm": 1.779693841934204, + "learning_rate": 3.001208777020012e-05, + "loss": 0.4788, + "step": 21810 + }, + { + "epoch": 2.1855060850403163, + "grad_norm": 1.9188098907470703, + "learning_rate": 2.999664212829528e-05, + "loss": 0.4916, + "step": 21820 + }, + { + "epoch": 2.186507737767316, + "grad_norm": 2.3162620067596436, + "learning_rate": 2.998119449951552e-05, + "loss": 0.4194, + "step": 21830 + }, + { + "epoch": 2.1875093904943155, + "grad_norm": 2.515458822250366, + "learning_rate": 2.996574489000345e-05, + "loss": 0.4606, + "step": 21840 + }, + { + "epoch": 2.188511043221315, + "grad_norm": 1.8557404279708862, + "learning_rate": 2.9950293305902494e-05, + "loss": 0.5494, + "step": 21850 + }, + { + "epoch": 2.1895126959483147, + "grad_norm": 1.8735612630844116, + "learning_rate": 2.9934839753356846e-05, + "loss": 0.4404, + "step": 21860 + }, + { + "epoch": 2.1905143486753142, + "grad_norm": 1.6917238235473633, + "learning_rate": 2.9919384238511484e-05, + "loss": 0.5173, + "step": 21870 + }, + { + "epoch": 2.191516001402314, + "grad_norm": 1.8293862342834473, + "learning_rate": 2.990392676751219e-05, + "loss": 0.4723, + "step": 21880 + }, + { + "epoch": 2.1925176541293134, + "grad_norm": 1.5702310800552368, + "learning_rate": 2.9888467346505478e-05, + "loss": 0.4483, + "step": 21890 + }, + { + "epoch": 2.193519306856313, + "grad_norm": 1.7046412229537964, + "learning_rate": 2.987300598163868e-05, + "loss": 0.4452, + "step": 21900 + }, + { + "epoch": 2.1945209595833126, + "grad_norm": 2.7863757610321045, + "learning_rate": 2.985754267905989e-05, + "loss": 0.574, + "step": 21910 + }, + { + "epoch": 2.195522612310312, + "grad_norm": 2.4093217849731445, + "learning_rate": 2.9842077444917944e-05, + "loss": 0.4968, + "step": 21920 + }, + { + "epoch": 2.1965242650373114, + "grad_norm": 2.1676666736602783, + "learning_rate": 2.9826610285362494e-05, + "loss": 0.4581, + "step": 21930 + }, + { + "epoch": 2.197525917764311, + "grad_norm": 2.1943066120147705, + "learning_rate": 2.9811141206543914e-05, + "loss": 0.4902, + "step": 21940 + }, + { + "epoch": 2.1985275704913105, + "grad_norm": 2.6076924800872803, + "learning_rate": 2.979567021461337e-05, + "loss": 0.5178, + "step": 21950 + }, + { + "epoch": 2.19952922321831, + "grad_norm": 1.8468619585037231, + "learning_rate": 2.978019731572278e-05, + "loss": 0.4675, + "step": 21960 + }, + { + "epoch": 2.2005308759453097, + "grad_norm": 2.3149871826171875, + "learning_rate": 2.976472251602481e-05, + "loss": 0.4665, + "step": 21970 + }, + { + "epoch": 2.2015325286723093, + "grad_norm": 2.723015308380127, + "learning_rate": 2.97492458216729e-05, + "loss": 0.4283, + "step": 21980 + }, + { + "epoch": 2.202534181399309, + "grad_norm": 2.4012863636016846, + "learning_rate": 2.9733767238821224e-05, + "loss": 0.5706, + "step": 21990 + }, + { + "epoch": 2.2035358341263085, + "grad_norm": 1.7116522789001465, + "learning_rate": 2.9718286773624733e-05, + "loss": 0.5375, + "step": 22000 + }, + { + "epoch": 2.204537486853308, + "grad_norm": 2.074054002761841, + "learning_rate": 2.9702804432239094e-05, + "loss": 0.4885, + "step": 22010 + }, + { + "epoch": 2.2055391395803077, + "grad_norm": 2.2811458110809326, + "learning_rate": 2.9687320220820748e-05, + "loss": 0.5012, + "step": 22020 + }, + { + "epoch": 2.2065407923073073, + "grad_norm": 1.5797665119171143, + "learning_rate": 2.967183414552686e-05, + "loss": 0.4347, + "step": 22030 + }, + { + "epoch": 2.2075424450343064, + "grad_norm": 1.8701937198638916, + "learning_rate": 2.965634621251535e-05, + "loss": 0.5473, + "step": 22040 + }, + { + "epoch": 2.208544097761306, + "grad_norm": 2.81683349609375, + "learning_rate": 2.9640856427944863e-05, + "loss": 0.511, + "step": 22050 + }, + { + "epoch": 2.2095457504883056, + "grad_norm": 4.241177082061768, + "learning_rate": 2.962536479797481e-05, + "loss": 0.4851, + "step": 22060 + }, + { + "epoch": 2.210547403215305, + "grad_norm": 2.56923246383667, + "learning_rate": 2.9609871328765288e-05, + "loss": 0.5112, + "step": 22070 + }, + { + "epoch": 2.2115490559423048, + "grad_norm": 1.703288197517395, + "learning_rate": 2.9594376026477173e-05, + "loss": 0.4989, + "step": 22080 + }, + { + "epoch": 2.2125507086693044, + "grad_norm": 2.1868696212768555, + "learning_rate": 2.9578878897272022e-05, + "loss": 0.4711, + "step": 22090 + }, + { + "epoch": 2.213552361396304, + "grad_norm": 2.30442214012146, + "learning_rate": 2.9563379947312176e-05, + "loss": 0.5014, + "step": 22100 + }, + { + "epoch": 2.2145540141233035, + "grad_norm": 1.7584575414657593, + "learning_rate": 2.954787918276065e-05, + "loss": 0.4397, + "step": 22110 + }, + { + "epoch": 2.215555666850303, + "grad_norm": 2.35834002494812, + "learning_rate": 2.9532376609781197e-05, + "loss": 0.4921, + "step": 22120 + }, + { + "epoch": 2.2165573195773027, + "grad_norm": 2.399610757827759, + "learning_rate": 2.9516872234538306e-05, + "loss": 0.4985, + "step": 22130 + }, + { + "epoch": 2.2175589723043023, + "grad_norm": 2.3085238933563232, + "learning_rate": 2.950136606319716e-05, + "loss": 0.5122, + "step": 22140 + }, + { + "epoch": 2.2185606250313015, + "grad_norm": 1.8348264694213867, + "learning_rate": 2.9485858101923664e-05, + "loss": 0.4883, + "step": 22150 + }, + { + "epoch": 2.219562277758301, + "grad_norm": 1.838392734527588, + "learning_rate": 2.9470348356884436e-05, + "loss": 0.4828, + "step": 22160 + }, + { + "epoch": 2.2205639304853007, + "grad_norm": 2.5261988639831543, + "learning_rate": 2.945483683424681e-05, + "loss": 0.4862, + "step": 22170 + }, + { + "epoch": 2.2215655832123002, + "grad_norm": 2.360555410385132, + "learning_rate": 2.9439323540178803e-05, + "loss": 0.4755, + "step": 22180 + }, + { + "epoch": 2.2225672359393, + "grad_norm": 1.8836420774459839, + "learning_rate": 2.9423808480849162e-05, + "loss": 0.5563, + "step": 22190 + }, + { + "epoch": 2.2235688886662994, + "grad_norm": 2.629767417907715, + "learning_rate": 2.9408291662427334e-05, + "loss": 0.4914, + "step": 22200 + }, + { + "epoch": 2.224570541393299, + "grad_norm": 1.7553353309631348, + "learning_rate": 2.939277309108345e-05, + "loss": 0.4442, + "step": 22210 + }, + { + "epoch": 2.2255721941202986, + "grad_norm": 1.9337244033813477, + "learning_rate": 2.937725277298834e-05, + "loss": 0.4718, + "step": 22220 + }, + { + "epoch": 2.226573846847298, + "grad_norm": 2.2966341972351074, + "learning_rate": 2.936173071431354e-05, + "loss": 0.4718, + "step": 22230 + }, + { + "epoch": 2.227575499574298, + "grad_norm": 1.795206069946289, + "learning_rate": 2.934620692123129e-05, + "loss": 0.4941, + "step": 22240 + }, + { + "epoch": 2.228577152301297, + "grad_norm": 2.291245698928833, + "learning_rate": 2.933068139991447e-05, + "loss": 0.47, + "step": 22250 + }, + { + "epoch": 2.2295788050282965, + "grad_norm": 2.183180570602417, + "learning_rate": 2.931515415653671e-05, + "loss": 0.4611, + "step": 22260 + }, + { + "epoch": 2.230580457755296, + "grad_norm": 2.6957666873931885, + "learning_rate": 2.9299625197272274e-05, + "loss": 0.5005, + "step": 22270 + }, + { + "epoch": 2.2315821104822957, + "grad_norm": 2.4472970962524414, + "learning_rate": 2.9284094528296147e-05, + "loss": 0.5073, + "step": 22280 + }, + { + "epoch": 2.2325837632092953, + "grad_norm": 2.379964590072632, + "learning_rate": 2.9268562155783957e-05, + "loss": 0.4925, + "step": 22290 + }, + { + "epoch": 2.233585415936295, + "grad_norm": 2.3397207260131836, + "learning_rate": 2.9253028085912044e-05, + "loss": 0.4374, + "step": 22300 + }, + { + "epoch": 2.2345870686632945, + "grad_norm": 1.8619310855865479, + "learning_rate": 2.9237492324857397e-05, + "loss": 0.4889, + "step": 22310 + }, + { + "epoch": 2.235588721390294, + "grad_norm": 1.8362969160079956, + "learning_rate": 2.9221954878797686e-05, + "loss": 0.43, + "step": 22320 + }, + { + "epoch": 2.2365903741172937, + "grad_norm": 3.204434633255005, + "learning_rate": 2.9206415753911266e-05, + "loss": 0.5093, + "step": 22330 + }, + { + "epoch": 2.2375920268442933, + "grad_norm": 2.4659273624420166, + "learning_rate": 2.9190874956377136e-05, + "loss": 0.4225, + "step": 22340 + }, + { + "epoch": 2.2385936795712924, + "grad_norm": 1.955161213874817, + "learning_rate": 2.9175332492374963e-05, + "loss": 0.5238, + "step": 22350 + }, + { + "epoch": 2.239595332298292, + "grad_norm": 1.948196291923523, + "learning_rate": 2.91597883680851e-05, + "loss": 0.4562, + "step": 22360 + }, + { + "epoch": 2.2405969850252916, + "grad_norm": 2.1601054668426514, + "learning_rate": 2.9144242589688535e-05, + "loss": 0.4433, + "step": 22370 + }, + { + "epoch": 2.241598637752291, + "grad_norm": 2.4899299144744873, + "learning_rate": 2.9128695163366927e-05, + "loss": 0.4281, + "step": 22380 + }, + { + "epoch": 2.2426002904792908, + "grad_norm": 2.4548087120056152, + "learning_rate": 2.9113146095302584e-05, + "loss": 0.5069, + "step": 22390 + }, + { + "epoch": 2.2436019432062904, + "grad_norm": 2.362183094024658, + "learning_rate": 2.9097595391678467e-05, + "loss": 0.4856, + "step": 22400 + }, + { + "epoch": 2.24460359593329, + "grad_norm": 2.1862354278564453, + "learning_rate": 2.9082043058678192e-05, + "loss": 0.4262, + "step": 22410 + }, + { + "epoch": 2.2456052486602895, + "grad_norm": 2.056276559829712, + "learning_rate": 2.9066489102486023e-05, + "loss": 0.4686, + "step": 22420 + }, + { + "epoch": 2.246606901387289, + "grad_norm": 2.3078675270080566, + "learning_rate": 2.9050933529286857e-05, + "loss": 0.5046, + "step": 22430 + }, + { + "epoch": 2.2476085541142887, + "grad_norm": 2.5429131984710693, + "learning_rate": 2.9035376345266256e-05, + "loss": 0.5805, + "step": 22440 + }, + { + "epoch": 2.2486102068412883, + "grad_norm": 2.4503445625305176, + "learning_rate": 2.90198175566104e-05, + "loss": 0.4483, + "step": 22450 + }, + { + "epoch": 2.2496118595682875, + "grad_norm": 2.3516530990600586, + "learning_rate": 2.9004257169506128e-05, + "loss": 0.4998, + "step": 22460 + }, + { + "epoch": 2.250613512295287, + "grad_norm": 2.0752251148223877, + "learning_rate": 2.898869519014089e-05, + "loss": 0.4373, + "step": 22470 + }, + { + "epoch": 2.2516151650222866, + "grad_norm": 1.8537724018096924, + "learning_rate": 2.8973131624702798e-05, + "loss": 0.4856, + "step": 22480 + }, + { + "epoch": 2.2526168177492862, + "grad_norm": 1.905326247215271, + "learning_rate": 2.8957566479380576e-05, + "loss": 0.49, + "step": 22490 + }, + { + "epoch": 2.253618470476286, + "grad_norm": 1.725106954574585, + "learning_rate": 2.8941999760363563e-05, + "loss": 0.4265, + "step": 22500 + }, + { + "epoch": 2.2546201232032854, + "grad_norm": 2.606820821762085, + "learning_rate": 2.8926431473841763e-05, + "loss": 0.4936, + "step": 22510 + }, + { + "epoch": 2.255621775930285, + "grad_norm": 2.2708587646484375, + "learning_rate": 2.8910861626005776e-05, + "loss": 0.4706, + "step": 22520 + }, + { + "epoch": 2.2566234286572846, + "grad_norm": 1.8929194211959839, + "learning_rate": 2.889529022304682e-05, + "loss": 0.4891, + "step": 22530 + }, + { + "epoch": 2.257625081384284, + "grad_norm": 2.43183970451355, + "learning_rate": 2.887971727115674e-05, + "loss": 0.5013, + "step": 22540 + }, + { + "epoch": 2.258626734111284, + "grad_norm": 1.686383605003357, + "learning_rate": 2.8864142776528007e-05, + "loss": 0.4561, + "step": 22550 + }, + { + "epoch": 2.2596283868382834, + "grad_norm": 1.827078104019165, + "learning_rate": 2.8848566745353683e-05, + "loss": 0.423, + "step": 22560 + }, + { + "epoch": 2.2606300395652825, + "grad_norm": 2.1075212955474854, + "learning_rate": 2.8832989183827464e-05, + "loss": 0.4196, + "step": 22570 + }, + { + "epoch": 2.261631692292282, + "grad_norm": 2.4093353748321533, + "learning_rate": 2.881741009814363e-05, + "loss": 0.5551, + "step": 22580 + }, + { + "epoch": 2.2626333450192817, + "grad_norm": 2.049708843231201, + "learning_rate": 2.8801829494497095e-05, + "loss": 0.4111, + "step": 22590 + }, + { + "epoch": 2.2636349977462813, + "grad_norm": 2.1682515144348145, + "learning_rate": 2.878624737908335e-05, + "loss": 0.516, + "step": 22600 + }, + { + "epoch": 2.264636650473281, + "grad_norm": 2.5212790966033936, + "learning_rate": 2.8770663758098494e-05, + "loss": 0.4576, + "step": 22610 + }, + { + "epoch": 2.2656383032002805, + "grad_norm": 2.4226863384246826, + "learning_rate": 2.8755078637739253e-05, + "loss": 0.5454, + "step": 22620 + }, + { + "epoch": 2.26663995592728, + "grad_norm": 1.4106111526489258, + "learning_rate": 2.8739492024202895e-05, + "loss": 0.3773, + "step": 22630 + }, + { + "epoch": 2.2676416086542797, + "grad_norm": 2.756021022796631, + "learning_rate": 2.8723903923687345e-05, + "loss": 0.4958, + "step": 22640 + }, + { + "epoch": 2.2686432613812793, + "grad_norm": 2.1808178424835205, + "learning_rate": 2.8708314342391062e-05, + "loss": 0.4745, + "step": 22650 + }, + { + "epoch": 2.269644914108279, + "grad_norm": 2.1560256481170654, + "learning_rate": 2.8692723286513133e-05, + "loss": 0.465, + "step": 22660 + }, + { + "epoch": 2.2706465668352784, + "grad_norm": 2.243764638900757, + "learning_rate": 2.8677130762253212e-05, + "loss": 0.4736, + "step": 22670 + }, + { + "epoch": 2.2716482195622776, + "grad_norm": 2.5886261463165283, + "learning_rate": 2.8661536775811542e-05, + "loss": 0.4174, + "step": 22680 + }, + { + "epoch": 2.272649872289277, + "grad_norm": 2.1652519702911377, + "learning_rate": 2.8645941333388948e-05, + "loss": 0.5013, + "step": 22690 + }, + { + "epoch": 2.2736515250162768, + "grad_norm": 2.6116833686828613, + "learning_rate": 2.863034444118683e-05, + "loss": 0.4973, + "step": 22700 + }, + { + "epoch": 2.2746531777432764, + "grad_norm": 2.0481340885162354, + "learning_rate": 2.8614746105407177e-05, + "loss": 0.4399, + "step": 22710 + }, + { + "epoch": 2.275654830470276, + "grad_norm": 2.0342276096343994, + "learning_rate": 2.859914633225253e-05, + "loss": 0.5249, + "step": 22720 + }, + { + "epoch": 2.2766564831972755, + "grad_norm": 2.232968330383301, + "learning_rate": 2.8583545127926025e-05, + "loss": 0.4847, + "step": 22730 + }, + { + "epoch": 2.277658135924275, + "grad_norm": 3.0498735904693604, + "learning_rate": 2.856794249863135e-05, + "loss": 0.5188, + "step": 22740 + }, + { + "epoch": 2.2786597886512747, + "grad_norm": 1.7310694456100464, + "learning_rate": 2.8552338450572768e-05, + "loss": 0.4896, + "step": 22750 + }, + { + "epoch": 2.2796614413782743, + "grad_norm": 2.2194247245788574, + "learning_rate": 2.8536732989955105e-05, + "loss": 0.4247, + "step": 22760 + }, + { + "epoch": 2.2806630941052735, + "grad_norm": 1.4236329793930054, + "learning_rate": 2.852112612298376e-05, + "loss": 0.4623, + "step": 22770 + }, + { + "epoch": 2.2816647468322735, + "grad_norm": 1.7755547761917114, + "learning_rate": 2.850551785586466e-05, + "loss": 0.4402, + "step": 22780 + }, + { + "epoch": 2.2826663995592726, + "grad_norm": 2.213015079498291, + "learning_rate": 2.848990819480431e-05, + "loss": 0.4715, + "step": 22790 + }, + { + "epoch": 2.2836680522862722, + "grad_norm": 1.86104154586792, + "learning_rate": 2.847429714600979e-05, + "loss": 0.4849, + "step": 22800 + }, + { + "epoch": 2.284669705013272, + "grad_norm": 2.5028235912323, + "learning_rate": 2.8458684715688676e-05, + "loss": 0.5567, + "step": 22810 + }, + { + "epoch": 2.2856713577402714, + "grad_norm": 2.2891948223114014, + "learning_rate": 2.8443070910049153e-05, + "loss": 0.5308, + "step": 22820 + }, + { + "epoch": 2.286673010467271, + "grad_norm": 2.1193156242370605, + "learning_rate": 2.8427455735299908e-05, + "loss": 0.5333, + "step": 22830 + }, + { + "epoch": 2.2876746631942706, + "grad_norm": 1.979323148727417, + "learning_rate": 2.841183919765021e-05, + "loss": 0.4608, + "step": 22840 + }, + { + "epoch": 2.28867631592127, + "grad_norm": 2.817578077316284, + "learning_rate": 2.8396221303309832e-05, + "loss": 0.3857, + "step": 22850 + }, + { + "epoch": 2.28967796864827, + "grad_norm": 2.252032518386841, + "learning_rate": 2.8380602058489115e-05, + "loss": 0.5232, + "step": 22860 + }, + { + "epoch": 2.2906796213752694, + "grad_norm": 4.845706939697266, + "learning_rate": 2.8364981469398925e-05, + "loss": 0.4782, + "step": 22870 + }, + { + "epoch": 2.2916812741022685, + "grad_norm": 2.3078505992889404, + "learning_rate": 2.834935954225067e-05, + "loss": 0.4634, + "step": 22880 + }, + { + "epoch": 2.292682926829268, + "grad_norm": 2.475879192352295, + "learning_rate": 2.8333736283256272e-05, + "loss": 0.4872, + "step": 22890 + }, + { + "epoch": 2.2936845795562677, + "grad_norm": 2.6143455505371094, + "learning_rate": 2.8318111698628214e-05, + "loss": 0.4772, + "step": 22900 + }, + { + "epoch": 2.2946862322832673, + "grad_norm": 2.3077473640441895, + "learning_rate": 2.830248579457947e-05, + "loss": 0.4862, + "step": 22910 + }, + { + "epoch": 2.295687885010267, + "grad_norm": 1.7800365686416626, + "learning_rate": 2.8286858577323566e-05, + "loss": 0.4625, + "step": 22920 + }, + { + "epoch": 2.2966895377372665, + "grad_norm": 2.5691211223602295, + "learning_rate": 2.827123005307454e-05, + "loss": 0.4215, + "step": 22930 + }, + { + "epoch": 2.297691190464266, + "grad_norm": 2.73599910736084, + "learning_rate": 2.8255600228046935e-05, + "loss": 0.4423, + "step": 22940 + }, + { + "epoch": 2.2986928431912657, + "grad_norm": 1.8676958084106445, + "learning_rate": 2.8239969108455856e-05, + "loss": 0.4174, + "step": 22950 + }, + { + "epoch": 2.2996944959182652, + "grad_norm": 1.835666537284851, + "learning_rate": 2.8224336700516873e-05, + "loss": 0.4988, + "step": 22960 + }, + { + "epoch": 2.300696148645265, + "grad_norm": 2.497913122177124, + "learning_rate": 2.8208703010446086e-05, + "loss": 0.5001, + "step": 22970 + }, + { + "epoch": 2.3016978013722644, + "grad_norm": 2.3494608402252197, + "learning_rate": 2.8193068044460118e-05, + "loss": 0.4715, + "step": 22980 + }, + { + "epoch": 2.3026994540992636, + "grad_norm": 1.8175679445266724, + "learning_rate": 2.8177431808776078e-05, + "loss": 0.4408, + "step": 22990 + }, + { + "epoch": 2.303701106826263, + "grad_norm": 2.166266918182373, + "learning_rate": 2.8161794309611612e-05, + "loss": 0.3983, + "step": 23000 + }, + { + "epoch": 2.3047027595532628, + "grad_norm": 1.8629862070083618, + "learning_rate": 2.8146155553184812e-05, + "loss": 0.4278, + "step": 23010 + }, + { + "epoch": 2.3057044122802623, + "grad_norm": 2.0270631313323975, + "learning_rate": 2.813051554571433e-05, + "loss": 0.5, + "step": 23020 + }, + { + "epoch": 2.306706065007262, + "grad_norm": 2.0581462383270264, + "learning_rate": 2.811487429341929e-05, + "loss": 0.4549, + "step": 23030 + }, + { + "epoch": 2.3077077177342615, + "grad_norm": 2.967930793762207, + "learning_rate": 2.8099231802519304e-05, + "loss": 0.5308, + "step": 23040 + }, + { + "epoch": 2.308709370461261, + "grad_norm": 1.926661491394043, + "learning_rate": 2.8083588079234485e-05, + "loss": 0.4796, + "step": 23050 + }, + { + "epoch": 2.3097110231882607, + "grad_norm": 2.6114249229431152, + "learning_rate": 2.806794312978544e-05, + "loss": 0.5267, + "step": 23060 + }, + { + "epoch": 2.3107126759152603, + "grad_norm": 1.7948778867721558, + "learning_rate": 2.805229696039325e-05, + "loss": 0.461, + "step": 23070 + }, + { + "epoch": 2.31171432864226, + "grad_norm": 1.9823604822158813, + "learning_rate": 2.8036649577279495e-05, + "loss": 0.4477, + "step": 23080 + }, + { + "epoch": 2.3127159813692595, + "grad_norm": 2.148059844970703, + "learning_rate": 2.8021000986666235e-05, + "loss": 0.4411, + "step": 23090 + }, + { + "epoch": 2.3137176340962586, + "grad_norm": 2.309586763381958, + "learning_rate": 2.8005351194775996e-05, + "loss": 0.5383, + "step": 23100 + }, + { + "epoch": 2.3147192868232582, + "grad_norm": 2.083111047744751, + "learning_rate": 2.7989700207831803e-05, + "loss": 0.4957, + "step": 23110 + }, + { + "epoch": 2.315720939550258, + "grad_norm": 2.7374930381774902, + "learning_rate": 2.7974048032057142e-05, + "loss": 0.466, + "step": 23120 + }, + { + "epoch": 2.3167225922772574, + "grad_norm": 2.315837860107422, + "learning_rate": 2.7958394673675986e-05, + "loss": 0.5647, + "step": 23130 + }, + { + "epoch": 2.317724245004257, + "grad_norm": 2.1665873527526855, + "learning_rate": 2.7942740138912748e-05, + "loss": 0.47, + "step": 23140 + }, + { + "epoch": 2.3187258977312566, + "grad_norm": 2.3246545791625977, + "learning_rate": 2.7927084433992355e-05, + "loss": 0.4727, + "step": 23150 + }, + { + "epoch": 2.319727550458256, + "grad_norm": 2.2014048099517822, + "learning_rate": 2.791142756514015e-05, + "loss": 0.4729, + "step": 23160 + }, + { + "epoch": 2.3207292031852558, + "grad_norm": 2.3125619888305664, + "learning_rate": 2.7895769538581973e-05, + "loss": 0.4782, + "step": 23170 + }, + { + "epoch": 2.3217308559122554, + "grad_norm": 1.8085143566131592, + "learning_rate": 2.788011036054412e-05, + "loss": 0.4332, + "step": 23180 + }, + { + "epoch": 2.322732508639255, + "grad_norm": 2.0354082584381104, + "learning_rate": 2.7864450037253316e-05, + "loss": 0.483, + "step": 23190 + }, + { + "epoch": 2.3237341613662545, + "grad_norm": 1.992011308670044, + "learning_rate": 2.7848788574936787e-05, + "loss": 0.4718, + "step": 23200 + }, + { + "epoch": 2.3247358140932537, + "grad_norm": 2.2824182510375977, + "learning_rate": 2.783312597982218e-05, + "loss": 0.5439, + "step": 23210 + }, + { + "epoch": 2.3257374668202533, + "grad_norm": 2.152175188064575, + "learning_rate": 2.78174622581376e-05, + "loss": 0.4938, + "step": 23220 + }, + { + "epoch": 2.326739119547253, + "grad_norm": 1.9049335718154907, + "learning_rate": 2.7801797416111597e-05, + "loss": 0.5244, + "step": 23230 + }, + { + "epoch": 2.3277407722742525, + "grad_norm": 1.704810619354248, + "learning_rate": 2.7786131459973185e-05, + "loss": 0.4052, + "step": 23240 + }, + { + "epoch": 2.328742425001252, + "grad_norm": 1.6601033210754395, + "learning_rate": 2.7770464395951788e-05, + "loss": 0.4639, + "step": 23250 + }, + { + "epoch": 2.3297440777282516, + "grad_norm": 2.091416597366333, + "learning_rate": 2.7754796230277307e-05, + "loss": 0.4594, + "step": 23260 + }, + { + "epoch": 2.3307457304552512, + "grad_norm": 2.0445423126220703, + "learning_rate": 2.7739126969180053e-05, + "loss": 0.4626, + "step": 23270 + }, + { + "epoch": 2.331747383182251, + "grad_norm": 2.797914981842041, + "learning_rate": 2.7723456618890782e-05, + "loss": 0.5565, + "step": 23280 + }, + { + "epoch": 2.3327490359092504, + "grad_norm": 2.423527956008911, + "learning_rate": 2.770778518564069e-05, + "loss": 0.5019, + "step": 23290 + }, + { + "epoch": 2.3337506886362496, + "grad_norm": 2.7250008583068848, + "learning_rate": 2.7692112675661385e-05, + "loss": 0.4707, + "step": 23300 + }, + { + "epoch": 2.3347523413632496, + "grad_norm": 2.50681734085083, + "learning_rate": 2.7676439095184936e-05, + "loss": 0.4785, + "step": 23310 + }, + { + "epoch": 2.3357539940902488, + "grad_norm": 2.4695465564727783, + "learning_rate": 2.76607644504438e-05, + "loss": 0.4693, + "step": 23320 + }, + { + "epoch": 2.3367556468172483, + "grad_norm": 2.140570640563965, + "learning_rate": 2.7645088747670885e-05, + "loss": 0.4847, + "step": 23330 + }, + { + "epoch": 2.337757299544248, + "grad_norm": 1.7873948812484741, + "learning_rate": 2.7629411993099497e-05, + "loss": 0.4984, + "step": 23340 + }, + { + "epoch": 2.3387589522712475, + "grad_norm": 2.5162179470062256, + "learning_rate": 2.7613734192963385e-05, + "loss": 0.5601, + "step": 23350 + }, + { + "epoch": 2.339760604998247, + "grad_norm": 1.9141145944595337, + "learning_rate": 2.7598055353496705e-05, + "loss": 0.4532, + "step": 23360 + }, + { + "epoch": 2.3407622577252467, + "grad_norm": 2.3677682876586914, + "learning_rate": 2.7582375480934e-05, + "loss": 0.4797, + "step": 23370 + }, + { + "epoch": 2.3417639104522463, + "grad_norm": 2.3838820457458496, + "learning_rate": 2.7566694581510272e-05, + "loss": 0.4768, + "step": 23380 + }, + { + "epoch": 2.342765563179246, + "grad_norm": 1.4900152683258057, + "learning_rate": 2.7551012661460897e-05, + "loss": 0.4558, + "step": 23390 + }, + { + "epoch": 2.3437672159062455, + "grad_norm": 2.4450631141662598, + "learning_rate": 2.7535329727021653e-05, + "loss": 0.4752, + "step": 23400 + }, + { + "epoch": 2.3447688686332446, + "grad_norm": 2.6570770740509033, + "learning_rate": 2.751964578442875e-05, + "loss": 0.4175, + "step": 23410 + }, + { + "epoch": 2.345770521360244, + "grad_norm": 2.1034457683563232, + "learning_rate": 2.7503960839918775e-05, + "loss": 0.5427, + "step": 23420 + }, + { + "epoch": 2.346772174087244, + "grad_norm": 1.7023869752883911, + "learning_rate": 2.7488274899728728e-05, + "loss": 0.4666, + "step": 23430 + }, + { + "epoch": 2.3477738268142434, + "grad_norm": 1.739460825920105, + "learning_rate": 2.7472587970095985e-05, + "loss": 0.479, + "step": 23440 + }, + { + "epoch": 2.348775479541243, + "grad_norm": 1.9922562837600708, + "learning_rate": 2.7456900057258344e-05, + "loss": 0.4813, + "step": 23450 + }, + { + "epoch": 2.3497771322682426, + "grad_norm": 2.244210720062256, + "learning_rate": 2.7441211167453973e-05, + "loss": 0.4797, + "step": 23460 + }, + { + "epoch": 2.350778784995242, + "grad_norm": 2.837597131729126, + "learning_rate": 2.7425521306921427e-05, + "loss": 0.4473, + "step": 23470 + }, + { + "epoch": 2.3517804377222418, + "grad_norm": 2.2177488803863525, + "learning_rate": 2.740983048189966e-05, + "loss": 0.5151, + "step": 23480 + }, + { + "epoch": 2.3527820904492414, + "grad_norm": 1.891855239868164, + "learning_rate": 2.7394138698628e-05, + "loss": 0.4739, + "step": 23490 + }, + { + "epoch": 2.353783743176241, + "grad_norm": 2.4679808616638184, + "learning_rate": 2.7378445963346165e-05, + "loss": 0.4497, + "step": 23500 + }, + { + "epoch": 2.3547853959032405, + "grad_norm": 1.8632603883743286, + "learning_rate": 2.7362752282294252e-05, + "loss": 0.4815, + "step": 23510 + }, + { + "epoch": 2.3557870486302397, + "grad_norm": 1.9275087118148804, + "learning_rate": 2.7347057661712706e-05, + "loss": 0.4674, + "step": 23520 + }, + { + "epoch": 2.3567887013572393, + "grad_norm": 2.283627986907959, + "learning_rate": 2.7331362107842388e-05, + "loss": 0.4662, + "step": 23530 + }, + { + "epoch": 2.357790354084239, + "grad_norm": 2.1892151832580566, + "learning_rate": 2.7315665626924515e-05, + "loss": 0.3666, + "step": 23540 + }, + { + "epoch": 2.3587920068112385, + "grad_norm": 1.8962342739105225, + "learning_rate": 2.7299968225200638e-05, + "loss": 0.4528, + "step": 23550 + }, + { + "epoch": 2.359793659538238, + "grad_norm": 2.1917147636413574, + "learning_rate": 2.7284269908912737e-05, + "loss": 0.5034, + "step": 23560 + }, + { + "epoch": 2.3607953122652376, + "grad_norm": 1.915441632270813, + "learning_rate": 2.7268570684303095e-05, + "loss": 0.5584, + "step": 23570 + }, + { + "epoch": 2.3617969649922372, + "grad_norm": 2.1486260890960693, + "learning_rate": 2.7252870557614402e-05, + "loss": 0.4875, + "step": 23580 + }, + { + "epoch": 2.362798617719237, + "grad_norm": 2.0003814697265625, + "learning_rate": 2.7237169535089686e-05, + "loss": 0.4692, + "step": 23590 + }, + { + "epoch": 2.3638002704462364, + "grad_norm": 2.1308910846710205, + "learning_rate": 2.7221467622972326e-05, + "loss": 0.4914, + "step": 23600 + }, + { + "epoch": 2.364801923173236, + "grad_norm": 2.5318915843963623, + "learning_rate": 2.7205764827506064e-05, + "loss": 0.5162, + "step": 23610 + }, + { + "epoch": 2.3658035759002356, + "grad_norm": 1.9081600904464722, + "learning_rate": 2.7190061154934993e-05, + "loss": 0.4971, + "step": 23620 + }, + { + "epoch": 2.3668052286272347, + "grad_norm": 3.0676238536834717, + "learning_rate": 2.717435661150356e-05, + "loss": 0.474, + "step": 23630 + }, + { + "epoch": 2.3678068813542343, + "grad_norm": 2.3040099143981934, + "learning_rate": 2.7158651203456542e-05, + "loss": 0.4463, + "step": 23640 + }, + { + "epoch": 2.368808534081234, + "grad_norm": 2.0134572982788086, + "learning_rate": 2.7142944937039072e-05, + "loss": 0.4961, + "step": 23650 + }, + { + "epoch": 2.3698101868082335, + "grad_norm": 2.1416115760803223, + "learning_rate": 2.712723781849662e-05, + "loss": 0.4916, + "step": 23660 + }, + { + "epoch": 2.370811839535233, + "grad_norm": 2.0588133335113525, + "learning_rate": 2.7111529854075002e-05, + "loss": 0.4562, + "step": 23670 + }, + { + "epoch": 2.3718134922622327, + "grad_norm": 1.623829960823059, + "learning_rate": 2.7095821050020358e-05, + "loss": 0.4011, + "step": 23680 + }, + { + "epoch": 2.3728151449892323, + "grad_norm": 2.447624921798706, + "learning_rate": 2.708011141257918e-05, + "loss": 0.4624, + "step": 23690 + }, + { + "epoch": 2.373816797716232, + "grad_norm": 3.0367507934570312, + "learning_rate": 2.7064400947998263e-05, + "loss": 0.4924, + "step": 23700 + }, + { + "epoch": 2.3748184504432315, + "grad_norm": 3.0571627616882324, + "learning_rate": 2.7048689662524767e-05, + "loss": 0.4647, + "step": 23710 + }, + { + "epoch": 2.375820103170231, + "grad_norm": 2.4752557277679443, + "learning_rate": 2.7032977562406147e-05, + "loss": 0.5044, + "step": 23720 + }, + { + "epoch": 2.3768217558972307, + "grad_norm": 2.2401139736175537, + "learning_rate": 2.7017264653890202e-05, + "loss": 0.5055, + "step": 23730 + }, + { + "epoch": 2.37782340862423, + "grad_norm": 2.167793035507202, + "learning_rate": 2.700155094322504e-05, + "loss": 0.4909, + "step": 23740 + }, + { + "epoch": 2.3788250613512294, + "grad_norm": 2.418596029281616, + "learning_rate": 2.6985836436659084e-05, + "loss": 0.5, + "step": 23750 + }, + { + "epoch": 2.379826714078229, + "grad_norm": 2.262390613555908, + "learning_rate": 2.6970121140441097e-05, + "loss": 0.5482, + "step": 23760 + }, + { + "epoch": 2.3808283668052286, + "grad_norm": 2.523616075515747, + "learning_rate": 2.6954405060820138e-05, + "loss": 0.4713, + "step": 23770 + }, + { + "epoch": 2.381830019532228, + "grad_norm": 1.510984182357788, + "learning_rate": 2.6938688204045582e-05, + "loss": 0.42, + "step": 23780 + }, + { + "epoch": 2.3828316722592278, + "grad_norm": 1.5943301916122437, + "learning_rate": 2.69229705763671e-05, + "loss": 0.4441, + "step": 23790 + }, + { + "epoch": 2.3838333249862274, + "grad_norm": 2.2615034580230713, + "learning_rate": 2.6907252184034697e-05, + "loss": 0.4397, + "step": 23800 + }, + { + "epoch": 2.384834977713227, + "grad_norm": 2.052757740020752, + "learning_rate": 2.6891533033298656e-05, + "loss": 0.4832, + "step": 23810 + }, + { + "epoch": 2.3858366304402265, + "grad_norm": 1.8588956594467163, + "learning_rate": 2.687581313040958e-05, + "loss": 0.4632, + "step": 23820 + }, + { + "epoch": 2.3868382831672257, + "grad_norm": 2.4952921867370605, + "learning_rate": 2.6860092481618355e-05, + "loss": 0.4788, + "step": 23830 + }, + { + "epoch": 2.3878399358942257, + "grad_norm": 2.3003437519073486, + "learning_rate": 2.684437109317618e-05, + "loss": 0.4824, + "step": 23840 + }, + { + "epoch": 2.388841588621225, + "grad_norm": 2.297075033187866, + "learning_rate": 2.682864897133453e-05, + "loss": 0.517, + "step": 23850 + }, + { + "epoch": 2.3898432413482245, + "grad_norm": 2.4082040786743164, + "learning_rate": 2.6812926122345185e-05, + "loss": 0.5695, + "step": 23860 + }, + { + "epoch": 2.390844894075224, + "grad_norm": 2.1202104091644287, + "learning_rate": 2.679720255246022e-05, + "loss": 0.5065, + "step": 23870 + }, + { + "epoch": 2.3918465468022236, + "grad_norm": 2.0936365127563477, + "learning_rate": 2.6781478267931975e-05, + "loss": 0.4594, + "step": 23880 + }, + { + "epoch": 2.3928481995292232, + "grad_norm": 2.3779218196868896, + "learning_rate": 2.67657532750131e-05, + "loss": 0.4314, + "step": 23890 + }, + { + "epoch": 2.393849852256223, + "grad_norm": 1.846227765083313, + "learning_rate": 2.6750027579956493e-05, + "loss": 0.4417, + "step": 23900 + }, + { + "epoch": 2.3948515049832224, + "grad_norm": 2.4832935333251953, + "learning_rate": 2.6734301189015363e-05, + "loss": 0.483, + "step": 23910 + }, + { + "epoch": 2.395853157710222, + "grad_norm": 2.117427110671997, + "learning_rate": 2.6718574108443196e-05, + "loss": 0.4926, + "step": 23920 + }, + { + "epoch": 2.3968548104372216, + "grad_norm": 1.825005292892456, + "learning_rate": 2.6702846344493714e-05, + "loss": 0.4941, + "step": 23930 + }, + { + "epoch": 2.3978564631642207, + "grad_norm": 2.119313955307007, + "learning_rate": 2.668711790342096e-05, + "loss": 0.4495, + "step": 23940 + }, + { + "epoch": 2.3988581158912203, + "grad_norm": 2.2307143211364746, + "learning_rate": 2.667138879147921e-05, + "loss": 0.5252, + "step": 23950 + }, + { + "epoch": 2.39985976861822, + "grad_norm": 2.309399127960205, + "learning_rate": 2.665565901492303e-05, + "loss": 0.5204, + "step": 23960 + }, + { + "epoch": 2.4008614213452195, + "grad_norm": 2.689640522003174, + "learning_rate": 2.663992858000723e-05, + "loss": 0.4835, + "step": 23970 + }, + { + "epoch": 2.401863074072219, + "grad_norm": 2.8619892597198486, + "learning_rate": 2.6624197492986897e-05, + "loss": 0.4268, + "step": 23980 + }, + { + "epoch": 2.4028647267992187, + "grad_norm": 1.9714316129684448, + "learning_rate": 2.6608465760117374e-05, + "loss": 0.4982, + "step": 23990 + }, + { + "epoch": 2.4038663795262183, + "grad_norm": 1.962856650352478, + "learning_rate": 2.6592733387654255e-05, + "loss": 0.4649, + "step": 24000 + }, + { + "epoch": 2.404868032253218, + "grad_norm": 2.0655150413513184, + "learning_rate": 2.65770003818534e-05, + "loss": 0.4452, + "step": 24010 + }, + { + "epoch": 2.4058696849802175, + "grad_norm": 2.481144905090332, + "learning_rate": 2.6561266748970902e-05, + "loss": 0.4823, + "step": 24020 + }, + { + "epoch": 2.406871337707217, + "grad_norm": 2.099876642227173, + "learning_rate": 2.6545532495263125e-05, + "loss": 0.4664, + "step": 24030 + }, + { + "epoch": 2.4078729904342167, + "grad_norm": 2.909708261489868, + "learning_rate": 2.6529797626986662e-05, + "loss": 0.5461, + "step": 24040 + }, + { + "epoch": 2.408874643161216, + "grad_norm": 2.3196239471435547, + "learning_rate": 2.651406215039837e-05, + "loss": 0.4675, + "step": 24050 + }, + { + "epoch": 2.4098762958882154, + "grad_norm": 2.3955135345458984, + "learning_rate": 2.649832607175532e-05, + "loss": 0.493, + "step": 24060 + }, + { + "epoch": 2.410877948615215, + "grad_norm": 1.9178582429885864, + "learning_rate": 2.6482589397314854e-05, + "loss": 0.4758, + "step": 24070 + }, + { + "epoch": 2.4118796013422146, + "grad_norm": 2.7419323921203613, + "learning_rate": 2.646685213333452e-05, + "loss": 0.4915, + "step": 24080 + }, + { + "epoch": 2.412881254069214, + "grad_norm": 2.2376959323883057, + "learning_rate": 2.645111428607212e-05, + "loss": 0.4918, + "step": 24090 + }, + { + "epoch": 2.4138829067962138, + "grad_norm": 1.6830248832702637, + "learning_rate": 2.6435375861785692e-05, + "loss": 0.4887, + "step": 24100 + }, + { + "epoch": 2.4148845595232133, + "grad_norm": 1.9309697151184082, + "learning_rate": 2.641963686673349e-05, + "loss": 0.498, + "step": 24110 + }, + { + "epoch": 2.415886212250213, + "grad_norm": 1.619993805885315, + "learning_rate": 2.6403897307173996e-05, + "loss": 0.3869, + "step": 24120 + }, + { + "epoch": 2.4168878649772125, + "grad_norm": 2.886514663696289, + "learning_rate": 2.638815718936593e-05, + "loss": 0.4794, + "step": 24130 + }, + { + "epoch": 2.417889517704212, + "grad_norm": 2.1846656799316406, + "learning_rate": 2.6372416519568216e-05, + "loss": 0.4546, + "step": 24140 + }, + { + "epoch": 2.4188911704312117, + "grad_norm": 2.40488338470459, + "learning_rate": 2.6356675304040003e-05, + "loss": 0.4387, + "step": 24150 + }, + { + "epoch": 2.419892823158211, + "grad_norm": 2.2383806705474854, + "learning_rate": 2.634093354904067e-05, + "loss": 0.49, + "step": 24160 + }, + { + "epoch": 2.4208944758852105, + "grad_norm": 2.0622711181640625, + "learning_rate": 2.6325191260829795e-05, + "loss": 0.5343, + "step": 24170 + }, + { + "epoch": 2.42189612861221, + "grad_norm": 1.8681621551513672, + "learning_rate": 2.6309448445667172e-05, + "loss": 0.4797, + "step": 24180 + }, + { + "epoch": 2.4228977813392096, + "grad_norm": 2.4106242656707764, + "learning_rate": 2.6293705109812806e-05, + "loss": 0.4516, + "step": 24190 + }, + { + "epoch": 2.4238994340662092, + "grad_norm": 2.238723039627075, + "learning_rate": 2.627796125952692e-05, + "loss": 0.4675, + "step": 24200 + }, + { + "epoch": 2.424901086793209, + "grad_norm": 2.0002214908599854, + "learning_rate": 2.6262216901069913e-05, + "loss": 0.4723, + "step": 24210 + }, + { + "epoch": 2.4259027395202084, + "grad_norm": 1.454474687576294, + "learning_rate": 2.6246472040702413e-05, + "loss": 0.4336, + "step": 24220 + }, + { + "epoch": 2.426904392247208, + "grad_norm": 2.259845495223999, + "learning_rate": 2.6230726684685246e-05, + "loss": 0.4863, + "step": 24230 + }, + { + "epoch": 2.4279060449742076, + "grad_norm": 2.448485851287842, + "learning_rate": 2.621498083927941e-05, + "loss": 0.4837, + "step": 24240 + }, + { + "epoch": 2.428907697701207, + "grad_norm": 1.6946465969085693, + "learning_rate": 2.619923451074613e-05, + "loss": 0.4887, + "step": 24250 + }, + { + "epoch": 2.4299093504282068, + "grad_norm": 2.0357892513275146, + "learning_rate": 2.6183487705346794e-05, + "loss": 0.5002, + "step": 24260 + }, + { + "epoch": 2.430911003155206, + "grad_norm": 2.7066149711608887, + "learning_rate": 2.6167740429343006e-05, + "loss": 0.4957, + "step": 24270 + }, + { + "epoch": 2.4319126558822055, + "grad_norm": 2.0033223628997803, + "learning_rate": 2.6151992688996546e-05, + "loss": 0.462, + "step": 24280 + }, + { + "epoch": 2.432914308609205, + "grad_norm": 2.564603328704834, + "learning_rate": 2.6136244490569366e-05, + "loss": 0.4292, + "step": 24290 + }, + { + "epoch": 2.4339159613362047, + "grad_norm": 3.0108890533447266, + "learning_rate": 2.6120495840323628e-05, + "loss": 0.537, + "step": 24300 + }, + { + "epoch": 2.4349176140632043, + "grad_norm": 2.422274351119995, + "learning_rate": 2.610474674452163e-05, + "loss": 0.4835, + "step": 24310 + }, + { + "epoch": 2.435919266790204, + "grad_norm": 2.4366953372955322, + "learning_rate": 2.6088997209425903e-05, + "loss": 0.4794, + "step": 24320 + }, + { + "epoch": 2.4369209195172035, + "grad_norm": 2.602489709854126, + "learning_rate": 2.607324724129911e-05, + "loss": 0.5207, + "step": 24330 + }, + { + "epoch": 2.437922572244203, + "grad_norm": 2.0761661529541016, + "learning_rate": 2.6057496846404105e-05, + "loss": 0.4881, + "step": 24340 + }, + { + "epoch": 2.4389242249712026, + "grad_norm": 2.906827211380005, + "learning_rate": 2.60417460310039e-05, + "loss": 0.5105, + "step": 24350 + }, + { + "epoch": 2.439925877698202, + "grad_norm": 1.975074052810669, + "learning_rate": 2.602599480136169e-05, + "loss": 0.4768, + "step": 24360 + }, + { + "epoch": 2.440927530425202, + "grad_norm": 2.0563461780548096, + "learning_rate": 2.6010243163740815e-05, + "loss": 0.5195, + "step": 24370 + }, + { + "epoch": 2.441929183152201, + "grad_norm": 1.6481271982192993, + "learning_rate": 2.599449112440481e-05, + "loss": 0.4575, + "step": 24380 + }, + { + "epoch": 2.4429308358792006, + "grad_norm": 1.454416275024414, + "learning_rate": 2.5978738689617322e-05, + "loss": 0.5094, + "step": 24390 + }, + { + "epoch": 2.4439324886062, + "grad_norm": 2.3923304080963135, + "learning_rate": 2.5962985865642198e-05, + "loss": 0.5516, + "step": 24400 + }, + { + "epoch": 2.4449341413331998, + "grad_norm": 2.3013486862182617, + "learning_rate": 2.5947232658743414e-05, + "loss": 0.4856, + "step": 24410 + }, + { + "epoch": 2.4459357940601993, + "grad_norm": 2.057035207748413, + "learning_rate": 2.593147907518511e-05, + "loss": 0.4566, + "step": 24420 + }, + { + "epoch": 2.446937446787199, + "grad_norm": 1.8558269739151, + "learning_rate": 2.5915725121231582e-05, + "loss": 0.4738, + "step": 24430 + }, + { + "epoch": 2.4479390995141985, + "grad_norm": 1.7031394243240356, + "learning_rate": 2.5899970803147246e-05, + "loss": 0.4891, + "step": 24440 + }, + { + "epoch": 2.448940752241198, + "grad_norm": 1.8119549751281738, + "learning_rate": 2.5884216127196694e-05, + "loss": 0.4586, + "step": 24450 + }, + { + "epoch": 2.4499424049681977, + "grad_norm": 2.4061672687530518, + "learning_rate": 2.5868461099644642e-05, + "loss": 0.5631, + "step": 24460 + }, + { + "epoch": 2.450944057695197, + "grad_norm": 1.9223402738571167, + "learning_rate": 2.5852705726755954e-05, + "loss": 0.5479, + "step": 24470 + }, + { + "epoch": 2.4519457104221964, + "grad_norm": 2.5272395610809326, + "learning_rate": 2.583695001479563e-05, + "loss": 0.5184, + "step": 24480 + }, + { + "epoch": 2.452947363149196, + "grad_norm": 2.009592056274414, + "learning_rate": 2.5821193970028806e-05, + "loss": 0.4291, + "step": 24490 + }, + { + "epoch": 2.4539490158761956, + "grad_norm": 2.079152822494507, + "learning_rate": 2.580543759872074e-05, + "loss": 0.4355, + "step": 24500 + }, + { + "epoch": 2.454950668603195, + "grad_norm": 1.9347796440124512, + "learning_rate": 2.5789680907136826e-05, + "loss": 0.5151, + "step": 24510 + }, + { + "epoch": 2.455952321330195, + "grad_norm": 3.365255117416382, + "learning_rate": 2.5773923901542597e-05, + "loss": 0.4749, + "step": 24520 + }, + { + "epoch": 2.4569539740571944, + "grad_norm": 2.0016565322875977, + "learning_rate": 2.575816658820369e-05, + "loss": 0.4134, + "step": 24530 + }, + { + "epoch": 2.457955626784194, + "grad_norm": 2.206012725830078, + "learning_rate": 2.5742408973385883e-05, + "loss": 0.4936, + "step": 24540 + }, + { + "epoch": 2.4589572795111936, + "grad_norm": 1.8435074090957642, + "learning_rate": 2.5726651063355057e-05, + "loss": 0.5471, + "step": 24550 + }, + { + "epoch": 2.459958932238193, + "grad_norm": 3.3343753814697266, + "learning_rate": 2.5710892864377234e-05, + "loss": 0.5009, + "step": 24560 + }, + { + "epoch": 2.4609605849651928, + "grad_norm": 2.1347665786743164, + "learning_rate": 2.5695134382718522e-05, + "loss": 0.4714, + "step": 24570 + }, + { + "epoch": 2.461962237692192, + "grad_norm": 2.361144542694092, + "learning_rate": 2.567937562464517e-05, + "loss": 0.531, + "step": 24580 + }, + { + "epoch": 2.4629638904191915, + "grad_norm": 1.9751992225646973, + "learning_rate": 2.5663616596423508e-05, + "loss": 0.4586, + "step": 24590 + }, + { + "epoch": 2.463965543146191, + "grad_norm": 2.1158523559570312, + "learning_rate": 2.564785730431999e-05, + "loss": 0.598, + "step": 24600 + }, + { + "epoch": 2.4649671958731907, + "grad_norm": 2.8670387268066406, + "learning_rate": 2.5632097754601185e-05, + "loss": 0.5161, + "step": 24610 + }, + { + "epoch": 2.4659688486001903, + "grad_norm": 2.2353668212890625, + "learning_rate": 2.5616337953533736e-05, + "loss": 0.5001, + "step": 24620 + }, + { + "epoch": 2.46697050132719, + "grad_norm": 2.321228504180908, + "learning_rate": 2.560057790738442e-05, + "loss": 0.5423, + "step": 24630 + }, + { + "epoch": 2.4679721540541895, + "grad_norm": 2.136357307434082, + "learning_rate": 2.5584817622420077e-05, + "loss": 0.4286, + "step": 24640 + }, + { + "epoch": 2.468973806781189, + "grad_norm": 1.9053417444229126, + "learning_rate": 2.5569057104907668e-05, + "loss": 0.4948, + "step": 24650 + }, + { + "epoch": 2.4699754595081886, + "grad_norm": 1.6119698286056519, + "learning_rate": 2.555329636111424e-05, + "loss": 0.5038, + "step": 24660 + }, + { + "epoch": 2.4709771122351882, + "grad_norm": 2.262643814086914, + "learning_rate": 2.5537535397306916e-05, + "loss": 0.5284, + "step": 24670 + }, + { + "epoch": 2.471978764962188, + "grad_norm": 2.5736396312713623, + "learning_rate": 2.5521774219752926e-05, + "loss": 0.4789, + "step": 24680 + }, + { + "epoch": 2.472980417689187, + "grad_norm": 1.8912774324417114, + "learning_rate": 2.5506012834719577e-05, + "loss": 0.5158, + "step": 24690 + }, + { + "epoch": 2.4739820704161866, + "grad_norm": 2.2454535961151123, + "learning_rate": 2.5490251248474257e-05, + "loss": 0.4481, + "step": 24700 + }, + { + "epoch": 2.474983723143186, + "grad_norm": 1.9818633794784546, + "learning_rate": 2.547448946728443e-05, + "loss": 0.4488, + "step": 24710 + }, + { + "epoch": 2.4759853758701857, + "grad_norm": 1.9652526378631592, + "learning_rate": 2.545872749741765e-05, + "loss": 0.472, + "step": 24720 + }, + { + "epoch": 2.4769870285971853, + "grad_norm": 2.1900923252105713, + "learning_rate": 2.5442965345141533e-05, + "loss": 0.426, + "step": 24730 + }, + { + "epoch": 2.477988681324185, + "grad_norm": 1.747668743133545, + "learning_rate": 2.5427203016723782e-05, + "loss": 0.4571, + "step": 24740 + }, + { + "epoch": 2.4789903340511845, + "grad_norm": 2.722621202468872, + "learning_rate": 2.5411440518432145e-05, + "loss": 0.533, + "step": 24750 + }, + { + "epoch": 2.479991986778184, + "grad_norm": 1.841064453125, + "learning_rate": 2.5395677856534477e-05, + "loss": 0.3963, + "step": 24760 + }, + { + "epoch": 2.4809936395051837, + "grad_norm": 1.9657102823257446, + "learning_rate": 2.537991503729865e-05, + "loss": 0.4979, + "step": 24770 + }, + { + "epoch": 2.4819952922321833, + "grad_norm": 2.6763949394226074, + "learning_rate": 2.536415206699264e-05, + "loss": 0.5105, + "step": 24780 + }, + { + "epoch": 2.482996944959183, + "grad_norm": 2.6003458499908447, + "learning_rate": 2.5348388951884472e-05, + "loss": 0.4752, + "step": 24790 + }, + { + "epoch": 2.483998597686182, + "grad_norm": 3.0447092056274414, + "learning_rate": 2.5332625698242195e-05, + "loss": 0.5319, + "step": 24800 + }, + { + "epoch": 2.4850002504131816, + "grad_norm": 1.5755362510681152, + "learning_rate": 2.5316862312333978e-05, + "loss": 0.4119, + "step": 24810 + }, + { + "epoch": 2.486001903140181, + "grad_norm": 1.8942782878875732, + "learning_rate": 2.530109880042797e-05, + "loss": 0.5605, + "step": 24820 + }, + { + "epoch": 2.487003555867181, + "grad_norm": 2.6547932624816895, + "learning_rate": 2.5285335168792435e-05, + "loss": 0.4788, + "step": 24830 + }, + { + "epoch": 2.4880052085941804, + "grad_norm": 2.3786137104034424, + "learning_rate": 2.5269571423695638e-05, + "loss": 0.4729, + "step": 24840 + }, + { + "epoch": 2.48900686132118, + "grad_norm": 2.6195545196533203, + "learning_rate": 2.5253807571405913e-05, + "loss": 0.5661, + "step": 24850 + }, + { + "epoch": 2.4900085140481796, + "grad_norm": 2.390591621398926, + "learning_rate": 2.523804361819163e-05, + "loss": 0.4755, + "step": 24860 + }, + { + "epoch": 2.491010166775179, + "grad_norm": 2.008957624435425, + "learning_rate": 2.52222795703212e-05, + "loss": 0.5102, + "step": 24870 + }, + { + "epoch": 2.4920118195021788, + "grad_norm": 1.8178975582122803, + "learning_rate": 2.520651543406307e-05, + "loss": 0.5363, + "step": 24880 + }, + { + "epoch": 2.493013472229178, + "grad_norm": 1.8573118448257446, + "learning_rate": 2.5190751215685727e-05, + "loss": 0.4536, + "step": 24890 + }, + { + "epoch": 2.494015124956178, + "grad_norm": 1.995599627494812, + "learning_rate": 2.5174986921457677e-05, + "loss": 0.5196, + "step": 24900 + }, + { + "epoch": 2.495016777683177, + "grad_norm": 2.5662119388580322, + "learning_rate": 2.5159222557647473e-05, + "loss": 0.4686, + "step": 24910 + }, + { + "epoch": 2.4960184304101767, + "grad_norm": 2.2614381313323975, + "learning_rate": 2.514345813052369e-05, + "loss": 0.4819, + "step": 24920 + }, + { + "epoch": 2.4970200831371763, + "grad_norm": 2.1325461864471436, + "learning_rate": 2.5127693646354915e-05, + "loss": 0.414, + "step": 24930 + }, + { + "epoch": 2.498021735864176, + "grad_norm": 2.0967817306518555, + "learning_rate": 2.5111929111409795e-05, + "loss": 0.4791, + "step": 24940 + }, + { + "epoch": 2.4990233885911755, + "grad_norm": 2.1351726055145264, + "learning_rate": 2.509616453195694e-05, + "loss": 0.4869, + "step": 24950 + }, + { + "epoch": 2.500025041318175, + "grad_norm": 2.286760091781616, + "learning_rate": 2.5080399914265034e-05, + "loss": 0.5563, + "step": 24960 + }, + { + "epoch": 2.5010266940451746, + "grad_norm": 2.3411989212036133, + "learning_rate": 2.506463526460274e-05, + "loss": 0.5398, + "step": 24970 + }, + { + "epoch": 2.5020283467721742, + "grad_norm": 2.220675230026245, + "learning_rate": 2.504887058923875e-05, + "loss": 0.5249, + "step": 24980 + }, + { + "epoch": 2.503029999499174, + "grad_norm": 2.0709569454193115, + "learning_rate": 2.503310589444176e-05, + "loss": 0.5202, + "step": 24990 + }, + { + "epoch": 2.504031652226173, + "grad_norm": 2.3817501068115234, + "learning_rate": 2.501734118648047e-05, + "loss": 0.4466, + "step": 25000 + }, + { + "epoch": 2.505033304953173, + "grad_norm": 2.626307487487793, + "learning_rate": 2.50015764716236e-05, + "loss": 0.4598, + "step": 25010 + }, + { + "epoch": 2.506034957680172, + "grad_norm": 2.288289785385132, + "learning_rate": 2.498581175613986e-05, + "loss": 0.4345, + "step": 25020 + }, + { + "epoch": 2.5070366104071717, + "grad_norm": 2.8735311031341553, + "learning_rate": 2.497004704629796e-05, + "loss": 0.4907, + "step": 25030 + }, + { + "epoch": 2.5080382631341713, + "grad_norm": 2.262436866760254, + "learning_rate": 2.4954282348366613e-05, + "loss": 0.5129, + "step": 25040 + }, + { + "epoch": 2.509039915861171, + "grad_norm": 2.4624216556549072, + "learning_rate": 2.493851766861452e-05, + "loss": 0.43, + "step": 25050 + }, + { + "epoch": 2.5100415685881705, + "grad_norm": 2.532715082168579, + "learning_rate": 2.4922753013310397e-05, + "loss": 0.4568, + "step": 25060 + }, + { + "epoch": 2.51104322131517, + "grad_norm": 2.2034213542938232, + "learning_rate": 2.490698838872292e-05, + "loss": 0.4944, + "step": 25070 + }, + { + "epoch": 2.5120448740421697, + "grad_norm": 1.553830623626709, + "learning_rate": 2.489122380112076e-05, + "loss": 0.4666, + "step": 25080 + }, + { + "epoch": 2.5130465267691693, + "grad_norm": 2.5888707637786865, + "learning_rate": 2.48754592567726e-05, + "loss": 0.4994, + "step": 25090 + }, + { + "epoch": 2.514048179496169, + "grad_norm": 2.0383970737457275, + "learning_rate": 2.4859694761947074e-05, + "loss": 0.4831, + "step": 25100 + }, + { + "epoch": 2.515049832223168, + "grad_norm": 2.4339680671691895, + "learning_rate": 2.4843930322912795e-05, + "loss": 0.4834, + "step": 25110 + }, + { + "epoch": 2.5160514849501676, + "grad_norm": 2.0037882328033447, + "learning_rate": 2.482816594593839e-05, + "loss": 0.5053, + "step": 25120 + }, + { + "epoch": 2.517053137677167, + "grad_norm": 2.148824691772461, + "learning_rate": 2.481240163729243e-05, + "loss": 0.4823, + "step": 25130 + }, + { + "epoch": 2.518054790404167, + "grad_norm": 2.539546251296997, + "learning_rate": 2.4796637403243462e-05, + "loss": 0.5396, + "step": 25140 + }, + { + "epoch": 2.5190564431311664, + "grad_norm": 2.2066774368286133, + "learning_rate": 2.478087325006e-05, + "loss": 0.4696, + "step": 25150 + }, + { + "epoch": 2.520058095858166, + "grad_norm": 2.336604595184326, + "learning_rate": 2.476510918401056e-05, + "loss": 0.4451, + "step": 25160 + }, + { + "epoch": 2.5210597485851656, + "grad_norm": 2.245626211166382, + "learning_rate": 2.4749345211363574e-05, + "loss": 0.4704, + "step": 25170 + }, + { + "epoch": 2.522061401312165, + "grad_norm": 2.162360429763794, + "learning_rate": 2.473358133838746e-05, + "loss": 0.4521, + "step": 25180 + }, + { + "epoch": 2.5230630540391648, + "grad_norm": 1.6677041053771973, + "learning_rate": 2.4717817571350617e-05, + "loss": 0.4513, + "step": 25190 + }, + { + "epoch": 2.524064706766164, + "grad_norm": 1.8102422952651978, + "learning_rate": 2.470205391652137e-05, + "loss": 0.4525, + "step": 25200 + }, + { + "epoch": 2.525066359493164, + "grad_norm": 3.0435376167297363, + "learning_rate": 2.468629038016799e-05, + "loss": 0.5204, + "step": 25210 + }, + { + "epoch": 2.526068012220163, + "grad_norm": 2.122325897216797, + "learning_rate": 2.4670526968558755e-05, + "loss": 0.4684, + "step": 25220 + }, + { + "epoch": 2.5270696649471627, + "grad_norm": 2.4607677459716797, + "learning_rate": 2.465476368796184e-05, + "loss": 0.554, + "step": 25230 + }, + { + "epoch": 2.5280713176741623, + "grad_norm": 1.8819490671157837, + "learning_rate": 2.4639000544645403e-05, + "loss": 0.4483, + "step": 25240 + }, + { + "epoch": 2.529072970401162, + "grad_norm": 2.0729873180389404, + "learning_rate": 2.4623237544877514e-05, + "loss": 0.4738, + "step": 25250 + }, + { + "epoch": 2.5300746231281614, + "grad_norm": 1.9538005590438843, + "learning_rate": 2.4607474694926213e-05, + "loss": 0.4464, + "step": 25260 + }, + { + "epoch": 2.531076275855161, + "grad_norm": 2.4094536304473877, + "learning_rate": 2.459171200105948e-05, + "loss": 0.4938, + "step": 25270 + }, + { + "epoch": 2.5320779285821606, + "grad_norm": 2.4971518516540527, + "learning_rate": 2.4575949469545213e-05, + "loss": 0.5492, + "step": 25280 + }, + { + "epoch": 2.53307958130916, + "grad_norm": 2.3595423698425293, + "learning_rate": 2.4560187106651257e-05, + "loss": 0.4369, + "step": 25290 + }, + { + "epoch": 2.53408123403616, + "grad_norm": 1.8320460319519043, + "learning_rate": 2.4544424918645396e-05, + "loss": 0.4955, + "step": 25300 + }, + { + "epoch": 2.535082886763159, + "grad_norm": 2.384916305541992, + "learning_rate": 2.4528662911795342e-05, + "loss": 0.4669, + "step": 25310 + }, + { + "epoch": 2.536084539490159, + "grad_norm": 2.4493815898895264, + "learning_rate": 2.451290109236872e-05, + "loss": 0.4963, + "step": 25320 + }, + { + "epoch": 2.537086192217158, + "grad_norm": 1.902748942375183, + "learning_rate": 2.449713946663309e-05, + "loss": 0.4792, + "step": 25330 + }, + { + "epoch": 2.5380878449441577, + "grad_norm": 2.6030337810516357, + "learning_rate": 2.4481378040855955e-05, + "loss": 0.4986, + "step": 25340 + }, + { + "epoch": 2.5390894976711573, + "grad_norm": 3.346127510070801, + "learning_rate": 2.446561682130471e-05, + "loss": 0.4734, + "step": 25350 + }, + { + "epoch": 2.540091150398157, + "grad_norm": 2.2294840812683105, + "learning_rate": 2.4449855814246668e-05, + "loss": 0.443, + "step": 25360 + }, + { + "epoch": 2.5410928031251565, + "grad_norm": 1.9707425832748413, + "learning_rate": 2.4434095025949085e-05, + "loss": 0.5229, + "step": 25370 + }, + { + "epoch": 2.542094455852156, + "grad_norm": 2.2785727977752686, + "learning_rate": 2.4418334462679112e-05, + "loss": 0.4674, + "step": 25380 + }, + { + "epoch": 2.5430961085791557, + "grad_norm": 2.2725393772125244, + "learning_rate": 2.4402574130703795e-05, + "loss": 0.4049, + "step": 25390 + }, + { + "epoch": 2.5440977613061553, + "grad_norm": 1.964576244354248, + "learning_rate": 2.438681403629012e-05, + "loss": 0.5268, + "step": 25400 + }, + { + "epoch": 2.545099414033155, + "grad_norm": 2.273085594177246, + "learning_rate": 2.437105418570495e-05, + "loss": 0.4817, + "step": 25410 + }, + { + "epoch": 2.546101066760154, + "grad_norm": 2.092212438583374, + "learning_rate": 2.4355294585215084e-05, + "loss": 0.4439, + "step": 25420 + }, + { + "epoch": 2.547102719487154, + "grad_norm": 2.514829635620117, + "learning_rate": 2.433953524108718e-05, + "loss": 0.492, + "step": 25430 + }, + { + "epoch": 2.548104372214153, + "grad_norm": 2.0971784591674805, + "learning_rate": 2.4323776159587828e-05, + "loss": 0.4162, + "step": 25440 + }, + { + "epoch": 2.549106024941153, + "grad_norm": 2.2613589763641357, + "learning_rate": 2.4308017346983508e-05, + "loss": 0.5042, + "step": 25450 + }, + { + "epoch": 2.5501076776681524, + "grad_norm": 2.3998570442199707, + "learning_rate": 2.429225880954056e-05, + "loss": 0.4472, + "step": 25460 + }, + { + "epoch": 2.551109330395152, + "grad_norm": 2.4489238262176514, + "learning_rate": 2.4276500553525267e-05, + "loss": 0.4959, + "step": 25470 + }, + { + "epoch": 2.5521109831221516, + "grad_norm": 2.075310468673706, + "learning_rate": 2.4260742585203755e-05, + "loss": 0.5041, + "step": 25480 + }, + { + "epoch": 2.553112635849151, + "grad_norm": 2.417213201522827, + "learning_rate": 2.4244984910842076e-05, + "loss": 0.4575, + "step": 25490 + }, + { + "epoch": 2.5541142885761507, + "grad_norm": 2.008910655975342, + "learning_rate": 2.4229227536706126e-05, + "loss": 0.4455, + "step": 25500 + }, + { + "epoch": 2.5551159413031503, + "grad_norm": 2.7786033153533936, + "learning_rate": 2.42134704690617e-05, + "loss": 0.4653, + "step": 25510 + }, + { + "epoch": 2.55611759403015, + "grad_norm": 1.7164369821548462, + "learning_rate": 2.4197713714174477e-05, + "loss": 0.4826, + "step": 25520 + }, + { + "epoch": 2.557119246757149, + "grad_norm": 2.7447457313537598, + "learning_rate": 2.418195727831001e-05, + "loss": 0.4789, + "step": 25530 + }, + { + "epoch": 2.558120899484149, + "grad_norm": 1.7751262187957764, + "learning_rate": 2.4166201167733705e-05, + "loss": 0.5007, + "step": 25540 + }, + { + "epoch": 2.5591225522111483, + "grad_norm": 2.5949177742004395, + "learning_rate": 2.415044538871086e-05, + "loss": 0.475, + "step": 25550 + }, + { + "epoch": 2.560124204938148, + "grad_norm": 2.2382097244262695, + "learning_rate": 2.413468994750665e-05, + "loss": 0.4627, + "step": 25560 + }, + { + "epoch": 2.5611258576651474, + "grad_norm": 3.2741479873657227, + "learning_rate": 2.4118934850386076e-05, + "loss": 0.4902, + "step": 25570 + }, + { + "epoch": 2.562127510392147, + "grad_norm": 1.8402247428894043, + "learning_rate": 2.4103180103614042e-05, + "loss": 0.5062, + "step": 25580 + }, + { + "epoch": 2.5631291631191466, + "grad_norm": 3.088942766189575, + "learning_rate": 2.408742571345529e-05, + "loss": 0.452, + "step": 25590 + }, + { + "epoch": 2.564130815846146, + "grad_norm": 2.6764485836029053, + "learning_rate": 2.4071671686174444e-05, + "loss": 0.4863, + "step": 25600 + }, + { + "epoch": 2.565132468573146, + "grad_norm": 1.7545393705368042, + "learning_rate": 2.4055918028035944e-05, + "loss": 0.5092, + "step": 25610 + }, + { + "epoch": 2.5661341213001454, + "grad_norm": 2.4561171531677246, + "learning_rate": 2.404016474530412e-05, + "loss": 0.4584, + "step": 25620 + }, + { + "epoch": 2.567135774027145, + "grad_norm": 2.4661591053009033, + "learning_rate": 2.4024411844243136e-05, + "loss": 0.47, + "step": 25630 + }, + { + "epoch": 2.568137426754144, + "grad_norm": 2.1460041999816895, + "learning_rate": 2.4008659331117018e-05, + "loss": 0.5056, + "step": 25640 + }, + { + "epoch": 2.5691390794811437, + "grad_norm": 2.1262660026550293, + "learning_rate": 2.3992907212189615e-05, + "loss": 0.511, + "step": 25650 + }, + { + "epoch": 2.5701407322081433, + "grad_norm": 2.1861934661865234, + "learning_rate": 2.3977155493724624e-05, + "loss": 0.4645, + "step": 25660 + }, + { + "epoch": 2.571142384935143, + "grad_norm": 4.291914939880371, + "learning_rate": 2.3961404181985613e-05, + "loss": 0.5166, + "step": 25670 + }, + { + "epoch": 2.5721440376621425, + "grad_norm": 2.78710675239563, + "learning_rate": 2.394565328323595e-05, + "loss": 0.5688, + "step": 25680 + }, + { + "epoch": 2.573145690389142, + "grad_norm": 2.88307523727417, + "learning_rate": 2.3929902803738852e-05, + "loss": 0.4476, + "step": 25690 + }, + { + "epoch": 2.5741473431161417, + "grad_norm": 2.102853298187256, + "learning_rate": 2.391415274975738e-05, + "loss": 0.522, + "step": 25700 + }, + { + "epoch": 2.5751489958431413, + "grad_norm": 2.1062817573547363, + "learning_rate": 2.3898403127554423e-05, + "loss": 0.5218, + "step": 25710 + }, + { + "epoch": 2.576150648570141, + "grad_norm": 2.3642542362213135, + "learning_rate": 2.3882653943392678e-05, + "loss": 0.4886, + "step": 25720 + }, + { + "epoch": 2.57715230129714, + "grad_norm": 1.5191824436187744, + "learning_rate": 2.3866905203534693e-05, + "loss": 0.4415, + "step": 25730 + }, + { + "epoch": 2.57815395402414, + "grad_norm": 2.072801113128662, + "learning_rate": 2.385115691424283e-05, + "loss": 0.4764, + "step": 25740 + }, + { + "epoch": 2.579155606751139, + "grad_norm": 2.228182554244995, + "learning_rate": 2.3835409081779252e-05, + "loss": 0.5007, + "step": 25750 + }, + { + "epoch": 2.580157259478139, + "grad_norm": 1.7731989622116089, + "learning_rate": 2.3819661712405983e-05, + "loss": 0.4685, + "step": 25760 + }, + { + "epoch": 2.5811589122051384, + "grad_norm": 1.9319185018539429, + "learning_rate": 2.3803914812384823e-05, + "loss": 0.4991, + "step": 25770 + }, + { + "epoch": 2.582160564932138, + "grad_norm": 2.0118401050567627, + "learning_rate": 2.3788168387977424e-05, + "loss": 0.5158, + "step": 25780 + }, + { + "epoch": 2.5831622176591376, + "grad_norm": 1.7415335178375244, + "learning_rate": 2.377242244544519e-05, + "loss": 0.4717, + "step": 25790 + }, + { + "epoch": 2.584163870386137, + "grad_norm": 2.1140804290771484, + "learning_rate": 2.3756676991049402e-05, + "loss": 0.4628, + "step": 25800 + }, + { + "epoch": 2.5851655231131367, + "grad_norm": 1.9852862358093262, + "learning_rate": 2.3740932031051092e-05, + "loss": 0.5444, + "step": 25810 + }, + { + "epoch": 2.5861671758401363, + "grad_norm": 1.569001317024231, + "learning_rate": 2.3725187571711138e-05, + "loss": 0.478, + "step": 25820 + }, + { + "epoch": 2.587168828567136, + "grad_norm": 1.7986372709274292, + "learning_rate": 2.3709443619290184e-05, + "loss": 0.4702, + "step": 25830 + }, + { + "epoch": 2.588170481294135, + "grad_norm": 2.3105223178863525, + "learning_rate": 2.3693700180048688e-05, + "loss": 0.4755, + "step": 25840 + }, + { + "epoch": 2.589172134021135, + "grad_norm": 1.8925827741622925, + "learning_rate": 2.3677957260246917e-05, + "loss": 0.4492, + "step": 25850 + }, + { + "epoch": 2.5901737867481343, + "grad_norm": 2.353822946548462, + "learning_rate": 2.36622148661449e-05, + "loss": 0.5658, + "step": 25860 + }, + { + "epoch": 2.591175439475134, + "grad_norm": 2.317485809326172, + "learning_rate": 2.364647300400248e-05, + "loss": 0.4531, + "step": 25870 + }, + { + "epoch": 2.5921770922021334, + "grad_norm": 1.9144796133041382, + "learning_rate": 2.363073168007929e-05, + "loss": 0.4664, + "step": 25880 + }, + { + "epoch": 2.593178744929133, + "grad_norm": 2.278697967529297, + "learning_rate": 2.361499090063474e-05, + "loss": 0.4907, + "step": 25890 + }, + { + "epoch": 2.5941803976561326, + "grad_norm": 2.4261505603790283, + "learning_rate": 2.3599250671928014e-05, + "loss": 0.4243, + "step": 25900 + }, + { + "epoch": 2.595182050383132, + "grad_norm": 2.083456039428711, + "learning_rate": 2.3583511000218103e-05, + "loss": 0.4674, + "step": 25910 + }, + { + "epoch": 2.596183703110132, + "grad_norm": 2.447664737701416, + "learning_rate": 2.3567771891763763e-05, + "loss": 0.493, + "step": 25920 + }, + { + "epoch": 2.5971853558371314, + "grad_norm": 2.034817934036255, + "learning_rate": 2.3552033352823505e-05, + "loss": 0.4827, + "step": 25930 + }, + { + "epoch": 2.598187008564131, + "grad_norm": 2.444237470626831, + "learning_rate": 2.3536295389655654e-05, + "loss": 0.4507, + "step": 25940 + }, + { + "epoch": 2.59918866129113, + "grad_norm": 2.6944849491119385, + "learning_rate": 2.3520558008518273e-05, + "loss": 0.5128, + "step": 25950 + }, + { + "epoch": 2.60019031401813, + "grad_norm": 1.7680457830429077, + "learning_rate": 2.3504821215669228e-05, + "loss": 0.4819, + "step": 25960 + }, + { + "epoch": 2.6011919667451293, + "grad_norm": 2.6250157356262207, + "learning_rate": 2.3489085017366097e-05, + "loss": 0.5471, + "step": 25970 + }, + { + "epoch": 2.602193619472129, + "grad_norm": 1.8260471820831299, + "learning_rate": 2.3473349419866275e-05, + "loss": 0.4382, + "step": 25980 + }, + { + "epoch": 2.6031952721991285, + "grad_norm": 2.2221004962921143, + "learning_rate": 2.345761442942689e-05, + "loss": 0.4822, + "step": 25990 + }, + { + "epoch": 2.604196924926128, + "grad_norm": 1.932762622833252, + "learning_rate": 2.3441880052304842e-05, + "loss": 0.5148, + "step": 26000 + }, + { + "epoch": 2.6051985776531277, + "grad_norm": 2.2098388671875, + "learning_rate": 2.3426146294756774e-05, + "loss": 0.5548, + "step": 26010 + }, + { + "epoch": 2.6062002303801273, + "grad_norm": 2.6943869590759277, + "learning_rate": 2.3410413163039088e-05, + "loss": 0.4284, + "step": 26020 + }, + { + "epoch": 2.607201883107127, + "grad_norm": 1.8194222450256348, + "learning_rate": 2.3394680663407954e-05, + "loss": 0.4428, + "step": 26030 + }, + { + "epoch": 2.6082035358341265, + "grad_norm": 2.8261239528656006, + "learning_rate": 2.3378948802119254e-05, + "loss": 0.5257, + "step": 26040 + }, + { + "epoch": 2.609205188561126, + "grad_norm": 2.3305418491363525, + "learning_rate": 2.3363217585428644e-05, + "loss": 0.3921, + "step": 26050 + }, + { + "epoch": 2.610206841288125, + "grad_norm": 1.934168815612793, + "learning_rate": 2.3347487019591524e-05, + "loss": 0.4497, + "step": 26060 + }, + { + "epoch": 2.6112084940151252, + "grad_norm": 2.6037657260894775, + "learning_rate": 2.3331757110863028e-05, + "loss": 0.5347, + "step": 26070 + }, + { + "epoch": 2.6122101467421244, + "grad_norm": 2.024768829345703, + "learning_rate": 2.3316027865498017e-05, + "loss": 0.4866, + "step": 26080 + }, + { + "epoch": 2.613211799469124, + "grad_norm": 2.0269052982330322, + "learning_rate": 2.330029928975111e-05, + "loss": 0.4654, + "step": 26090 + }, + { + "epoch": 2.6142134521961236, + "grad_norm": 2.378782033920288, + "learning_rate": 2.3284571389876643e-05, + "loss": 0.5363, + "step": 26100 + }, + { + "epoch": 2.615215104923123, + "grad_norm": 2.4418351650238037, + "learning_rate": 2.3268844172128703e-05, + "loss": 0.4381, + "step": 26110 + }, + { + "epoch": 2.6162167576501227, + "grad_norm": 2.0797080993652344, + "learning_rate": 2.325311764276108e-05, + "loss": 0.4586, + "step": 26120 + }, + { + "epoch": 2.6172184103771223, + "grad_norm": 2.9233202934265137, + "learning_rate": 2.323739180802731e-05, + "loss": 0.5051, + "step": 26130 + }, + { + "epoch": 2.618220063104122, + "grad_norm": 1.9914222955703735, + "learning_rate": 2.3221666674180647e-05, + "loss": 0.4812, + "step": 26140 + }, + { + "epoch": 2.6192217158311215, + "grad_norm": 1.946094036102295, + "learning_rate": 2.320594224747405e-05, + "loss": 0.5021, + "step": 26150 + }, + { + "epoch": 2.620223368558121, + "grad_norm": 2.0537545680999756, + "learning_rate": 2.3190218534160233e-05, + "loss": 0.4171, + "step": 26160 + }, + { + "epoch": 2.6212250212851202, + "grad_norm": 2.107255697250366, + "learning_rate": 2.3174495540491588e-05, + "loss": 0.4683, + "step": 26170 + }, + { + "epoch": 2.62222667401212, + "grad_norm": 1.7646708488464355, + "learning_rate": 2.3158773272720254e-05, + "loss": 0.4256, + "step": 26180 + }, + { + "epoch": 2.6232283267391194, + "grad_norm": 1.94224214553833, + "learning_rate": 2.3143051737098054e-05, + "loss": 0.4832, + "step": 26190 + }, + { + "epoch": 2.624229979466119, + "grad_norm": 2.5952892303466797, + "learning_rate": 2.312733093987653e-05, + "loss": 0.4565, + "step": 26200 + }, + { + "epoch": 2.6252316321931186, + "grad_norm": 1.9447942972183228, + "learning_rate": 2.3111610887306946e-05, + "loss": 0.4124, + "step": 26210 + }, + { + "epoch": 2.626233284920118, + "grad_norm": 2.0823614597320557, + "learning_rate": 2.3095891585640246e-05, + "loss": 0.5009, + "step": 26220 + }, + { + "epoch": 2.627234937647118, + "grad_norm": 2.4931070804595947, + "learning_rate": 2.3080173041127074e-05, + "loss": 0.4633, + "step": 26230 + }, + { + "epoch": 2.6282365903741174, + "grad_norm": 2.088278293609619, + "learning_rate": 2.3064455260017803e-05, + "loss": 0.4155, + "step": 26240 + }, + { + "epoch": 2.629238243101117, + "grad_norm": 2.1908822059631348, + "learning_rate": 2.3048738248562478e-05, + "loss": 0.3935, + "step": 26250 + }, + { + "epoch": 2.630239895828116, + "grad_norm": 2.7435402870178223, + "learning_rate": 2.3033022013010834e-05, + "loss": 0.4813, + "step": 26260 + }, + { + "epoch": 2.631241548555116, + "grad_norm": 2.0322299003601074, + "learning_rate": 2.301730655961232e-05, + "loss": 0.4314, + "step": 26270 + }, + { + "epoch": 2.6322432012821153, + "grad_norm": 2.4347589015960693, + "learning_rate": 2.300159189461605e-05, + "loss": 0.4644, + "step": 26280 + }, + { + "epoch": 2.633244854009115, + "grad_norm": 2.0065510272979736, + "learning_rate": 2.298587802427085e-05, + "loss": 0.4897, + "step": 26290 + }, + { + "epoch": 2.6342465067361145, + "grad_norm": 2.5837438106536865, + "learning_rate": 2.2970164954825192e-05, + "loss": 0.469, + "step": 26300 + }, + { + "epoch": 2.635248159463114, + "grad_norm": 2.4908740520477295, + "learning_rate": 2.2954452692527277e-05, + "loss": 0.4813, + "step": 26310 + }, + { + "epoch": 2.6362498121901137, + "grad_norm": 2.1361074447631836, + "learning_rate": 2.293874124362495e-05, + "loss": 0.4876, + "step": 26320 + }, + { + "epoch": 2.6372514649171133, + "grad_norm": 2.353395462036133, + "learning_rate": 2.2923030614365735e-05, + "loss": 0.4718, + "step": 26330 + }, + { + "epoch": 2.638253117644113, + "grad_norm": 1.8883154392242432, + "learning_rate": 2.290732081099685e-05, + "loss": 0.5121, + "step": 26340 + }, + { + "epoch": 2.6392547703711124, + "grad_norm": 2.065039873123169, + "learning_rate": 2.289161183976517e-05, + "loss": 0.488, + "step": 26350 + }, + { + "epoch": 2.640256423098112, + "grad_norm": 2.287727117538452, + "learning_rate": 2.287590370691725e-05, + "loss": 0.4939, + "step": 26360 + }, + { + "epoch": 2.641258075825111, + "grad_norm": 2.5591814517974854, + "learning_rate": 2.2860196418699293e-05, + "loss": 0.4275, + "step": 26370 + }, + { + "epoch": 2.642259728552111, + "grad_norm": 2.074220657348633, + "learning_rate": 2.284448998135717e-05, + "loss": 0.4809, + "step": 26380 + }, + { + "epoch": 2.6432613812791104, + "grad_norm": 1.7956701517105103, + "learning_rate": 2.282878440113644e-05, + "loss": 0.4729, + "step": 26390 + }, + { + "epoch": 2.64426303400611, + "grad_norm": 2.273275375366211, + "learning_rate": 2.28130796842823e-05, + "loss": 0.5459, + "step": 26400 + }, + { + "epoch": 2.6452646867331095, + "grad_norm": 2.206364393234253, + "learning_rate": 2.2797375837039586e-05, + "loss": 0.5378, + "step": 26410 + }, + { + "epoch": 2.646266339460109, + "grad_norm": 2.165149688720703, + "learning_rate": 2.2781672865652824e-05, + "loss": 0.4299, + "step": 26420 + }, + { + "epoch": 2.6472679921871087, + "grad_norm": 2.0546000003814697, + "learning_rate": 2.276597077636618e-05, + "loss": 0.4466, + "step": 26430 + }, + { + "epoch": 2.6482696449141083, + "grad_norm": 1.8035660982131958, + "learning_rate": 2.2750269575423447e-05, + "loss": 0.4712, + "step": 26440 + }, + { + "epoch": 2.649271297641108, + "grad_norm": 2.0470290184020996, + "learning_rate": 2.2734569269068092e-05, + "loss": 0.4758, + "step": 26450 + }, + { + "epoch": 2.6502729503681075, + "grad_norm": 2.314500570297241, + "learning_rate": 2.271886986354322e-05, + "loss": 0.5292, + "step": 26460 + }, + { + "epoch": 2.651274603095107, + "grad_norm": 1.8078278303146362, + "learning_rate": 2.2703171365091577e-05, + "loss": 0.4468, + "step": 26470 + }, + { + "epoch": 2.6522762558221062, + "grad_norm": 2.535794734954834, + "learning_rate": 2.2687473779955526e-05, + "loss": 0.4847, + "step": 26480 + }, + { + "epoch": 2.6532779085491063, + "grad_norm": 2.081583023071289, + "learning_rate": 2.267177711437711e-05, + "loss": 0.4684, + "step": 26490 + }, + { + "epoch": 2.6542795612761054, + "grad_norm": 2.319704294204712, + "learning_rate": 2.2656081374597976e-05, + "loss": 0.4822, + "step": 26500 + }, + { + "epoch": 2.655281214003105, + "grad_norm": 2.5577144622802734, + "learning_rate": 2.2640386566859398e-05, + "loss": 0.4674, + "step": 26510 + }, + { + "epoch": 2.6562828667301046, + "grad_norm": 1.8538458347320557, + "learning_rate": 2.2624692697402306e-05, + "loss": 0.4069, + "step": 26520 + }, + { + "epoch": 2.657284519457104, + "grad_norm": 2.406271457672119, + "learning_rate": 2.2608999772467228e-05, + "loss": 0.5484, + "step": 26530 + }, + { + "epoch": 2.658286172184104, + "grad_norm": 2.2202024459838867, + "learning_rate": 2.2593307798294355e-05, + "loss": 0.3974, + "step": 26540 + }, + { + "epoch": 2.6592878249111034, + "grad_norm": 1.886047601699829, + "learning_rate": 2.257761678112345e-05, + "loss": 0.4842, + "step": 26550 + }, + { + "epoch": 2.660289477638103, + "grad_norm": 2.4025936126708984, + "learning_rate": 2.256192672719393e-05, + "loss": 0.466, + "step": 26560 + }, + { + "epoch": 2.6612911303651026, + "grad_norm": 2.0745341777801514, + "learning_rate": 2.254623764274482e-05, + "loss": 0.4589, + "step": 26570 + }, + { + "epoch": 2.662292783092102, + "grad_norm": 1.9398627281188965, + "learning_rate": 2.2530549534014772e-05, + "loss": 0.507, + "step": 26580 + }, + { + "epoch": 2.6632944358191013, + "grad_norm": 2.9563961029052734, + "learning_rate": 2.2514862407242012e-05, + "loss": 0.4202, + "step": 26590 + }, + { + "epoch": 2.6642960885461013, + "grad_norm": 2.6384031772613525, + "learning_rate": 2.249917626866442e-05, + "loss": 0.4569, + "step": 26600 + }, + { + "epoch": 2.6652977412731005, + "grad_norm": 2.049692153930664, + "learning_rate": 2.2483491124519465e-05, + "loss": 0.4576, + "step": 26610 + }, + { + "epoch": 2.6662993940001, + "grad_norm": 2.3128302097320557, + "learning_rate": 2.2467806981044207e-05, + "loss": 0.4881, + "step": 26620 + }, + { + "epoch": 2.6673010467270997, + "grad_norm": 1.87588369846344, + "learning_rate": 2.2452123844475322e-05, + "loss": 0.4082, + "step": 26630 + }, + { + "epoch": 2.6683026994540993, + "grad_norm": 2.448084592819214, + "learning_rate": 2.2436441721049095e-05, + "loss": 0.4678, + "step": 26640 + }, + { + "epoch": 2.669304352181099, + "grad_norm": 2.088381290435791, + "learning_rate": 2.2420760617001395e-05, + "loss": 0.4423, + "step": 26650 + }, + { + "epoch": 2.6703060049080984, + "grad_norm": 2.1151883602142334, + "learning_rate": 2.2405080538567673e-05, + "loss": 0.4899, + "step": 26660 + }, + { + "epoch": 2.671307657635098, + "grad_norm": 2.232419729232788, + "learning_rate": 2.238940149198301e-05, + "loss": 0.5182, + "step": 26670 + }, + { + "epoch": 2.6723093103620976, + "grad_norm": 1.9108939170837402, + "learning_rate": 2.237372348348203e-05, + "loss": 0.4394, + "step": 26680 + }, + { + "epoch": 2.673310963089097, + "grad_norm": 2.0978970527648926, + "learning_rate": 2.2358046519298997e-05, + "loss": 0.5118, + "step": 26690 + }, + { + "epoch": 2.6743126158160964, + "grad_norm": 2.155327558517456, + "learning_rate": 2.234237060566771e-05, + "loss": 0.5335, + "step": 26700 + }, + { + "epoch": 2.675314268543096, + "grad_norm": 2.3690075874328613, + "learning_rate": 2.2326695748821565e-05, + "loss": 0.4261, + "step": 26710 + }, + { + "epoch": 2.6763159212700955, + "grad_norm": 1.9474866390228271, + "learning_rate": 2.2311021954993572e-05, + "loss": 0.5525, + "step": 26720 + }, + { + "epoch": 2.677317573997095, + "grad_norm": 2.135596990585327, + "learning_rate": 2.2295349230416266e-05, + "loss": 0.5035, + "step": 26730 + }, + { + "epoch": 2.6783192267240947, + "grad_norm": 2.132472276687622, + "learning_rate": 2.227967758132178e-05, + "loss": 0.5243, + "step": 26740 + }, + { + "epoch": 2.6793208794510943, + "grad_norm": 1.8780027627944946, + "learning_rate": 2.226400701394184e-05, + "loss": 0.4771, + "step": 26750 + }, + { + "epoch": 2.680322532178094, + "grad_norm": 1.9652684926986694, + "learning_rate": 2.224833753450771e-05, + "loss": 0.4811, + "step": 26760 + }, + { + "epoch": 2.6813241849050935, + "grad_norm": 2.099169969558716, + "learning_rate": 2.2232669149250227e-05, + "loss": 0.4777, + "step": 26770 + }, + { + "epoch": 2.682325837632093, + "grad_norm": 2.4007065296173096, + "learning_rate": 2.221700186439981e-05, + "loss": 0.4176, + "step": 26780 + }, + { + "epoch": 2.6833274903590922, + "grad_norm": 2.2021305561065674, + "learning_rate": 2.2201335686186435e-05, + "loss": 0.5119, + "step": 26790 + }, + { + "epoch": 2.6843291430860923, + "grad_norm": 2.159282922744751, + "learning_rate": 2.218567062083962e-05, + "loss": 0.4317, + "step": 26800 + }, + { + "epoch": 2.6853307958130914, + "grad_norm": 2.309873342514038, + "learning_rate": 2.217000667458845e-05, + "loss": 0.5505, + "step": 26810 + }, + { + "epoch": 2.686332448540091, + "grad_norm": 2.6905899047851562, + "learning_rate": 2.2154343853661586e-05, + "loss": 0.4556, + "step": 26820 + }, + { + "epoch": 2.6873341012670906, + "grad_norm": 1.9428925514221191, + "learning_rate": 2.2138682164287217e-05, + "loss": 0.4622, + "step": 26830 + }, + { + "epoch": 2.68833575399409, + "grad_norm": 2.6776959896087646, + "learning_rate": 2.212302161269308e-05, + "loss": 0.5363, + "step": 26840 + }, + { + "epoch": 2.68933740672109, + "grad_norm": 2.054520606994629, + "learning_rate": 2.2107362205106477e-05, + "loss": 0.4442, + "step": 26850 + }, + { + "epoch": 2.6903390594480894, + "grad_norm": 1.8479695320129395, + "learning_rate": 2.209170394775424e-05, + "loss": 0.4855, + "step": 26860 + }, + { + "epoch": 2.691340712175089, + "grad_norm": 2.017530679702759, + "learning_rate": 2.2076046846862767e-05, + "loss": 0.4774, + "step": 26870 + }, + { + "epoch": 2.6923423649020886, + "grad_norm": 2.2654614448547363, + "learning_rate": 2.2060390908657964e-05, + "loss": 0.5063, + "step": 26880 + }, + { + "epoch": 2.693344017629088, + "grad_norm": 2.8104443550109863, + "learning_rate": 2.2044736139365286e-05, + "loss": 0.4963, + "step": 26890 + }, + { + "epoch": 2.6943456703560873, + "grad_norm": 2.534173011779785, + "learning_rate": 2.2029082545209748e-05, + "loss": 0.5132, + "step": 26900 + }, + { + "epoch": 2.6953473230830873, + "grad_norm": 2.722679376602173, + "learning_rate": 2.2013430132415858e-05, + "loss": 0.4832, + "step": 26910 + }, + { + "epoch": 2.6963489758100865, + "grad_norm": 1.6218446493148804, + "learning_rate": 2.1997778907207673e-05, + "loss": 0.4437, + "step": 26920 + }, + { + "epoch": 2.697350628537086, + "grad_norm": 2.0946335792541504, + "learning_rate": 2.198212887580879e-05, + "loss": 0.4733, + "step": 26930 + }, + { + "epoch": 2.6983522812640857, + "grad_norm": 2.202590227127075, + "learning_rate": 2.196648004444232e-05, + "loss": 0.4577, + "step": 26940 + }, + { + "epoch": 2.6993539339910853, + "grad_norm": 2.196267604827881, + "learning_rate": 2.195083241933088e-05, + "loss": 0.5106, + "step": 26950 + }, + { + "epoch": 2.700355586718085, + "grad_norm": 2.427388906478882, + "learning_rate": 2.1935186006696634e-05, + "loss": 0.4277, + "step": 26960 + }, + { + "epoch": 2.7013572394450844, + "grad_norm": 2.214708089828491, + "learning_rate": 2.191954081276126e-05, + "loss": 0.4305, + "step": 26970 + }, + { + "epoch": 2.702358892172084, + "grad_norm": 2.6666109561920166, + "learning_rate": 2.1903896843745932e-05, + "loss": 0.4622, + "step": 26980 + }, + { + "epoch": 2.7033605448990836, + "grad_norm": 2.6057868003845215, + "learning_rate": 2.188825410587135e-05, + "loss": 0.4764, + "step": 26990 + }, + { + "epoch": 2.704362197626083, + "grad_norm": 2.430042266845703, + "learning_rate": 2.1872612605357735e-05, + "loss": 0.4984, + "step": 27000 + }, + { + "epoch": 2.7053638503530824, + "grad_norm": 2.1184325218200684, + "learning_rate": 2.1856972348424805e-05, + "loss": 0.4949, + "step": 27010 + }, + { + "epoch": 2.7063655030800824, + "grad_norm": 1.98492431640625, + "learning_rate": 2.184133334129176e-05, + "loss": 0.5301, + "step": 27020 + }, + { + "epoch": 2.7073671558070815, + "grad_norm": 1.8999614715576172, + "learning_rate": 2.1825695590177355e-05, + "loss": 0.4537, + "step": 27030 + }, + { + "epoch": 2.708368808534081, + "grad_norm": 3.519404649734497, + "learning_rate": 2.1810059101299802e-05, + "loss": 0.4338, + "step": 27040 + }, + { + "epoch": 2.7093704612610807, + "grad_norm": 2.756481409072876, + "learning_rate": 2.1794423880876842e-05, + "loss": 0.472, + "step": 27050 + }, + { + "epoch": 2.7103721139880803, + "grad_norm": 2.2912352085113525, + "learning_rate": 2.1778789935125674e-05, + "loss": 0.4935, + "step": 27060 + }, + { + "epoch": 2.71137376671508, + "grad_norm": 1.8153929710388184, + "learning_rate": 2.1763157270263017e-05, + "loss": 0.4422, + "step": 27070 + }, + { + "epoch": 2.7123754194420795, + "grad_norm": 2.11661434173584, + "learning_rate": 2.1747525892505094e-05, + "loss": 0.484, + "step": 27080 + }, + { + "epoch": 2.713377072169079, + "grad_norm": 2.561905860900879, + "learning_rate": 2.1731895808067583e-05, + "loss": 0.4715, + "step": 27090 + }, + { + "epoch": 2.7143787248960787, + "grad_norm": 2.2325327396392822, + "learning_rate": 2.171626702316565e-05, + "loss": 0.4948, + "step": 27100 + }, + { + "epoch": 2.7153803776230783, + "grad_norm": 2.3941023349761963, + "learning_rate": 2.170063954401398e-05, + "loss": 0.5549, + "step": 27110 + }, + { + "epoch": 2.7163820303500774, + "grad_norm": 1.9785363674163818, + "learning_rate": 2.1685013376826715e-05, + "loss": 0.4756, + "step": 27120 + }, + { + "epoch": 2.7173836830770774, + "grad_norm": 2.551440715789795, + "learning_rate": 2.166938852781746e-05, + "loss": 0.5079, + "step": 27130 + }, + { + "epoch": 2.7183853358040766, + "grad_norm": 2.1827147006988525, + "learning_rate": 2.1653765003199315e-05, + "loss": 0.4197, + "step": 27140 + }, + { + "epoch": 2.719386988531076, + "grad_norm": 1.9968968629837036, + "learning_rate": 2.163814280918486e-05, + "loss": 0.4311, + "step": 27150 + }, + { + "epoch": 2.720388641258076, + "grad_norm": 1.8684368133544922, + "learning_rate": 2.162252195198613e-05, + "loss": 0.416, + "step": 27160 + }, + { + "epoch": 2.7213902939850754, + "grad_norm": 2.0815813541412354, + "learning_rate": 2.160690243781463e-05, + "loss": 0.4611, + "step": 27170 + }, + { + "epoch": 2.722391946712075, + "grad_norm": 2.146352767944336, + "learning_rate": 2.159128427288134e-05, + "loss": 0.4603, + "step": 27180 + }, + { + "epoch": 2.7233935994390746, + "grad_norm": 2.8626322746276855, + "learning_rate": 2.157566746339671e-05, + "loss": 0.484, + "step": 27190 + }, + { + "epoch": 2.724395252166074, + "grad_norm": 1.959716796875, + "learning_rate": 2.1560052015570613e-05, + "loss": 0.4973, + "step": 27200 + }, + { + "epoch": 2.7253969048930737, + "grad_norm": 2.7703990936279297, + "learning_rate": 2.154443793561243e-05, + "loss": 0.4741, + "step": 27210 + }, + { + "epoch": 2.7263985576200733, + "grad_norm": 2.6336097717285156, + "learning_rate": 2.1528825229730966e-05, + "loss": 0.4971, + "step": 27220 + }, + { + "epoch": 2.7274002103470725, + "grad_norm": 2.6301822662353516, + "learning_rate": 2.1513213904134496e-05, + "loss": 0.4954, + "step": 27230 + }, + { + "epoch": 2.728401863074072, + "grad_norm": 2.201345920562744, + "learning_rate": 2.1497603965030736e-05, + "loss": 0.4635, + "step": 27240 + }, + { + "epoch": 2.7294035158010717, + "grad_norm": 2.9680652618408203, + "learning_rate": 2.1481995418626842e-05, + "loss": 0.4896, + "step": 27250 + }, + { + "epoch": 2.7304051685280712, + "grad_norm": 1.5185983180999756, + "learning_rate": 2.1466388271129458e-05, + "loss": 0.4887, + "step": 27260 + }, + { + "epoch": 2.731406821255071, + "grad_norm": 2.366058111190796, + "learning_rate": 2.1450782528744605e-05, + "loss": 0.4411, + "step": 27270 + }, + { + "epoch": 2.7324084739820704, + "grad_norm": 2.3551275730133057, + "learning_rate": 2.143517819767781e-05, + "loss": 0.3853, + "step": 27280 + }, + { + "epoch": 2.73341012670907, + "grad_norm": 1.7083995342254639, + "learning_rate": 2.1419575284133993e-05, + "loss": 0.4732, + "step": 27290 + }, + { + "epoch": 2.7344117794360696, + "grad_norm": 2.4988021850585938, + "learning_rate": 2.1403973794317543e-05, + "loss": 0.546, + "step": 27300 + }, + { + "epoch": 2.735413432163069, + "grad_norm": 2.1923186779022217, + "learning_rate": 2.1388373734432256e-05, + "loss": 0.4854, + "step": 27310 + }, + { + "epoch": 2.7364150848900683, + "grad_norm": 2.622400999069214, + "learning_rate": 2.1372775110681376e-05, + "loss": 0.4919, + "step": 27320 + }, + { + "epoch": 2.7374167376170684, + "grad_norm": 2.5335519313812256, + "learning_rate": 2.1357177929267574e-05, + "loss": 0.5083, + "step": 27330 + }, + { + "epoch": 2.7384183903440675, + "grad_norm": 2.4166882038116455, + "learning_rate": 2.134158219639295e-05, + "loss": 0.4475, + "step": 27340 + }, + { + "epoch": 2.739420043071067, + "grad_norm": 2.1072611808776855, + "learning_rate": 2.1325987918259006e-05, + "loss": 0.498, + "step": 27350 + }, + { + "epoch": 2.7404216957980667, + "grad_norm": 2.3170204162597656, + "learning_rate": 2.1310395101066692e-05, + "loss": 0.4514, + "step": 27360 + }, + { + "epoch": 2.7414233485250663, + "grad_norm": 2.4249660968780518, + "learning_rate": 2.129480375101638e-05, + "loss": 0.4603, + "step": 27370 + }, + { + "epoch": 2.742425001252066, + "grad_norm": 1.9688012599945068, + "learning_rate": 2.1279213874307818e-05, + "loss": 0.4447, + "step": 27380 + }, + { + "epoch": 2.7434266539790655, + "grad_norm": 2.5289130210876465, + "learning_rate": 2.126362547714022e-05, + "loss": 0.4885, + "step": 27390 + }, + { + "epoch": 2.744428306706065, + "grad_norm": 1.826878309249878, + "learning_rate": 2.1248038565712175e-05, + "loss": 0.4917, + "step": 27400 + }, + { + "epoch": 2.7454299594330647, + "grad_norm": 2.649292230606079, + "learning_rate": 2.123245314622171e-05, + "loss": 0.4552, + "step": 27410 + }, + { + "epoch": 2.7464316121600643, + "grad_norm": 3.107866048812866, + "learning_rate": 2.1216869224866226e-05, + "loss": 0.4791, + "step": 27420 + }, + { + "epoch": 2.7474332648870634, + "grad_norm": 2.3746352195739746, + "learning_rate": 2.120128680784255e-05, + "loss": 0.4838, + "step": 27430 + }, + { + "epoch": 2.7484349176140634, + "grad_norm": 1.829545497894287, + "learning_rate": 2.118570590134691e-05, + "loss": 0.4865, + "step": 27440 + }, + { + "epoch": 2.7494365703410626, + "grad_norm": 1.9630305767059326, + "learning_rate": 2.1170126511574927e-05, + "loss": 0.4979, + "step": 27450 + }, + { + "epoch": 2.750438223068062, + "grad_norm": 2.2091236114501953, + "learning_rate": 2.115454864472162e-05, + "loss": 0.5135, + "step": 27460 + }, + { + "epoch": 2.7514398757950618, + "grad_norm": 2.330111265182495, + "learning_rate": 2.1138972306981398e-05, + "loss": 0.4824, + "step": 27470 + }, + { + "epoch": 2.7524415285220614, + "grad_norm": 2.7530877590179443, + "learning_rate": 2.1123397504548087e-05, + "loss": 0.4567, + "step": 27480 + }, + { + "epoch": 2.753443181249061, + "grad_norm": 2.259148597717285, + "learning_rate": 2.1107824243614865e-05, + "loss": 0.4989, + "step": 27490 + }, + { + "epoch": 2.7544448339760605, + "grad_norm": 1.6459332704544067, + "learning_rate": 2.109225253037431e-05, + "loss": 0.4808, + "step": 27500 + }, + { + "epoch": 2.75544648670306, + "grad_norm": 2.2297933101654053, + "learning_rate": 2.107668237101841e-05, + "loss": 0.4954, + "step": 27510 + }, + { + "epoch": 2.7564481394300597, + "grad_norm": 2.1366751194000244, + "learning_rate": 2.106111377173851e-05, + "loss": 0.4906, + "step": 27520 + }, + { + "epoch": 2.7574497921570593, + "grad_norm": 2.337979555130005, + "learning_rate": 2.104554673872532e-05, + "loss": 0.5064, + "step": 27530 + }, + { + "epoch": 2.7584514448840585, + "grad_norm": 2.3288912773132324, + "learning_rate": 2.102998127816897e-05, + "loss": 0.4822, + "step": 27540 + }, + { + "epoch": 2.7594530976110585, + "grad_norm": 2.4183363914489746, + "learning_rate": 2.1014417396258936e-05, + "loss": 0.5117, + "step": 27550 + }, + { + "epoch": 2.7604547503380577, + "grad_norm": 2.034332752227783, + "learning_rate": 2.0998855099184058e-05, + "loss": 0.4364, + "step": 27560 + }, + { + "epoch": 2.7614564030650572, + "grad_norm": 2.2769012451171875, + "learning_rate": 2.0983294393132575e-05, + "loss": 0.4577, + "step": 27570 + }, + { + "epoch": 2.762458055792057, + "grad_norm": 2.3711085319519043, + "learning_rate": 2.0967735284292065e-05, + "loss": 0.5053, + "step": 27580 + }, + { + "epoch": 2.7634597085190564, + "grad_norm": 1.7482993602752686, + "learning_rate": 2.09521777788495e-05, + "loss": 0.4199, + "step": 27590 + }, + { + "epoch": 2.764461361246056, + "grad_norm": 1.9056187868118286, + "learning_rate": 2.0936621882991186e-05, + "loss": 0.4214, + "step": 27600 + }, + { + "epoch": 2.7654630139730556, + "grad_norm": 1.977642297744751, + "learning_rate": 2.0921067602902804e-05, + "loss": 0.4373, + "step": 27610 + }, + { + "epoch": 2.766464666700055, + "grad_norm": 2.014859199523926, + "learning_rate": 2.0905514944769382e-05, + "loss": 0.4806, + "step": 27620 + }, + { + "epoch": 2.767466319427055, + "grad_norm": 3.8795015811920166, + "learning_rate": 2.0889963914775333e-05, + "loss": 0.4404, + "step": 27630 + }, + { + "epoch": 2.7684679721540544, + "grad_norm": 2.2615413665771484, + "learning_rate": 2.0874414519104375e-05, + "loss": 0.5025, + "step": 27640 + }, + { + "epoch": 2.7694696248810535, + "grad_norm": 3.039480209350586, + "learning_rate": 2.085886676393961e-05, + "loss": 0.4841, + "step": 27650 + }, + { + "epoch": 2.7704712776080536, + "grad_norm": 2.475024700164795, + "learning_rate": 2.084332065546349e-05, + "loss": 0.5154, + "step": 27660 + }, + { + "epoch": 2.7714729303350527, + "grad_norm": 2.402113437652588, + "learning_rate": 2.082777619985778e-05, + "loss": 0.4713, + "step": 27670 + }, + { + "epoch": 2.7724745830620523, + "grad_norm": 1.7097605466842651, + "learning_rate": 2.0812233403303625e-05, + "loss": 0.4631, + "step": 27680 + }, + { + "epoch": 2.773476235789052, + "grad_norm": 2.578178644180298, + "learning_rate": 2.079669227198149e-05, + "loss": 0.4864, + "step": 27690 + }, + { + "epoch": 2.7744778885160515, + "grad_norm": 2.374117374420166, + "learning_rate": 2.0781152812071187e-05, + "loss": 0.4303, + "step": 27700 + }, + { + "epoch": 2.775479541243051, + "grad_norm": 2.412297010421753, + "learning_rate": 2.0765615029751843e-05, + "loss": 0.5018, + "step": 27710 + }, + { + "epoch": 2.7764811939700507, + "grad_norm": 2.166706085205078, + "learning_rate": 2.075007893120195e-05, + "loss": 0.4195, + "step": 27720 + }, + { + "epoch": 2.7774828466970503, + "grad_norm": 2.3247227668762207, + "learning_rate": 2.0734544522599298e-05, + "loss": 0.4862, + "step": 27730 + }, + { + "epoch": 2.77848449942405, + "grad_norm": 2.5825328826904297, + "learning_rate": 2.0719011810121043e-05, + "loss": 0.5212, + "step": 27740 + }, + { + "epoch": 2.7794861521510494, + "grad_norm": 3.512326955795288, + "learning_rate": 2.070348079994363e-05, + "loss": 0.5204, + "step": 27750 + }, + { + "epoch": 2.7804878048780486, + "grad_norm": 2.1720283031463623, + "learning_rate": 2.0687951498242833e-05, + "loss": 0.4784, + "step": 27760 + }, + { + "epoch": 2.781489457605048, + "grad_norm": 2.6881589889526367, + "learning_rate": 2.067242391119378e-05, + "loss": 0.4718, + "step": 27770 + }, + { + "epoch": 2.7824911103320478, + "grad_norm": 1.799422025680542, + "learning_rate": 2.0656898044970866e-05, + "loss": 0.4398, + "step": 27780 + }, + { + "epoch": 2.7834927630590474, + "grad_norm": 2.7559547424316406, + "learning_rate": 2.064137390574784e-05, + "loss": 0.5155, + "step": 27790 + }, + { + "epoch": 2.784494415786047, + "grad_norm": 2.2045767307281494, + "learning_rate": 2.0625851499697747e-05, + "loss": 0.5234, + "step": 27800 + }, + { + "epoch": 2.7854960685130465, + "grad_norm": 3.2407376766204834, + "learning_rate": 2.0610330832992966e-05, + "loss": 0.514, + "step": 27810 + }, + { + "epoch": 2.786497721240046, + "grad_norm": 1.689829707145691, + "learning_rate": 2.0594811911805145e-05, + "loss": 0.3743, + "step": 27820 + }, + { + "epoch": 2.7874993739670457, + "grad_norm": 2.45869517326355, + "learning_rate": 2.0579294742305263e-05, + "loss": 0.5228, + "step": 27830 + }, + { + "epoch": 2.7885010266940453, + "grad_norm": 2.170257806777954, + "learning_rate": 2.0563779330663614e-05, + "loss": 0.4958, + "step": 27840 + }, + { + "epoch": 2.7895026794210445, + "grad_norm": 2.5842323303222656, + "learning_rate": 2.0548265683049763e-05, + "loss": 0.5162, + "step": 27850 + }, + { + "epoch": 2.7905043321480445, + "grad_norm": 2.090120315551758, + "learning_rate": 2.0532753805632588e-05, + "loss": 0.4321, + "step": 27860 + }, + { + "epoch": 2.7915059848750436, + "grad_norm": 1.8928121328353882, + "learning_rate": 2.051724370458027e-05, + "loss": 0.4296, + "step": 27870 + }, + { + "epoch": 2.7925076376020432, + "grad_norm": 2.0915029048919678, + "learning_rate": 2.050173538606028e-05, + "loss": 0.4516, + "step": 27880 + }, + { + "epoch": 2.793509290329043, + "grad_norm": 2.43007230758667, + "learning_rate": 2.0486228856239366e-05, + "loss": 0.4801, + "step": 27890 + }, + { + "epoch": 2.7945109430560424, + "grad_norm": 1.7025607824325562, + "learning_rate": 2.0470724121283584e-05, + "loss": 0.5191, + "step": 27900 + }, + { + "epoch": 2.795512595783042, + "grad_norm": 2.6428115367889404, + "learning_rate": 2.0455221187358258e-05, + "loss": 0.4427, + "step": 27910 + }, + { + "epoch": 2.7965142485100416, + "grad_norm": 2.2131521701812744, + "learning_rate": 2.043972006062803e-05, + "loss": 0.5153, + "step": 27920 + }, + { + "epoch": 2.797515901237041, + "grad_norm": 2.5547220706939697, + "learning_rate": 2.0424220747256777e-05, + "loss": 0.4719, + "step": 27930 + }, + { + "epoch": 2.798517553964041, + "grad_norm": 2.3055248260498047, + "learning_rate": 2.040872325340769e-05, + "loss": 0.4596, + "step": 27940 + }, + { + "epoch": 2.7995192066910404, + "grad_norm": 2.9861836433410645, + "learning_rate": 2.039322758524322e-05, + "loss": 0.4723, + "step": 27950 + }, + { + "epoch": 2.8005208594180395, + "grad_norm": 1.8417468070983887, + "learning_rate": 2.0377733748925082e-05, + "loss": 0.5091, + "step": 27960 + }, + { + "epoch": 2.8015225121450396, + "grad_norm": 2.0634090900421143, + "learning_rate": 2.0362241750614298e-05, + "loss": 0.4419, + "step": 27970 + }, + { + "epoch": 2.8025241648720387, + "grad_norm": 1.7758311033248901, + "learning_rate": 2.034675159647112e-05, + "loss": 0.5149, + "step": 27980 + }, + { + "epoch": 2.8035258175990383, + "grad_norm": 2.2538533210754395, + "learning_rate": 2.0331263292655106e-05, + "loss": 0.5028, + "step": 27990 + }, + { + "epoch": 2.804527470326038, + "grad_norm": 2.3173768520355225, + "learning_rate": 2.0315776845325038e-05, + "loss": 0.4044, + "step": 28000 + }, + { + "epoch": 2.8055291230530375, + "grad_norm": 1.9004391431808472, + "learning_rate": 2.030029226063898e-05, + "loss": 0.4568, + "step": 28010 + }, + { + "epoch": 2.806530775780037, + "grad_norm": 2.0229578018188477, + "learning_rate": 2.028480954475427e-05, + "loss": 0.4042, + "step": 28020 + }, + { + "epoch": 2.8075324285070367, + "grad_norm": 2.288827657699585, + "learning_rate": 2.0269328703827468e-05, + "loss": 0.4943, + "step": 28030 + }, + { + "epoch": 2.8085340812340363, + "grad_norm": 2.060798168182373, + "learning_rate": 2.0253849744014405e-05, + "loss": 0.4906, + "step": 28040 + }, + { + "epoch": 2.809535733961036, + "grad_norm": 1.997622013092041, + "learning_rate": 2.0238372671470186e-05, + "loss": 0.4681, + "step": 28050 + }, + { + "epoch": 2.8105373866880354, + "grad_norm": 2.37577748298645, + "learning_rate": 2.0222897492349137e-05, + "loss": 0.4847, + "step": 28060 + }, + { + "epoch": 2.8115390394150346, + "grad_norm": 2.299891471862793, + "learning_rate": 2.0207424212804822e-05, + "loss": 0.4673, + "step": 28070 + }, + { + "epoch": 2.8125406921420346, + "grad_norm": 1.9214786291122437, + "learning_rate": 2.0191952838990093e-05, + "loss": 0.4824, + "step": 28080 + }, + { + "epoch": 2.8135423448690338, + "grad_norm": 2.1179656982421875, + "learning_rate": 2.0176483377056997e-05, + "loss": 0.5231, + "step": 28090 + }, + { + "epoch": 2.8145439975960334, + "grad_norm": 1.6549978256225586, + "learning_rate": 2.0161015833156877e-05, + "loss": 0.5406, + "step": 28100 + }, + { + "epoch": 2.815545650323033, + "grad_norm": 1.342764973640442, + "learning_rate": 2.0145550213440233e-05, + "loss": 0.3921, + "step": 28110 + }, + { + "epoch": 2.8165473030500325, + "grad_norm": 2.5125224590301514, + "learning_rate": 2.0130086524056873e-05, + "loss": 0.476, + "step": 28120 + }, + { + "epoch": 2.817548955777032, + "grad_norm": 2.1776862144470215, + "learning_rate": 2.011462477115581e-05, + "loss": 0.5087, + "step": 28130 + }, + { + "epoch": 2.8185506085040317, + "grad_norm": 2.026686668395996, + "learning_rate": 2.009916496088527e-05, + "loss": 0.461, + "step": 28140 + }, + { + "epoch": 2.8195522612310313, + "grad_norm": 2.4303576946258545, + "learning_rate": 2.008370709939274e-05, + "loss": 0.4916, + "step": 28150 + }, + { + "epoch": 2.820553913958031, + "grad_norm": 2.582199811935425, + "learning_rate": 2.0068251192824904e-05, + "loss": 0.4763, + "step": 28160 + }, + { + "epoch": 2.8215555666850305, + "grad_norm": 2.5607872009277344, + "learning_rate": 2.00527972473277e-05, + "loss": 0.5032, + "step": 28170 + }, + { + "epoch": 2.8225572194120296, + "grad_norm": 2.378352165222168, + "learning_rate": 2.003734526904624e-05, + "loss": 0.4109, + "step": 28180 + }, + { + "epoch": 2.8235588721390297, + "grad_norm": 1.7307512760162354, + "learning_rate": 2.0021895264124892e-05, + "loss": 0.4661, + "step": 28190 + }, + { + "epoch": 2.824560524866029, + "grad_norm": 2.8087072372436523, + "learning_rate": 2.000644723870723e-05, + "loss": 0.4496, + "step": 28200 + }, + { + "epoch": 2.8255621775930284, + "grad_norm": 1.670059323310852, + "learning_rate": 1.9991001198936043e-05, + "loss": 0.4691, + "step": 28210 + }, + { + "epoch": 2.826563830320028, + "grad_norm": 2.1640727519989014, + "learning_rate": 1.9975557150953307e-05, + "loss": 0.4691, + "step": 28220 + }, + { + "epoch": 2.8275654830470276, + "grad_norm": 2.104672431945801, + "learning_rate": 1.996011510090024e-05, + "loss": 0.4978, + "step": 28230 + }, + { + "epoch": 2.828567135774027, + "grad_norm": 2.842073917388916, + "learning_rate": 1.994467505491725e-05, + "loss": 0.4459, + "step": 28240 + }, + { + "epoch": 2.8295687885010268, + "grad_norm": 2.6502578258514404, + "learning_rate": 1.9929237019143937e-05, + "loss": 0.4711, + "step": 28250 + }, + { + "epoch": 2.8305704412280264, + "grad_norm": 1.7367525100708008, + "learning_rate": 1.9913800999719122e-05, + "loss": 0.4424, + "step": 28260 + }, + { + "epoch": 2.8315720939550255, + "grad_norm": 1.9925302267074585, + "learning_rate": 1.989836700278081e-05, + "loss": 0.4808, + "step": 28270 + }, + { + "epoch": 2.8325737466820256, + "grad_norm": 2.3260669708251953, + "learning_rate": 1.988293503446623e-05, + "loss": 0.401, + "step": 28280 + }, + { + "epoch": 2.8335753994090247, + "grad_norm": 2.675114154815674, + "learning_rate": 1.9867505100911744e-05, + "loss": 0.4942, + "step": 28290 + }, + { + "epoch": 2.8345770521360243, + "grad_norm": 2.485140085220337, + "learning_rate": 1.985207720825296e-05, + "loss": 0.5136, + "step": 28300 + }, + { + "epoch": 2.835578704863024, + "grad_norm": 2.7628536224365234, + "learning_rate": 1.9836651362624666e-05, + "loss": 0.4945, + "step": 28310 + }, + { + "epoch": 2.8365803575900235, + "grad_norm": 2.1196231842041016, + "learning_rate": 1.9821227570160804e-05, + "loss": 0.5113, + "step": 28320 + }, + { + "epoch": 2.837582010317023, + "grad_norm": 1.7910830974578857, + "learning_rate": 1.9805805836994546e-05, + "loss": 0.4488, + "step": 28330 + }, + { + "epoch": 2.8385836630440227, + "grad_norm": 2.0061752796173096, + "learning_rate": 1.9790386169258197e-05, + "loss": 0.4598, + "step": 28340 + }, + { + "epoch": 2.8395853157710222, + "grad_norm": 2.4097883701324463, + "learning_rate": 1.9774968573083294e-05, + "loss": 0.4771, + "step": 28350 + }, + { + "epoch": 2.840586968498022, + "grad_norm": 1.9780919551849365, + "learning_rate": 1.97595530546005e-05, + "loss": 0.5228, + "step": 28360 + }, + { + "epoch": 2.8415886212250214, + "grad_norm": 2.0265541076660156, + "learning_rate": 1.9744139619939673e-05, + "loss": 0.4618, + "step": 28370 + }, + { + "epoch": 2.8425902739520206, + "grad_norm": 2.245145082473755, + "learning_rate": 1.972872827522985e-05, + "loss": 0.5279, + "step": 28380 + }, + { + "epoch": 2.8435919266790206, + "grad_norm": 2.6057050228118896, + "learning_rate": 1.9713319026599235e-05, + "loss": 0.4666, + "step": 28390 + }, + { + "epoch": 2.8445935794060198, + "grad_norm": 1.8463797569274902, + "learning_rate": 1.9697911880175178e-05, + "loss": 0.4472, + "step": 28400 + }, + { + "epoch": 2.8455952321330193, + "grad_norm": 2.043485403060913, + "learning_rate": 1.9682506842084218e-05, + "loss": 0.4854, + "step": 28410 + }, + { + "epoch": 2.846596884860019, + "grad_norm": 2.281378746032715, + "learning_rate": 1.966710391845205e-05, + "loss": 0.499, + "step": 28420 + }, + { + "epoch": 2.8475985375870185, + "grad_norm": 1.9262113571166992, + "learning_rate": 1.9651703115403514e-05, + "loss": 0.5275, + "step": 28430 + }, + { + "epoch": 2.848600190314018, + "grad_norm": 1.9217075109481812, + "learning_rate": 1.9636304439062613e-05, + "loss": 0.4581, + "step": 28440 + }, + { + "epoch": 2.8496018430410177, + "grad_norm": 2.031644105911255, + "learning_rate": 1.9620907895552515e-05, + "loss": 0.4833, + "step": 28450 + }, + { + "epoch": 2.8506034957680173, + "grad_norm": 1.8989890813827515, + "learning_rate": 1.9605513490995543e-05, + "loss": 0.4247, + "step": 28460 + }, + { + "epoch": 2.851605148495017, + "grad_norm": 2.3920555114746094, + "learning_rate": 1.9590121231513137e-05, + "loss": 0.5115, + "step": 28470 + }, + { + "epoch": 2.8526068012220165, + "grad_norm": 2.1428468227386475, + "learning_rate": 1.9574731123225922e-05, + "loss": 0.5196, + "step": 28480 + }, + { + "epoch": 2.8536084539490156, + "grad_norm": 1.9174599647521973, + "learning_rate": 1.9559343172253645e-05, + "loss": 0.5146, + "step": 28490 + }, + { + "epoch": 2.8546101066760157, + "grad_norm": 1.9081904888153076, + "learning_rate": 1.9543957384715213e-05, + "loss": 0.4258, + "step": 28500 + }, + { + "epoch": 2.855611759403015, + "grad_norm": 2.5541326999664307, + "learning_rate": 1.952857376672865e-05, + "loss": 0.5465, + "step": 28510 + }, + { + "epoch": 2.8566134121300144, + "grad_norm": 2.1741418838500977, + "learning_rate": 1.951319232441113e-05, + "loss": 0.5138, + "step": 28520 + }, + { + "epoch": 2.857615064857014, + "grad_norm": 2.374300479888916, + "learning_rate": 1.9497813063878977e-05, + "loss": 0.5121, + "step": 28530 + }, + { + "epoch": 2.8586167175840136, + "grad_norm": 2.0950210094451904, + "learning_rate": 1.948243599124761e-05, + "loss": 0.4595, + "step": 28540 + }, + { + "epoch": 2.859618370311013, + "grad_norm": 1.7163071632385254, + "learning_rate": 1.94670611126316e-05, + "loss": 0.5161, + "step": 28550 + }, + { + "epoch": 2.8606200230380128, + "grad_norm": 2.6274149417877197, + "learning_rate": 1.945168843414466e-05, + "loss": 0.4492, + "step": 28560 + }, + { + "epoch": 2.8616216757650124, + "grad_norm": 2.2445178031921387, + "learning_rate": 1.9436317961899612e-05, + "loss": 0.4481, + "step": 28570 + }, + { + "epoch": 2.862623328492012, + "grad_norm": 2.0394294261932373, + "learning_rate": 1.9420949702008384e-05, + "loss": 0.4574, + "step": 28580 + }, + { + "epoch": 2.8636249812190115, + "grad_norm": 2.9116296768188477, + "learning_rate": 1.9405583660582062e-05, + "loss": 0.4636, + "step": 28590 + }, + { + "epoch": 2.8646266339460107, + "grad_norm": 2.4173495769500732, + "learning_rate": 1.9390219843730825e-05, + "loss": 0.4435, + "step": 28600 + }, + { + "epoch": 2.8656282866730107, + "grad_norm": 2.1900784969329834, + "learning_rate": 1.9374858257563968e-05, + "loss": 0.4313, + "step": 28610 + }, + { + "epoch": 2.86662993940001, + "grad_norm": 2.1137681007385254, + "learning_rate": 1.9359498908189893e-05, + "loss": 0.4938, + "step": 28620 + }, + { + "epoch": 2.8676315921270095, + "grad_norm": 2.0387117862701416, + "learning_rate": 1.934414180171614e-05, + "loss": 0.4846, + "step": 28630 + }, + { + "epoch": 2.868633244854009, + "grad_norm": 2.3030261993408203, + "learning_rate": 1.9328786944249347e-05, + "loss": 0.444, + "step": 28640 + }, + { + "epoch": 2.8696348975810086, + "grad_norm": 2.0840537548065186, + "learning_rate": 1.9313434341895224e-05, + "loss": 0.4349, + "step": 28650 + }, + { + "epoch": 2.8706365503080082, + "grad_norm": 2.0024921894073486, + "learning_rate": 1.929808400075863e-05, + "loss": 0.4305, + "step": 28660 + }, + { + "epoch": 2.871638203035008, + "grad_norm": 2.2334063053131104, + "learning_rate": 1.9282735926943492e-05, + "loss": 0.4807, + "step": 28670 + }, + { + "epoch": 2.8726398557620074, + "grad_norm": 2.153905153274536, + "learning_rate": 1.9267390126552876e-05, + "loss": 0.4767, + "step": 28680 + }, + { + "epoch": 2.873641508489007, + "grad_norm": 1.7628467082977295, + "learning_rate": 1.925204660568889e-05, + "loss": 0.4862, + "step": 28690 + }, + { + "epoch": 2.8746431612160066, + "grad_norm": 2.5055811405181885, + "learning_rate": 1.923670537045277e-05, + "loss": 0.5051, + "step": 28700 + }, + { + "epoch": 2.8756448139430058, + "grad_norm": 2.421543836593628, + "learning_rate": 1.9221366426944845e-05, + "loss": 0.4746, + "step": 28710 + }, + { + "epoch": 2.876646466670006, + "grad_norm": 2.5517852306365967, + "learning_rate": 1.9206029781264513e-05, + "loss": 0.4647, + "step": 28720 + }, + { + "epoch": 2.877648119397005, + "grad_norm": 1.869573712348938, + "learning_rate": 1.919069543951027e-05, + "loss": 0.4866, + "step": 28730 + }, + { + "epoch": 2.8786497721240045, + "grad_norm": 2.2313554286956787, + "learning_rate": 1.91753634077797e-05, + "loss": 0.4412, + "step": 28740 + }, + { + "epoch": 2.879651424851004, + "grad_norm": 2.8452532291412354, + "learning_rate": 1.916003369216947e-05, + "loss": 0.5655, + "step": 28750 + }, + { + "epoch": 2.8806530775780037, + "grad_norm": 2.1433467864990234, + "learning_rate": 1.91447062987753e-05, + "loss": 0.4176, + "step": 28760 + }, + { + "epoch": 2.8816547303050033, + "grad_norm": 1.7829129695892334, + "learning_rate": 1.9129381233692014e-05, + "loss": 0.4787, + "step": 28770 + }, + { + "epoch": 2.882656383032003, + "grad_norm": 2.395498275756836, + "learning_rate": 1.911405850301351e-05, + "loss": 0.4389, + "step": 28780 + }, + { + "epoch": 2.8836580357590025, + "grad_norm": 1.7302987575531006, + "learning_rate": 1.909873811283275e-05, + "loss": 0.459, + "step": 28790 + }, + { + "epoch": 2.8846596884860016, + "grad_norm": 2.646073341369629, + "learning_rate": 1.9083420069241747e-05, + "loss": 0.4666, + "step": 28800 + }, + { + "epoch": 2.8856613412130017, + "grad_norm": 1.9419676065444946, + "learning_rate": 1.9068104378331615e-05, + "loss": 0.4968, + "step": 28810 + }, + { + "epoch": 2.886662993940001, + "grad_norm": 1.7777568101882935, + "learning_rate": 1.905279104619252e-05, + "loss": 0.4824, + "step": 28820 + }, + { + "epoch": 2.8876646466670004, + "grad_norm": 1.9487704038619995, + "learning_rate": 1.903748007891367e-05, + "loss": 0.5306, + "step": 28830 + }, + { + "epoch": 2.888666299394, + "grad_norm": 2.5456743240356445, + "learning_rate": 1.902217148258336e-05, + "loss": 0.4403, + "step": 28840 + }, + { + "epoch": 2.8896679521209996, + "grad_norm": 3.1180431842803955, + "learning_rate": 1.9006865263288926e-05, + "loss": 0.531, + "step": 28850 + }, + { + "epoch": 2.890669604847999, + "grad_norm": 3.0304436683654785, + "learning_rate": 1.8991561427116777e-05, + "loss": 0.5187, + "step": 28860 + }, + { + "epoch": 2.8916712575749988, + "grad_norm": 2.2932333946228027, + "learning_rate": 1.8976259980152348e-05, + "loss": 0.4541, + "step": 28870 + }, + { + "epoch": 2.8926729103019984, + "grad_norm": 2.4942545890808105, + "learning_rate": 1.8960960928480136e-05, + "loss": 0.4583, + "step": 28880 + }, + { + "epoch": 2.893674563028998, + "grad_norm": 1.900305986404419, + "learning_rate": 1.8945664278183704e-05, + "loss": 0.413, + "step": 28890 + }, + { + "epoch": 2.8946762157559975, + "grad_norm": 2.3043212890625, + "learning_rate": 1.8930370035345623e-05, + "loss": 0.5049, + "step": 28900 + }, + { + "epoch": 2.8956778684829967, + "grad_norm": 3.232800006866455, + "learning_rate": 1.8915078206047535e-05, + "loss": 0.4768, + "step": 28910 + }, + { + "epoch": 2.8966795212099967, + "grad_norm": 1.8843035697937012, + "learning_rate": 1.889978879637012e-05, + "loss": 0.4959, + "step": 28920 + }, + { + "epoch": 2.897681173936996, + "grad_norm": 2.1841742992401123, + "learning_rate": 1.8884501812393086e-05, + "loss": 0.4738, + "step": 28930 + }, + { + "epoch": 2.8986828266639955, + "grad_norm": 2.26435923576355, + "learning_rate": 1.8869217260195178e-05, + "loss": 0.4499, + "step": 28940 + }, + { + "epoch": 2.899684479390995, + "grad_norm": 2.054779529571533, + "learning_rate": 1.885393514585417e-05, + "loss": 0.4213, + "step": 28950 + }, + { + "epoch": 2.9006861321179946, + "grad_norm": 2.1258368492126465, + "learning_rate": 1.883865547544688e-05, + "loss": 0.5051, + "step": 28960 + }, + { + "epoch": 2.9016877848449942, + "grad_norm": 1.691569447517395, + "learning_rate": 1.882337825504916e-05, + "loss": 0.4256, + "step": 28970 + }, + { + "epoch": 2.902689437571994, + "grad_norm": 2.8432114124298096, + "learning_rate": 1.880810349073585e-05, + "loss": 0.512, + "step": 28980 + }, + { + "epoch": 2.9036910902989934, + "grad_norm": 2.043947696685791, + "learning_rate": 1.8792831188580856e-05, + "loss": 0.5151, + "step": 28990 + }, + { + "epoch": 2.904692743025993, + "grad_norm": 1.4768218994140625, + "learning_rate": 1.8777561354657087e-05, + "loss": 0.4742, + "step": 29000 + }, + { + "epoch": 2.9056943957529926, + "grad_norm": 2.128981590270996, + "learning_rate": 1.8762293995036457e-05, + "loss": 0.5125, + "step": 29010 + }, + { + "epoch": 2.9066960484799917, + "grad_norm": 2.0752673149108887, + "learning_rate": 1.874702911578992e-05, + "loss": 0.5301, + "step": 29020 + }, + { + "epoch": 2.907697701206992, + "grad_norm": 2.5276637077331543, + "learning_rate": 1.873176672298743e-05, + "loss": 0.4494, + "step": 29030 + }, + { + "epoch": 2.908699353933991, + "grad_norm": 2.664680004119873, + "learning_rate": 1.871650682269797e-05, + "loss": 0.4918, + "step": 29040 + }, + { + "epoch": 2.9097010066609905, + "grad_norm": 2.063596725463867, + "learning_rate": 1.87012494209895e-05, + "loss": 0.4775, + "step": 29050 + }, + { + "epoch": 2.91070265938799, + "grad_norm": 1.8587756156921387, + "learning_rate": 1.8685994523929005e-05, + "loss": 0.4373, + "step": 29060 + }, + { + "epoch": 2.9117043121149897, + "grad_norm": 2.288957118988037, + "learning_rate": 1.8670742137582486e-05, + "loss": 0.5238, + "step": 29070 + }, + { + "epoch": 2.9127059648419893, + "grad_norm": 1.6254380941390991, + "learning_rate": 1.8655492268014922e-05, + "loss": 0.4024, + "step": 29080 + }, + { + "epoch": 2.913707617568989, + "grad_norm": 1.8339122533798218, + "learning_rate": 1.864024492129031e-05, + "loss": 0.429, + "step": 29090 + }, + { + "epoch": 2.9147092702959885, + "grad_norm": 2.6196937561035156, + "learning_rate": 1.8625000103471622e-05, + "loss": 0.5356, + "step": 29100 + }, + { + "epoch": 2.915710923022988, + "grad_norm": 2.130065679550171, + "learning_rate": 1.8609757820620856e-05, + "loss": 0.4308, + "step": 29110 + }, + { + "epoch": 2.9167125757499877, + "grad_norm": 2.1206037998199463, + "learning_rate": 1.8594518078798976e-05, + "loss": 0.4566, + "step": 29120 + }, + { + "epoch": 2.917714228476987, + "grad_norm": 2.1794984340667725, + "learning_rate": 1.8579280884065935e-05, + "loss": 0.4251, + "step": 29130 + }, + { + "epoch": 2.918715881203987, + "grad_norm": 1.4547064304351807, + "learning_rate": 1.8564046242480697e-05, + "loss": 0.4447, + "step": 29140 + }, + { + "epoch": 2.919717533930986, + "grad_norm": 2.497053623199463, + "learning_rate": 1.8548814160101197e-05, + "loss": 0.4754, + "step": 29150 + }, + { + "epoch": 2.9207191866579856, + "grad_norm": 2.3862662315368652, + "learning_rate": 1.8533584642984334e-05, + "loss": 0.5176, + "step": 29160 + }, + { + "epoch": 2.921720839384985, + "grad_norm": 2.6100847721099854, + "learning_rate": 1.8518357697186017e-05, + "loss": 0.4517, + "step": 29170 + }, + { + "epoch": 2.9227224921119848, + "grad_norm": 1.9027708768844604, + "learning_rate": 1.8503133328761123e-05, + "loss": 0.4236, + "step": 29180 + }, + { + "epoch": 2.9237241448389844, + "grad_norm": 2.5158753395080566, + "learning_rate": 1.8487911543763486e-05, + "loss": 0.4196, + "step": 29190 + }, + { + "epoch": 2.924725797565984, + "grad_norm": 2.278364896774292, + "learning_rate": 1.847269234824594e-05, + "loss": 0.5142, + "step": 29200 + }, + { + "epoch": 2.9257274502929835, + "grad_norm": 2.325521469116211, + "learning_rate": 1.8457475748260268e-05, + "loss": 0.524, + "step": 29210 + }, + { + "epoch": 2.926729103019983, + "grad_norm": 2.00109601020813, + "learning_rate": 1.844226174985725e-05, + "loss": 0.4382, + "step": 29220 + }, + { + "epoch": 2.9277307557469827, + "grad_norm": 2.5737013816833496, + "learning_rate": 1.8427050359086586e-05, + "loss": 0.4711, + "step": 29230 + }, + { + "epoch": 2.928732408473982, + "grad_norm": 2.1871094703674316, + "learning_rate": 1.8411841581996966e-05, + "loss": 0.4493, + "step": 29240 + }, + { + "epoch": 2.9297340612009815, + "grad_norm": 2.6685447692871094, + "learning_rate": 1.8396635424636056e-05, + "loss": 0.4532, + "step": 29250 + }, + { + "epoch": 2.930735713927981, + "grad_norm": 2.19049334526062, + "learning_rate": 1.838143189305046e-05, + "loss": 0.5213, + "step": 29260 + }, + { + "epoch": 2.9317373666549806, + "grad_norm": 2.1203505992889404, + "learning_rate": 1.836623099328573e-05, + "loss": 0.5022, + "step": 29270 + }, + { + "epoch": 2.9327390193819802, + "grad_norm": 2.115341901779175, + "learning_rate": 1.8351032731386385e-05, + "loss": 0.4569, + "step": 29280 + }, + { + "epoch": 2.93374067210898, + "grad_norm": 2.712913990020752, + "learning_rate": 1.833583711339591e-05, + "loss": 0.4849, + "step": 29290 + }, + { + "epoch": 2.9347423248359794, + "grad_norm": 2.8189609050750732, + "learning_rate": 1.8320644145356697e-05, + "loss": 0.5019, + "step": 29300 + }, + { + "epoch": 2.935743977562979, + "grad_norm": 1.6259757280349731, + "learning_rate": 1.830545383331012e-05, + "loss": 0.4915, + "step": 29310 + }, + { + "epoch": 2.9367456302899786, + "grad_norm": 2.498462677001953, + "learning_rate": 1.8290266183296483e-05, + "loss": 0.4389, + "step": 29320 + }, + { + "epoch": 2.9377472830169777, + "grad_norm": 2.082543134689331, + "learning_rate": 1.8275081201355044e-05, + "loss": 0.4655, + "step": 29330 + }, + { + "epoch": 2.9387489357439778, + "grad_norm": 2.135136842727661, + "learning_rate": 1.8259898893523976e-05, + "loss": 0.4485, + "step": 29340 + }, + { + "epoch": 2.939750588470977, + "grad_norm": 2.071990489959717, + "learning_rate": 1.824471926584041e-05, + "loss": 0.4787, + "step": 29350 + }, + { + "epoch": 2.9407522411979765, + "grad_norm": 1.6802020072937012, + "learning_rate": 1.822954232434041e-05, + "loss": 0.4796, + "step": 29360 + }, + { + "epoch": 2.941753893924976, + "grad_norm": 1.772538661956787, + "learning_rate": 1.8214368075058946e-05, + "loss": 0.4682, + "step": 29370 + }, + { + "epoch": 2.9427555466519757, + "grad_norm": 2.284114122390747, + "learning_rate": 1.8199196524029956e-05, + "loss": 0.4882, + "step": 29380 + }, + { + "epoch": 2.9437571993789753, + "grad_norm": 2.2266476154327393, + "learning_rate": 1.8184027677286275e-05, + "loss": 0.494, + "step": 29390 + }, + { + "epoch": 2.944758852105975, + "grad_norm": 3.315500497817993, + "learning_rate": 1.8168861540859693e-05, + "loss": 0.5365, + "step": 29400 + }, + { + "epoch": 2.9457605048329745, + "grad_norm": 1.9478274583816528, + "learning_rate": 1.815369812078088e-05, + "loss": 0.4552, + "step": 29410 + }, + { + "epoch": 2.946762157559974, + "grad_norm": 2.8757264614105225, + "learning_rate": 1.8138537423079463e-05, + "loss": 0.5344, + "step": 29420 + }, + { + "epoch": 2.9477638102869737, + "grad_norm": 2.388357400894165, + "learning_rate": 1.8123379453783962e-05, + "loss": 0.5292, + "step": 29430 + }, + { + "epoch": 2.948765463013973, + "grad_norm": 2.107295036315918, + "learning_rate": 1.8108224218921838e-05, + "loss": 0.4461, + "step": 29440 + }, + { + "epoch": 2.949767115740973, + "grad_norm": 2.5148425102233887, + "learning_rate": 1.809307172451944e-05, + "loss": 0.4258, + "step": 29450 + }, + { + "epoch": 2.950768768467972, + "grad_norm": 2.1600284576416016, + "learning_rate": 1.8077921976602025e-05, + "loss": 0.5051, + "step": 29460 + }, + { + "epoch": 2.9517704211949716, + "grad_norm": 1.8847415447235107, + "learning_rate": 1.8062774981193796e-05, + "loss": 0.4117, + "step": 29470 + }, + { + "epoch": 2.952772073921971, + "grad_norm": 1.3785723447799683, + "learning_rate": 1.8047630744317817e-05, + "loss": 0.4374, + "step": 29480 + }, + { + "epoch": 2.9537737266489708, + "grad_norm": 3.6868884563446045, + "learning_rate": 1.8032489271996073e-05, + "loss": 0.5149, + "step": 29490 + }, + { + "epoch": 2.9547753793759703, + "grad_norm": 2.474139451980591, + "learning_rate": 1.801735057024946e-05, + "loss": 0.4437, + "step": 29500 + }, + { + "epoch": 2.95577703210297, + "grad_norm": 2.2893331050872803, + "learning_rate": 1.8002214645097755e-05, + "loss": 0.4841, + "step": 29510 + }, + { + "epoch": 2.9567786848299695, + "grad_norm": 1.97756826877594, + "learning_rate": 1.7987081502559634e-05, + "loss": 0.4465, + "step": 29520 + }, + { + "epoch": 2.957780337556969, + "grad_norm": 2.1581039428710938, + "learning_rate": 1.7971951148652676e-05, + "loss": 0.4425, + "step": 29530 + }, + { + "epoch": 2.9587819902839687, + "grad_norm": 1.6784934997558594, + "learning_rate": 1.7956823589393345e-05, + "loss": 0.4253, + "step": 29540 + }, + { + "epoch": 2.959783643010968, + "grad_norm": 1.9420301914215088, + "learning_rate": 1.7941698830797005e-05, + "loss": 0.4639, + "step": 29550 + }, + { + "epoch": 2.960785295737968, + "grad_norm": 2.0235326290130615, + "learning_rate": 1.7926576878877876e-05, + "loss": 0.5391, + "step": 29560 + }, + { + "epoch": 2.961786948464967, + "grad_norm": 1.8747906684875488, + "learning_rate": 1.791145773964909e-05, + "loss": 0.4593, + "step": 29570 + }, + { + "epoch": 2.9627886011919666, + "grad_norm": 2.1775217056274414, + "learning_rate": 1.7896341419122665e-05, + "loss": 0.4645, + "step": 29580 + }, + { + "epoch": 2.9637902539189662, + "grad_norm": 2.069146156311035, + "learning_rate": 1.788122792330946e-05, + "loss": 0.4522, + "step": 29590 + }, + { + "epoch": 2.964791906645966, + "grad_norm": 2.7413077354431152, + "learning_rate": 1.786611725821926e-05, + "loss": 0.4479, + "step": 29600 + }, + { + "epoch": 2.9657935593729654, + "grad_norm": 2.3349380493164062, + "learning_rate": 1.785100942986068e-05, + "loss": 0.4827, + "step": 29610 + }, + { + "epoch": 2.966795212099965, + "grad_norm": 1.798128604888916, + "learning_rate": 1.783590444424125e-05, + "loss": 0.4739, + "step": 29620 + }, + { + "epoch": 2.9677968648269646, + "grad_norm": 3.001051425933838, + "learning_rate": 1.782080230736733e-05, + "loss": 0.448, + "step": 29630 + }, + { + "epoch": 2.968798517553964, + "grad_norm": 2.069577693939209, + "learning_rate": 1.7805703025244163e-05, + "loss": 0.378, + "step": 29640 + }, + { + "epoch": 2.9698001702809638, + "grad_norm": 1.8276804685592651, + "learning_rate": 1.779060660387588e-05, + "loss": 0.4602, + "step": 29650 + }, + { + "epoch": 2.970801823007963, + "grad_norm": 2.161472797393799, + "learning_rate": 1.7775513049265427e-05, + "loss": 0.454, + "step": 29660 + }, + { + "epoch": 2.971803475734963, + "grad_norm": 2.3388967514038086, + "learning_rate": 1.7760422367414643e-05, + "loss": 0.4632, + "step": 29670 + }, + { + "epoch": 2.972805128461962, + "grad_norm": 2.109787702560425, + "learning_rate": 1.774533456432423e-05, + "loss": 0.4374, + "step": 29680 + }, + { + "epoch": 2.9738067811889617, + "grad_norm": 2.1075479984283447, + "learning_rate": 1.7730249645993725e-05, + "loss": 0.4088, + "step": 29690 + }, + { + "epoch": 2.9748084339159613, + "grad_norm": 2.297384262084961, + "learning_rate": 1.7715167618421512e-05, + "loss": 0.5332, + "step": 29700 + }, + { + "epoch": 2.975810086642961, + "grad_norm": 2.0056192874908447, + "learning_rate": 1.7700088487604857e-05, + "loss": 0.4708, + "step": 29710 + }, + { + "epoch": 2.9768117393699605, + "grad_norm": 2.1155338287353516, + "learning_rate": 1.7685012259539847e-05, + "loss": 0.4678, + "step": 29720 + }, + { + "epoch": 2.97781339209696, + "grad_norm": 2.3071138858795166, + "learning_rate": 1.7669938940221438e-05, + "loss": 0.4392, + "step": 29730 + }, + { + "epoch": 2.9788150448239596, + "grad_norm": 2.3638877868652344, + "learning_rate": 1.7654868535643397e-05, + "loss": 0.4035, + "step": 29740 + }, + { + "epoch": 2.9798166975509592, + "grad_norm": 2.3334412574768066, + "learning_rate": 1.763980105179835e-05, + "loss": 0.494, + "step": 29750 + }, + { + "epoch": 2.980818350277959, + "grad_norm": 2.2591774463653564, + "learning_rate": 1.762473649467779e-05, + "loss": 0.3932, + "step": 29760 + }, + { + "epoch": 2.981820003004958, + "grad_norm": 2.066215991973877, + "learning_rate": 1.7609674870271985e-05, + "loss": 0.4833, + "step": 29770 + }, + { + "epoch": 2.9828216557319576, + "grad_norm": 1.8637421131134033, + "learning_rate": 1.7594616184570085e-05, + "loss": 0.4515, + "step": 29780 + }, + { + "epoch": 2.983823308458957, + "grad_norm": 2.29803729057312, + "learning_rate": 1.7579560443560045e-05, + "loss": 0.5153, + "step": 29790 + }, + { + "epoch": 2.9848249611859567, + "grad_norm": 2.6893725395202637, + "learning_rate": 1.7564507653228685e-05, + "loss": 0.4788, + "step": 29800 + }, + { + "epoch": 2.9858266139129563, + "grad_norm": 2.405869483947754, + "learning_rate": 1.7549457819561603e-05, + "loss": 0.4965, + "step": 29810 + }, + { + "epoch": 2.986828266639956, + "grad_norm": 2.2893760204315186, + "learning_rate": 1.7534410948543253e-05, + "loss": 0.4852, + "step": 29820 + }, + { + "epoch": 2.9878299193669555, + "grad_norm": 2.5346555709838867, + "learning_rate": 1.751936704615691e-05, + "loss": 0.52, + "step": 29830 + }, + { + "epoch": 2.988831572093955, + "grad_norm": 2.542680025100708, + "learning_rate": 1.7504326118384658e-05, + "loss": 0.433, + "step": 29840 + }, + { + "epoch": 2.9898332248209547, + "grad_norm": 3.150050163269043, + "learning_rate": 1.7489288171207394e-05, + "loss": 0.5064, + "step": 29850 + }, + { + "epoch": 2.990834877547954, + "grad_norm": 1.9536211490631104, + "learning_rate": 1.7474253210604847e-05, + "loss": 0.4579, + "step": 29860 + }, + { + "epoch": 2.991836530274954, + "grad_norm": 2.1609601974487305, + "learning_rate": 1.7459221242555553e-05, + "loss": 0.4633, + "step": 29870 + }, + { + "epoch": 2.992838183001953, + "grad_norm": 1.8260406255722046, + "learning_rate": 1.744419227303684e-05, + "loss": 0.4563, + "step": 29880 + }, + { + "epoch": 2.9938398357289526, + "grad_norm": 2.1514317989349365, + "learning_rate": 1.742916630802487e-05, + "loss": 0.4483, + "step": 29890 + }, + { + "epoch": 2.994841488455952, + "grad_norm": 2.2385141849517822, + "learning_rate": 1.7414143353494585e-05, + "loss": 0.4668, + "step": 29900 + }, + { + "epoch": 2.995843141182952, + "grad_norm": 2.0614023208618164, + "learning_rate": 1.739912341541977e-05, + "loss": 0.4083, + "step": 29910 + }, + { + "epoch": 2.9968447939099514, + "grad_norm": 2.2477142810821533, + "learning_rate": 1.7384106499772954e-05, + "loss": 0.5019, + "step": 29920 + }, + { + "epoch": 2.997846446636951, + "grad_norm": 1.8166592121124268, + "learning_rate": 1.736909261252551e-05, + "loss": 0.4157, + "step": 29930 + }, + { + "epoch": 2.9988480993639506, + "grad_norm": 1.802172303199768, + "learning_rate": 1.735408175964759e-05, + "loss": 0.4273, + "step": 29940 + }, + { + "epoch": 2.99984975209095, + "grad_norm": 2.223151445388794, + "learning_rate": 1.733907394710813e-05, + "loss": 0.3838, + "step": 29950 + }, + { + "epoch": 3.0008013221815997, + "grad_norm": 1.856826901435852, + "learning_rate": 1.7324069180874874e-05, + "loss": 0.3971, + "step": 29960 + }, + { + "epoch": 3.0018029749085993, + "grad_norm": 2.1584417819976807, + "learning_rate": 1.730906746691435e-05, + "loss": 0.4797, + "step": 29970 + }, + { + "epoch": 3.002804627635599, + "grad_norm": 2.594388484954834, + "learning_rate": 1.729406881119187e-05, + "loss": 0.4686, + "step": 29980 + }, + { + "epoch": 3.0038062803625984, + "grad_norm": 2.0008304119110107, + "learning_rate": 1.7279073219671525e-05, + "loss": 0.4361, + "step": 29990 + }, + { + "epoch": 3.004807933089598, + "grad_norm": 2.1979317665100098, + "learning_rate": 1.7264080698316186e-05, + "loss": 0.4022, + "step": 30000 + }, + { + "epoch": 3.004807933089598, + "eval_bleu": 0.3025730108532601, + "eval_loss": 0.5179290175437927, + "eval_rouge1": 0.6867038326183534, + "eval_rouge2": 0.5271660350662968, + "eval_rougeL": 0.6468106582725407, + "eval_runtime": 82898.3122, + "eval_samples_per_second": 0.214, + "eval_steps_per_second": 0.027, + "eval_wer": 0.9895713443561743, + "step": 30000 + }, + { + "epoch": 3.005809585816597, + "grad_norm": 1.4881905317306519, + "learning_rate": 1.7249091253087528e-05, + "loss": 0.4422, + "step": 30010 + }, + { + "epoch": 3.0068112385435968, + "grad_norm": 2.210646390914917, + "learning_rate": 1.7234104889945983e-05, + "loss": 0.435, + "step": 30020 + }, + { + "epoch": 3.0078128912705964, + "grad_norm": 2.2528607845306396, + "learning_rate": 1.7219121614850737e-05, + "loss": 0.4465, + "step": 30030 + }, + { + "epoch": 3.008814543997596, + "grad_norm": 1.7737597227096558, + "learning_rate": 1.720414143375979e-05, + "loss": 0.4228, + "step": 30040 + }, + { + "epoch": 3.0098161967245955, + "grad_norm": 2.0847411155700684, + "learning_rate": 1.7189164352629895e-05, + "loss": 0.3812, + "step": 30050 + }, + { + "epoch": 3.010817849451595, + "grad_norm": 2.101858615875244, + "learning_rate": 1.7174190377416545e-05, + "loss": 0.4121, + "step": 30060 + }, + { + "epoch": 3.0118195021785947, + "grad_norm": 2.346134662628174, + "learning_rate": 1.7159219514074042e-05, + "loss": 0.4031, + "step": 30070 + }, + { + "epoch": 3.0128211549055943, + "grad_norm": 2.2245075702667236, + "learning_rate": 1.714425176855542e-05, + "loss": 0.3885, + "step": 30080 + }, + { + "epoch": 3.013822807632594, + "grad_norm": 2.2482717037200928, + "learning_rate": 1.71292871468125e-05, + "loss": 0.4431, + "step": 30090 + }, + { + "epoch": 3.0148244603595935, + "grad_norm": 2.3442859649658203, + "learning_rate": 1.7114325654795823e-05, + "loss": 0.4431, + "step": 30100 + }, + { + "epoch": 3.015826113086593, + "grad_norm": 2.165501356124878, + "learning_rate": 1.7099367298454723e-05, + "loss": 0.4719, + "step": 30110 + }, + { + "epoch": 3.0168277658135922, + "grad_norm": 2.0025136470794678, + "learning_rate": 1.708441208373727e-05, + "loss": 0.4177, + "step": 30120 + }, + { + "epoch": 3.017829418540592, + "grad_norm": 1.7397187948226929, + "learning_rate": 1.7069460016590272e-05, + "loss": 0.4252, + "step": 30130 + }, + { + "epoch": 3.0188310712675914, + "grad_norm": 1.9813027381896973, + "learning_rate": 1.7054511102959315e-05, + "loss": 0.4546, + "step": 30140 + }, + { + "epoch": 3.019832723994591, + "grad_norm": 1.9185618162155151, + "learning_rate": 1.70395653487887e-05, + "loss": 0.4019, + "step": 30150 + }, + { + "epoch": 3.0208343767215906, + "grad_norm": 1.8481942415237427, + "learning_rate": 1.7024622760021508e-05, + "loss": 0.3715, + "step": 30160 + }, + { + "epoch": 3.02183602944859, + "grad_norm": 2.2238407135009766, + "learning_rate": 1.7009683342599526e-05, + "loss": 0.5105, + "step": 30170 + }, + { + "epoch": 3.02283768217559, + "grad_norm": 2.2952382564544678, + "learning_rate": 1.6994747102463292e-05, + "loss": 0.4362, + "step": 30180 + }, + { + "epoch": 3.0238393349025894, + "grad_norm": 2.0249791145324707, + "learning_rate": 1.6979814045552095e-05, + "loss": 0.4306, + "step": 30190 + }, + { + "epoch": 3.024840987629589, + "grad_norm": 2.3426079750061035, + "learning_rate": 1.6964884177803942e-05, + "loss": 0.4174, + "step": 30200 + }, + { + "epoch": 3.0258426403565886, + "grad_norm": 2.315671920776367, + "learning_rate": 1.694995750515556e-05, + "loss": 0.4406, + "step": 30210 + }, + { + "epoch": 3.0268442930835877, + "grad_norm": 2.683622121810913, + "learning_rate": 1.6935034033542443e-05, + "loss": 0.4369, + "step": 30220 + }, + { + "epoch": 3.0278459458105873, + "grad_norm": 2.1116085052490234, + "learning_rate": 1.692011376889879e-05, + "loss": 0.4556, + "step": 30230 + }, + { + "epoch": 3.028847598537587, + "grad_norm": 2.1275312900543213, + "learning_rate": 1.6905196717157505e-05, + "loss": 0.4808, + "step": 30240 + }, + { + "epoch": 3.0298492512645865, + "grad_norm": 1.7960904836654663, + "learning_rate": 1.6890282884250247e-05, + "loss": 0.4133, + "step": 30250 + }, + { + "epoch": 3.030850903991586, + "grad_norm": 1.9978631734848022, + "learning_rate": 1.687537227610739e-05, + "loss": 0.4625, + "step": 30260 + }, + { + "epoch": 3.0318525567185857, + "grad_norm": 2.269608497619629, + "learning_rate": 1.6860464898658013e-05, + "loss": 0.4151, + "step": 30270 + }, + { + "epoch": 3.0328542094455853, + "grad_norm": 2.2099547386169434, + "learning_rate": 1.6845560757829903e-05, + "loss": 0.4048, + "step": 30280 + }, + { + "epoch": 3.033855862172585, + "grad_norm": 2.1496224403381348, + "learning_rate": 1.6830659859549593e-05, + "loss": 0.3884, + "step": 30290 + }, + { + "epoch": 3.0348575148995844, + "grad_norm": 2.702627420425415, + "learning_rate": 1.681576220974229e-05, + "loss": 0.4569, + "step": 30300 + }, + { + "epoch": 3.035859167626584, + "grad_norm": 2.0695064067840576, + "learning_rate": 1.6800867814331943e-05, + "loss": 0.4137, + "step": 30310 + }, + { + "epoch": 3.0368608203535836, + "grad_norm": 2.1617956161499023, + "learning_rate": 1.678597667924118e-05, + "loss": 0.4098, + "step": 30320 + }, + { + "epoch": 3.0378624730805828, + "grad_norm": 2.5029983520507812, + "learning_rate": 1.6771088810391332e-05, + "loss": 0.4801, + "step": 30330 + }, + { + "epoch": 3.0388641258075824, + "grad_norm": 1.9497227668762207, + "learning_rate": 1.6756204213702467e-05, + "loss": 0.4275, + "step": 30340 + }, + { + "epoch": 3.039865778534582, + "grad_norm": 1.6746639013290405, + "learning_rate": 1.6741322895093304e-05, + "loss": 0.4753, + "step": 30350 + }, + { + "epoch": 3.0408674312615815, + "grad_norm": 1.430365800857544, + "learning_rate": 1.6726444860481283e-05, + "loss": 0.3356, + "step": 30360 + }, + { + "epoch": 3.041869083988581, + "grad_norm": 1.6194040775299072, + "learning_rate": 1.6711570115782542e-05, + "loss": 0.3804, + "step": 30370 + }, + { + "epoch": 3.0428707367155807, + "grad_norm": 2.530052423477173, + "learning_rate": 1.6696698666911913e-05, + "loss": 0.4464, + "step": 30380 + }, + { + "epoch": 3.0438723894425803, + "grad_norm": 1.8114992380142212, + "learning_rate": 1.668183051978289e-05, + "loss": 0.4295, + "step": 30390 + }, + { + "epoch": 3.04487404216958, + "grad_norm": 2.182450532913208, + "learning_rate": 1.666696568030769e-05, + "loss": 0.3924, + "step": 30400 + }, + { + "epoch": 3.0458756948965795, + "grad_norm": 1.8115897178649902, + "learning_rate": 1.665210415439719e-05, + "loss": 0.4182, + "step": 30410 + }, + { + "epoch": 3.046877347623579, + "grad_norm": 1.8325234651565552, + "learning_rate": 1.6637245947960955e-05, + "loss": 0.4205, + "step": 30420 + }, + { + "epoch": 3.0478790003505782, + "grad_norm": 2.4362452030181885, + "learning_rate": 1.662239106690723e-05, + "loss": 0.432, + "step": 30430 + }, + { + "epoch": 3.048880653077578, + "grad_norm": 2.317858934402466, + "learning_rate": 1.6607539517142952e-05, + "loss": 0.4567, + "step": 30440 + }, + { + "epoch": 3.0498823058045774, + "grad_norm": 2.2887580394744873, + "learning_rate": 1.6592691304573722e-05, + "loss": 0.4084, + "step": 30450 + }, + { + "epoch": 3.050883958531577, + "grad_norm": 2.171954393386841, + "learning_rate": 1.6577846435103794e-05, + "loss": 0.4411, + "step": 30460 + }, + { + "epoch": 3.0518856112585766, + "grad_norm": 2.362718105316162, + "learning_rate": 1.6563004914636132e-05, + "loss": 0.4254, + "step": 30470 + }, + { + "epoch": 3.052887263985576, + "grad_norm": 2.31545090675354, + "learning_rate": 1.6548166749072332e-05, + "loss": 0.4505, + "step": 30480 + }, + { + "epoch": 3.053888916712576, + "grad_norm": 1.8873096704483032, + "learning_rate": 1.6533331944312694e-05, + "loss": 0.3795, + "step": 30490 + }, + { + "epoch": 3.0548905694395754, + "grad_norm": 2.9692976474761963, + "learning_rate": 1.651850050625614e-05, + "loss": 0.418, + "step": 30500 + }, + { + "epoch": 3.055892222166575, + "grad_norm": 2.100770950317383, + "learning_rate": 1.650367244080028e-05, + "loss": 0.4898, + "step": 30510 + }, + { + "epoch": 3.0568938748935746, + "grad_norm": 2.251727819442749, + "learning_rate": 1.6488847753841382e-05, + "loss": 0.4811, + "step": 30520 + }, + { + "epoch": 3.057895527620574, + "grad_norm": 1.89990234375, + "learning_rate": 1.6474026451274356e-05, + "loss": 0.4516, + "step": 30530 + }, + { + "epoch": 3.0588971803475733, + "grad_norm": 2.172435998916626, + "learning_rate": 1.6459208538992772e-05, + "loss": 0.3815, + "step": 30540 + }, + { + "epoch": 3.059898833074573, + "grad_norm": 2.072516441345215, + "learning_rate": 1.6444394022888868e-05, + "loss": 0.4441, + "step": 30550 + }, + { + "epoch": 3.0609004858015725, + "grad_norm": 2.7983038425445557, + "learning_rate": 1.6429582908853515e-05, + "loss": 0.3968, + "step": 30560 + }, + { + "epoch": 3.061902138528572, + "grad_norm": 1.6153335571289062, + "learning_rate": 1.6414775202776224e-05, + "loss": 0.4167, + "step": 30570 + }, + { + "epoch": 3.0629037912555717, + "grad_norm": 1.913484811782837, + "learning_rate": 1.6399970910545165e-05, + "loss": 0.4488, + "step": 30580 + }, + { + "epoch": 3.0639054439825713, + "grad_norm": 1.8409531116485596, + "learning_rate": 1.6385170038047153e-05, + "loss": 0.4134, + "step": 30590 + }, + { + "epoch": 3.064907096709571, + "grad_norm": 2.3591482639312744, + "learning_rate": 1.637037259116764e-05, + "loss": 0.3989, + "step": 30600 + }, + { + "epoch": 3.0659087494365704, + "grad_norm": 2.223154306411743, + "learning_rate": 1.6355578575790696e-05, + "loss": 0.4391, + "step": 30610 + }, + { + "epoch": 3.06691040216357, + "grad_norm": 1.8484059572219849, + "learning_rate": 1.6340787997799058e-05, + "loss": 0.4701, + "step": 30620 + }, + { + "epoch": 3.0679120548905696, + "grad_norm": 2.000924825668335, + "learning_rate": 1.6326000863074084e-05, + "loss": 0.4881, + "step": 30630 + }, + { + "epoch": 3.068913707617569, + "grad_norm": 2.186718702316284, + "learning_rate": 1.6311217177495744e-05, + "loss": 0.4545, + "step": 30640 + }, + { + "epoch": 3.0699153603445684, + "grad_norm": 2.2570459842681885, + "learning_rate": 1.6296436946942666e-05, + "loss": 0.4912, + "step": 30650 + }, + { + "epoch": 3.070917013071568, + "grad_norm": 1.8138389587402344, + "learning_rate": 1.628166017729208e-05, + "loss": 0.4317, + "step": 30660 + }, + { + "epoch": 3.0719186657985675, + "grad_norm": 2.333362102508545, + "learning_rate": 1.6266886874419874e-05, + "loss": 0.4869, + "step": 30670 + }, + { + "epoch": 3.072920318525567, + "grad_norm": 2.178196907043457, + "learning_rate": 1.625211704420051e-05, + "loss": 0.5721, + "step": 30680 + }, + { + "epoch": 3.0739219712525667, + "grad_norm": 2.0177736282348633, + "learning_rate": 1.62373506925071e-05, + "loss": 0.4564, + "step": 30690 + }, + { + "epoch": 3.0749236239795663, + "grad_norm": 2.001838207244873, + "learning_rate": 1.622258782521137e-05, + "loss": 0.4209, + "step": 30700 + }, + { + "epoch": 3.075925276706566, + "grad_norm": 1.930259108543396, + "learning_rate": 1.6207828448183653e-05, + "loss": 0.4177, + "step": 30710 + }, + { + "epoch": 3.0769269294335655, + "grad_norm": 2.031282663345337, + "learning_rate": 1.6193072567292896e-05, + "loss": 0.4197, + "step": 30720 + }, + { + "epoch": 3.077928582160565, + "grad_norm": 1.463243007659912, + "learning_rate": 1.6178320188406665e-05, + "loss": 0.4557, + "step": 30730 + }, + { + "epoch": 3.0789302348875647, + "grad_norm": 1.8603028059005737, + "learning_rate": 1.6163571317391126e-05, + "loss": 0.4094, + "step": 30740 + }, + { + "epoch": 3.079931887614564, + "grad_norm": 2.061042547225952, + "learning_rate": 1.6148825960111038e-05, + "loss": 0.3696, + "step": 30750 + }, + { + "epoch": 3.0809335403415634, + "grad_norm": 2.072138547897339, + "learning_rate": 1.6134084122429778e-05, + "loss": 0.4335, + "step": 30760 + }, + { + "epoch": 3.081935193068563, + "grad_norm": 1.9868147373199463, + "learning_rate": 1.611934581020933e-05, + "loss": 0.4321, + "step": 30770 + }, + { + "epoch": 3.0829368457955626, + "grad_norm": 2.325610399246216, + "learning_rate": 1.610461102931027e-05, + "loss": 0.4099, + "step": 30780 + }, + { + "epoch": 3.083938498522562, + "grad_norm": 2.9362833499908447, + "learning_rate": 1.6089879785591738e-05, + "loss": 0.4579, + "step": 30790 + }, + { + "epoch": 3.084940151249562, + "grad_norm": 1.9224374294281006, + "learning_rate": 1.6075152084911526e-05, + "loss": 0.4099, + "step": 30800 + }, + { + "epoch": 3.0859418039765614, + "grad_norm": 2.3298308849334717, + "learning_rate": 1.6060427933125983e-05, + "loss": 0.4423, + "step": 30810 + }, + { + "epoch": 3.086943456703561, + "grad_norm": 2.235973834991455, + "learning_rate": 1.6045707336090026e-05, + "loss": 0.4588, + "step": 30820 + }, + { + "epoch": 3.0879451094305606, + "grad_norm": 2.449845790863037, + "learning_rate": 1.6030990299657213e-05, + "loss": 0.4757, + "step": 30830 + }, + { + "epoch": 3.08894676215756, + "grad_norm": 1.9638694524765015, + "learning_rate": 1.6016276829679633e-05, + "loss": 0.3865, + "step": 30840 + }, + { + "epoch": 3.0899484148845597, + "grad_norm": 1.989548921585083, + "learning_rate": 1.6001566932008e-05, + "loss": 0.393, + "step": 30850 + }, + { + "epoch": 3.090950067611559, + "grad_norm": 2.4953360557556152, + "learning_rate": 1.598686061249158e-05, + "loss": 0.4132, + "step": 30860 + }, + { + "epoch": 3.0919517203385585, + "grad_norm": 1.8345521688461304, + "learning_rate": 1.597215787697821e-05, + "loss": 0.4129, + "step": 30870 + }, + { + "epoch": 3.092953373065558, + "grad_norm": 2.3891615867614746, + "learning_rate": 1.5957458731314334e-05, + "loss": 0.4426, + "step": 30880 + }, + { + "epoch": 3.0939550257925577, + "grad_norm": 2.574930429458618, + "learning_rate": 1.594276318134495e-05, + "loss": 0.4353, + "step": 30890 + }, + { + "epoch": 3.0949566785195572, + "grad_norm": 2.4506924152374268, + "learning_rate": 1.5928071232913615e-05, + "loss": 0.4643, + "step": 30900 + }, + { + "epoch": 3.095958331246557, + "grad_norm": 3.072618246078491, + "learning_rate": 1.591338289186247e-05, + "loss": 0.4184, + "step": 30910 + }, + { + "epoch": 3.0969599839735564, + "grad_norm": 2.3220643997192383, + "learning_rate": 1.589869816403222e-05, + "loss": 0.4504, + "step": 30920 + }, + { + "epoch": 3.097961636700556, + "grad_norm": 1.746625542640686, + "learning_rate": 1.588401705526213e-05, + "loss": 0.4448, + "step": 30930 + }, + { + "epoch": 3.0989632894275556, + "grad_norm": 2.5139360427856445, + "learning_rate": 1.5869339571390012e-05, + "loss": 0.4606, + "step": 30940 + }, + { + "epoch": 3.099964942154555, + "grad_norm": 2.294275999069214, + "learning_rate": 1.585466571825227e-05, + "loss": 0.4509, + "step": 30950 + }, + { + "epoch": 3.1009665948815544, + "grad_norm": 2.091242551803589, + "learning_rate": 1.5839995501683842e-05, + "loss": 0.3922, + "step": 30960 + }, + { + "epoch": 3.101968247608554, + "grad_norm": 1.6841497421264648, + "learning_rate": 1.5825328927518203e-05, + "loss": 0.4624, + "step": 30970 + }, + { + "epoch": 3.1029699003355535, + "grad_norm": 2.2461984157562256, + "learning_rate": 1.5810666001587422e-05, + "loss": 0.4055, + "step": 30980 + }, + { + "epoch": 3.103971553062553, + "grad_norm": 2.257538318634033, + "learning_rate": 1.579600672972209e-05, + "loss": 0.4854, + "step": 30990 + }, + { + "epoch": 3.1049732057895527, + "grad_norm": 2.6897525787353516, + "learning_rate": 1.5781351117751336e-05, + "loss": 0.4658, + "step": 31000 + }, + { + "epoch": 3.1059748585165523, + "grad_norm": 2.213277816772461, + "learning_rate": 1.5766699171502864e-05, + "loss": 0.4478, + "step": 31010 + }, + { + "epoch": 3.106976511243552, + "grad_norm": 2.556442975997925, + "learning_rate": 1.575205089680289e-05, + "loss": 0.385, + "step": 31020 + }, + { + "epoch": 3.1079781639705515, + "grad_norm": 2.161545515060425, + "learning_rate": 1.5737406299476202e-05, + "loss": 0.3858, + "step": 31030 + }, + { + "epoch": 3.108979816697551, + "grad_norm": 2.166633367538452, + "learning_rate": 1.5722765385346088e-05, + "loss": 0.4702, + "step": 31040 + }, + { + "epoch": 3.1099814694245507, + "grad_norm": 1.7207388877868652, + "learning_rate": 1.5708128160234398e-05, + "loss": 0.4262, + "step": 31050 + }, + { + "epoch": 3.1109831221515503, + "grad_norm": 2.28712797164917, + "learning_rate": 1.569349462996151e-05, + "loss": 0.4538, + "step": 31060 + }, + { + "epoch": 3.1119847748785494, + "grad_norm": 2.292612075805664, + "learning_rate": 1.567886480034634e-05, + "loss": 0.41, + "step": 31070 + }, + { + "epoch": 3.112986427605549, + "grad_norm": 2.4569859504699707, + "learning_rate": 1.5664238677206303e-05, + "loss": 0.4326, + "step": 31080 + }, + { + "epoch": 3.1139880803325486, + "grad_norm": 2.032771110534668, + "learning_rate": 1.5649616266357364e-05, + "loss": 0.446, + "step": 31090 + }, + { + "epoch": 3.114989733059548, + "grad_norm": 2.4700424671173096, + "learning_rate": 1.563499757361403e-05, + "loss": 0.4004, + "step": 31100 + }, + { + "epoch": 3.1159913857865478, + "grad_norm": 2.9324538707733154, + "learning_rate": 1.562038260478928e-05, + "loss": 0.3842, + "step": 31110 + }, + { + "epoch": 3.1169930385135474, + "grad_norm": 2.291475534439087, + "learning_rate": 1.5605771365694653e-05, + "loss": 0.3809, + "step": 31120 + }, + { + "epoch": 3.117994691240547, + "grad_norm": 1.8152060508728027, + "learning_rate": 1.5591163862140193e-05, + "loss": 0.4574, + "step": 31130 + }, + { + "epoch": 3.1189963439675465, + "grad_norm": 2.637399911880493, + "learning_rate": 1.557656009993446e-05, + "loss": 0.4402, + "step": 31140 + }, + { + "epoch": 3.119997996694546, + "grad_norm": 2.726431131362915, + "learning_rate": 1.5561960084884507e-05, + "loss": 0.4582, + "step": 31150 + }, + { + "epoch": 3.1209996494215457, + "grad_norm": 1.8519651889801025, + "learning_rate": 1.5547363822795935e-05, + "loss": 0.3868, + "step": 31160 + }, + { + "epoch": 3.1220013021485453, + "grad_norm": 1.980515480041504, + "learning_rate": 1.5532771319472824e-05, + "loss": 0.4291, + "step": 31170 + }, + { + "epoch": 3.1230029548755445, + "grad_norm": 2.1525003910064697, + "learning_rate": 1.5518182580717756e-05, + "loss": 0.4091, + "step": 31180 + }, + { + "epoch": 3.124004607602544, + "grad_norm": 2.2667062282562256, + "learning_rate": 1.5503597612331834e-05, + "loss": 0.4454, + "step": 31190 + }, + { + "epoch": 3.1250062603295437, + "grad_norm": 1.6090610027313232, + "learning_rate": 1.5489016420114658e-05, + "loss": 0.4377, + "step": 31200 + }, + { + "epoch": 3.1260079130565432, + "grad_norm": 1.957115650177002, + "learning_rate": 1.5474439009864328e-05, + "loss": 0.4627, + "step": 31210 + }, + { + "epoch": 3.127009565783543, + "grad_norm": 2.4358901977539062, + "learning_rate": 1.545986538737742e-05, + "loss": 0.4202, + "step": 31220 + }, + { + "epoch": 3.1280112185105424, + "grad_norm": 2.5619661808013916, + "learning_rate": 1.5445295558449024e-05, + "loss": 0.4035, + "step": 31230 + }, + { + "epoch": 3.129012871237542, + "grad_norm": 3.102004289627075, + "learning_rate": 1.5430729528872722e-05, + "loss": 0.4026, + "step": 31240 + }, + { + "epoch": 3.1300145239645416, + "grad_norm": 1.8699795007705688, + "learning_rate": 1.541616730444058e-05, + "loss": 0.4918, + "step": 31250 + }, + { + "epoch": 3.131016176691541, + "grad_norm": 1.6903408765792847, + "learning_rate": 1.540160889094315e-05, + "loss": 0.37, + "step": 31260 + }, + { + "epoch": 3.132017829418541, + "grad_norm": 2.113809108734131, + "learning_rate": 1.5387054294169455e-05, + "loss": 0.4645, + "step": 31270 + }, + { + "epoch": 3.13301948214554, + "grad_norm": 1.8396596908569336, + "learning_rate": 1.5372503519907035e-05, + "loss": 0.3981, + "step": 31280 + }, + { + "epoch": 3.1340211348725395, + "grad_norm": 1.8806499242782593, + "learning_rate": 1.5357956573941872e-05, + "loss": 0.4145, + "step": 31290 + }, + { + "epoch": 3.135022787599539, + "grad_norm": 1.8449625968933105, + "learning_rate": 1.5343413462058453e-05, + "loss": 0.4274, + "step": 31300 + }, + { + "epoch": 3.1360244403265387, + "grad_norm": 2.39495849609375, + "learning_rate": 1.532887419003973e-05, + "loss": 0.4665, + "step": 31310 + }, + { + "epoch": 3.1370260930535383, + "grad_norm": 2.1482601165771484, + "learning_rate": 1.5314338763667132e-05, + "loss": 0.4653, + "step": 31320 + }, + { + "epoch": 3.138027745780538, + "grad_norm": 1.9213060140609741, + "learning_rate": 1.529980718872054e-05, + "loss": 0.4786, + "step": 31330 + }, + { + "epoch": 3.1390293985075375, + "grad_norm": 1.8243101835250854, + "learning_rate": 1.5285279470978335e-05, + "loss": 0.3736, + "step": 31340 + }, + { + "epoch": 3.140031051234537, + "grad_norm": 2.4560937881469727, + "learning_rate": 1.5270755616217343e-05, + "loss": 0.4372, + "step": 31350 + }, + { + "epoch": 3.1410327039615367, + "grad_norm": 1.697628140449524, + "learning_rate": 1.5256235630212864e-05, + "loss": 0.3558, + "step": 31360 + }, + { + "epoch": 3.1420343566885363, + "grad_norm": 1.9843086004257202, + "learning_rate": 1.5241719518738646e-05, + "loss": 0.4401, + "step": 31370 + }, + { + "epoch": 3.1430360094155354, + "grad_norm": 2.132075786590576, + "learning_rate": 1.5227207287566903e-05, + "loss": 0.4759, + "step": 31380 + }, + { + "epoch": 3.144037662142535, + "grad_norm": 1.9841755628585815, + "learning_rate": 1.521269894246833e-05, + "loss": 0.4012, + "step": 31390 + }, + { + "epoch": 3.1450393148695346, + "grad_norm": 2.36376690864563, + "learning_rate": 1.5198194489212025e-05, + "loss": 0.4353, + "step": 31400 + }, + { + "epoch": 3.146040967596534, + "grad_norm": 2.0222671031951904, + "learning_rate": 1.5183693933565585e-05, + "loss": 0.4259, + "step": 31410 + }, + { + "epoch": 3.1470426203235338, + "grad_norm": 1.9244353771209717, + "learning_rate": 1.5169197281295033e-05, + "loss": 0.4191, + "step": 31420 + }, + { + "epoch": 3.1480442730505334, + "grad_norm": 2.056471586227417, + "learning_rate": 1.515470453816486e-05, + "loss": 0.44, + "step": 31430 + }, + { + "epoch": 3.149045925777533, + "grad_norm": 2.2160956859588623, + "learning_rate": 1.514021570993798e-05, + "loss": 0.4678, + "step": 31440 + }, + { + "epoch": 3.1500475785045325, + "grad_norm": 2.4954745769500732, + "learning_rate": 1.5125730802375754e-05, + "loss": 0.4518, + "step": 31450 + }, + { + "epoch": 3.151049231231532, + "grad_norm": 2.0969278812408447, + "learning_rate": 1.511124982123801e-05, + "loss": 0.4325, + "step": 31460 + }, + { + "epoch": 3.1520508839585317, + "grad_norm": 2.045363664627075, + "learning_rate": 1.5096772772282977e-05, + "loss": 0.4275, + "step": 31470 + }, + { + "epoch": 3.1530525366855313, + "grad_norm": 2.1558549404144287, + "learning_rate": 1.5082299661267338e-05, + "loss": 0.4799, + "step": 31480 + }, + { + "epoch": 3.1540541894125305, + "grad_norm": 1.989071249961853, + "learning_rate": 1.5067830493946222e-05, + "loss": 0.444, + "step": 31490 + }, + { + "epoch": 3.15505584213953, + "grad_norm": 1.6298166513442993, + "learning_rate": 1.5053365276073176e-05, + "loss": 0.4246, + "step": 31500 + }, + { + "epoch": 3.1560574948665296, + "grad_norm": 1.942826747894287, + "learning_rate": 1.5038904013400165e-05, + "loss": 0.4383, + "step": 31510 + }, + { + "epoch": 3.1570591475935292, + "grad_norm": 2.3768038749694824, + "learning_rate": 1.502444671167761e-05, + "loss": 0.4235, + "step": 31520 + }, + { + "epoch": 3.158060800320529, + "grad_norm": 1.900918960571289, + "learning_rate": 1.500999337665433e-05, + "loss": 0.3984, + "step": 31530 + }, + { + "epoch": 3.1590624530475284, + "grad_norm": 2.272731304168701, + "learning_rate": 1.4995544014077595e-05, + "loss": 0.4221, + "step": 31540 + }, + { + "epoch": 3.160064105774528, + "grad_norm": 2.2893619537353516, + "learning_rate": 1.4981098629693066e-05, + "loss": 0.4546, + "step": 31550 + }, + { + "epoch": 3.1610657585015276, + "grad_norm": 2.42533016204834, + "learning_rate": 1.496665722924483e-05, + "loss": 0.3829, + "step": 31560 + }, + { + "epoch": 3.162067411228527, + "grad_norm": 2.7083005905151367, + "learning_rate": 1.4952219818475422e-05, + "loss": 0.4093, + "step": 31570 + }, + { + "epoch": 3.163069063955527, + "grad_norm": 2.4388954639434814, + "learning_rate": 1.4937786403125725e-05, + "loss": 0.4436, + "step": 31580 + }, + { + "epoch": 3.1640707166825264, + "grad_norm": 2.242216110229492, + "learning_rate": 1.4923356988935094e-05, + "loss": 0.4942, + "step": 31590 + }, + { + "epoch": 3.1650723694095255, + "grad_norm": 2.0267419815063477, + "learning_rate": 1.4908931581641267e-05, + "loss": 0.4141, + "step": 31600 + }, + { + "epoch": 3.166074022136525, + "grad_norm": 2.0003437995910645, + "learning_rate": 1.48945101869804e-05, + "loss": 0.4125, + "step": 31610 + }, + { + "epoch": 3.1670756748635247, + "grad_norm": 2.4626035690307617, + "learning_rate": 1.4880092810687034e-05, + "loss": 0.3956, + "step": 31620 + }, + { + "epoch": 3.1680773275905243, + "grad_norm": 2.163177967071533, + "learning_rate": 1.4865679458494123e-05, + "loss": 0.4299, + "step": 31630 + }, + { + "epoch": 3.169078980317524, + "grad_norm": 2.259563684463501, + "learning_rate": 1.485127013613303e-05, + "loss": 0.4229, + "step": 31640 + }, + { + "epoch": 3.1700806330445235, + "grad_norm": 1.6991698741912842, + "learning_rate": 1.4836864849333509e-05, + "loss": 0.4263, + "step": 31650 + }, + { + "epoch": 3.171082285771523, + "grad_norm": 2.012312889099121, + "learning_rate": 1.4822463603823694e-05, + "loss": 0.4329, + "step": 31660 + }, + { + "epoch": 3.1720839384985227, + "grad_norm": 1.6116600036621094, + "learning_rate": 1.480806640533014e-05, + "loss": 0.45, + "step": 31670 + }, + { + "epoch": 3.1730855912255223, + "grad_norm": 1.7727469205856323, + "learning_rate": 1.4793673259577773e-05, + "loss": 0.4319, + "step": 31680 + }, + { + "epoch": 3.174087243952522, + "grad_norm": 1.567363977432251, + "learning_rate": 1.4779284172289904e-05, + "loss": 0.4308, + "step": 31690 + }, + { + "epoch": 3.1750888966795214, + "grad_norm": 2.1986780166625977, + "learning_rate": 1.4764899149188256e-05, + "loss": 0.4781, + "step": 31700 + }, + { + "epoch": 3.1760905494065206, + "grad_norm": 2.068117380142212, + "learning_rate": 1.4750518195992897e-05, + "loss": 0.4237, + "step": 31710 + }, + { + "epoch": 3.17709220213352, + "grad_norm": 2.2476770877838135, + "learning_rate": 1.4736141318422331e-05, + "loss": 0.478, + "step": 31720 + }, + { + "epoch": 3.1780938548605198, + "grad_norm": 2.5276355743408203, + "learning_rate": 1.4721768522193372e-05, + "loss": 0.4347, + "step": 31730 + }, + { + "epoch": 3.1790955075875194, + "grad_norm": 2.2223894596099854, + "learning_rate": 1.4707399813021272e-05, + "loss": 0.466, + "step": 31740 + }, + { + "epoch": 3.180097160314519, + "grad_norm": 1.6248425245285034, + "learning_rate": 1.4693035196619629e-05, + "loss": 0.4342, + "step": 31750 + }, + { + "epoch": 3.1810988130415185, + "grad_norm": 1.8279471397399902, + "learning_rate": 1.4678674678700405e-05, + "loss": 0.4708, + "step": 31760 + }, + { + "epoch": 3.182100465768518, + "grad_norm": 2.691256523132324, + "learning_rate": 1.4664318264973965e-05, + "loss": 0.4571, + "step": 31770 + }, + { + "epoch": 3.1831021184955177, + "grad_norm": 4.807773590087891, + "learning_rate": 1.4649965961149003e-05, + "loss": 0.4275, + "step": 31780 + }, + { + "epoch": 3.1841037712225173, + "grad_norm": 1.9490879774093628, + "learning_rate": 1.4635617772932625e-05, + "loss": 0.4218, + "step": 31790 + }, + { + "epoch": 3.185105423949517, + "grad_norm": 2.2792632579803467, + "learning_rate": 1.462127370603025e-05, + "loss": 0.4507, + "step": 31800 + }, + { + "epoch": 3.186107076676516, + "grad_norm": 2.2887744903564453, + "learning_rate": 1.4606933766145686e-05, + "loss": 0.458, + "step": 31810 + }, + { + "epoch": 3.1871087294035156, + "grad_norm": 2.0166826248168945, + "learning_rate": 1.4592597958981102e-05, + "loss": 0.4476, + "step": 31820 + }, + { + "epoch": 3.1881103821305152, + "grad_norm": 1.9539391994476318, + "learning_rate": 1.457826629023702e-05, + "loss": 0.4931, + "step": 31830 + }, + { + "epoch": 3.189112034857515, + "grad_norm": 2.3129167556762695, + "learning_rate": 1.4563938765612304e-05, + "loss": 0.4348, + "step": 31840 + }, + { + "epoch": 3.1901136875845144, + "grad_norm": 2.1217124462127686, + "learning_rate": 1.4549615390804183e-05, + "loss": 0.4267, + "step": 31850 + }, + { + "epoch": 3.191115340311514, + "grad_norm": 2.5243582725524902, + "learning_rate": 1.453529617150824e-05, + "loss": 0.4407, + "step": 31860 + }, + { + "epoch": 3.1921169930385136, + "grad_norm": 2.3081367015838623, + "learning_rate": 1.4520981113418391e-05, + "loss": 0.5121, + "step": 31870 + }, + { + "epoch": 3.193118645765513, + "grad_norm": 2.267855167388916, + "learning_rate": 1.450667022222691e-05, + "loss": 0.4379, + "step": 31880 + }, + { + "epoch": 3.1941202984925128, + "grad_norm": 1.9463863372802734, + "learning_rate": 1.44923635036244e-05, + "loss": 0.4241, + "step": 31890 + }, + { + "epoch": 3.1951219512195124, + "grad_norm": 1.9318593740463257, + "learning_rate": 1.4478060963299825e-05, + "loss": 0.4678, + "step": 31900 + }, + { + "epoch": 3.1961236039465115, + "grad_norm": 2.170785665512085, + "learning_rate": 1.4463762606940467e-05, + "loss": 0.471, + "step": 31910 + }, + { + "epoch": 3.197125256673511, + "grad_norm": 2.2391130924224854, + "learning_rate": 1.4449468440231961e-05, + "loss": 0.4823, + "step": 31920 + }, + { + "epoch": 3.1981269094005107, + "grad_norm": 2.5374581813812256, + "learning_rate": 1.4435178468858263e-05, + "loss": 0.4299, + "step": 31930 + }, + { + "epoch": 3.1991285621275103, + "grad_norm": 1.7519168853759766, + "learning_rate": 1.4420892698501665e-05, + "loss": 0.4503, + "step": 31940 + }, + { + "epoch": 3.20013021485451, + "grad_norm": 2.36940860748291, + "learning_rate": 1.4406611134842795e-05, + "loss": 0.4593, + "step": 31950 + }, + { + "epoch": 3.2011318675815095, + "grad_norm": 2.056506872177124, + "learning_rate": 1.4392333783560602e-05, + "loss": 0.4437, + "step": 31960 + }, + { + "epoch": 3.202133520308509, + "grad_norm": 2.2406508922576904, + "learning_rate": 1.4378060650332359e-05, + "loss": 0.3915, + "step": 31970 + }, + { + "epoch": 3.2031351730355087, + "grad_norm": 2.973459482192993, + "learning_rate": 1.4363791740833666e-05, + "loss": 0.4502, + "step": 31980 + }, + { + "epoch": 3.2041368257625082, + "grad_norm": 2.4402339458465576, + "learning_rate": 1.4349527060738435e-05, + "loss": 0.4597, + "step": 31990 + }, + { + "epoch": 3.205138478489508, + "grad_norm": 2.2443811893463135, + "learning_rate": 1.43352666157189e-05, + "loss": 0.4083, + "step": 32000 + }, + { + "epoch": 3.2061401312165074, + "grad_norm": 1.9804131984710693, + "learning_rate": 1.432101041144564e-05, + "loss": 0.4083, + "step": 32010 + }, + { + "epoch": 3.2071417839435066, + "grad_norm": 2.6847102642059326, + "learning_rate": 1.4306758453587494e-05, + "loss": 0.4574, + "step": 32020 + }, + { + "epoch": 3.208143436670506, + "grad_norm": 2.2255048751831055, + "learning_rate": 1.4292510747811633e-05, + "loss": 0.3964, + "step": 32030 + }, + { + "epoch": 3.2091450893975058, + "grad_norm": 1.9805270433425903, + "learning_rate": 1.4278267299783576e-05, + "loss": 0.454, + "step": 32040 + }, + { + "epoch": 3.2101467421245053, + "grad_norm": 2.9171619415283203, + "learning_rate": 1.4264028115167091e-05, + "loss": 0.4639, + "step": 32050 + }, + { + "epoch": 3.211148394851505, + "grad_norm": 1.7909626960754395, + "learning_rate": 1.4249793199624284e-05, + "loss": 0.4419, + "step": 32060 + }, + { + "epoch": 3.2121500475785045, + "grad_norm": 1.8210331201553345, + "learning_rate": 1.4235562558815544e-05, + "loss": 0.3819, + "step": 32070 + }, + { + "epoch": 3.213151700305504, + "grad_norm": 2.2666754722595215, + "learning_rate": 1.4221336198399605e-05, + "loss": 0.4234, + "step": 32080 + }, + { + "epoch": 3.2141533530325037, + "grad_norm": 2.126728057861328, + "learning_rate": 1.420711412403343e-05, + "loss": 0.4262, + "step": 32090 + }, + { + "epoch": 3.2151550057595033, + "grad_norm": 2.2342824935913086, + "learning_rate": 1.419289634137232e-05, + "loss": 0.4111, + "step": 32100 + }, + { + "epoch": 3.216156658486503, + "grad_norm": 2.0711076259613037, + "learning_rate": 1.417868285606988e-05, + "loss": 0.4585, + "step": 32110 + }, + { + "epoch": 3.2171583112135025, + "grad_norm": 2.6071414947509766, + "learning_rate": 1.416447367377799e-05, + "loss": 0.4229, + "step": 32120 + }, + { + "epoch": 3.2181599639405016, + "grad_norm": 2.0745346546173096, + "learning_rate": 1.4150268800146787e-05, + "loss": 0.4607, + "step": 32130 + }, + { + "epoch": 3.2191616166675012, + "grad_norm": 2.3919355869293213, + "learning_rate": 1.4136068240824758e-05, + "loss": 0.4789, + "step": 32140 + }, + { + "epoch": 3.220163269394501, + "grad_norm": 1.9690043926239014, + "learning_rate": 1.4121872001458641e-05, + "loss": 0.4274, + "step": 32150 + }, + { + "epoch": 3.2211649221215004, + "grad_norm": 2.3856241703033447, + "learning_rate": 1.4107680087693425e-05, + "loss": 0.3834, + "step": 32160 + }, + { + "epoch": 3.2221665748485, + "grad_norm": 2.1940131187438965, + "learning_rate": 1.4093492505172445e-05, + "loss": 0.3788, + "step": 32170 + }, + { + "epoch": 3.2231682275754996, + "grad_norm": 2.333587169647217, + "learning_rate": 1.407930925953727e-05, + "loss": 0.4461, + "step": 32180 + }, + { + "epoch": 3.224169880302499, + "grad_norm": 2.39921236038208, + "learning_rate": 1.4065130356427753e-05, + "loss": 0.4476, + "step": 32190 + }, + { + "epoch": 3.2251715330294988, + "grad_norm": 2.1098825931549072, + "learning_rate": 1.4050955801482024e-05, + "loss": 0.4647, + "step": 32200 + }, + { + "epoch": 3.2261731857564984, + "grad_norm": 1.7223210334777832, + "learning_rate": 1.4036785600336477e-05, + "loss": 0.4261, + "step": 32210 + }, + { + "epoch": 3.227174838483498, + "grad_norm": 1.8587815761566162, + "learning_rate": 1.4022619758625793e-05, + "loss": 0.4039, + "step": 32220 + }, + { + "epoch": 3.2281764912104975, + "grad_norm": 2.4933252334594727, + "learning_rate": 1.4008458281982879e-05, + "loss": 0.4714, + "step": 32230 + }, + { + "epoch": 3.2291781439374967, + "grad_norm": 1.8548504114151, + "learning_rate": 1.399430117603896e-05, + "loss": 0.4457, + "step": 32240 + }, + { + "epoch": 3.2301797966644963, + "grad_norm": 1.866041660308838, + "learning_rate": 1.3980148446423483e-05, + "loss": 0.4801, + "step": 32250 + }, + { + "epoch": 3.231181449391496, + "grad_norm": 2.160381317138672, + "learning_rate": 1.3966000098764175e-05, + "loss": 0.4553, + "step": 32260 + }, + { + "epoch": 3.2321831021184955, + "grad_norm": 2.0052473545074463, + "learning_rate": 1.3951856138687009e-05, + "loss": 0.4554, + "step": 32270 + }, + { + "epoch": 3.233184754845495, + "grad_norm": 2.251857280731201, + "learning_rate": 1.3937716571816218e-05, + "loss": 0.3904, + "step": 32280 + }, + { + "epoch": 3.2341864075724946, + "grad_norm": 2.450507164001465, + "learning_rate": 1.3923581403774294e-05, + "loss": 0.4219, + "step": 32290 + }, + { + "epoch": 3.2351880602994942, + "grad_norm": 2.243173599243164, + "learning_rate": 1.3909450640181965e-05, + "loss": 0.4448, + "step": 32300 + }, + { + "epoch": 3.236189713026494, + "grad_norm": 2.0648858547210693, + "learning_rate": 1.3895324286658224e-05, + "loss": 0.471, + "step": 32310 + }, + { + "epoch": 3.2371913657534934, + "grad_norm": 2.1174683570861816, + "learning_rate": 1.38812023488203e-05, + "loss": 0.4278, + "step": 32320 + }, + { + "epoch": 3.238193018480493, + "grad_norm": 2.3641650676727295, + "learning_rate": 1.3867084832283667e-05, + "loss": 0.4653, + "step": 32330 + }, + { + "epoch": 3.239194671207492, + "grad_norm": 1.988904356956482, + "learning_rate": 1.3852971742662047e-05, + "loss": 0.4166, + "step": 32340 + }, + { + "epoch": 3.2401963239344918, + "grad_norm": 2.140962839126587, + "learning_rate": 1.3838863085567395e-05, + "loss": 0.4228, + "step": 32350 + }, + { + "epoch": 3.2411979766614913, + "grad_norm": 2.70149564743042, + "learning_rate": 1.3824758866609897e-05, + "loss": 0.393, + "step": 32360 + }, + { + "epoch": 3.242199629388491, + "grad_norm": 1.729416012763977, + "learning_rate": 1.3810659091398012e-05, + "loss": 0.5004, + "step": 32370 + }, + { + "epoch": 3.2432012821154905, + "grad_norm": 1.8394402265548706, + "learning_rate": 1.3796563765538376e-05, + "loss": 0.4351, + "step": 32380 + }, + { + "epoch": 3.24420293484249, + "grad_norm": 2.235522985458374, + "learning_rate": 1.378247289463589e-05, + "loss": 0.3838, + "step": 32390 + }, + { + "epoch": 3.2452045875694897, + "grad_norm": 1.8229221105575562, + "learning_rate": 1.376838648429367e-05, + "loss": 0.3934, + "step": 32400 + }, + { + "epoch": 3.2462062402964893, + "grad_norm": 1.658739686012268, + "learning_rate": 1.3754304540113089e-05, + "loss": 0.3869, + "step": 32410 + }, + { + "epoch": 3.247207893023489, + "grad_norm": 2.5649220943450928, + "learning_rate": 1.3740227067693697e-05, + "loss": 0.4916, + "step": 32420 + }, + { + "epoch": 3.2482095457504885, + "grad_norm": 1.8066800832748413, + "learning_rate": 1.3726154072633285e-05, + "loss": 0.456, + "step": 32430 + }, + { + "epoch": 3.2492111984774876, + "grad_norm": 2.1143670082092285, + "learning_rate": 1.37120855605279e-05, + "loss": 0.4308, + "step": 32440 + }, + { + "epoch": 3.250212851204487, + "grad_norm": 2.2256264686584473, + "learning_rate": 1.3698021536971733e-05, + "loss": 0.4165, + "step": 32450 + }, + { + "epoch": 3.251214503931487, + "grad_norm": 1.9437214136123657, + "learning_rate": 1.3683962007557247e-05, + "loss": 0.447, + "step": 32460 + }, + { + "epoch": 3.2522161566584864, + "grad_norm": 2.0992989540100098, + "learning_rate": 1.3669906977875113e-05, + "loss": 0.3604, + "step": 32470 + }, + { + "epoch": 3.253217809385486, + "grad_norm": 2.455745220184326, + "learning_rate": 1.3655856453514202e-05, + "loss": 0.3967, + "step": 32480 + }, + { + "epoch": 3.2542194621124856, + "grad_norm": 1.914459228515625, + "learning_rate": 1.3641810440061564e-05, + "loss": 0.4463, + "step": 32490 + }, + { + "epoch": 3.255221114839485, + "grad_norm": 2.0907294750213623, + "learning_rate": 1.3627768943102515e-05, + "loss": 0.448, + "step": 32500 + }, + { + "epoch": 3.2562227675664848, + "grad_norm": 2.3785226345062256, + "learning_rate": 1.3613731968220539e-05, + "loss": 0.4895, + "step": 32510 + }, + { + "epoch": 3.2572244202934844, + "grad_norm": 2.042774200439453, + "learning_rate": 1.3599699520997306e-05, + "loss": 0.4133, + "step": 32520 + }, + { + "epoch": 3.258226073020484, + "grad_norm": 2.2214324474334717, + "learning_rate": 1.3585671607012735e-05, + "loss": 0.3817, + "step": 32530 + }, + { + "epoch": 3.2592277257474835, + "grad_norm": 2.654376983642578, + "learning_rate": 1.3571648231844897e-05, + "loss": 0.4512, + "step": 32540 + }, + { + "epoch": 3.2602293784744827, + "grad_norm": 2.0407917499542236, + "learning_rate": 1.3557629401070094e-05, + "loss": 0.3947, + "step": 32550 + }, + { + "epoch": 3.2612310312014823, + "grad_norm": 1.8769006729125977, + "learning_rate": 1.3543615120262764e-05, + "loss": 0.4184, + "step": 32560 + }, + { + "epoch": 3.262232683928482, + "grad_norm": 1.9162296056747437, + "learning_rate": 1.3529605394995613e-05, + "loss": 0.3897, + "step": 32570 + }, + { + "epoch": 3.2632343366554815, + "grad_norm": 2.0198333263397217, + "learning_rate": 1.3515600230839478e-05, + "loss": 0.4421, + "step": 32580 + }, + { + "epoch": 3.264235989382481, + "grad_norm": 2.189434289932251, + "learning_rate": 1.350159963336341e-05, + "loss": 0.4541, + "step": 32590 + }, + { + "epoch": 3.2652376421094806, + "grad_norm": 1.9533107280731201, + "learning_rate": 1.3487603608134627e-05, + "loss": 0.4527, + "step": 32600 + }, + { + "epoch": 3.2662392948364802, + "grad_norm": 2.1534202098846436, + "learning_rate": 1.347361216071854e-05, + "loss": 0.4493, + "step": 32610 + }, + { + "epoch": 3.26724094756348, + "grad_norm": 1.7690802812576294, + "learning_rate": 1.345962529667874e-05, + "loss": 0.5048, + "step": 32620 + }, + { + "epoch": 3.2682426002904794, + "grad_norm": 2.1077520847320557, + "learning_rate": 1.3445643021576987e-05, + "loss": 0.43, + "step": 32630 + }, + { + "epoch": 3.269244253017479, + "grad_norm": 2.1861956119537354, + "learning_rate": 1.3431665340973223e-05, + "loss": 0.4492, + "step": 32640 + }, + { + "epoch": 3.2702459057444786, + "grad_norm": 2.0920753479003906, + "learning_rate": 1.3417692260425564e-05, + "loss": 0.4196, + "step": 32650 + }, + { + "epoch": 3.2712475584714777, + "grad_norm": 2.3200623989105225, + "learning_rate": 1.3403723785490286e-05, + "loss": 0.4897, + "step": 32660 + }, + { + "epoch": 3.2722492111984773, + "grad_norm": 2.933948516845703, + "learning_rate": 1.338975992172185e-05, + "loss": 0.4299, + "step": 32670 + }, + { + "epoch": 3.273250863925477, + "grad_norm": 2.2343332767486572, + "learning_rate": 1.3375800674672872e-05, + "loss": 0.3738, + "step": 32680 + }, + { + "epoch": 3.2742525166524765, + "grad_norm": 2.4421591758728027, + "learning_rate": 1.3361846049894127e-05, + "loss": 0.4757, + "step": 32690 + }, + { + "epoch": 3.275254169379476, + "grad_norm": 1.903268575668335, + "learning_rate": 1.3347896052934591e-05, + "loss": 0.3908, + "step": 32700 + }, + { + "epoch": 3.2762558221064757, + "grad_norm": 1.6958303451538086, + "learning_rate": 1.3333950689341335e-05, + "loss": 0.4063, + "step": 32710 + }, + { + "epoch": 3.2772574748334753, + "grad_norm": 2.532978057861328, + "learning_rate": 1.3320009964659624e-05, + "loss": 0.4859, + "step": 32720 + }, + { + "epoch": 3.278259127560475, + "grad_norm": 2.2470481395721436, + "learning_rate": 1.3306073884432907e-05, + "loss": 0.4377, + "step": 32730 + }, + { + "epoch": 3.2792607802874745, + "grad_norm": 2.7446441650390625, + "learning_rate": 1.3292142454202725e-05, + "loss": 0.4472, + "step": 32740 + }, + { + "epoch": 3.280262433014474, + "grad_norm": 2.225325107574463, + "learning_rate": 1.327821567950881e-05, + "loss": 0.4215, + "step": 32750 + }, + { + "epoch": 3.2812640857414737, + "grad_norm": 2.3718440532684326, + "learning_rate": 1.326429356588902e-05, + "loss": 0.4337, + "step": 32760 + }, + { + "epoch": 3.282265738468473, + "grad_norm": 1.878641963005066, + "learning_rate": 1.3250376118879408e-05, + "loss": 0.4516, + "step": 32770 + }, + { + "epoch": 3.2832673911954724, + "grad_norm": 2.452392578125, + "learning_rate": 1.32364633440141e-05, + "loss": 0.4819, + "step": 32780 + }, + { + "epoch": 3.284269043922472, + "grad_norm": 3.089430332183838, + "learning_rate": 1.3222555246825407e-05, + "loss": 0.4242, + "step": 32790 + }, + { + "epoch": 3.2852706966494716, + "grad_norm": 1.9483718872070312, + "learning_rate": 1.3208651832843796e-05, + "loss": 0.4246, + "step": 32800 + }, + { + "epoch": 3.286272349376471, + "grad_norm": 2.125678539276123, + "learning_rate": 1.3194753107597824e-05, + "loss": 0.4367, + "step": 32810 + }, + { + "epoch": 3.2872740021034708, + "grad_norm": 2.1187121868133545, + "learning_rate": 1.3180859076614205e-05, + "loss": 0.4101, + "step": 32820 + }, + { + "epoch": 3.2882756548304704, + "grad_norm": 1.9084290266036987, + "learning_rate": 1.3166969745417813e-05, + "loss": 0.3884, + "step": 32830 + }, + { + "epoch": 3.28927730755747, + "grad_norm": 2.143211603164673, + "learning_rate": 1.3153085119531624e-05, + "loss": 0.4027, + "step": 32840 + }, + { + "epoch": 3.2902789602844695, + "grad_norm": 1.8853137493133545, + "learning_rate": 1.3139205204476726e-05, + "loss": 0.4311, + "step": 32850 + }, + { + "epoch": 3.2912806130114687, + "grad_norm": 2.1946933269500732, + "learning_rate": 1.3125330005772385e-05, + "loss": 0.4407, + "step": 32860 + }, + { + "epoch": 3.2922822657384687, + "grad_norm": 2.14318585395813, + "learning_rate": 1.3111459528935949e-05, + "loss": 0.4017, + "step": 32870 + }, + { + "epoch": 3.293283918465468, + "grad_norm": 2.319744110107422, + "learning_rate": 1.3097593779482914e-05, + "loss": 0.4307, + "step": 32880 + }, + { + "epoch": 3.2942855711924675, + "grad_norm": 2.9636361598968506, + "learning_rate": 1.3083732762926859e-05, + "loss": 0.4675, + "step": 32890 + }, + { + "epoch": 3.295287223919467, + "grad_norm": 2.6737558841705322, + "learning_rate": 1.306987648477953e-05, + "loss": 0.5051, + "step": 32900 + }, + { + "epoch": 3.2962888766464666, + "grad_norm": 2.0904157161712646, + "learning_rate": 1.3056024950550772e-05, + "loss": 0.4611, + "step": 32910 + }, + { + "epoch": 3.2972905293734662, + "grad_norm": 2.091607093811035, + "learning_rate": 1.3042178165748507e-05, + "loss": 0.3842, + "step": 32920 + }, + { + "epoch": 3.298292182100466, + "grad_norm": 2.5052433013916016, + "learning_rate": 1.3028336135878822e-05, + "loss": 0.4381, + "step": 32930 + }, + { + "epoch": 3.2992938348274654, + "grad_norm": 1.78681480884552, + "learning_rate": 1.3014498866445887e-05, + "loss": 0.4373, + "step": 32940 + }, + { + "epoch": 3.300295487554465, + "grad_norm": 1.7345925569534302, + "learning_rate": 1.3000666362951979e-05, + "loss": 0.4163, + "step": 32950 + }, + { + "epoch": 3.3012971402814646, + "grad_norm": 2.144805431365967, + "learning_rate": 1.298683863089748e-05, + "loss": 0.4375, + "step": 32960 + }, + { + "epoch": 3.3022987930084637, + "grad_norm": 2.5673115253448486, + "learning_rate": 1.2973015675780883e-05, + "loss": 0.403, + "step": 32970 + }, + { + "epoch": 3.3033004457354633, + "grad_norm": 1.9869693517684937, + "learning_rate": 1.2959197503098775e-05, + "loss": 0.4217, + "step": 32980 + }, + { + "epoch": 3.304302098462463, + "grad_norm": 2.347975969314575, + "learning_rate": 1.2945384118345838e-05, + "loss": 0.4225, + "step": 32990 + }, + { + "epoch": 3.3053037511894625, + "grad_norm": 2.203904390335083, + "learning_rate": 1.2931575527014855e-05, + "loss": 0.457, + "step": 33000 + }, + { + "epoch": 3.306305403916462, + "grad_norm": 2.366241216659546, + "learning_rate": 1.2917771734596707e-05, + "loss": 0.4346, + "step": 33010 + }, + { + "epoch": 3.3073070566434617, + "grad_norm": 2.1328248977661133, + "learning_rate": 1.2903972746580361e-05, + "loss": 0.4801, + "step": 33020 + }, + { + "epoch": 3.3083087093704613, + "grad_norm": 1.9029438495635986, + "learning_rate": 1.2890178568452871e-05, + "loss": 0.4414, + "step": 33030 + }, + { + "epoch": 3.309310362097461, + "grad_norm": 1.8686634302139282, + "learning_rate": 1.2876389205699386e-05, + "loss": 0.4772, + "step": 33040 + }, + { + "epoch": 3.3103120148244605, + "grad_norm": 2.073631525039673, + "learning_rate": 1.2862604663803118e-05, + "loss": 0.544, + "step": 33050 + }, + { + "epoch": 3.31131366755146, + "grad_norm": 1.7073830366134644, + "learning_rate": 1.2848824948245423e-05, + "loss": 0.4515, + "step": 33060 + }, + { + "epoch": 3.3123153202784597, + "grad_norm": 2.569355010986328, + "learning_rate": 1.2835050064505655e-05, + "loss": 0.4384, + "step": 33070 + }, + { + "epoch": 3.313316973005459, + "grad_norm": 1.8745815753936768, + "learning_rate": 1.2821280018061305e-05, + "loss": 0.4025, + "step": 33080 + }, + { + "epoch": 3.3143186257324584, + "grad_norm": 2.1308634281158447, + "learning_rate": 1.280751481438791e-05, + "loss": 0.4674, + "step": 33090 + }, + { + "epoch": 3.315320278459458, + "grad_norm": 2.4393298625946045, + "learning_rate": 1.2793754458959109e-05, + "loss": 0.4237, + "step": 33100 + }, + { + "epoch": 3.3163219311864576, + "grad_norm": 1.8815350532531738, + "learning_rate": 1.2779998957246586e-05, + "loss": 0.502, + "step": 33110 + }, + { + "epoch": 3.317323583913457, + "grad_norm": 2.3304734230041504, + "learning_rate": 1.2766248314720102e-05, + "loss": 0.4362, + "step": 33120 + }, + { + "epoch": 3.3183252366404568, + "grad_norm": 2.8277807235717773, + "learning_rate": 1.2752502536847516e-05, + "loss": 0.4311, + "step": 33130 + }, + { + "epoch": 3.3193268893674563, + "grad_norm": 1.8867182731628418, + "learning_rate": 1.2738761629094703e-05, + "loss": 0.403, + "step": 33140 + }, + { + "epoch": 3.320328542094456, + "grad_norm": 2.2600784301757812, + "learning_rate": 1.2725025596925622e-05, + "loss": 0.4428, + "step": 33150 + }, + { + "epoch": 3.3213301948214555, + "grad_norm": 2.057344436645508, + "learning_rate": 1.2711294445802319e-05, + "loss": 0.3904, + "step": 33160 + }, + { + "epoch": 3.322331847548455, + "grad_norm": 2.2911007404327393, + "learning_rate": 1.2697568181184872e-05, + "loss": 0.4301, + "step": 33170 + }, + { + "epoch": 3.3233335002754547, + "grad_norm": 2.3054580688476562, + "learning_rate": 1.2683846808531396e-05, + "loss": 0.4788, + "step": 33180 + }, + { + "epoch": 3.324335153002454, + "grad_norm": 2.0127499103546143, + "learning_rate": 1.2670130333298113e-05, + "loss": 0.4351, + "step": 33190 + }, + { + "epoch": 3.3253368057294534, + "grad_norm": 1.7581689357757568, + "learning_rate": 1.2656418760939267e-05, + "loss": 0.3516, + "step": 33200 + }, + { + "epoch": 3.326338458456453, + "grad_norm": 2.3614418506622314, + "learning_rate": 1.2642712096907145e-05, + "loss": 0.4669, + "step": 33210 + }, + { + "epoch": 3.3273401111834526, + "grad_norm": 2.0278165340423584, + "learning_rate": 1.2629010346652082e-05, + "loss": 0.3568, + "step": 33220 + }, + { + "epoch": 3.3283417639104522, + "grad_norm": 2.4216668605804443, + "learning_rate": 1.2615313515622496e-05, + "loss": 0.417, + "step": 33230 + }, + { + "epoch": 3.329343416637452, + "grad_norm": 2.564865827560425, + "learning_rate": 1.260162160926482e-05, + "loss": 0.418, + "step": 33240 + }, + { + "epoch": 3.3303450693644514, + "grad_norm": 1.9982346296310425, + "learning_rate": 1.2587934633023505e-05, + "loss": 0.3987, + "step": 33250 + }, + { + "epoch": 3.331346722091451, + "grad_norm": 1.943405032157898, + "learning_rate": 1.2574252592341095e-05, + "loss": 0.4683, + "step": 33260 + }, + { + "epoch": 3.3323483748184506, + "grad_norm": 2.346015453338623, + "learning_rate": 1.2560575492658145e-05, + "loss": 0.4025, + "step": 33270 + }, + { + "epoch": 3.33335002754545, + "grad_norm": 1.8255881071090698, + "learning_rate": 1.2546903339413214e-05, + "loss": 0.4333, + "step": 33280 + }, + { + "epoch": 3.3343516802724498, + "grad_norm": 2.3527069091796875, + "learning_rate": 1.2533236138042956e-05, + "loss": 0.4259, + "step": 33290 + }, + { + "epoch": 3.335353332999449, + "grad_norm": 1.7683374881744385, + "learning_rate": 1.2519573893982014e-05, + "loss": 0.3926, + "step": 33300 + }, + { + "epoch": 3.3363549857264485, + "grad_norm": 1.8756691217422485, + "learning_rate": 1.2505916612663071e-05, + "loss": 0.4162, + "step": 33310 + }, + { + "epoch": 3.337356638453448, + "grad_norm": 2.328777313232422, + "learning_rate": 1.2492264299516837e-05, + "loss": 0.4315, + "step": 33320 + }, + { + "epoch": 3.3383582911804477, + "grad_norm": 2.066019058227539, + "learning_rate": 1.2478616959972044e-05, + "loss": 0.3843, + "step": 33330 + }, + { + "epoch": 3.3393599439074473, + "grad_norm": 2.2497713565826416, + "learning_rate": 1.2464974599455451e-05, + "loss": 0.4682, + "step": 33340 + }, + { + "epoch": 3.340361596634447, + "grad_norm": 2.539682149887085, + "learning_rate": 1.245133722339183e-05, + "loss": 0.4342, + "step": 33350 + }, + { + "epoch": 3.3413632493614465, + "grad_norm": 2.3787872791290283, + "learning_rate": 1.2437704837203972e-05, + "loss": 0.3957, + "step": 33360 + }, + { + "epoch": 3.342364902088446, + "grad_norm": 2.2526967525482178, + "learning_rate": 1.2424077446312696e-05, + "loss": 0.4469, + "step": 33370 + }, + { + "epoch": 3.3433665548154456, + "grad_norm": 2.094153642654419, + "learning_rate": 1.2410455056136818e-05, + "loss": 0.4025, + "step": 33380 + }, + { + "epoch": 3.344368207542445, + "grad_norm": 1.881780982017517, + "learning_rate": 1.2396837672093175e-05, + "loss": 0.4512, + "step": 33390 + }, + { + "epoch": 3.345369860269445, + "grad_norm": 2.2013063430786133, + "learning_rate": 1.2383225299596616e-05, + "loss": 0.4346, + "step": 33400 + }, + { + "epoch": 3.346371512996444, + "grad_norm": 1.754610300064087, + "learning_rate": 1.2369617944059983e-05, + "loss": 0.3997, + "step": 33410 + }, + { + "epoch": 3.3473731657234436, + "grad_norm": 2.1179208755493164, + "learning_rate": 1.2356015610894139e-05, + "loss": 0.4434, + "step": 33420 + }, + { + "epoch": 3.348374818450443, + "grad_norm": 1.6774661540985107, + "learning_rate": 1.2342418305507943e-05, + "loss": 0.4495, + "step": 33430 + }, + { + "epoch": 3.3493764711774427, + "grad_norm": 2.1426808834075928, + "learning_rate": 1.2328826033308252e-05, + "loss": 0.4384, + "step": 33440 + }, + { + "epoch": 3.3503781239044423, + "grad_norm": 1.60723876953125, + "learning_rate": 1.2315238799699922e-05, + "loss": 0.4383, + "step": 33450 + }, + { + "epoch": 3.351379776631442, + "grad_norm": 2.91386079788208, + "learning_rate": 1.2301656610085827e-05, + "loss": 0.4487, + "step": 33460 + }, + { + "epoch": 3.3523814293584415, + "grad_norm": 2.210634708404541, + "learning_rate": 1.2288079469866795e-05, + "loss": 0.4915, + "step": 33470 + }, + { + "epoch": 3.353383082085441, + "grad_norm": 2.1685736179351807, + "learning_rate": 1.2274507384441666e-05, + "loss": 0.4122, + "step": 33480 + }, + { + "epoch": 3.3543847348124407, + "grad_norm": 1.9470291137695312, + "learning_rate": 1.2260940359207299e-05, + "loss": 0.446, + "step": 33490 + }, + { + "epoch": 3.35538638753944, + "grad_norm": 1.978200912475586, + "learning_rate": 1.2247378399558488e-05, + "loss": 0.4294, + "step": 33500 + }, + { + "epoch": 3.3563880402664394, + "grad_norm": 1.9848915338516235, + "learning_rate": 1.2233821510888033e-05, + "loss": 0.5088, + "step": 33510 + }, + { + "epoch": 3.357389692993439, + "grad_norm": 1.9251346588134766, + "learning_rate": 1.2220269698586744e-05, + "loss": 0.5011, + "step": 33520 + }, + { + "epoch": 3.3583913457204386, + "grad_norm": 2.3554296493530273, + "learning_rate": 1.2206722968043396e-05, + "loss": 0.4617, + "step": 33530 + }, + { + "epoch": 3.359392998447438, + "grad_norm": 2.23020339012146, + "learning_rate": 1.2193181324644712e-05, + "loss": 0.4251, + "step": 33540 + }, + { + "epoch": 3.360394651174438, + "grad_norm": 1.555371642112732, + "learning_rate": 1.2179644773775422e-05, + "loss": 0.4204, + "step": 33550 + }, + { + "epoch": 3.3613963039014374, + "grad_norm": 3.28185772895813, + "learning_rate": 1.216611332081826e-05, + "loss": 0.4155, + "step": 33560 + }, + { + "epoch": 3.362397956628437, + "grad_norm": 2.456796884536743, + "learning_rate": 1.2152586971153865e-05, + "loss": 0.4124, + "step": 33570 + }, + { + "epoch": 3.3633996093554366, + "grad_norm": 3.379307270050049, + "learning_rate": 1.2139065730160884e-05, + "loss": 0.4497, + "step": 33580 + }, + { + "epoch": 3.364401262082436, + "grad_norm": 2.1304967403411865, + "learning_rate": 1.2125549603215952e-05, + "loss": 0.4305, + "step": 33590 + }, + { + "epoch": 3.3654029148094358, + "grad_norm": 2.1245455741882324, + "learning_rate": 1.2112038595693651e-05, + "loss": 0.4025, + "step": 33600 + }, + { + "epoch": 3.366404567536435, + "grad_norm": 1.967885971069336, + "learning_rate": 1.209853271296649e-05, + "loss": 0.4145, + "step": 33610 + }, + { + "epoch": 3.3674062202634345, + "grad_norm": 1.7838064432144165, + "learning_rate": 1.208503196040501e-05, + "loss": 0.3829, + "step": 33620 + }, + { + "epoch": 3.368407872990434, + "grad_norm": 2.147975444793701, + "learning_rate": 1.2071536343377668e-05, + "loss": 0.4521, + "step": 33630 + }, + { + "epoch": 3.3694095257174337, + "grad_norm": 2.331613063812256, + "learning_rate": 1.2058045867250884e-05, + "loss": 0.4155, + "step": 33640 + }, + { + "epoch": 3.3704111784444333, + "grad_norm": 2.6464388370513916, + "learning_rate": 1.2044560537389044e-05, + "loss": 0.4248, + "step": 33650 + }, + { + "epoch": 3.371412831171433, + "grad_norm": 1.795331597328186, + "learning_rate": 1.2031080359154476e-05, + "loss": 0.404, + "step": 33660 + }, + { + "epoch": 3.3724144838984325, + "grad_norm": 2.6335437297821045, + "learning_rate": 1.2017605337907472e-05, + "loss": 0.4048, + "step": 33670 + }, + { + "epoch": 3.373416136625432, + "grad_norm": 1.8416485786437988, + "learning_rate": 1.2004135479006263e-05, + "loss": 0.4342, + "step": 33680 + }, + { + "epoch": 3.3744177893524316, + "grad_norm": 2.2963712215423584, + "learning_rate": 1.1990670787807031e-05, + "loss": 0.4062, + "step": 33690 + }, + { + "epoch": 3.3754194420794312, + "grad_norm": 1.744742751121521, + "learning_rate": 1.1977211269663904e-05, + "loss": 0.3897, + "step": 33700 + }, + { + "epoch": 3.376421094806431, + "grad_norm": 1.9296424388885498, + "learning_rate": 1.1963756929928952e-05, + "loss": 0.4246, + "step": 33710 + }, + { + "epoch": 3.37742274753343, + "grad_norm": 2.5108537673950195, + "learning_rate": 1.1950307773952185e-05, + "loss": 0.4156, + "step": 33720 + }, + { + "epoch": 3.3784244002604296, + "grad_norm": 2.305176258087158, + "learning_rate": 1.1936863807081555e-05, + "loss": 0.4031, + "step": 33730 + }, + { + "epoch": 3.379426052987429, + "grad_norm": 2.122509002685547, + "learning_rate": 1.1923425034662946e-05, + "loss": 0.4498, + "step": 33740 + }, + { + "epoch": 3.3804277057144287, + "grad_norm": 2.2472517490386963, + "learning_rate": 1.1909991462040182e-05, + "loss": 0.4545, + "step": 33750 + }, + { + "epoch": 3.3814293584414283, + "grad_norm": 2.068202495574951, + "learning_rate": 1.1896563094555014e-05, + "loss": 0.4709, + "step": 33760 + }, + { + "epoch": 3.382431011168428, + "grad_norm": 2.0109431743621826, + "learning_rate": 1.1883139937547122e-05, + "loss": 0.4377, + "step": 33770 + }, + { + "epoch": 3.3834326638954275, + "grad_norm": 2.1727137565612793, + "learning_rate": 1.1869721996354124e-05, + "loss": 0.4618, + "step": 33780 + }, + { + "epoch": 3.384434316622427, + "grad_norm": 3.1651906967163086, + "learning_rate": 1.1856309276311553e-05, + "loss": 0.3812, + "step": 33790 + }, + { + "epoch": 3.3854359693494267, + "grad_norm": 2.4753785133361816, + "learning_rate": 1.1842901782752871e-05, + "loss": 0.4006, + "step": 33800 + }, + { + "epoch": 3.3864376220764263, + "grad_norm": 2.4785983562469482, + "learning_rate": 1.1829499521009452e-05, + "loss": 0.457, + "step": 33810 + }, + { + "epoch": 3.387439274803426, + "grad_norm": 1.973423719406128, + "learning_rate": 1.1816102496410627e-05, + "loss": 0.4479, + "step": 33820 + }, + { + "epoch": 3.388440927530425, + "grad_norm": 2.3075802326202393, + "learning_rate": 1.1802710714283588e-05, + "loss": 0.4552, + "step": 33830 + }, + { + "epoch": 3.3894425802574246, + "grad_norm": 2.3044092655181885, + "learning_rate": 1.1789324179953467e-05, + "loss": 0.4096, + "step": 33840 + }, + { + "epoch": 3.390444232984424, + "grad_norm": 2.105095863342285, + "learning_rate": 1.1775942898743347e-05, + "loss": 0.4348, + "step": 33850 + }, + { + "epoch": 3.391445885711424, + "grad_norm": 2.2408626079559326, + "learning_rate": 1.1762566875974157e-05, + "loss": 0.4708, + "step": 33860 + }, + { + "epoch": 3.3924475384384234, + "grad_norm": 2.347888469696045, + "learning_rate": 1.1749196116964764e-05, + "loss": 0.4569, + "step": 33870 + }, + { + "epoch": 3.393449191165423, + "grad_norm": 2.3559656143188477, + "learning_rate": 1.1735830627031966e-05, + "loss": 0.4546, + "step": 33880 + }, + { + "epoch": 3.3944508438924226, + "grad_norm": 1.7592754364013672, + "learning_rate": 1.1722470411490438e-05, + "loss": 0.4055, + "step": 33890 + }, + { + "epoch": 3.395452496619422, + "grad_norm": 2.887687921524048, + "learning_rate": 1.1709115475652751e-05, + "loss": 0.4309, + "step": 33900 + }, + { + "epoch": 3.3964541493464218, + "grad_norm": 2.2861642837524414, + "learning_rate": 1.1695765824829386e-05, + "loss": 0.4514, + "step": 33910 + }, + { + "epoch": 3.397455802073421, + "grad_norm": 2.557394504547119, + "learning_rate": 1.1682421464328745e-05, + "loss": 0.4279, + "step": 33920 + }, + { + "epoch": 3.398457454800421, + "grad_norm": 2.420703411102295, + "learning_rate": 1.1669082399457105e-05, + "loss": 0.4231, + "step": 33930 + }, + { + "epoch": 3.39945910752742, + "grad_norm": 1.7291672229766846, + "learning_rate": 1.1655748635518609e-05, + "loss": 0.4588, + "step": 33940 + }, + { + "epoch": 3.4004607602544197, + "grad_norm": 2.1979129314422607, + "learning_rate": 1.1642420177815352e-05, + "loss": 0.4296, + "step": 33950 + }, + { + "epoch": 3.4014624129814193, + "grad_norm": 2.324911594390869, + "learning_rate": 1.1629097031647293e-05, + "loss": 0.488, + "step": 33960 + }, + { + "epoch": 3.402464065708419, + "grad_norm": 2.866677761077881, + "learning_rate": 1.1615779202312241e-05, + "loss": 0.4562, + "step": 33970 + }, + { + "epoch": 3.4034657184354185, + "grad_norm": 1.6479003429412842, + "learning_rate": 1.1602466695105956e-05, + "loss": 0.4266, + "step": 33980 + }, + { + "epoch": 3.404467371162418, + "grad_norm": 2.6396446228027344, + "learning_rate": 1.158915951532204e-05, + "loss": 0.4387, + "step": 33990 + }, + { + "epoch": 3.4054690238894176, + "grad_norm": 2.2791261672973633, + "learning_rate": 1.1575857668251988e-05, + "loss": 0.4231, + "step": 34000 + }, + { + "epoch": 3.4064706766164172, + "grad_norm": 1.8278400897979736, + "learning_rate": 1.1562561159185174e-05, + "loss": 0.4151, + "step": 34010 + }, + { + "epoch": 3.407472329343417, + "grad_norm": 2.3506553173065186, + "learning_rate": 1.1549269993408846e-05, + "loss": 0.4128, + "step": 34020 + }, + { + "epoch": 3.408473982070416, + "grad_norm": 2.4189066886901855, + "learning_rate": 1.153598417620813e-05, + "loss": 0.4881, + "step": 34030 + }, + { + "epoch": 3.4094756347974156, + "grad_norm": 2.4470417499542236, + "learning_rate": 1.1522703712866029e-05, + "loss": 0.4015, + "step": 34040 + }, + { + "epoch": 3.410477287524415, + "grad_norm": 1.7285627126693726, + "learning_rate": 1.150942860866341e-05, + "loss": 0.4424, + "step": 34050 + }, + { + "epoch": 3.4114789402514147, + "grad_norm": 2.0729176998138428, + "learning_rate": 1.1496158868879015e-05, + "loss": 0.4989, + "step": 34060 + }, + { + "epoch": 3.4124805929784143, + "grad_norm": 2.230759382247925, + "learning_rate": 1.148289449878945e-05, + "loss": 0.4757, + "step": 34070 + }, + { + "epoch": 3.413482245705414, + "grad_norm": 1.9001071453094482, + "learning_rate": 1.1469635503669186e-05, + "loss": 0.4198, + "step": 34080 + }, + { + "epoch": 3.4144838984324135, + "grad_norm": 2.2974631786346436, + "learning_rate": 1.1456381888790554e-05, + "loss": 0.4217, + "step": 34090 + }, + { + "epoch": 3.415485551159413, + "grad_norm": 1.4676469564437866, + "learning_rate": 1.144313365942375e-05, + "loss": 0.4494, + "step": 34100 + }, + { + "epoch": 3.4164872038864127, + "grad_norm": 2.2653024196624756, + "learning_rate": 1.1429890820836829e-05, + "loss": 0.4153, + "step": 34110 + }, + { + "epoch": 3.4174888566134123, + "grad_norm": 1.8923325538635254, + "learning_rate": 1.1416653378295697e-05, + "loss": 0.4202, + "step": 34120 + }, + { + "epoch": 3.418490509340412, + "grad_norm": 2.0065672397613525, + "learning_rate": 1.1403421337064121e-05, + "loss": 0.4641, + "step": 34130 + }, + { + "epoch": 3.419492162067411, + "grad_norm": 1.9923065900802612, + "learning_rate": 1.1390194702403715e-05, + "loss": 0.4349, + "step": 34140 + }, + { + "epoch": 3.4204938147944106, + "grad_norm": 2.1380667686462402, + "learning_rate": 1.1376973479573942e-05, + "loss": 0.45, + "step": 34150 + }, + { + "epoch": 3.42149546752141, + "grad_norm": 1.5650336742401123, + "learning_rate": 1.136375767383212e-05, + "loss": 0.4001, + "step": 34160 + }, + { + "epoch": 3.42249712024841, + "grad_norm": 1.5955073833465576, + "learning_rate": 1.1350547290433399e-05, + "loss": 0.4333, + "step": 34170 + }, + { + "epoch": 3.4234987729754094, + "grad_norm": 1.952041506767273, + "learning_rate": 1.133734233463081e-05, + "loss": 0.4634, + "step": 34180 + }, + { + "epoch": 3.424500425702409, + "grad_norm": 2.256084680557251, + "learning_rate": 1.1324142811675167e-05, + "loss": 0.4104, + "step": 34190 + }, + { + "epoch": 3.4255020784294086, + "grad_norm": 1.7892006635665894, + "learning_rate": 1.1310948726815162e-05, + "loss": 0.4574, + "step": 34200 + }, + { + "epoch": 3.426503731156408, + "grad_norm": 2.5087509155273438, + "learning_rate": 1.1297760085297332e-05, + "loss": 0.4494, + "step": 34210 + }, + { + "epoch": 3.4275053838834078, + "grad_norm": 1.9253290891647339, + "learning_rate": 1.1284576892366037e-05, + "loss": 0.4651, + "step": 34220 + }, + { + "epoch": 3.4285070366104073, + "grad_norm": 2.0629913806915283, + "learning_rate": 1.1271399153263448e-05, + "loss": 0.3922, + "step": 34230 + }, + { + "epoch": 3.429508689337407, + "grad_norm": 2.1463217735290527, + "learning_rate": 1.1258226873229589e-05, + "loss": 0.3902, + "step": 34240 + }, + { + "epoch": 3.430510342064406, + "grad_norm": 2.2839205265045166, + "learning_rate": 1.124506005750234e-05, + "loss": 0.4098, + "step": 34250 + }, + { + "epoch": 3.4315119947914057, + "grad_norm": 2.252955198287964, + "learning_rate": 1.1231898711317352e-05, + "loss": 0.441, + "step": 34260 + }, + { + "epoch": 3.4325136475184053, + "grad_norm": 2.093442678451538, + "learning_rate": 1.1218742839908131e-05, + "loss": 0.3993, + "step": 34270 + }, + { + "epoch": 3.433515300245405, + "grad_norm": 2.032712459564209, + "learning_rate": 1.1205592448506022e-05, + "loss": 0.4433, + "step": 34280 + }, + { + "epoch": 3.4345169529724044, + "grad_norm": 2.3281428813934326, + "learning_rate": 1.1192447542340173e-05, + "loss": 0.4208, + "step": 34290 + }, + { + "epoch": 3.435518605699404, + "grad_norm": 2.4409408569335938, + "learning_rate": 1.1179308126637524e-05, + "loss": 0.3541, + "step": 34300 + }, + { + "epoch": 3.4365202584264036, + "grad_norm": 2.3380045890808105, + "learning_rate": 1.1166174206622887e-05, + "loss": 0.4353, + "step": 34310 + }, + { + "epoch": 3.437521911153403, + "grad_norm": 1.6690657138824463, + "learning_rate": 1.115304578751886e-05, + "loss": 0.4292, + "step": 34320 + }, + { + "epoch": 3.438523563880403, + "grad_norm": 2.014461040496826, + "learning_rate": 1.1139922874545824e-05, + "loss": 0.487, + "step": 34330 + }, + { + "epoch": 3.4395252166074024, + "grad_norm": 2.823160409927368, + "learning_rate": 1.1126805472922034e-05, + "loss": 0.4126, + "step": 34340 + }, + { + "epoch": 3.440526869334402, + "grad_norm": 2.1313488483428955, + "learning_rate": 1.1113693587863507e-05, + "loss": 0.4308, + "step": 34350 + }, + { + "epoch": 3.441528522061401, + "grad_norm": 1.7940298318862915, + "learning_rate": 1.1100587224584092e-05, + "loss": 0.4637, + "step": 34360 + }, + { + "epoch": 3.4425301747884007, + "grad_norm": 2.9107887744903564, + "learning_rate": 1.1087486388295399e-05, + "loss": 0.4597, + "step": 34370 + }, + { + "epoch": 3.4435318275154003, + "grad_norm": 1.9696946144104004, + "learning_rate": 1.10743910842069e-05, + "loss": 0.4005, + "step": 34380 + }, + { + "epoch": 3.4445334802424, + "grad_norm": 1.943562388420105, + "learning_rate": 1.106130131752583e-05, + "loss": 0.4398, + "step": 34390 + }, + { + "epoch": 3.4455351329693995, + "grad_norm": 2.523973226547241, + "learning_rate": 1.1048217093457228e-05, + "loss": 0.4519, + "step": 34400 + }, + { + "epoch": 3.446536785696399, + "grad_norm": 2.25036883354187, + "learning_rate": 1.1035138417203936e-05, + "loss": 0.4734, + "step": 34410 + }, + { + "epoch": 3.4475384384233987, + "grad_norm": 2.8198471069335938, + "learning_rate": 1.1022065293966579e-05, + "loss": 0.4328, + "step": 34420 + }, + { + "epoch": 3.4485400911503983, + "grad_norm": 1.8240866661071777, + "learning_rate": 1.1008997728943584e-05, + "loss": 0.4731, + "step": 34430 + }, + { + "epoch": 3.449541743877398, + "grad_norm": 2.043656349182129, + "learning_rate": 1.0995935727331166e-05, + "loss": 0.439, + "step": 34440 + }, + { + "epoch": 3.450543396604397, + "grad_norm": 2.4668407440185547, + "learning_rate": 1.0982879294323323e-05, + "loss": 0.4616, + "step": 34450 + }, + { + "epoch": 3.451545049331397, + "grad_norm": 1.7924633026123047, + "learning_rate": 1.0969828435111843e-05, + "loss": 0.4282, + "step": 34460 + }, + { + "epoch": 3.452546702058396, + "grad_norm": 1.8731104135513306, + "learning_rate": 1.0956783154886297e-05, + "loss": 0.395, + "step": 34470 + }, + { + "epoch": 3.453548354785396, + "grad_norm": 2.430614948272705, + "learning_rate": 1.0943743458834033e-05, + "loss": 0.4489, + "step": 34480 + }, + { + "epoch": 3.4545500075123954, + "grad_norm": 2.1472465991973877, + "learning_rate": 1.0930709352140183e-05, + "loss": 0.4062, + "step": 34490 + }, + { + "epoch": 3.455551660239395, + "grad_norm": 2.722684383392334, + "learning_rate": 1.0917680839987652e-05, + "loss": 0.4683, + "step": 34500 + }, + { + "epoch": 3.4565533129663946, + "grad_norm": 2.144862174987793, + "learning_rate": 1.0904657927557144e-05, + "loss": 0.4497, + "step": 34510 + }, + { + "epoch": 3.457554965693394, + "grad_norm": 2.7352027893066406, + "learning_rate": 1.0891640620027094e-05, + "loss": 0.4247, + "step": 34520 + }, + { + "epoch": 3.4585566184203937, + "grad_norm": 2.6462039947509766, + "learning_rate": 1.0878628922573727e-05, + "loss": 0.4004, + "step": 34530 + }, + { + "epoch": 3.4595582711473933, + "grad_norm": 2.273921012878418, + "learning_rate": 1.0865622840371074e-05, + "loss": 0.4683, + "step": 34540 + }, + { + "epoch": 3.460559923874393, + "grad_norm": 1.9296114444732666, + "learning_rate": 1.0852622378590866e-05, + "loss": 0.4534, + "step": 34550 + }, + { + "epoch": 3.461561576601392, + "grad_norm": 2.1889405250549316, + "learning_rate": 1.0839627542402644e-05, + "loss": 0.3771, + "step": 34560 + }, + { + "epoch": 3.4625632293283917, + "grad_norm": 2.109031915664673, + "learning_rate": 1.0826638336973696e-05, + "loss": 0.4224, + "step": 34570 + }, + { + "epoch": 3.4635648820553913, + "grad_norm": 3.0517172813415527, + "learning_rate": 1.08136547674691e-05, + "loss": 0.4029, + "step": 34580 + }, + { + "epoch": 3.464566534782391, + "grad_norm": 2.070493459701538, + "learning_rate": 1.0800676839051641e-05, + "loss": 0.4262, + "step": 34590 + }, + { + "epoch": 3.4655681875093904, + "grad_norm": 2.03619384765625, + "learning_rate": 1.0787704556881888e-05, + "loss": 0.4141, + "step": 34600 + }, + { + "epoch": 3.46656984023639, + "grad_norm": 2.0710926055908203, + "learning_rate": 1.0774737926118198e-05, + "loss": 0.4238, + "step": 34610 + }, + { + "epoch": 3.4675714929633896, + "grad_norm": 2.2858543395996094, + "learning_rate": 1.0761776951916616e-05, + "loss": 0.3914, + "step": 34620 + }, + { + "epoch": 3.468573145690389, + "grad_norm": 2.7241454124450684, + "learning_rate": 1.0748821639430973e-05, + "loss": 0.4691, + "step": 34630 + }, + { + "epoch": 3.469574798417389, + "grad_norm": 2.1155881881713867, + "learning_rate": 1.0735871993812863e-05, + "loss": 0.4633, + "step": 34640 + }, + { + "epoch": 3.4705764511443884, + "grad_norm": 1.7804661989212036, + "learning_rate": 1.0722928020211609e-05, + "loss": 0.4208, + "step": 34650 + }, + { + "epoch": 3.471578103871388, + "grad_norm": 1.557573676109314, + "learning_rate": 1.070998972377425e-05, + "loss": 0.4351, + "step": 34660 + }, + { + "epoch": 3.472579756598387, + "grad_norm": 2.085498094558716, + "learning_rate": 1.0697057109645629e-05, + "loss": 0.4311, + "step": 34670 + }, + { + "epoch": 3.4735814093253867, + "grad_norm": 2.409001111984253, + "learning_rate": 1.0684130182968284e-05, + "loss": 0.4173, + "step": 34680 + }, + { + "epoch": 3.4745830620523863, + "grad_norm": 1.4353440999984741, + "learning_rate": 1.0671208948882514e-05, + "loss": 0.441, + "step": 34690 + }, + { + "epoch": 3.475584714779386, + "grad_norm": 2.4345576763153076, + "learning_rate": 1.0658293412526316e-05, + "loss": 0.3898, + "step": 34700 + }, + { + "epoch": 3.4765863675063855, + "grad_norm": 2.105102777481079, + "learning_rate": 1.0645383579035478e-05, + "loss": 0.4628, + "step": 34710 + }, + { + "epoch": 3.477588020233385, + "grad_norm": 2.4883739948272705, + "learning_rate": 1.0632479453543498e-05, + "loss": 0.4361, + "step": 34720 + }, + { + "epoch": 3.4785896729603847, + "grad_norm": 1.9684547185897827, + "learning_rate": 1.0619581041181565e-05, + "loss": 0.4194, + "step": 34730 + }, + { + "epoch": 3.4795913256873843, + "grad_norm": 1.775962233543396, + "learning_rate": 1.0606688347078659e-05, + "loss": 0.4438, + "step": 34740 + }, + { + "epoch": 3.480592978414384, + "grad_norm": 2.144451856613159, + "learning_rate": 1.0593801376361448e-05, + "loss": 0.41, + "step": 34750 + }, + { + "epoch": 3.4815946311413835, + "grad_norm": 1.9578640460968018, + "learning_rate": 1.0580920134154332e-05, + "loss": 0.4127, + "step": 34760 + }, + { + "epoch": 3.482596283868383, + "grad_norm": 2.3081228733062744, + "learning_rate": 1.0568044625579437e-05, + "loss": 0.4418, + "step": 34770 + }, + { + "epoch": 3.483597936595382, + "grad_norm": 2.8557839393615723, + "learning_rate": 1.0555174855756605e-05, + "loss": 0.4509, + "step": 34780 + }, + { + "epoch": 3.484599589322382, + "grad_norm": 2.0425710678100586, + "learning_rate": 1.0542310829803396e-05, + "loss": 0.4134, + "step": 34790 + }, + { + "epoch": 3.4856012420493814, + "grad_norm": 1.7663625478744507, + "learning_rate": 1.052945255283509e-05, + "loss": 0.3944, + "step": 34800 + }, + { + "epoch": 3.486602894776381, + "grad_norm": 2.232553243637085, + "learning_rate": 1.0516600029964677e-05, + "loss": 0.4513, + "step": 34810 + }, + { + "epoch": 3.4876045475033806, + "grad_norm": 2.3331985473632812, + "learning_rate": 1.0503753266302863e-05, + "loss": 0.4078, + "step": 34820 + }, + { + "epoch": 3.48860620023038, + "grad_norm": 1.9388011693954468, + "learning_rate": 1.0490912266958055e-05, + "loss": 0.4758, + "step": 34830 + }, + { + "epoch": 3.4896078529573797, + "grad_norm": 2.1196694374084473, + "learning_rate": 1.047807703703638e-05, + "loss": 0.4602, + "step": 34840 + }, + { + "epoch": 3.4906095056843793, + "grad_norm": 1.5808969736099243, + "learning_rate": 1.0465247581641663e-05, + "loss": 0.3994, + "step": 34850 + }, + { + "epoch": 3.491611158411379, + "grad_norm": 1.7989665269851685, + "learning_rate": 1.0452423905875425e-05, + "loss": 0.407, + "step": 34860 + }, + { + "epoch": 3.4926128111383785, + "grad_norm": 2.2902822494506836, + "learning_rate": 1.0439606014836924e-05, + "loss": 0.3768, + "step": 34870 + }, + { + "epoch": 3.493614463865378, + "grad_norm": 2.374018430709839, + "learning_rate": 1.042679391362307e-05, + "loss": 0.4478, + "step": 34880 + }, + { + "epoch": 3.4946161165923773, + "grad_norm": 1.874822735786438, + "learning_rate": 1.0413987607328497e-05, + "loss": 0.451, + "step": 34890 + }, + { + "epoch": 3.495617769319377, + "grad_norm": 2.655895233154297, + "learning_rate": 1.0401187101045534e-05, + "loss": 0.4629, + "step": 34900 + }, + { + "epoch": 3.4966194220463764, + "grad_norm": 2.248385429382324, + "learning_rate": 1.03883923998642e-05, + "loss": 0.4135, + "step": 34910 + }, + { + "epoch": 3.497621074773376, + "grad_norm": 2.5209267139434814, + "learning_rate": 1.0375603508872203e-05, + "loss": 0.4222, + "step": 34920 + }, + { + "epoch": 3.4986227275003756, + "grad_norm": 1.8577488660812378, + "learning_rate": 1.0362820433154938e-05, + "loss": 0.4049, + "step": 34930 + }, + { + "epoch": 3.499624380227375, + "grad_norm": 2.4650468826293945, + "learning_rate": 1.0350043177795517e-05, + "loss": 0.4289, + "step": 34940 + }, + { + "epoch": 3.500626032954375, + "grad_norm": 2.639314889907837, + "learning_rate": 1.033727174787469e-05, + "loss": 0.4502, + "step": 34950 + }, + { + "epoch": 3.5016276856813744, + "grad_norm": 2.0178632736206055, + "learning_rate": 1.0324506148470917e-05, + "loss": 0.4272, + "step": 34960 + }, + { + "epoch": 3.502629338408374, + "grad_norm": 1.9113802909851074, + "learning_rate": 1.0311746384660346e-05, + "loss": 0.4286, + "step": 34970 + }, + { + "epoch": 3.503630991135373, + "grad_norm": 2.1833677291870117, + "learning_rate": 1.0298992461516802e-05, + "loss": 0.501, + "step": 34980 + }, + { + "epoch": 3.504632643862373, + "grad_norm": 2.2047367095947266, + "learning_rate": 1.028624438411175e-05, + "loss": 0.4528, + "step": 34990 + }, + { + "epoch": 3.5056342965893723, + "grad_norm": 1.7950270175933838, + "learning_rate": 1.0273502157514394e-05, + "loss": 0.4207, + "step": 35000 + }, + { + "epoch": 3.506635949316372, + "grad_norm": 1.9958217144012451, + "learning_rate": 1.0260765786791577e-05, + "loss": 0.3915, + "step": 35010 + }, + { + "epoch": 3.5076376020433715, + "grad_norm": 2.9710733890533447, + "learning_rate": 1.0248035277007783e-05, + "loss": 0.4248, + "step": 35020 + }, + { + "epoch": 3.508639254770371, + "grad_norm": 1.977878212928772, + "learning_rate": 1.0235310633225232e-05, + "loss": 0.4082, + "step": 35030 + }, + { + "epoch": 3.5096409074973707, + "grad_norm": 2.591295003890991, + "learning_rate": 1.0222591860503761e-05, + "loss": 0.4369, + "step": 35040 + }, + { + "epoch": 3.5106425602243703, + "grad_norm": 1.7341768741607666, + "learning_rate": 1.0209878963900904e-05, + "loss": 0.3957, + "step": 35050 + }, + { + "epoch": 3.51164421295137, + "grad_norm": 2.3429300785064697, + "learning_rate": 1.0197171948471812e-05, + "loss": 0.4407, + "step": 35060 + }, + { + "epoch": 3.5126458656783695, + "grad_norm": 2.3763067722320557, + "learning_rate": 1.0184470819269359e-05, + "loss": 0.4635, + "step": 35070 + }, + { + "epoch": 3.513647518405369, + "grad_norm": 2.520565986633301, + "learning_rate": 1.0171775581344044e-05, + "loss": 0.438, + "step": 35080 + }, + { + "epoch": 3.514649171132368, + "grad_norm": 2.132378578186035, + "learning_rate": 1.0159086239744003e-05, + "loss": 0.4628, + "step": 35090 + }, + { + "epoch": 3.5156508238593682, + "grad_norm": 2.1328794956207275, + "learning_rate": 1.0146402799515079e-05, + "loss": 0.449, + "step": 35100 + }, + { + "epoch": 3.5166524765863674, + "grad_norm": 2.2346129417419434, + "learning_rate": 1.0133725265700728e-05, + "loss": 0.4675, + "step": 35110 + }, + { + "epoch": 3.517654129313367, + "grad_norm": 2.5284063816070557, + "learning_rate": 1.0121053643342074e-05, + "loss": 0.4691, + "step": 35120 + }, + { + "epoch": 3.5186557820403666, + "grad_norm": 2.44215989112854, + "learning_rate": 1.0108387937477886e-05, + "loss": 0.4336, + "step": 35130 + }, + { + "epoch": 3.519657434767366, + "grad_norm": 2.0735301971435547, + "learning_rate": 1.0095728153144579e-05, + "loss": 0.4179, + "step": 35140 + }, + { + "epoch": 3.5206590874943657, + "grad_norm": 1.7370909452438354, + "learning_rate": 1.0083074295376219e-05, + "loss": 0.4225, + "step": 35150 + }, + { + "epoch": 3.5216607402213653, + "grad_norm": 2.455852746963501, + "learning_rate": 1.0070426369204511e-05, + "loss": 0.4091, + "step": 35160 + }, + { + "epoch": 3.522662392948365, + "grad_norm": 2.051345109939575, + "learning_rate": 1.0057784379658797e-05, + "loss": 0.4919, + "step": 35170 + }, + { + "epoch": 3.5236640456753645, + "grad_norm": 2.2854480743408203, + "learning_rate": 1.0045148331766069e-05, + "loss": 0.4539, + "step": 35180 + }, + { + "epoch": 3.524665698402364, + "grad_norm": 2.3247320652008057, + "learning_rate": 1.0032518230550947e-05, + "loss": 0.4547, + "step": 35190 + }, + { + "epoch": 3.5256673511293632, + "grad_norm": 2.1312015056610107, + "learning_rate": 1.0019894081035694e-05, + "loss": 0.4494, + "step": 35200 + }, + { + "epoch": 3.526669003856363, + "grad_norm": 1.9080195426940918, + "learning_rate": 1.00072758882402e-05, + "loss": 0.4012, + "step": 35210 + }, + { + "epoch": 3.5276706565833624, + "grad_norm": 1.9371165037155151, + "learning_rate": 9.994663657181991e-06, + "loss": 0.4325, + "step": 35220 + }, + { + "epoch": 3.528672309310362, + "grad_norm": 2.2770767211914062, + "learning_rate": 9.982057392876215e-06, + "loss": 0.4173, + "step": 35230 + }, + { + "epoch": 3.5296739620373616, + "grad_norm": 2.719331741333008, + "learning_rate": 9.969457100335658e-06, + "loss": 0.4215, + "step": 35240 + }, + { + "epoch": 3.530675614764361, + "grad_norm": 2.2385475635528564, + "learning_rate": 9.956862784570726e-06, + "loss": 0.4139, + "step": 35250 + }, + { + "epoch": 3.531677267491361, + "grad_norm": 2.103825330734253, + "learning_rate": 9.944274450589433e-06, + "loss": 0.4366, + "step": 35260 + }, + { + "epoch": 3.5326789202183604, + "grad_norm": 2.2733116149902344, + "learning_rate": 9.931692103397464e-06, + "loss": 0.4787, + "step": 35270 + }, + { + "epoch": 3.53368057294536, + "grad_norm": 1.8610239028930664, + "learning_rate": 9.919115747998059e-06, + "loss": 0.4291, + "step": 35280 + }, + { + "epoch": 3.534682225672359, + "grad_norm": 1.7383731603622437, + "learning_rate": 9.906545389392105e-06, + "loss": 0.4369, + "step": 35290 + }, + { + "epoch": 3.535683878399359, + "grad_norm": 3.8377766609191895, + "learning_rate": 9.89398103257813e-06, + "loss": 0.4582, + "step": 35300 + }, + { + "epoch": 3.5366855311263583, + "grad_norm": 1.6542972326278687, + "learning_rate": 9.88142268255223e-06, + "loss": 0.3911, + "step": 35310 + }, + { + "epoch": 3.537687183853358, + "grad_norm": 1.7561484575271606, + "learning_rate": 9.868870344308127e-06, + "loss": 0.3694, + "step": 35320 + }, + { + "epoch": 3.5386888365803575, + "grad_norm": 2.326295852661133, + "learning_rate": 9.856324022837177e-06, + "loss": 0.42, + "step": 35330 + }, + { + "epoch": 3.539690489307357, + "grad_norm": 1.6023072004318237, + "learning_rate": 9.843783723128328e-06, + "loss": 0.4183, + "step": 35340 + }, + { + "epoch": 3.5406921420343567, + "grad_norm": 2.0997536182403564, + "learning_rate": 9.8312494501681e-06, + "loss": 0.4093, + "step": 35350 + }, + { + "epoch": 3.5416937947613563, + "grad_norm": 2.6246097087860107, + "learning_rate": 9.818721208940674e-06, + "loss": 0.4217, + "step": 35360 + }, + { + "epoch": 3.542695447488356, + "grad_norm": 2.3198230266571045, + "learning_rate": 9.806199004427807e-06, + "loss": 0.4572, + "step": 35370 + }, + { + "epoch": 3.5436971002153554, + "grad_norm": 2.1230669021606445, + "learning_rate": 9.793682841608836e-06, + "loss": 0.4245, + "step": 35380 + }, + { + "epoch": 3.544698752942355, + "grad_norm": 2.395850896835327, + "learning_rate": 9.781172725460711e-06, + "loss": 0.4536, + "step": 35390 + }, + { + "epoch": 3.545700405669354, + "grad_norm": 2.717364549636841, + "learning_rate": 9.768668660957999e-06, + "loss": 0.4774, + "step": 35400 + }, + { + "epoch": 3.546702058396354, + "grad_norm": 2.4505763053894043, + "learning_rate": 9.756170653072844e-06, + "loss": 0.4056, + "step": 35410 + }, + { + "epoch": 3.5477037111233534, + "grad_norm": 3.048994779586792, + "learning_rate": 9.74367870677495e-06, + "loss": 0.4456, + "step": 35420 + }, + { + "epoch": 3.548705363850353, + "grad_norm": 2.351593017578125, + "learning_rate": 9.731192827031666e-06, + "loss": 0.4442, + "step": 35430 + }, + { + "epoch": 3.5497070165773525, + "grad_norm": 1.9275217056274414, + "learning_rate": 9.718713018807895e-06, + "loss": 0.4316, + "step": 35440 + }, + { + "epoch": 3.550708669304352, + "grad_norm": 2.4288742542266846, + "learning_rate": 9.706239287066132e-06, + "loss": 0.4615, + "step": 35450 + }, + { + "epoch": 3.5517103220313517, + "grad_norm": 2.2295823097229004, + "learning_rate": 9.69377163676646e-06, + "loss": 0.4165, + "step": 35460 + }, + { + "epoch": 3.5527119747583513, + "grad_norm": 2.53344464302063, + "learning_rate": 9.68131007286654e-06, + "loss": 0.483, + "step": 35470 + }, + { + "epoch": 3.553713627485351, + "grad_norm": 2.220214366912842, + "learning_rate": 9.668854600321612e-06, + "loss": 0.4092, + "step": 35480 + }, + { + "epoch": 3.5547152802123505, + "grad_norm": 2.255557060241699, + "learning_rate": 9.656405224084494e-06, + "loss": 0.4361, + "step": 35490 + }, + { + "epoch": 3.55571693293935, + "grad_norm": 2.5812127590179443, + "learning_rate": 9.643961949105584e-06, + "loss": 0.4721, + "step": 35500 + }, + { + "epoch": 3.5567185856663492, + "grad_norm": 1.8932257890701294, + "learning_rate": 9.631524780332851e-06, + "loss": 0.4126, + "step": 35510 + }, + { + "epoch": 3.5577202383933493, + "grad_norm": 2.0898537635803223, + "learning_rate": 9.619093722711833e-06, + "loss": 0.4027, + "step": 35520 + }, + { + "epoch": 3.5587218911203484, + "grad_norm": 2.0887091159820557, + "learning_rate": 9.606668781185646e-06, + "loss": 0.4123, + "step": 35530 + }, + { + "epoch": 3.559723543847348, + "grad_norm": 2.2083592414855957, + "learning_rate": 9.594249960694965e-06, + "loss": 0.4965, + "step": 35540 + }, + { + "epoch": 3.5607251965743476, + "grad_norm": 2.0207650661468506, + "learning_rate": 9.58183726617804e-06, + "loss": 0.4269, + "step": 35550 + }, + { + "epoch": 3.561726849301347, + "grad_norm": 1.9305944442749023, + "learning_rate": 9.569430702570672e-06, + "loss": 0.4924, + "step": 35560 + }, + { + "epoch": 3.562728502028347, + "grad_norm": 2.5445716381073, + "learning_rate": 9.557030274806239e-06, + "loss": 0.3865, + "step": 35570 + }, + { + "epoch": 3.5637301547553464, + "grad_norm": 2.159574031829834, + "learning_rate": 9.54463598781567e-06, + "loss": 0.4323, + "step": 35580 + }, + { + "epoch": 3.564731807482346, + "grad_norm": 2.1673364639282227, + "learning_rate": 9.532247846527453e-06, + "loss": 0.4703, + "step": 35590 + }, + { + "epoch": 3.5657334602093456, + "grad_norm": 2.177016496658325, + "learning_rate": 9.519865855867637e-06, + "loss": 0.4123, + "step": 35600 + }, + { + "epoch": 3.566735112936345, + "grad_norm": 1.9912105798721313, + "learning_rate": 9.507490020759818e-06, + "loss": 0.4372, + "step": 35610 + }, + { + "epoch": 3.5677367656633443, + "grad_norm": 1.8226884603500366, + "learning_rate": 9.495120346125145e-06, + "loss": 0.4321, + "step": 35620 + }, + { + "epoch": 3.5687384183903443, + "grad_norm": 1.9442039728164673, + "learning_rate": 9.48275683688234e-06, + "loss": 0.3922, + "step": 35630 + }, + { + "epoch": 3.5697400711173435, + "grad_norm": 1.86778724193573, + "learning_rate": 9.470399497947633e-06, + "loss": 0.4015, + "step": 35640 + }, + { + "epoch": 3.570741723844343, + "grad_norm": 2.015185832977295, + "learning_rate": 9.458048334234818e-06, + "loss": 0.4101, + "step": 35650 + }, + { + "epoch": 3.5717433765713427, + "grad_norm": 2.0542356967926025, + "learning_rate": 9.445703350655264e-06, + "loss": 0.4151, + "step": 35660 + }, + { + "epoch": 3.5727450292983423, + "grad_norm": 2.4986319541931152, + "learning_rate": 9.433364552117829e-06, + "loss": 0.4189, + "step": 35670 + }, + { + "epoch": 3.573746682025342, + "grad_norm": 2.3909826278686523, + "learning_rate": 9.421031943528941e-06, + "loss": 0.4168, + "step": 35680 + }, + { + "epoch": 3.5747483347523414, + "grad_norm": 2.0575599670410156, + "learning_rate": 9.408705529792577e-06, + "loss": 0.4215, + "step": 35690 + }, + { + "epoch": 3.575749987479341, + "grad_norm": 2.2897729873657227, + "learning_rate": 9.396385315810242e-06, + "loss": 0.4642, + "step": 35700 + }, + { + "epoch": 3.5767516402063406, + "grad_norm": 2.1848273277282715, + "learning_rate": 9.384071306480947e-06, + "loss": 0.4217, + "step": 35710 + }, + { + "epoch": 3.57775329293334, + "grad_norm": 2.4846177101135254, + "learning_rate": 9.371763506701265e-06, + "loss": 0.423, + "step": 35720 + }, + { + "epoch": 3.5787549456603394, + "grad_norm": 1.7969287633895874, + "learning_rate": 9.359461921365307e-06, + "loss": 0.4192, + "step": 35730 + }, + { + "epoch": 3.579756598387339, + "grad_norm": 2.2867469787597656, + "learning_rate": 9.347166555364704e-06, + "loss": 0.461, + "step": 35740 + }, + { + "epoch": 3.5807582511143385, + "grad_norm": 2.5535647869110107, + "learning_rate": 9.334877413588583e-06, + "loss": 0.4299, + "step": 35750 + }, + { + "epoch": 3.581759903841338, + "grad_norm": 1.96112859249115, + "learning_rate": 9.322594500923653e-06, + "loss": 0.4267, + "step": 35760 + }, + { + "epoch": 3.5827615565683377, + "grad_norm": 3.1883227825164795, + "learning_rate": 9.31031782225411e-06, + "loss": 0.4415, + "step": 35770 + }, + { + "epoch": 3.5837632092953373, + "grad_norm": 2.1661252975463867, + "learning_rate": 9.298047382461656e-06, + "loss": 0.46, + "step": 35780 + }, + { + "epoch": 3.584764862022337, + "grad_norm": 2.134152412414551, + "learning_rate": 9.285783186425559e-06, + "loss": 0.3904, + "step": 35790 + }, + { + "epoch": 3.5857665147493365, + "grad_norm": 2.428616523742676, + "learning_rate": 9.273525239022567e-06, + "loss": 0.4525, + "step": 35800 + }, + { + "epoch": 3.586768167476336, + "grad_norm": 2.4641940593719482, + "learning_rate": 9.26127354512696e-06, + "loss": 0.4485, + "step": 35810 + }, + { + "epoch": 3.5877698202033352, + "grad_norm": 2.321436643600464, + "learning_rate": 9.249028109610524e-06, + "loss": 0.4743, + "step": 35820 + }, + { + "epoch": 3.5887714729303353, + "grad_norm": 2.3309600353240967, + "learning_rate": 9.236788937342558e-06, + "loss": 0.4444, + "step": 35830 + }, + { + "epoch": 3.5897731256573344, + "grad_norm": 1.9578757286071777, + "learning_rate": 9.22455603318987e-06, + "loss": 0.4182, + "step": 35840 + }, + { + "epoch": 3.590774778384334, + "grad_norm": 1.8859965801239014, + "learning_rate": 9.212329402016784e-06, + "loss": 0.3995, + "step": 35850 + }, + { + "epoch": 3.5917764311113336, + "grad_norm": 2.524366855621338, + "learning_rate": 9.200109048685112e-06, + "loss": 0.4419, + "step": 35860 + }, + { + "epoch": 3.592778083838333, + "grad_norm": 2.6382808685302734, + "learning_rate": 9.18789497805419e-06, + "loss": 0.3921, + "step": 35870 + }, + { + "epoch": 3.593779736565333, + "grad_norm": 2.853111743927002, + "learning_rate": 9.175687194980839e-06, + "loss": 0.459, + "step": 35880 + }, + { + "epoch": 3.5947813892923324, + "grad_norm": 2.2526397705078125, + "learning_rate": 9.163485704319392e-06, + "loss": 0.4678, + "step": 35890 + }, + { + "epoch": 3.595783042019332, + "grad_norm": 1.7041195631027222, + "learning_rate": 9.15129051092167e-06, + "loss": 0.4143, + "step": 35900 + }, + { + "epoch": 3.5967846947463316, + "grad_norm": 1.7411357164382935, + "learning_rate": 9.139101619636995e-06, + "loss": 0.441, + "step": 35910 + }, + { + "epoch": 3.597786347473331, + "grad_norm": 1.8171757459640503, + "learning_rate": 9.126919035312186e-06, + "loss": 0.4283, + "step": 35920 + }, + { + "epoch": 3.5987880002003303, + "grad_norm": 2.0983874797821045, + "learning_rate": 9.114742762791547e-06, + "loss": 0.4392, + "step": 35930 + }, + { + "epoch": 3.5997896529273303, + "grad_norm": 2.409411907196045, + "learning_rate": 9.102572806916876e-06, + "loss": 0.4293, + "step": 35940 + }, + { + "epoch": 3.6007913056543295, + "grad_norm": 1.5346509218215942, + "learning_rate": 9.090409172527462e-06, + "loss": 0.3898, + "step": 35950 + }, + { + "epoch": 3.601792958381329, + "grad_norm": 1.5412535667419434, + "learning_rate": 9.078251864460074e-06, + "loss": 0.3976, + "step": 35960 + }, + { + "epoch": 3.6027946111083287, + "grad_norm": 1.62209153175354, + "learning_rate": 9.066100887548972e-06, + "loss": 0.4361, + "step": 35970 + }, + { + "epoch": 3.6037962638353283, + "grad_norm": 2.0511932373046875, + "learning_rate": 9.053956246625884e-06, + "loss": 0.4037, + "step": 35980 + }, + { + "epoch": 3.604797916562328, + "grad_norm": 1.7861130237579346, + "learning_rate": 9.041817946520054e-06, + "loss": 0.4244, + "step": 35990 + }, + { + "epoch": 3.6057995692893274, + "grad_norm": 1.4258286952972412, + "learning_rate": 9.029685992058159e-06, + "loss": 0.4323, + "step": 36000 + }, + { + "epoch": 3.606801222016327, + "grad_norm": 1.9314757585525513, + "learning_rate": 9.01756038806437e-06, + "loss": 0.4408, + "step": 36010 + }, + { + "epoch": 3.6078028747433266, + "grad_norm": 1.7380179166793823, + "learning_rate": 9.005441139360358e-06, + "loss": 0.3859, + "step": 36020 + }, + { + "epoch": 3.608804527470326, + "grad_norm": 2.3640384674072266, + "learning_rate": 8.99332825076524e-06, + "loss": 0.4027, + "step": 36030 + }, + { + "epoch": 3.6098061801973254, + "grad_norm": 3.133868455886841, + "learning_rate": 8.981221727095598e-06, + "loss": 0.4192, + "step": 36040 + }, + { + "epoch": 3.6108078329243254, + "grad_norm": 1.5783082246780396, + "learning_rate": 8.969121573165492e-06, + "loss": 0.4265, + "step": 36050 + }, + { + "epoch": 3.6118094856513245, + "grad_norm": 1.7218824625015259, + "learning_rate": 8.957027793786477e-06, + "loss": 0.4918, + "step": 36060 + }, + { + "epoch": 3.612811138378324, + "grad_norm": 2.190296173095703, + "learning_rate": 8.944940393767524e-06, + "loss": 0.4599, + "step": 36070 + }, + { + "epoch": 3.6138127911053237, + "grad_norm": 1.9028578996658325, + "learning_rate": 8.93285937791509e-06, + "loss": 0.3851, + "step": 36080 + }, + { + "epoch": 3.6148144438323233, + "grad_norm": 2.483093500137329, + "learning_rate": 8.920784751033115e-06, + "loss": 0.5199, + "step": 36090 + }, + { + "epoch": 3.615816096559323, + "grad_norm": 2.389422655105591, + "learning_rate": 8.908716517922972e-06, + "loss": 0.4206, + "step": 36100 + }, + { + "epoch": 3.6168177492863225, + "grad_norm": 2.4605555534362793, + "learning_rate": 8.89665468338348e-06, + "loss": 0.3872, + "step": 36110 + }, + { + "epoch": 3.617819402013322, + "grad_norm": 2.5835015773773193, + "learning_rate": 8.884599252210948e-06, + "loss": 0.422, + "step": 36120 + }, + { + "epoch": 3.6188210547403217, + "grad_norm": 2.2940011024475098, + "learning_rate": 8.872550229199128e-06, + "loss": 0.4322, + "step": 36130 + }, + { + "epoch": 3.6198227074673213, + "grad_norm": 2.135439872741699, + "learning_rate": 8.860507619139194e-06, + "loss": 0.3873, + "step": 36140 + }, + { + "epoch": 3.6208243601943204, + "grad_norm": 2.894531488418579, + "learning_rate": 8.848471426819813e-06, + "loss": 0.4451, + "step": 36150 + }, + { + "epoch": 3.6218260129213204, + "grad_norm": 2.076733350753784, + "learning_rate": 8.836441657027078e-06, + "loss": 0.4245, + "step": 36160 + }, + { + "epoch": 3.6228276656483196, + "grad_norm": 2.015641689300537, + "learning_rate": 8.82441831454453e-06, + "loss": 0.484, + "step": 36170 + }, + { + "epoch": 3.623829318375319, + "grad_norm": 2.4164772033691406, + "learning_rate": 8.812401404153153e-06, + "loss": 0.4671, + "step": 36180 + }, + { + "epoch": 3.6248309711023188, + "grad_norm": 2.1226491928100586, + "learning_rate": 8.80039093063138e-06, + "loss": 0.3972, + "step": 36190 + }, + { + "epoch": 3.6258326238293184, + "grad_norm": 2.2570536136627197, + "learning_rate": 8.788386898755075e-06, + "loss": 0.4411, + "step": 36200 + }, + { + "epoch": 3.626834276556318, + "grad_norm": 2.020395040512085, + "learning_rate": 8.776389313297551e-06, + "loss": 0.4889, + "step": 36210 + }, + { + "epoch": 3.6278359292833176, + "grad_norm": 2.354189157485962, + "learning_rate": 8.764398179029546e-06, + "loss": 0.4833, + "step": 36220 + }, + { + "epoch": 3.628837582010317, + "grad_norm": 1.986047625541687, + "learning_rate": 8.752413500719242e-06, + "loss": 0.3904, + "step": 36230 + }, + { + "epoch": 3.6298392347373167, + "grad_norm": 1.9563875198364258, + "learning_rate": 8.740435283132253e-06, + "loss": 0.4306, + "step": 36240 + }, + { + "epoch": 3.6308408874643163, + "grad_norm": 2.950375556945801, + "learning_rate": 8.72846353103162e-06, + "loss": 0.4219, + "step": 36250 + }, + { + "epoch": 3.6318425401913155, + "grad_norm": 2.0507826805114746, + "learning_rate": 8.716498249177814e-06, + "loss": 0.4712, + "step": 36260 + }, + { + "epoch": 3.632844192918315, + "grad_norm": 2.2555625438690186, + "learning_rate": 8.704539442328736e-06, + "loss": 0.4081, + "step": 36270 + }, + { + "epoch": 3.6338458456453147, + "grad_norm": 1.7647645473480225, + "learning_rate": 8.692587115239706e-06, + "loss": 0.425, + "step": 36280 + }, + { + "epoch": 3.6348474983723142, + "grad_norm": 2.237157106399536, + "learning_rate": 8.680641272663479e-06, + "loss": 0.3712, + "step": 36290 + }, + { + "epoch": 3.635849151099314, + "grad_norm": 1.650541067123413, + "learning_rate": 8.668701919350216e-06, + "loss": 0.4203, + "step": 36300 + }, + { + "epoch": 3.6368508038263134, + "grad_norm": 2.149442672729492, + "learning_rate": 8.656769060047504e-06, + "loss": 0.3975, + "step": 36310 + }, + { + "epoch": 3.637852456553313, + "grad_norm": 1.5350333452224731, + "learning_rate": 8.64484269950037e-06, + "loss": 0.3873, + "step": 36320 + }, + { + "epoch": 3.6388541092803126, + "grad_norm": 2.2094063758850098, + "learning_rate": 8.632922842451212e-06, + "loss": 0.3724, + "step": 36330 + }, + { + "epoch": 3.639855762007312, + "grad_norm": 2.3353826999664307, + "learning_rate": 8.621009493639867e-06, + "loss": 0.4148, + "step": 36340 + }, + { + "epoch": 3.6408574147343113, + "grad_norm": 2.6725997924804688, + "learning_rate": 8.60910265780361e-06, + "loss": 0.4197, + "step": 36350 + }, + { + "epoch": 3.6418590674613114, + "grad_norm": 2.070082664489746, + "learning_rate": 8.597202339677071e-06, + "loss": 0.3991, + "step": 36360 + }, + { + "epoch": 3.6428607201883105, + "grad_norm": 2.181931972503662, + "learning_rate": 8.585308543992329e-06, + "loss": 0.4527, + "step": 36370 + }, + { + "epoch": 3.64386237291531, + "grad_norm": 1.89093017578125, + "learning_rate": 8.573421275478844e-06, + "loss": 0.4001, + "step": 36380 + }, + { + "epoch": 3.6448640256423097, + "grad_norm": 3.096177101135254, + "learning_rate": 8.561540538863522e-06, + "loss": 0.4488, + "step": 36390 + }, + { + "epoch": 3.6458656783693093, + "grad_norm": 2.1697757244110107, + "learning_rate": 8.549666338870619e-06, + "loss": 0.4258, + "step": 36400 + }, + { + "epoch": 3.646867331096309, + "grad_norm": 2.5357141494750977, + "learning_rate": 8.537798680221808e-06, + "loss": 0.4039, + "step": 36410 + }, + { + "epoch": 3.6478689838233085, + "grad_norm": 2.1486408710479736, + "learning_rate": 8.525937567636208e-06, + "loss": 0.4074, + "step": 36420 + }, + { + "epoch": 3.648870636550308, + "grad_norm": 2.1422061920166016, + "learning_rate": 8.514083005830259e-06, + "loss": 0.449, + "step": 36430 + }, + { + "epoch": 3.6498722892773077, + "grad_norm": 2.3769538402557373, + "learning_rate": 8.502234999517839e-06, + "loss": 0.4385, + "step": 36440 + }, + { + "epoch": 3.6508739420043073, + "grad_norm": 2.6000874042510986, + "learning_rate": 8.490393553410225e-06, + "loss": 0.4101, + "step": 36450 + }, + { + "epoch": 3.6518755947313064, + "grad_norm": 2.888254165649414, + "learning_rate": 8.478558672216078e-06, + "loss": 0.4509, + "step": 36460 + }, + { + "epoch": 3.6528772474583064, + "grad_norm": 1.998104214668274, + "learning_rate": 8.46673036064142e-06, + "loss": 0.3462, + "step": 36470 + }, + { + "epoch": 3.6538789001853056, + "grad_norm": 1.9908857345581055, + "learning_rate": 8.454908623389706e-06, + "loss": 0.4169, + "step": 36480 + }, + { + "epoch": 3.654880552912305, + "grad_norm": 2.254089593887329, + "learning_rate": 8.44309346516175e-06, + "loss": 0.4894, + "step": 36490 + }, + { + "epoch": 3.6558822056393048, + "grad_norm": 2.5244266986846924, + "learning_rate": 8.431284890655752e-06, + "loss": 0.5155, + "step": 36500 + }, + { + "epoch": 3.6568838583663044, + "grad_norm": 2.26857328414917, + "learning_rate": 8.419482904567305e-06, + "loss": 0.4682, + "step": 36510 + }, + { + "epoch": 3.657885511093304, + "grad_norm": 3.812756299972534, + "learning_rate": 8.407687511589366e-06, + "loss": 0.4426, + "step": 36520 + }, + { + "epoch": 3.6588871638203035, + "grad_norm": 2.134424924850464, + "learning_rate": 8.395898716412293e-06, + "loss": 0.4639, + "step": 36530 + }, + { + "epoch": 3.659888816547303, + "grad_norm": 1.799182653427124, + "learning_rate": 8.384116523723778e-06, + "loss": 0.4578, + "step": 36540 + }, + { + "epoch": 3.6608904692743027, + "grad_norm": 2.2345874309539795, + "learning_rate": 8.372340938208948e-06, + "loss": 0.4465, + "step": 36550 + }, + { + "epoch": 3.6618921220013023, + "grad_norm": 1.7349672317504883, + "learning_rate": 8.36057196455025e-06, + "loss": 0.445, + "step": 36560 + }, + { + "epoch": 3.6628937747283015, + "grad_norm": 2.1134767532348633, + "learning_rate": 8.348809607427535e-06, + "loss": 0.4594, + "step": 36570 + }, + { + "epoch": 3.6638954274553015, + "grad_norm": 2.5449306964874268, + "learning_rate": 8.337053871518005e-06, + "loss": 0.4286, + "step": 36580 + }, + { + "epoch": 3.6648970801823006, + "grad_norm": 2.225423812866211, + "learning_rate": 8.325304761496234e-06, + "loss": 0.4111, + "step": 36590 + }, + { + "epoch": 3.6658987329093002, + "grad_norm": 1.876946210861206, + "learning_rate": 8.313562282034162e-06, + "loss": 0.419, + "step": 36600 + }, + { + "epoch": 3.6669003856363, + "grad_norm": 2.1918551921844482, + "learning_rate": 8.301826437801097e-06, + "loss": 0.4192, + "step": 36610 + }, + { + "epoch": 3.6679020383632994, + "grad_norm": 2.6870341300964355, + "learning_rate": 8.290097233463697e-06, + "loss": 0.386, + "step": 36620 + }, + { + "epoch": 3.668903691090299, + "grad_norm": 1.8027386665344238, + "learning_rate": 8.278374673685993e-06, + "loss": 0.4481, + "step": 36630 + }, + { + "epoch": 3.6699053438172986, + "grad_norm": 1.7095636129379272, + "learning_rate": 8.266658763129362e-06, + "loss": 0.4075, + "step": 36640 + }, + { + "epoch": 3.670906996544298, + "grad_norm": 1.323539137840271, + "learning_rate": 8.254949506452548e-06, + "loss": 0.4143, + "step": 36650 + }, + { + "epoch": 3.671908649271298, + "grad_norm": 2.6137101650238037, + "learning_rate": 8.243246908311639e-06, + "loss": 0.4067, + "step": 36660 + }, + { + "epoch": 3.6729103019982974, + "grad_norm": 2.2960166931152344, + "learning_rate": 8.231550973360072e-06, + "loss": 0.4166, + "step": 36670 + }, + { + "epoch": 3.6739119547252965, + "grad_norm": 1.6058300733566284, + "learning_rate": 8.219861706248672e-06, + "loss": 0.4128, + "step": 36680 + }, + { + "epoch": 3.6749136074522966, + "grad_norm": 1.9703116416931152, + "learning_rate": 8.208179111625552e-06, + "loss": 0.4412, + "step": 36690 + }, + { + "epoch": 3.6759152601792957, + "grad_norm": 2.711176633834839, + "learning_rate": 8.196503194136213e-06, + "loss": 0.4466, + "step": 36700 + }, + { + "epoch": 3.6769169129062953, + "grad_norm": 2.1494076251983643, + "learning_rate": 8.184833958423494e-06, + "loss": 0.4272, + "step": 36710 + }, + { + "epoch": 3.677918565633295, + "grad_norm": 2.330552339553833, + "learning_rate": 8.173171409127566e-06, + "loss": 0.4899, + "step": 36720 + }, + { + "epoch": 3.6789202183602945, + "grad_norm": 2.2153825759887695, + "learning_rate": 8.161515550885959e-06, + "loss": 0.3723, + "step": 36730 + }, + { + "epoch": 3.679921871087294, + "grad_norm": 1.7207276821136475, + "learning_rate": 8.149866388333515e-06, + "loss": 0.397, + "step": 36740 + }, + { + "epoch": 3.6809235238142937, + "grad_norm": 2.033440351486206, + "learning_rate": 8.138223926102462e-06, + "loss": 0.4444, + "step": 36750 + }, + { + "epoch": 3.6819251765412933, + "grad_norm": 2.009399175643921, + "learning_rate": 8.126588168822303e-06, + "loss": 0.4054, + "step": 36760 + }, + { + "epoch": 3.682926829268293, + "grad_norm": 1.952618956565857, + "learning_rate": 8.114959121119903e-06, + "loss": 0.3859, + "step": 36770 + }, + { + "epoch": 3.6839284819952924, + "grad_norm": 1.7253168821334839, + "learning_rate": 8.10333678761948e-06, + "loss": 0.3954, + "step": 36780 + }, + { + "epoch": 3.6849301347222916, + "grad_norm": 1.8752772808074951, + "learning_rate": 8.091721172942566e-06, + "loss": 0.4315, + "step": 36790 + }, + { + "epoch": 3.685931787449291, + "grad_norm": 1.8806124925613403, + "learning_rate": 8.080112281707985e-06, + "loss": 0.4244, + "step": 36800 + }, + { + "epoch": 3.6869334401762908, + "grad_norm": 2.6830246448516846, + "learning_rate": 8.068510118531949e-06, + "loss": 0.4661, + "step": 36810 + }, + { + "epoch": 3.6879350929032904, + "grad_norm": 2.2413887977600098, + "learning_rate": 8.056914688027964e-06, + "loss": 0.4661, + "step": 36820 + }, + { + "epoch": 3.68893674563029, + "grad_norm": 2.0021140575408936, + "learning_rate": 8.045325994806838e-06, + "loss": 0.4098, + "step": 36830 + }, + { + "epoch": 3.6899383983572895, + "grad_norm": 1.5996559858322144, + "learning_rate": 8.033744043476749e-06, + "loss": 0.4021, + "step": 36840 + }, + { + "epoch": 3.690940051084289, + "grad_norm": 3.1006476879119873, + "learning_rate": 8.022168838643152e-06, + "loss": 0.4296, + "step": 36850 + }, + { + "epoch": 3.6919417038112887, + "grad_norm": 2.16410231590271, + "learning_rate": 8.010600384908848e-06, + "loss": 0.4847, + "step": 36860 + }, + { + "epoch": 3.6929433565382883, + "grad_norm": 2.522000312805176, + "learning_rate": 7.99903868687392e-06, + "loss": 0.4312, + "step": 36870 + }, + { + "epoch": 3.6939450092652875, + "grad_norm": 1.8807637691497803, + "learning_rate": 7.9874837491358e-06, + "loss": 0.5074, + "step": 36880 + }, + { + "epoch": 3.6949466619922875, + "grad_norm": 1.5786900520324707, + "learning_rate": 7.975935576289218e-06, + "loss": 0.3864, + "step": 36890 + }, + { + "epoch": 3.6959483147192866, + "grad_norm": 2.4154186248779297, + "learning_rate": 7.964394172926206e-06, + "loss": 0.4542, + "step": 36900 + }, + { + "epoch": 3.6969499674462862, + "grad_norm": 1.6693038940429688, + "learning_rate": 7.952859543636116e-06, + "loss": 0.4141, + "step": 36910 + }, + { + "epoch": 3.697951620173286, + "grad_norm": 2.0245635509490967, + "learning_rate": 7.941331693005599e-06, + "loss": 0.4179, + "step": 36920 + }, + { + "epoch": 3.6989532729002854, + "grad_norm": 1.8546513319015503, + "learning_rate": 7.929810625618612e-06, + "loss": 0.4161, + "step": 36930 + }, + { + "epoch": 3.699954925627285, + "grad_norm": 1.8944015502929688, + "learning_rate": 7.91829634605642e-06, + "loss": 0.432, + "step": 36940 + }, + { + "epoch": 3.7009565783542846, + "grad_norm": 2.0617423057556152, + "learning_rate": 7.906788858897579e-06, + "loss": 0.4466, + "step": 36950 + }, + { + "epoch": 3.701958231081284, + "grad_norm": 2.174147129058838, + "learning_rate": 7.895288168717951e-06, + "loss": 0.4087, + "step": 36960 + }, + { + "epoch": 3.702959883808284, + "grad_norm": 2.1996078491210938, + "learning_rate": 7.883794280090698e-06, + "loss": 0.3939, + "step": 36970 + }, + { + "epoch": 3.7039615365352834, + "grad_norm": 1.9552979469299316, + "learning_rate": 7.872307197586271e-06, + "loss": 0.4228, + "step": 36980 + }, + { + "epoch": 3.7049631892622825, + "grad_norm": 2.0555949211120605, + "learning_rate": 7.860826925772414e-06, + "loss": 0.4539, + "step": 36990 + }, + { + "epoch": 3.7059648419892826, + "grad_norm": 2.5038888454437256, + "learning_rate": 7.849353469214165e-06, + "loss": 0.4057, + "step": 37000 + }, + { + "epoch": 3.7069664947162817, + "grad_norm": 2.1329166889190674, + "learning_rate": 7.837886832473859e-06, + "loss": 0.4751, + "step": 37010 + }, + { + "epoch": 3.7079681474432813, + "grad_norm": 2.378232479095459, + "learning_rate": 7.826427020111107e-06, + "loss": 0.4125, + "step": 37020 + }, + { + "epoch": 3.708969800170281, + "grad_norm": 2.8863255977630615, + "learning_rate": 7.814974036682814e-06, + "loss": 0.4706, + "step": 37030 + }, + { + "epoch": 3.7099714528972805, + "grad_norm": 1.9137957096099854, + "learning_rate": 7.80352788674317e-06, + "loss": 0.4149, + "step": 37040 + }, + { + "epoch": 3.71097310562428, + "grad_norm": 1.9049749374389648, + "learning_rate": 7.792088574843643e-06, + "loss": 0.3909, + "step": 37050 + }, + { + "epoch": 3.7119747583512797, + "grad_norm": 2.699704647064209, + "learning_rate": 7.780656105532983e-06, + "loss": 0.4085, + "step": 37060 + }, + { + "epoch": 3.7129764110782792, + "grad_norm": 2.358980178833008, + "learning_rate": 7.769230483357212e-06, + "loss": 0.4699, + "step": 37070 + }, + { + "epoch": 3.713978063805279, + "grad_norm": 2.233934164047241, + "learning_rate": 7.757811712859665e-06, + "loss": 0.4224, + "step": 37080 + }, + { + "epoch": 3.7149797165322784, + "grad_norm": 2.1187474727630615, + "learning_rate": 7.7463997985809e-06, + "loss": 0.4722, + "step": 37090 + }, + { + "epoch": 3.7159813692592776, + "grad_norm": 2.188882350921631, + "learning_rate": 7.734994745058771e-06, + "loss": 0.4373, + "step": 37100 + }, + { + "epoch": 3.7169830219862776, + "grad_norm": 2.354186773300171, + "learning_rate": 7.723596556828434e-06, + "loss": 0.4367, + "step": 37110 + }, + { + "epoch": 3.7179846747132768, + "grad_norm": 2.162371873855591, + "learning_rate": 7.71220523842226e-06, + "loss": 0.3975, + "step": 37120 + }, + { + "epoch": 3.7189863274402764, + "grad_norm": 2.0304088592529297, + "learning_rate": 7.700820794369923e-06, + "loss": 0.4807, + "step": 37130 + }, + { + "epoch": 3.719987980167276, + "grad_norm": 1.8520346879959106, + "learning_rate": 7.689443229198365e-06, + "loss": 0.4204, + "step": 37140 + }, + { + "epoch": 3.7209896328942755, + "grad_norm": 1.8495360612869263, + "learning_rate": 7.678072547431787e-06, + "loss": 0.4181, + "step": 37150 + }, + { + "epoch": 3.721991285621275, + "grad_norm": 2.0318777561187744, + "learning_rate": 7.666708753591626e-06, + "loss": 0.4456, + "step": 37160 + }, + { + "epoch": 3.7229929383482747, + "grad_norm": 2.1289327144622803, + "learning_rate": 7.655351852196627e-06, + "loss": 0.434, + "step": 37170 + }, + { + "epoch": 3.7239945910752743, + "grad_norm": 2.1364686489105225, + "learning_rate": 7.644001847762774e-06, + "loss": 0.4215, + "step": 37180 + }, + { + "epoch": 3.724996243802274, + "grad_norm": 2.0084264278411865, + "learning_rate": 7.63265874480329e-06, + "loss": 0.3936, + "step": 37190 + }, + { + "epoch": 3.7259978965292735, + "grad_norm": 1.8063205480575562, + "learning_rate": 7.621322547828663e-06, + "loss": 0.3598, + "step": 37200 + }, + { + "epoch": 3.7269995492562726, + "grad_norm": 2.089787006378174, + "learning_rate": 7.609993261346668e-06, + "loss": 0.4059, + "step": 37210 + }, + { + "epoch": 3.7280012019832727, + "grad_norm": 1.835331678390503, + "learning_rate": 7.598670889862297e-06, + "loss": 0.4411, + "step": 37220 + }, + { + "epoch": 3.729002854710272, + "grad_norm": 2.2048423290252686, + "learning_rate": 7.587355437877777e-06, + "loss": 0.4427, + "step": 37230 + }, + { + "epoch": 3.7300045074372714, + "grad_norm": 2.705153703689575, + "learning_rate": 7.576046909892637e-06, + "loss": 0.4109, + "step": 37240 + }, + { + "epoch": 3.731006160164271, + "grad_norm": 1.9492063522338867, + "learning_rate": 7.564745310403612e-06, + "loss": 0.3932, + "step": 37250 + }, + { + "epoch": 3.7320078128912706, + "grad_norm": 2.270702600479126, + "learning_rate": 7.553450643904692e-06, + "loss": 0.3898, + "step": 37260 + }, + { + "epoch": 3.73300946561827, + "grad_norm": 1.9737622737884521, + "learning_rate": 7.542162914887111e-06, + "loss": 0.4094, + "step": 37270 + }, + { + "epoch": 3.7340111183452698, + "grad_norm": 2.8277175426483154, + "learning_rate": 7.530882127839348e-06, + "loss": 0.3824, + "step": 37280 + }, + { + "epoch": 3.7350127710722694, + "grad_norm": 2.938880205154419, + "learning_rate": 7.519608287247113e-06, + "loss": 0.4285, + "step": 37290 + }, + { + "epoch": 3.736014423799269, + "grad_norm": 2.2028396129608154, + "learning_rate": 7.508341397593363e-06, + "loss": 0.3687, + "step": 37300 + }, + { + "epoch": 3.7370160765262685, + "grad_norm": 2.1324620246887207, + "learning_rate": 7.497081463358286e-06, + "loss": 0.4436, + "step": 37310 + }, + { + "epoch": 3.7380177292532677, + "grad_norm": 2.0066702365875244, + "learning_rate": 7.485828489019303e-06, + "loss": 0.369, + "step": 37320 + }, + { + "epoch": 3.7390193819802673, + "grad_norm": 2.5639231204986572, + "learning_rate": 7.47458247905107e-06, + "loss": 0.4182, + "step": 37330 + }, + { + "epoch": 3.740021034707267, + "grad_norm": 2.4227943420410156, + "learning_rate": 7.463343437925477e-06, + "loss": 0.4117, + "step": 37340 + }, + { + "epoch": 3.7410226874342665, + "grad_norm": 1.719071626663208, + "learning_rate": 7.452111370111636e-06, + "loss": 0.4173, + "step": 37350 + }, + { + "epoch": 3.742024340161266, + "grad_norm": 2.1645774841308594, + "learning_rate": 7.440886280075887e-06, + "loss": 0.4216, + "step": 37360 + }, + { + "epoch": 3.7430259928882657, + "grad_norm": 2.005495309829712, + "learning_rate": 7.429668172281803e-06, + "loss": 0.4413, + "step": 37370 + }, + { + "epoch": 3.7440276456152652, + "grad_norm": 2.228274345397949, + "learning_rate": 7.418457051190173e-06, + "loss": 0.3805, + "step": 37380 + }, + { + "epoch": 3.745029298342265, + "grad_norm": 2.2738568782806396, + "learning_rate": 7.407252921259008e-06, + "loss": 0.4104, + "step": 37390 + }, + { + "epoch": 3.7460309510692644, + "grad_norm": 2.142463445663452, + "learning_rate": 7.396055786943543e-06, + "loss": 0.4443, + "step": 37400 + }, + { + "epoch": 3.7470326037962636, + "grad_norm": 2.024423122406006, + "learning_rate": 7.3848656526962295e-06, + "loss": 0.3953, + "step": 37410 + }, + { + "epoch": 3.7480342565232636, + "grad_norm": 2.5991528034210205, + "learning_rate": 7.373682522966735e-06, + "loss": 0.4175, + "step": 37420 + }, + { + "epoch": 3.7490359092502628, + "grad_norm": 1.819032907485962, + "learning_rate": 7.362506402201932e-06, + "loss": 0.46, + "step": 37430 + }, + { + "epoch": 3.7500375619772623, + "grad_norm": 2.4464993476867676, + "learning_rate": 7.351337294845942e-06, + "loss": 0.4255, + "step": 37440 + }, + { + "epoch": 3.751039214704262, + "grad_norm": 2.157233715057373, + "learning_rate": 7.340175205340044e-06, + "loss": 0.4405, + "step": 37450 + }, + { + "epoch": 3.7520408674312615, + "grad_norm": 2.148372173309326, + "learning_rate": 7.329020138122761e-06, + "loss": 0.4301, + "step": 37460 + }, + { + "epoch": 3.753042520158261, + "grad_norm": 2.149197578430176, + "learning_rate": 7.317872097629836e-06, + "loss": 0.4719, + "step": 37470 + }, + { + "epoch": 3.7540441728852607, + "grad_norm": 2.0958821773529053, + "learning_rate": 7.306731088294172e-06, + "loss": 0.4272, + "step": 37480 + }, + { + "epoch": 3.7550458256122603, + "grad_norm": 2.766040086746216, + "learning_rate": 7.295597114545907e-06, + "loss": 0.4578, + "step": 37490 + }, + { + "epoch": 3.75604747833926, + "grad_norm": 2.5304043292999268, + "learning_rate": 7.284470180812392e-06, + "loss": 0.4636, + "step": 37500 + }, + { + "epoch": 3.7570491310662595, + "grad_norm": 2.753812313079834, + "learning_rate": 7.2733502915181604e-06, + "loss": 0.4124, + "step": 37510 + }, + { + "epoch": 3.7580507837932586, + "grad_norm": 1.706272840499878, + "learning_rate": 7.262237451084938e-06, + "loss": 0.3704, + "step": 37520 + }, + { + "epoch": 3.7590524365202587, + "grad_norm": 2.44985294342041, + "learning_rate": 7.251131663931652e-06, + "loss": 0.4415, + "step": 37530 + }, + { + "epoch": 3.760054089247258, + "grad_norm": 1.9321026802062988, + "learning_rate": 7.240032934474447e-06, + "loss": 0.4082, + "step": 37540 + }, + { + "epoch": 3.7610557419742574, + "grad_norm": 2.1816911697387695, + "learning_rate": 7.228941267126646e-06, + "loss": 0.4284, + "step": 37550 + }, + { + "epoch": 3.762057394701257, + "grad_norm": 2.1587181091308594, + "learning_rate": 7.217856666298736e-06, + "loss": 0.4536, + "step": 37560 + }, + { + "epoch": 3.7630590474282566, + "grad_norm": 2.2277698516845703, + "learning_rate": 7.206779136398445e-06, + "loss": 0.3842, + "step": 37570 + }, + { + "epoch": 3.764060700155256, + "grad_norm": 2.4164507389068604, + "learning_rate": 7.195708681830665e-06, + "loss": 0.3984, + "step": 37580 + }, + { + "epoch": 3.7650623528822558, + "grad_norm": 2.4073264598846436, + "learning_rate": 7.18464530699745e-06, + "loss": 0.3684, + "step": 37590 + }, + { + "epoch": 3.7660640056092554, + "grad_norm": 2.5888216495513916, + "learning_rate": 7.1735890162980855e-06, + "loss": 0.4355, + "step": 37600 + }, + { + "epoch": 3.767065658336255, + "grad_norm": 2.342641592025757, + "learning_rate": 7.162539814129013e-06, + "loss": 0.4305, + "step": 37610 + }, + { + "epoch": 3.7680673110632545, + "grad_norm": 2.2944300174713135, + "learning_rate": 7.151497704883855e-06, + "loss": 0.4233, + "step": 37620 + }, + { + "epoch": 3.7690689637902537, + "grad_norm": 2.190720796585083, + "learning_rate": 7.1404626929534206e-06, + "loss": 0.4602, + "step": 37630 + }, + { + "epoch": 3.7700706165172537, + "grad_norm": 2.474369764328003, + "learning_rate": 7.129434782725697e-06, + "loss": 0.4556, + "step": 37640 + }, + { + "epoch": 3.771072269244253, + "grad_norm": 3.5644686222076416, + "learning_rate": 7.118413978585839e-06, + "loss": 0.4676, + "step": 37650 + }, + { + "epoch": 3.7720739219712525, + "grad_norm": 2.2803492546081543, + "learning_rate": 7.107400284916185e-06, + "loss": 0.4021, + "step": 37660 + }, + { + "epoch": 3.773075574698252, + "grad_norm": 2.6325793266296387, + "learning_rate": 7.0963937060962435e-06, + "loss": 0.4832, + "step": 37670 + }, + { + "epoch": 3.7740772274252516, + "grad_norm": 2.1822357177734375, + "learning_rate": 7.085394246502692e-06, + "loss": 0.4158, + "step": 37680 + }, + { + "epoch": 3.7750788801522512, + "grad_norm": 2.2507176399230957, + "learning_rate": 7.074401910509376e-06, + "loss": 0.4426, + "step": 37690 + }, + { + "epoch": 3.776080532879251, + "grad_norm": 1.774174690246582, + "learning_rate": 7.063416702487313e-06, + "loss": 0.3955, + "step": 37700 + }, + { + "epoch": 3.7770821856062504, + "grad_norm": 2.155637264251709, + "learning_rate": 7.05243862680468e-06, + "loss": 0.3967, + "step": 37710 + }, + { + "epoch": 3.77808383833325, + "grad_norm": 2.492680311203003, + "learning_rate": 7.041467687826825e-06, + "loss": 0.4521, + "step": 37720 + }, + { + "epoch": 3.7790854910602496, + "grad_norm": 1.9159152507781982, + "learning_rate": 7.030503889916254e-06, + "loss": 0.4482, + "step": 37730 + }, + { + "epoch": 3.7800871437872487, + "grad_norm": 2.340388298034668, + "learning_rate": 7.0195472374326335e-06, + "loss": 0.4927, + "step": 37740 + }, + { + "epoch": 3.781088796514249, + "grad_norm": 2.2713913917541504, + "learning_rate": 7.008597734732786e-06, + "loss": 0.4053, + "step": 37750 + }, + { + "epoch": 3.782090449241248, + "grad_norm": 1.636338710784912, + "learning_rate": 6.997655386170698e-06, + "loss": 0.4071, + "step": 37760 + }, + { + "epoch": 3.7830921019682475, + "grad_norm": 1.8299113512039185, + "learning_rate": 6.986720196097507e-06, + "loss": 0.4461, + "step": 37770 + }, + { + "epoch": 3.784093754695247, + "grad_norm": 2.3852038383483887, + "learning_rate": 6.9757921688615e-06, + "loss": 0.4631, + "step": 37780 + }, + { + "epoch": 3.7850954074222467, + "grad_norm": 2.396932363510132, + "learning_rate": 6.964871308808118e-06, + "loss": 0.4139, + "step": 37790 + }, + { + "epoch": 3.7860970601492463, + "grad_norm": 1.9309535026550293, + "learning_rate": 6.953957620279971e-06, + "loss": 0.4465, + "step": 37800 + }, + { + "epoch": 3.787098712876246, + "grad_norm": 2.1654818058013916, + "learning_rate": 6.943051107616785e-06, + "loss": 0.3709, + "step": 37810 + }, + { + "epoch": 3.7881003656032455, + "grad_norm": 3.310075283050537, + "learning_rate": 6.93215177515544e-06, + "loss": 0.4081, + "step": 37820 + }, + { + "epoch": 3.789102018330245, + "grad_norm": 2.255657434463501, + "learning_rate": 6.921259627229989e-06, + "loss": 0.4686, + "step": 37830 + }, + { + "epoch": 3.7901036710572447, + "grad_norm": 1.773685336112976, + "learning_rate": 6.910374668171607e-06, + "loss": 0.4184, + "step": 37840 + }, + { + "epoch": 3.791105323784244, + "grad_norm": 1.9942299127578735, + "learning_rate": 6.899496902308592e-06, + "loss": 0.4079, + "step": 37850 + }, + { + "epoch": 3.7921069765112434, + "grad_norm": 2.451260805130005, + "learning_rate": 6.888626333966405e-06, + "loss": 0.4615, + "step": 37860 + }, + { + "epoch": 3.793108629238243, + "grad_norm": 2.6214842796325684, + "learning_rate": 6.877762967467666e-06, + "loss": 0.4257, + "step": 37870 + }, + { + "epoch": 3.7941102819652426, + "grad_norm": 2.6001267433166504, + "learning_rate": 6.866906807132079e-06, + "loss": 0.4699, + "step": 37880 + }, + { + "epoch": 3.795111934692242, + "grad_norm": 2.7137014865875244, + "learning_rate": 6.856057857276507e-06, + "loss": 0.4248, + "step": 37890 + }, + { + "epoch": 3.7961135874192418, + "grad_norm": 2.153409957885742, + "learning_rate": 6.845216122214973e-06, + "loss": 0.4262, + "step": 37900 + }, + { + "epoch": 3.7971152401462414, + "grad_norm": 2.0230278968811035, + "learning_rate": 6.834381606258597e-06, + "loss": 0.4167, + "step": 37910 + }, + { + "epoch": 3.798116892873241, + "grad_norm": 1.8823751211166382, + "learning_rate": 6.823554313715619e-06, + "loss": 0.4419, + "step": 37920 + }, + { + "epoch": 3.7991185456002405, + "grad_norm": 2.529693126678467, + "learning_rate": 6.81273424889145e-06, + "loss": 0.4636, + "step": 37930 + }, + { + "epoch": 3.8001201983272397, + "grad_norm": 2.7344167232513428, + "learning_rate": 6.801921416088597e-06, + "loss": 0.417, + "step": 37940 + }, + { + "epoch": 3.8011218510542397, + "grad_norm": 2.9459967613220215, + "learning_rate": 6.791115819606689e-06, + "loss": 0.4455, + "step": 37950 + }, + { + "epoch": 3.802123503781239, + "grad_norm": 2.277332305908203, + "learning_rate": 6.780317463742492e-06, + "loss": 0.4097, + "step": 37960 + }, + { + "epoch": 3.8031251565082385, + "grad_norm": 2.0806617736816406, + "learning_rate": 6.769526352789882e-06, + "loss": 0.4105, + "step": 37970 + }, + { + "epoch": 3.804126809235238, + "grad_norm": 2.3580572605133057, + "learning_rate": 6.75874249103986e-06, + "loss": 0.4319, + "step": 37980 + }, + { + "epoch": 3.8051284619622376, + "grad_norm": 2.0618369579315186, + "learning_rate": 6.7479658827805435e-06, + "loss": 0.4417, + "step": 37990 + }, + { + "epoch": 3.8061301146892372, + "grad_norm": 2.297889471054077, + "learning_rate": 6.737196532297163e-06, + "loss": 0.4363, + "step": 38000 + }, + { + "epoch": 3.807131767416237, + "grad_norm": 1.9872561693191528, + "learning_rate": 6.726434443872071e-06, + "loss": 0.4912, + "step": 38010 + }, + { + "epoch": 3.8081334201432364, + "grad_norm": 1.8478449583053589, + "learning_rate": 6.7156796217847155e-06, + "loss": 0.3763, + "step": 38020 + }, + { + "epoch": 3.809135072870236, + "grad_norm": 1.7361677885055542, + "learning_rate": 6.7049320703116756e-06, + "loss": 0.4135, + "step": 38030 + }, + { + "epoch": 3.8101367255972356, + "grad_norm": 2.6249921321868896, + "learning_rate": 6.694191793726623e-06, + "loss": 0.3936, + "step": 38040 + }, + { + "epoch": 3.8111383783242347, + "grad_norm": 1.9055705070495605, + "learning_rate": 6.683458796300349e-06, + "loss": 0.4196, + "step": 38050 + }, + { + "epoch": 3.812140031051235, + "grad_norm": 1.5801509618759155, + "learning_rate": 6.672733082300739e-06, + "loss": 0.3927, + "step": 38060 + }, + { + "epoch": 3.813141683778234, + "grad_norm": 2.310429573059082, + "learning_rate": 6.662014655992791e-06, + "loss": 0.4781, + "step": 38070 + }, + { + "epoch": 3.8141433365052335, + "grad_norm": 2.7247142791748047, + "learning_rate": 6.651303521638599e-06, + "loss": 0.4488, + "step": 38080 + }, + { + "epoch": 3.815144989232233, + "grad_norm": 1.6246259212493896, + "learning_rate": 6.640599683497364e-06, + "loss": 0.3836, + "step": 38090 + }, + { + "epoch": 3.8161466419592327, + "grad_norm": 1.9836045503616333, + "learning_rate": 6.629903145825378e-06, + "loss": 0.3884, + "step": 38100 + }, + { + "epoch": 3.8171482946862323, + "grad_norm": 1.8660048246383667, + "learning_rate": 6.619213912876038e-06, + "loss": 0.3775, + "step": 38110 + }, + { + "epoch": 3.818149947413232, + "grad_norm": 2.0846478939056396, + "learning_rate": 6.608531988899822e-06, + "loss": 0.4793, + "step": 38120 + }, + { + "epoch": 3.8191516001402315, + "grad_norm": 2.101534366607666, + "learning_rate": 6.5978573781443346e-06, + "loss": 0.4689, + "step": 38130 + }, + { + "epoch": 3.820153252867231, + "grad_norm": 2.1078479290008545, + "learning_rate": 6.587190084854228e-06, + "loss": 0.4624, + "step": 38140 + }, + { + "epoch": 3.8211549055942307, + "grad_norm": 1.6181825399398804, + "learning_rate": 6.576530113271265e-06, + "loss": 0.4424, + "step": 38150 + }, + { + "epoch": 3.82215655832123, + "grad_norm": 1.6215962171554565, + "learning_rate": 6.565877467634324e-06, + "loss": 0.4434, + "step": 38160 + }, + { + "epoch": 3.82315821104823, + "grad_norm": 2.2983250617980957, + "learning_rate": 6.5552321521793195e-06, + "loss": 0.4694, + "step": 38170 + }, + { + "epoch": 3.824159863775229, + "grad_norm": 2.1482555866241455, + "learning_rate": 6.5445941711392845e-06, + "loss": 0.4466, + "step": 38180 + }, + { + "epoch": 3.8251615165022286, + "grad_norm": 2.240410804748535, + "learning_rate": 6.533963528744319e-06, + "loss": 0.4196, + "step": 38190 + }, + { + "epoch": 3.826163169229228, + "grad_norm": 2.4457499980926514, + "learning_rate": 6.523340229221639e-06, + "loss": 0.4329, + "step": 38200 + }, + { + "epoch": 3.8271648219562278, + "grad_norm": 2.4264252185821533, + "learning_rate": 6.51272427679549e-06, + "loss": 0.3891, + "step": 38210 + }, + { + "epoch": 3.8281664746832273, + "grad_norm": 1.8337993621826172, + "learning_rate": 6.5021156756872175e-06, + "loss": 0.3847, + "step": 38220 + }, + { + "epoch": 3.829168127410227, + "grad_norm": 3.0118589401245117, + "learning_rate": 6.491514430115278e-06, + "loss": 0.4514, + "step": 38230 + }, + { + "epoch": 3.8301697801372265, + "grad_norm": 2.0712578296661377, + "learning_rate": 6.480920544295141e-06, + "loss": 0.4276, + "step": 38240 + }, + { + "epoch": 3.831171432864226, + "grad_norm": 1.8239521980285645, + "learning_rate": 6.4703340224393895e-06, + "loss": 0.3844, + "step": 38250 + }, + { + "epoch": 3.8321730855912257, + "grad_norm": 2.277188301086426, + "learning_rate": 6.459754868757675e-06, + "loss": 0.475, + "step": 38260 + }, + { + "epoch": 3.833174738318225, + "grad_norm": 2.219277858734131, + "learning_rate": 6.449183087456723e-06, + "loss": 0.4694, + "step": 38270 + }, + { + "epoch": 3.834176391045225, + "grad_norm": 2.4775564670562744, + "learning_rate": 6.4386186827402884e-06, + "loss": 0.4562, + "step": 38280 + }, + { + "epoch": 3.835178043772224, + "grad_norm": 2.250697612762451, + "learning_rate": 6.428061658809248e-06, + "loss": 0.4658, + "step": 38290 + }, + { + "epoch": 3.8361796964992236, + "grad_norm": 2.2010769844055176, + "learning_rate": 6.41751201986151e-06, + "loss": 0.4638, + "step": 38300 + }, + { + "epoch": 3.8371813492262232, + "grad_norm": 2.2128689289093018, + "learning_rate": 6.406969770092056e-06, + "loss": 0.4676, + "step": 38310 + }, + { + "epoch": 3.838183001953223, + "grad_norm": 2.170311450958252, + "learning_rate": 6.396434913692928e-06, + "loss": 0.4089, + "step": 38320 + }, + { + "epoch": 3.8391846546802224, + "grad_norm": 2.4297749996185303, + "learning_rate": 6.385907454853224e-06, + "loss": 0.4224, + "step": 38330 + }, + { + "epoch": 3.840186307407222, + "grad_norm": 1.6787548065185547, + "learning_rate": 6.375387397759114e-06, + "loss": 0.4002, + "step": 38340 + }, + { + "epoch": 3.8411879601342216, + "grad_norm": 2.9497814178466797, + "learning_rate": 6.364874746593793e-06, + "loss": 0.4372, + "step": 38350 + }, + { + "epoch": 3.8421896128612207, + "grad_norm": 2.112044334411621, + "learning_rate": 6.354369505537555e-06, + "loss": 0.4423, + "step": 38360 + }, + { + "epoch": 3.8431912655882208, + "grad_norm": 1.6409683227539062, + "learning_rate": 6.343871678767715e-06, + "loss": 0.4345, + "step": 38370 + }, + { + "epoch": 3.84419291831522, + "grad_norm": 2.136765241622925, + "learning_rate": 6.333381270458655e-06, + "loss": 0.4651, + "step": 38380 + }, + { + "epoch": 3.8451945710422195, + "grad_norm": 2.24963641166687, + "learning_rate": 6.322898284781801e-06, + "loss": 0.4305, + "step": 38390 + }, + { + "epoch": 3.846196223769219, + "grad_norm": 2.395167112350464, + "learning_rate": 6.312422725905626e-06, + "loss": 0.427, + "step": 38400 + }, + { + "epoch": 3.8471978764962187, + "grad_norm": 2.715379238128662, + "learning_rate": 6.3019545979956545e-06, + "loss": 0.483, + "step": 38410 + }, + { + "epoch": 3.8481995292232183, + "grad_norm": 2.4242758750915527, + "learning_rate": 6.291493905214454e-06, + "loss": 0.3938, + "step": 38420 + }, + { + "epoch": 3.849201181950218, + "grad_norm": 2.016488790512085, + "learning_rate": 6.281040651721637e-06, + "loss": 0.4412, + "step": 38430 + }, + { + "epoch": 3.8502028346772175, + "grad_norm": 2.018977403640747, + "learning_rate": 6.270594841673852e-06, + "loss": 0.4535, + "step": 38440 + }, + { + "epoch": 3.851204487404217, + "grad_norm": 2.2235140800476074, + "learning_rate": 6.260156479224797e-06, + "loss": 0.4212, + "step": 38450 + }, + { + "epoch": 3.8522061401312166, + "grad_norm": 2.009124279022217, + "learning_rate": 6.2497255685251995e-06, + "loss": 0.3964, + "step": 38460 + }, + { + "epoch": 3.853207792858216, + "grad_norm": 2.90751576423645, + "learning_rate": 6.239302113722833e-06, + "loss": 0.4423, + "step": 38470 + }, + { + "epoch": 3.854209445585216, + "grad_norm": 2.9282796382904053, + "learning_rate": 6.228886118962493e-06, + "loss": 0.4727, + "step": 38480 + }, + { + "epoch": 3.855211098312215, + "grad_norm": 2.084012985229492, + "learning_rate": 6.218477588386035e-06, + "loss": 0.456, + "step": 38490 + }, + { + "epoch": 3.8562127510392146, + "grad_norm": 2.1381776332855225, + "learning_rate": 6.208076526132306e-06, + "loss": 0.4246, + "step": 38500 + }, + { + "epoch": 3.857214403766214, + "grad_norm": 1.9279459714889526, + "learning_rate": 6.197682936337218e-06, + "loss": 0.404, + "step": 38510 + }, + { + "epoch": 3.8582160564932138, + "grad_norm": 1.9992539882659912, + "learning_rate": 6.187296823133698e-06, + "loss": 0.4197, + "step": 38520 + }, + { + "epoch": 3.8592177092202133, + "grad_norm": 1.7238306999206543, + "learning_rate": 6.1769181906516955e-06, + "loss": 0.3974, + "step": 38530 + }, + { + "epoch": 3.860219361947213, + "grad_norm": 1.9684339761734009, + "learning_rate": 6.1665470430181975e-06, + "loss": 0.4424, + "step": 38540 + }, + { + "epoch": 3.8612210146742125, + "grad_norm": 2.5566136837005615, + "learning_rate": 6.1561833843571975e-06, + "loss": 0.4095, + "step": 38550 + }, + { + "epoch": 3.862222667401212, + "grad_norm": 2.300997495651245, + "learning_rate": 6.1458272187897455e-06, + "loss": 0.4618, + "step": 38560 + }, + { + "epoch": 3.8632243201282117, + "grad_norm": 2.2941536903381348, + "learning_rate": 6.135478550433865e-06, + "loss": 0.4177, + "step": 38570 + }, + { + "epoch": 3.864225972855211, + "grad_norm": 1.9589506387710571, + "learning_rate": 6.125137383404622e-06, + "loss": 0.4183, + "step": 38580 + }, + { + "epoch": 3.865227625582211, + "grad_norm": 1.2186977863311768, + "learning_rate": 6.114803721814114e-06, + "loss": 0.4134, + "step": 38590 + }, + { + "epoch": 3.86622927830921, + "grad_norm": 1.7170480489730835, + "learning_rate": 6.104477569771439e-06, + "loss": 0.3788, + "step": 38600 + }, + { + "epoch": 3.8672309310362096, + "grad_norm": 3.361743211746216, + "learning_rate": 6.094158931382685e-06, + "loss": 0.4188, + "step": 38610 + }, + { + "epoch": 3.868232583763209, + "grad_norm": 2.1883580684661865, + "learning_rate": 6.083847810751004e-06, + "loss": 0.4335, + "step": 38620 + }, + { + "epoch": 3.869234236490209, + "grad_norm": 1.248789668083191, + "learning_rate": 6.073544211976523e-06, + "loss": 0.4231, + "step": 38630 + }, + { + "epoch": 3.8702358892172084, + "grad_norm": 2.8954248428344727, + "learning_rate": 6.063248139156372e-06, + "loss": 0.4248, + "step": 38640 + }, + { + "epoch": 3.871237541944208, + "grad_norm": 1.9824678897857666, + "learning_rate": 6.052959596384719e-06, + "loss": 0.4393, + "step": 38650 + }, + { + "epoch": 3.8722391946712076, + "grad_norm": 2.0736277103424072, + "learning_rate": 6.042678587752718e-06, + "loss": 0.3752, + "step": 38660 + }, + { + "epoch": 3.873240847398207, + "grad_norm": 2.2819983959198, + "learning_rate": 6.032405117348533e-06, + "loss": 0.4167, + "step": 38670 + }, + { + "epoch": 3.8742425001252068, + "grad_norm": 2.657353639602661, + "learning_rate": 6.022139189257306e-06, + "loss": 0.4253, + "step": 38680 + }, + { + "epoch": 3.875244152852206, + "grad_norm": 2.045306921005249, + "learning_rate": 6.011880807561227e-06, + "loss": 0.398, + "step": 38690 + }, + { + "epoch": 3.876245805579206, + "grad_norm": 1.6201101541519165, + "learning_rate": 6.001629976339448e-06, + "loss": 0.4137, + "step": 38700 + }, + { + "epoch": 3.877247458306205, + "grad_norm": 2.2877581119537354, + "learning_rate": 5.991386699668136e-06, + "loss": 0.3722, + "step": 38710 + }, + { + "epoch": 3.8782491110332047, + "grad_norm": 1.7606736421585083, + "learning_rate": 5.981150981620443e-06, + "loss": 0.3913, + "step": 38720 + }, + { + "epoch": 3.8792507637602043, + "grad_norm": 2.2930450439453125, + "learning_rate": 5.970922826266523e-06, + "loss": 0.4395, + "step": 38730 + }, + { + "epoch": 3.880252416487204, + "grad_norm": 2.470388650894165, + "learning_rate": 5.960702237673521e-06, + "loss": 0.4097, + "step": 38740 + }, + { + "epoch": 3.8812540692142035, + "grad_norm": 2.244858503341675, + "learning_rate": 5.950489219905572e-06, + "loss": 0.4404, + "step": 38750 + }, + { + "epoch": 3.882255721941203, + "grad_norm": 2.1141445636749268, + "learning_rate": 5.9402837770238e-06, + "loss": 0.437, + "step": 38760 + }, + { + "epoch": 3.8832573746682026, + "grad_norm": 2.3806586265563965, + "learning_rate": 5.930085913086322e-06, + "loss": 0.432, + "step": 38770 + }, + { + "epoch": 3.8842590273952022, + "grad_norm": 2.2542169094085693, + "learning_rate": 5.919895632148231e-06, + "loss": 0.3765, + "step": 38780 + }, + { + "epoch": 3.885260680122202, + "grad_norm": 2.2765283584594727, + "learning_rate": 5.909712938261616e-06, + "loss": 0.4707, + "step": 38790 + }, + { + "epoch": 3.886262332849201, + "grad_norm": 1.9423617124557495, + "learning_rate": 5.899537835475544e-06, + "loss": 0.4707, + "step": 38800 + }, + { + "epoch": 3.887263985576201, + "grad_norm": 2.1070590019226074, + "learning_rate": 5.889370327836061e-06, + "loss": 0.4737, + "step": 38810 + }, + { + "epoch": 3.8882656383032, + "grad_norm": 2.498217821121216, + "learning_rate": 5.879210419386197e-06, + "loss": 0.3988, + "step": 38820 + }, + { + "epoch": 3.8892672910301997, + "grad_norm": 2.292095184326172, + "learning_rate": 5.869058114165956e-06, + "loss": 0.444, + "step": 38830 + }, + { + "epoch": 3.8902689437571993, + "grad_norm": 2.990748882293701, + "learning_rate": 5.858913416212325e-06, + "loss": 0.4561, + "step": 38840 + }, + { + "epoch": 3.891270596484199, + "grad_norm": 2.656426429748535, + "learning_rate": 5.848776329559261e-06, + "loss": 0.4223, + "step": 38850 + }, + { + "epoch": 3.8922722492111985, + "grad_norm": 2.1715846061706543, + "learning_rate": 5.838646858237695e-06, + "loss": 0.4408, + "step": 38860 + }, + { + "epoch": 3.893273901938198, + "grad_norm": 2.254647970199585, + "learning_rate": 5.828525006275532e-06, + "loss": 0.4076, + "step": 38870 + }, + { + "epoch": 3.8942755546651977, + "grad_norm": 3.026210308074951, + "learning_rate": 5.818410777697639e-06, + "loss": 0.4982, + "step": 38880 + }, + { + "epoch": 3.895277207392197, + "grad_norm": 1.857161521911621, + "learning_rate": 5.808304176525875e-06, + "loss": 0.458, + "step": 38890 + }, + { + "epoch": 3.896278860119197, + "grad_norm": 1.916955828666687, + "learning_rate": 5.798205206779033e-06, + "loss": 0.3936, + "step": 38900 + }, + { + "epoch": 3.897280512846196, + "grad_norm": 2.5912342071533203, + "learning_rate": 5.788113872472886e-06, + "loss": 0.4142, + "step": 38910 + }, + { + "epoch": 3.8982821655731956, + "grad_norm": 2.2807843685150146, + "learning_rate": 5.778030177620198e-06, + "loss": 0.471, + "step": 38920 + }, + { + "epoch": 3.899283818300195, + "grad_norm": 1.8373711109161377, + "learning_rate": 5.767954126230641e-06, + "loss": 0.4649, + "step": 38930 + }, + { + "epoch": 3.900285471027195, + "grad_norm": 2.2042641639709473, + "learning_rate": 5.757885722310882e-06, + "loss": 0.4439, + "step": 38940 + }, + { + "epoch": 3.9012871237541944, + "grad_norm": 1.9236149787902832, + "learning_rate": 5.747824969864554e-06, + "loss": 0.4517, + "step": 38950 + }, + { + "epoch": 3.902288776481194, + "grad_norm": 2.47882080078125, + "learning_rate": 5.7377718728922365e-06, + "loss": 0.4453, + "step": 38960 + }, + { + "epoch": 3.9032904292081936, + "grad_norm": 1.873042106628418, + "learning_rate": 5.72772643539144e-06, + "loss": 0.4618, + "step": 38970 + }, + { + "epoch": 3.904292081935193, + "grad_norm": 2.677980661392212, + "learning_rate": 5.7176886613566735e-06, + "loss": 0.4366, + "step": 38980 + }, + { + "epoch": 3.9052937346621928, + "grad_norm": 2.4638803005218506, + "learning_rate": 5.707658554779374e-06, + "loss": 0.4097, + "step": 38990 + }, + { + "epoch": 3.906295387389192, + "grad_norm": 2.025956869125366, + "learning_rate": 5.697636119647939e-06, + "loss": 0.4124, + "step": 39000 + }, + { + "epoch": 3.907297040116192, + "grad_norm": 1.7988700866699219, + "learning_rate": 5.68762135994769e-06, + "loss": 0.3835, + "step": 39010 + }, + { + "epoch": 3.908298692843191, + "grad_norm": 1.9764509201049805, + "learning_rate": 5.677614279660934e-06, + "loss": 0.4574, + "step": 39020 + }, + { + "epoch": 3.9093003455701907, + "grad_norm": 2.4781711101531982, + "learning_rate": 5.667614882766908e-06, + "loss": 0.4962, + "step": 39030 + }, + { + "epoch": 3.9103019982971903, + "grad_norm": 1.637224555015564, + "learning_rate": 5.6576231732417745e-06, + "loss": 0.4241, + "step": 39040 + }, + { + "epoch": 3.91130365102419, + "grad_norm": 2.451122522354126, + "learning_rate": 5.647639155058676e-06, + "loss": 0.412, + "step": 39050 + }, + { + "epoch": 3.9123053037511895, + "grad_norm": 1.8591203689575195, + "learning_rate": 5.63766283218767e-06, + "loss": 0.4624, + "step": 39060 + }, + { + "epoch": 3.913306956478189, + "grad_norm": 2.224898099899292, + "learning_rate": 5.627694208595763e-06, + "loss": 0.4359, + "step": 39070 + }, + { + "epoch": 3.9143086092051886, + "grad_norm": 2.6811466217041016, + "learning_rate": 5.617733288246898e-06, + "loss": 0.3711, + "step": 39080 + }, + { + "epoch": 3.9153102619321882, + "grad_norm": 2.6933958530426025, + "learning_rate": 5.607780075101956e-06, + "loss": 0.4493, + "step": 39090 + }, + { + "epoch": 3.916311914659188, + "grad_norm": 1.5979939699172974, + "learning_rate": 5.597834573118754e-06, + "loss": 0.3847, + "step": 39100 + }, + { + "epoch": 3.917313567386187, + "grad_norm": 2.687382698059082, + "learning_rate": 5.587896786252039e-06, + "loss": 0.4065, + "step": 39110 + }, + { + "epoch": 3.918315220113187, + "grad_norm": 2.6313157081604004, + "learning_rate": 5.577966718453495e-06, + "loss": 0.4204, + "step": 39120 + }, + { + "epoch": 3.919316872840186, + "grad_norm": 2.1213088035583496, + "learning_rate": 5.5680443736717325e-06, + "loss": 0.4447, + "step": 39130 + }, + { + "epoch": 3.9203185255671857, + "grad_norm": 2.800734281539917, + "learning_rate": 5.558129755852295e-06, + "loss": 0.42, + "step": 39140 + }, + { + "epoch": 3.9213201782941853, + "grad_norm": 2.896596908569336, + "learning_rate": 5.548222868937649e-06, + "loss": 0.453, + "step": 39150 + }, + { + "epoch": 3.922321831021185, + "grad_norm": 2.1709372997283936, + "learning_rate": 5.538323716867194e-06, + "loss": 0.4739, + "step": 39160 + }, + { + "epoch": 3.9233234837481845, + "grad_norm": 2.2188522815704346, + "learning_rate": 5.528432303577244e-06, + "loss": 0.432, + "step": 39170 + }, + { + "epoch": 3.924325136475184, + "grad_norm": 2.2879672050476074, + "learning_rate": 5.518548633001039e-06, + "loss": 0.5457, + "step": 39180 + }, + { + "epoch": 3.9253267892021837, + "grad_norm": 2.0803894996643066, + "learning_rate": 5.508672709068746e-06, + "loss": 0.3965, + "step": 39190 + }, + { + "epoch": 3.9263284419291833, + "grad_norm": 1.8538846969604492, + "learning_rate": 5.498804535707447e-06, + "loss": 0.4526, + "step": 39200 + }, + { + "epoch": 3.927330094656183, + "grad_norm": 2.3934788703918457, + "learning_rate": 5.488944116841144e-06, + "loss": 0.3509, + "step": 39210 + }, + { + "epoch": 3.928331747383182, + "grad_norm": 1.6396085023880005, + "learning_rate": 5.4790914563907474e-06, + "loss": 0.4732, + "step": 39220 + }, + { + "epoch": 3.929333400110182, + "grad_norm": 1.8581525087356567, + "learning_rate": 5.469246558274096e-06, + "loss": 0.4159, + "step": 39230 + }, + { + "epoch": 3.930335052837181, + "grad_norm": 2.8066928386688232, + "learning_rate": 5.459409426405926e-06, + "loss": 0.493, + "step": 39240 + }, + { + "epoch": 3.931336705564181, + "grad_norm": 2.4748148918151855, + "learning_rate": 5.449580064697915e-06, + "loss": 0.4374, + "step": 39250 + }, + { + "epoch": 3.9323383582911804, + "grad_norm": 2.191162586212158, + "learning_rate": 5.43975847705861e-06, + "loss": 0.4382, + "step": 39260 + }, + { + "epoch": 3.93334001101818, + "grad_norm": 2.5699470043182373, + "learning_rate": 5.429944667393486e-06, + "loss": 0.3827, + "step": 39270 + }, + { + "epoch": 3.9343416637451796, + "grad_norm": 2.012481689453125, + "learning_rate": 5.420138639604947e-06, + "loss": 0.4211, + "step": 39280 + }, + { + "epoch": 3.935343316472179, + "grad_norm": 2.005993127822876, + "learning_rate": 5.410340397592262e-06, + "loss": 0.4031, + "step": 39290 + }, + { + "epoch": 3.9363449691991788, + "grad_norm": 2.065284490585327, + "learning_rate": 5.4005499452516234e-06, + "loss": 0.4853, + "step": 39300 + }, + { + "epoch": 3.9373466219261783, + "grad_norm": 3.1377053260803223, + "learning_rate": 5.3907672864761395e-06, + "loss": 0.4342, + "step": 39310 + }, + { + "epoch": 3.938348274653178, + "grad_norm": 1.8640254735946655, + "learning_rate": 5.380992425155809e-06, + "loss": 0.4401, + "step": 39320 + }, + { + "epoch": 3.939349927380177, + "grad_norm": 2.006674289703369, + "learning_rate": 5.371225365177513e-06, + "loss": 0.3803, + "step": 39330 + }, + { + "epoch": 3.9403515801071767, + "grad_norm": 1.887819766998291, + "learning_rate": 5.361466110425045e-06, + "loss": 0.4175, + "step": 39340 + }, + { + "epoch": 3.9413532328341763, + "grad_norm": 1.9000790119171143, + "learning_rate": 5.351714664779106e-06, + "loss": 0.4634, + "step": 39350 + }, + { + "epoch": 3.942354885561176, + "grad_norm": 2.495184898376465, + "learning_rate": 5.341971032117285e-06, + "loss": 0.4234, + "step": 39360 + }, + { + "epoch": 3.9433565382881755, + "grad_norm": 2.152071237564087, + "learning_rate": 5.332235216314035e-06, + "loss": 0.3666, + "step": 39370 + }, + { + "epoch": 3.944358191015175, + "grad_norm": 2.0491833686828613, + "learning_rate": 5.32250722124075e-06, + "loss": 0.4581, + "step": 39380 + }, + { + "epoch": 3.9453598437421746, + "grad_norm": 2.057877540588379, + "learning_rate": 5.312787050765689e-06, + "loss": 0.4422, + "step": 39390 + }, + { + "epoch": 3.9463614964691742, + "grad_norm": 2.5603151321411133, + "learning_rate": 5.303074708753977e-06, + "loss": 0.4315, + "step": 39400 + }, + { + "epoch": 3.947363149196174, + "grad_norm": 2.4202048778533936, + "learning_rate": 5.293370199067671e-06, + "loss": 0.4272, + "step": 39410 + }, + { + "epoch": 3.948364801923173, + "grad_norm": 1.896180510520935, + "learning_rate": 5.283673525565688e-06, + "loss": 0.3761, + "step": 39420 + }, + { + "epoch": 3.949366454650173, + "grad_norm": 2.138786554336548, + "learning_rate": 5.273984692103831e-06, + "loss": 0.418, + "step": 39430 + }, + { + "epoch": 3.950368107377172, + "grad_norm": 2.09519100189209, + "learning_rate": 5.264303702534784e-06, + "loss": 0.4182, + "step": 39440 + }, + { + "epoch": 3.9513697601041717, + "grad_norm": 2.606905460357666, + "learning_rate": 5.254630560708123e-06, + "loss": 0.4222, + "step": 39450 + }, + { + "epoch": 3.9523714128311713, + "grad_norm": 2.679440498352051, + "learning_rate": 5.24496527047029e-06, + "loss": 0.4478, + "step": 39460 + }, + { + "epoch": 3.953373065558171, + "grad_norm": 2.5487256050109863, + "learning_rate": 5.23530783566461e-06, + "loss": 0.373, + "step": 39470 + }, + { + "epoch": 3.9543747182851705, + "grad_norm": 2.0131003856658936, + "learning_rate": 5.225658260131289e-06, + "loss": 0.3875, + "step": 39480 + }, + { + "epoch": 3.95537637101217, + "grad_norm": 1.9472322463989258, + "learning_rate": 5.216016547707403e-06, + "loss": 0.38, + "step": 39490 + }, + { + "epoch": 3.9563780237391697, + "grad_norm": 1.9929423332214355, + "learning_rate": 5.2063827022269e-06, + "loss": 0.4123, + "step": 39500 + }, + { + "epoch": 3.9573796764661693, + "grad_norm": 2.648130416870117, + "learning_rate": 5.196756727520602e-06, + "loss": 0.4668, + "step": 39510 + }, + { + "epoch": 3.958381329193169, + "grad_norm": 2.119204521179199, + "learning_rate": 5.187138627416202e-06, + "loss": 0.3989, + "step": 39520 + }, + { + "epoch": 3.959382981920168, + "grad_norm": 2.0803627967834473, + "learning_rate": 5.177528405738261e-06, + "loss": 0.3887, + "step": 39530 + }, + { + "epoch": 3.960384634647168, + "grad_norm": 2.0360963344573975, + "learning_rate": 5.167926066308207e-06, + "loss": 0.4147, + "step": 39540 + }, + { + "epoch": 3.961386287374167, + "grad_norm": 2.736513376235962, + "learning_rate": 5.158331612944337e-06, + "loss": 0.4734, + "step": 39550 + }, + { + "epoch": 3.962387940101167, + "grad_norm": 2.068869113922119, + "learning_rate": 5.1487450494618004e-06, + "loss": 0.4155, + "step": 39560 + }, + { + "epoch": 3.9633895928281664, + "grad_norm": 2.3239376544952393, + "learning_rate": 5.139166379672627e-06, + "loss": 0.4444, + "step": 39570 + }, + { + "epoch": 3.964391245555166, + "grad_norm": 2.593562602996826, + "learning_rate": 5.129595607385693e-06, + "loss": 0.4024, + "step": 39580 + }, + { + "epoch": 3.9653928982821656, + "grad_norm": 1.6795233488082886, + "learning_rate": 5.120032736406744e-06, + "loss": 0.4005, + "step": 39590 + }, + { + "epoch": 3.966394551009165, + "grad_norm": 1.533920407295227, + "learning_rate": 5.110477770538366e-06, + "loss": 0.5235, + "step": 39600 + }, + { + "epoch": 3.9673962037361648, + "grad_norm": 2.483339309692383, + "learning_rate": 5.100930713580044e-06, + "loss": 0.4192, + "step": 39610 + }, + { + "epoch": 3.9683978564631643, + "grad_norm": 2.7518630027770996, + "learning_rate": 5.09139156932806e-06, + "loss": 0.4448, + "step": 39620 + }, + { + "epoch": 3.969399509190164, + "grad_norm": 2.8066458702087402, + "learning_rate": 5.081860341575584e-06, + "loss": 0.3969, + "step": 39630 + }, + { + "epoch": 3.970401161917163, + "grad_norm": 2.323917865753174, + "learning_rate": 5.072337034112645e-06, + "loss": 0.4005, + "step": 39640 + }, + { + "epoch": 3.971402814644163, + "grad_norm": 1.8697916269302368, + "learning_rate": 5.062821650726113e-06, + "loss": 0.361, + "step": 39650 + }, + { + "epoch": 3.9724044673711623, + "grad_norm": 2.3262734413146973, + "learning_rate": 5.053314195199685e-06, + "loss": 0.4364, + "step": 39660 + }, + { + "epoch": 3.973406120098162, + "grad_norm": 2.201371192932129, + "learning_rate": 5.043814671313932e-06, + "loss": 0.4244, + "step": 39670 + }, + { + "epoch": 3.9744077728251614, + "grad_norm": 2.123957633972168, + "learning_rate": 5.0343230828462764e-06, + "loss": 0.366, + "step": 39680 + }, + { + "epoch": 3.975409425552161, + "grad_norm": 2.200098752975464, + "learning_rate": 5.02483943357096e-06, + "loss": 0.503, + "step": 39690 + }, + { + "epoch": 3.9764110782791606, + "grad_norm": 1.9379470348358154, + "learning_rate": 5.015363727259076e-06, + "loss": 0.4664, + "step": 39700 + }, + { + "epoch": 3.97741273100616, + "grad_norm": 1.8062063455581665, + "learning_rate": 5.0058959676785785e-06, + "loss": 0.4212, + "step": 39710 + }, + { + "epoch": 3.97841438373316, + "grad_norm": 2.2486565113067627, + "learning_rate": 4.996436158594245e-06, + "loss": 0.4427, + "step": 39720 + }, + { + "epoch": 3.9794160364601594, + "grad_norm": 2.03916597366333, + "learning_rate": 4.986984303767675e-06, + "loss": 0.4555, + "step": 39730 + }, + { + "epoch": 3.980417689187159, + "grad_norm": 1.792839527130127, + "learning_rate": 4.9775404069573425e-06, + "loss": 0.3701, + "step": 39740 + }, + { + "epoch": 3.981419341914158, + "grad_norm": 2.415205240249634, + "learning_rate": 4.968104471918533e-06, + "loss": 0.5279, + "step": 39750 + }, + { + "epoch": 3.982420994641158, + "grad_norm": 1.9716744422912598, + "learning_rate": 4.958676502403367e-06, + "loss": 0.4309, + "step": 39760 + }, + { + "epoch": 3.9834226473681573, + "grad_norm": 1.593368649482727, + "learning_rate": 4.949256502160804e-06, + "loss": 0.4196, + "step": 39770 + }, + { + "epoch": 3.984424300095157, + "grad_norm": 2.1170709133148193, + "learning_rate": 4.939844474936634e-06, + "loss": 0.4474, + "step": 39780 + }, + { + "epoch": 3.9854259528221565, + "grad_norm": 2.5782203674316406, + "learning_rate": 4.930440424473467e-06, + "loss": 0.4556, + "step": 39790 + }, + { + "epoch": 3.986427605549156, + "grad_norm": 2.213399887084961, + "learning_rate": 4.921044354510759e-06, + "loss": 0.4286, + "step": 39800 + }, + { + "epoch": 3.9874292582761557, + "grad_norm": 2.359858989715576, + "learning_rate": 4.911656268784775e-06, + "loss": 0.4304, + "step": 39810 + }, + { + "epoch": 3.9884309110031553, + "grad_norm": 2.288480281829834, + "learning_rate": 4.902276171028617e-06, + "loss": 0.3837, + "step": 39820 + }, + { + "epoch": 3.989432563730155, + "grad_norm": 2.2940833568573, + "learning_rate": 4.8929040649722e-06, + "loss": 0.4507, + "step": 39830 + }, + { + "epoch": 3.9904342164571545, + "grad_norm": 2.6430327892303467, + "learning_rate": 4.883539954342276e-06, + "loss": 0.3704, + "step": 39840 + }, + { + "epoch": 3.991435869184154, + "grad_norm": 2.3419792652130127, + "learning_rate": 4.874183842862401e-06, + "loss": 0.4236, + "step": 39850 + }, + { + "epoch": 3.992437521911153, + "grad_norm": 2.221893310546875, + "learning_rate": 4.864835734252962e-06, + "loss": 0.4163, + "step": 39860 + }, + { + "epoch": 3.993439174638153, + "grad_norm": 2.3750827312469482, + "learning_rate": 4.855495632231161e-06, + "loss": 0.3974, + "step": 39870 + }, + { + "epoch": 3.9944408273651524, + "grad_norm": 1.812092900276184, + "learning_rate": 4.846163540511011e-06, + "loss": 0.43, + "step": 39880 + }, + { + "epoch": 3.995442480092152, + "grad_norm": 2.505802631378174, + "learning_rate": 4.836839462803347e-06, + "loss": 0.3909, + "step": 39890 + }, + { + "epoch": 3.9964441328191516, + "grad_norm": 2.1492509841918945, + "learning_rate": 4.827523402815815e-06, + "loss": 0.4526, + "step": 39900 + }, + { + "epoch": 3.997445785546151, + "grad_norm": 2.6524550914764404, + "learning_rate": 4.818215364252871e-06, + "loss": 0.4235, + "step": 39910 + }, + { + "epoch": 3.9984474382731507, + "grad_norm": 2.4931113719940186, + "learning_rate": 4.808915350815779e-06, + "loss": 0.4216, + "step": 39920 + }, + { + "epoch": 3.9994490910001503, + "grad_norm": 2.9122989177703857, + "learning_rate": 4.799623366202615e-06, + "loss": 0.4228, + "step": 39930 + }, + { + "epoch": 4.0004006610908, + "grad_norm": 2.093388319015503, + "learning_rate": 4.790339414108278e-06, + "loss": 0.3735, + "step": 39940 + }, + { + "epoch": 4.001402313817799, + "grad_norm": 2.304086685180664, + "learning_rate": 4.781063498224439e-06, + "loss": 0.4207, + "step": 39950 + }, + { + "epoch": 4.002403966544799, + "grad_norm": 1.982155442237854, + "learning_rate": 4.771795622239592e-06, + "loss": 0.3568, + "step": 39960 + }, + { + "epoch": 4.003405619271798, + "grad_norm": 2.5733442306518555, + "learning_rate": 4.762535789839054e-06, + "loss": 0.4407, + "step": 39970 + }, + { + "epoch": 4.004407271998798, + "grad_norm": 2.412712812423706, + "learning_rate": 4.753284004704902e-06, + "loss": 0.4066, + "step": 39980 + }, + { + "epoch": 4.005408924725797, + "grad_norm": 1.9959261417388916, + "learning_rate": 4.7440402705160425e-06, + "loss": 0.3478, + "step": 39990 + }, + { + "epoch": 4.006410577452797, + "grad_norm": 2.1611344814300537, + "learning_rate": 4.734804590948169e-06, + "loss": 0.4195, + "step": 40000 + }, + { + "epoch": 4.006410577452797, + "eval_bleu": 0.3662565942705902, + "eval_loss": 0.5082083940505981, + "eval_rouge1": 0.7014509543822172, + "eval_rouge2": 0.5362548944715757, + "eval_rougeL": 0.6602146413463208, + "eval_runtime": 86792.7268, + "eval_samples_per_second": 0.204, + "eval_steps_per_second": 0.026, + "eval_wer": 0.7582390590348932, + "step": 40000 + }, + { + "epoch": 4.0074122301797965, + "grad_norm": 2.3167426586151123, + "learning_rate": 4.725576969673789e-06, + "loss": 0.4305, + "step": 40010 + }, + { + "epoch": 4.008413882906797, + "grad_norm": 2.5307557582855225, + "learning_rate": 4.7163574103621825e-06, + "loss": 0.3887, + "step": 40020 + }, + { + "epoch": 4.009415535633796, + "grad_norm": 1.9515817165374756, + "learning_rate": 4.707145916679426e-06, + "loss": 0.4566, + "step": 40030 + }, + { + "epoch": 4.010417188360796, + "grad_norm": 2.6875641345977783, + "learning_rate": 4.6979424922884096e-06, + "loss": 0.3808, + "step": 40040 + }, + { + "epoch": 4.011418841087795, + "grad_norm": 2.634347677230835, + "learning_rate": 4.688747140848807e-06, + "loss": 0.3669, + "step": 40050 + }, + { + "epoch": 4.012420493814794, + "grad_norm": 2.083259344100952, + "learning_rate": 4.679559866017052e-06, + "loss": 0.4264, + "step": 40060 + }, + { + "epoch": 4.013422146541794, + "grad_norm": 2.376185655593872, + "learning_rate": 4.670380671446412e-06, + "loss": 0.379, + "step": 40070 + }, + { + "epoch": 4.014423799268793, + "grad_norm": 2.035813570022583, + "learning_rate": 4.661209560786922e-06, + "loss": 0.3986, + "step": 40080 + }, + { + "epoch": 4.015425451995793, + "grad_norm": 1.9220411777496338, + "learning_rate": 4.652046537685381e-06, + "loss": 0.3928, + "step": 40090 + }, + { + "epoch": 4.016427104722792, + "grad_norm": 2.600371837615967, + "learning_rate": 4.642891605785413e-06, + "loss": 0.3932, + "step": 40100 + }, + { + "epoch": 4.017428757449792, + "grad_norm": 1.7663367986679077, + "learning_rate": 4.633744768727394e-06, + "loss": 0.3915, + "step": 40110 + }, + { + "epoch": 4.018430410176792, + "grad_norm": 1.809329867362976, + "learning_rate": 4.624606030148493e-06, + "loss": 0.3715, + "step": 40120 + }, + { + "epoch": 4.019432062903792, + "grad_norm": 2.3340373039245605, + "learning_rate": 4.615475393682655e-06, + "loss": 0.3847, + "step": 40130 + }, + { + "epoch": 4.020433715630791, + "grad_norm": 2.9561421871185303, + "learning_rate": 4.606352862960606e-06, + "loss": 0.4141, + "step": 40140 + }, + { + "epoch": 4.02143536835779, + "grad_norm": 2.7550601959228516, + "learning_rate": 4.597238441609855e-06, + "loss": 0.3774, + "step": 40150 + }, + { + "epoch": 4.02243702108479, + "grad_norm": 2.112523317337036, + "learning_rate": 4.588132133254658e-06, + "loss": 0.3566, + "step": 40160 + }, + { + "epoch": 4.023438673811789, + "grad_norm": 2.55678653717041, + "learning_rate": 4.579033941516087e-06, + "loss": 0.4126, + "step": 40170 + }, + { + "epoch": 4.024440326538789, + "grad_norm": 2.198493719100952, + "learning_rate": 4.569943870011956e-06, + "loss": 0.3802, + "step": 40180 + }, + { + "epoch": 4.025441979265788, + "grad_norm": 2.324162006378174, + "learning_rate": 4.560861922356863e-06, + "loss": 0.4629, + "step": 40190 + }, + { + "epoch": 4.026443631992788, + "grad_norm": 1.9970417022705078, + "learning_rate": 4.551788102162172e-06, + "loss": 0.4426, + "step": 40200 + }, + { + "epoch": 4.0274452847197875, + "grad_norm": 2.143714427947998, + "learning_rate": 4.54272241303601e-06, + "loss": 0.4101, + "step": 40210 + }, + { + "epoch": 4.0284469374467875, + "grad_norm": 1.9812605381011963, + "learning_rate": 4.5336648585832835e-06, + "loss": 0.3751, + "step": 40220 + }, + { + "epoch": 4.029448590173787, + "grad_norm": 2.0631392002105713, + "learning_rate": 4.524615442405652e-06, + "loss": 0.3368, + "step": 40230 + }, + { + "epoch": 4.030450242900787, + "grad_norm": 1.7811130285263062, + "learning_rate": 4.5155741681015465e-06, + "loss": 0.3955, + "step": 40240 + }, + { + "epoch": 4.031451895627786, + "grad_norm": 2.1916298866271973, + "learning_rate": 4.506541039266154e-06, + "loss": 0.4335, + "step": 40250 + }, + { + "epoch": 4.032453548354785, + "grad_norm": 2.283879518508911, + "learning_rate": 4.49751605949143e-06, + "loss": 0.4194, + "step": 40260 + }, + { + "epoch": 4.033455201081785, + "grad_norm": 2.2963759899139404, + "learning_rate": 4.4884992323660835e-06, + "loss": 0.4499, + "step": 40270 + }, + { + "epoch": 4.034456853808784, + "grad_norm": 1.6694716215133667, + "learning_rate": 4.479490561475585e-06, + "loss": 0.4056, + "step": 40280 + }, + { + "epoch": 4.035458506535784, + "grad_norm": 2.5013225078582764, + "learning_rate": 4.470490050402154e-06, + "loss": 0.4222, + "step": 40290 + }, + { + "epoch": 4.036460159262783, + "grad_norm": 2.4829888343811035, + "learning_rate": 4.4614977027247924e-06, + "loss": 0.4054, + "step": 40300 + }, + { + "epoch": 4.037461811989783, + "grad_norm": 2.05537748336792, + "learning_rate": 4.452513522019214e-06, + "loss": 0.4458, + "step": 40310 + }, + { + "epoch": 4.0384634647167825, + "grad_norm": 2.26316499710083, + "learning_rate": 4.443537511857915e-06, + "loss": 0.4931, + "step": 40320 + }, + { + "epoch": 4.039465117443783, + "grad_norm": 2.514526605606079, + "learning_rate": 4.434569675810132e-06, + "loss": 0.4126, + "step": 40330 + }, + { + "epoch": 4.040466770170782, + "grad_norm": 1.9930896759033203, + "learning_rate": 4.425610017441855e-06, + "loss": 0.4017, + "step": 40340 + }, + { + "epoch": 4.041468422897782, + "grad_norm": 2.2072863578796387, + "learning_rate": 4.416658540315824e-06, + "loss": 0.4262, + "step": 40350 + }, + { + "epoch": 4.042470075624781, + "grad_norm": 2.070967435836792, + "learning_rate": 4.4077152479915115e-06, + "loss": 0.4335, + "step": 40360 + }, + { + "epoch": 4.04347172835178, + "grad_norm": 2.4259519577026367, + "learning_rate": 4.398780144025169e-06, + "loss": 0.3856, + "step": 40370 + }, + { + "epoch": 4.04447338107878, + "grad_norm": 2.0865633487701416, + "learning_rate": 4.3898532319697455e-06, + "loss": 0.3952, + "step": 40380 + }, + { + "epoch": 4.045475033805779, + "grad_norm": 2.032640218734741, + "learning_rate": 4.38093451537496e-06, + "loss": 0.3963, + "step": 40390 + }, + { + "epoch": 4.046476686532779, + "grad_norm": 2.034989595413208, + "learning_rate": 4.372023997787284e-06, + "loss": 0.3822, + "step": 40400 + }, + { + "epoch": 4.047478339259778, + "grad_norm": 2.2075562477111816, + "learning_rate": 4.363121682749907e-06, + "loss": 0.4047, + "step": 40410 + }, + { + "epoch": 4.048479991986778, + "grad_norm": 1.8768295049667358, + "learning_rate": 4.354227573802752e-06, + "loss": 0.3965, + "step": 40420 + }, + { + "epoch": 4.049481644713778, + "grad_norm": 1.8903871774673462, + "learning_rate": 4.345341674482503e-06, + "loss": 0.4633, + "step": 40430 + }, + { + "epoch": 4.050483297440778, + "grad_norm": 1.9325599670410156, + "learning_rate": 4.336463988322572e-06, + "loss": 0.405, + "step": 40440 + }, + { + "epoch": 4.051484950167777, + "grad_norm": 2.1987829208374023, + "learning_rate": 4.327594518853081e-06, + "loss": 0.4235, + "step": 40450 + }, + { + "epoch": 4.052486602894777, + "grad_norm": 2.2805888652801514, + "learning_rate": 4.318733269600919e-06, + "loss": 0.4807, + "step": 40460 + }, + { + "epoch": 4.053488255621776, + "grad_norm": 2.4149937629699707, + "learning_rate": 4.30988024408969e-06, + "loss": 0.4326, + "step": 40470 + }, + { + "epoch": 4.054489908348775, + "grad_norm": 1.7438945770263672, + "learning_rate": 4.3010354458397295e-06, + "loss": 0.4199, + "step": 40480 + }, + { + "epoch": 4.055491561075775, + "grad_norm": 1.9871585369110107, + "learning_rate": 4.292198878368086e-06, + "loss": 0.3935, + "step": 40490 + }, + { + "epoch": 4.056493213802774, + "grad_norm": 2.012375831604004, + "learning_rate": 4.2833705451885696e-06, + "loss": 0.3794, + "step": 40500 + }, + { + "epoch": 4.057494866529774, + "grad_norm": 2.4525644779205322, + "learning_rate": 4.274550449811687e-06, + "loss": 0.439, + "step": 40510 + }, + { + "epoch": 4.0584965192567735, + "grad_norm": 1.6219289302825928, + "learning_rate": 4.265738595744681e-06, + "loss": 0.4129, + "step": 40520 + }, + { + "epoch": 4.0594981719837735, + "grad_norm": 1.9297707080841064, + "learning_rate": 4.2569349864915175e-06, + "loss": 0.4051, + "step": 40530 + }, + { + "epoch": 4.060499824710773, + "grad_norm": 2.0720765590667725, + "learning_rate": 4.248139625552877e-06, + "loss": 0.3447, + "step": 40540 + }, + { + "epoch": 4.061501477437773, + "grad_norm": 1.5587975978851318, + "learning_rate": 4.239352516426167e-06, + "loss": 0.3451, + "step": 40550 + }, + { + "epoch": 4.062503130164772, + "grad_norm": 2.4509339332580566, + "learning_rate": 4.230573662605511e-06, + "loss": 0.4148, + "step": 40560 + }, + { + "epoch": 4.063504782891771, + "grad_norm": 2.0733141899108887, + "learning_rate": 4.221803067581751e-06, + "loss": 0.4208, + "step": 40570 + }, + { + "epoch": 4.064506435618771, + "grad_norm": 1.82527494430542, + "learning_rate": 4.213040734842444e-06, + "loss": 0.3823, + "step": 40580 + }, + { + "epoch": 4.06550808834577, + "grad_norm": 2.46215558052063, + "learning_rate": 4.204286667871859e-06, + "loss": 0.423, + "step": 40590 + }, + { + "epoch": 4.06650974107277, + "grad_norm": 2.0969185829162598, + "learning_rate": 4.1955408701509854e-06, + "loss": 0.4262, + "step": 40600 + }, + { + "epoch": 4.067511393799769, + "grad_norm": 1.8389031887054443, + "learning_rate": 4.186803345157517e-06, + "loss": 0.4392, + "step": 40610 + }, + { + "epoch": 4.068513046526769, + "grad_norm": 2.8035295009613037, + "learning_rate": 4.178074096365864e-06, + "loss": 0.4361, + "step": 40620 + }, + { + "epoch": 4.0695146992537685, + "grad_norm": 1.908402442932129, + "learning_rate": 4.169353127247139e-06, + "loss": 0.3687, + "step": 40630 + }, + { + "epoch": 4.0705163519807686, + "grad_norm": 1.9811326265335083, + "learning_rate": 4.160640441269168e-06, + "loss": 0.3685, + "step": 40640 + }, + { + "epoch": 4.071518004707768, + "grad_norm": 2.062203884124756, + "learning_rate": 4.151936041896482e-06, + "loss": 0.4121, + "step": 40650 + }, + { + "epoch": 4.072519657434768, + "grad_norm": 1.943377137184143, + "learning_rate": 4.143239932590312e-06, + "loss": 0.4364, + "step": 40660 + }, + { + "epoch": 4.073521310161767, + "grad_norm": 2.0794131755828857, + "learning_rate": 4.134552116808602e-06, + "loss": 0.3846, + "step": 40670 + }, + { + "epoch": 4.074522962888766, + "grad_norm": 2.561751365661621, + "learning_rate": 4.1258725980059865e-06, + "loss": 0.4058, + "step": 40680 + }, + { + "epoch": 4.075524615615766, + "grad_norm": 2.585106611251831, + "learning_rate": 4.117201379633809e-06, + "loss": 0.3959, + "step": 40690 + }, + { + "epoch": 4.076526268342765, + "grad_norm": 1.8804863691329956, + "learning_rate": 4.108538465140122e-06, + "loss": 0.391, + "step": 40700 + }, + { + "epoch": 4.077527921069765, + "grad_norm": 1.504548192024231, + "learning_rate": 4.099883857969647e-06, + "loss": 0.4662, + "step": 40710 + }, + { + "epoch": 4.078529573796764, + "grad_norm": 2.0218613147735596, + "learning_rate": 4.091237561563821e-06, + "loss": 0.4463, + "step": 40720 + }, + { + "epoch": 4.079531226523764, + "grad_norm": 1.8394734859466553, + "learning_rate": 4.082599579360794e-06, + "loss": 0.3977, + "step": 40730 + }, + { + "epoch": 4.080532879250764, + "grad_norm": 2.5202815532684326, + "learning_rate": 4.073969914795373e-06, + "loss": 0.4841, + "step": 40740 + }, + { + "epoch": 4.081534531977764, + "grad_norm": 2.3350720405578613, + "learning_rate": 4.065348571299071e-06, + "loss": 0.3872, + "step": 40750 + }, + { + "epoch": 4.082536184704763, + "grad_norm": 2.0294203758239746, + "learning_rate": 4.056735552300115e-06, + "loss": 0.349, + "step": 40760 + }, + { + "epoch": 4.083537837431763, + "grad_norm": 1.7993828058242798, + "learning_rate": 4.048130861223395e-06, + "loss": 0.3741, + "step": 40770 + }, + { + "epoch": 4.084539490158762, + "grad_norm": 1.6386586427688599, + "learning_rate": 4.0395345014904885e-06, + "loss": 0.3829, + "step": 40780 + }, + { + "epoch": 4.085541142885761, + "grad_norm": 2.2599360942840576, + "learning_rate": 4.030946476519684e-06, + "loss": 0.3841, + "step": 40790 + }, + { + "epoch": 4.086542795612761, + "grad_norm": 1.9661349058151245, + "learning_rate": 4.022366789725931e-06, + "loss": 0.3766, + "step": 40800 + }, + { + "epoch": 4.08754444833976, + "grad_norm": 2.0001981258392334, + "learning_rate": 4.0137954445208876e-06, + "loss": 0.4531, + "step": 40810 + }, + { + "epoch": 4.08854610106676, + "grad_norm": 1.8442391157150269, + "learning_rate": 4.0052324443128564e-06, + "loss": 0.3898, + "step": 40820 + }, + { + "epoch": 4.0895477537937595, + "grad_norm": 2.130378246307373, + "learning_rate": 3.99667779250687e-06, + "loss": 0.4561, + "step": 40830 + }, + { + "epoch": 4.0905494065207595, + "grad_norm": 2.1687450408935547, + "learning_rate": 3.98813149250461e-06, + "loss": 0.4071, + "step": 40840 + }, + { + "epoch": 4.091551059247759, + "grad_norm": 1.72993004322052, + "learning_rate": 3.979593547704433e-06, + "loss": 0.3537, + "step": 40850 + }, + { + "epoch": 4.092552711974759, + "grad_norm": 1.8413400650024414, + "learning_rate": 3.9710639615014e-06, + "loss": 0.4079, + "step": 40860 + }, + { + "epoch": 4.093554364701758, + "grad_norm": 2.123229742050171, + "learning_rate": 3.962542737287226e-06, + "loss": 0.3985, + "step": 40870 + }, + { + "epoch": 4.094556017428758, + "grad_norm": 1.9011422395706177, + "learning_rate": 3.954029878450311e-06, + "loss": 0.3487, + "step": 40880 + }, + { + "epoch": 4.095557670155757, + "grad_norm": 1.8646225929260254, + "learning_rate": 3.9455253883757234e-06, + "loss": 0.4002, + "step": 40890 + }, + { + "epoch": 4.096559322882756, + "grad_norm": 2.285891056060791, + "learning_rate": 3.937029270445206e-06, + "loss": 0.3964, + "step": 40900 + }, + { + "epoch": 4.097560975609756, + "grad_norm": 2.354170799255371, + "learning_rate": 3.928541528037172e-06, + "loss": 0.3985, + "step": 40910 + }, + { + "epoch": 4.098562628336755, + "grad_norm": 2.1918609142303467, + "learning_rate": 3.920062164526706e-06, + "loss": 0.4191, + "step": 40920 + }, + { + "epoch": 4.099564281063755, + "grad_norm": 2.174389362335205, + "learning_rate": 3.911591183285557e-06, + "loss": 0.3841, + "step": 40930 + }, + { + "epoch": 4.1005659337907545, + "grad_norm": 2.802354335784912, + "learning_rate": 3.903128587682147e-06, + "loss": 0.4132, + "step": 40940 + }, + { + "epoch": 4.1015675865177545, + "grad_norm": 1.9637892246246338, + "learning_rate": 3.894674381081556e-06, + "loss": 0.3879, + "step": 40950 + }, + { + "epoch": 4.102569239244754, + "grad_norm": 2.0116653442382812, + "learning_rate": 3.886228566845534e-06, + "loss": 0.4091, + "step": 40960 + }, + { + "epoch": 4.103570891971754, + "grad_norm": 2.062088966369629, + "learning_rate": 3.877791148332491e-06, + "loss": 0.4185, + "step": 40970 + }, + { + "epoch": 4.104572544698753, + "grad_norm": 2.834785223007202, + "learning_rate": 3.8693621288975e-06, + "loss": 0.3921, + "step": 40980 + }, + { + "epoch": 4.105574197425753, + "grad_norm": 1.9823628664016724, + "learning_rate": 3.860941511892291e-06, + "loss": 0.4165, + "step": 40990 + }, + { + "epoch": 4.106575850152752, + "grad_norm": 2.0273165702819824, + "learning_rate": 3.852529300665259e-06, + "loss": 0.4596, + "step": 41000 + }, + { + "epoch": 4.107577502879751, + "grad_norm": 1.7067807912826538, + "learning_rate": 3.844125498561449e-06, + "loss": 0.4131, + "step": 41010 + }, + { + "epoch": 4.108579155606751, + "grad_norm": 2.307382345199585, + "learning_rate": 3.8357301089225696e-06, + "loss": 0.4735, + "step": 41020 + }, + { + "epoch": 4.10958080833375, + "grad_norm": 2.346848964691162, + "learning_rate": 3.827343135086978e-06, + "loss": 0.4004, + "step": 41030 + }, + { + "epoch": 4.11058246106075, + "grad_norm": 2.429508686065674, + "learning_rate": 3.818964580389686e-06, + "loss": 0.4555, + "step": 41040 + }, + { + "epoch": 4.11158411378775, + "grad_norm": 2.165010929107666, + "learning_rate": 3.8105944481623578e-06, + "loss": 0.3992, + "step": 41050 + }, + { + "epoch": 4.11258576651475, + "grad_norm": 2.037492513656616, + "learning_rate": 3.802232741733325e-06, + "loss": 0.4485, + "step": 41060 + }, + { + "epoch": 4.113587419241749, + "grad_norm": 2.500692367553711, + "learning_rate": 3.7938794644275355e-06, + "loss": 0.4451, + "step": 41070 + }, + { + "epoch": 4.114589071968749, + "grad_norm": 1.6784147024154663, + "learning_rate": 3.7855346195666027e-06, + "loss": 0.3731, + "step": 41080 + }, + { + "epoch": 4.115590724695748, + "grad_norm": 2.7670488357543945, + "learning_rate": 3.777198210468802e-06, + "loss": 0.3972, + "step": 41090 + }, + { + "epoch": 4.116592377422748, + "grad_norm": 2.1402854919433594, + "learning_rate": 3.7688702404490406e-06, + "loss": 0.4219, + "step": 41100 + }, + { + "epoch": 4.117594030149747, + "grad_norm": 2.4333910942077637, + "learning_rate": 3.760550712818847e-06, + "loss": 0.3754, + "step": 41110 + }, + { + "epoch": 4.118595682876746, + "grad_norm": 2.354822874069214, + "learning_rate": 3.7522396308864367e-06, + "loss": 0.401, + "step": 41120 + }, + { + "epoch": 4.119597335603746, + "grad_norm": 1.6452901363372803, + "learning_rate": 3.743936997956646e-06, + "loss": 0.3948, + "step": 41130 + }, + { + "epoch": 4.1205989883307454, + "grad_norm": 2.6851041316986084, + "learning_rate": 3.7356428173309337e-06, + "loss": 0.4247, + "step": 41140 + }, + { + "epoch": 4.1216006410577455, + "grad_norm": 2.218315839767456, + "learning_rate": 3.7273570923074207e-06, + "loss": 0.3556, + "step": 41150 + }, + { + "epoch": 4.122602293784745, + "grad_norm": 2.302386522293091, + "learning_rate": 3.7190798261808657e-06, + "loss": 0.3815, + "step": 41160 + }, + { + "epoch": 4.123603946511745, + "grad_norm": 2.2736644744873047, + "learning_rate": 3.7108110222426636e-06, + "loss": 0.325, + "step": 41170 + }, + { + "epoch": 4.124605599238744, + "grad_norm": 1.8379311561584473, + "learning_rate": 3.7025506837808173e-06, + "loss": 0.3882, + "step": 41180 + }, + { + "epoch": 4.125607251965744, + "grad_norm": 1.8531453609466553, + "learning_rate": 3.6942988140800023e-06, + "loss": 0.382, + "step": 41190 + }, + { + "epoch": 4.126608904692743, + "grad_norm": 2.105767011642456, + "learning_rate": 3.686055416421508e-06, + "loss": 0.4276, + "step": 41200 + }, + { + "epoch": 4.127610557419743, + "grad_norm": 1.9618855714797974, + "learning_rate": 3.6778204940832394e-06, + "loss": 0.3956, + "step": 41210 + }, + { + "epoch": 4.128612210146742, + "grad_norm": 1.891269326210022, + "learning_rate": 3.6695940503397667e-06, + "loss": 0.3889, + "step": 41220 + }, + { + "epoch": 4.129613862873741, + "grad_norm": 1.8943488597869873, + "learning_rate": 3.661376088462262e-06, + "loss": 0.4319, + "step": 41230 + }, + { + "epoch": 4.130615515600741, + "grad_norm": 2.5317161083221436, + "learning_rate": 3.6531666117185335e-06, + "loss": 0.4331, + "step": 41240 + }, + { + "epoch": 4.1316171683277405, + "grad_norm": 1.882836937904358, + "learning_rate": 3.644965623373012e-06, + "loss": 0.4, + "step": 41250 + }, + { + "epoch": 4.1326188210547405, + "grad_norm": 1.9603805541992188, + "learning_rate": 3.636773126686757e-06, + "loss": 0.3485, + "step": 41260 + }, + { + "epoch": 4.13362047378174, + "grad_norm": 2.294837236404419, + "learning_rate": 3.6285891249174504e-06, + "loss": 0.4074, + "step": 41270 + }, + { + "epoch": 4.13462212650874, + "grad_norm": 2.164186477661133, + "learning_rate": 3.6204136213193935e-06, + "loss": 0.4174, + "step": 41280 + }, + { + "epoch": 4.135623779235739, + "grad_norm": 1.9150091409683228, + "learning_rate": 3.6122466191435112e-06, + "loss": 0.4222, + "step": 41290 + }, + { + "epoch": 4.136625431962739, + "grad_norm": 2.168137550354004, + "learning_rate": 3.6040881216373444e-06, + "loss": 0.4368, + "step": 41300 + }, + { + "epoch": 4.137627084689738, + "grad_norm": 2.411673069000244, + "learning_rate": 3.5959381320450596e-06, + "loss": 0.4143, + "step": 41310 + }, + { + "epoch": 4.138628737416737, + "grad_norm": 1.9745694398880005, + "learning_rate": 3.5877966536074283e-06, + "loss": 0.3476, + "step": 41320 + }, + { + "epoch": 4.139630390143737, + "grad_norm": 1.8774809837341309, + "learning_rate": 3.579663689561852e-06, + "loss": 0.3927, + "step": 41330 + }, + { + "epoch": 4.140632042870736, + "grad_norm": 1.9254579544067383, + "learning_rate": 3.5715392431423356e-06, + "loss": 0.3869, + "step": 41340 + }, + { + "epoch": 4.141633695597736, + "grad_norm": 2.5691428184509277, + "learning_rate": 3.563423317579498e-06, + "loss": 0.4281, + "step": 41350 + }, + { + "epoch": 4.142635348324736, + "grad_norm": 2.32572340965271, + "learning_rate": 3.5553159161005765e-06, + "loss": 0.4107, + "step": 41360 + }, + { + "epoch": 4.143637001051736, + "grad_norm": 2.5501484870910645, + "learning_rate": 3.547217041929413e-06, + "loss": 0.4378, + "step": 41370 + }, + { + "epoch": 4.144638653778735, + "grad_norm": 2.0283477306365967, + "learning_rate": 3.53912669828646e-06, + "loss": 0.3957, + "step": 41380 + }, + { + "epoch": 4.145640306505735, + "grad_norm": 2.043149709701538, + "learning_rate": 3.531044888388779e-06, + "loss": 0.4236, + "step": 41390 + }, + { + "epoch": 4.146641959232734, + "grad_norm": 1.4444963932037354, + "learning_rate": 3.5229716154500354e-06, + "loss": 0.3652, + "step": 41400 + }, + { + "epoch": 4.147643611959734, + "grad_norm": 2.416794776916504, + "learning_rate": 3.5149068826804993e-06, + "loss": 0.4204, + "step": 41410 + }, + { + "epoch": 4.148645264686733, + "grad_norm": 1.7935189008712769, + "learning_rate": 3.5068506932870616e-06, + "loss": 0.344, + "step": 41420 + }, + { + "epoch": 4.149646917413732, + "grad_norm": 2.3463051319122314, + "learning_rate": 3.498803050473187e-06, + "loss": 0.3847, + "step": 41430 + }, + { + "epoch": 4.150648570140732, + "grad_norm": 2.427424669265747, + "learning_rate": 3.490763957438953e-06, + "loss": 0.4119, + "step": 41440 + }, + { + "epoch": 4.151650222867731, + "grad_norm": 1.8780012130737305, + "learning_rate": 3.4827334173810565e-06, + "loss": 0.3799, + "step": 41450 + }, + { + "epoch": 4.1526518755947315, + "grad_norm": 2.543027639389038, + "learning_rate": 3.4747114334927777e-06, + "loss": 0.3964, + "step": 41460 + }, + { + "epoch": 4.153653528321731, + "grad_norm": 1.5053480863571167, + "learning_rate": 3.4666980089639783e-06, + "loss": 0.4012, + "step": 41470 + }, + { + "epoch": 4.154655181048731, + "grad_norm": 1.9852502346038818, + "learning_rate": 3.4586931469811373e-06, + "loss": 0.3834, + "step": 41480 + }, + { + "epoch": 4.15565683377573, + "grad_norm": 2.525178909301758, + "learning_rate": 3.450696850727339e-06, + "loss": 0.4066, + "step": 41490 + }, + { + "epoch": 4.15665848650273, + "grad_norm": 2.6303272247314453, + "learning_rate": 3.442709123382232e-06, + "loss": 0.3694, + "step": 41500 + }, + { + "epoch": 4.157660139229729, + "grad_norm": 1.9393866062164307, + "learning_rate": 3.4347299681220716e-06, + "loss": 0.4593, + "step": 41510 + }, + { + "epoch": 4.158661791956729, + "grad_norm": 1.95452082157135, + "learning_rate": 3.426759388119716e-06, + "loss": 0.4154, + "step": 41520 + }, + { + "epoch": 4.159663444683728, + "grad_norm": 2.5059969425201416, + "learning_rate": 3.4187973865446005e-06, + "loss": 0.4373, + "step": 41530 + }, + { + "epoch": 4.160665097410727, + "grad_norm": 2.185824394226074, + "learning_rate": 3.410843966562741e-06, + "loss": 0.3905, + "step": 41540 + }, + { + "epoch": 4.161666750137727, + "grad_norm": 2.5035550594329834, + "learning_rate": 3.402899131336762e-06, + "loss": 0.4118, + "step": 41550 + }, + { + "epoch": 4.1626684028647265, + "grad_norm": 1.755468726158142, + "learning_rate": 3.394962884025862e-06, + "loss": 0.4559, + "step": 41560 + }, + { + "epoch": 4.1636700555917265, + "grad_norm": 2.0787034034729004, + "learning_rate": 3.387035227785826e-06, + "loss": 0.4125, + "step": 41570 + }, + { + "epoch": 4.164671708318726, + "grad_norm": 2.5473291873931885, + "learning_rate": 3.3791161657690225e-06, + "loss": 0.4438, + "step": 41580 + }, + { + "epoch": 4.165673361045726, + "grad_norm": 2.753129243850708, + "learning_rate": 3.3712057011244023e-06, + "loss": 0.4606, + "step": 41590 + }, + { + "epoch": 4.166675013772725, + "grad_norm": 2.3650059700012207, + "learning_rate": 3.363303836997503e-06, + "loss": 0.3975, + "step": 41600 + }, + { + "epoch": 4.167676666499725, + "grad_norm": 2.2438313961029053, + "learning_rate": 3.3554105765304323e-06, + "loss": 0.4134, + "step": 41610 + }, + { + "epoch": 4.168678319226724, + "grad_norm": 2.5630788803100586, + "learning_rate": 3.3475259228618856e-06, + "loss": 0.3901, + "step": 41620 + }, + { + "epoch": 4.169679971953723, + "grad_norm": 1.7865986824035645, + "learning_rate": 3.3396498791271324e-06, + "loss": 0.431, + "step": 41630 + }, + { + "epoch": 4.170681624680723, + "grad_norm": 2.134347915649414, + "learning_rate": 3.3317824484580177e-06, + "loss": 0.4261, + "step": 41640 + }, + { + "epoch": 4.171683277407722, + "grad_norm": 1.7214415073394775, + "learning_rate": 3.3239236339829645e-06, + "loss": 0.4616, + "step": 41650 + }, + { + "epoch": 4.172684930134722, + "grad_norm": 1.7558671236038208, + "learning_rate": 3.3160734388269666e-06, + "loss": 0.4491, + "step": 41660 + }, + { + "epoch": 4.173686582861722, + "grad_norm": 2.4053850173950195, + "learning_rate": 3.308231866111591e-06, + "loss": 0.415, + "step": 41670 + }, + { + "epoch": 4.174688235588722, + "grad_norm": 2.2763030529022217, + "learning_rate": 3.300398918954978e-06, + "loss": 0.4713, + "step": 41680 + }, + { + "epoch": 4.175689888315721, + "grad_norm": 2.562437057495117, + "learning_rate": 3.2925746004718344e-06, + "loss": 0.4066, + "step": 41690 + }, + { + "epoch": 4.176691541042721, + "grad_norm": 1.9022046327590942, + "learning_rate": 3.284758913773442e-06, + "loss": 0.369, + "step": 41700 + }, + { + "epoch": 4.17769319376972, + "grad_norm": 1.930850625038147, + "learning_rate": 3.276951861967642e-06, + "loss": 0.4493, + "step": 41710 + }, + { + "epoch": 4.17869484649672, + "grad_norm": 2.1526312828063965, + "learning_rate": 3.269153448158846e-06, + "loss": 0.4159, + "step": 41720 + }, + { + "epoch": 4.179696499223719, + "grad_norm": 2.301459550857544, + "learning_rate": 3.261363675448037e-06, + "loss": 0.4742, + "step": 41730 + }, + { + "epoch": 4.180698151950718, + "grad_norm": 2.6906955242156982, + "learning_rate": 3.253582546932746e-06, + "loss": 0.4189, + "step": 41740 + }, + { + "epoch": 4.181699804677718, + "grad_norm": 1.7854713201522827, + "learning_rate": 3.2458100657070916e-06, + "loss": 0.4461, + "step": 41750 + }, + { + "epoch": 4.182701457404717, + "grad_norm": 2.6074023246765137, + "learning_rate": 3.2380462348617272e-06, + "loss": 0.4518, + "step": 41760 + }, + { + "epoch": 4.1837031101317175, + "grad_norm": 2.636751174926758, + "learning_rate": 3.230291057483875e-06, + "loss": 0.4189, + "step": 41770 + }, + { + "epoch": 4.184704762858717, + "grad_norm": 2.3315718173980713, + "learning_rate": 3.2225445366573376e-06, + "loss": 0.3592, + "step": 41780 + }, + { + "epoch": 4.185706415585717, + "grad_norm": 2.527083158493042, + "learning_rate": 3.214806675462442e-06, + "loss": 0.4188, + "step": 41790 + }, + { + "epoch": 4.186708068312716, + "grad_norm": 2.1696078777313232, + "learning_rate": 3.2070774769760892e-06, + "loss": 0.4505, + "step": 41800 + }, + { + "epoch": 4.187709721039716, + "grad_norm": 2.4959564208984375, + "learning_rate": 3.199356944271728e-06, + "loss": 0.4, + "step": 41810 + }, + { + "epoch": 4.188711373766715, + "grad_norm": 1.9425634145736694, + "learning_rate": 3.1916450804193865e-06, + "loss": 0.3721, + "step": 41820 + }, + { + "epoch": 4.189713026493715, + "grad_norm": 2.789562702178955, + "learning_rate": 3.1839418884856057e-06, + "loss": 0.3906, + "step": 41830 + }, + { + "epoch": 4.190714679220714, + "grad_norm": 2.2337393760681152, + "learning_rate": 3.176247371533503e-06, + "loss": 0.4394, + "step": 41840 + }, + { + "epoch": 4.191716331947713, + "grad_norm": 2.4459781646728516, + "learning_rate": 3.168561532622749e-06, + "loss": 0.4465, + "step": 41850 + }, + { + "epoch": 4.192717984674713, + "grad_norm": 2.3486621379852295, + "learning_rate": 3.1608843748095585e-06, + "loss": 0.3933, + "step": 41860 + }, + { + "epoch": 4.1937196374017125, + "grad_norm": 2.227614402770996, + "learning_rate": 3.1532159011466724e-06, + "loss": 0.4286, + "step": 41870 + }, + { + "epoch": 4.1947212901287125, + "grad_norm": 1.9478914737701416, + "learning_rate": 3.1455561146834178e-06, + "loss": 0.4396, + "step": 41880 + }, + { + "epoch": 4.195722942855712, + "grad_norm": 2.1462783813476562, + "learning_rate": 3.1379050184656477e-06, + "loss": 0.3786, + "step": 41890 + }, + { + "epoch": 4.196724595582712, + "grad_norm": 2.016328811645508, + "learning_rate": 3.1302626155357423e-06, + "loss": 0.462, + "step": 41900 + }, + { + "epoch": 4.197726248309711, + "grad_norm": 1.9448295831680298, + "learning_rate": 3.1226289089326593e-06, + "loss": 0.4233, + "step": 41910 + }, + { + "epoch": 4.198727901036711, + "grad_norm": 1.9269777536392212, + "learning_rate": 3.115003901691871e-06, + "loss": 0.3689, + "step": 41920 + }, + { + "epoch": 4.19972955376371, + "grad_norm": 2.250922679901123, + "learning_rate": 3.1073875968454096e-06, + "loss": 0.394, + "step": 41930 + }, + { + "epoch": 4.20073120649071, + "grad_norm": 2.7293055057525635, + "learning_rate": 3.099779997421831e-06, + "loss": 0.359, + "step": 41940 + }, + { + "epoch": 4.201732859217709, + "grad_norm": 2.0588979721069336, + "learning_rate": 3.0921811064462374e-06, + "loss": 0.352, + "step": 41950 + }, + { + "epoch": 4.202734511944708, + "grad_norm": 1.7878366708755493, + "learning_rate": 3.0845909269402756e-06, + "loss": 0.3826, + "step": 41960 + }, + { + "epoch": 4.203736164671708, + "grad_norm": 2.4124763011932373, + "learning_rate": 3.0770094619221036e-06, + "loss": 0.4297, + "step": 41970 + }, + { + "epoch": 4.2047378173987076, + "grad_norm": 1.2512212991714478, + "learning_rate": 3.0694367144064456e-06, + "loss": 0.4079, + "step": 41980 + }, + { + "epoch": 4.205739470125708, + "grad_norm": 1.5953153371810913, + "learning_rate": 3.061872687404538e-06, + "loss": 0.3754, + "step": 41990 + }, + { + "epoch": 4.206741122852707, + "grad_norm": 2.8178634643554688, + "learning_rate": 3.0543173839241546e-06, + "loss": 0.4456, + "step": 42000 + }, + { + "epoch": 4.207742775579707, + "grad_norm": 2.2212166786193848, + "learning_rate": 3.0467708069696037e-06, + "loss": 0.3952, + "step": 42010 + }, + { + "epoch": 4.208744428306706, + "grad_norm": 2.113723039627075, + "learning_rate": 3.039232959541724e-06, + "loss": 0.3944, + "step": 42020 + }, + { + "epoch": 4.209746081033706, + "grad_norm": 2.7201507091522217, + "learning_rate": 3.031703844637876e-06, + "loss": 0.3985, + "step": 42030 + }, + { + "epoch": 4.210747733760705, + "grad_norm": 2.516010284423828, + "learning_rate": 3.024183465251956e-06, + "loss": 0.4158, + "step": 42040 + }, + { + "epoch": 4.211749386487705, + "grad_norm": 2.367647647857666, + "learning_rate": 3.0166718243743785e-06, + "loss": 0.3951, + "step": 42050 + }, + { + "epoch": 4.212751039214704, + "grad_norm": 1.4118751287460327, + "learning_rate": 3.0091689249920923e-06, + "loss": 0.3785, + "step": 42060 + }, + { + "epoch": 4.213752691941703, + "grad_norm": 2.2263405323028564, + "learning_rate": 3.001674770088564e-06, + "loss": 0.3853, + "step": 42070 + }, + { + "epoch": 4.2147543446687035, + "grad_norm": 1.7431256771087646, + "learning_rate": 2.9941893626437817e-06, + "loss": 0.3816, + "step": 42080 + }, + { + "epoch": 4.215755997395703, + "grad_norm": 1.8744088411331177, + "learning_rate": 2.986712705634262e-06, + "loss": 0.4145, + "step": 42090 + }, + { + "epoch": 4.216757650122703, + "grad_norm": 1.928413987159729, + "learning_rate": 2.979244802033032e-06, + "loss": 0.3944, + "step": 42100 + }, + { + "epoch": 4.217759302849702, + "grad_norm": 2.396594762802124, + "learning_rate": 2.971785654809656e-06, + "loss": 0.4667, + "step": 42110 + }, + { + "epoch": 4.218760955576702, + "grad_norm": 2.273517608642578, + "learning_rate": 2.9643352669301933e-06, + "loss": 0.4142, + "step": 42120 + }, + { + "epoch": 4.219762608303701, + "grad_norm": 1.8810628652572632, + "learning_rate": 2.9568936413572363e-06, + "loss": 0.3943, + "step": 42130 + }, + { + "epoch": 4.220764261030701, + "grad_norm": 2.027733325958252, + "learning_rate": 2.9494607810498786e-06, + "loss": 0.4632, + "step": 42140 + }, + { + "epoch": 4.2217659137577, + "grad_norm": 2.457437753677368, + "learning_rate": 2.9420366889637592e-06, + "loss": 0.439, + "step": 42150 + }, + { + "epoch": 4.2227675664847, + "grad_norm": 2.446218252182007, + "learning_rate": 2.9346213680509882e-06, + "loss": 0.3905, + "step": 42160 + }, + { + "epoch": 4.223769219211699, + "grad_norm": 2.112877368927002, + "learning_rate": 2.927214821260213e-06, + "loss": 0.3998, + "step": 42170 + }, + { + "epoch": 4.2247708719386985, + "grad_norm": 2.958712577819824, + "learning_rate": 2.9198170515366023e-06, + "loss": 0.4534, + "step": 42180 + }, + { + "epoch": 4.2257725246656985, + "grad_norm": 2.4029061794281006, + "learning_rate": 2.9124280618218035e-06, + "loss": 0.4766, + "step": 42190 + }, + { + "epoch": 4.226774177392698, + "grad_norm": 2.6346077919006348, + "learning_rate": 2.905047855053991e-06, + "loss": 0.4245, + "step": 42200 + }, + { + "epoch": 4.227775830119698, + "grad_norm": 2.153205156326294, + "learning_rate": 2.8976764341678536e-06, + "loss": 0.4395, + "step": 42210 + }, + { + "epoch": 4.228777482846697, + "grad_norm": 1.5752555131912231, + "learning_rate": 2.8903138020945797e-06, + "loss": 0.3512, + "step": 42220 + }, + { + "epoch": 4.229779135573697, + "grad_norm": 2.1180450916290283, + "learning_rate": 2.882959961761847e-06, + "loss": 0.4136, + "step": 42230 + }, + { + "epoch": 4.230780788300696, + "grad_norm": 2.0195274353027344, + "learning_rate": 2.8756149160938635e-06, + "loss": 0.42, + "step": 42240 + }, + { + "epoch": 4.231782441027696, + "grad_norm": 2.2286198139190674, + "learning_rate": 2.8682786680113314e-06, + "loss": 0.42, + "step": 42250 + }, + { + "epoch": 4.232784093754695, + "grad_norm": 2.2767393589019775, + "learning_rate": 2.860951220431435e-06, + "loss": 0.4883, + "step": 42260 + }, + { + "epoch": 4.233785746481694, + "grad_norm": 2.395681858062744, + "learning_rate": 2.8536325762678895e-06, + "loss": 0.4004, + "step": 42270 + }, + { + "epoch": 4.234787399208694, + "grad_norm": 2.077942371368408, + "learning_rate": 2.8463227384308898e-06, + "loss": 0.4004, + "step": 42280 + }, + { + "epoch": 4.2357890519356935, + "grad_norm": 2.383152723312378, + "learning_rate": 2.8390217098271414e-06, + "loss": 0.422, + "step": 42290 + }, + { + "epoch": 4.236790704662694, + "grad_norm": 1.919995903968811, + "learning_rate": 2.8317294933598277e-06, + "loss": 0.391, + "step": 42300 + }, + { + "epoch": 4.237792357389693, + "grad_norm": 2.3750290870666504, + "learning_rate": 2.824446091928651e-06, + "loss": 0.4244, + "step": 42310 + }, + { + "epoch": 4.238794010116693, + "grad_norm": 2.0089025497436523, + "learning_rate": 2.8171715084297916e-06, + "loss": 0.381, + "step": 42320 + }, + { + "epoch": 4.239795662843692, + "grad_norm": 2.8085126876831055, + "learning_rate": 2.809905745755936e-06, + "loss": 0.3842, + "step": 42330 + }, + { + "epoch": 4.240797315570692, + "grad_norm": 2.5227251052856445, + "learning_rate": 2.8026488067962515e-06, + "loss": 0.4056, + "step": 42340 + }, + { + "epoch": 4.241798968297691, + "grad_norm": 2.0635979175567627, + "learning_rate": 2.7954006944364063e-06, + "loss": 0.4036, + "step": 42350 + }, + { + "epoch": 4.242800621024691, + "grad_norm": 2.093088150024414, + "learning_rate": 2.7881614115585485e-06, + "loss": 0.3837, + "step": 42360 + }, + { + "epoch": 4.24380227375169, + "grad_norm": 3.117953300476074, + "learning_rate": 2.780930961041328e-06, + "loss": 0.414, + "step": 42370 + }, + { + "epoch": 4.244803926478689, + "grad_norm": 2.0712902545928955, + "learning_rate": 2.77370934575987e-06, + "loss": 0.4608, + "step": 42380 + }, + { + "epoch": 4.2458055792056895, + "grad_norm": 2.027517080307007, + "learning_rate": 2.766496568585797e-06, + "loss": 0.4196, + "step": 42390 + }, + { + "epoch": 4.246807231932689, + "grad_norm": 2.0849087238311768, + "learning_rate": 2.7592926323872088e-06, + "loss": 0.4262, + "step": 42400 + }, + { + "epoch": 4.247808884659689, + "grad_norm": 1.7223390340805054, + "learning_rate": 2.7520975400286973e-06, + "loss": 0.3834, + "step": 42410 + }, + { + "epoch": 4.248810537386688, + "grad_norm": 2.2863972187042236, + "learning_rate": 2.7449112943713272e-06, + "loss": 0.4206, + "step": 42420 + }, + { + "epoch": 4.249812190113688, + "grad_norm": 1.9107893705368042, + "learning_rate": 2.73773389827266e-06, + "loss": 0.4077, + "step": 42430 + }, + { + "epoch": 4.250813842840687, + "grad_norm": 1.7265775203704834, + "learning_rate": 2.730565354586723e-06, + "loss": 0.4015, + "step": 42440 + }, + { + "epoch": 4.251815495567687, + "grad_norm": 2.400996208190918, + "learning_rate": 2.7234056661640335e-06, + "loss": 0.4373, + "step": 42450 + }, + { + "epoch": 4.252817148294686, + "grad_norm": 2.8759820461273193, + "learning_rate": 2.7162548358515865e-06, + "loss": 0.4431, + "step": 42460 + }, + { + "epoch": 4.253818801021686, + "grad_norm": 2.009953737258911, + "learning_rate": 2.709112866492847e-06, + "loss": 0.3766, + "step": 42470 + }, + { + "epoch": 4.254820453748685, + "grad_norm": 2.1563308238983154, + "learning_rate": 2.7019797609277696e-06, + "loss": 0.4626, + "step": 42480 + }, + { + "epoch": 4.2558221064756845, + "grad_norm": 2.7688074111938477, + "learning_rate": 2.69485552199277e-06, + "loss": 0.4418, + "step": 42490 + }, + { + "epoch": 4.2568237592026845, + "grad_norm": 2.4989662170410156, + "learning_rate": 2.6877401525207458e-06, + "loss": 0.39, + "step": 42500 + }, + { + "epoch": 4.257825411929684, + "grad_norm": 2.268092393875122, + "learning_rate": 2.6806336553410777e-06, + "loss": 0.4315, + "step": 42510 + }, + { + "epoch": 4.258827064656684, + "grad_norm": 2.241440773010254, + "learning_rate": 2.6735360332795917e-06, + "loss": 0.4064, + "step": 42520 + }, + { + "epoch": 4.259828717383683, + "grad_norm": 2.1825673580169678, + "learning_rate": 2.6664472891586077e-06, + "loss": 0.3797, + "step": 42530 + }, + { + "epoch": 4.260830370110683, + "grad_norm": 2.2951955795288086, + "learning_rate": 2.659367425796916e-06, + "loss": 0.4349, + "step": 42540 + }, + { + "epoch": 4.261832022837682, + "grad_norm": 1.7518614530563354, + "learning_rate": 2.6522964460097553e-06, + "loss": 0.3917, + "step": 42550 + }, + { + "epoch": 4.262833675564682, + "grad_norm": 2.920267343521118, + "learning_rate": 2.645234352608847e-06, + "loss": 0.412, + "step": 42560 + }, + { + "epoch": 4.263835328291681, + "grad_norm": 2.29345965385437, + "learning_rate": 2.638181148402383e-06, + "loss": 0.3755, + "step": 42570 + }, + { + "epoch": 4.26483698101868, + "grad_norm": 2.323241710662842, + "learning_rate": 2.631136836195014e-06, + "loss": 0.4501, + "step": 42580 + }, + { + "epoch": 4.26583863374568, + "grad_norm": 2.2796177864074707, + "learning_rate": 2.6241014187878417e-06, + "loss": 0.3953, + "step": 42590 + }, + { + "epoch": 4.2668402864726795, + "grad_norm": 2.3419172763824463, + "learning_rate": 2.6170748989784583e-06, + "loss": 0.3953, + "step": 42600 + }, + { + "epoch": 4.26784193919968, + "grad_norm": 2.1087424755096436, + "learning_rate": 2.610057279560896e-06, + "loss": 0.4198, + "step": 42610 + }, + { + "epoch": 4.268843591926679, + "grad_norm": 1.8118137121200562, + "learning_rate": 2.603048563325661e-06, + "loss": 0.4031, + "step": 42620 + }, + { + "epoch": 4.269845244653679, + "grad_norm": 2.0749223232269287, + "learning_rate": 2.5960487530597018e-06, + "loss": 0.4355, + "step": 42630 + }, + { + "epoch": 4.270846897380678, + "grad_norm": 2.66550612449646, + "learning_rate": 2.5890578515464476e-06, + "loss": 0.421, + "step": 42640 + }, + { + "epoch": 4.271848550107678, + "grad_norm": 1.960179090499878, + "learning_rate": 2.5820758615657776e-06, + "loss": 0.3849, + "step": 42650 + }, + { + "epoch": 4.272850202834677, + "grad_norm": 2.837324619293213, + "learning_rate": 2.575102785894007e-06, + "loss": 0.4429, + "step": 42660 + }, + { + "epoch": 4.273851855561677, + "grad_norm": 1.9536443948745728, + "learning_rate": 2.5681386273039426e-06, + "loss": 0.3737, + "step": 42670 + }, + { + "epoch": 4.274853508288676, + "grad_norm": 1.969543695449829, + "learning_rate": 2.5611833885648194e-06, + "loss": 0.4493, + "step": 42680 + }, + { + "epoch": 4.275855161015675, + "grad_norm": 2.2718586921691895, + "learning_rate": 2.554237072442331e-06, + "loss": 0.3987, + "step": 42690 + }, + { + "epoch": 4.2768568137426755, + "grad_norm": 2.5806398391723633, + "learning_rate": 2.5472996816986283e-06, + "loss": 0.4017, + "step": 42700 + }, + { + "epoch": 4.277858466469675, + "grad_norm": 1.761518120765686, + "learning_rate": 2.540371219092305e-06, + "loss": 0.3936, + "step": 42710 + }, + { + "epoch": 4.278860119196675, + "grad_norm": 2.4270286560058594, + "learning_rate": 2.533451687378413e-06, + "loss": 0.3815, + "step": 42720 + }, + { + "epoch": 4.279861771923674, + "grad_norm": 2.0313477516174316, + "learning_rate": 2.5265410893084485e-06, + "loss": 0.4022, + "step": 42730 + }, + { + "epoch": 4.280863424650674, + "grad_norm": 1.7014490365982056, + "learning_rate": 2.519639427630352e-06, + "loss": 0.4044, + "step": 42740 + }, + { + "epoch": 4.281865077377673, + "grad_norm": 2.5806448459625244, + "learning_rate": 2.51274670508852e-06, + "loss": 0.4435, + "step": 42750 + }, + { + "epoch": 4.282866730104673, + "grad_norm": 2.133436918258667, + "learning_rate": 2.505862924423785e-06, + "loss": 0.3617, + "step": 42760 + }, + { + "epoch": 4.283868382831672, + "grad_norm": 2.5142359733581543, + "learning_rate": 2.4989880883734273e-06, + "loss": 0.4125, + "step": 42770 + }, + { + "epoch": 4.284870035558672, + "grad_norm": 1.8463962078094482, + "learning_rate": 2.4921221996711707e-06, + "loss": 0.4152, + "step": 42780 + }, + { + "epoch": 4.285871688285671, + "grad_norm": 2.2918601036071777, + "learning_rate": 2.4852652610471842e-06, + "loss": 0.4054, + "step": 42790 + }, + { + "epoch": 4.2868733410126705, + "grad_norm": 2.54331374168396, + "learning_rate": 2.4784172752280733e-06, + "loss": 0.3861, + "step": 42800 + }, + { + "epoch": 4.2878749937396705, + "grad_norm": 1.8478418588638306, + "learning_rate": 2.471578244936881e-06, + "loss": 0.4161, + "step": 42810 + }, + { + "epoch": 4.28887664646667, + "grad_norm": 2.231584072113037, + "learning_rate": 2.4647481728931e-06, + "loss": 0.3572, + "step": 42820 + }, + { + "epoch": 4.28987829919367, + "grad_norm": 2.3038971424102783, + "learning_rate": 2.457927061812648e-06, + "loss": 0.3729, + "step": 42830 + }, + { + "epoch": 4.290879951920669, + "grad_norm": 1.7609002590179443, + "learning_rate": 2.45111491440789e-06, + "loss": 0.3543, + "step": 42840 + }, + { + "epoch": 4.291881604647669, + "grad_norm": 2.5485479831695557, + "learning_rate": 2.4443117333876194e-06, + "loss": 0.358, + "step": 42850 + }, + { + "epoch": 4.292883257374668, + "grad_norm": 2.667595148086548, + "learning_rate": 2.437517521457061e-06, + "loss": 0.3777, + "step": 42860 + }, + { + "epoch": 4.293884910101668, + "grad_norm": 2.162569522857666, + "learning_rate": 2.430732281317899e-06, + "loss": 0.4588, + "step": 42870 + }, + { + "epoch": 4.294886562828667, + "grad_norm": 2.4120516777038574, + "learning_rate": 2.4239560156682105e-06, + "loss": 0.4066, + "step": 42880 + }, + { + "epoch": 4.295888215555667, + "grad_norm": 2.75325345993042, + "learning_rate": 2.417188727202524e-06, + "loss": 0.4154, + "step": 42890 + }, + { + "epoch": 4.296889868282666, + "grad_norm": 1.9056499004364014, + "learning_rate": 2.4104304186118088e-06, + "loss": 0.4268, + "step": 42900 + }, + { + "epoch": 4.2978915210096655, + "grad_norm": 2.1397392749786377, + "learning_rate": 2.403681092583454e-06, + "loss": 0.3723, + "step": 42910 + }, + { + "epoch": 4.298893173736666, + "grad_norm": 2.5149714946746826, + "learning_rate": 2.3969407518012576e-06, + "loss": 0.5002, + "step": 42920 + }, + { + "epoch": 4.299894826463665, + "grad_norm": 1.9185551404953003, + "learning_rate": 2.390209398945478e-06, + "loss": 0.4161, + "step": 42930 + }, + { + "epoch": 4.300896479190665, + "grad_norm": 2.358262777328491, + "learning_rate": 2.383487036692786e-06, + "loss": 0.4237, + "step": 42940 + }, + { + "epoch": 4.301898131917664, + "grad_norm": 1.9108012914657593, + "learning_rate": 2.376773667716262e-06, + "loss": 0.4023, + "step": 42950 + }, + { + "epoch": 4.302899784644664, + "grad_norm": 2.148002862930298, + "learning_rate": 2.3700692946854286e-06, + "loss": 0.4459, + "step": 42960 + }, + { + "epoch": 4.303901437371663, + "grad_norm": 1.83115816116333, + "learning_rate": 2.363373920266229e-06, + "loss": 0.4049, + "step": 42970 + }, + { + "epoch": 4.304903090098663, + "grad_norm": 2.1353845596313477, + "learning_rate": 2.356687547121034e-06, + "loss": 0.4329, + "step": 42980 + }, + { + "epoch": 4.305904742825662, + "grad_norm": 2.241957902908325, + "learning_rate": 2.350010177908604e-06, + "loss": 0.3842, + "step": 42990 + }, + { + "epoch": 4.306906395552662, + "grad_norm": 2.3134796619415283, + "learning_rate": 2.343341815284164e-06, + "loss": 0.4726, + "step": 43000 + }, + { + "epoch": 4.3079080482796615, + "grad_norm": 1.6615601778030396, + "learning_rate": 2.336682461899328e-06, + "loss": 0.4071, + "step": 43010 + }, + { + "epoch": 4.308909701006661, + "grad_norm": 1.741769552230835, + "learning_rate": 2.3300321204021257e-06, + "loss": 0.331, + "step": 43020 + }, + { + "epoch": 4.309911353733661, + "grad_norm": 2.2427818775177, + "learning_rate": 2.323390793437022e-06, + "loss": 0.386, + "step": 43030 + }, + { + "epoch": 4.31091300646066, + "grad_norm": 3.0531322956085205, + "learning_rate": 2.3167584836448875e-06, + "loss": 0.3896, + "step": 43040 + }, + { + "epoch": 4.31191465918766, + "grad_norm": 1.993509292602539, + "learning_rate": 2.3101351936630047e-06, + "loss": 0.3983, + "step": 43050 + }, + { + "epoch": 4.312916311914659, + "grad_norm": 2.2616026401519775, + "learning_rate": 2.3035209261250716e-06, + "loss": 0.4106, + "step": 43060 + }, + { + "epoch": 4.313917964641659, + "grad_norm": 2.288247585296631, + "learning_rate": 2.296915683661202e-06, + "loss": 0.3551, + "step": 43070 + }, + { + "epoch": 4.314919617368658, + "grad_norm": 1.8383212089538574, + "learning_rate": 2.290319468897917e-06, + "loss": 0.4036, + "step": 43080 + }, + { + "epoch": 4.315921270095658, + "grad_norm": 2.345820665359497, + "learning_rate": 2.2837322844581454e-06, + "loss": 0.4122, + "step": 43090 + }, + { + "epoch": 4.316922922822657, + "grad_norm": 1.7840343713760376, + "learning_rate": 2.2771541329612317e-06, + "loss": 0.3799, + "step": 43100 + }, + { + "epoch": 4.317924575549657, + "grad_norm": 1.8226646184921265, + "learning_rate": 2.2705850170229246e-06, + "loss": 0.4293, + "step": 43110 + }, + { + "epoch": 4.3189262282766565, + "grad_norm": 1.8495908975601196, + "learning_rate": 2.2640249392553823e-06, + "loss": 0.3918, + "step": 43120 + }, + { + "epoch": 4.319927881003656, + "grad_norm": 2.4388835430145264, + "learning_rate": 2.257473902267165e-06, + "loss": 0.4003, + "step": 43130 + }, + { + "epoch": 4.320929533730656, + "grad_norm": 1.8674567937850952, + "learning_rate": 2.2509319086632425e-06, + "loss": 0.381, + "step": 43140 + }, + { + "epoch": 4.321931186457655, + "grad_norm": 1.9195691347122192, + "learning_rate": 2.2443989610449855e-06, + "loss": 0.383, + "step": 43150 + }, + { + "epoch": 4.322932839184655, + "grad_norm": 2.5186662673950195, + "learning_rate": 2.2378750620101667e-06, + "loss": 0.3618, + "step": 43160 + }, + { + "epoch": 4.323934491911654, + "grad_norm": 1.8908497095108032, + "learning_rate": 2.2313602141529668e-06, + "loss": 0.4159, + "step": 43170 + }, + { + "epoch": 4.324936144638654, + "grad_norm": 2.380554437637329, + "learning_rate": 2.22485442006396e-06, + "loss": 0.4289, + "step": 43180 + }, + { + "epoch": 4.325937797365653, + "grad_norm": 2.3493990898132324, + "learning_rate": 2.218357682330119e-06, + "loss": 0.4513, + "step": 43190 + }, + { + "epoch": 4.326939450092653, + "grad_norm": 2.776486396789551, + "learning_rate": 2.2118700035348328e-06, + "loss": 0.3659, + "step": 43200 + }, + { + "epoch": 4.327941102819652, + "grad_norm": 2.014625310897827, + "learning_rate": 2.2053913862578656e-06, + "loss": 0.3878, + "step": 43210 + }, + { + "epoch": 4.328942755546652, + "grad_norm": 2.328507423400879, + "learning_rate": 2.198921833075385e-06, + "loss": 0.4097, + "step": 43220 + }, + { + "epoch": 4.329944408273652, + "grad_norm": 2.105379104614258, + "learning_rate": 2.192461346559968e-06, + "loss": 0.3963, + "step": 43230 + }, + { + "epoch": 4.330946061000651, + "grad_norm": 2.268242120742798, + "learning_rate": 2.1860099292805664e-06, + "loss": 0.4335, + "step": 43240 + }, + { + "epoch": 4.331947713727651, + "grad_norm": 1.6855250597000122, + "learning_rate": 2.1795675838025333e-06, + "loss": 0.3711, + "step": 43250 + }, + { + "epoch": 4.33294936645465, + "grad_norm": 2.5439865589141846, + "learning_rate": 2.1731343126876276e-06, + "loss": 0.363, + "step": 43260 + }, + { + "epoch": 4.33395101918165, + "grad_norm": 1.8600542545318604, + "learning_rate": 2.1667101184939837e-06, + "loss": 0.3719, + "step": 43270 + }, + { + "epoch": 4.334952671908649, + "grad_norm": 2.798398494720459, + "learning_rate": 2.160295003776125e-06, + "loss": 0.3651, + "step": 43280 + }, + { + "epoch": 4.335954324635649, + "grad_norm": 1.7904213666915894, + "learning_rate": 2.153888971084969e-06, + "loss": 0.4151, + "step": 43290 + }, + { + "epoch": 4.336955977362648, + "grad_norm": 2.4006080627441406, + "learning_rate": 2.1474920229678396e-06, + "loss": 0.4356, + "step": 43300 + }, + { + "epoch": 4.337957630089648, + "grad_norm": 2.7240309715270996, + "learning_rate": 2.1411041619684186e-06, + "loss": 0.4338, + "step": 43310 + }, + { + "epoch": 4.3389592828166474, + "grad_norm": 1.9607129096984863, + "learning_rate": 2.134725390626785e-06, + "loss": 0.3561, + "step": 43320 + }, + { + "epoch": 4.3399609355436475, + "grad_norm": 2.6184070110321045, + "learning_rate": 2.1283557114794183e-06, + "loss": 0.4381, + "step": 43330 + }, + { + "epoch": 4.340962588270647, + "grad_norm": 1.8792089223861694, + "learning_rate": 2.1219951270591703e-06, + "loss": 0.3831, + "step": 43340 + }, + { + "epoch": 4.341964240997646, + "grad_norm": 1.6819710731506348, + "learning_rate": 2.1156436398952623e-06, + "loss": 0.3814, + "step": 43350 + }, + { + "epoch": 4.342965893724646, + "grad_norm": 2.5652620792388916, + "learning_rate": 2.1093012525133237e-06, + "loss": 0.4009, + "step": 43360 + }, + { + "epoch": 4.343967546451645, + "grad_norm": 1.6315181255340576, + "learning_rate": 2.102967967435354e-06, + "loss": 0.4335, + "step": 43370 + }, + { + "epoch": 4.344969199178645, + "grad_norm": 2.178917407989502, + "learning_rate": 2.0966437871797333e-06, + "loss": 0.4018, + "step": 43380 + }, + { + "epoch": 4.345970851905644, + "grad_norm": 1.893080711364746, + "learning_rate": 2.0903287142612193e-06, + "loss": 0.3939, + "step": 43390 + }, + { + "epoch": 4.346972504632644, + "grad_norm": 2.0967724323272705, + "learning_rate": 2.0840227511909504e-06, + "loss": 0.4255, + "step": 43400 + }, + { + "epoch": 4.347974157359643, + "grad_norm": 2.593022108078003, + "learning_rate": 2.077725900476446e-06, + "loss": 0.3903, + "step": 43410 + }, + { + "epoch": 4.348975810086643, + "grad_norm": 1.952826738357544, + "learning_rate": 2.071438164621595e-06, + "loss": 0.415, + "step": 43420 + }, + { + "epoch": 4.3499774628136425, + "grad_norm": 1.9698874950408936, + "learning_rate": 2.065159546126666e-06, + "loss": 0.3708, + "step": 43430 + }, + { + "epoch": 4.350979115540642, + "grad_norm": 2.352921962738037, + "learning_rate": 2.0588900474883017e-06, + "loss": 0.4387, + "step": 43440 + }, + { + "epoch": 4.351980768267642, + "grad_norm": 2.0334842205047607, + "learning_rate": 2.0526296711995184e-06, + "loss": 0.4526, + "step": 43450 + }, + { + "epoch": 4.352982420994641, + "grad_norm": 2.4401466846466064, + "learning_rate": 2.046378419749706e-06, + "loss": 0.3585, + "step": 43460 + }, + { + "epoch": 4.353984073721641, + "grad_norm": 2.077836751937866, + "learning_rate": 2.0401362956246256e-06, + "loss": 0.4503, + "step": 43470 + }, + { + "epoch": 4.35498572644864, + "grad_norm": 2.017932415008545, + "learning_rate": 2.033903301306403e-06, + "loss": 0.3723, + "step": 43480 + }, + { + "epoch": 4.35598737917564, + "grad_norm": 2.4183194637298584, + "learning_rate": 2.0276794392735444e-06, + "loss": 0.4098, + "step": 43490 + }, + { + "epoch": 4.356989031902639, + "grad_norm": 2.4014265537261963, + "learning_rate": 2.0214647120009173e-06, + "loss": 0.4118, + "step": 43500 + }, + { + "epoch": 4.357990684629639, + "grad_norm": 2.41623854637146, + "learning_rate": 2.015259121959759e-06, + "loss": 0.4008, + "step": 43510 + }, + { + "epoch": 4.358992337356638, + "grad_norm": 2.6848697662353516, + "learning_rate": 2.00906267161767e-06, + "loss": 0.4053, + "step": 43520 + }, + { + "epoch": 4.359993990083638, + "grad_norm": 2.032064437866211, + "learning_rate": 2.002875363438622e-06, + "loss": 0.3978, + "step": 43530 + }, + { + "epoch": 4.360995642810638, + "grad_norm": 1.95393967628479, + "learning_rate": 1.9966971998829463e-06, + "loss": 0.3873, + "step": 43540 + }, + { + "epoch": 4.361997295537637, + "grad_norm": 2.1058743000030518, + "learning_rate": 1.990528183407339e-06, + "loss": 0.4167, + "step": 43550 + }, + { + "epoch": 4.362998948264637, + "grad_norm": 2.3774189949035645, + "learning_rate": 1.984368316464874e-06, + "loss": 0.3911, + "step": 43560 + }, + { + "epoch": 4.364000600991636, + "grad_norm": 1.8962682485580444, + "learning_rate": 1.978217601504956e-06, + "loss": 0.3933, + "step": 43570 + }, + { + "epoch": 4.365002253718636, + "grad_norm": 2.469862937927246, + "learning_rate": 1.972076040973372e-06, + "loss": 0.4414, + "step": 43580 + }, + { + "epoch": 4.366003906445635, + "grad_norm": 2.292123794555664, + "learning_rate": 1.965943637312276e-06, + "loss": 0.3814, + "step": 43590 + }, + { + "epoch": 4.367005559172635, + "grad_norm": 2.4966185092926025, + "learning_rate": 1.9598203929601573e-06, + "loss": 0.4514, + "step": 43600 + }, + { + "epoch": 4.368007211899634, + "grad_norm": 2.058504104614258, + "learning_rate": 1.9537063103518772e-06, + "loss": 0.3639, + "step": 43610 + }, + { + "epoch": 4.369008864626634, + "grad_norm": 2.378732919692993, + "learning_rate": 1.947601391918649e-06, + "loss": 0.3669, + "step": 43620 + }, + { + "epoch": 4.370010517353633, + "grad_norm": 2.036379337310791, + "learning_rate": 1.9415056400880593e-06, + "loss": 0.3781, + "step": 43630 + }, + { + "epoch": 4.371012170080633, + "grad_norm": 2.2540555000305176, + "learning_rate": 1.9354190572840187e-06, + "loss": 0.4094, + "step": 43640 + }, + { + "epoch": 4.372013822807633, + "grad_norm": 1.785356879234314, + "learning_rate": 1.9293416459268134e-06, + "loss": 0.4068, + "step": 43650 + }, + { + "epoch": 4.373015475534632, + "grad_norm": 2.5779080390930176, + "learning_rate": 1.9232734084330824e-06, + "loss": 0.4351, + "step": 43660 + }, + { + "epoch": 4.374017128261632, + "grad_norm": 1.8296396732330322, + "learning_rate": 1.9172143472158122e-06, + "loss": 0.3672, + "step": 43670 + }, + { + "epoch": 4.375018780988631, + "grad_norm": 2.4608004093170166, + "learning_rate": 1.9111644646843284e-06, + "loss": 0.4167, + "step": 43680 + }, + { + "epoch": 4.376020433715631, + "grad_norm": 1.9104523658752441, + "learning_rate": 1.905123763244329e-06, + "loss": 0.375, + "step": 43690 + }, + { + "epoch": 4.37702208644263, + "grad_norm": 1.8407065868377686, + "learning_rate": 1.8990922452978565e-06, + "loss": 0.4115, + "step": 43700 + }, + { + "epoch": 4.37802373916963, + "grad_norm": 1.947320818901062, + "learning_rate": 1.8930699132432784e-06, + "loss": 0.4018, + "step": 43710 + }, + { + "epoch": 4.379025391896629, + "grad_norm": 2.074536085128784, + "learning_rate": 1.88705676947534e-06, + "loss": 0.4216, + "step": 43720 + }, + { + "epoch": 4.380027044623629, + "grad_norm": 2.215348958969116, + "learning_rate": 1.8810528163851177e-06, + "loss": 0.436, + "step": 43730 + }, + { + "epoch": 4.3810286973506285, + "grad_norm": 2.0011367797851562, + "learning_rate": 1.8750580563600351e-06, + "loss": 0.3784, + "step": 43740 + }, + { + "epoch": 4.382030350077628, + "grad_norm": 2.45017409324646, + "learning_rate": 1.8690724917838603e-06, + "loss": 0.3976, + "step": 43750 + }, + { + "epoch": 4.383032002804628, + "grad_norm": 2.329396963119507, + "learning_rate": 1.8630961250367062e-06, + "loss": 0.3553, + "step": 43760 + }, + { + "epoch": 4.384033655531627, + "grad_norm": 2.6597559452056885, + "learning_rate": 1.8571289584950302e-06, + "loss": 0.4332, + "step": 43770 + }, + { + "epoch": 4.385035308258627, + "grad_norm": 2.527765989303589, + "learning_rate": 1.8511709945316146e-06, + "loss": 0.4534, + "step": 43780 + }, + { + "epoch": 4.386036960985626, + "grad_norm": 2.524082899093628, + "learning_rate": 1.8452222355156146e-06, + "loss": 0.3753, + "step": 43790 + }, + { + "epoch": 4.387038613712626, + "grad_norm": 2.056520938873291, + "learning_rate": 1.8392826838124961e-06, + "loss": 0.4125, + "step": 43800 + }, + { + "epoch": 4.388040266439625, + "grad_norm": 1.816526174545288, + "learning_rate": 1.8333523417840782e-06, + "loss": 0.4369, + "step": 43810 + }, + { + "epoch": 4.389041919166625, + "grad_norm": 2.039801836013794, + "learning_rate": 1.8274312117885135e-06, + "loss": 0.3958, + "step": 43820 + }, + { + "epoch": 4.390043571893624, + "grad_norm": 2.485262155532837, + "learning_rate": 1.8215192961802907e-06, + "loss": 0.3797, + "step": 43830 + }, + { + "epoch": 4.391045224620624, + "grad_norm": 3.454650402069092, + "learning_rate": 1.8156165973102379e-06, + "loss": 0.3839, + "step": 43840 + }, + { + "epoch": 4.392046877347624, + "grad_norm": 2.490710735321045, + "learning_rate": 1.8097231175255163e-06, + "loss": 0.4095, + "step": 43850 + }, + { + "epoch": 4.393048530074623, + "grad_norm": 1.9275168180465698, + "learning_rate": 1.803838859169621e-06, + "loss": 0.3915, + "step": 43860 + }, + { + "epoch": 4.394050182801623, + "grad_norm": 1.8556205034255981, + "learning_rate": 1.7979638245823771e-06, + "loss": 0.457, + "step": 43870 + }, + { + "epoch": 4.395051835528622, + "grad_norm": 2.13421893119812, + "learning_rate": 1.7920980160999496e-06, + "loss": 0.4821, + "step": 43880 + }, + { + "epoch": 4.396053488255622, + "grad_norm": 1.8426682949066162, + "learning_rate": 1.786241436054828e-06, + "loss": 0.3621, + "step": 43890 + }, + { + "epoch": 4.397055140982621, + "grad_norm": 2.002091884613037, + "learning_rate": 1.7803940867758384e-06, + "loss": 0.4091, + "step": 43900 + }, + { + "epoch": 4.398056793709621, + "grad_norm": 2.000837564468384, + "learning_rate": 1.7745559705881236e-06, + "loss": 0.3843, + "step": 43910 + }, + { + "epoch": 4.39905844643662, + "grad_norm": 2.5199735164642334, + "learning_rate": 1.7687270898131796e-06, + "loss": 0.4583, + "step": 43920 + }, + { + "epoch": 4.40006009916362, + "grad_norm": 2.1583092212677, + "learning_rate": 1.7629074467687995e-06, + "loss": 0.3926, + "step": 43930 + }, + { + "epoch": 4.401061751890619, + "grad_norm": 1.873566746711731, + "learning_rate": 1.7570970437691238e-06, + "loss": 0.4649, + "step": 43940 + }, + { + "epoch": 4.4020634046176195, + "grad_norm": 1.9938627481460571, + "learning_rate": 1.7512958831246096e-06, + "loss": 0.4128, + "step": 43950 + }, + { + "epoch": 4.403065057344619, + "grad_norm": 2.516998529434204, + "learning_rate": 1.7455039671420537e-06, + "loss": 0.3994, + "step": 43960 + }, + { + "epoch": 4.404066710071618, + "grad_norm": 1.9853394031524658, + "learning_rate": 1.7397212981245526e-06, + "loss": 0.4124, + "step": 43970 + }, + { + "epoch": 4.405068362798618, + "grad_norm": 2.252943515777588, + "learning_rate": 1.733947878371539e-06, + "loss": 0.3939, + "step": 43980 + }, + { + "epoch": 4.406070015525617, + "grad_norm": 1.9540263414382935, + "learning_rate": 1.7281837101787797e-06, + "loss": 0.3688, + "step": 43990 + }, + { + "epoch": 4.407071668252617, + "grad_norm": 1.7471299171447754, + "learning_rate": 1.722428795838335e-06, + "loss": 0.3401, + "step": 44000 + }, + { + "epoch": 4.408073320979616, + "grad_norm": 1.9910095930099487, + "learning_rate": 1.7166831376386084e-06, + "loss": 0.4205, + "step": 44010 + }, + { + "epoch": 4.409074973706616, + "grad_norm": 2.40335750579834, + "learning_rate": 1.7109467378643135e-06, + "loss": 0.4245, + "step": 44020 + }, + { + "epoch": 4.410076626433615, + "grad_norm": 2.117269515991211, + "learning_rate": 1.7052195987964898e-06, + "loss": 0.3946, + "step": 44030 + }, + { + "epoch": 4.411078279160615, + "grad_norm": 1.9731988906860352, + "learning_rate": 1.6995017227124766e-06, + "loss": 0.3306, + "step": 44040 + }, + { + "epoch": 4.4120799318876145, + "grad_norm": 2.510343551635742, + "learning_rate": 1.6937931118859523e-06, + "loss": 0.4009, + "step": 44050 + }, + { + "epoch": 4.4130815846146145, + "grad_norm": 2.3110790252685547, + "learning_rate": 1.6880937685869013e-06, + "loss": 0.4243, + "step": 44060 + }, + { + "epoch": 4.414083237341614, + "grad_norm": 2.2889299392700195, + "learning_rate": 1.682403695081608e-06, + "loss": 0.3852, + "step": 44070 + }, + { + "epoch": 4.415084890068613, + "grad_norm": 2.237353563308716, + "learning_rate": 1.6767228936326984e-06, + "loss": 0.3906, + "step": 44080 + }, + { + "epoch": 4.416086542795613, + "grad_norm": 2.174852132797241, + "learning_rate": 1.6710513664990961e-06, + "loss": 0.3828, + "step": 44090 + }, + { + "epoch": 4.417088195522612, + "grad_norm": 2.5690200328826904, + "learning_rate": 1.6653891159360391e-06, + "loss": 0.4048, + "step": 44100 + }, + { + "epoch": 4.418089848249612, + "grad_norm": 2.094517230987549, + "learning_rate": 1.659736144195065e-06, + "loss": 0.4225, + "step": 44110 + }, + { + "epoch": 4.419091500976611, + "grad_norm": 2.416579246520996, + "learning_rate": 1.654092453524045e-06, + "loss": 0.4436, + "step": 44120 + }, + { + "epoch": 4.420093153703611, + "grad_norm": 2.3031387329101562, + "learning_rate": 1.648458046167145e-06, + "loss": 0.4566, + "step": 44130 + }, + { + "epoch": 4.42109480643061, + "grad_norm": 2.294541358947754, + "learning_rate": 1.64283292436484e-06, + "loss": 0.4522, + "step": 44140 + }, + { + "epoch": 4.42209645915761, + "grad_norm": 1.5399305820465088, + "learning_rate": 1.637217090353918e-06, + "loss": 0.409, + "step": 44150 + }, + { + "epoch": 4.4230981118846096, + "grad_norm": 2.1396875381469727, + "learning_rate": 1.631610546367468e-06, + "loss": 0.3919, + "step": 44160 + }, + { + "epoch": 4.42409976461161, + "grad_norm": 2.2197341918945312, + "learning_rate": 1.6260132946348899e-06, + "loss": 0.4115, + "step": 44170 + }, + { + "epoch": 4.425101417338609, + "grad_norm": 2.275446891784668, + "learning_rate": 1.620425337381884e-06, + "loss": 0.4239, + "step": 44180 + }, + { + "epoch": 4.426103070065608, + "grad_norm": 2.164278745651245, + "learning_rate": 1.6148466768304587e-06, + "loss": 0.4411, + "step": 44190 + }, + { + "epoch": 4.427104722792608, + "grad_norm": 2.4946508407592773, + "learning_rate": 1.6092773151989204e-06, + "loss": 0.4211, + "step": 44200 + }, + { + "epoch": 4.428106375519607, + "grad_norm": 2.1352880001068115, + "learning_rate": 1.6037172547018863e-06, + "loss": 0.3815, + "step": 44210 + }, + { + "epoch": 4.429108028246607, + "grad_norm": 2.4429593086242676, + "learning_rate": 1.5981664975502685e-06, + "loss": 0.474, + "step": 44220 + }, + { + "epoch": 4.430109680973606, + "grad_norm": 1.741686463356018, + "learning_rate": 1.5926250459512793e-06, + "loss": 0.3641, + "step": 44230 + }, + { + "epoch": 4.431111333700606, + "grad_norm": 1.501874327659607, + "learning_rate": 1.5870929021084307e-06, + "loss": 0.3908, + "step": 44240 + }, + { + "epoch": 4.432112986427605, + "grad_norm": 2.2431552410125732, + "learning_rate": 1.5815700682215439e-06, + "loss": 0.4295, + "step": 44250 + }, + { + "epoch": 4.4331146391546055, + "grad_norm": 2.8315837383270264, + "learning_rate": 1.576056546486726e-06, + "loss": 0.4185, + "step": 44260 + }, + { + "epoch": 4.434116291881605, + "grad_norm": 2.2130463123321533, + "learning_rate": 1.570552339096376e-06, + "loss": 0.4278, + "step": 44270 + }, + { + "epoch": 4.435117944608605, + "grad_norm": 2.1421968936920166, + "learning_rate": 1.5650574482392183e-06, + "loss": 0.4138, + "step": 44280 + }, + { + "epoch": 4.436119597335604, + "grad_norm": 2.1757049560546875, + "learning_rate": 1.5595718761002325e-06, + "loss": 0.4759, + "step": 44290 + }, + { + "epoch": 4.437121250062603, + "grad_norm": 2.8125619888305664, + "learning_rate": 1.5540956248607213e-06, + "loss": 0.4304, + "step": 44300 + }, + { + "epoch": 4.438122902789603, + "grad_norm": 2.888702154159546, + "learning_rate": 1.5486286966982677e-06, + "loss": 0.4594, + "step": 44310 + }, + { + "epoch": 4.439124555516602, + "grad_norm": 2.215304136276245, + "learning_rate": 1.5431710937867633e-06, + "loss": 0.3982, + "step": 44320 + }, + { + "epoch": 4.440126208243602, + "grad_norm": 2.1499667167663574, + "learning_rate": 1.537722818296372e-06, + "loss": 0.421, + "step": 44330 + }, + { + "epoch": 4.441127860970601, + "grad_norm": 2.082578420639038, + "learning_rate": 1.53228387239355e-06, + "loss": 0.4226, + "step": 44340 + }, + { + "epoch": 4.442129513697601, + "grad_norm": 2.056445598602295, + "learning_rate": 1.52685425824107e-06, + "loss": 0.4324, + "step": 44350 + }, + { + "epoch": 4.4431311664246005, + "grad_norm": 2.5467376708984375, + "learning_rate": 1.5214339779979576e-06, + "loss": 0.4099, + "step": 44360 + }, + { + "epoch": 4.4441328191516005, + "grad_norm": 2.7463908195495605, + "learning_rate": 1.5160230338195497e-06, + "loss": 0.4159, + "step": 44370 + }, + { + "epoch": 4.4451344718786, + "grad_norm": 2.3661184310913086, + "learning_rate": 1.510621427857467e-06, + "loss": 0.4228, + "step": 44380 + }, + { + "epoch": 4.4461361246056, + "grad_norm": 2.045476198196411, + "learning_rate": 1.5052291622596166e-06, + "loss": 0.3782, + "step": 44390 + }, + { + "epoch": 4.447137777332599, + "grad_norm": 2.825392007827759, + "learning_rate": 1.49984623917018e-06, + "loss": 0.4446, + "step": 44400 + }, + { + "epoch": 4.448139430059598, + "grad_norm": 1.9934977293014526, + "learning_rate": 1.4944726607296456e-06, + "loss": 0.3826, + "step": 44410 + }, + { + "epoch": 4.449141082786598, + "grad_norm": 2.085172414779663, + "learning_rate": 1.4891084290747704e-06, + "loss": 0.4111, + "step": 44420 + }, + { + "epoch": 4.450142735513597, + "grad_norm": 2.056011199951172, + "learning_rate": 1.4837535463385982e-06, + "loss": 0.4359, + "step": 44430 + }, + { + "epoch": 4.451144388240597, + "grad_norm": 2.0101840496063232, + "learning_rate": 1.478408014650448e-06, + "loss": 0.3443, + "step": 44440 + }, + { + "epoch": 4.452146040967596, + "grad_norm": 2.3599886894226074, + "learning_rate": 1.473071836135939e-06, + "loss": 0.4214, + "step": 44450 + }, + { + "epoch": 4.453147693694596, + "grad_norm": 2.235658884048462, + "learning_rate": 1.4677450129169574e-06, + "loss": 0.4071, + "step": 44460 + }, + { + "epoch": 4.4541493464215955, + "grad_norm": 2.4682376384735107, + "learning_rate": 1.4624275471116638e-06, + "loss": 0.3985, + "step": 44470 + }, + { + "epoch": 4.455150999148596, + "grad_norm": 2.524515390396118, + "learning_rate": 1.4571194408345146e-06, + "loss": 0.3838, + "step": 44480 + }, + { + "epoch": 4.456152651875595, + "grad_norm": 2.4610915184020996, + "learning_rate": 1.451820696196235e-06, + "loss": 0.3972, + "step": 44490 + }, + { + "epoch": 4.457154304602594, + "grad_norm": 1.954127311706543, + "learning_rate": 1.4465313153038284e-06, + "loss": 0.4061, + "step": 44500 + }, + { + "epoch": 4.458155957329594, + "grad_norm": 1.9836078882217407, + "learning_rate": 1.441251300260571e-06, + "loss": 0.4063, + "step": 44510 + }, + { + "epoch": 4.459157610056593, + "grad_norm": 2.0435895919799805, + "learning_rate": 1.435980653166022e-06, + "loss": 0.4643, + "step": 44520 + }, + { + "epoch": 4.460159262783593, + "grad_norm": 2.140763998031616, + "learning_rate": 1.4307193761160131e-06, + "loss": 0.3666, + "step": 44530 + }, + { + "epoch": 4.461160915510592, + "grad_norm": 2.599942445755005, + "learning_rate": 1.4254674712026488e-06, + "loss": 0.4522, + "step": 44540 + }, + { + "epoch": 4.462162568237592, + "grad_norm": 1.9349007606506348, + "learning_rate": 1.4202249405143032e-06, + "loss": 0.3737, + "step": 44550 + }, + { + "epoch": 4.463164220964591, + "grad_norm": 2.34694766998291, + "learning_rate": 1.4149917861356331e-06, + "loss": 0.4058, + "step": 44560 + }, + { + "epoch": 4.4641658736915915, + "grad_norm": 2.036736249923706, + "learning_rate": 1.4097680101475553e-06, + "loss": 0.3698, + "step": 44570 + }, + { + "epoch": 4.465167526418591, + "grad_norm": 2.2654614448547363, + "learning_rate": 1.404553614627266e-06, + "loss": 0.4287, + "step": 44580 + }, + { + "epoch": 4.466169179145591, + "grad_norm": 1.855678915977478, + "learning_rate": 1.399348601648226e-06, + "loss": 0.3928, + "step": 44590 + }, + { + "epoch": 4.46717083187259, + "grad_norm": 2.315638303756714, + "learning_rate": 1.3941529732801662e-06, + "loss": 0.424, + "step": 44600 + }, + { + "epoch": 4.468172484599589, + "grad_norm": 2.044198989868164, + "learning_rate": 1.3889667315890948e-06, + "loss": 0.4122, + "step": 44610 + }, + { + "epoch": 4.469174137326589, + "grad_norm": 2.004559278488159, + "learning_rate": 1.3837898786372704e-06, + "loss": 0.3801, + "step": 44620 + }, + { + "epoch": 4.470175790053588, + "grad_norm": 1.7977226972579956, + "learning_rate": 1.3786224164832302e-06, + "loss": 0.3668, + "step": 44630 + }, + { + "epoch": 4.471177442780588, + "grad_norm": 1.8870209455490112, + "learning_rate": 1.3734643471817743e-06, + "loss": 0.4103, + "step": 44640 + }, + { + "epoch": 4.472179095507587, + "grad_norm": 1.5372698307037354, + "learning_rate": 1.368315672783968e-06, + "loss": 0.4171, + "step": 44650 + }, + { + "epoch": 4.473180748234587, + "grad_norm": 2.3017232418060303, + "learning_rate": 1.3631763953371402e-06, + "loss": 0.3999, + "step": 44660 + }, + { + "epoch": 4.4741824009615865, + "grad_norm": 2.015127182006836, + "learning_rate": 1.358046516884881e-06, + "loss": 0.4281, + "step": 44670 + }, + { + "epoch": 4.4751840536885865, + "grad_norm": 2.389350175857544, + "learning_rate": 1.3529260394670562e-06, + "loss": 0.4312, + "step": 44680 + }, + { + "epoch": 4.476185706415586, + "grad_norm": 2.0506579875946045, + "learning_rate": 1.347814965119773e-06, + "loss": 0.4292, + "step": 44690 + }, + { + "epoch": 4.477187359142585, + "grad_norm": 2.1258811950683594, + "learning_rate": 1.3427132958754057e-06, + "loss": 0.3413, + "step": 44700 + }, + { + "epoch": 4.478189011869585, + "grad_norm": 2.5169951915740967, + "learning_rate": 1.3376210337626037e-06, + "loss": 0.5072, + "step": 44710 + }, + { + "epoch": 4.479190664596584, + "grad_norm": 2.3843328952789307, + "learning_rate": 1.3325381808062641e-06, + "loss": 0.4088, + "step": 44720 + }, + { + "epoch": 4.480192317323584, + "grad_norm": 2.526643753051758, + "learning_rate": 1.3274647390275314e-06, + "loss": 0.3923, + "step": 44730 + }, + { + "epoch": 4.481193970050583, + "grad_norm": 2.592135429382324, + "learning_rate": 1.3224007104438273e-06, + "loss": 0.4091, + "step": 44740 + }, + { + "epoch": 4.482195622777583, + "grad_norm": 2.1877357959747314, + "learning_rate": 1.3173460970688251e-06, + "loss": 0.4152, + "step": 44750 + }, + { + "epoch": 4.483197275504582, + "grad_norm": 2.017637014389038, + "learning_rate": 1.3123009009124442e-06, + "loss": 0.471, + "step": 44760 + }, + { + "epoch": 4.484198928231582, + "grad_norm": 2.706550359725952, + "learning_rate": 1.3072651239808692e-06, + "loss": 0.4349, + "step": 44770 + }, + { + "epoch": 4.4852005809585815, + "grad_norm": 2.889103889465332, + "learning_rate": 1.3022387682765398e-06, + "loss": 0.3704, + "step": 44780 + }, + { + "epoch": 4.486202233685582, + "grad_norm": 2.3987677097320557, + "learning_rate": 1.2972218357981458e-06, + "loss": 0.3974, + "step": 44790 + }, + { + "epoch": 4.487203886412581, + "grad_norm": 2.7593445777893066, + "learning_rate": 1.2922143285406224e-06, + "loss": 0.4262, + "step": 44800 + }, + { + "epoch": 4.48820553913958, + "grad_norm": 2.0148863792419434, + "learning_rate": 1.2872162484951738e-06, + "loss": 0.4055, + "step": 44810 + }, + { + "epoch": 4.48920719186658, + "grad_norm": 2.1033871173858643, + "learning_rate": 1.2822275976492493e-06, + "loss": 0.4186, + "step": 44820 + }, + { + "epoch": 4.490208844593579, + "grad_norm": 2.587449312210083, + "learning_rate": 1.277248377986534e-06, + "loss": 0.3816, + "step": 44830 + }, + { + "epoch": 4.491210497320579, + "grad_norm": 2.1474623680114746, + "learning_rate": 1.2722785914869862e-06, + "loss": 0.3941, + "step": 44840 + }, + { + "epoch": 4.492212150047578, + "grad_norm": 2.459506034851074, + "learning_rate": 1.267318240126794e-06, + "loss": 0.361, + "step": 44850 + }, + { + "epoch": 4.493213802774578, + "grad_norm": 2.14664363861084, + "learning_rate": 1.262367325878408e-06, + "loss": 0.4469, + "step": 44860 + }, + { + "epoch": 4.494215455501577, + "grad_norm": 2.295531749725342, + "learning_rate": 1.2574258507105168e-06, + "loss": 0.5106, + "step": 44870 + }, + { + "epoch": 4.4952171082285775, + "grad_norm": 2.360304594039917, + "learning_rate": 1.2524938165880601e-06, + "loss": 0.4323, + "step": 44880 + }, + { + "epoch": 4.496218760955577, + "grad_norm": 2.631624937057495, + "learning_rate": 1.2475712254722188e-06, + "loss": 0.4318, + "step": 44890 + }, + { + "epoch": 4.497220413682577, + "grad_norm": 1.8856874704360962, + "learning_rate": 1.2426580793204246e-06, + "loss": 0.4143, + "step": 44900 + }, + { + "epoch": 4.498222066409576, + "grad_norm": 2.2703697681427, + "learning_rate": 1.2377543800863505e-06, + "loss": 0.4279, + "step": 44910 + }, + { + "epoch": 4.499223719136575, + "grad_norm": 1.7943007946014404, + "learning_rate": 1.2328601297199121e-06, + "loss": 0.3452, + "step": 44920 + }, + { + "epoch": 4.500225371863575, + "grad_norm": 1.8310256004333496, + "learning_rate": 1.2279753301672691e-06, + "loss": 0.4461, + "step": 44930 + }, + { + "epoch": 4.501227024590574, + "grad_norm": 2.538496494293213, + "learning_rate": 1.2230999833708262e-06, + "loss": 0.3722, + "step": 44940 + }, + { + "epoch": 4.502228677317574, + "grad_norm": 1.8670365810394287, + "learning_rate": 1.218234091269224e-06, + "loss": 0.3366, + "step": 44950 + }, + { + "epoch": 4.503230330044573, + "grad_norm": 2.470527410507202, + "learning_rate": 1.2133776557973458e-06, + "loss": 0.3661, + "step": 44960 + }, + { + "epoch": 4.504231982771573, + "grad_norm": 2.3969390392303467, + "learning_rate": 1.208530678886316e-06, + "loss": 0.3995, + "step": 44970 + }, + { + "epoch": 4.5052336354985725, + "grad_norm": 1.7786046266555786, + "learning_rate": 1.2036931624634928e-06, + "loss": 0.3834, + "step": 44980 + }, + { + "epoch": 4.5062352882255725, + "grad_norm": 1.8966312408447266, + "learning_rate": 1.1988651084524794e-06, + "loss": 0.4019, + "step": 44990 + }, + { + "epoch": 4.507236940952572, + "grad_norm": 2.385106325149536, + "learning_rate": 1.1940465187731093e-06, + "loss": 0.3839, + "step": 45000 + }, + { + "epoch": 4.508238593679572, + "grad_norm": 1.5654442310333252, + "learning_rate": 1.1892373953414638e-06, + "loss": 0.3852, + "step": 45010 + }, + { + "epoch": 4.509240246406571, + "grad_norm": 2.0553817749023438, + "learning_rate": 1.1844377400698437e-06, + "loss": 0.4238, + "step": 45020 + }, + { + "epoch": 4.51024189913357, + "grad_norm": 1.7214395999908447, + "learning_rate": 1.1796475548667945e-06, + "loss": 0.3799, + "step": 45030 + }, + { + "epoch": 4.51124355186057, + "grad_norm": 2.3973593711853027, + "learning_rate": 1.1748668416371067e-06, + "loss": 0.4252, + "step": 45040 + }, + { + "epoch": 4.512245204587569, + "grad_norm": 1.9291691780090332, + "learning_rate": 1.1700956022817788e-06, + "loss": 0.4175, + "step": 45050 + }, + { + "epoch": 4.513246857314569, + "grad_norm": 2.493912696838379, + "learning_rate": 1.16533383869806e-06, + "loss": 0.3822, + "step": 45060 + }, + { + "epoch": 4.514248510041568, + "grad_norm": 1.8617504835128784, + "learning_rate": 1.1605815527794329e-06, + "loss": 0.3491, + "step": 45070 + }, + { + "epoch": 4.515250162768568, + "grad_norm": 2.023247003555298, + "learning_rate": 1.1558387464156024e-06, + "loss": 0.4175, + "step": 45080 + }, + { + "epoch": 4.5162518154955675, + "grad_norm": 2.2523674964904785, + "learning_rate": 1.151105421492507e-06, + "loss": 0.4757, + "step": 45090 + }, + { + "epoch": 4.517253468222568, + "grad_norm": 1.8958468437194824, + "learning_rate": 1.1463815798923138e-06, + "loss": 0.3856, + "step": 45100 + }, + { + "epoch": 4.518255120949567, + "grad_norm": 1.961393117904663, + "learning_rate": 1.1416672234934283e-06, + "loss": 0.451, + "step": 45110 + }, + { + "epoch": 4.519256773676567, + "grad_norm": 1.5756324529647827, + "learning_rate": 1.1369623541704706e-06, + "loss": 0.4175, + "step": 45120 + }, + { + "epoch": 4.520258426403566, + "grad_norm": 1.8311176300048828, + "learning_rate": 1.1322669737942908e-06, + "loss": 0.4113, + "step": 45130 + }, + { + "epoch": 4.521260079130565, + "grad_norm": 2.2952687740325928, + "learning_rate": 1.1275810842319767e-06, + "loss": 0.3718, + "step": 45140 + }, + { + "epoch": 4.522261731857565, + "grad_norm": 1.6738089323043823, + "learning_rate": 1.1229046873468374e-06, + "loss": 0.372, + "step": 45150 + }, + { + "epoch": 4.523263384584564, + "grad_norm": 2.8291242122650146, + "learning_rate": 1.118237784998394e-06, + "loss": 0.4321, + "step": 45160 + }, + { + "epoch": 4.524265037311564, + "grad_norm": 2.192211866378784, + "learning_rate": 1.1135803790424115e-06, + "loss": 0.3983, + "step": 45170 + }, + { + "epoch": 4.525266690038563, + "grad_norm": 2.6538708209991455, + "learning_rate": 1.1089324713308674e-06, + "loss": 0.3733, + "step": 45180 + }, + { + "epoch": 4.5262683427655634, + "grad_norm": 2.111126661300659, + "learning_rate": 1.1042940637119665e-06, + "loss": 0.3748, + "step": 45190 + }, + { + "epoch": 4.527269995492563, + "grad_norm": 2.8170530796051025, + "learning_rate": 1.099665158030133e-06, + "loss": 0.4561, + "step": 45200 + }, + { + "epoch": 4.528271648219563, + "grad_norm": 1.8179787397384644, + "learning_rate": 1.0950457561260174e-06, + "loss": 0.3934, + "step": 45210 + }, + { + "epoch": 4.529273300946562, + "grad_norm": 3.770756721496582, + "learning_rate": 1.0904358598364833e-06, + "loss": 0.3988, + "step": 45220 + }, + { + "epoch": 4.530274953673562, + "grad_norm": 2.2152037620544434, + "learning_rate": 1.085835470994623e-06, + "loss": 0.386, + "step": 45230 + }, + { + "epoch": 4.531276606400561, + "grad_norm": 2.2403817176818848, + "learning_rate": 1.0812445914297447e-06, + "loss": 0.4372, + "step": 45240 + }, + { + "epoch": 4.53227825912756, + "grad_norm": 2.0329346656799316, + "learning_rate": 1.0766632229673724e-06, + "loss": 0.3938, + "step": 45250 + }, + { + "epoch": 4.53327991185456, + "grad_norm": 2.1116416454315186, + "learning_rate": 1.0720913674292509e-06, + "loss": 0.4002, + "step": 45260 + }, + { + "epoch": 4.534281564581559, + "grad_norm": 1.6225544214248657, + "learning_rate": 1.0675290266333433e-06, + "loss": 0.4409, + "step": 45270 + }, + { + "epoch": 4.535283217308559, + "grad_norm": 2.520085334777832, + "learning_rate": 1.0629762023938283e-06, + "loss": 0.3987, + "step": 45280 + }, + { + "epoch": 4.5362848700355585, + "grad_norm": 1.736752986907959, + "learning_rate": 1.0584328965211e-06, + "loss": 0.3976, + "step": 45290 + }, + { + "epoch": 4.5372865227625585, + "grad_norm": 2.065598726272583, + "learning_rate": 1.0538991108217682e-06, + "loss": 0.4387, + "step": 45300 + }, + { + "epoch": 4.538288175489558, + "grad_norm": 2.372802495956421, + "learning_rate": 1.0493748470986554e-06, + "loss": 0.4333, + "step": 45310 + }, + { + "epoch": 4.539289828216558, + "grad_norm": 2.1573617458343506, + "learning_rate": 1.0448601071507996e-06, + "loss": 0.3845, + "step": 45320 + }, + { + "epoch": 4.540291480943557, + "grad_norm": 2.4107489585876465, + "learning_rate": 1.040354892773454e-06, + "loss": 0.439, + "step": 45330 + }, + { + "epoch": 4.541293133670557, + "grad_norm": 2.2853431701660156, + "learning_rate": 1.0358592057580746e-06, + "loss": 0.393, + "step": 45340 + }, + { + "epoch": 4.542294786397556, + "grad_norm": 2.2682673931121826, + "learning_rate": 1.0313730478923422e-06, + "loss": 0.3983, + "step": 45350 + }, + { + "epoch": 4.543296439124555, + "grad_norm": 1.9406390190124512, + "learning_rate": 1.0268964209601328e-06, + "loss": 0.3568, + "step": 45360 + }, + { + "epoch": 4.544298091851555, + "grad_norm": 2.364281415939331, + "learning_rate": 1.0224293267415558e-06, + "loss": 0.4477, + "step": 45370 + }, + { + "epoch": 4.545299744578554, + "grad_norm": 2.5253348350524902, + "learning_rate": 1.0179717670129041e-06, + "loss": 0.425, + "step": 45380 + }, + { + "epoch": 4.546301397305554, + "grad_norm": 2.046555995941162, + "learning_rate": 1.0135237435466932e-06, + "loss": 0.4468, + "step": 45390 + }, + { + "epoch": 4.5473030500325535, + "grad_norm": 1.9803757667541504, + "learning_rate": 1.0090852581116473e-06, + "loss": 0.4173, + "step": 45400 + }, + { + "epoch": 4.548304702759554, + "grad_norm": 2.417950391769409, + "learning_rate": 1.004656312472693e-06, + "loss": 0.3773, + "step": 45410 + }, + { + "epoch": 4.549306355486553, + "grad_norm": 2.228677749633789, + "learning_rate": 1.0002369083909612e-06, + "loss": 0.4586, + "step": 45420 + }, + { + "epoch": 4.550308008213553, + "grad_norm": 2.234309196472168, + "learning_rate": 9.958270476237957e-07, + "loss": 0.4367, + "step": 45430 + }, + { + "epoch": 4.551309660940552, + "grad_norm": 1.536161184310913, + "learning_rate": 9.914267319247495e-07, + "loss": 0.4218, + "step": 45440 + }, + { + "epoch": 4.552311313667552, + "grad_norm": 2.13627552986145, + "learning_rate": 9.870359630435616e-07, + "loss": 0.397, + "step": 45450 + }, + { + "epoch": 4.553312966394551, + "grad_norm": 2.022158145904541, + "learning_rate": 9.826547427261913e-07, + "loss": 0.4395, + "step": 45460 + }, + { + "epoch": 4.55431461912155, + "grad_norm": 2.3157637119293213, + "learning_rate": 9.782830727147974e-07, + "loss": 0.4107, + "step": 45470 + }, + { + "epoch": 4.55531627184855, + "grad_norm": 2.565974235534668, + "learning_rate": 9.739209547477396e-07, + "loss": 0.3774, + "step": 45480 + }, + { + "epoch": 4.556317924575549, + "grad_norm": 2.3096580505371094, + "learning_rate": 9.695683905595748e-07, + "loss": 0.458, + "step": 45490 + }, + { + "epoch": 4.557319577302549, + "grad_norm": 2.6375136375427246, + "learning_rate": 9.652253818810686e-07, + "loss": 0.4138, + "step": 45500 + }, + { + "epoch": 4.558321230029549, + "grad_norm": 2.0477283000946045, + "learning_rate": 9.608919304391895e-07, + "loss": 0.4362, + "step": 45510 + }, + { + "epoch": 4.559322882756549, + "grad_norm": 2.114675521850586, + "learning_rate": 9.565680379570867e-07, + "loss": 0.4118, + "step": 45520 + }, + { + "epoch": 4.560324535483548, + "grad_norm": 1.7504016160964966, + "learning_rate": 9.522537061541353e-07, + "loss": 0.3752, + "step": 45530 + }, + { + "epoch": 4.561326188210547, + "grad_norm": 2.3009567260742188, + "learning_rate": 9.47948936745885e-07, + "loss": 0.3958, + "step": 45540 + }, + { + "epoch": 4.562327840937547, + "grad_norm": 2.306684732437134, + "learning_rate": 9.436537314440996e-07, + "loss": 0.4089, + "step": 45550 + }, + { + "epoch": 4.563329493664547, + "grad_norm": 3.0576114654541016, + "learning_rate": 9.393680919567299e-07, + "loss": 0.4074, + "step": 45560 + }, + { + "epoch": 4.564331146391546, + "grad_norm": 1.9671968221664429, + "learning_rate": 9.350920199879265e-07, + "loss": 0.3332, + "step": 45570 + }, + { + "epoch": 4.565332799118545, + "grad_norm": 1.918282389640808, + "learning_rate": 9.308255172380376e-07, + "loss": 0.4116, + "step": 45580 + }, + { + "epoch": 4.566334451845545, + "grad_norm": 1.908010721206665, + "learning_rate": 9.265685854035977e-07, + "loss": 0.3648, + "step": 45590 + }, + { + "epoch": 4.5673361045725445, + "grad_norm": 1.6069321632385254, + "learning_rate": 9.2232122617735e-07, + "loss": 0.4405, + "step": 45600 + }, + { + "epoch": 4.5683377572995445, + "grad_norm": 2.416090726852417, + "learning_rate": 9.180834412482187e-07, + "loss": 0.3838, + "step": 45610 + }, + { + "epoch": 4.569339410026544, + "grad_norm": 1.9915950298309326, + "learning_rate": 9.13855232301325e-07, + "loss": 0.4142, + "step": 45620 + }, + { + "epoch": 4.570341062753544, + "grad_norm": 2.693861722946167, + "learning_rate": 9.096366010179852e-07, + "loss": 0.4403, + "step": 45630 + }, + { + "epoch": 4.571342715480543, + "grad_norm": 1.9319853782653809, + "learning_rate": 9.054275490757019e-07, + "loss": 0.4234, + "step": 45640 + }, + { + "epoch": 4.572344368207542, + "grad_norm": 2.010544538497925, + "learning_rate": 9.012280781481725e-07, + "loss": 0.4433, + "step": 45650 + }, + { + "epoch": 4.573346020934542, + "grad_norm": 2.1950645446777344, + "learning_rate": 8.970381899052804e-07, + "loss": 0.4073, + "step": 45660 + }, + { + "epoch": 4.574347673661541, + "grad_norm": 1.9796150922775269, + "learning_rate": 8.928578860131043e-07, + "loss": 0.4217, + "step": 45670 + }, + { + "epoch": 4.575349326388541, + "grad_norm": 2.374180316925049, + "learning_rate": 8.886871681339087e-07, + "loss": 0.3626, + "step": 45680 + }, + { + "epoch": 4.57635097911554, + "grad_norm": 3.04028582572937, + "learning_rate": 8.845260379261449e-07, + "loss": 0.4371, + "step": 45690 + }, + { + "epoch": 4.57735263184254, + "grad_norm": 2.386300563812256, + "learning_rate": 8.803744970444533e-07, + "loss": 0.3989, + "step": 45700 + }, + { + "epoch": 4.5783542845695395, + "grad_norm": 2.734030246734619, + "learning_rate": 8.762325471396632e-07, + "loss": 0.4327, + "step": 45710 + }, + { + "epoch": 4.57935593729654, + "grad_norm": 2.3858745098114014, + "learning_rate": 8.721001898587822e-07, + "loss": 0.4408, + "step": 45720 + }, + { + "epoch": 4.580357590023539, + "grad_norm": 2.001619577407837, + "learning_rate": 8.67977426845018e-07, + "loss": 0.421, + "step": 45730 + }, + { + "epoch": 4.581359242750539, + "grad_norm": 2.0118606090545654, + "learning_rate": 8.638642597377483e-07, + "loss": 0.442, + "step": 45740 + }, + { + "epoch": 4.582360895477538, + "grad_norm": 2.007488489151001, + "learning_rate": 8.597606901725397e-07, + "loss": 0.4262, + "step": 45750 + }, + { + "epoch": 4.583362548204537, + "grad_norm": 2.105195999145508, + "learning_rate": 8.55666719781148e-07, + "loss": 0.3884, + "step": 45760 + }, + { + "epoch": 4.584364200931537, + "grad_norm": 1.8013733625411987, + "learning_rate": 8.515823501915126e-07, + "loss": 0.4489, + "step": 45770 + }, + { + "epoch": 4.585365853658536, + "grad_norm": 2.1725831031799316, + "learning_rate": 8.475075830277401e-07, + "loss": 0.4059, + "step": 45780 + }, + { + "epoch": 4.586367506385536, + "grad_norm": 1.905267357826233, + "learning_rate": 8.434424199101315e-07, + "loss": 0.3964, + "step": 45790 + }, + { + "epoch": 4.587369159112535, + "grad_norm": 2.351881265640259, + "learning_rate": 8.393868624551743e-07, + "loss": 0.3971, + "step": 45800 + }, + { + "epoch": 4.588370811839535, + "grad_norm": 1.9069644212722778, + "learning_rate": 8.353409122755202e-07, + "loss": 0.4131, + "step": 45810 + }, + { + "epoch": 4.589372464566535, + "grad_norm": 2.0071794986724854, + "learning_rate": 8.313045709800071e-07, + "loss": 0.3534, + "step": 45820 + }, + { + "epoch": 4.590374117293535, + "grad_norm": 1.9949153661727905, + "learning_rate": 8.272778401736652e-07, + "loss": 0.4288, + "step": 45830 + }, + { + "epoch": 4.591375770020534, + "grad_norm": 2.0804128646850586, + "learning_rate": 8.232607214576859e-07, + "loss": 0.3968, + "step": 45840 + }, + { + "epoch": 4.592377422747534, + "grad_norm": 2.215404987335205, + "learning_rate": 8.192532164294414e-07, + "loss": 0.4034, + "step": 45850 + }, + { + "epoch": 4.593379075474533, + "grad_norm": 2.0437800884246826, + "learning_rate": 8.152553266824875e-07, + "loss": 0.4487, + "step": 45860 + }, + { + "epoch": 4.594380728201532, + "grad_norm": 2.4165077209472656, + "learning_rate": 8.112670538065553e-07, + "loss": 0.3931, + "step": 45870 + }, + { + "epoch": 4.595382380928532, + "grad_norm": 2.1966311931610107, + "learning_rate": 8.072883993875429e-07, + "loss": 0.3606, + "step": 45880 + }, + { + "epoch": 4.596384033655531, + "grad_norm": 1.7312675714492798, + "learning_rate": 8.033193650075349e-07, + "loss": 0.4133, + "step": 45890 + }, + { + "epoch": 4.597385686382531, + "grad_norm": 1.9515161514282227, + "learning_rate": 7.993599522447881e-07, + "loss": 0.4031, + "step": 45900 + }, + { + "epoch": 4.5983873391095305, + "grad_norm": 2.2079174518585205, + "learning_rate": 7.954101626737321e-07, + "loss": 0.431, + "step": 45910 + }, + { + "epoch": 4.5993889918365305, + "grad_norm": 2.438955783843994, + "learning_rate": 7.914699978649604e-07, + "loss": 0.3663, + "step": 45920 + }, + { + "epoch": 4.60039064456353, + "grad_norm": 2.7964272499084473, + "learning_rate": 7.875394593852559e-07, + "loss": 0.4036, + "step": 45930 + }, + { + "epoch": 4.60139229729053, + "grad_norm": 2.09883713722229, + "learning_rate": 7.836185487975655e-07, + "loss": 0.4253, + "step": 45940 + }, + { + "epoch": 4.602393950017529, + "grad_norm": 2.234827995300293, + "learning_rate": 7.797072676610062e-07, + "loss": 0.3641, + "step": 45950 + }, + { + "epoch": 4.603395602744529, + "grad_norm": 2.1803348064422607, + "learning_rate": 7.75805617530867e-07, + "loss": 0.38, + "step": 45960 + }, + { + "epoch": 4.604397255471528, + "grad_norm": 2.11966872215271, + "learning_rate": 7.719135999586125e-07, + "loss": 0.402, + "step": 45970 + }, + { + "epoch": 4.605398908198527, + "grad_norm": 1.670632243156433, + "learning_rate": 7.680312164918657e-07, + "loss": 0.356, + "step": 45980 + }, + { + "epoch": 4.606400560925527, + "grad_norm": 2.887688636779785, + "learning_rate": 7.641584686744308e-07, + "loss": 0.4281, + "step": 45990 + }, + { + "epoch": 4.607402213652526, + "grad_norm": 1.9670038223266602, + "learning_rate": 7.602953580462729e-07, + "loss": 0.4214, + "step": 46000 + }, + { + "epoch": 4.608403866379526, + "grad_norm": 2.14485239982605, + "learning_rate": 7.564418861435301e-07, + "loss": 0.3554, + "step": 46010 + }, + { + "epoch": 4.6094055191065255, + "grad_norm": 1.9791523218154907, + "learning_rate": 7.525980544984989e-07, + "loss": 0.3751, + "step": 46020 + }, + { + "epoch": 4.6104071718335256, + "grad_norm": 2.46907901763916, + "learning_rate": 7.487638646396539e-07, + "loss": 0.4055, + "step": 46030 + }, + { + "epoch": 4.611408824560525, + "grad_norm": 2.633350372314453, + "learning_rate": 7.449393180916281e-07, + "loss": 0.404, + "step": 46040 + }, + { + "epoch": 4.612410477287525, + "grad_norm": 2.210999011993408, + "learning_rate": 7.411244163752163e-07, + "loss": 0.4498, + "step": 46050 + }, + { + "epoch": 4.613412130014524, + "grad_norm": 2.5408952236175537, + "learning_rate": 7.373191610073965e-07, + "loss": 0.4118, + "step": 46060 + }, + { + "epoch": 4.614413782741524, + "grad_norm": 2.2600090503692627, + "learning_rate": 7.335235535012891e-07, + "loss": 0.411, + "step": 46070 + }, + { + "epoch": 4.615415435468523, + "grad_norm": 2.1320509910583496, + "learning_rate": 7.297375953661867e-07, + "loss": 0.4412, + "step": 46080 + }, + { + "epoch": 4.616417088195522, + "grad_norm": 2.4685146808624268, + "learning_rate": 7.259612881075517e-07, + "loss": 0.4401, + "step": 46090 + }, + { + "epoch": 4.617418740922522, + "grad_norm": 1.9967085123062134, + "learning_rate": 7.221946332269968e-07, + "loss": 0.4211, + "step": 46100 + }, + { + "epoch": 4.618420393649521, + "grad_norm": 1.9377949237823486, + "learning_rate": 7.184376322223019e-07, + "loss": 0.3698, + "step": 46110 + }, + { + "epoch": 4.619422046376521, + "grad_norm": 2.3374502658843994, + "learning_rate": 7.146902865874105e-07, + "loss": 0.3967, + "step": 46120 + }, + { + "epoch": 4.620423699103521, + "grad_norm": 3.4399402141571045, + "learning_rate": 7.10952597812431e-07, + "loss": 0.3948, + "step": 46130 + }, + { + "epoch": 4.621425351830521, + "grad_norm": 1.7531739473342896, + "learning_rate": 7.072245673836131e-07, + "loss": 0.4081, + "step": 46140 + }, + { + "epoch": 4.62242700455752, + "grad_norm": 2.1112916469573975, + "learning_rate": 7.03506196783385e-07, + "loss": 0.3658, + "step": 46150 + }, + { + "epoch": 4.62342865728452, + "grad_norm": 3.10821533203125, + "learning_rate": 6.997974874903334e-07, + "loss": 0.4272, + "step": 46160 + }, + { + "epoch": 4.624430310011519, + "grad_norm": 2.686772346496582, + "learning_rate": 6.960984409791871e-07, + "loss": 0.3991, + "step": 46170 + }, + { + "epoch": 4.625431962738519, + "grad_norm": 2.0568907260894775, + "learning_rate": 6.924090587208415e-07, + "loss": 0.4252, + "step": 46180 + }, + { + "epoch": 4.626433615465518, + "grad_norm": 2.369635581970215, + "learning_rate": 6.887293421823593e-07, + "loss": 0.4062, + "step": 46190 + }, + { + "epoch": 4.627435268192517, + "grad_norm": 2.34002947807312, + "learning_rate": 6.850592928269478e-07, + "loss": 0.3955, + "step": 46200 + }, + { + "epoch": 4.628436920919517, + "grad_norm": 2.856238603591919, + "learning_rate": 6.813989121139647e-07, + "loss": 0.4216, + "step": 46210 + }, + { + "epoch": 4.6294385736465165, + "grad_norm": 2.129307508468628, + "learning_rate": 6.7774820149894e-07, + "loss": 0.3689, + "step": 46220 + }, + { + "epoch": 4.6304402263735165, + "grad_norm": 2.2219908237457275, + "learning_rate": 6.741071624335459e-07, + "loss": 0.4228, + "step": 46230 + }, + { + "epoch": 4.631441879100516, + "grad_norm": 2.6033074855804443, + "learning_rate": 6.704757963656189e-07, + "loss": 0.4703, + "step": 46240 + }, + { + "epoch": 4.632443531827516, + "grad_norm": 2.086947441101074, + "learning_rate": 6.668541047391313e-07, + "loss": 0.3758, + "step": 46250 + }, + { + "epoch": 4.633445184554515, + "grad_norm": 1.4777419567108154, + "learning_rate": 6.632420889942287e-07, + "loss": 0.3655, + "step": 46260 + }, + { + "epoch": 4.634446837281515, + "grad_norm": 2.539104461669922, + "learning_rate": 6.596397505672009e-07, + "loss": 0.4336, + "step": 46270 + }, + { + "epoch": 4.635448490008514, + "grad_norm": 2.2465150356292725, + "learning_rate": 6.560470908904798e-07, + "loss": 0.4204, + "step": 46280 + }, + { + "epoch": 4.636450142735514, + "grad_norm": 2.106008529663086, + "learning_rate": 6.524641113926672e-07, + "loss": 0.4228, + "step": 46290 + }, + { + "epoch": 4.637451795462513, + "grad_norm": 3.1730639934539795, + "learning_rate": 6.488908134985011e-07, + "loss": 0.3906, + "step": 46300 + }, + { + "epoch": 4.638453448189512, + "grad_norm": 2.27172589302063, + "learning_rate": 6.453271986288812e-07, + "loss": 0.3889, + "step": 46310 + }, + { + "epoch": 4.639455100916512, + "grad_norm": 2.1306052207946777, + "learning_rate": 6.417732682008431e-07, + "loss": 0.4226, + "step": 46320 + }, + { + "epoch": 4.6404567536435115, + "grad_norm": 3.0258591175079346, + "learning_rate": 6.382290236275845e-07, + "loss": 0.4184, + "step": 46330 + }, + { + "epoch": 4.6414584063705115, + "grad_norm": 1.9228705167770386, + "learning_rate": 6.346944663184418e-07, + "loss": 0.3917, + "step": 46340 + }, + { + "epoch": 4.642460059097511, + "grad_norm": 2.0833024978637695, + "learning_rate": 6.311695976789073e-07, + "loss": 0.4407, + "step": 46350 + }, + { + "epoch": 4.643461711824511, + "grad_norm": 2.445340871810913, + "learning_rate": 6.276544191106154e-07, + "loss": 0.3477, + "step": 46360 + }, + { + "epoch": 4.64446336455151, + "grad_norm": 2.1367125511169434, + "learning_rate": 6.241489320113453e-07, + "loss": 0.3996, + "step": 46370 + }, + { + "epoch": 4.64546501727851, + "grad_norm": 2.3300392627716064, + "learning_rate": 6.206531377750319e-07, + "loss": 0.3829, + "step": 46380 + }, + { + "epoch": 4.646466670005509, + "grad_norm": 2.5135016441345215, + "learning_rate": 6.171670377917465e-07, + "loss": 0.3779, + "step": 46390 + }, + { + "epoch": 4.647468322732509, + "grad_norm": 2.5999858379364014, + "learning_rate": 6.136906334477111e-07, + "loss": 0.3559, + "step": 46400 + }, + { + "epoch": 4.648469975459508, + "grad_norm": 2.109624147415161, + "learning_rate": 6.102239261252862e-07, + "loss": 0.3592, + "step": 46410 + }, + { + "epoch": 4.649471628186507, + "grad_norm": 2.480128526687622, + "learning_rate": 6.067669172029888e-07, + "loss": 0.4487, + "step": 46420 + }, + { + "epoch": 4.650473280913507, + "grad_norm": 2.0874390602111816, + "learning_rate": 6.03319608055461e-07, + "loss": 0.4079, + "step": 46430 + }, + { + "epoch": 4.651474933640507, + "grad_norm": 2.6308510303497314, + "learning_rate": 5.998820000535005e-07, + "loss": 0.4327, + "step": 46440 + }, + { + "epoch": 4.652476586367507, + "grad_norm": 1.756169319152832, + "learning_rate": 5.964540945640501e-07, + "loss": 0.4047, + "step": 46450 + }, + { + "epoch": 4.653478239094506, + "grad_norm": 1.9640753269195557, + "learning_rate": 5.930358929501834e-07, + "loss": 0.4348, + "step": 46460 + }, + { + "epoch": 4.654479891821506, + "grad_norm": 1.7669875621795654, + "learning_rate": 5.896273965711213e-07, + "loss": 0.3855, + "step": 46470 + }, + { + "epoch": 4.655481544548505, + "grad_norm": 3.000284433364868, + "learning_rate": 5.86228606782227e-07, + "loss": 0.3811, + "step": 46480 + }, + { + "epoch": 4.656483197275505, + "grad_norm": 2.0170233249664307, + "learning_rate": 5.828395249350054e-07, + "loss": 0.3869, + "step": 46490 + }, + { + "epoch": 4.657484850002504, + "grad_norm": 2.0546176433563232, + "learning_rate": 5.794601523770926e-07, + "loss": 0.4393, + "step": 46500 + }, + { + "epoch": 4.658486502729504, + "grad_norm": 2.225003719329834, + "learning_rate": 5.76090490452269e-07, + "loss": 0.4438, + "step": 46510 + }, + { + "epoch": 4.659488155456503, + "grad_norm": 1.9720081090927124, + "learning_rate": 5.727305405004574e-07, + "loss": 0.3664, + "step": 46520 + }, + { + "epoch": 4.6604898081835024, + "grad_norm": 2.1545374393463135, + "learning_rate": 5.693803038577167e-07, + "loss": 0.3775, + "step": 46530 + }, + { + "epoch": 4.6614914609105025, + "grad_norm": 1.8879632949829102, + "learning_rate": 5.660397818562341e-07, + "loss": 0.4258, + "step": 46540 + }, + { + "epoch": 4.662493113637502, + "grad_norm": 2.4001457691192627, + "learning_rate": 5.627089758243498e-07, + "loss": 0.3909, + "step": 46550 + }, + { + "epoch": 4.663494766364502, + "grad_norm": 2.4746174812316895, + "learning_rate": 5.593878870865294e-07, + "loss": 0.4284, + "step": 46560 + }, + { + "epoch": 4.664496419091501, + "grad_norm": 2.5044631958007812, + "learning_rate": 5.56076516963372e-07, + "loss": 0.4111, + "step": 46570 + }, + { + "epoch": 4.665498071818501, + "grad_norm": 2.208523988723755, + "learning_rate": 5.527748667716243e-07, + "loss": 0.4011, + "step": 46580 + }, + { + "epoch": 4.6664997245455, + "grad_norm": 1.9320225715637207, + "learning_rate": 5.494829378241584e-07, + "loss": 0.4025, + "step": 46590 + }, + { + "epoch": 4.667501377272499, + "grad_norm": 2.7530622482299805, + "learning_rate": 5.462007314299883e-07, + "loss": 0.3827, + "step": 46600 + }, + { + "epoch": 4.668503029999499, + "grad_norm": 1.7465649843215942, + "learning_rate": 5.429282488942477e-07, + "loss": 0.3867, + "step": 46610 + }, + { + "epoch": 4.669504682726499, + "grad_norm": 1.7411669492721558, + "learning_rate": 5.396654915182209e-07, + "loss": 0.4054, + "step": 46620 + }, + { + "epoch": 4.670506335453498, + "grad_norm": 1.8928498029708862, + "learning_rate": 5.36412460599317e-07, + "loss": 0.4182, + "step": 46630 + }, + { + "epoch": 4.6715079881804975, + "grad_norm": 2.3412082195281982, + "learning_rate": 5.33169157431071e-07, + "loss": 0.4115, + "step": 46640 + }, + { + "epoch": 4.6725096409074975, + "grad_norm": 2.5557522773742676, + "learning_rate": 5.29935583303165e-07, + "loss": 0.3666, + "step": 46650 + }, + { + "epoch": 4.673511293634497, + "grad_norm": 2.372802972793579, + "learning_rate": 5.267117395014009e-07, + "loss": 0.4566, + "step": 46660 + }, + { + "epoch": 4.674512946361497, + "grad_norm": 2.2661385536193848, + "learning_rate": 5.234976273077147e-07, + "loss": 0.417, + "step": 46670 + }, + { + "epoch": 4.675514599088496, + "grad_norm": 2.0665102005004883, + "learning_rate": 5.202932480001699e-07, + "loss": 0.3484, + "step": 46680 + }, + { + "epoch": 4.676516251815496, + "grad_norm": 2.0566537380218506, + "learning_rate": 5.170986028529667e-07, + "loss": 0.3725, + "step": 46690 + }, + { + "epoch": 4.677517904542495, + "grad_norm": 1.9390207529067993, + "learning_rate": 5.139136931364252e-07, + "loss": 0.4094, + "step": 46700 + }, + { + "epoch": 4.678519557269494, + "grad_norm": 2.6145527362823486, + "learning_rate": 5.107385201170045e-07, + "loss": 0.4093, + "step": 46710 + }, + { + "epoch": 4.679521209996494, + "grad_norm": 1.9652702808380127, + "learning_rate": 5.075730850572835e-07, + "loss": 0.416, + "step": 46720 + }, + { + "epoch": 4.680522862723493, + "grad_norm": 2.30545973777771, + "learning_rate": 5.044173892159748e-07, + "loss": 0.3941, + "step": 46730 + }, + { + "epoch": 4.681524515450493, + "grad_norm": 1.984140157699585, + "learning_rate": 5.012714338479135e-07, + "loss": 0.4178, + "step": 46740 + }, + { + "epoch": 4.682526168177493, + "grad_norm": 2.0339691638946533, + "learning_rate": 4.981352202040628e-07, + "loss": 0.3753, + "step": 46750 + }, + { + "epoch": 4.683527820904493, + "grad_norm": 1.7999358177185059, + "learning_rate": 4.95008749531517e-07, + "loss": 0.3793, + "step": 46760 + }, + { + "epoch": 4.684529473631492, + "grad_norm": 1.8290388584136963, + "learning_rate": 4.91892023073487e-07, + "loss": 0.4243, + "step": 46770 + }, + { + "epoch": 4.685531126358492, + "grad_norm": 2.1667447090148926, + "learning_rate": 4.887850420693202e-07, + "loss": 0.4241, + "step": 46780 + }, + { + "epoch": 4.686532779085491, + "grad_norm": 2.310354471206665, + "learning_rate": 4.856878077544785e-07, + "loss": 0.4206, + "step": 46790 + }, + { + "epoch": 4.687534431812491, + "grad_norm": 2.7219927310943604, + "learning_rate": 4.826003213605545e-07, + "loss": 0.4295, + "step": 46800 + }, + { + "epoch": 4.68853608453949, + "grad_norm": 1.945172667503357, + "learning_rate": 4.795225841152579e-07, + "loss": 0.4499, + "step": 46810 + }, + { + "epoch": 4.689537737266489, + "grad_norm": 2.1988821029663086, + "learning_rate": 4.7645459724243444e-07, + "loss": 0.403, + "step": 46820 + }, + { + "epoch": 4.690539389993489, + "grad_norm": 2.19480562210083, + "learning_rate": 4.7339636196204184e-07, + "loss": 0.3844, + "step": 46830 + }, + { + "epoch": 4.691541042720488, + "grad_norm": 2.085426092147827, + "learning_rate": 4.703478794901572e-07, + "loss": 0.4018, + "step": 46840 + }, + { + "epoch": 4.6925426954474885, + "grad_norm": 2.0894999504089355, + "learning_rate": 4.673091510389943e-07, + "loss": 0.3954, + "step": 46850 + }, + { + "epoch": 4.693544348174488, + "grad_norm": 2.0696043968200684, + "learning_rate": 4.642801778168726e-07, + "loss": 0.4097, + "step": 46860 + }, + { + "epoch": 4.694546000901488, + "grad_norm": 2.10793137550354, + "learning_rate": 4.61260961028237e-07, + "loss": 0.3324, + "step": 46870 + }, + { + "epoch": 4.695547653628487, + "grad_norm": 2.361173629760742, + "learning_rate": 4.582515018736633e-07, + "loss": 0.4199, + "step": 46880 + }, + { + "epoch": 4.696549306355487, + "grad_norm": 2.2172489166259766, + "learning_rate": 4.552518015498386e-07, + "loss": 0.3685, + "step": 46890 + }, + { + "epoch": 4.697550959082486, + "grad_norm": 2.353889226913452, + "learning_rate": 4.522618612495588e-07, + "loss": 0.4178, + "step": 46900 + }, + { + "epoch": 4.698552611809486, + "grad_norm": 1.8032987117767334, + "learning_rate": 4.492816821617618e-07, + "loss": 0.4407, + "step": 46910 + }, + { + "epoch": 4.699554264536485, + "grad_norm": 2.0415968894958496, + "learning_rate": 4.463112654714885e-07, + "loss": 0.3827, + "step": 46920 + }, + { + "epoch": 4.700555917263484, + "grad_norm": 2.6588540077209473, + "learning_rate": 4.433506123598996e-07, + "loss": 0.3954, + "step": 46930 + }, + { + "epoch": 4.701557569990484, + "grad_norm": 2.428084373474121, + "learning_rate": 4.4039972400427286e-07, + "loss": 0.4364, + "step": 46940 + }, + { + "epoch": 4.7025592227174835, + "grad_norm": 2.2132630348205566, + "learning_rate": 4.374586015780113e-07, + "loss": 0.474, + "step": 46950 + }, + { + "epoch": 4.7035608754444835, + "grad_norm": 2.4782333374023438, + "learning_rate": 4.3452724625062946e-07, + "loss": 0.4194, + "step": 46960 + }, + { + "epoch": 4.704562528171483, + "grad_norm": 1.8694998025894165, + "learning_rate": 4.3160565918774767e-07, + "loss": 0.379, + "step": 46970 + }, + { + "epoch": 4.705564180898483, + "grad_norm": 1.9565788507461548, + "learning_rate": 4.286938415511227e-07, + "loss": 0.4134, + "step": 46980 + }, + { + "epoch": 4.706565833625482, + "grad_norm": 2.3455045223236084, + "learning_rate": 4.2579179449860896e-07, + "loss": 0.3826, + "step": 46990 + }, + { + "epoch": 4.707567486352482, + "grad_norm": 1.9453314542770386, + "learning_rate": 4.228995191841861e-07, + "loss": 0.3892, + "step": 47000 + }, + { + "epoch": 4.708569139079481, + "grad_norm": 1.8712366819381714, + "learning_rate": 4.200170167579426e-07, + "loss": 0.4426, + "step": 47010 + }, + { + "epoch": 4.709570791806481, + "grad_norm": 1.9517831802368164, + "learning_rate": 4.171442883660809e-07, + "loss": 0.371, + "step": 47020 + }, + { + "epoch": 4.71057244453348, + "grad_norm": 2.4578795433044434, + "learning_rate": 4.142813351509234e-07, + "loss": 0.4289, + "step": 47030 + }, + { + "epoch": 4.711574097260479, + "grad_norm": 1.4846292734146118, + "learning_rate": 4.114281582508955e-07, + "loss": 0.3575, + "step": 47040 + }, + { + "epoch": 4.712575749987479, + "grad_norm": 2.2234723567962646, + "learning_rate": 4.0858475880054537e-07, + "loss": 0.4322, + "step": 47050 + }, + { + "epoch": 4.713577402714479, + "grad_norm": 1.7275258302688599, + "learning_rate": 4.057511379305212e-07, + "loss": 0.3939, + "step": 47060 + }, + { + "epoch": 4.714579055441479, + "grad_norm": 2.4854557514190674, + "learning_rate": 4.029272967675968e-07, + "loss": 0.4135, + "step": 47070 + }, + { + "epoch": 4.715580708168478, + "grad_norm": 3.625199317932129, + "learning_rate": 4.0011323643464605e-07, + "loss": 0.4075, + "step": 47080 + }, + { + "epoch": 4.716582360895478, + "grad_norm": 1.9963372945785522, + "learning_rate": 3.9730895805066e-07, + "loss": 0.4061, + "step": 47090 + }, + { + "epoch": 4.717584013622477, + "grad_norm": 2.0822815895080566, + "learning_rate": 3.945144627307329e-07, + "loss": 0.42, + "step": 47100 + }, + { + "epoch": 4.718585666349477, + "grad_norm": 1.8344260454177856, + "learning_rate": 3.917297515860813e-07, + "loss": 0.4285, + "step": 47110 + }, + { + "epoch": 4.719587319076476, + "grad_norm": 2.9358978271484375, + "learning_rate": 3.8895482572401664e-07, + "loss": 0.4028, + "step": 47120 + }, + { + "epoch": 4.720588971803476, + "grad_norm": 1.8024274110794067, + "learning_rate": 3.861896862479675e-07, + "loss": 0.3655, + "step": 47130 + }, + { + "epoch": 4.721590624530475, + "grad_norm": 2.0380611419677734, + "learning_rate": 3.8343433425747365e-07, + "loss": 0.3879, + "step": 47140 + }, + { + "epoch": 4.722592277257474, + "grad_norm": 1.7357391119003296, + "learning_rate": 3.8068877084817267e-07, + "loss": 0.3821, + "step": 47150 + }, + { + "epoch": 4.7235939299844745, + "grad_norm": 2.1513285636901855, + "learning_rate": 3.7795299711182173e-07, + "loss": 0.4278, + "step": 47160 + }, + { + "epoch": 4.724595582711474, + "grad_norm": 2.3541009426116943, + "learning_rate": 3.752270141362729e-07, + "loss": 0.4123, + "step": 47170 + }, + { + "epoch": 4.725597235438474, + "grad_norm": 2.2936346530914307, + "learning_rate": 3.72510823005498e-07, + "loss": 0.4565, + "step": 47180 + }, + { + "epoch": 4.726598888165473, + "grad_norm": 2.1213440895080566, + "learning_rate": 3.6980442479956633e-07, + "loss": 0.3844, + "step": 47190 + }, + { + "epoch": 4.727600540892473, + "grad_norm": 2.0257651805877686, + "learning_rate": 3.671078205946532e-07, + "loss": 0.4299, + "step": 47200 + }, + { + "epoch": 4.728602193619472, + "grad_norm": 2.968496561050415, + "learning_rate": 3.644210114630481e-07, + "loss": 0.4463, + "step": 47210 + }, + { + "epoch": 4.729603846346472, + "grad_norm": 1.8193625211715698, + "learning_rate": 3.6174399847313525e-07, + "loss": 0.4118, + "step": 47220 + }, + { + "epoch": 4.730605499073471, + "grad_norm": 2.025061845779419, + "learning_rate": 3.5907678268940206e-07, + "loss": 0.3836, + "step": 47230 + }, + { + "epoch": 4.731607151800471, + "grad_norm": 1.9306206703186035, + "learning_rate": 3.564193651724557e-07, + "loss": 0.3626, + "step": 47240 + }, + { + "epoch": 4.73260880452747, + "grad_norm": 2.3328723907470703, + "learning_rate": 3.5377174697899253e-07, + "loss": 0.4453, + "step": 47250 + }, + { + "epoch": 4.7336104572544695, + "grad_norm": 2.4250965118408203, + "learning_rate": 3.5113392916181485e-07, + "loss": 0.4006, + "step": 47260 + }, + { + "epoch": 4.7346121099814695, + "grad_norm": 2.66080904006958, + "learning_rate": 3.485059127698309e-07, + "loss": 0.4236, + "step": 47270 + }, + { + "epoch": 4.735613762708469, + "grad_norm": 2.2034497261047363, + "learning_rate": 3.4588769884805473e-07, + "loss": 0.3963, + "step": 47280 + }, + { + "epoch": 4.736615415435469, + "grad_norm": 2.1475400924682617, + "learning_rate": 3.4327928843759517e-07, + "loss": 0.4313, + "step": 47290 + }, + { + "epoch": 4.737617068162468, + "grad_norm": 2.439648389816284, + "learning_rate": 3.406806825756614e-07, + "loss": 0.4619, + "step": 47300 + }, + { + "epoch": 4.738618720889468, + "grad_norm": 1.9085047245025635, + "learning_rate": 3.380918822955742e-07, + "loss": 0.3627, + "step": 47310 + }, + { + "epoch": 4.739620373616467, + "grad_norm": 2.736255168914795, + "learning_rate": 3.3551288862675167e-07, + "loss": 0.415, + "step": 47320 + }, + { + "epoch": 4.740622026343467, + "grad_norm": 2.4652960300445557, + "learning_rate": 3.329437025947013e-07, + "loss": 0.4147, + "step": 47330 + }, + { + "epoch": 4.741623679070466, + "grad_norm": 2.4739601612091064, + "learning_rate": 3.303843252210448e-07, + "loss": 0.4556, + "step": 47340 + }, + { + "epoch": 4.742625331797466, + "grad_norm": 2.4230849742889404, + "learning_rate": 3.278347575234986e-07, + "loss": 0.3888, + "step": 47350 + }, + { + "epoch": 4.743626984524465, + "grad_norm": 2.326728105545044, + "learning_rate": 3.252950005158767e-07, + "loss": 0.4818, + "step": 47360 + }, + { + "epoch": 4.7446286372514646, + "grad_norm": 2.3454596996307373, + "learning_rate": 3.2276505520809353e-07, + "loss": 0.3918, + "step": 47370 + }, + { + "epoch": 4.745630289978465, + "grad_norm": 2.66456937789917, + "learning_rate": 3.20244922606161e-07, + "loss": 0.4257, + "step": 47380 + }, + { + "epoch": 4.746631942705464, + "grad_norm": 2.293207883834839, + "learning_rate": 3.1773460371219144e-07, + "loss": 0.4507, + "step": 47390 + }, + { + "epoch": 4.747633595432464, + "grad_norm": 2.403235673904419, + "learning_rate": 3.1523409952439465e-07, + "loss": 0.4127, + "step": 47400 + }, + { + "epoch": 4.748635248159463, + "grad_norm": 2.0613138675689697, + "learning_rate": 3.1274341103706973e-07, + "loss": 0.351, + "step": 47410 + }, + { + "epoch": 4.749636900886463, + "grad_norm": 2.4735984802246094, + "learning_rate": 3.102625392406244e-07, + "loss": 0.44, + "step": 47420 + }, + { + "epoch": 4.750638553613462, + "grad_norm": 1.6152466535568237, + "learning_rate": 3.077914851215585e-07, + "loss": 0.4251, + "step": 47430 + }, + { + "epoch": 4.751640206340462, + "grad_norm": 1.6947752237319946, + "learning_rate": 3.05330249662461e-07, + "loss": 0.3719, + "step": 47440 + }, + { + "epoch": 4.752641859067461, + "grad_norm": 2.6465158462524414, + "learning_rate": 3.0287883384202965e-07, + "loss": 0.4549, + "step": 47450 + }, + { + "epoch": 4.753643511794461, + "grad_norm": 2.3784332275390625, + "learning_rate": 3.004372386350457e-07, + "loss": 0.4461, + "step": 47460 + }, + { + "epoch": 4.7546451645214605, + "grad_norm": 2.1705985069274902, + "learning_rate": 2.980054650123909e-07, + "loss": 0.3915, + "step": 47470 + }, + { + "epoch": 4.75564681724846, + "grad_norm": 2.2914929389953613, + "learning_rate": 2.955835139410418e-07, + "loss": 0.4398, + "step": 47480 + }, + { + "epoch": 4.75664846997546, + "grad_norm": 2.312525749206543, + "learning_rate": 2.9317138638406684e-07, + "loss": 0.4375, + "step": 47490 + }, + { + "epoch": 4.757650122702459, + "grad_norm": 1.8366079330444336, + "learning_rate": 2.9076908330062937e-07, + "loss": 0.3871, + "step": 47500 + }, + { + "epoch": 4.758651775429459, + "grad_norm": 2.4751739501953125, + "learning_rate": 2.8837660564598747e-07, + "loss": 0.4007, + "step": 47510 + }, + { + "epoch": 4.759653428156458, + "grad_norm": 1.7309223413467407, + "learning_rate": 2.859939543714912e-07, + "loss": 0.4545, + "step": 47520 + }, + { + "epoch": 4.760655080883458, + "grad_norm": 2.232672929763794, + "learning_rate": 2.836211304245773e-07, + "loss": 0.4088, + "step": 47530 + }, + { + "epoch": 4.761656733610457, + "grad_norm": 2.060687303543091, + "learning_rate": 2.812581347487908e-07, + "loss": 0.3797, + "step": 47540 + }, + { + "epoch": 4.762658386337456, + "grad_norm": 2.090240716934204, + "learning_rate": 2.789049682837469e-07, + "loss": 0.4377, + "step": 47550 + }, + { + "epoch": 4.763660039064456, + "grad_norm": 1.645607590675354, + "learning_rate": 2.765616319651693e-07, + "loss": 0.3978, + "step": 47560 + }, + { + "epoch": 4.764661691791456, + "grad_norm": 1.79109525680542, + "learning_rate": 2.742281267248681e-07, + "loss": 0.4024, + "step": 47570 + }, + { + "epoch": 4.7656633445184555, + "grad_norm": 2.5542709827423096, + "learning_rate": 2.7190445349074e-07, + "loss": 0.4798, + "step": 47580 + }, + { + "epoch": 4.766664997245455, + "grad_norm": 2.0201637744903564, + "learning_rate": 2.6959061318677645e-07, + "loss": 0.4226, + "step": 47590 + }, + { + "epoch": 4.767666649972455, + "grad_norm": 2.7899389266967773, + "learning_rate": 2.672866067330554e-07, + "loss": 0.4479, + "step": 47600 + }, + { + "epoch": 4.768668302699454, + "grad_norm": 2.2652668952941895, + "learning_rate": 2.649924350457522e-07, + "loss": 0.4183, + "step": 47610 + }, + { + "epoch": 4.769669955426454, + "grad_norm": 2.208167314529419, + "learning_rate": 2.6270809903712057e-07, + "loss": 0.379, + "step": 47620 + }, + { + "epoch": 4.770671608153453, + "grad_norm": 2.410855293273926, + "learning_rate": 2.604335996155088e-07, + "loss": 0.4361, + "step": 47630 + }, + { + "epoch": 4.771673260880453, + "grad_norm": 1.9786897897720337, + "learning_rate": 2.5816893768535744e-07, + "loss": 0.4211, + "step": 47640 + }, + { + "epoch": 4.772674913607452, + "grad_norm": 2.5731799602508545, + "learning_rate": 2.5591411414719046e-07, + "loss": 0.3395, + "step": 47650 + }, + { + "epoch": 4.773676566334451, + "grad_norm": 2.1998531818389893, + "learning_rate": 2.5366912989761573e-07, + "loss": 0.412, + "step": 47660 + }, + { + "epoch": 4.774678219061451, + "grad_norm": 2.2904112339019775, + "learning_rate": 2.5143398582933575e-07, + "loss": 0.3507, + "step": 47670 + }, + { + "epoch": 4.775679871788451, + "grad_norm": 2.1148123741149902, + "learning_rate": 2.4920868283114243e-07, + "loss": 0.4345, + "step": 47680 + }, + { + "epoch": 4.776681524515451, + "grad_norm": 2.2035341262817383, + "learning_rate": 2.4699322178790285e-07, + "loss": 0.3879, + "step": 47690 + }, + { + "epoch": 4.77768317724245, + "grad_norm": 2.235149621963501, + "learning_rate": 2.4478760358057904e-07, + "loss": 0.3872, + "step": 47700 + }, + { + "epoch": 4.77868482996945, + "grad_norm": 2.893347978591919, + "learning_rate": 2.4259182908622226e-07, + "loss": 0.4255, + "step": 47710 + }, + { + "epoch": 4.779686482696449, + "grad_norm": 2.0718188285827637, + "learning_rate": 2.4040589917796175e-07, + "loss": 0.4655, + "step": 47720 + }, + { + "epoch": 4.780688135423449, + "grad_norm": 2.6403911113739014, + "learning_rate": 2.382298147250106e-07, + "loss": 0.3789, + "step": 47730 + }, + { + "epoch": 4.781689788150448, + "grad_norm": 2.3864388465881348, + "learning_rate": 2.3606357659267942e-07, + "loss": 0.373, + "step": 47740 + }, + { + "epoch": 4.782691440877448, + "grad_norm": 2.413670539855957, + "learning_rate": 2.3390718564235137e-07, + "loss": 0.4291, + "step": 47750 + }, + { + "epoch": 4.783693093604447, + "grad_norm": 1.797200083732605, + "learning_rate": 2.3176064273149612e-07, + "loss": 0.3736, + "step": 47760 + }, + { + "epoch": 4.784694746331446, + "grad_norm": 2.2276856899261475, + "learning_rate": 2.2962394871367533e-07, + "loss": 0.4219, + "step": 47770 + }, + { + "epoch": 4.7856963990584465, + "grad_norm": 2.2116262912750244, + "learning_rate": 2.2749710443852047e-07, + "loss": 0.3695, + "step": 47780 + }, + { + "epoch": 4.786698051785446, + "grad_norm": 2.3747339248657227, + "learning_rate": 2.2538011075176059e-07, + "loss": 0.4255, + "step": 47790 + }, + { + "epoch": 4.787699704512446, + "grad_norm": 2.656726360321045, + "learning_rate": 2.2327296849520008e-07, + "loss": 0.3903, + "step": 47800 + }, + { + "epoch": 4.788701357239445, + "grad_norm": 2.5891129970550537, + "learning_rate": 2.2117567850672705e-07, + "loss": 0.4349, + "step": 47810 + }, + { + "epoch": 4.789703009966445, + "grad_norm": 2.4755101203918457, + "learning_rate": 2.190882416203105e-07, + "loss": 0.3904, + "step": 47820 + }, + { + "epoch": 4.790704662693444, + "grad_norm": 2.4286439418792725, + "learning_rate": 2.1701065866600312e-07, + "loss": 0.3761, + "step": 47830 + }, + { + "epoch": 4.791706315420444, + "grad_norm": 2.4156734943389893, + "learning_rate": 2.149429304699413e-07, + "loss": 0.4208, + "step": 47840 + }, + { + "epoch": 4.792707968147443, + "grad_norm": 2.1973776817321777, + "learning_rate": 2.1288505785433954e-07, + "loss": 0.3716, + "step": 47850 + }, + { + "epoch": 4.793709620874443, + "grad_norm": 2.4909114837646484, + "learning_rate": 2.108370416374933e-07, + "loss": 0.3857, + "step": 47860 + }, + { + "epoch": 4.794711273601442, + "grad_norm": 2.1599183082580566, + "learning_rate": 2.0879888263378167e-07, + "loss": 0.4253, + "step": 47870 + }, + { + "epoch": 4.7957129263284415, + "grad_norm": 1.7978217601776123, + "learning_rate": 2.067705816536647e-07, + "loss": 0.367, + "step": 47880 + }, + { + "epoch": 4.7967145790554415, + "grad_norm": 2.390063524246216, + "learning_rate": 2.0475213950367221e-07, + "loss": 0.4664, + "step": 47890 + }, + { + "epoch": 4.797716231782441, + "grad_norm": 2.1187353134155273, + "learning_rate": 2.0274355698643166e-07, + "loss": 0.4457, + "step": 47900 + }, + { + "epoch": 4.798717884509441, + "grad_norm": 2.258744239807129, + "learning_rate": 2.007448349006319e-07, + "loss": 0.4192, + "step": 47910 + }, + { + "epoch": 4.79971953723644, + "grad_norm": 2.4565742015838623, + "learning_rate": 1.9875597404105383e-07, + "loss": 0.4753, + "step": 47920 + }, + { + "epoch": 4.80072118996344, + "grad_norm": 2.1519105434417725, + "learning_rate": 1.967769751985482e-07, + "loss": 0.4224, + "step": 47930 + }, + { + "epoch": 4.801722842690439, + "grad_norm": 1.9558358192443848, + "learning_rate": 1.9480783916005217e-07, + "loss": 0.3739, + "step": 47940 + }, + { + "epoch": 4.802724495417439, + "grad_norm": 2.899057388305664, + "learning_rate": 1.9284856670857276e-07, + "loss": 0.4697, + "step": 47950 + }, + { + "epoch": 4.803726148144438, + "grad_norm": 2.3547844886779785, + "learning_rate": 1.908991586232006e-07, + "loss": 0.3781, + "step": 47960 + }, + { + "epoch": 4.804727800871438, + "grad_norm": 2.6609480381011963, + "learning_rate": 1.8895961567910737e-07, + "loss": 0.388, + "step": 47970 + }, + { + "epoch": 4.805729453598437, + "grad_norm": 2.470571994781494, + "learning_rate": 1.8702993864752883e-07, + "loss": 0.4113, + "step": 47980 + }, + { + "epoch": 4.8067311063254365, + "grad_norm": 2.2260477542877197, + "learning_rate": 1.8511012829578733e-07, + "loss": 0.3831, + "step": 47990 + }, + { + "epoch": 4.807732759052437, + "grad_norm": 1.9835573434829712, + "learning_rate": 1.832001853872861e-07, + "loss": 0.4476, + "step": 48000 + }, + { + "epoch": 4.808734411779436, + "grad_norm": 2.08286190032959, + "learning_rate": 1.8130011068149266e-07, + "loss": 0.3969, + "step": 48010 + }, + { + "epoch": 4.809736064506436, + "grad_norm": 2.29129958152771, + "learning_rate": 1.7940990493395815e-07, + "loss": 0.4017, + "step": 48020 + }, + { + "epoch": 4.810737717233435, + "grad_norm": 2.6982805728912354, + "learning_rate": 1.775295688963091e-07, + "loss": 0.4785, + "step": 48030 + }, + { + "epoch": 4.811739369960435, + "grad_norm": 2.0841140747070312, + "learning_rate": 1.7565910331624468e-07, + "loss": 0.4117, + "step": 48040 + }, + { + "epoch": 4.812741022687434, + "grad_norm": 2.39800763130188, + "learning_rate": 1.7379850893754212e-07, + "loss": 0.4054, + "step": 48050 + }, + { + "epoch": 4.813742675414434, + "grad_norm": 1.6849093437194824, + "learning_rate": 1.719477865000513e-07, + "loss": 0.4053, + "step": 48060 + }, + { + "epoch": 4.814744328141433, + "grad_norm": 2.5003740787506104, + "learning_rate": 1.7010693673969736e-07, + "loss": 0.3939, + "step": 48070 + }, + { + "epoch": 4.815745980868433, + "grad_norm": 2.4821770191192627, + "learning_rate": 1.6827596038848092e-07, + "loss": 0.4269, + "step": 48080 + }, + { + "epoch": 4.8167476335954325, + "grad_norm": 1.7561688423156738, + "learning_rate": 1.664548581744696e-07, + "loss": 0.3697, + "step": 48090 + }, + { + "epoch": 4.817749286322432, + "grad_norm": 2.127955436706543, + "learning_rate": 1.6464363082181467e-07, + "loss": 0.3987, + "step": 48100 + }, + { + "epoch": 4.818750939049432, + "grad_norm": 1.8868879079818726, + "learning_rate": 1.6284227905073722e-07, + "loss": 0.4484, + "step": 48110 + }, + { + "epoch": 4.819752591776431, + "grad_norm": 1.6073931455612183, + "learning_rate": 1.6105080357752822e-07, + "loss": 0.3723, + "step": 48120 + }, + { + "epoch": 4.820754244503431, + "grad_norm": 2.3502357006073, + "learning_rate": 1.592692051145539e-07, + "loss": 0.3868, + "step": 48130 + }, + { + "epoch": 4.82175589723043, + "grad_norm": 2.4274137020111084, + "learning_rate": 1.5749748437025314e-07, + "loss": 0.423, + "step": 48140 + }, + { + "epoch": 4.82275754995743, + "grad_norm": 2.140169143676758, + "learning_rate": 1.557356420491346e-07, + "loss": 0.4156, + "step": 48150 + }, + { + "epoch": 4.823759202684429, + "grad_norm": 2.5988030433654785, + "learning_rate": 1.539836788517851e-07, + "loss": 0.4077, + "step": 48160 + }, + { + "epoch": 4.824760855411429, + "grad_norm": 2.1025750637054443, + "learning_rate": 1.5224159547485573e-07, + "loss": 0.4326, + "step": 48170 + }, + { + "epoch": 4.825762508138428, + "grad_norm": 1.7013978958129883, + "learning_rate": 1.505093926110729e-07, + "loss": 0.396, + "step": 48180 + }, + { + "epoch": 4.826764160865428, + "grad_norm": 2.384352207183838, + "learning_rate": 1.4878707094923283e-07, + "loss": 0.3865, + "step": 48190 + }, + { + "epoch": 4.8277658135924275, + "grad_norm": 2.697436571121216, + "learning_rate": 1.470746311742044e-07, + "loss": 0.3775, + "step": 48200 + }, + { + "epoch": 4.828767466319427, + "grad_norm": 2.4065449237823486, + "learning_rate": 1.4537207396692343e-07, + "loss": 0.4029, + "step": 48210 + }, + { + "epoch": 4.829769119046427, + "grad_norm": 1.876576542854309, + "learning_rate": 1.436794000043984e-07, + "loss": 0.4067, + "step": 48220 + }, + { + "epoch": 4.830770771773426, + "grad_norm": 2.1361072063446045, + "learning_rate": 1.419966099597103e-07, + "loss": 0.3742, + "step": 48230 + }, + { + "epoch": 4.831772424500426, + "grad_norm": 2.7625808715820312, + "learning_rate": 1.4032370450200728e-07, + "loss": 0.4557, + "step": 48240 + }, + { + "epoch": 4.832774077227425, + "grad_norm": 2.028794527053833, + "learning_rate": 1.386606842965016e-07, + "loss": 0.4347, + "step": 48250 + }, + { + "epoch": 4.833775729954425, + "grad_norm": 2.240504026412964, + "learning_rate": 1.3700755000448373e-07, + "loss": 0.4257, + "step": 48260 + }, + { + "epoch": 4.834777382681424, + "grad_norm": 2.295073986053467, + "learning_rate": 1.3536430228331122e-07, + "loss": 0.3854, + "step": 48270 + }, + { + "epoch": 4.835779035408424, + "grad_norm": 2.3853859901428223, + "learning_rate": 1.3373094178640576e-07, + "loss": 0.3957, + "step": 48280 + }, + { + "epoch": 4.836780688135423, + "grad_norm": 2.4576685428619385, + "learning_rate": 1.32107469163259e-07, + "loss": 0.3913, + "step": 48290 + }, + { + "epoch": 4.837782340862423, + "grad_norm": 1.7085213661193848, + "learning_rate": 1.3049388505943504e-07, + "loss": 0.3868, + "step": 48300 + }, + { + "epoch": 4.838783993589423, + "grad_norm": 2.529207706451416, + "learning_rate": 1.2889019011655955e-07, + "loss": 0.4383, + "step": 48310 + }, + { + "epoch": 4.839785646316422, + "grad_norm": 2.305237054824829, + "learning_rate": 1.2729638497233077e-07, + "loss": 0.4403, + "step": 48320 + }, + { + "epoch": 4.840787299043422, + "grad_norm": 1.8845841884613037, + "learning_rate": 1.2571247026051392e-07, + "loss": 0.3387, + "step": 48330 + }, + { + "epoch": 4.841788951770421, + "grad_norm": 2.760739803314209, + "learning_rate": 1.241384466109413e-07, + "loss": 0.4136, + "step": 48340 + }, + { + "epoch": 4.842790604497421, + "grad_norm": 2.0073342323303223, + "learning_rate": 1.2257431464950396e-07, + "loss": 0.4204, + "step": 48350 + }, + { + "epoch": 4.84379225722442, + "grad_norm": 2.3097691535949707, + "learning_rate": 1.2102007499817103e-07, + "loss": 0.3863, + "step": 48360 + }, + { + "epoch": 4.84479390995142, + "grad_norm": 2.4525158405303955, + "learning_rate": 1.1947572827497588e-07, + "loss": 0.425, + "step": 48370 + }, + { + "epoch": 4.845795562678419, + "grad_norm": 1.7297872304916382, + "learning_rate": 1.1794127509401065e-07, + "loss": 0.3882, + "step": 48380 + }, + { + "epoch": 4.846797215405419, + "grad_norm": 2.2076919078826904, + "learning_rate": 1.164167160654428e-07, + "loss": 0.4148, + "step": 48390 + }, + { + "epoch": 4.8477988681324184, + "grad_norm": 2.070524215698242, + "learning_rate": 1.1490205179549851e-07, + "loss": 0.3745, + "step": 48400 + }, + { + "epoch": 4.8488005208594185, + "grad_norm": 2.7982497215270996, + "learning_rate": 1.1339728288647378e-07, + "loss": 0.4531, + "step": 48410 + }, + { + "epoch": 4.849802173586418, + "grad_norm": 1.602648377418518, + "learning_rate": 1.1190240993672607e-07, + "loss": 0.3816, + "step": 48420 + }, + { + "epoch": 4.850803826313417, + "grad_norm": 1.842578411102295, + "learning_rate": 1.1041743354067991e-07, + "loss": 0.3725, + "step": 48430 + }, + { + "epoch": 4.851805479040417, + "grad_norm": 2.1316118240356445, + "learning_rate": 1.0894235428882682e-07, + "loss": 0.4026, + "step": 48440 + }, + { + "epoch": 4.852807131767416, + "grad_norm": 2.16373348236084, + "learning_rate": 1.0747717276771707e-07, + "loss": 0.3965, + "step": 48450 + }, + { + "epoch": 4.853808784494416, + "grad_norm": 2.0309717655181885, + "learning_rate": 1.0602188955996795e-07, + "loss": 0.4534, + "step": 48460 + }, + { + "epoch": 4.854810437221415, + "grad_norm": 1.7845313549041748, + "learning_rate": 1.0457650524426654e-07, + "loss": 0.4438, + "step": 48470 + }, + { + "epoch": 4.855812089948415, + "grad_norm": 1.8658897876739502, + "learning_rate": 1.0314102039535312e-07, + "loss": 0.3743, + "step": 48480 + }, + { + "epoch": 4.856813742675414, + "grad_norm": 2.0582423210144043, + "learning_rate": 1.0171543558403774e-07, + "loss": 0.4112, + "step": 48490 + }, + { + "epoch": 4.857815395402414, + "grad_norm": 2.1272170543670654, + "learning_rate": 1.0029975137719472e-07, + "loss": 0.3999, + "step": 48500 + }, + { + "epoch": 4.8588170481294135, + "grad_norm": 2.131007432937622, + "learning_rate": 9.88939683377571e-08, + "loss": 0.384, + "step": 48510 + }, + { + "epoch": 4.8598187008564135, + "grad_norm": 2.36995005607605, + "learning_rate": 9.749808702472774e-08, + "loss": 0.3686, + "step": 48520 + }, + { + "epoch": 4.860820353583413, + "grad_norm": 2.426204204559326, + "learning_rate": 9.611210799316262e-08, + "loss": 0.4627, + "step": 48530 + }, + { + "epoch": 4.861822006310412, + "grad_norm": 1.8704348802566528, + "learning_rate": 9.473603179418756e-08, + "loss": 0.4085, + "step": 48540 + }, + { + "epoch": 4.862823659037412, + "grad_norm": 2.140582799911499, + "learning_rate": 9.336985897498706e-08, + "loss": 0.3729, + "step": 48550 + }, + { + "epoch": 4.863825311764411, + "grad_norm": 1.9879549741744995, + "learning_rate": 9.201359007881271e-08, + "loss": 0.4406, + "step": 48560 + }, + { + "epoch": 4.864826964491411, + "grad_norm": 2.478294610977173, + "learning_rate": 9.066722564496921e-08, + "loss": 0.3528, + "step": 48570 + }, + { + "epoch": 4.86582861721841, + "grad_norm": 2.1455304622650146, + "learning_rate": 8.93307662088283e-08, + "loss": 0.3928, + "step": 48580 + }, + { + "epoch": 4.86683026994541, + "grad_norm": 2.190230369567871, + "learning_rate": 8.800421230182599e-08, + "loss": 0.3745, + "step": 48590 + }, + { + "epoch": 4.867831922672409, + "grad_norm": 2.3964619636535645, + "learning_rate": 8.668756445145421e-08, + "loss": 0.4348, + "step": 48600 + }, + { + "epoch": 4.8688335753994085, + "grad_norm": 2.1755192279815674, + "learning_rate": 8.53808231812664e-08, + "loss": 0.3612, + "step": 48610 + }, + { + "epoch": 4.869835228126409, + "grad_norm": 2.1091182231903076, + "learning_rate": 8.408398901087466e-08, + "loss": 0.4825, + "step": 48620 + }, + { + "epoch": 4.870836880853409, + "grad_norm": 2.6433656215667725, + "learning_rate": 8.279706245596375e-08, + "loss": 0.4309, + "step": 48630 + }, + { + "epoch": 4.871838533580408, + "grad_norm": 2.1513772010803223, + "learning_rate": 8.152004402826319e-08, + "loss": 0.3659, + "step": 48640 + }, + { + "epoch": 4.872840186307407, + "grad_norm": 2.3869216442108154, + "learning_rate": 8.025293423556956e-08, + "loss": 0.4321, + "step": 48650 + }, + { + "epoch": 4.873841839034407, + "grad_norm": 2.6791770458221436, + "learning_rate": 7.899573358174094e-08, + "loss": 0.4111, + "step": 48660 + }, + { + "epoch": 4.874843491761406, + "grad_norm": 2.2372169494628906, + "learning_rate": 7.774844256669134e-08, + "loss": 0.4593, + "step": 48670 + }, + { + "epoch": 4.875845144488406, + "grad_norm": 2.1360697746276855, + "learning_rate": 7.651106168639344e-08, + "loss": 0.443, + "step": 48680 + }, + { + "epoch": 4.876846797215405, + "grad_norm": 2.072798490524292, + "learning_rate": 7.528359143288977e-08, + "loss": 0.3797, + "step": 48690 + }, + { + "epoch": 4.877848449942405, + "grad_norm": 2.050501823425293, + "learning_rate": 7.406603229427045e-08, + "loss": 0.3611, + "step": 48700 + }, + { + "epoch": 4.878850102669404, + "grad_norm": 2.0490472316741943, + "learning_rate": 7.285838475468431e-08, + "loss": 0.4052, + "step": 48710 + }, + { + "epoch": 4.879851755396404, + "grad_norm": 2.0858280658721924, + "learning_rate": 7.166064929434446e-08, + "loss": 0.4111, + "step": 48720 + }, + { + "epoch": 4.880853408123404, + "grad_norm": 1.834701657295227, + "learning_rate": 7.047282638952545e-08, + "loss": 0.4064, + "step": 48730 + }, + { + "epoch": 4.881855060850404, + "grad_norm": 1.9594968557357788, + "learning_rate": 6.929491651255226e-08, + "loss": 0.4009, + "step": 48740 + }, + { + "epoch": 4.882856713577403, + "grad_norm": 1.9518842697143555, + "learning_rate": 6.812692013181132e-08, + "loss": 0.3914, + "step": 48750 + }, + { + "epoch": 4.883858366304402, + "grad_norm": 3.005319118499756, + "learning_rate": 6.6968837711745e-08, + "loss": 0.4469, + "step": 48760 + }, + { + "epoch": 4.884860019031402, + "grad_norm": 2.5086545944213867, + "learning_rate": 6.582066971285995e-08, + "loss": 0.4403, + "step": 48770 + }, + { + "epoch": 4.885861671758401, + "grad_norm": 2.4275851249694824, + "learning_rate": 6.468241659171315e-08, + "loss": 0.4217, + "step": 48780 + }, + { + "epoch": 4.886863324485401, + "grad_norm": 2.5610122680664062, + "learning_rate": 6.355407880092313e-08, + "loss": 0.4648, + "step": 48790 + }, + { + "epoch": 4.8878649772124, + "grad_norm": 1.8257973194122314, + "learning_rate": 6.24356567891643e-08, + "loss": 0.4074, + "step": 48800 + }, + { + "epoch": 4.8888666299394, + "grad_norm": 1.4888192415237427, + "learning_rate": 6.132715100116704e-08, + "loss": 0.4077, + "step": 48810 + }, + { + "epoch": 4.8898682826663995, + "grad_norm": 2.0692968368530273, + "learning_rate": 6.022856187772041e-08, + "loss": 0.4498, + "step": 48820 + }, + { + "epoch": 4.890869935393399, + "grad_norm": 1.9948792457580566, + "learning_rate": 5.913988985566943e-08, + "loss": 0.3604, + "step": 48830 + }, + { + "epoch": 4.891871588120399, + "grad_norm": 2.2797932624816895, + "learning_rate": 5.806113536791779e-08, + "loss": 0.4214, + "step": 48840 + }, + { + "epoch": 4.892873240847398, + "grad_norm": 1.757158637046814, + "learning_rate": 5.699229884341961e-08, + "loss": 0.4587, + "step": 48850 + }, + { + "epoch": 4.893874893574398, + "grad_norm": 2.277294874191284, + "learning_rate": 5.593338070719323e-08, + "loss": 0.3885, + "step": 48860 + }, + { + "epoch": 4.894876546301397, + "grad_norm": 2.3876211643218994, + "learning_rate": 5.488438138030738e-08, + "loss": 0.365, + "step": 48870 + }, + { + "epoch": 4.895878199028397, + "grad_norm": 2.418525457382202, + "learning_rate": 5.3845301279889514e-08, + "loss": 0.3895, + "step": 48880 + }, + { + "epoch": 4.896879851755396, + "grad_norm": 2.0426852703094482, + "learning_rate": 5.2816140819120233e-08, + "loss": 0.4293, + "step": 48890 + }, + { + "epoch": 4.897881504482396, + "grad_norm": 2.254586696624756, + "learning_rate": 5.179690040723606e-08, + "loss": 0.3793, + "step": 48900 + }, + { + "epoch": 4.898883157209395, + "grad_norm": 2.491412878036499, + "learning_rate": 5.078758044952947e-08, + "loss": 0.3904, + "step": 48910 + }, + { + "epoch": 4.899884809936395, + "grad_norm": 2.3205726146698, + "learning_rate": 4.978818134735164e-08, + "loss": 0.3826, + "step": 48920 + }, + { + "epoch": 4.900886462663395, + "grad_norm": 2.237753391265869, + "learning_rate": 4.879870349810689e-08, + "loss": 0.3971, + "step": 48930 + }, + { + "epoch": 4.901888115390394, + "grad_norm": 1.7564687728881836, + "learning_rate": 4.781914729524717e-08, + "loss": 0.4064, + "step": 48940 + }, + { + "epoch": 4.902889768117394, + "grad_norm": 2.1869213581085205, + "learning_rate": 4.684951312828867e-08, + "loss": 0.4179, + "step": 48950 + }, + { + "epoch": 4.903891420844393, + "grad_norm": 1.9542244672775269, + "learning_rate": 4.588980138279797e-08, + "loss": 0.4201, + "step": 48960 + }, + { + "epoch": 4.904893073571393, + "grad_norm": 2.3261213302612305, + "learning_rate": 4.4940012440397583e-08, + "loss": 0.3918, + "step": 48970 + }, + { + "epoch": 4.905894726298392, + "grad_norm": 2.3012595176696777, + "learning_rate": 4.400014667876318e-08, + "loss": 0.4794, + "step": 48980 + }, + { + "epoch": 4.906896379025392, + "grad_norm": 2.335749387741089, + "learning_rate": 4.307020447162635e-08, + "loss": 0.3636, + "step": 48990 + }, + { + "epoch": 4.907898031752391, + "grad_norm": 2.5763540267944336, + "learning_rate": 4.215018618876632e-08, + "loss": 0.388, + "step": 49000 + }, + { + "epoch": 4.908899684479391, + "grad_norm": 2.0762269496917725, + "learning_rate": 4.124009219602654e-08, + "loss": 0.4073, + "step": 49010 + }, + { + "epoch": 4.90990133720639, + "grad_norm": 2.1076955795288086, + "learning_rate": 4.033992285529809e-08, + "loss": 0.4247, + "step": 49020 + }, + { + "epoch": 4.9109029899333905, + "grad_norm": 2.1044840812683105, + "learning_rate": 3.9449678524522415e-08, + "loss": 0.449, + "step": 49030 + }, + { + "epoch": 4.91190464266039, + "grad_norm": 2.3766860961914062, + "learning_rate": 3.856935955769969e-08, + "loss": 0.4182, + "step": 49040 + }, + { + "epoch": 4.912906295387389, + "grad_norm": 2.158463716506958, + "learning_rate": 3.769896630488323e-08, + "loss": 0.4147, + "step": 49050 + }, + { + "epoch": 4.913907948114389, + "grad_norm": 2.246264696121216, + "learning_rate": 3.683849911217674e-08, + "loss": 0.4409, + "step": 49060 + }, + { + "epoch": 4.914909600841388, + "grad_norm": 1.723870873451233, + "learning_rate": 3.5987958321739844e-08, + "loss": 0.4226, + "step": 49070 + }, + { + "epoch": 4.915911253568388, + "grad_norm": 2.071007490158081, + "learning_rate": 3.514734427177979e-08, + "loss": 0.5269, + "step": 49080 + }, + { + "epoch": 4.916912906295387, + "grad_norm": 2.3110177516937256, + "learning_rate": 3.431665729656253e-08, + "loss": 0.3906, + "step": 49090 + }, + { + "epoch": 4.917914559022387, + "grad_norm": 1.8781073093414307, + "learning_rate": 3.349589772640715e-08, + "loss": 0.3827, + "step": 49100 + }, + { + "epoch": 4.918916211749386, + "grad_norm": 2.104565143585205, + "learning_rate": 3.2685065887674834e-08, + "loss": 0.4316, + "step": 49110 + }, + { + "epoch": 4.919917864476386, + "grad_norm": 2.219674587249756, + "learning_rate": 3.188416210279099e-08, + "loss": 0.363, + "step": 49120 + }, + { + "epoch": 4.9209195172033855, + "grad_norm": 2.114689588546753, + "learning_rate": 3.1093186690228645e-08, + "loss": 0.4162, + "step": 49130 + }, + { + "epoch": 4.9219211699303855, + "grad_norm": 2.0361156463623047, + "learning_rate": 3.031213996451121e-08, + "loss": 0.4122, + "step": 49140 + }, + { + "epoch": 4.922922822657385, + "grad_norm": 1.689947247505188, + "learning_rate": 2.954102223621802e-08, + "loss": 0.4286, + "step": 49150 + }, + { + "epoch": 4.923924475384384, + "grad_norm": 1.9771041870117188, + "learning_rate": 2.877983381197602e-08, + "loss": 0.3823, + "step": 49160 + }, + { + "epoch": 4.924926128111384, + "grad_norm": 2.4345974922180176, + "learning_rate": 2.8028574994465317e-08, + "loss": 0.4479, + "step": 49170 + }, + { + "epoch": 4.925927780838383, + "grad_norm": 2.2519659996032715, + "learning_rate": 2.728724608241917e-08, + "loss": 0.3989, + "step": 49180 + }, + { + "epoch": 4.926929433565383, + "grad_norm": 2.584491729736328, + "learning_rate": 2.6555847370621222e-08, + "loss": 0.441, + "step": 49190 + }, + { + "epoch": 4.927931086292382, + "grad_norm": 2.2947561740875244, + "learning_rate": 2.5834379149905495e-08, + "loss": 0.4375, + "step": 49200 + }, + { + "epoch": 4.928932739019382, + "grad_norm": 2.1762804985046387, + "learning_rate": 2.5122841707159172e-08, + "loss": 0.3976, + "step": 49210 + }, + { + "epoch": 4.929934391746381, + "grad_norm": 2.127589225769043, + "learning_rate": 2.4421235325319815e-08, + "loss": 0.3811, + "step": 49220 + }, + { + "epoch": 4.930936044473381, + "grad_norm": 2.0069942474365234, + "learning_rate": 2.3729560283372586e-08, + "loss": 0.4151, + "step": 49230 + }, + { + "epoch": 4.9319376972003806, + "grad_norm": 1.6562203168869019, + "learning_rate": 2.3047816856358595e-08, + "loss": 0.3697, + "step": 49240 + }, + { + "epoch": 4.932939349927381, + "grad_norm": 2.022047996520996, + "learning_rate": 2.2376005315369318e-08, + "loss": 0.4432, + "step": 49250 + }, + { + "epoch": 4.93394100265438, + "grad_norm": 2.255925416946411, + "learning_rate": 2.1714125927543848e-08, + "loss": 0.3806, + "step": 49260 + }, + { + "epoch": 4.934942655381379, + "grad_norm": 1.9878329038619995, + "learning_rate": 2.1062178956071655e-08, + "loss": 0.3847, + "step": 49270 + }, + { + "epoch": 4.935944308108379, + "grad_norm": 1.935551404953003, + "learning_rate": 2.0420164660195362e-08, + "loss": 0.4273, + "step": 49280 + }, + { + "epoch": 4.936945960835378, + "grad_norm": 2.561936378479004, + "learning_rate": 1.978808329520798e-08, + "loss": 0.4051, + "step": 49290 + }, + { + "epoch": 4.937947613562378, + "grad_norm": 2.0825181007385254, + "learning_rate": 1.916593511245013e-08, + "loss": 0.4425, + "step": 49300 + }, + { + "epoch": 4.938949266289377, + "grad_norm": 1.867803692817688, + "learning_rate": 1.8553720359315574e-08, + "loss": 0.442, + "step": 49310 + }, + { + "epoch": 4.939950919016377, + "grad_norm": 2.4296388626098633, + "learning_rate": 1.7951439279245697e-08, + "loss": 0.4022, + "step": 49320 + }, + { + "epoch": 4.940952571743376, + "grad_norm": 1.7822569608688354, + "learning_rate": 1.7359092111732255e-08, + "loss": 0.42, + "step": 49330 + }, + { + "epoch": 4.9419542244703765, + "grad_norm": 2.034141778945923, + "learning_rate": 1.6776679092320168e-08, + "loss": 0.4095, + "step": 49340 + }, + { + "epoch": 4.942955877197376, + "grad_norm": 2.480480194091797, + "learning_rate": 1.6204200452596407e-08, + "loss": 0.4123, + "step": 49350 + }, + { + "epoch": 4.943957529924376, + "grad_norm": 1.9543697834014893, + "learning_rate": 1.564165642020665e-08, + "loss": 0.4178, + "step": 49360 + }, + { + "epoch": 4.944959182651375, + "grad_norm": 2.275780439376831, + "learning_rate": 1.5089047218838637e-08, + "loss": 0.4003, + "step": 49370 + }, + { + "epoch": 4.945960835378374, + "grad_norm": 2.82968807220459, + "learning_rate": 1.454637306823603e-08, + "loss": 0.4349, + "step": 49380 + }, + { + "epoch": 4.946962488105374, + "grad_norm": 1.861443042755127, + "learning_rate": 1.4013634184190105e-08, + "loss": 0.3798, + "step": 49390 + }, + { + "epoch": 4.947964140832373, + "grad_norm": 2.3817360401153564, + "learning_rate": 1.3490830778534192e-08, + "loss": 0.3854, + "step": 49400 + }, + { + "epoch": 4.948965793559373, + "grad_norm": 2.194943904876709, + "learning_rate": 1.2977963059163101e-08, + "loss": 0.422, + "step": 49410 + }, + { + "epoch": 4.949967446286372, + "grad_norm": 2.164625644683838, + "learning_rate": 1.24750312300137e-08, + "loss": 0.399, + "step": 49420 + }, + { + "epoch": 4.950969099013372, + "grad_norm": 2.4973015785217285, + "learning_rate": 1.198203549106769e-08, + "loss": 0.4427, + "step": 49430 + }, + { + "epoch": 4.9519707517403715, + "grad_norm": 1.9972591400146484, + "learning_rate": 1.1498976038368248e-08, + "loss": 0.3763, + "step": 49440 + }, + { + "epoch": 4.9529724044673715, + "grad_norm": 2.178032875061035, + "learning_rate": 1.1025853063992287e-08, + "loss": 0.4024, + "step": 49450 + }, + { + "epoch": 4.953974057194371, + "grad_norm": 1.7938523292541504, + "learning_rate": 1.0562666756080974e-08, + "loss": 0.3719, + "step": 49460 + }, + { + "epoch": 4.954975709921371, + "grad_norm": 1.7838163375854492, + "learning_rate": 1.0109417298811985e-08, + "loss": 0.3619, + "step": 49470 + }, + { + "epoch": 4.95597736264837, + "grad_norm": 1.8792906999588013, + "learning_rate": 9.666104872416148e-09, + "loss": 0.4221, + "step": 49480 + }, + { + "epoch": 4.956979015375369, + "grad_norm": 1.9721959829330444, + "learning_rate": 9.232729653177452e-09, + "loss": 0.3949, + "step": 49490 + }, + { + "epoch": 4.957980668102369, + "grad_norm": 2.2299644947052, + "learning_rate": 8.809291813419163e-09, + "loss": 0.4221, + "step": 49500 + }, + { + "epoch": 4.958982320829368, + "grad_norm": 2.5148696899414062, + "learning_rate": 8.39579152152048e-09, + "loss": 0.4239, + "step": 49510 + }, + { + "epoch": 4.959983973556368, + "grad_norm": 2.1798460483551025, + "learning_rate": 7.992228941905433e-09, + "loss": 0.427, + "step": 49520 + }, + { + "epoch": 4.960985626283367, + "grad_norm": 2.080162525177002, + "learning_rate": 7.598604235048434e-09, + "loss": 0.4076, + "step": 49530 + }, + { + "epoch": 4.961987279010367, + "grad_norm": 1.9533112049102783, + "learning_rate": 7.214917557471501e-09, + "loss": 0.3875, + "step": 49540 + }, + { + "epoch": 4.9629889317373665, + "grad_norm": 2.1065080165863037, + "learning_rate": 6.841169061744257e-09, + "loss": 0.4761, + "step": 49550 + }, + { + "epoch": 4.963990584464367, + "grad_norm": 2.6746320724487305, + "learning_rate": 6.477358896483932e-09, + "loss": 0.3773, + "step": 49560 + }, + { + "epoch": 4.964992237191366, + "grad_norm": 1.8172719478607178, + "learning_rate": 6.1234872063553604e-09, + "loss": 0.3882, + "step": 49570 + }, + { + "epoch": 4.965993889918366, + "grad_norm": 2.7890677452087402, + "learning_rate": 5.779554132076537e-09, + "loss": 0.3805, + "step": 49580 + }, + { + "epoch": 4.966995542645365, + "grad_norm": 2.2248497009277344, + "learning_rate": 5.445559810407508e-09, + "loss": 0.4279, + "step": 49590 + }, + { + "epoch": 4.967997195372364, + "grad_norm": 2.016270399093628, + "learning_rate": 5.1215043741587034e-09, + "loss": 0.3731, + "step": 49600 + }, + { + "epoch": 4.968998848099364, + "grad_norm": 2.032270669937134, + "learning_rate": 4.8073879521909335e-09, + "loss": 0.4135, + "step": 49610 + }, + { + "epoch": 4.970000500826363, + "grad_norm": 2.85500431060791, + "learning_rate": 4.50321066940429e-09, + "loss": 0.4544, + "step": 49620 + }, + { + "epoch": 4.971002153553363, + "grad_norm": 1.6594706773757935, + "learning_rate": 4.2089726467547945e-09, + "loss": 0.4161, + "step": 49630 + }, + { + "epoch": 4.972003806280362, + "grad_norm": 2.2570676803588867, + "learning_rate": 3.9246740012488515e-09, + "loss": 0.4116, + "step": 49640 + }, + { + "epoch": 4.9730054590073625, + "grad_norm": 2.1982297897338867, + "learning_rate": 3.6503148459265944e-09, + "loss": 0.394, + "step": 49650 + }, + { + "epoch": 4.974007111734362, + "grad_norm": 2.184145450592041, + "learning_rate": 3.3858952898924156e-09, + "loss": 0.4166, + "step": 49660 + }, + { + "epoch": 4.975008764461361, + "grad_norm": 1.760334849357605, + "learning_rate": 3.1314154382872108e-09, + "loss": 0.4014, + "step": 49670 + }, + { + "epoch": 4.976010417188361, + "grad_norm": 2.06929612159729, + "learning_rate": 2.8868753923022573e-09, + "loss": 0.4085, + "step": 49680 + }, + { + "epoch": 4.977012069915361, + "grad_norm": 2.60774827003479, + "learning_rate": 2.6522752491792147e-09, + "loss": 0.3767, + "step": 49690 + }, + { + "epoch": 4.97801372264236, + "grad_norm": 2.44492769241333, + "learning_rate": 2.4276151022045724e-09, + "loss": 0.4283, + "step": 49700 + }, + { + "epoch": 4.979015375369359, + "grad_norm": 2.499879837036133, + "learning_rate": 2.212895040712426e-09, + "loss": 0.4121, + "step": 49710 + }, + { + "epoch": 4.980017028096359, + "grad_norm": 1.796566367149353, + "learning_rate": 2.008115150081702e-09, + "loss": 0.3966, + "step": 49720 + }, + { + "epoch": 4.981018680823358, + "grad_norm": 2.4814465045928955, + "learning_rate": 1.8132755117444832e-09, + "loss": 0.3729, + "step": 49730 + }, + { + "epoch": 4.982020333550358, + "grad_norm": 2.5805459022521973, + "learning_rate": 1.628376203177684e-09, + "loss": 0.3886, + "step": 49740 + }, + { + "epoch": 4.9830219862773575, + "grad_norm": 2.3878934383392334, + "learning_rate": 1.453417297903048e-09, + "loss": 0.42, + "step": 49750 + }, + { + "epoch": 4.9840236390043575, + "grad_norm": 1.974474549293518, + "learning_rate": 1.2883988654927015e-09, + "loss": 0.4083, + "step": 49760 + }, + { + "epoch": 4.985025291731357, + "grad_norm": 2.5575461387634277, + "learning_rate": 1.1333209715636006e-09, + "loss": 0.4061, + "step": 49770 + }, + { + "epoch": 4.986026944458356, + "grad_norm": 1.7831733226776123, + "learning_rate": 9.881836777830832e-10, + "loss": 0.4559, + "step": 49780 + }, + { + "epoch": 4.987028597185356, + "grad_norm": 2.3388864994049072, + "learning_rate": 8.529870418633179e-10, + "loss": 0.4285, + "step": 49790 + }, + { + "epoch": 4.988030249912356, + "grad_norm": 2.1213908195495605, + "learning_rate": 7.277311175640789e-10, + "loss": 0.4158, + "step": 49800 + }, + { + "epoch": 4.989031902639355, + "grad_norm": 2.5922374725341797, + "learning_rate": 6.124159546899711e-10, + "loss": 0.4285, + "step": 49810 + }, + { + "epoch": 4.990033555366354, + "grad_norm": 1.7242733240127563, + "learning_rate": 5.070415990987565e-10, + "loss": 0.421, + "step": 49820 + }, + { + "epoch": 4.991035208093354, + "grad_norm": 2.315945863723755, + "learning_rate": 4.1160809269025213e-10, + "loss": 0.4358, + "step": 49830 + }, + { + "epoch": 4.992036860820353, + "grad_norm": 2.1397175788879395, + "learning_rate": 3.261154734146565e-10, + "loss": 0.4419, + "step": 49840 + }, + { + "epoch": 4.993038513547353, + "grad_norm": 1.9984654188156128, + "learning_rate": 2.505637752642231e-10, + "loss": 0.4377, + "step": 49850 + }, + { + "epoch": 4.9940401662743525, + "grad_norm": 1.9078891277313232, + "learning_rate": 1.84953028281587e-10, + "loss": 0.4617, + "step": 49860 + }, + { + "epoch": 4.995041819001353, + "grad_norm": 2.45648455619812, + "learning_rate": 1.2928325855976475e-10, + "loss": 0.4741, + "step": 49870 + }, + { + "epoch": 4.996043471728352, + "grad_norm": 1.3880661725997925, + "learning_rate": 8.355448823105238e-11, + "loss": 0.3786, + "step": 49880 + }, + { + "epoch": 4.997045124455351, + "grad_norm": 1.9489227533340454, + "learning_rate": 4.77667354836786e-11, + "loss": 0.3738, + "step": 49890 + }, + { + "epoch": 4.998046777182351, + "grad_norm": 2.0824508666992188, + "learning_rate": 2.192001454515147e-11, + "loss": 0.4642, + "step": 49900 + }, + { + "epoch": 4.99904842990935, + "grad_norm": 2.111239433288574, + "learning_rate": 6.014335693360629e-12, + "loss": 0.3691, + "step": 49910 + }, + { + "epoch": 5.0, + "grad_norm": 1.7227387428283691, + "learning_rate": 4.970525657732594e-14, + "loss": 0.4032, + "step": 49920 + }, + { + "epoch": 5.001001652726999, + "grad_norm": 1.9057866334915161, + "learning_rate": 9.429981463315144e-06, + "loss": 0.3777, + "step": 49930 + }, + { + "epoch": 5.002003305453999, + "grad_norm": 2.094399929046631, + "learning_rate": 9.421178086170654e-06, + "loss": 0.3618, + "step": 49940 + }, + { + "epoch": 5.003004958180998, + "grad_norm": 2.0568583011627197, + "learning_rate": 9.412377866013538e-06, + "loss": 0.3926, + "step": 49950 + }, + { + "epoch": 5.004006610907998, + "grad_norm": 2.5123448371887207, + "learning_rate": 9.403580804627127e-06, + "loss": 0.4442, + "step": 49960 + }, + { + "epoch": 5.0050082636349975, + "grad_norm": 2.6346914768218994, + "learning_rate": 9.394786903794133e-06, + "loss": 0.4134, + "step": 49970 + }, + { + "epoch": 5.0060099163619975, + "grad_norm": 2.1011178493499756, + "learning_rate": 9.385996165296584e-06, + "loss": 0.3734, + "step": 49980 + }, + { + "epoch": 5.007011569088997, + "grad_norm": 2.0437891483306885, + "learning_rate": 9.377208590915892e-06, + "loss": 0.3885, + "step": 49990 + }, + { + "epoch": 5.008013221815997, + "grad_norm": 2.111875295639038, + "learning_rate": 9.368424182432825e-06, + "loss": 0.4173, + "step": 50000 + }, + { + "epoch": 5.008013221815997, + "eval_bleu": 0.4009660835826073, + "eval_loss": 0.508514940738678, + "eval_rouge1": 0.715396568133781, + "eval_rouge2": 0.550282125226329, + "eval_rougeL": 0.6740186461553936, + "eval_runtime": 77959.3622, + "eval_samples_per_second": 0.228, + "eval_steps_per_second": 0.028, + "eval_wer": 0.6738413384611066, + "step": 50000 + }, + { + "epoch": 5.009014874542996, + "grad_norm": 2.5804548263549805, + "learning_rate": 9.359642941627524e-06, + "loss": 0.4246, + "step": 50010 + }, + { + "epoch": 5.010016527269996, + "grad_norm": 2.5093562602996826, + "learning_rate": 9.350864870279457e-06, + "loss": 0.3901, + "step": 50020 + }, + { + "epoch": 5.011018179996995, + "grad_norm": 2.047625780105591, + "learning_rate": 9.342089970167458e-06, + "loss": 0.3293, + "step": 50030 + }, + { + "epoch": 5.012019832723994, + "grad_norm": 2.4648852348327637, + "learning_rate": 9.33331824306975e-06, + "loss": 0.47, + "step": 50040 + }, + { + "epoch": 5.013021485450994, + "grad_norm": 2.168367862701416, + "learning_rate": 9.324549690763887e-06, + "loss": 0.4254, + "step": 50050 + }, + { + "epoch": 5.014023138177993, + "grad_norm": 2.1796538829803467, + "learning_rate": 9.31578431502676e-06, + "loss": 0.4144, + "step": 50060 + }, + { + "epoch": 5.015024790904993, + "grad_norm": 2.1317245960235596, + "learning_rate": 9.307022117634646e-06, + "loss": 0.4444, + "step": 50070 + }, + { + "epoch": 5.016026443631993, + "grad_norm": 2.2961223125457764, + "learning_rate": 9.298263100363188e-06, + "loss": 0.4129, + "step": 50080 + }, + { + "epoch": 5.017028096358993, + "grad_norm": 2.3871800899505615, + "learning_rate": 9.289507264987348e-06, + "loss": 0.3687, + "step": 50090 + }, + { + "epoch": 5.018029749085992, + "grad_norm": 2.189589500427246, + "learning_rate": 9.280754613281456e-06, + "loss": 0.3685, + "step": 50100 + }, + { + "epoch": 5.019031401812992, + "grad_norm": 2.174671173095703, + "learning_rate": 9.272005147019225e-06, + "loss": 0.388, + "step": 50110 + }, + { + "epoch": 5.020033054539991, + "grad_norm": 1.7764248847961426, + "learning_rate": 9.263258867973696e-06, + "loss": 0.4141, + "step": 50120 + }, + { + "epoch": 5.021034707266991, + "grad_norm": 2.3782589435577393, + "learning_rate": 9.254515777917253e-06, + "loss": 0.4332, + "step": 50130 + }, + { + "epoch": 5.02203635999399, + "grad_norm": 1.9943517446517944, + "learning_rate": 9.245775878621649e-06, + "loss": 0.4191, + "step": 50140 + }, + { + "epoch": 5.023038012720989, + "grad_norm": 2.292137384414673, + "learning_rate": 9.237039171858006e-06, + "loss": 0.3959, + "step": 50150 + }, + { + "epoch": 5.024039665447989, + "grad_norm": 2.6247379779815674, + "learning_rate": 9.228305659396785e-06, + "loss": 0.3956, + "step": 50160 + }, + { + "epoch": 5.0250413181749884, + "grad_norm": 1.960654854774475, + "learning_rate": 9.219575343007771e-06, + "loss": 0.3702, + "step": 50170 + }, + { + "epoch": 5.0260429709019885, + "grad_norm": 1.9130282402038574, + "learning_rate": 9.210848224460158e-06, + "loss": 0.3923, + "step": 50180 + }, + { + "epoch": 5.027044623628988, + "grad_norm": 2.1862452030181885, + "learning_rate": 9.202124305522462e-06, + "loss": 0.3833, + "step": 50190 + }, + { + "epoch": 5.028046276355988, + "grad_norm": 1.7818164825439453, + "learning_rate": 9.193403587962527e-06, + "loss": 0.3926, + "step": 50200 + }, + { + "epoch": 5.029047929082987, + "grad_norm": 2.3955230712890625, + "learning_rate": 9.184686073547576e-06, + "loss": 0.4152, + "step": 50210 + }, + { + "epoch": 5.030049581809987, + "grad_norm": 2.290144681930542, + "learning_rate": 9.175971764044202e-06, + "loss": 0.4072, + "step": 50220 + }, + { + "epoch": 5.031051234536986, + "grad_norm": 1.9459701776504517, + "learning_rate": 9.167260661218322e-06, + "loss": 0.413, + "step": 50230 + }, + { + "epoch": 5.032052887263985, + "grad_norm": 1.623977541923523, + "learning_rate": 9.158552766835176e-06, + "loss": 0.3658, + "step": 50240 + }, + { + "epoch": 5.033054539990985, + "grad_norm": 2.4459660053253174, + "learning_rate": 9.149848082659417e-06, + "loss": 0.4204, + "step": 50250 + }, + { + "epoch": 5.034056192717984, + "grad_norm": 1.9712905883789062, + "learning_rate": 9.141146610455006e-06, + "loss": 0.4554, + "step": 50260 + }, + { + "epoch": 5.035057845444984, + "grad_norm": 2.139141798019409, + "learning_rate": 9.13244835198526e-06, + "loss": 0.3885, + "step": 50270 + }, + { + "epoch": 5.0360594981719835, + "grad_norm": 2.5080270767211914, + "learning_rate": 9.123753309012848e-06, + "loss": 0.4311, + "step": 50280 + }, + { + "epoch": 5.0370611508989835, + "grad_norm": 1.8112452030181885, + "learning_rate": 9.115061483299786e-06, + "loss": 0.3111, + "step": 50290 + }, + { + "epoch": 5.038062803625983, + "grad_norm": 2.099536180496216, + "learning_rate": 9.10637287660745e-06, + "loss": 0.3742, + "step": 50300 + }, + { + "epoch": 5.039064456352983, + "grad_norm": 2.1379730701446533, + "learning_rate": 9.097687490696522e-06, + "loss": 0.3914, + "step": 50310 + }, + { + "epoch": 5.040066109079982, + "grad_norm": 2.5523993968963623, + "learning_rate": 9.089005327327088e-06, + "loss": 0.4206, + "step": 50320 + }, + { + "epoch": 5.041067761806982, + "grad_norm": 1.9698617458343506, + "learning_rate": 9.08032638825855e-06, + "loss": 0.3978, + "step": 50330 + }, + { + "epoch": 5.042069414533981, + "grad_norm": 2.262073516845703, + "learning_rate": 9.071650675249658e-06, + "loss": 0.4593, + "step": 50340 + }, + { + "epoch": 5.04307106726098, + "grad_norm": 2.4841954708099365, + "learning_rate": 9.06297819005851e-06, + "loss": 0.3381, + "step": 50350 + }, + { + "epoch": 5.04407271998798, + "grad_norm": 2.5123629570007324, + "learning_rate": 9.054308934442554e-06, + "loss": 0.4633, + "step": 50360 + }, + { + "epoch": 5.045074372714979, + "grad_norm": 2.472378969192505, + "learning_rate": 9.045642910158581e-06, + "loss": 0.3572, + "step": 50370 + }, + { + "epoch": 5.046076025441979, + "grad_norm": 2.2146644592285156, + "learning_rate": 9.036980118962723e-06, + "loss": 0.4085, + "step": 50380 + }, + { + "epoch": 5.047077678168979, + "grad_norm": 2.2886135578155518, + "learning_rate": 9.028320562610465e-06, + "loss": 0.4098, + "step": 50390 + }, + { + "epoch": 5.048079330895979, + "grad_norm": 2.3172385692596436, + "learning_rate": 9.019664242856632e-06, + "loss": 0.401, + "step": 50400 + }, + { + "epoch": 5.049080983622978, + "grad_norm": 2.2763006687164307, + "learning_rate": 9.01101116145539e-06, + "loss": 0.4277, + "step": 50410 + }, + { + "epoch": 5.050082636349978, + "grad_norm": 2.075080633163452, + "learning_rate": 9.002361320160255e-06, + "loss": 0.4076, + "step": 50420 + }, + { + "epoch": 5.051084289076977, + "grad_norm": 2.275829315185547, + "learning_rate": 8.993714720724084e-06, + "loss": 0.4366, + "step": 50430 + }, + { + "epoch": 5.052085941803977, + "grad_norm": 2.0190999507904053, + "learning_rate": 8.985071364899072e-06, + "loss": 0.4001, + "step": 50440 + }, + { + "epoch": 5.053087594530976, + "grad_norm": 1.6728298664093018, + "learning_rate": 8.976431254436769e-06, + "loss": 0.3575, + "step": 50450 + }, + { + "epoch": 5.054089247257975, + "grad_norm": 2.0061964988708496, + "learning_rate": 8.967794391088052e-06, + "loss": 0.3538, + "step": 50460 + }, + { + "epoch": 5.055090899984975, + "grad_norm": 2.0933334827423096, + "learning_rate": 8.959160776603152e-06, + "loss": 0.3712, + "step": 50470 + }, + { + "epoch": 5.056092552711974, + "grad_norm": 2.1422982215881348, + "learning_rate": 8.950530412731634e-06, + "loss": 0.3951, + "step": 50480 + }, + { + "epoch": 5.0570942054389745, + "grad_norm": 2.5245361328125, + "learning_rate": 8.941903301222412e-06, + "loss": 0.3951, + "step": 50490 + }, + { + "epoch": 5.058095858165974, + "grad_norm": 1.8500747680664062, + "learning_rate": 8.933279443823733e-06, + "loss": 0.3443, + "step": 50500 + }, + { + "epoch": 5.059097510892974, + "grad_norm": 1.7166526317596436, + "learning_rate": 8.92465884228319e-06, + "loss": 0.4024, + "step": 50510 + }, + { + "epoch": 5.060099163619973, + "grad_norm": 1.9748985767364502, + "learning_rate": 8.916041498347712e-06, + "loss": 0.3806, + "step": 50520 + }, + { + "epoch": 5.061100816346973, + "grad_norm": 2.0966179370880127, + "learning_rate": 8.907427413763573e-06, + "loss": 0.3958, + "step": 50530 + }, + { + "epoch": 5.062102469073972, + "grad_norm": 3.496039867401123, + "learning_rate": 8.898816590276379e-06, + "loss": 0.3929, + "step": 50540 + }, + { + "epoch": 5.063104121800972, + "grad_norm": 1.860559105873108, + "learning_rate": 8.890209029631086e-06, + "loss": 0.3346, + "step": 50550 + }, + { + "epoch": 5.064105774527971, + "grad_norm": 2.681044340133667, + "learning_rate": 8.881604733571977e-06, + "loss": 0.3904, + "step": 50560 + }, + { + "epoch": 5.06510742725497, + "grad_norm": 1.6064103841781616, + "learning_rate": 8.873003703842681e-06, + "loss": 0.4072, + "step": 50570 + }, + { + "epoch": 5.06610907998197, + "grad_norm": 2.7219972610473633, + "learning_rate": 8.864405942186163e-06, + "loss": 0.4058, + "step": 50580 + }, + { + "epoch": 5.0671107327089695, + "grad_norm": 2.393876314163208, + "learning_rate": 8.855811450344729e-06, + "loss": 0.455, + "step": 50590 + }, + { + "epoch": 5.0681123854359695, + "grad_norm": 2.1746304035186768, + "learning_rate": 8.847220230060014e-06, + "loss": 0.4109, + "step": 50600 + }, + { + "epoch": 5.069114038162969, + "grad_norm": 1.8715713024139404, + "learning_rate": 8.838632283072998e-06, + "loss": 0.346, + "step": 50610 + }, + { + "epoch": 5.070115690889969, + "grad_norm": 2.0678677558898926, + "learning_rate": 8.830047611123992e-06, + "loss": 0.3984, + "step": 50620 + }, + { + "epoch": 5.071117343616968, + "grad_norm": 1.8897209167480469, + "learning_rate": 8.821466215952651e-06, + "loss": 0.3794, + "step": 50630 + }, + { + "epoch": 5.072118996343968, + "grad_norm": 3.4548091888427734, + "learning_rate": 8.81288809929796e-06, + "loss": 0.3808, + "step": 50640 + }, + { + "epoch": 5.073120649070967, + "grad_norm": 1.9121617078781128, + "learning_rate": 8.804313262898234e-06, + "loss": 0.4143, + "step": 50650 + }, + { + "epoch": 5.074122301797967, + "grad_norm": 2.0208704471588135, + "learning_rate": 8.795741708491139e-06, + "loss": 0.3962, + "step": 50660 + }, + { + "epoch": 5.075123954524966, + "grad_norm": 2.1051366329193115, + "learning_rate": 8.787173437813664e-06, + "loss": 0.3973, + "step": 50670 + }, + { + "epoch": 5.076125607251965, + "grad_norm": 2.504681348800659, + "learning_rate": 8.778608452602136e-06, + "loss": 0.406, + "step": 50680 + }, + { + "epoch": 5.077127259978965, + "grad_norm": 2.3707287311553955, + "learning_rate": 8.770046754592211e-06, + "loss": 0.4546, + "step": 50690 + }, + { + "epoch": 5.078128912705965, + "grad_norm": 2.108637809753418, + "learning_rate": 8.761488345518893e-06, + "loss": 0.3693, + "step": 50700 + }, + { + "epoch": 5.079130565432965, + "grad_norm": 2.4608542919158936, + "learning_rate": 8.752933227116503e-06, + "loss": 0.4373, + "step": 50710 + }, + { + "epoch": 5.080132218159964, + "grad_norm": 2.6883091926574707, + "learning_rate": 8.7443814011187e-06, + "loss": 0.4489, + "step": 50720 + }, + { + "epoch": 5.081133870886964, + "grad_norm": 1.7556604146957397, + "learning_rate": 8.735832869258486e-06, + "loss": 0.3975, + "step": 50730 + }, + { + "epoch": 5.082135523613963, + "grad_norm": 2.1072566509246826, + "learning_rate": 8.727287633268182e-06, + "loss": 0.4055, + "step": 50740 + }, + { + "epoch": 5.083137176340963, + "grad_norm": 2.7997405529022217, + "learning_rate": 8.718745694879451e-06, + "loss": 0.4077, + "step": 50750 + }, + { + "epoch": 5.084138829067962, + "grad_norm": 2.5607924461364746, + "learning_rate": 8.710207055823272e-06, + "loss": 0.4069, + "step": 50760 + }, + { + "epoch": 5.085140481794961, + "grad_norm": 2.0641298294067383, + "learning_rate": 8.701671717829993e-06, + "loss": 0.371, + "step": 50770 + }, + { + "epoch": 5.086142134521961, + "grad_norm": 2.205183982849121, + "learning_rate": 8.69313968262924e-06, + "loss": 0.4074, + "step": 50780 + }, + { + "epoch": 5.08714378724896, + "grad_norm": 2.3521621227264404, + "learning_rate": 8.684610951950006e-06, + "loss": 0.3741, + "step": 50790 + }, + { + "epoch": 5.0881454399759605, + "grad_norm": 2.01767635345459, + "learning_rate": 8.676085527520605e-06, + "loss": 0.4264, + "step": 50800 + }, + { + "epoch": 5.08914709270296, + "grad_norm": 2.336679458618164, + "learning_rate": 8.66756341106868e-06, + "loss": 0.4016, + "step": 50810 + }, + { + "epoch": 5.09014874542996, + "grad_norm": 2.2592053413391113, + "learning_rate": 8.659044604321206e-06, + "loss": 0.4801, + "step": 50820 + }, + { + "epoch": 5.091150398156959, + "grad_norm": 2.573158025741577, + "learning_rate": 8.65052910900448e-06, + "loss": 0.4175, + "step": 50830 + }, + { + "epoch": 5.092152050883959, + "grad_norm": 2.452357292175293, + "learning_rate": 8.642016926844154e-06, + "loss": 0.4282, + "step": 50840 + }, + { + "epoch": 5.093153703610958, + "grad_norm": 1.6255464553833008, + "learning_rate": 8.633508059565166e-06, + "loss": 0.4136, + "step": 50850 + }, + { + "epoch": 5.094155356337958, + "grad_norm": 1.7420393228530884, + "learning_rate": 8.625002508891813e-06, + "loss": 0.3775, + "step": 50860 + }, + { + "epoch": 5.095157009064957, + "grad_norm": 2.3295178413391113, + "learning_rate": 8.6165002765477e-06, + "loss": 0.4032, + "step": 50870 + }, + { + "epoch": 5.096158661791956, + "grad_norm": 1.9889898300170898, + "learning_rate": 8.608001364255803e-06, + "loss": 0.4529, + "step": 50880 + }, + { + "epoch": 5.097160314518956, + "grad_norm": 2.4790945053100586, + "learning_rate": 8.599505773738365e-06, + "loss": 0.3701, + "step": 50890 + }, + { + "epoch": 5.0981619672459555, + "grad_norm": 2.510486602783203, + "learning_rate": 8.59101350671698e-06, + "loss": 0.4356, + "step": 50900 + }, + { + "epoch": 5.0991636199729555, + "grad_norm": 2.439235210418701, + "learning_rate": 8.582524564912604e-06, + "loss": 0.4114, + "step": 50910 + }, + { + "epoch": 5.100165272699955, + "grad_norm": 2.2638766765594482, + "learning_rate": 8.574038950045457e-06, + "loss": 0.3951, + "step": 50920 + }, + { + "epoch": 5.101166925426955, + "grad_norm": 2.317436933517456, + "learning_rate": 8.565556663835131e-06, + "loss": 0.4255, + "step": 50930 + }, + { + "epoch": 5.102168578153954, + "grad_norm": 2.248082160949707, + "learning_rate": 8.557077708000514e-06, + "loss": 0.445, + "step": 50940 + }, + { + "epoch": 5.103170230880954, + "grad_norm": 2.4589478969573975, + "learning_rate": 8.54860208425986e-06, + "loss": 0.3887, + "step": 50950 + }, + { + "epoch": 5.104171883607953, + "grad_norm": 2.3282318115234375, + "learning_rate": 8.540129794330699e-06, + "loss": 0.3674, + "step": 50960 + }, + { + "epoch": 5.105173536334953, + "grad_norm": 1.6831501722335815, + "learning_rate": 8.5316608399299e-06, + "loss": 0.4535, + "step": 50970 + }, + { + "epoch": 5.106175189061952, + "grad_norm": 1.7259416580200195, + "learning_rate": 8.523195222773688e-06, + "loss": 0.3896, + "step": 50980 + }, + { + "epoch": 5.107176841788951, + "grad_norm": 2.5421462059020996, + "learning_rate": 8.514732944577583e-06, + "loss": 0.3938, + "step": 50990 + }, + { + "epoch": 5.108178494515951, + "grad_norm": 2.577516794204712, + "learning_rate": 8.506274007056412e-06, + "loss": 0.4579, + "step": 51000 + }, + { + "epoch": 5.1091801472429506, + "grad_norm": 2.2617828845977783, + "learning_rate": 8.497818411924363e-06, + "loss": 0.4584, + "step": 51010 + }, + { + "epoch": 5.110181799969951, + "grad_norm": 2.734596014022827, + "learning_rate": 8.489366160894937e-06, + "loss": 0.4102, + "step": 51020 + }, + { + "epoch": 5.11118345269695, + "grad_norm": 2.1711714267730713, + "learning_rate": 8.480917255680929e-06, + "loss": 0.4107, + "step": 51030 + }, + { + "epoch": 5.11218510542395, + "grad_norm": 2.1046245098114014, + "learning_rate": 8.472471697994478e-06, + "loss": 0.4126, + "step": 51040 + }, + { + "epoch": 5.113186758150949, + "grad_norm": 2.018583059310913, + "learning_rate": 8.464029489547057e-06, + "loss": 0.3639, + "step": 51050 + }, + { + "epoch": 5.114188410877949, + "grad_norm": 2.4939472675323486, + "learning_rate": 8.455590632049451e-06, + "loss": 0.391, + "step": 51060 + }, + { + "epoch": 5.115190063604948, + "grad_norm": 2.2978515625, + "learning_rate": 8.447155127211734e-06, + "loss": 0.3951, + "step": 51070 + }, + { + "epoch": 5.116191716331948, + "grad_norm": 2.4101288318634033, + "learning_rate": 8.438722976743352e-06, + "loss": 0.4014, + "step": 51080 + }, + { + "epoch": 5.117193369058947, + "grad_norm": 1.3514302968978882, + "learning_rate": 8.430294182353049e-06, + "loss": 0.3879, + "step": 51090 + }, + { + "epoch": 5.118195021785946, + "grad_norm": 3.136054277420044, + "learning_rate": 8.421868745748873e-06, + "loss": 0.4316, + "step": 51100 + }, + { + "epoch": 5.1191966745129465, + "grad_norm": 2.85766339302063, + "learning_rate": 8.4134466686382e-06, + "loss": 0.3821, + "step": 51110 + }, + { + "epoch": 5.120198327239946, + "grad_norm": 2.4241206645965576, + "learning_rate": 8.405027952727754e-06, + "loss": 0.5232, + "step": 51120 + }, + { + "epoch": 5.121199979966946, + "grad_norm": 1.8085269927978516, + "learning_rate": 8.39661259972355e-06, + "loss": 0.4257, + "step": 51130 + }, + { + "epoch": 5.122201632693945, + "grad_norm": 1.9889925718307495, + "learning_rate": 8.388200611330902e-06, + "loss": 0.4042, + "step": 51140 + }, + { + "epoch": 5.123203285420945, + "grad_norm": 1.835249662399292, + "learning_rate": 8.379791989254493e-06, + "loss": 0.3501, + "step": 51150 + }, + { + "epoch": 5.124204938147944, + "grad_norm": 2.209379196166992, + "learning_rate": 8.371386735198292e-06, + "loss": 0.3846, + "step": 51160 + }, + { + "epoch": 5.125206590874944, + "grad_norm": 1.8162431716918945, + "learning_rate": 8.36298485086559e-06, + "loss": 0.4411, + "step": 51170 + }, + { + "epoch": 5.126208243601943, + "grad_norm": 2.113241195678711, + "learning_rate": 8.354586337958983e-06, + "loss": 0.4041, + "step": 51180 + }, + { + "epoch": 5.127209896328942, + "grad_norm": 1.9983274936676025, + "learning_rate": 8.346191198180414e-06, + "loss": 0.379, + "step": 51190 + }, + { + "epoch": 5.128211549055942, + "grad_norm": 2.5187878608703613, + "learning_rate": 8.337799433231126e-06, + "loss": 0.4103, + "step": 51200 + }, + { + "epoch": 5.1292132017829415, + "grad_norm": 2.1544573307037354, + "learning_rate": 8.329411044811653e-06, + "loss": 0.4325, + "step": 51210 + }, + { + "epoch": 5.1302148545099415, + "grad_norm": 2.4598071575164795, + "learning_rate": 8.321026034621896e-06, + "loss": 0.3789, + "step": 51220 + }, + { + "epoch": 5.131216507236941, + "grad_norm": 2.138986349105835, + "learning_rate": 8.312644404361033e-06, + "loss": 0.4073, + "step": 51230 + }, + { + "epoch": 5.132218159963941, + "grad_norm": 2.2500813007354736, + "learning_rate": 8.30426615572758e-06, + "loss": 0.4308, + "step": 51240 + }, + { + "epoch": 5.13321981269094, + "grad_norm": 2.7701327800750732, + "learning_rate": 8.295891290419334e-06, + "loss": 0.3836, + "step": 51250 + }, + { + "epoch": 5.13422146541794, + "grad_norm": 1.6952491998672485, + "learning_rate": 8.287519810133443e-06, + "loss": 0.4298, + "step": 51260 + }, + { + "epoch": 5.135223118144939, + "grad_norm": 1.9191524982452393, + "learning_rate": 8.279151716566358e-06, + "loss": 0.3448, + "step": 51270 + }, + { + "epoch": 5.136224770871939, + "grad_norm": 1.756516695022583, + "learning_rate": 8.270787011413833e-06, + "loss": 0.4178, + "step": 51280 + }, + { + "epoch": 5.137226423598938, + "grad_norm": 1.841568112373352, + "learning_rate": 8.262425696370949e-06, + "loss": 0.3687, + "step": 51290 + }, + { + "epoch": 5.138228076325937, + "grad_norm": 2.1284594535827637, + "learning_rate": 8.254067773132085e-06, + "loss": 0.3736, + "step": 51300 + }, + { + "epoch": 5.139229729052937, + "grad_norm": 1.7997220754623413, + "learning_rate": 8.24571324339096e-06, + "loss": 0.3807, + "step": 51310 + }, + { + "epoch": 5.1402313817799365, + "grad_norm": 2.3964970111846924, + "learning_rate": 8.237362108840555e-06, + "loss": 0.4019, + "step": 51320 + }, + { + "epoch": 5.141233034506937, + "grad_norm": 2.325812816619873, + "learning_rate": 8.22901437117322e-06, + "loss": 0.3861, + "step": 51330 + }, + { + "epoch": 5.142234687233936, + "grad_norm": 2.397308826446533, + "learning_rate": 8.220670032080587e-06, + "loss": 0.4339, + "step": 51340 + }, + { + "epoch": 5.143236339960936, + "grad_norm": 2.3338468074798584, + "learning_rate": 8.212329093253605e-06, + "loss": 0.4321, + "step": 51350 + }, + { + "epoch": 5.144237992687935, + "grad_norm": 1.8569867610931396, + "learning_rate": 8.203991556382523e-06, + "loss": 0.3943, + "step": 51360 + }, + { + "epoch": 5.145239645414935, + "grad_norm": 1.970132827758789, + "learning_rate": 8.195657423156921e-06, + "loss": 0.3841, + "step": 51370 + }, + { + "epoch": 5.146241298141934, + "grad_norm": 2.166137933731079, + "learning_rate": 8.187326695265671e-06, + "loss": 0.4063, + "step": 51380 + }, + { + "epoch": 5.147242950868934, + "grad_norm": 2.590151786804199, + "learning_rate": 8.178999374396967e-06, + "loss": 0.4072, + "step": 51390 + }, + { + "epoch": 5.148244603595933, + "grad_norm": 2.1858596801757812, + "learning_rate": 8.170675462238306e-06, + "loss": 0.3971, + "step": 51400 + }, + { + "epoch": 5.149246256322932, + "grad_norm": 2.6428606510162354, + "learning_rate": 8.162354960476498e-06, + "loss": 0.4001, + "step": 51410 + }, + { + "epoch": 5.1502479090499325, + "grad_norm": 2.1872785091400146, + "learning_rate": 8.154037870797657e-06, + "loss": 0.3665, + "step": 51420 + }, + { + "epoch": 5.151249561776932, + "grad_norm": 2.6396920680999756, + "learning_rate": 8.14572419488721e-06, + "loss": 0.3787, + "step": 51430 + }, + { + "epoch": 5.152251214503932, + "grad_norm": 2.438502073287964, + "learning_rate": 8.137413934429893e-06, + "loss": 0.3889, + "step": 51440 + }, + { + "epoch": 5.153252867230931, + "grad_norm": 2.2407338619232178, + "learning_rate": 8.12910709110975e-06, + "loss": 0.4352, + "step": 51450 + }, + { + "epoch": 5.154254519957931, + "grad_norm": 1.8680403232574463, + "learning_rate": 8.120803666610122e-06, + "loss": 0.367, + "step": 51460 + }, + { + "epoch": 5.15525617268493, + "grad_norm": 2.1846930980682373, + "learning_rate": 8.112503662613672e-06, + "loss": 0.3574, + "step": 51470 + }, + { + "epoch": 5.15625782541193, + "grad_norm": 1.9968862533569336, + "learning_rate": 8.104207080802361e-06, + "loss": 0.4072, + "step": 51480 + }, + { + "epoch": 5.157259478138929, + "grad_norm": 2.1309328079223633, + "learning_rate": 8.09591392285746e-06, + "loss": 0.4554, + "step": 51490 + }, + { + "epoch": 5.158261130865929, + "grad_norm": 2.3871166706085205, + "learning_rate": 8.087624190459545e-06, + "loss": 0.4323, + "step": 51500 + }, + { + "epoch": 5.159262783592928, + "grad_norm": 2.2008261680603027, + "learning_rate": 8.079337885288496e-06, + "loss": 0.4549, + "step": 51510 + }, + { + "epoch": 5.1602644363199275, + "grad_norm": 2.131897211074829, + "learning_rate": 8.071055009023505e-06, + "loss": 0.4221, + "step": 51520 + }, + { + "epoch": 5.1612660890469275, + "grad_norm": 2.741530418395996, + "learning_rate": 8.062775563343056e-06, + "loss": 0.4023, + "step": 51530 + }, + { + "epoch": 5.162267741773927, + "grad_norm": 1.911015272140503, + "learning_rate": 8.054499549924955e-06, + "loss": 0.4323, + "step": 51540 + }, + { + "epoch": 5.163269394500927, + "grad_norm": 2.1528337001800537, + "learning_rate": 8.046226970446299e-06, + "loss": 0.4609, + "step": 51550 + }, + { + "epoch": 5.164271047227926, + "grad_norm": 2.0719454288482666, + "learning_rate": 8.037957826583497e-06, + "loss": 0.3626, + "step": 51560 + }, + { + "epoch": 5.165272699954926, + "grad_norm": 2.4185614585876465, + "learning_rate": 8.029692120012255e-06, + "loss": 0.4371, + "step": 51570 + }, + { + "epoch": 5.166274352681925, + "grad_norm": 2.706132411956787, + "learning_rate": 8.021429852407592e-06, + "loss": 0.4581, + "step": 51580 + }, + { + "epoch": 5.167276005408925, + "grad_norm": 2.2206037044525146, + "learning_rate": 8.013171025443816e-06, + "loss": 0.3972, + "step": 51590 + }, + { + "epoch": 5.168277658135924, + "grad_norm": 1.9738082885742188, + "learning_rate": 8.004915640794553e-06, + "loss": 0.3954, + "step": 51600 + }, + { + "epoch": 5.169279310862924, + "grad_norm": 2.3320820331573486, + "learning_rate": 7.996663700132723e-06, + "loss": 0.4122, + "step": 51610 + }, + { + "epoch": 5.170280963589923, + "grad_norm": 1.627105474472046, + "learning_rate": 7.988415205130545e-06, + "loss": 0.3838, + "step": 51620 + }, + { + "epoch": 5.1712826163169225, + "grad_norm": 1.9464679956436157, + "learning_rate": 7.980170157459549e-06, + "loss": 0.3813, + "step": 51630 + }, + { + "epoch": 5.172284269043923, + "grad_norm": 2.1501245498657227, + "learning_rate": 7.971928558790562e-06, + "loss": 0.4326, + "step": 51640 + }, + { + "epoch": 5.173285921770922, + "grad_norm": 1.8441542387008667, + "learning_rate": 7.963690410793709e-06, + "loss": 0.3666, + "step": 51650 + }, + { + "epoch": 5.174287574497922, + "grad_norm": 1.986935019493103, + "learning_rate": 7.955455715138419e-06, + "loss": 0.4267, + "step": 51660 + }, + { + "epoch": 5.175289227224921, + "grad_norm": 2.072371006011963, + "learning_rate": 7.94722447349342e-06, + "loss": 0.4365, + "step": 51670 + }, + { + "epoch": 5.176290879951921, + "grad_norm": 1.5192880630493164, + "learning_rate": 7.938996687526745e-06, + "loss": 0.3503, + "step": 51680 + }, + { + "epoch": 5.17729253267892, + "grad_norm": 2.655179500579834, + "learning_rate": 7.930772358905719e-06, + "loss": 0.4375, + "step": 51690 + }, + { + "epoch": 5.17829418540592, + "grad_norm": 2.249799966812134, + "learning_rate": 7.92255148929697e-06, + "loss": 0.3607, + "step": 51700 + }, + { + "epoch": 5.179295838132919, + "grad_norm": 2.191415548324585, + "learning_rate": 7.914334080366428e-06, + "loss": 0.3986, + "step": 51710 + }, + { + "epoch": 5.180297490859919, + "grad_norm": 2.2049078941345215, + "learning_rate": 7.906120133779318e-06, + "loss": 0.3843, + "step": 51720 + }, + { + "epoch": 5.1812991435869185, + "grad_norm": 3.0092198848724365, + "learning_rate": 7.897909651200152e-06, + "loss": 0.4309, + "step": 51730 + }, + { + "epoch": 5.182300796313918, + "grad_norm": 2.194514751434326, + "learning_rate": 7.889702634292785e-06, + "loss": 0.4127, + "step": 51740 + }, + { + "epoch": 5.183302449040918, + "grad_norm": 2.2221009731292725, + "learning_rate": 7.881499084720301e-06, + "loss": 0.3811, + "step": 51750 + }, + { + "epoch": 5.184304101767917, + "grad_norm": 1.7878801822662354, + "learning_rate": 7.873299004145136e-06, + "loss": 0.3726, + "step": 51760 + }, + { + "epoch": 5.185305754494917, + "grad_norm": 2.5555198192596436, + "learning_rate": 7.86510239422899e-06, + "loss": 0.4467, + "step": 51770 + }, + { + "epoch": 5.186307407221916, + "grad_norm": 2.4053938388824463, + "learning_rate": 7.8569092566329e-06, + "loss": 0.3929, + "step": 51780 + }, + { + "epoch": 5.187309059948916, + "grad_norm": 2.1048076152801514, + "learning_rate": 7.84871959301715e-06, + "loss": 0.3874, + "step": 51790 + }, + { + "epoch": 5.188310712675915, + "grad_norm": 2.1604602336883545, + "learning_rate": 7.840533405041343e-06, + "loss": 0.381, + "step": 51800 + }, + { + "epoch": 5.189312365402915, + "grad_norm": 2.2942380905151367, + "learning_rate": 7.8323506943644e-06, + "loss": 0.4038, + "step": 51810 + }, + { + "epoch": 5.190314018129914, + "grad_norm": 2.568305015563965, + "learning_rate": 7.824171462644493e-06, + "loss": 0.4428, + "step": 51820 + }, + { + "epoch": 5.1913156708569135, + "grad_norm": 2.6570141315460205, + "learning_rate": 7.81599571153912e-06, + "loss": 0.4239, + "step": 51830 + }, + { + "epoch": 5.1923173235839135, + "grad_norm": 2.121030807495117, + "learning_rate": 7.807823442705056e-06, + "loss": 0.4091, + "step": 51840 + }, + { + "epoch": 5.193318976310913, + "grad_norm": 2.315417528152466, + "learning_rate": 7.799654657798402e-06, + "loss": 0.39, + "step": 51850 + }, + { + "epoch": 5.194320629037913, + "grad_norm": 2.200780153274536, + "learning_rate": 7.79148935847451e-06, + "loss": 0.3821, + "step": 51860 + }, + { + "epoch": 5.195322281764912, + "grad_norm": 2.2050414085388184, + "learning_rate": 7.783327546388045e-06, + "loss": 0.414, + "step": 51870 + }, + { + "epoch": 5.196323934491912, + "grad_norm": 2.513406991958618, + "learning_rate": 7.77516922319298e-06, + "loss": 0.4066, + "step": 51880 + }, + { + "epoch": 5.197325587218911, + "grad_norm": 2.4052371978759766, + "learning_rate": 7.767014390542565e-06, + "loss": 0.3507, + "step": 51890 + }, + { + "epoch": 5.198327239945911, + "grad_norm": 2.228341579437256, + "learning_rate": 7.758863050089337e-06, + "loss": 0.4601, + "step": 51900 + }, + { + "epoch": 5.19932889267291, + "grad_norm": 2.076115608215332, + "learning_rate": 7.750715203485127e-06, + "loss": 0.3405, + "step": 51910 + }, + { + "epoch": 5.20033054539991, + "grad_norm": 2.2756433486938477, + "learning_rate": 7.742570852381092e-06, + "loss": 0.4388, + "step": 51920 + }, + { + "epoch": 5.201332198126909, + "grad_norm": 1.7769001722335815, + "learning_rate": 7.734429998427626e-06, + "loss": 0.3725, + "step": 51930 + }, + { + "epoch": 5.2023338508539085, + "grad_norm": 2.7252211570739746, + "learning_rate": 7.726292643274441e-06, + "loss": 0.4404, + "step": 51940 + }, + { + "epoch": 5.203335503580909, + "grad_norm": 2.693631887435913, + "learning_rate": 7.718158788570557e-06, + "loss": 0.3966, + "step": 51950 + }, + { + "epoch": 5.204337156307908, + "grad_norm": 1.9786999225616455, + "learning_rate": 7.71002843596427e-06, + "loss": 0.3878, + "step": 51960 + }, + { + "epoch": 5.205338809034908, + "grad_norm": 2.3402976989746094, + "learning_rate": 7.701901587103146e-06, + "loss": 0.391, + "step": 51970 + }, + { + "epoch": 5.206340461761907, + "grad_norm": 1.6999900341033936, + "learning_rate": 7.69377824363406e-06, + "loss": 0.4084, + "step": 51980 + }, + { + "epoch": 5.207342114488907, + "grad_norm": 2.1598548889160156, + "learning_rate": 7.685658407203192e-06, + "loss": 0.3995, + "step": 51990 + }, + { + "epoch": 5.208343767215906, + "grad_norm": 2.034207582473755, + "learning_rate": 7.677542079455994e-06, + "loss": 0.4061, + "step": 52000 + }, + { + "epoch": 5.209345419942906, + "grad_norm": 2.343937635421753, + "learning_rate": 7.669429262037183e-06, + "loss": 0.355, + "step": 52010 + }, + { + "epoch": 5.210347072669905, + "grad_norm": 2.3966968059539795, + "learning_rate": 7.661319956590817e-06, + "loss": 0.3928, + "step": 52020 + }, + { + "epoch": 5.211348725396905, + "grad_norm": 3.0318691730499268, + "learning_rate": 7.653214164760217e-06, + "loss": 0.4378, + "step": 52030 + }, + { + "epoch": 5.2123503781239044, + "grad_norm": 2.464893341064453, + "learning_rate": 7.645111888187967e-06, + "loss": 0.3915, + "step": 52040 + }, + { + "epoch": 5.213352030850904, + "grad_norm": 1.6739379167556763, + "learning_rate": 7.637013128515966e-06, + "loss": 0.3769, + "step": 52050 + }, + { + "epoch": 5.214353683577904, + "grad_norm": 1.666397213935852, + "learning_rate": 7.6289178873854086e-06, + "loss": 0.427, + "step": 52060 + }, + { + "epoch": 5.215355336304903, + "grad_norm": 2.2799434661865234, + "learning_rate": 7.62082616643677e-06, + "loss": 0.3725, + "step": 52070 + }, + { + "epoch": 5.216356989031903, + "grad_norm": 2.4019501209259033, + "learning_rate": 7.612737967309777e-06, + "loss": 0.3981, + "step": 52080 + }, + { + "epoch": 5.217358641758902, + "grad_norm": 2.1799919605255127, + "learning_rate": 7.604653291643496e-06, + "loss": 0.4186, + "step": 52090 + }, + { + "epoch": 5.218360294485902, + "grad_norm": 2.0130274295806885, + "learning_rate": 7.59657214107625e-06, + "loss": 0.4132, + "step": 52100 + }, + { + "epoch": 5.219361947212901, + "grad_norm": 1.9630812406539917, + "learning_rate": 7.588494517245656e-06, + "loss": 0.351, + "step": 52110 + }, + { + "epoch": 5.220363599939901, + "grad_norm": 2.2186408042907715, + "learning_rate": 7.5804204217885925e-06, + "loss": 0.4274, + "step": 52120 + }, + { + "epoch": 5.2213652526669, + "grad_norm": 1.6884628534317017, + "learning_rate": 7.572349856341265e-06, + "loss": 0.3911, + "step": 52130 + }, + { + "epoch": 5.2223669053939, + "grad_norm": 2.2494137287139893, + "learning_rate": 7.564282822539143e-06, + "loss": 0.4608, + "step": 52140 + }, + { + "epoch": 5.2233685581208995, + "grad_norm": 2.3293933868408203, + "learning_rate": 7.556219322016958e-06, + "loss": 0.4025, + "step": 52150 + }, + { + "epoch": 5.224370210847899, + "grad_norm": 2.4375548362731934, + "learning_rate": 7.548159356408766e-06, + "loss": 0.4536, + "step": 52160 + }, + { + "epoch": 5.225371863574899, + "grad_norm": 1.501153826713562, + "learning_rate": 7.540102927347883e-06, + "loss": 0.393, + "step": 52170 + }, + { + "epoch": 5.226373516301898, + "grad_norm": 2.5104174613952637, + "learning_rate": 7.53205003646692e-06, + "loss": 0.4326, + "step": 52180 + }, + { + "epoch": 5.227375169028898, + "grad_norm": 2.1039373874664307, + "learning_rate": 7.524000685397739e-06, + "loss": 0.4187, + "step": 52190 + }, + { + "epoch": 5.228376821755897, + "grad_norm": 2.2166614532470703, + "learning_rate": 7.515954875771533e-06, + "loss": 0.4832, + "step": 52200 + }, + { + "epoch": 5.229378474482897, + "grad_norm": 2.8615593910217285, + "learning_rate": 7.507912609218759e-06, + "loss": 0.3768, + "step": 52210 + }, + { + "epoch": 5.230380127209896, + "grad_norm": 2.372994899749756, + "learning_rate": 7.499873887369119e-06, + "loss": 0.4151, + "step": 52220 + }, + { + "epoch": 5.231381779936896, + "grad_norm": 2.192148447036743, + "learning_rate": 7.491838711851659e-06, + "loss": 0.4568, + "step": 52230 + }, + { + "epoch": 5.232383432663895, + "grad_norm": 2.341484785079956, + "learning_rate": 7.483807084294664e-06, + "loss": 0.3927, + "step": 52240 + }, + { + "epoch": 5.2333850853908945, + "grad_norm": 1.8965469598770142, + "learning_rate": 7.475779006325723e-06, + "loss": 0.4777, + "step": 52250 + }, + { + "epoch": 5.234386738117895, + "grad_norm": 2.039476156234741, + "learning_rate": 7.467754479571667e-06, + "loss": 0.4218, + "step": 52260 + }, + { + "epoch": 5.235388390844894, + "grad_norm": 1.3191801309585571, + "learning_rate": 7.459733505658661e-06, + "loss": 0.3756, + "step": 52270 + }, + { + "epoch": 5.236390043571894, + "grad_norm": 2.1452765464782715, + "learning_rate": 7.4517160862121185e-06, + "loss": 0.4151, + "step": 52280 + }, + { + "epoch": 5.237391696298893, + "grad_norm": 1.9325929880142212, + "learning_rate": 7.443702222856735e-06, + "loss": 0.3947, + "step": 52290 + }, + { + "epoch": 5.238393349025893, + "grad_norm": 2.707630157470703, + "learning_rate": 7.435691917216489e-06, + "loss": 0.4178, + "step": 52300 + }, + { + "epoch": 5.239395001752892, + "grad_norm": 2.664762020111084, + "learning_rate": 7.427685170914636e-06, + "loss": 0.3769, + "step": 52310 + }, + { + "epoch": 5.240396654479892, + "grad_norm": 1.8347071409225464, + "learning_rate": 7.4196819855737255e-06, + "loss": 0.4197, + "step": 52320 + }, + { + "epoch": 5.241398307206891, + "grad_norm": 1.889101266860962, + "learning_rate": 7.411682362815542e-06, + "loss": 0.411, + "step": 52330 + }, + { + "epoch": 5.242399959933891, + "grad_norm": 3.006286144256592, + "learning_rate": 7.403686304261204e-06, + "loss": 0.4188, + "step": 52340 + }, + { + "epoch": 5.24340161266089, + "grad_norm": 2.348801851272583, + "learning_rate": 7.3956938115310734e-06, + "loss": 0.4278, + "step": 52350 + }, + { + "epoch": 5.24440326538789, + "grad_norm": 2.0983736515045166, + "learning_rate": 7.387704886244798e-06, + "loss": 0.4544, + "step": 52360 + }, + { + "epoch": 5.24540491811489, + "grad_norm": 3.7822070121765137, + "learning_rate": 7.3797195300213005e-06, + "loss": 0.4355, + "step": 52370 + }, + { + "epoch": 5.246406570841889, + "grad_norm": 2.2507216930389404, + "learning_rate": 7.371737744478785e-06, + "loss": 0.4098, + "step": 52380 + }, + { + "epoch": 5.247408223568889, + "grad_norm": 2.440915107727051, + "learning_rate": 7.363759531234729e-06, + "loss": 0.3955, + "step": 52390 + }, + { + "epoch": 5.248409876295888, + "grad_norm": 1.8399525880813599, + "learning_rate": 7.355784891905882e-06, + "loss": 0.3885, + "step": 52400 + }, + { + "epoch": 5.249411529022888, + "grad_norm": 2.2914631366729736, + "learning_rate": 7.347813828108277e-06, + "loss": 0.411, + "step": 52410 + }, + { + "epoch": 5.250413181749887, + "grad_norm": 2.0582337379455566, + "learning_rate": 7.339846341457221e-06, + "loss": 0.3832, + "step": 52420 + }, + { + "epoch": 5.251414834476887, + "grad_norm": 1.9887348413467407, + "learning_rate": 7.331882433567289e-06, + "loss": 0.4259, + "step": 52430 + }, + { + "epoch": 5.252416487203886, + "grad_norm": 2.355299472808838, + "learning_rate": 7.323922106052339e-06, + "loss": 0.3626, + "step": 52440 + }, + { + "epoch": 5.253418139930886, + "grad_norm": 2.0583131313323975, + "learning_rate": 7.315965360525498e-06, + "loss": 0.4364, + "step": 52450 + }, + { + "epoch": 5.2544197926578855, + "grad_norm": 2.732943296432495, + "learning_rate": 7.308012198599173e-06, + "loss": 0.4869, + "step": 52460 + }, + { + "epoch": 5.255421445384885, + "grad_norm": 2.507206439971924, + "learning_rate": 7.300062621885037e-06, + "loss": 0.4206, + "step": 52470 + }, + { + "epoch": 5.256423098111885, + "grad_norm": 2.7191555500030518, + "learning_rate": 7.292116631994045e-06, + "loss": 0.411, + "step": 52480 + }, + { + "epoch": 5.257424750838884, + "grad_norm": 1.7782527208328247, + "learning_rate": 7.284174230536417e-06, + "loss": 0.3589, + "step": 52490 + }, + { + "epoch": 5.258426403565884, + "grad_norm": 2.6875455379486084, + "learning_rate": 7.276235419121649e-06, + "loss": 0.3732, + "step": 52500 + }, + { + "epoch": 5.259428056292883, + "grad_norm": 2.146796464920044, + "learning_rate": 7.268300199358516e-06, + "loss": 0.3572, + "step": 52510 + }, + { + "epoch": 5.260429709019883, + "grad_norm": 2.5086498260498047, + "learning_rate": 7.260368572855053e-06, + "loss": 0.4919, + "step": 52520 + }, + { + "epoch": 5.261431361746882, + "grad_norm": 2.6334359645843506, + "learning_rate": 7.2524405412185775e-06, + "loss": 0.383, + "step": 52530 + }, + { + "epoch": 5.262433014473882, + "grad_norm": 2.3560729026794434, + "learning_rate": 7.244516106055671e-06, + "loss": 0.4374, + "step": 52540 + }, + { + "epoch": 5.263434667200881, + "grad_norm": 1.7651573419570923, + "learning_rate": 7.23659526897219e-06, + "loss": 0.3754, + "step": 52550 + }, + { + "epoch": 5.264436319927881, + "grad_norm": 2.338216781616211, + "learning_rate": 7.228678031573263e-06, + "loss": 0.3835, + "step": 52560 + }, + { + "epoch": 5.265437972654881, + "grad_norm": 2.0452308654785156, + "learning_rate": 7.2207643954632865e-06, + "loss": 0.4685, + "step": 52570 + }, + { + "epoch": 5.26643962538188, + "grad_norm": 1.7141194343566895, + "learning_rate": 7.212854362245924e-06, + "loss": 0.3844, + "step": 52580 + }, + { + "epoch": 5.26744127810888, + "grad_norm": 2.3181238174438477, + "learning_rate": 7.20494793352412e-06, + "loss": 0.4083, + "step": 52590 + }, + { + "epoch": 5.268442930835879, + "grad_norm": 2.631800889968872, + "learning_rate": 7.197045110900069e-06, + "loss": 0.359, + "step": 52600 + }, + { + "epoch": 5.269444583562879, + "grad_norm": 1.7754448652267456, + "learning_rate": 7.189145895975272e-06, + "loss": 0.3636, + "step": 52610 + }, + { + "epoch": 5.270446236289878, + "grad_norm": 2.115633964538574, + "learning_rate": 7.181250290350447e-06, + "loss": 0.3871, + "step": 52620 + }, + { + "epoch": 5.271447889016878, + "grad_norm": 2.290174961090088, + "learning_rate": 7.173358295625621e-06, + "loss": 0.4456, + "step": 52630 + }, + { + "epoch": 5.272449541743877, + "grad_norm": 2.639821767807007, + "learning_rate": 7.165469913400072e-06, + "loss": 0.4184, + "step": 52640 + }, + { + "epoch": 5.273451194470877, + "grad_norm": 2.0272226333618164, + "learning_rate": 7.1575851452723496e-06, + "loss": 0.4, + "step": 52650 + }, + { + "epoch": 5.274452847197876, + "grad_norm": 2.293170213699341, + "learning_rate": 7.149703992840276e-06, + "loss": 0.3766, + "step": 52660 + }, + { + "epoch": 5.2754544999248765, + "grad_norm": 3.4424662590026855, + "learning_rate": 7.141826457700923e-06, + "loss": 0.351, + "step": 52670 + }, + { + "epoch": 5.276456152651876, + "grad_norm": 1.8073848485946655, + "learning_rate": 7.133952541450669e-06, + "loss": 0.4577, + "step": 52680 + }, + { + "epoch": 5.277457805378875, + "grad_norm": 2.518890619277954, + "learning_rate": 7.126082245685106e-06, + "loss": 0.3885, + "step": 52690 + }, + { + "epoch": 5.278459458105875, + "grad_norm": 2.2708356380462646, + "learning_rate": 7.118215571999129e-06, + "loss": 0.3841, + "step": 52700 + }, + { + "epoch": 5.279461110832874, + "grad_norm": 1.920397162437439, + "learning_rate": 7.1103525219868795e-06, + "loss": 0.4228, + "step": 52710 + }, + { + "epoch": 5.280462763559874, + "grad_norm": 2.008230686187744, + "learning_rate": 7.102493097241797e-06, + "loss": 0.4011, + "step": 52720 + }, + { + "epoch": 5.281464416286873, + "grad_norm": 1.935886025428772, + "learning_rate": 7.094637299356544e-06, + "loss": 0.3789, + "step": 52730 + }, + { + "epoch": 5.282466069013873, + "grad_norm": 2.69047212600708, + "learning_rate": 7.0867851299230595e-06, + "loss": 0.4479, + "step": 52740 + }, + { + "epoch": 5.283467721740872, + "grad_norm": 1.8354910612106323, + "learning_rate": 7.078936590532584e-06, + "loss": 0.3983, + "step": 52750 + }, + { + "epoch": 5.284469374467872, + "grad_norm": 2.452239990234375, + "learning_rate": 7.071091682775569e-06, + "loss": 0.4333, + "step": 52760 + }, + { + "epoch": 5.2854710271948715, + "grad_norm": 2.979823112487793, + "learning_rate": 7.063250408241761e-06, + "loss": 0.427, + "step": 52770 + }, + { + "epoch": 5.2864726799218715, + "grad_norm": 2.0987963676452637, + "learning_rate": 7.055412768520156e-06, + "loss": 0.3653, + "step": 52780 + }, + { + "epoch": 5.287474332648871, + "grad_norm": 2.366989850997925, + "learning_rate": 7.047578765199048e-06, + "loss": 0.4078, + "step": 52790 + }, + { + "epoch": 5.28847598537587, + "grad_norm": 2.4074864387512207, + "learning_rate": 7.039748399865937e-06, + "loss": 0.4018, + "step": 52800 + }, + { + "epoch": 5.28947763810287, + "grad_norm": 2.3981125354766846, + "learning_rate": 7.031921674107622e-06, + "loss": 0.4609, + "step": 52810 + }, + { + "epoch": 5.290479290829869, + "grad_norm": 2.6520919799804688, + "learning_rate": 7.024098589510181e-06, + "loss": 0.4054, + "step": 52820 + }, + { + "epoch": 5.291480943556869, + "grad_norm": 2.526301145553589, + "learning_rate": 7.016279147658903e-06, + "loss": 0.4026, + "step": 52830 + }, + { + "epoch": 5.292482596283868, + "grad_norm": 1.9764511585235596, + "learning_rate": 7.008463350138381e-06, + "loss": 0.406, + "step": 52840 + }, + { + "epoch": 5.293484249010868, + "grad_norm": 2.1861753463745117, + "learning_rate": 7.000651198532446e-06, + "loss": 0.4219, + "step": 52850 + }, + { + "epoch": 5.294485901737867, + "grad_norm": 1.8499925136566162, + "learning_rate": 6.992842694424221e-06, + "loss": 0.3846, + "step": 52860 + }, + { + "epoch": 5.295487554464867, + "grad_norm": 2.1721975803375244, + "learning_rate": 6.9850378393960495e-06, + "loss": 0.4378, + "step": 52870 + }, + { + "epoch": 5.2964892071918666, + "grad_norm": 2.3228940963745117, + "learning_rate": 6.977236635029552e-06, + "loss": 0.4515, + "step": 52880 + }, + { + "epoch": 5.297490859918867, + "grad_norm": 2.2590081691741943, + "learning_rate": 6.96943908290563e-06, + "loss": 0.4124, + "step": 52890 + }, + { + "epoch": 5.298492512645866, + "grad_norm": 2.0431101322174072, + "learning_rate": 6.961645184604423e-06, + "loss": 0.3689, + "step": 52900 + }, + { + "epoch": 5.299494165372865, + "grad_norm": 1.8751521110534668, + "learning_rate": 6.953854941705323e-06, + "loss": 0.4161, + "step": 52910 + }, + { + "epoch": 5.300495818099865, + "grad_norm": 2.33186936378479, + "learning_rate": 6.946068355786992e-06, + "loss": 0.4703, + "step": 52920 + }, + { + "epoch": 5.301497470826864, + "grad_norm": 2.136018991470337, + "learning_rate": 6.9382854284273715e-06, + "loss": 0.3905, + "step": 52930 + }, + { + "epoch": 5.302499123553864, + "grad_norm": 1.790964126586914, + "learning_rate": 6.930506161203618e-06, + "loss": 0.4172, + "step": 52940 + }, + { + "epoch": 5.303500776280863, + "grad_norm": 1.893381118774414, + "learning_rate": 6.922730555692172e-06, + "loss": 0.4716, + "step": 52950 + }, + { + "epoch": 5.304502429007863, + "grad_norm": 2.6119513511657715, + "learning_rate": 6.914958613468744e-06, + "loss": 0.4113, + "step": 52960 + }, + { + "epoch": 5.305504081734862, + "grad_norm": 1.9697766304016113, + "learning_rate": 6.907190336108288e-06, + "loss": 0.419, + "step": 52970 + }, + { + "epoch": 5.3065057344618625, + "grad_norm": 2.429619312286377, + "learning_rate": 6.899425725184999e-06, + "loss": 0.3799, + "step": 52980 + }, + { + "epoch": 5.307507387188862, + "grad_norm": 2.6430892944335938, + "learning_rate": 6.891664782272347e-06, + "loss": 0.3845, + "step": 52990 + }, + { + "epoch": 5.308509039915861, + "grad_norm": 2.1307921409606934, + "learning_rate": 6.883907508943069e-06, + "loss": 0.3771, + "step": 53000 + }, + { + "epoch": 5.309510692642861, + "grad_norm": 2.107192277908325, + "learning_rate": 6.876153906769148e-06, + "loss": 0.4124, + "step": 53010 + }, + { + "epoch": 5.31051234536986, + "grad_norm": 2.150707721710205, + "learning_rate": 6.868403977321799e-06, + "loss": 0.3855, + "step": 53020 + }, + { + "epoch": 5.31151399809686, + "grad_norm": 2.0903549194335938, + "learning_rate": 6.860657722171534e-06, + "loss": 0.4535, + "step": 53030 + }, + { + "epoch": 5.312515650823859, + "grad_norm": 2.294050455093384, + "learning_rate": 6.852915142888108e-06, + "loss": 0.4359, + "step": 53040 + }, + { + "epoch": 5.313517303550859, + "grad_norm": 2.1415610313415527, + "learning_rate": 6.845176241040505e-06, + "loss": 0.4042, + "step": 53050 + }, + { + "epoch": 5.314518956277858, + "grad_norm": 2.3479254245758057, + "learning_rate": 6.83744101819698e-06, + "loss": 0.4422, + "step": 53060 + }, + { + "epoch": 5.315520609004858, + "grad_norm": 2.104673385620117, + "learning_rate": 6.8297094759250665e-06, + "loss": 0.3555, + "step": 53070 + }, + { + "epoch": 5.3165222617318575, + "grad_norm": 3.1974856853485107, + "learning_rate": 6.8219816157915286e-06, + "loss": 0.3773, + "step": 53080 + }, + { + "epoch": 5.3175239144588575, + "grad_norm": 2.344932794570923, + "learning_rate": 6.814257439362368e-06, + "loss": 0.3502, + "step": 53090 + }, + { + "epoch": 5.318525567185857, + "grad_norm": 2.2202727794647217, + "learning_rate": 6.806536948202874e-06, + "loss": 0.4425, + "step": 53100 + }, + { + "epoch": 5.319527219912856, + "grad_norm": 3.03460955619812, + "learning_rate": 6.798820143877574e-06, + "loss": 0.5061, + "step": 53110 + }, + { + "epoch": 5.320528872639856, + "grad_norm": 1.9732153415679932, + "learning_rate": 6.791107027950244e-06, + "loss": 0.3563, + "step": 53120 + }, + { + "epoch": 5.321530525366855, + "grad_norm": 2.2756731510162354, + "learning_rate": 6.783397601983918e-06, + "loss": 0.3928, + "step": 53130 + }, + { + "epoch": 5.322532178093855, + "grad_norm": 1.9389190673828125, + "learning_rate": 6.775691867540882e-06, + "loss": 0.4561, + "step": 53140 + }, + { + "epoch": 5.323533830820854, + "grad_norm": 2.05364727973938, + "learning_rate": 6.7679898261826775e-06, + "loss": 0.4577, + "step": 53150 + }, + { + "epoch": 5.324535483547854, + "grad_norm": 2.402559995651245, + "learning_rate": 6.760291479470074e-06, + "loss": 0.3989, + "step": 53160 + }, + { + "epoch": 5.325537136274853, + "grad_norm": 2.256415843963623, + "learning_rate": 6.752596828963132e-06, + "loss": 0.4353, + "step": 53170 + }, + { + "epoch": 5.326538789001853, + "grad_norm": 2.008063793182373, + "learning_rate": 6.744905876221133e-06, + "loss": 0.4673, + "step": 53180 + }, + { + "epoch": 5.3275404417288525, + "grad_norm": 1.8382278680801392, + "learning_rate": 6.737218622802621e-06, + "loss": 0.3791, + "step": 53190 + }, + { + "epoch": 5.328542094455852, + "grad_norm": 3.4138710498809814, + "learning_rate": 6.729535070265389e-06, + "loss": 0.4359, + "step": 53200 + }, + { + "epoch": 5.329543747182852, + "grad_norm": 2.198563575744629, + "learning_rate": 6.721855220166479e-06, + "loss": 0.4072, + "step": 53210 + }, + { + "epoch": 5.330545399909851, + "grad_norm": 3.2445294857025146, + "learning_rate": 6.71417907406218e-06, + "loss": 0.4526, + "step": 53220 + }, + { + "epoch": 5.331547052636851, + "grad_norm": 2.4834206104278564, + "learning_rate": 6.706506633508033e-06, + "loss": 0.4159, + "step": 53230 + }, + { + "epoch": 5.33254870536385, + "grad_norm": 2.6208958625793457, + "learning_rate": 6.69883790005883e-06, + "loss": 0.4058, + "step": 53240 + }, + { + "epoch": 5.33355035809085, + "grad_norm": 2.0602033138275146, + "learning_rate": 6.691172875268609e-06, + "loss": 0.3935, + "step": 53250 + }, + { + "epoch": 5.334552010817849, + "grad_norm": 2.335693359375, + "learning_rate": 6.683511560690658e-06, + "loss": 0.4759, + "step": 53260 + }, + { + "epoch": 5.335553663544849, + "grad_norm": 3.5803706645965576, + "learning_rate": 6.675853957877512e-06, + "loss": 0.4427, + "step": 53270 + }, + { + "epoch": 5.336555316271848, + "grad_norm": 1.9978817701339722, + "learning_rate": 6.668200068380953e-06, + "loss": 0.4553, + "step": 53280 + }, + { + "epoch": 5.3375569689988485, + "grad_norm": 2.189866065979004, + "learning_rate": 6.660549893752013e-06, + "loss": 0.4083, + "step": 53290 + }, + { + "epoch": 5.338558621725848, + "grad_norm": 1.6272187232971191, + "learning_rate": 6.652903435540972e-06, + "loss": 0.373, + "step": 53300 + }, + { + "epoch": 5.339560274452847, + "grad_norm": 2.4417366981506348, + "learning_rate": 6.6452606952973534e-06, + "loss": 0.4228, + "step": 53310 + }, + { + "epoch": 5.340561927179847, + "grad_norm": 2.1256210803985596, + "learning_rate": 6.637621674569925e-06, + "loss": 0.4603, + "step": 53320 + }, + { + "epoch": 5.341563579906846, + "grad_norm": 2.3436925411224365, + "learning_rate": 6.629986374906707e-06, + "loss": 0.4088, + "step": 53330 + }, + { + "epoch": 5.342565232633846, + "grad_norm": 1.9916789531707764, + "learning_rate": 6.622354797854965e-06, + "loss": 0.4398, + "step": 53340 + }, + { + "epoch": 5.343566885360845, + "grad_norm": 3.09892201423645, + "learning_rate": 6.614726944961208e-06, + "loss": 0.4802, + "step": 53350 + }, + { + "epoch": 5.344568538087845, + "grad_norm": 2.0518476963043213, + "learning_rate": 6.607102817771191e-06, + "loss": 0.4275, + "step": 53360 + }, + { + "epoch": 5.345570190814844, + "grad_norm": 2.4747159481048584, + "learning_rate": 6.599482417829908e-06, + "loss": 0.457, + "step": 53370 + }, + { + "epoch": 5.346571843541844, + "grad_norm": 2.272169351577759, + "learning_rate": 6.591865746681608e-06, + "loss": 0.3775, + "step": 53380 + }, + { + "epoch": 5.3475734962688435, + "grad_norm": 2.412099599838257, + "learning_rate": 6.584252805869781e-06, + "loss": 0.3946, + "step": 53390 + }, + { + "epoch": 5.3485751489958435, + "grad_norm": 2.5232436656951904, + "learning_rate": 6.576643596937157e-06, + "loss": 0.4018, + "step": 53400 + }, + { + "epoch": 5.349576801722843, + "grad_norm": 2.147260904312134, + "learning_rate": 6.569038121425711e-06, + "loss": 0.4361, + "step": 53410 + }, + { + "epoch": 5.350578454449842, + "grad_norm": 3.0588362216949463, + "learning_rate": 6.561436380876668e-06, + "loss": 0.4696, + "step": 53420 + }, + { + "epoch": 5.351580107176842, + "grad_norm": 2.22107195854187, + "learning_rate": 6.553838376830485e-06, + "loss": 0.3865, + "step": 53430 + }, + { + "epoch": 5.352581759903841, + "grad_norm": 1.650066614151001, + "learning_rate": 6.546244110826874e-06, + "loss": 0.3675, + "step": 53440 + }, + { + "epoch": 5.353583412630841, + "grad_norm": 2.833719253540039, + "learning_rate": 6.5386535844047775e-06, + "loss": 0.3676, + "step": 53450 + }, + { + "epoch": 5.35458506535784, + "grad_norm": 1.93025803565979, + "learning_rate": 6.53106679910239e-06, + "loss": 0.4342, + "step": 53460 + }, + { + "epoch": 5.35558671808484, + "grad_norm": 2.8691656589508057, + "learning_rate": 6.523483756457144e-06, + "loss": 0.4971, + "step": 53470 + }, + { + "epoch": 5.356588370811839, + "grad_norm": 2.115009069442749, + "learning_rate": 6.515904458005709e-06, + "loss": 0.417, + "step": 53480 + }, + { + "epoch": 5.357590023538839, + "grad_norm": 1.9475973844528198, + "learning_rate": 6.508328905284006e-06, + "loss": 0.3852, + "step": 53490 + }, + { + "epoch": 5.3585916762658385, + "grad_norm": 2.8739306926727295, + "learning_rate": 6.500757099827187e-06, + "loss": 0.3797, + "step": 53500 + }, + { + "epoch": 5.359593328992839, + "grad_norm": 2.277933359146118, + "learning_rate": 6.493189043169651e-06, + "loss": 0.3827, + "step": 53510 + }, + { + "epoch": 5.360594981719838, + "grad_norm": 2.5828804969787598, + "learning_rate": 6.485624736845031e-06, + "loss": 0.3898, + "step": 53520 + }, + { + "epoch": 5.361596634446837, + "grad_norm": 1.9495731592178345, + "learning_rate": 6.478064182386212e-06, + "loss": 0.406, + "step": 53530 + }, + { + "epoch": 5.362598287173837, + "grad_norm": 1.9667346477508545, + "learning_rate": 6.470507381325303e-06, + "loss": 0.4068, + "step": 53540 + }, + { + "epoch": 5.363599939900836, + "grad_norm": 1.973042368888855, + "learning_rate": 6.462954335193666e-06, + "loss": 0.4642, + "step": 53550 + }, + { + "epoch": 5.364601592627836, + "grad_norm": 2.1213204860687256, + "learning_rate": 6.455405045521892e-06, + "loss": 0.3665, + "step": 53560 + }, + { + "epoch": 5.365603245354835, + "grad_norm": 2.8752615451812744, + "learning_rate": 6.4478595138398185e-06, + "loss": 0.4114, + "step": 53570 + }, + { + "epoch": 5.366604898081835, + "grad_norm": 1.7899949550628662, + "learning_rate": 6.440317741676513e-06, + "loss": 0.3377, + "step": 53580 + }, + { + "epoch": 5.367606550808834, + "grad_norm": 2.2464749813079834, + "learning_rate": 6.432779730560292e-06, + "loss": 0.3849, + "step": 53590 + }, + { + "epoch": 5.3686082035358345, + "grad_norm": 2.21551513671875, + "learning_rate": 6.425245482018702e-06, + "loss": 0.3827, + "step": 53600 + }, + { + "epoch": 5.369609856262834, + "grad_norm": 2.1476612091064453, + "learning_rate": 6.41771499757852e-06, + "loss": 0.3988, + "step": 53610 + }, + { + "epoch": 5.370611508989834, + "grad_norm": 2.1985154151916504, + "learning_rate": 6.4101882787657916e-06, + "loss": 0.4127, + "step": 53620 + }, + { + "epoch": 5.371613161716833, + "grad_norm": 2.32558012008667, + "learning_rate": 6.402665327105756e-06, + "loss": 0.455, + "step": 53630 + }, + { + "epoch": 5.372614814443832, + "grad_norm": 1.937166690826416, + "learning_rate": 6.3951461441229185e-06, + "loss": 0.3793, + "step": 53640 + }, + { + "epoch": 5.373616467170832, + "grad_norm": 3.1271660327911377, + "learning_rate": 6.38763073134101e-06, + "loss": 0.4362, + "step": 53650 + }, + { + "epoch": 5.374618119897831, + "grad_norm": 2.246852159500122, + "learning_rate": 6.3801190902829985e-06, + "loss": 0.3768, + "step": 53660 + }, + { + "epoch": 5.375619772624831, + "grad_norm": 2.1570510864257812, + "learning_rate": 6.372611222471092e-06, + "loss": 0.3875, + "step": 53670 + }, + { + "epoch": 5.37662142535183, + "grad_norm": 2.5383236408233643, + "learning_rate": 6.365107129426723e-06, + "loss": 0.3872, + "step": 53680 + }, + { + "epoch": 5.37762307807883, + "grad_norm": 2.381948709487915, + "learning_rate": 6.3576068126705855e-06, + "loss": 0.3625, + "step": 53690 + }, + { + "epoch": 5.3786247308058295, + "grad_norm": 2.5966947078704834, + "learning_rate": 6.350110273722571e-06, + "loss": 0.4216, + "step": 53700 + }, + { + "epoch": 5.3796263835328295, + "grad_norm": 2.2546310424804688, + "learning_rate": 6.342617514101826e-06, + "loss": 0.4408, + "step": 53710 + }, + { + "epoch": 5.380628036259829, + "grad_norm": 2.3283772468566895, + "learning_rate": 6.335128535326726e-06, + "loss": 0.4264, + "step": 53720 + }, + { + "epoch": 5.381629688986829, + "grad_norm": 2.2052276134490967, + "learning_rate": 6.3276433389149045e-06, + "loss": 0.436, + "step": 53730 + }, + { + "epoch": 5.382631341713828, + "grad_norm": 2.5837795734405518, + "learning_rate": 6.320161926383186e-06, + "loss": 0.423, + "step": 53740 + }, + { + "epoch": 5.383632994440827, + "grad_norm": 1.8887791633605957, + "learning_rate": 6.312684299247648e-06, + "loss": 0.3555, + "step": 53750 + }, + { + "epoch": 5.384634647167827, + "grad_norm": 2.3950350284576416, + "learning_rate": 6.305210459023625e-06, + "loss": 0.4179, + "step": 53760 + }, + { + "epoch": 5.385636299894826, + "grad_norm": 3.177776336669922, + "learning_rate": 6.297740407225638e-06, + "loss": 0.4403, + "step": 53770 + }, + { + "epoch": 5.386637952621826, + "grad_norm": 2.376591920852661, + "learning_rate": 6.290274145367475e-06, + "loss": 0.5005, + "step": 53780 + }, + { + "epoch": 5.387639605348825, + "grad_norm": 2.027867078781128, + "learning_rate": 6.282811674962135e-06, + "loss": 0.3684, + "step": 53790 + }, + { + "epoch": 5.388641258075825, + "grad_norm": 2.313084602355957, + "learning_rate": 6.27535299752188e-06, + "loss": 0.3768, + "step": 53800 + }, + { + "epoch": 5.3896429108028245, + "grad_norm": 2.342231273651123, + "learning_rate": 6.267898114558157e-06, + "loss": 0.4171, + "step": 53810 + }, + { + "epoch": 5.390644563529825, + "grad_norm": 2.253715991973877, + "learning_rate": 6.260447027581676e-06, + "loss": 0.4569, + "step": 53820 + }, + { + "epoch": 5.391646216256824, + "grad_norm": 2.658900022506714, + "learning_rate": 6.252999738102381e-06, + "loss": 0.436, + "step": 53830 + }, + { + "epoch": 5.392647868983824, + "grad_norm": 2.1993579864501953, + "learning_rate": 6.245556247629436e-06, + "loss": 0.3874, + "step": 53840 + }, + { + "epoch": 5.393649521710823, + "grad_norm": 2.270911931991577, + "learning_rate": 6.238116557671217e-06, + "loss": 0.3653, + "step": 53850 + }, + { + "epoch": 5.394651174437822, + "grad_norm": 2.0035555362701416, + "learning_rate": 6.230680669735361e-06, + "loss": 0.3763, + "step": 53860 + }, + { + "epoch": 5.395652827164822, + "grad_norm": 3.234086513519287, + "learning_rate": 6.223248585328734e-06, + "loss": 0.4419, + "step": 53870 + }, + { + "epoch": 5.396654479891821, + "grad_norm": 2.189328193664551, + "learning_rate": 6.215820305957393e-06, + "loss": 0.389, + "step": 53880 + }, + { + "epoch": 5.397656132618821, + "grad_norm": 2.4266152381896973, + "learning_rate": 6.208395833126657e-06, + "loss": 0.3803, + "step": 53890 + }, + { + "epoch": 5.39865778534582, + "grad_norm": 2.4305834770202637, + "learning_rate": 6.200975168341081e-06, + "loss": 0.3779, + "step": 53900 + }, + { + "epoch": 5.3996594380728204, + "grad_norm": 2.4199655055999756, + "learning_rate": 6.193558313104425e-06, + "loss": 0.417, + "step": 53910 + }, + { + "epoch": 5.40066109079982, + "grad_norm": 2.372927188873291, + "learning_rate": 6.186145268919677e-06, + "loss": 0.4141, + "step": 53920 + }, + { + "epoch": 5.40166274352682, + "grad_norm": 2.0953314304351807, + "learning_rate": 6.178736037289074e-06, + "loss": 0.394, + "step": 53930 + }, + { + "epoch": 5.402664396253819, + "grad_norm": 2.165076971054077, + "learning_rate": 6.1713306197140605e-06, + "loss": 0.3865, + "step": 53940 + }, + { + "epoch": 5.403666048980819, + "grad_norm": 2.225846290588379, + "learning_rate": 6.163929017695328e-06, + "loss": 0.3702, + "step": 53950 + }, + { + "epoch": 5.404667701707818, + "grad_norm": 2.3943235874176025, + "learning_rate": 6.156531232732756e-06, + "loss": 0.444, + "step": 53960 + }, + { + "epoch": 5.405669354434817, + "grad_norm": 2.743006706237793, + "learning_rate": 6.1491372663255025e-06, + "loss": 0.4253, + "step": 53970 + }, + { + "epoch": 5.406671007161817, + "grad_norm": 2.2035491466522217, + "learning_rate": 6.141747119971925e-06, + "loss": 0.4603, + "step": 53980 + }, + { + "epoch": 5.407672659888816, + "grad_norm": 2.0817651748657227, + "learning_rate": 6.1343607951695805e-06, + "loss": 0.4126, + "step": 53990 + }, + { + "epoch": 5.408674312615816, + "grad_norm": 2.3498198986053467, + "learning_rate": 6.126978293415306e-06, + "loss": 0.3927, + "step": 54000 + }, + { + "epoch": 5.4096759653428155, + "grad_norm": 2.3940587043762207, + "learning_rate": 6.1195996162051295e-06, + "loss": 0.4416, + "step": 54010 + }, + { + "epoch": 5.4106776180698155, + "grad_norm": 1.8981224298477173, + "learning_rate": 6.112224765034316e-06, + "loss": 0.3833, + "step": 54020 + }, + { + "epoch": 5.411679270796815, + "grad_norm": 2.426468849182129, + "learning_rate": 6.104853741397332e-06, + "loss": 0.4342, + "step": 54030 + }, + { + "epoch": 5.412680923523815, + "grad_norm": 3.046752452850342, + "learning_rate": 6.097486546787903e-06, + "loss": 0.4214, + "step": 54040 + }, + { + "epoch": 5.413682576250814, + "grad_norm": 1.7482635974884033, + "learning_rate": 6.090123182698965e-06, + "loss": 0.4063, + "step": 54050 + }, + { + "epoch": 5.414684228977813, + "grad_norm": 2.8719656467437744, + "learning_rate": 6.082763650622655e-06, + "loss": 0.4556, + "step": 54060 + }, + { + "epoch": 5.415685881704813, + "grad_norm": 2.3082518577575684, + "learning_rate": 6.075407952050374e-06, + "loss": 0.4332, + "step": 54070 + }, + { + "epoch": 5.416687534431812, + "grad_norm": 1.7820711135864258, + "learning_rate": 6.068056088472715e-06, + "loss": 0.3846, + "step": 54080 + }, + { + "epoch": 5.417689187158812, + "grad_norm": 2.5791473388671875, + "learning_rate": 6.060708061379516e-06, + "loss": 0.4377, + "step": 54090 + }, + { + "epoch": 5.418690839885811, + "grad_norm": 2.1430938243865967, + "learning_rate": 6.053363872259802e-06, + "loss": 0.3801, + "step": 54100 + }, + { + "epoch": 5.419692492612811, + "grad_norm": 2.625718355178833, + "learning_rate": 6.046023522601868e-06, + "loss": 0.3715, + "step": 54110 + }, + { + "epoch": 5.4206941453398105, + "grad_norm": 2.7916579246520996, + "learning_rate": 6.038687013893199e-06, + "loss": 0.3915, + "step": 54120 + }, + { + "epoch": 5.421695798066811, + "grad_norm": 1.9555777311325073, + "learning_rate": 6.031354347620508e-06, + "loss": 0.3917, + "step": 54130 + }, + { + "epoch": 5.42269745079381, + "grad_norm": 2.3919904232025146, + "learning_rate": 6.024025525269733e-06, + "loss": 0.4028, + "step": 54140 + }, + { + "epoch": 5.42369910352081, + "grad_norm": 2.119253635406494, + "learning_rate": 6.016700548326029e-06, + "loss": 0.3997, + "step": 54150 + }, + { + "epoch": 5.424700756247809, + "grad_norm": 1.9236420392990112, + "learning_rate": 6.0093794182737866e-06, + "loss": 0.4038, + "step": 54160 + }, + { + "epoch": 5.425702408974808, + "grad_norm": 1.5495247840881348, + "learning_rate": 6.002062136596578e-06, + "loss": 0.3885, + "step": 54170 + }, + { + "epoch": 5.426704061701808, + "grad_norm": 2.620678424835205, + "learning_rate": 5.9947487047772425e-06, + "loss": 0.4028, + "step": 54180 + }, + { + "epoch": 5.427705714428807, + "grad_norm": 2.5425150394439697, + "learning_rate": 5.987439124297814e-06, + "loss": 0.4059, + "step": 54190 + }, + { + "epoch": 5.428707367155807, + "grad_norm": 2.143843173980713, + "learning_rate": 5.980133396639551e-06, + "loss": 0.4257, + "step": 54200 + }, + { + "epoch": 5.429709019882806, + "grad_norm": 2.344255208969116, + "learning_rate": 5.972831523282927e-06, + "loss": 0.4211, + "step": 54210 + }, + { + "epoch": 5.4307106726098064, + "grad_norm": 1.8500442504882812, + "learning_rate": 5.965533505707641e-06, + "loss": 0.3479, + "step": 54220 + }, + { + "epoch": 5.431712325336806, + "grad_norm": 1.6079330444335938, + "learning_rate": 5.958239345392605e-06, + "loss": 0.3915, + "step": 54230 + }, + { + "epoch": 5.432713978063806, + "grad_norm": 2.237523317337036, + "learning_rate": 5.950949043815956e-06, + "loss": 0.4472, + "step": 54240 + }, + { + "epoch": 5.433715630790805, + "grad_norm": 1.8473663330078125, + "learning_rate": 5.943662602455044e-06, + "loss": 0.3923, + "step": 54250 + }, + { + "epoch": 5.434717283517804, + "grad_norm": 2.521827220916748, + "learning_rate": 5.936380022786436e-06, + "loss": 0.4243, + "step": 54260 + }, + { + "epoch": 5.435718936244804, + "grad_norm": 1.908718466758728, + "learning_rate": 5.929101306285919e-06, + "loss": 0.3434, + "step": 54270 + }, + { + "epoch": 5.436720588971803, + "grad_norm": 2.581207752227783, + "learning_rate": 5.9218264544285e-06, + "loss": 0.4207, + "step": 54280 + }, + { + "epoch": 5.437722241698803, + "grad_norm": 2.1982924938201904, + "learning_rate": 5.914555468688393e-06, + "loss": 0.3784, + "step": 54290 + }, + { + "epoch": 5.438723894425802, + "grad_norm": 2.5535848140716553, + "learning_rate": 5.9072883505390395e-06, + "loss": 0.4427, + "step": 54300 + }, + { + "epoch": 5.439725547152802, + "grad_norm": 2.093137741088867, + "learning_rate": 5.900025101453089e-06, + "loss": 0.4415, + "step": 54310 + }, + { + "epoch": 5.4407271998798015, + "grad_norm": 2.2132139205932617, + "learning_rate": 5.892765722902413e-06, + "loss": 0.4316, + "step": 54320 + }, + { + "epoch": 5.4417288526068015, + "grad_norm": 1.9173887968063354, + "learning_rate": 5.885510216358098e-06, + "loss": 0.4156, + "step": 54330 + }, + { + "epoch": 5.442730505333801, + "grad_norm": 1.8375977277755737, + "learning_rate": 5.878258583290441e-06, + "loss": 0.3712, + "step": 54340 + }, + { + "epoch": 5.443732158060801, + "grad_norm": 2.173393726348877, + "learning_rate": 5.871010825168957e-06, + "loss": 0.3781, + "step": 54350 + }, + { + "epoch": 5.4447338107878, + "grad_norm": 2.204589605331421, + "learning_rate": 5.863766943462379e-06, + "loss": 0.3748, + "step": 54360 + }, + { + "epoch": 5.445735463514799, + "grad_norm": 2.3114230632781982, + "learning_rate": 5.856526939638646e-06, + "loss": 0.3875, + "step": 54370 + }, + { + "epoch": 5.446737116241799, + "grad_norm": 2.2886545658111572, + "learning_rate": 5.849290815164921e-06, + "loss": 0.4018, + "step": 54380 + }, + { + "epoch": 5.447738768968798, + "grad_norm": 2.2720205783843994, + "learning_rate": 5.8420585715075744e-06, + "loss": 0.4316, + "step": 54390 + }, + { + "epoch": 5.448740421695798, + "grad_norm": 2.6107401847839355, + "learning_rate": 5.8348302101321944e-06, + "loss": 0.435, + "step": 54400 + }, + { + "epoch": 5.449742074422797, + "grad_norm": 2.0635311603546143, + "learning_rate": 5.827605732503577e-06, + "loss": 0.4727, + "step": 54410 + }, + { + "epoch": 5.450743727149797, + "grad_norm": 2.109706401824951, + "learning_rate": 5.820385140085735e-06, + "loss": 0.3891, + "step": 54420 + }, + { + "epoch": 5.4517453798767965, + "grad_norm": 2.1004483699798584, + "learning_rate": 5.813168434341898e-06, + "loss": 0.4171, + "step": 54430 + }, + { + "epoch": 5.452747032603797, + "grad_norm": 2.433253765106201, + "learning_rate": 5.805955616734496e-06, + "loss": 0.4314, + "step": 54440 + }, + { + "epoch": 5.453748685330796, + "grad_norm": 1.5983706712722778, + "learning_rate": 5.798746688725182e-06, + "loss": 0.4282, + "step": 54450 + }, + { + "epoch": 5.454750338057796, + "grad_norm": 1.6324114799499512, + "learning_rate": 5.791541651774817e-06, + "loss": 0.4018, + "step": 54460 + }, + { + "epoch": 5.455751990784795, + "grad_norm": 2.1272900104522705, + "learning_rate": 5.784340507343472e-06, + "loss": 0.3705, + "step": 54470 + }, + { + "epoch": 5.456753643511794, + "grad_norm": 1.5174928903579712, + "learning_rate": 5.777143256890435e-06, + "loss": 0.3686, + "step": 54480 + }, + { + "epoch": 5.457755296238794, + "grad_norm": 2.0583887100219727, + "learning_rate": 5.769949901874194e-06, + "loss": 0.4044, + "step": 54490 + }, + { + "epoch": 5.458756948965793, + "grad_norm": 2.4094724655151367, + "learning_rate": 5.76276044375246e-06, + "loss": 0.3918, + "step": 54500 + }, + { + "epoch": 5.459758601692793, + "grad_norm": 3.136207103729248, + "learning_rate": 5.75557488398214e-06, + "loss": 0.374, + "step": 54510 + }, + { + "epoch": 5.460760254419792, + "grad_norm": 1.848882794380188, + "learning_rate": 5.74839322401938e-06, + "loss": 0.3436, + "step": 54520 + }, + { + "epoch": 5.461761907146792, + "grad_norm": 2.511730432510376, + "learning_rate": 5.741215465319494e-06, + "loss": 0.4469, + "step": 54530 + }, + { + "epoch": 5.462763559873792, + "grad_norm": 1.784735918045044, + "learning_rate": 5.7340416093370345e-06, + "loss": 0.3822, + "step": 54540 + }, + { + "epoch": 5.463765212600792, + "grad_norm": 2.1342051029205322, + "learning_rate": 5.726871657525751e-06, + "loss": 0.4132, + "step": 54550 + }, + { + "epoch": 5.464766865327791, + "grad_norm": 3.0335421562194824, + "learning_rate": 5.7197056113386215e-06, + "loss": 0.4171, + "step": 54560 + }, + { + "epoch": 5.465768518054791, + "grad_norm": 2.3769993782043457, + "learning_rate": 5.712543472227797e-06, + "loss": 0.3798, + "step": 54570 + }, + { + "epoch": 5.46677017078179, + "grad_norm": 2.6821632385253906, + "learning_rate": 5.705385241644662e-06, + "loss": 0.3973, + "step": 54580 + }, + { + "epoch": 5.467771823508789, + "grad_norm": 2.379179000854492, + "learning_rate": 5.698230921039821e-06, + "loss": 0.3875, + "step": 54590 + }, + { + "epoch": 5.468773476235789, + "grad_norm": 2.252995729446411, + "learning_rate": 5.691080511863051e-06, + "loss": 0.3976, + "step": 54600 + }, + { + "epoch": 5.469775128962788, + "grad_norm": 2.438122510910034, + "learning_rate": 5.683934015563358e-06, + "loss": 0.4273, + "step": 54610 + }, + { + "epoch": 5.470776781689788, + "grad_norm": 2.041632652282715, + "learning_rate": 5.676791433588946e-06, + "loss": 0.4138, + "step": 54620 + }, + { + "epoch": 5.4717784344167875, + "grad_norm": 2.2904317378997803, + "learning_rate": 5.669652767387254e-06, + "loss": 0.4218, + "step": 54630 + }, + { + "epoch": 5.4727800871437875, + "grad_norm": 2.181467294692993, + "learning_rate": 5.662518018404883e-06, + "loss": 0.3501, + "step": 54640 + }, + { + "epoch": 5.473781739870787, + "grad_norm": 1.8415119647979736, + "learning_rate": 5.6553871880876575e-06, + "loss": 0.3538, + "step": 54650 + }, + { + "epoch": 5.474783392597787, + "grad_norm": 2.321047782897949, + "learning_rate": 5.64826027788064e-06, + "loss": 0.4236, + "step": 54660 + }, + { + "epoch": 5.475785045324786, + "grad_norm": 2.017359972000122, + "learning_rate": 5.641137289228049e-06, + "loss": 0.4154, + "step": 54670 + }, + { + "epoch": 5.476786698051786, + "grad_norm": 2.092787504196167, + "learning_rate": 5.6340182235733315e-06, + "loss": 0.3638, + "step": 54680 + }, + { + "epoch": 5.477788350778785, + "grad_norm": 2.5516960620880127, + "learning_rate": 5.626903082359139e-06, + "loss": 0.4228, + "step": 54690 + }, + { + "epoch": 5.478790003505784, + "grad_norm": 2.765859603881836, + "learning_rate": 5.619791867027343e-06, + "loss": 0.3729, + "step": 54700 + }, + { + "epoch": 5.479791656232784, + "grad_norm": 1.8943431377410889, + "learning_rate": 5.612684579018984e-06, + "loss": 0.3901, + "step": 54710 + }, + { + "epoch": 5.480793308959783, + "grad_norm": 2.0830743312835693, + "learning_rate": 5.605581219774325e-06, + "loss": 0.449, + "step": 54720 + }, + { + "epoch": 5.481794961686783, + "grad_norm": 2.621382474899292, + "learning_rate": 5.598481790732851e-06, + "loss": 0.3602, + "step": 54730 + }, + { + "epoch": 5.4827966144137825, + "grad_norm": 2.1246063709259033, + "learning_rate": 5.591386293333231e-06, + "loss": 0.4236, + "step": 54740 + }, + { + "epoch": 5.483798267140783, + "grad_norm": 1.4775830507278442, + "learning_rate": 5.584294729013325e-06, + "loss": 0.3476, + "step": 54750 + }, + { + "epoch": 5.484799919867782, + "grad_norm": 2.423421859741211, + "learning_rate": 5.577207099210216e-06, + "loss": 0.3928, + "step": 54760 + }, + { + "epoch": 5.485801572594782, + "grad_norm": 2.1227428913116455, + "learning_rate": 5.570123405360198e-06, + "loss": 0.4252, + "step": 54770 + }, + { + "epoch": 5.486803225321781, + "grad_norm": 2.768158435821533, + "learning_rate": 5.563043648898738e-06, + "loss": 0.4303, + "step": 54780 + }, + { + "epoch": 5.487804878048781, + "grad_norm": 3.048062324523926, + "learning_rate": 5.55596783126052e-06, + "loss": 0.4353, + "step": 54790 + }, + { + "epoch": 5.48880653077578, + "grad_norm": 2.053222179412842, + "learning_rate": 5.548895953879443e-06, + "loss": 0.4145, + "step": 54800 + }, + { + "epoch": 5.489808183502779, + "grad_norm": 3.174201011657715, + "learning_rate": 5.541828018188599e-06, + "loss": 0.4039, + "step": 54810 + }, + { + "epoch": 5.490809836229779, + "grad_norm": 1.8603386878967285, + "learning_rate": 5.5347640256202595e-06, + "loss": 0.4172, + "step": 54820 + }, + { + "epoch": 5.491811488956778, + "grad_norm": 2.1023452281951904, + "learning_rate": 5.527703977605919e-06, + "loss": 0.4082, + "step": 54830 + }, + { + "epoch": 5.492813141683778, + "grad_norm": 2.3197402954101562, + "learning_rate": 5.520647875576279e-06, + "loss": 0.3994, + "step": 54840 + }, + { + "epoch": 5.493814794410778, + "grad_norm": 1.8299857378005981, + "learning_rate": 5.513595720961231e-06, + "loss": 0.3643, + "step": 54850 + }, + { + "epoch": 5.494816447137778, + "grad_norm": 1.9272540807724, + "learning_rate": 5.50654751518985e-06, + "loss": 0.4442, + "step": 54860 + }, + { + "epoch": 5.495818099864777, + "grad_norm": 2.175856113433838, + "learning_rate": 5.499503259690442e-06, + "loss": 0.3711, + "step": 54870 + }, + { + "epoch": 5.496819752591777, + "grad_norm": 2.690469741821289, + "learning_rate": 5.492462955890504e-06, + "loss": 0.4243, + "step": 54880 + }, + { + "epoch": 5.497821405318776, + "grad_norm": 2.87835693359375, + "learning_rate": 5.4854266052167065e-06, + "loss": 0.4216, + "step": 54890 + }, + { + "epoch": 5.498823058045776, + "grad_norm": 2.505415439605713, + "learning_rate": 5.478394209094942e-06, + "loss": 0.4195, + "step": 54900 + }, + { + "epoch": 5.499824710772775, + "grad_norm": 2.257429361343384, + "learning_rate": 5.471365768950313e-06, + "loss": 0.3791, + "step": 54910 + }, + { + "epoch": 5.500826363499774, + "grad_norm": 1.8191579580307007, + "learning_rate": 5.464341286207103e-06, + "loss": 0.4309, + "step": 54920 + }, + { + "epoch": 5.501828016226774, + "grad_norm": 2.082589626312256, + "learning_rate": 5.4573207622887755e-06, + "loss": 0.4079, + "step": 54930 + }, + { + "epoch": 5.5028296689537735, + "grad_norm": 2.1472620964050293, + "learning_rate": 5.450304198618034e-06, + "loss": 0.3812, + "step": 54940 + }, + { + "epoch": 5.5038313216807735, + "grad_norm": 2.3610379695892334, + "learning_rate": 5.443291596616748e-06, + "loss": 0.4051, + "step": 54950 + }, + { + "epoch": 5.504832974407773, + "grad_norm": 1.5136091709136963, + "learning_rate": 5.436282957706004e-06, + "loss": 0.3947, + "step": 54960 + }, + { + "epoch": 5.505834627134773, + "grad_norm": 2.1798224449157715, + "learning_rate": 5.429278283306055e-06, + "loss": 0.3891, + "step": 54970 + }, + { + "epoch": 5.506836279861772, + "grad_norm": 2.1945409774780273, + "learning_rate": 5.42227757483639e-06, + "loss": 0.3617, + "step": 54980 + }, + { + "epoch": 5.507837932588772, + "grad_norm": 1.7211298942565918, + "learning_rate": 5.415280833715675e-06, + "loss": 0.4423, + "step": 54990 + }, + { + "epoch": 5.508839585315771, + "grad_norm": 2.1400156021118164, + "learning_rate": 5.408288061361749e-06, + "loss": 0.4237, + "step": 55000 + } + ], + "logging_steps": 10, + "max_steps": 69888, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.02157876235862e+21, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}