{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 2844, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004219409282700422, "grad_norm": 14.014748573303223, "learning_rate": 2.797202797202797e-08, "loss": 2.1982650756835938, "step": 2 }, { "epoch": 0.008438818565400843, "grad_norm": 12.766752243041992, "learning_rate": 8.391608391608391e-08, "loss": 1.7798584699630737, "step": 4 }, { "epoch": 0.012658227848101266, "grad_norm": 2.7050275802612305, "learning_rate": 1.3986013986013987e-07, "loss": 1.9378855228424072, "step": 6 }, { "epoch": 0.016877637130801686, "grad_norm": 6.62382173538208, "learning_rate": 1.958041958041958e-07, "loss": 1.9494550228118896, "step": 8 }, { "epoch": 0.02109704641350211, "grad_norm": 6.674213886260986, "learning_rate": 2.517482517482518e-07, "loss": 1.8559846878051758, "step": 10 }, { "epoch": 0.02531645569620253, "grad_norm": 2.3814892768859863, "learning_rate": 3.076923076923077e-07, "loss": 1.3210176229476929, "step": 12 }, { "epoch": 0.029535864978902954, "grad_norm": 10.13178825378418, "learning_rate": 3.636363636363636e-07, "loss": 1.6618098020553589, "step": 14 }, { "epoch": 0.03375527426160337, "grad_norm": 10.594144821166992, "learning_rate": 4.1958041958041957e-07, "loss": 2.217015266418457, "step": 16 }, { "epoch": 0.0379746835443038, "grad_norm": 2.019131898880005, "learning_rate": 4.755244755244755e-07, "loss": 1.8345531225204468, "step": 18 }, { "epoch": 0.04219409282700422, "grad_norm": 2.9380502700805664, "learning_rate": 5.314685314685314e-07, "loss": 1.9022338390350342, "step": 20 }, { "epoch": 0.046413502109704644, "grad_norm": 2.7503366470336914, "learning_rate": 5.874125874125873e-07, "loss": 1.5993010997772217, "step": 22 }, { "epoch": 0.05063291139240506, "grad_norm": 9.334077835083008, "learning_rate": 6.433566433566433e-07, "loss": 1.841583013534546, "step": 24 }, { "epoch": 0.05485232067510549, "grad_norm": 4.110686302185059, "learning_rate": 6.993006993006993e-07, "loss": 1.7132441997528076, "step": 26 }, { "epoch": 0.05907172995780591, "grad_norm": 5.482414245605469, "learning_rate": 7.552447552447552e-07, "loss": 1.9471144676208496, "step": 28 }, { "epoch": 0.06329113924050633, "grad_norm": 7.981461524963379, "learning_rate": 8.111888111888111e-07, "loss": 1.8706644773483276, "step": 30 }, { "epoch": 0.06751054852320675, "grad_norm": 2.825488567352295, "learning_rate": 8.67132867132867e-07, "loss": 1.7588139772415161, "step": 32 }, { "epoch": 0.07172995780590717, "grad_norm": 27.41690444946289, "learning_rate": 9.230769230769231e-07, "loss": 1.775272011756897, "step": 34 }, { "epoch": 0.0759493670886076, "grad_norm": 3.1006486415863037, "learning_rate": 9.79020979020979e-07, "loss": 1.784650444984436, "step": 36 }, { "epoch": 0.08016877637130802, "grad_norm": 7.034008026123047, "learning_rate": 1.034965034965035e-06, "loss": 1.5920319557189941, "step": 38 }, { "epoch": 0.08438818565400844, "grad_norm": 2.675238847732544, "learning_rate": 1.0909090909090908e-06, "loss": 1.7519234418869019, "step": 40 }, { "epoch": 0.08860759493670886, "grad_norm": 10.898767471313477, "learning_rate": 1.1468531468531469e-06, "loss": 1.3292922973632812, "step": 42 }, { "epoch": 0.09282700421940929, "grad_norm": 5.946654796600342, "learning_rate": 1.2027972027972026e-06, "loss": 1.9151337146759033, "step": 44 }, { "epoch": 0.0970464135021097, "grad_norm": 4.006372451782227, "learning_rate": 1.2587412587412587e-06, "loss": 1.734480619430542, "step": 46 }, { "epoch": 0.10126582278481013, "grad_norm": 1.6106147766113281, "learning_rate": 1.3146853146853144e-06, "loss": 1.6714043617248535, "step": 48 }, { "epoch": 0.10548523206751055, "grad_norm": 3.555082321166992, "learning_rate": 1.3706293706293705e-06, "loss": 0.9601479172706604, "step": 50 }, { "epoch": 0.10970464135021098, "grad_norm": 18.376201629638672, "learning_rate": 1.4265734265734267e-06, "loss": 0.9682204723358154, "step": 52 }, { "epoch": 0.11392405063291139, "grad_norm": 3.829577684402466, "learning_rate": 1.4825174825174824e-06, "loss": 0.9149891138076782, "step": 54 }, { "epoch": 0.11814345991561181, "grad_norm": 8.751733779907227, "learning_rate": 1.5384615384615385e-06, "loss": 1.5466492176055908, "step": 56 }, { "epoch": 0.12236286919831224, "grad_norm": 43.25166702270508, "learning_rate": 1.5944055944055942e-06, "loss": 0.8738414645195007, "step": 58 }, { "epoch": 0.12658227848101267, "grad_norm": 2.858604669570923, "learning_rate": 1.6503496503496503e-06, "loss": 1.5882339477539062, "step": 60 }, { "epoch": 0.1308016877637131, "grad_norm": 2.080610990524292, "learning_rate": 1.7062937062937063e-06, "loss": 1.6133513450622559, "step": 62 }, { "epoch": 0.1350210970464135, "grad_norm": 1.6210132837295532, "learning_rate": 1.7622377622377622e-06, "loss": 1.1352812051773071, "step": 64 }, { "epoch": 0.13924050632911392, "grad_norm": 4.165830135345459, "learning_rate": 1.818181818181818e-06, "loss": 0.8928266763687134, "step": 66 }, { "epoch": 0.14345991561181434, "grad_norm": 2.4804110527038574, "learning_rate": 1.874125874125874e-06, "loss": 1.182489275932312, "step": 68 }, { "epoch": 0.14767932489451477, "grad_norm": 11.683263778686523, "learning_rate": 1.9300699300699297e-06, "loss": 1.0528309345245361, "step": 70 }, { "epoch": 0.1518987341772152, "grad_norm": 5.113679885864258, "learning_rate": 1.986013986013986e-06, "loss": 1.3555092811584473, "step": 72 }, { "epoch": 0.15611814345991562, "grad_norm": 3.419110059738159, "learning_rate": 2.041958041958042e-06, "loss": 1.1131813526153564, "step": 74 }, { "epoch": 0.16033755274261605, "grad_norm": 5.5904622077941895, "learning_rate": 2.097902097902098e-06, "loss": 0.9376708269119263, "step": 76 }, { "epoch": 0.16455696202531644, "grad_norm": 4.4593892097473145, "learning_rate": 2.1538461538461538e-06, "loss": 1.4518260955810547, "step": 78 }, { "epoch": 0.16877637130801687, "grad_norm": 1.9147013425827026, "learning_rate": 2.2097902097902093e-06, "loss": 1.4421272277832031, "step": 80 }, { "epoch": 0.1729957805907173, "grad_norm": 4.915895462036133, "learning_rate": 2.2657342657342656e-06, "loss": 1.4590272903442383, "step": 82 }, { "epoch": 0.17721518987341772, "grad_norm": 6.905501842498779, "learning_rate": 2.3216783216783215e-06, "loss": 0.9708279371261597, "step": 84 }, { "epoch": 0.18143459915611815, "grad_norm": 7.524752140045166, "learning_rate": 2.3776223776223774e-06, "loss": 1.141646385192871, "step": 86 }, { "epoch": 0.18565400843881857, "grad_norm": 1.9856427907943726, "learning_rate": 2.4335664335664338e-06, "loss": 1.3669147491455078, "step": 88 }, { "epoch": 0.189873417721519, "grad_norm": 5.223474025726318, "learning_rate": 2.4895104895104893e-06, "loss": 0.6930243968963623, "step": 90 }, { "epoch": 0.1940928270042194, "grad_norm": 3.9480249881744385, "learning_rate": 2.545454545454545e-06, "loss": 1.7789967060089111, "step": 92 }, { "epoch": 0.19831223628691982, "grad_norm": 6.213054180145264, "learning_rate": 2.601398601398601e-06, "loss": 0.9946894645690918, "step": 94 }, { "epoch": 0.20253164556962025, "grad_norm": 2.132254123687744, "learning_rate": 2.6573426573426574e-06, "loss": 1.4530797004699707, "step": 96 }, { "epoch": 0.20675105485232068, "grad_norm": 1.8356496095657349, "learning_rate": 2.7132867132867134e-06, "loss": 1.5200846195220947, "step": 98 }, { "epoch": 0.2109704641350211, "grad_norm": 14.19537353515625, "learning_rate": 2.769230769230769e-06, "loss": 1.292062759399414, "step": 100 }, { "epoch": 0.21518987341772153, "grad_norm": 2.111111640930176, "learning_rate": 2.8251748251748248e-06, "loss": 1.1042956113815308, "step": 102 }, { "epoch": 0.21940928270042195, "grad_norm": 1.8971158266067505, "learning_rate": 2.881118881118881e-06, "loss": 1.0220731496810913, "step": 104 }, { "epoch": 0.22362869198312235, "grad_norm": 5.727835178375244, "learning_rate": 2.937062937062937e-06, "loss": 1.0205355882644653, "step": 106 }, { "epoch": 0.22784810126582278, "grad_norm": 3.1581368446350098, "learning_rate": 2.993006993006993e-06, "loss": 1.0161347389221191, "step": 108 }, { "epoch": 0.2320675105485232, "grad_norm": 2.3190581798553467, "learning_rate": 3.0489510489510484e-06, "loss": 1.0544636249542236, "step": 110 }, { "epoch": 0.23628691983122363, "grad_norm": 5.929664611816406, "learning_rate": 3.1048951048951048e-06, "loss": 1.4253602027893066, "step": 112 }, { "epoch": 0.24050632911392406, "grad_norm": 2.6725683212280273, "learning_rate": 3.1608391608391607e-06, "loss": 1.318920612335205, "step": 114 }, { "epoch": 0.24472573839662448, "grad_norm": 7.776963710784912, "learning_rate": 3.2167832167832166e-06, "loss": 1.6443480253219604, "step": 116 }, { "epoch": 0.2489451476793249, "grad_norm": 2.3923261165618896, "learning_rate": 3.272727272727273e-06, "loss": 1.3153703212738037, "step": 118 }, { "epoch": 0.25316455696202533, "grad_norm": 3.2848472595214844, "learning_rate": 3.3286713286713284e-06, "loss": 1.0184035301208496, "step": 120 }, { "epoch": 0.25738396624472576, "grad_norm": 4.440483093261719, "learning_rate": 3.3846153846153843e-06, "loss": 1.312201976776123, "step": 122 }, { "epoch": 0.2616033755274262, "grad_norm": 4.970678806304932, "learning_rate": 3.4405594405594402e-06, "loss": 1.3157330751419067, "step": 124 }, { "epoch": 0.26582278481012656, "grad_norm": 3.659862995147705, "learning_rate": 3.4965034965034966e-06, "loss": 1.4062931537628174, "step": 126 }, { "epoch": 0.270042194092827, "grad_norm": 4.357997894287109, "learning_rate": 3.5524475524475525e-06, "loss": 0.9154614210128784, "step": 128 }, { "epoch": 0.2742616033755274, "grad_norm": 4.5792341232299805, "learning_rate": 3.608391608391608e-06, "loss": 1.1704046726226807, "step": 130 }, { "epoch": 0.27848101265822783, "grad_norm": 5.039772033691406, "learning_rate": 3.664335664335664e-06, "loss": 1.2377243041992188, "step": 132 }, { "epoch": 0.28270042194092826, "grad_norm": 6.672406196594238, "learning_rate": 3.7202797202797202e-06, "loss": 0.7351927757263184, "step": 134 }, { "epoch": 0.2869198312236287, "grad_norm": 2.329267740249634, "learning_rate": 3.776223776223776e-06, "loss": 0.9117053151130676, "step": 136 }, { "epoch": 0.2911392405063291, "grad_norm": 4.902188777923584, "learning_rate": 3.832167832167832e-06, "loss": 1.4102413654327393, "step": 138 }, { "epoch": 0.29535864978902954, "grad_norm": 7.462285041809082, "learning_rate": 3.888111888111888e-06, "loss": 0.9595804214477539, "step": 140 }, { "epoch": 0.29957805907172996, "grad_norm": 4.3409953117370605, "learning_rate": 3.944055944055944e-06, "loss": 1.2982699871063232, "step": 142 }, { "epoch": 0.3037974683544304, "grad_norm": 5.797815799713135, "learning_rate": 4e-06, "loss": 1.0992412567138672, "step": 144 }, { "epoch": 0.3080168776371308, "grad_norm": 3.4705042839050293, "learning_rate": 3.999995129731755e-06, "loss": 1.4175902605056763, "step": 146 }, { "epoch": 0.31223628691983124, "grad_norm": 3.2805113792419434, "learning_rate": 3.999980518953377e-06, "loss": 1.3948296308517456, "step": 148 }, { "epoch": 0.31645569620253167, "grad_norm": 2.5500190258026123, "learning_rate": 3.9999561677439284e-06, "loss": 1.2504572868347168, "step": 150 }, { "epoch": 0.3206751054852321, "grad_norm": 2.943164825439453, "learning_rate": 3.999922076235186e-06, "loss": 1.3152413368225098, "step": 152 }, { "epoch": 0.32489451476793246, "grad_norm": 1.8291728496551514, "learning_rate": 3.999878244611632e-06, "loss": 1.4914839267730713, "step": 154 }, { "epoch": 0.3291139240506329, "grad_norm": 3.691744327545166, "learning_rate": 3.999824673110458e-06, "loss": 1.2806551456451416, "step": 156 }, { "epoch": 0.3333333333333333, "grad_norm": 3.6490440368652344, "learning_rate": 3.999761362021559e-06, "loss": 1.3481640815734863, "step": 158 }, { "epoch": 0.33755274261603374, "grad_norm": 2.0211308002471924, "learning_rate": 3.999688311687539e-06, "loss": 1.3426798582077026, "step": 160 }, { "epoch": 0.34177215189873417, "grad_norm": 3.4758718013763428, "learning_rate": 3.9996055225037035e-06, "loss": 0.8756759762763977, "step": 162 }, { "epoch": 0.3459915611814346, "grad_norm": 3.027031660079956, "learning_rate": 3.999512994918057e-06, "loss": 1.2513983249664307, "step": 164 }, { "epoch": 0.350210970464135, "grad_norm": 4.0340094566345215, "learning_rate": 3.999410729431306e-06, "loss": 0.83528733253479, "step": 166 }, { "epoch": 0.35443037974683544, "grad_norm": 4.2334747314453125, "learning_rate": 3.9992987265968506e-06, "loss": 1.2495150566101074, "step": 168 }, { "epoch": 0.35864978902953587, "grad_norm": 2.4250214099884033, "learning_rate": 3.999176987020782e-06, "loss": 1.3424336910247803, "step": 170 }, { "epoch": 0.3628691983122363, "grad_norm": 2.0446016788482666, "learning_rate": 3.999045511361886e-06, "loss": 1.2304866313934326, "step": 172 }, { "epoch": 0.3670886075949367, "grad_norm": 2.2647955417633057, "learning_rate": 3.998904300331629e-06, "loss": 1.0302658081054688, "step": 174 }, { "epoch": 0.37130801687763715, "grad_norm": 4.148885250091553, "learning_rate": 3.998753354694162e-06, "loss": 1.3435766696929932, "step": 176 }, { "epoch": 0.3755274261603376, "grad_norm": 2.1456167697906494, "learning_rate": 3.998592675266313e-06, "loss": 1.3384077548980713, "step": 178 }, { "epoch": 0.379746835443038, "grad_norm": 1.8021888732910156, "learning_rate": 3.998422262917586e-06, "loss": 1.0130809545516968, "step": 180 }, { "epoch": 0.38396624472573837, "grad_norm": 1.8628857135772705, "learning_rate": 3.99824211857015e-06, "loss": 1.3068010807037354, "step": 182 }, { "epoch": 0.3881856540084388, "grad_norm": 2.337610960006714, "learning_rate": 3.998052243198841e-06, "loss": 1.3072583675384521, "step": 184 }, { "epoch": 0.3924050632911392, "grad_norm": 4.762563228607178, "learning_rate": 3.997852637831152e-06, "loss": 0.5184736847877502, "step": 186 }, { "epoch": 0.39662447257383965, "grad_norm": 5.280208110809326, "learning_rate": 3.9976433035472296e-06, "loss": 0.9710695743560791, "step": 188 }, { "epoch": 0.4008438818565401, "grad_norm": 2.887589693069458, "learning_rate": 3.997424241479867e-06, "loss": 1.0692715644836426, "step": 190 }, { "epoch": 0.4050632911392405, "grad_norm": 1.577860951423645, "learning_rate": 3.997195452814498e-06, "loss": 1.315537452697754, "step": 192 }, { "epoch": 0.4092827004219409, "grad_norm": 3.5055530071258545, "learning_rate": 3.996956938789193e-06, "loss": 1.0743625164031982, "step": 194 }, { "epoch": 0.41350210970464135, "grad_norm": 2.70391583442688, "learning_rate": 3.996708700694647e-06, "loss": 1.2994472980499268, "step": 196 }, { "epoch": 0.4177215189873418, "grad_norm": 2.665532112121582, "learning_rate": 3.99645073987418e-06, "loss": 1.0376091003417969, "step": 198 }, { "epoch": 0.4219409282700422, "grad_norm": 3.4091718196868896, "learning_rate": 3.9961830577237225e-06, "loss": 1.1265370845794678, "step": 200 }, { "epoch": 0.42616033755274263, "grad_norm": 3.360374689102173, "learning_rate": 3.9959056556918125e-06, "loss": 1.1382226943969727, "step": 202 }, { "epoch": 0.43037974683544306, "grad_norm": 3.247422218322754, "learning_rate": 3.9956185352795864e-06, "loss": 0.9122767448425293, "step": 204 }, { "epoch": 0.4345991561181435, "grad_norm": 3.775322198867798, "learning_rate": 3.995321698040768e-06, "loss": 1.5471869707107544, "step": 206 }, { "epoch": 0.4388185654008439, "grad_norm": 11.316990852355957, "learning_rate": 3.995015145581668e-06, "loss": 0.7269084453582764, "step": 208 }, { "epoch": 0.4430379746835443, "grad_norm": 1.767858862876892, "learning_rate": 3.994698879561165e-06, "loss": 1.2886333465576172, "step": 210 }, { "epoch": 0.4472573839662447, "grad_norm": 3.727637767791748, "learning_rate": 3.994372901690705e-06, "loss": 0.8034701943397522, "step": 212 }, { "epoch": 0.45147679324894513, "grad_norm": 2.0933773517608643, "learning_rate": 3.994037213734287e-06, "loss": 1.209691047668457, "step": 214 }, { "epoch": 0.45569620253164556, "grad_norm": 2.345202684402466, "learning_rate": 3.993691817508457e-06, "loss": 1.2683181762695312, "step": 216 }, { "epoch": 0.459915611814346, "grad_norm": 6.4172868728637695, "learning_rate": 3.993336714882294e-06, "loss": 1.3031342029571533, "step": 218 }, { "epoch": 0.4641350210970464, "grad_norm": 4.881870269775391, "learning_rate": 3.992971907777404e-06, "loss": 1.259873390197754, "step": 220 }, { "epoch": 0.46835443037974683, "grad_norm": 4.619325637817383, "learning_rate": 3.992597398167907e-06, "loss": 1.2213921546936035, "step": 222 }, { "epoch": 0.47257383966244726, "grad_norm": 2.6401724815368652, "learning_rate": 3.99221318808043e-06, "loss": 1.2425501346588135, "step": 224 }, { "epoch": 0.4767932489451477, "grad_norm": 2.318206548690796, "learning_rate": 3.9918192795940875e-06, "loss": 1.2931036949157715, "step": 226 }, { "epoch": 0.4810126582278481, "grad_norm": 3.360222339630127, "learning_rate": 3.991415674840482e-06, "loss": 0.7865722179412842, "step": 228 }, { "epoch": 0.48523206751054854, "grad_norm": 7.906117916107178, "learning_rate": 3.9910023760036835e-06, "loss": 0.920839250087738, "step": 230 }, { "epoch": 0.48945147679324896, "grad_norm": 4.246833324432373, "learning_rate": 3.99057938532022e-06, "loss": 0.8984707593917847, "step": 232 }, { "epoch": 0.4936708860759494, "grad_norm": 1.9855449199676514, "learning_rate": 3.990146705079069e-06, "loss": 1.2834184169769287, "step": 234 }, { "epoch": 0.4978902953586498, "grad_norm": 2.732619285583496, "learning_rate": 3.989704337621639e-06, "loss": 1.3313374519348145, "step": 236 }, { "epoch": 0.5021097046413502, "grad_norm": 2.2487165927886963, "learning_rate": 3.989252285341761e-06, "loss": 0.9914782047271729, "step": 238 }, { "epoch": 0.5063291139240507, "grad_norm": 2.918333053588867, "learning_rate": 3.988790550685677e-06, "loss": 0.4503798186779022, "step": 240 }, { "epoch": 0.510548523206751, "grad_norm": 5.367257118225098, "learning_rate": 3.98831913615202e-06, "loss": 1.4287300109863281, "step": 242 }, { "epoch": 0.5147679324894515, "grad_norm": 4.372511863708496, "learning_rate": 3.987838044291807e-06, "loss": 0.8704193830490112, "step": 244 }, { "epoch": 0.5189873417721519, "grad_norm": 2.685379981994629, "learning_rate": 3.987347277708424e-06, "loss": 1.4937043190002441, "step": 246 }, { "epoch": 0.5232067510548524, "grad_norm": 2.241354465484619, "learning_rate": 3.986846839057609e-06, "loss": 1.2054930925369263, "step": 248 }, { "epoch": 0.5274261603375527, "grad_norm": 2.666008472442627, "learning_rate": 3.98633673104744e-06, "loss": 1.322192907333374, "step": 250 }, { "epoch": 0.5316455696202531, "grad_norm": 3.0313169956207275, "learning_rate": 3.985816956438322e-06, "loss": 1.1353508234024048, "step": 252 }, { "epoch": 0.5358649789029536, "grad_norm": 1.7615196704864502, "learning_rate": 3.985287518042965e-06, "loss": 1.2446702718734741, "step": 254 }, { "epoch": 0.540084388185654, "grad_norm": 2.7614693641662598, "learning_rate": 3.984748418726381e-06, "loss": 1.2152833938598633, "step": 256 }, { "epoch": 0.5443037974683544, "grad_norm": 1.3947678804397583, "learning_rate": 3.9841996614058536e-06, "loss": 1.0362350940704346, "step": 258 }, { "epoch": 0.5485232067510548, "grad_norm": 3.6117563247680664, "learning_rate": 3.983641249050933e-06, "loss": 0.9856378436088562, "step": 260 }, { "epoch": 0.5527426160337553, "grad_norm": 2.348914861679077, "learning_rate": 3.983073184683419e-06, "loss": 1.2900649309158325, "step": 262 }, { "epoch": 0.5569620253164557, "grad_norm": 2.4478940963745117, "learning_rate": 3.98249547137734e-06, "loss": 1.30060613155365, "step": 264 }, { "epoch": 0.5611814345991561, "grad_norm": 1.8957366943359375, "learning_rate": 3.981908112258938e-06, "loss": 1.2571529150009155, "step": 266 }, { "epoch": 0.5654008438818565, "grad_norm": 2.468729257583618, "learning_rate": 3.981311110506654e-06, "loss": 1.522542119026184, "step": 268 }, { "epoch": 0.569620253164557, "grad_norm": 5.101961612701416, "learning_rate": 3.9807044693511086e-06, "loss": 1.0608189105987549, "step": 270 }, { "epoch": 0.5738396624472574, "grad_norm": 3.0331854820251465, "learning_rate": 3.980088192075085e-06, "loss": 1.3017442226409912, "step": 272 }, { "epoch": 0.5780590717299579, "grad_norm": 2.463477373123169, "learning_rate": 3.979462282013513e-06, "loss": 1.099843144416809, "step": 274 }, { "epoch": 0.5822784810126582, "grad_norm": 1.7117162942886353, "learning_rate": 3.978826742553447e-06, "loss": 1.2798070907592773, "step": 276 }, { "epoch": 0.5864978902953587, "grad_norm": 3.3944342136383057, "learning_rate": 3.978181577134051e-06, "loss": 1.4166996479034424, "step": 278 }, { "epoch": 0.5907172995780591, "grad_norm": 2.0399510860443115, "learning_rate": 3.97752678924658e-06, "loss": 0.9708434343338013, "step": 280 }, { "epoch": 0.5949367088607594, "grad_norm": 5.146090984344482, "learning_rate": 3.976862382434358e-06, "loss": 1.3494899272918701, "step": 282 }, { "epoch": 0.5991561181434599, "grad_norm": 2.0854623317718506, "learning_rate": 3.976188360292762e-06, "loss": 1.551278829574585, "step": 284 }, { "epoch": 0.6033755274261603, "grad_norm": 0.7903197407722473, "learning_rate": 3.975504726469204e-06, "loss": 1.1335902214050293, "step": 286 }, { "epoch": 0.6075949367088608, "grad_norm": 1.5145395994186401, "learning_rate": 3.9748114846631025e-06, "loss": 1.2714455127716064, "step": 288 }, { "epoch": 0.6118143459915611, "grad_norm": 2.4970903396606445, "learning_rate": 3.974108638625875e-06, "loss": 0.8297945857048035, "step": 290 }, { "epoch": 0.6160337552742616, "grad_norm": 1.9116922616958618, "learning_rate": 3.973396192160909e-06, "loss": 0.6557431221008301, "step": 292 }, { "epoch": 0.620253164556962, "grad_norm": 1.597800374031067, "learning_rate": 3.972674149123543e-06, "loss": 1.251997709274292, "step": 294 }, { "epoch": 0.6244725738396625, "grad_norm": 5.221956253051758, "learning_rate": 3.971942513421049e-06, "loss": 0.7073361873626709, "step": 296 }, { "epoch": 0.6286919831223629, "grad_norm": 8.381784439086914, "learning_rate": 3.971201289012605e-06, "loss": 0.6594762802124023, "step": 298 }, { "epoch": 0.6329113924050633, "grad_norm": 4.704819202423096, "learning_rate": 3.97045047990928e-06, "loss": 1.7869096994400024, "step": 300 }, { "epoch": 0.6371308016877637, "grad_norm": 1.746824026107788, "learning_rate": 3.969690090174009e-06, "loss": 1.2827584743499756, "step": 302 }, { "epoch": 0.6413502109704642, "grad_norm": 2.3811588287353516, "learning_rate": 3.968920123921574e-06, "loss": 0.8861095905303955, "step": 304 }, { "epoch": 0.6455696202531646, "grad_norm": 2.874070644378662, "learning_rate": 3.968140585318575e-06, "loss": 1.0074717998504639, "step": 306 }, { "epoch": 0.6497890295358649, "grad_norm": 1.4178441762924194, "learning_rate": 3.967351478583417e-06, "loss": 1.271646499633789, "step": 308 }, { "epoch": 0.6540084388185654, "grad_norm": 2.7072203159332275, "learning_rate": 3.9665528079862766e-06, "loss": 1.2094981670379639, "step": 310 }, { "epoch": 0.6582278481012658, "grad_norm": 2.434222936630249, "learning_rate": 3.965744577849089e-06, "loss": 1.016772747039795, "step": 312 }, { "epoch": 0.6624472573839663, "grad_norm": 1.4761089086532593, "learning_rate": 3.964926792545517e-06, "loss": 1.2257163524627686, "step": 314 }, { "epoch": 0.6666666666666666, "grad_norm": 1.9905054569244385, "learning_rate": 3.964099456500932e-06, "loss": 1.1116795539855957, "step": 316 }, { "epoch": 0.6708860759493671, "grad_norm": 2.5824759006500244, "learning_rate": 3.963262574192388e-06, "loss": 1.0979809761047363, "step": 318 }, { "epoch": 0.6751054852320675, "grad_norm": 2.126721143722534, "learning_rate": 3.962416150148598e-06, "loss": 1.0931775569915771, "step": 320 }, { "epoch": 0.679324894514768, "grad_norm": 2.355828046798706, "learning_rate": 3.961560188949909e-06, "loss": 0.8760429620742798, "step": 322 }, { "epoch": 0.6835443037974683, "grad_norm": 2.159811496734619, "learning_rate": 3.9606946952282745e-06, "loss": 0.8803830146789551, "step": 324 }, { "epoch": 0.6877637130801688, "grad_norm": 2.930659294128418, "learning_rate": 3.959819673667239e-06, "loss": 0.8701751232147217, "step": 326 }, { "epoch": 0.6919831223628692, "grad_norm": 1.615522027015686, "learning_rate": 3.958935129001899e-06, "loss": 0.8708148002624512, "step": 328 }, { "epoch": 0.6962025316455697, "grad_norm": 5.590799331665039, "learning_rate": 3.958041066018891e-06, "loss": 1.591496229171753, "step": 330 }, { "epoch": 0.70042194092827, "grad_norm": 3.333008050918579, "learning_rate": 3.957137489556352e-06, "loss": 1.1004414558410645, "step": 332 }, { "epoch": 0.7046413502109705, "grad_norm": 2.3116865158081055, "learning_rate": 3.956224404503906e-06, "loss": 1.4001518487930298, "step": 334 }, { "epoch": 0.7088607594936709, "grad_norm": 4.017354965209961, "learning_rate": 3.955301815802629e-06, "loss": 1.2720857858657837, "step": 336 }, { "epoch": 0.7130801687763713, "grad_norm": 2.1855573654174805, "learning_rate": 3.954369728445028e-06, "loss": 1.2939956188201904, "step": 338 }, { "epoch": 0.7172995780590717, "grad_norm": 2.4703359603881836, "learning_rate": 3.953428147475006e-06, "loss": 1.2735445499420166, "step": 340 }, { "epoch": 0.7215189873417721, "grad_norm": 2.1738462448120117, "learning_rate": 3.952477077987845e-06, "loss": 1.2617197036743164, "step": 342 }, { "epoch": 0.7257383966244726, "grad_norm": 2.704313278198242, "learning_rate": 3.95151652513017e-06, "loss": 1.0853008031845093, "step": 344 }, { "epoch": 0.729957805907173, "grad_norm": 10.06601333618164, "learning_rate": 3.950546494099926e-06, "loss": 0.8921165466308594, "step": 346 }, { "epoch": 0.7341772151898734, "grad_norm": 1.9999581575393677, "learning_rate": 3.949566990146349e-06, "loss": 1.256639003753662, "step": 348 }, { "epoch": 0.7383966244725738, "grad_norm": 5.633319854736328, "learning_rate": 3.948578018569932e-06, "loss": 1.1841363906860352, "step": 350 }, { "epoch": 0.7426160337552743, "grad_norm": 7.676711559295654, "learning_rate": 3.94757958472241e-06, "loss": 1.0944801568984985, "step": 352 }, { "epoch": 0.7468354430379747, "grad_norm": 4.892640590667725, "learning_rate": 3.946571694006712e-06, "loss": 0.6508228182792664, "step": 354 }, { "epoch": 0.7510548523206751, "grad_norm": 2.564443349838257, "learning_rate": 3.945554351876951e-06, "loss": 1.0562660694122314, "step": 356 }, { "epoch": 0.7552742616033755, "grad_norm": 4.787500858306885, "learning_rate": 3.94452756383838e-06, "loss": 0.998831033706665, "step": 358 }, { "epoch": 0.759493670886076, "grad_norm": 1.8746553659439087, "learning_rate": 3.943491335447368e-06, "loss": 1.2303812503814697, "step": 360 }, { "epoch": 0.7637130801687764, "grad_norm": 2.7792534828186035, "learning_rate": 3.942445672311373e-06, "loss": 0.9920629858970642, "step": 362 }, { "epoch": 0.7679324894514767, "grad_norm": 5.023082733154297, "learning_rate": 3.941390580088905e-06, "loss": 1.5890564918518066, "step": 364 }, { "epoch": 0.7721518987341772, "grad_norm": 3.2253143787384033, "learning_rate": 3.940326064489499e-06, "loss": 0.7020189166069031, "step": 366 }, { "epoch": 0.7763713080168776, "grad_norm": 4.751897811889648, "learning_rate": 3.939252131273686e-06, "loss": 1.1662057638168335, "step": 368 }, { "epoch": 0.7805907172995781, "grad_norm": 4.707884788513184, "learning_rate": 3.938168786252957e-06, "loss": 1.490715742111206, "step": 370 }, { "epoch": 0.7848101265822784, "grad_norm": 4.896017074584961, "learning_rate": 3.937076035289735e-06, "loss": 0.9990431070327759, "step": 372 }, { "epoch": 0.7890295358649789, "grad_norm": 5.3917059898376465, "learning_rate": 3.935973884297344e-06, "loss": 1.06167471408844, "step": 374 }, { "epoch": 0.7932489451476793, "grad_norm": 1.6713993549346924, "learning_rate": 3.934862339239972e-06, "loss": 1.1578385829925537, "step": 376 }, { "epoch": 0.7974683544303798, "grad_norm": 1.267899990081787, "learning_rate": 3.933741406132645e-06, "loss": 1.1280488967895508, "step": 378 }, { "epoch": 0.8016877637130801, "grad_norm": 2.5772478580474854, "learning_rate": 3.932611091041192e-06, "loss": 0.7228022217750549, "step": 380 }, { "epoch": 0.8059071729957806, "grad_norm": 2.877981185913086, "learning_rate": 3.931471400082208e-06, "loss": 1.275989294052124, "step": 382 }, { "epoch": 0.810126582278481, "grad_norm": 4.086211204528809, "learning_rate": 3.930322339423029e-06, "loss": 1.0468356609344482, "step": 384 }, { "epoch": 0.8143459915611815, "grad_norm": 3.2680745124816895, "learning_rate": 3.929163915281692e-06, "loss": 1.2617956399917603, "step": 386 }, { "epoch": 0.8185654008438819, "grad_norm": 2.128434181213379, "learning_rate": 3.927996133926903e-06, "loss": 0.9376715421676636, "step": 388 }, { "epoch": 0.8227848101265823, "grad_norm": 1.895815372467041, "learning_rate": 3.926819001678005e-06, "loss": 1.2338812351226807, "step": 390 }, { "epoch": 0.8270042194092827, "grad_norm": 2.774864435195923, "learning_rate": 3.925632524904943e-06, "loss": 0.9890301823616028, "step": 392 }, { "epoch": 0.8312236286919831, "grad_norm": 3.1884961128234863, "learning_rate": 3.924436710028228e-06, "loss": 0.9189957976341248, "step": 394 }, { "epoch": 0.8354430379746836, "grad_norm": 2.6990184783935547, "learning_rate": 3.923231563518904e-06, "loss": 1.2466810941696167, "step": 396 }, { "epoch": 0.8396624472573839, "grad_norm": 0.5147901773452759, "learning_rate": 3.922017091898511e-06, "loss": 1.0888053178787231, "step": 398 }, { "epoch": 0.8438818565400844, "grad_norm": 2.1018054485321045, "learning_rate": 3.920793301739052e-06, "loss": 0.9585396647453308, "step": 400 }, { "epoch": 0.8481012658227848, "grad_norm": 4.2678046226501465, "learning_rate": 3.9195601996629564e-06, "loss": 0.702578067779541, "step": 402 }, { "epoch": 0.8523206751054853, "grad_norm": 3.8625717163085938, "learning_rate": 3.9183177923430445e-06, "loss": 1.2020361423492432, "step": 404 }, { "epoch": 0.8565400843881856, "grad_norm": 2.158465623855591, "learning_rate": 3.917066086502491e-06, "loss": 0.9442514181137085, "step": 406 }, { "epoch": 0.8607594936708861, "grad_norm": 2.1818642616271973, "learning_rate": 3.915805088914787e-06, "loss": 0.9750051498413086, "step": 408 }, { "epoch": 0.8649789029535865, "grad_norm": 1.9446742534637451, "learning_rate": 3.914534806403707e-06, "loss": 1.247662901878357, "step": 410 }, { "epoch": 0.869198312236287, "grad_norm": 2.9858086109161377, "learning_rate": 3.913255245843269e-06, "loss": 0.8505547642707825, "step": 412 }, { "epoch": 0.8734177215189873, "grad_norm": 3.3264975547790527, "learning_rate": 3.911966414157699e-06, "loss": 1.222496509552002, "step": 414 }, { "epoch": 0.8776371308016878, "grad_norm": 1.9070676565170288, "learning_rate": 3.910668318321395e-06, "loss": 1.0650990009307861, "step": 416 }, { "epoch": 0.8818565400843882, "grad_norm": 49.54351806640625, "learning_rate": 3.90936096535888e-06, "loss": 1.170435905456543, "step": 418 }, { "epoch": 0.8860759493670886, "grad_norm": 3.324521064758301, "learning_rate": 3.90804436234478e-06, "loss": 0.9483715295791626, "step": 420 }, { "epoch": 0.890295358649789, "grad_norm": 2.001574754714966, "learning_rate": 3.9067185164037705e-06, "loss": 1.3522322177886963, "step": 422 }, { "epoch": 0.8945147679324894, "grad_norm": 13.360381126403809, "learning_rate": 3.905383434710546e-06, "loss": 0.980687141418457, "step": 424 }, { "epoch": 0.8987341772151899, "grad_norm": 2.832037925720215, "learning_rate": 3.904039124489782e-06, "loss": 1.1890883445739746, "step": 426 }, { "epoch": 0.9029535864978903, "grad_norm": 3.036261796951294, "learning_rate": 3.902685593016088e-06, "loss": 1.0536837577819824, "step": 428 }, { "epoch": 0.9071729957805907, "grad_norm": 3.503538131713867, "learning_rate": 3.90132284761398e-06, "loss": 1.016420602798462, "step": 430 }, { "epoch": 0.9113924050632911, "grad_norm": 2.102992534637451, "learning_rate": 3.899950895657829e-06, "loss": 1.0863244533538818, "step": 432 }, { "epoch": 0.9156118143459916, "grad_norm": 2.5443339347839355, "learning_rate": 3.8985697445718275e-06, "loss": 1.2617383003234863, "step": 434 }, { "epoch": 0.919831223628692, "grad_norm": 13.239272117614746, "learning_rate": 3.8971794018299515e-06, "loss": 0.8763201832771301, "step": 436 }, { "epoch": 0.9240506329113924, "grad_norm": 1.944677710533142, "learning_rate": 3.895779874955913e-06, "loss": 1.2141039371490479, "step": 438 }, { "epoch": 0.9282700421940928, "grad_norm": 1.6930376291275024, "learning_rate": 3.894371171523124e-06, "loss": 0.9925521016120911, "step": 440 }, { "epoch": 0.9324894514767933, "grad_norm": 2.417435646057129, "learning_rate": 3.892953299154657e-06, "loss": 0.9523521661758423, "step": 442 }, { "epoch": 0.9367088607594937, "grad_norm": 4.125819683074951, "learning_rate": 3.8915262655231985e-06, "loss": 1.1057894229888916, "step": 444 }, { "epoch": 0.9409282700421941, "grad_norm": 5.843780517578125, "learning_rate": 3.890090078351011e-06, "loss": 1.3123371601104736, "step": 446 }, { "epoch": 0.9451476793248945, "grad_norm": 1.6658388376235962, "learning_rate": 3.8886447454098914e-06, "loss": 1.013564109802246, "step": 448 }, { "epoch": 0.9493670886075949, "grad_norm": 12.320473670959473, "learning_rate": 3.887190274521128e-06, "loss": 0.9290477633476257, "step": 450 }, { "epoch": 0.9535864978902954, "grad_norm": 1.9203139543533325, "learning_rate": 3.885726673555457e-06, "loss": 1.2885007858276367, "step": 452 }, { "epoch": 0.9578059071729957, "grad_norm": 1.4699382781982422, "learning_rate": 3.884253950433022e-06, "loss": 1.0010005235671997, "step": 454 }, { "epoch": 0.9620253164556962, "grad_norm": 2.8639562129974365, "learning_rate": 3.882772113123332e-06, "loss": 1.1654586791992188, "step": 456 }, { "epoch": 0.9662447257383966, "grad_norm": 3.0785443782806396, "learning_rate": 3.881281169645212e-06, "loss": 0.8937104940414429, "step": 458 }, { "epoch": 0.9704641350210971, "grad_norm": 4.43109655380249, "learning_rate": 3.879781128066771e-06, "loss": 0.7110123634338379, "step": 460 }, { "epoch": 0.9746835443037974, "grad_norm": 2.914869785308838, "learning_rate": 3.878271996505345e-06, "loss": 0.8978859186172485, "step": 462 }, { "epoch": 0.9789029535864979, "grad_norm": 2.207864999771118, "learning_rate": 3.876753783127464e-06, "loss": 1.1789137125015259, "step": 464 }, { "epoch": 0.9831223628691983, "grad_norm": 3.6072731018066406, "learning_rate": 3.875226496148799e-06, "loss": 0.7317770719528198, "step": 466 }, { "epoch": 0.9873417721518988, "grad_norm": 6.211583614349365, "learning_rate": 3.873690143834129e-06, "loss": 1.3279008865356445, "step": 468 }, { "epoch": 0.9915611814345991, "grad_norm": 8.5204496383667, "learning_rate": 3.872144734497281e-06, "loss": 0.8850146532058716, "step": 470 }, { "epoch": 0.9957805907172996, "grad_norm": 2.666997194290161, "learning_rate": 3.870590276501099e-06, "loss": 1.220442295074463, "step": 472 }, { "epoch": 1.0, "grad_norm": 3.6749136447906494, "learning_rate": 3.869026778257392e-06, "loss": 1.2717642784118652, "step": 474 }, { "epoch": 1.0042194092827004, "grad_norm": 5.584980010986328, "learning_rate": 3.867454248226887e-06, "loss": 1.0543200969696045, "step": 476 }, { "epoch": 1.0084388185654007, "grad_norm": 4.494807243347168, "learning_rate": 3.86587269491919e-06, "loss": 1.1978576183319092, "step": 478 }, { "epoch": 1.0126582278481013, "grad_norm": 2.680006504058838, "learning_rate": 3.86428212689273e-06, "loss": 1.1809672117233276, "step": 480 }, { "epoch": 1.0168776371308017, "grad_norm": 3.9369754791259766, "learning_rate": 3.862682552754722e-06, "loss": 0.9172142744064331, "step": 482 }, { "epoch": 1.021097046413502, "grad_norm": 7.680518627166748, "learning_rate": 3.861073981161118e-06, "loss": 1.1449049711227417, "step": 484 }, { "epoch": 1.0253164556962024, "grad_norm": 2.746133804321289, "learning_rate": 3.859456420816556e-06, "loss": 0.5115264654159546, "step": 486 }, { "epoch": 1.029535864978903, "grad_norm": 2.72514271736145, "learning_rate": 3.857829880474316e-06, "loss": 0.9918684363365173, "step": 488 }, { "epoch": 1.0337552742616034, "grad_norm": 2.223912000656128, "learning_rate": 3.856194368936275e-06, "loss": 0.8463398814201355, "step": 490 }, { "epoch": 1.0379746835443038, "grad_norm": 2.9955148696899414, "learning_rate": 3.8545498950528535e-06, "loss": 1.173925757408142, "step": 492 }, { "epoch": 1.0421940928270041, "grad_norm": 4.594770431518555, "learning_rate": 3.852896467722974e-06, "loss": 0.8562051057815552, "step": 494 }, { "epoch": 1.0464135021097047, "grad_norm": 9.129888534545898, "learning_rate": 3.851234095894007e-06, "loss": 0.9281083345413208, "step": 496 }, { "epoch": 1.0506329113924051, "grad_norm": 2.604607105255127, "learning_rate": 3.849562788561727e-06, "loss": 1.2945480346679688, "step": 498 }, { "epoch": 1.0548523206751055, "grad_norm": 2.3840718269348145, "learning_rate": 3.847882554770263e-06, "loss": 1.1486706733703613, "step": 500 }, { "epoch": 1.0590717299578059, "grad_norm": 1.9679715633392334, "learning_rate": 3.846193403612046e-06, "loss": 1.1716930866241455, "step": 502 }, { "epoch": 1.0632911392405062, "grad_norm": 1.7950235605239868, "learning_rate": 3.844495344227765e-06, "loss": 1.2809019088745117, "step": 504 }, { "epoch": 1.0675105485232068, "grad_norm": 2.0246713161468506, "learning_rate": 3.842788385806312e-06, "loss": 0.7856377363204956, "step": 506 }, { "epoch": 1.0717299578059072, "grad_norm": 2.0895354747772217, "learning_rate": 3.841072537584741e-06, "loss": 1.1074151992797852, "step": 508 }, { "epoch": 1.0759493670886076, "grad_norm": 2.316358804702759, "learning_rate": 3.8393478088482065e-06, "loss": 1.1439809799194336, "step": 510 }, { "epoch": 1.080168776371308, "grad_norm": 4.703127384185791, "learning_rate": 3.837614208929921e-06, "loss": 1.035994291305542, "step": 512 }, { "epoch": 1.0843881856540085, "grad_norm": 7.031744003295898, "learning_rate": 3.835871747211105e-06, "loss": 1.151397705078125, "step": 514 }, { "epoch": 1.0886075949367089, "grad_norm": 2.653866767883301, "learning_rate": 3.83412043312093e-06, "loss": 1.16837739944458, "step": 516 }, { "epoch": 1.0928270042194093, "grad_norm": 2.976186752319336, "learning_rate": 3.832360276136474e-06, "loss": 0.9901262521743774, "step": 518 }, { "epoch": 1.0970464135021096, "grad_norm": 4.738975524902344, "learning_rate": 3.830591285782666e-06, "loss": 0.9500905871391296, "step": 520 }, { "epoch": 1.1012658227848102, "grad_norm": 8.483416557312012, "learning_rate": 3.828813471632237e-06, "loss": 0.8555248975753784, "step": 522 }, { "epoch": 1.1054852320675106, "grad_norm": 4.0885467529296875, "learning_rate": 3.827026843305667e-06, "loss": 1.0695732831954956, "step": 524 }, { "epoch": 1.109704641350211, "grad_norm": 2.929239273071289, "learning_rate": 3.825231410471132e-06, "loss": 0.868694543838501, "step": 526 }, { "epoch": 1.1139240506329113, "grad_norm": 2.6514179706573486, "learning_rate": 3.823427182844455e-06, "loss": 1.3674180507659912, "step": 528 }, { "epoch": 1.1181434599156117, "grad_norm": 3.984480142593384, "learning_rate": 3.821614170189049e-06, "loss": 1.2144532203674316, "step": 530 }, { "epoch": 1.1223628691983123, "grad_norm": 7.298747539520264, "learning_rate": 3.819792382315868e-06, "loss": 0.6592221260070801, "step": 532 }, { "epoch": 1.1265822784810127, "grad_norm": 5.481675624847412, "learning_rate": 3.81796182908335e-06, "loss": 1.1008317470550537, "step": 534 }, { "epoch": 1.130801687763713, "grad_norm": 2.6566853523254395, "learning_rate": 3.816122520397369e-06, "loss": 1.1687147617340088, "step": 536 }, { "epoch": 1.1350210970464134, "grad_norm": 2.098435163497925, "learning_rate": 3.8142744662111767e-06, "loss": 0.8460148572921753, "step": 538 }, { "epoch": 1.139240506329114, "grad_norm": 2.0900216102600098, "learning_rate": 3.81241767652535e-06, "loss": 0.7578733563423157, "step": 540 }, { "epoch": 1.1434599156118144, "grad_norm": 2.375847578048706, "learning_rate": 3.8105521613877386e-06, "loss": 0.8102576732635498, "step": 542 }, { "epoch": 1.1476793248945147, "grad_norm": 3.2528064250946045, "learning_rate": 3.8086779308934066e-06, "loss": 0.8352131247520447, "step": 544 }, { "epoch": 1.1518987341772151, "grad_norm": 2.7880918979644775, "learning_rate": 3.8067949951845836e-06, "loss": 1.108149766921997, "step": 546 }, { "epoch": 1.1561181434599157, "grad_norm": 2.515939712524414, "learning_rate": 3.8049033644506043e-06, "loss": 1.1225923299789429, "step": 548 }, { "epoch": 1.160337552742616, "grad_norm": 7.062304973602295, "learning_rate": 3.8030030489278563e-06, "loss": 0.9247970581054688, "step": 550 }, { "epoch": 1.1645569620253164, "grad_norm": 4.359371662139893, "learning_rate": 3.8010940588997253e-06, "loss": 1.4258188009262085, "step": 552 }, { "epoch": 1.1687763713080168, "grad_norm": 2.2747061252593994, "learning_rate": 3.799176404696537e-06, "loss": 1.1855448484420776, "step": 554 }, { "epoch": 1.1729957805907172, "grad_norm": 4.772888660430908, "learning_rate": 3.797250096695503e-06, "loss": 0.6528091430664062, "step": 556 }, { "epoch": 1.1772151898734178, "grad_norm": 6.059512138366699, "learning_rate": 3.7953151453206635e-06, "loss": 1.0413281917572021, "step": 558 }, { "epoch": 1.1814345991561181, "grad_norm": 3.8079075813293457, "learning_rate": 3.793371561042833e-06, "loss": 0.6656049489974976, "step": 560 }, { "epoch": 1.1856540084388185, "grad_norm": 3.2168707847595215, "learning_rate": 3.791419354379541e-06, "loss": 0.8556336164474487, "step": 562 }, { "epoch": 1.189873417721519, "grad_norm": 6.392472267150879, "learning_rate": 3.7894585358949758e-06, "loss": 1.3849632740020752, "step": 564 }, { "epoch": 1.1940928270042195, "grad_norm": 6.333314418792725, "learning_rate": 3.78748911619993e-06, "loss": 1.1986020803451538, "step": 566 }, { "epoch": 1.1983122362869199, "grad_norm": 3.8843421936035156, "learning_rate": 3.7855111059517376e-06, "loss": 0.834921658039093, "step": 568 }, { "epoch": 1.2025316455696202, "grad_norm": 2.22169828414917, "learning_rate": 3.7835245158542225e-06, "loss": 1.1095911264419556, "step": 570 }, { "epoch": 1.2067510548523206, "grad_norm": 2.5398857593536377, "learning_rate": 3.7815293566576367e-06, "loss": 1.06223464012146, "step": 572 }, { "epoch": 1.2109704641350212, "grad_norm": 1.9426056146621704, "learning_rate": 3.779525639158602e-06, "loss": 1.1437506675720215, "step": 574 }, { "epoch": 1.2151898734177216, "grad_norm": 3.523289203643799, "learning_rate": 3.7775133742000542e-06, "loss": 0.9638210535049438, "step": 576 }, { "epoch": 1.219409282700422, "grad_norm": 2.9455223083496094, "learning_rate": 3.7754925726711832e-06, "loss": 0.6213325262069702, "step": 578 }, { "epoch": 1.2236286919831223, "grad_norm": 1.926129698753357, "learning_rate": 3.773463245507371e-06, "loss": 0.9760810732841492, "step": 580 }, { "epoch": 1.2278481012658227, "grad_norm": 5.75839900970459, "learning_rate": 3.7714254036901382e-06, "loss": 0.8893729448318481, "step": 582 }, { "epoch": 1.2320675105485233, "grad_norm": 2.0400707721710205, "learning_rate": 3.7693790582470815e-06, "loss": 0.7321144342422485, "step": 584 }, { "epoch": 1.2362869198312236, "grad_norm": 9.54411792755127, "learning_rate": 3.767324220251812e-06, "loss": 0.938395082950592, "step": 586 }, { "epoch": 1.240506329113924, "grad_norm": 3.1993234157562256, "learning_rate": 3.7652609008238994e-06, "loss": 0.8318843841552734, "step": 588 }, { "epoch": 1.2447257383966246, "grad_norm": 2.4239490032196045, "learning_rate": 3.76318911112881e-06, "loss": 1.1875081062316895, "step": 590 }, { "epoch": 1.248945147679325, "grad_norm": 7.202500820159912, "learning_rate": 3.761108862377844e-06, "loss": 0.6182510852813721, "step": 592 }, { "epoch": 1.2531645569620253, "grad_norm": 1.383612871170044, "learning_rate": 3.75902016582808e-06, "loss": 0.8994504809379578, "step": 594 }, { "epoch": 1.2573839662447257, "grad_norm": 4.613704204559326, "learning_rate": 3.756923032782309e-06, "loss": 0.7695854902267456, "step": 596 }, { "epoch": 1.261603375527426, "grad_norm": 3.9212303161621094, "learning_rate": 3.754817474588976e-06, "loss": 0.6324819922447205, "step": 598 }, { "epoch": 1.2658227848101267, "grad_norm": 2.7459237575531006, "learning_rate": 3.752703502642118e-06, "loss": 1.0705938339233398, "step": 600 }, { "epoch": 1.270042194092827, "grad_norm": 6.447327613830566, "learning_rate": 3.7505811283813028e-06, "loss": 1.4245244264602661, "step": 602 }, { "epoch": 1.2742616033755274, "grad_norm": 1.7515556812286377, "learning_rate": 3.7484503632915642e-06, "loss": 1.0706822872161865, "step": 604 }, { "epoch": 1.2784810126582278, "grad_norm": 4.614502429962158, "learning_rate": 3.7463112189033452e-06, "loss": 0.9431329965591431, "step": 606 }, { "epoch": 1.2827004219409281, "grad_norm": 8.263338088989258, "learning_rate": 3.7441637067924314e-06, "loss": 0.8352319598197937, "step": 608 }, { "epoch": 1.2869198312236287, "grad_norm": 3.6502585411071777, "learning_rate": 3.7420078385798895e-06, "loss": 0.9339005351066589, "step": 610 }, { "epoch": 1.2911392405063291, "grad_norm": 8.820695877075195, "learning_rate": 3.739843625932004e-06, "loss": 0.6273094415664673, "step": 612 }, { "epoch": 1.2953586497890295, "grad_norm": 2.1156527996063232, "learning_rate": 3.737671080560215e-06, "loss": 0.6872820854187012, "step": 614 }, { "epoch": 1.29957805907173, "grad_norm": 2.442565679550171, "learning_rate": 3.7354902142210548e-06, "loss": 1.1194093227386475, "step": 616 }, { "epoch": 1.3037974683544304, "grad_norm": 1.8104244470596313, "learning_rate": 3.7333010387160834e-06, "loss": 1.1286826133728027, "step": 618 }, { "epoch": 1.3080168776371308, "grad_norm": 2.462080955505371, "learning_rate": 3.7311035658918248e-06, "loss": 0.7162832617759705, "step": 620 }, { "epoch": 1.3122362869198312, "grad_norm": 3.075747013092041, "learning_rate": 3.728897807639705e-06, "loss": 0.9384140968322754, "step": 622 }, { "epoch": 1.3164556962025316, "grad_norm": 30.50847053527832, "learning_rate": 3.7266837758959825e-06, "loss": 0.8814220428466797, "step": 624 }, { "epoch": 1.3206751054852321, "grad_norm": 2.7363264560699463, "learning_rate": 3.7244614826416896e-06, "loss": 1.1194790601730347, "step": 626 }, { "epoch": 1.3248945147679325, "grad_norm": 11.446985244750977, "learning_rate": 3.722230939902565e-06, "loss": 1.6146903038024902, "step": 628 }, { "epoch": 1.3291139240506329, "grad_norm": 1.5937474966049194, "learning_rate": 3.7199921597489876e-06, "loss": 0.8981386423110962, "step": 630 }, { "epoch": 1.3333333333333333, "grad_norm": 1.8236477375030518, "learning_rate": 3.717745154295913e-06, "loss": 1.0962973833084106, "step": 632 }, { "epoch": 1.3375527426160336, "grad_norm": 1.031929850578308, "learning_rate": 3.7154899357028072e-06, "loss": 0.8632595539093018, "step": 634 }, { "epoch": 1.3417721518987342, "grad_norm": 6.748950958251953, "learning_rate": 3.7132265161735803e-06, "loss": 0.6589536666870117, "step": 636 }, { "epoch": 1.3459915611814346, "grad_norm": 9.24288558959961, "learning_rate": 3.710954907956522e-06, "loss": 0.8823557496070862, "step": 638 }, { "epoch": 1.350210970464135, "grad_norm": 5.132577419281006, "learning_rate": 3.7086751233442327e-06, "loss": 1.2359545230865479, "step": 640 }, { "epoch": 1.3544303797468356, "grad_norm": 2.1931583881378174, "learning_rate": 3.7063871746735615e-06, "loss": 0.839038610458374, "step": 642 }, { "epoch": 1.358649789029536, "grad_norm": 1.920567512512207, "learning_rate": 3.704091074325534e-06, "loss": 1.2603816986083984, "step": 644 }, { "epoch": 1.3628691983122363, "grad_norm": 1.3721178770065308, "learning_rate": 3.7017868347252882e-06, "loss": 1.1347554922103882, "step": 646 }, { "epoch": 1.3670886075949367, "grad_norm": 6.712429523468018, "learning_rate": 3.699474468342008e-06, "loss": 0.8782555460929871, "step": 648 }, { "epoch": 1.371308016877637, "grad_norm": 3.626140594482422, "learning_rate": 3.6971539876888525e-06, "loss": 1.3546593189239502, "step": 650 }, { "epoch": 1.3755274261603376, "grad_norm": 2.531872034072876, "learning_rate": 3.694825405322894e-06, "loss": 1.1074378490447998, "step": 652 }, { "epoch": 1.379746835443038, "grad_norm": 1.418874740600586, "learning_rate": 3.692488733845044e-06, "loss": 0.8609563112258911, "step": 654 }, { "epoch": 1.3839662447257384, "grad_norm": 1.9295591115951538, "learning_rate": 3.690143985899987e-06, "loss": 1.2149752378463745, "step": 656 }, { "epoch": 1.3881856540084387, "grad_norm": 9.573609352111816, "learning_rate": 3.687791174176115e-06, "loss": 0.6435118317604065, "step": 658 }, { "epoch": 1.3924050632911391, "grad_norm": 2.0520520210266113, "learning_rate": 3.685430311405453e-06, "loss": 1.1482752561569214, "step": 660 }, { "epoch": 1.3966244725738397, "grad_norm": 5.835472583770752, "learning_rate": 3.6830614103635977e-06, "loss": 0.6969774961471558, "step": 662 }, { "epoch": 1.40084388185654, "grad_norm": 1.448106288909912, "learning_rate": 3.6806844838696397e-06, "loss": 1.1494622230529785, "step": 664 }, { "epoch": 1.4050632911392404, "grad_norm": 2.3839871883392334, "learning_rate": 3.6782995447861017e-06, "loss": 0.7210063934326172, "step": 666 }, { "epoch": 1.409282700421941, "grad_norm": 3.103909492492676, "learning_rate": 3.675906606018865e-06, "loss": 1.1002976894378662, "step": 668 }, { "epoch": 1.4135021097046414, "grad_norm": 1.7114917039871216, "learning_rate": 3.6735056805171012e-06, "loss": 1.154873013496399, "step": 670 }, { "epoch": 1.4177215189873418, "grad_norm": 3.427095651626587, "learning_rate": 3.6710967812731994e-06, "loss": 1.3804283142089844, "step": 672 }, { "epoch": 1.4219409282700421, "grad_norm": 2.9029994010925293, "learning_rate": 3.6686799213226984e-06, "loss": 0.7311358451843262, "step": 674 }, { "epoch": 1.4261603375527425, "grad_norm": 2.845263719558716, "learning_rate": 3.666255113744218e-06, "loss": 0.6623574495315552, "step": 676 }, { "epoch": 1.4303797468354431, "grad_norm": 5.403914451599121, "learning_rate": 3.663822371659383e-06, "loss": 0.9805995225906372, "step": 678 }, { "epoch": 1.4345991561181435, "grad_norm": 3.444819927215576, "learning_rate": 3.6613817082327565e-06, "loss": 1.088465690612793, "step": 680 }, { "epoch": 1.4388185654008439, "grad_norm": 4.646100997924805, "learning_rate": 3.658933136671767e-06, "loss": 0.8819342851638794, "step": 682 }, { "epoch": 1.4430379746835442, "grad_norm": 3.1290183067321777, "learning_rate": 3.656476670226637e-06, "loss": 1.2142698764801025, "step": 684 }, { "epoch": 1.4472573839662446, "grad_norm": 4.68398904800415, "learning_rate": 3.6540123221903123e-06, "loss": 0.7775373458862305, "step": 686 }, { "epoch": 1.4514767932489452, "grad_norm": 3.9637718200683594, "learning_rate": 3.651540105898387e-06, "loss": 0.9440705180168152, "step": 688 }, { "epoch": 1.4556962025316456, "grad_norm": 6.741257190704346, "learning_rate": 3.6490600347290353e-06, "loss": 1.0546112060546875, "step": 690 }, { "epoch": 1.459915611814346, "grad_norm": 4.779881000518799, "learning_rate": 3.6465721221029376e-06, "loss": 0.7046493887901306, "step": 692 }, { "epoch": 1.4641350210970465, "grad_norm": 5.674314498901367, "learning_rate": 3.6440763814832075e-06, "loss": 1.2944858074188232, "step": 694 }, { "epoch": 1.4683544303797469, "grad_norm": 2.4671552181243896, "learning_rate": 3.6415728263753176e-06, "loss": 0.6650893688201904, "step": 696 }, { "epoch": 1.4725738396624473, "grad_norm": 3.0560495853424072, "learning_rate": 3.63906147032703e-06, "loss": 1.177491545677185, "step": 698 }, { "epoch": 1.4767932489451476, "grad_norm": 2.7282063961029053, "learning_rate": 3.6365423269283187e-06, "loss": 1.2095248699188232, "step": 700 }, { "epoch": 1.481012658227848, "grad_norm": 5.56691837310791, "learning_rate": 3.6340154098113e-06, "loss": 1.0211296081542969, "step": 702 }, { "epoch": 1.4852320675105486, "grad_norm": 11.867128372192383, "learning_rate": 3.631480732650156e-06, "loss": 0.8005210161209106, "step": 704 }, { "epoch": 1.489451476793249, "grad_norm": 1.5090935230255127, "learning_rate": 3.6289383091610625e-06, "loss": 1.1544265747070312, "step": 706 }, { "epoch": 1.4936708860759493, "grad_norm": 1.969177484512329, "learning_rate": 3.626388153102113e-06, "loss": 1.180321455001831, "step": 708 }, { "epoch": 1.49789029535865, "grad_norm": 1.4724305868148804, "learning_rate": 3.6238302782732446e-06, "loss": 1.0343523025512695, "step": 710 }, { "epoch": 1.50210970464135, "grad_norm": 4.455009937286377, "learning_rate": 3.621264698516166e-06, "loss": 0.48465144634246826, "step": 712 }, { "epoch": 1.5063291139240507, "grad_norm": 2.1380884647369385, "learning_rate": 3.6186914277142776e-06, "loss": 1.1161589622497559, "step": 714 }, { "epoch": 1.510548523206751, "grad_norm": 3.7489266395568848, "learning_rate": 3.6161104797926013e-06, "loss": 1.091984510421753, "step": 716 }, { "epoch": 1.5147679324894514, "grad_norm": 2.2989237308502197, "learning_rate": 3.613521868717703e-06, "loss": 1.1017979383468628, "step": 718 }, { "epoch": 1.518987341772152, "grad_norm": 4.086328506469727, "learning_rate": 3.6109256084976147e-06, "loss": 1.0278382301330566, "step": 720 }, { "epoch": 1.5232067510548524, "grad_norm": 4.82416296005249, "learning_rate": 3.608321713181764e-06, "loss": 1.198899745941162, "step": 722 }, { "epoch": 1.5274261603375527, "grad_norm": 2.247619867324829, "learning_rate": 3.6057101968608936e-06, "loss": 1.2113308906555176, "step": 724 }, { "epoch": 1.5316455696202531, "grad_norm": 5.557096004486084, "learning_rate": 3.603091073666987e-06, "loss": 0.5562316179275513, "step": 726 }, { "epoch": 1.5358649789029535, "grad_norm": 8.159991264343262, "learning_rate": 3.600464357773191e-06, "loss": 0.414279580116272, "step": 728 }, { "epoch": 1.540084388185654, "grad_norm": 2.0832576751708984, "learning_rate": 3.5978300633937403e-06, "loss": 0.9449454545974731, "step": 730 }, { "epoch": 1.5443037974683544, "grad_norm": 2.1067464351654053, "learning_rate": 3.5951882047838798e-06, "loss": 0.9659292101860046, "step": 732 }, { "epoch": 1.5485232067510548, "grad_norm": 1.711477518081665, "learning_rate": 3.5925387962397866e-06, "loss": 1.1613965034484863, "step": 734 }, { "epoch": 1.5527426160337554, "grad_norm": 3.1845133304595947, "learning_rate": 3.589881852098495e-06, "loss": 0.864007830619812, "step": 736 }, { "epoch": 1.5569620253164556, "grad_norm": 3.9110360145568848, "learning_rate": 3.5872173867378177e-06, "loss": 0.902462363243103, "step": 738 }, { "epoch": 1.5611814345991561, "grad_norm": 3.437896490097046, "learning_rate": 3.5845454145762657e-06, "loss": 1.0834063291549683, "step": 740 }, { "epoch": 1.5654008438818565, "grad_norm": 1.5851118564605713, "learning_rate": 3.5818659500729735e-06, "loss": 0.7697902917861938, "step": 742 }, { "epoch": 1.5696202531645569, "grad_norm": 7.4633588790893555, "learning_rate": 3.5791790077276214e-06, "loss": 0.5523649454116821, "step": 744 }, { "epoch": 1.5738396624472575, "grad_norm": 1.9582291841506958, "learning_rate": 3.576484602080352e-06, "loss": 0.6860834360122681, "step": 746 }, { "epoch": 1.5780590717299579, "grad_norm": 3.9132864475250244, "learning_rate": 3.573782747711697e-06, "loss": 0.6468961834907532, "step": 748 }, { "epoch": 1.5822784810126582, "grad_norm": 2.304565906524658, "learning_rate": 3.571073459242498e-06, "loss": 1.1524250507354736, "step": 750 }, { "epoch": 1.5864978902953588, "grad_norm": 2.1101715564727783, "learning_rate": 3.56835675133382e-06, "loss": 0.7160176038742065, "step": 752 }, { "epoch": 1.590717299578059, "grad_norm": 2.8462789058685303, "learning_rate": 3.565632638686884e-06, "loss": 0.7810688018798828, "step": 754 }, { "epoch": 1.5949367088607596, "grad_norm": 2.3834588527679443, "learning_rate": 3.562901136042977e-06, "loss": 0.6207853555679321, "step": 756 }, { "epoch": 1.59915611814346, "grad_norm": 3.6158013343811035, "learning_rate": 3.560162258183377e-06, "loss": 0.8702360987663269, "step": 758 }, { "epoch": 1.6033755274261603, "grad_norm": 2.5689971446990967, "learning_rate": 3.5574160199292737e-06, "loss": 1.1135127544403076, "step": 760 }, { "epoch": 1.6075949367088609, "grad_norm": 1.0458358526229858, "learning_rate": 3.5546624361416855e-06, "loss": 0.7249690294265747, "step": 762 }, { "epoch": 1.611814345991561, "grad_norm": 1.9451916217803955, "learning_rate": 3.55190152172138e-06, "loss": 1.1511328220367432, "step": 764 }, { "epoch": 1.6160337552742616, "grad_norm": 3.351893901824951, "learning_rate": 3.549133291608796e-06, "loss": 1.0460021495819092, "step": 766 }, { "epoch": 1.620253164556962, "grad_norm": 4.358265399932861, "learning_rate": 3.5463577607839588e-06, "loss": 0.9370321035385132, "step": 768 }, { "epoch": 1.6244725738396624, "grad_norm": 3.3822832107543945, "learning_rate": 3.5435749442664016e-06, "loss": 1.1469030380249023, "step": 770 }, { "epoch": 1.628691983122363, "grad_norm": 2.77669358253479, "learning_rate": 3.540784857115084e-06, "loss": 1.1965186595916748, "step": 772 }, { "epoch": 1.6329113924050633, "grad_norm": 2.8289971351623535, "learning_rate": 3.537987514428307e-06, "loss": 1.1629645824432373, "step": 774 }, { "epoch": 1.6371308016877637, "grad_norm": 2.216648817062378, "learning_rate": 3.535182931343638e-06, "loss": 1.1647021770477295, "step": 776 }, { "epoch": 1.6413502109704643, "grad_norm": 8.33935546875, "learning_rate": 3.5323711230378236e-06, "loss": 0.8370733261108398, "step": 778 }, { "epoch": 1.6455696202531644, "grad_norm": 11.60359001159668, "learning_rate": 3.5295521047267085e-06, "loss": 0.3443516492843628, "step": 780 }, { "epoch": 1.649789029535865, "grad_norm": 2.730212688446045, "learning_rate": 3.5267258916651543e-06, "loss": 1.091811180114746, "step": 782 }, { "epoch": 1.6540084388185654, "grad_norm": 4.5352888107299805, "learning_rate": 3.5238924991469567e-06, "loss": 0.916614830493927, "step": 784 }, { "epoch": 1.6582278481012658, "grad_norm": 11.15390682220459, "learning_rate": 3.5210519425047618e-06, "loss": 1.0898263454437256, "step": 786 }, { "epoch": 1.6624472573839664, "grad_norm": 4.555877208709717, "learning_rate": 3.518204237109983e-06, "loss": 0.5768306255340576, "step": 788 }, { "epoch": 1.6666666666666665, "grad_norm": 1.4780551195144653, "learning_rate": 3.51534939837272e-06, "loss": 1.1375391483306885, "step": 790 }, { "epoch": 1.6708860759493671, "grad_norm": 6.170176982879639, "learning_rate": 3.5124874417416734e-06, "loss": 0.6376422643661499, "step": 792 }, { "epoch": 1.6751054852320675, "grad_norm": 0.6833944916725159, "learning_rate": 3.509618382704061e-06, "loss": 0.900169849395752, "step": 794 }, { "epoch": 1.6793248945147679, "grad_norm": 1.5629899501800537, "learning_rate": 3.5067422367855364e-06, "loss": 1.1173095703125, "step": 796 }, { "epoch": 1.6835443037974684, "grad_norm": 3.2922439575195312, "learning_rate": 3.5038590195501006e-06, "loss": 0.8964512348175049, "step": 798 }, { "epoch": 1.6877637130801688, "grad_norm": 4.654068470001221, "learning_rate": 3.5009687466000224e-06, "loss": 1.2155747413635254, "step": 800 }, { "epoch": 1.6919831223628692, "grad_norm": 7.5080437660217285, "learning_rate": 3.498071433575751e-06, "loss": 0.5988451242446899, "step": 802 }, { "epoch": 1.6962025316455698, "grad_norm": 1.9102202653884888, "learning_rate": 3.495167096155834e-06, "loss": 1.2323973178863525, "step": 804 }, { "epoch": 1.70042194092827, "grad_norm": 3.7390973567962646, "learning_rate": 3.4922557500568272e-06, "loss": 1.1244511604309082, "step": 806 }, { "epoch": 1.7046413502109705, "grad_norm": 32.09159851074219, "learning_rate": 3.489337411033217e-06, "loss": 0.8318772912025452, "step": 808 }, { "epoch": 1.7088607594936709, "grad_norm": 3.8693466186523438, "learning_rate": 3.48641209487733e-06, "loss": 0.8628280758857727, "step": 810 }, { "epoch": 1.7130801687763713, "grad_norm": 2.1568024158477783, "learning_rate": 3.4834798174192476e-06, "loss": 1.1509721279144287, "step": 812 }, { "epoch": 1.7172995780590719, "grad_norm": 6.118010997772217, "learning_rate": 3.4805405945267245e-06, "loss": 1.4755480289459229, "step": 814 }, { "epoch": 1.721518987341772, "grad_norm": 1.7534123659133911, "learning_rate": 3.4775944421050976e-06, "loss": 1.1487780809402466, "step": 816 }, { "epoch": 1.7257383966244726, "grad_norm": 12.342169761657715, "learning_rate": 3.4746413760972033e-06, "loss": 0.8102009296417236, "step": 818 }, { "epoch": 1.729957805907173, "grad_norm": 7.229720115661621, "learning_rate": 3.4716814124832895e-06, "loss": 0.38379400968551636, "step": 820 }, { "epoch": 1.7341772151898733, "grad_norm": 5.0790886878967285, "learning_rate": 3.468714567280931e-06, "loss": 0.6532369256019592, "step": 822 }, { "epoch": 1.738396624472574, "grad_norm": 9.148484230041504, "learning_rate": 3.4657408565449413e-06, "loss": 0.7519415616989136, "step": 824 }, { "epoch": 1.7426160337552743, "grad_norm": 3.0881879329681396, "learning_rate": 3.4627602963672854e-06, "loss": 0.9758714437484741, "step": 826 }, { "epoch": 1.7468354430379747, "grad_norm": 4.036842346191406, "learning_rate": 3.459772902876994e-06, "loss": 0.9723775386810303, "step": 828 }, { "epoch": 1.7510548523206753, "grad_norm": 2.8862991333007812, "learning_rate": 3.4567786922400757e-06, "loss": 1.1287617683410645, "step": 830 }, { "epoch": 1.7552742616033754, "grad_norm": 2.3025224208831787, "learning_rate": 3.4537776806594293e-06, "loss": 1.1016814708709717, "step": 832 }, { "epoch": 1.759493670886076, "grad_norm": 2.3911185264587402, "learning_rate": 3.4507698843747567e-06, "loss": 0.8698973655700684, "step": 834 }, { "epoch": 1.7637130801687764, "grad_norm": 2.9084486961364746, "learning_rate": 3.4477553196624734e-06, "loss": 1.1183581352233887, "step": 836 }, { "epoch": 1.7679324894514767, "grad_norm": 2.349198579788208, "learning_rate": 3.444734002835624e-06, "loss": 1.0136666297912598, "step": 838 }, { "epoch": 1.7721518987341773, "grad_norm": 3.0843915939331055, "learning_rate": 3.441705950243789e-06, "loss": 0.8606936931610107, "step": 840 }, { "epoch": 1.7763713080168775, "grad_norm": 3.7463226318359375, "learning_rate": 3.4386711782729996e-06, "loss": 0.9574577808380127, "step": 842 }, { "epoch": 1.780590717299578, "grad_norm": 1.5658468008041382, "learning_rate": 3.4356297033456496e-06, "loss": 0.46850845217704773, "step": 844 }, { "epoch": 1.7848101265822784, "grad_norm": 1.2881762981414795, "learning_rate": 3.432581541920404e-06, "loss": 0.7656896114349365, "step": 846 }, { "epoch": 1.7890295358649788, "grad_norm": 4.494737148284912, "learning_rate": 3.429526710492111e-06, "loss": 0.6177375912666321, "step": 848 }, { "epoch": 1.7932489451476794, "grad_norm": 2.9015707969665527, "learning_rate": 3.426465225591713e-06, "loss": 0.8043622374534607, "step": 850 }, { "epoch": 1.7974683544303798, "grad_norm": 2.33308482170105, "learning_rate": 3.4233971037861587e-06, "loss": 1.1262691020965576, "step": 852 }, { "epoch": 1.8016877637130801, "grad_norm": 3.47825026512146, "learning_rate": 3.4203223616783097e-06, "loss": 1.4144643545150757, "step": 854 }, { "epoch": 1.8059071729957807, "grad_norm": 0.613185703754425, "learning_rate": 3.4172410159068545e-06, "loss": 0.9285470247268677, "step": 856 }, { "epoch": 1.810126582278481, "grad_norm": 4.0782623291015625, "learning_rate": 3.414153083146215e-06, "loss": 1.0604450702667236, "step": 858 }, { "epoch": 1.8143459915611815, "grad_norm": 4.575808525085449, "learning_rate": 3.411058580106458e-06, "loss": 0.7167332172393799, "step": 860 }, { "epoch": 1.8185654008438819, "grad_norm": 2.6179590225219727, "learning_rate": 3.4079575235332077e-06, "loss": 1.1503570079803467, "step": 862 }, { "epoch": 1.8227848101265822, "grad_norm": 1.5741914510726929, "learning_rate": 3.4048499302075485e-06, "loss": 1.0776422023773193, "step": 864 }, { "epoch": 1.8270042194092828, "grad_norm": 2.200496196746826, "learning_rate": 3.40173581694594e-06, "loss": 1.0765013694763184, "step": 866 }, { "epoch": 1.831223628691983, "grad_norm": 7.11644172668457, "learning_rate": 3.3986152006001233e-06, "loss": 0.9683362245559692, "step": 868 }, { "epoch": 1.8354430379746836, "grad_norm": 2.3128275871276855, "learning_rate": 3.3954880980570296e-06, "loss": 1.044558048248291, "step": 870 }, { "epoch": 1.839662447257384, "grad_norm": 10.811915397644043, "learning_rate": 3.392354526238691e-06, "loss": 0.8069396615028381, "step": 872 }, { "epoch": 1.8438818565400843, "grad_norm": 2.664677858352661, "learning_rate": 3.3892145021021462e-06, "loss": 0.9714232683181763, "step": 874 }, { "epoch": 1.8481012658227849, "grad_norm": 2.777123212814331, "learning_rate": 3.3860680426393515e-06, "loss": 1.1506626605987549, "step": 876 }, { "epoch": 1.8523206751054853, "grad_norm": 4.2269368171691895, "learning_rate": 3.3829151648770855e-06, "loss": 0.8257066011428833, "step": 878 }, { "epoch": 1.8565400843881856, "grad_norm": 3.8701000213623047, "learning_rate": 3.3797558858768593e-06, "loss": 0.7449560761451721, "step": 880 }, { "epoch": 1.8607594936708862, "grad_norm": 3.4201698303222656, "learning_rate": 3.3765902227348255e-06, "loss": 1.0331380367279053, "step": 882 }, { "epoch": 1.8649789029535864, "grad_norm": 3.0394904613494873, "learning_rate": 3.3734181925816826e-06, "loss": 0.7403502464294434, "step": 884 }, { "epoch": 1.869198312236287, "grad_norm": 2.232851266860962, "learning_rate": 3.370239812582583e-06, "loss": 0.7928322553634644, "step": 886 }, { "epoch": 1.8734177215189873, "grad_norm": 1.918642282485962, "learning_rate": 3.367055099937041e-06, "loss": 1.0973682403564453, "step": 888 }, { "epoch": 1.8776371308016877, "grad_norm": 4.839916229248047, "learning_rate": 3.3638640718788406e-06, "loss": 0.5104875564575195, "step": 890 }, { "epoch": 1.8818565400843883, "grad_norm": 7.4713239669799805, "learning_rate": 3.3606667456759397e-06, "loss": 0.7245833873748779, "step": 892 }, { "epoch": 1.8860759493670884, "grad_norm": 2.0137648582458496, "learning_rate": 3.3574631386303797e-06, "loss": 1.1190528869628906, "step": 894 }, { "epoch": 1.890295358649789, "grad_norm": 1.844823956489563, "learning_rate": 3.3542532680781876e-06, "loss": 1.3033103942871094, "step": 896 }, { "epoch": 1.8945147679324894, "grad_norm": 9.570866584777832, "learning_rate": 3.351037151389287e-06, "loss": 0.8090759515762329, "step": 898 }, { "epoch": 1.8987341772151898, "grad_norm": 5.827152252197266, "learning_rate": 3.3478148059674016e-06, "loss": 1.06083083152771, "step": 900 }, { "epoch": 1.9029535864978904, "grad_norm": 4.6404595375061035, "learning_rate": 3.3445862492499595e-06, "loss": 1.226179838180542, "step": 902 }, { "epoch": 1.9071729957805907, "grad_norm": 4.473128318786621, "learning_rate": 3.3413514987080043e-06, "loss": 1.0048933029174805, "step": 904 }, { "epoch": 1.9113924050632911, "grad_norm": 2.088918924331665, "learning_rate": 3.338110571846093e-06, "loss": 1.325439214706421, "step": 906 }, { "epoch": 1.9156118143459917, "grad_norm": 7.492137432098389, "learning_rate": 3.3348634862022074e-06, "loss": 0.5317611694335938, "step": 908 }, { "epoch": 1.9198312236286919, "grad_norm": 5.070749759674072, "learning_rate": 3.331610259347657e-06, "loss": 1.0684950351715088, "step": 910 }, { "epoch": 1.9240506329113924, "grad_norm": 4.511446952819824, "learning_rate": 3.328350908886983e-06, "loss": 0.8111604452133179, "step": 912 }, { "epoch": 1.9282700421940928, "grad_norm": 9.428959846496582, "learning_rate": 3.3250854524578636e-06, "loss": 1.1320171356201172, "step": 914 }, { "epoch": 1.9324894514767932, "grad_norm": 13.564945220947266, "learning_rate": 3.3218139077310206e-06, "loss": 0.8104444742202759, "step": 916 }, { "epoch": 1.9367088607594938, "grad_norm": 2.054192543029785, "learning_rate": 3.3185362924101207e-06, "loss": 1.0631756782531738, "step": 918 }, { "epoch": 1.9409282700421941, "grad_norm": 3.2311954498291016, "learning_rate": 3.315252624231682e-06, "loss": 0.5999157428741455, "step": 920 }, { "epoch": 1.9451476793248945, "grad_norm": 1.8943932056427002, "learning_rate": 3.3119629209649763e-06, "loss": 1.0982520580291748, "step": 922 }, { "epoch": 1.9493670886075949, "grad_norm": 1.940902590751648, "learning_rate": 3.3086672004119335e-06, "loss": 1.226811408996582, "step": 924 }, { "epoch": 1.9535864978902953, "grad_norm": 3.3977231979370117, "learning_rate": 3.305365480407046e-06, "loss": 0.9012327194213867, "step": 926 }, { "epoch": 1.9578059071729959, "grad_norm": 3.1414709091186523, "learning_rate": 3.3020577788172725e-06, "loss": 0.7510135173797607, "step": 928 }, { "epoch": 1.9620253164556962, "grad_norm": 2.9762823581695557, "learning_rate": 3.2987441135419394e-06, "loss": 1.1897534132003784, "step": 930 }, { "epoch": 1.9662447257383966, "grad_norm": 3.8375062942504883, "learning_rate": 3.2954245025126446e-06, "loss": 0.9271247982978821, "step": 932 }, { "epoch": 1.9704641350210972, "grad_norm": 1.9467542171478271, "learning_rate": 3.292098963693163e-06, "loss": 1.2084356546401978, "step": 934 }, { "epoch": 1.9746835443037973, "grad_norm": 2.952320098876953, "learning_rate": 3.2887675150793443e-06, "loss": 1.1498595476150513, "step": 936 }, { "epoch": 1.978902953586498, "grad_norm": 1.555445909500122, "learning_rate": 3.2854301746990206e-06, "loss": 0.8107820749282837, "step": 938 }, { "epoch": 1.9831223628691983, "grad_norm": 1.9152470827102661, "learning_rate": 3.2820869606119068e-06, "loss": 1.1318726539611816, "step": 940 }, { "epoch": 1.9873417721518987, "grad_norm": 3.219928026199341, "learning_rate": 3.278737890909502e-06, "loss": 0.9334742426872253, "step": 942 }, { "epoch": 1.9915611814345993, "grad_norm": 1.992208480834961, "learning_rate": 3.275382983714992e-06, "loss": 0.7602829933166504, "step": 944 }, { "epoch": 1.9957805907172996, "grad_norm": 2.6617956161499023, "learning_rate": 3.272022257183153e-06, "loss": 1.0931661128997803, "step": 946 }, { "epoch": 2.0, "grad_norm": 12.275853157043457, "learning_rate": 3.268655729500251e-06, "loss": 0.5812578797340393, "step": 948 }, { "epoch": 2.0042194092827006, "grad_norm": 3.4581050872802734, "learning_rate": 3.265283418883945e-06, "loss": 0.8604273200035095, "step": 950 }, { "epoch": 2.0084388185654007, "grad_norm": 5.053099155426025, "learning_rate": 3.2619053435831878e-06, "loss": 0.6394712924957275, "step": 952 }, { "epoch": 2.0126582278481013, "grad_norm": 2.949049711227417, "learning_rate": 3.258521521878126e-06, "loss": 0.8134095072746277, "step": 954 }, { "epoch": 2.0168776371308015, "grad_norm": 3.0072250366210938, "learning_rate": 3.2551319720800043e-06, "loss": 0.9163396954536438, "step": 956 }, { "epoch": 2.021097046413502, "grad_norm": 4.474330902099609, "learning_rate": 3.251736712531063e-06, "loss": 0.7234617471694946, "step": 958 }, { "epoch": 2.0253164556962027, "grad_norm": 3.9642207622528076, "learning_rate": 3.2483357616044418e-06, "loss": 0.7650543451309204, "step": 960 }, { "epoch": 2.029535864978903, "grad_norm": 4.6968793869018555, "learning_rate": 3.244929137704076e-06, "loss": 1.1930127143859863, "step": 962 }, { "epoch": 2.0337552742616034, "grad_norm": 1.5408298969268799, "learning_rate": 3.241516859264602e-06, "loss": 0.7401737570762634, "step": 964 }, { "epoch": 2.037974683544304, "grad_norm": 4.210058689117432, "learning_rate": 3.238098944751256e-06, "loss": 0.756514310836792, "step": 966 }, { "epoch": 2.042194092827004, "grad_norm": 3.6998515129089355, "learning_rate": 3.23467541265977e-06, "loss": 0.750130295753479, "step": 968 }, { "epoch": 2.0464135021097047, "grad_norm": 2.7548975944519043, "learning_rate": 3.2312462815162777e-06, "loss": 1.0819189548492432, "step": 970 }, { "epoch": 2.050632911392405, "grad_norm": 4.967726707458496, "learning_rate": 3.2278115698772116e-06, "loss": 0.923316240310669, "step": 972 }, { "epoch": 2.0548523206751055, "grad_norm": 2.2812294960021973, "learning_rate": 3.2243712963292003e-06, "loss": 0.8755730390548706, "step": 974 }, { "epoch": 2.059071729957806, "grad_norm": 3.7565250396728516, "learning_rate": 3.2209254794889724e-06, "loss": 0.6916130781173706, "step": 976 }, { "epoch": 2.0632911392405062, "grad_norm": 2.0674679279327393, "learning_rate": 3.2174741380032523e-06, "loss": 0.6281135082244873, "step": 978 }, { "epoch": 2.067510548523207, "grad_norm": 3.7574315071105957, "learning_rate": 3.2140172905486612e-06, "loss": 0.7170443534851074, "step": 980 }, { "epoch": 2.071729957805907, "grad_norm": 3.4279699325561523, "learning_rate": 3.210554955831615e-06, "loss": 1.0432848930358887, "step": 982 }, { "epoch": 2.0759493670886076, "grad_norm": 2.687915802001953, "learning_rate": 3.207087152588224e-06, "loss": 0.9696755409240723, "step": 984 }, { "epoch": 2.080168776371308, "grad_norm": 2.2797346115112305, "learning_rate": 3.203613899584189e-06, "loss": 1.0136628150939941, "step": 986 }, { "epoch": 2.0843881856540083, "grad_norm": 2.3300132751464844, "learning_rate": 3.2001352156147045e-06, "loss": 1.0422950983047485, "step": 988 }, { "epoch": 2.088607594936709, "grad_norm": 6.217328071594238, "learning_rate": 3.1966511195043527e-06, "loss": 0.5632253289222717, "step": 990 }, { "epoch": 2.0928270042194095, "grad_norm": 2.278618335723877, "learning_rate": 3.193161630107003e-06, "loss": 0.5706143379211426, "step": 992 }, { "epoch": 2.0970464135021096, "grad_norm": 2.097888946533203, "learning_rate": 3.18966676630571e-06, "loss": 1.1316472291946411, "step": 994 }, { "epoch": 2.1012658227848102, "grad_norm": 4.8473286628723145, "learning_rate": 3.186166547012612e-06, "loss": 1.068217158317566, "step": 996 }, { "epoch": 2.1054852320675104, "grad_norm": 1.3159743547439575, "learning_rate": 3.1826609911688273e-06, "loss": 0.643653154373169, "step": 998 }, { "epoch": 2.109704641350211, "grad_norm": 2.744520425796509, "learning_rate": 3.1791501177443533e-06, "loss": 1.1834640502929688, "step": 1000 }, { "epoch": 2.1139240506329116, "grad_norm": 5.579896926879883, "learning_rate": 3.1756339457379626e-06, "loss": 1.023376703262329, "step": 1002 }, { "epoch": 2.1181434599156117, "grad_norm": 2.515099048614502, "learning_rate": 3.1721124941771005e-06, "loss": 1.092795491218567, "step": 1004 }, { "epoch": 2.1223628691983123, "grad_norm": 1.9233348369598389, "learning_rate": 3.1685857821177832e-06, "loss": 0.6104440689086914, "step": 1006 }, { "epoch": 2.1265822784810124, "grad_norm": 1.7998379468917847, "learning_rate": 3.1650538286444902e-06, "loss": 0.7144567966461182, "step": 1008 }, { "epoch": 2.130801687763713, "grad_norm": 1.6687654256820679, "learning_rate": 3.16151665287007e-06, "loss": 0.6989231109619141, "step": 1010 }, { "epoch": 2.1350210970464136, "grad_norm": 3.730558156967163, "learning_rate": 3.1579742739356252e-06, "loss": 0.8780606985092163, "step": 1012 }, { "epoch": 2.1392405063291138, "grad_norm": 3.9646623134613037, "learning_rate": 3.154426711010419e-06, "loss": 1.304856300354004, "step": 1014 }, { "epoch": 2.1434599156118144, "grad_norm": 4.966225624084473, "learning_rate": 3.1508739832917664e-06, "loss": 0.5962163209915161, "step": 1016 }, { "epoch": 2.147679324894515, "grad_norm": 3.8472814559936523, "learning_rate": 3.147316110004929e-06, "loss": 0.8961644768714905, "step": 1018 }, { "epoch": 2.151898734177215, "grad_norm": 16.210412979125977, "learning_rate": 3.1437531104030172e-06, "loss": 0.7574584484100342, "step": 1020 }, { "epoch": 2.1561181434599157, "grad_norm": 6.170048713684082, "learning_rate": 3.1401850037668773e-06, "loss": 0.8245753049850464, "step": 1022 }, { "epoch": 2.160337552742616, "grad_norm": 7.539897918701172, "learning_rate": 3.1366118094049962e-06, "loss": 0.8227906227111816, "step": 1024 }, { "epoch": 2.1645569620253164, "grad_norm": 2.6890225410461426, "learning_rate": 3.133033546653389e-06, "loss": 1.0590184926986694, "step": 1026 }, { "epoch": 2.168776371308017, "grad_norm": 2.2687880992889404, "learning_rate": 3.129450234875501e-06, "loss": 1.1196215152740479, "step": 1028 }, { "epoch": 2.172995780590717, "grad_norm": 1.5322057008743286, "learning_rate": 3.1258618934620977e-06, "loss": 1.0350878238677979, "step": 1030 }, { "epoch": 2.1772151898734178, "grad_norm": 1.7896466255187988, "learning_rate": 3.1222685418311624e-06, "loss": 1.0621168613433838, "step": 1032 }, { "epoch": 2.181434599156118, "grad_norm": 1.9826382398605347, "learning_rate": 3.1186701994277913e-06, "loss": 1.0807254314422607, "step": 1034 }, { "epoch": 2.1856540084388185, "grad_norm": 2.4833922386169434, "learning_rate": 3.115066885724087e-06, "loss": 1.0103787183761597, "step": 1036 }, { "epoch": 2.189873417721519, "grad_norm": 4.183322429656982, "learning_rate": 3.111458620219056e-06, "loss": 1.0446069240570068, "step": 1038 }, { "epoch": 2.1940928270042193, "grad_norm": 5.676382064819336, "learning_rate": 3.107845422438497e-06, "loss": 1.1852116584777832, "step": 1040 }, { "epoch": 2.19831223628692, "grad_norm": 12.257451057434082, "learning_rate": 3.1042273119349024e-06, "loss": 0.3302527964115143, "step": 1042 }, { "epoch": 2.2025316455696204, "grad_norm": 1.8637685775756836, "learning_rate": 3.10060430828735e-06, "loss": 1.0095632076263428, "step": 1044 }, { "epoch": 2.2067510548523206, "grad_norm": 6.286106109619141, "learning_rate": 3.0969764311013927e-06, "loss": 0.6037812232971191, "step": 1046 }, { "epoch": 2.210970464135021, "grad_norm": 2.026481866836548, "learning_rate": 3.09334370000896e-06, "loss": 0.8940553665161133, "step": 1048 }, { "epoch": 2.2151898734177213, "grad_norm": 2.958310604095459, "learning_rate": 3.089706134668245e-06, "loss": 1.070237636566162, "step": 1050 }, { "epoch": 2.219409282700422, "grad_norm": 5.202909469604492, "learning_rate": 3.0860637547636023e-06, "loss": 0.9080023765563965, "step": 1052 }, { "epoch": 2.2236286919831225, "grad_norm": 4.214676856994629, "learning_rate": 3.082416580005441e-06, "loss": 0.9310380220413208, "step": 1054 }, { "epoch": 2.2278481012658227, "grad_norm": 4.913782119750977, "learning_rate": 3.0787646301301143e-06, "loss": 0.8610812425613403, "step": 1056 }, { "epoch": 2.2320675105485233, "grad_norm": 11.496319770812988, "learning_rate": 3.0751079248998183e-06, "loss": 0.5102381706237793, "step": 1058 }, { "epoch": 2.2362869198312234, "grad_norm": 2.501431703567505, "learning_rate": 3.0714464841024817e-06, "loss": 1.026395559310913, "step": 1060 }, { "epoch": 2.240506329113924, "grad_norm": 1.0209457874298096, "learning_rate": 3.067780327551658e-06, "loss": 0.7514087557792664, "step": 1062 }, { "epoch": 2.2447257383966246, "grad_norm": 10.08558464050293, "learning_rate": 3.06410947508642e-06, "loss": 0.4998623728752136, "step": 1064 }, { "epoch": 2.2489451476793247, "grad_norm": 2.017042875289917, "learning_rate": 3.060433946571253e-06, "loss": 0.9955783486366272, "step": 1066 }, { "epoch": 2.2531645569620253, "grad_norm": 3.0692787170410156, "learning_rate": 3.0567537618959453e-06, "loss": 1.24436616897583, "step": 1068 }, { "epoch": 2.257383966244726, "grad_norm": 2.2183597087860107, "learning_rate": 3.0530689409754826e-06, "loss": 1.1389007568359375, "step": 1070 }, { "epoch": 2.261603375527426, "grad_norm": 3.1245839595794678, "learning_rate": 3.0493795037499374e-06, "loss": 1.1064579486846924, "step": 1072 }, { "epoch": 2.2658227848101267, "grad_norm": 5.401794910430908, "learning_rate": 3.0456854701843647e-06, "loss": 1.280016303062439, "step": 1074 }, { "epoch": 2.270042194092827, "grad_norm": 2.5527584552764893, "learning_rate": 3.041986860268693e-06, "loss": 1.0337902307510376, "step": 1076 }, { "epoch": 2.2742616033755274, "grad_norm": 1.6811496019363403, "learning_rate": 3.0382836940176112e-06, "loss": 0.7087812423706055, "step": 1078 }, { "epoch": 2.278481012658228, "grad_norm": 4.886277675628662, "learning_rate": 3.034575991470468e-06, "loss": 0.8468987941741943, "step": 1080 }, { "epoch": 2.282700421940928, "grad_norm": 10.467023849487305, "learning_rate": 3.03086377269116e-06, "loss": 0.46134668588638306, "step": 1082 }, { "epoch": 2.2869198312236287, "grad_norm": 4.281970500946045, "learning_rate": 3.027147057768022e-06, "loss": 0.6730149984359741, "step": 1084 }, { "epoch": 2.291139240506329, "grad_norm": 1.6377662420272827, "learning_rate": 3.023425866813718e-06, "loss": 0.5801299810409546, "step": 1086 }, { "epoch": 2.2953586497890295, "grad_norm": 4.013052940368652, "learning_rate": 3.0197002199651353e-06, "loss": 0.900696873664856, "step": 1088 }, { "epoch": 2.29957805907173, "grad_norm": 1.2075470685958862, "learning_rate": 3.015970137383273e-06, "loss": 0.557762861251831, "step": 1090 }, { "epoch": 2.3037974683544302, "grad_norm": 6.79136848449707, "learning_rate": 3.0122356392531345e-06, "loss": 0.8252531290054321, "step": 1092 }, { "epoch": 2.308016877637131, "grad_norm": 1.973429560661316, "learning_rate": 3.008496745783617e-06, "loss": 0.6639243364334106, "step": 1094 }, { "epoch": 2.3122362869198314, "grad_norm": 5.644299507141113, "learning_rate": 3.0047534772074038e-06, "loss": 0.41757094860076904, "step": 1096 }, { "epoch": 2.3164556962025316, "grad_norm": 4.321779727935791, "learning_rate": 3.001005853780852e-06, "loss": 1.101494550704956, "step": 1098 }, { "epoch": 2.320675105485232, "grad_norm": 2.1912591457366943, "learning_rate": 2.9972538957838848e-06, "loss": 0.9152376055717468, "step": 1100 }, { "epoch": 2.3248945147679323, "grad_norm": 2.042452335357666, "learning_rate": 2.9934976235198827e-06, "loss": 1.0394017696380615, "step": 1102 }, { "epoch": 2.329113924050633, "grad_norm": 1.53744637966156, "learning_rate": 2.989737057315572e-06, "loss": 1.2090572118759155, "step": 1104 }, { "epoch": 2.3333333333333335, "grad_norm": 2.0143048763275146, "learning_rate": 2.9859722175209153e-06, "loss": 0.7863491773605347, "step": 1106 }, { "epoch": 2.3375527426160336, "grad_norm": 10.555294036865234, "learning_rate": 2.9822031245090002e-06, "loss": 0.5064557790756226, "step": 1108 }, { "epoch": 2.3417721518987342, "grad_norm": 3.0460026264190674, "learning_rate": 2.978429798675931e-06, "loss": 1.0185744762420654, "step": 1110 }, { "epoch": 2.3459915611814344, "grad_norm": 1.6025739908218384, "learning_rate": 2.97465226044072e-06, "loss": 1.0687915086746216, "step": 1112 }, { "epoch": 2.350210970464135, "grad_norm": 2.336373805999756, "learning_rate": 2.9708705302451697e-06, "loss": 1.1018157005310059, "step": 1114 }, { "epoch": 2.3544303797468356, "grad_norm": 1.2120983600616455, "learning_rate": 2.96708462855377e-06, "loss": 0.6393563747406006, "step": 1116 }, { "epoch": 2.3586497890295357, "grad_norm": 5.554210186004639, "learning_rate": 2.9632945758535847e-06, "loss": 0.9500521421432495, "step": 1118 }, { "epoch": 2.3628691983122363, "grad_norm": 13.489524841308594, "learning_rate": 2.9595003926541398e-06, "loss": 0.6889848709106445, "step": 1120 }, { "epoch": 2.367088607594937, "grad_norm": 6.1560187339782715, "learning_rate": 2.9557020994873125e-06, "loss": 0.9626091718673706, "step": 1122 }, { "epoch": 2.371308016877637, "grad_norm": 1.836715579032898, "learning_rate": 2.951899716907221e-06, "loss": 0.5855181217193604, "step": 1124 }, { "epoch": 2.3755274261603376, "grad_norm": 1.9696272611618042, "learning_rate": 2.9480932654901142e-06, "loss": 0.8846515417098999, "step": 1126 }, { "epoch": 2.379746835443038, "grad_norm": 2.0595052242279053, "learning_rate": 2.944282765834257e-06, "loss": 1.0026812553405762, "step": 1128 }, { "epoch": 2.3839662447257384, "grad_norm": 8.984773635864258, "learning_rate": 2.9404682385598225e-06, "loss": 0.4564356803894043, "step": 1130 }, { "epoch": 2.388185654008439, "grad_norm": 9.524094581604004, "learning_rate": 2.9366497043087794e-06, "loss": 0.3366748094558716, "step": 1132 }, { "epoch": 2.392405063291139, "grad_norm": 2.6163482666015625, "learning_rate": 2.932827183744778e-06, "loss": 0.46002885699272156, "step": 1134 }, { "epoch": 2.3966244725738397, "grad_norm": 7.858697414398193, "learning_rate": 2.929000697553041e-06, "loss": 0.5188404321670532, "step": 1136 }, { "epoch": 2.40084388185654, "grad_norm": 2.04315447807312, "learning_rate": 2.925170266440252e-06, "loss": 1.063408613204956, "step": 1138 }, { "epoch": 2.4050632911392404, "grad_norm": 3.0201163291931152, "learning_rate": 2.921335911134439e-06, "loss": 0.7606229186058044, "step": 1140 }, { "epoch": 2.409282700421941, "grad_norm": 5.318437576293945, "learning_rate": 2.91749765238487e-06, "loss": 0.2792668044567108, "step": 1142 }, { "epoch": 2.413502109704641, "grad_norm": 1.64540433883667, "learning_rate": 2.9136555109619316e-06, "loss": 0.7836066484451294, "step": 1144 }, { "epoch": 2.4177215189873418, "grad_norm": 7.265844821929932, "learning_rate": 2.9098095076570235e-06, "loss": 1.0778812170028687, "step": 1146 }, { "epoch": 2.4219409282700424, "grad_norm": 4.908560752868652, "learning_rate": 2.9059596632824432e-06, "loss": 0.8231828212738037, "step": 1148 }, { "epoch": 2.4261603375527425, "grad_norm": 3.473619222640991, "learning_rate": 2.902105998671275e-06, "loss": 1.0785859823226929, "step": 1150 }, { "epoch": 2.430379746835443, "grad_norm": 5.009274959564209, "learning_rate": 2.8982485346772733e-06, "loss": 0.6990054845809937, "step": 1152 }, { "epoch": 2.4345991561181437, "grad_norm": 1.6592916250228882, "learning_rate": 2.894387292174754e-06, "loss": 1.1584959030151367, "step": 1154 }, { "epoch": 2.438818565400844, "grad_norm": 1.9908864498138428, "learning_rate": 2.8905222920584814e-06, "loss": 0.3479560613632202, "step": 1156 }, { "epoch": 2.4430379746835444, "grad_norm": 2.59413743019104, "learning_rate": 2.886653555243553e-06, "loss": 0.7740304470062256, "step": 1158 }, { "epoch": 2.4472573839662446, "grad_norm": 3.607126235961914, "learning_rate": 2.882781102665284e-06, "loss": 1.0350849628448486, "step": 1160 }, { "epoch": 2.451476793248945, "grad_norm": 2.7151076793670654, "learning_rate": 2.8789049552791024e-06, "loss": 0.6460145711898804, "step": 1162 }, { "epoch": 2.4556962025316453, "grad_norm": 1.7807066440582275, "learning_rate": 2.8750251340604255e-06, "loss": 1.0453755855560303, "step": 1164 }, { "epoch": 2.459915611814346, "grad_norm": 2.944485664367676, "learning_rate": 2.8711416600045556e-06, "loss": 1.079903483390808, "step": 1166 }, { "epoch": 2.4641350210970465, "grad_norm": 0.9675163626670837, "learning_rate": 2.8672545541265583e-06, "loss": 0.5578194856643677, "step": 1168 }, { "epoch": 2.4683544303797467, "grad_norm": 1.795234203338623, "learning_rate": 2.8633638374611544e-06, "loss": 1.0072107315063477, "step": 1170 }, { "epoch": 2.4725738396624473, "grad_norm": 3.3494741916656494, "learning_rate": 2.8594695310626034e-06, "loss": 1.0281925201416016, "step": 1172 }, { "epoch": 2.476793248945148, "grad_norm": 2.088599920272827, "learning_rate": 2.8555716560045917e-06, "loss": 1.0314571857452393, "step": 1174 }, { "epoch": 2.481012658227848, "grad_norm": 2.605670213699341, "learning_rate": 2.851670233380114e-06, "loss": 0.7644580602645874, "step": 1176 }, { "epoch": 2.4852320675105486, "grad_norm": 13.257305145263672, "learning_rate": 2.8477652843013666e-06, "loss": 0.42062222957611084, "step": 1178 }, { "epoch": 2.489451476793249, "grad_norm": 7.103763103485107, "learning_rate": 2.8438568298996265e-06, "loss": 0.7796779274940491, "step": 1180 }, { "epoch": 2.4936708860759493, "grad_norm": 3.013402223587036, "learning_rate": 2.8399448913251374e-06, "loss": 0.9339659214019775, "step": 1182 }, { "epoch": 2.49789029535865, "grad_norm": 7.224562644958496, "learning_rate": 2.836029489747002e-06, "loss": 0.49434345960617065, "step": 1184 }, { "epoch": 2.50210970464135, "grad_norm": 17.112947463989258, "learning_rate": 2.8321106463530592e-06, "loss": 0.6316568851470947, "step": 1186 }, { "epoch": 2.5063291139240507, "grad_norm": 5.573176383972168, "learning_rate": 2.8281883823497745e-06, "loss": 0.7511799335479736, "step": 1188 }, { "epoch": 2.510548523206751, "grad_norm": 2.383787155151367, "learning_rate": 2.824262718962122e-06, "loss": 1.03713858127594, "step": 1190 }, { "epoch": 2.5147679324894514, "grad_norm": 5.0437116622924805, "learning_rate": 2.820333677433474e-06, "loss": 0.510556697845459, "step": 1192 }, { "epoch": 2.518987341772152, "grad_norm": 6.297809600830078, "learning_rate": 2.816401279025482e-06, "loss": 1.3623912334442139, "step": 1194 }, { "epoch": 2.523206751054852, "grad_norm": 2.4292147159576416, "learning_rate": 2.8124655450179618e-06, "loss": 1.1327567100524902, "step": 1196 }, { "epoch": 2.5274261603375527, "grad_norm": 2.8005106449127197, "learning_rate": 2.808526496708781e-06, "loss": 0.980167031288147, "step": 1198 }, { "epoch": 2.5316455696202533, "grad_norm": 6.94888162612915, "learning_rate": 2.804584155413741e-06, "loss": 0.6094427704811096, "step": 1200 }, { "epoch": 2.5358649789029535, "grad_norm": 2.302324056625366, "learning_rate": 2.8006385424664638e-06, "loss": 0.7884533405303955, "step": 1202 }, { "epoch": 2.540084388185654, "grad_norm": 7.919814586639404, "learning_rate": 2.7966896792182755e-06, "loss": 0.6705489754676819, "step": 1204 }, { "epoch": 2.5443037974683547, "grad_norm": 2.791510581970215, "learning_rate": 2.792737587038092e-06, "loss": 0.9616777300834656, "step": 1206 }, { "epoch": 2.548523206751055, "grad_norm": 5.007606029510498, "learning_rate": 2.7887822873122995e-06, "loss": 0.7277128100395203, "step": 1208 }, { "epoch": 2.5527426160337554, "grad_norm": 2.232788562774658, "learning_rate": 2.7848238014446447e-06, "loss": 1.1262240409851074, "step": 1210 }, { "epoch": 2.5569620253164556, "grad_norm": 3.4404702186584473, "learning_rate": 2.7808621508561123e-06, "loss": 1.0465441942214966, "step": 1212 }, { "epoch": 2.561181434599156, "grad_norm": 8.573604583740234, "learning_rate": 2.776897356984816e-06, "loss": 0.30951395630836487, "step": 1214 }, { "epoch": 2.5654008438818563, "grad_norm": 3.45868182182312, "learning_rate": 2.7729294412858776e-06, "loss": 0.7883036136627197, "step": 1216 }, { "epoch": 2.569620253164557, "grad_norm": 1.7647202014923096, "learning_rate": 2.7689584252313128e-06, "loss": 1.0650732517242432, "step": 1218 }, { "epoch": 2.5738396624472575, "grad_norm": 8.709357261657715, "learning_rate": 2.7649843303099127e-06, "loss": 0.6637066602706909, "step": 1220 }, { "epoch": 2.5780590717299576, "grad_norm": 4.496120929718018, "learning_rate": 2.761007178027132e-06, "loss": 0.9158288240432739, "step": 1222 }, { "epoch": 2.5822784810126582, "grad_norm": 11.006595611572266, "learning_rate": 2.75702698990497e-06, "loss": 0.7496324777603149, "step": 1224 }, { "epoch": 2.586497890295359, "grad_norm": 4.899750232696533, "learning_rate": 2.7530437874818515e-06, "loss": 0.6235587000846863, "step": 1226 }, { "epoch": 2.590717299578059, "grad_norm": 1.8441094160079956, "learning_rate": 2.749057592312515e-06, "loss": 1.0314083099365234, "step": 1228 }, { "epoch": 2.5949367088607596, "grad_norm": 2.7288100719451904, "learning_rate": 2.7450684259678943e-06, "loss": 1.0736459493637085, "step": 1230 }, { "epoch": 2.59915611814346, "grad_norm": 3.8577749729156494, "learning_rate": 2.7410763100350004e-06, "loss": 0.9584764838218689, "step": 1232 }, { "epoch": 2.6033755274261603, "grad_norm": 9.928874969482422, "learning_rate": 2.7370812661168046e-06, "loss": 0.2811320722103119, "step": 1234 }, { "epoch": 2.607594936708861, "grad_norm": 3.457975387573242, "learning_rate": 2.7330833158321267e-06, "loss": 1.1292645931243896, "step": 1236 }, { "epoch": 2.611814345991561, "grad_norm": 6.1282172203063965, "learning_rate": 2.7290824808155096e-06, "loss": 1.2942759990692139, "step": 1238 }, { "epoch": 2.6160337552742616, "grad_norm": 6.050518035888672, "learning_rate": 2.7250787827171085e-06, "loss": 0.7845382690429688, "step": 1240 }, { "epoch": 2.620253164556962, "grad_norm": 2.2712647914886475, "learning_rate": 2.721072243202573e-06, "loss": 0.9927393794059753, "step": 1242 }, { "epoch": 2.6244725738396624, "grad_norm": 12.99117660522461, "learning_rate": 2.7170628839529277e-06, "loss": 0.4361240863800049, "step": 1244 }, { "epoch": 2.628691983122363, "grad_norm": 2.062415599822998, "learning_rate": 2.7130507266644555e-06, "loss": 0.7296593189239502, "step": 1246 }, { "epoch": 2.632911392405063, "grad_norm": 6.197027206420898, "learning_rate": 2.709035793048581e-06, "loss": 1.5014359951019287, "step": 1248 }, { "epoch": 2.6371308016877637, "grad_norm": 1.7749969959259033, "learning_rate": 2.705018104831753e-06, "loss": 1.0191712379455566, "step": 1250 }, { "epoch": 2.6413502109704643, "grad_norm": 3.7179009914398193, "learning_rate": 2.700997683755326e-06, "loss": 0.9707983732223511, "step": 1252 }, { "epoch": 2.6455696202531644, "grad_norm": 7.614749431610107, "learning_rate": 2.6969745515754444e-06, "loss": 0.47567054629325867, "step": 1254 }, { "epoch": 2.649789029535865, "grad_norm": 3.8538355827331543, "learning_rate": 2.6929487300629206e-06, "loss": 0.5580261945724487, "step": 1256 }, { "epoch": 2.6540084388185656, "grad_norm": 3.0637574195861816, "learning_rate": 2.6889202410031237e-06, "loss": 0.9232720136642456, "step": 1258 }, { "epoch": 2.6582278481012658, "grad_norm": 1.9953484535217285, "learning_rate": 2.6848891061958565e-06, "loss": 1.007423996925354, "step": 1260 }, { "epoch": 2.6624472573839664, "grad_norm": 10.962545394897461, "learning_rate": 2.680855347455238e-06, "loss": 1.0483016967773438, "step": 1262 }, { "epoch": 2.6666666666666665, "grad_norm": 2.6327028274536133, "learning_rate": 2.6768189866095867e-06, "loss": 0.5767178535461426, "step": 1264 }, { "epoch": 2.670886075949367, "grad_norm": 5.506629943847656, "learning_rate": 2.6727800455013037e-06, "loss": 0.8919286727905273, "step": 1266 }, { "epoch": 2.6751054852320673, "grad_norm": 1.8910753726959229, "learning_rate": 2.6687385459867514e-06, "loss": 0.7154239416122437, "step": 1268 }, { "epoch": 2.679324894514768, "grad_norm": 4.416780948638916, "learning_rate": 2.6646945099361382e-06, "loss": 0.4701068103313446, "step": 1270 }, { "epoch": 2.6835443037974684, "grad_norm": 1.5386635065078735, "learning_rate": 2.6606479592333965e-06, "loss": 0.9448637962341309, "step": 1272 }, { "epoch": 2.6877637130801686, "grad_norm": 6.68757963180542, "learning_rate": 2.6565989157760678e-06, "loss": 0.735755443572998, "step": 1274 }, { "epoch": 2.691983122362869, "grad_norm": 23.566585540771484, "learning_rate": 2.652547401475184e-06, "loss": 0.8000218868255615, "step": 1276 }, { "epoch": 2.6962025316455698, "grad_norm": 1.7401084899902344, "learning_rate": 2.6484934382551465e-06, "loss": 0.35548001527786255, "step": 1278 }, { "epoch": 2.70042194092827, "grad_norm": 10.348366737365723, "learning_rate": 2.644437048053609e-06, "loss": 0.8879528641700745, "step": 1280 }, { "epoch": 2.7046413502109705, "grad_norm": 2.0043532848358154, "learning_rate": 2.6403782528213577e-06, "loss": 1.076289415359497, "step": 1282 }, { "epoch": 2.708860759493671, "grad_norm": 14.326828956604004, "learning_rate": 2.6363170745221958e-06, "loss": 0.5147005915641785, "step": 1284 }, { "epoch": 2.7130801687763713, "grad_norm": 2.707928419113159, "learning_rate": 2.6322535351328193e-06, "loss": 0.502042293548584, "step": 1286 }, { "epoch": 2.717299578059072, "grad_norm": 1.4950000047683716, "learning_rate": 2.6281876566427034e-06, "loss": 0.6342880129814148, "step": 1288 }, { "epoch": 2.721518987341772, "grad_norm": 0.5780206918716431, "learning_rate": 2.624119461053979e-06, "loss": 0.7421303391456604, "step": 1290 }, { "epoch": 2.7257383966244726, "grad_norm": 1.3298128843307495, "learning_rate": 2.620048970381319e-06, "loss": 0.9955764412879944, "step": 1292 }, { "epoch": 2.7299578059071727, "grad_norm": 2.542677879333496, "learning_rate": 2.6159762066518117e-06, "loss": 0.5678607821464539, "step": 1294 }, { "epoch": 2.7341772151898733, "grad_norm": 2.9699714183807373, "learning_rate": 2.61190119190485e-06, "loss": 1.0441884994506836, "step": 1296 }, { "epoch": 2.738396624472574, "grad_norm": 1.9846669435501099, "learning_rate": 2.607823948192005e-06, "loss": 1.0227396488189697, "step": 1298 }, { "epoch": 2.742616033755274, "grad_norm": 3.1612093448638916, "learning_rate": 2.6037444975769104e-06, "loss": 0.7024236917495728, "step": 1300 }, { "epoch": 2.7468354430379747, "grad_norm": 1.8448959589004517, "learning_rate": 2.5996628621351437e-06, "loss": 1.156023621559143, "step": 1302 }, { "epoch": 2.7510548523206753, "grad_norm": 4.011197566986084, "learning_rate": 2.5955790639541036e-06, "loss": 0.6238597631454468, "step": 1304 }, { "epoch": 2.7552742616033754, "grad_norm": 3.856045961380005, "learning_rate": 2.591493125132893e-06, "loss": 1.281459093093872, "step": 1306 }, { "epoch": 2.759493670886076, "grad_norm": 2.341705083847046, "learning_rate": 2.5874050677821984e-06, "loss": 0.9869955778121948, "step": 1308 }, { "epoch": 2.7637130801687766, "grad_norm": 10.147032737731934, "learning_rate": 2.5833149140241718e-06, "loss": 0.8909780979156494, "step": 1310 }, { "epoch": 2.7679324894514767, "grad_norm": 1.7961941957473755, "learning_rate": 2.579222685992307e-06, "loss": 1.0535545349121094, "step": 1312 }, { "epoch": 2.7721518987341773, "grad_norm": 3.12715482711792, "learning_rate": 2.5751284058313266e-06, "loss": 1.1261003017425537, "step": 1314 }, { "epoch": 2.7763713080168775, "grad_norm": 3.8387131690979004, "learning_rate": 2.5710320956970536e-06, "loss": 0.7698974609375, "step": 1316 }, { "epoch": 2.780590717299578, "grad_norm": 1.3000264167785645, "learning_rate": 2.5669337777562996e-06, "loss": 0.5697190761566162, "step": 1318 }, { "epoch": 2.7848101265822782, "grad_norm": 1.9856594800949097, "learning_rate": 2.5628334741867385e-06, "loss": 1.1043368577957153, "step": 1320 }, { "epoch": 2.789029535864979, "grad_norm": 3.5784945487976074, "learning_rate": 2.5587312071767923e-06, "loss": 0.6595450639724731, "step": 1322 }, { "epoch": 2.7932489451476794, "grad_norm": 5.370586395263672, "learning_rate": 2.554626998925505e-06, "loss": 1.2037230730056763, "step": 1324 }, { "epoch": 2.7974683544303796, "grad_norm": 6.791380882263184, "learning_rate": 2.5505208716424275e-06, "loss": 0.899883508682251, "step": 1326 }, { "epoch": 2.80168776371308, "grad_norm": 1.783818006515503, "learning_rate": 2.5464128475474937e-06, "loss": 0.7012801170349121, "step": 1328 }, { "epoch": 2.8059071729957807, "grad_norm": 1.9667185544967651, "learning_rate": 2.542302948870904e-06, "loss": 1.041996955871582, "step": 1330 }, { "epoch": 2.810126582278481, "grad_norm": 13.571832656860352, "learning_rate": 2.5381911978530006e-06, "loss": 0.9141802787780762, "step": 1332 }, { "epoch": 2.8143459915611815, "grad_norm": 2.473447799682617, "learning_rate": 2.5340776167441508e-06, "loss": 0.5973923206329346, "step": 1334 }, { "epoch": 2.818565400843882, "grad_norm": 1.2413594722747803, "learning_rate": 2.529962227804626e-06, "loss": 0.8588274717330933, "step": 1336 }, { "epoch": 2.8227848101265822, "grad_norm": 5.830739498138428, "learning_rate": 2.525845053304479e-06, "loss": 0.7775506973266602, "step": 1338 }, { "epoch": 2.827004219409283, "grad_norm": 5.612140655517578, "learning_rate": 2.521726115523425e-06, "loss": 0.9469473361968994, "step": 1340 }, { "epoch": 2.831223628691983, "grad_norm": 2.9371390342712402, "learning_rate": 2.517605436750723e-06, "loss": 1.0295050144195557, "step": 1342 }, { "epoch": 2.8354430379746836, "grad_norm": 2.6451170444488525, "learning_rate": 2.513483039285051e-06, "loss": 1.1780718564987183, "step": 1344 }, { "epoch": 2.8396624472573837, "grad_norm": 12.214982032775879, "learning_rate": 2.5093589454343883e-06, "loss": 0.7536942362785339, "step": 1346 }, { "epoch": 2.8438818565400843, "grad_norm": 2.7933950424194336, "learning_rate": 2.505233177515894e-06, "loss": 0.607318639755249, "step": 1348 }, { "epoch": 2.848101265822785, "grad_norm": 2.1484858989715576, "learning_rate": 2.501105757855787e-06, "loss": 1.0892062187194824, "step": 1350 }, { "epoch": 2.852320675105485, "grad_norm": 3.0315003395080566, "learning_rate": 2.4969767087892236e-06, "loss": 0.7782174348831177, "step": 1352 }, { "epoch": 2.8565400843881856, "grad_norm": 2.569249153137207, "learning_rate": 2.492846052660178e-06, "loss": 0.8103134632110596, "step": 1354 }, { "epoch": 2.8607594936708862, "grad_norm": 8.901324272155762, "learning_rate": 2.4887138118213206e-06, "loss": 0.5044631361961365, "step": 1356 }, { "epoch": 2.8649789029535864, "grad_norm": 2.725210428237915, "learning_rate": 2.4845800086338972e-06, "loss": 1.0778303146362305, "step": 1358 }, { "epoch": 2.869198312236287, "grad_norm": 3.4764597415924072, "learning_rate": 2.4804446654676076e-06, "loss": 0.8491913080215454, "step": 1360 }, { "epoch": 2.8734177215189876, "grad_norm": 1.586370587348938, "learning_rate": 2.4763078047004863e-06, "loss": 0.6659104824066162, "step": 1362 }, { "epoch": 2.8776371308016877, "grad_norm": 6.09430456161499, "learning_rate": 2.47216944871878e-06, "loss": 0.950684130191803, "step": 1364 }, { "epoch": 2.8818565400843883, "grad_norm": 2.1875460147857666, "learning_rate": 2.468029619916825e-06, "loss": 0.9997307062149048, "step": 1366 }, { "epoch": 2.8860759493670884, "grad_norm": 3.9469892978668213, "learning_rate": 2.46388834069693e-06, "loss": 1.1051433086395264, "step": 1368 }, { "epoch": 2.890295358649789, "grad_norm": 1.705639123916626, "learning_rate": 2.4597456334692505e-06, "loss": 1.03743577003479, "step": 1370 }, { "epoch": 2.894514767932489, "grad_norm": 22.948728561401367, "learning_rate": 2.455601520651671e-06, "loss": 0.4580141305923462, "step": 1372 }, { "epoch": 2.8987341772151898, "grad_norm": 1.9022364616394043, "learning_rate": 2.451456024669681e-06, "loss": 0.92431640625, "step": 1374 }, { "epoch": 2.9029535864978904, "grad_norm": 1.598383903503418, "learning_rate": 2.4473091679562555e-06, "loss": 1.1237053871154785, "step": 1376 }, { "epoch": 2.9071729957805905, "grad_norm": 4.576679706573486, "learning_rate": 2.443160972951733e-06, "loss": 0.8321917653083801, "step": 1378 }, { "epoch": 2.911392405063291, "grad_norm": 3.267960786819458, "learning_rate": 2.4390114621036948e-06, "loss": 1.2134051322937012, "step": 1380 }, { "epoch": 2.9156118143459917, "grad_norm": 9.497183799743652, "learning_rate": 2.43486065786684e-06, "loss": 0.6116930842399597, "step": 1382 }, { "epoch": 2.919831223628692, "grad_norm": 9.528655052185059, "learning_rate": 2.43070858270287e-06, "loss": 0.7370846271514893, "step": 1384 }, { "epoch": 2.9240506329113924, "grad_norm": 2.335017204284668, "learning_rate": 2.4265552590803616e-06, "loss": 0.6520988941192627, "step": 1386 }, { "epoch": 2.928270042194093, "grad_norm": 3.7409374713897705, "learning_rate": 2.4224007094746495e-06, "loss": 1.0449352264404297, "step": 1388 }, { "epoch": 2.932489451476793, "grad_norm": 2.975673198699951, "learning_rate": 2.418244956367701e-06, "loss": 0.9698547124862671, "step": 1390 }, { "epoch": 2.9367088607594938, "grad_norm": 2.086550712585449, "learning_rate": 2.4140880222479963e-06, "loss": 0.6123561859130859, "step": 1392 }, { "epoch": 2.9409282700421944, "grad_norm": 2.2701752185821533, "learning_rate": 2.4099299296104063e-06, "loss": 0.6262718439102173, "step": 1394 }, { "epoch": 2.9451476793248945, "grad_norm": 4.327895164489746, "learning_rate": 2.405770700956073e-06, "loss": 1.0023303031921387, "step": 1396 }, { "epoch": 2.9493670886075947, "grad_norm": 1.0873901844024658, "learning_rate": 2.401610358792283e-06, "loss": 0.8893314599990845, "step": 1398 }, { "epoch": 2.9535864978902953, "grad_norm": 3.0334839820861816, "learning_rate": 2.3974489256323508e-06, "loss": 0.8417981266975403, "step": 1400 }, { "epoch": 2.957805907172996, "grad_norm": 5.334658622741699, "learning_rate": 2.3932864239954937e-06, "loss": 0.7297941446304321, "step": 1402 }, { "epoch": 2.962025316455696, "grad_norm": 2.946950674057007, "learning_rate": 2.3891228764067106e-06, "loss": 1.0070791244506836, "step": 1404 }, { "epoch": 2.9662447257383966, "grad_norm": 3.0521016120910645, "learning_rate": 2.384958305396662e-06, "loss": 0.8960994482040405, "step": 1406 }, { "epoch": 2.970464135021097, "grad_norm": 4.832094192504883, "learning_rate": 2.380792733501545e-06, "loss": 0.577763557434082, "step": 1408 }, { "epoch": 2.9746835443037973, "grad_norm": 3.717233419418335, "learning_rate": 2.376626183262975e-06, "loss": 0.8571799993515015, "step": 1410 }, { "epoch": 2.978902953586498, "grad_norm": 5.3040547370910645, "learning_rate": 2.3724586772278574e-06, "loss": 1.0527344942092896, "step": 1412 }, { "epoch": 2.9831223628691985, "grad_norm": 5.977110385894775, "learning_rate": 2.368290237948275e-06, "loss": 0.8416517972946167, "step": 1414 }, { "epoch": 2.9873417721518987, "grad_norm": 10.9218111038208, "learning_rate": 2.3641208879813567e-06, "loss": 0.8895251750946045, "step": 1416 }, { "epoch": 2.9915611814345993, "grad_norm": 2.910202741622925, "learning_rate": 2.3599506498891625e-06, "loss": 0.9375064373016357, "step": 1418 }, { "epoch": 2.9957805907173, "grad_norm": 4.632609844207764, "learning_rate": 2.355779546238555e-06, "loss": 1.054133415222168, "step": 1420 }, { "epoch": 3.0, "grad_norm": 2.0572307109832764, "learning_rate": 2.3516075996010844e-06, "loss": 0.47653502225875854, "step": 1422 }, { "epoch": 3.0042194092827006, "grad_norm": 6.811531066894531, "learning_rate": 2.3474348325528613e-06, "loss": 0.7990585565567017, "step": 1424 }, { "epoch": 3.0084388185654007, "grad_norm": 4.26148796081543, "learning_rate": 2.3432612676744338e-06, "loss": 0.6641910672187805, "step": 1426 }, { "epoch": 3.0126582278481013, "grad_norm": 14.421019554138184, "learning_rate": 2.3390869275506704e-06, "loss": 0.6507161855697632, "step": 1428 }, { "epoch": 3.0168776371308015, "grad_norm": 3.2018351554870605, "learning_rate": 2.334911834770633e-06, "loss": 0.3902518153190613, "step": 1430 }, { "epoch": 3.021097046413502, "grad_norm": 1.717463731765747, "learning_rate": 2.330736011927458e-06, "loss": 1.0567653179168701, "step": 1432 }, { "epoch": 3.0253164556962027, "grad_norm": 1.948553442955017, "learning_rate": 2.326559481618229e-06, "loss": 0.9750782251358032, "step": 1434 }, { "epoch": 3.029535864978903, "grad_norm": 3.6881439685821533, "learning_rate": 2.322382266443863e-06, "loss": 1.128783106803894, "step": 1436 }, { "epoch": 3.0337552742616034, "grad_norm": 3.5191917419433594, "learning_rate": 2.3182043890089784e-06, "loss": 0.5267306566238403, "step": 1438 }, { "epoch": 3.037974683544304, "grad_norm": 2.0311169624328613, "learning_rate": 2.3140258719217808e-06, "loss": 0.9317551851272583, "step": 1440 }, { "epoch": 3.042194092827004, "grad_norm": 5.587665557861328, "learning_rate": 2.309846737793935e-06, "loss": 0.6537089943885803, "step": 1442 }, { "epoch": 3.0464135021097047, "grad_norm": 2.518566131591797, "learning_rate": 2.3056670092404463e-06, "loss": 0.8329222202301025, "step": 1444 }, { "epoch": 3.050632911392405, "grad_norm": 2.06829833984375, "learning_rate": 2.3014867088795357e-06, "loss": 1.0246927738189697, "step": 1446 }, { "epoch": 3.0548523206751055, "grad_norm": 9.406811714172363, "learning_rate": 2.297305859332519e-06, "loss": 0.6608364582061768, "step": 1448 }, { "epoch": 3.059071729957806, "grad_norm": 18.555009841918945, "learning_rate": 2.2931244832236837e-06, "loss": 0.8099187612533569, "step": 1450 }, { "epoch": 3.0632911392405062, "grad_norm": 2.87371563911438, "learning_rate": 2.288942603180167e-06, "loss": 1.048098087310791, "step": 1452 }, { "epoch": 3.067510548523207, "grad_norm": 5.4316205978393555, "learning_rate": 2.2847602418318327e-06, "loss": 0.7442044019699097, "step": 1454 }, { "epoch": 3.071729957805907, "grad_norm": 6.081297397613525, "learning_rate": 2.2805774218111496e-06, "loss": 0.6251615285873413, "step": 1456 }, { "epoch": 3.0759493670886076, "grad_norm": 10.227375984191895, "learning_rate": 2.276394165753067e-06, "loss": 0.6871986389160156, "step": 1458 }, { "epoch": 3.080168776371308, "grad_norm": 7.270413398742676, "learning_rate": 2.272210496294896e-06, "loss": 0.7179367542266846, "step": 1460 }, { "epoch": 3.0843881856540083, "grad_norm": 2.082552194595337, "learning_rate": 2.268026436076185e-06, "loss": 0.9696202278137207, "step": 1462 }, { "epoch": 3.088607594936709, "grad_norm": 2.518341302871704, "learning_rate": 2.263842007738594e-06, "loss": 0.9051344394683838, "step": 1464 }, { "epoch": 3.0928270042194095, "grad_norm": 2.340363025665283, "learning_rate": 2.2596572339257777e-06, "loss": 0.8250648975372314, "step": 1466 }, { "epoch": 3.0970464135021096, "grad_norm": 11.077507019042969, "learning_rate": 2.255472137283259e-06, "loss": 0.6344802975654602, "step": 1468 }, { "epoch": 3.1012658227848102, "grad_norm": 7.140725612640381, "learning_rate": 2.2512867404583085e-06, "loss": 0.1541098654270172, "step": 1470 }, { "epoch": 3.1054852320675104, "grad_norm": 4.636476993560791, "learning_rate": 2.2471010660998215e-06, "loss": 1.4155219793319702, "step": 1472 }, { "epoch": 3.109704641350211, "grad_norm": 3.027451276779175, "learning_rate": 2.242915136858193e-06, "loss": 0.49524158239364624, "step": 1474 }, { "epoch": 3.1139240506329116, "grad_norm": 3.410243034362793, "learning_rate": 2.2387289753852e-06, "loss": 1.0359880924224854, "step": 1476 }, { "epoch": 3.1181434599156117, "grad_norm": 2.0419440269470215, "learning_rate": 2.234542604333875e-06, "loss": 1.03524911403656, "step": 1478 }, { "epoch": 3.1223628691983123, "grad_norm": 2.8948912620544434, "learning_rate": 2.230356046358384e-06, "loss": 0.9543738961219788, "step": 1480 }, { "epoch": 3.1265822784810124, "grad_norm": 2.4057018756866455, "learning_rate": 2.2261693241139065e-06, "loss": 0.9722020030021667, "step": 1482 }, { "epoch": 3.130801687763713, "grad_norm": 1.9602731466293335, "learning_rate": 2.2219824602565087e-06, "loss": 0.9750865697860718, "step": 1484 }, { "epoch": 3.1350210970464136, "grad_norm": 2.10933780670166, "learning_rate": 2.2177954774430234e-06, "loss": 0.6285134553909302, "step": 1486 }, { "epoch": 3.1392405063291138, "grad_norm": 1.8953523635864258, "learning_rate": 2.2136083983309286e-06, "loss": 0.6080442667007446, "step": 1488 }, { "epoch": 3.1434599156118144, "grad_norm": 10.058272361755371, "learning_rate": 2.2094212455782227e-06, "loss": 1.1448235511779785, "step": 1490 }, { "epoch": 3.147679324894515, "grad_norm": 3.1135060787200928, "learning_rate": 2.2052340418433024e-06, "loss": 0.6743577718734741, "step": 1492 }, { "epoch": 3.151898734177215, "grad_norm": 6.616735458374023, "learning_rate": 2.2010468097848396e-06, "loss": 0.737909197807312, "step": 1494 }, { "epoch": 3.1561181434599157, "grad_norm": 3.229160785675049, "learning_rate": 2.1968595720616606e-06, "loss": 0.8287728428840637, "step": 1496 }, { "epoch": 3.160337552742616, "grad_norm": 0.8820177316665649, "learning_rate": 2.192672351332623e-06, "loss": 0.4992554783821106, "step": 1498 }, { "epoch": 3.1645569620253164, "grad_norm": 2.330535411834717, "learning_rate": 2.1884851702564897e-06, "loss": 0.5810240507125854, "step": 1500 }, { "epoch": 3.168776371308017, "grad_norm": 1.8228909969329834, "learning_rate": 2.1842980514918117e-06, "loss": 0.9471129179000854, "step": 1502 }, { "epoch": 3.172995780590717, "grad_norm": 2.3642683029174805, "learning_rate": 2.1801110176968016e-06, "loss": 0.8418397903442383, "step": 1504 }, { "epoch": 3.1772151898734178, "grad_norm": 1.8827167749404907, "learning_rate": 2.1759240915292135e-06, "loss": 0.9700140357017517, "step": 1506 }, { "epoch": 3.181434599156118, "grad_norm": 3.775982618331909, "learning_rate": 2.171737295646216e-06, "loss": 1.1170215606689453, "step": 1508 }, { "epoch": 3.1856540084388185, "grad_norm": 4.465606212615967, "learning_rate": 2.167550652704276e-06, "loss": 0.9244706630706787, "step": 1510 }, { "epoch": 3.189873417721519, "grad_norm": 2.6871254444122314, "learning_rate": 2.1633641853590318e-06, "loss": 0.25759080052375793, "step": 1512 }, { "epoch": 3.1940928270042193, "grad_norm": 3.345410108566284, "learning_rate": 2.15917791626517e-06, "loss": 0.9588069319725037, "step": 1514 }, { "epoch": 3.19831223628692, "grad_norm": 10.61077880859375, "learning_rate": 2.154991868076306e-06, "loss": 0.5874932408332825, "step": 1516 }, { "epoch": 3.2025316455696204, "grad_norm": 2.544962167739868, "learning_rate": 2.1508060634448595e-06, "loss": 0.991689920425415, "step": 1518 }, { "epoch": 3.2067510548523206, "grad_norm": 3.6874847412109375, "learning_rate": 2.1466205250219315e-06, "loss": 0.9816372990608215, "step": 1520 }, { "epoch": 3.210970464135021, "grad_norm": 5.628320217132568, "learning_rate": 2.142435275457184e-06, "loss": 0.30518054962158203, "step": 1522 }, { "epoch": 3.2151898734177213, "grad_norm": 3.656771659851074, "learning_rate": 2.1382503373987133e-06, "loss": 0.7900766134262085, "step": 1524 }, { "epoch": 3.219409282700422, "grad_norm": 2.2453036308288574, "learning_rate": 2.1340657334929335e-06, "loss": 0.8744317293167114, "step": 1526 }, { "epoch": 3.2236286919831225, "grad_norm": 0.9895398616790771, "learning_rate": 2.1298814863844476e-06, "loss": 0.47598880529403687, "step": 1528 }, { "epoch": 3.2278481012658227, "grad_norm": 5.763035297393799, "learning_rate": 2.1256976187159278e-06, "loss": 0.7799667119979858, "step": 1530 }, { "epoch": 3.2320675105485233, "grad_norm": 0.7087782621383667, "learning_rate": 2.121514153127995e-06, "loss": 0.2722686529159546, "step": 1532 }, { "epoch": 3.2362869198312234, "grad_norm": 3.2583420276641846, "learning_rate": 2.1173311122590932e-06, "loss": 0.7357510328292847, "step": 1534 }, { "epoch": 3.240506329113924, "grad_norm": 3.8085386753082275, "learning_rate": 2.1131485187453676e-06, "loss": 0.9901435375213623, "step": 1536 }, { "epoch": 3.2447257383966246, "grad_norm": 2.8548874855041504, "learning_rate": 2.1089663952205435e-06, "loss": 0.9335240721702576, "step": 1538 }, { "epoch": 3.2489451476793247, "grad_norm": 9.909287452697754, "learning_rate": 2.104784764315802e-06, "loss": 0.752236008644104, "step": 1540 }, { "epoch": 3.2531645569620253, "grad_norm": 9.005875587463379, "learning_rate": 2.100603648659659e-06, "loss": 0.741628885269165, "step": 1542 }, { "epoch": 3.257383966244726, "grad_norm": 8.307135581970215, "learning_rate": 2.096423070877843e-06, "loss": 0.6267164945602417, "step": 1544 }, { "epoch": 3.261603375527426, "grad_norm": 6.679696559906006, "learning_rate": 2.092243053593169e-06, "loss": 0.5680997371673584, "step": 1546 }, { "epoch": 3.2658227848101267, "grad_norm": 3.8873493671417236, "learning_rate": 2.0880636194254225e-06, "loss": 0.874029278755188, "step": 1548 }, { "epoch": 3.270042194092827, "grad_norm": 6.6328301429748535, "learning_rate": 2.0838847909912307e-06, "loss": 0.4085759222507477, "step": 1550 }, { "epoch": 3.2742616033755274, "grad_norm": 2.145261526107788, "learning_rate": 2.0797065909039457e-06, "loss": 0.36501544713974, "step": 1552 }, { "epoch": 3.278481012658228, "grad_norm": 7.986878395080566, "learning_rate": 2.0755290417735156e-06, "loss": 0.4557437002658844, "step": 1554 }, { "epoch": 3.282700421940928, "grad_norm": 2.874678373336792, "learning_rate": 2.071352166206369e-06, "loss": 0.962173581123352, "step": 1556 }, { "epoch": 3.2869198312236287, "grad_norm": 1.1194981336593628, "learning_rate": 2.0671759868052893e-06, "loss": 0.7566915154457092, "step": 1558 }, { "epoch": 3.291139240506329, "grad_norm": 2.609232187271118, "learning_rate": 2.0630005261692905e-06, "loss": 0.6619813442230225, "step": 1560 }, { "epoch": 3.2953586497890295, "grad_norm": 9.03283977508545, "learning_rate": 2.0588258068935002e-06, "loss": 0.5809231400489807, "step": 1562 }, { "epoch": 3.29957805907173, "grad_norm": 3.785902500152588, "learning_rate": 2.0546518515690316e-06, "loss": 0.8656713366508484, "step": 1564 }, { "epoch": 3.3037974683544302, "grad_norm": 11.584537506103516, "learning_rate": 2.0504786827828648e-06, "loss": 0.7611091136932373, "step": 1566 }, { "epoch": 3.308016877637131, "grad_norm": 8.480819702148438, "learning_rate": 2.0463063231177236e-06, "loss": 0.5610800981521606, "step": 1568 }, { "epoch": 3.3122362869198314, "grad_norm": 5.454217910766602, "learning_rate": 2.0421347951519535e-06, "loss": 0.5264372229576111, "step": 1570 }, { "epoch": 3.3164556962025316, "grad_norm": 3.4205212593078613, "learning_rate": 2.037964121459399e-06, "loss": 0.5730254650115967, "step": 1572 }, { "epoch": 3.320675105485232, "grad_norm": 3.719339609146118, "learning_rate": 2.033794324609282e-06, "loss": 1.091575026512146, "step": 1574 }, { "epoch": 3.3248945147679323, "grad_norm": 2.0314159393310547, "learning_rate": 2.0296254271660795e-06, "loss": 0.8482744693756104, "step": 1576 }, { "epoch": 3.329113924050633, "grad_norm": 2.4221394062042236, "learning_rate": 2.025457451689401e-06, "loss": 0.9338847398757935, "step": 1578 }, { "epoch": 3.3333333333333335, "grad_norm": 3.0065743923187256, "learning_rate": 2.0212904207338672e-06, "loss": 0.7377324104309082, "step": 1580 }, { "epoch": 3.3375527426160336, "grad_norm": 1.4527244567871094, "learning_rate": 2.0171243568489883e-06, "loss": 0.48970168828964233, "step": 1582 }, { "epoch": 3.3417721518987342, "grad_norm": 5.6556010246276855, "learning_rate": 2.0129592825790397e-06, "loss": 0.7742688655853271, "step": 1584 }, { "epoch": 3.3459915611814344, "grad_norm": 5.596024990081787, "learning_rate": 2.0087952204629422e-06, "loss": 0.641385555267334, "step": 1586 }, { "epoch": 3.350210970464135, "grad_norm": 2.5374205112457275, "learning_rate": 2.0046321930341405e-06, "loss": 0.5972579717636108, "step": 1588 }, { "epoch": 3.3544303797468356, "grad_norm": 1.8447959423065186, "learning_rate": 2.0004702228204797e-06, "loss": 0.8615912199020386, "step": 1590 }, { "epoch": 3.3586497890295357, "grad_norm": 6.396413326263428, "learning_rate": 1.9963093323440824e-06, "loss": 0.9015900492668152, "step": 1592 }, { "epoch": 3.3628691983122363, "grad_norm": 0.825650155544281, "learning_rate": 1.99214954412123e-06, "loss": 0.6481198072433472, "step": 1594 }, { "epoch": 3.367088607594937, "grad_norm": 6.811497211456299, "learning_rate": 1.9879908806622385e-06, "loss": 0.4374066889286041, "step": 1596 }, { "epoch": 3.371308016877637, "grad_norm": 0.7065004706382751, "learning_rate": 1.9838333644713377e-06, "loss": 0.4804467558860779, "step": 1598 }, { "epoch": 3.3755274261603376, "grad_norm": 10.63037109375, "learning_rate": 1.9796770180465484e-06, "loss": 0.6881888508796692, "step": 1600 }, { "epoch": 3.379746835443038, "grad_norm": 5.440734386444092, "learning_rate": 1.9755218638795626e-06, "loss": 0.547875165939331, "step": 1602 }, { "epoch": 3.3839662447257384, "grad_norm": 1.4273031949996948, "learning_rate": 1.971367924455618e-06, "loss": 0.5285290479660034, "step": 1604 }, { "epoch": 3.388185654008439, "grad_norm": 3.3389201164245605, "learning_rate": 1.9672152222533822e-06, "loss": 1.0279819965362549, "step": 1606 }, { "epoch": 3.392405063291139, "grad_norm": 2.3950865268707275, "learning_rate": 1.9630637797448248e-06, "loss": 0.6111994981765747, "step": 1608 }, { "epoch": 3.3966244725738397, "grad_norm": 1.5823328495025635, "learning_rate": 1.9589136193951e-06, "loss": 0.5231560468673706, "step": 1610 }, { "epoch": 3.40084388185654, "grad_norm": 2.5763416290283203, "learning_rate": 1.9547647636624243e-06, "loss": 0.916947603225708, "step": 1612 }, { "epoch": 3.4050632911392404, "grad_norm": 2.1279733180999756, "learning_rate": 1.9506172349979523e-06, "loss": 0.39490947127342224, "step": 1614 }, { "epoch": 3.409282700421941, "grad_norm": 13.062804222106934, "learning_rate": 1.9464710558456595e-06, "loss": 0.8276299238204956, "step": 1616 }, { "epoch": 3.413502109704641, "grad_norm": 2.3977434635162354, "learning_rate": 1.942326248642218e-06, "loss": 1.0900508165359497, "step": 1618 }, { "epoch": 3.4177215189873418, "grad_norm": 2.819269895553589, "learning_rate": 1.9381828358168748e-06, "loss": 0.9528172016143799, "step": 1620 }, { "epoch": 3.4219409282700424, "grad_norm": 23.19445037841797, "learning_rate": 1.934040839791332e-06, "loss": 0.5396543145179749, "step": 1622 }, { "epoch": 3.4261603375527425, "grad_norm": 7.595386028289795, "learning_rate": 1.9299002829796253e-06, "loss": 0.3888126611709595, "step": 1624 }, { "epoch": 3.430379746835443, "grad_norm": 5.8151960372924805, "learning_rate": 1.925761187788002e-06, "loss": 0.3526824712753296, "step": 1626 }, { "epoch": 3.4345991561181437, "grad_norm": 5.015478134155273, "learning_rate": 1.921623576614799e-06, "loss": 1.0127757787704468, "step": 1628 }, { "epoch": 3.438818565400844, "grad_norm": 3.884026050567627, "learning_rate": 1.917487471850323e-06, "loss": 0.3786028325557709, "step": 1630 }, { "epoch": 3.4430379746835444, "grad_norm": 4.3548784255981445, "learning_rate": 1.91335289587673e-06, "loss": 1.0020424127578735, "step": 1632 }, { "epoch": 3.4472573839662446, "grad_norm": 1.1631532907485962, "learning_rate": 1.909219871067902e-06, "loss": 0.5979082584381104, "step": 1634 }, { "epoch": 3.451476793248945, "grad_norm": 5.268650531768799, "learning_rate": 1.9050884197893278e-06, "loss": 1.1838793754577637, "step": 1636 }, { "epoch": 3.4556962025316453, "grad_norm": 4.640054702758789, "learning_rate": 1.90095856439798e-06, "loss": 1.1896966695785522, "step": 1638 }, { "epoch": 3.459915611814346, "grad_norm": 3.0583033561706543, "learning_rate": 1.8968303272421968e-06, "loss": 0.9648596048355103, "step": 1640 }, { "epoch": 3.4641350210970465, "grad_norm": 30.096092224121094, "learning_rate": 1.8927037306615578e-06, "loss": 1.0935192108154297, "step": 1642 }, { "epoch": 3.4683544303797467, "grad_norm": 2.4771618843078613, "learning_rate": 1.8885787969867656e-06, "loss": 0.35215988755226135, "step": 1644 }, { "epoch": 3.4725738396624473, "grad_norm": 10.702546119689941, "learning_rate": 1.884455548539524e-06, "loss": 0.839633584022522, "step": 1646 }, { "epoch": 3.476793248945148, "grad_norm": 1.648725152015686, "learning_rate": 1.8803340076324181e-06, "loss": 0.9294931888580322, "step": 1648 }, { "epoch": 3.481012658227848, "grad_norm": 2.6175386905670166, "learning_rate": 1.876214196568791e-06, "loss": 0.5126534104347229, "step": 1650 }, { "epoch": 3.4852320675105486, "grad_norm": 2.2899160385131836, "learning_rate": 1.872096137642627e-06, "loss": 0.8264724612236023, "step": 1652 }, { "epoch": 3.489451476793249, "grad_norm": 1.0500420331954956, "learning_rate": 1.8679798531384274e-06, "loss": 0.4854082465171814, "step": 1654 }, { "epoch": 3.4936708860759493, "grad_norm": 1.5645257234573364, "learning_rate": 1.8638653653310926e-06, "loss": 0.7242560386657715, "step": 1656 }, { "epoch": 3.49789029535865, "grad_norm": 3.557481050491333, "learning_rate": 1.8597526964857985e-06, "loss": 0.7009620666503906, "step": 1658 }, { "epoch": 3.50210970464135, "grad_norm": 2.4170994758605957, "learning_rate": 1.8556418688578797e-06, "loss": 1.0089216232299805, "step": 1660 }, { "epoch": 3.5063291139240507, "grad_norm": 5.906785488128662, "learning_rate": 1.8515329046927058e-06, "loss": 1.111635446548462, "step": 1662 }, { "epoch": 3.510548523206751, "grad_norm": 21.11191749572754, "learning_rate": 1.8474258262255642e-06, "loss": 0.4738878309726715, "step": 1664 }, { "epoch": 3.5147679324894514, "grad_norm": 6.232138633728027, "learning_rate": 1.843320655681536e-06, "loss": 1.019901990890503, "step": 1666 }, { "epoch": 3.518987341772152, "grad_norm": 7.000395774841309, "learning_rate": 1.839217415275379e-06, "loss": 0.6458152532577515, "step": 1668 }, { "epoch": 3.523206751054852, "grad_norm": 2.109321355819702, "learning_rate": 1.835116127211406e-06, "loss": 0.9234386086463928, "step": 1670 }, { "epoch": 3.5274261603375527, "grad_norm": 8.41999340057373, "learning_rate": 1.8310168136833646e-06, "loss": 0.382904052734375, "step": 1672 }, { "epoch": 3.5316455696202533, "grad_norm": 2.0964558124542236, "learning_rate": 1.8269194968743178e-06, "loss": 0.585561990737915, "step": 1674 }, { "epoch": 3.5358649789029535, "grad_norm": 10.49689769744873, "learning_rate": 1.8228241989565239e-06, "loss": 0.6187952160835266, "step": 1676 }, { "epoch": 3.540084388185654, "grad_norm": 7.462824821472168, "learning_rate": 1.8187309420913142e-06, "loss": 0.7788501977920532, "step": 1678 }, { "epoch": 3.5443037974683547, "grad_norm": 3.3341939449310303, "learning_rate": 1.8146397484289774e-06, "loss": 0.9248118996620178, "step": 1680 }, { "epoch": 3.548523206751055, "grad_norm": 2.9744744300842285, "learning_rate": 1.810550640108636e-06, "loss": 0.7860240936279297, "step": 1682 }, { "epoch": 3.5527426160337554, "grad_norm": 16.682893753051758, "learning_rate": 1.8064636392581285e-06, "loss": 0.7947289347648621, "step": 1684 }, { "epoch": 3.5569620253164556, "grad_norm": 11.304174423217773, "learning_rate": 1.8023787679938884e-06, "loss": 0.32021385431289673, "step": 1686 }, { "epoch": 3.561181434599156, "grad_norm": 3.4476826190948486, "learning_rate": 1.7982960484208255e-06, "loss": 0.5928635597229004, "step": 1688 }, { "epoch": 3.5654008438818563, "grad_norm": 4.565676689147949, "learning_rate": 1.7942155026322064e-06, "loss": 1.007154941558838, "step": 1690 }, { "epoch": 3.569620253164557, "grad_norm": 54.17780685424805, "learning_rate": 1.7901371527095336e-06, "loss": 0.20298929512500763, "step": 1692 }, { "epoch": 3.5738396624472575, "grad_norm": 0.7691044807434082, "learning_rate": 1.7860610207224266e-06, "loss": 0.610919713973999, "step": 1694 }, { "epoch": 3.5780590717299576, "grad_norm": 7.206573486328125, "learning_rate": 1.7819871287285042e-06, "loss": 0.2613908350467682, "step": 1696 }, { "epoch": 3.5822784810126582, "grad_norm": 2.2238030433654785, "learning_rate": 1.7779154987732628e-06, "loss": 0.7429696321487427, "step": 1698 }, { "epoch": 3.586497890295359, "grad_norm": 2.2671563625335693, "learning_rate": 1.7738461528899582e-06, "loss": 0.6627340912818909, "step": 1700 }, { "epoch": 3.590717299578059, "grad_norm": 1.9748802185058594, "learning_rate": 1.769779113099485e-06, "loss": 0.5637974739074707, "step": 1702 }, { "epoch": 3.5949367088607596, "grad_norm": 2.075197696685791, "learning_rate": 1.7657144014102605e-06, "loss": 1.022030234336853, "step": 1704 }, { "epoch": 3.59915611814346, "grad_norm": 2.7764699459075928, "learning_rate": 1.7616520398181019e-06, "loss": 0.6542642116546631, "step": 1706 }, { "epoch": 3.6033755274261603, "grad_norm": 5.018822193145752, "learning_rate": 1.757592050306111e-06, "loss": 0.7118390202522278, "step": 1708 }, { "epoch": 3.607594936708861, "grad_norm": 1.829730749130249, "learning_rate": 1.7535344548445523e-06, "loss": 0.5238461494445801, "step": 1710 }, { "epoch": 3.611814345991561, "grad_norm": 2.2571935653686523, "learning_rate": 1.7494792753907342e-06, "loss": 0.9762560129165649, "step": 1712 }, { "epoch": 3.6160337552742616, "grad_norm": 11.215494155883789, "learning_rate": 1.7454265338888923e-06, "loss": 1.1840991973876953, "step": 1714 }, { "epoch": 3.620253164556962, "grad_norm": 5.1113972663879395, "learning_rate": 1.741376252270069e-06, "loss": 0.5932983160018921, "step": 1716 }, { "epoch": 3.6244725738396624, "grad_norm": 3.276780843734741, "learning_rate": 1.7373284524519956e-06, "loss": 0.654528021812439, "step": 1718 }, { "epoch": 3.628691983122363, "grad_norm": 4.502676486968994, "learning_rate": 1.733283156338973e-06, "loss": 0.329173743724823, "step": 1720 }, { "epoch": 3.632911392405063, "grad_norm": 4.122840404510498, "learning_rate": 1.7292403858217534e-06, "loss": 1.0182509422302246, "step": 1722 }, { "epoch": 3.6371308016877637, "grad_norm": 8.013359069824219, "learning_rate": 1.7252001627774227e-06, "loss": 0.5020068287849426, "step": 1724 }, { "epoch": 3.6413502109704643, "grad_norm": 7.430994987487793, "learning_rate": 1.72116250906928e-06, "loss": 0.45291832089424133, "step": 1726 }, { "epoch": 3.6455696202531644, "grad_norm": 5.890309810638428, "learning_rate": 1.7171274465467224e-06, "loss": 0.8754688501358032, "step": 1728 }, { "epoch": 3.649789029535865, "grad_norm": 9.963774681091309, "learning_rate": 1.7130949970451245e-06, "loss": 0.2187124788761139, "step": 1730 }, { "epoch": 3.6540084388185656, "grad_norm": 6.262022972106934, "learning_rate": 1.709065182385719e-06, "loss": 0.886106014251709, "step": 1732 }, { "epoch": 3.6582278481012658, "grad_norm": 9.15018367767334, "learning_rate": 1.7050380243754838e-06, "loss": 0.3278903365135193, "step": 1734 }, { "epoch": 3.6624472573839664, "grad_norm": 30.086578369140625, "learning_rate": 1.7010135448070169e-06, "loss": 0.3603389263153076, "step": 1736 }, { "epoch": 3.6666666666666665, "grad_norm": 24.306060791015625, "learning_rate": 1.6969917654584247e-06, "loss": 0.6651766300201416, "step": 1738 }, { "epoch": 3.670886075949367, "grad_norm": 4.77196741104126, "learning_rate": 1.692972708093201e-06, "loss": 0.33792465925216675, "step": 1740 }, { "epoch": 3.6751054852320673, "grad_norm": 1.7918250560760498, "learning_rate": 1.688956394460109e-06, "loss": 1.0997920036315918, "step": 1742 }, { "epoch": 3.679324894514768, "grad_norm": 19.624130249023438, "learning_rate": 1.6849428462930653e-06, "loss": 0.5909217596054077, "step": 1744 }, { "epoch": 3.6835443037974684, "grad_norm": 7.293959140777588, "learning_rate": 1.6809320853110215e-06, "loss": 0.563459038734436, "step": 1746 }, { "epoch": 3.6877637130801686, "grad_norm": 2.4896528720855713, "learning_rate": 1.6769241332178469e-06, "loss": 1.0555415153503418, "step": 1748 }, { "epoch": 3.691983122362869, "grad_norm": 2.973538398742676, "learning_rate": 1.6729190117022095e-06, "loss": 0.8185904026031494, "step": 1750 }, { "epoch": 3.6962025316455698, "grad_norm": 3.3849141597747803, "learning_rate": 1.6689167424374597e-06, "loss": 0.8749343752861023, "step": 1752 }, { "epoch": 3.70042194092827, "grad_norm": 2.0385217666625977, "learning_rate": 1.664917347081516e-06, "loss": 1.026354432106018, "step": 1754 }, { "epoch": 3.7046413502109705, "grad_norm": 5.828520774841309, "learning_rate": 1.660920847276741e-06, "loss": 0.8060284852981567, "step": 1756 }, { "epoch": 3.708860759493671, "grad_norm": 5.976668357849121, "learning_rate": 1.6569272646498318e-06, "loss": 0.7234772443771362, "step": 1758 }, { "epoch": 3.7130801687763713, "grad_norm": 9.543655395507812, "learning_rate": 1.6529366208116974e-06, "loss": 0.7528952360153198, "step": 1760 }, { "epoch": 3.717299578059072, "grad_norm": 4.140414237976074, "learning_rate": 1.6489489373573443e-06, "loss": 0.26903659105300903, "step": 1762 }, { "epoch": 3.721518987341772, "grad_norm": 12.051411628723145, "learning_rate": 1.64496423586576e-06, "loss": 0.5374072790145874, "step": 1764 }, { "epoch": 3.7257383966244726, "grad_norm": 2.326197624206543, "learning_rate": 1.6409825378997941e-06, "loss": 0.9479004740715027, "step": 1766 }, { "epoch": 3.7299578059071727, "grad_norm": 4.621135234832764, "learning_rate": 1.6370038650060437e-06, "loss": 0.5748968124389648, "step": 1768 }, { "epoch": 3.7341772151898733, "grad_norm": 2.885585069656372, "learning_rate": 1.6330282387147349e-06, "loss": 0.5932916402816772, "step": 1770 }, { "epoch": 3.738396624472574, "grad_norm": 1.9321597814559937, "learning_rate": 1.6290556805396093e-06, "loss": 0.9674075245857239, "step": 1772 }, { "epoch": 3.742616033755274, "grad_norm": 3.254708766937256, "learning_rate": 1.6250862119778046e-06, "loss": 0.4991704523563385, "step": 1774 }, { "epoch": 3.7468354430379747, "grad_norm": 1.1030203104019165, "learning_rate": 1.6211198545097381e-06, "loss": 0.5824090242385864, "step": 1776 }, { "epoch": 3.7510548523206753, "grad_norm": 2.4272022247314453, "learning_rate": 1.6171566295989947e-06, "loss": 0.8916751146316528, "step": 1778 }, { "epoch": 3.7552742616033754, "grad_norm": 2.7834560871124268, "learning_rate": 1.6131965586922039e-06, "loss": 0.9039870500564575, "step": 1780 }, { "epoch": 3.759493670886076, "grad_norm": 3.2108805179595947, "learning_rate": 1.6092396632189317e-06, "loss": 0.8393138647079468, "step": 1782 }, { "epoch": 3.7637130801687766, "grad_norm": 8.731537818908691, "learning_rate": 1.6052859645915575e-06, "loss": 0.8530555963516235, "step": 1784 }, { "epoch": 3.7679324894514767, "grad_norm": 2.2591445446014404, "learning_rate": 1.6013354842051624e-06, "loss": 1.0453441143035889, "step": 1786 }, { "epoch": 3.7721518987341773, "grad_norm": 18.5029296875, "learning_rate": 1.5973882434374124e-06, "loss": 0.2866585850715637, "step": 1788 }, { "epoch": 3.7763713080168775, "grad_norm": 2.598447561264038, "learning_rate": 1.5934442636484425e-06, "loss": 0.5377147197723389, "step": 1790 }, { "epoch": 3.780590717299578, "grad_norm": 2.245370864868164, "learning_rate": 1.5895035661807397e-06, "loss": 0.9374682903289795, "step": 1792 }, { "epoch": 3.7848101265822782, "grad_norm": 10.506272315979004, "learning_rate": 1.5855661723590319e-06, "loss": 0.7131825685501099, "step": 1794 }, { "epoch": 3.789029535864979, "grad_norm": 5.187559127807617, "learning_rate": 1.581632103490168e-06, "loss": 0.9631250500679016, "step": 1796 }, { "epoch": 3.7932489451476794, "grad_norm": 5.299999713897705, "learning_rate": 1.577701380863003e-06, "loss": 1.1112829446792603, "step": 1798 }, { "epoch": 3.7974683544303796, "grad_norm": 2.1457207202911377, "learning_rate": 1.5737740257482867e-06, "loss": 0.8928860425949097, "step": 1800 }, { "epoch": 3.80168776371308, "grad_norm": 2.5547454357147217, "learning_rate": 1.569850059398544e-06, "loss": 1.004746675491333, "step": 1802 }, { "epoch": 3.8059071729957807, "grad_norm": 3.674745798110962, "learning_rate": 1.565929503047963e-06, "loss": 0.49736908078193665, "step": 1804 }, { "epoch": 3.810126582278481, "grad_norm": 7.80587100982666, "learning_rate": 1.562012377912277e-06, "loss": 0.23617342114448547, "step": 1806 }, { "epoch": 3.8143459915611815, "grad_norm": 5.4438958168029785, "learning_rate": 1.5580987051886533e-06, "loss": 0.8461598753929138, "step": 1808 }, { "epoch": 3.818565400843882, "grad_norm": 2.466731071472168, "learning_rate": 1.554188506055577e-06, "loss": 0.9447206258773804, "step": 1810 }, { "epoch": 3.8227848101265822, "grad_norm": 5.592019081115723, "learning_rate": 1.550281801672735e-06, "loss": 0.47888684272766113, "step": 1812 }, { "epoch": 3.827004219409283, "grad_norm": 2.1095151901245117, "learning_rate": 1.5463786131809031e-06, "loss": 0.9347876310348511, "step": 1814 }, { "epoch": 3.831223628691983, "grad_norm": 4.567122936248779, "learning_rate": 1.542478961701831e-06, "loss": 0.8219131231307983, "step": 1816 }, { "epoch": 3.8354430379746836, "grad_norm": 3.2872185707092285, "learning_rate": 1.5385828683381293e-06, "loss": 0.7965229749679565, "step": 1818 }, { "epoch": 3.8396624472573837, "grad_norm": 4.746089935302734, "learning_rate": 1.5346903541731524e-06, "loss": 0.6401727199554443, "step": 1820 }, { "epoch": 3.8438818565400843, "grad_norm": 3.5851891040802, "learning_rate": 1.530801440270888e-06, "loss": 0.9646581411361694, "step": 1822 }, { "epoch": 3.848101265822785, "grad_norm": 7.018674373626709, "learning_rate": 1.5269161476758404e-06, "loss": 0.7993499636650085, "step": 1824 }, { "epoch": 3.852320675105485, "grad_norm": 3.83168888092041, "learning_rate": 1.523034497412916e-06, "loss": 0.9415961503982544, "step": 1826 }, { "epoch": 3.8565400843881856, "grad_norm": 3.7820115089416504, "learning_rate": 1.5191565104873144e-06, "loss": 0.9054951667785645, "step": 1828 }, { "epoch": 3.8607594936708862, "grad_norm": 5.366248607635498, "learning_rate": 1.5152822078844088e-06, "loss": 0.9999287128448486, "step": 1830 }, { "epoch": 3.8649789029535864, "grad_norm": 5.807839393615723, "learning_rate": 1.511411610569636e-06, "loss": 0.3293692171573639, "step": 1832 }, { "epoch": 3.869198312236287, "grad_norm": 3.83225679397583, "learning_rate": 1.5075447394883814e-06, "loss": 0.6949493885040283, "step": 1834 }, { "epoch": 3.8734177215189876, "grad_norm": 10.349047660827637, "learning_rate": 1.5036816155658665e-06, "loss": 0.7142183184623718, "step": 1836 }, { "epoch": 3.8776371308016877, "grad_norm": 4.179904460906982, "learning_rate": 1.4998222597070362e-06, "loss": 0.6529619097709656, "step": 1838 }, { "epoch": 3.8818565400843883, "grad_norm": 11.569310188293457, "learning_rate": 1.4959666927964437e-06, "loss": 0.8389513492584229, "step": 1840 }, { "epoch": 3.8860759493670884, "grad_norm": 4.005336761474609, "learning_rate": 1.4921149356981397e-06, "loss": 0.5777831077575684, "step": 1842 }, { "epoch": 3.890295358649789, "grad_norm": 5.133764266967773, "learning_rate": 1.4882670092555567e-06, "loss": 0.5414679050445557, "step": 1844 }, { "epoch": 3.894514767932489, "grad_norm": 2.863504409790039, "learning_rate": 1.4844229342913996e-06, "loss": 0.9309226870536804, "step": 1846 }, { "epoch": 3.8987341772151898, "grad_norm": 22.195985794067383, "learning_rate": 1.480582731607531e-06, "loss": 0.4635329842567444, "step": 1848 }, { "epoch": 3.9029535864978904, "grad_norm": 2.475642204284668, "learning_rate": 1.4767464219848593e-06, "loss": 0.9393260478973389, "step": 1850 }, { "epoch": 3.9071729957805905, "grad_norm": 3.141064405441284, "learning_rate": 1.4729140261832246e-06, "loss": 0.9542742967605591, "step": 1852 }, { "epoch": 3.911392405063291, "grad_norm": 2.667790174484253, "learning_rate": 1.4690855649412895e-06, "loss": 0.9756711721420288, "step": 1854 }, { "epoch": 3.9156118143459917, "grad_norm": 2.641533374786377, "learning_rate": 1.4652610589764235e-06, "loss": 0.9634566903114319, "step": 1856 }, { "epoch": 3.919831223628692, "grad_norm": 2.9647128582000732, "learning_rate": 1.461440528984594e-06, "loss": 0.9994820356369019, "step": 1858 }, { "epoch": 3.9240506329113924, "grad_norm": 5.323459625244141, "learning_rate": 1.4576239956402514e-06, "loss": 0.9943286180496216, "step": 1860 }, { "epoch": 3.928270042194093, "grad_norm": 2.4466195106506348, "learning_rate": 1.4538114795962195e-06, "loss": 0.6168838143348694, "step": 1862 }, { "epoch": 3.932489451476793, "grad_norm": 3.8990132808685303, "learning_rate": 1.4500030014835822e-06, "loss": 0.6228926777839661, "step": 1864 }, { "epoch": 3.9367088607594938, "grad_norm": 6.640925407409668, "learning_rate": 1.4461985819115733e-06, "loss": 1.230762243270874, "step": 1866 }, { "epoch": 3.9409282700421944, "grad_norm": 1.7788114547729492, "learning_rate": 1.4423982414674635e-06, "loss": 0.9199753999710083, "step": 1868 }, { "epoch": 3.9451476793248945, "grad_norm": 11.634161949157715, "learning_rate": 1.4386020007164494e-06, "loss": 0.702942967414856, "step": 1870 }, { "epoch": 3.9493670886075947, "grad_norm": 0.652026355266571, "learning_rate": 1.4348098802015446e-06, "loss": 0.5037093162536621, "step": 1872 }, { "epoch": 3.9535864978902953, "grad_norm": 10.706385612487793, "learning_rate": 1.4310219004434632e-06, "loss": 0.45475533604621887, "step": 1874 }, { "epoch": 3.957805907172996, "grad_norm": 7.073146820068359, "learning_rate": 1.4272380819405139e-06, "loss": 0.8023735284805298, "step": 1876 }, { "epoch": 3.962025316455696, "grad_norm": 3.564532518386841, "learning_rate": 1.4234584451684866e-06, "loss": 0.716842770576477, "step": 1878 }, { "epoch": 3.9662447257383966, "grad_norm": 1.7148876190185547, "learning_rate": 1.4196830105805432e-06, "loss": 0.5358736515045166, "step": 1880 }, { "epoch": 3.970464135021097, "grad_norm": 3.4616918563842773, "learning_rate": 1.4159117986071038e-06, "loss": 0.9063611030578613, "step": 1882 }, { "epoch": 3.9746835443037973, "grad_norm": 5.480584144592285, "learning_rate": 1.4121448296557406e-06, "loss": 0.40525734424591064, "step": 1884 }, { "epoch": 3.978902953586498, "grad_norm": 4.338303565979004, "learning_rate": 1.4083821241110637e-06, "loss": 0.9141275882720947, "step": 1886 }, { "epoch": 3.9831223628691985, "grad_norm": 7.042728900909424, "learning_rate": 1.4046237023346113e-06, "loss": 0.6083638668060303, "step": 1888 }, { "epoch": 3.9873417721518987, "grad_norm": 7.335713863372803, "learning_rate": 1.400869584664743e-06, "loss": 0.9237312078475952, "step": 1890 }, { "epoch": 3.9915611814345993, "grad_norm": 7.168555736541748, "learning_rate": 1.3971197914165238e-06, "loss": 0.6043530702590942, "step": 1892 }, { "epoch": 3.9957805907173, "grad_norm": 2.8935647010803223, "learning_rate": 1.3933743428816209e-06, "loss": 0.9517507553100586, "step": 1894 }, { "epoch": 4.0, "grad_norm": 2.358701705932617, "learning_rate": 1.3896332593281876e-06, "loss": 0.9641570448875427, "step": 1896 }, { "epoch": 4.0042194092827, "grad_norm": 4.007087230682373, "learning_rate": 1.385896561000759e-06, "loss": 0.8658764362335205, "step": 1898 }, { "epoch": 4.008438818565401, "grad_norm": 6.783811092376709, "learning_rate": 1.382164268120137e-06, "loss": 0.7082722187042236, "step": 1900 }, { "epoch": 4.012658227848101, "grad_norm": 2.4722962379455566, "learning_rate": 1.3784364008832867e-06, "loss": 0.7488058805465698, "step": 1902 }, { "epoch": 4.0168776371308015, "grad_norm": 7.7128705978393555, "learning_rate": 1.3747129794632236e-06, "loss": 0.5546174049377441, "step": 1904 }, { "epoch": 4.0210970464135025, "grad_norm": 1.1015756130218506, "learning_rate": 1.3709940240089027e-06, "loss": 0.5142375826835632, "step": 1906 }, { "epoch": 4.025316455696203, "grad_norm": 5.702658653259277, "learning_rate": 1.3672795546451144e-06, "loss": 0.9443526268005371, "step": 1908 }, { "epoch": 4.029535864978903, "grad_norm": 6.516256809234619, "learning_rate": 1.3635695914723724e-06, "loss": 0.11540517210960388, "step": 1910 }, { "epoch": 4.033755274261603, "grad_norm": 3.0924103260040283, "learning_rate": 1.359864154566805e-06, "loss": 0.7493268251419067, "step": 1912 }, { "epoch": 4.037974683544304, "grad_norm": 5.080263614654541, "learning_rate": 1.356163263980048e-06, "loss": 0.793247401714325, "step": 1914 }, { "epoch": 4.042194092827004, "grad_norm": 0.8498378396034241, "learning_rate": 1.352466939739134e-06, "loss": 0.4381150007247925, "step": 1916 }, { "epoch": 4.046413502109704, "grad_norm": 5.049806594848633, "learning_rate": 1.3487752018463865e-06, "loss": 0.23625794053077698, "step": 1918 }, { "epoch": 4.050632911392405, "grad_norm": 6.505473613739014, "learning_rate": 1.34508807027931e-06, "loss": 0.8553643226623535, "step": 1920 }, { "epoch": 4.0548523206751055, "grad_norm": 2.442864418029785, "learning_rate": 1.341405564990481e-06, "loss": 0.9089441895484924, "step": 1922 }, { "epoch": 4.059071729957806, "grad_norm": 3.782691717147827, "learning_rate": 1.3377277059074428e-06, "loss": 0.6086368560791016, "step": 1924 }, { "epoch": 4.063291139240507, "grad_norm": 7.667325019836426, "learning_rate": 1.3340545129325956e-06, "loss": 0.5529667139053345, "step": 1926 }, { "epoch": 4.067510548523207, "grad_norm": 4.649930953979492, "learning_rate": 1.330386005943089e-06, "loss": 0.7499093413352966, "step": 1928 }, { "epoch": 4.071729957805907, "grad_norm": 6.8586602210998535, "learning_rate": 1.3267222047907167e-06, "loss": 0.2909429967403412, "step": 1930 }, { "epoch": 4.075949367088608, "grad_norm": 5.850220680236816, "learning_rate": 1.323063129301806e-06, "loss": 0.5432990789413452, "step": 1932 }, { "epoch": 4.080168776371308, "grad_norm": 5.898839473724365, "learning_rate": 1.3194087992771097e-06, "loss": 0.6550246477127075, "step": 1934 }, { "epoch": 4.084388185654008, "grad_norm": 3.0061066150665283, "learning_rate": 1.3157592344917036e-06, "loss": 0.7705998420715332, "step": 1936 }, { "epoch": 4.0886075949367084, "grad_norm": 2.5635762214660645, "learning_rate": 1.3121144546948766e-06, "loss": 0.44453972578048706, "step": 1938 }, { "epoch": 4.0928270042194095, "grad_norm": 4.0387773513793945, "learning_rate": 1.3084744796100229e-06, "loss": 0.5306001901626587, "step": 1940 }, { "epoch": 4.09704641350211, "grad_norm": 4.215574264526367, "learning_rate": 1.3048393289345369e-06, "loss": 0.5609068870544434, "step": 1942 }, { "epoch": 4.10126582278481, "grad_norm": 2.5985476970672607, "learning_rate": 1.3012090223397066e-06, "loss": 0.503987193107605, "step": 1944 }, { "epoch": 4.105485232067511, "grad_norm": 2.6729464530944824, "learning_rate": 1.2975835794706063e-06, "loss": 0.8981311321258545, "step": 1946 }, { "epoch": 4.109704641350211, "grad_norm": 8.088824272155762, "learning_rate": 1.2939630199459914e-06, "loss": 0.502710223197937, "step": 1948 }, { "epoch": 4.113924050632911, "grad_norm": 0.9990053772926331, "learning_rate": 1.2903473633581894e-06, "loss": 0.5058774948120117, "step": 1950 }, { "epoch": 4.118143459915612, "grad_norm": 2.116455554962158, "learning_rate": 1.2867366292729984e-06, "loss": 0.8362418413162231, "step": 1952 }, { "epoch": 4.122362869198312, "grad_norm": 4.284731388092041, "learning_rate": 1.283130837229578e-06, "loss": 0.9526023864746094, "step": 1954 }, { "epoch": 4.1265822784810124, "grad_norm": 27.23639678955078, "learning_rate": 1.2795300067403436e-06, "loss": 0.16982686519622803, "step": 1956 }, { "epoch": 4.1308016877637135, "grad_norm": 9.439923286437988, "learning_rate": 1.275934157290863e-06, "loss": 0.844666600227356, "step": 1958 }, { "epoch": 4.135021097046414, "grad_norm": 52.91316604614258, "learning_rate": 1.2723433083397486e-06, "loss": 0.8215901255607605, "step": 1960 }, { "epoch": 4.139240506329114, "grad_norm": 12.898977279663086, "learning_rate": 1.2687574793185535e-06, "loss": 0.5214605331420898, "step": 1962 }, { "epoch": 4.143459915611814, "grad_norm": 1.9493759870529175, "learning_rate": 1.2651766896316653e-06, "loss": 0.8226008415222168, "step": 1964 }, { "epoch": 4.147679324894515, "grad_norm": 0.4556528925895691, "learning_rate": 1.2616009586562021e-06, "loss": 0.43690699338912964, "step": 1966 }, { "epoch": 4.151898734177215, "grad_norm": 3.7246546745300293, "learning_rate": 1.2580303057419079e-06, "loss": 0.871078610420227, "step": 1968 }, { "epoch": 4.156118143459915, "grad_norm": 3.5394413471221924, "learning_rate": 1.2544647502110464e-06, "loss": 0.9380326271057129, "step": 1970 }, { "epoch": 4.160337552742616, "grad_norm": 4.74537467956543, "learning_rate": 1.2509043113582969e-06, "loss": 1.0427074432373047, "step": 1972 }, { "epoch": 4.1645569620253164, "grad_norm": 5.703405380249023, "learning_rate": 1.247349008450651e-06, "loss": 0.17169350385665894, "step": 1974 }, { "epoch": 4.168776371308017, "grad_norm": 8.463484764099121, "learning_rate": 1.243798860727308e-06, "loss": 0.5819951891899109, "step": 1976 }, { "epoch": 4.172995780590718, "grad_norm": 5.530209541320801, "learning_rate": 1.2402538873995701e-06, "loss": 0.40900328755378723, "step": 1978 }, { "epoch": 4.177215189873418, "grad_norm": 6.495384216308594, "learning_rate": 1.236714107650737e-06, "loss": 0.42087459564208984, "step": 1980 }, { "epoch": 4.181434599156118, "grad_norm": 3.931180953979492, "learning_rate": 1.233179540636006e-06, "loss": 0.7898563742637634, "step": 1982 }, { "epoch": 4.185654008438819, "grad_norm": 5.3524322509765625, "learning_rate": 1.2296502054823655e-06, "loss": 0.335269570350647, "step": 1984 }, { "epoch": 4.189873417721519, "grad_norm": 6.85384464263916, "learning_rate": 1.226126121288492e-06, "loss": 0.220280259847641, "step": 1986 }, { "epoch": 4.194092827004219, "grad_norm": 17.49827003479004, "learning_rate": 1.222607307124647e-06, "loss": 0.5092884302139282, "step": 1988 }, { "epoch": 4.198312236286919, "grad_norm": 5.663785934448242, "learning_rate": 1.2190937820325733e-06, "loss": 0.4246003031730652, "step": 1990 }, { "epoch": 4.2025316455696204, "grad_norm": 4.241413116455078, "learning_rate": 1.215585565025394e-06, "loss": 0.8379718065261841, "step": 1992 }, { "epoch": 4.206751054852321, "grad_norm": 6.015312194824219, "learning_rate": 1.2120826750875059e-06, "loss": 0.5074017643928528, "step": 1994 }, { "epoch": 4.210970464135021, "grad_norm": 3.0558958053588867, "learning_rate": 1.2085851311744794e-06, "loss": 0.8118472099304199, "step": 1996 }, { "epoch": 4.215189873417722, "grad_norm": 6.353532314300537, "learning_rate": 1.205092952212956e-06, "loss": 1.135847568511963, "step": 1998 }, { "epoch": 4.219409282700422, "grad_norm": 2.435732126235962, "learning_rate": 1.201606157100544e-06, "loss": 0.9003854990005493, "step": 2000 }, { "epoch": 4.223628691983122, "grad_norm": 8.28079891204834, "learning_rate": 1.1981247647057202e-06, "loss": 0.6943663358688354, "step": 2002 }, { "epoch": 4.227848101265823, "grad_norm": 7.8127264976501465, "learning_rate": 1.1946487938677226e-06, "loss": 0.16587281227111816, "step": 2004 }, { "epoch": 4.232067510548523, "grad_norm": 1.957531213760376, "learning_rate": 1.1911782633964518e-06, "loss": 0.9451367855072021, "step": 2006 }, { "epoch": 4.236286919831223, "grad_norm": 2.2095224857330322, "learning_rate": 1.1877131920723674e-06, "loss": 0.4541362524032593, "step": 2008 }, { "epoch": 4.2405063291139244, "grad_norm": 2.0317702293395996, "learning_rate": 1.1842535986463885e-06, "loss": 0.9444383382797241, "step": 2010 }, { "epoch": 4.244725738396625, "grad_norm": 8.539976119995117, "learning_rate": 1.180799501839791e-06, "loss": 0.19654181599617004, "step": 2012 }, { "epoch": 4.248945147679325, "grad_norm": 3.8399620056152344, "learning_rate": 1.1773509203441052e-06, "loss": 0.5152616500854492, "step": 2014 }, { "epoch": 4.253164556962025, "grad_norm": 2.4427969455718994, "learning_rate": 1.1739078728210175e-06, "loss": 0.89030921459198, "step": 2016 }, { "epoch": 4.257383966244726, "grad_norm": 3.564229726791382, "learning_rate": 1.170470377902266e-06, "loss": 0.9515880346298218, "step": 2018 }, { "epoch": 4.261603375527426, "grad_norm": 17.13824462890625, "learning_rate": 1.167038454189543e-06, "loss": 0.0852670818567276, "step": 2020 }, { "epoch": 4.265822784810126, "grad_norm": 1.1132172346115112, "learning_rate": 1.163612120254392e-06, "loss": 0.3325420618057251, "step": 2022 }, { "epoch": 4.270042194092827, "grad_norm": 2.2386295795440674, "learning_rate": 1.1601913946381068e-06, "loss": 0.8490246534347534, "step": 2024 }, { "epoch": 4.274261603375527, "grad_norm": 4.5493927001953125, "learning_rate": 1.1567762958516336e-06, "loss": 0.30698156356811523, "step": 2026 }, { "epoch": 4.2784810126582276, "grad_norm": 2.7599310874938965, "learning_rate": 1.1533668423754703e-06, "loss": 0.3949320912361145, "step": 2028 }, { "epoch": 4.282700421940929, "grad_norm": 0.7302427291870117, "learning_rate": 1.1499630526595632e-06, "loss": 0.4672113060951233, "step": 2030 }, { "epoch": 4.286919831223629, "grad_norm": 6.222799777984619, "learning_rate": 1.1465649451232121e-06, "loss": 1.0849535465240479, "step": 2032 }, { "epoch": 4.291139240506329, "grad_norm": 2.6900506019592285, "learning_rate": 1.1431725381549675e-06, "loss": 0.12843787670135498, "step": 2034 }, { "epoch": 4.29535864978903, "grad_norm": 7.403899669647217, "learning_rate": 1.1397858501125304e-06, "loss": 0.3389854431152344, "step": 2036 }, { "epoch": 4.29957805907173, "grad_norm": 5.636825084686279, "learning_rate": 1.1364048993226566e-06, "loss": 0.6659049391746521, "step": 2038 }, { "epoch": 4.30379746835443, "grad_norm": 10.65471363067627, "learning_rate": 1.1330297040810534e-06, "loss": 1.0959115028381348, "step": 2040 }, { "epoch": 4.308016877637131, "grad_norm": 6.164623737335205, "learning_rate": 1.129660282652284e-06, "loss": 0.8495713472366333, "step": 2042 }, { "epoch": 4.312236286919831, "grad_norm": 21.337953567504883, "learning_rate": 1.1262966532696658e-06, "loss": 0.4679602384567261, "step": 2044 }, { "epoch": 4.3164556962025316, "grad_norm": 3.735825300216675, "learning_rate": 1.1229388341351739e-06, "loss": 1.0504865646362305, "step": 2046 }, { "epoch": 4.320675105485232, "grad_norm": 2.874302387237549, "learning_rate": 1.1195868434193413e-06, "loss": 0.9641183614730835, "step": 2048 }, { "epoch": 4.324894514767933, "grad_norm": 2.697021722793579, "learning_rate": 1.1162406992611618e-06, "loss": 0.24490822851657867, "step": 2050 }, { "epoch": 4.329113924050633, "grad_norm": 8.153789520263672, "learning_rate": 1.1129004197679907e-06, "loss": 0.43832969665527344, "step": 2052 }, { "epoch": 4.333333333333333, "grad_norm": 2.9143199920654297, "learning_rate": 1.1095660230154457e-06, "loss": 0.7494316101074219, "step": 2054 }, { "epoch": 4.337552742616034, "grad_norm": 22.089580535888672, "learning_rate": 1.1062375270473129e-06, "loss": 0.4954107403755188, "step": 2056 }, { "epoch": 4.341772151898734, "grad_norm": 5.983814716339111, "learning_rate": 1.1029149498754458e-06, "loss": 0.39451485872268677, "step": 2058 }, { "epoch": 4.345991561181434, "grad_norm": 3.4319894313812256, "learning_rate": 1.0995983094796688e-06, "loss": 0.816379189491272, "step": 2060 }, { "epoch": 4.350210970464135, "grad_norm": 2.19193172454834, "learning_rate": 1.0962876238076799e-06, "loss": 0.9197038412094116, "step": 2062 }, { "epoch": 4.3544303797468356, "grad_norm": 8.006820678710938, "learning_rate": 1.0929829107749547e-06, "loss": 0.8574424982070923, "step": 2064 }, { "epoch": 4.358649789029536, "grad_norm": 3.324010133743286, "learning_rate": 1.0896841882646471e-06, "loss": 0.9916654825210571, "step": 2066 }, { "epoch": 4.362869198312236, "grad_norm": 3.6000797748565674, "learning_rate": 1.0863914741274944e-06, "loss": 0.4570949077606201, "step": 2068 }, { "epoch": 4.367088607594937, "grad_norm": 1.9650532007217407, "learning_rate": 1.0831047861817193e-06, "loss": 0.9559861421585083, "step": 2070 }, { "epoch": 4.371308016877637, "grad_norm": 2.6903204917907715, "learning_rate": 1.079824142212936e-06, "loss": 0.9988477230072021, "step": 2072 }, { "epoch": 4.375527426160337, "grad_norm": 3.71533203125, "learning_rate": 1.07654955997405e-06, "loss": 0.8142194747924805, "step": 2074 }, { "epoch": 4.379746835443038, "grad_norm": 6.335799694061279, "learning_rate": 1.0732810571851677e-06, "loss": 0.6120598316192627, "step": 2076 }, { "epoch": 4.383966244725738, "grad_norm": 2.123081684112549, "learning_rate": 1.0700186515334939e-06, "loss": 0.4905482232570648, "step": 2078 }, { "epoch": 4.3881856540084385, "grad_norm": 14.160784721374512, "learning_rate": 1.0667623606732408e-06, "loss": 0.9914622902870178, "step": 2080 }, { "epoch": 4.3924050632911396, "grad_norm": 1.714659571647644, "learning_rate": 1.0635122022255298e-06, "loss": 0.6109655499458313, "step": 2082 }, { "epoch": 4.39662447257384, "grad_norm": 4.592569351196289, "learning_rate": 1.0602681937782985e-06, "loss": 0.7499299049377441, "step": 2084 }, { "epoch": 4.40084388185654, "grad_norm": 3.3476827144622803, "learning_rate": 1.0570303528862044e-06, "loss": 0.9557301998138428, "step": 2086 }, { "epoch": 4.405063291139241, "grad_norm": 3.5681612491607666, "learning_rate": 1.0537986970705284e-06, "loss": 0.9052315354347229, "step": 2088 }, { "epoch": 4.409282700421941, "grad_norm": 4.310785293579102, "learning_rate": 1.0505732438190832e-06, "loss": 0.6285467147827148, "step": 2090 }, { "epoch": 4.413502109704641, "grad_norm": 8.120601654052734, "learning_rate": 1.0473540105861158e-06, "loss": 0.8778185844421387, "step": 2092 }, { "epoch": 4.417721518987342, "grad_norm": 2.5801761150360107, "learning_rate": 1.0441410147922142e-06, "loss": 0.8876914381980896, "step": 2094 }, { "epoch": 4.421940928270042, "grad_norm": 22.094350814819336, "learning_rate": 1.0409342738242145e-06, "loss": 0.5706854462623596, "step": 2096 }, { "epoch": 4.4261603375527425, "grad_norm": 2.56339955329895, "learning_rate": 1.0377338050351023e-06, "loss": 0.8818637132644653, "step": 2098 }, { "epoch": 4.430379746835443, "grad_norm": 15.67695140838623, "learning_rate": 1.0345396257439248e-06, "loss": 0.6227443814277649, "step": 2100 }, { "epoch": 4.434599156118144, "grad_norm": 2.2217981815338135, "learning_rate": 1.0313517532356928e-06, "loss": 0.7605068683624268, "step": 2102 }, { "epoch": 4.438818565400844, "grad_norm": 2.441141128540039, "learning_rate": 1.0281702047612885e-06, "loss": 0.7203768491744995, "step": 2104 }, { "epoch": 4.443037974683544, "grad_norm": 4.200733184814453, "learning_rate": 1.024994997537373e-06, "loss": 0.8852105736732483, "step": 2106 }, { "epoch": 4.447257383966245, "grad_norm": 9.651650428771973, "learning_rate": 1.0218261487462916e-06, "loss": 0.5270538330078125, "step": 2108 }, { "epoch": 4.451476793248945, "grad_norm": 2.550156593322754, "learning_rate": 1.0186636755359814e-06, "loss": 0.8197285532951355, "step": 2110 }, { "epoch": 4.455696202531645, "grad_norm": 2.093350887298584, "learning_rate": 1.0155075950198794e-06, "loss": 0.8607369065284729, "step": 2112 }, { "epoch": 4.459915611814346, "grad_norm": 1.2756742238998413, "learning_rate": 1.0123579242768282e-06, "loss": 0.6345518827438354, "step": 2114 }, { "epoch": 4.4641350210970465, "grad_norm": 3.17000675201416, "learning_rate": 1.0092146803509854e-06, "loss": 0.48864442110061646, "step": 2116 }, { "epoch": 4.468354430379747, "grad_norm": 2.0671489238739014, "learning_rate": 1.006077880251729e-06, "loss": 0.862575888633728, "step": 2118 }, { "epoch": 4.472573839662447, "grad_norm": 2.851736307144165, "learning_rate": 1.0029475409535692e-06, "loss": 0.4032348692417145, "step": 2120 }, { "epoch": 4.476793248945148, "grad_norm": 4.477703094482422, "learning_rate": 9.998236793960514e-07, "loss": 0.36202433705329895, "step": 2122 }, { "epoch": 4.481012658227848, "grad_norm": 8.475764274597168, "learning_rate": 9.967063124836695e-07, "loss": 0.21301576495170593, "step": 2124 }, { "epoch": 4.485232067510548, "grad_norm": 3.3703811168670654, "learning_rate": 9.935954570857717e-07, "loss": 0.39527398347854614, "step": 2126 }, { "epoch": 4.489451476793249, "grad_norm": 2.7759153842926025, "learning_rate": 9.90491130036468e-07, "loss": 0.6493411064147949, "step": 2128 }, { "epoch": 4.493670886075949, "grad_norm": 9.41816520690918, "learning_rate": 9.873933481345432e-07, "loss": 0.484800785779953, "step": 2130 }, { "epoch": 4.4978902953586495, "grad_norm": 2.5206875801086426, "learning_rate": 9.843021281433624e-07, "loss": 1.0602920055389404, "step": 2132 }, { "epoch": 4.5021097046413505, "grad_norm": 4.042180061340332, "learning_rate": 9.81217486790782e-07, "loss": 0.7310470342636108, "step": 2134 }, { "epoch": 4.506329113924051, "grad_norm": 4.009156703948975, "learning_rate": 9.781394407690582e-07, "loss": 0.12923167645931244, "step": 2136 }, { "epoch": 4.510548523206751, "grad_norm": 3.74722957611084, "learning_rate": 9.750680067347574e-07, "loss": 0.3252981901168823, "step": 2138 }, { "epoch": 4.514767932489452, "grad_norm": 39.80788040161133, "learning_rate": 9.720032013086665e-07, "loss": 0.25149163603782654, "step": 2140 }, { "epoch": 4.518987341772152, "grad_norm": 1.6690428256988525, "learning_rate": 9.689450410757014e-07, "loss": 0.6628930568695068, "step": 2142 }, { "epoch": 4.523206751054852, "grad_norm": 12.622380256652832, "learning_rate": 9.658935425848178e-07, "loss": 0.17167873680591583, "step": 2144 }, { "epoch": 4.527426160337553, "grad_norm": 3.543349504470825, "learning_rate": 9.628487223489232e-07, "loss": 0.5717638731002808, "step": 2146 }, { "epoch": 4.531645569620253, "grad_norm": 3.6029629707336426, "learning_rate": 9.598105968447845e-07, "loss": 0.5759022831916809, "step": 2148 }, { "epoch": 4.5358649789029535, "grad_norm": 0.09577035158872604, "learning_rate": 9.567791825129436e-07, "loss": 0.45371395349502563, "step": 2150 }, { "epoch": 4.540084388185654, "grad_norm": 2.7558352947235107, "learning_rate": 9.537544957576232e-07, "loss": 0.5172098875045776, "step": 2152 }, { "epoch": 4.544303797468355, "grad_norm": 6.613936424255371, "learning_rate": 9.507365529466414e-07, "loss": 1.0241069793701172, "step": 2154 }, { "epoch": 4.548523206751055, "grad_norm": 3.1837728023529053, "learning_rate": 9.477253704113204e-07, "loss": 0.9064798355102539, "step": 2156 }, { "epoch": 4.552742616033755, "grad_norm": 2.8910419940948486, "learning_rate": 9.447209644464014e-07, "loss": 0.8971297740936279, "step": 2158 }, { "epoch": 4.556962025316456, "grad_norm": 3.6541380882263184, "learning_rate": 9.417233513099545e-07, "loss": 0.7274525165557861, "step": 2160 }, { "epoch": 4.561181434599156, "grad_norm": 2.7287378311157227, "learning_rate": 9.387325472232908e-07, "loss": 0.7473336458206177, "step": 2162 }, { "epoch": 4.565400843881856, "grad_norm": 5.9793500900268555, "learning_rate": 9.357485683708752e-07, "loss": 0.6158387660980225, "step": 2164 }, { "epoch": 4.569620253164557, "grad_norm": 2.9492175579071045, "learning_rate": 9.327714309002378e-07, "loss": 0.8946245312690735, "step": 2166 }, { "epoch": 4.5738396624472575, "grad_norm": 2.516920566558838, "learning_rate": 9.298011509218878e-07, "loss": 0.7441626787185669, "step": 2168 }, { "epoch": 4.578059071729958, "grad_norm": 9.32639217376709, "learning_rate": 9.268377445092257e-07, "loss": 0.18001851439476013, "step": 2170 }, { "epoch": 4.582278481012658, "grad_norm": 2.5811736583709717, "learning_rate": 9.238812276984563e-07, "loss": 0.6168837547302246, "step": 2172 }, { "epoch": 4.586497890295359, "grad_norm": 79.09625244140625, "learning_rate": 9.209316164885007e-07, "loss": 0.7156883478164673, "step": 2174 }, { "epoch": 4.590717299578059, "grad_norm": 3.27329683303833, "learning_rate": 9.179889268409126e-07, "loss": 0.9324935078620911, "step": 2176 }, { "epoch": 4.594936708860759, "grad_norm": 10.160416603088379, "learning_rate": 9.150531746797897e-07, "loss": 0.6166714429855347, "step": 2178 }, { "epoch": 4.59915611814346, "grad_norm": 3.8143131732940674, "learning_rate": 9.121243758916885e-07, "loss": 0.5362197756767273, "step": 2180 }, { "epoch": 4.60337552742616, "grad_norm": 2.629331111907959, "learning_rate": 9.092025463255371e-07, "loss": 0.9286479949951172, "step": 2182 }, { "epoch": 4.6075949367088604, "grad_norm": 3.1048662662506104, "learning_rate": 9.062877017925509e-07, "loss": 0.23398178815841675, "step": 2184 }, { "epoch": 4.6118143459915615, "grad_norm": 2.799243211746216, "learning_rate": 9.033798580661465e-07, "loss": 0.9572643041610718, "step": 2186 }, { "epoch": 4.616033755274262, "grad_norm": 1.9954626560211182, "learning_rate": 9.00479030881856e-07, "loss": 0.5257174372673035, "step": 2188 }, { "epoch": 4.620253164556962, "grad_norm": 2.1326327323913574, "learning_rate": 8.975852359372421e-07, "loss": 0.907311737537384, "step": 2190 }, { "epoch": 4.624472573839663, "grad_norm": 2.5456783771514893, "learning_rate": 8.946984888918133e-07, "loss": 0.48332294821739197, "step": 2192 }, { "epoch": 4.628691983122363, "grad_norm": 4.807265758514404, "learning_rate": 8.918188053669391e-07, "loss": 0.8633521199226379, "step": 2194 }, { "epoch": 4.632911392405063, "grad_norm": 3.1572585105895996, "learning_rate": 8.889462009457651e-07, "loss": 0.4701206088066101, "step": 2196 }, { "epoch": 4.637130801687764, "grad_norm": 2.305100440979004, "learning_rate": 8.860806911731295e-07, "loss": 0.4662626385688782, "step": 2198 }, { "epoch": 4.641350210970464, "grad_norm": 2.7214598655700684, "learning_rate": 8.832222915554783e-07, "loss": 0.8649228811264038, "step": 2200 }, { "epoch": 4.6455696202531644, "grad_norm": 4.30544900894165, "learning_rate": 8.803710175607808e-07, "loss": 0.8740881085395813, "step": 2202 }, { "epoch": 4.649789029535865, "grad_norm": 1.4659613370895386, "learning_rate": 8.775268846184471e-07, "loss": 0.5230797529220581, "step": 2204 }, { "epoch": 4.654008438818566, "grad_norm": 3.350233316421509, "learning_rate": 8.74689908119245e-07, "loss": 0.5945952534675598, "step": 2206 }, { "epoch": 4.658227848101266, "grad_norm": 1.1876442432403564, "learning_rate": 8.718601034152144e-07, "loss": 0.5520751476287842, "step": 2208 }, { "epoch": 4.662447257383966, "grad_norm": 2.593919277191162, "learning_rate": 8.690374858195868e-07, "loss": 0.8659783601760864, "step": 2210 }, { "epoch": 4.666666666666667, "grad_norm": 2.051456928253174, "learning_rate": 8.662220706067007e-07, "loss": 0.7441516518592834, "step": 2212 }, { "epoch": 4.670886075949367, "grad_norm": 9.695352554321289, "learning_rate": 8.634138730119199e-07, "loss": 0.6046957969665527, "step": 2214 }, { "epoch": 4.675105485232067, "grad_norm": 3.217013359069824, "learning_rate": 8.606129082315514e-07, "loss": 0.8700679540634155, "step": 2216 }, { "epoch": 4.679324894514768, "grad_norm": 2.5168628692626953, "learning_rate": 8.578191914227602e-07, "loss": 0.5581780076026917, "step": 2218 }, { "epoch": 4.6835443037974684, "grad_norm": 3.7480080127716064, "learning_rate": 8.550327377034915e-07, "loss": 0.7154510617256165, "step": 2220 }, { "epoch": 4.687763713080169, "grad_norm": 2.187389373779297, "learning_rate": 8.522535621523864e-07, "loss": 0.17346470057964325, "step": 2222 }, { "epoch": 4.691983122362869, "grad_norm": 2.2572085857391357, "learning_rate": 8.494816798087014e-07, "loss": 0.8721593618392944, "step": 2224 }, { "epoch": 4.69620253164557, "grad_norm": 8.44543170928955, "learning_rate": 8.467171056722262e-07, "loss": 0.5838876962661743, "step": 2226 }, { "epoch": 4.70042194092827, "grad_norm": 10.777728080749512, "learning_rate": 8.439598547032021e-07, "loss": 0.15432819724082947, "step": 2228 }, { "epoch": 4.70464135021097, "grad_norm": 2.5708587169647217, "learning_rate": 8.412099418222429e-07, "loss": 0.8907821178436279, "step": 2230 }, { "epoch": 4.708860759493671, "grad_norm": 1.8050702810287476, "learning_rate": 8.384673819102515e-07, "loss": 0.8190984725952148, "step": 2232 }, { "epoch": 4.713080168776371, "grad_norm": 13.505372047424316, "learning_rate": 8.357321898083417e-07, "loss": 0.5908716917037964, "step": 2234 }, { "epoch": 4.717299578059071, "grad_norm": 4.608894348144531, "learning_rate": 8.330043803177576e-07, "loss": 0.43208563327789307, "step": 2236 }, { "epoch": 4.7215189873417724, "grad_norm": 6.133680820465088, "learning_rate": 8.302839681997924e-07, "loss": 0.7111215591430664, "step": 2238 }, { "epoch": 4.725738396624473, "grad_norm": 1.9396830797195435, "learning_rate": 8.275709681757091e-07, "loss": 0.8701183795928955, "step": 2240 }, { "epoch": 4.729957805907173, "grad_norm": 2.7942826747894287, "learning_rate": 8.248653949266609e-07, "loss": 0.9508087635040283, "step": 2242 }, { "epoch": 4.734177215189874, "grad_norm": 1.77509343624115, "learning_rate": 8.221672630936114e-07, "loss": 0.14094747602939606, "step": 2244 }, { "epoch": 4.738396624472574, "grad_norm": 9.949209213256836, "learning_rate": 8.194765872772569e-07, "loss": 0.7157829999923706, "step": 2246 }, { "epoch": 4.742616033755274, "grad_norm": 3.089747667312622, "learning_rate": 8.167933820379438e-07, "loss": 0.9330974817276001, "step": 2248 }, { "epoch": 4.746835443037975, "grad_norm": 8.302231788635254, "learning_rate": 8.141176618955941e-07, "loss": 0.18974465131759644, "step": 2250 }, { "epoch": 4.751054852320675, "grad_norm": 17.27684211730957, "learning_rate": 8.114494413296242e-07, "loss": 0.6534916162490845, "step": 2252 }, { "epoch": 4.755274261603375, "grad_norm": 13.299623489379883, "learning_rate": 8.087887347788675e-07, "loss": 0.5243600606918335, "step": 2254 }, { "epoch": 4.759493670886076, "grad_norm": 1.3798922300338745, "learning_rate": 8.061355566414959e-07, "loss": 0.46594005823135376, "step": 2256 }, { "epoch": 4.763713080168777, "grad_norm": 6.182672023773193, "learning_rate": 8.034899212749415e-07, "loss": 0.22735753655433655, "step": 2258 }, { "epoch": 4.767932489451477, "grad_norm": 4.455085277557373, "learning_rate": 8.0085184299582e-07, "loss": 0.22588486969470978, "step": 2260 }, { "epoch": 4.772151898734177, "grad_norm": 2.398963212966919, "learning_rate": 7.982213360798524e-07, "loss": 0.5842011570930481, "step": 2262 }, { "epoch": 4.776371308016878, "grad_norm": 3.986417055130005, "learning_rate": 7.955984147617878e-07, "loss": 0.8581550121307373, "step": 2264 }, { "epoch": 4.780590717299578, "grad_norm": 2.5186336040496826, "learning_rate": 7.929830932353267e-07, "loss": 0.9678604602813721, "step": 2266 }, { "epoch": 4.784810126582278, "grad_norm": 17.917510986328125, "learning_rate": 7.903753856530439e-07, "loss": 0.776985764503479, "step": 2268 }, { "epoch": 4.789029535864979, "grad_norm": 4.219602108001709, "learning_rate": 7.877753061263124e-07, "loss": 0.49661773443222046, "step": 2270 }, { "epoch": 4.793248945147679, "grad_norm": 2.524501323699951, "learning_rate": 7.851828687252258e-07, "loss": 0.9214498996734619, "step": 2272 }, { "epoch": 4.7974683544303796, "grad_norm": 24.021936416625977, "learning_rate": 7.825980874785245e-07, "loss": 0.2861242890357971, "step": 2274 }, { "epoch": 4.80168776371308, "grad_norm": 1.766944169998169, "learning_rate": 7.800209763735166e-07, "loss": 0.2682395279407501, "step": 2276 }, { "epoch": 4.805907172995781, "grad_norm": 3.6635119915008545, "learning_rate": 7.774515493560047e-07, "loss": 0.5065731406211853, "step": 2278 }, { "epoch": 4.810126582278481, "grad_norm": 0.9169036746025085, "learning_rate": 7.748898203302101e-07, "loss": 0.4213840365409851, "step": 2280 }, { "epoch": 4.814345991561181, "grad_norm": 2.111497402191162, "learning_rate": 7.723358031586968e-07, "loss": 0.8279630541801453, "step": 2282 }, { "epoch": 4.818565400843882, "grad_norm": 3.6885154247283936, "learning_rate": 7.697895116622962e-07, "loss": 0.721439003944397, "step": 2284 }, { "epoch": 4.822784810126582, "grad_norm": 4.22064733505249, "learning_rate": 7.672509596200339e-07, "loss": 0.8761791586875916, "step": 2286 }, { "epoch": 4.827004219409282, "grad_norm": 2.2504615783691406, "learning_rate": 7.647201607690535e-07, "loss": 0.43095457553863525, "step": 2288 }, { "epoch": 4.831223628691983, "grad_norm": 2.19746470451355, "learning_rate": 7.621971288045436e-07, "loss": 0.7216506004333496, "step": 2290 }, { "epoch": 4.8354430379746836, "grad_norm": 2.588840961456299, "learning_rate": 7.596818773796616e-07, "loss": 0.8444218039512634, "step": 2292 }, { "epoch": 4.839662447257384, "grad_norm": 2.1437089443206787, "learning_rate": 7.571744201054619e-07, "loss": 0.9132941961288452, "step": 2294 }, { "epoch": 4.843881856540085, "grad_norm": 2.2970213890075684, "learning_rate": 7.54674770550823e-07, "loss": 0.8675155639648438, "step": 2296 }, { "epoch": 4.848101265822785, "grad_norm": 2.469003438949585, "learning_rate": 7.521829422423707e-07, "loss": 0.8924763202667236, "step": 2298 }, { "epoch": 4.852320675105485, "grad_norm": 5.6491169929504395, "learning_rate": 7.496989486644074e-07, "loss": 1.2289131879806519, "step": 2300 }, { "epoch": 4.856540084388186, "grad_norm": 0.6651078462600708, "learning_rate": 7.472228032588392e-07, "loss": 0.5435088872909546, "step": 2302 }, { "epoch": 4.860759493670886, "grad_norm": 1.8895771503448486, "learning_rate": 7.447545194251021e-07, "loss": 0.4832010865211487, "step": 2304 }, { "epoch": 4.864978902953586, "grad_norm": 4.667498588562012, "learning_rate": 7.422941105200888e-07, "loss": 0.7593515515327454, "step": 2306 }, { "epoch": 4.869198312236287, "grad_norm": 2.6413588523864746, "learning_rate": 7.398415898580795e-07, "loss": 0.5025730729103088, "step": 2308 }, { "epoch": 4.8734177215189876, "grad_norm": 2.2257080078125, "learning_rate": 7.373969707106667e-07, "loss": 0.5178145170211792, "step": 2310 }, { "epoch": 4.877637130801688, "grad_norm": 4.63566255569458, "learning_rate": 7.349602663066848e-07, "loss": 0.8785790801048279, "step": 2312 }, { "epoch": 4.881856540084388, "grad_norm": 11.207052230834961, "learning_rate": 7.325314898321387e-07, "loss": 0.6604704260826111, "step": 2314 }, { "epoch": 4.886075949367089, "grad_norm": 2.7186286449432373, "learning_rate": 7.30110654430131e-07, "loss": 0.8655844330787659, "step": 2316 }, { "epoch": 4.890295358649789, "grad_norm": 9.436038970947266, "learning_rate": 7.276977732007934e-07, "loss": 0.6372033357620239, "step": 2318 }, { "epoch": 4.894514767932489, "grad_norm": 9.619095802307129, "learning_rate": 7.252928592012131e-07, "loss": 0.5399308204650879, "step": 2320 }, { "epoch": 4.89873417721519, "grad_norm": 3.560415267944336, "learning_rate": 7.228959254453634e-07, "loss": 0.5512664318084717, "step": 2322 }, { "epoch": 4.90295358649789, "grad_norm": 2.261822462081909, "learning_rate": 7.20506984904034e-07, "loss": 0.965155839920044, "step": 2324 }, { "epoch": 4.9071729957805905, "grad_norm": 5.737890243530273, "learning_rate": 7.181260505047593e-07, "loss": 0.5091350078582764, "step": 2326 }, { "epoch": 4.911392405063291, "grad_norm": 2.460875988006592, "learning_rate": 7.157531351317499e-07, "loss": 0.6960829496383667, "step": 2328 }, { "epoch": 4.915611814345992, "grad_norm": 2.570103883743286, "learning_rate": 7.133882516258215e-07, "loss": 1.0476431846618652, "step": 2330 }, { "epoch": 4.919831223628692, "grad_norm": 0.8946544528007507, "learning_rate": 7.110314127843266e-07, "loss": 0.5339324474334717, "step": 2332 }, { "epoch": 4.924050632911392, "grad_norm": 6.283257007598877, "learning_rate": 7.086826313610843e-07, "loss": 0.6191664934158325, "step": 2334 }, { "epoch": 4.928270042194093, "grad_norm": 2.9751780033111572, "learning_rate": 7.063419200663121e-07, "loss": 0.9971131086349487, "step": 2336 }, { "epoch": 4.932489451476793, "grad_norm": 30.684070587158203, "learning_rate": 7.040092915665563e-07, "loss": 0.5671279430389404, "step": 2338 }, { "epoch": 4.936708860759493, "grad_norm": 3.855710506439209, "learning_rate": 7.016847584846243e-07, "loss": 0.5699124336242676, "step": 2340 }, { "epoch": 4.940928270042194, "grad_norm": 5.847226142883301, "learning_rate": 6.993683333995155e-07, "loss": 0.8012879490852356, "step": 2342 }, { "epoch": 4.9451476793248945, "grad_norm": 6.018973350524902, "learning_rate": 6.970600288463544e-07, "loss": 0.5165205597877502, "step": 2344 }, { "epoch": 4.949367088607595, "grad_norm": 2.1352529525756836, "learning_rate": 6.947598573163207e-07, "loss": 0.9921296834945679, "step": 2346 }, { "epoch": 4.953586497890296, "grad_norm": 2.2737410068511963, "learning_rate": 6.924678312565846e-07, "loss": 0.5466551780700684, "step": 2348 }, { "epoch": 4.957805907172996, "grad_norm": 2.052476167678833, "learning_rate": 6.901839630702358e-07, "loss": 0.7028835415840149, "step": 2350 }, { "epoch": 4.962025316455696, "grad_norm": 5.379444599151611, "learning_rate": 6.879082651162198e-07, "loss": 0.4037717580795288, "step": 2352 }, { "epoch": 4.966244725738397, "grad_norm": 2.764251470565796, "learning_rate": 6.856407497092698e-07, "loss": 0.8569744825363159, "step": 2354 }, { "epoch": 4.970464135021097, "grad_norm": 11.355871200561523, "learning_rate": 6.833814291198395e-07, "loss": 0.5073586106300354, "step": 2356 }, { "epoch": 4.974683544303797, "grad_norm": 3.204270124435425, "learning_rate": 6.811303155740364e-07, "loss": 0.8562701344490051, "step": 2358 }, { "epoch": 4.978902953586498, "grad_norm": 0.8829920887947083, "learning_rate": 6.788874212535576e-07, "loss": 0.5263558626174927, "step": 2360 }, { "epoch": 4.9831223628691985, "grad_norm": 4.092447757720947, "learning_rate": 6.766527582956217e-07, "loss": 0.8253353238105774, "step": 2362 }, { "epoch": 4.987341772151899, "grad_norm": 5.725378513336182, "learning_rate": 6.744263387929044e-07, "loss": 1.0841920375823975, "step": 2364 }, { "epoch": 4.991561181434599, "grad_norm": 2.338805913925171, "learning_rate": 6.722081747934722e-07, "loss": 0.9890093803405762, "step": 2366 }, { "epoch": 4.9957805907173, "grad_norm": 6.7110137939453125, "learning_rate": 6.699982783007181e-07, "loss": 0.6056183576583862, "step": 2368 }, { "epoch": 5.0, "grad_norm": 8.881539344787598, "learning_rate": 6.677966612732969e-07, "loss": 0.19234615564346313, "step": 2370 }, { "epoch": 5.0042194092827, "grad_norm": 2.8525843620300293, "learning_rate": 6.656033356250588e-07, "loss": 0.5725005865097046, "step": 2372 }, { "epoch": 5.008438818565401, "grad_norm": 2.8133490085601807, "learning_rate": 6.634183132249862e-07, "loss": 0.930966317653656, "step": 2374 }, { "epoch": 5.012658227848101, "grad_norm": 6.63195276260376, "learning_rate": 6.612416058971295e-07, "loss": 0.6420108079910278, "step": 2376 }, { "epoch": 5.0168776371308015, "grad_norm": 3.1725778579711914, "learning_rate": 6.590732254205429e-07, "loss": 0.8284573554992676, "step": 2378 }, { "epoch": 5.0210970464135025, "grad_norm": 4.44754695892334, "learning_rate": 6.569131835292196e-07, "loss": 0.48461687564849854, "step": 2380 }, { "epoch": 5.025316455696203, "grad_norm": 3.432302474975586, "learning_rate": 6.547614919120305e-07, "loss": 0.7024222016334534, "step": 2382 }, { "epoch": 5.029535864978903, "grad_norm": 2.8031928539276123, "learning_rate": 6.526181622126594e-07, "loss": 0.6068092584609985, "step": 2384 }, { "epoch": 5.033755274261603, "grad_norm": 6.872043132781982, "learning_rate": 6.504832060295403e-07, "loss": 0.5841951370239258, "step": 2386 }, { "epoch": 5.037974683544304, "grad_norm": 2.907249927520752, "learning_rate": 6.483566349157945e-07, "loss": 0.6709692478179932, "step": 2388 }, { "epoch": 5.042194092827004, "grad_norm": 0.09023797512054443, "learning_rate": 6.462384603791684e-07, "loss": 0.5061817765235901, "step": 2390 }, { "epoch": 5.046413502109704, "grad_norm": 11.977813720703125, "learning_rate": 6.441286938819714e-07, "loss": 0.31504881381988525, "step": 2392 }, { "epoch": 5.050632911392405, "grad_norm": 5.734729766845703, "learning_rate": 6.420273468410131e-07, "loss": 0.48150938749313354, "step": 2394 }, { "epoch": 5.0548523206751055, "grad_norm": 1.924585223197937, "learning_rate": 6.399344306275419e-07, "loss": 0.3540734052658081, "step": 2396 }, { "epoch": 5.059071729957806, "grad_norm": 3.9746241569519043, "learning_rate": 6.378499565671839e-07, "loss": 0.8421119451522827, "step": 2398 }, { "epoch": 5.063291139240507, "grad_norm": 7.219764232635498, "learning_rate": 6.35773935939881e-07, "loss": 1.0435487031936646, "step": 2400 }, { "epoch": 5.067510548523207, "grad_norm": 5.04371976852417, "learning_rate": 6.337063799798305e-07, "loss": 0.9782629013061523, "step": 2402 }, { "epoch": 5.071729957805907, "grad_norm": 16.103092193603516, "learning_rate": 6.316472998754234e-07, "loss": 0.033330727368593216, "step": 2404 }, { "epoch": 5.075949367088608, "grad_norm": 4.6494526863098145, "learning_rate": 6.29596706769185e-07, "loss": 0.7846492528915405, "step": 2406 }, { "epoch": 5.080168776371308, "grad_norm": 2.9837498664855957, "learning_rate": 6.275546117577132e-07, "loss": 0.48354560136795044, "step": 2408 }, { "epoch": 5.084388185654008, "grad_norm": 5.440659999847412, "learning_rate": 6.255210258916199e-07, "loss": 0.5124998688697815, "step": 2410 }, { "epoch": 5.0886075949367084, "grad_norm": 2.850675106048584, "learning_rate": 6.234959601754703e-07, "loss": 0.7655423879623413, "step": 2412 }, { "epoch": 5.0928270042194095, "grad_norm": 2.1231069564819336, "learning_rate": 6.214794255677234e-07, "loss": 0.7977665662765503, "step": 2414 }, { "epoch": 5.09704641350211, "grad_norm": 17.62323760986328, "learning_rate": 6.194714329806732e-07, "loss": 0.34903600811958313, "step": 2416 }, { "epoch": 5.10126582278481, "grad_norm": 2.3470253944396973, "learning_rate": 6.174719932803891e-07, "loss": 0.5935072898864746, "step": 2418 }, { "epoch": 5.105485232067511, "grad_norm": 2.2750778198242188, "learning_rate": 6.154811172866576e-07, "loss": 1.007997751235962, "step": 2420 }, { "epoch": 5.109704641350211, "grad_norm": 2.6111321449279785, "learning_rate": 6.13498815772923e-07, "loss": 0.7840423583984375, "step": 2422 }, { "epoch": 5.113924050632911, "grad_norm": 1.3846306800842285, "learning_rate": 6.115250994662303e-07, "loss": 0.5133131742477417, "step": 2424 }, { "epoch": 5.118143459915612, "grad_norm": 2.471632480621338, "learning_rate": 6.095599790471655e-07, "loss": 0.5239850282669067, "step": 2426 }, { "epoch": 5.122362869198312, "grad_norm": 6.427463054656982, "learning_rate": 6.076034651497995e-07, "loss": 0.46869874000549316, "step": 2428 }, { "epoch": 5.1265822784810124, "grad_norm": 2.414717674255371, "learning_rate": 6.056555683616291e-07, "loss": 0.5103088617324829, "step": 2430 }, { "epoch": 5.1308016877637135, "grad_norm": 2.512148380279541, "learning_rate": 6.037162992235214e-07, "loss": 0.8223515152931213, "step": 2432 }, { "epoch": 5.135021097046414, "grad_norm": 2.9548940658569336, "learning_rate": 6.017856682296551e-07, "loss": 0.917111873626709, "step": 2434 }, { "epoch": 5.139240506329114, "grad_norm": 3.2818551063537598, "learning_rate": 5.998636858274642e-07, "loss": 0.4495956301689148, "step": 2436 }, { "epoch": 5.143459915611814, "grad_norm": 7.414487838745117, "learning_rate": 5.97950362417582e-07, "loss": 0.10738074779510498, "step": 2438 }, { "epoch": 5.147679324894515, "grad_norm": 3.5307252407073975, "learning_rate": 5.960457083537848e-07, "loss": 0.6862280368804932, "step": 2440 }, { "epoch": 5.151898734177215, "grad_norm": 5.234355449676514, "learning_rate": 5.941497339429337e-07, "loss": 0.790778636932373, "step": 2442 }, { "epoch": 5.156118143459915, "grad_norm": 9.668578147888184, "learning_rate": 5.922624494449232e-07, "loss": 0.44245994091033936, "step": 2444 }, { "epoch": 5.160337552742616, "grad_norm": 2.4678266048431396, "learning_rate": 5.903838650726219e-07, "loss": 0.9481706023216248, "step": 2446 }, { "epoch": 5.1645569620253164, "grad_norm": 6.675557613372803, "learning_rate": 5.885139909918178e-07, "loss": 0.5106003284454346, "step": 2448 }, { "epoch": 5.168776371308017, "grad_norm": 2.8948278427124023, "learning_rate": 5.866528373211652e-07, "loss": 0.818520188331604, "step": 2450 }, { "epoch": 5.172995780590718, "grad_norm": 0.031267765909433365, "learning_rate": 5.848004141321279e-07, "loss": 0.4252956509590149, "step": 2452 }, { "epoch": 5.177215189873418, "grad_norm": 5.288304805755615, "learning_rate": 5.82956731448926e-07, "loss": 0.17302009463310242, "step": 2454 }, { "epoch": 5.181434599156118, "grad_norm": 2.205019950866699, "learning_rate": 5.811217992484801e-07, "loss": 0.44998836517333984, "step": 2456 }, { "epoch": 5.185654008438819, "grad_norm": 2.3904027938842773, "learning_rate": 5.792956274603598e-07, "loss": 0.5072075128555298, "step": 2458 }, { "epoch": 5.189873417721519, "grad_norm": 10.959815979003906, "learning_rate": 5.774782259667278e-07, "loss": 0.5302789807319641, "step": 2460 }, { "epoch": 5.194092827004219, "grad_norm": 3.0402653217315674, "learning_rate": 5.756696046022868e-07, "loss": 0.8277729749679565, "step": 2462 }, { "epoch": 5.198312236286919, "grad_norm": 1.877632737159729, "learning_rate": 5.738697731542275e-07, "loss": 0.8515483736991882, "step": 2464 }, { "epoch": 5.2025316455696204, "grad_norm": 3.8875975608825684, "learning_rate": 5.720787413621739e-07, "loss": 0.3267098069190979, "step": 2466 }, { "epoch": 5.206751054852321, "grad_norm": 2.2627320289611816, "learning_rate": 5.702965189181324e-07, "loss": 0.786805272102356, "step": 2468 }, { "epoch": 5.210970464135021, "grad_norm": 6.892368793487549, "learning_rate": 5.685231154664372e-07, "loss": 0.6648309826850891, "step": 2470 }, { "epoch": 5.215189873417722, "grad_norm": 3.592425584793091, "learning_rate": 5.667585406036999e-07, "loss": 0.6738979816436768, "step": 2472 }, { "epoch": 5.219409282700422, "grad_norm": 4.459148406982422, "learning_rate": 5.650028038787577e-07, "loss": 0.7590001821517944, "step": 2474 }, { "epoch": 5.223628691983122, "grad_norm": 2.538756847381592, "learning_rate": 5.632559147926202e-07, "loss": 0.42987027764320374, "step": 2476 }, { "epoch": 5.227848101265823, "grad_norm": 2.191638946533203, "learning_rate": 5.615178827984186e-07, "loss": 0.0880412608385086, "step": 2478 }, { "epoch": 5.232067510548523, "grad_norm": 0.6364640593528748, "learning_rate": 5.597887173013555e-07, "loss": 0.48929768800735474, "step": 2480 }, { "epoch": 5.236286919831223, "grad_norm": 3.1823930740356445, "learning_rate": 5.580684276586535e-07, "loss": 0.7606073617935181, "step": 2482 }, { "epoch": 5.2405063291139244, "grad_norm": 3.0193521976470947, "learning_rate": 5.563570231795027e-07, "loss": 0.4337414503097534, "step": 2484 }, { "epoch": 5.244725738396625, "grad_norm": 5.9851298332214355, "learning_rate": 5.546545131250133e-07, "loss": 1.1480921506881714, "step": 2486 }, { "epoch": 5.248945147679325, "grad_norm": 2.274847984313965, "learning_rate": 5.52960906708164e-07, "loss": 0.8605833053588867, "step": 2488 }, { "epoch": 5.253164556962025, "grad_norm": 4.390803813934326, "learning_rate": 5.512762130937521e-07, "loss": 0.891315221786499, "step": 2490 }, { "epoch": 5.257383966244726, "grad_norm": 3.777196168899536, "learning_rate": 5.496004413983437e-07, "loss": 0.9285299777984619, "step": 2492 }, { "epoch": 5.261603375527426, "grad_norm": 2.7875325679779053, "learning_rate": 5.479336006902255e-07, "loss": 0.6960370540618896, "step": 2494 }, { "epoch": 5.265822784810126, "grad_norm": 5.017436981201172, "learning_rate": 5.462756999893543e-07, "loss": 0.42756134271621704, "step": 2496 }, { "epoch": 5.270042194092827, "grad_norm": 4.370994567871094, "learning_rate": 5.446267482673096e-07, "loss": 0.9004020690917969, "step": 2498 }, { "epoch": 5.274261603375527, "grad_norm": 8.342371940612793, "learning_rate": 5.429867544472434e-07, "loss": 0.49218082427978516, "step": 2500 }, { "epoch": 5.2784810126582276, "grad_norm": 4.241844654083252, "learning_rate": 5.413557274038332e-07, "loss": 0.6770671606063843, "step": 2502 }, { "epoch": 5.282700421940929, "grad_norm": 2.6205804347991943, "learning_rate": 5.397336759632338e-07, "loss": 0.660459041595459, "step": 2504 }, { "epoch": 5.286919831223629, "grad_norm": 8.33484935760498, "learning_rate": 5.381206089030293e-07, "loss": 0.9731260538101196, "step": 2506 }, { "epoch": 5.291139240506329, "grad_norm": 2.1884765625, "learning_rate": 5.365165349521859e-07, "loss": 0.9394969940185547, "step": 2508 }, { "epoch": 5.29535864978903, "grad_norm": 13.115405082702637, "learning_rate": 5.349214627910034e-07, "loss": 0.3471090793609619, "step": 2510 }, { "epoch": 5.29957805907173, "grad_norm": 4.7298970222473145, "learning_rate": 5.333354010510703e-07, "loss": 0.49661415815353394, "step": 2512 }, { "epoch": 5.30379746835443, "grad_norm": 8.904556274414062, "learning_rate": 5.31758358315216e-07, "loss": 0.9580909609794617, "step": 2514 }, { "epoch": 5.308016877637131, "grad_norm": 4.521732807159424, "learning_rate": 5.301903431174628e-07, "loss": 0.6797637939453125, "step": 2516 }, { "epoch": 5.312236286919831, "grad_norm": 3.3015851974487305, "learning_rate": 5.286313639429837e-07, "loss": 0.8633707761764526, "step": 2518 }, { "epoch": 5.3164556962025316, "grad_norm": 6.265556812286377, "learning_rate": 5.270814292280526e-07, "loss": 0.9207254648208618, "step": 2520 }, { "epoch": 5.320675105485232, "grad_norm": 2.132657051086426, "learning_rate": 5.255405473600001e-07, "loss": 0.8656923174858093, "step": 2522 }, { "epoch": 5.324894514767933, "grad_norm": 3.075263738632202, "learning_rate": 5.240087266771686e-07, "loss": 0.8665053844451904, "step": 2524 }, { "epoch": 5.329113924050633, "grad_norm": 25.491024017333984, "learning_rate": 5.22485975468867e-07, "loss": 0.9272741675376892, "step": 2526 }, { "epoch": 5.333333333333333, "grad_norm": 5.792654514312744, "learning_rate": 5.209723019753245e-07, "loss": 0.6649227142333984, "step": 2528 }, { "epoch": 5.337552742616034, "grad_norm": 3.785661220550537, "learning_rate": 5.19467714387648e-07, "loss": 0.7637553215026855, "step": 2530 }, { "epoch": 5.341772151898734, "grad_norm": 3.9980387687683105, "learning_rate": 5.179722208477764e-07, "loss": 0.8297359347343445, "step": 2532 }, { "epoch": 5.345991561181434, "grad_norm": 3.7611520290374756, "learning_rate": 5.164858294484372e-07, "loss": 0.5959780216217041, "step": 2534 }, { "epoch": 5.350210970464135, "grad_norm": 3.5796737670898438, "learning_rate": 5.150085482331025e-07, "loss": 0.8286501169204712, "step": 2536 }, { "epoch": 5.3544303797468356, "grad_norm": 3.5265612602233887, "learning_rate": 5.135403851959455e-07, "loss": 0.7233340740203857, "step": 2538 }, { "epoch": 5.358649789029536, "grad_norm": 3.3139536380767822, "learning_rate": 5.120813482817971e-07, "loss": 0.5095676183700562, "step": 2540 }, { "epoch": 5.362869198312236, "grad_norm": 4.816203594207764, "learning_rate": 5.106314453861031e-07, "loss": 0.10940656065940857, "step": 2542 }, { "epoch": 5.367088607594937, "grad_norm": 0.789928138256073, "learning_rate": 5.091906843548809e-07, "loss": 0.4012370705604553, "step": 2544 }, { "epoch": 5.371308016877637, "grad_norm": 6.561746120452881, "learning_rate": 5.077590729846782e-07, "loss": 0.6537183523178101, "step": 2546 }, { "epoch": 5.375527426160337, "grad_norm": 2.8327221870422363, "learning_rate": 5.063366190225298e-07, "loss": 0.8231172561645508, "step": 2548 }, { "epoch": 5.379746835443038, "grad_norm": 8.452791213989258, "learning_rate": 5.049233301659161e-07, "loss": 0.5680804252624512, "step": 2550 }, { "epoch": 5.383966244725738, "grad_norm": 3.7673988342285156, "learning_rate": 5.035192140627213e-07, "loss": 0.1833023726940155, "step": 2552 }, { "epoch": 5.3881856540084385, "grad_norm": 2.091782569885254, "learning_rate": 5.021242783111924e-07, "loss": 0.7948375344276428, "step": 2554 }, { "epoch": 5.3924050632911396, "grad_norm": 14.723713874816895, "learning_rate": 5.007385304598978e-07, "loss": 0.6941039562225342, "step": 2556 }, { "epoch": 5.39662447257384, "grad_norm": 14.07388973236084, "learning_rate": 4.993619780076855e-07, "loss": 0.43440479040145874, "step": 2558 }, { "epoch": 5.40084388185654, "grad_norm": 7.218398094177246, "learning_rate": 4.979946284036441e-07, "loss": 0.21915487945079803, "step": 2560 }, { "epoch": 5.405063291139241, "grad_norm": 4.034780979156494, "learning_rate": 4.966364890470618e-07, "loss": 0.547726571559906, "step": 2562 }, { "epoch": 5.409282700421941, "grad_norm": 2.1914002895355225, "learning_rate": 4.952875672873867e-07, "loss": 0.9137965440750122, "step": 2564 }, { "epoch": 5.413502109704641, "grad_norm": 2.9248530864715576, "learning_rate": 4.939478704241859e-07, "loss": 0.4639781713485718, "step": 2566 }, { "epoch": 5.417721518987342, "grad_norm": 3.954902172088623, "learning_rate": 4.926174057071077e-07, "loss": 0.7315584421157837, "step": 2568 }, { "epoch": 5.421940928270042, "grad_norm": 3.5057809352874756, "learning_rate": 4.912961803358409e-07, "loss": 0.17236268520355225, "step": 2570 }, { "epoch": 5.4261603375527425, "grad_norm": 7.570180416107178, "learning_rate": 4.899842014600768e-07, "loss": 0.542130708694458, "step": 2572 }, { "epoch": 5.430379746835443, "grad_norm": 0.8830806612968445, "learning_rate": 4.886814761794694e-07, "loss": 0.08617094159126282, "step": 2574 }, { "epoch": 5.434599156118144, "grad_norm": 2.9515278339385986, "learning_rate": 4.873880115435982e-07, "loss": 0.6731958389282227, "step": 2576 }, { "epoch": 5.438818565400844, "grad_norm": 14.474751472473145, "learning_rate": 4.861038145519302e-07, "loss": 0.8146198987960815, "step": 2578 }, { "epoch": 5.443037974683544, "grad_norm": 4.252223968505859, "learning_rate": 4.848288921537804e-07, "loss": 0.7910962104797363, "step": 2580 }, { "epoch": 5.447257383966245, "grad_norm": 3.409487009048462, "learning_rate": 4.835632512482754e-07, "loss": 0.4601414203643799, "step": 2582 }, { "epoch": 5.451476793248945, "grad_norm": 2.4231197834014893, "learning_rate": 4.823068986843162e-07, "loss": 0.5326846837997437, "step": 2584 }, { "epoch": 5.455696202531645, "grad_norm": 11.150148391723633, "learning_rate": 4.810598412605407e-07, "loss": 0.6682008504867554, "step": 2586 }, { "epoch": 5.459915611814346, "grad_norm": 1.4135065078735352, "learning_rate": 4.798220857252866e-07, "loss": 0.30620691180229187, "step": 2588 }, { "epoch": 5.4641350210970465, "grad_norm": 6.7066755294799805, "learning_rate": 4.785936387765555e-07, "loss": 0.7434167861938477, "step": 2590 }, { "epoch": 5.468354430379747, "grad_norm": 1.7110621929168701, "learning_rate": 4.773745070619767e-07, "loss": 0.5532716512680054, "step": 2592 }, { "epoch": 5.472573839662447, "grad_norm": 3.8936519622802734, "learning_rate": 4.761646971787707e-07, "loss": 0.47537532448768616, "step": 2594 }, { "epoch": 5.476793248945148, "grad_norm": 4.289298057556152, "learning_rate": 4.749642156737138e-07, "loss": 0.34944185614585876, "step": 2596 }, { "epoch": 5.481012658227848, "grad_norm": 2.747558116912842, "learning_rate": 4.7377306904310233e-07, "loss": 0.16377092897891998, "step": 2598 }, { "epoch": 5.485232067510548, "grad_norm": 0.3106602132320404, "learning_rate": 4.7259126373271865e-07, "loss": 0.42584800720214844, "step": 2600 }, { "epoch": 5.489451476793249, "grad_norm": 5.100452899932861, "learning_rate": 4.714188061377942e-07, "loss": 0.8994771242141724, "step": 2602 }, { "epoch": 5.493670886075949, "grad_norm": 7.287261962890625, "learning_rate": 4.7025570260297703e-07, "loss": 0.8067635297775269, "step": 2604 }, { "epoch": 5.4978902953586495, "grad_norm": 15.138601303100586, "learning_rate": 4.6910195942229627e-07, "loss": 0.13593333959579468, "step": 2606 }, { "epoch": 5.5021097046413505, "grad_norm": 2.714247226715088, "learning_rate": 4.6795758283912836e-07, "loss": 0.3896440267562866, "step": 2608 }, { "epoch": 5.506329113924051, "grad_norm": 3.3304672241210938, "learning_rate": 4.668225790461631e-07, "loss": 0.0639631599187851, "step": 2610 }, { "epoch": 5.510548523206751, "grad_norm": 19.265941619873047, "learning_rate": 4.6569695418537063e-07, "loss": 0.2734604477882385, "step": 2612 }, { "epoch": 5.514767932489452, "grad_norm": 2.5918619632720947, "learning_rate": 4.645807143479674e-07, "loss": 0.8366518616676331, "step": 2614 }, { "epoch": 5.518987341772152, "grad_norm": 1.1537539958953857, "learning_rate": 4.634738655743843e-07, "loss": 0.4462703466415405, "step": 2616 }, { "epoch": 5.523206751054852, "grad_norm": 2.6978986263275146, "learning_rate": 4.6237641385423225e-07, "loss": 0.4549875259399414, "step": 2618 }, { "epoch": 5.527426160337553, "grad_norm": 2.130697727203369, "learning_rate": 4.6128836512627204e-07, "loss": 0.8581835627555847, "step": 2620 }, { "epoch": 5.531645569620253, "grad_norm": 4.347284317016602, "learning_rate": 4.602097252783805e-07, "loss": 0.5586264133453369, "step": 2622 }, { "epoch": 5.5358649789029535, "grad_norm": 14.522599220275879, "learning_rate": 4.591405001475189e-07, "loss": 0.8266869783401489, "step": 2624 }, { "epoch": 5.540084388185654, "grad_norm": 7.911047458648682, "learning_rate": 4.58080695519702e-07, "loss": 0.44375789165496826, "step": 2626 }, { "epoch": 5.544303797468355, "grad_norm": 4.837867736816406, "learning_rate": 4.570303171299666e-07, "loss": 0.6062820553779602, "step": 2628 }, { "epoch": 5.548523206751055, "grad_norm": 5.2242021560668945, "learning_rate": 4.5598937066233973e-07, "loss": 0.7080090641975403, "step": 2630 }, { "epoch": 5.552742616033755, "grad_norm": 4.157374858856201, "learning_rate": 4.5495786174980867e-07, "loss": 0.45279741287231445, "step": 2632 }, { "epoch": 5.556962025316456, "grad_norm": 3.1067519187927246, "learning_rate": 4.539357959742899e-07, "loss": 0.4694240689277649, "step": 2634 }, { "epoch": 5.561181434599156, "grad_norm": 2.8363306522369385, "learning_rate": 4.5292317886659993e-07, "loss": 0.37042319774627686, "step": 2636 }, { "epoch": 5.565400843881856, "grad_norm": 5.392505168914795, "learning_rate": 4.51920015906424e-07, "loss": 0.4348013401031494, "step": 2638 }, { "epoch": 5.569620253164557, "grad_norm": 1.3840664625167847, "learning_rate": 4.5092631252228734e-07, "loss": 0.2230294644832611, "step": 2640 }, { "epoch": 5.5738396624472575, "grad_norm": 2.5080552101135254, "learning_rate": 4.4994207409152575e-07, "loss": 0.8967776298522949, "step": 2642 }, { "epoch": 5.578059071729958, "grad_norm": 3.2199008464813232, "learning_rate": 4.48967305940256e-07, "loss": 0.9706035852432251, "step": 2644 }, { "epoch": 5.582278481012658, "grad_norm": 11.19129753112793, "learning_rate": 4.480020133433474e-07, "loss": 0.626300573348999, "step": 2646 }, { "epoch": 5.586497890295359, "grad_norm": 14.880667686462402, "learning_rate": 4.47046201524393e-07, "loss": 0.06408479064702988, "step": 2648 }, { "epoch": 5.590717299578059, "grad_norm": 2.3462014198303223, "learning_rate": 4.460998756556818e-07, "loss": 0.44877690076828003, "step": 2650 }, { "epoch": 5.594936708860759, "grad_norm": 3.08370041847229, "learning_rate": 4.451630408581701e-07, "loss": 0.3830834925174713, "step": 2652 }, { "epoch": 5.59915611814346, "grad_norm": 7.73508358001709, "learning_rate": 4.442357022014546e-07, "loss": 0.15033870935440063, "step": 2654 }, { "epoch": 5.60337552742616, "grad_norm": 4.020411014556885, "learning_rate": 4.43317864703744e-07, "loss": 0.5552294850349426, "step": 2656 }, { "epoch": 5.6075949367088604, "grad_norm": 12.524031639099121, "learning_rate": 4.4240953333183257e-07, "loss": 0.1009381040930748, "step": 2658 }, { "epoch": 5.6118143459915615, "grad_norm": 3.7477056980133057, "learning_rate": 4.4151071300107296e-07, "loss": 0.4878613352775574, "step": 2660 }, { "epoch": 5.616033755274262, "grad_norm": 25.352882385253906, "learning_rate": 4.406214085753499e-07, "loss": 0.0786014273762703, "step": 2662 }, { "epoch": 5.620253164556962, "grad_norm": 5.754502773284912, "learning_rate": 4.3974162486705327e-07, "loss": 0.424061119556427, "step": 2664 }, { "epoch": 5.624472573839663, "grad_norm": 4.437866687774658, "learning_rate": 4.38871366637053e-07, "loss": 0.07941263914108276, "step": 2666 }, { "epoch": 5.628691983122363, "grad_norm": 3.537459373474121, "learning_rate": 4.380106385946721e-07, "loss": 0.30082571506500244, "step": 2668 }, { "epoch": 5.632911392405063, "grad_norm": 2.312814474105835, "learning_rate": 4.3715944539766257e-07, "loss": 0.71795254945755, "step": 2670 }, { "epoch": 5.637130801687764, "grad_norm": 5.115408897399902, "learning_rate": 4.3631779165217875e-07, "loss": 0.811305820941925, "step": 2672 }, { "epoch": 5.641350210970464, "grad_norm": 8.744047164916992, "learning_rate": 4.354856819127537e-07, "loss": 0.6766564249992371, "step": 2674 }, { "epoch": 5.6455696202531644, "grad_norm": 2.2004096508026123, "learning_rate": 4.346631206822732e-07, "loss": 0.8192415237426758, "step": 2676 }, { "epoch": 5.649789029535865, "grad_norm": 1.8391209840774536, "learning_rate": 4.338501124119533e-07, "loss": 0.5205031037330627, "step": 2678 }, { "epoch": 5.654008438818566, "grad_norm": 3.9403841495513916, "learning_rate": 4.330466615013138e-07, "loss": 0.2361564040184021, "step": 2680 }, { "epoch": 5.658227848101266, "grad_norm": 4.0212554931640625, "learning_rate": 4.3225277229815673e-07, "loss": 0.45385825634002686, "step": 2682 }, { "epoch": 5.662447257383966, "grad_norm": 3.5017166137695312, "learning_rate": 4.314684490985411e-07, "loss": 0.2712249159812927, "step": 2684 }, { "epoch": 5.666666666666667, "grad_norm": 2.6000726222991943, "learning_rate": 4.3069369614676086e-07, "loss": 0.9603966474533081, "step": 2686 }, { "epoch": 5.670886075949367, "grad_norm": 2.9337501525878906, "learning_rate": 4.2992851763532125e-07, "loss": 0.5593338012695312, "step": 2688 }, { "epoch": 5.675105485232067, "grad_norm": 3.656930923461914, "learning_rate": 4.291729177049159e-07, "loss": 1.0005125999450684, "step": 2690 }, { "epoch": 5.679324894514768, "grad_norm": 12.878107070922852, "learning_rate": 4.28426900444406e-07, "loss": 0.04988168552517891, "step": 2692 }, { "epoch": 5.6835443037974684, "grad_norm": 2.371689558029175, "learning_rate": 4.2769046989079543e-07, "loss": 0.8081762790679932, "step": 2694 }, { "epoch": 5.687763713080169, "grad_norm": 5.237072944641113, "learning_rate": 4.2696363002921135e-07, "loss": 0.4558332860469818, "step": 2696 }, { "epoch": 5.691983122362869, "grad_norm": 2.5988988876342773, "learning_rate": 4.262463847928818e-07, "loss": 0.8788666129112244, "step": 2698 }, { "epoch": 5.69620253164557, "grad_norm": 3.3628621101379395, "learning_rate": 4.2553873806311424e-07, "loss": 0.8370002508163452, "step": 2700 }, { "epoch": 5.70042194092827, "grad_norm": 3.688671588897705, "learning_rate": 4.248406936692747e-07, "loss": 0.6099220514297485, "step": 2702 }, { "epoch": 5.70464135021097, "grad_norm": 1.2157199382781982, "learning_rate": 4.2415225538876686e-07, "loss": 0.49759507179260254, "step": 2704 }, { "epoch": 5.708860759493671, "grad_norm": 0.465036541223526, "learning_rate": 4.2347342694701206e-07, "loss": 0.40582969784736633, "step": 2706 }, { "epoch": 5.713080168776371, "grad_norm": 13.82797622680664, "learning_rate": 4.2280421201742874e-07, "loss": 0.11880761384963989, "step": 2708 }, { "epoch": 5.717299578059071, "grad_norm": 1.5762630701065063, "learning_rate": 4.221446142214125e-07, "loss": 0.620478630065918, "step": 2710 }, { "epoch": 5.7215189873417724, "grad_norm": 4.519263744354248, "learning_rate": 4.214946371283172e-07, "loss": 0.8996577262878418, "step": 2712 }, { "epoch": 5.725738396624473, "grad_norm": 8.791622161865234, "learning_rate": 4.2085428425543474e-07, "loss": 0.6637638807296753, "step": 2714 }, { "epoch": 5.729957805907173, "grad_norm": 3.510023832321167, "learning_rate": 4.202235590679763e-07, "loss": 0.77869713306427, "step": 2716 }, { "epoch": 5.734177215189874, "grad_norm": 5.473074913024902, "learning_rate": 4.1960246497905417e-07, "loss": 0.8682685494422913, "step": 2718 }, { "epoch": 5.738396624472574, "grad_norm": 2.2831952571868896, "learning_rate": 4.1899100534966263e-07, "loss": 0.8572003841400146, "step": 2720 }, { "epoch": 5.742616033755274, "grad_norm": 4.826292037963867, "learning_rate": 4.183891834886598e-07, "loss": 0.834069013595581, "step": 2722 }, { "epoch": 5.746835443037975, "grad_norm": 32.02092742919922, "learning_rate": 4.177970026527499e-07, "loss": 0.22675754129886627, "step": 2724 }, { "epoch": 5.751054852320675, "grad_norm": 2.374525308609009, "learning_rate": 4.1721446604646607e-07, "loss": 0.6690686345100403, "step": 2726 }, { "epoch": 5.755274261603375, "grad_norm": 2.256140947341919, "learning_rate": 4.1664157682215173e-07, "loss": 0.7398881316184998, "step": 2728 }, { "epoch": 5.759493670886076, "grad_norm": 4.3521504402160645, "learning_rate": 4.1607833807994547e-07, "loss": 0.8732868432998657, "step": 2730 }, { "epoch": 5.763713080168777, "grad_norm": 6.75162410736084, "learning_rate": 4.155247528677621e-07, "loss": 0.7909585237503052, "step": 2732 }, { "epoch": 5.767932489451477, "grad_norm": 31.269031524658203, "learning_rate": 4.1498082418127807e-07, "loss": 0.2190740704536438, "step": 2734 }, { "epoch": 5.772151898734177, "grad_norm": 7.679251194000244, "learning_rate": 4.1444655496391376e-07, "loss": 0.46999984979629517, "step": 2736 }, { "epoch": 5.776371308016878, "grad_norm": 2.543074607849121, "learning_rate": 4.139219481068185e-07, "loss": 0.884986162185669, "step": 2738 }, { "epoch": 5.780590717299578, "grad_norm": 2.4317591190338135, "learning_rate": 4.13407006448855e-07, "loss": 0.5444875955581665, "step": 2740 }, { "epoch": 5.784810126582278, "grad_norm": 2.9350624084472656, "learning_rate": 4.1290173277658303e-07, "loss": 0.8912389278411865, "step": 2742 }, { "epoch": 5.789029535864979, "grad_norm": 7.446691513061523, "learning_rate": 4.124061298242451e-07, "loss": 0.5339520573616028, "step": 2744 }, { "epoch": 5.793248945147679, "grad_norm": 5.2088704109191895, "learning_rate": 4.119202002737515e-07, "loss": 0.45539939403533936, "step": 2746 }, { "epoch": 5.7974683544303796, "grad_norm": 3.678557872772217, "learning_rate": 4.1144394675466634e-07, "loss": 0.8749001026153564, "step": 2748 }, { "epoch": 5.80168776371308, "grad_norm": 10.216012954711914, "learning_rate": 4.109773718441916e-07, "loss": 0.7841247320175171, "step": 2750 }, { "epoch": 5.805907172995781, "grad_norm": 2.440023422241211, "learning_rate": 4.105204780671556e-07, "loss": 0.8511307239532471, "step": 2752 }, { "epoch": 5.810126582278481, "grad_norm": 7.605076789855957, "learning_rate": 4.100732678959971e-07, "loss": 1.0421419143676758, "step": 2754 }, { "epoch": 5.814345991561181, "grad_norm": 4.731003284454346, "learning_rate": 4.0963574375075354e-07, "loss": 0.4821122884750366, "step": 2756 }, { "epoch": 5.818565400843882, "grad_norm": 3.862736463546753, "learning_rate": 4.092079079990471e-07, "loss": 0.05994529277086258, "step": 2758 }, { "epoch": 5.822784810126582, "grad_norm": 2.8706905841827393, "learning_rate": 4.087897629560719e-07, "loss": 0.6597020626068115, "step": 2760 }, { "epoch": 5.827004219409282, "grad_norm": 10.528990745544434, "learning_rate": 4.0838131088458207e-07, "loss": 0.5567920804023743, "step": 2762 }, { "epoch": 5.831223628691983, "grad_norm": 12.403848648071289, "learning_rate": 4.079825539948785e-07, "loss": 0.22084438800811768, "step": 2764 }, { "epoch": 5.8354430379746836, "grad_norm": 3.479530096054077, "learning_rate": 4.0759349444479853e-07, "loss": 0.8606102466583252, "step": 2766 }, { "epoch": 5.839662447257384, "grad_norm": 2.724365711212158, "learning_rate": 4.072141343397021e-07, "loss": 0.45490285754203796, "step": 2768 }, { "epoch": 5.843881856540085, "grad_norm": 6.362490653991699, "learning_rate": 4.068444757324621e-07, "loss": 0.8239868879318237, "step": 2770 }, { "epoch": 5.848101265822785, "grad_norm": 10.339574813842773, "learning_rate": 4.064845206234523e-07, "loss": 0.5215486884117126, "step": 2772 }, { "epoch": 5.852320675105485, "grad_norm": 186.8642578125, "learning_rate": 4.061342709605374e-07, "loss": 0.5665589570999146, "step": 2774 }, { "epoch": 5.856540084388186, "grad_norm": 2.5368990898132324, "learning_rate": 4.057937286390615e-07, "loss": 0.7514277100563049, "step": 2776 }, { "epoch": 5.860759493670886, "grad_norm": 7.951842784881592, "learning_rate": 4.0546289550183833e-07, "loss": 0.8747674822807312, "step": 2778 }, { "epoch": 5.864978902953586, "grad_norm": 4.173673152923584, "learning_rate": 4.0514177333914147e-07, "loss": 0.8620109558105469, "step": 2780 }, { "epoch": 5.869198312236287, "grad_norm": 2.6262011528015137, "learning_rate": 4.0483036388869426e-07, "loss": 0.8278003931045532, "step": 2782 }, { "epoch": 5.8734177215189876, "grad_norm": 3.4531075954437256, "learning_rate": 4.045286688356607e-07, "loss": 0.8439078330993652, "step": 2784 }, { "epoch": 5.877637130801688, "grad_norm": 17.26287269592285, "learning_rate": 4.0423668981263635e-07, "loss": 0.2546153664588928, "step": 2786 }, { "epoch": 5.881856540084388, "grad_norm": 2.9670450687408447, "learning_rate": 4.039544283996389e-07, "loss": 0.803874135017395, "step": 2788 }, { "epoch": 5.886075949367089, "grad_norm": 1.728909969329834, "learning_rate": 4.036818861241004e-07, "loss": 0.11378484964370728, "step": 2790 }, { "epoch": 5.890295358649789, "grad_norm": 15.379825592041016, "learning_rate": 4.0341906446085865e-07, "loss": 0.40370649099349976, "step": 2792 }, { "epoch": 5.894514767932489, "grad_norm": 14.338972091674805, "learning_rate": 4.0316596483214915e-07, "loss": 0.7983355522155762, "step": 2794 }, { "epoch": 5.89873417721519, "grad_norm": 3.563936710357666, "learning_rate": 4.0292258860759767e-07, "loss": 0.9050275087356567, "step": 2796 }, { "epoch": 5.90295358649789, "grad_norm": 2.441664934158325, "learning_rate": 4.026889371042125e-07, "loss": 0.4420316219329834, "step": 2798 }, { "epoch": 5.9071729957805905, "grad_norm": 2.3287241458892822, "learning_rate": 4.024650115863774e-07, "loss": 0.7599180936813354, "step": 2800 }, { "epoch": 5.911392405063291, "grad_norm": 3.5945613384246826, "learning_rate": 4.022508132658452e-07, "loss": 0.6878820657730103, "step": 2802 }, { "epoch": 5.915611814345992, "grad_norm": 12.153562545776367, "learning_rate": 4.020463433017305e-07, "loss": 0.40130820870399475, "step": 2804 }, { "epoch": 5.919831223628692, "grad_norm": 3.069974899291992, "learning_rate": 4.0185160280050384e-07, "loss": 0.095822274684906, "step": 2806 }, { "epoch": 5.924050632911392, "grad_norm": 8.717458724975586, "learning_rate": 4.01666592815986e-07, "loss": 0.9885622262954712, "step": 2808 }, { "epoch": 5.928270042194093, "grad_norm": 7.206968307495117, "learning_rate": 4.014913143493415e-07, "loss": 0.04864209145307541, "step": 2810 }, { "epoch": 5.932489451476793, "grad_norm": 3.1413886547088623, "learning_rate": 4.0132576834907404e-07, "loss": 0.43854427337646484, "step": 2812 }, { "epoch": 5.936708860759493, "grad_norm": 0.46113014221191406, "learning_rate": 4.0116995571102056e-07, "loss": 0.4027542471885681, "step": 2814 }, { "epoch": 5.940928270042194, "grad_norm": 3.120668888092041, "learning_rate": 4.0102387727834705e-07, "loss": 0.6854231357574463, "step": 2816 }, { "epoch": 5.9451476793248945, "grad_norm": 2.3229949474334717, "learning_rate": 4.008875338415438e-07, "loss": 0.5028409361839294, "step": 2818 }, { "epoch": 5.949367088607595, "grad_norm": 0.521416187286377, "learning_rate": 4.007609261384207e-07, "loss": 0.43289196491241455, "step": 2820 }, { "epoch": 5.953586497890296, "grad_norm": 2.9964866638183594, "learning_rate": 4.006440548541041e-07, "loss": 0.9015544652938843, "step": 2822 }, { "epoch": 5.957805907172996, "grad_norm": 1.731990933418274, "learning_rate": 4.005369206210321e-07, "loss": 0.43057486414909363, "step": 2824 }, { "epoch": 5.962025316455696, "grad_norm": 2.3747055530548096, "learning_rate": 4.0043952401895207e-07, "loss": 0.8347324132919312, "step": 2826 }, { "epoch": 5.966244725738397, "grad_norm": 1.1531779766082764, "learning_rate": 4.0035186557491683e-07, "loss": 0.44332531094551086, "step": 2828 }, { "epoch": 5.970464135021097, "grad_norm": 2.191092014312744, "learning_rate": 4.0027394576328213e-07, "loss": 0.39579838514328003, "step": 2830 }, { "epoch": 5.974683544303797, "grad_norm": 7.761366367340088, "learning_rate": 4.0020576500570355e-07, "loss": 1.0412178039550781, "step": 2832 }, { "epoch": 5.978902953586498, "grad_norm": 0.7852330207824707, "learning_rate": 4.0014732367113567e-07, "loss": 0.36100465059280396, "step": 2834 }, { "epoch": 5.9831223628691985, "grad_norm": 21.701784133911133, "learning_rate": 4.000986220758279e-07, "loss": 0.07913509011268616, "step": 2836 }, { "epoch": 5.987341772151899, "grad_norm": 5.154250621795654, "learning_rate": 4.0005966048332503e-07, "loss": 0.5702348351478577, "step": 2838 }, { "epoch": 5.991561181434599, "grad_norm": 1.1740047931671143, "learning_rate": 4.0003043910446375e-07, "loss": 0.47653162479400635, "step": 2840 }, { "epoch": 5.9957805907173, "grad_norm": 6.092247009277344, "learning_rate": 4.000109580973733e-07, "loss": 0.811444878578186, "step": 2842 }, { "epoch": 6.0, "grad_norm": 3.5838730335235596, "learning_rate": 4.0000121756747285e-07, "loss": 0.7996691465377808, "step": 2844 }, { "epoch": 6.0, "step": 2844, "total_flos": 5.392281114922451e+18, "train_loss": 0.8344338661696338, "train_runtime": 6866.9503, "train_samples_per_second": 12.425, "train_steps_per_second": 0.414 } ], "logging_steps": 2, "max_steps": 2844, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.392281114922451e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }