{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 1896, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004219409282700422, "grad_norm": 7.742424964904785, "learning_rate": 1.263157894736842e-08, "loss": 2.195801019668579, "step": 2 }, { "epoch": 0.008438818565400843, "grad_norm": 9.064373016357422, "learning_rate": 3.7894736842105265e-08, "loss": 1.77604079246521, "step": 4 }, { "epoch": 0.012658227848101266, "grad_norm": 1.5363776683807373, "learning_rate": 6.31578947368421e-08, "loss": 1.9369428157806396, "step": 6 }, { "epoch": 0.016877637130801686, "grad_norm": 3.058716297149658, "learning_rate": 8.842105263157893e-08, "loss": 1.949758768081665, "step": 8 }, { "epoch": 0.02109704641350211, "grad_norm": 2.559150457382202, "learning_rate": 1.1368421052631579e-07, "loss": 1.855597972869873, "step": 10 }, { "epoch": 0.02531645569620253, "grad_norm": 1.0957772731781006, "learning_rate": 1.3894736842105263e-07, "loss": 1.3166148662567139, "step": 12 }, { "epoch": 0.029535864978902954, "grad_norm": 2.902003049850464, "learning_rate": 1.642105263157895e-07, "loss": 1.662704348564148, "step": 14 }, { "epoch": 0.03375527426160337, "grad_norm": 7.220351219177246, "learning_rate": 1.894736842105263e-07, "loss": 2.225470542907715, "step": 16 }, { "epoch": 0.0379746835443038, "grad_norm": 1.3415447473526, "learning_rate": 2.1473684210526315e-07, "loss": 1.8447153568267822, "step": 18 }, { "epoch": 0.04219409282700422, "grad_norm": 1.5747126340866089, "learning_rate": 2.4e-07, "loss": 1.9135501384735107, "step": 20 }, { "epoch": 0.046413502109704644, "grad_norm": 1.7888033390045166, "learning_rate": 2.6526315789473684e-07, "loss": 1.6221210956573486, "step": 22 }, { "epoch": 0.05063291139240506, "grad_norm": 2.052851915359497, "learning_rate": 2.905263157894737e-07, "loss": 1.8789974451065063, "step": 24 }, { "epoch": 0.05485232067510549, "grad_norm": 2.472539186477661, "learning_rate": 3.157894736842105e-07, "loss": 1.7755436897277832, "step": 26 }, { "epoch": 0.05907172995780591, "grad_norm": 2.0235300064086914, "learning_rate": 3.4105263157894735e-07, "loss": 1.9977495670318604, "step": 28 }, { "epoch": 0.06329113924050633, "grad_norm": 3.731635808944702, "learning_rate": 3.663157894736842e-07, "loss": 2.019644021987915, "step": 30 }, { "epoch": 0.06751054852320675, "grad_norm": 1.7156628370285034, "learning_rate": 3.9157894736842107e-07, "loss": 1.8407037258148193, "step": 32 }, { "epoch": 0.07172995780590717, "grad_norm": 7.599488735198975, "learning_rate": 4.168421052631579e-07, "loss": 2.1601576805114746, "step": 34 }, { "epoch": 0.0759493670886076, "grad_norm": 1.4482383728027344, "learning_rate": 4.4210526315789467e-07, "loss": 1.8958334922790527, "step": 36 }, { "epoch": 0.08016877637130802, "grad_norm": 3.731816530227661, "learning_rate": 4.6736842105263153e-07, "loss": 2.1196088790893555, "step": 38 }, { "epoch": 0.08438818565400844, "grad_norm": 1.4769682884216309, "learning_rate": 4.926315789473684e-07, "loss": 1.9134398698806763, "step": 40 }, { "epoch": 0.08860759493670886, "grad_norm": 13.363183975219727, "learning_rate": 5.178947368421052e-07, "loss": 1.8113877773284912, "step": 42 }, { "epoch": 0.09282700421940929, "grad_norm": 3.724055051803589, "learning_rate": 5.431578947368421e-07, "loss": 2.103419542312622, "step": 44 }, { "epoch": 0.0970464135021097, "grad_norm": 2.3887927532196045, "learning_rate": 5.684210526315788e-07, "loss": 1.8661903142929077, "step": 46 }, { "epoch": 0.10126582278481013, "grad_norm": 1.5525636672973633, "learning_rate": 5.936842105263157e-07, "loss": 1.8071659803390503, "step": 48 }, { "epoch": 0.10548523206751055, "grad_norm": 5.839144229888916, "learning_rate": 6.189473684210527e-07, "loss": 1.561785340309143, "step": 50 }, { "epoch": 0.10970464135021098, "grad_norm": 5.124746799468994, "learning_rate": 6.442105263157894e-07, "loss": 1.3494197130203247, "step": 52 }, { "epoch": 0.11392405063291139, "grad_norm": 5.008734703063965, "learning_rate": 6.694736842105263e-07, "loss": 1.3085637092590332, "step": 54 }, { "epoch": 0.11814345991561181, "grad_norm": 1.1680630445480347, "learning_rate": 6.947368421052631e-07, "loss": 1.710934042930603, "step": 56 }, { "epoch": 0.12236286919831224, "grad_norm": 11.505062103271484, "learning_rate": 7.2e-07, "loss": 1.460722804069519, "step": 58 }, { "epoch": 0.12658227848101267, "grad_norm": 7.362393856048584, "learning_rate": 7.452631578947368e-07, "loss": 1.740147590637207, "step": 60 }, { "epoch": 0.1308016877637131, "grad_norm": 1.551930546760559, "learning_rate": 7.705263157894736e-07, "loss": 1.7590422630310059, "step": 62 }, { "epoch": 0.1350210970464135, "grad_norm": 1.2569609880447388, "learning_rate": 7.957894736842105e-07, "loss": 1.2291865348815918, "step": 64 }, { "epoch": 0.13924050632911392, "grad_norm": 2.3231699466705322, "learning_rate": 8.210526315789473e-07, "loss": 1.040055513381958, "step": 66 }, { "epoch": 0.14345991561181434, "grad_norm": 1.0935379266738892, "learning_rate": 8.463157894736842e-07, "loss": 1.300035834312439, "step": 68 }, { "epoch": 0.14767932489451477, "grad_norm": 4.188493728637695, "learning_rate": 8.71578947368421e-07, "loss": 1.1873421669006348, "step": 70 }, { "epoch": 0.1518987341772152, "grad_norm": 1.0681216716766357, "learning_rate": 8.968421052631579e-07, "loss": 1.4782516956329346, "step": 72 }, { "epoch": 0.15611814345991562, "grad_norm": 2.1197876930236816, "learning_rate": 9.221052631578946e-07, "loss": 1.2450737953186035, "step": 74 }, { "epoch": 0.16033755274261605, "grad_norm": 4.197497844696045, "learning_rate": 9.473684210526316e-07, "loss": 1.0491926670074463, "step": 76 }, { "epoch": 0.16455696202531644, "grad_norm": 1.161306619644165, "learning_rate": 9.726315789473682e-07, "loss": 1.60398268699646, "step": 78 }, { "epoch": 0.16877637130801687, "grad_norm": 1.0192948579788208, "learning_rate": 9.978947368421053e-07, "loss": 1.5951966047286987, "step": 80 }, { "epoch": 0.1729957805907173, "grad_norm": 2.497844934463501, "learning_rate": 1.023157894736842e-06, "loss": 1.564704179763794, "step": 82 }, { "epoch": 0.17721518987341772, "grad_norm": 9.568504333496094, "learning_rate": 1.048421052631579e-06, "loss": 1.1448438167572021, "step": 84 }, { "epoch": 0.18143459915611815, "grad_norm": 1.6581389904022217, "learning_rate": 1.0736842105263157e-06, "loss": 1.2066229581832886, "step": 86 }, { "epoch": 0.18565400843881857, "grad_norm": 2.0455548763275146, "learning_rate": 1.0989473684210525e-06, "loss": 1.475752353668213, "step": 88 }, { "epoch": 0.189873417721519, "grad_norm": 1.7110133171081543, "learning_rate": 1.1242105263157894e-06, "loss": 0.7750993967056274, "step": 90 }, { "epoch": 0.1940928270042194, "grad_norm": 1.7819750308990479, "learning_rate": 1.1494736842105262e-06, "loss": 1.8029006719589233, "step": 92 }, { "epoch": 0.19831223628691982, "grad_norm": 2.48787784576416, "learning_rate": 1.174736842105263e-06, "loss": 1.0352967977523804, "step": 94 }, { "epoch": 0.20253164556962025, "grad_norm": 1.236476182937622, "learning_rate": 1.2e-06, "loss": 1.5603318214416504, "step": 96 }, { "epoch": 0.20675105485232068, "grad_norm": 1.040940761566162, "learning_rate": 1.1999967137875644e-06, "loss": 1.6248691082000732, "step": 98 }, { "epoch": 0.2109704641350211, "grad_norm": 2.421082019805908, "learning_rate": 1.199986855190255e-06, "loss": 1.3791676759719849, "step": 100 }, { "epoch": 0.21518987341772153, "grad_norm": 1.4071706533432007, "learning_rate": 1.1999704243280622e-06, "loss": 1.1831879615783691, "step": 102 }, { "epoch": 0.21940928270042195, "grad_norm": 0.912856936454773, "learning_rate": 1.1999474214009684e-06, "loss": 1.097001552581787, "step": 104 }, { "epoch": 0.22362869198312235, "grad_norm": 2.3082234859466553, "learning_rate": 1.1999178466889462e-06, "loss": 1.089848518371582, "step": 106 }, { "epoch": 0.22784810126582278, "grad_norm": 1.7334387302398682, "learning_rate": 1.1998817005519536e-06, "loss": 1.0864239931106567, "step": 108 }, { "epoch": 0.2320675105485232, "grad_norm": 1.158636212348938, "learning_rate": 1.1998389834299315e-06, "loss": 1.135922908782959, "step": 110 }, { "epoch": 0.23628691983122363, "grad_norm": 1.3626004457473755, "learning_rate": 1.1997896958427962e-06, "loss": 1.511846661567688, "step": 112 }, { "epoch": 0.24050632911392406, "grad_norm": 2.417889356613159, "learning_rate": 1.199733838390435e-06, "loss": 1.387232780456543, "step": 114 }, { "epoch": 0.24472573839662448, "grad_norm": 2.854128837585449, "learning_rate": 1.1996714117526975e-06, "loss": 1.7170121669769287, "step": 116 }, { "epoch": 0.2489451476793249, "grad_norm": 1.1655317544937134, "learning_rate": 1.1996024166893883e-06, "loss": 1.4113752841949463, "step": 118 }, { "epoch": 0.25316455696202533, "grad_norm": 0.6216784715652466, "learning_rate": 1.199526854040257e-06, "loss": 1.0603108406066895, "step": 120 }, { "epoch": 0.25738396624472576, "grad_norm": 8.596741676330566, "learning_rate": 1.1994447247249886e-06, "loss": 1.3766067028045654, "step": 122 }, { "epoch": 0.2616033755274262, "grad_norm": 6.650040149688721, "learning_rate": 1.199356029743192e-06, "loss": 1.3911750316619873, "step": 124 }, { "epoch": 0.26582278481012656, "grad_norm": 1.3944730758666992, "learning_rate": 1.1992607701743877e-06, "loss": 1.479828953742981, "step": 126 }, { "epoch": 0.270042194092827, "grad_norm": 1.8790662288665771, "learning_rate": 1.1991589471779944e-06, "loss": 1.0149238109588623, "step": 128 }, { "epoch": 0.2742616033755274, "grad_norm": 2.3544161319732666, "learning_rate": 1.1990505619933166e-06, "loss": 1.2252846956253052, "step": 130 }, { "epoch": 0.27848101265822783, "grad_norm": 1.664776086807251, "learning_rate": 1.1989356159395268e-06, "loss": 1.3019721508026123, "step": 132 }, { "epoch": 0.28270042194092826, "grad_norm": 9.16057014465332, "learning_rate": 1.1988141104156518e-06, "loss": 0.8186183571815491, "step": 134 }, { "epoch": 0.2869198312236287, "grad_norm": 1.194026231765747, "learning_rate": 1.1986860469005543e-06, "loss": 1.0649269819259644, "step": 136 }, { "epoch": 0.2911392405063291, "grad_norm": 1.9782294034957886, "learning_rate": 1.1985514269529155e-06, "loss": 1.479400873184204, "step": 138 }, { "epoch": 0.29535864978902954, "grad_norm": 5.742581367492676, "learning_rate": 1.1984102522112159e-06, "loss": 1.0161385536193848, "step": 140 }, { "epoch": 0.29957805907172996, "grad_norm": 1.9590661525726318, "learning_rate": 1.1982625243937158e-06, "loss": 1.3290033340454102, "step": 142 }, { "epoch": 0.3037974683544304, "grad_norm": 7.880887031555176, "learning_rate": 1.198108245298433e-06, "loss": 1.1459200382232666, "step": 144 }, { "epoch": 0.3080168776371308, "grad_norm": 3.6530940532684326, "learning_rate": 1.1979474168031232e-06, "loss": 1.4865257740020752, "step": 146 }, { "epoch": 0.31223628691983124, "grad_norm": 1.9205989837646484, "learning_rate": 1.1977800408652552e-06, "loss": 1.4399386644363403, "step": 148 }, { "epoch": 0.31645569620253167, "grad_norm": 1.6300798654556274, "learning_rate": 1.1976061195219877e-06, "loss": 1.3092478513717651, "step": 150 }, { "epoch": 0.3206751054852321, "grad_norm": 1.216039776802063, "learning_rate": 1.1974256548901447e-06, "loss": 1.3857874870300293, "step": 152 }, { "epoch": 0.32489451476793246, "grad_norm": 1.1664661169052124, "learning_rate": 1.1972386491661896e-06, "loss": 1.5414711236953735, "step": 154 }, { "epoch": 0.3291139240506329, "grad_norm": 6.513598918914795, "learning_rate": 1.1970451046261986e-06, "loss": 1.3435574769973755, "step": 156 }, { "epoch": 0.3333333333333333, "grad_norm": 1.0399960279464722, "learning_rate": 1.196845023625833e-06, "loss": 1.4113006591796875, "step": 158 }, { "epoch": 0.33755274261603374, "grad_norm": 1.2614532709121704, "learning_rate": 1.196638408600309e-06, "loss": 1.4127811193466187, "step": 160 }, { "epoch": 0.34177215189873417, "grad_norm": 1.940744161605835, "learning_rate": 1.1964252620643718e-06, "loss": 0.9027857184410095, "step": 162 }, { "epoch": 0.3459915611814346, "grad_norm": 2.2784762382507324, "learning_rate": 1.1962055866122608e-06, "loss": 1.305877447128296, "step": 164 }, { "epoch": 0.350210970464135, "grad_norm": 2.4151477813720703, "learning_rate": 1.1959793849176804e-06, "loss": 0.8810802698135376, "step": 166 }, { "epoch": 0.35443037974683544, "grad_norm": 1.4606540203094482, "learning_rate": 1.195746659733767e-06, "loss": 1.3153032064437866, "step": 168 }, { "epoch": 0.35864978902953587, "grad_norm": 2.1664512157440186, "learning_rate": 1.1955074138930558e-06, "loss": 1.409055233001709, "step": 170 }, { "epoch": 0.3628691983122363, "grad_norm": 1.8031892776489258, "learning_rate": 1.1952616503074452e-06, "loss": 1.288240909576416, "step": 172 }, { "epoch": 0.3670886075949367, "grad_norm": 1.2246201038360596, "learning_rate": 1.1950093719681623e-06, "loss": 1.0962798595428467, "step": 174 }, { "epoch": 0.37130801687763715, "grad_norm": 2.090580701828003, "learning_rate": 1.1947505819457264e-06, "loss": 1.4130232334136963, "step": 176 }, { "epoch": 0.3755274261603376, "grad_norm": 0.9846044778823853, "learning_rate": 1.1944852833899122e-06, "loss": 1.4005430936813354, "step": 178 }, { "epoch": 0.379746835443038, "grad_norm": 1.043843388557434, "learning_rate": 1.1942134795297092e-06, "loss": 1.0696699619293213, "step": 180 }, { "epoch": 0.38396624472573837, "grad_norm": 1.0868744850158691, "learning_rate": 1.1939351736732854e-06, "loss": 1.3760430812835693, "step": 182 }, { "epoch": 0.3881856540084388, "grad_norm": 1.203220009803772, "learning_rate": 1.193650369207945e-06, "loss": 1.3777748346328735, "step": 184 }, { "epoch": 0.3924050632911392, "grad_norm": 2.734304666519165, "learning_rate": 1.1933590696000883e-06, "loss": 0.5890464186668396, "step": 186 }, { "epoch": 0.39662447257383965, "grad_norm": 1.4362255334854126, "learning_rate": 1.193061278395168e-06, "loss": 1.0182992219924927, "step": 188 }, { "epoch": 0.4008438818565401, "grad_norm": 1.4891494512557983, "learning_rate": 1.1927569992176479e-06, "loss": 1.1124638319015503, "step": 190 }, { "epoch": 0.4050632911392405, "grad_norm": 1.0771912336349487, "learning_rate": 1.1924462357709577e-06, "loss": 1.3731889724731445, "step": 192 }, { "epoch": 0.4092827004219409, "grad_norm": 1.2685896158218384, "learning_rate": 1.1921289918374481e-06, "loss": 1.1032942533493042, "step": 194 }, { "epoch": 0.41350210970464135, "grad_norm": 1.8579025268554688, "learning_rate": 1.1918052712783451e-06, "loss": 1.364923357963562, "step": 196 }, { "epoch": 0.4177215189873418, "grad_norm": 1.3035788536071777, "learning_rate": 1.1914750780337023e-06, "loss": 1.0887572765350342, "step": 198 }, { "epoch": 0.4219409282700422, "grad_norm": 1.1359857320785522, "learning_rate": 1.1911384161223538e-06, "loss": 1.1425602436065674, "step": 200 }, { "epoch": 0.42616033755274263, "grad_norm": 1.8307956457138062, "learning_rate": 1.1907952896418643e-06, "loss": 1.177668809890747, "step": 202 }, { "epoch": 0.43037974683544306, "grad_norm": 1.2392957210540771, "learning_rate": 1.1904457027684802e-06, "loss": 0.9585235714912415, "step": 204 }, { "epoch": 0.4345991561181435, "grad_norm": 1.794783115386963, "learning_rate": 1.1900896597570784e-06, "loss": 1.6650797128677368, "step": 206 }, { "epoch": 0.4388185654008439, "grad_norm": 1.6542989015579224, "learning_rate": 1.1897271649411145e-06, "loss": 0.7469709515571594, "step": 208 }, { "epoch": 0.4430379746835443, "grad_norm": 1.3410842418670654, "learning_rate": 1.1893582227325694e-06, "loss": 1.3532118797302246, "step": 210 }, { "epoch": 0.4472573839662447, "grad_norm": 3.8952300548553467, "learning_rate": 1.1889828376218972e-06, "loss": 0.8239259719848633, "step": 212 }, { "epoch": 0.45147679324894513, "grad_norm": 1.3895829916000366, "learning_rate": 1.1886010141779688e-06, "loss": 1.2587556838989258, "step": 214 }, { "epoch": 0.45569620253164556, "grad_norm": 0.944612443447113, "learning_rate": 1.1882127570480174e-06, "loss": 1.3315932750701904, "step": 216 }, { "epoch": 0.459915611814346, "grad_norm": 1.4940253496170044, "learning_rate": 1.1878180709575815e-06, "loss": 1.3548877239227295, "step": 218 }, { "epoch": 0.4641350210970464, "grad_norm": 1.9045593738555908, "learning_rate": 1.1874169607104478e-06, "loss": 1.3191989660263062, "step": 220 }, { "epoch": 0.46835443037974683, "grad_norm": 2.8294312953948975, "learning_rate": 1.187009431188592e-06, "loss": 1.270053505897522, "step": 222 }, { "epoch": 0.47257383966244726, "grad_norm": 1.5677937269210815, "learning_rate": 1.1865954873521197e-06, "loss": 1.4200479984283447, "step": 224 }, { "epoch": 0.4767932489451477, "grad_norm": 1.1733640432357788, "learning_rate": 1.1861751342392067e-06, "loss": 1.3603910207748413, "step": 226 }, { "epoch": 0.4810126582278481, "grad_norm": 2.0203163623809814, "learning_rate": 1.185748376966037e-06, "loss": 0.8049441576004028, "step": 228 }, { "epoch": 0.48523206751054854, "grad_norm": 1.1351509094238281, "learning_rate": 1.18531522072674e-06, "loss": 0.9551719427108765, "step": 230 }, { "epoch": 0.48945147679324896, "grad_norm": 2.4217798709869385, "learning_rate": 1.1848756707933284e-06, "loss": 0.9277099967002869, "step": 232 }, { "epoch": 0.4936708860759494, "grad_norm": 1.298305869102478, "learning_rate": 1.1844297325156337e-06, "loss": 1.334661602973938, "step": 234 }, { "epoch": 0.4978902953586498, "grad_norm": 1.7692943811416626, "learning_rate": 1.183977411321241e-06, "loss": 1.372158169746399, "step": 236 }, { "epoch": 0.5021097046413502, "grad_norm": 2.271902322769165, "learning_rate": 1.1835187127154221e-06, "loss": 1.036437749862671, "step": 238 }, { "epoch": 0.5063291139240507, "grad_norm": 2.205810070037842, "learning_rate": 1.18305364228107e-06, "loss": 0.470305472612381, "step": 240 }, { "epoch": 0.510548523206751, "grad_norm": 1.814501404762268, "learning_rate": 1.1825822056786304e-06, "loss": 1.4641677141189575, "step": 242 }, { "epoch": 0.5147679324894515, "grad_norm": 1.974550724029541, "learning_rate": 1.182104408646032e-06, "loss": 0.9201871156692505, "step": 244 }, { "epoch": 0.5189873417721519, "grad_norm": 1.1704046726226807, "learning_rate": 1.1816202569986176e-06, "loss": 1.5398619174957275, "step": 246 }, { "epoch": 0.5232067510548524, "grad_norm": 2.400852918624878, "learning_rate": 1.181129756629073e-06, "loss": 1.265621542930603, "step": 248 }, { "epoch": 0.5274261603375527, "grad_norm": 1.2383782863616943, "learning_rate": 1.1806329135073552e-06, "loss": 1.3679600954055786, "step": 250 }, { "epoch": 0.5316455696202531, "grad_norm": 1.7232836484909058, "learning_rate": 1.18012973368062e-06, "loss": 1.171999216079712, "step": 252 }, { "epoch": 0.5358649789029536, "grad_norm": 1.5587044954299927, "learning_rate": 1.1796202232731485e-06, "loss": 1.2946254014968872, "step": 254 }, { "epoch": 0.540084388185654, "grad_norm": 2.5242385864257812, "learning_rate": 1.1791043884862711e-06, "loss": 1.254220724105835, "step": 256 }, { "epoch": 0.5443037974683544, "grad_norm": 2.730691432952881, "learning_rate": 1.178582235598295e-06, "loss": 1.078680396080017, "step": 258 }, { "epoch": 0.5485232067510548, "grad_norm": 4.930171489715576, "learning_rate": 1.1780537709644245e-06, "loss": 1.0161340236663818, "step": 260 }, { "epoch": 0.5527426160337553, "grad_norm": 1.1866450309753418, "learning_rate": 1.177519001016686e-06, "loss": 1.352670431137085, "step": 262 }, { "epoch": 0.5569620253164557, "grad_norm": 1.242305040359497, "learning_rate": 1.1769779322638483e-06, "loss": 1.3570655584335327, "step": 264 }, { "epoch": 0.5611814345991561, "grad_norm": 3.957782745361328, "learning_rate": 1.1764305712913445e-06, "loss": 1.311238169670105, "step": 266 }, { "epoch": 0.5654008438818565, "grad_norm": 2.596217393875122, "learning_rate": 1.1758769247611908e-06, "loss": 1.5630828142166138, "step": 268 }, { "epoch": 0.569620253164557, "grad_norm": 2.2337851524353027, "learning_rate": 1.1753169994119063e-06, "loss": 1.0898045301437378, "step": 270 }, { "epoch": 0.5738396624472574, "grad_norm": 1.5337291955947876, "learning_rate": 1.1747508020584302e-06, "loss": 1.3198161125183105, "step": 272 }, { "epoch": 0.5780590717299579, "grad_norm": 4.55377721786499, "learning_rate": 1.17417833959204e-06, "loss": 1.1360241174697876, "step": 274 }, { "epoch": 0.5822784810126582, "grad_norm": 10.100658416748047, "learning_rate": 1.173599618980266e-06, "loss": 1.3351401090621948, "step": 276 }, { "epoch": 0.5864978902953587, "grad_norm": 1.0856088399887085, "learning_rate": 1.1730146472668075e-06, "loss": 1.4669663906097412, "step": 278 }, { "epoch": 0.5907172995780591, "grad_norm": 1.9721051454544067, "learning_rate": 1.1724234315714474e-06, "loss": 1.003104329109192, "step": 280 }, { "epoch": 0.5949367088607594, "grad_norm": 1.5582189559936523, "learning_rate": 1.1718259790899647e-06, "loss": 1.405082106590271, "step": 282 }, { "epoch": 0.5991561181434599, "grad_norm": 1.6864080429077148, "learning_rate": 1.1712222970940478e-06, "loss": 1.595037579536438, "step": 284 }, { "epoch": 0.6033755274261603, "grad_norm": 0.3938112258911133, "learning_rate": 1.1706123929312049e-06, "loss": 1.1622782945632935, "step": 286 }, { "epoch": 0.6075949367088608, "grad_norm": 1.0948325395584106, "learning_rate": 1.1699962740246754e-06, "loss": 1.325197458267212, "step": 288 }, { "epoch": 0.6118143459915611, "grad_norm": 1.514491081237793, "learning_rate": 1.1693739478733393e-06, "loss": 0.8543146848678589, "step": 290 }, { "epoch": 0.6160337552742616, "grad_norm": 1.247450590133667, "learning_rate": 1.1687454220516262e-06, "loss": 0.6629498600959778, "step": 292 }, { "epoch": 0.620253164556962, "grad_norm": 0.9832797050476074, "learning_rate": 1.1681107042094227e-06, "loss": 1.3061555624008179, "step": 294 }, { "epoch": 0.6244725738396625, "grad_norm": 1.6159130334854126, "learning_rate": 1.1674698020719791e-06, "loss": 0.7377375364303589, "step": 296 }, { "epoch": 0.6286919831223629, "grad_norm": 2.47055983543396, "learning_rate": 1.1668227234398165e-06, "loss": 0.6730928421020508, "step": 298 }, { "epoch": 0.6329113924050633, "grad_norm": 3.4460909366607666, "learning_rate": 1.16616947618863e-06, "loss": 1.8364771604537964, "step": 300 }, { "epoch": 0.6371308016877637, "grad_norm": 1.025884985923767, "learning_rate": 1.1655100682691951e-06, "loss": 1.3243968486785889, "step": 302 }, { "epoch": 0.6413502109704642, "grad_norm": 4.711573600769043, "learning_rate": 1.1648445077072692e-06, "loss": 0.9149092435836792, "step": 304 }, { "epoch": 0.6455696202531646, "grad_norm": 1.3360319137573242, "learning_rate": 1.164172802603494e-06, "loss": 1.0405997037887573, "step": 306 }, { "epoch": 0.6497890295358649, "grad_norm": 0.9437978267669678, "learning_rate": 1.1634949611332986e-06, "loss": 1.3173035383224487, "step": 308 }, { "epoch": 0.6540084388185654, "grad_norm": 1.0983142852783203, "learning_rate": 1.1628109915467975e-06, "loss": 1.251430869102478, "step": 310 }, { "epoch": 0.6582278481012658, "grad_norm": 2.2314326763153076, "learning_rate": 1.1621209021686924e-06, "loss": 1.0687130689620972, "step": 312 }, { "epoch": 0.6624472573839663, "grad_norm": 1.2847200632095337, "learning_rate": 1.1614247013981692e-06, "loss": 1.2770864963531494, "step": 314 }, { "epoch": 0.6666666666666666, "grad_norm": 2.491546630859375, "learning_rate": 1.1607223977087972e-06, "loss": 1.1677052974700928, "step": 316 }, { "epoch": 0.6708860759493671, "grad_norm": 1.112823486328125, "learning_rate": 1.160013999648425e-06, "loss": 1.1452233791351318, "step": 318 }, { "epoch": 0.6751054852320675, "grad_norm": 2.3609695434570312, "learning_rate": 1.1592995158390764e-06, "loss": 1.1290454864501953, "step": 320 }, { "epoch": 0.679324894514768, "grad_norm": 1.2427384853363037, "learning_rate": 1.1585789549768468e-06, "loss": 0.9067545533180237, "step": 322 }, { "epoch": 0.6835443037974683, "grad_norm": 1.1474452018737793, "learning_rate": 1.157852325831795e-06, "loss": 1.0441116094589233, "step": 324 }, { "epoch": 0.6877637130801688, "grad_norm": 2.173767328262329, "learning_rate": 1.157119637247839e-06, "loss": 0.8966209292411804, "step": 326 }, { "epoch": 0.6919831223628692, "grad_norm": 1.1117126941680908, "learning_rate": 1.1563808981426463e-06, "loss": 0.9047636985778809, "step": 328 }, { "epoch": 0.6962025316455697, "grad_norm": 5.761388778686523, "learning_rate": 1.155636117507527e-06, "loss": 1.6347975730895996, "step": 330 }, { "epoch": 0.70042194092827, "grad_norm": 2.036907434463501, "learning_rate": 1.1548853044073231e-06, "loss": 1.1312888860702515, "step": 332 }, { "epoch": 0.7046413502109705, "grad_norm": 1.1030317544937134, "learning_rate": 1.1541284679802987e-06, "loss": 1.441202163696289, "step": 334 }, { "epoch": 0.7088607594936709, "grad_norm": 1.0779141187667847, "learning_rate": 1.1533656174380295e-06, "loss": 1.3240406513214111, "step": 336 }, { "epoch": 0.7130801687763713, "grad_norm": 1.0553290843963623, "learning_rate": 1.1525967620652888e-06, "loss": 1.355104684829712, "step": 338 }, { "epoch": 0.7172995780590717, "grad_norm": 1.1798067092895508, "learning_rate": 1.151821911219936e-06, "loss": 1.3105881214141846, "step": 340 }, { "epoch": 0.7215189873417721, "grad_norm": 1.144148349761963, "learning_rate": 1.151041074332803e-06, "loss": 1.3141815662384033, "step": 342 }, { "epoch": 0.7257383966244726, "grad_norm": 7.016842365264893, "learning_rate": 1.1502542609075783e-06, "loss": 1.1324222087860107, "step": 344 }, { "epoch": 0.729957805907173, "grad_norm": 2.61010479927063, "learning_rate": 1.1494614805206915e-06, "loss": 0.908640444278717, "step": 346 }, { "epoch": 0.7341772151898734, "grad_norm": 1.0395554304122925, "learning_rate": 1.1486627428211974e-06, "loss": 1.308266282081604, "step": 348 }, { "epoch": 0.7383966244725738, "grad_norm": 2.721623659133911, "learning_rate": 1.147858057530658e-06, "loss": 1.228571891784668, "step": 350 }, { "epoch": 0.7426160337552743, "grad_norm": 22.471782684326172, "learning_rate": 1.1470474344430244e-06, "loss": 1.149246335029602, "step": 352 }, { "epoch": 0.7468354430379747, "grad_norm": 2.0179569721221924, "learning_rate": 1.1462308834245177e-06, "loss": 0.6629557013511658, "step": 354 }, { "epoch": 0.7510548523206751, "grad_norm": 2.0603108406066895, "learning_rate": 1.1454084144135089e-06, "loss": 1.0916632413864136, "step": 356 }, { "epoch": 0.7552742616033755, "grad_norm": 12.69516372680664, "learning_rate": 1.1445800374203972e-06, "loss": 1.026712417602539, "step": 358 }, { "epoch": 0.759493670886076, "grad_norm": 1.3232800960540771, "learning_rate": 1.1437457625274893e-06, "loss": 1.2708055973052979, "step": 360 }, { "epoch": 0.7637130801687764, "grad_norm": 2.96313738822937, "learning_rate": 1.1429055998888764e-06, "loss": 1.0283684730529785, "step": 362 }, { "epoch": 0.7679324894514767, "grad_norm": 2.2174739837646484, "learning_rate": 1.1420595597303093e-06, "loss": 1.6322853565216064, "step": 364 }, { "epoch": 0.7721518987341772, "grad_norm": 3.5580825805664062, "learning_rate": 1.1412076523490762e-06, "loss": 0.7543882727622986, "step": 366 }, { "epoch": 0.7763713080168776, "grad_norm": 1.3570228815078735, "learning_rate": 1.140349888113876e-06, "loss": 1.1952807903289795, "step": 368 }, { "epoch": 0.7805907172995781, "grad_norm": 3.309746503829956, "learning_rate": 1.1394862774646915e-06, "loss": 1.5346460342407227, "step": 370 }, { "epoch": 0.7848101265822784, "grad_norm": 2.107482433319092, "learning_rate": 1.1386168309126637e-06, "loss": 1.0200968980789185, "step": 372 }, { "epoch": 0.7890295358649789, "grad_norm": 1.8146216869354248, "learning_rate": 1.1377415590399635e-06, "loss": 1.0982069969177246, "step": 374 }, { "epoch": 0.7932489451476793, "grad_norm": 1.029420256614685, "learning_rate": 1.1368604724996625e-06, "loss": 1.197360873222351, "step": 376 }, { "epoch": 0.7974683544303798, "grad_norm": 1.1954386234283447, "learning_rate": 1.1359735820156029e-06, "loss": 1.1520774364471436, "step": 378 }, { "epoch": 0.8016877637130801, "grad_norm": 1.2728540897369385, "learning_rate": 1.1350808983822688e-06, "loss": 0.7453869581222534, "step": 380 }, { "epoch": 0.8059071729957806, "grad_norm": 2.447286605834961, "learning_rate": 1.134182432464653e-06, "loss": 1.3108738660812378, "step": 382 }, { "epoch": 0.810126582278481, "grad_norm": 5.362612247467041, "learning_rate": 1.1332781951981248e-06, "loss": 1.0827962160110474, "step": 384 }, { "epoch": 0.8143459915611815, "grad_norm": 1.7063276767730713, "learning_rate": 1.1323681975882984e-06, "loss": 1.3062907457351685, "step": 386 }, { "epoch": 0.8185654008438819, "grad_norm": 2.8370184898376465, "learning_rate": 1.131452450710898e-06, "loss": 0.9684048295021057, "step": 388 }, { "epoch": 0.8227848101265823, "grad_norm": 1.1811680793762207, "learning_rate": 1.1305309657116222e-06, "loss": 1.2863088846206665, "step": 390 }, { "epoch": 0.8270042194092827, "grad_norm": 3.667228937149048, "learning_rate": 1.1296037538060104e-06, "loss": 1.0412209033966064, "step": 392 }, { "epoch": 0.8312236286919831, "grad_norm": 4.117892265319824, "learning_rate": 1.128670826279304e-06, "loss": 0.9639609456062317, "step": 394 }, { "epoch": 0.8354430379746836, "grad_norm": 1.29248046875, "learning_rate": 1.1277321944863108e-06, "loss": 1.2934151887893677, "step": 396 }, { "epoch": 0.8396624472573839, "grad_norm": 0.26427099108695984, "learning_rate": 1.1267878698512655e-06, "loss": 1.1188089847564697, "step": 398 }, { "epoch": 0.8438818565400844, "grad_norm": 0.8574454188346863, "learning_rate": 1.125837863867692e-06, "loss": 0.9975463151931763, "step": 400 }, { "epoch": 0.8481012658227848, "grad_norm": 1.629779577255249, "learning_rate": 1.1248821880982622e-06, "loss": 0.7363186478614807, "step": 402 }, { "epoch": 0.8523206751054853, "grad_norm": 1.8325449228286743, "learning_rate": 1.1239208541746565e-06, "loss": 1.2270734310150146, "step": 404 }, { "epoch": 0.8565400843881856, "grad_norm": 0.7708742618560791, "learning_rate": 1.1229538737974207e-06, "loss": 0.9653185606002808, "step": 406 }, { "epoch": 0.8607594936708861, "grad_norm": 2.376756429672241, "learning_rate": 1.1219812587358254e-06, "loss": 0.997606098651886, "step": 408 }, { "epoch": 0.8649789029535865, "grad_norm": 1.2060413360595703, "learning_rate": 1.121003020827721e-06, "loss": 1.2897322177886963, "step": 410 }, { "epoch": 0.869198312236287, "grad_norm": 1.555523157119751, "learning_rate": 1.1200191719793948e-06, "loss": 0.876572847366333, "step": 412 }, { "epoch": 0.8734177215189873, "grad_norm": 3.1254689693450928, "learning_rate": 1.1190297241654262e-06, "loss": 1.2611523866653442, "step": 414 }, { "epoch": 0.8776371308016878, "grad_norm": 1.103677749633789, "learning_rate": 1.1180346894285397e-06, "loss": 1.0928722620010376, "step": 416 }, { "epoch": 0.8818565400843882, "grad_norm": 2.221696615219116, "learning_rate": 1.1170340798794594e-06, "loss": 1.2073904275894165, "step": 418 }, { "epoch": 0.8860759493670886, "grad_norm": 1.7576788663864136, "learning_rate": 1.1160279076967616e-06, "loss": 0.9891563057899475, "step": 420 }, { "epoch": 0.890295358649789, "grad_norm": 2.0383450984954834, "learning_rate": 1.1150161851267262e-06, "loss": 1.399549126625061, "step": 422 }, { "epoch": 0.8945147679324894, "grad_norm": 3.365711212158203, "learning_rate": 1.1139989244831874e-06, "loss": 1.029995083808899, "step": 424 }, { "epoch": 0.8987341772151899, "grad_norm": 2.773817539215088, "learning_rate": 1.1129761381473842e-06, "loss": 1.2264801263809204, "step": 426 }, { "epoch": 0.9029535864978903, "grad_norm": 2.2570652961730957, "learning_rate": 1.11194783856781e-06, "loss": 1.0824590921401978, "step": 428 }, { "epoch": 0.9071729957805907, "grad_norm": 5.947412967681885, "learning_rate": 1.1109140382600606e-06, "loss": 1.057291865348816, "step": 430 }, { "epoch": 0.9113924050632911, "grad_norm": 5.3977580070495605, "learning_rate": 1.1098747498066824e-06, "loss": 1.1226750612258911, "step": 432 }, { "epoch": 0.9156118143459916, "grad_norm": 3.355656385421753, "learning_rate": 1.108829985857018e-06, "loss": 1.3119703531265259, "step": 434 }, { "epoch": 0.919831223628692, "grad_norm": 3.1750526428222656, "learning_rate": 1.1077797591270538e-06, "loss": 0.9117200970649719, "step": 436 }, { "epoch": 0.9240506329113924, "grad_norm": 1.7613136768341064, "learning_rate": 1.1067240823992643e-06, "loss": 1.2639193534851074, "step": 438 }, { "epoch": 0.9282700421940928, "grad_norm": 1.1626863479614258, "learning_rate": 1.105662968522457e-06, "loss": 1.0154443979263306, "step": 440 }, { "epoch": 0.9324894514767933, "grad_norm": 4.403058052062988, "learning_rate": 1.1045964304116158e-06, "loss": 0.9742609262466431, "step": 442 }, { "epoch": 0.9367088607594937, "grad_norm": 4.4023237228393555, "learning_rate": 1.1035244810477435e-06, "loss": 1.161311388015747, "step": 444 }, { "epoch": 0.9409282700421941, "grad_norm": 1.6864513158798218, "learning_rate": 1.1024471334777044e-06, "loss": 1.3747820854187012, "step": 446 }, { "epoch": 0.9451476793248945, "grad_norm": 1.197826623916626, "learning_rate": 1.1013644008140647e-06, "loss": 1.0570836067199707, "step": 448 }, { "epoch": 0.9493670886075949, "grad_norm": 4.425671577453613, "learning_rate": 1.1002762962349342e-06, "loss": 1.066590666770935, "step": 450 }, { "epoch": 0.9535864978902954, "grad_norm": 1.4791566133499146, "learning_rate": 1.0991828329838048e-06, "loss": 1.3325567245483398, "step": 452 }, { "epoch": 0.9578059071729957, "grad_norm": 1.0424067974090576, "learning_rate": 1.0980840243693891e-06, "loss": 1.0253040790557861, "step": 454 }, { "epoch": 0.9620253164556962, "grad_norm": 1.6803632974624634, "learning_rate": 1.0969798837654603e-06, "loss": 1.2115472555160522, "step": 456 }, { "epoch": 0.9662447257383966, "grad_norm": 1.7260364294052124, "learning_rate": 1.0958704246106864e-06, "loss": 0.9136871695518494, "step": 458 }, { "epoch": 0.9704641350210971, "grad_norm": 4.201066493988037, "learning_rate": 1.0947556604084698e-06, "loss": 0.7265217304229736, "step": 460 }, { "epoch": 0.9746835443037974, "grad_norm": 1.5730266571044922, "learning_rate": 1.09363560472678e-06, "loss": 0.9232859015464783, "step": 462 }, { "epoch": 0.9789029535864979, "grad_norm": 1.9785159826278687, "learning_rate": 1.0925102711979916e-06, "loss": 1.2320111989974976, "step": 464 }, { "epoch": 0.9831223628691983, "grad_norm": 2.112661123275757, "learning_rate": 1.0913796735187152e-06, "loss": 0.7564235925674438, "step": 466 }, { "epoch": 0.9873417721518988, "grad_norm": 3.1255481243133545, "learning_rate": 1.0902438254496335e-06, "loss": 1.3790355920791626, "step": 468 }, { "epoch": 0.9915611814345991, "grad_norm": 2.643756866455078, "learning_rate": 1.0891027408153311e-06, "loss": 0.8968592286109924, "step": 470 }, { "epoch": 0.9957805907172996, "grad_norm": 1.2613961696624756, "learning_rate": 1.087956433504129e-06, "loss": 1.2724238634109497, "step": 472 }, { "epoch": 1.0, "grad_norm": 1.897484302520752, "learning_rate": 1.0868049174679133e-06, "loss": 1.3249882459640503, "step": 474 }, { "epoch": 1.0042194092827004, "grad_norm": 15.572712898254395, "learning_rate": 1.0856482067219672e-06, "loss": 1.1418063640594482, "step": 476 }, { "epoch": 1.0084388185654007, "grad_norm": 0.9338006377220154, "learning_rate": 1.0844863153447983e-06, "loss": 1.2509591579437256, "step": 478 }, { "epoch": 1.0126582278481013, "grad_norm": 1.7618024349212646, "learning_rate": 1.0833192574779696e-06, "loss": 1.2292466163635254, "step": 480 }, { "epoch": 1.0168776371308017, "grad_norm": 2.1609816551208496, "learning_rate": 1.0821470473259254e-06, "loss": 0.9470843076705933, "step": 482 }, { "epoch": 1.021097046413502, "grad_norm": 1.7773792743682861, "learning_rate": 1.0809696991558202e-06, "loss": 1.2175320386886597, "step": 484 }, { "epoch": 1.0253164556962024, "grad_norm": 3.248727560043335, "learning_rate": 1.0797872272973435e-06, "loss": 0.5157210230827332, "step": 486 }, { "epoch": 1.029535864978903, "grad_norm": 4.235684394836426, "learning_rate": 1.078599646142546e-06, "loss": 1.0747886896133423, "step": 488 }, { "epoch": 1.0337552742616034, "grad_norm": 1.0086420774459839, "learning_rate": 1.0774069701456646e-06, "loss": 0.91233229637146, "step": 490 }, { "epoch": 1.0379746835443038, "grad_norm": 1.760449767112732, "learning_rate": 1.0762092138229461e-06, "loss": 1.2355482578277588, "step": 492 }, { "epoch": 1.0421940928270041, "grad_norm": 2.7897939682006836, "learning_rate": 1.0750063917524715e-06, "loss": 0.876376748085022, "step": 494 }, { "epoch": 1.0464135021097047, "grad_norm": 1.583694577217102, "learning_rate": 1.073798518573977e-06, "loss": 0.9621012806892395, "step": 496 }, { "epoch": 1.0506329113924051, "grad_norm": 1.2283833026885986, "learning_rate": 1.0725856089886768e-06, "loss": 1.3705410957336426, "step": 498 }, { "epoch": 1.0548523206751055, "grad_norm": 1.892619013786316, "learning_rate": 1.071367677759084e-06, "loss": 1.2057194709777832, "step": 500 }, { "epoch": 1.0590717299578059, "grad_norm": 1.5852138996124268, "learning_rate": 1.0701447397088314e-06, "loss": 1.225092887878418, "step": 502 }, { "epoch": 1.0632911392405062, "grad_norm": 1.6359580755233765, "learning_rate": 1.0689168097224896e-06, "loss": 1.3359899520874023, "step": 504 }, { "epoch": 1.0675105485232068, "grad_norm": 2.100905418395996, "learning_rate": 1.0676839027453882e-06, "loss": 0.8091757297515869, "step": 506 }, { "epoch": 1.0717299578059072, "grad_norm": 0.9373227953910828, "learning_rate": 1.0664460337834312e-06, "loss": 1.20570969581604, "step": 508 }, { "epoch": 1.0759493670886076, "grad_norm": 1.1788032054901123, "learning_rate": 1.0652032179029165e-06, "loss": 1.2286429405212402, "step": 510 }, { "epoch": 1.080168776371308, "grad_norm": 2.070732355117798, "learning_rate": 1.0639554702303516e-06, "loss": 1.1464022397994995, "step": 512 }, { "epoch": 1.0843881856540085, "grad_norm": 1.4905924797058105, "learning_rate": 1.0627028059522697e-06, "loss": 1.2270240783691406, "step": 514 }, { "epoch": 1.0886075949367089, "grad_norm": 1.2873064279556274, "learning_rate": 1.061445240315044e-06, "loss": 1.2191872596740723, "step": 516 }, { "epoch": 1.0928270042194093, "grad_norm": 1.5833098888397217, "learning_rate": 1.060182788624704e-06, "loss": 1.0899208784103394, "step": 518 }, { "epoch": 1.0970464135021096, "grad_norm": 1.2056680917739868, "learning_rate": 1.0589154662467476e-06, "loss": 1.002990484237671, "step": 520 }, { "epoch": 1.1012658227848102, "grad_norm": 2.3490617275238037, "learning_rate": 1.0576432886059546e-06, "loss": 0.9123169779777527, "step": 522 }, { "epoch": 1.1054852320675106, "grad_norm": 1.398703694343567, "learning_rate": 1.056366271186199e-06, "loss": 1.1336543560028076, "step": 524 }, { "epoch": 1.109704641350211, "grad_norm": 5.56015682220459, "learning_rate": 1.0550844295302604e-06, "loss": 0.8910406231880188, "step": 526 }, { "epoch": 1.1139240506329113, "grad_norm": 1.501484751701355, "learning_rate": 1.0537977792396352e-06, "loss": 1.4902470111846924, "step": 528 }, { "epoch": 1.1181434599156117, "grad_norm": 3.0271458625793457, "learning_rate": 1.0525063359743461e-06, "loss": 1.2566696405410767, "step": 530 }, { "epoch": 1.1223628691983123, "grad_norm": 3.5508389472961426, "learning_rate": 1.0512101154527524e-06, "loss": 0.6722557544708252, "step": 532 }, { "epoch": 1.1265822784810127, "grad_norm": 5.183070182800293, "learning_rate": 1.049909133451358e-06, "loss": 1.16892409324646, "step": 534 }, { "epoch": 1.130801687763713, "grad_norm": 1.5916978120803833, "learning_rate": 1.0486034058046184e-06, "loss": 1.2602534294128418, "step": 536 }, { "epoch": 1.1350210970464134, "grad_norm": 2.219564914703369, "learning_rate": 1.0472929484047508e-06, "loss": 0.9274411797523499, "step": 538 }, { "epoch": 1.139240506329114, "grad_norm": 2.6550791263580322, "learning_rate": 1.0459777772015377e-06, "loss": 0.7955924868583679, "step": 540 }, { "epoch": 1.1434599156118144, "grad_norm": 1.0860302448272705, "learning_rate": 1.044657908202135e-06, "loss": 0.8460701704025269, "step": 542 }, { "epoch": 1.1476793248945147, "grad_norm": 2.0455169677734375, "learning_rate": 1.0433333574708754e-06, "loss": 0.9194719791412354, "step": 544 }, { "epoch": 1.1518987341772151, "grad_norm": 0.8281774520874023, "learning_rate": 1.042004141129074e-06, "loss": 1.209435224533081, "step": 546 }, { "epoch": 1.1561181434599157, "grad_norm": 3.4284002780914307, "learning_rate": 1.040670275354832e-06, "loss": 1.1639091968536377, "step": 548 }, { "epoch": 1.160337552742616, "grad_norm": 3.964017152786255, "learning_rate": 1.0393317763828394e-06, "loss": 1.0248503684997559, "step": 550 }, { "epoch": 1.1645569620253164, "grad_norm": 1.5311849117279053, "learning_rate": 1.0379886605041773e-06, "loss": 1.5549976825714111, "step": 552 }, { "epoch": 1.1687763713080168, "grad_norm": 1.1133424043655396, "learning_rate": 1.0366409440661203e-06, "loss": 1.2537164688110352, "step": 554 }, { "epoch": 1.1729957805907172, "grad_norm": 2.7838144302368164, "learning_rate": 1.035288643471937e-06, "loss": 0.6379430890083313, "step": 556 }, { "epoch": 1.1772151898734178, "grad_norm": 1.4451053142547607, "learning_rate": 1.0339317751806905e-06, "loss": 1.1707175970077515, "step": 558 }, { "epoch": 1.1814345991561181, "grad_norm": 1.5942399501800537, "learning_rate": 1.0325703557070377e-06, "loss": 0.7751450538635254, "step": 560 }, { "epoch": 1.1856540084388185, "grad_norm": 0.90716153383255, "learning_rate": 1.0312044016210299e-06, "loss": 0.9596038460731506, "step": 562 }, { "epoch": 1.189873417721519, "grad_norm": 3.479564905166626, "learning_rate": 1.029833929547908e-06, "loss": 1.4486083984375, "step": 564 }, { "epoch": 1.1940928270042195, "grad_norm": 1.8199161291122437, "learning_rate": 1.028458956167903e-06, "loss": 1.2960246801376343, "step": 566 }, { "epoch": 1.1983122362869199, "grad_norm": 2.1159839630126953, "learning_rate": 1.0270794982160328e-06, "loss": 0.9260680079460144, "step": 568 }, { "epoch": 1.2025316455696202, "grad_norm": 1.1287225484848022, "learning_rate": 1.0256955724818963e-06, "loss": 1.1793110370635986, "step": 570 }, { "epoch": 1.2067510548523206, "grad_norm": 3.6729793548583984, "learning_rate": 1.0243071958094713e-06, "loss": 1.1447832584381104, "step": 572 }, { "epoch": 1.2109704641350212, "grad_norm": 1.18032705783844, "learning_rate": 1.0229143850969086e-06, "loss": 1.230734944343567, "step": 574 }, { "epoch": 1.2151898734177216, "grad_norm": 1.3310970067977905, "learning_rate": 1.0215171572963262e-06, "loss": 1.0188127756118774, "step": 576 }, { "epoch": 1.219409282700422, "grad_norm": 1.4861373901367188, "learning_rate": 1.020115529413603e-06, "loss": 0.6179706454277039, "step": 578 }, { "epoch": 1.2236286919831223, "grad_norm": 1.791669249534607, "learning_rate": 1.0187095185081726e-06, "loss": 1.0826208591461182, "step": 580 }, { "epoch": 1.2278481012658227, "grad_norm": 2.172065258026123, "learning_rate": 1.0172991416928149e-06, "loss": 0.9076665639877319, "step": 582 }, { "epoch": 1.2320675105485233, "grad_norm": 1.2151978015899658, "learning_rate": 1.0158844161334472e-06, "loss": 0.9629290103912354, "step": 584 }, { "epoch": 1.2362869198312236, "grad_norm": 3.7680106163024902, "learning_rate": 1.014465359048917e-06, "loss": 1.0498627424240112, "step": 586 }, { "epoch": 1.240506329113924, "grad_norm": 1.3523688316345215, "learning_rate": 1.0130419877107911e-06, "loss": 0.8714591860771179, "step": 588 }, { "epoch": 1.2447257383966246, "grad_norm": 1.3897624015808105, "learning_rate": 1.0116143194431453e-06, "loss": 1.247403860092163, "step": 590 }, { "epoch": 1.248945147679325, "grad_norm": 0.8131434917449951, "learning_rate": 1.0101823716223555e-06, "loss": 0.6486424207687378, "step": 592 }, { "epoch": 1.2531645569620253, "grad_norm": 0.7691041827201843, "learning_rate": 1.0087461616768827e-06, "loss": 0.8923141956329346, "step": 594 }, { "epoch": 1.2573839662447257, "grad_norm": 3.8645286560058594, "learning_rate": 1.0073057070870643e-06, "loss": 0.8870598673820496, "step": 596 }, { "epoch": 1.261603375527426, "grad_norm": 1.9958916902542114, "learning_rate": 1.0058610253848993e-06, "loss": 0.6330664753913879, "step": 598 }, { "epoch": 1.2658227848101267, "grad_norm": 0.7626611590385437, "learning_rate": 1.0044121341538363e-06, "loss": 1.0315228700637817, "step": 600 }, { "epoch": 1.270042194092827, "grad_norm": 3.175126791000366, "learning_rate": 1.0029590510285573e-06, "loss": 1.4815832376480103, "step": 602 }, { "epoch": 1.2742616033755274, "grad_norm": 1.2133333683013916, "learning_rate": 1.001501793694766e-06, "loss": 1.1330845355987549, "step": 604 }, { "epoch": 1.2784810126582278, "grad_norm": 6.052526473999023, "learning_rate": 1.0000403798889702e-06, "loss": 1.0692338943481445, "step": 606 }, { "epoch": 1.2827004219409281, "grad_norm": 1.9215449094772339, "learning_rate": 9.985748273982674e-07, "loss": 0.8957496285438538, "step": 608 }, { "epoch": 1.2869198312236287, "grad_norm": 2.6900336742401123, "learning_rate": 9.97105154060127e-07, "loss": 1.0588608980178833, "step": 610 }, { "epoch": 1.2911392405063291, "grad_norm": 4.2595014572143555, "learning_rate": 9.956313777621743e-07, "loss": 0.6556817293167114, "step": 612 }, { "epoch": 1.2953586497890295, "grad_norm": 8.070136070251465, "learning_rate": 9.941535164419721e-07, "loss": 0.718927800655365, "step": 614 }, { "epoch": 1.29957805907173, "grad_norm": 1.7044512033462524, "learning_rate": 9.926715880868028e-07, "loss": 1.1856049299240112, "step": 616 }, { "epoch": 1.3037974683544304, "grad_norm": 1.7254352569580078, "learning_rate": 9.911856107334497e-07, "loss": 1.2073801755905151, "step": 618 }, { "epoch": 1.3080168776371308, "grad_norm": 1.0367379188537598, "learning_rate": 9.896956024679761e-07, "loss": 0.7765376567840576, "step": 620 }, { "epoch": 1.3122362869198312, "grad_norm": 1.8108422756195068, "learning_rate": 9.882015814255073e-07, "loss": 1.221542477607727, "step": 622 }, { "epoch": 1.3164556962025316, "grad_norm": 1.8170989751815796, "learning_rate": 9.867035657900079e-07, "loss": 0.9256758689880371, "step": 624 }, { "epoch": 1.3206751054852321, "grad_norm": 1.397377610206604, "learning_rate": 9.852015737940618e-07, "loss": 1.1996105909347534, "step": 626 }, { "epoch": 1.3248945147679325, "grad_norm": 2.3642630577087402, "learning_rate": 9.836956237186495e-07, "loss": 1.7291648387908936, "step": 628 }, { "epoch": 1.3291139240506329, "grad_norm": 1.3181530237197876, "learning_rate": 9.821857338929266e-07, "loss": 0.9664700627326965, "step": 630 }, { "epoch": 1.3333333333333333, "grad_norm": 1.441937804222107, "learning_rate": 9.806719226939986e-07, "loss": 1.2257553339004517, "step": 632 }, { "epoch": 1.3375527426160336, "grad_norm": 0.5612751841545105, "learning_rate": 9.791542085467003e-07, "loss": 0.9133172035217285, "step": 634 }, { "epoch": 1.3417721518987342, "grad_norm": 1.5648330450057983, "learning_rate": 9.776326099233684e-07, "loss": 0.7176555395126343, "step": 636 }, { "epoch": 1.3459915611814346, "grad_norm": 21.417667388916016, "learning_rate": 9.761071453436195e-07, "loss": 0.9039233326911926, "step": 638 }, { "epoch": 1.350210970464135, "grad_norm": 1.9340534210205078, "learning_rate": 9.745778333741227e-07, "loss": 1.2601927518844604, "step": 640 }, { "epoch": 1.3544303797468356, "grad_norm": 2.36677885055542, "learning_rate": 9.73044692628374e-07, "loss": 0.9230378866195679, "step": 642 }, { "epoch": 1.358649789029536, "grad_norm": 0.7526681423187256, "learning_rate": 9.715077417664705e-07, "loss": 1.3141403198242188, "step": 644 }, { "epoch": 1.3628691983122363, "grad_norm": 1.5295689105987549, "learning_rate": 9.699669994948829e-07, "loss": 1.20694899559021, "step": 646 }, { "epoch": 1.3670886075949367, "grad_norm": 8.918047904968262, "learning_rate": 9.684224845662273e-07, "loss": 0.9112899899482727, "step": 648 }, { "epoch": 1.371308016877637, "grad_norm": 2.3975322246551514, "learning_rate": 9.668742157790378e-07, "loss": 1.4381672143936157, "step": 650 }, { "epoch": 1.3755274261603376, "grad_norm": 1.3441905975341797, "learning_rate": 9.653222119775373e-07, "loss": 1.224355936050415, "step": 652 }, { "epoch": 1.379746835443038, "grad_norm": 8.170360565185547, "learning_rate": 9.637664920514075e-07, "loss": 0.9496920108795166, "step": 654 }, { "epoch": 1.3839662447257384, "grad_norm": 1.037866234779358, "learning_rate": 9.622070749355605e-07, "loss": 1.2685517072677612, "step": 656 }, { "epoch": 1.3881856540084387, "grad_norm": 5.456931114196777, "learning_rate": 9.60643979609907e-07, "loss": 0.676283597946167, "step": 658 }, { "epoch": 1.3924050632911391, "grad_norm": 1.2280727624893188, "learning_rate": 9.59077225099126e-07, "loss": 1.1961660385131836, "step": 660 }, { "epoch": 1.3966244725738397, "grad_norm": 2.4184653759002686, "learning_rate": 9.57506830472433e-07, "loss": 0.7105515599250793, "step": 662 }, { "epoch": 1.40084388185654, "grad_norm": 2.039471387863159, "learning_rate": 9.559328148433473e-07, "loss": 1.2236860990524292, "step": 664 }, { "epoch": 1.4050632911392404, "grad_norm": 3.3632094860076904, "learning_rate": 9.54355197369461e-07, "loss": 0.8225454092025757, "step": 666 }, { "epoch": 1.409282700421941, "grad_norm": 2.8079605102539062, "learning_rate": 9.527739972522041e-07, "loss": 1.224509835243225, "step": 668 }, { "epoch": 1.4135021097046414, "grad_norm": 1.0237503051757812, "learning_rate": 9.511892337366117e-07, "loss": 1.2146466970443726, "step": 670 }, { "epoch": 1.4177215189873418, "grad_norm": 2.5967676639556885, "learning_rate": 9.496009261110901e-07, "loss": 1.5150516033172607, "step": 672 }, { "epoch": 1.4219409282700421, "grad_norm": 1.5330960750579834, "learning_rate": 9.480090937071802e-07, "loss": 0.8809629082679749, "step": 674 }, { "epoch": 1.4261603375527425, "grad_norm": 1.5120795965194702, "learning_rate": 9.464137558993251e-07, "loss": 0.7257891893386841, "step": 676 }, { "epoch": 1.4303797468354431, "grad_norm": 1.8336877822875977, "learning_rate": 9.448149321046316e-07, "loss": 1.0394529104232788, "step": 678 }, { "epoch": 1.4345991561181435, "grad_norm": 3.8357579708099365, "learning_rate": 9.432126417826358e-07, "loss": 1.1556706428527832, "step": 680 }, { "epoch": 1.4388185654008439, "grad_norm": 1.9536528587341309, "learning_rate": 9.416069044350646e-07, "loss": 0.9677222967147827, "step": 682 }, { "epoch": 1.4430379746835442, "grad_norm": 3.1365737915039062, "learning_rate": 9.399977396055995e-07, "loss": 1.2571027278900146, "step": 684 }, { "epoch": 1.4472573839662446, "grad_norm": 3.375725746154785, "learning_rate": 9.383851668796392e-07, "loss": 0.7981452345848083, "step": 686 }, { "epoch": 1.4514767932489452, "grad_norm": 2.0981504917144775, "learning_rate": 9.367692058840594e-07, "loss": 0.9887269735336304, "step": 688 }, { "epoch": 1.4556962025316456, "grad_norm": 5.0241265296936035, "learning_rate": 9.351498762869752e-07, "loss": 1.1597225666046143, "step": 690 }, { "epoch": 1.459915611814346, "grad_norm": 3.389521598815918, "learning_rate": 9.33527197797502e-07, "loss": 0.7292091846466064, "step": 692 }, { "epoch": 1.4641350210970465, "grad_norm": 2.6481471061706543, "learning_rate": 9.319011901655145e-07, "loss": 1.3359123468399048, "step": 694 }, { "epoch": 1.4683544303797469, "grad_norm": 3.0631887912750244, "learning_rate": 9.302718731814072e-07, "loss": 0.7314563393592834, "step": 696 }, { "epoch": 1.4725738396624473, "grad_norm": 1.1294174194335938, "learning_rate": 9.286392666758532e-07, "loss": 1.202915072441101, "step": 698 }, { "epoch": 1.4767932489451476, "grad_norm": 0.9764413237571716, "learning_rate": 9.270033905195628e-07, "loss": 1.2414040565490723, "step": 700 }, { "epoch": 1.481012658227848, "grad_norm": 2.000211000442505, "learning_rate": 9.25364264623042e-07, "loss": 1.1095331907272339, "step": 702 }, { "epoch": 1.4852320675105486, "grad_norm": 2.9179534912109375, "learning_rate": 9.237219089363494e-07, "loss": 0.8434455990791321, "step": 704 }, { "epoch": 1.489451476793249, "grad_norm": 1.4670263528823853, "learning_rate": 9.220763434488545e-07, "loss": 1.1951138973236084, "step": 706 }, { "epoch": 1.4936708860759493, "grad_norm": 1.2732899188995361, "learning_rate": 9.204275881889934e-07, "loss": 1.2532763481140137, "step": 708 }, { "epoch": 1.49789029535865, "grad_norm": 1.302393913269043, "learning_rate": 9.187756632240253e-07, "loss": 1.1061906814575195, "step": 710 }, { "epoch": 1.50210970464135, "grad_norm": 3.0175118446350098, "learning_rate": 9.171205886597887e-07, "loss": 0.5435208082199097, "step": 712 }, { "epoch": 1.5063291139240507, "grad_norm": 2.06999135017395, "learning_rate": 9.154623846404564e-07, "loss": 1.2072559595108032, "step": 714 }, { "epoch": 1.510548523206751, "grad_norm": 0.7862725853919983, "learning_rate": 9.138010713482899e-07, "loss": 1.1671605110168457, "step": 716 }, { "epoch": 1.5147679324894514, "grad_norm": 1.871321439743042, "learning_rate": 9.121366690033944e-07, "loss": 1.1794459819793701, "step": 718 }, { "epoch": 1.518987341772152, "grad_norm": 2.5938880443573, "learning_rate": 9.104691978634728e-07, "loss": 1.0995539426803589, "step": 720 }, { "epoch": 1.5232067510548524, "grad_norm": 3.1606552600860596, "learning_rate": 9.08798678223578e-07, "loss": 1.231619954109192, "step": 722 }, { "epoch": 1.5274261603375527, "grad_norm": 0.9736570715904236, "learning_rate": 9.071251304158672e-07, "loss": 1.250243067741394, "step": 724 }, { "epoch": 1.5316455696202531, "grad_norm": 3.1312761306762695, "learning_rate": 9.054485748093538e-07, "loss": 0.6082893013954163, "step": 726 }, { "epoch": 1.5358649789029535, "grad_norm": 2.6216931343078613, "learning_rate": 9.037690318096597e-07, "loss": 0.4211277663707733, "step": 728 }, { "epoch": 1.540084388185654, "grad_norm": 1.668655276298523, "learning_rate": 9.020865218587668e-07, "loss": 1.0038397312164307, "step": 730 }, { "epoch": 1.5443037974683544, "grad_norm": 2.301889419555664, "learning_rate": 9.004010654347677e-07, "loss": 0.9896605610847473, "step": 732 }, { "epoch": 1.5485232067510548, "grad_norm": 1.4354939460754395, "learning_rate": 8.98712683051618e-07, "loss": 1.235560417175293, "step": 734 }, { "epoch": 1.5527426160337554, "grad_norm": 1.394313097000122, "learning_rate": 8.970213952588844e-07, "loss": 0.986316442489624, "step": 736 }, { "epoch": 1.5569620253164556, "grad_norm": 2.5624778270721436, "learning_rate": 8.953272226414971e-07, "loss": 0.9202096462249756, "step": 738 }, { "epoch": 1.5611814345991561, "grad_norm": 2.074122667312622, "learning_rate": 8.936301858194968e-07, "loss": 1.1290022134780884, "step": 740 }, { "epoch": 1.5654008438818565, "grad_norm": 1.5154873132705688, "learning_rate": 8.919303054477857e-07, "loss": 0.8514289855957031, "step": 742 }, { "epoch": 1.5696202531645569, "grad_norm": 6.750024318695068, "learning_rate": 8.90227602215875e-07, "loss": 0.5599585175514221, "step": 744 }, { "epoch": 1.5738396624472575, "grad_norm": 1.8320426940917969, "learning_rate": 8.885220968476331e-07, "loss": 0.780780017375946, "step": 746 }, { "epoch": 1.5780590717299579, "grad_norm": 3.8813395500183105, "learning_rate": 8.868138101010339e-07, "loss": 0.656001091003418, "step": 748 }, { "epoch": 1.5822784810126582, "grad_norm": 1.3229504823684692, "learning_rate": 8.85102762767904e-07, "loss": 1.215933084487915, "step": 750 }, { "epoch": 1.5864978902953588, "grad_norm": 1.5343960523605347, "learning_rate": 8.833889756736696e-07, "loss": 0.7347640991210938, "step": 752 }, { "epoch": 1.590717299578059, "grad_norm": 1.3849875926971436, "learning_rate": 8.816724696771023e-07, "loss": 0.8356782793998718, "step": 754 }, { "epoch": 1.5949367088607596, "grad_norm": 1.4311785697937012, "learning_rate": 8.799532656700668e-07, "loss": 0.6571628451347351, "step": 756 }, { "epoch": 1.59915611814346, "grad_norm": 1.954759955406189, "learning_rate": 8.78231384577265e-07, "loss": 0.8940713405609131, "step": 758 }, { "epoch": 1.6033755274261603, "grad_norm": 1.7239028215408325, "learning_rate": 8.765068473559826e-07, "loss": 1.1826146841049194, "step": 760 }, { "epoch": 1.6075949367088609, "grad_norm": 0.5811662673950195, "learning_rate": 8.747796749958329e-07, "loss": 0.8342135548591614, "step": 762 }, { "epoch": 1.611814345991561, "grad_norm": 2.9614205360412598, "learning_rate": 8.730498885185022e-07, "loss": 1.2261645793914795, "step": 764 }, { "epoch": 1.6160337552742616, "grad_norm": 1.8469215631484985, "learning_rate": 8.713175089774935e-07, "loss": 1.0828239917755127, "step": 766 }, { "epoch": 1.620253164556962, "grad_norm": 0.7462615966796875, "learning_rate": 8.695825574578708e-07, "loss": 1.08014976978302, "step": 768 }, { "epoch": 1.6244725738396624, "grad_norm": 1.5869735479354858, "learning_rate": 8.678450550760013e-07, "loss": 1.2228014469146729, "step": 770 }, { "epoch": 1.628691983122363, "grad_norm": 1.267874836921692, "learning_rate": 8.661050229793e-07, "loss": 1.2381342649459839, "step": 772 }, { "epoch": 1.6329113924050633, "grad_norm": 1.8816311359405518, "learning_rate": 8.643624823459705e-07, "loss": 1.2392218112945557, "step": 774 }, { "epoch": 1.6371308016877637, "grad_norm": 2.270045280456543, "learning_rate": 8.626174543847494e-07, "loss": 1.2957593202590942, "step": 776 }, { "epoch": 1.6413502109704643, "grad_norm": 0.9629765152931213, "learning_rate": 8.608699603346457e-07, "loss": 0.8434277772903442, "step": 778 }, { "epoch": 1.6455696202531644, "grad_norm": 6.275035381317139, "learning_rate": 8.591200214646842e-07, "loss": 0.3582332730293274, "step": 780 }, { "epoch": 1.649789029535865, "grad_norm": 1.902158260345459, "learning_rate": 8.573676590736464e-07, "loss": 1.1803405284881592, "step": 782 }, { "epoch": 1.6540084388185654, "grad_norm": 1.5325170755386353, "learning_rate": 8.556128944898098e-07, "loss": 0.9606213569641113, "step": 784 }, { "epoch": 1.6582278481012658, "grad_norm": 1.5803859233856201, "learning_rate": 8.538557490706904e-07, "loss": 1.1115106344223022, "step": 786 }, { "epoch": 1.6624472573839664, "grad_norm": 1.5015286207199097, "learning_rate": 8.520962442027808e-07, "loss": 0.5854233503341675, "step": 788 }, { "epoch": 1.6666666666666665, "grad_norm": 1.0308812856674194, "learning_rate": 8.503344013012916e-07, "loss": 1.2016632556915283, "step": 790 }, { "epoch": 1.6708860759493671, "grad_norm": 3.8510706424713135, "learning_rate": 8.485702418098897e-07, "loss": 0.648362398147583, "step": 792 }, { "epoch": 1.6751054852320675, "grad_norm": 0.47625789046287537, "learning_rate": 8.468037872004374e-07, "loss": 1.0536069869995117, "step": 794 }, { "epoch": 1.6793248945147679, "grad_norm": 1.4682717323303223, "learning_rate": 8.450350589727312e-07, "loss": 1.2215386629104614, "step": 796 }, { "epoch": 1.6835443037974684, "grad_norm": 4.169456481933594, "learning_rate": 8.432640786542407e-07, "loss": 0.9762102961540222, "step": 798 }, { "epoch": 1.6877637130801688, "grad_norm": 4.598972797393799, "learning_rate": 8.414908677998456e-07, "loss": 1.2525511980056763, "step": 800 }, { "epoch": 1.6919831223628692, "grad_norm": 4.160123825073242, "learning_rate": 8.39715447991574e-07, "loss": 0.6331847906112671, "step": 802 }, { "epoch": 1.6962025316455698, "grad_norm": 1.725115180015564, "learning_rate": 8.379378408383392e-07, "loss": 1.2866941690444946, "step": 804 }, { "epoch": 1.70042194092827, "grad_norm": 1.8015742301940918, "learning_rate": 8.361580679756771e-07, "loss": 1.1813989877700806, "step": 806 }, { "epoch": 1.7046413502109705, "grad_norm": 2.260415554046631, "learning_rate": 8.343761510654834e-07, "loss": 0.8856143355369568, "step": 808 }, { "epoch": 1.7088607594936709, "grad_norm": 3.748908042907715, "learning_rate": 8.325921117957487e-07, "loss": 0.9216241240501404, "step": 810 }, { "epoch": 1.7130801687763713, "grad_norm": 1.0273261070251465, "learning_rate": 8.308059718802953e-07, "loss": 1.1896474361419678, "step": 812 }, { "epoch": 1.7172995780590719, "grad_norm": 5.24169921875, "learning_rate": 8.290177530585126e-07, "loss": 1.525089144706726, "step": 814 }, { "epoch": 1.721518987341772, "grad_norm": 2.1483330726623535, "learning_rate": 8.272274770950934e-07, "loss": 1.2185280323028564, "step": 816 }, { "epoch": 1.7257383966244726, "grad_norm": 3.7641489505767822, "learning_rate": 8.254351657797674e-07, "loss": 0.8334339261054993, "step": 818 }, { "epoch": 1.729957805907173, "grad_norm": 1.9054092168807983, "learning_rate": 8.236408409270376e-07, "loss": 0.4915008842945099, "step": 820 }, { "epoch": 1.7341772151898733, "grad_norm": 18.341960906982422, "learning_rate": 8.218445243759137e-07, "loss": 0.7150586843490601, "step": 822 }, { "epoch": 1.738396624472574, "grad_norm": 1.671027421951294, "learning_rate": 8.200462379896468e-07, "loss": 0.7935347557067871, "step": 824 }, { "epoch": 1.7426160337552743, "grad_norm": 1.1407239437103271, "learning_rate": 8.182460036554631e-07, "loss": 1.0441514253616333, "step": 826 }, { "epoch": 1.7468354430379747, "grad_norm": 1.8266801834106445, "learning_rate": 8.164438432842973e-07, "loss": 1.0361227989196777, "step": 828 }, { "epoch": 1.7510548523206753, "grad_norm": 1.3403441905975342, "learning_rate": 8.146397788105272e-07, "loss": 1.1865990161895752, "step": 830 }, { "epoch": 1.7552742616033754, "grad_norm": 0.9253147840499878, "learning_rate": 8.128338321917045e-07, "loss": 1.1751179695129395, "step": 832 }, { "epoch": 1.759493670886076, "grad_norm": 3.3475301265716553, "learning_rate": 8.110260254082898e-07, "loss": 0.9232848286628723, "step": 834 }, { "epoch": 1.7637130801687764, "grad_norm": 1.080689549446106, "learning_rate": 8.092163804633832e-07, "loss": 1.2128963470458984, "step": 836 }, { "epoch": 1.7679324894514767, "grad_norm": 1.2559332847595215, "learning_rate": 8.074049193824579e-07, "loss": 1.0571973323822021, "step": 838 }, { "epoch": 1.7721518987341773, "grad_norm": 1.474366307258606, "learning_rate": 8.055916642130914e-07, "loss": 1.1260405778884888, "step": 840 }, { "epoch": 1.7763713080168775, "grad_norm": 2.3742828369140625, "learning_rate": 8.037766370246972e-07, "loss": 1.0088326930999756, "step": 842 }, { "epoch": 1.780590717299578, "grad_norm": 0.6461037397384644, "learning_rate": 8.019598599082567e-07, "loss": 0.5369378328323364, "step": 844 }, { "epoch": 1.7848101265822784, "grad_norm": 0.5748505592346191, "learning_rate": 8.001413549760496e-07, "loss": 0.8393441438674927, "step": 846 }, { "epoch": 1.7890295358649788, "grad_norm": 3.184478282928467, "learning_rate": 7.983211443613853e-07, "loss": 0.7285841107368469, "step": 848 }, { "epoch": 1.7932489451476794, "grad_norm": 6.552399635314941, "learning_rate": 7.964992502183333e-07, "loss": 0.8242054581642151, "step": 850 }, { "epoch": 1.7974683544303798, "grad_norm": 1.3520995378494263, "learning_rate": 7.946756947214536e-07, "loss": 1.210748314857483, "step": 852 }, { "epoch": 1.8016877637130801, "grad_norm": 2.167078733444214, "learning_rate": 7.928505000655264e-07, "loss": 1.4572898149490356, "step": 854 }, { "epoch": 1.8059071729957807, "grad_norm": 0.506519615650177, "learning_rate": 7.910236884652833e-07, "loss": 1.0607579946517944, "step": 856 }, { "epoch": 1.810126582278481, "grad_norm": 2.641108512878418, "learning_rate": 7.891952821551348e-07, "loss": 1.0674760341644287, "step": 858 }, { "epoch": 1.8143459915611815, "grad_norm": 2.2153725624084473, "learning_rate": 7.87365303388902e-07, "loss": 0.8174270987510681, "step": 860 }, { "epoch": 1.8185654008438819, "grad_norm": 2.081165075302124, "learning_rate": 7.855337744395437e-07, "loss": 1.2201720476150513, "step": 862 }, { "epoch": 1.8227848101265822, "grad_norm": 1.6072843074798584, "learning_rate": 7.837007175988869e-07, "loss": 1.0889828205108643, "step": 864 }, { "epoch": 1.8270042194092828, "grad_norm": 5.830190181732178, "learning_rate": 7.818661551773542e-07, "loss": 1.2174073457717896, "step": 866 }, { "epoch": 1.831223628691983, "grad_norm": 3.3919594287872314, "learning_rate": 7.800301095036933e-07, "loss": 0.9814926385879517, "step": 868 }, { "epoch": 1.8354430379746836, "grad_norm": 3.2243967056274414, "learning_rate": 7.781926029247048e-07, "loss": 1.1042759418487549, "step": 870 }, { "epoch": 1.839662447257384, "grad_norm": 3.5643749237060547, "learning_rate": 7.763536578049699e-07, "loss": 0.8058743476867676, "step": 872 }, { "epoch": 1.8438818565400843, "grad_norm": 1.4269522428512573, "learning_rate": 7.745132965265788e-07, "loss": 0.987337052822113, "step": 874 }, { "epoch": 1.8481012658227849, "grad_norm": 0.985817015171051, "learning_rate": 7.726715414888577e-07, "loss": 1.2107572555541992, "step": 876 }, { "epoch": 1.8523206751054853, "grad_norm": 1.9935749769210815, "learning_rate": 7.708284151080968e-07, "loss": 0.9476048946380615, "step": 878 }, { "epoch": 1.8565400843881856, "grad_norm": 3.2491185665130615, "learning_rate": 7.689839398172767e-07, "loss": 0.9019596576690674, "step": 880 }, { "epoch": 1.8607594936708862, "grad_norm": 2.6126883029937744, "learning_rate": 7.671381380657965e-07, "loss": 1.1691335439682007, "step": 882 }, { "epoch": 1.8649789029535864, "grad_norm": 1.3800222873687744, "learning_rate": 7.65291032319199e-07, "loss": 0.8417655229568481, "step": 884 }, { "epoch": 1.869198312236287, "grad_norm": 2.0879945755004883, "learning_rate": 7.634426450588988e-07, "loss": 0.8084736466407776, "step": 886 }, { "epoch": 1.8734177215189873, "grad_norm": 1.3853508234024048, "learning_rate": 7.615929987819075e-07, "loss": 1.136643648147583, "step": 888 }, { "epoch": 1.8776371308016877, "grad_norm": 7.130331993103027, "learning_rate": 7.597421160005612e-07, "loss": 0.4776380956172943, "step": 890 }, { "epoch": 1.8818565400843883, "grad_norm": 3.002958059310913, "learning_rate": 7.578900192422443e-07, "loss": 0.7818654179573059, "step": 892 }, { "epoch": 1.8860759493670884, "grad_norm": 1.7899680137634277, "learning_rate": 7.560367310491182e-07, "loss": 1.1894859075546265, "step": 894 }, { "epoch": 1.890295358649789, "grad_norm": 1.8424100875854492, "learning_rate": 7.541822739778445e-07, "loss": 1.3867307901382446, "step": 896 }, { "epoch": 1.8945147679324894, "grad_norm": 11.084500312805176, "learning_rate": 7.523266705993115e-07, "loss": 0.8175121545791626, "step": 898 }, { "epoch": 1.8987341772151898, "grad_norm": 2.078657388687134, "learning_rate": 7.504699434983602e-07, "loss": 1.1003247499465942, "step": 900 }, { "epoch": 1.9029535864978904, "grad_norm": 1.9573427438735962, "learning_rate": 7.486121152735074e-07, "loss": 1.3067007064819336, "step": 902 }, { "epoch": 1.9071729957805907, "grad_norm": 6.904273509979248, "learning_rate": 7.467532085366726e-07, "loss": 1.073278784751892, "step": 904 }, { "epoch": 1.9113924050632911, "grad_norm": 0.9965894818305969, "learning_rate": 7.448932459129016e-07, "loss": 1.3775935173034668, "step": 906 }, { "epoch": 1.9156118143459917, "grad_norm": 4.340345859527588, "learning_rate": 7.430322500400924e-07, "loss": 0.5346195697784424, "step": 908 }, { "epoch": 1.9198312236286919, "grad_norm": 1.2633851766586304, "learning_rate": 7.411702435687177e-07, "loss": 1.1176321506500244, "step": 910 }, { "epoch": 1.9240506329113924, "grad_norm": 2.212251901626587, "learning_rate": 7.393072491615511e-07, "loss": 0.8476999402046204, "step": 912 }, { "epoch": 1.9282700421940928, "grad_norm": 1.6803202629089355, "learning_rate": 7.374432894933905e-07, "loss": 1.2019180059432983, "step": 914 }, { "epoch": 1.9324894514767932, "grad_norm": 2.0837478637695312, "learning_rate": 7.355783872507818e-07, "loss": 0.9530687928199768, "step": 916 }, { "epoch": 1.9367088607594938, "grad_norm": 2.765504837036133, "learning_rate": 7.337125651317433e-07, "loss": 1.0955183506011963, "step": 918 }, { "epoch": 1.9409282700421941, "grad_norm": 2.821669101715088, "learning_rate": 7.318458458454892e-07, "loss": 0.5842803120613098, "step": 920 }, { "epoch": 1.9451476793248945, "grad_norm": 1.241335153579712, "learning_rate": 7.299782521121536e-07, "loss": 1.1832818984985352, "step": 922 }, { "epoch": 1.9493670886075949, "grad_norm": 1.0857776403427124, "learning_rate": 7.281098066625129e-07, "loss": 1.262142539024353, "step": 924 }, { "epoch": 1.9535864978902953, "grad_norm": 1.392290472984314, "learning_rate": 7.262405322377109e-07, "loss": 0.9511996507644653, "step": 926 }, { "epoch": 1.9578059071729959, "grad_norm": 1.1899210214614868, "learning_rate": 7.243704515889799e-07, "loss": 0.797012448310852, "step": 928 }, { "epoch": 1.9620253164556962, "grad_norm": 3.176696300506592, "learning_rate": 7.224995874773657e-07, "loss": 1.2408126592636108, "step": 930 }, { "epoch": 1.9662447257383966, "grad_norm": 3.178877592086792, "learning_rate": 7.206279626734492e-07, "loss": 0.9860198497772217, "step": 932 }, { "epoch": 1.9704641350210972, "grad_norm": 1.3276498317718506, "learning_rate": 7.187555999570705e-07, "loss": 1.2460663318634033, "step": 934 }, { "epoch": 1.9746835443037973, "grad_norm": 1.3889448642730713, "learning_rate": 7.1688252211705e-07, "loss": 1.2101694345474243, "step": 936 }, { "epoch": 1.978902953586498, "grad_norm": 0.8887938261032104, "learning_rate": 7.150087519509128e-07, "loss": 0.8580332398414612, "step": 938 }, { "epoch": 1.9831223628691983, "grad_norm": 1.4792304039001465, "learning_rate": 7.131343122646098e-07, "loss": 1.231054663658142, "step": 940 }, { "epoch": 1.9873417721518987, "grad_norm": 2.457331418991089, "learning_rate": 7.11259225872241e-07, "loss": 1.006805658340454, "step": 942 }, { "epoch": 1.9915611814345993, "grad_norm": 1.956710696220398, "learning_rate": 7.093835155957782e-07, "loss": 0.7936272025108337, "step": 944 }, { "epoch": 1.9957805907172996, "grad_norm": 1.3758666515350342, "learning_rate": 7.075072042647852e-07, "loss": 1.1611456871032715, "step": 946 }, { "epoch": 2.0, "grad_norm": 3.4326858520507812, "learning_rate": 7.056303147161428e-07, "loss": 0.5819499492645264, "step": 948 }, { "epoch": 2.0042194092827006, "grad_norm": 6.350503921508789, "learning_rate": 7.03752869793768e-07, "loss": 0.9798819422721863, "step": 950 }, { "epoch": 2.0084388185654007, "grad_norm": 3.770968437194824, "learning_rate": 7.018748923483386e-07, "loss": 0.6936891078948975, "step": 952 }, { "epoch": 2.0126582278481013, "grad_norm": 3.1057989597320557, "learning_rate": 6.99996405237013e-07, "loss": 0.857315182685852, "step": 954 }, { "epoch": 2.0168776371308015, "grad_norm": 1.2099494934082031, "learning_rate": 6.98117431323153e-07, "loss": 1.0093313455581665, "step": 956 }, { "epoch": 2.021097046413502, "grad_norm": 2.805772542953491, "learning_rate": 6.962379934760456e-07, "loss": 0.7519159913063049, "step": 958 }, { "epoch": 2.0253164556962027, "grad_norm": 2.69637131690979, "learning_rate": 6.94358114570624e-07, "loss": 0.8004332780838013, "step": 960 }, { "epoch": 2.029535864978903, "grad_norm": 4.524166584014893, "learning_rate": 6.924778174871901e-07, "loss": 1.2693367004394531, "step": 962 }, { "epoch": 2.0337552742616034, "grad_norm": 1.710188388824463, "learning_rate": 6.905971251111349e-07, "loss": 0.8327010869979858, "step": 964 }, { "epoch": 2.037974683544304, "grad_norm": 1.4968762397766113, "learning_rate": 6.887160603326612e-07, "loss": 0.8057103753089905, "step": 966 }, { "epoch": 2.042194092827004, "grad_norm": 2.4308996200561523, "learning_rate": 6.868346460465038e-07, "loss": 0.7996687889099121, "step": 968 }, { "epoch": 2.0464135021097047, "grad_norm": 1.531032681465149, "learning_rate": 6.849529051516521e-07, "loss": 1.125715732574463, "step": 970 }, { "epoch": 2.050632911392405, "grad_norm": 3.428903579711914, "learning_rate": 6.830708605510697e-07, "loss": 1.0384615659713745, "step": 972 }, { "epoch": 2.0548523206751055, "grad_norm": 1.0824832916259766, "learning_rate": 6.811885351514176e-07, "loss": 0.9185305237770081, "step": 974 }, { "epoch": 2.059071729957806, "grad_norm": 1.7839653491973877, "learning_rate": 6.793059518627739e-07, "loss": 0.8305885195732117, "step": 976 }, { "epoch": 2.0632911392405062, "grad_norm": 0.7381780743598938, "learning_rate": 6.77423133598356e-07, "loss": 0.8384730815887451, "step": 978 }, { "epoch": 2.067510548523207, "grad_norm": 1.6481800079345703, "learning_rate": 6.755401032742407e-07, "loss": 0.8727558255195618, "step": 980 }, { "epoch": 2.071729957805907, "grad_norm": 5.477509021759033, "learning_rate": 6.736568838090859e-07, "loss": 1.1277180910110474, "step": 982 }, { "epoch": 2.0759493670886076, "grad_norm": 2.758972644805908, "learning_rate": 6.71773498123852e-07, "loss": 1.0967183113098145, "step": 984 }, { "epoch": 2.080168776371308, "grad_norm": 1.1603978872299194, "learning_rate": 6.698899691415218e-07, "loss": 1.1284269094467163, "step": 986 }, { "epoch": 2.0843881856540083, "grad_norm": 1.3078337907791138, "learning_rate": 6.680063197868228e-07, "loss": 1.166777491569519, "step": 988 }, { "epoch": 2.088607594936709, "grad_norm": 3.5238006114959717, "learning_rate": 6.661225729859475e-07, "loss": 0.5711318850517273, "step": 990 }, { "epoch": 2.0928270042194095, "grad_norm": 2.0197713375091553, "learning_rate": 6.64238751666274e-07, "loss": 0.608964204788208, "step": 992 }, { "epoch": 2.0970464135021096, "grad_norm": 1.3378883600234985, "learning_rate": 6.623548787560878e-07, "loss": 1.175323247909546, "step": 994 }, { "epoch": 2.1012658227848102, "grad_norm": 1.223233938217163, "learning_rate": 6.604709771843022e-07, "loss": 1.1399847269058228, "step": 996 }, { "epoch": 2.1054852320675104, "grad_norm": 0.5097165703773499, "learning_rate": 6.585870698801791e-07, "loss": 0.8538580536842346, "step": 998 }, { "epoch": 2.109704641350211, "grad_norm": 1.8075917959213257, "learning_rate": 6.567031797730507e-07, "loss": 1.2541990280151367, "step": 1000 }, { "epoch": 2.1139240506329116, "grad_norm": 1.6272530555725098, "learning_rate": 6.548193297920393e-07, "loss": 1.182500958442688, "step": 1002 }, { "epoch": 2.1181434599156117, "grad_norm": 1.8821264505386353, "learning_rate": 6.529355428657795e-07, "loss": 1.1924080848693848, "step": 1004 }, { "epoch": 2.1223628691983123, "grad_norm": 1.0999635457992554, "learning_rate": 6.510518419221377e-07, "loss": 0.6417333483695984, "step": 1006 }, { "epoch": 2.1265822784810124, "grad_norm": 1.3833292722702026, "learning_rate": 6.49168249887934e-07, "loss": 0.7661027908325195, "step": 1008 }, { "epoch": 2.130801687763713, "grad_norm": 1.4525195360183716, "learning_rate": 6.472847896886636e-07, "loss": 0.7349141240119934, "step": 1010 }, { "epoch": 2.1350210970464136, "grad_norm": 3.5440096855163574, "learning_rate": 6.454014842482162e-07, "loss": 0.9432771801948547, "step": 1012 }, { "epoch": 2.1392405063291138, "grad_norm": 4.978313446044922, "learning_rate": 6.435183564885985e-07, "loss": 1.375197172164917, "step": 1014 }, { "epoch": 2.1434599156118144, "grad_norm": 1.7762482166290283, "learning_rate": 6.416354293296542e-07, "loss": 0.8380042910575867, "step": 1016 }, { "epoch": 2.147679324894515, "grad_norm": 1.8821486234664917, "learning_rate": 6.39752725688786e-07, "loss": 0.9462857842445374, "step": 1018 }, { "epoch": 2.151898734177215, "grad_norm": 1.470024585723877, "learning_rate": 6.378702684806757e-07, "loss": 0.8377196192741394, "step": 1020 }, { "epoch": 2.1561181434599157, "grad_norm": 2.115182638168335, "learning_rate": 6.359880806170058e-07, "loss": 0.9362459182739258, "step": 1022 }, { "epoch": 2.160337552742616, "grad_norm": 2.337805986404419, "learning_rate": 6.341061850061807e-07, "loss": 0.8514955639839172, "step": 1024 }, { "epoch": 2.1645569620253164, "grad_norm": 9.63266372680664, "learning_rate": 6.322246045530474e-07, "loss": 1.1533026695251465, "step": 1026 }, { "epoch": 2.168776371308017, "grad_norm": 1.6961092948913574, "learning_rate": 6.303433621586177e-07, "loss": 1.1458700895309448, "step": 1028 }, { "epoch": 2.172995780590717, "grad_norm": 1.3575078248977661, "learning_rate": 6.28462480719788e-07, "loss": 1.1239484548568726, "step": 1030 }, { "epoch": 2.1772151898734178, "grad_norm": 1.2787476778030396, "learning_rate": 6.265819831290624e-07, "loss": 1.1294289827346802, "step": 1032 }, { "epoch": 2.181434599156118, "grad_norm": 4.088858604431152, "learning_rate": 6.247018922742722e-07, "loss": 1.1388219594955444, "step": 1034 }, { "epoch": 2.1856540084388185, "grad_norm": 6.764144420623779, "learning_rate": 6.228222310382992e-07, "loss": 1.0533146858215332, "step": 1036 }, { "epoch": 2.189873417721519, "grad_norm": 2.094905138015747, "learning_rate": 6.209430222987952e-07, "loss": 1.132552146911621, "step": 1038 }, { "epoch": 2.1940928270042193, "grad_norm": 1.7523225545883179, "learning_rate": 6.190642889279052e-07, "loss": 1.2820512056350708, "step": 1040 }, { "epoch": 2.19831223628692, "grad_norm": 4.281554222106934, "learning_rate": 6.171860537919886e-07, "loss": 0.39310938119888306, "step": 1042 }, { "epoch": 2.2025316455696204, "grad_norm": 2.323817491531372, "learning_rate": 6.153083397513404e-07, "loss": 1.1017502546310425, "step": 1044 }, { "epoch": 2.2067510548523206, "grad_norm": 4.524064064025879, "learning_rate": 6.134311696599129e-07, "loss": 0.6054593324661255, "step": 1046 }, { "epoch": 2.210970464135021, "grad_norm": 2.6248085498809814, "learning_rate": 6.115545663650389e-07, "loss": 0.9862580299377441, "step": 1048 }, { "epoch": 2.2151898734177213, "grad_norm": 1.9876245260238647, "learning_rate": 6.096785527071516e-07, "loss": 1.1376148462295532, "step": 1050 }, { "epoch": 2.219409282700422, "grad_norm": 2.210066080093384, "learning_rate": 6.078031515195085e-07, "loss": 0.9529132843017578, "step": 1052 }, { "epoch": 2.2236286919831225, "grad_norm": 3.2140283584594727, "learning_rate": 6.059283856279118e-07, "loss": 1.0213066339492798, "step": 1054 }, { "epoch": 2.2278481012658227, "grad_norm": 6.621954917907715, "learning_rate": 6.040542778504319e-07, "loss": 0.9980672001838684, "step": 1056 }, { "epoch": 2.2320675105485233, "grad_norm": 10.540366172790527, "learning_rate": 6.021808509971293e-07, "loss": 0.5453277826309204, "step": 1058 }, { "epoch": 2.2362869198312234, "grad_norm": 1.3416770696640015, "learning_rate": 6.003081278697764e-07, "loss": 1.1391900777816772, "step": 1060 }, { "epoch": 2.240506329113924, "grad_norm": 0.30088382959365845, "learning_rate": 5.984361312615811e-07, "loss": 0.9888620972633362, "step": 1062 }, { "epoch": 2.2447257383966246, "grad_norm": 1.483581781387329, "learning_rate": 5.96564883956908e-07, "loss": 0.6946426033973694, "step": 1064 }, { "epoch": 2.2489451476793247, "grad_norm": 2.5259406566619873, "learning_rate": 5.946944087310022e-07, "loss": 1.0866342782974243, "step": 1066 }, { "epoch": 2.2531645569620253, "grad_norm": 2.395719528198242, "learning_rate": 5.928247283497117e-07, "loss": 1.3847568035125732, "step": 1068 }, { "epoch": 2.257383966244726, "grad_norm": 2.874040126800537, "learning_rate": 5.909558655692104e-07, "loss": 1.1452842950820923, "step": 1070 }, { "epoch": 2.261603375527426, "grad_norm": 2.1399810314178467, "learning_rate": 5.890878431357208e-07, "loss": 1.1274282932281494, "step": 1072 }, { "epoch": 2.2658227848101267, "grad_norm": 3.358569383621216, "learning_rate": 5.872206837852376e-07, "loss": 1.3512498140335083, "step": 1074 }, { "epoch": 2.270042194092827, "grad_norm": 1.4806420803070068, "learning_rate": 5.853544102432505e-07, "loss": 1.14762282371521, "step": 1076 }, { "epoch": 2.2742616033755274, "grad_norm": 1.1972980499267578, "learning_rate": 5.834890452244685e-07, "loss": 0.9154924750328064, "step": 1078 }, { "epoch": 2.278481012658228, "grad_norm": 10.489628791809082, "learning_rate": 5.816246114325421e-07, "loss": 0.9368666410446167, "step": 1080 }, { "epoch": 2.282700421940928, "grad_norm": 5.601263046264648, "learning_rate": 5.79761131559788e-07, "loss": 0.6107386350631714, "step": 1082 }, { "epoch": 2.2869198312236287, "grad_norm": 2.7437796592712402, "learning_rate": 5.778986282869127e-07, "loss": 0.7205576300621033, "step": 1084 }, { "epoch": 2.291139240506329, "grad_norm": 0.8865097761154175, "learning_rate": 5.760371242827363e-07, "loss": 0.6305662393569946, "step": 1086 }, { "epoch": 2.2953586497890295, "grad_norm": 2.2365691661834717, "learning_rate": 5.741766422039167e-07, "loss": 0.9999610781669617, "step": 1088 }, { "epoch": 2.29957805907173, "grad_norm": 1.5019956827163696, "learning_rate": 5.723172046946733e-07, "loss": 0.589636504650116, "step": 1090 }, { "epoch": 2.3037974683544302, "grad_norm": 2.1107327938079834, "learning_rate": 5.704588343865127e-07, "loss": 0.8981572389602661, "step": 1092 }, { "epoch": 2.308016877637131, "grad_norm": 4.003733158111572, "learning_rate": 5.686015538979518e-07, "loss": 0.732837438583374, "step": 1094 }, { "epoch": 2.3122362869198314, "grad_norm": 2.012057065963745, "learning_rate": 5.667453858342434e-07, "loss": 0.4853237271308899, "step": 1096 }, { "epoch": 2.3164556962025316, "grad_norm": 2.796154260635376, "learning_rate": 5.648903527871006e-07, "loss": 1.1909679174423218, "step": 1098 }, { "epoch": 2.320675105485232, "grad_norm": 1.6839478015899658, "learning_rate": 5.630364773344224e-07, "loss": 1.0224688053131104, "step": 1100 }, { "epoch": 2.3248945147679323, "grad_norm": 1.592947006225586, "learning_rate": 5.611837820400182e-07, "loss": 1.1030757427215576, "step": 1102 }, { "epoch": 2.329113924050633, "grad_norm": 1.691872239112854, "learning_rate": 5.593322894533334e-07, "loss": 1.2941904067993164, "step": 1104 }, { "epoch": 2.3333333333333335, "grad_norm": 1.7891680002212524, "learning_rate": 5.574820221091757e-07, "loss": 0.8782735466957092, "step": 1106 }, { "epoch": 2.3375527426160336, "grad_norm": 3.5078885555267334, "learning_rate": 5.556330025274393e-07, "loss": 0.5180922150611877, "step": 1108 }, { "epoch": 2.3417721518987342, "grad_norm": 1.8680453300476074, "learning_rate": 5.537852532128322e-07, "loss": 1.1475764513015747, "step": 1110 }, { "epoch": 2.3459915611814344, "grad_norm": 1.262511968612671, "learning_rate": 5.519387966546021e-07, "loss": 1.1460936069488525, "step": 1112 }, { "epoch": 2.350210970464135, "grad_norm": 12.242781639099121, "learning_rate": 5.500936553262616e-07, "loss": 1.1747325658798218, "step": 1114 }, { "epoch": 2.3544303797468356, "grad_norm": 0.7147314548492432, "learning_rate": 5.48249851685316e-07, "loss": 0.7451015114784241, "step": 1116 }, { "epoch": 2.3586497890295357, "grad_norm": 4.066142559051514, "learning_rate": 5.464074081729892e-07, "loss": 1.0633448362350464, "step": 1118 }, { "epoch": 2.3628691983122363, "grad_norm": 1.6116374731063843, "learning_rate": 5.445663472139506e-07, "loss": 0.8038894534111023, "step": 1120 }, { "epoch": 2.367088607594937, "grad_norm": 2.5959835052490234, "learning_rate": 5.427266912160427e-07, "loss": 1.0548654794692993, "step": 1122 }, { "epoch": 2.371308016877637, "grad_norm": 1.4511165618896484, "learning_rate": 5.408884625700076e-07, "loss": 0.744436502456665, "step": 1124 }, { "epoch": 2.3755274261603376, "grad_norm": 2.0259265899658203, "learning_rate": 5.390516836492152e-07, "loss": 1.0626447200775146, "step": 1126 }, { "epoch": 2.379746835443038, "grad_norm": 1.5352128744125366, "learning_rate": 5.372163768093903e-07, "loss": 1.1404402256011963, "step": 1128 }, { "epoch": 2.3839662447257384, "grad_norm": 3.401780366897583, "learning_rate": 5.35382564388341e-07, "loss": 0.5039758086204529, "step": 1130 }, { "epoch": 2.388185654008439, "grad_norm": 1.8972293138504028, "learning_rate": 5.335502687056865e-07, "loss": 0.345048725605011, "step": 1132 }, { "epoch": 2.392405063291139, "grad_norm": 4.107486248016357, "learning_rate": 5.317195120625855e-07, "loss": 0.4859941303730011, "step": 1134 }, { "epoch": 2.3966244725738397, "grad_norm": 2.5772571563720703, "learning_rate": 5.298903167414648e-07, "loss": 0.5732159614562988, "step": 1136 }, { "epoch": 2.40084388185654, "grad_norm": 1.3114792108535767, "learning_rate": 5.280627050057483e-07, "loss": 1.1417685747146606, "step": 1138 }, { "epoch": 2.4050632911392404, "grad_norm": 7.5032267570495605, "learning_rate": 5.262366990995852e-07, "loss": 0.8103894591331482, "step": 1140 }, { "epoch": 2.409282700421941, "grad_norm": 3.7041962146759033, "learning_rate": 5.244123212475811e-07, "loss": 0.3755455017089844, "step": 1142 }, { "epoch": 2.413502109704641, "grad_norm": 1.3423445224761963, "learning_rate": 5.22589593654525e-07, "loss": 0.8771740198135376, "step": 1144 }, { "epoch": 2.4177215189873418, "grad_norm": 1.499751329421997, "learning_rate": 5.207685385051213e-07, "loss": 1.168401837348938, "step": 1146 }, { "epoch": 2.4219409282700424, "grad_norm": 4.436310291290283, "learning_rate": 5.189491779637181e-07, "loss": 0.8418995141983032, "step": 1148 }, { "epoch": 2.4261603375527425, "grad_norm": 1.6216802597045898, "learning_rate": 5.171315341740387e-07, "loss": 1.147579550743103, "step": 1150 }, { "epoch": 2.430379746835443, "grad_norm": 43.39120864868164, "learning_rate": 5.153156292589112e-07, "loss": 0.8518908619880676, "step": 1152 }, { "epoch": 2.4345991561181437, "grad_norm": 1.7255734205245972, "learning_rate": 5.1350148532e-07, "loss": 1.205424427986145, "step": 1154 }, { "epoch": 2.438818565400844, "grad_norm": 6.3630475997924805, "learning_rate": 5.116891244375358e-07, "loss": 0.43493425846099854, "step": 1156 }, { "epoch": 2.4430379746835444, "grad_norm": 2.129798412322998, "learning_rate": 5.098785686700478e-07, "loss": 0.9413697719573975, "step": 1158 }, { "epoch": 2.4472573839662446, "grad_norm": 1.4703646898269653, "learning_rate": 5.080698400540949e-07, "loss": 1.1531509160995483, "step": 1160 }, { "epoch": 2.451476793248945, "grad_norm": 1.771309494972229, "learning_rate": 5.062629606039975e-07, "loss": 0.7602155208587646, "step": 1162 }, { "epoch": 2.4556962025316453, "grad_norm": 1.3786754608154297, "learning_rate": 5.04457952311569e-07, "loss": 1.1161296367645264, "step": 1164 }, { "epoch": 2.459915611814346, "grad_norm": 2.190340280532837, "learning_rate": 5.026548371458493e-07, "loss": 1.1393266916275024, "step": 1166 }, { "epoch": 2.4641350210970465, "grad_norm": 0.6447933316230774, "learning_rate": 5.008536370528365e-07, "loss": 0.728462815284729, "step": 1168 }, { "epoch": 2.4683544303797467, "grad_norm": 1.4482827186584473, "learning_rate": 4.990543739552197e-07, "loss": 1.0875799655914307, "step": 1170 }, { "epoch": 2.4725738396624473, "grad_norm": 1.4591885805130005, "learning_rate": 4.972570697521133e-07, "loss": 1.124202013015747, "step": 1172 }, { "epoch": 2.476793248945148, "grad_norm": 1.3290364742279053, "learning_rate": 4.954617463187888e-07, "loss": 1.1189545392990112, "step": 1174 }, { "epoch": 2.481012658227848, "grad_norm": 2.232417106628418, "learning_rate": 4.936684255064102e-07, "loss": 0.8213171362876892, "step": 1176 }, { "epoch": 2.4852320675105486, "grad_norm": 1.8304226398468018, "learning_rate": 4.918771291417669e-07, "loss": 0.40340158343315125, "step": 1178 }, { "epoch": 2.489451476793249, "grad_norm": 8.472685813903809, "learning_rate": 4.900878790270084e-07, "loss": 0.9105018973350525, "step": 1180 }, { "epoch": 2.4936708860759493, "grad_norm": 4.292173385620117, "learning_rate": 4.883006969393791e-07, "loss": 1.0442423820495605, "step": 1182 }, { "epoch": 2.49789029535865, "grad_norm": 4.11490535736084, "learning_rate": 4.865156046309528e-07, "loss": 0.5216444730758667, "step": 1184 }, { "epoch": 2.50210970464135, "grad_norm": 0.9223915338516235, "learning_rate": 4.847326238283692e-07, "loss": 0.7885441780090332, "step": 1186 }, { "epoch": 2.5063291139240507, "grad_norm": 1.5045500993728638, "learning_rate": 4.829517762325671e-07, "loss": 0.8654785752296448, "step": 1188 }, { "epoch": 2.510548523206751, "grad_norm": 1.1637619733810425, "learning_rate": 4.811730835185232e-07, "loss": 1.1407520771026611, "step": 1190 }, { "epoch": 2.5147679324894514, "grad_norm": 6.294982433319092, "learning_rate": 4.793965673349857e-07, "loss": 0.5034950971603394, "step": 1192 }, { "epoch": 2.518987341772152, "grad_norm": 3.0815131664276123, "learning_rate": 4.776222493042122e-07, "loss": 1.443105697631836, "step": 1194 }, { "epoch": 2.523206751054852, "grad_norm": 1.1364555358886719, "learning_rate": 4.758501510217066e-07, "loss": 1.1503655910491943, "step": 1196 }, { "epoch": 2.5274261603375527, "grad_norm": 2.4966022968292236, "learning_rate": 4.740802940559553e-07, "loss": 1.0758484601974487, "step": 1198 }, { "epoch": 2.5316455696202533, "grad_norm": 2.249464511871338, "learning_rate": 4.7231269994816584e-07, "loss": 0.6718664765357971, "step": 1200 }, { "epoch": 2.5358649789029535, "grad_norm": 1.933441400527954, "learning_rate": 4.705473902120039e-07, "loss": 0.8221999406814575, "step": 1202 }, { "epoch": 2.540084388185654, "grad_norm": 2.7778429985046387, "learning_rate": 4.687843863333317e-07, "loss": 0.7672927975654602, "step": 1204 }, { "epoch": 2.5443037974683547, "grad_norm": 1.3384257555007935, "learning_rate": 4.670237097699464e-07, "loss": 1.0449153184890747, "step": 1206 }, { "epoch": 2.548523206751055, "grad_norm": 4.268535137176514, "learning_rate": 4.6526538195131944e-07, "loss": 0.7585489749908447, "step": 1208 }, { "epoch": 2.5527426160337554, "grad_norm": 1.7707507610321045, "learning_rate": 4.6350942427833463e-07, "loss": 1.191308617591858, "step": 1210 }, { "epoch": 2.5569620253164556, "grad_norm": 1.2368897199630737, "learning_rate": 4.6175585812302914e-07, "loss": 1.115039348602295, "step": 1212 }, { "epoch": 2.561181434599156, "grad_norm": 6.681468486785889, "learning_rate": 4.600047048283323e-07, "loss": 0.35992902517318726, "step": 1214 }, { "epoch": 2.5654008438818563, "grad_norm": 2.588292360305786, "learning_rate": 4.582559857078059e-07, "loss": 0.831079363822937, "step": 1216 }, { "epoch": 2.569620253164557, "grad_norm": 1.0166317224502563, "learning_rate": 4.565097220453852e-07, "loss": 1.160988211631775, "step": 1218 }, { "epoch": 2.5738396624472575, "grad_norm": 4.55487060546875, "learning_rate": 4.5476593509511975e-07, "loss": 0.8059465289115906, "step": 1220 }, { "epoch": 2.5780590717299576, "grad_norm": 1.7319614887237549, "learning_rate": 4.5302464608091444e-07, "loss": 0.9973964095115662, "step": 1222 }, { "epoch": 2.5822784810126582, "grad_norm": 2.5319430828094482, "learning_rate": 4.512858761962719e-07, "loss": 0.8335304260253906, "step": 1224 }, { "epoch": 2.586497890295359, "grad_norm": 2.232879161834717, "learning_rate": 4.495496466040333e-07, "loss": 0.7188448309898376, "step": 1226 }, { "epoch": 2.590717299578059, "grad_norm": 1.1091829538345337, "learning_rate": 4.478159784361222e-07, "loss": 1.0995886325836182, "step": 1228 }, { "epoch": 2.5949367088607596, "grad_norm": 1.6400138139724731, "learning_rate": 4.4608489279328616e-07, "loss": 1.197192907333374, "step": 1230 }, { "epoch": 2.59915611814346, "grad_norm": 2.313340187072754, "learning_rate": 4.443564107448406e-07, "loss": 1.024308204650879, "step": 1232 }, { "epoch": 2.6033755274261603, "grad_norm": 3.4521191120147705, "learning_rate": 4.4263055332841223e-07, "loss": 0.30793383717536926, "step": 1234 }, { "epoch": 2.607594936708861, "grad_norm": 3.510732889175415, "learning_rate": 4.409073415496829e-07, "loss": 1.2074471712112427, "step": 1236 }, { "epoch": 2.611814345991561, "grad_norm": 5.145284652709961, "learning_rate": 4.391867963821341e-07, "loss": 1.3546441793441772, "step": 1238 }, { "epoch": 2.6160337552742616, "grad_norm": 1.3848285675048828, "learning_rate": 4.374689387667913e-07, "loss": 0.7564114332199097, "step": 1240 }, { "epoch": 2.620253164556962, "grad_norm": 1.5747932195663452, "learning_rate": 4.3575378961196987e-07, "loss": 1.1020171642303467, "step": 1242 }, { "epoch": 2.6244725738396624, "grad_norm": 7.985814094543457, "learning_rate": 4.340413697930193e-07, "loss": 0.6297235488891602, "step": 1244 }, { "epoch": 2.628691983122363, "grad_norm": 1.5127233266830444, "learning_rate": 4.3233170015207045e-07, "loss": 0.7452877163887024, "step": 1246 }, { "epoch": 2.632911392405063, "grad_norm": 2.9100558757781982, "learning_rate": 4.306248014977816e-07, "loss": 1.5952140092849731, "step": 1248 }, { "epoch": 2.6371308016877637, "grad_norm": 2.4428465366363525, "learning_rate": 4.2892069460508416e-07, "loss": 1.142899990081787, "step": 1250 }, { "epoch": 2.6413502109704643, "grad_norm": 1.664016604423523, "learning_rate": 4.27219400214931e-07, "loss": 1.067954182624817, "step": 1252 }, { "epoch": 2.6455696202531644, "grad_norm": 14.406804084777832, "learning_rate": 4.255209390340436e-07, "loss": 0.608812689781189, "step": 1254 }, { "epoch": 2.649789029535865, "grad_norm": 1.7003490924835205, "learning_rate": 4.238253317346602e-07, "loss": 0.5725827813148499, "step": 1256 }, { "epoch": 2.6540084388185656, "grad_norm": 2.5306010246276855, "learning_rate": 4.221325989542832e-07, "loss": 0.9772995710372925, "step": 1258 }, { "epoch": 2.6582278481012658, "grad_norm": 1.6626094579696655, "learning_rate": 4.2044276129542956e-07, "loss": 1.0970871448516846, "step": 1260 }, { "epoch": 2.6624472573839664, "grad_norm": 1.178289771080017, "learning_rate": 4.1875583932537926e-07, "loss": 1.2285281419754028, "step": 1262 }, { "epoch": 2.6666666666666665, "grad_norm": 2.0838637351989746, "learning_rate": 4.1707185357592434e-07, "loss": 0.6816955208778381, "step": 1264 }, { "epoch": 2.670886075949367, "grad_norm": 2.826573133468628, "learning_rate": 4.1539082454312016e-07, "loss": 0.9266291856765747, "step": 1266 }, { "epoch": 2.6751054852320673, "grad_norm": 2.5557596683502197, "learning_rate": 4.1371277268703537e-07, "loss": 0.7625723481178284, "step": 1268 }, { "epoch": 2.679324894514768, "grad_norm": 1.970330834388733, "learning_rate": 4.120377184315029e-07, "loss": 0.6248302459716797, "step": 1270 }, { "epoch": 2.6835443037974684, "grad_norm": 1.4884620904922485, "learning_rate": 4.103656821638711e-07, "loss": 1.0319654941558838, "step": 1272 }, { "epoch": 2.6877637130801686, "grad_norm": 7.556288242340088, "learning_rate": 4.086966842347563e-07, "loss": 0.745881199836731, "step": 1274 }, { "epoch": 2.691983122362869, "grad_norm": 9.8837890625, "learning_rate": 4.0703074495779464e-07, "loss": 0.8159171342849731, "step": 1276 }, { "epoch": 2.6962025316455698, "grad_norm": 0.8211575746536255, "learning_rate": 4.053678846093952e-07, "loss": 0.4533369243144989, "step": 1278 }, { "epoch": 2.70042194092827, "grad_norm": 2.684162139892578, "learning_rate": 4.03708123428492e-07, "loss": 0.9859198331832886, "step": 1280 }, { "epoch": 2.7046413502109705, "grad_norm": 2.0273070335388184, "learning_rate": 4.0205148161629964e-07, "loss": 1.131312608718872, "step": 1282 }, { "epoch": 2.708860759493671, "grad_norm": 1.63193678855896, "learning_rate": 4.003979793360661e-07, "loss": 0.5977147221565247, "step": 1284 }, { "epoch": 2.7130801687763713, "grad_norm": 4.451190948486328, "learning_rate": 3.987476367128271e-07, "loss": 0.6326662302017212, "step": 1286 }, { "epoch": 2.717299578059072, "grad_norm": 0.9102901816368103, "learning_rate": 3.9710047383316225e-07, "loss": 0.8215235471725464, "step": 1288 }, { "epoch": 2.721518987341772, "grad_norm": 0.5546138286590576, "learning_rate": 3.954565107449499e-07, "loss": 1.0023081302642822, "step": 1290 }, { "epoch": 2.7257383966244726, "grad_norm": 1.5768028497695923, "learning_rate": 3.9381576745712347e-07, "loss": 1.1236157417297363, "step": 1292 }, { "epoch": 2.7299578059071727, "grad_norm": 1.9878129959106445, "learning_rate": 3.921782639394268e-07, "loss": 0.7208356857299805, "step": 1294 }, { "epoch": 2.7341772151898733, "grad_norm": 1.5068297386169434, "learning_rate": 3.905440201221729e-07, "loss": 1.1069350242614746, "step": 1296 }, { "epoch": 2.738396624472574, "grad_norm": 1.8150042295455933, "learning_rate": 3.8891305589600005e-07, "loss": 1.1665513515472412, "step": 1298 }, { "epoch": 2.742616033755274, "grad_norm": 1.199442744255066, "learning_rate": 3.872853911116304e-07, "loss": 0.8418156504631042, "step": 1300 }, { "epoch": 2.7468354430379747, "grad_norm": 1.2970781326293945, "learning_rate": 3.856610455796275e-07, "loss": 1.1775513887405396, "step": 1302 }, { "epoch": 2.7510548523206753, "grad_norm": 2.245298147201538, "learning_rate": 3.840400390701562e-07, "loss": 0.740407407283783, "step": 1304 }, { "epoch": 2.7552742616033754, "grad_norm": 1.889854073524475, "learning_rate": 3.824223913127419e-07, "loss": 1.4258309602737427, "step": 1306 }, { "epoch": 2.759493670886076, "grad_norm": 1.5117489099502563, "learning_rate": 3.808081219960292e-07, "loss": 1.0796724557876587, "step": 1308 }, { "epoch": 2.7637130801687766, "grad_norm": 5.329049110412598, "learning_rate": 3.791972507675438e-07, "loss": 0.9499403834342957, "step": 1310 }, { "epoch": 2.7679324894514767, "grad_norm": 1.1606426239013672, "learning_rate": 3.775897972334526e-07, "loss": 1.1145509481430054, "step": 1312 }, { "epoch": 2.7721518987341773, "grad_norm": 5.624031066894531, "learning_rate": 3.759857809583255e-07, "loss": 1.1557338237762451, "step": 1314 }, { "epoch": 2.7763713080168775, "grad_norm": 2.0686893463134766, "learning_rate": 3.7438522146489624e-07, "loss": 0.9982014894485474, "step": 1316 }, { "epoch": 2.780590717299578, "grad_norm": 7.816877841949463, "learning_rate": 3.727881382338262e-07, "loss": 0.642890453338623, "step": 1318 }, { "epoch": 2.7848101265822782, "grad_norm": 8.81386661529541, "learning_rate": 3.711945507034663e-07, "loss": 1.1752903461456299, "step": 1320 }, { "epoch": 2.789029535864979, "grad_norm": 0.7249853014945984, "learning_rate": 3.696044782696211e-07, "loss": 0.7932354807853699, "step": 1322 }, { "epoch": 2.7932489451476794, "grad_norm": 4.055787563323975, "learning_rate": 3.680179402853118e-07, "loss": 1.365350604057312, "step": 1324 }, { "epoch": 2.7974683544303796, "grad_norm": 3.504054546356201, "learning_rate": 3.6643495606054153e-07, "loss": 0.9429040551185608, "step": 1326 }, { "epoch": 2.80168776371308, "grad_norm": 1.6331150531768799, "learning_rate": 3.6485554486206035e-07, "loss": 0.8298648595809937, "step": 1328 }, { "epoch": 2.8059071729957807, "grad_norm": 1.505171298980713, "learning_rate": 3.632797259131301e-07, "loss": 1.119720458984375, "step": 1330 }, { "epoch": 2.810126582278481, "grad_norm": 1.824774980545044, "learning_rate": 3.6170751839329087e-07, "loss": 1.1578552722930908, "step": 1332 }, { "epoch": 2.8143459915611815, "grad_norm": 1.4384500980377197, "learning_rate": 3.601389414381272e-07, "loss": 0.7492596507072449, "step": 1334 }, { "epoch": 2.818565400843882, "grad_norm": 0.6595861911773682, "learning_rate": 3.585740141390362e-07, "loss": 1.0319997072219849, "step": 1336 }, { "epoch": 2.8227848101265822, "grad_norm": 1.1182562112808228, "learning_rate": 3.570127555429937e-07, "loss": 0.8679478168487549, "step": 1338 }, { "epoch": 2.827004219409283, "grad_norm": 2.751737117767334, "learning_rate": 3.554551846523234e-07, "loss": 0.992285430431366, "step": 1340 }, { "epoch": 2.831223628691983, "grad_norm": 1.0553044080734253, "learning_rate": 3.5390132042446593e-07, "loss": 1.0697180032730103, "step": 1342 }, { "epoch": 2.8354430379746836, "grad_norm": 6.8535590171813965, "learning_rate": 3.5235118177174633e-07, "loss": 1.3121901750564575, "step": 1344 }, { "epoch": 2.8396624472573837, "grad_norm": 3.6509854793548584, "learning_rate": 3.5080478756114603e-07, "loss": 0.7838273048400879, "step": 1346 }, { "epoch": 2.8438818565400843, "grad_norm": 1.319709062576294, "learning_rate": 3.4926215661407224e-07, "loss": 0.6845376491546631, "step": 1348 }, { "epoch": 2.848101265822785, "grad_norm": 1.3884485960006714, "learning_rate": 3.4772330770612856e-07, "loss": 1.1699258089065552, "step": 1350 }, { "epoch": 2.852320675105485, "grad_norm": 1.5885751247406006, "learning_rate": 3.4618825956688674e-07, "loss": 1.0469439029693604, "step": 1352 }, { "epoch": 2.8565400843881856, "grad_norm": 3.556104898452759, "learning_rate": 3.4465703087965895e-07, "loss": 0.8466750383377075, "step": 1354 }, { "epoch": 2.8607594936708862, "grad_norm": 4.589015960693359, "learning_rate": 3.4312964028127036e-07, "loss": 0.5300393104553223, "step": 1356 }, { "epoch": 2.8649789029535864, "grad_norm": 1.3245023488998413, "learning_rate": 3.416061063618321e-07, "loss": 1.1446274518966675, "step": 1358 }, { "epoch": 2.869198312236287, "grad_norm": 2.1873321533203125, "learning_rate": 3.400864476645146e-07, "loss": 0.9219729900360107, "step": 1360 }, { "epoch": 2.8734177215189876, "grad_norm": 8.96743392944336, "learning_rate": 3.3857068268532285e-07, "loss": 0.7180023789405823, "step": 1362 }, { "epoch": 2.8776371308016877, "grad_norm": 2.621079444885254, "learning_rate": 3.3705882987287096e-07, "loss": 1.0849711894989014, "step": 1364 }, { "epoch": 2.8818565400843883, "grad_norm": 1.501388669013977, "learning_rate": 3.355509076281567e-07, "loss": 1.1244922876358032, "step": 1366 }, { "epoch": 2.8860759493670884, "grad_norm": 2.502393960952759, "learning_rate": 3.3404693430433883e-07, "loss": 1.1720871925354004, "step": 1368 }, { "epoch": 2.890295358649789, "grad_norm": 1.8280988931655884, "learning_rate": 3.32546928206513e-07, "loss": 1.0966358184814453, "step": 1370 }, { "epoch": 2.894514767932489, "grad_norm": 19.55905532836914, "learning_rate": 3.3105090759148967e-07, "loss": 0.48501160740852356, "step": 1372 }, { "epoch": 2.8987341772151898, "grad_norm": 1.4679443836212158, "learning_rate": 3.2955889066757016e-07, "loss": 0.8926799297332764, "step": 1374 }, { "epoch": 2.9029535864978904, "grad_norm": 1.4338287115097046, "learning_rate": 3.280708955943272e-07, "loss": 1.1578876972198486, "step": 1376 }, { "epoch": 2.9071729957805905, "grad_norm": 3.8778481483459473, "learning_rate": 3.265869404823828e-07, "loss": 0.9735660552978516, "step": 1378 }, { "epoch": 2.911392405063291, "grad_norm": 2.506485939025879, "learning_rate": 3.2510704339318803e-07, "loss": 1.3276560306549072, "step": 1380 }, { "epoch": 2.9156118143459917, "grad_norm": 2.2147409915924072, "learning_rate": 3.2363122233880246e-07, "loss": 0.7593087553977966, "step": 1382 }, { "epoch": 2.919831223628692, "grad_norm": 5.710489749908447, "learning_rate": 3.221594952816764e-07, "loss": 0.7504158616065979, "step": 1384 }, { "epoch": 2.9240506329113924, "grad_norm": 1.8435603380203247, "learning_rate": 3.2069188013443137e-07, "loss": 0.8508476614952087, "step": 1386 }, { "epoch": 2.928270042194093, "grad_norm": 1.3359285593032837, "learning_rate": 3.192283947596416e-07, "loss": 1.1549383401870728, "step": 1388 }, { "epoch": 2.932489451476793, "grad_norm": 1.435840129852295, "learning_rate": 3.1776905696961776e-07, "loss": 1.0318659543991089, "step": 1390 }, { "epoch": 2.9367088607594938, "grad_norm": 1.5136826038360596, "learning_rate": 3.163138845261895e-07, "loss": 0.7768437266349792, "step": 1392 }, { "epoch": 2.9409282700421944, "grad_norm": 2.1672825813293457, "learning_rate": 3.148628951404894e-07, "loss": 0.7318160533905029, "step": 1394 }, { "epoch": 2.9451476793248945, "grad_norm": 2.231234550476074, "learning_rate": 3.134161064727371e-07, "loss": 1.1114449501037598, "step": 1396 }, { "epoch": 2.9493670886075947, "grad_norm": 1.1347553730010986, "learning_rate": 3.1197353613202493e-07, "loss": 0.98956298828125, "step": 1398 }, { "epoch": 2.9535864978902953, "grad_norm": 2.6513452529907227, "learning_rate": 3.1053520167610327e-07, "loss": 0.8672858476638794, "step": 1400 }, { "epoch": 2.957805907172996, "grad_norm": 5.388697147369385, "learning_rate": 3.0910112061116706e-07, "loss": 0.8895928263664246, "step": 1402 }, { "epoch": 2.962025316455696, "grad_norm": 1.2345776557922363, "learning_rate": 3.07671310391642e-07, "loss": 1.0770245790481567, "step": 1404 }, { "epoch": 2.9662447257383966, "grad_norm": 2.581882953643799, "learning_rate": 3.06245788419973e-07, "loss": 0.9955227971076965, "step": 1406 }, { "epoch": 2.970464135021097, "grad_norm": 1.2137199640274048, "learning_rate": 3.0482457204641244e-07, "loss": 0.6025493741035461, "step": 1408 }, { "epoch": 2.9746835443037973, "grad_norm": 7.410897731781006, "learning_rate": 3.0340767856880765e-07, "loss": 0.9356251358985901, "step": 1410 }, { "epoch": 2.978902953586498, "grad_norm": 1.3034472465515137, "learning_rate": 3.019951252323922e-07, "loss": 1.1050803661346436, "step": 1412 }, { "epoch": 2.9831223628691985, "grad_norm": 4.278261184692383, "learning_rate": 3.005869292295745e-07, "loss": 0.9199661016464233, "step": 1414 }, { "epoch": 2.9873417721518987, "grad_norm": 1.678770661354065, "learning_rate": 2.9918310769972974e-07, "loss": 1.006180763244629, "step": 1416 }, { "epoch": 2.9915611814345993, "grad_norm": 1.6135848760604858, "learning_rate": 2.9778367772899007e-07, "loss": 1.0220967531204224, "step": 1418 }, { "epoch": 2.9957805907173, "grad_norm": 4.610077857971191, "learning_rate": 2.963886563500377e-07, "loss": 1.10872220993042, "step": 1420 }, { "epoch": 3.0, "grad_norm": 1.7171008586883545, "learning_rate": 2.949980605418972e-07, "loss": 0.4870656132698059, "step": 1422 }, { "epoch": 3.0042194092827006, "grad_norm": 1.3645132780075073, "learning_rate": 2.936119072297288e-07, "loss": 0.8511791825294495, "step": 1424 }, { "epoch": 3.0084388185654007, "grad_norm": 1.9679698944091797, "learning_rate": 2.9223021328462197e-07, "loss": 0.7651324272155762, "step": 1426 }, { "epoch": 3.0126582278481013, "grad_norm": 3.9621288776397705, "learning_rate": 2.908529955233911e-07, "loss": 0.699533224105835, "step": 1428 }, { "epoch": 3.0168776371308015, "grad_norm": 3.126701831817627, "learning_rate": 2.8948027070836994e-07, "loss": 0.4490070939064026, "step": 1430 }, { "epoch": 3.021097046413502, "grad_norm": 2.446420431137085, "learning_rate": 2.881120555472082e-07, "loss": 1.0999044179916382, "step": 1432 }, { "epoch": 3.0253164556962027, "grad_norm": 1.639694333076477, "learning_rate": 2.867483666926673e-07, "loss": 1.0761295557022095, "step": 1434 }, { "epoch": 3.029535864978903, "grad_norm": 2.0383009910583496, "learning_rate": 2.853892207424188e-07, "loss": 1.2911527156829834, "step": 1436 }, { "epoch": 3.0337552742616034, "grad_norm": 2.1497604846954346, "learning_rate": 2.840346342388418e-07, "loss": 0.7010747790336609, "step": 1438 }, { "epoch": 3.037974683544304, "grad_norm": 1.3137015104293823, "learning_rate": 2.8268462366882116e-07, "loss": 1.0549767017364502, "step": 1440 }, { "epoch": 3.042194092827004, "grad_norm": 2.2534055709838867, "learning_rate": 2.81339205463548e-07, "loss": 0.7904849052429199, "step": 1442 }, { "epoch": 3.0464135021097047, "grad_norm": 1.8378784656524658, "learning_rate": 2.7999839599831866e-07, "loss": 0.9793230891227722, "step": 1444 }, { "epoch": 3.050632911392405, "grad_norm": 1.6699494123458862, "learning_rate": 2.786622115923361e-07, "loss": 1.100398302078247, "step": 1446 }, { "epoch": 3.0548523206751055, "grad_norm": 4.9398722648620605, "learning_rate": 2.773306685085103e-07, "loss": 0.7494297027587891, "step": 1448 }, { "epoch": 3.059071729957806, "grad_norm": 2.751260757446289, "learning_rate": 2.760037829532616e-07, "loss": 0.9139360189437866, "step": 1450 }, { "epoch": 3.0632911392405062, "grad_norm": 1.659805178642273, "learning_rate": 2.746815710763228e-07, "loss": 1.121703028678894, "step": 1452 }, { "epoch": 3.067510548523207, "grad_norm": 17.309215545654297, "learning_rate": 2.733640489705424e-07, "loss": 0.8850579261779785, "step": 1454 }, { "epoch": 3.071729957805907, "grad_norm": 1.963599443435669, "learning_rate": 2.7205123267168884e-07, "loss": 0.7342712879180908, "step": 1456 }, { "epoch": 3.0759493670886076, "grad_norm": 1.344913125038147, "learning_rate": 2.7074313815825577e-07, "loss": 0.8235659003257751, "step": 1458 }, { "epoch": 3.080168776371308, "grad_norm": 2.194878101348877, "learning_rate": 2.694397813512672e-07, "loss": 0.8748940229415894, "step": 1460 }, { "epoch": 3.0843881856540083, "grad_norm": 1.611878514289856, "learning_rate": 2.6814117811408343e-07, "loss": 1.0315779447555542, "step": 1462 }, { "epoch": 3.088607594936709, "grad_norm": 1.17129647731781, "learning_rate": 2.668473442522087e-07, "loss": 1.089264154434204, "step": 1464 }, { "epoch": 3.0928270042194095, "grad_norm": 1.8487638235092163, "learning_rate": 2.655582955130983e-07, "loss": 0.9789541959762573, "step": 1466 }, { "epoch": 3.0970464135021096, "grad_norm": 2.393946409225464, "learning_rate": 2.6427404758596716e-07, "loss": 0.7049380540847778, "step": 1468 }, { "epoch": 3.1012658227848102, "grad_norm": 6.393697261810303, "learning_rate": 2.6299461610159823e-07, "loss": 0.2891662120819092, "step": 1470 }, { "epoch": 3.1054852320675104, "grad_norm": 3.184678316116333, "learning_rate": 2.617200166321536e-07, "loss": 1.5170872211456299, "step": 1472 }, { "epoch": 3.109704641350211, "grad_norm": 0.9037976264953613, "learning_rate": 2.604502646909835e-07, "loss": 0.6711030602455139, "step": 1474 }, { "epoch": 3.1139240506329116, "grad_norm": 1.8876357078552246, "learning_rate": 2.591853757324387e-07, "loss": 1.0795202255249023, "step": 1476 }, { "epoch": 3.1181434599156117, "grad_norm": 2.756838083267212, "learning_rate": 2.579253651516811e-07, "loss": 1.132811427116394, "step": 1478 }, { "epoch": 3.1223628691983123, "grad_norm": 1.3386019468307495, "learning_rate": 2.566702482844977e-07, "loss": 1.08835768699646, "step": 1480 }, { "epoch": 3.1265822784810124, "grad_norm": 2.480353593826294, "learning_rate": 2.554200404071133e-07, "loss": 1.070718765258789, "step": 1482 }, { "epoch": 3.130801687763713, "grad_norm": 1.40932297706604, "learning_rate": 2.541747567360042e-07, "loss": 1.0528981685638428, "step": 1484 }, { "epoch": 3.1350210970464136, "grad_norm": 1.5161710977554321, "learning_rate": 2.529344124277137e-07, "loss": 0.701133131980896, "step": 1486 }, { "epoch": 3.1392405063291138, "grad_norm": 1.7773646116256714, "learning_rate": 2.516990225786675e-07, "loss": 0.714127242565155, "step": 1488 }, { "epoch": 3.1434599156118144, "grad_norm": 1.856155276298523, "learning_rate": 2.5046860222498974e-07, "loss": 1.374661922454834, "step": 1490 }, { "epoch": 3.147679324894515, "grad_norm": 1.7023481130599976, "learning_rate": 2.492431663423195e-07, "loss": 0.7714812159538269, "step": 1492 }, { "epoch": 3.151898734177215, "grad_norm": 2.849262237548828, "learning_rate": 2.480227298456298e-07, "loss": 0.9089514017105103, "step": 1494 }, { "epoch": 3.1561181434599157, "grad_norm": 1.426505208015442, "learning_rate": 2.468073075890449e-07, "loss": 0.885564386844635, "step": 1496 }, { "epoch": 3.160337552742616, "grad_norm": 1.386016845703125, "learning_rate": 2.455969143656604e-07, "loss": 0.6194628477096558, "step": 1498 }, { "epoch": 3.1645569620253164, "grad_norm": 3.545844316482544, "learning_rate": 2.4439156490736206e-07, "loss": 0.6920610070228577, "step": 1500 }, { "epoch": 3.168776371308017, "grad_norm": 2.4662020206451416, "learning_rate": 2.431912738846479e-07, "loss": 1.0780019760131836, "step": 1502 }, { "epoch": 3.172995780590717, "grad_norm": 1.5884943008422852, "learning_rate": 2.4199605590644834e-07, "loss": 0.987308144569397, "step": 1504 }, { "epoch": 3.1772151898734178, "grad_norm": 1.7786238193511963, "learning_rate": 2.4080592551994957e-07, "loss": 1.1196187734603882, "step": 1506 }, { "epoch": 3.181434599156118, "grad_norm": 1.3663359880447388, "learning_rate": 2.396208972104153e-07, "loss": 1.2225620746612549, "step": 1508 }, { "epoch": 3.1856540084388185, "grad_norm": 2.5622196197509766, "learning_rate": 2.384409854010114e-07, "loss": 1.0651240348815918, "step": 1510 }, { "epoch": 3.189873417721519, "grad_norm": 0.9567521214485168, "learning_rate": 2.372662044526301e-07, "loss": 0.3738023042678833, "step": 1512 }, { "epoch": 3.1940928270042193, "grad_norm": 1.9998040199279785, "learning_rate": 2.3609656866371468e-07, "loss": 1.1397721767425537, "step": 1514 }, { "epoch": 3.19831223628692, "grad_norm": 0.7790340781211853, "learning_rate": 2.3493209227008635e-07, "loss": 0.7803550958633423, "step": 1516 }, { "epoch": 3.2025316455696204, "grad_norm": 1.4339203834533691, "learning_rate": 2.3377278944477026e-07, "loss": 1.136408805847168, "step": 1518 }, { "epoch": 3.2067510548523206, "grad_norm": 2.4172418117523193, "learning_rate": 2.3261867429782352e-07, "loss": 1.0867120027542114, "step": 1520 }, { "epoch": 3.210970464135021, "grad_norm": 5.30928373336792, "learning_rate": 2.3146976087616251e-07, "loss": 0.40863823890686035, "step": 1522 }, { "epoch": 3.2151898734177213, "grad_norm": 1.3400903940200806, "learning_rate": 2.3032606316339343e-07, "loss": 0.9426780343055725, "step": 1524 }, { "epoch": 3.219409282700422, "grad_norm": 2.4984984397888184, "learning_rate": 2.2918759507964067e-07, "loss": 1.065047025680542, "step": 1526 }, { "epoch": 3.2236286919831225, "grad_norm": 1.2303318977355957, "learning_rate": 2.280543704813786e-07, "loss": 0.7552684545516968, "step": 1528 }, { "epoch": 3.2278481012658227, "grad_norm": 8.25938606262207, "learning_rate": 2.2692640316126142e-07, "loss": 0.8803672790527344, "step": 1530 }, { "epoch": 3.2320675105485233, "grad_norm": 0.49941709637641907, "learning_rate": 2.258037068479569e-07, "loss": 0.4145871102809906, "step": 1532 }, { "epoch": 3.2362869198312234, "grad_norm": 4.213127613067627, "learning_rate": 2.246862952059784e-07, "loss": 0.8059659600257874, "step": 1534 }, { "epoch": 3.240506329113924, "grad_norm": 1.6297084093093872, "learning_rate": 2.2357418183551847e-07, "loss": 1.0444282293319702, "step": 1536 }, { "epoch": 3.2447257383966246, "grad_norm": 1.131995677947998, "learning_rate": 2.2246738027228375e-07, "loss": 1.0914216041564941, "step": 1538 }, { "epoch": 3.2489451476793247, "grad_norm": 4.478993892669678, "learning_rate": 2.2136590398733008e-07, "loss": 0.9430460929870605, "step": 1540 }, { "epoch": 3.2531645569620253, "grad_norm": 3.0573625564575195, "learning_rate": 2.2026976638689858e-07, "loss": 0.911579966545105, "step": 1542 }, { "epoch": 3.257383966244726, "grad_norm": 3.4742343425750732, "learning_rate": 2.1917898081225196e-07, "loss": 0.7584477066993713, "step": 1544 }, { "epoch": 3.261603375527426, "grad_norm": 8.812678337097168, "learning_rate": 2.1809356053951312e-07, "loss": 0.8638182878494263, "step": 1546 }, { "epoch": 3.2658227848101267, "grad_norm": 2.5531651973724365, "learning_rate": 2.1701351877950265e-07, "loss": 0.9924852848052979, "step": 1548 }, { "epoch": 3.270042194092827, "grad_norm": 2.971946954727173, "learning_rate": 2.1593886867757877e-07, "loss": 0.4322529435157776, "step": 1550 }, { "epoch": 3.2742616033755274, "grad_norm": 1.717172384262085, "learning_rate": 2.148696233134765e-07, "loss": 0.550542414188385, "step": 1552 }, { "epoch": 3.278481012658228, "grad_norm": 5.607646942138672, "learning_rate": 2.1380579570114936e-07, "loss": 0.5011199116706848, "step": 1554 }, { "epoch": 3.282700421940928, "grad_norm": 1.612561821937561, "learning_rate": 2.1274739878861052e-07, "loss": 1.0595111846923828, "step": 1556 }, { "epoch": 3.2869198312236287, "grad_norm": 0.5656753182411194, "learning_rate": 2.1169444545777492e-07, "loss": 0.9489805102348328, "step": 1558 }, { "epoch": 3.291139240506329, "grad_norm": 1.542765736579895, "learning_rate": 2.1064694852430298e-07, "loss": 0.7409214377403259, "step": 1560 }, { "epoch": 3.2953586497890295, "grad_norm": 4.1754326820373535, "learning_rate": 2.0960492073744497e-07, "loss": 0.6657558679580688, "step": 1562 }, { "epoch": 3.29957805907173, "grad_norm": 2.3946285247802734, "learning_rate": 2.0856837477988444e-07, "loss": 1.0093276500701904, "step": 1564 }, { "epoch": 3.3037974683544302, "grad_norm": 54.370628356933594, "learning_rate": 2.075373232675853e-07, "loss": 0.911258339881897, "step": 1566 }, { "epoch": 3.308016877637131, "grad_norm": 0.5367670655250549, "learning_rate": 2.0651177874963756e-07, "loss": 0.5720005035400391, "step": 1568 }, { "epoch": 3.3122362869198314, "grad_norm": 2.9743804931640625, "learning_rate": 2.054917537081048e-07, "loss": 0.7077758312225342, "step": 1570 }, { "epoch": 3.3164556962025316, "grad_norm": 1.33404541015625, "learning_rate": 2.0447726055787184e-07, "loss": 0.7469961047172546, "step": 1572 }, { "epoch": 3.320675105485232, "grad_norm": 5.848537445068359, "learning_rate": 2.0346831164649456e-07, "loss": 1.2882143259048462, "step": 1574 }, { "epoch": 3.3248945147679323, "grad_norm": 2.0500552654266357, "learning_rate": 2.024649192540486e-07, "loss": 1.0107818841934204, "step": 1576 }, { "epoch": 3.329113924050633, "grad_norm": 1.4133131504058838, "learning_rate": 2.0146709559298057e-07, "loss": 1.098578929901123, "step": 1578 }, { "epoch": 3.3333333333333335, "grad_norm": 2.475172281265259, "learning_rate": 2.004748528079589e-07, "loss": 0.907584547996521, "step": 1580 }, { "epoch": 3.3375527426160336, "grad_norm": 1.8427865505218506, "learning_rate": 1.9948820297572654e-07, "loss": 0.5680180191993713, "step": 1582 }, { "epoch": 3.3417721518987342, "grad_norm": 2.7834925651550293, "learning_rate": 1.9850715810495388e-07, "loss": 0.8737412095069885, "step": 1584 }, { "epoch": 3.3459915611814344, "grad_norm": 3.1142473220825195, "learning_rate": 1.9753173013609188e-07, "loss": 0.9088540077209473, "step": 1586 }, { "epoch": 3.350210970464135, "grad_norm": 1.0896648168563843, "learning_rate": 1.9656193094122788e-07, "loss": 0.6729345917701721, "step": 1588 }, { "epoch": 3.3544303797468356, "grad_norm": 0.7042174339294434, "learning_rate": 1.955977723239402e-07, "loss": 1.0873976945877075, "step": 1590 }, { "epoch": 3.3586497890295357, "grad_norm": 2.3321895599365234, "learning_rate": 1.946392660191551e-07, "loss": 1.0663033723831177, "step": 1592 }, { "epoch": 3.3628691983122363, "grad_norm": 0.490595281124115, "learning_rate": 1.9368642369300324e-07, "loss": 0.9354673624038696, "step": 1594 }, { "epoch": 3.367088607594937, "grad_norm": 10.656190872192383, "learning_rate": 1.927392569426783e-07, "loss": 0.4992368817329407, "step": 1596 }, { "epoch": 3.371308016877637, "grad_norm": 0.5064166784286499, "learning_rate": 1.917977772962959e-07, "loss": 0.528096616268158, "step": 1598 }, { "epoch": 3.3755274261603376, "grad_norm": 5.858240604400635, "learning_rate": 1.9086199621275264e-07, "loss": 0.8440109491348267, "step": 1600 }, { "epoch": 3.379746835443038, "grad_norm": 8.531730651855469, "learning_rate": 1.899319250815872e-07, "loss": 0.6302809119224548, "step": 1602 }, { "epoch": 3.3839662447257384, "grad_norm": 0.5061826705932617, "learning_rate": 1.8900757522284133e-07, "loss": 0.8138654828071594, "step": 1604 }, { "epoch": 3.388185654008439, "grad_norm": 2.710231065750122, "learning_rate": 1.880889578869227e-07, "loss": 1.1358734369277954, "step": 1606 }, { "epoch": 3.392405063291139, "grad_norm": 2.9734416007995605, "learning_rate": 1.8717608425446727e-07, "loss": 0.7783518433570862, "step": 1608 }, { "epoch": 3.3966244725738397, "grad_norm": 1.6831233501434326, "learning_rate": 1.8626896543620322e-07, "loss": 0.7331032156944275, "step": 1610 }, { "epoch": 3.40084388185654, "grad_norm": 1.832513451576233, "learning_rate": 1.853676124728165e-07, "loss": 1.0596171617507935, "step": 1612 }, { "epoch": 3.4050632911392404, "grad_norm": 1.777066946029663, "learning_rate": 1.8447203633481567e-07, "loss": 0.5832729935646057, "step": 1614 }, { "epoch": 3.409282700421941, "grad_norm": 3.6729393005371094, "learning_rate": 1.8358224792239858e-07, "loss": 0.9451841115951538, "step": 1616 }, { "epoch": 3.413502109704641, "grad_norm": 1.5150253772735596, "learning_rate": 1.8269825806531981e-07, "loss": 1.205118179321289, "step": 1618 }, { "epoch": 3.4177215189873418, "grad_norm": 1.506641149520874, "learning_rate": 1.8182007752275897e-07, "loss": 1.1017844676971436, "step": 1620 }, { "epoch": 3.4219409282700424, "grad_norm": 1.7625582218170166, "learning_rate": 1.8094771698318949e-07, "loss": 0.7701492309570312, "step": 1622 }, { "epoch": 3.4261603375527425, "grad_norm": 3.7757952213287354, "learning_rate": 1.8008118706424835e-07, "loss": 0.47009673714637756, "step": 1624 }, { "epoch": 3.430379746835443, "grad_norm": 86.28419494628906, "learning_rate": 1.792204983126077e-07, "loss": 0.3835935592651367, "step": 1626 }, { "epoch": 3.4345991561181437, "grad_norm": 1.6593104600906372, "learning_rate": 1.7836566120384535e-07, "loss": 1.0729460716247559, "step": 1628 }, { "epoch": 3.438818565400844, "grad_norm": 1.3321086168289185, "learning_rate": 1.7751668614231838e-07, "loss": 0.5311670303344727, "step": 1630 }, { "epoch": 3.4430379746835444, "grad_norm": 1.7757083177566528, "learning_rate": 1.7667358346103543e-07, "loss": 1.0757611989974976, "step": 1632 }, { "epoch": 3.4472573839662446, "grad_norm": 0.7050431370735168, "learning_rate": 1.7583636342153186e-07, "loss": 0.8372207283973694, "step": 1634 }, { "epoch": 3.451476793248945, "grad_norm": 3.041806221008301, "learning_rate": 1.7500503621374447e-07, "loss": 1.3023487329483032, "step": 1636 }, { "epoch": 3.4556962025316453, "grad_norm": 2.8929758071899414, "learning_rate": 1.7417961195588712e-07, "loss": 1.2805616855621338, "step": 1638 }, { "epoch": 3.459915611814346, "grad_norm": 1.4591811895370483, "learning_rate": 1.733601006943283e-07, "loss": 1.0746394395828247, "step": 1640 }, { "epoch": 3.4641350210970465, "grad_norm": 21.10038185119629, "learning_rate": 1.7254651240346834e-07, "loss": 1.2883800268173218, "step": 1642 }, { "epoch": 3.4683544303797467, "grad_norm": 0.8419481515884399, "learning_rate": 1.717388569856184e-07, "loss": 0.4558939039707184, "step": 1644 }, { "epoch": 3.4725738396624473, "grad_norm": 1.598176121711731, "learning_rate": 1.7093714427087921e-07, "loss": 1.1013548374176025, "step": 1646 }, { "epoch": 3.476793248945148, "grad_norm": 1.9482252597808838, "learning_rate": 1.7014138401702235e-07, "loss": 1.064300537109375, "step": 1648 }, { "epoch": 3.481012658227848, "grad_norm": 2.4247756004333496, "learning_rate": 1.6935158590937102e-07, "loss": 0.5595088005065918, "step": 1650 }, { "epoch": 3.4852320675105486, "grad_norm": 1.5676363706588745, "learning_rate": 1.685677595606821e-07, "loss": 0.9377724528312683, "step": 1652 }, { "epoch": 3.489451476793249, "grad_norm": 1.7761136293411255, "learning_rate": 1.6778991451102917e-07, "loss": 0.6129472255706787, "step": 1654 }, { "epoch": 3.4936708860759493, "grad_norm": 1.6247411966323853, "learning_rate": 1.6701806022768664e-07, "loss": 0.9987605214118958, "step": 1656 }, { "epoch": 3.49789029535865, "grad_norm": 7.621754169464111, "learning_rate": 1.662522061050143e-07, "loss": 0.7994301319122314, "step": 1658 }, { "epoch": 3.50210970464135, "grad_norm": 1.6483778953552246, "learning_rate": 1.6549236146434306e-07, "loss": 1.0804067850112915, "step": 1660 }, { "epoch": 3.5063291139240507, "grad_norm": 2.4437475204467773, "learning_rate": 1.6473853555386138e-07, "loss": 1.301591396331787, "step": 1662 }, { "epoch": 3.510548523206751, "grad_norm": 6.270905017852783, "learning_rate": 1.63990737548503e-07, "loss": 0.5238262414932251, "step": 1664 }, { "epoch": 3.5147679324894514, "grad_norm": 1.6719293594360352, "learning_rate": 1.6324897654983497e-07, "loss": 1.1141690015792847, "step": 1666 }, { "epoch": 3.518987341772152, "grad_norm": 3.9029476642608643, "learning_rate": 1.6251326158594697e-07, "loss": 0.9623671770095825, "step": 1668 }, { "epoch": 3.523206751054852, "grad_norm": 1.746028184890747, "learning_rate": 1.617836016113414e-07, "loss": 1.0135071277618408, "step": 1670 }, { "epoch": 3.5274261603375527, "grad_norm": 3.1107168197631836, "learning_rate": 1.610600055068245e-07, "loss": 0.4389096200466156, "step": 1672 }, { "epoch": 3.5316455696202533, "grad_norm": 1.7027398347854614, "learning_rate": 1.603424820793983e-07, "loss": 0.6981071829795837, "step": 1674 }, { "epoch": 3.5358649789029535, "grad_norm": 2.8486416339874268, "learning_rate": 1.5963104006215308e-07, "loss": 0.7279437780380249, "step": 1676 }, { "epoch": 3.540084388185654, "grad_norm": 3.396284341812134, "learning_rate": 1.589256881141614e-07, "loss": 0.9122246503829956, "step": 1678 }, { "epoch": 3.5443037974683547, "grad_norm": 3.791874647140503, "learning_rate": 1.5822643482037287e-07, "loss": 1.1270490884780884, "step": 1680 }, { "epoch": 3.548523206751055, "grad_norm": 1.448197364807129, "learning_rate": 1.5753328869150915e-07, "loss": 0.958101749420166, "step": 1682 }, { "epoch": 3.5527426160337554, "grad_norm": 2.4740562438964844, "learning_rate": 1.5684625816396065e-07, "loss": 0.9169100522994995, "step": 1684 }, { "epoch": 3.5569620253164556, "grad_norm": 4.803852081298828, "learning_rate": 1.5616535159968395e-07, "loss": 0.4023887515068054, "step": 1686 }, { "epoch": 3.561181434599156, "grad_norm": 1.6774110794067383, "learning_rate": 1.5549057728609994e-07, "loss": 0.7174091935157776, "step": 1688 }, { "epoch": 3.5654008438818563, "grad_norm": 2.055140972137451, "learning_rate": 1.5482194343599262e-07, "loss": 1.1519484519958496, "step": 1690 }, { "epoch": 3.569620253164557, "grad_norm": 2.408010482788086, "learning_rate": 1.5415945818740984e-07, "loss": 0.2424314320087433, "step": 1692 }, { "epoch": 3.5738396624472575, "grad_norm": 0.4710818827152252, "learning_rate": 1.5350312960356366e-07, "loss": 0.975223183631897, "step": 1694 }, { "epoch": 3.5780590717299576, "grad_norm": 7.146688461303711, "learning_rate": 1.5285296567273247e-07, "loss": 0.2773347795009613, "step": 1696 }, { "epoch": 3.5822784810126582, "grad_norm": 2.3770270347595215, "learning_rate": 1.5220897430816355e-07, "loss": 0.8169768452644348, "step": 1698 }, { "epoch": 3.586497890295359, "grad_norm": 1.4037396907806396, "learning_rate": 1.5157116334797708e-07, "loss": 0.900860071182251, "step": 1700 }, { "epoch": 3.590717299578059, "grad_norm": 1.6098082065582275, "learning_rate": 1.5093954055507043e-07, "loss": 0.6856269240379333, "step": 1702 }, { "epoch": 3.5949367088607596, "grad_norm": 1.4536845684051514, "learning_rate": 1.5031411361702408e-07, "loss": 1.1157587766647339, "step": 1704 }, { "epoch": 3.59915611814346, "grad_norm": 3.0524935722351074, "learning_rate": 1.4969489014600732e-07, "loss": 0.812619149684906, "step": 1706 }, { "epoch": 3.6033755274261603, "grad_norm": 4.811793804168701, "learning_rate": 1.4908187767868651e-07, "loss": 0.7652060389518738, "step": 1708 }, { "epoch": 3.607594936708861, "grad_norm": 0.5443377494812012, "learning_rate": 1.484750836761328e-07, "loss": 0.677264392375946, "step": 1710 }, { "epoch": 3.611814345991561, "grad_norm": 6.806301593780518, "learning_rate": 1.4787451552373115e-07, "loss": 1.052730679512024, "step": 1712 }, { "epoch": 3.6160337552742616, "grad_norm": 4.058206081390381, "learning_rate": 1.4728018053109103e-07, "loss": 1.285649299621582, "step": 1714 }, { "epoch": 3.620253164556962, "grad_norm": 3.216102361679077, "learning_rate": 1.4669208593195704e-07, "loss": 0.6992135047912598, "step": 1716 }, { "epoch": 3.6244725738396624, "grad_norm": 2.728694438934326, "learning_rate": 1.4611023888412115e-07, "loss": 0.8372994065284729, "step": 1718 }, { "epoch": 3.628691983122363, "grad_norm": 8.481232643127441, "learning_rate": 1.4553464646933492e-07, "loss": 0.5174750685691833, "step": 1720 }, { "epoch": 3.632911392405063, "grad_norm": 3.1336352825164795, "learning_rate": 1.4496531569322426e-07, "loss": 1.101250410079956, "step": 1722 }, { "epoch": 3.6371308016877637, "grad_norm": 3.442155122756958, "learning_rate": 1.4440225348520354e-07, "loss": 0.6749483346939087, "step": 1724 }, { "epoch": 3.6413502109704643, "grad_norm": 3.023040771484375, "learning_rate": 1.4384546669839147e-07, "loss": 0.48659658432006836, "step": 1726 }, { "epoch": 3.6455696202531644, "grad_norm": 6.006860733032227, "learning_rate": 1.432949621095273e-07, "loss": 1.0057132244110107, "step": 1728 }, { "epoch": 3.649789029535865, "grad_norm": 5.072360992431641, "learning_rate": 1.4275074641888904e-07, "loss": 0.29357773065567017, "step": 1730 }, { "epoch": 3.6540084388185656, "grad_norm": 18.242097854614258, "learning_rate": 1.4221282625021142e-07, "loss": 1.019067406654358, "step": 1732 }, { "epoch": 3.6582278481012658, "grad_norm": 2.2106029987335205, "learning_rate": 1.4168120815060542e-07, "loss": 0.5755662322044373, "step": 1734 }, { "epoch": 3.6624472573839664, "grad_norm": 2.0836057662963867, "learning_rate": 1.4115589859047829e-07, "loss": 0.5893323421478271, "step": 1736 }, { "epoch": 3.6666666666666665, "grad_norm": 1.689981460571289, "learning_rate": 1.4063690396345539e-07, "loss": 0.8215257525444031, "step": 1738 }, { "epoch": 3.670886075949367, "grad_norm": 2.466362714767456, "learning_rate": 1.401242305863019e-07, "loss": 0.5873066782951355, "step": 1740 }, { "epoch": 3.6751054852320673, "grad_norm": 2.1418519020080566, "learning_rate": 1.3961788469884597e-07, "loss": 1.2188622951507568, "step": 1742 }, { "epoch": 3.679324894514768, "grad_norm": 2.1476902961730957, "learning_rate": 1.39117872463903e-07, "loss": 0.6782402396202087, "step": 1744 }, { "epoch": 3.6835443037974684, "grad_norm": 2.313478946685791, "learning_rate": 1.3862419996720055e-07, "loss": 0.6638330817222595, "step": 1746 }, { "epoch": 3.6877637130801686, "grad_norm": 1.2573710680007935, "learning_rate": 1.381368732173042e-07, "loss": 1.1310936212539673, "step": 1748 }, { "epoch": 3.691983122362869, "grad_norm": 4.773893356323242, "learning_rate": 1.376558981455443e-07, "loss": 0.9830767512321472, "step": 1750 }, { "epoch": 3.6962025316455698, "grad_norm": 1.9760856628417969, "learning_rate": 1.371812806059441e-07, "loss": 1.0266754627227783, "step": 1752 }, { "epoch": 3.70042194092827, "grad_norm": 1.8001806735992432, "learning_rate": 1.3671302637514825e-07, "loss": 1.1445378065109253, "step": 1754 }, { "epoch": 3.7046413502109705, "grad_norm": 2.3651130199432373, "learning_rate": 1.3625114115235267e-07, "loss": 0.8746024370193481, "step": 1756 }, { "epoch": 3.708860759493671, "grad_norm": 2.966754913330078, "learning_rate": 1.357956305592349e-07, "loss": 0.8632293343544006, "step": 1758 }, { "epoch": 3.7130801687763713, "grad_norm": 2.7932474613189697, "learning_rate": 1.35346500139886e-07, "loss": 0.8797197937965393, "step": 1760 }, { "epoch": 3.717299578059072, "grad_norm": 3.4520580768585205, "learning_rate": 1.3490375536074293e-07, "loss": 0.4202856123447418, "step": 1762 }, { "epoch": 3.721518987341772, "grad_norm": 5.053709506988525, "learning_rate": 1.3446740161052182e-07, "loss": 0.7906475067138672, "step": 1764 }, { "epoch": 3.7257383966244726, "grad_norm": 1.492531418800354, "learning_rate": 1.3403744420015293e-07, "loss": 1.0731313228607178, "step": 1766 }, { "epoch": 3.7299578059071727, "grad_norm": 4.506521701812744, "learning_rate": 1.3361388836271545e-07, "loss": 0.6830440163612366, "step": 1768 }, { "epoch": 3.7341772151898733, "grad_norm": 2.127143383026123, "learning_rate": 1.33196739253374e-07, "loss": 0.7407412528991699, "step": 1770 }, { "epoch": 3.738396624472574, "grad_norm": 2.162644147872925, "learning_rate": 1.3278600194931595e-07, "loss": 1.099405288696289, "step": 1772 }, { "epoch": 3.742616033755274, "grad_norm": 0.9268086552619934, "learning_rate": 1.323816814496896e-07, "loss": 0.7270370721817017, "step": 1774 }, { "epoch": 3.7468354430379747, "grad_norm": 0.7520632743835449, "learning_rate": 1.3198378267554327e-07, "loss": 0.7462360262870789, "step": 1776 }, { "epoch": 3.7510548523206753, "grad_norm": 1.411445140838623, "learning_rate": 1.3159231046976552e-07, "loss": 1.026281476020813, "step": 1778 }, { "epoch": 3.7552742616033754, "grad_norm": 2.437485456466675, "learning_rate": 1.3120726959702608e-07, "loss": 1.0296030044555664, "step": 1780 }, { "epoch": 3.759493670886076, "grad_norm": 2.7060513496398926, "learning_rate": 1.308286647437179e-07, "loss": 0.9808471202850342, "step": 1782 }, { "epoch": 3.7637130801687766, "grad_norm": 2.146833658218384, "learning_rate": 1.3045650051790027e-07, "loss": 0.9502108097076416, "step": 1784 }, { "epoch": 3.7679324894514767, "grad_norm": 1.3278952836990356, "learning_rate": 1.300907814492422e-07, "loss": 1.123317003250122, "step": 1786 }, { "epoch": 3.7721518987341773, "grad_norm": 14.552665710449219, "learning_rate": 1.2973151198896823e-07, "loss": 0.525389552116394, "step": 1788 }, { "epoch": 3.7763713080168775, "grad_norm": 1.6734447479248047, "learning_rate": 1.2937869650980342e-07, "loss": 0.7029292583465576, "step": 1790 }, { "epoch": 3.780590717299578, "grad_norm": 1.3970534801483154, "learning_rate": 1.2903233930592022e-07, "loss": 1.0671159029006958, "step": 1792 }, { "epoch": 3.7848101265822782, "grad_norm": 2.8452141284942627, "learning_rate": 1.2869244459288677e-07, "loss": 0.7484707832336426, "step": 1794 }, { "epoch": 3.789029535864979, "grad_norm": 2.7676146030426025, "learning_rate": 1.2835901650761496e-07, "loss": 1.1054531335830688, "step": 1796 }, { "epoch": 3.7932489451476794, "grad_norm": 2.6690499782562256, "learning_rate": 1.2803205910831044e-07, "loss": 1.1910511255264282, "step": 1798 }, { "epoch": 3.7974683544303796, "grad_norm": 2.3067097663879395, "learning_rate": 1.2771157637442308e-07, "loss": 1.0350401401519775, "step": 1800 }, { "epoch": 3.80168776371308, "grad_norm": 2.0456929206848145, "learning_rate": 1.273975722065986e-07, "loss": 1.1489591598510742, "step": 1802 }, { "epoch": 3.8059071729957807, "grad_norm": 1.7378591299057007, "learning_rate": 1.2709005042663118e-07, "loss": 0.6581465005874634, "step": 1804 }, { "epoch": 3.810126582278481, "grad_norm": 6.99116849899292, "learning_rate": 1.267890147774167e-07, "loss": 0.29897159337997437, "step": 1806 }, { "epoch": 3.8143459915611815, "grad_norm": 4.381340026855469, "learning_rate": 1.264944689229072e-07, "loss": 1.0360081195831299, "step": 1808 }, { "epoch": 3.818565400843882, "grad_norm": 2.1093826293945312, "learning_rate": 1.2620641644806678e-07, "loss": 1.0628427267074585, "step": 1810 }, { "epoch": 3.8227848101265822, "grad_norm": 9.374409675598145, "learning_rate": 1.2592486085882725e-07, "loss": 0.7481462955474854, "step": 1812 }, { "epoch": 3.827004219409283, "grad_norm": 1.5822006464004517, "learning_rate": 1.25649805582046e-07, "loss": 1.0469331741333008, "step": 1814 }, { "epoch": 3.831223628691983, "grad_norm": 2.6007158756256104, "learning_rate": 1.25381253965464e-07, "loss": 0.9370917081832886, "step": 1816 }, { "epoch": 3.8354430379746836, "grad_norm": 3.8402206897735596, "learning_rate": 1.2511920927766525e-07, "loss": 0.9214923977851868, "step": 1818 }, { "epoch": 3.8396624472573837, "grad_norm": 11.853067398071289, "learning_rate": 1.2486367470803673e-07, "loss": 0.8060356378555298, "step": 1820 }, { "epoch": 3.8438818565400843, "grad_norm": 1.407483696937561, "learning_rate": 1.246146533667299e-07, "loss": 1.076265573501587, "step": 1822 }, { "epoch": 3.848101265822785, "grad_norm": 3.0918633937835693, "learning_rate": 1.243721482846227e-07, "loss": 0.9416312575340271, "step": 1824 }, { "epoch": 3.852320675105485, "grad_norm": 2.7018940448760986, "learning_rate": 1.2413616241328252e-07, "loss": 1.026483416557312, "step": 1826 }, { "epoch": 3.8565400843881856, "grad_norm": 2.9005277156829834, "learning_rate": 1.2390669862493044e-07, "loss": 1.033530354499817, "step": 1828 }, { "epoch": 3.8607594936708862, "grad_norm": 1.5697400569915771, "learning_rate": 1.2368375971240647e-07, "loss": 1.0893433094024658, "step": 1830 }, { "epoch": 3.8649789029535864, "grad_norm": 7.101255893707275, "learning_rate": 1.2346734838913498e-07, "loss": 0.4264039397239685, "step": 1832 }, { "epoch": 3.869198312236287, "grad_norm": 1.942752718925476, "learning_rate": 1.2325746728909227e-07, "loss": 0.6822599172592163, "step": 1834 }, { "epoch": 3.8734177215189876, "grad_norm": 2.711249351501465, "learning_rate": 1.2305411896677423e-07, "loss": 0.8705965280532837, "step": 1836 }, { "epoch": 3.8776371308016877, "grad_norm": 3.3902530670166016, "learning_rate": 1.228573058971652e-07, "loss": 0.7575594186782837, "step": 1838 }, { "epoch": 3.8818565400843883, "grad_norm": 5.287688732147217, "learning_rate": 1.2266703047570794e-07, "loss": 0.8974352478981018, "step": 1840 }, { "epoch": 3.8860759493670884, "grad_norm": 2.1966428756713867, "learning_rate": 1.2248329501827461e-07, "loss": 0.7821562886238098, "step": 1842 }, { "epoch": 3.890295358649789, "grad_norm": 2.125584125518799, "learning_rate": 1.2230610176113828e-07, "loss": 0.7629109621047974, "step": 1844 }, { "epoch": 3.894514767932489, "grad_norm": 1.5011521577835083, "learning_rate": 1.2213545286094602e-07, "loss": 1.0465257167816162, "step": 1846 }, { "epoch": 3.8987341772151898, "grad_norm": 3.0355629920959473, "learning_rate": 1.219713503946922e-07, "loss": 0.5780481100082397, "step": 1848 }, { "epoch": 3.9029535864978904, "grad_norm": 2.1277599334716797, "learning_rate": 1.21813796359694e-07, "loss": 1.0891631841659546, "step": 1850 }, { "epoch": 3.9071729957805905, "grad_norm": 1.4144175052642822, "learning_rate": 1.2166279267356617e-07, "loss": 1.0926233530044556, "step": 1852 }, { "epoch": 3.911392405063291, "grad_norm": 1.8264589309692383, "learning_rate": 1.2151834117419832e-07, "loss": 1.0842887163162231, "step": 1854 }, { "epoch": 3.9156118143459917, "grad_norm": 1.3971328735351562, "learning_rate": 1.2138044361973238e-07, "loss": 1.1029634475708008, "step": 1856 }, { "epoch": 3.919831223628692, "grad_norm": 1.3931989669799805, "learning_rate": 1.2124910168854125e-07, "loss": 1.092046856880188, "step": 1858 }, { "epoch": 3.9240506329113924, "grad_norm": 1.2768291234970093, "learning_rate": 1.21124316979208e-07, "loss": 1.0661836862564087, "step": 1860 }, { "epoch": 3.928270042194093, "grad_norm": 5.809596538543701, "learning_rate": 1.210060910105071e-07, "loss": 0.9497167468070984, "step": 1862 }, { "epoch": 3.932489451476793, "grad_norm": 1.809336543083191, "learning_rate": 1.208944252213854e-07, "loss": 0.7419611811637878, "step": 1864 }, { "epoch": 3.9367088607594938, "grad_norm": 3.3719143867492676, "learning_rate": 1.2078932097094474e-07, "loss": 1.3616517782211304, "step": 1866 }, { "epoch": 3.9409282700421944, "grad_norm": 1.4581533670425415, "learning_rate": 1.2069077953842544e-07, "loss": 1.0452879667282104, "step": 1868 }, { "epoch": 3.9451476793248945, "grad_norm": 3.9840292930603027, "learning_rate": 1.2059880212319078e-07, "loss": 0.7806097269058228, "step": 1870 }, { "epoch": 3.9493670886075947, "grad_norm": 0.3003561794757843, "learning_rate": 1.2051338984471242e-07, "loss": 0.568496584892273, "step": 1872 }, { "epoch": 3.9535864978902953, "grad_norm": 3.0003912448883057, "learning_rate": 1.2043454374255645e-07, "loss": 0.5840458273887634, "step": 1874 }, { "epoch": 3.957805907172996, "grad_norm": 17.255149841308594, "learning_rate": 1.203622647763713e-07, "loss": 0.9891324639320374, "step": 1876 }, { "epoch": 3.962025316455696, "grad_norm": 4.442596435546875, "learning_rate": 1.2029655382587557e-07, "loss": 0.937990665435791, "step": 1878 }, { "epoch": 3.9662447257383966, "grad_norm": 1.3784996271133423, "learning_rate": 1.2023741169084767e-07, "loss": 0.6944407224655151, "step": 1880 }, { "epoch": 3.970464135021097, "grad_norm": 1.8049193620681763, "learning_rate": 1.2018483909111572e-07, "loss": 1.0277503728866577, "step": 1882 }, { "epoch": 3.9746835443037973, "grad_norm": 6.727908611297607, "learning_rate": 1.2013883666654907e-07, "loss": 0.509749174118042, "step": 1884 }, { "epoch": 3.978902953586498, "grad_norm": 2.5991525650024414, "learning_rate": 1.2009940497705058e-07, "loss": 1.0679656267166138, "step": 1886 }, { "epoch": 3.9831223628691985, "grad_norm": 2.2473011016845703, "learning_rate": 1.2006654450254938e-07, "loss": 0.7142981290817261, "step": 1888 }, { "epoch": 3.9873417721518987, "grad_norm": 3.0477726459503174, "learning_rate": 1.2004025564299563e-07, "loss": 1.0713993310928345, "step": 1890 }, { "epoch": 3.9915611814345993, "grad_norm": 8.713078498840332, "learning_rate": 1.2002053871835507e-07, "loss": 0.6879635453224182, "step": 1892 }, { "epoch": 3.9957805907173, "grad_norm": 1.979125738143921, "learning_rate": 1.2000739396860554e-07, "loss": 1.0905542373657227, "step": 1894 }, { "epoch": 4.0, "grad_norm": 1.923147439956665, "learning_rate": 1.2000082155373382e-07, "loss": 1.1579601764678955, "step": 1896 }, { "epoch": 4.0, "step": 1896, "total_flos": 3.5948540672197263e+18, "train_loss": 1.0285371271618309, "train_runtime": 8697.1879, "train_samples_per_second": 6.54, "train_steps_per_second": 0.218 } ], "logging_steps": 2, "max_steps": 1896, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.5948540672197263e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }