{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9994236311239195, "eval_steps": 500, "global_step": 3903, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001536983669548511, "grad_norm": 25.51269023102587, "learning_rate": 3.5805626598465474e-08, "loss": 1.1796, "step": 2 }, { "epoch": 0.003073967339097022, "grad_norm": 35.249053162120475, "learning_rate": 7.161125319693095e-08, "loss": 1.176, "step": 4 }, { "epoch": 0.004610951008645533, "grad_norm": 40.978012392510365, "learning_rate": 1.0741687979539642e-07, "loss": 1.1898, "step": 6 }, { "epoch": 0.006147934678194044, "grad_norm": 37.791685318747554, "learning_rate": 1.432225063938619e-07, "loss": 1.1456, "step": 8 }, { "epoch": 0.007684918347742555, "grad_norm": 42.35883366195493, "learning_rate": 1.7902813299232735e-07, "loss": 1.2175, "step": 10 }, { "epoch": 0.009221902017291067, "grad_norm": 48.99379222065678, "learning_rate": 2.1483375959079283e-07, "loss": 1.192, "step": 12 }, { "epoch": 0.010758885686839578, "grad_norm": 44.18044117170443, "learning_rate": 2.506393861892583e-07, "loss": 1.1534, "step": 14 }, { "epoch": 0.012295869356388088, "grad_norm": 53.20894364313595, "learning_rate": 2.864450127877238e-07, "loss": 1.1731, "step": 16 }, { "epoch": 0.0138328530259366, "grad_norm": 24.63444222442823, "learning_rate": 3.2225063938618927e-07, "loss": 1.1526, "step": 18 }, { "epoch": 0.01536983669548511, "grad_norm": 26.218955749561335, "learning_rate": 3.580562659846547e-07, "loss": 1.1429, "step": 20 }, { "epoch": 0.01690682036503362, "grad_norm": 24.488025247790667, "learning_rate": 3.938618925831202e-07, "loss": 1.103, "step": 22 }, { "epoch": 0.018443804034582133, "grad_norm": 36.781877179954705, "learning_rate": 4.2966751918158566e-07, "loss": 1.119, "step": 24 }, { "epoch": 0.019980787704130643, "grad_norm": 20.069997876411783, "learning_rate": 4.654731457800512e-07, "loss": 1.0916, "step": 26 }, { "epoch": 0.021517771373679155, "grad_norm": 18.663749580298237, "learning_rate": 5.012787723785166e-07, "loss": 1.0403, "step": 28 }, { "epoch": 0.023054755043227664, "grad_norm": 30.587706942463424, "learning_rate": 5.37084398976982e-07, "loss": 0.9996, "step": 30 }, { "epoch": 0.024591738712776177, "grad_norm": 25.127131893569143, "learning_rate": 5.728900255754476e-07, "loss": 0.9013, "step": 32 }, { "epoch": 0.02612872238232469, "grad_norm": 24.505390983983318, "learning_rate": 6.08695652173913e-07, "loss": 0.8338, "step": 34 }, { "epoch": 0.0276657060518732, "grad_norm": 28.03624180091526, "learning_rate": 6.445012787723785e-07, "loss": 0.7863, "step": 36 }, { "epoch": 0.02920268972142171, "grad_norm": 18.04703186938572, "learning_rate": 6.80306905370844e-07, "loss": 0.8265, "step": 38 }, { "epoch": 0.03073967339097022, "grad_norm": 19.463425683379242, "learning_rate": 7.161125319693094e-07, "loss": 0.775, "step": 40 }, { "epoch": 0.03227665706051873, "grad_norm": 17.677748750715033, "learning_rate": 7.519181585677749e-07, "loss": 0.6703, "step": 42 }, { "epoch": 0.03381364073006724, "grad_norm": 10.700820532554927, "learning_rate": 7.877237851662404e-07, "loss": 0.5609, "step": 44 }, { "epoch": 0.035350624399615754, "grad_norm": 8.475129109313396, "learning_rate": 8.235294117647059e-07, "loss": 0.4812, "step": 46 }, { "epoch": 0.03688760806916427, "grad_norm": 8.038325889019223, "learning_rate": 8.593350383631713e-07, "loss": 0.5001, "step": 48 }, { "epoch": 0.03842459173871278, "grad_norm": 5.218235069976644, "learning_rate": 8.951406649616369e-07, "loss": 0.4479, "step": 50 }, { "epoch": 0.039961575408261285, "grad_norm": 4.835373589811063, "learning_rate": 9.309462915601024e-07, "loss": 0.4323, "step": 52 }, { "epoch": 0.0414985590778098, "grad_norm": 6.306114301931914, "learning_rate": 9.667519181585676e-07, "loss": 0.4035, "step": 54 }, { "epoch": 0.04303554274735831, "grad_norm": 4.0054619844059, "learning_rate": 1.0025575447570332e-06, "loss": 0.3893, "step": 56 }, { "epoch": 0.04457252641690682, "grad_norm": 6.071833866129103, "learning_rate": 1.0383631713554987e-06, "loss": 0.3929, "step": 58 }, { "epoch": 0.04610951008645533, "grad_norm": 6.484184007577541, "learning_rate": 1.074168797953964e-06, "loss": 0.3561, "step": 60 }, { "epoch": 0.04764649375600384, "grad_norm": 3.3031873501656173, "learning_rate": 1.1099744245524297e-06, "loss": 0.3598, "step": 62 }, { "epoch": 0.049183477425552354, "grad_norm": 3.1979226953240896, "learning_rate": 1.1457800511508952e-06, "loss": 0.3505, "step": 64 }, { "epoch": 0.050720461095100866, "grad_norm": 3.2686119354900667, "learning_rate": 1.1815856777493606e-06, "loss": 0.3402, "step": 66 }, { "epoch": 0.05225744476464938, "grad_norm": 3.4755981271030425, "learning_rate": 1.217391304347826e-06, "loss": 0.3354, "step": 68 }, { "epoch": 0.053794428434197884, "grad_norm": 3.7786309539549294, "learning_rate": 1.2531969309462915e-06, "loss": 0.3222, "step": 70 }, { "epoch": 0.0553314121037464, "grad_norm": 3.189215314687861, "learning_rate": 1.289002557544757e-06, "loss": 0.3269, "step": 72 }, { "epoch": 0.05686839577329491, "grad_norm": 3.231319202267947, "learning_rate": 1.3248081841432225e-06, "loss": 0.317, "step": 74 }, { "epoch": 0.05840537944284342, "grad_norm": 3.4059003373513526, "learning_rate": 1.360613810741688e-06, "loss": 0.3116, "step": 76 }, { "epoch": 0.05994236311239193, "grad_norm": 3.118126356582086, "learning_rate": 1.3964194373401534e-06, "loss": 0.3138, "step": 78 }, { "epoch": 0.06147934678194044, "grad_norm": 3.4693555336883417, "learning_rate": 1.4322250639386188e-06, "loss": 0.3019, "step": 80 }, { "epoch": 0.06301633045148895, "grad_norm": 2.970297866234449, "learning_rate": 1.4680306905370844e-06, "loss": 0.293, "step": 82 }, { "epoch": 0.06455331412103746, "grad_norm": 2.9935089318553216, "learning_rate": 1.5038363171355499e-06, "loss": 0.2807, "step": 84 }, { "epoch": 0.06609029779058598, "grad_norm": 2.9320835508202747, "learning_rate": 1.5396419437340153e-06, "loss": 0.2981, "step": 86 }, { "epoch": 0.06762728146013448, "grad_norm": 2.7902350790145958, "learning_rate": 1.5754475703324807e-06, "loss": 0.2882, "step": 88 }, { "epoch": 0.069164265129683, "grad_norm": 2.913110266507068, "learning_rate": 1.6112531969309462e-06, "loss": 0.2963, "step": 90 }, { "epoch": 0.07070124879923151, "grad_norm": 3.0624887135891905, "learning_rate": 1.6470588235294118e-06, "loss": 0.2841, "step": 92 }, { "epoch": 0.07223823246878001, "grad_norm": 2.9694480736928193, "learning_rate": 1.6828644501278772e-06, "loss": 0.2745, "step": 94 }, { "epoch": 0.07377521613832853, "grad_norm": 2.9922295547415168, "learning_rate": 1.7186700767263426e-06, "loss": 0.2702, "step": 96 }, { "epoch": 0.07531219980787704, "grad_norm": 2.909619857546067, "learning_rate": 1.754475703324808e-06, "loss": 0.2731, "step": 98 }, { "epoch": 0.07684918347742556, "grad_norm": 2.894052835979119, "learning_rate": 1.7902813299232737e-06, "loss": 0.28, "step": 100 }, { "epoch": 0.07838616714697406, "grad_norm": 3.0298785356363256, "learning_rate": 1.8260869565217391e-06, "loss": 0.2604, "step": 102 }, { "epoch": 0.07992315081652257, "grad_norm": 2.7510733379017367, "learning_rate": 1.8618925831202048e-06, "loss": 0.2745, "step": 104 }, { "epoch": 0.08146013448607109, "grad_norm": 2.8755196777892267, "learning_rate": 1.89769820971867e-06, "loss": 0.2649, "step": 106 }, { "epoch": 0.0829971181556196, "grad_norm": 2.7823559727995715, "learning_rate": 1.933503836317135e-06, "loss": 0.2617, "step": 108 }, { "epoch": 0.0845341018251681, "grad_norm": 2.777608095721294, "learning_rate": 1.9693094629156013e-06, "loss": 0.2421, "step": 110 }, { "epoch": 0.08607108549471662, "grad_norm": 2.8340487973855075, "learning_rate": 2.0051150895140665e-06, "loss": 0.2725, "step": 112 }, { "epoch": 0.08760806916426513, "grad_norm": 2.7460631775841913, "learning_rate": 2.040920716112532e-06, "loss": 0.2553, "step": 114 }, { "epoch": 0.08914505283381365, "grad_norm": 2.930465300765411, "learning_rate": 2.0767263427109973e-06, "loss": 0.2563, "step": 116 }, { "epoch": 0.09068203650336215, "grad_norm": 2.9408506186054297, "learning_rate": 2.1125319693094626e-06, "loss": 0.2522, "step": 118 }, { "epoch": 0.09221902017291066, "grad_norm": 2.9075494103912516, "learning_rate": 2.148337595907928e-06, "loss": 0.252, "step": 120 }, { "epoch": 0.09375600384245918, "grad_norm": 2.772712298254247, "learning_rate": 2.184143222506394e-06, "loss": 0.2443, "step": 122 }, { "epoch": 0.09529298751200768, "grad_norm": 2.9572827571923654, "learning_rate": 2.2199488491048595e-06, "loss": 0.2371, "step": 124 }, { "epoch": 0.0968299711815562, "grad_norm": 2.939257511565131, "learning_rate": 2.2557544757033247e-06, "loss": 0.2553, "step": 126 }, { "epoch": 0.09836695485110471, "grad_norm": 2.8729850658382055, "learning_rate": 2.2915601023017903e-06, "loss": 0.2468, "step": 128 }, { "epoch": 0.09990393852065321, "grad_norm": 2.7047070980502474, "learning_rate": 2.3273657289002556e-06, "loss": 0.2314, "step": 130 }, { "epoch": 0.10144092219020173, "grad_norm": 2.7576396912344876, "learning_rate": 2.363171355498721e-06, "loss": 0.231, "step": 132 }, { "epoch": 0.10297790585975024, "grad_norm": 2.9300704934262054, "learning_rate": 2.398976982097187e-06, "loss": 0.2386, "step": 134 }, { "epoch": 0.10451488952929876, "grad_norm": 2.630373862177306, "learning_rate": 2.434782608695652e-06, "loss": 0.2259, "step": 136 }, { "epoch": 0.10605187319884726, "grad_norm": 2.7320707506261286, "learning_rate": 2.4705882352941177e-06, "loss": 0.2393, "step": 138 }, { "epoch": 0.10758885686839577, "grad_norm": 2.7386829967371384, "learning_rate": 2.506393861892583e-06, "loss": 0.2391, "step": 140 }, { "epoch": 0.10912584053794429, "grad_norm": 2.5434808295139653, "learning_rate": 2.5421994884910485e-06, "loss": 0.2254, "step": 142 }, { "epoch": 0.1106628242074928, "grad_norm": 2.59627848497483, "learning_rate": 2.578005115089514e-06, "loss": 0.2323, "step": 144 }, { "epoch": 0.11219980787704131, "grad_norm": 2.6102361814153836, "learning_rate": 2.6138107416879794e-06, "loss": 0.2212, "step": 146 }, { "epoch": 0.11373679154658982, "grad_norm": 2.4837198822431157, "learning_rate": 2.649616368286445e-06, "loss": 0.2174, "step": 148 }, { "epoch": 0.11527377521613832, "grad_norm": 2.487984443326589, "learning_rate": 2.6854219948849103e-06, "loss": 0.2171, "step": 150 }, { "epoch": 0.11681075888568684, "grad_norm": 2.491267047584632, "learning_rate": 2.721227621483376e-06, "loss": 0.2303, "step": 152 }, { "epoch": 0.11834774255523535, "grad_norm": 2.2658939217150302, "learning_rate": 2.7570332480818415e-06, "loss": 0.211, "step": 154 }, { "epoch": 0.11988472622478386, "grad_norm": 2.2884352286474727, "learning_rate": 2.7928388746803067e-06, "loss": 0.2155, "step": 156 }, { "epoch": 0.12142170989433237, "grad_norm": 2.175477310058195, "learning_rate": 2.8286445012787724e-06, "loss": 0.2101, "step": 158 }, { "epoch": 0.12295869356388088, "grad_norm": 2.04586024150522, "learning_rate": 2.8644501278772376e-06, "loss": 0.2072, "step": 160 }, { "epoch": 0.1244956772334294, "grad_norm": 1.937221714698271, "learning_rate": 2.9002557544757037e-06, "loss": 0.2128, "step": 162 }, { "epoch": 0.1260326609029779, "grad_norm": 1.9127369194130952, "learning_rate": 2.936061381074169e-06, "loss": 0.2164, "step": 164 }, { "epoch": 0.12756964457252642, "grad_norm": 1.9467054527515646, "learning_rate": 2.971867007672634e-06, "loss": 0.216, "step": 166 }, { "epoch": 0.12910662824207492, "grad_norm": 1.7444615717696514, "learning_rate": 3.0076726342710997e-06, "loss": 0.185, "step": 168 }, { "epoch": 0.13064361191162344, "grad_norm": 1.721455823281585, "learning_rate": 3.043478260869565e-06, "loss": 0.2041, "step": 170 }, { "epoch": 0.13218059558117196, "grad_norm": 1.6944968403589584, "learning_rate": 3.0792838874680306e-06, "loss": 0.2001, "step": 172 }, { "epoch": 0.13371757925072045, "grad_norm": 1.5902843900468329, "learning_rate": 3.1150895140664962e-06, "loss": 0.1969, "step": 174 }, { "epoch": 0.13525456292026897, "grad_norm": 1.564694037006993, "learning_rate": 3.1508951406649614e-06, "loss": 0.1945, "step": 176 }, { "epoch": 0.1367915465898175, "grad_norm": 1.5404270105425055, "learning_rate": 3.186700767263427e-06, "loss": 0.1919, "step": 178 }, { "epoch": 0.138328530259366, "grad_norm": 1.4600068175795704, "learning_rate": 3.2225063938618923e-06, "loss": 0.2144, "step": 180 }, { "epoch": 0.1398655139289145, "grad_norm": 1.4972009484455417, "learning_rate": 3.258312020460358e-06, "loss": 0.2029, "step": 182 }, { "epoch": 0.14140249759846302, "grad_norm": 1.5216950664348157, "learning_rate": 3.2941176470588236e-06, "loss": 0.1997, "step": 184 }, { "epoch": 0.14293948126801154, "grad_norm": 1.4985918972641317, "learning_rate": 3.3299232736572892e-06, "loss": 0.2036, "step": 186 }, { "epoch": 0.14447646493756003, "grad_norm": 1.4780907494983677, "learning_rate": 3.3657289002557544e-06, "loss": 0.1981, "step": 188 }, { "epoch": 0.14601344860710855, "grad_norm": 1.3886567101627987, "learning_rate": 3.4015345268542197e-06, "loss": 0.185, "step": 190 }, { "epoch": 0.14755043227665707, "grad_norm": 1.398832954652927, "learning_rate": 3.4373401534526853e-06, "loss": 0.2029, "step": 192 }, { "epoch": 0.14908741594620556, "grad_norm": 1.4597381544671573, "learning_rate": 3.473145780051151e-06, "loss": 0.1881, "step": 194 }, { "epoch": 0.15062439961575408, "grad_norm": 1.3840318868108508, "learning_rate": 3.508951406649616e-06, "loss": 0.2078, "step": 196 }, { "epoch": 0.1521613832853026, "grad_norm": 1.4070697917261799, "learning_rate": 3.544757033248082e-06, "loss": 0.1974, "step": 198 }, { "epoch": 0.15369836695485112, "grad_norm": 1.361393353227654, "learning_rate": 3.5805626598465474e-06, "loss": 0.197, "step": 200 }, { "epoch": 0.1552353506243996, "grad_norm": 1.434775426937055, "learning_rate": 3.6163682864450126e-06, "loss": 0.2038, "step": 202 }, { "epoch": 0.15677233429394813, "grad_norm": 1.5065947786249383, "learning_rate": 3.6521739130434783e-06, "loss": 0.2037, "step": 204 }, { "epoch": 0.15830931796349665, "grad_norm": 1.3442194224119404, "learning_rate": 3.6879795396419435e-06, "loss": 0.1964, "step": 206 }, { "epoch": 0.15984630163304514, "grad_norm": 1.3670203145371058, "learning_rate": 3.7237851662404096e-06, "loss": 0.195, "step": 208 }, { "epoch": 0.16138328530259366, "grad_norm": 1.4035980721723869, "learning_rate": 3.7595907928388748e-06, "loss": 0.1961, "step": 210 }, { "epoch": 0.16292026897214218, "grad_norm": 1.30606746520172, "learning_rate": 3.79539641943734e-06, "loss": 0.1877, "step": 212 }, { "epoch": 0.16445725264169067, "grad_norm": 1.4103433650854988, "learning_rate": 3.831202046035806e-06, "loss": 0.1941, "step": 214 }, { "epoch": 0.1659942363112392, "grad_norm": 1.394026552172561, "learning_rate": 3.86700767263427e-06, "loss": 0.1922, "step": 216 }, { "epoch": 0.1675312199807877, "grad_norm": 1.3829946302657223, "learning_rate": 3.902813299232737e-06, "loss": 0.2048, "step": 218 }, { "epoch": 0.1690682036503362, "grad_norm": 1.3276260104745685, "learning_rate": 3.9386189258312025e-06, "loss": 0.1892, "step": 220 }, { "epoch": 0.17060518731988472, "grad_norm": 1.3432125610119714, "learning_rate": 3.974424552429667e-06, "loss": 0.1892, "step": 222 }, { "epoch": 0.17214217098943324, "grad_norm": 1.3597738268768664, "learning_rate": 4.010230179028133e-06, "loss": 0.1933, "step": 224 }, { "epoch": 0.17367915465898176, "grad_norm": 1.3784514898304991, "learning_rate": 4.046035805626598e-06, "loss": 0.187, "step": 226 }, { "epoch": 0.17521613832853025, "grad_norm": 1.283311780182546, "learning_rate": 4.081841432225064e-06, "loss": 0.1889, "step": 228 }, { "epoch": 0.17675312199807877, "grad_norm": 1.3048663711388646, "learning_rate": 4.11764705882353e-06, "loss": 0.1958, "step": 230 }, { "epoch": 0.1782901056676273, "grad_norm": 1.3644400089186173, "learning_rate": 4.153452685421995e-06, "loss": 0.1939, "step": 232 }, { "epoch": 0.17982708933717578, "grad_norm": 1.2592600425837408, "learning_rate": 4.18925831202046e-06, "loss": 0.2031, "step": 234 }, { "epoch": 0.1813640730067243, "grad_norm": 30.200615105516786, "learning_rate": 4.225063938618925e-06, "loss": 0.1891, "step": 236 }, { "epoch": 0.18290105667627282, "grad_norm": 1.4965751336406534, "learning_rate": 4.260869565217392e-06, "loss": 0.1878, "step": 238 }, { "epoch": 0.1844380403458213, "grad_norm": 5.563286529626895, "learning_rate": 4.296675191815856e-06, "loss": 0.1982, "step": 240 }, { "epoch": 0.18597502401536983, "grad_norm": 1.4805149402757245, "learning_rate": 4.332480818414322e-06, "loss": 0.1942, "step": 242 }, { "epoch": 0.18751200768491835, "grad_norm": 1.6629859442097377, "learning_rate": 4.368286445012788e-06, "loss": 0.2067, "step": 244 }, { "epoch": 0.18904899135446687, "grad_norm": 9.155421397230691, "learning_rate": 4.4040920716112525e-06, "loss": 0.1872, "step": 246 }, { "epoch": 0.19058597502401536, "grad_norm": 1.32114090472292, "learning_rate": 4.439897698209719e-06, "loss": 0.1844, "step": 248 }, { "epoch": 0.19212295869356388, "grad_norm": 1.2320314387569729, "learning_rate": 4.475703324808184e-06, "loss": 0.1791, "step": 250 }, { "epoch": 0.1936599423631124, "grad_norm": 1.2291776990139922, "learning_rate": 4.511508951406649e-06, "loss": 0.1799, "step": 252 }, { "epoch": 0.1951969260326609, "grad_norm": 1.270813260985118, "learning_rate": 4.547314578005115e-06, "loss": 0.1877, "step": 254 }, { "epoch": 0.19673390970220941, "grad_norm": 1.3202011345679392, "learning_rate": 4.583120204603581e-06, "loss": 0.2013, "step": 256 }, { "epoch": 0.19827089337175793, "grad_norm": 1.3290195976171792, "learning_rate": 4.618925831202046e-06, "loss": 0.1915, "step": 258 }, { "epoch": 0.19980787704130643, "grad_norm": 1.2091766956810812, "learning_rate": 4.654731457800511e-06, "loss": 0.1847, "step": 260 }, { "epoch": 0.20134486071085494, "grad_norm": 1.3107091320741244, "learning_rate": 4.690537084398977e-06, "loss": 0.1844, "step": 262 }, { "epoch": 0.20288184438040346, "grad_norm": 1.448237078882646, "learning_rate": 4.726342710997442e-06, "loss": 0.1948, "step": 264 }, { "epoch": 0.20441882804995196, "grad_norm": 1.2336655516702122, "learning_rate": 4.762148337595908e-06, "loss": 0.1949, "step": 266 }, { "epoch": 0.20595581171950048, "grad_norm": 1.3658099392493064, "learning_rate": 4.797953964194374e-06, "loss": 0.1804, "step": 268 }, { "epoch": 0.207492795389049, "grad_norm": 1.373403969669977, "learning_rate": 4.8337595907928385e-06, "loss": 0.1905, "step": 270 }, { "epoch": 0.20902977905859751, "grad_norm": 1.277467955296089, "learning_rate": 4.869565217391304e-06, "loss": 0.1987, "step": 272 }, { "epoch": 0.210566762728146, "grad_norm": 1.3159132642953701, "learning_rate": 4.90537084398977e-06, "loss": 0.1957, "step": 274 }, { "epoch": 0.21210374639769453, "grad_norm": 1.278799705973987, "learning_rate": 4.941176470588235e-06, "loss": 0.1903, "step": 276 }, { "epoch": 0.21364073006724305, "grad_norm": 1.363154755717981, "learning_rate": 4.976982097186701e-06, "loss": 0.181, "step": 278 }, { "epoch": 0.21517771373679154, "grad_norm": 1.226652497702008, "learning_rate": 5.012787723785166e-06, "loss": 0.185, "step": 280 }, { "epoch": 0.21671469740634006, "grad_norm": 1.2660471639800883, "learning_rate": 5.0485933503836314e-06, "loss": 0.184, "step": 282 }, { "epoch": 0.21825168107588858, "grad_norm": 1.204935258599295, "learning_rate": 5.084398976982097e-06, "loss": 0.1816, "step": 284 }, { "epoch": 0.21978866474543707, "grad_norm": 1.2808963980429877, "learning_rate": 5.120204603580563e-06, "loss": 0.1911, "step": 286 }, { "epoch": 0.2213256484149856, "grad_norm": 1.194046158945937, "learning_rate": 5.156010230179028e-06, "loss": 0.1838, "step": 288 }, { "epoch": 0.2228626320845341, "grad_norm": 1.3438720779184878, "learning_rate": 5.191815856777493e-06, "loss": 0.1925, "step": 290 }, { "epoch": 0.22439961575408263, "grad_norm": 1.1388013545101945, "learning_rate": 5.227621483375959e-06, "loss": 0.1732, "step": 292 }, { "epoch": 0.22593659942363112, "grad_norm": 1.151418121106973, "learning_rate": 5.2634271099744244e-06, "loss": 0.1936, "step": 294 }, { "epoch": 0.22747358309317964, "grad_norm": 1.205495348613184, "learning_rate": 5.29923273657289e-06, "loss": 0.1752, "step": 296 }, { "epoch": 0.22901056676272816, "grad_norm": 1.2040139295608776, "learning_rate": 5.335038363171356e-06, "loss": 0.1873, "step": 298 }, { "epoch": 0.23054755043227665, "grad_norm": 1.2348832685626474, "learning_rate": 5.3708439897698205e-06, "loss": 0.1898, "step": 300 }, { "epoch": 0.23208453410182517, "grad_norm": 1.1372824585247818, "learning_rate": 5.406649616368286e-06, "loss": 0.1757, "step": 302 }, { "epoch": 0.2336215177713737, "grad_norm": 1.231905059549878, "learning_rate": 5.442455242966752e-06, "loss": 0.1952, "step": 304 }, { "epoch": 0.23515850144092218, "grad_norm": 1.1803349744715428, "learning_rate": 5.478260869565217e-06, "loss": 0.2001, "step": 306 }, { "epoch": 0.2366954851104707, "grad_norm": 1.277265628143273, "learning_rate": 5.514066496163683e-06, "loss": 0.2008, "step": 308 }, { "epoch": 0.23823246878001922, "grad_norm": 1.3927848001575907, "learning_rate": 5.549872122762148e-06, "loss": 0.1963, "step": 310 }, { "epoch": 0.2397694524495677, "grad_norm": 1.2403730899787904, "learning_rate": 5.5856777493606135e-06, "loss": 0.1961, "step": 312 }, { "epoch": 0.24130643611911623, "grad_norm": 1.163059501211908, "learning_rate": 5.62148337595908e-06, "loss": 0.175, "step": 314 }, { "epoch": 0.24284341978866475, "grad_norm": 1.1639642316766559, "learning_rate": 5.657289002557545e-06, "loss": 0.1886, "step": 316 }, { "epoch": 0.24438040345821327, "grad_norm": 1.2382118437766167, "learning_rate": 5.69309462915601e-06, "loss": 0.1973, "step": 318 }, { "epoch": 0.24591738712776176, "grad_norm": 1.2074114040937993, "learning_rate": 5.728900255754475e-06, "loss": 0.1837, "step": 320 }, { "epoch": 0.24745437079731028, "grad_norm": 1.295214887261411, "learning_rate": 5.764705882352941e-06, "loss": 0.1979, "step": 322 }, { "epoch": 0.2489913544668588, "grad_norm": 1.2581192996807238, "learning_rate": 5.800511508951407e-06, "loss": 0.1789, "step": 324 }, { "epoch": 0.2505283381364073, "grad_norm": 1.254789252666287, "learning_rate": 5.836317135549872e-06, "loss": 0.1941, "step": 326 }, { "epoch": 0.2520653218059558, "grad_norm": 1.251310029743626, "learning_rate": 5.872122762148338e-06, "loss": 0.189, "step": 328 }, { "epoch": 0.25360230547550433, "grad_norm": 1.1550514720366614, "learning_rate": 5.9079283887468026e-06, "loss": 0.1968, "step": 330 }, { "epoch": 0.25513928914505285, "grad_norm": 1.1535871144055385, "learning_rate": 5.943734015345268e-06, "loss": 0.1844, "step": 332 }, { "epoch": 0.25667627281460137, "grad_norm": 1.2208131864363314, "learning_rate": 5.979539641943734e-06, "loss": 0.1919, "step": 334 }, { "epoch": 0.25821325648414983, "grad_norm": 1.1683405652224148, "learning_rate": 6.0153452685421995e-06, "loss": 0.1759, "step": 336 }, { "epoch": 0.25975024015369835, "grad_norm": 1.1952164336575553, "learning_rate": 6.051150895140665e-06, "loss": 0.1795, "step": 338 }, { "epoch": 0.2612872238232469, "grad_norm": 1.1820897323143373, "learning_rate": 6.08695652173913e-06, "loss": 0.19, "step": 340 }, { "epoch": 0.2628242074927954, "grad_norm": 1.1800564956613375, "learning_rate": 6.1227621483375955e-06, "loss": 0.1978, "step": 342 }, { "epoch": 0.2643611911623439, "grad_norm": 1.1467821141624814, "learning_rate": 6.158567774936061e-06, "loss": 0.1911, "step": 344 }, { "epoch": 0.26589817483189243, "grad_norm": 1.1627264267097417, "learning_rate": 6.194373401534527e-06, "loss": 0.1921, "step": 346 }, { "epoch": 0.2674351585014409, "grad_norm": 1.195438253587923, "learning_rate": 6.2301790281329925e-06, "loss": 0.1898, "step": 348 }, { "epoch": 0.2689721421709894, "grad_norm": 1.1780466341960676, "learning_rate": 6.265984654731457e-06, "loss": 0.1792, "step": 350 }, { "epoch": 0.27050912584053793, "grad_norm": 1.1731330647661304, "learning_rate": 6.301790281329923e-06, "loss": 0.1955, "step": 352 }, { "epoch": 0.27204610951008645, "grad_norm": 1.1745568091917384, "learning_rate": 6.3375959079283885e-06, "loss": 0.1852, "step": 354 }, { "epoch": 0.273583093179635, "grad_norm": 1.1783143021627482, "learning_rate": 6.373401534526854e-06, "loss": 0.1845, "step": 356 }, { "epoch": 0.2751200768491835, "grad_norm": 1.1827170581568967, "learning_rate": 6.40920716112532e-06, "loss": 0.201, "step": 358 }, { "epoch": 0.276657060518732, "grad_norm": 1.1115484126704642, "learning_rate": 6.445012787723785e-06, "loss": 0.17, "step": 360 }, { "epoch": 0.2781940441882805, "grad_norm": 1.0326502834598414, "learning_rate": 6.48081841432225e-06, "loss": 0.186, "step": 362 }, { "epoch": 0.279731027857829, "grad_norm": 1.259244016304707, "learning_rate": 6.516624040920716e-06, "loss": 0.1885, "step": 364 }, { "epoch": 0.2812680115273775, "grad_norm": 1.1678776079796356, "learning_rate": 6.5524296675191815e-06, "loss": 0.1921, "step": 366 }, { "epoch": 0.28280499519692603, "grad_norm": 1.0738638308920176, "learning_rate": 6.588235294117647e-06, "loss": 0.1707, "step": 368 }, { "epoch": 0.28434197886647455, "grad_norm": 1.1594023002547749, "learning_rate": 6.624040920716112e-06, "loss": 0.1903, "step": 370 }, { "epoch": 0.2858789625360231, "grad_norm": 1.1471065906349844, "learning_rate": 6.6598465473145784e-06, "loss": 0.194, "step": 372 }, { "epoch": 0.2874159462055716, "grad_norm": 1.1901453575596241, "learning_rate": 6.695652173913043e-06, "loss": 0.1854, "step": 374 }, { "epoch": 0.28895292987512006, "grad_norm": 1.1928859018793583, "learning_rate": 6.731457800511509e-06, "loss": 0.1906, "step": 376 }, { "epoch": 0.2904899135446686, "grad_norm": 1.2778835898977514, "learning_rate": 6.7672634271099745e-06, "loss": 0.1871, "step": 378 }, { "epoch": 0.2920268972142171, "grad_norm": 1.2992357301881277, "learning_rate": 6.803069053708439e-06, "loss": 0.2, "step": 380 }, { "epoch": 0.2935638808837656, "grad_norm": 1.0621001143169835, "learning_rate": 6.838874680306906e-06, "loss": 0.1735, "step": 382 }, { "epoch": 0.29510086455331414, "grad_norm": 1.1201679749220157, "learning_rate": 6.874680306905371e-06, "loss": 0.1736, "step": 384 }, { "epoch": 0.29663784822286265, "grad_norm": 1.1884134035644247, "learning_rate": 6.910485933503836e-06, "loss": 0.1765, "step": 386 }, { "epoch": 0.2981748318924111, "grad_norm": 1.0331590850791008, "learning_rate": 6.946291560102302e-06, "loss": 0.1751, "step": 388 }, { "epoch": 0.29971181556195964, "grad_norm": 1.0474474582479285, "learning_rate": 6.982097186700767e-06, "loss": 0.1899, "step": 390 }, { "epoch": 0.30124879923150816, "grad_norm": 1.1045197145968615, "learning_rate": 6.999998599675296e-06, "loss": 0.1873, "step": 392 }, { "epoch": 0.3027857829010567, "grad_norm": 1.0461247560001137, "learning_rate": 6.999987397084384e-06, "loss": 0.1844, "step": 394 }, { "epoch": 0.3043227665706052, "grad_norm": 1.074110109904979, "learning_rate": 6.999964991938417e-06, "loss": 0.1867, "step": 396 }, { "epoch": 0.3058597502401537, "grad_norm": 1.1795081212239336, "learning_rate": 6.999931384309108e-06, "loss": 0.1927, "step": 398 }, { "epoch": 0.30739673390970224, "grad_norm": 1.0767670064958226, "learning_rate": 6.999886574304027e-06, "loss": 0.1756, "step": 400 }, { "epoch": 0.3089337175792507, "grad_norm": 1.1180599613087716, "learning_rate": 6.999830562066599e-06, "loss": 0.1777, "step": 402 }, { "epoch": 0.3104707012487992, "grad_norm": 1.1283551526882591, "learning_rate": 6.999763347776102e-06, "loss": 0.1806, "step": 404 }, { "epoch": 0.31200768491834774, "grad_norm": 1.2128389787316596, "learning_rate": 6.999684931647677e-06, "loss": 0.1989, "step": 406 }, { "epoch": 0.31354466858789626, "grad_norm": 1.0231317030587057, "learning_rate": 6.999595313932308e-06, "loss": 0.1737, "step": 408 }, { "epoch": 0.3150816522574448, "grad_norm": 1.2500344807446935, "learning_rate": 6.999494494916842e-06, "loss": 0.1906, "step": 410 }, { "epoch": 0.3166186359269933, "grad_norm": 1.1142795967528607, "learning_rate": 6.999382474923973e-06, "loss": 0.1754, "step": 412 }, { "epoch": 0.31815561959654176, "grad_norm": 1.0688175628422532, "learning_rate": 6.999259254312248e-06, "loss": 0.1738, "step": 414 }, { "epoch": 0.3196926032660903, "grad_norm": 1.075624317236362, "learning_rate": 6.999124833476066e-06, "loss": 0.1785, "step": 416 }, { "epoch": 0.3212295869356388, "grad_norm": 1.0898379845370674, "learning_rate": 6.9989792128456716e-06, "loss": 0.1992, "step": 418 }, { "epoch": 0.3227665706051873, "grad_norm": 1.027575992474251, "learning_rate": 6.998822392887159e-06, "loss": 0.1773, "step": 420 }, { "epoch": 0.32430355427473584, "grad_norm": 1.1417631076293755, "learning_rate": 6.9986543741024684e-06, "loss": 0.1905, "step": 422 }, { "epoch": 0.32584053794428436, "grad_norm": 1.0666179275230614, "learning_rate": 6.998475157029385e-06, "loss": 0.1771, "step": 424 }, { "epoch": 0.3273775216138329, "grad_norm": 1.178184842738573, "learning_rate": 6.998284742241536e-06, "loss": 0.2027, "step": 426 }, { "epoch": 0.32891450528338134, "grad_norm": 1.1389042968070178, "learning_rate": 6.99808313034839e-06, "loss": 0.1761, "step": 428 }, { "epoch": 0.33045148895292986, "grad_norm": 1.1267197030584097, "learning_rate": 6.997870321995255e-06, "loss": 0.1907, "step": 430 }, { "epoch": 0.3319884726224784, "grad_norm": 1.07793801111443, "learning_rate": 6.9976463178632756e-06, "loss": 0.1826, "step": 432 }, { "epoch": 0.3335254562920269, "grad_norm": 1.0872443642223155, "learning_rate": 6.99741111866943e-06, "loss": 0.1913, "step": 434 }, { "epoch": 0.3350624399615754, "grad_norm": 1.1636846292387626, "learning_rate": 6.997164725166531e-06, "loss": 0.1891, "step": 436 }, { "epoch": 0.33659942363112394, "grad_norm": 1.1189017028456263, "learning_rate": 6.996907138143219e-06, "loss": 0.178, "step": 438 }, { "epoch": 0.3381364073006724, "grad_norm": 1.0292145655593512, "learning_rate": 6.996638358423965e-06, "loss": 0.1803, "step": 440 }, { "epoch": 0.3396733909702209, "grad_norm": 1.0390558930376024, "learning_rate": 6.996358386869064e-06, "loss": 0.1881, "step": 442 }, { "epoch": 0.34121037463976944, "grad_norm": 1.1559870900026559, "learning_rate": 6.996067224374631e-06, "loss": 0.1825, "step": 444 }, { "epoch": 0.34274735830931796, "grad_norm": 1.1284033103165994, "learning_rate": 6.995764871872603e-06, "loss": 0.1985, "step": 446 }, { "epoch": 0.3442843419788665, "grad_norm": 1.0182449603206558, "learning_rate": 6.995451330330732e-06, "loss": 0.1963, "step": 448 }, { "epoch": 0.345821325648415, "grad_norm": 1.0080327819510069, "learning_rate": 6.995126600752583e-06, "loss": 0.189, "step": 450 }, { "epoch": 0.3473583093179635, "grad_norm": 1.0876359081678704, "learning_rate": 6.994790684177531e-06, "loss": 0.1672, "step": 452 }, { "epoch": 0.348895292987512, "grad_norm": 1.0950900793628788, "learning_rate": 6.99444358168076e-06, "loss": 0.196, "step": 454 }, { "epoch": 0.3504322766570605, "grad_norm": 1.0087724624388277, "learning_rate": 6.9940852943732534e-06, "loss": 0.1779, "step": 456 }, { "epoch": 0.351969260326609, "grad_norm": 1.0777984809693466, "learning_rate": 6.993715823401798e-06, "loss": 0.1867, "step": 458 }, { "epoch": 0.35350624399615754, "grad_norm": 1.0121221411630168, "learning_rate": 6.993335169948972e-06, "loss": 0.1715, "step": 460 }, { "epoch": 0.35504322766570606, "grad_norm": 1.011168047129122, "learning_rate": 6.992943335233152e-06, "loss": 0.1707, "step": 462 }, { "epoch": 0.3565802113352546, "grad_norm": 1.0120465143177686, "learning_rate": 6.992540320508498e-06, "loss": 0.1875, "step": 464 }, { "epoch": 0.3581171950048031, "grad_norm": 0.9550620727028469, "learning_rate": 6.992126127064956e-06, "loss": 0.1635, "step": 466 }, { "epoch": 0.35965417867435157, "grad_norm": 1.1350686459542934, "learning_rate": 6.9917007562282535e-06, "loss": 0.1751, "step": 468 }, { "epoch": 0.3611911623439001, "grad_norm": 1.1572256160083796, "learning_rate": 6.991264209359891e-06, "loss": 0.1883, "step": 470 }, { "epoch": 0.3627281460134486, "grad_norm": 1.021485377857385, "learning_rate": 6.9908164878571425e-06, "loss": 0.1798, "step": 472 }, { "epoch": 0.3642651296829971, "grad_norm": 0.9632560183914399, "learning_rate": 6.99035759315305e-06, "loss": 0.1655, "step": 474 }, { "epoch": 0.36580211335254564, "grad_norm": 1.0013270641998204, "learning_rate": 6.989887526716415e-06, "loss": 0.1927, "step": 476 }, { "epoch": 0.36733909702209416, "grad_norm": 0.9745686321374288, "learning_rate": 6.9894062900517996e-06, "loss": 0.1674, "step": 478 }, { "epoch": 0.3688760806916426, "grad_norm": 1.0407042923473835, "learning_rate": 6.988913884699518e-06, "loss": 0.1925, "step": 480 }, { "epoch": 0.37041306436119115, "grad_norm": 1.0062661091753247, "learning_rate": 6.988410312235632e-06, "loss": 0.1965, "step": 482 }, { "epoch": 0.37195004803073967, "grad_norm": 1.1013102934828658, "learning_rate": 6.987895574271948e-06, "loss": 0.1998, "step": 484 }, { "epoch": 0.3734870317002882, "grad_norm": 1.0597643936100356, "learning_rate": 6.987369672456009e-06, "loss": 0.1772, "step": 486 }, { "epoch": 0.3750240153698367, "grad_norm": 0.9866253310752731, "learning_rate": 6.986832608471089e-06, "loss": 0.1673, "step": 488 }, { "epoch": 0.3765609990393852, "grad_norm": 1.0602452001077722, "learning_rate": 6.986284384036193e-06, "loss": 0.1739, "step": 490 }, { "epoch": 0.37809798270893374, "grad_norm": 1.041644897151479, "learning_rate": 6.985725000906045e-06, "loss": 0.1695, "step": 492 }, { "epoch": 0.3796349663784822, "grad_norm": 0.952300652923444, "learning_rate": 6.985154460871086e-06, "loss": 0.1801, "step": 494 }, { "epoch": 0.3811719500480307, "grad_norm": 0.93795302994956, "learning_rate": 6.984572765757467e-06, "loss": 0.1691, "step": 496 }, { "epoch": 0.38270893371757925, "grad_norm": 0.9322450383167038, "learning_rate": 6.983979917427043e-06, "loss": 0.1789, "step": 498 }, { "epoch": 0.38424591738712777, "grad_norm": 1.0051761272259416, "learning_rate": 6.98337591777737e-06, "loss": 0.1713, "step": 500 }, { "epoch": 0.38424591738712777, "eval_loss": 0.16142325103282928, "eval_runtime": 365.2501, "eval_samples_per_second": 50.664, "eval_steps_per_second": 6.335, "step": 500 }, { "epoch": 0.3857829010566763, "grad_norm": 1.008256178397373, "learning_rate": 6.982760768741694e-06, "loss": 0.178, "step": 502 }, { "epoch": 0.3873198847262248, "grad_norm": 0.9324814287025068, "learning_rate": 6.982134472288947e-06, "loss": 0.1633, "step": 504 }, { "epoch": 0.38885686839577327, "grad_norm": 1.0289506080987343, "learning_rate": 6.981497030423744e-06, "loss": 0.1805, "step": 506 }, { "epoch": 0.3903938520653218, "grad_norm": 0.9347810671889286, "learning_rate": 6.980848445186369e-06, "loss": 0.1727, "step": 508 }, { "epoch": 0.3919308357348703, "grad_norm": 1.052017862885602, "learning_rate": 6.980188718652778e-06, "loss": 0.1797, "step": 510 }, { "epoch": 0.39346781940441883, "grad_norm": 1.056395915175445, "learning_rate": 6.9795178529345855e-06, "loss": 0.186, "step": 512 }, { "epoch": 0.39500480307396735, "grad_norm": 1.0283503364288544, "learning_rate": 6.978835850179057e-06, "loss": 0.1844, "step": 514 }, { "epoch": 0.39654178674351587, "grad_norm": 0.9433928178661478, "learning_rate": 6.978142712569109e-06, "loss": 0.1719, "step": 516 }, { "epoch": 0.3980787704130644, "grad_norm": 0.9623302440193999, "learning_rate": 6.9774384423232945e-06, "loss": 0.1613, "step": 518 }, { "epoch": 0.39961575408261285, "grad_norm": 1.0414121844166375, "learning_rate": 6.976723041695802e-06, "loss": 0.1841, "step": 520 }, { "epoch": 0.40115273775216137, "grad_norm": 1.0155405045344752, "learning_rate": 6.9759965129764425e-06, "loss": 0.1706, "step": 522 }, { "epoch": 0.4026897214217099, "grad_norm": 1.0347436462429636, "learning_rate": 6.975258858490648e-06, "loss": 0.1898, "step": 524 }, { "epoch": 0.4042267050912584, "grad_norm": 0.9905491545072838, "learning_rate": 6.974510080599458e-06, "loss": 0.1701, "step": 526 }, { "epoch": 0.40576368876080693, "grad_norm": 1.0065784769120174, "learning_rate": 6.973750181699518e-06, "loss": 0.173, "step": 528 }, { "epoch": 0.40730067243035545, "grad_norm": 1.0028920218026682, "learning_rate": 6.972979164223069e-06, "loss": 0.1815, "step": 530 }, { "epoch": 0.4088376560999039, "grad_norm": 0.9926648764778461, "learning_rate": 6.972197030637938e-06, "loss": 0.1701, "step": 532 }, { "epoch": 0.41037463976945243, "grad_norm": 0.9362580434331931, "learning_rate": 6.971403783447532e-06, "loss": 0.162, "step": 534 }, { "epoch": 0.41191162343900095, "grad_norm": 1.0313345974784995, "learning_rate": 6.97059942519083e-06, "loss": 0.1793, "step": 536 }, { "epoch": 0.41344860710854947, "grad_norm": 1.053038519408416, "learning_rate": 6.969783958442376e-06, "loss": 0.1744, "step": 538 }, { "epoch": 0.414985590778098, "grad_norm": 1.09503010697989, "learning_rate": 6.968957385812268e-06, "loss": 0.1837, "step": 540 }, { "epoch": 0.4165225744476465, "grad_norm": 0.9963387156020179, "learning_rate": 6.968119709946151e-06, "loss": 0.1824, "step": 542 }, { "epoch": 0.41805955811719503, "grad_norm": 0.9881607706879252, "learning_rate": 6.9672709335252075e-06, "loss": 0.1864, "step": 544 }, { "epoch": 0.4195965417867435, "grad_norm": 0.9849697365361947, "learning_rate": 6.966411059266153e-06, "loss": 0.1676, "step": 546 }, { "epoch": 0.421133525456292, "grad_norm": 0.99285568761607, "learning_rate": 6.965540089921224e-06, "loss": 0.1788, "step": 548 }, { "epoch": 0.42267050912584053, "grad_norm": 1.0020197625846368, "learning_rate": 6.964658028278167e-06, "loss": 0.1717, "step": 550 }, { "epoch": 0.42420749279538905, "grad_norm": 1.0446372613390171, "learning_rate": 6.963764877160232e-06, "loss": 0.1724, "step": 552 }, { "epoch": 0.42574447646493757, "grad_norm": 1.0489345870285778, "learning_rate": 6.962860639426168e-06, "loss": 0.1751, "step": 554 }, { "epoch": 0.4272814601344861, "grad_norm": 0.9913965985343988, "learning_rate": 6.9619453179702036e-06, "loss": 0.1753, "step": 556 }, { "epoch": 0.42881844380403455, "grad_norm": 0.9746372697415868, "learning_rate": 6.9610189157220465e-06, "loss": 0.1849, "step": 558 }, { "epoch": 0.4303554274735831, "grad_norm": 1.0035507982106044, "learning_rate": 6.960081435646872e-06, "loss": 0.1887, "step": 560 }, { "epoch": 0.4318924111431316, "grad_norm": 1.0189535936717342, "learning_rate": 6.95913288074531e-06, "loss": 0.1761, "step": 562 }, { "epoch": 0.4334293948126801, "grad_norm": 1.0604859650996308, "learning_rate": 6.958173254053442e-06, "loss": 0.1698, "step": 564 }, { "epoch": 0.43496637848222863, "grad_norm": 1.04513958829485, "learning_rate": 6.957202558642782e-06, "loss": 0.1842, "step": 566 }, { "epoch": 0.43650336215177715, "grad_norm": 0.9420876517823673, "learning_rate": 6.9562207976202775e-06, "loss": 0.1614, "step": 568 }, { "epoch": 0.43804034582132567, "grad_norm": 1.0001567101709503, "learning_rate": 6.9552279741282916e-06, "loss": 0.1692, "step": 570 }, { "epoch": 0.43957732949087414, "grad_norm": 1.0163931461175468, "learning_rate": 6.954224091344593e-06, "loss": 0.1687, "step": 572 }, { "epoch": 0.44111431316042266, "grad_norm": 0.98643009928601, "learning_rate": 6.953209152482355e-06, "loss": 0.1853, "step": 574 }, { "epoch": 0.4426512968299712, "grad_norm": 0.9353513380661882, "learning_rate": 6.952183160790133e-06, "loss": 0.1675, "step": 576 }, { "epoch": 0.4441882804995197, "grad_norm": 0.9020207899109111, "learning_rate": 6.951146119551859e-06, "loss": 0.1679, "step": 578 }, { "epoch": 0.4457252641690682, "grad_norm": 0.9841310128546708, "learning_rate": 6.950098032086837e-06, "loss": 0.1694, "step": 580 }, { "epoch": 0.44726224783861673, "grad_norm": 1.0255232655982072, "learning_rate": 6.949038901749723e-06, "loss": 0.1736, "step": 582 }, { "epoch": 0.44879923150816525, "grad_norm": 0.9032841104351299, "learning_rate": 6.947968731930519e-06, "loss": 0.163, "step": 584 }, { "epoch": 0.4503362151777137, "grad_norm": 0.9005602498337584, "learning_rate": 6.946887526054563e-06, "loss": 0.1593, "step": 586 }, { "epoch": 0.45187319884726224, "grad_norm": 0.9439866081688895, "learning_rate": 6.945795287582514e-06, "loss": 0.172, "step": 588 }, { "epoch": 0.45341018251681076, "grad_norm": 1.015175377754709, "learning_rate": 6.9446920200103465e-06, "loss": 0.1658, "step": 590 }, { "epoch": 0.4549471661863593, "grad_norm": 0.9730430866278983, "learning_rate": 6.943577726869334e-06, "loss": 0.1607, "step": 592 }, { "epoch": 0.4564841498559078, "grad_norm": 0.9619897655090097, "learning_rate": 6.942452411726042e-06, "loss": 0.1799, "step": 594 }, { "epoch": 0.4580211335254563, "grad_norm": 0.9934784984855407, "learning_rate": 6.941316078182312e-06, "loss": 0.1692, "step": 596 }, { "epoch": 0.4595581171950048, "grad_norm": 0.9337603705095543, "learning_rate": 6.940168729875255e-06, "loss": 0.1787, "step": 598 }, { "epoch": 0.4610951008645533, "grad_norm": 0.9057632316451069, "learning_rate": 6.939010370477235e-06, "loss": 0.1713, "step": 600 }, { "epoch": 0.4626320845341018, "grad_norm": 0.989678630378062, "learning_rate": 6.9378410036958635e-06, "loss": 0.1761, "step": 602 }, { "epoch": 0.46416906820365034, "grad_norm": 0.9385046490524968, "learning_rate": 6.936660633273979e-06, "loss": 0.1745, "step": 604 }, { "epoch": 0.46570605187319886, "grad_norm": 0.955462629075025, "learning_rate": 6.935469262989644e-06, "loss": 0.1711, "step": 606 }, { "epoch": 0.4672430355427474, "grad_norm": 0.8795030512447493, "learning_rate": 6.9342668966561245e-06, "loss": 0.169, "step": 608 }, { "epoch": 0.4687800192122959, "grad_norm": 0.9495870025972418, "learning_rate": 6.933053538121886e-06, "loss": 0.1682, "step": 610 }, { "epoch": 0.47031700288184436, "grad_norm": 0.9330347326137807, "learning_rate": 6.931829191270576e-06, "loss": 0.1662, "step": 612 }, { "epoch": 0.4718539865513929, "grad_norm": 0.9017675551372555, "learning_rate": 6.930593860021012e-06, "loss": 0.1628, "step": 614 }, { "epoch": 0.4733909702209414, "grad_norm": 0.9466561162964194, "learning_rate": 6.929347548327168e-06, "loss": 0.1807, "step": 616 }, { "epoch": 0.4749279538904899, "grad_norm": 0.9881577337000146, "learning_rate": 6.928090260178169e-06, "loss": 0.1872, "step": 618 }, { "epoch": 0.47646493756003844, "grad_norm": 0.9646614031070989, "learning_rate": 6.926821999598266e-06, "loss": 0.1709, "step": 620 }, { "epoch": 0.47800192122958696, "grad_norm": 0.9609197980971407, "learning_rate": 6.9255427706468375e-06, "loss": 0.1774, "step": 622 }, { "epoch": 0.4795389048991354, "grad_norm": 0.9541380042708364, "learning_rate": 6.92425257741836e-06, "loss": 0.1777, "step": 624 }, { "epoch": 0.48107588856868394, "grad_norm": 0.898383778954144, "learning_rate": 6.922951424042412e-06, "loss": 0.1691, "step": 626 }, { "epoch": 0.48261287223823246, "grad_norm": 0.960885030497367, "learning_rate": 6.921639314683648e-06, "loss": 0.1576, "step": 628 }, { "epoch": 0.484149855907781, "grad_norm": 0.9454433168177289, "learning_rate": 6.92031625354179e-06, "loss": 0.1796, "step": 630 }, { "epoch": 0.4856868395773295, "grad_norm": 0.9852240424660407, "learning_rate": 6.918982244851616e-06, "loss": 0.171, "step": 632 }, { "epoch": 0.487223823246878, "grad_norm": 1.0414345737302746, "learning_rate": 6.917637292882944e-06, "loss": 0.1776, "step": 634 }, { "epoch": 0.48876080691642654, "grad_norm": 0.9675949339406987, "learning_rate": 6.916281401940615e-06, "loss": 0.1806, "step": 636 }, { "epoch": 0.490297790585975, "grad_norm": 0.982314844842542, "learning_rate": 6.914914576364487e-06, "loss": 0.1785, "step": 638 }, { "epoch": 0.4918347742555235, "grad_norm": 0.9775652590949767, "learning_rate": 6.913536820529416e-06, "loss": 0.1751, "step": 640 }, { "epoch": 0.49337175792507204, "grad_norm": 0.8855730733866225, "learning_rate": 6.912148138845241e-06, "loss": 0.1659, "step": 642 }, { "epoch": 0.49490874159462056, "grad_norm": 1.0573307642737217, "learning_rate": 6.910748535756774e-06, "loss": 0.1781, "step": 644 }, { "epoch": 0.4964457252641691, "grad_norm": 0.948011615943263, "learning_rate": 6.909338015743782e-06, "loss": 0.1521, "step": 646 }, { "epoch": 0.4979827089337176, "grad_norm": 1.006209266616953, "learning_rate": 6.907916583320976e-06, "loss": 0.1671, "step": 648 }, { "epoch": 0.49951969260326606, "grad_norm": 0.9744730979666755, "learning_rate": 6.906484243037992e-06, "loss": 0.1777, "step": 650 }, { "epoch": 0.5010566762728146, "grad_norm": 1.0140793456518158, "learning_rate": 6.9050409994793835e-06, "loss": 0.1724, "step": 652 }, { "epoch": 0.5025936599423632, "grad_norm": 0.9953513391992594, "learning_rate": 6.903586857264598e-06, "loss": 0.1612, "step": 654 }, { "epoch": 0.5041306436119116, "grad_norm": 0.9120318862748673, "learning_rate": 6.9021218210479715e-06, "loss": 0.1565, "step": 656 }, { "epoch": 0.5056676272814601, "grad_norm": 0.8703090747149029, "learning_rate": 6.900645895518703e-06, "loss": 0.1616, "step": 658 }, { "epoch": 0.5072046109510087, "grad_norm": 0.9188361211761223, "learning_rate": 6.899159085400851e-06, "loss": 0.1702, "step": 660 }, { "epoch": 0.5087415946205571, "grad_norm": 0.9052706219772494, "learning_rate": 6.897661395453309e-06, "loss": 0.1671, "step": 662 }, { "epoch": 0.5102785782901057, "grad_norm": 0.9790524922779839, "learning_rate": 6.896152830469797e-06, "loss": 0.1707, "step": 664 }, { "epoch": 0.5118155619596542, "grad_norm": 0.9737405664828704, "learning_rate": 6.894633395278839e-06, "loss": 0.1749, "step": 666 }, { "epoch": 0.5133525456292027, "grad_norm": 0.8786179004815331, "learning_rate": 6.893103094743758e-06, "loss": 0.1662, "step": 668 }, { "epoch": 0.5148895292987512, "grad_norm": 0.9648594871932485, "learning_rate": 6.891561933762648e-06, "loss": 0.1777, "step": 670 }, { "epoch": 0.5164265129682997, "grad_norm": 0.9273934239395568, "learning_rate": 6.8900099172683675e-06, "loss": 0.1762, "step": 672 }, { "epoch": 0.5179634966378482, "grad_norm": 0.9132188955471342, "learning_rate": 6.8884470502285195e-06, "loss": 0.1638, "step": 674 }, { "epoch": 0.5195004803073967, "grad_norm": 1.0216457284197233, "learning_rate": 6.886873337645439e-06, "loss": 0.1663, "step": 676 }, { "epoch": 0.5210374639769453, "grad_norm": 0.9342404527623865, "learning_rate": 6.885288784556172e-06, "loss": 0.1692, "step": 678 }, { "epoch": 0.5225744476464937, "grad_norm": 0.8975547579280767, "learning_rate": 6.883693396032463e-06, "loss": 0.1669, "step": 680 }, { "epoch": 0.5241114313160423, "grad_norm": 0.8921639188810697, "learning_rate": 6.88208717718074e-06, "loss": 0.1665, "step": 682 }, { "epoch": 0.5256484149855908, "grad_norm": 0.9687694730815803, "learning_rate": 6.880470133142094e-06, "loss": 0.1674, "step": 684 }, { "epoch": 0.5271853986551392, "grad_norm": 0.9456241155960425, "learning_rate": 6.878842269092263e-06, "loss": 0.1685, "step": 686 }, { "epoch": 0.5287223823246878, "grad_norm": 1.0077089550967064, "learning_rate": 6.877203590241621e-06, "loss": 0.1755, "step": 688 }, { "epoch": 0.5302593659942363, "grad_norm": 0.9499550767483984, "learning_rate": 6.875554101835156e-06, "loss": 0.1727, "step": 690 }, { "epoch": 0.5317963496637849, "grad_norm": 0.9677648466898067, "learning_rate": 6.873893809152453e-06, "loss": 0.1768, "step": 692 }, { "epoch": 0.5333333333333333, "grad_norm": 0.9430584650028841, "learning_rate": 6.872222717507679e-06, "loss": 0.1736, "step": 694 }, { "epoch": 0.5348703170028818, "grad_norm": 0.8724957576172335, "learning_rate": 6.870540832249567e-06, "loss": 0.1586, "step": 696 }, { "epoch": 0.5364073006724304, "grad_norm": 0.8628795594957057, "learning_rate": 6.868848158761398e-06, "loss": 0.1617, "step": 698 }, { "epoch": 0.5379442843419788, "grad_norm": 0.9609714958221978, "learning_rate": 6.867144702460982e-06, "loss": 0.1831, "step": 700 }, { "epoch": 0.5394812680115274, "grad_norm": 0.9942806006570933, "learning_rate": 6.865430468800642e-06, "loss": 0.181, "step": 702 }, { "epoch": 0.5410182516810759, "grad_norm": 0.9153357695300892, "learning_rate": 6.863705463267197e-06, "loss": 0.1616, "step": 704 }, { "epoch": 0.5425552353506244, "grad_norm": 0.903669480913219, "learning_rate": 6.861969691381943e-06, "loss": 0.176, "step": 706 }, { "epoch": 0.5440922190201729, "grad_norm": 0.88720714763806, "learning_rate": 6.860223158700639e-06, "loss": 0.17, "step": 708 }, { "epoch": 0.5456292026897214, "grad_norm": 0.9051200845778055, "learning_rate": 6.8584658708134825e-06, "loss": 0.1686, "step": 710 }, { "epoch": 0.54716618635927, "grad_norm": 0.9513875655430767, "learning_rate": 6.856697833345101e-06, "loss": 0.1723, "step": 712 }, { "epoch": 0.5487031700288184, "grad_norm": 0.9099315275605531, "learning_rate": 6.8549190519545206e-06, "loss": 0.1641, "step": 714 }, { "epoch": 0.550240153698367, "grad_norm": 0.8702057226720046, "learning_rate": 6.8531295323351655e-06, "loss": 0.1583, "step": 716 }, { "epoch": 0.5517771373679154, "grad_norm": 0.9086133226107327, "learning_rate": 6.851329280214823e-06, "loss": 0.1637, "step": 718 }, { "epoch": 0.553314121037464, "grad_norm": 0.908182677978134, "learning_rate": 6.8495183013556365e-06, "loss": 0.1596, "step": 720 }, { "epoch": 0.5548511047070125, "grad_norm": 0.9856738002008828, "learning_rate": 6.847696601554079e-06, "loss": 0.1701, "step": 722 }, { "epoch": 0.556388088376561, "grad_norm": 0.9119819563123349, "learning_rate": 6.845864186640944e-06, "loss": 0.1563, "step": 724 }, { "epoch": 0.5579250720461095, "grad_norm": 0.9080236111588762, "learning_rate": 6.844021062481314e-06, "loss": 0.1753, "step": 726 }, { "epoch": 0.559462055715658, "grad_norm": 0.8990390737752408, "learning_rate": 6.842167234974556e-06, "loss": 0.1647, "step": 728 }, { "epoch": 0.5609990393852066, "grad_norm": 0.9761008697066089, "learning_rate": 6.840302710054292e-06, "loss": 0.1708, "step": 730 }, { "epoch": 0.562536023054755, "grad_norm": 0.9552176939231856, "learning_rate": 6.838427493688384e-06, "loss": 0.1675, "step": 732 }, { "epoch": 0.5640730067243036, "grad_norm": 0.8757204503224848, "learning_rate": 6.836541591878915e-06, "loss": 0.1624, "step": 734 }, { "epoch": 0.5656099903938521, "grad_norm": 0.8887299929134649, "learning_rate": 6.834645010662169e-06, "loss": 0.1579, "step": 736 }, { "epoch": 0.5671469740634005, "grad_norm": 0.9268495074535574, "learning_rate": 6.832737756108613e-06, "loss": 0.1817, "step": 738 }, { "epoch": 0.5686839577329491, "grad_norm": 0.8930331129406648, "learning_rate": 6.830819834322875e-06, "loss": 0.1722, "step": 740 }, { "epoch": 0.5702209414024976, "grad_norm": 0.9181728216635983, "learning_rate": 6.828891251443729e-06, "loss": 0.1751, "step": 742 }, { "epoch": 0.5717579250720461, "grad_norm": 0.9429482896947388, "learning_rate": 6.826952013644067e-06, "loss": 0.1715, "step": 744 }, { "epoch": 0.5732949087415946, "grad_norm": 0.9118821908902819, "learning_rate": 6.825002127130891e-06, "loss": 0.1696, "step": 746 }, { "epoch": 0.5748318924111432, "grad_norm": 0.9132457028236339, "learning_rate": 6.823041598145282e-06, "loss": 0.1699, "step": 748 }, { "epoch": 0.5763688760806917, "grad_norm": 0.8756070623659923, "learning_rate": 6.821070432962387e-06, "loss": 0.1635, "step": 750 }, { "epoch": 0.5779058597502401, "grad_norm": 0.8779903149650536, "learning_rate": 6.819088637891397e-06, "loss": 0.1571, "step": 752 }, { "epoch": 0.5794428434197887, "grad_norm": 0.951553518645419, "learning_rate": 6.817096219275525e-06, "loss": 0.177, "step": 754 }, { "epoch": 0.5809798270893372, "grad_norm": 0.8815920502748197, "learning_rate": 6.815093183491988e-06, "loss": 0.1723, "step": 756 }, { "epoch": 0.5825168107588857, "grad_norm": 0.8943776455140009, "learning_rate": 6.813079536951986e-06, "loss": 0.1675, "step": 758 }, { "epoch": 0.5840537944284342, "grad_norm": 0.9042515471791829, "learning_rate": 6.811055286100681e-06, "loss": 0.1668, "step": 760 }, { "epoch": 0.5855907780979827, "grad_norm": 0.8985203223108896, "learning_rate": 6.809020437417178e-06, "loss": 0.1683, "step": 762 }, { "epoch": 0.5871277617675312, "grad_norm": 0.8854626460275703, "learning_rate": 6.8069749974145e-06, "loss": 0.1664, "step": 764 }, { "epoch": 0.5886647454370797, "grad_norm": 0.9849706761614369, "learning_rate": 6.804918972639572e-06, "loss": 0.1603, "step": 766 }, { "epoch": 0.5902017291066283, "grad_norm": 0.9492268475374204, "learning_rate": 6.802852369673199e-06, "loss": 0.1712, "step": 768 }, { "epoch": 0.5917387127761767, "grad_norm": 0.9656447681787407, "learning_rate": 6.8007751951300425e-06, "loss": 0.1571, "step": 770 }, { "epoch": 0.5932756964457253, "grad_norm": 0.8950523002371793, "learning_rate": 6.798687455658602e-06, "loss": 0.166, "step": 772 }, { "epoch": 0.5948126801152738, "grad_norm": 0.9273600651692718, "learning_rate": 6.79658915794119e-06, "loss": 0.1617, "step": 774 }, { "epoch": 0.5963496637848222, "grad_norm": 0.9160428728026682, "learning_rate": 6.794480308693916e-06, "loss": 0.1718, "step": 776 }, { "epoch": 0.5978866474543708, "grad_norm": 0.9675151521212334, "learning_rate": 6.792360914666662e-06, "loss": 0.1618, "step": 778 }, { "epoch": 0.5994236311239193, "grad_norm": 0.9613967630774912, "learning_rate": 6.79023098264306e-06, "loss": 0.1707, "step": 780 }, { "epoch": 0.6009606147934679, "grad_norm": 0.8548759973613992, "learning_rate": 6.7880905194404735e-06, "loss": 0.1514, "step": 782 }, { "epoch": 0.6024975984630163, "grad_norm": 0.890219782689619, "learning_rate": 6.78593953190997e-06, "loss": 0.1715, "step": 784 }, { "epoch": 0.6040345821325649, "grad_norm": 0.9086155715910085, "learning_rate": 6.783778026936305e-06, "loss": 0.1663, "step": 786 }, { "epoch": 0.6055715658021134, "grad_norm": 0.8095718809326027, "learning_rate": 6.781606011437898e-06, "loss": 0.1506, "step": 788 }, { "epoch": 0.6071085494716618, "grad_norm": 0.8993402244627612, "learning_rate": 6.779423492366808e-06, "loss": 0.1778, "step": 790 }, { "epoch": 0.6086455331412104, "grad_norm": 0.9179176055571641, "learning_rate": 6.777230476708715e-06, "loss": 0.1683, "step": 792 }, { "epoch": 0.6101825168107589, "grad_norm": 0.880916421665006, "learning_rate": 6.775026971482896e-06, "loss": 0.1679, "step": 794 }, { "epoch": 0.6117195004803074, "grad_norm": 0.9118630511257958, "learning_rate": 6.7728129837422016e-06, "loss": 0.1647, "step": 796 }, { "epoch": 0.6132564841498559, "grad_norm": 0.9503223570669749, "learning_rate": 6.770588520573034e-06, "loss": 0.1653, "step": 798 }, { "epoch": 0.6147934678194045, "grad_norm": 0.914092375436958, "learning_rate": 6.768353589095325e-06, "loss": 0.1706, "step": 800 }, { "epoch": 0.6163304514889529, "grad_norm": 0.8780710561728171, "learning_rate": 6.766108196462512e-06, "loss": 0.1581, "step": 802 }, { "epoch": 0.6178674351585014, "grad_norm": 0.9908640319756162, "learning_rate": 6.763852349861517e-06, "loss": 0.1746, "step": 804 }, { "epoch": 0.61940441882805, "grad_norm": 0.9382036222959125, "learning_rate": 6.761586056512721e-06, "loss": 0.1467, "step": 806 }, { "epoch": 0.6209414024975984, "grad_norm": 0.8796176529284295, "learning_rate": 6.759309323669945e-06, "loss": 0.1638, "step": 808 }, { "epoch": 0.622478386167147, "grad_norm": 0.9402322649840099, "learning_rate": 6.757022158620422e-06, "loss": 0.1644, "step": 810 }, { "epoch": 0.6240153698366955, "grad_norm": 0.9036305217983724, "learning_rate": 6.754724568684775e-06, "loss": 0.1625, "step": 812 }, { "epoch": 0.6255523535062439, "grad_norm": 0.8200742046612647, "learning_rate": 6.752416561216997e-06, "loss": 0.1617, "step": 814 }, { "epoch": 0.6270893371757925, "grad_norm": 0.9107960160473824, "learning_rate": 6.750098143604423e-06, "loss": 0.1687, "step": 816 }, { "epoch": 0.628626320845341, "grad_norm": 0.8972272932978446, "learning_rate": 6.747769323267706e-06, "loss": 0.1746, "step": 818 }, { "epoch": 0.6301633045148896, "grad_norm": 0.9499481842269516, "learning_rate": 6.7454301076608025e-06, "loss": 0.1696, "step": 820 }, { "epoch": 0.631700288184438, "grad_norm": 0.8409255790993163, "learning_rate": 6.743080504270933e-06, "loss": 0.1598, "step": 822 }, { "epoch": 0.6332372718539866, "grad_norm": 0.9027842885644961, "learning_rate": 6.740720520618574e-06, "loss": 0.1585, "step": 824 }, { "epoch": 0.6347742555235351, "grad_norm": 0.8719606536944999, "learning_rate": 6.738350164257421e-06, "loss": 0.1547, "step": 826 }, { "epoch": 0.6363112391930835, "grad_norm": 0.9247004952412972, "learning_rate": 6.735969442774372e-06, "loss": 0.173, "step": 828 }, { "epoch": 0.6378482228626321, "grad_norm": 0.8506134884934429, "learning_rate": 6.733578363789503e-06, "loss": 0.154, "step": 830 }, { "epoch": 0.6393852065321806, "grad_norm": 0.9171064482180226, "learning_rate": 6.731176934956039e-06, "loss": 0.1652, "step": 832 }, { "epoch": 0.6409221902017291, "grad_norm": 0.9248200687434835, "learning_rate": 6.728765163960333e-06, "loss": 0.1704, "step": 834 }, { "epoch": 0.6424591738712776, "grad_norm": 0.8545348365058903, "learning_rate": 6.726343058521839e-06, "loss": 0.1569, "step": 836 }, { "epoch": 0.6439961575408262, "grad_norm": 0.9043067784462817, "learning_rate": 6.723910626393091e-06, "loss": 0.1715, "step": 838 }, { "epoch": 0.6455331412103746, "grad_norm": 1.0091403589112276, "learning_rate": 6.721467875359678e-06, "loss": 0.1743, "step": 840 }, { "epoch": 0.6470701248799231, "grad_norm": 0.8452475270597983, "learning_rate": 6.719014813240213e-06, "loss": 0.1637, "step": 842 }, { "epoch": 0.6486071085494717, "grad_norm": 0.9000907310330241, "learning_rate": 6.716551447886314e-06, "loss": 0.1664, "step": 844 }, { "epoch": 0.6501440922190201, "grad_norm": 0.9367454591962908, "learning_rate": 6.714077787182576e-06, "loss": 0.1609, "step": 846 }, { "epoch": 0.6516810758885687, "grad_norm": 0.8639855347461337, "learning_rate": 6.71159383904655e-06, "loss": 0.1619, "step": 848 }, { "epoch": 0.6532180595581172, "grad_norm": 0.8444570709766536, "learning_rate": 6.709099611428709e-06, "loss": 0.1569, "step": 850 }, { "epoch": 0.6547550432276658, "grad_norm": 0.8660206440875692, "learning_rate": 6.706595112312432e-06, "loss": 0.1638, "step": 852 }, { "epoch": 0.6562920268972142, "grad_norm": 0.9309827842770139, "learning_rate": 6.704080349713974e-06, "loss": 0.1605, "step": 854 }, { "epoch": 0.6578290105667627, "grad_norm": 0.8714646054346427, "learning_rate": 6.70155533168244e-06, "loss": 0.1595, "step": 856 }, { "epoch": 0.6593659942363113, "grad_norm": 0.8712554159426241, "learning_rate": 6.699020066299759e-06, "loss": 0.1678, "step": 858 }, { "epoch": 0.6609029779058597, "grad_norm": 0.9250738676547778, "learning_rate": 6.696474561680663e-06, "loss": 0.1789, "step": 860 }, { "epoch": 0.6624399615754083, "grad_norm": 0.9111129955962746, "learning_rate": 6.693918825972651e-06, "loss": 0.168, "step": 862 }, { "epoch": 0.6639769452449568, "grad_norm": 0.9141522182958596, "learning_rate": 6.691352867355973e-06, "loss": 0.16, "step": 864 }, { "epoch": 0.6655139289145053, "grad_norm": 0.881012224572559, "learning_rate": 6.688776694043602e-06, "loss": 0.1653, "step": 866 }, { "epoch": 0.6670509125840538, "grad_norm": 0.8632782839785611, "learning_rate": 6.6861903142812e-06, "loss": 0.1564, "step": 868 }, { "epoch": 0.6685878962536023, "grad_norm": 0.8720327769692922, "learning_rate": 6.683593736347102e-06, "loss": 0.1534, "step": 870 }, { "epoch": 0.6701248799231508, "grad_norm": 0.876964430748599, "learning_rate": 6.680986968552282e-06, "loss": 0.1657, "step": 872 }, { "epoch": 0.6716618635926993, "grad_norm": 0.9631837226264072, "learning_rate": 6.6783700192403296e-06, "loss": 0.1683, "step": 874 }, { "epoch": 0.6731988472622479, "grad_norm": 0.846575004663177, "learning_rate": 6.675742896787425e-06, "loss": 0.1476, "step": 876 }, { "epoch": 0.6747358309317963, "grad_norm": 0.9308686661082001, "learning_rate": 6.6731056096023065e-06, "loss": 0.1595, "step": 878 }, { "epoch": 0.6762728146013448, "grad_norm": 0.8915035477659844, "learning_rate": 6.6704581661262486e-06, "loss": 0.1575, "step": 880 }, { "epoch": 0.6778097982708934, "grad_norm": 0.8362332131345471, "learning_rate": 6.6678005748330346e-06, "loss": 0.1601, "step": 882 }, { "epoch": 0.6793467819404418, "grad_norm": 0.9249603517376113, "learning_rate": 6.665132844228926e-06, "loss": 0.1654, "step": 884 }, { "epoch": 0.6808837656099904, "grad_norm": 0.8814575750008693, "learning_rate": 6.662454982852641e-06, "loss": 0.1749, "step": 886 }, { "epoch": 0.6824207492795389, "grad_norm": 0.899221716183914, "learning_rate": 6.65976699927532e-06, "loss": 0.1682, "step": 888 }, { "epoch": 0.6839577329490875, "grad_norm": 0.9139541897412309, "learning_rate": 6.657068902100504e-06, "loss": 0.1662, "step": 890 }, { "epoch": 0.6854947166186359, "grad_norm": 0.9291186479929943, "learning_rate": 6.6543606999641065e-06, "loss": 0.1677, "step": 892 }, { "epoch": 0.6870317002881844, "grad_norm": 0.8803357017385457, "learning_rate": 6.6516424015343795e-06, "loss": 0.1528, "step": 894 }, { "epoch": 0.688568683957733, "grad_norm": 0.9230606940030847, "learning_rate": 6.6489140155118964e-06, "loss": 0.1676, "step": 896 }, { "epoch": 0.6901056676272814, "grad_norm": 0.8703699992143266, "learning_rate": 6.6461755506295145e-06, "loss": 0.1554, "step": 898 }, { "epoch": 0.69164265129683, "grad_norm": 0.9524256265392508, "learning_rate": 6.643427015652351e-06, "loss": 0.1704, "step": 900 }, { "epoch": 0.6931796349663785, "grad_norm": 0.9415591239679129, "learning_rate": 6.640668419377758e-06, "loss": 0.1592, "step": 902 }, { "epoch": 0.694716618635927, "grad_norm": 0.8738330937457875, "learning_rate": 6.6378997706352885e-06, "loss": 0.1692, "step": 904 }, { "epoch": 0.6962536023054755, "grad_norm": 0.9005686509467422, "learning_rate": 6.635121078286671e-06, "loss": 0.1684, "step": 906 }, { "epoch": 0.697790585975024, "grad_norm": 0.8296366500080525, "learning_rate": 6.632332351225783e-06, "loss": 0.1593, "step": 908 }, { "epoch": 0.6993275696445725, "grad_norm": 0.8623498006286683, "learning_rate": 6.629533598378617e-06, "loss": 0.1683, "step": 910 }, { "epoch": 0.700864553314121, "grad_norm": 0.8462969581032185, "learning_rate": 6.626724828703259e-06, "loss": 0.1534, "step": 912 }, { "epoch": 0.7024015369836696, "grad_norm": 0.8628311131127484, "learning_rate": 6.623906051189854e-06, "loss": 0.1604, "step": 914 }, { "epoch": 0.703938520653218, "grad_norm": 0.9145692129791568, "learning_rate": 6.621077274860581e-06, "loss": 0.1594, "step": 916 }, { "epoch": 0.7054755043227666, "grad_norm": 0.9195013649008027, "learning_rate": 6.618238508769621e-06, "loss": 0.1716, "step": 918 }, { "epoch": 0.7070124879923151, "grad_norm": 0.9005634587923742, "learning_rate": 6.615389762003131e-06, "loss": 0.169, "step": 920 }, { "epoch": 0.7085494716618636, "grad_norm": 0.9087616214061071, "learning_rate": 6.612531043679213e-06, "loss": 0.1584, "step": 922 }, { "epoch": 0.7100864553314121, "grad_norm": 0.8929943346561857, "learning_rate": 6.609662362947886e-06, "loss": 0.1543, "step": 924 }, { "epoch": 0.7116234390009606, "grad_norm": 0.8923387750870015, "learning_rate": 6.606783728991054e-06, "loss": 0.1576, "step": 926 }, { "epoch": 0.7131604226705092, "grad_norm": 0.8545883642264139, "learning_rate": 6.603895151022483e-06, "loss": 0.1593, "step": 928 }, { "epoch": 0.7146974063400576, "grad_norm": 0.8581885831090912, "learning_rate": 6.600996638287762e-06, "loss": 0.1603, "step": 930 }, { "epoch": 0.7162343900096062, "grad_norm": 0.9326356897470247, "learning_rate": 6.598088200064284e-06, "loss": 0.1731, "step": 932 }, { "epoch": 0.7177713736791547, "grad_norm": 0.8874229434905837, "learning_rate": 6.595169845661204e-06, "loss": 0.1561, "step": 934 }, { "epoch": 0.7193083573487031, "grad_norm": 0.880472017082445, "learning_rate": 6.592241584419424e-06, "loss": 0.1589, "step": 936 }, { "epoch": 0.7208453410182517, "grad_norm": 0.8802659879981632, "learning_rate": 6.589303425711548e-06, "loss": 0.1699, "step": 938 }, { "epoch": 0.7223823246878002, "grad_norm": 0.8605735232186383, "learning_rate": 6.586355378941866e-06, "loss": 0.1508, "step": 940 }, { "epoch": 0.7239193083573487, "grad_norm": 0.9218559892018848, "learning_rate": 6.58339745354631e-06, "loss": 0.1536, "step": 942 }, { "epoch": 0.7254562920268972, "grad_norm": 0.8848127640286009, "learning_rate": 6.580429658992438e-06, "loss": 0.1595, "step": 944 }, { "epoch": 0.7269932756964457, "grad_norm": 0.9210041621285134, "learning_rate": 6.577452004779393e-06, "loss": 0.1493, "step": 946 }, { "epoch": 0.7285302593659942, "grad_norm": 0.9094329668162617, "learning_rate": 6.574464500437875e-06, "loss": 0.1656, "step": 948 }, { "epoch": 0.7300672430355427, "grad_norm": 0.8599101226794522, "learning_rate": 6.571467155530114e-06, "loss": 0.1621, "step": 950 }, { "epoch": 0.7316042267050913, "grad_norm": 0.8800370037884307, "learning_rate": 6.568459979649836e-06, "loss": 0.1613, "step": 952 }, { "epoch": 0.7331412103746398, "grad_norm": 0.8133202498369466, "learning_rate": 6.565442982422233e-06, "loss": 0.1498, "step": 954 }, { "epoch": 0.7346781940441883, "grad_norm": 0.8854492713891652, "learning_rate": 6.5624161735039365e-06, "loss": 0.1576, "step": 956 }, { "epoch": 0.7362151777137368, "grad_norm": 0.8860066937387789, "learning_rate": 6.559379562582976e-06, "loss": 0.1567, "step": 958 }, { "epoch": 0.7377521613832853, "grad_norm": 0.801524450434012, "learning_rate": 6.556333159378761e-06, "loss": 0.1507, "step": 960 }, { "epoch": 0.7392891450528338, "grad_norm": 0.826356057888294, "learning_rate": 6.553276973642037e-06, "loss": 0.1693, "step": 962 }, { "epoch": 0.7408261287223823, "grad_norm": 0.8294938538531003, "learning_rate": 6.550211015154869e-06, "loss": 0.1558, "step": 964 }, { "epoch": 0.7423631123919309, "grad_norm": 0.8307597065545927, "learning_rate": 6.547135293730595e-06, "loss": 0.1533, "step": 966 }, { "epoch": 0.7439000960614793, "grad_norm": 0.8014991418925953, "learning_rate": 6.544049819213806e-06, "loss": 0.1634, "step": 968 }, { "epoch": 0.7454370797310279, "grad_norm": 0.8998803118527309, "learning_rate": 6.540954601480307e-06, "loss": 0.1678, "step": 970 }, { "epoch": 0.7469740634005764, "grad_norm": 0.8815150629442536, "learning_rate": 6.537849650437091e-06, "loss": 0.1549, "step": 972 }, { "epoch": 0.7485110470701248, "grad_norm": 0.825376179394833, "learning_rate": 6.534734976022302e-06, "loss": 0.153, "step": 974 }, { "epoch": 0.7500480307396734, "grad_norm": 0.8179612985435794, "learning_rate": 6.53161058820521e-06, "loss": 0.1444, "step": 976 }, { "epoch": 0.7515850144092219, "grad_norm": 0.8727593413193312, "learning_rate": 6.528476496986172e-06, "loss": 0.1517, "step": 978 }, { "epoch": 0.7531219980787704, "grad_norm": 0.8512575927431834, "learning_rate": 6.525332712396604e-06, "loss": 0.1573, "step": 980 }, { "epoch": 0.7546589817483189, "grad_norm": 0.8582941888601493, "learning_rate": 6.522179244498948e-06, "loss": 0.1541, "step": 982 }, { "epoch": 0.7561959654178675, "grad_norm": 0.8767811552173622, "learning_rate": 6.519016103386639e-06, "loss": 0.1546, "step": 984 }, { "epoch": 0.757732949087416, "grad_norm": 0.8210784117243145, "learning_rate": 6.5158432991840755e-06, "loss": 0.145, "step": 986 }, { "epoch": 0.7592699327569644, "grad_norm": 0.8326696940302717, "learning_rate": 6.512660842046582e-06, "loss": 0.1597, "step": 988 }, { "epoch": 0.760806916426513, "grad_norm": 0.953434296685044, "learning_rate": 6.509468742160382e-06, "loss": 0.153, "step": 990 }, { "epoch": 0.7623439000960615, "grad_norm": 0.8362202034275996, "learning_rate": 6.506267009742564e-06, "loss": 0.1422, "step": 992 }, { "epoch": 0.76388088376561, "grad_norm": 0.857361945741671, "learning_rate": 6.503055655041042e-06, "loss": 0.1735, "step": 994 }, { "epoch": 0.7654178674351585, "grad_norm": 0.891793041703015, "learning_rate": 6.499834688334537e-06, "loss": 0.1555, "step": 996 }, { "epoch": 0.766954851104707, "grad_norm": 0.9034407482465361, "learning_rate": 6.496604119932528e-06, "loss": 0.1692, "step": 998 }, { "epoch": 0.7684918347742555, "grad_norm": 0.9404179935204194, "learning_rate": 6.493363960175231e-06, "loss": 0.1549, "step": 1000 }, { "epoch": 0.7684918347742555, "eval_loss": 0.1432075798511505, "eval_runtime": 362.8754, "eval_samples_per_second": 50.995, "eval_steps_per_second": 6.377, "step": 1000 }, { "epoch": 0.770028818443804, "grad_norm": 0.8281134972466698, "learning_rate": 6.490114219433558e-06, "loss": 0.1476, "step": 1002 }, { "epoch": 0.7715658021133526, "grad_norm": 0.8720452925619442, "learning_rate": 6.486854908109089e-06, "loss": 0.1433, "step": 1004 }, { "epoch": 0.773102785782901, "grad_norm": 0.8720479928054387, "learning_rate": 6.483586036634041e-06, "loss": 0.1553, "step": 1006 }, { "epoch": 0.7746397694524496, "grad_norm": 0.8412137015802439, "learning_rate": 6.480307615471223e-06, "loss": 0.157, "step": 1008 }, { "epoch": 0.7761767531219981, "grad_norm": 0.8527177593745339, "learning_rate": 6.4770196551140155e-06, "loss": 0.1614, "step": 1010 }, { "epoch": 0.7777137367915465, "grad_norm": 0.8821801330028585, "learning_rate": 6.473722166086329e-06, "loss": 0.1525, "step": 1012 }, { "epoch": 0.7792507204610951, "grad_norm": 0.9018994294254266, "learning_rate": 6.470415158942574e-06, "loss": 0.16, "step": 1014 }, { "epoch": 0.7807877041306436, "grad_norm": 0.8703842227810923, "learning_rate": 6.467098644267625e-06, "loss": 0.1585, "step": 1016 }, { "epoch": 0.7823246878001922, "grad_norm": 0.8051081010409243, "learning_rate": 6.46377263267679e-06, "loss": 0.1527, "step": 1018 }, { "epoch": 0.7838616714697406, "grad_norm": 0.8888259620906063, "learning_rate": 6.460437134815771e-06, "loss": 0.167, "step": 1020 }, { "epoch": 0.7853986551392892, "grad_norm": 0.9029947228741374, "learning_rate": 6.457092161360633e-06, "loss": 0.1668, "step": 1022 }, { "epoch": 0.7869356388088377, "grad_norm": 0.8981361575693058, "learning_rate": 6.453737723017775e-06, "loss": 0.1612, "step": 1024 }, { "epoch": 0.7884726224783861, "grad_norm": 0.9352139489971404, "learning_rate": 6.450373830523886e-06, "loss": 0.1553, "step": 1026 }, { "epoch": 0.7900096061479347, "grad_norm": 0.8227972574038364, "learning_rate": 6.447000494645916e-06, "loss": 0.1578, "step": 1028 }, { "epoch": 0.7915465898174832, "grad_norm": 0.8530604144128302, "learning_rate": 6.4436177261810395e-06, "loss": 0.1541, "step": 1030 }, { "epoch": 0.7930835734870317, "grad_norm": 0.8897585617894813, "learning_rate": 6.440225535956627e-06, "loss": 0.1552, "step": 1032 }, { "epoch": 0.7946205571565802, "grad_norm": 0.8139899114750058, "learning_rate": 6.436823934830201e-06, "loss": 0.1397, "step": 1034 }, { "epoch": 0.7961575408261288, "grad_norm": 0.9155078950589843, "learning_rate": 6.433412933689408e-06, "loss": 0.1638, "step": 1036 }, { "epoch": 0.7976945244956772, "grad_norm": 0.7986982196116793, "learning_rate": 6.429992543451982e-06, "loss": 0.1492, "step": 1038 }, { "epoch": 0.7992315081652257, "grad_norm": 0.9067813768385491, "learning_rate": 6.426562775065706e-06, "loss": 0.166, "step": 1040 }, { "epoch": 0.8007684918347743, "grad_norm": 0.8546237308090272, "learning_rate": 6.4231236395083835e-06, "loss": 0.1528, "step": 1042 }, { "epoch": 0.8023054755043227, "grad_norm": 0.8301245146373359, "learning_rate": 6.419675147787799e-06, "loss": 0.169, "step": 1044 }, { "epoch": 0.8038424591738713, "grad_norm": 0.9357811284934665, "learning_rate": 6.416217310941682e-06, "loss": 0.1567, "step": 1046 }, { "epoch": 0.8053794428434198, "grad_norm": 0.8763494765723041, "learning_rate": 6.412750140037675e-06, "loss": 0.1578, "step": 1048 }, { "epoch": 0.8069164265129684, "grad_norm": 0.8853921013266428, "learning_rate": 6.409273646173296e-06, "loss": 0.1753, "step": 1050 }, { "epoch": 0.8084534101825168, "grad_norm": 0.8836427743865589, "learning_rate": 6.405787840475904e-06, "loss": 0.1599, "step": 1052 }, { "epoch": 0.8099903938520653, "grad_norm": 0.8292826766651644, "learning_rate": 6.402292734102661e-06, "loss": 0.1603, "step": 1054 }, { "epoch": 0.8115273775216139, "grad_norm": 0.7981150884405339, "learning_rate": 6.3987883382405e-06, "loss": 0.152, "step": 1056 }, { "epoch": 0.8130643611911623, "grad_norm": 0.8941084452594006, "learning_rate": 6.395274664106086e-06, "loss": 0.1675, "step": 1058 }, { "epoch": 0.8146013448607109, "grad_norm": 0.8843029348683886, "learning_rate": 6.39175172294578e-06, "loss": 0.1647, "step": 1060 }, { "epoch": 0.8161383285302594, "grad_norm": 0.8607622983535753, "learning_rate": 6.38821952603561e-06, "loss": 0.1642, "step": 1062 }, { "epoch": 0.8176753121998078, "grad_norm": 0.8712768415639528, "learning_rate": 6.3846780846812234e-06, "loss": 0.165, "step": 1064 }, { "epoch": 0.8192122958693564, "grad_norm": 0.839138399301467, "learning_rate": 6.381127410217858e-06, "loss": 0.1679, "step": 1066 }, { "epoch": 0.8207492795389049, "grad_norm": 0.8561911550355878, "learning_rate": 6.377567514010304e-06, "loss": 0.1582, "step": 1068 }, { "epoch": 0.8222862632084534, "grad_norm": 0.8290678486131811, "learning_rate": 6.373998407452873e-06, "loss": 0.1545, "step": 1070 }, { "epoch": 0.8238232468780019, "grad_norm": 0.847760798773364, "learning_rate": 6.370420101969349e-06, "loss": 0.1459, "step": 1072 }, { "epoch": 0.8253602305475505, "grad_norm": 0.8321417732926941, "learning_rate": 6.3668326090129645e-06, "loss": 0.1524, "step": 1074 }, { "epoch": 0.8268972142170989, "grad_norm": 0.8458379024110787, "learning_rate": 6.363235940066358e-06, "loss": 0.1485, "step": 1076 }, { "epoch": 0.8284341978866474, "grad_norm": 0.9374727516067862, "learning_rate": 6.359630106641535e-06, "loss": 0.1641, "step": 1078 }, { "epoch": 0.829971181556196, "grad_norm": 0.8470538370988849, "learning_rate": 6.356015120279837e-06, "loss": 0.1571, "step": 1080 }, { "epoch": 0.8315081652257444, "grad_norm": 0.8292449900966895, "learning_rate": 6.352390992551903e-06, "loss": 0.1668, "step": 1082 }, { "epoch": 0.833045148895293, "grad_norm": 0.8873775326690251, "learning_rate": 6.348757735057628e-06, "loss": 0.1573, "step": 1084 }, { "epoch": 0.8345821325648415, "grad_norm": 0.8792119220842931, "learning_rate": 6.345115359426129e-06, "loss": 0.1578, "step": 1086 }, { "epoch": 0.8361191162343901, "grad_norm": 0.9098125786437564, "learning_rate": 6.341463877315711e-06, "loss": 0.1631, "step": 1088 }, { "epoch": 0.8376560999039385, "grad_norm": 0.801489722135179, "learning_rate": 6.337803300413822e-06, "loss": 0.1479, "step": 1090 }, { "epoch": 0.839193083573487, "grad_norm": 0.9244844513428183, "learning_rate": 6.334133640437025e-06, "loss": 0.1632, "step": 1092 }, { "epoch": 0.8407300672430356, "grad_norm": 0.881534980036438, "learning_rate": 6.330454909130952e-06, "loss": 0.1595, "step": 1094 }, { "epoch": 0.842267050912584, "grad_norm": 0.8598183603447899, "learning_rate": 6.326767118270271e-06, "loss": 0.1616, "step": 1096 }, { "epoch": 0.8438040345821326, "grad_norm": 0.889022724838577, "learning_rate": 6.323070279658648e-06, "loss": 0.1601, "step": 1098 }, { "epoch": 0.8453410182516811, "grad_norm": 0.8912527206906096, "learning_rate": 6.319364405128706e-06, "loss": 0.1605, "step": 1100 }, { "epoch": 0.8468780019212296, "grad_norm": 0.8035411272312927, "learning_rate": 6.315649506541995e-06, "loss": 0.1514, "step": 1102 }, { "epoch": 0.8484149855907781, "grad_norm": 0.866003931969349, "learning_rate": 6.311925595788942e-06, "loss": 0.1535, "step": 1104 }, { "epoch": 0.8499519692603266, "grad_norm": 0.8292860317473079, "learning_rate": 6.308192684788825e-06, "loss": 0.1517, "step": 1106 }, { "epoch": 0.8514889529298751, "grad_norm": 0.8869266953820677, "learning_rate": 6.3044507854897265e-06, "loss": 0.1603, "step": 1108 }, { "epoch": 0.8530259365994236, "grad_norm": 0.8577246546870172, "learning_rate": 6.3006999098684985e-06, "loss": 0.157, "step": 1110 }, { "epoch": 0.8545629202689722, "grad_norm": 0.8403234008489573, "learning_rate": 6.296940069930725e-06, "loss": 0.1592, "step": 1112 }, { "epoch": 0.8560999039385206, "grad_norm": 0.8702914831113494, "learning_rate": 6.293171277710682e-06, "loss": 0.1575, "step": 1114 }, { "epoch": 0.8576368876080691, "grad_norm": 0.8870545228720214, "learning_rate": 6.289393545271299e-06, "loss": 0.163, "step": 1116 }, { "epoch": 0.8591738712776177, "grad_norm": 0.8835871253233961, "learning_rate": 6.285606884704122e-06, "loss": 0.1505, "step": 1118 }, { "epoch": 0.8607108549471661, "grad_norm": 0.9222189898206439, "learning_rate": 6.281811308129271e-06, "loss": 0.1565, "step": 1120 }, { "epoch": 0.8622478386167147, "grad_norm": 0.865733104634757, "learning_rate": 6.278006827695407e-06, "loss": 0.1657, "step": 1122 }, { "epoch": 0.8637848222862632, "grad_norm": 0.7873625454059449, "learning_rate": 6.274193455579688e-06, "loss": 0.152, "step": 1124 }, { "epoch": 0.8653218059558118, "grad_norm": 0.9036534746354465, "learning_rate": 6.270371203987733e-06, "loss": 0.1524, "step": 1126 }, { "epoch": 0.8668587896253602, "grad_norm": 0.8874043561419231, "learning_rate": 6.266540085153581e-06, "loss": 0.1623, "step": 1128 }, { "epoch": 0.8683957732949087, "grad_norm": 0.8520395043888209, "learning_rate": 6.262700111339654e-06, "loss": 0.1511, "step": 1130 }, { "epoch": 0.8699327569644573, "grad_norm": 0.814438124524513, "learning_rate": 6.2588512948367144e-06, "loss": 0.143, "step": 1132 }, { "epoch": 0.8714697406340057, "grad_norm": 0.8620767282571578, "learning_rate": 6.254993647963831e-06, "loss": 0.1616, "step": 1134 }, { "epoch": 0.8730067243035543, "grad_norm": 0.892703951825046, "learning_rate": 6.251127183068331e-06, "loss": 0.1579, "step": 1136 }, { "epoch": 0.8745437079731028, "grad_norm": 0.8623333160707259, "learning_rate": 6.247251912525773e-06, "loss": 0.1576, "step": 1138 }, { "epoch": 0.8760806916426513, "grad_norm": 0.8199322821651265, "learning_rate": 6.243367848739894e-06, "loss": 0.153, "step": 1140 }, { "epoch": 0.8776176753121998, "grad_norm": 0.8392074331450308, "learning_rate": 6.23947500414258e-06, "loss": 0.1606, "step": 1142 }, { "epoch": 0.8791546589817483, "grad_norm": 0.8281996951649421, "learning_rate": 6.235573391193819e-06, "loss": 0.1469, "step": 1144 }, { "epoch": 0.8806916426512968, "grad_norm": 0.8544291264871913, "learning_rate": 6.231663022381666e-06, "loss": 0.17, "step": 1146 }, { "epoch": 0.8822286263208453, "grad_norm": 0.8699715620381219, "learning_rate": 6.227743910222202e-06, "loss": 0.1543, "step": 1148 }, { "epoch": 0.8837656099903939, "grad_norm": 0.8096799920711654, "learning_rate": 6.22381606725949e-06, "loss": 0.1562, "step": 1150 }, { "epoch": 0.8853025936599423, "grad_norm": 0.8090700874638417, "learning_rate": 6.219879506065542e-06, "loss": 0.1499, "step": 1152 }, { "epoch": 0.8868395773294909, "grad_norm": 0.7867446095253069, "learning_rate": 6.215934239240272e-06, "loss": 0.1558, "step": 1154 }, { "epoch": 0.8883765609990394, "grad_norm": 0.788584493474745, "learning_rate": 6.211980279411459e-06, "loss": 0.1433, "step": 1156 }, { "epoch": 0.8899135446685879, "grad_norm": 0.8362819878588067, "learning_rate": 6.208017639234708e-06, "loss": 0.1552, "step": 1158 }, { "epoch": 0.8914505283381364, "grad_norm": 0.8817509955093219, "learning_rate": 6.204046331393405e-06, "loss": 0.1499, "step": 1160 }, { "epoch": 0.8929875120076849, "grad_norm": 0.8313021295240178, "learning_rate": 6.20006636859868e-06, "loss": 0.1671, "step": 1162 }, { "epoch": 0.8945244956772335, "grad_norm": 0.8485741130804667, "learning_rate": 6.196077763589365e-06, "loss": 0.1609, "step": 1164 }, { "epoch": 0.8960614793467819, "grad_norm": 0.861196387787182, "learning_rate": 6.192080529131955e-06, "loss": 0.1609, "step": 1166 }, { "epoch": 0.8975984630163305, "grad_norm": 0.8527772834985116, "learning_rate": 6.188074678020563e-06, "loss": 0.1571, "step": 1168 }, { "epoch": 0.899135446685879, "grad_norm": 0.820498232150602, "learning_rate": 6.184060223076884e-06, "loss": 0.1599, "step": 1170 }, { "epoch": 0.9006724303554274, "grad_norm": 0.8519749575925564, "learning_rate": 6.180037177150149e-06, "loss": 0.1548, "step": 1172 }, { "epoch": 0.902209414024976, "grad_norm": 0.8442399564211222, "learning_rate": 6.176005553117091e-06, "loss": 0.151, "step": 1174 }, { "epoch": 0.9037463976945245, "grad_norm": 0.8937597645248366, "learning_rate": 6.171965363881894e-06, "loss": 0.1682, "step": 1176 }, { "epoch": 0.905283381364073, "grad_norm": 0.9004779698121571, "learning_rate": 6.167916622376161e-06, "loss": 0.1648, "step": 1178 }, { "epoch": 0.9068203650336215, "grad_norm": 0.8739876161966016, "learning_rate": 6.163859341558867e-06, "loss": 0.1466, "step": 1180 }, { "epoch": 0.90835734870317, "grad_norm": 0.8359328154212177, "learning_rate": 6.159793534416318e-06, "loss": 0.1609, "step": 1182 }, { "epoch": 0.9098943323727186, "grad_norm": 0.7772963511251748, "learning_rate": 6.155719213962113e-06, "loss": 0.1396, "step": 1184 }, { "epoch": 0.911431316042267, "grad_norm": 0.8267806690490346, "learning_rate": 6.151636393237099e-06, "loss": 0.1557, "step": 1186 }, { "epoch": 0.9129682997118156, "grad_norm": 0.8299194507951076, "learning_rate": 6.147545085309329e-06, "loss": 0.1535, "step": 1188 }, { "epoch": 0.914505283381364, "grad_norm": 0.8107731050092446, "learning_rate": 6.143445303274022e-06, "loss": 0.1533, "step": 1190 }, { "epoch": 0.9160422670509126, "grad_norm": 0.8664903101940159, "learning_rate": 6.139337060253521e-06, "loss": 0.1509, "step": 1192 }, { "epoch": 0.9175792507204611, "grad_norm": 0.9096007236323014, "learning_rate": 6.135220369397252e-06, "loss": 0.1576, "step": 1194 }, { "epoch": 0.9191162343900096, "grad_norm": 0.7794208493232124, "learning_rate": 6.131095243881675e-06, "loss": 0.1353, "step": 1196 }, { "epoch": 0.9206532180595581, "grad_norm": 0.8177662161991649, "learning_rate": 6.1269616969102546e-06, "loss": 0.1409, "step": 1198 }, { "epoch": 0.9221902017291066, "grad_norm": 0.7668271695157493, "learning_rate": 6.122819741713402e-06, "loss": 0.1442, "step": 1200 }, { "epoch": 0.9237271853986552, "grad_norm": 0.7607151071645437, "learning_rate": 6.118669391548449e-06, "loss": 0.1497, "step": 1202 }, { "epoch": 0.9252641690682036, "grad_norm": 0.8748866630574149, "learning_rate": 6.114510659699591e-06, "loss": 0.1611, "step": 1204 }, { "epoch": 0.9268011527377522, "grad_norm": 0.8598578438295764, "learning_rate": 6.110343559477855e-06, "loss": 0.1542, "step": 1206 }, { "epoch": 0.9283381364073007, "grad_norm": 0.8347211442345176, "learning_rate": 6.106168104221052e-06, "loss": 0.1488, "step": 1208 }, { "epoch": 0.9298751200768491, "grad_norm": 0.8798272771765616, "learning_rate": 6.101984307293735e-06, "loss": 0.1528, "step": 1210 }, { "epoch": 0.9314121037463977, "grad_norm": 0.809406948238894, "learning_rate": 6.097792182087156e-06, "loss": 0.1605, "step": 1212 }, { "epoch": 0.9329490874159462, "grad_norm": 0.8465513093584821, "learning_rate": 6.093591742019225e-06, "loss": 0.1515, "step": 1214 }, { "epoch": 0.9344860710854948, "grad_norm": 0.8717872500128261, "learning_rate": 6.089383000534465e-06, "loss": 0.1537, "step": 1216 }, { "epoch": 0.9360230547550432, "grad_norm": 0.7761882043003053, "learning_rate": 6.085165971103969e-06, "loss": 0.137, "step": 1218 }, { "epoch": 0.9375600384245918, "grad_norm": 0.8779867026896587, "learning_rate": 6.080940667225359e-06, "loss": 0.1477, "step": 1220 }, { "epoch": 0.9390970220941403, "grad_norm": 0.8339829528733452, "learning_rate": 6.076707102422741e-06, "loss": 0.1511, "step": 1222 }, { "epoch": 0.9406340057636887, "grad_norm": 0.8459206215885985, "learning_rate": 6.072465290246659e-06, "loss": 0.1429, "step": 1224 }, { "epoch": 0.9421709894332373, "grad_norm": 0.8156133417127609, "learning_rate": 6.068215244274061e-06, "loss": 0.1545, "step": 1226 }, { "epoch": 0.9437079731027858, "grad_norm": 0.8160966643134648, "learning_rate": 6.063956978108245e-06, "loss": 0.1427, "step": 1228 }, { "epoch": 0.9452449567723343, "grad_norm": 0.8239874532076136, "learning_rate": 6.059690505378819e-06, "loss": 0.1534, "step": 1230 }, { "epoch": 0.9467819404418828, "grad_norm": 0.8475441460391161, "learning_rate": 6.0554158397416596e-06, "loss": 0.1507, "step": 1232 }, { "epoch": 0.9483189241114313, "grad_norm": 0.839569089678522, "learning_rate": 6.051132994878868e-06, "loss": 0.1527, "step": 1234 }, { "epoch": 0.9498559077809798, "grad_norm": 0.7802196203437861, "learning_rate": 6.046841984498722e-06, "loss": 0.1376, "step": 1236 }, { "epoch": 0.9513928914505283, "grad_norm": 0.8574149706930279, "learning_rate": 6.042542822335638e-06, "loss": 0.1637, "step": 1238 }, { "epoch": 0.9529298751200769, "grad_norm": 0.8161418800215421, "learning_rate": 6.038235522150124e-06, "loss": 0.1479, "step": 1240 }, { "epoch": 0.9544668587896253, "grad_norm": 0.882673614104903, "learning_rate": 6.0339200977287316e-06, "loss": 0.1414, "step": 1242 }, { "epoch": 0.9560038424591739, "grad_norm": 0.8368270452723762, "learning_rate": 6.029596562884021e-06, "loss": 0.158, "step": 1244 }, { "epoch": 0.9575408261287224, "grad_norm": 0.8428259430927917, "learning_rate": 6.025264931454509e-06, "loss": 0.1474, "step": 1246 }, { "epoch": 0.9590778097982708, "grad_norm": 0.8478092566057043, "learning_rate": 6.020925217304629e-06, "loss": 0.1526, "step": 1248 }, { "epoch": 0.9606147934678194, "grad_norm": 0.8333839896657882, "learning_rate": 6.016577434324684e-06, "loss": 0.1419, "step": 1250 }, { "epoch": 0.9621517771373679, "grad_norm": 0.8445286054177732, "learning_rate": 6.012221596430804e-06, "loss": 0.1596, "step": 1252 }, { "epoch": 0.9636887608069165, "grad_norm": 0.851762036838009, "learning_rate": 6.0078577175649e-06, "loss": 0.1546, "step": 1254 }, { "epoch": 0.9652257444764649, "grad_norm": 0.8437848350814667, "learning_rate": 6.00348581169462e-06, "loss": 0.1508, "step": 1256 }, { "epoch": 0.9667627281460135, "grad_norm": 0.8945339085483407, "learning_rate": 5.9991058928133054e-06, "loss": 0.1576, "step": 1258 }, { "epoch": 0.968299711815562, "grad_norm": 0.7944819259983578, "learning_rate": 5.994717974939944e-06, "loss": 0.1476, "step": 1260 }, { "epoch": 0.9698366954851104, "grad_norm": 0.8729176506312757, "learning_rate": 5.990322072119126e-06, "loss": 0.1596, "step": 1262 }, { "epoch": 0.971373679154659, "grad_norm": 0.8725237140356162, "learning_rate": 5.985918198421002e-06, "loss": 0.1571, "step": 1264 }, { "epoch": 0.9729106628242075, "grad_norm": 0.7991775531475158, "learning_rate": 5.981506367941233e-06, "loss": 0.1434, "step": 1266 }, { "epoch": 0.974447646493756, "grad_norm": 0.8382327345169207, "learning_rate": 5.977086594800948e-06, "loss": 0.1596, "step": 1268 }, { "epoch": 0.9759846301633045, "grad_norm": 0.9529464464579128, "learning_rate": 5.972658893146697e-06, "loss": 0.1514, "step": 1270 }, { "epoch": 0.9775216138328531, "grad_norm": 0.8484469791654419, "learning_rate": 5.96822327715041e-06, "loss": 0.152, "step": 1272 }, { "epoch": 0.9790585975024015, "grad_norm": 0.81835031937336, "learning_rate": 5.963779761009348e-06, "loss": 0.1528, "step": 1274 }, { "epoch": 0.98059558117195, "grad_norm": 0.8310249942970027, "learning_rate": 5.959328358946056e-06, "loss": 0.1473, "step": 1276 }, { "epoch": 0.9821325648414986, "grad_norm": 0.8856986735480396, "learning_rate": 5.954869085208323e-06, "loss": 0.1609, "step": 1278 }, { "epoch": 0.983669548511047, "grad_norm": 0.8166362545867224, "learning_rate": 5.9504019540691305e-06, "loss": 0.1465, "step": 1280 }, { "epoch": 0.9852065321805956, "grad_norm": 0.8533786504203714, "learning_rate": 5.945926979826612e-06, "loss": 0.1555, "step": 1282 }, { "epoch": 0.9867435158501441, "grad_norm": 0.8994585946450427, "learning_rate": 5.941444176804002e-06, "loss": 0.1495, "step": 1284 }, { "epoch": 0.9882804995196927, "grad_norm": 0.8108578942364258, "learning_rate": 5.936953559349596e-06, "loss": 0.1505, "step": 1286 }, { "epoch": 0.9898174831892411, "grad_norm": 0.8282612807656491, "learning_rate": 5.932455141836697e-06, "loss": 0.1561, "step": 1288 }, { "epoch": 0.9913544668587896, "grad_norm": 0.8532043323116557, "learning_rate": 5.927948938663581e-06, "loss": 0.1559, "step": 1290 }, { "epoch": 0.9928914505283382, "grad_norm": 0.8582728286463194, "learning_rate": 5.923434964253437e-06, "loss": 0.1628, "step": 1292 }, { "epoch": 0.9944284341978866, "grad_norm": 0.8145713992914184, "learning_rate": 5.91891323305433e-06, "loss": 0.1503, "step": 1294 }, { "epoch": 0.9959654178674352, "grad_norm": 0.8011507540671976, "learning_rate": 5.914383759539153e-06, "loss": 0.1439, "step": 1296 }, { "epoch": 0.9975024015369837, "grad_norm": 0.9045409623476682, "learning_rate": 5.909846558205582e-06, "loss": 0.1533, "step": 1298 }, { "epoch": 0.9990393852065321, "grad_norm": 0.8994142078460239, "learning_rate": 5.905301643576025e-06, "loss": 0.1635, "step": 1300 }, { "epoch": 1.0005763688760807, "grad_norm": 1.9268010896112178, "learning_rate": 5.900749030197578e-06, "loss": 0.2109, "step": 1302 }, { "epoch": 1.0021133525456292, "grad_norm": 0.7330614000908158, "learning_rate": 5.8961887326419804e-06, "loss": 0.0988, "step": 1304 }, { "epoch": 1.0036503362151776, "grad_norm": 0.7392181978150959, "learning_rate": 5.891620765505566e-06, "loss": 0.1003, "step": 1306 }, { "epoch": 1.0051873198847263, "grad_norm": 0.7663128310167143, "learning_rate": 5.887045143409216e-06, "loss": 0.1016, "step": 1308 }, { "epoch": 1.0067243035542748, "grad_norm": 0.7607110644159815, "learning_rate": 5.882461880998317e-06, "loss": 0.0916, "step": 1310 }, { "epoch": 1.0082612872238232, "grad_norm": 0.8238052235863006, "learning_rate": 5.877870992942704e-06, "loss": 0.0931, "step": 1312 }, { "epoch": 1.0097982708933717, "grad_norm": 0.8892662986360846, "learning_rate": 5.873272493936625e-06, "loss": 0.1033, "step": 1314 }, { "epoch": 1.0113352545629202, "grad_norm": 0.7704764998566338, "learning_rate": 5.868666398698687e-06, "loss": 0.1002, "step": 1316 }, { "epoch": 1.0128722382324689, "grad_norm": 0.7709377348356039, "learning_rate": 5.864052721971809e-06, "loss": 0.0967, "step": 1318 }, { "epoch": 1.0144092219020173, "grad_norm": 0.7969803446335071, "learning_rate": 5.859431478523179e-06, "loss": 0.1048, "step": 1320 }, { "epoch": 1.0159462055715658, "grad_norm": 0.6999689392816087, "learning_rate": 5.854802683144201e-06, "loss": 0.088, "step": 1322 }, { "epoch": 1.0174831892411142, "grad_norm": 0.7760114509424976, "learning_rate": 5.850166350650456e-06, "loss": 0.0991, "step": 1324 }, { "epoch": 1.019020172910663, "grad_norm": 0.7880085543010041, "learning_rate": 5.845522495881642e-06, "loss": 0.0971, "step": 1326 }, { "epoch": 1.0205571565802114, "grad_norm": 0.8289396285343062, "learning_rate": 5.840871133701542e-06, "loss": 0.102, "step": 1328 }, { "epoch": 1.0220941402497599, "grad_norm": 0.7142434734819356, "learning_rate": 5.836212278997961e-06, "loss": 0.0883, "step": 1330 }, { "epoch": 1.0236311239193083, "grad_norm": 0.7250033576245166, "learning_rate": 5.8315459466826895e-06, "loss": 0.0915, "step": 1332 }, { "epoch": 1.0251681075888568, "grad_norm": 0.7570476788775075, "learning_rate": 5.826872151691452e-06, "loss": 0.0868, "step": 1334 }, { "epoch": 1.0267050912584055, "grad_norm": 0.7965255668625996, "learning_rate": 5.822190908983859e-06, "loss": 0.0883, "step": 1336 }, { "epoch": 1.028242074927954, "grad_norm": 0.8376697395117676, "learning_rate": 5.817502233543355e-06, "loss": 0.0991, "step": 1338 }, { "epoch": 1.0297790585975024, "grad_norm": 0.7468897147308834, "learning_rate": 5.8128061403771815e-06, "loss": 0.0885, "step": 1340 }, { "epoch": 1.0313160422670509, "grad_norm": 0.8645299006954267, "learning_rate": 5.8081026445163184e-06, "loss": 0.1036, "step": 1342 }, { "epoch": 1.0328530259365993, "grad_norm": 0.8029697731003538, "learning_rate": 5.80339176101544e-06, "loss": 0.1027, "step": 1344 }, { "epoch": 1.034390009606148, "grad_norm": 0.7986403468821157, "learning_rate": 5.798673504952866e-06, "loss": 0.0982, "step": 1346 }, { "epoch": 1.0359269932756965, "grad_norm": 0.8210887006805161, "learning_rate": 5.793947891430516e-06, "loss": 0.0973, "step": 1348 }, { "epoch": 1.037463976945245, "grad_norm": 0.7710919739110469, "learning_rate": 5.789214935573857e-06, "loss": 0.097, "step": 1350 }, { "epoch": 1.0390009606147934, "grad_norm": 0.7754287784899241, "learning_rate": 5.784474652531857e-06, "loss": 0.0962, "step": 1352 }, { "epoch": 1.0405379442843419, "grad_norm": 0.7811613718003362, "learning_rate": 5.779727057476938e-06, "loss": 0.101, "step": 1354 }, { "epoch": 1.0420749279538906, "grad_norm": 0.7683352035150061, "learning_rate": 5.774972165604923e-06, "loss": 0.0926, "step": 1356 }, { "epoch": 1.043611911623439, "grad_norm": 0.8121646663638018, "learning_rate": 5.770209992134992e-06, "loss": 0.1025, "step": 1358 }, { "epoch": 1.0451488952929875, "grad_norm": 0.7946845821407413, "learning_rate": 5.765440552309633e-06, "loss": 0.098, "step": 1360 }, { "epoch": 1.046685878962536, "grad_norm": 0.8107985568570227, "learning_rate": 5.760663861394589e-06, "loss": 0.0939, "step": 1362 }, { "epoch": 1.0482228626320846, "grad_norm": 0.7738201688286012, "learning_rate": 5.755879934678815e-06, "loss": 0.0977, "step": 1364 }, { "epoch": 1.049759846301633, "grad_norm": 0.8043266326845157, "learning_rate": 5.751088787474421e-06, "loss": 0.104, "step": 1366 }, { "epoch": 1.0512968299711816, "grad_norm": 0.8212861671574773, "learning_rate": 5.746290435116633e-06, "loss": 0.1012, "step": 1368 }, { "epoch": 1.05283381364073, "grad_norm": 0.8217655361068997, "learning_rate": 5.7414848929637344e-06, "loss": 0.0948, "step": 1370 }, { "epoch": 1.0543707973102785, "grad_norm": 0.7398326798927687, "learning_rate": 5.7366721763970276e-06, "loss": 0.0926, "step": 1372 }, { "epoch": 1.0559077809798272, "grad_norm": 0.8684068565935559, "learning_rate": 5.73185230082077e-06, "loss": 0.0977, "step": 1374 }, { "epoch": 1.0574447646493756, "grad_norm": 0.812673696526823, "learning_rate": 5.727025281662141e-06, "loss": 0.0922, "step": 1376 }, { "epoch": 1.058981748318924, "grad_norm": 0.8179613499881782, "learning_rate": 5.722191134371179e-06, "loss": 0.0944, "step": 1378 }, { "epoch": 1.0605187319884726, "grad_norm": 0.810924645502839, "learning_rate": 5.717349874420742e-06, "loss": 0.0924, "step": 1380 }, { "epoch": 1.062055715658021, "grad_norm": 0.763255489678059, "learning_rate": 5.71250151730645e-06, "loss": 0.0896, "step": 1382 }, { "epoch": 1.0635926993275697, "grad_norm": 0.8239338886221861, "learning_rate": 5.707646078546642e-06, "loss": 0.1043, "step": 1384 }, { "epoch": 1.0651296829971182, "grad_norm": 0.7862418625776587, "learning_rate": 5.702783573682323e-06, "loss": 0.0927, "step": 1386 }, { "epoch": 1.0666666666666667, "grad_norm": 0.7694175148908957, "learning_rate": 5.697914018277113e-06, "loss": 0.0905, "step": 1388 }, { "epoch": 1.0682036503362151, "grad_norm": 0.8056119084783939, "learning_rate": 5.693037427917201e-06, "loss": 0.0964, "step": 1390 }, { "epoch": 1.0697406340057638, "grad_norm": 0.827006265409657, "learning_rate": 5.688153818211293e-06, "loss": 0.0905, "step": 1392 }, { "epoch": 1.0712776176753123, "grad_norm": 0.8282583664130263, "learning_rate": 5.683263204790561e-06, "loss": 0.0915, "step": 1394 }, { "epoch": 1.0728146013448607, "grad_norm": 0.7937251940987193, "learning_rate": 5.678365603308593e-06, "loss": 0.0969, "step": 1396 }, { "epoch": 1.0743515850144092, "grad_norm": 0.7954663404413561, "learning_rate": 5.673461029441347e-06, "loss": 0.0996, "step": 1398 }, { "epoch": 1.0758885686839577, "grad_norm": 0.8270927064110352, "learning_rate": 5.668549498887098e-06, "loss": 0.1034, "step": 1400 }, { "epoch": 1.0774255523535063, "grad_norm": 0.7670514687913251, "learning_rate": 5.663631027366382e-06, "loss": 0.0913, "step": 1402 }, { "epoch": 1.0789625360230548, "grad_norm": 0.8715536438358069, "learning_rate": 5.658705630621959e-06, "loss": 0.1004, "step": 1404 }, { "epoch": 1.0804995196926033, "grad_norm": 0.8161604730434415, "learning_rate": 5.653773324418748e-06, "loss": 0.0943, "step": 1406 }, { "epoch": 1.0820365033621517, "grad_norm": 0.8667760108772686, "learning_rate": 5.648834124543787e-06, "loss": 0.0954, "step": 1408 }, { "epoch": 1.0835734870317002, "grad_norm": 0.8231577798724082, "learning_rate": 5.643888046806179e-06, "loss": 0.1, "step": 1410 }, { "epoch": 1.0851104707012489, "grad_norm": 0.7413599439846686, "learning_rate": 5.638935107037041e-06, "loss": 0.0866, "step": 1412 }, { "epoch": 1.0866474543707973, "grad_norm": 0.7754362451670733, "learning_rate": 5.633975321089452e-06, "loss": 0.0964, "step": 1414 }, { "epoch": 1.0881844380403458, "grad_norm": 0.7854818133378504, "learning_rate": 5.629008704838404e-06, "loss": 0.0978, "step": 1416 }, { "epoch": 1.0897214217098943, "grad_norm": 0.8230491957163081, "learning_rate": 5.624035274180755e-06, "loss": 0.0904, "step": 1418 }, { "epoch": 1.0912584053794427, "grad_norm": 0.8415692370246974, "learning_rate": 5.619055045035168e-06, "loss": 0.0949, "step": 1420 }, { "epoch": 1.0927953890489914, "grad_norm": 0.7752546061977003, "learning_rate": 5.6140680333420714e-06, "loss": 0.0977, "step": 1422 }, { "epoch": 1.09433237271854, "grad_norm": 0.796060251145471, "learning_rate": 5.609074255063598e-06, "loss": 0.0986, "step": 1424 }, { "epoch": 1.0958693563880884, "grad_norm": 0.8014273381014567, "learning_rate": 5.604073726183545e-06, "loss": 0.0996, "step": 1426 }, { "epoch": 1.0974063400576368, "grad_norm": 0.8574802894140412, "learning_rate": 5.599066462707311e-06, "loss": 0.0915, "step": 1428 }, { "epoch": 1.0989433237271853, "grad_norm": 0.7973180847732659, "learning_rate": 5.594052480661852e-06, "loss": 0.098, "step": 1430 }, { "epoch": 1.100480307396734, "grad_norm": 0.7978978724248726, "learning_rate": 5.589031796095629e-06, "loss": 0.095, "step": 1432 }, { "epoch": 1.1020172910662824, "grad_norm": 0.7735364885990075, "learning_rate": 5.584004425078556e-06, "loss": 0.0977, "step": 1434 }, { "epoch": 1.103554274735831, "grad_norm": 0.824645677844322, "learning_rate": 5.5789703837019465e-06, "loss": 0.101, "step": 1436 }, { "epoch": 1.1050912584053794, "grad_norm": 0.8045096798761333, "learning_rate": 5.5739296880784685e-06, "loss": 0.0995, "step": 1438 }, { "epoch": 1.106628242074928, "grad_norm": 0.7785417170752582, "learning_rate": 5.568882354342084e-06, "loss": 0.0944, "step": 1440 }, { "epoch": 1.1081652257444765, "grad_norm": 0.7865456166072924, "learning_rate": 5.5638283986480055e-06, "loss": 0.0993, "step": 1442 }, { "epoch": 1.109702209414025, "grad_norm": 0.7482822898483027, "learning_rate": 5.5587678371726365e-06, "loss": 0.0923, "step": 1444 }, { "epoch": 1.1112391930835734, "grad_norm": 0.847672498751733, "learning_rate": 5.553700686113528e-06, "loss": 0.0924, "step": 1446 }, { "epoch": 1.112776176753122, "grad_norm": 0.8088910111550272, "learning_rate": 5.54862696168932e-06, "loss": 0.0969, "step": 1448 }, { "epoch": 1.1143131604226706, "grad_norm": 0.7477679679317595, "learning_rate": 5.543546680139695e-06, "loss": 0.089, "step": 1450 }, { "epoch": 1.115850144092219, "grad_norm": 0.8008986821135872, "learning_rate": 5.5384598577253185e-06, "loss": 0.0979, "step": 1452 }, { "epoch": 1.1173871277617675, "grad_norm": 0.8322762593719337, "learning_rate": 5.533366510727797e-06, "loss": 0.0959, "step": 1454 }, { "epoch": 1.118924111431316, "grad_norm": 0.7810694389767996, "learning_rate": 5.528266655449615e-06, "loss": 0.1004, "step": 1456 }, { "epoch": 1.1204610951008647, "grad_norm": 0.8168615959182653, "learning_rate": 5.523160308214093e-06, "loss": 0.0934, "step": 1458 }, { "epoch": 1.1219980787704131, "grad_norm": 0.8202613520761898, "learning_rate": 5.518047485365328e-06, "loss": 0.0923, "step": 1460 }, { "epoch": 1.1235350624399616, "grad_norm": 0.7506685980798529, "learning_rate": 5.512928203268145e-06, "loss": 0.0983, "step": 1462 }, { "epoch": 1.12507204610951, "grad_norm": 0.7964645064908876, "learning_rate": 5.507802478308043e-06, "loss": 0.1018, "step": 1464 }, { "epoch": 1.1266090297790585, "grad_norm": 0.7856943241580147, "learning_rate": 5.502670326891141e-06, "loss": 0.0897, "step": 1466 }, { "epoch": 1.1281460134486072, "grad_norm": 0.9369819821403769, "learning_rate": 5.497531765444132e-06, "loss": 0.0997, "step": 1468 }, { "epoch": 1.1296829971181557, "grad_norm": 0.8411240897070034, "learning_rate": 5.492386810414222e-06, "loss": 0.099, "step": 1470 }, { "epoch": 1.1312199807877041, "grad_norm": 0.7880140283795076, "learning_rate": 5.48723547826908e-06, "loss": 0.0962, "step": 1472 }, { "epoch": 1.1327569644572526, "grad_norm": 0.7705964805253543, "learning_rate": 5.482077785496794e-06, "loss": 0.0931, "step": 1474 }, { "epoch": 1.134293948126801, "grad_norm": 0.8165812210553237, "learning_rate": 5.4769137486058e-06, "loss": 0.1001, "step": 1476 }, { "epoch": 1.1358309317963498, "grad_norm": 0.8057592991161191, "learning_rate": 5.471743384124848e-06, "loss": 0.0943, "step": 1478 }, { "epoch": 1.1373679154658982, "grad_norm": 0.7946279160573669, "learning_rate": 5.466566708602939e-06, "loss": 0.0968, "step": 1480 }, { "epoch": 1.1389048991354467, "grad_norm": 0.7589266561686325, "learning_rate": 5.461383738609272e-06, "loss": 0.0919, "step": 1482 }, { "epoch": 1.1404418828049951, "grad_norm": 0.7750698355623663, "learning_rate": 5.456194490733194e-06, "loss": 0.0905, "step": 1484 }, { "epoch": 1.1419788664745436, "grad_norm": 0.7612447169194039, "learning_rate": 5.450998981584148e-06, "loss": 0.0876, "step": 1486 }, { "epoch": 1.1435158501440923, "grad_norm": 0.7659227044968493, "learning_rate": 5.445797227791616e-06, "loss": 0.0913, "step": 1488 }, { "epoch": 1.1450528338136408, "grad_norm": 0.8296191241219513, "learning_rate": 5.440589246005064e-06, "loss": 0.1023, "step": 1490 }, { "epoch": 1.1465898174831892, "grad_norm": 0.8332033205007243, "learning_rate": 5.4353750528938995e-06, "loss": 0.1018, "step": 1492 }, { "epoch": 1.1481268011527377, "grad_norm": 0.755633743693491, "learning_rate": 5.430154665147406e-06, "loss": 0.0982, "step": 1494 }, { "epoch": 1.1496637848222861, "grad_norm": 0.8294945642825525, "learning_rate": 5.424928099474694e-06, "loss": 0.101, "step": 1496 }, { "epoch": 1.1512007684918348, "grad_norm": 0.8705045376216894, "learning_rate": 5.419695372604652e-06, "loss": 0.0966, "step": 1498 }, { "epoch": 1.1527377521613833, "grad_norm": 0.8386991034461505, "learning_rate": 5.414456501285883e-06, "loss": 0.0948, "step": 1500 }, { "epoch": 1.1527377521613833, "eval_loss": 0.14147287607192993, "eval_runtime": 362.5666, "eval_samples_per_second": 51.039, "eval_steps_per_second": 6.382, "step": 1500 }, { "epoch": 1.1542747358309318, "grad_norm": 0.808895295948306, "learning_rate": 5.409211502286663e-06, "loss": 0.1023, "step": 1502 }, { "epoch": 1.1558117195004802, "grad_norm": 0.8129387791807478, "learning_rate": 5.403960392394877e-06, "loss": 0.0917, "step": 1504 }, { "epoch": 1.157348703170029, "grad_norm": 0.8709858446003281, "learning_rate": 5.398703188417971e-06, "loss": 0.1076, "step": 1506 }, { "epoch": 1.1588856868395774, "grad_norm": 0.7731592974500305, "learning_rate": 5.393439907182895e-06, "loss": 0.0932, "step": 1508 }, { "epoch": 1.1604226705091258, "grad_norm": 0.8193191652163263, "learning_rate": 5.388170565536052e-06, "loss": 0.087, "step": 1510 }, { "epoch": 1.1619596541786743, "grad_norm": 0.8821413504599133, "learning_rate": 5.382895180343243e-06, "loss": 0.1016, "step": 1512 }, { "epoch": 1.1634966378482228, "grad_norm": 0.8330757982422947, "learning_rate": 5.377613768489613e-06, "loss": 0.0902, "step": 1514 }, { "epoch": 1.1650336215177715, "grad_norm": 0.7755643958777008, "learning_rate": 5.372326346879597e-06, "loss": 0.0935, "step": 1516 }, { "epoch": 1.16657060518732, "grad_norm": 0.7798742092931141, "learning_rate": 5.367032932436863e-06, "loss": 0.0907, "step": 1518 }, { "epoch": 1.1681075888568684, "grad_norm": 0.7669670923992181, "learning_rate": 5.3617335421042644e-06, "loss": 0.1023, "step": 1520 }, { "epoch": 1.1696445725264168, "grad_norm": 0.8341959799418742, "learning_rate": 5.3564281928437785e-06, "loss": 0.1016, "step": 1522 }, { "epoch": 1.1711815561959655, "grad_norm": 0.7444403179993622, "learning_rate": 5.351116901636459e-06, "loss": 0.0895, "step": 1524 }, { "epoch": 1.172718539865514, "grad_norm": 0.788295757514909, "learning_rate": 5.3457996854823736e-06, "loss": 0.0955, "step": 1526 }, { "epoch": 1.1742555235350625, "grad_norm": 0.8581606629276641, "learning_rate": 5.340476561400559e-06, "loss": 0.0956, "step": 1528 }, { "epoch": 1.175792507204611, "grad_norm": 0.7680372748823804, "learning_rate": 5.33514754642896e-06, "loss": 0.0892, "step": 1530 }, { "epoch": 1.1773294908741594, "grad_norm": 0.8579631291966535, "learning_rate": 5.329812657624374e-06, "loss": 0.0947, "step": 1532 }, { "epoch": 1.178866474543708, "grad_norm": 0.8028770597268665, "learning_rate": 5.324471912062402e-06, "loss": 0.0919, "step": 1534 }, { "epoch": 1.1804034582132565, "grad_norm": 0.8274846559591892, "learning_rate": 5.319125326837392e-06, "loss": 0.0931, "step": 1536 }, { "epoch": 1.181940441882805, "grad_norm": 0.8099150871185563, "learning_rate": 5.3137729190623784e-06, "loss": 0.1059, "step": 1538 }, { "epoch": 1.1834774255523535, "grad_norm": 0.7888074507502317, "learning_rate": 5.308414705869037e-06, "loss": 0.0948, "step": 1540 }, { "epoch": 1.185014409221902, "grad_norm": 0.736344371575378, "learning_rate": 5.3030507044076244e-06, "loss": 0.0927, "step": 1542 }, { "epoch": 1.1865513928914506, "grad_norm": 0.8067822180532194, "learning_rate": 5.29768093184692e-06, "loss": 0.0916, "step": 1544 }, { "epoch": 1.188088376560999, "grad_norm": 0.8588531827838762, "learning_rate": 5.29230540537418e-06, "loss": 0.0962, "step": 1546 }, { "epoch": 1.1896253602305475, "grad_norm": 0.8183423934241679, "learning_rate": 5.286924142195075e-06, "loss": 0.0908, "step": 1548 }, { "epoch": 1.191162343900096, "grad_norm": 0.7893384100991108, "learning_rate": 5.2815371595336375e-06, "loss": 0.0949, "step": 1550 }, { "epoch": 1.1926993275696445, "grad_norm": 0.8369129068810596, "learning_rate": 5.2761444746322054e-06, "loss": 0.1004, "step": 1552 }, { "epoch": 1.1942363112391932, "grad_norm": 0.852519706321052, "learning_rate": 5.270746104751371e-06, "loss": 0.1071, "step": 1554 }, { "epoch": 1.1957732949087416, "grad_norm": 0.7067264280513246, "learning_rate": 5.265342067169921e-06, "loss": 0.0866, "step": 1556 }, { "epoch": 1.19731027857829, "grad_norm": 0.7807816607071811, "learning_rate": 5.259932379184782e-06, "loss": 0.0947, "step": 1558 }, { "epoch": 1.1988472622478386, "grad_norm": 0.7595283315156356, "learning_rate": 5.254517058110968e-06, "loss": 0.093, "step": 1560 }, { "epoch": 1.200384245917387, "grad_norm": 0.7355421808089894, "learning_rate": 5.249096121281521e-06, "loss": 0.0929, "step": 1562 }, { "epoch": 1.2019212295869357, "grad_norm": 0.8309945335729965, "learning_rate": 5.243669586047459e-06, "loss": 0.1038, "step": 1564 }, { "epoch": 1.2034582132564842, "grad_norm": 0.7783243017854818, "learning_rate": 5.238237469777719e-06, "loss": 0.0919, "step": 1566 }, { "epoch": 1.2049951969260326, "grad_norm": 0.7536478224038158, "learning_rate": 5.232799789859102e-06, "loss": 0.0832, "step": 1568 }, { "epoch": 1.206532180595581, "grad_norm": 0.8375220966676139, "learning_rate": 5.227356563696215e-06, "loss": 0.0933, "step": 1570 }, { "epoch": 1.2080691642651298, "grad_norm": 0.8168530200077441, "learning_rate": 5.221907808711418e-06, "loss": 0.0962, "step": 1572 }, { "epoch": 1.2096061479346782, "grad_norm": 0.7669312918946458, "learning_rate": 5.216453542344768e-06, "loss": 0.0876, "step": 1574 }, { "epoch": 1.2111431316042267, "grad_norm": 0.8257303860560026, "learning_rate": 5.210993782053961e-06, "loss": 0.0923, "step": 1576 }, { "epoch": 1.2126801152737752, "grad_norm": 0.8359416165543359, "learning_rate": 5.205528545314281e-06, "loss": 0.09, "step": 1578 }, { "epoch": 1.2142170989433236, "grad_norm": 0.8022199675695576, "learning_rate": 5.200057849618535e-06, "loss": 0.0928, "step": 1580 }, { "epoch": 1.2157540826128723, "grad_norm": 0.809397949668016, "learning_rate": 5.194581712477007e-06, "loss": 0.1011, "step": 1582 }, { "epoch": 1.2172910662824208, "grad_norm": 0.7620719694922747, "learning_rate": 5.1891001514173994e-06, "loss": 0.0947, "step": 1584 }, { "epoch": 1.2188280499519693, "grad_norm": 0.8748786606947648, "learning_rate": 5.183613183984768e-06, "loss": 0.1022, "step": 1586 }, { "epoch": 1.2203650336215177, "grad_norm": 0.8471054549064284, "learning_rate": 5.178120827741481e-06, "loss": 0.0994, "step": 1588 }, { "epoch": 1.2219020172910664, "grad_norm": 0.8328172662255555, "learning_rate": 5.172623100267148e-06, "loss": 0.09, "step": 1590 }, { "epoch": 1.2234390009606149, "grad_norm": 0.7751726026024895, "learning_rate": 5.167120019158578e-06, "loss": 0.0872, "step": 1592 }, { "epoch": 1.2249759846301633, "grad_norm": 0.8122210518131329, "learning_rate": 5.1616116020297065e-06, "loss": 0.0901, "step": 1594 }, { "epoch": 1.2265129682997118, "grad_norm": 0.7575239070267672, "learning_rate": 5.1560978665115555e-06, "loss": 0.0911, "step": 1596 }, { "epoch": 1.2280499519692603, "grad_norm": 0.7802284165444789, "learning_rate": 5.150578830252168e-06, "loss": 0.0943, "step": 1598 }, { "epoch": 1.229586935638809, "grad_norm": 0.8020724802994549, "learning_rate": 5.145054510916552e-06, "loss": 0.0861, "step": 1600 }, { "epoch": 1.2311239193083574, "grad_norm": 0.9021178290335383, "learning_rate": 5.139524926186624e-06, "loss": 0.1049, "step": 1602 }, { "epoch": 1.2326609029779059, "grad_norm": 0.7954364435694474, "learning_rate": 5.133990093761158e-06, "loss": 0.0975, "step": 1604 }, { "epoch": 1.2341978866474543, "grad_norm": 0.862849089104852, "learning_rate": 5.1284500313557214e-06, "loss": 0.1001, "step": 1606 }, { "epoch": 1.2357348703170028, "grad_norm": 0.8234510165839098, "learning_rate": 5.122904756702622e-06, "loss": 0.096, "step": 1608 }, { "epoch": 1.2372718539865515, "grad_norm": 0.8180910858318627, "learning_rate": 5.1173542875508495e-06, "loss": 0.0929, "step": 1610 }, { "epoch": 1.2388088376561, "grad_norm": 0.8254978234033816, "learning_rate": 5.111798641666022e-06, "loss": 0.0986, "step": 1612 }, { "epoch": 1.2403458213256484, "grad_norm": 0.7970300036804161, "learning_rate": 5.1062378368303286e-06, "loss": 0.0968, "step": 1614 }, { "epoch": 1.2418828049951969, "grad_norm": 0.7942368711186357, "learning_rate": 5.100671890842464e-06, "loss": 0.0946, "step": 1616 }, { "epoch": 1.2434197886647453, "grad_norm": 0.781461470784258, "learning_rate": 5.095100821517586e-06, "loss": 0.0909, "step": 1618 }, { "epoch": 1.244956772334294, "grad_norm": 0.7637849604119566, "learning_rate": 5.089524646687245e-06, "loss": 0.089, "step": 1620 }, { "epoch": 1.2464937560038425, "grad_norm": 0.8322634659167238, "learning_rate": 5.083943384199339e-06, "loss": 0.0946, "step": 1622 }, { "epoch": 1.248030739673391, "grad_norm": 0.8673983066924847, "learning_rate": 5.078357051918042e-06, "loss": 0.0997, "step": 1624 }, { "epoch": 1.2495677233429394, "grad_norm": 0.8093921155750359, "learning_rate": 5.072765667723763e-06, "loss": 0.0924, "step": 1626 }, { "epoch": 1.2511047070124879, "grad_norm": 0.7682990135468413, "learning_rate": 5.067169249513078e-06, "loss": 0.0908, "step": 1628 }, { "epoch": 1.2526416906820366, "grad_norm": 0.9219480609910328, "learning_rate": 5.061567815198674e-06, "loss": 0.0992, "step": 1630 }, { "epoch": 1.254178674351585, "grad_norm": 0.7864964134564114, "learning_rate": 5.055961382709294e-06, "loss": 0.0856, "step": 1632 }, { "epoch": 1.2557156580211335, "grad_norm": 0.789369364367871, "learning_rate": 5.05034996998968e-06, "loss": 0.101, "step": 1634 }, { "epoch": 1.257252641690682, "grad_norm": 0.785618855267339, "learning_rate": 5.044733595000514e-06, "loss": 0.0913, "step": 1636 }, { "epoch": 1.2587896253602304, "grad_norm": 0.8540857446211212, "learning_rate": 5.0391122757183605e-06, "loss": 0.1008, "step": 1638 }, { "epoch": 1.260326609029779, "grad_norm": 0.8815323610676825, "learning_rate": 5.03348603013561e-06, "loss": 0.0967, "step": 1640 }, { "epoch": 1.2618635926993276, "grad_norm": 0.8589693358904777, "learning_rate": 5.02785487626042e-06, "loss": 0.0979, "step": 1642 }, { "epoch": 1.263400576368876, "grad_norm": 0.8415206231916414, "learning_rate": 5.022218832116659e-06, "loss": 0.1038, "step": 1644 }, { "epoch": 1.2649375600384247, "grad_norm": 0.8392847365814639, "learning_rate": 5.016577915743848e-06, "loss": 0.0999, "step": 1646 }, { "epoch": 1.266474543707973, "grad_norm": 0.840680092317177, "learning_rate": 5.010932145197101e-06, "loss": 0.0959, "step": 1648 }, { "epoch": 1.2680115273775217, "grad_norm": 0.8176605216769884, "learning_rate": 5.005281538547071e-06, "loss": 0.0968, "step": 1650 }, { "epoch": 1.2695485110470701, "grad_norm": 0.7636164672938492, "learning_rate": 4.999626113879891e-06, "loss": 0.0855, "step": 1652 }, { "epoch": 1.2710854947166186, "grad_norm": 0.8820138872988845, "learning_rate": 4.9939658892971106e-06, "loss": 0.0963, "step": 1654 }, { "epoch": 1.2726224783861673, "grad_norm": 0.8230226501954376, "learning_rate": 4.9883008829156475e-06, "loss": 0.0908, "step": 1656 }, { "epoch": 1.2741594620557157, "grad_norm": 0.7954482370699276, "learning_rate": 4.982631112867724e-06, "loss": 0.0949, "step": 1658 }, { "epoch": 1.2756964457252642, "grad_norm": 0.7446617995311455, "learning_rate": 4.976956597300806e-06, "loss": 0.0922, "step": 1660 }, { "epoch": 1.2772334293948127, "grad_norm": 0.8316466452619663, "learning_rate": 4.971277354377554e-06, "loss": 0.0961, "step": 1662 }, { "epoch": 1.2787704130643611, "grad_norm": 0.8717333357230952, "learning_rate": 4.965593402275754e-06, "loss": 0.0989, "step": 1664 }, { "epoch": 1.2803073967339098, "grad_norm": 0.8635438600294546, "learning_rate": 4.959904759188271e-06, "loss": 0.0965, "step": 1666 }, { "epoch": 1.2818443804034583, "grad_norm": 0.8348320154619787, "learning_rate": 4.954211443322978e-06, "loss": 0.0982, "step": 1668 }, { "epoch": 1.2833813640730067, "grad_norm": 0.7587760801820468, "learning_rate": 4.948513472902709e-06, "loss": 0.0894, "step": 1670 }, { "epoch": 1.2849183477425552, "grad_norm": 0.823101148418266, "learning_rate": 4.942810866165194e-06, "loss": 0.1011, "step": 1672 }, { "epoch": 1.2864553314121037, "grad_norm": 0.8615662400758028, "learning_rate": 4.937103641363004e-06, "loss": 0.0965, "step": 1674 }, { "epoch": 1.2879923150816524, "grad_norm": 0.7508304626813055, "learning_rate": 4.931391816763491e-06, "loss": 0.0902, "step": 1676 }, { "epoch": 1.2895292987512008, "grad_norm": 0.7605099804332683, "learning_rate": 4.925675410648728e-06, "loss": 0.0957, "step": 1678 }, { "epoch": 1.2910662824207493, "grad_norm": 0.8276765994390878, "learning_rate": 4.919954441315453e-06, "loss": 0.0928, "step": 1680 }, { "epoch": 1.2926032660902977, "grad_norm": 0.8394305545603477, "learning_rate": 4.914228927075014e-06, "loss": 0.0933, "step": 1682 }, { "epoch": 1.2941402497598462, "grad_norm": 0.8228704133694662, "learning_rate": 4.908498886253298e-06, "loss": 0.0954, "step": 1684 }, { "epoch": 1.295677233429395, "grad_norm": 0.8428554748260062, "learning_rate": 4.902764337190685e-06, "loss": 0.101, "step": 1686 }, { "epoch": 1.2972142170989434, "grad_norm": 0.8148117938502604, "learning_rate": 4.897025298241987e-06, "loss": 0.0939, "step": 1688 }, { "epoch": 1.2987512007684918, "grad_norm": 0.7520640222305418, "learning_rate": 4.891281787776383e-06, "loss": 0.0897, "step": 1690 }, { "epoch": 1.3002881844380403, "grad_norm": 0.7757726426358977, "learning_rate": 4.885533824177365e-06, "loss": 0.0907, "step": 1692 }, { "epoch": 1.3018251681075887, "grad_norm": 0.8281519655081531, "learning_rate": 4.87978142584268e-06, "loss": 0.102, "step": 1694 }, { "epoch": 1.3033621517771374, "grad_norm": 0.845824623898633, "learning_rate": 4.874024611184266e-06, "loss": 0.0958, "step": 1696 }, { "epoch": 1.304899135446686, "grad_norm": 0.8422886597995062, "learning_rate": 4.868263398628203e-06, "loss": 0.0944, "step": 1698 }, { "epoch": 1.3064361191162344, "grad_norm": 0.8314043674269285, "learning_rate": 4.86249780661464e-06, "loss": 0.1048, "step": 1700 }, { "epoch": 1.3079731027857828, "grad_norm": 0.8629695831852081, "learning_rate": 4.8567278535977475e-06, "loss": 0.1105, "step": 1702 }, { "epoch": 1.3095100864553313, "grad_norm": 0.8379948759443979, "learning_rate": 4.850953558045653e-06, "loss": 0.0973, "step": 1704 }, { "epoch": 1.31104707012488, "grad_norm": 0.7985221017745798, "learning_rate": 4.845174938440386e-06, "loss": 0.0994, "step": 1706 }, { "epoch": 1.3125840537944284, "grad_norm": 0.8271445315963422, "learning_rate": 4.8393920132778144e-06, "loss": 0.089, "step": 1708 }, { "epoch": 1.314121037463977, "grad_norm": 0.8199327418927324, "learning_rate": 4.833604801067585e-06, "loss": 0.0979, "step": 1710 }, { "epoch": 1.3156580211335256, "grad_norm": 0.7802159907934838, "learning_rate": 4.827813320333071e-06, "loss": 0.0891, "step": 1712 }, { "epoch": 1.3171950048030738, "grad_norm": 0.8828098223970235, "learning_rate": 4.822017589611302e-06, "loss": 0.0968, "step": 1714 }, { "epoch": 1.3187319884726225, "grad_norm": 0.7743361926366948, "learning_rate": 4.816217627452917e-06, "loss": 0.0901, "step": 1716 }, { "epoch": 1.320268972142171, "grad_norm": 0.8452670586980338, "learning_rate": 4.810413452422094e-06, "loss": 0.0923, "step": 1718 }, { "epoch": 1.3218059558117194, "grad_norm": 0.8330276027385624, "learning_rate": 4.804605083096499e-06, "loss": 0.0952, "step": 1720 }, { "epoch": 1.3233429394812681, "grad_norm": 0.805109828914744, "learning_rate": 4.798792538067218e-06, "loss": 0.0946, "step": 1722 }, { "epoch": 1.3248799231508166, "grad_norm": 0.8508784027134717, "learning_rate": 4.792975835938709e-06, "loss": 0.0957, "step": 1724 }, { "epoch": 1.326416906820365, "grad_norm": 0.7937254392396291, "learning_rate": 4.787154995328729e-06, "loss": 0.0948, "step": 1726 }, { "epoch": 1.3279538904899135, "grad_norm": 0.7723472021275456, "learning_rate": 4.781330034868287e-06, "loss": 0.0954, "step": 1728 }, { "epoch": 1.329490874159462, "grad_norm": 0.8147194013177234, "learning_rate": 4.775500973201573e-06, "loss": 0.0994, "step": 1730 }, { "epoch": 1.3310278578290107, "grad_norm": 0.7865073470553021, "learning_rate": 4.76966782898591e-06, "loss": 0.0908, "step": 1732 }, { "epoch": 1.3325648414985591, "grad_norm": 0.8909496268629883, "learning_rate": 4.763830620891682e-06, "loss": 0.0994, "step": 1734 }, { "epoch": 1.3341018251681076, "grad_norm": 0.8439047859296126, "learning_rate": 4.757989367602286e-06, "loss": 0.0948, "step": 1736 }, { "epoch": 1.335638808837656, "grad_norm": 0.8146110101810443, "learning_rate": 4.752144087814062e-06, "loss": 0.0932, "step": 1738 }, { "epoch": 1.3371757925072045, "grad_norm": 0.8639874055710914, "learning_rate": 4.746294800236241e-06, "loss": 0.103, "step": 1740 }, { "epoch": 1.3387127761767532, "grad_norm": 0.8343222963762533, "learning_rate": 4.740441523590881e-06, "loss": 0.0933, "step": 1742 }, { "epoch": 1.3402497598463017, "grad_norm": 0.9191398577485145, "learning_rate": 4.734584276612807e-06, "loss": 0.0983, "step": 1744 }, { "epoch": 1.3417867435158501, "grad_norm": 0.8075250938783436, "learning_rate": 4.7287230780495525e-06, "loss": 0.0958, "step": 1746 }, { "epoch": 1.3433237271853986, "grad_norm": 0.8741221440894832, "learning_rate": 4.722857946661299e-06, "loss": 0.0997, "step": 1748 }, { "epoch": 1.344860710854947, "grad_norm": 0.8738624905786072, "learning_rate": 4.7169889012208174e-06, "loss": 0.1046, "step": 1750 }, { "epoch": 1.3463976945244958, "grad_norm": 0.8003724652951454, "learning_rate": 4.711115960513405e-06, "loss": 0.099, "step": 1752 }, { "epoch": 1.3479346781940442, "grad_norm": 0.7908228630871454, "learning_rate": 4.705239143336827e-06, "loss": 0.0991, "step": 1754 }, { "epoch": 1.3494716618635927, "grad_norm": 0.7736523132228244, "learning_rate": 4.6993584685012554e-06, "loss": 0.1028, "step": 1756 }, { "epoch": 1.3510086455331412, "grad_norm": 0.8141238639955589, "learning_rate": 4.693473954829211e-06, "loss": 0.0954, "step": 1758 }, { "epoch": 1.3525456292026896, "grad_norm": 0.7526341018348459, "learning_rate": 4.687585621155502e-06, "loss": 0.0905, "step": 1760 }, { "epoch": 1.3540826128722383, "grad_norm": 0.776573467548187, "learning_rate": 4.68169348632716e-06, "loss": 0.098, "step": 1762 }, { "epoch": 1.3556195965417868, "grad_norm": 0.8263354203916096, "learning_rate": 4.675797569203389e-06, "loss": 0.1029, "step": 1764 }, { "epoch": 1.3571565802113352, "grad_norm": 0.7840394777513143, "learning_rate": 4.669897888655494e-06, "loss": 0.0938, "step": 1766 }, { "epoch": 1.3586935638808837, "grad_norm": 0.8026477022187467, "learning_rate": 4.663994463566828e-06, "loss": 0.0972, "step": 1768 }, { "epoch": 1.3602305475504322, "grad_norm": 0.7984909693964071, "learning_rate": 4.658087312832729e-06, "loss": 0.0934, "step": 1770 }, { "epoch": 1.3617675312199808, "grad_norm": 0.8445059001868617, "learning_rate": 4.652176455360459e-06, "loss": 0.0969, "step": 1772 }, { "epoch": 1.3633045148895293, "grad_norm": 0.8508994597739256, "learning_rate": 4.646261910069147e-06, "loss": 0.0895, "step": 1774 }, { "epoch": 1.3648414985590778, "grad_norm": 0.7381165146693729, "learning_rate": 4.640343695889721e-06, "loss": 0.0893, "step": 1776 }, { "epoch": 1.3663784822286265, "grad_norm": 0.7966384986609926, "learning_rate": 4.634421831764857e-06, "loss": 0.0992, "step": 1778 }, { "epoch": 1.3679154658981747, "grad_norm": 0.7322872414451935, "learning_rate": 4.628496336648911e-06, "loss": 0.0868, "step": 1780 }, { "epoch": 1.3694524495677234, "grad_norm": 0.7490827880894491, "learning_rate": 4.6225672295078615e-06, "loss": 0.0898, "step": 1782 }, { "epoch": 1.3709894332372718, "grad_norm": 0.8010232538915442, "learning_rate": 4.616634529319249e-06, "loss": 0.0917, "step": 1784 }, { "epoch": 1.3725264169068203, "grad_norm": 0.8095923768198692, "learning_rate": 4.61069825507211e-06, "loss": 0.092, "step": 1786 }, { "epoch": 1.374063400576369, "grad_norm": 0.7861781833880861, "learning_rate": 4.604758425766928e-06, "loss": 0.0891, "step": 1788 }, { "epoch": 1.3756003842459175, "grad_norm": 0.8541796594703853, "learning_rate": 4.5988150604155585e-06, "loss": 0.0962, "step": 1790 }, { "epoch": 1.377137367915466, "grad_norm": 0.8065855822446965, "learning_rate": 4.592868178041181e-06, "loss": 0.0976, "step": 1792 }, { "epoch": 1.3786743515850144, "grad_norm": 0.8147810473519597, "learning_rate": 4.586917797678225e-06, "loss": 0.0931, "step": 1794 }, { "epoch": 1.3802113352545629, "grad_norm": 0.7931514880597085, "learning_rate": 4.5809639383723245e-06, "loss": 0.0952, "step": 1796 }, { "epoch": 1.3817483189241115, "grad_norm": 0.8182579050054116, "learning_rate": 4.57500661918024e-06, "loss": 0.0971, "step": 1798 }, { "epoch": 1.38328530259366, "grad_norm": 0.7690232401474136, "learning_rate": 4.569045859169814e-06, "loss": 0.083, "step": 1800 }, { "epoch": 1.3848222862632085, "grad_norm": 0.7899387190866939, "learning_rate": 4.563081677419897e-06, "loss": 0.0908, "step": 1802 }, { "epoch": 1.386359269932757, "grad_norm": 0.8326942252427081, "learning_rate": 4.557114093020294e-06, "loss": 0.1004, "step": 1804 }, { "epoch": 1.3878962536023054, "grad_norm": 0.868404572818935, "learning_rate": 4.551143125071698e-06, "loss": 0.0989, "step": 1806 }, { "epoch": 1.389433237271854, "grad_norm": 0.7829935600541224, "learning_rate": 4.545168792685637e-06, "loss": 0.0904, "step": 1808 }, { "epoch": 1.3909702209414025, "grad_norm": 0.8030113132084329, "learning_rate": 4.539191114984403e-06, "loss": 0.0948, "step": 1810 }, { "epoch": 1.392507204610951, "grad_norm": 0.8322013967452164, "learning_rate": 4.533210111101e-06, "loss": 0.0889, "step": 1812 }, { "epoch": 1.3940441882804995, "grad_norm": 0.854566482621519, "learning_rate": 4.527225800179074e-06, "loss": 0.0926, "step": 1814 }, { "epoch": 1.395581171950048, "grad_norm": 0.8546853000055383, "learning_rate": 4.521238201372854e-06, "loss": 0.0985, "step": 1816 }, { "epoch": 1.3971181556195966, "grad_norm": 0.7861004953262382, "learning_rate": 4.5152473338471025e-06, "loss": 0.0842, "step": 1818 }, { "epoch": 1.398655139289145, "grad_norm": 0.8279303392404915, "learning_rate": 4.509253216777034e-06, "loss": 0.093, "step": 1820 }, { "epoch": 1.4001921229586936, "grad_norm": 0.7776536059593128, "learning_rate": 4.50325586934827e-06, "loss": 0.0892, "step": 1822 }, { "epoch": 1.401729106628242, "grad_norm": 0.823356682996185, "learning_rate": 4.497255310756767e-06, "loss": 0.0873, "step": 1824 }, { "epoch": 1.4032660902977905, "grad_norm": 0.781617011211452, "learning_rate": 4.491251560208766e-06, "loss": 0.0962, "step": 1826 }, { "epoch": 1.4048030739673392, "grad_norm": 0.8182872551902259, "learning_rate": 4.485244636920716e-06, "loss": 0.09, "step": 1828 }, { "epoch": 1.4063400576368876, "grad_norm": 0.7732731640678324, "learning_rate": 4.479234560119231e-06, "loss": 0.0909, "step": 1830 }, { "epoch": 1.407877041306436, "grad_norm": 0.8358121488236012, "learning_rate": 4.473221349041009e-06, "loss": 0.0945, "step": 1832 }, { "epoch": 1.4094140249759846, "grad_norm": 0.783898477086607, "learning_rate": 4.467205022932788e-06, "loss": 0.0868, "step": 1834 }, { "epoch": 1.410951008645533, "grad_norm": 0.8347502314619144, "learning_rate": 4.4611856010512696e-06, "loss": 0.0859, "step": 1836 }, { "epoch": 1.4124879923150817, "grad_norm": 0.8029436387846094, "learning_rate": 4.455163102663071e-06, "loss": 0.0986, "step": 1838 }, { "epoch": 1.4140249759846302, "grad_norm": 0.780720619644275, "learning_rate": 4.449137547044651e-06, "loss": 0.092, "step": 1840 }, { "epoch": 1.4155619596541786, "grad_norm": 0.7647611604463465, "learning_rate": 4.443108953482255e-06, "loss": 0.0843, "step": 1842 }, { "epoch": 1.4170989433237273, "grad_norm": 0.7569931424177674, "learning_rate": 4.437077341271854e-06, "loss": 0.1, "step": 1844 }, { "epoch": 1.4186359269932756, "grad_norm": 0.780426436469334, "learning_rate": 4.431042729719081e-06, "loss": 0.0922, "step": 1846 }, { "epoch": 1.4201729106628243, "grad_norm": 0.8266309005493522, "learning_rate": 4.425005138139165e-06, "loss": 0.091, "step": 1848 }, { "epoch": 1.4217098943323727, "grad_norm": 0.7919591525646804, "learning_rate": 4.418964585856878e-06, "loss": 0.0914, "step": 1850 }, { "epoch": 1.4232468780019212, "grad_norm": 0.8676995223573973, "learning_rate": 4.4129210922064655e-06, "loss": 0.0933, "step": 1852 }, { "epoch": 1.4247838616714699, "grad_norm": 0.8245651321649498, "learning_rate": 4.406874676531591e-06, "loss": 0.099, "step": 1854 }, { "epoch": 1.4263208453410183, "grad_norm": 0.8084380870931988, "learning_rate": 4.400825358185267e-06, "loss": 0.094, "step": 1856 }, { "epoch": 1.4278578290105668, "grad_norm": 0.8383642555183834, "learning_rate": 4.394773156529796e-06, "loss": 0.1035, "step": 1858 }, { "epoch": 1.4293948126801153, "grad_norm": 0.8496971372491992, "learning_rate": 4.388718090936714e-06, "loss": 0.0961, "step": 1860 }, { "epoch": 1.4309317963496637, "grad_norm": 0.778441978283022, "learning_rate": 4.38266018078672e-06, "loss": 0.0903, "step": 1862 }, { "epoch": 1.4324687800192124, "grad_norm": 0.8399196603924103, "learning_rate": 4.376599445469619e-06, "loss": 0.0951, "step": 1864 }, { "epoch": 1.4340057636887609, "grad_norm": 0.8277510344863601, "learning_rate": 4.370535904384257e-06, "loss": 0.0947, "step": 1866 }, { "epoch": 1.4355427473583093, "grad_norm": 0.8546854576792136, "learning_rate": 4.3644695769384634e-06, "loss": 0.0978, "step": 1868 }, { "epoch": 1.4370797310278578, "grad_norm": 0.8038219716990682, "learning_rate": 4.358400482548984e-06, "loss": 0.0901, "step": 1870 }, { "epoch": 1.4386167146974063, "grad_norm": 0.8079184472772435, "learning_rate": 4.352328640641422e-06, "loss": 0.0885, "step": 1872 }, { "epoch": 1.440153698366955, "grad_norm": 0.7459154763602515, "learning_rate": 4.346254070650172e-06, "loss": 0.0886, "step": 1874 }, { "epoch": 1.4416906820365034, "grad_norm": 0.7909488135828993, "learning_rate": 4.340176792018365e-06, "loss": 0.0928, "step": 1876 }, { "epoch": 1.4432276657060519, "grad_norm": 0.8321374668536021, "learning_rate": 4.3340968241977975e-06, "loss": 0.0932, "step": 1878 }, { "epoch": 1.4447646493756003, "grad_norm": 0.7923523546601555, "learning_rate": 4.328014186648875e-06, "loss": 0.0867, "step": 1880 }, { "epoch": 1.4463016330451488, "grad_norm": 0.8123704722222743, "learning_rate": 4.321928898840549e-06, "loss": 0.0883, "step": 1882 }, { "epoch": 1.4478386167146975, "grad_norm": 0.8443684423046657, "learning_rate": 4.315840980250253e-06, "loss": 0.0941, "step": 1884 }, { "epoch": 1.449375600384246, "grad_norm": 0.8218259476991273, "learning_rate": 4.30975045036384e-06, "loss": 0.0934, "step": 1886 }, { "epoch": 1.4509125840537944, "grad_norm": 0.7796868880134117, "learning_rate": 4.3036573286755225e-06, "loss": 0.0904, "step": 1888 }, { "epoch": 1.4524495677233429, "grad_norm": 0.8382801166615858, "learning_rate": 4.297561634687809e-06, "loss": 0.0963, "step": 1890 }, { "epoch": 1.4539865513928913, "grad_norm": 0.7638917693205276, "learning_rate": 4.291463387911439e-06, "loss": 0.0846, "step": 1892 }, { "epoch": 1.45552353506244, "grad_norm": 0.8232905374055427, "learning_rate": 4.2853626078653255e-06, "loss": 0.0907, "step": 1894 }, { "epoch": 1.4570605187319885, "grad_norm": 0.8327882309952656, "learning_rate": 4.279259314076488e-06, "loss": 0.0924, "step": 1896 }, { "epoch": 1.458597502401537, "grad_norm": 0.8362322094432861, "learning_rate": 4.273153526079994e-06, "loss": 0.0961, "step": 1898 }, { "epoch": 1.4601344860710854, "grad_norm": 0.8371159641695654, "learning_rate": 4.2670452634188895e-06, "loss": 0.0857, "step": 1900 }, { "epoch": 1.4616714697406339, "grad_norm": 0.7922677007185823, "learning_rate": 4.260934545644148e-06, "loss": 0.0919, "step": 1902 }, { "epoch": 1.4632084534101826, "grad_norm": 0.7923726734910232, "learning_rate": 4.254821392314595e-06, "loss": 0.0903, "step": 1904 }, { "epoch": 1.464745437079731, "grad_norm": 0.7722229174216336, "learning_rate": 4.248705822996856e-06, "loss": 0.0879, "step": 1906 }, { "epoch": 1.4662824207492795, "grad_norm": 0.8218964345538021, "learning_rate": 4.242587857265288e-06, "loss": 0.0993, "step": 1908 }, { "epoch": 1.4678194044188282, "grad_norm": 0.8010254286113982, "learning_rate": 4.236467514701916e-06, "loss": 0.0897, "step": 1910 }, { "epoch": 1.4693563880883764, "grad_norm": 0.7744562817644429, "learning_rate": 4.230344814896378e-06, "loss": 0.0913, "step": 1912 }, { "epoch": 1.4708933717579251, "grad_norm": 0.8343016463549663, "learning_rate": 4.22421977744585e-06, "loss": 0.0993, "step": 1914 }, { "epoch": 1.4724303554274736, "grad_norm": 0.8688236869752541, "learning_rate": 4.2180924219549964e-06, "loss": 0.0997, "step": 1916 }, { "epoch": 1.473967339097022, "grad_norm": 0.7734307337086462, "learning_rate": 4.211962768035896e-06, "loss": 0.0984, "step": 1918 }, { "epoch": 1.4755043227665707, "grad_norm": 0.7679518244992084, "learning_rate": 4.205830835307989e-06, "loss": 0.0885, "step": 1920 }, { "epoch": 1.4770413064361192, "grad_norm": 0.7926045518052727, "learning_rate": 4.199696643398008e-06, "loss": 0.0914, "step": 1922 }, { "epoch": 1.4785782901056677, "grad_norm": 0.8283722584130274, "learning_rate": 4.193560211939913e-06, "loss": 0.094, "step": 1924 }, { "epoch": 1.4801152737752161, "grad_norm": 0.7250571779590265, "learning_rate": 4.1874215605748366e-06, "loss": 0.079, "step": 1926 }, { "epoch": 1.4816522574447646, "grad_norm": 0.8580770642845996, "learning_rate": 4.181280708951015e-06, "loss": 0.1018, "step": 1928 }, { "epoch": 1.4831892411143133, "grad_norm": 0.8728686047506982, "learning_rate": 4.175137676723726e-06, "loss": 0.0964, "step": 1930 }, { "epoch": 1.4847262247838617, "grad_norm": 0.7923177951847387, "learning_rate": 4.168992483555231e-06, "loss": 0.095, "step": 1932 }, { "epoch": 1.4862632084534102, "grad_norm": 0.8120910936666785, "learning_rate": 4.162845149114702e-06, "loss": 0.0969, "step": 1934 }, { "epoch": 1.4878001921229587, "grad_norm": 0.788706989115151, "learning_rate": 4.15669569307817e-06, "loss": 0.0885, "step": 1936 }, { "epoch": 1.4893371757925071, "grad_norm": 0.830625147311433, "learning_rate": 4.1505441351284526e-06, "loss": 0.0972, "step": 1938 }, { "epoch": 1.4908741594620558, "grad_norm": 0.8072145444813656, "learning_rate": 4.144390494955098e-06, "loss": 0.0887, "step": 1940 }, { "epoch": 1.4924111431316043, "grad_norm": 0.8139524053166517, "learning_rate": 4.138234792254317e-06, "loss": 0.0939, "step": 1942 }, { "epoch": 1.4939481268011527, "grad_norm": 0.7807279012822973, "learning_rate": 4.132077046728924e-06, "loss": 0.0804, "step": 1944 }, { "epoch": 1.4954851104707012, "grad_norm": 0.7814387788672251, "learning_rate": 4.125917278088269e-06, "loss": 0.0924, "step": 1946 }, { "epoch": 1.4970220941402497, "grad_norm": 0.884806793139695, "learning_rate": 4.1197555060481836e-06, "loss": 0.0977, "step": 1948 }, { "epoch": 1.4985590778097984, "grad_norm": 0.8315758503149716, "learning_rate": 4.1135917503309026e-06, "loss": 0.0895, "step": 1950 }, { "epoch": 1.5000960614793468, "grad_norm": 0.8674422252417301, "learning_rate": 4.107426030665016e-06, "loss": 0.0882, "step": 1952 }, { "epoch": 1.5016330451488953, "grad_norm": 0.8449293445755316, "learning_rate": 4.101258366785402e-06, "loss": 0.0967, "step": 1954 }, { "epoch": 1.5031700288184437, "grad_norm": 0.8359354740918613, "learning_rate": 4.095088778433156e-06, "loss": 0.0998, "step": 1956 }, { "epoch": 1.5047070124879922, "grad_norm": 0.8262628504512934, "learning_rate": 4.088917285355536e-06, "loss": 0.0945, "step": 1958 }, { "epoch": 1.506243996157541, "grad_norm": 0.7479486694048768, "learning_rate": 4.082743907305897e-06, "loss": 0.0943, "step": 1960 }, { "epoch": 1.5077809798270894, "grad_norm": 0.8436782180522755, "learning_rate": 4.076568664043625e-06, "loss": 0.1004, "step": 1962 }, { "epoch": 1.5093179634966378, "grad_norm": 0.7748345501977325, "learning_rate": 4.0703915753340804e-06, "loss": 0.0875, "step": 1964 }, { "epoch": 1.5108549471661865, "grad_norm": 0.8382516980717638, "learning_rate": 4.064212660948524e-06, "loss": 0.094, "step": 1966 }, { "epoch": 1.5123919308357348, "grad_norm": 0.7852255919020612, "learning_rate": 4.058031940664067e-06, "loss": 0.0912, "step": 1968 }, { "epoch": 1.5139289145052834, "grad_norm": 0.8541412693322231, "learning_rate": 4.051849434263595e-06, "loss": 0.0968, "step": 1970 }, { "epoch": 1.515465898174832, "grad_norm": 0.8000408705444431, "learning_rate": 4.0456651615357155e-06, "loss": 0.0874, "step": 1972 }, { "epoch": 1.5170028818443804, "grad_norm": 0.7870384108772879, "learning_rate": 4.039479142274686e-06, "loss": 0.089, "step": 1974 }, { "epoch": 1.518539865513929, "grad_norm": 0.820652722071049, "learning_rate": 4.033291396280355e-06, "loss": 0.0926, "step": 1976 }, { "epoch": 1.5200768491834773, "grad_norm": 0.8154568505994646, "learning_rate": 4.027101943358098e-06, "loss": 0.0969, "step": 1978 }, { "epoch": 1.521613832853026, "grad_norm": 0.8770625870074841, "learning_rate": 4.020910803318756e-06, "loss": 0.0993, "step": 1980 }, { "epoch": 1.5231508165225744, "grad_norm": 0.6758733981469482, "learning_rate": 4.014717995978565e-06, "loss": 0.081, "step": 1982 }, { "epoch": 1.524687800192123, "grad_norm": 0.8441728203990428, "learning_rate": 4.008523541159104e-06, "loss": 0.1009, "step": 1984 }, { "epoch": 1.5262247838616716, "grad_norm": 0.8110301274459378, "learning_rate": 4.002327458687218e-06, "loss": 0.0872, "step": 1986 }, { "epoch": 1.5277617675312198, "grad_norm": 0.8833546884887249, "learning_rate": 3.996129768394969e-06, "loss": 0.0982, "step": 1988 }, { "epoch": 1.5292987512007685, "grad_norm": 0.8265167766685495, "learning_rate": 3.989930490119561e-06, "loss": 0.0967, "step": 1990 }, { "epoch": 1.530835734870317, "grad_norm": 0.8707509834765963, "learning_rate": 3.98372964370328e-06, "loss": 0.0956, "step": 1992 }, { "epoch": 1.5323727185398655, "grad_norm": 0.7763758628665975, "learning_rate": 3.977527248993434e-06, "loss": 0.0867, "step": 1994 }, { "epoch": 1.5339097022094141, "grad_norm": 0.7789258378178834, "learning_rate": 3.9713233258422855e-06, "loss": 0.0878, "step": 1996 }, { "epoch": 1.5354466858789624, "grad_norm": 0.7521669320980235, "learning_rate": 3.965117894106988e-06, "loss": 0.0888, "step": 1998 }, { "epoch": 1.536983669548511, "grad_norm": 0.8412076854628533, "learning_rate": 3.958910973649527e-06, "loss": 0.0846, "step": 2000 }, { "epoch": 1.536983669548511, "eval_loss": 0.1369212120771408, "eval_runtime": 362.0502, "eval_samples_per_second": 51.112, "eval_steps_per_second": 6.391, "step": 2000 }, { "epoch": 1.5385206532180595, "grad_norm": 0.7902219827328915, "learning_rate": 3.952702584336648e-06, "loss": 0.0984, "step": 2002 }, { "epoch": 1.540057636887608, "grad_norm": 0.7801431687989913, "learning_rate": 3.9464927460398e-06, "loss": 0.0979, "step": 2004 }, { "epoch": 1.5415946205571567, "grad_norm": 0.8810086083716847, "learning_rate": 3.940281478635074e-06, "loss": 0.0916, "step": 2006 }, { "epoch": 1.5431316042267051, "grad_norm": 0.8496438323924863, "learning_rate": 3.934068802003129e-06, "loss": 0.0943, "step": 2008 }, { "epoch": 1.5446685878962536, "grad_norm": 0.8624052138866306, "learning_rate": 3.927854736029138e-06, "loss": 0.0941, "step": 2010 }, { "epoch": 1.546205571565802, "grad_norm": 0.803691628281103, "learning_rate": 3.921639300602719e-06, "loss": 0.0964, "step": 2012 }, { "epoch": 1.5477425552353505, "grad_norm": 0.7838512381219084, "learning_rate": 3.915422515617876e-06, "loss": 0.0877, "step": 2014 }, { "epoch": 1.5492795389048992, "grad_norm": 0.8154032044869861, "learning_rate": 3.90920440097293e-06, "loss": 0.0898, "step": 2016 }, { "epoch": 1.5508165225744477, "grad_norm": 0.7774689436181877, "learning_rate": 3.902984976570459e-06, "loss": 0.0942, "step": 2018 }, { "epoch": 1.5523535062439962, "grad_norm": 0.7714965765671886, "learning_rate": 3.896764262317232e-06, "loss": 0.0882, "step": 2020 }, { "epoch": 1.5538904899135446, "grad_norm": 0.8482244169935028, "learning_rate": 3.890542278124151e-06, "loss": 0.092, "step": 2022 }, { "epoch": 1.555427473583093, "grad_norm": 0.8089857501256293, "learning_rate": 3.884319043906175e-06, "loss": 0.0885, "step": 2024 }, { "epoch": 1.5569644572526418, "grad_norm": 0.7732451008458415, "learning_rate": 3.878094579582271e-06, "loss": 0.0852, "step": 2026 }, { "epoch": 1.5585014409221902, "grad_norm": 0.8485314140958313, "learning_rate": 3.871868905075339e-06, "loss": 0.097, "step": 2028 }, { "epoch": 1.5600384245917387, "grad_norm": 0.7295949942508037, "learning_rate": 3.865642040312155e-06, "loss": 0.0765, "step": 2030 }, { "epoch": 1.5615754082612874, "grad_norm": 0.8109480864302396, "learning_rate": 3.859414005223303e-06, "loss": 0.0879, "step": 2032 }, { "epoch": 1.5631123919308356, "grad_norm": 0.8308781255627026, "learning_rate": 3.8531848197431155e-06, "loss": 0.0957, "step": 2034 }, { "epoch": 1.5646493756003843, "grad_norm": 0.8436684145142693, "learning_rate": 3.846954503809602e-06, "loss": 0.0941, "step": 2036 }, { "epoch": 1.5661863592699328, "grad_norm": 0.7993009937419601, "learning_rate": 3.840723077364396e-06, "loss": 0.0907, "step": 2038 }, { "epoch": 1.5677233429394812, "grad_norm": 0.7917608535459787, "learning_rate": 3.834490560352682e-06, "loss": 0.0946, "step": 2040 }, { "epoch": 1.56926032660903, "grad_norm": 0.775435052389592, "learning_rate": 3.828256972723137e-06, "loss": 0.0866, "step": 2042 }, { "epoch": 1.5707973102785782, "grad_norm": 0.8417542619925643, "learning_rate": 3.822022334427863e-06, "loss": 0.0956, "step": 2044 }, { "epoch": 1.5723342939481268, "grad_norm": 0.7534098817985344, "learning_rate": 3.815786665422326e-06, "loss": 0.0907, "step": 2046 }, { "epoch": 1.5738712776176753, "grad_norm": 0.7454391500744776, "learning_rate": 3.8095499856652907e-06, "loss": 0.0942, "step": 2048 }, { "epoch": 1.5754082612872238, "grad_norm": 0.8059844013874078, "learning_rate": 3.803312315118758e-06, "loss": 0.0833, "step": 2050 }, { "epoch": 1.5769452449567725, "grad_norm": 0.831173234796314, "learning_rate": 3.7970736737478976e-06, "loss": 0.0937, "step": 2052 }, { "epoch": 1.5784822286263207, "grad_norm": 0.8319311638041307, "learning_rate": 3.790834081520988e-06, "loss": 0.0941, "step": 2054 }, { "epoch": 1.5800192122958694, "grad_norm": 0.8017768441792715, "learning_rate": 3.7845935584093535e-06, "loss": 0.088, "step": 2056 }, { "epoch": 1.5815561959654179, "grad_norm": 0.8146738006411376, "learning_rate": 3.7783521243872945e-06, "loss": 0.0929, "step": 2058 }, { "epoch": 1.5830931796349663, "grad_norm": 0.7862299852946094, "learning_rate": 3.7721097994320274e-06, "loss": 0.088, "step": 2060 }, { "epoch": 1.584630163304515, "grad_norm": 0.7644295350251104, "learning_rate": 3.765866603523621e-06, "loss": 0.0971, "step": 2062 }, { "epoch": 1.5861671469740632, "grad_norm": 0.8145450168514264, "learning_rate": 3.7596225566449326e-06, "loss": 0.0902, "step": 2064 }, { "epoch": 1.587704130643612, "grad_norm": 0.8269635685870652, "learning_rate": 3.753377678781543e-06, "loss": 0.0923, "step": 2066 }, { "epoch": 1.5892411143131604, "grad_norm": 0.7490940289331177, "learning_rate": 3.7471319899216904e-06, "loss": 0.0779, "step": 2068 }, { "epoch": 1.5907780979827089, "grad_norm": 0.7723884968412873, "learning_rate": 3.7408855100562114e-06, "loss": 0.0901, "step": 2070 }, { "epoch": 1.5923150816522575, "grad_norm": 0.8319441486290872, "learning_rate": 3.7346382591784747e-06, "loss": 0.0956, "step": 2072 }, { "epoch": 1.593852065321806, "grad_norm": 0.8690168846750308, "learning_rate": 3.728390257284314e-06, "loss": 0.0953, "step": 2074 }, { "epoch": 1.5953890489913545, "grad_norm": 0.7909069572145911, "learning_rate": 3.722141524371969e-06, "loss": 0.086, "step": 2076 }, { "epoch": 1.596926032660903, "grad_norm": 0.8062074972797826, "learning_rate": 3.7158920804420203e-06, "loss": 0.0892, "step": 2078 }, { "epoch": 1.5984630163304514, "grad_norm": 0.7611492040694079, "learning_rate": 3.7096419454973193e-06, "loss": 0.0969, "step": 2080 }, { "epoch": 1.6, "grad_norm": 0.8268228564890713, "learning_rate": 3.703391139542937e-06, "loss": 0.0927, "step": 2082 }, { "epoch": 1.6015369836695486, "grad_norm": 0.7570539959602375, "learning_rate": 3.6971396825860836e-06, "loss": 0.0833, "step": 2084 }, { "epoch": 1.603073967339097, "grad_norm": 0.797065245492781, "learning_rate": 3.6908875946360597e-06, "loss": 0.0895, "step": 2086 }, { "epoch": 1.6046109510086455, "grad_norm": 0.8082111005522078, "learning_rate": 3.68463489570418e-06, "loss": 0.0891, "step": 2088 }, { "epoch": 1.606147934678194, "grad_norm": 0.8233331752704219, "learning_rate": 3.6783816058037215e-06, "loss": 0.0948, "step": 2090 }, { "epoch": 1.6076849183477426, "grad_norm": 0.8349017726953393, "learning_rate": 3.672127744949847e-06, "loss": 0.0908, "step": 2092 }, { "epoch": 1.609221902017291, "grad_norm": 0.7751955893816385, "learning_rate": 3.6658733331595493e-06, "loss": 0.0878, "step": 2094 }, { "epoch": 1.6107588856868396, "grad_norm": 0.806749618964119, "learning_rate": 3.6596183904515817e-06, "loss": 0.0908, "step": 2096 }, { "epoch": 1.6122958693563882, "grad_norm": 0.798020323683331, "learning_rate": 3.6533629368464026e-06, "loss": 0.085, "step": 2098 }, { "epoch": 1.6138328530259365, "grad_norm": 0.8845913837287194, "learning_rate": 3.6471069923661e-06, "loss": 0.0923, "step": 2100 }, { "epoch": 1.6153698366954852, "grad_norm": 0.7975937926027286, "learning_rate": 3.6408505770343365e-06, "loss": 0.0945, "step": 2102 }, { "epoch": 1.6169068203650336, "grad_norm": 0.7977338905828745, "learning_rate": 3.6345937108762803e-06, "loss": 0.0886, "step": 2104 }, { "epoch": 1.618443804034582, "grad_norm": 0.887493292191394, "learning_rate": 3.628336413918541e-06, "loss": 0.0909, "step": 2106 }, { "epoch": 1.6199807877041308, "grad_norm": 0.7939745654491456, "learning_rate": 3.622078706189112e-06, "loss": 0.0929, "step": 2108 }, { "epoch": 1.621517771373679, "grad_norm": 0.8183822731345303, "learning_rate": 3.615820607717296e-06, "loss": 0.0858, "step": 2110 }, { "epoch": 1.6230547550432277, "grad_norm": 0.7807975084192331, "learning_rate": 3.6095621385336505e-06, "loss": 0.0957, "step": 2112 }, { "epoch": 1.6245917387127762, "grad_norm": 0.8063009489440389, "learning_rate": 3.6033033186699152e-06, "loss": 0.0919, "step": 2114 }, { "epoch": 1.6261287223823246, "grad_norm": 0.8112570440921344, "learning_rate": 3.597044168158958e-06, "loss": 0.0933, "step": 2116 }, { "epoch": 1.6276657060518733, "grad_norm": 0.8331881631759025, "learning_rate": 3.5907847070346997e-06, "loss": 0.0927, "step": 2118 }, { "epoch": 1.6292026897214216, "grad_norm": 0.7415764943928016, "learning_rate": 3.5845249553320595e-06, "loss": 0.0889, "step": 2120 }, { "epoch": 1.6307396733909703, "grad_norm": 0.8003668361045585, "learning_rate": 3.5782649330868817e-06, "loss": 0.0971, "step": 2122 }, { "epoch": 1.6322766570605187, "grad_norm": 0.7911658768866721, "learning_rate": 3.5720046603358823e-06, "loss": 0.0878, "step": 2124 }, { "epoch": 1.6338136407300672, "grad_norm": 0.8045681908855453, "learning_rate": 3.5657441571165754e-06, "loss": 0.0874, "step": 2126 }, { "epoch": 1.6353506243996159, "grad_norm": 0.893368127193528, "learning_rate": 3.5594834434672142e-06, "loss": 0.0904, "step": 2128 }, { "epoch": 1.6368876080691641, "grad_norm": 0.7865118382233219, "learning_rate": 3.553222539426724e-06, "loss": 0.0841, "step": 2130 }, { "epoch": 1.6384245917387128, "grad_norm": 0.7830147718405267, "learning_rate": 3.546961465034641e-06, "loss": 0.0936, "step": 2132 }, { "epoch": 1.6399615754082613, "grad_norm": 0.8530754022949045, "learning_rate": 3.5407002403310453e-06, "loss": 0.094, "step": 2134 }, { "epoch": 1.6414985590778097, "grad_norm": 0.7937086579061391, "learning_rate": 3.5344388853565013e-06, "loss": 0.0919, "step": 2136 }, { "epoch": 1.6430355427473584, "grad_norm": 0.8490157093915195, "learning_rate": 3.528177420151984e-06, "loss": 0.0978, "step": 2138 }, { "epoch": 1.6445725264169067, "grad_norm": 0.8109475329632754, "learning_rate": 3.521915864758829e-06, "loss": 0.0891, "step": 2140 }, { "epoch": 1.6461095100864553, "grad_norm": 0.7936184951568639, "learning_rate": 3.5156542392186554e-06, "loss": 0.0916, "step": 2142 }, { "epoch": 1.6476464937560038, "grad_norm": 0.916031078014482, "learning_rate": 3.5093925635733084e-06, "loss": 0.1031, "step": 2144 }, { "epoch": 1.6491834774255523, "grad_norm": 0.7265572887246778, "learning_rate": 3.503130857864792e-06, "loss": 0.0872, "step": 2146 }, { "epoch": 1.650720461095101, "grad_norm": 0.7277873762559617, "learning_rate": 3.496869142135209e-06, "loss": 0.086, "step": 2148 }, { "epoch": 1.6522574447646494, "grad_norm": 0.7780905988859983, "learning_rate": 3.4906074364266932e-06, "loss": 0.0887, "step": 2150 }, { "epoch": 1.6537944284341979, "grad_norm": 0.7713751406958493, "learning_rate": 3.4843457607813445e-06, "loss": 0.0871, "step": 2152 }, { "epoch": 1.6553314121037463, "grad_norm": 0.8650115956133952, "learning_rate": 3.478084135241171e-06, "loss": 0.0948, "step": 2154 }, { "epoch": 1.6568683957732948, "grad_norm": 0.8279315849380673, "learning_rate": 3.4718225798480157e-06, "loss": 0.0931, "step": 2156 }, { "epoch": 1.6584053794428435, "grad_norm": 0.8057279660058387, "learning_rate": 3.4655611146435003e-06, "loss": 0.0906, "step": 2158 }, { "epoch": 1.659942363112392, "grad_norm": 0.8002672116393091, "learning_rate": 3.459299759668954e-06, "loss": 0.0889, "step": 2160 }, { "epoch": 1.6614793467819404, "grad_norm": 0.7562188109651065, "learning_rate": 3.4530385349653597e-06, "loss": 0.0846, "step": 2162 }, { "epoch": 1.663016330451489, "grad_norm": 0.9071744451213971, "learning_rate": 3.4467774605732763e-06, "loss": 0.0975, "step": 2164 }, { "epoch": 1.6645533141210374, "grad_norm": 0.7996156080383284, "learning_rate": 3.440516556532787e-06, "loss": 0.0924, "step": 2166 }, { "epoch": 1.666090297790586, "grad_norm": 0.7950473833323731, "learning_rate": 3.434255842883424e-06, "loss": 0.0879, "step": 2168 }, { "epoch": 1.6676272814601345, "grad_norm": 0.7899806697862958, "learning_rate": 3.427995339664118e-06, "loss": 0.0937, "step": 2170 }, { "epoch": 1.669164265129683, "grad_norm": 0.7336053866465901, "learning_rate": 3.421735066913118e-06, "loss": 0.0835, "step": 2172 }, { "epoch": 1.6707012487992317, "grad_norm": 0.7728149688439113, "learning_rate": 3.415475044667942e-06, "loss": 0.0927, "step": 2174 }, { "epoch": 1.67223823246878, "grad_norm": 0.8305527642036483, "learning_rate": 3.4092152929653e-06, "loss": 0.0969, "step": 2176 }, { "epoch": 1.6737752161383286, "grad_norm": 0.773715213471355, "learning_rate": 3.4029558318410426e-06, "loss": 0.0937, "step": 2178 }, { "epoch": 1.675312199807877, "grad_norm": 0.8059092352832676, "learning_rate": 3.396696681330085e-06, "loss": 0.0863, "step": 2180 }, { "epoch": 1.6768491834774255, "grad_norm": 0.803662636009022, "learning_rate": 3.3904378614663507e-06, "loss": 0.084, "step": 2182 }, { "epoch": 1.6783861671469742, "grad_norm": 0.837705389217021, "learning_rate": 3.3841793922827035e-06, "loss": 0.0882, "step": 2184 }, { "epoch": 1.6799231508165224, "grad_norm": 0.7922764494195353, "learning_rate": 3.3779212938108883e-06, "loss": 0.0827, "step": 2186 }, { "epoch": 1.6814601344860711, "grad_norm": 0.7860318255345747, "learning_rate": 3.3716635860814593e-06, "loss": 0.0876, "step": 2188 }, { "epoch": 1.6829971181556196, "grad_norm": 0.8066504756055669, "learning_rate": 3.365406289123721e-06, "loss": 0.0938, "step": 2190 }, { "epoch": 1.684534101825168, "grad_norm": 0.7921416532988432, "learning_rate": 3.3591494229656634e-06, "loss": 0.0934, "step": 2192 }, { "epoch": 1.6860710854947167, "grad_norm": 0.745529905487878, "learning_rate": 3.3528930076339002e-06, "loss": 0.0813, "step": 2194 }, { "epoch": 1.687608069164265, "grad_norm": 0.832234842686226, "learning_rate": 3.346637063153598e-06, "loss": 0.09, "step": 2196 }, { "epoch": 1.6891450528338137, "grad_norm": 0.787715663482219, "learning_rate": 3.3403816095484177e-06, "loss": 0.0883, "step": 2198 }, { "epoch": 1.6906820365033621, "grad_norm": 0.8626434473168217, "learning_rate": 3.3341266668404514e-06, "loss": 0.1017, "step": 2200 }, { "epoch": 1.6922190201729106, "grad_norm": 0.804548679573353, "learning_rate": 3.3278722550501534e-06, "loss": 0.0894, "step": 2202 }, { "epoch": 1.6937560038424593, "grad_norm": 0.7757347097137762, "learning_rate": 3.3216183941962793e-06, "loss": 0.0841, "step": 2204 }, { "epoch": 1.6952929875120075, "grad_norm": 0.7833610959261708, "learning_rate": 3.3153651042958196e-06, "loss": 0.096, "step": 2206 }, { "epoch": 1.6968299711815562, "grad_norm": 0.7848923969699126, "learning_rate": 3.309112405363941e-06, "loss": 0.0952, "step": 2208 }, { "epoch": 1.6983669548511047, "grad_norm": 0.8052464321733062, "learning_rate": 3.302860317413917e-06, "loss": 0.0915, "step": 2210 }, { "epoch": 1.6999039385206531, "grad_norm": 0.8512154630818453, "learning_rate": 3.2966088604570648e-06, "loss": 0.0876, "step": 2212 }, { "epoch": 1.7014409221902018, "grad_norm": 0.7790903398816384, "learning_rate": 3.2903580545026797e-06, "loss": 0.087, "step": 2214 }, { "epoch": 1.7029779058597503, "grad_norm": 0.8376240131132313, "learning_rate": 3.28410791955798e-06, "loss": 0.0967, "step": 2216 }, { "epoch": 1.7045148895292987, "grad_norm": 0.7967627035735608, "learning_rate": 3.2778584756280307e-06, "loss": 0.0915, "step": 2218 }, { "epoch": 1.7060518731988472, "grad_norm": 0.7702278491400715, "learning_rate": 3.271609742715687e-06, "loss": 0.0862, "step": 2220 }, { "epoch": 1.7075888568683957, "grad_norm": 0.7544840615892947, "learning_rate": 3.265361740821525e-06, "loss": 0.0833, "step": 2222 }, { "epoch": 1.7091258405379444, "grad_norm": 0.7621768245734015, "learning_rate": 3.2591144899437885e-06, "loss": 0.0871, "step": 2224 }, { "epoch": 1.7106628242074928, "grad_norm": 0.8831931871572729, "learning_rate": 3.25286801007831e-06, "loss": 0.0882, "step": 2226 }, { "epoch": 1.7121998078770413, "grad_norm": 0.8197620745142645, "learning_rate": 3.246622321218458e-06, "loss": 0.0928, "step": 2228 }, { "epoch": 1.71373679154659, "grad_norm": 0.77604756798234, "learning_rate": 3.2403774433550673e-06, "loss": 0.0832, "step": 2230 }, { "epoch": 1.7152737752161382, "grad_norm": 0.8842522339003976, "learning_rate": 3.2341333964763795e-06, "loss": 0.0988, "step": 2232 }, { "epoch": 1.716810758885687, "grad_norm": 0.7768312377818398, "learning_rate": 3.2278902005679734e-06, "loss": 0.0882, "step": 2234 }, { "epoch": 1.7183477425552354, "grad_norm": 0.8327531068216764, "learning_rate": 3.2216478756127067e-06, "loss": 0.085, "step": 2236 }, { "epoch": 1.7198847262247838, "grad_norm": 0.8186258517055159, "learning_rate": 3.215406441590646e-06, "loss": 0.0885, "step": 2238 }, { "epoch": 1.7214217098943325, "grad_norm": 0.6874479376398842, "learning_rate": 3.209165918479012e-06, "loss": 0.0781, "step": 2240 }, { "epoch": 1.7229586935638808, "grad_norm": 0.8115322816676634, "learning_rate": 3.202926326252103e-06, "loss": 0.0969, "step": 2242 }, { "epoch": 1.7244956772334294, "grad_norm": 0.8146826671168621, "learning_rate": 3.1966876848812434e-06, "loss": 0.089, "step": 2244 }, { "epoch": 1.726032660902978, "grad_norm": 0.8248049130832095, "learning_rate": 3.1904500143347092e-06, "loss": 0.0882, "step": 2246 }, { "epoch": 1.7275696445725264, "grad_norm": 0.8015905899655976, "learning_rate": 3.184213334577675e-06, "loss": 0.0914, "step": 2248 }, { "epoch": 1.729106628242075, "grad_norm": 0.8160931505603131, "learning_rate": 3.1779776655721374e-06, "loss": 0.0898, "step": 2250 }, { "epoch": 1.7306436119116233, "grad_norm": 0.8170663856878996, "learning_rate": 3.1717430272768637e-06, "loss": 0.0925, "step": 2252 }, { "epoch": 1.732180595581172, "grad_norm": 0.8051970712819015, "learning_rate": 3.1655094396473175e-06, "loss": 0.0929, "step": 2254 }, { "epoch": 1.7337175792507205, "grad_norm": 0.8967076051389512, "learning_rate": 3.1592769226356045e-06, "loss": 0.0931, "step": 2256 }, { "epoch": 1.735254562920269, "grad_norm": 0.8217397870462823, "learning_rate": 3.153045496190398e-06, "loss": 0.0885, "step": 2258 }, { "epoch": 1.7367915465898176, "grad_norm": 0.7584555916208618, "learning_rate": 3.1468151802568857e-06, "loss": 0.0868, "step": 2260 }, { "epoch": 1.7383285302593658, "grad_norm": 0.8377345355524379, "learning_rate": 3.1405859947766965e-06, "loss": 0.084, "step": 2262 }, { "epoch": 1.7398655139289145, "grad_norm": 0.7974250660258231, "learning_rate": 3.1343579596878455e-06, "loss": 0.0896, "step": 2264 }, { "epoch": 1.741402497598463, "grad_norm": 0.8423863324362034, "learning_rate": 3.128131094924661e-06, "loss": 0.0914, "step": 2266 }, { "epoch": 1.7429394812680115, "grad_norm": 0.7998587118371564, "learning_rate": 3.12190542041773e-06, "loss": 0.0877, "step": 2268 }, { "epoch": 1.7444764649375601, "grad_norm": 0.8420892166338229, "learning_rate": 3.1156809560938246e-06, "loss": 0.0892, "step": 2270 }, { "epoch": 1.7460134486071084, "grad_norm": 0.7811169390028642, "learning_rate": 3.1094577218758497e-06, "loss": 0.0826, "step": 2272 }, { "epoch": 1.747550432276657, "grad_norm": 0.856105762521616, "learning_rate": 3.103235737682768e-06, "loss": 0.0925, "step": 2274 }, { "epoch": 1.7490874159462055, "grad_norm": 0.8434878009386567, "learning_rate": 3.0970150234295416e-06, "loss": 0.0895, "step": 2276 }, { "epoch": 1.750624399615754, "grad_norm": 0.7912672991129802, "learning_rate": 3.09079559902707e-06, "loss": 0.0844, "step": 2278 }, { "epoch": 1.7521613832853027, "grad_norm": 0.8093805440566595, "learning_rate": 3.0845774843821242e-06, "loss": 0.0857, "step": 2280 }, { "epoch": 1.7536983669548512, "grad_norm": 0.7841327968506241, "learning_rate": 3.0783606993972816e-06, "loss": 0.0824, "step": 2282 }, { "epoch": 1.7552353506243996, "grad_norm": 0.8784822733296287, "learning_rate": 3.072145263970863e-06, "loss": 0.0896, "step": 2284 }, { "epoch": 1.756772334293948, "grad_norm": 0.785311512287812, "learning_rate": 3.0659311979968707e-06, "loss": 0.0787, "step": 2286 }, { "epoch": 1.7583093179634965, "grad_norm": 0.8489973269201797, "learning_rate": 3.059718521364926e-06, "loss": 0.0965, "step": 2288 }, { "epoch": 1.7598463016330452, "grad_norm": 0.8687972454705519, "learning_rate": 3.0535072539602004e-06, "loss": 0.0875, "step": 2290 }, { "epoch": 1.7613832853025937, "grad_norm": 0.8170562404470127, "learning_rate": 3.0472974156633535e-06, "loss": 0.0947, "step": 2292 }, { "epoch": 1.7629202689721422, "grad_norm": 0.7542361214197597, "learning_rate": 3.0410890263504736e-06, "loss": 0.0838, "step": 2294 }, { "epoch": 1.7644572526416908, "grad_norm": 0.8228131888155036, "learning_rate": 3.0348821058930117e-06, "loss": 0.0956, "step": 2296 }, { "epoch": 1.765994236311239, "grad_norm": 0.8281839285188529, "learning_rate": 3.0286766741577156e-06, "loss": 0.0897, "step": 2298 }, { "epoch": 1.7675312199807878, "grad_norm": 0.7587083990715107, "learning_rate": 3.022472751006566e-06, "loss": 0.0805, "step": 2300 }, { "epoch": 1.7690682036503362, "grad_norm": 0.8251420202843136, "learning_rate": 3.0162703562967197e-06, "loss": 0.0845, "step": 2302 }, { "epoch": 1.7706051873198847, "grad_norm": 0.8065903172521137, "learning_rate": 3.01006950988044e-06, "loss": 0.0891, "step": 2304 }, { "epoch": 1.7721421709894334, "grad_norm": 0.8124974824000051, "learning_rate": 3.0038702316050317e-06, "loss": 0.0896, "step": 2306 }, { "epoch": 1.7736791546589816, "grad_norm": 0.7529764285672882, "learning_rate": 2.997672541312782e-06, "loss": 0.0776, "step": 2308 }, { "epoch": 1.7752161383285303, "grad_norm": 0.8091489227633613, "learning_rate": 2.9914764588408966e-06, "loss": 0.0922, "step": 2310 }, { "epoch": 1.7767531219980788, "grad_norm": 0.8938294981287168, "learning_rate": 2.985282004021435e-06, "loss": 0.0924, "step": 2312 }, { "epoch": 1.7782901056676272, "grad_norm": 0.9013544600990553, "learning_rate": 2.979089196681245e-06, "loss": 0.091, "step": 2314 }, { "epoch": 1.779827089337176, "grad_norm": 0.8597516613442064, "learning_rate": 2.9728980566419016e-06, "loss": 0.0847, "step": 2316 }, { "epoch": 1.7813640730067242, "grad_norm": 0.8090979220931821, "learning_rate": 2.966708603719645e-06, "loss": 0.091, "step": 2318 }, { "epoch": 1.7829010566762729, "grad_norm": 0.7916631388831536, "learning_rate": 2.960520857725314e-06, "loss": 0.0867, "step": 2320 }, { "epoch": 1.7844380403458213, "grad_norm": 0.7616856425912224, "learning_rate": 2.954334838464285e-06, "loss": 0.0892, "step": 2322 }, { "epoch": 1.7859750240153698, "grad_norm": 0.7512880043816689, "learning_rate": 2.948150565736404e-06, "loss": 0.0825, "step": 2324 }, { "epoch": 1.7875120076849185, "grad_norm": 0.7903862314768638, "learning_rate": 2.9419680593359335e-06, "loss": 0.0815, "step": 2326 }, { "epoch": 1.7890489913544667, "grad_norm": 0.826491887413533, "learning_rate": 2.9357873390514757e-06, "loss": 0.0868, "step": 2328 }, { "epoch": 1.7905859750240154, "grad_norm": 0.8078642556595113, "learning_rate": 2.929608424665921e-06, "loss": 0.0945, "step": 2330 }, { "epoch": 1.7921229586935639, "grad_norm": 0.8097155781369342, "learning_rate": 2.9234313359563744e-06, "loss": 0.0938, "step": 2332 }, { "epoch": 1.7936599423631123, "grad_norm": 0.8150186954176805, "learning_rate": 2.9172560926941037e-06, "loss": 0.0828, "step": 2334 }, { "epoch": 1.795196926032661, "grad_norm": 0.7813243625319553, "learning_rate": 2.9110827146444643e-06, "loss": 0.0858, "step": 2336 }, { "epoch": 1.7967339097022093, "grad_norm": 0.7791024554992239, "learning_rate": 2.904911221566845e-06, "loss": 0.0844, "step": 2338 }, { "epoch": 1.798270893371758, "grad_norm": 0.8124678993462557, "learning_rate": 2.898741633214598e-06, "loss": 0.0893, "step": 2340 }, { "epoch": 1.7998078770413064, "grad_norm": 0.829294783575689, "learning_rate": 2.8925739693349833e-06, "loss": 0.0861, "step": 2342 }, { "epoch": 1.8013448607108549, "grad_norm": 0.7957217580914165, "learning_rate": 2.886408249669098e-06, "loss": 0.0836, "step": 2344 }, { "epoch": 1.8028818443804036, "grad_norm": 0.8631788141169373, "learning_rate": 2.880244493951818e-06, "loss": 0.1018, "step": 2346 }, { "epoch": 1.804418828049952, "grad_norm": 0.6989805954411332, "learning_rate": 2.8740827219117302e-06, "loss": 0.0827, "step": 2348 }, { "epoch": 1.8059558117195005, "grad_norm": 0.8472076650897804, "learning_rate": 2.867922953271077e-06, "loss": 0.0912, "step": 2350 }, { "epoch": 1.807492795389049, "grad_norm": 0.8663805858998409, "learning_rate": 2.861765207745683e-06, "loss": 0.094, "step": 2352 }, { "epoch": 1.8090297790585974, "grad_norm": 0.7756793455854972, "learning_rate": 2.8556095050449032e-06, "loss": 0.086, "step": 2354 }, { "epoch": 1.810566762728146, "grad_norm": 0.8038188721900218, "learning_rate": 2.8494558648715473e-06, "loss": 0.0895, "step": 2356 }, { "epoch": 1.8121037463976946, "grad_norm": 0.7788550764950732, "learning_rate": 2.8433043069218307e-06, "loss": 0.0912, "step": 2358 }, { "epoch": 1.813640730067243, "grad_norm": 0.8331357355337144, "learning_rate": 2.8371548508852977e-06, "loss": 0.0982, "step": 2360 }, { "epoch": 1.8151777137367917, "grad_norm": 0.8118854152234461, "learning_rate": 2.8310075164447696e-06, "loss": 0.0925, "step": 2362 }, { "epoch": 1.81671469740634, "grad_norm": 0.8017256660340538, "learning_rate": 2.824862323276273e-06, "loss": 0.0855, "step": 2364 }, { "epoch": 1.8182516810758886, "grad_norm": 0.7800752551508296, "learning_rate": 2.8187192910489856e-06, "loss": 0.0834, "step": 2366 }, { "epoch": 1.819788664745437, "grad_norm": 0.8380624718020749, "learning_rate": 2.812578439425164e-06, "loss": 0.0945, "step": 2368 }, { "epoch": 1.8213256484149856, "grad_norm": 0.8276945914691816, "learning_rate": 2.806439788060088e-06, "loss": 0.0889, "step": 2370 }, { "epoch": 1.8228626320845343, "grad_norm": 0.7985257731425042, "learning_rate": 2.8003033566019922e-06, "loss": 0.0841, "step": 2372 }, { "epoch": 1.8243996157540825, "grad_norm": 0.7637963888810898, "learning_rate": 2.7941691646920105e-06, "loss": 0.0751, "step": 2374 }, { "epoch": 1.8259365994236312, "grad_norm": 0.8413312901210165, "learning_rate": 2.7880372319641042e-06, "loss": 0.089, "step": 2376 }, { "epoch": 1.8274735830931796, "grad_norm": 0.8586650363460536, "learning_rate": 2.781907578045005e-06, "loss": 0.0887, "step": 2378 }, { "epoch": 1.829010566762728, "grad_norm": 0.8360563017968606, "learning_rate": 2.77578022255415e-06, "loss": 0.0939, "step": 2380 }, { "epoch": 1.8305475504322768, "grad_norm": 0.877204652341484, "learning_rate": 2.769655185103623e-06, "loss": 0.0893, "step": 2382 }, { "epoch": 1.832084534101825, "grad_norm": 0.8297837842829701, "learning_rate": 2.7635324852980843e-06, "loss": 0.0907, "step": 2384 }, { "epoch": 1.8336215177713737, "grad_norm": 0.8431709281215559, "learning_rate": 2.7574121427347133e-06, "loss": 0.0982, "step": 2386 }, { "epoch": 1.8351585014409222, "grad_norm": 0.7604258311920689, "learning_rate": 2.751294177003143e-06, "loss": 0.0789, "step": 2388 }, { "epoch": 1.8366954851104706, "grad_norm": 0.7739652641897983, "learning_rate": 2.745178607685405e-06, "loss": 0.0928, "step": 2390 }, { "epoch": 1.8382324687800193, "grad_norm": 0.8167833653726546, "learning_rate": 2.7390654543558534e-06, "loss": 0.0842, "step": 2392 }, { "epoch": 1.8397694524495676, "grad_norm": 0.7908210162723227, "learning_rate": 2.7329547365811104e-06, "loss": 0.0843, "step": 2394 }, { "epoch": 1.8413064361191163, "grad_norm": 0.8201467727248781, "learning_rate": 2.726846473920006e-06, "loss": 0.0919, "step": 2396 }, { "epoch": 1.8428434197886647, "grad_norm": 0.8484929622858332, "learning_rate": 2.7207406859235117e-06, "loss": 0.0817, "step": 2398 }, { "epoch": 1.8443804034582132, "grad_norm": 0.7641741013611666, "learning_rate": 2.714637392134675e-06, "loss": 0.0875, "step": 2400 }, { "epoch": 1.8459173871277619, "grad_norm": 0.8726969374519596, "learning_rate": 2.708536612088561e-06, "loss": 0.0883, "step": 2402 }, { "epoch": 1.8474543707973101, "grad_norm": 0.7568052808653818, "learning_rate": 2.702438365312191e-06, "loss": 0.0847, "step": 2404 }, { "epoch": 1.8489913544668588, "grad_norm": 0.774661999809938, "learning_rate": 2.696342671324478e-06, "loss": 0.0912, "step": 2406 }, { "epoch": 1.8505283381364073, "grad_norm": 0.8750231519779115, "learning_rate": 2.6902495496361613e-06, "loss": 0.094, "step": 2408 }, { "epoch": 1.8520653218059557, "grad_norm": 0.7904137960913173, "learning_rate": 2.6841590197497476e-06, "loss": 0.0858, "step": 2410 }, { "epoch": 1.8536023054755044, "grad_norm": 0.7711313526212195, "learning_rate": 2.678071101159451e-06, "loss": 0.0825, "step": 2412 }, { "epoch": 1.8551392891450529, "grad_norm": 0.8221962336381063, "learning_rate": 2.6719858133511257e-06, "loss": 0.0906, "step": 2414 }, { "epoch": 1.8566762728146013, "grad_norm": 0.8337437421475409, "learning_rate": 2.665903175802204e-06, "loss": 0.0868, "step": 2416 }, { "epoch": 1.8582132564841498, "grad_norm": 0.8378451337853939, "learning_rate": 2.6598232079816353e-06, "loss": 0.0933, "step": 2418 }, { "epoch": 1.8597502401536983, "grad_norm": 0.7639730128714044, "learning_rate": 2.6537459293498277e-06, "loss": 0.0846, "step": 2420 }, { "epoch": 1.861287223823247, "grad_norm": 0.7830022284803336, "learning_rate": 2.6476713593585783e-06, "loss": 0.0863, "step": 2422 }, { "epoch": 1.8628242074927954, "grad_norm": 0.816812769548626, "learning_rate": 2.641599517451016e-06, "loss": 0.0937, "step": 2424 }, { "epoch": 1.864361191162344, "grad_norm": 0.8742317390360426, "learning_rate": 2.6355304230615356e-06, "loss": 0.0975, "step": 2426 }, { "epoch": 1.8658981748318926, "grad_norm": 0.8026788777916881, "learning_rate": 2.629464095615743e-06, "loss": 0.0848, "step": 2428 }, { "epoch": 1.8674351585014408, "grad_norm": 0.8437565172328019, "learning_rate": 2.623400554530382e-06, "loss": 0.0947, "step": 2430 }, { "epoch": 1.8689721421709895, "grad_norm": 0.801208290733419, "learning_rate": 2.6173398192132812e-06, "loss": 0.0899, "step": 2432 }, { "epoch": 1.870509125840538, "grad_norm": 0.7932315598508702, "learning_rate": 2.6112819090632854e-06, "loss": 0.0895, "step": 2434 }, { "epoch": 1.8720461095100864, "grad_norm": 0.9042900524043032, "learning_rate": 2.6052268434702042e-06, "loss": 0.0946, "step": 2436 }, { "epoch": 1.8735830931796351, "grad_norm": 0.7894581411758911, "learning_rate": 2.599174641814734e-06, "loss": 0.0867, "step": 2438 }, { "epoch": 1.8751200768491834, "grad_norm": 0.7854020543576818, "learning_rate": 2.59312532346841e-06, "loss": 0.0911, "step": 2440 }, { "epoch": 1.876657060518732, "grad_norm": 0.815693549418664, "learning_rate": 2.5870789077935335e-06, "loss": 0.0895, "step": 2442 }, { "epoch": 1.8781940441882805, "grad_norm": 0.7887109470073577, "learning_rate": 2.5810354141431226e-06, "loss": 0.0903, "step": 2444 }, { "epoch": 1.879731027857829, "grad_norm": 0.7896777569740607, "learning_rate": 2.5749948618608356e-06, "loss": 0.0809, "step": 2446 }, { "epoch": 1.8812680115273777, "grad_norm": 0.7615729562820233, "learning_rate": 2.5689572702809203e-06, "loss": 0.0838, "step": 2448 }, { "epoch": 1.882804995196926, "grad_norm": 0.8582942386637096, "learning_rate": 2.562922658728145e-06, "loss": 0.09, "step": 2450 }, { "epoch": 1.8843419788664746, "grad_norm": 0.7611707316427125, "learning_rate": 2.556891046517745e-06, "loss": 0.0775, "step": 2452 }, { "epoch": 1.885878962536023, "grad_norm": 0.7455166273658402, "learning_rate": 2.5508624529553496e-06, "loss": 0.0866, "step": 2454 }, { "epoch": 1.8874159462055715, "grad_norm": 0.8572197727943554, "learning_rate": 2.5448368973369295e-06, "loss": 0.0867, "step": 2456 }, { "epoch": 1.8889529298751202, "grad_norm": 0.8079812248707947, "learning_rate": 2.5388143989487295e-06, "loss": 0.09, "step": 2458 }, { "epoch": 1.8904899135446684, "grad_norm": 0.8079920947645508, "learning_rate": 2.5327949770672125e-06, "loss": 0.0872, "step": 2460 }, { "epoch": 1.8920268972142171, "grad_norm": 0.8329864584858913, "learning_rate": 2.5267786509589907e-06, "loss": 0.0892, "step": 2462 }, { "epoch": 1.8935638808837656, "grad_norm": 0.8044295715601575, "learning_rate": 2.52076543988077e-06, "loss": 0.0874, "step": 2464 }, { "epoch": 1.895100864553314, "grad_norm": 0.7566977212719446, "learning_rate": 2.5147553630792827e-06, "loss": 0.079, "step": 2466 }, { "epoch": 1.8966378482228627, "grad_norm": 0.7703458660050836, "learning_rate": 2.5087484397912354e-06, "loss": 0.0803, "step": 2468 }, { "epoch": 1.898174831892411, "grad_norm": 0.7472342418402756, "learning_rate": 2.5027446892432335e-06, "loss": 0.0831, "step": 2470 }, { "epoch": 1.8997118155619597, "grad_norm": 0.7860659793588969, "learning_rate": 2.496744130651731e-06, "loss": 0.0861, "step": 2472 }, { "epoch": 1.9012487992315081, "grad_norm": 0.7782192620932303, "learning_rate": 2.4907467832229655e-06, "loss": 0.0904, "step": 2474 }, { "epoch": 1.9027857829010566, "grad_norm": 0.7976874165660952, "learning_rate": 2.4847526661528974e-06, "loss": 0.0868, "step": 2476 }, { "epoch": 1.9043227665706053, "grad_norm": 0.8130144869116065, "learning_rate": 2.4787617986271457e-06, "loss": 0.0887, "step": 2478 }, { "epoch": 1.9058597502401537, "grad_norm": 0.8306432379209792, "learning_rate": 2.4727741998209278e-06, "loss": 0.0817, "step": 2480 }, { "epoch": 1.9073967339097022, "grad_norm": 0.7830733463300852, "learning_rate": 2.4667898888989997e-06, "loss": 0.0894, "step": 2482 }, { "epoch": 1.9089337175792507, "grad_norm": 0.7962828362447372, "learning_rate": 2.460808885015596e-06, "loss": 0.0833, "step": 2484 }, { "epoch": 1.9104707012487991, "grad_norm": 0.84538791014193, "learning_rate": 2.454831207314364e-06, "loss": 0.0898, "step": 2486 }, { "epoch": 1.9120076849183478, "grad_norm": 0.7980052835225272, "learning_rate": 2.4488568749283024e-06, "loss": 0.0867, "step": 2488 }, { "epoch": 1.9135446685878963, "grad_norm": 0.7823748772923179, "learning_rate": 2.4428859069797065e-06, "loss": 0.086, "step": 2490 }, { "epoch": 1.9150816522574448, "grad_norm": 0.8497213682534336, "learning_rate": 2.4369183225801037e-06, "loss": 0.0876, "step": 2492 }, { "epoch": 1.9166186359269934, "grad_norm": 0.729672453173719, "learning_rate": 2.430954140830187e-06, "loss": 0.08, "step": 2494 }, { "epoch": 1.9181556195965417, "grad_norm": 0.8535703260306475, "learning_rate": 2.424993380819759e-06, "loss": 0.0809, "step": 2496 }, { "epoch": 1.9196926032660904, "grad_norm": 0.8459889716819726, "learning_rate": 2.419036061627676e-06, "loss": 0.0898, "step": 2498 }, { "epoch": 1.9212295869356388, "grad_norm": 0.8483259441228239, "learning_rate": 2.4130822023217745e-06, "loss": 0.0841, "step": 2500 }, { "epoch": 1.9212295869356388, "eval_loss": 0.13174882531166077, "eval_runtime": 362.8867, "eval_samples_per_second": 50.994, "eval_steps_per_second": 6.377, "step": 2500 }, { "epoch": 1.9227665706051873, "grad_norm": 0.7537082873290889, "learning_rate": 2.40713182195882e-06, "loss": 0.0835, "step": 2502 }, { "epoch": 1.924303554274736, "grad_norm": 0.8025926008593399, "learning_rate": 2.401184939584441e-06, "loss": 0.091, "step": 2504 }, { "epoch": 1.9258405379442842, "grad_norm": 0.8587481263834186, "learning_rate": 2.3952415742330715e-06, "loss": 0.0817, "step": 2506 }, { "epoch": 1.927377521613833, "grad_norm": 0.8496889861192923, "learning_rate": 2.38930174492789e-06, "loss": 0.0871, "step": 2508 }, { "epoch": 1.9289145052833814, "grad_norm": 0.7449830563434601, "learning_rate": 2.383365470680753e-06, "loss": 0.0889, "step": 2510 }, { "epoch": 1.9304514889529298, "grad_norm": 0.7842488010652338, "learning_rate": 2.377432770492138e-06, "loss": 0.0896, "step": 2512 }, { "epoch": 1.9319884726224785, "grad_norm": 0.7604948684079603, "learning_rate": 2.3715036633510887e-06, "loss": 0.0768, "step": 2514 }, { "epoch": 1.9335254562920268, "grad_norm": 0.8622740265461863, "learning_rate": 2.365578168235143e-06, "loss": 0.0884, "step": 2516 }, { "epoch": 1.9350624399615755, "grad_norm": 0.8075457215159016, "learning_rate": 2.3596563041102794e-06, "loss": 0.0859, "step": 2518 }, { "epoch": 1.936599423631124, "grad_norm": 0.7735312713920628, "learning_rate": 2.3537380899308532e-06, "loss": 0.0888, "step": 2520 }, { "epoch": 1.9381364073006724, "grad_norm": 0.7250205794783319, "learning_rate": 2.347823544639541e-06, "loss": 0.0812, "step": 2522 }, { "epoch": 1.939673390970221, "grad_norm": 0.8168723067376802, "learning_rate": 2.3419126871672716e-06, "loss": 0.085, "step": 2524 }, { "epoch": 1.9412103746397693, "grad_norm": 0.7558226843999474, "learning_rate": 2.3360055364331726e-06, "loss": 0.0842, "step": 2526 }, { "epoch": 1.942747358309318, "grad_norm": 0.8367673308249525, "learning_rate": 2.3301021113445057e-06, "loss": 0.0859, "step": 2528 }, { "epoch": 1.9442843419788665, "grad_norm": 0.771933781721656, "learning_rate": 2.3242024307966115e-06, "loss": 0.0853, "step": 2530 }, { "epoch": 1.945821325648415, "grad_norm": 0.8022281457204921, "learning_rate": 2.3183065136728395e-06, "loss": 0.0829, "step": 2532 }, { "epoch": 1.9473583093179636, "grad_norm": 0.8770111872133347, "learning_rate": 2.3124143788444994e-06, "loss": 0.0871, "step": 2534 }, { "epoch": 1.9488952929875119, "grad_norm": 0.7289175750768725, "learning_rate": 2.3065260451707887e-06, "loss": 0.0765, "step": 2536 }, { "epoch": 1.9504322766570605, "grad_norm": 0.8573432100144109, "learning_rate": 2.3006415314987453e-06, "loss": 0.0926, "step": 2538 }, { "epoch": 1.951969260326609, "grad_norm": 0.8049758874747357, "learning_rate": 2.2947608566631738e-06, "loss": 0.09, "step": 2540 }, { "epoch": 1.9535062439961575, "grad_norm": 0.8070974588287757, "learning_rate": 2.288884039486595e-06, "loss": 0.0805, "step": 2542 }, { "epoch": 1.9550432276657062, "grad_norm": 0.7988980263528441, "learning_rate": 2.2830110987791816e-06, "loss": 0.0868, "step": 2544 }, { "epoch": 1.9565802113352546, "grad_norm": 0.8018076390389216, "learning_rate": 2.277142053338701e-06, "loss": 0.0833, "step": 2546 }, { "epoch": 1.958117195004803, "grad_norm": 0.8761877037291468, "learning_rate": 2.271276921950448e-06, "loss": 0.0889, "step": 2548 }, { "epoch": 1.9596541786743515, "grad_norm": 0.8222253983712655, "learning_rate": 2.265415723387194e-06, "loss": 0.0884, "step": 2550 }, { "epoch": 1.9611911623439, "grad_norm": 0.8067488111444437, "learning_rate": 2.259558476409119e-06, "loss": 0.0878, "step": 2552 }, { "epoch": 1.9627281460134487, "grad_norm": 0.7772228994300671, "learning_rate": 2.253705199763759e-06, "loss": 0.0827, "step": 2554 }, { "epoch": 1.9642651296829972, "grad_norm": 0.8171211068617626, "learning_rate": 2.247855912185938e-06, "loss": 0.0853, "step": 2556 }, { "epoch": 1.9658021133525456, "grad_norm": 0.7146445332349611, "learning_rate": 2.242010632397715e-06, "loss": 0.0782, "step": 2558 }, { "epoch": 1.967339097022094, "grad_norm": 0.7891447556881424, "learning_rate": 2.2361693791083176e-06, "loss": 0.0887, "step": 2560 }, { "epoch": 1.9688760806916425, "grad_norm": 0.7846780714035666, "learning_rate": 2.230332171014091e-06, "loss": 0.0851, "step": 2562 }, { "epoch": 1.9704130643611912, "grad_norm": 0.8098130826581111, "learning_rate": 2.2244990267984265e-06, "loss": 0.0915, "step": 2564 }, { "epoch": 1.9719500480307397, "grad_norm": 0.777976273941637, "learning_rate": 2.2186699651317143e-06, "loss": 0.0802, "step": 2566 }, { "epoch": 1.9734870317002882, "grad_norm": 0.8355719530974074, "learning_rate": 2.2128450046712702e-06, "loss": 0.0904, "step": 2568 }, { "epoch": 1.9750240153698368, "grad_norm": 0.7798614757356649, "learning_rate": 2.2070241640612915e-06, "loss": 0.0838, "step": 2570 }, { "epoch": 1.976560999039385, "grad_norm": 0.8328604297726137, "learning_rate": 2.2012074619327824e-06, "loss": 0.0865, "step": 2572 }, { "epoch": 1.9780979827089338, "grad_norm": 0.793696845619225, "learning_rate": 2.195394916903502e-06, "loss": 0.0782, "step": 2574 }, { "epoch": 1.9796349663784822, "grad_norm": 0.7815626290849819, "learning_rate": 2.1895865475779054e-06, "loss": 0.0788, "step": 2576 }, { "epoch": 1.9811719500480307, "grad_norm": 0.8439158338940291, "learning_rate": 2.1837823725470835e-06, "loss": 0.0889, "step": 2578 }, { "epoch": 1.9827089337175794, "grad_norm": 0.8288860856763497, "learning_rate": 2.177982410388699e-06, "loss": 0.0884, "step": 2580 }, { "epoch": 1.9842459173871276, "grad_norm": 0.8187063512218921, "learning_rate": 2.1721866796669302e-06, "loss": 0.088, "step": 2582 }, { "epoch": 1.9857829010566763, "grad_norm": 0.7887200768254294, "learning_rate": 2.166395198932414e-06, "loss": 0.0836, "step": 2584 }, { "epoch": 1.9873198847262248, "grad_norm": 0.8058422750958129, "learning_rate": 2.160607986722186e-06, "loss": 0.0872, "step": 2586 }, { "epoch": 1.9888568683957732, "grad_norm": 0.8186786966538023, "learning_rate": 2.154825061559614e-06, "loss": 0.0921, "step": 2588 }, { "epoch": 1.990393852065322, "grad_norm": 0.8648916595771557, "learning_rate": 2.149046441954347e-06, "loss": 0.0859, "step": 2590 }, { "epoch": 1.9919308357348702, "grad_norm": 0.8612159243706975, "learning_rate": 2.1432721464022532e-06, "loss": 0.0997, "step": 2592 }, { "epoch": 1.9934678194044189, "grad_norm": 0.8214954020731858, "learning_rate": 2.137502193385361e-06, "loss": 0.0875, "step": 2594 }, { "epoch": 1.9950048030739673, "grad_norm": 0.7979371383049858, "learning_rate": 2.1317366013717983e-06, "loss": 0.0843, "step": 2596 }, { "epoch": 1.9965417867435158, "grad_norm": 0.7620709623218802, "learning_rate": 2.125975388815733e-06, "loss": 0.0793, "step": 2598 }, { "epoch": 1.9980787704130645, "grad_norm": 0.7554666873391905, "learning_rate": 2.1202185741573206e-06, "loss": 0.0871, "step": 2600 }, { "epoch": 1.9996157540826127, "grad_norm": 0.7689027561104417, "learning_rate": 2.1144661758226355e-06, "loss": 0.0787, "step": 2602 }, { "epoch": 2.0011527377521614, "grad_norm": 0.6010513132525606, "learning_rate": 2.108718212223618e-06, "loss": 0.0835, "step": 2604 }, { "epoch": 2.00268972142171, "grad_norm": 0.5735836702570751, "learning_rate": 2.1029747017580132e-06, "loss": 0.0485, "step": 2606 }, { "epoch": 2.0042267050912583, "grad_norm": 0.5702538041925809, "learning_rate": 2.0972356628093154e-06, "loss": 0.0452, "step": 2608 }, { "epoch": 2.005763688760807, "grad_norm": 0.5895447128781794, "learning_rate": 2.091501113746703e-06, "loss": 0.0446, "step": 2610 }, { "epoch": 2.0073006724303553, "grad_norm": 0.6069626452449407, "learning_rate": 2.085771072924988e-06, "loss": 0.0439, "step": 2612 }, { "epoch": 2.008837656099904, "grad_norm": 0.6625893476183761, "learning_rate": 2.080045558684546e-06, "loss": 0.0426, "step": 2614 }, { "epoch": 2.0103746397694526, "grad_norm": 0.7034714400173309, "learning_rate": 2.0743245893512725e-06, "loss": 0.0433, "step": 2616 }, { "epoch": 2.011911623439001, "grad_norm": 0.7707738025199298, "learning_rate": 2.0686081832365095e-06, "loss": 0.045, "step": 2618 }, { "epoch": 2.0134486071085496, "grad_norm": 0.793812018682344, "learning_rate": 2.0628963586369966e-06, "loss": 0.043, "step": 2620 }, { "epoch": 2.014985590778098, "grad_norm": 0.7298066829996939, "learning_rate": 2.057189133834806e-06, "loss": 0.0413, "step": 2622 }, { "epoch": 2.0165225744476465, "grad_norm": 0.7317715663050453, "learning_rate": 2.051486527097292e-06, "loss": 0.0413, "step": 2624 }, { "epoch": 2.018059558117195, "grad_norm": 0.7006778779832138, "learning_rate": 2.045788556677023e-06, "loss": 0.0425, "step": 2626 }, { "epoch": 2.0195965417867434, "grad_norm": 0.747566077495069, "learning_rate": 2.04009524081173e-06, "loss": 0.0416, "step": 2628 }, { "epoch": 2.021133525456292, "grad_norm": 0.6905039170767364, "learning_rate": 2.034406597724246e-06, "loss": 0.0366, "step": 2630 }, { "epoch": 2.0226705091258403, "grad_norm": 0.6513726296876595, "learning_rate": 2.0287226456224464e-06, "loss": 0.0417, "step": 2632 }, { "epoch": 2.024207492795389, "grad_norm": 0.6710660115404629, "learning_rate": 2.0230434026991936e-06, "loss": 0.0439, "step": 2634 }, { "epoch": 2.0257444764649377, "grad_norm": 0.6687092945349834, "learning_rate": 2.0173688871322763e-06, "loss": 0.0396, "step": 2636 }, { "epoch": 2.027281460134486, "grad_norm": 0.7735624904374961, "learning_rate": 2.011699117084352e-06, "loss": 0.0431, "step": 2638 }, { "epoch": 2.0288184438040346, "grad_norm": 0.7166923451943271, "learning_rate": 2.0060341107028893e-06, "loss": 0.0439, "step": 2640 }, { "epoch": 2.030355427473583, "grad_norm": 0.7077961035403634, "learning_rate": 2.0003738861201104e-06, "loss": 0.0499, "step": 2642 }, { "epoch": 2.0318924111431316, "grad_norm": 0.6883244536280981, "learning_rate": 1.994718461452929e-06, "loss": 0.0443, "step": 2644 }, { "epoch": 2.0334293948126803, "grad_norm": 0.6214200682417671, "learning_rate": 1.9890678548028994e-06, "loss": 0.0425, "step": 2646 }, { "epoch": 2.0349663784822285, "grad_norm": 0.6365455258734565, "learning_rate": 1.9834220842561525e-06, "loss": 0.0438, "step": 2648 }, { "epoch": 2.036503362151777, "grad_norm": 0.6181452869288234, "learning_rate": 1.9777811678833405e-06, "loss": 0.0363, "step": 2650 }, { "epoch": 2.038040345821326, "grad_norm": 0.6491919777612177, "learning_rate": 1.972145123739581e-06, "loss": 0.0397, "step": 2652 }, { "epoch": 2.039577329490874, "grad_norm": 0.6585608787162763, "learning_rate": 1.9665139698643894e-06, "loss": 0.0368, "step": 2654 }, { "epoch": 2.041114313160423, "grad_norm": 0.7727352776818418, "learning_rate": 1.960887724281639e-06, "loss": 0.0454, "step": 2656 }, { "epoch": 2.042651296829971, "grad_norm": 0.6357364480821968, "learning_rate": 1.955266404999487e-06, "loss": 0.0415, "step": 2658 }, { "epoch": 2.0441882804995197, "grad_norm": 0.6281847150403672, "learning_rate": 1.9496500300103206e-06, "loss": 0.0371, "step": 2660 }, { "epoch": 2.0457252641690684, "grad_norm": 0.6912027377771374, "learning_rate": 1.944038617290707e-06, "loss": 0.04, "step": 2662 }, { "epoch": 2.0472622478386167, "grad_norm": 0.7127284309719673, "learning_rate": 1.938432184801327e-06, "loss": 0.0444, "step": 2664 }, { "epoch": 2.0487992315081653, "grad_norm": 0.6753527235959302, "learning_rate": 1.9328307504869223e-06, "loss": 0.0406, "step": 2666 }, { "epoch": 2.0503362151777136, "grad_norm": 0.6882383050291725, "learning_rate": 1.9272343322762377e-06, "loss": 0.0393, "step": 2668 }, { "epoch": 2.0518731988472623, "grad_norm": 0.7047552202339743, "learning_rate": 1.9216429480819575e-06, "loss": 0.0438, "step": 2670 }, { "epoch": 2.053410182516811, "grad_norm": 0.7596140785601401, "learning_rate": 1.9160566158006613e-06, "loss": 0.0414, "step": 2672 }, { "epoch": 2.054947166186359, "grad_norm": 0.6985050704664985, "learning_rate": 1.9104753533127555e-06, "loss": 0.0397, "step": 2674 }, { "epoch": 2.056484149855908, "grad_norm": 0.6562480639793371, "learning_rate": 1.9048991784824146e-06, "loss": 0.0401, "step": 2676 }, { "epoch": 2.058021133525456, "grad_norm": 0.69494658065462, "learning_rate": 1.8993281091575362e-06, "loss": 0.0404, "step": 2678 }, { "epoch": 2.059558117195005, "grad_norm": 0.7358984218115102, "learning_rate": 1.8937621631696722e-06, "loss": 0.0422, "step": 2680 }, { "epoch": 2.0610951008645535, "grad_norm": 0.6513459605049308, "learning_rate": 1.8882013583339773e-06, "loss": 0.0401, "step": 2682 }, { "epoch": 2.0626320845341017, "grad_norm": 0.6939024794365927, "learning_rate": 1.8826457124491504e-06, "loss": 0.0445, "step": 2684 }, { "epoch": 2.0641690682036504, "grad_norm": 0.6668447837064994, "learning_rate": 1.8770952432973784e-06, "loss": 0.0392, "step": 2686 }, { "epoch": 2.0657060518731987, "grad_norm": 0.6666480416435122, "learning_rate": 1.87154996864428e-06, "loss": 0.0409, "step": 2688 }, { "epoch": 2.0672430355427474, "grad_norm": 0.6709721661234623, "learning_rate": 1.8660099062388431e-06, "loss": 0.0405, "step": 2690 }, { "epoch": 2.068780019212296, "grad_norm": 0.685700314026161, "learning_rate": 1.8604750738133756e-06, "loss": 0.0397, "step": 2692 }, { "epoch": 2.0703170028818443, "grad_norm": 0.6515559887232024, "learning_rate": 1.8549454890834497e-06, "loss": 0.0384, "step": 2694 }, { "epoch": 2.071853986551393, "grad_norm": 0.709788654208775, "learning_rate": 1.8494211697478323e-06, "loss": 0.0415, "step": 2696 }, { "epoch": 2.073390970220941, "grad_norm": 0.6775450512874873, "learning_rate": 1.8439021334884444e-06, "loss": 0.0386, "step": 2698 }, { "epoch": 2.07492795389049, "grad_norm": 0.7750933505013274, "learning_rate": 1.8383883979702934e-06, "loss": 0.0426, "step": 2700 }, { "epoch": 2.0764649375600386, "grad_norm": 0.6584818113536601, "learning_rate": 1.8328799808414227e-06, "loss": 0.038, "step": 2702 }, { "epoch": 2.078001921229587, "grad_norm": 0.7829082201798115, "learning_rate": 1.8273768997328525e-06, "loss": 0.04, "step": 2704 }, { "epoch": 2.0795389048991355, "grad_norm": 0.734651522047483, "learning_rate": 1.8218791722585205e-06, "loss": 0.0415, "step": 2706 }, { "epoch": 2.0810758885686838, "grad_norm": 0.6479085477298625, "learning_rate": 1.8163868160152308e-06, "loss": 0.0372, "step": 2708 }, { "epoch": 2.0826128722382324, "grad_norm": 0.7161779560377441, "learning_rate": 1.8108998485826017e-06, "loss": 0.0439, "step": 2710 }, { "epoch": 2.084149855907781, "grad_norm": 0.6464214115435033, "learning_rate": 1.8054182875229925e-06, "loss": 0.0401, "step": 2712 }, { "epoch": 2.0856868395773294, "grad_norm": 0.694251707574077, "learning_rate": 1.799942150381465e-06, "loss": 0.0387, "step": 2714 }, { "epoch": 2.087223823246878, "grad_norm": 0.7296255965243119, "learning_rate": 1.7944714546857195e-06, "loss": 0.0428, "step": 2716 }, { "epoch": 2.0887608069164267, "grad_norm": 0.693023285089645, "learning_rate": 1.7890062179460383e-06, "loss": 0.0425, "step": 2718 }, { "epoch": 2.090297790585975, "grad_norm": 0.8091932276327041, "learning_rate": 1.7835464576552334e-06, "loss": 0.0442, "step": 2720 }, { "epoch": 2.0918347742555237, "grad_norm": 0.7647763579617219, "learning_rate": 1.7780921912885828e-06, "loss": 0.0423, "step": 2722 }, { "epoch": 2.093371757925072, "grad_norm": 0.7252365506572918, "learning_rate": 1.7726434363037843e-06, "loss": 0.0401, "step": 2724 }, { "epoch": 2.0949087415946206, "grad_norm": 0.7457710289722953, "learning_rate": 1.7672002101408983e-06, "loss": 0.0441, "step": 2726 }, { "epoch": 2.0964457252641693, "grad_norm": 0.8268423722666511, "learning_rate": 1.761762530222281e-06, "loss": 0.0428, "step": 2728 }, { "epoch": 2.0979827089337175, "grad_norm": 0.7059199502684411, "learning_rate": 1.756330413952541e-06, "loss": 0.0415, "step": 2730 }, { "epoch": 2.099519692603266, "grad_norm": 0.6799281357494354, "learning_rate": 1.7509038787184795e-06, "loss": 0.0433, "step": 2732 }, { "epoch": 2.1010566762728144, "grad_norm": 0.6560319432868118, "learning_rate": 1.7454829418890321e-06, "loss": 0.0404, "step": 2734 }, { "epoch": 2.102593659942363, "grad_norm": 0.6785113040948152, "learning_rate": 1.7400676208152185e-06, "loss": 0.0387, "step": 2736 }, { "epoch": 2.104130643611912, "grad_norm": 0.6843492876225117, "learning_rate": 1.7346579328300795e-06, "loss": 0.0437, "step": 2738 }, { "epoch": 2.10566762728146, "grad_norm": 0.6887111820251511, "learning_rate": 1.7292538952486288e-06, "loss": 0.0384, "step": 2740 }, { "epoch": 2.1072046109510087, "grad_norm": 0.7567201231202126, "learning_rate": 1.7238555253677945e-06, "loss": 0.0438, "step": 2742 }, { "epoch": 2.108741594620557, "grad_norm": 0.6548000735118099, "learning_rate": 1.7184628404663628e-06, "loss": 0.0392, "step": 2744 }, { "epoch": 2.1102785782901057, "grad_norm": 0.6916814777807151, "learning_rate": 1.713075857804926e-06, "loss": 0.039, "step": 2746 }, { "epoch": 2.1118155619596544, "grad_norm": 0.675062961353931, "learning_rate": 1.7076945946258195e-06, "loss": 0.0409, "step": 2748 }, { "epoch": 2.1133525456292026, "grad_norm": 0.7099794758199118, "learning_rate": 1.702319068153079e-06, "loss": 0.0422, "step": 2750 }, { "epoch": 2.1148895292987513, "grad_norm": 0.8255111866724547, "learning_rate": 1.6969492955923765e-06, "loss": 0.0467, "step": 2752 }, { "epoch": 2.1164265129682995, "grad_norm": 0.7089306864322071, "learning_rate": 1.6915852941309628e-06, "loss": 0.0372, "step": 2754 }, { "epoch": 2.117963496637848, "grad_norm": 0.7195267991677364, "learning_rate": 1.6862270809376217e-06, "loss": 0.0441, "step": 2756 }, { "epoch": 2.119500480307397, "grad_norm": 0.7600552648629298, "learning_rate": 1.6808746731626085e-06, "loss": 0.0498, "step": 2758 }, { "epoch": 2.121037463976945, "grad_norm": 0.6719618311917168, "learning_rate": 1.6755280879375975e-06, "loss": 0.0365, "step": 2760 }, { "epoch": 2.122574447646494, "grad_norm": 0.6725979801550628, "learning_rate": 1.6701873423756275e-06, "loss": 0.038, "step": 2762 }, { "epoch": 2.124111431316042, "grad_norm": 0.7133205530228818, "learning_rate": 1.6648524535710401e-06, "loss": 0.041, "step": 2764 }, { "epoch": 2.1256484149855908, "grad_norm": 0.6458222389869777, "learning_rate": 1.6595234385994398e-06, "loss": 0.0433, "step": 2766 }, { "epoch": 2.1271853986551394, "grad_norm": 0.7422996801701388, "learning_rate": 1.6542003145176265e-06, "loss": 0.0425, "step": 2768 }, { "epoch": 2.1287223823246877, "grad_norm": 0.7076408311621847, "learning_rate": 1.648883098363542e-06, "loss": 0.0416, "step": 2770 }, { "epoch": 2.1302593659942364, "grad_norm": 0.7798719878560143, "learning_rate": 1.6435718071562212e-06, "loss": 0.0495, "step": 2772 }, { "epoch": 2.1317963496637846, "grad_norm": 0.7240342459534611, "learning_rate": 1.6382664578957359e-06, "loss": 0.0392, "step": 2774 }, { "epoch": 2.1333333333333333, "grad_norm": 0.7052969420907925, "learning_rate": 1.6329670675631369e-06, "loss": 0.0419, "step": 2776 }, { "epoch": 2.134870317002882, "grad_norm": 0.7301031702593861, "learning_rate": 1.6276736531204044e-06, "loss": 0.0386, "step": 2778 }, { "epoch": 2.1364073006724302, "grad_norm": 0.6875032643695771, "learning_rate": 1.6223862315103865e-06, "loss": 0.0393, "step": 2780 }, { "epoch": 2.137944284341979, "grad_norm": 0.6775172137731008, "learning_rate": 1.617104819656758e-06, "loss": 0.0399, "step": 2782 }, { "epoch": 2.1394812680115276, "grad_norm": 0.7408298536972419, "learning_rate": 1.6118294344639496e-06, "loss": 0.0431, "step": 2784 }, { "epoch": 2.141018251681076, "grad_norm": 0.7531648570461281, "learning_rate": 1.6065600928171054e-06, "loss": 0.0401, "step": 2786 }, { "epoch": 2.1425552353506245, "grad_norm": 0.6778919207003176, "learning_rate": 1.6012968115820306e-06, "loss": 0.0399, "step": 2788 }, { "epoch": 2.1440922190201728, "grad_norm": 0.7039227096859256, "learning_rate": 1.5960396076051233e-06, "loss": 0.0402, "step": 2790 }, { "epoch": 2.1456292026897215, "grad_norm": 0.6789632610059323, "learning_rate": 1.5907884977133366e-06, "loss": 0.0373, "step": 2792 }, { "epoch": 2.1471661863592697, "grad_norm": 0.6884435472545695, "learning_rate": 1.5855434987141163e-06, "loss": 0.0447, "step": 2794 }, { "epoch": 2.1487031700288184, "grad_norm": 0.6920660658427317, "learning_rate": 1.580304627395348e-06, "loss": 0.0454, "step": 2796 }, { "epoch": 2.150240153698367, "grad_norm": 0.7745737611033345, "learning_rate": 1.5750719005253062e-06, "loss": 0.0413, "step": 2798 }, { "epoch": 2.1517771373679153, "grad_norm": 0.6907792630115484, "learning_rate": 1.569845334852595e-06, "loss": 0.041, "step": 2800 }, { "epoch": 2.153314121037464, "grad_norm": 0.6604941051087728, "learning_rate": 1.5646249471060995e-06, "loss": 0.0431, "step": 2802 }, { "epoch": 2.1548511047070127, "grad_norm": 0.7080559060460522, "learning_rate": 1.5594107539949365e-06, "loss": 0.0421, "step": 2804 }, { "epoch": 2.156388088376561, "grad_norm": 0.7149705649090393, "learning_rate": 1.5542027722083853e-06, "loss": 0.0388, "step": 2806 }, { "epoch": 2.1579250720461096, "grad_norm": 0.7146661785693218, "learning_rate": 1.5490010184158523e-06, "loss": 0.0398, "step": 2808 }, { "epoch": 2.159462055715658, "grad_norm": 0.7240758501470305, "learning_rate": 1.543805509266806e-06, "loss": 0.045, "step": 2810 }, { "epoch": 2.1609990393852065, "grad_norm": 0.7001839005699572, "learning_rate": 1.5386162613907287e-06, "loss": 0.0377, "step": 2812 }, { "epoch": 2.1625360230547552, "grad_norm": 0.7094421429704845, "learning_rate": 1.5334332913970623e-06, "loss": 0.0425, "step": 2814 }, { "epoch": 2.1640730067243035, "grad_norm": 0.6847460522981713, "learning_rate": 1.5282566158751524e-06, "loss": 0.0402, "step": 2816 }, { "epoch": 2.165609990393852, "grad_norm": 0.7120209696945222, "learning_rate": 1.5230862513941995e-06, "loss": 0.0399, "step": 2818 }, { "epoch": 2.1671469740634004, "grad_norm": 0.6907346621733982, "learning_rate": 1.5179222145032073e-06, "loss": 0.0447, "step": 2820 }, { "epoch": 2.168683957732949, "grad_norm": 0.7773144554098375, "learning_rate": 1.5127645217309192e-06, "loss": 0.0434, "step": 2822 }, { "epoch": 2.1702209414024978, "grad_norm": 0.7117848076597437, "learning_rate": 1.5076131895857784e-06, "loss": 0.0423, "step": 2824 }, { "epoch": 2.171757925072046, "grad_norm": 0.7027544209261267, "learning_rate": 1.5024682345558675e-06, "loss": 0.0401, "step": 2826 }, { "epoch": 2.1732949087415947, "grad_norm": 0.7033554856080803, "learning_rate": 1.4973296731088581e-06, "loss": 0.0388, "step": 2828 }, { "epoch": 2.174831892411143, "grad_norm": 0.7715815363157628, "learning_rate": 1.4921975216919582e-06, "loss": 0.04, "step": 2830 }, { "epoch": 2.1763688760806916, "grad_norm": 0.7862111914567127, "learning_rate": 1.4870717967318554e-06, "loss": 0.0392, "step": 2832 }, { "epoch": 2.1779058597502403, "grad_norm": 0.7221877404149011, "learning_rate": 1.4819525146346723e-06, "loss": 0.0461, "step": 2834 }, { "epoch": 2.1794428434197886, "grad_norm": 0.6911027142163597, "learning_rate": 1.4768396917859073e-06, "loss": 0.0382, "step": 2836 }, { "epoch": 2.1809798270893372, "grad_norm": 0.7119349195163982, "learning_rate": 1.4717333445503851e-06, "loss": 0.0384, "step": 2838 }, { "epoch": 2.1825168107588855, "grad_norm": 0.7025660651802623, "learning_rate": 1.4666334892722047e-06, "loss": 0.0393, "step": 2840 }, { "epoch": 2.184053794428434, "grad_norm": 0.7083054615445171, "learning_rate": 1.4615401422746807e-06, "loss": 0.0411, "step": 2842 }, { "epoch": 2.185590778097983, "grad_norm": 0.7283963127887443, "learning_rate": 1.4564533198603048e-06, "loss": 0.0417, "step": 2844 }, { "epoch": 2.187127761767531, "grad_norm": 0.6929199237523709, "learning_rate": 1.45137303831068e-06, "loss": 0.0378, "step": 2846 }, { "epoch": 2.18866474543708, "grad_norm": 0.7003853343345394, "learning_rate": 1.4462993138864725e-06, "loss": 0.0389, "step": 2848 }, { "epoch": 2.1902017291066285, "grad_norm": 0.6983653676006323, "learning_rate": 1.441232162827364e-06, "loss": 0.039, "step": 2850 }, { "epoch": 2.1917387127761767, "grad_norm": 0.7159975682093909, "learning_rate": 1.4361716013519952e-06, "loss": 0.0409, "step": 2852 }, { "epoch": 2.1932756964457254, "grad_norm": 0.7754416019144194, "learning_rate": 1.4311176456579158e-06, "loss": 0.0427, "step": 2854 }, { "epoch": 2.1948126801152736, "grad_norm": 0.7345157001852429, "learning_rate": 1.4260703119215324e-06, "loss": 0.0425, "step": 2856 }, { "epoch": 2.1963496637848223, "grad_norm": 0.6537911636031369, "learning_rate": 1.4210296162980526e-06, "loss": 0.04, "step": 2858 }, { "epoch": 2.1978866474543706, "grad_norm": 0.6733339321649235, "learning_rate": 1.415995574921444e-06, "loss": 0.0358, "step": 2860 }, { "epoch": 2.1994236311239193, "grad_norm": 0.6520397122582371, "learning_rate": 1.4109682039043717e-06, "loss": 0.0367, "step": 2862 }, { "epoch": 2.200960614793468, "grad_norm": 0.7851918798134458, "learning_rate": 1.4059475193381485e-06, "loss": 0.044, "step": 2864 }, { "epoch": 2.202497598463016, "grad_norm": 0.6944754890985299, "learning_rate": 1.4009335372926895e-06, "loss": 0.0388, "step": 2866 }, { "epoch": 2.204034582132565, "grad_norm": 0.7185388294778169, "learning_rate": 1.3959262738164548e-06, "loss": 0.0434, "step": 2868 }, { "epoch": 2.2055715658021136, "grad_norm": 0.7581514840792837, "learning_rate": 1.3909257449364012e-06, "loss": 0.0419, "step": 2870 }, { "epoch": 2.207108549471662, "grad_norm": 0.6783363501218831, "learning_rate": 1.38593196665793e-06, "loss": 0.0374, "step": 2872 }, { "epoch": 2.2086455331412105, "grad_norm": 0.6911512020404058, "learning_rate": 1.3809449549648313e-06, "loss": 0.037, "step": 2874 }, { "epoch": 2.2101825168107587, "grad_norm": 0.6909020138454802, "learning_rate": 1.3759647258192446e-06, "loss": 0.0388, "step": 2876 }, { "epoch": 2.2117195004803074, "grad_norm": 0.6798676329168656, "learning_rate": 1.370991295161596e-06, "loss": 0.0371, "step": 2878 }, { "epoch": 2.213256484149856, "grad_norm": 0.7757828920522275, "learning_rate": 1.3660246789105472e-06, "loss": 0.0383, "step": 2880 }, { "epoch": 2.2147934678194043, "grad_norm": 0.7289028293318881, "learning_rate": 1.3610648929629593e-06, "loss": 0.0408, "step": 2882 }, { "epoch": 2.216330451488953, "grad_norm": 0.7420967431557955, "learning_rate": 1.3561119531938205e-06, "loss": 0.043, "step": 2884 }, { "epoch": 2.2178674351585013, "grad_norm": 0.7163966573368784, "learning_rate": 1.3511658754562126e-06, "loss": 0.0396, "step": 2886 }, { "epoch": 2.21940441882805, "grad_norm": 0.7419349249047851, "learning_rate": 1.3462266755812522e-06, "loss": 0.0417, "step": 2888 }, { "epoch": 2.2209414024975986, "grad_norm": 0.7800199446876597, "learning_rate": 1.3412943693780412e-06, "loss": 0.0422, "step": 2890 }, { "epoch": 2.222478386167147, "grad_norm": 0.6961688291543298, "learning_rate": 1.3363689726336185e-06, "loss": 0.0407, "step": 2892 }, { "epoch": 2.2240153698366956, "grad_norm": 0.7098183143734048, "learning_rate": 1.3314505011129031e-06, "loss": 0.0441, "step": 2894 }, { "epoch": 2.225552353506244, "grad_norm": 0.6599491965120293, "learning_rate": 1.3265389705586513e-06, "loss": 0.0391, "step": 2896 }, { "epoch": 2.2270893371757925, "grad_norm": 0.6447579173261855, "learning_rate": 1.3216343966914074e-06, "loss": 0.0346, "step": 2898 }, { "epoch": 2.228626320845341, "grad_norm": 0.7146194112938924, "learning_rate": 1.31673679520944e-06, "loss": 0.0414, "step": 2900 }, { "epoch": 2.2301633045148894, "grad_norm": 0.7599787345630702, "learning_rate": 1.3118461817887071e-06, "loss": 0.0426, "step": 2902 }, { "epoch": 2.231700288184438, "grad_norm": 0.7045861225059481, "learning_rate": 1.3069625720827984e-06, "loss": 0.0369, "step": 2904 }, { "epoch": 2.2332372718539863, "grad_norm": 0.6678597621037076, "learning_rate": 1.302085981722887e-06, "loss": 0.0378, "step": 2906 }, { "epoch": 2.234774255523535, "grad_norm": 0.7318552001241747, "learning_rate": 1.2972164263176784e-06, "loss": 0.043, "step": 2908 }, { "epoch": 2.2363112391930837, "grad_norm": 0.7347881359720747, "learning_rate": 1.2923539214533586e-06, "loss": 0.0435, "step": 2910 }, { "epoch": 2.237848222862632, "grad_norm": 0.7833005087501644, "learning_rate": 1.2874984826935493e-06, "loss": 0.0411, "step": 2912 }, { "epoch": 2.2393852065321806, "grad_norm": 0.7340931719392261, "learning_rate": 1.2826501255792588e-06, "loss": 0.0419, "step": 2914 }, { "epoch": 2.2409221902017293, "grad_norm": 0.7244781359138952, "learning_rate": 1.277808865628821e-06, "loss": 0.0403, "step": 2916 }, { "epoch": 2.2424591738712776, "grad_norm": 0.7012819390876723, "learning_rate": 1.2729747183378591e-06, "loss": 0.041, "step": 2918 }, { "epoch": 2.2439961575408263, "grad_norm": 0.7456233730244353, "learning_rate": 1.2681476991792295e-06, "loss": 0.0421, "step": 2920 }, { "epoch": 2.2455331412103745, "grad_norm": 0.6802488303834758, "learning_rate": 1.2633278236029728e-06, "loss": 0.0362, "step": 2922 }, { "epoch": 2.247070124879923, "grad_norm": 0.7518464030635749, "learning_rate": 1.2585151070362655e-06, "loss": 0.0404, "step": 2924 }, { "epoch": 2.2486071085494714, "grad_norm": 0.6909362534084197, "learning_rate": 1.253709564883368e-06, "loss": 0.0404, "step": 2926 }, { "epoch": 2.25014409221902, "grad_norm": 0.752505311933665, "learning_rate": 1.2489112125255795e-06, "loss": 0.0387, "step": 2928 }, { "epoch": 2.251681075888569, "grad_norm": 0.74911032053493, "learning_rate": 1.2441200653211853e-06, "loss": 0.0383, "step": 2930 }, { "epoch": 2.253218059558117, "grad_norm": 0.7048961647841477, "learning_rate": 1.2393361386054102e-06, "loss": 0.0359, "step": 2932 }, { "epoch": 2.2547550432276657, "grad_norm": 0.7455505928042365, "learning_rate": 1.2345594476903678e-06, "loss": 0.0388, "step": 2934 }, { "epoch": 2.2562920268972144, "grad_norm": 0.7618852898444295, "learning_rate": 1.2297900078650073e-06, "loss": 0.0406, "step": 2936 }, { "epoch": 2.2578290105667627, "grad_norm": 0.7877656713372072, "learning_rate": 1.225027834395077e-06, "loss": 0.0418, "step": 2938 }, { "epoch": 2.2593659942363113, "grad_norm": 0.7487876696738023, "learning_rate": 1.2202729425230628e-06, "loss": 0.0435, "step": 2940 }, { "epoch": 2.2609029779058596, "grad_norm": 0.7549339920384163, "learning_rate": 1.215525347468143e-06, "loss": 0.039, "step": 2942 }, { "epoch": 2.2624399615754083, "grad_norm": 0.7381797557455133, "learning_rate": 1.210785064426143e-06, "loss": 0.0401, "step": 2944 }, { "epoch": 2.263976945244957, "grad_norm": 0.7303121808696214, "learning_rate": 1.2060521085694836e-06, "loss": 0.0393, "step": 2946 }, { "epoch": 2.265513928914505, "grad_norm": 0.7091244479813347, "learning_rate": 1.2013264950471333e-06, "loss": 0.0426, "step": 2948 }, { "epoch": 2.267050912584054, "grad_norm": 0.8368399757213874, "learning_rate": 1.196608238984561e-06, "loss": 0.0488, "step": 2950 }, { "epoch": 2.268587896253602, "grad_norm": 0.7827298872789893, "learning_rate": 1.191897355483681e-06, "loss": 0.0439, "step": 2952 }, { "epoch": 2.270124879923151, "grad_norm": 0.7171969138664943, "learning_rate": 1.1871938596228177e-06, "loss": 0.0388, "step": 2954 }, { "epoch": 2.2716618635926995, "grad_norm": 0.7079483043665579, "learning_rate": 1.1824977664566453e-06, "loss": 0.0418, "step": 2956 }, { "epoch": 2.2731988472622477, "grad_norm": 0.7574665616251981, "learning_rate": 1.1778090910161426e-06, "loss": 0.0416, "step": 2958 }, { "epoch": 2.2747358309317964, "grad_norm": 0.7111652785046836, "learning_rate": 1.1731278483085481e-06, "loss": 0.0427, "step": 2960 }, { "epoch": 2.2762728146013447, "grad_norm": 0.7535827297817603, "learning_rate": 1.1684540533173104e-06, "loss": 0.041, "step": 2962 }, { "epoch": 2.2778097982708934, "grad_norm": 0.7519434123378339, "learning_rate": 1.1637877210020395e-06, "loss": 0.0408, "step": 2964 }, { "epoch": 2.279346781940442, "grad_norm": 0.7008462349602121, "learning_rate": 1.1591288662984594e-06, "loss": 0.0397, "step": 2966 }, { "epoch": 2.2808837656099903, "grad_norm": 0.7152887905478794, "learning_rate": 1.154477504118357e-06, "loss": 0.038, "step": 2968 }, { "epoch": 2.282420749279539, "grad_norm": 0.7322203327399467, "learning_rate": 1.1498336493495444e-06, "loss": 0.0424, "step": 2970 }, { "epoch": 2.283957732949087, "grad_norm": 0.705019954856083, "learning_rate": 1.1451973168557992e-06, "loss": 0.0393, "step": 2972 }, { "epoch": 2.285494716618636, "grad_norm": 0.7314831773427871, "learning_rate": 1.1405685214768224e-06, "loss": 0.042, "step": 2974 }, { "epoch": 2.2870317002881846, "grad_norm": 0.7024944801405867, "learning_rate": 1.1359472780281916e-06, "loss": 0.04, "step": 2976 }, { "epoch": 2.288568683957733, "grad_norm": 0.7216103123781318, "learning_rate": 1.1313336013013139e-06, "loss": 0.04, "step": 2978 }, { "epoch": 2.2901056676272815, "grad_norm": 0.6819781243729732, "learning_rate": 1.126727506063375e-06, "loss": 0.0383, "step": 2980 }, { "epoch": 2.29164265129683, "grad_norm": 0.6997754986778041, "learning_rate": 1.122129007057296e-06, "loss": 0.0402, "step": 2982 }, { "epoch": 2.2931796349663784, "grad_norm": 0.6626184454931626, "learning_rate": 1.1175381190016833e-06, "loss": 0.038, "step": 2984 }, { "epoch": 2.294716618635927, "grad_norm": 0.6890948335939371, "learning_rate": 1.1129548565907842e-06, "loss": 0.041, "step": 2986 }, { "epoch": 2.2962536023054754, "grad_norm": 0.6433359932240779, "learning_rate": 1.108379234494435e-06, "loss": 0.0351, "step": 2988 }, { "epoch": 2.297790585975024, "grad_norm": 0.7392619695394935, "learning_rate": 1.103811267358019e-06, "loss": 0.0414, "step": 2990 }, { "epoch": 2.2993275696445723, "grad_norm": 0.7246127735420398, "learning_rate": 1.0992509698024226e-06, "loss": 0.0389, "step": 2992 }, { "epoch": 2.300864553314121, "grad_norm": 0.7447654773215131, "learning_rate": 1.0946983564239754e-06, "loss": 0.0459, "step": 2994 }, { "epoch": 2.3024015369836697, "grad_norm": 0.7324507127817105, "learning_rate": 1.0901534417944175e-06, "loss": 0.0377, "step": 2996 }, { "epoch": 2.303938520653218, "grad_norm": 0.6915535971116652, "learning_rate": 1.0856162404608466e-06, "loss": 0.0361, "step": 2998 }, { "epoch": 2.3054755043227666, "grad_norm": 0.7490224319565619, "learning_rate": 1.0810867669456705e-06, "loss": 0.0375, "step": 3000 }, { "epoch": 2.3054755043227666, "eval_loss": 0.1596132069826126, "eval_runtime": 361.7237, "eval_samples_per_second": 51.158, "eval_steps_per_second": 6.397, "step": 3000 }, { "epoch": 2.3070124879923153, "grad_norm": 0.7500197718141116, "learning_rate": 1.0765650357465648e-06, "loss": 0.0439, "step": 3002 }, { "epoch": 2.3085494716618635, "grad_norm": 0.7509126020866208, "learning_rate": 1.07205106133642e-06, "loss": 0.0399, "step": 3004 }, { "epoch": 2.310086455331412, "grad_norm": 0.6985233959996506, "learning_rate": 1.0675448581633016e-06, "loss": 0.0368, "step": 3006 }, { "epoch": 2.3116234390009605, "grad_norm": 0.7109894469419616, "learning_rate": 1.063046440650405e-06, "loss": 0.0394, "step": 3008 }, { "epoch": 2.313160422670509, "grad_norm": 0.6685986841427571, "learning_rate": 1.0585558231959986e-06, "loss": 0.0354, "step": 3010 }, { "epoch": 2.314697406340058, "grad_norm": 0.6537755179648417, "learning_rate": 1.0540730201733887e-06, "loss": 0.0341, "step": 3012 }, { "epoch": 2.316234390009606, "grad_norm": 0.7680457966259363, "learning_rate": 1.0495980459308696e-06, "loss": 0.0401, "step": 3014 }, { "epoch": 2.3177713736791548, "grad_norm": 0.7160760279013273, "learning_rate": 1.0451309147916773e-06, "loss": 0.0385, "step": 3016 }, { "epoch": 2.319308357348703, "grad_norm": 0.6930648625031354, "learning_rate": 1.040671641053945e-06, "loss": 0.0385, "step": 3018 }, { "epoch": 2.3208453410182517, "grad_norm": 0.7212560558538098, "learning_rate": 1.036220238990653e-06, "loss": 0.038, "step": 3020 }, { "epoch": 2.3223823246878004, "grad_norm": 0.7246474758799715, "learning_rate": 1.0317767228495906e-06, "loss": 0.0371, "step": 3022 }, { "epoch": 2.3239193083573486, "grad_norm": 0.7534642781877979, "learning_rate": 1.0273411068533037e-06, "loss": 0.0387, "step": 3024 }, { "epoch": 2.3254562920268973, "grad_norm": 0.7525530298880164, "learning_rate": 1.0229134051990528e-06, "loss": 0.0406, "step": 3026 }, { "epoch": 2.3269932756964455, "grad_norm": 0.7939486095316215, "learning_rate": 1.018493632058767e-06, "loss": 0.0445, "step": 3028 }, { "epoch": 2.3285302593659942, "grad_norm": 0.7218972892190336, "learning_rate": 1.0140818015789975e-06, "loss": 0.0405, "step": 3030 }, { "epoch": 2.330067243035543, "grad_norm": 0.7983391471167234, "learning_rate": 1.009677927880873e-06, "loss": 0.04, "step": 3032 }, { "epoch": 2.331604226705091, "grad_norm": 0.7867565276794657, "learning_rate": 1.0052820250600568e-06, "loss": 0.0433, "step": 3034 }, { "epoch": 2.33314121037464, "grad_norm": 0.7385399550662357, "learning_rate": 1.000894107186695e-06, "loss": 0.0358, "step": 3036 }, { "epoch": 2.334678194044188, "grad_norm": 0.7023204396957986, "learning_rate": 9.965141883053803e-07, "loss": 0.0409, "step": 3038 }, { "epoch": 2.3362151777137368, "grad_norm": 0.7239223187447263, "learning_rate": 9.921422824351001e-07, "loss": 0.0401, "step": 3040 }, { "epoch": 2.3377521613832855, "grad_norm": 0.7201483530659722, "learning_rate": 9.87778403569196e-07, "loss": 0.0374, "step": 3042 }, { "epoch": 2.3392891450528337, "grad_norm": 0.7067782590935108, "learning_rate": 9.834225656753167e-07, "loss": 0.0364, "step": 3044 }, { "epoch": 2.3408261287223824, "grad_norm": 0.767002733961986, "learning_rate": 9.790747826953707e-07, "loss": 0.0382, "step": 3046 }, { "epoch": 2.342363112391931, "grad_norm": 0.7748316117552638, "learning_rate": 9.747350685454906e-07, "loss": 0.0416, "step": 3048 }, { "epoch": 2.3439000960614793, "grad_norm": 0.6930253730222432, "learning_rate": 9.704034371159801e-07, "loss": 0.0409, "step": 3050 }, { "epoch": 2.345437079731028, "grad_norm": 0.7552114221352043, "learning_rate": 9.660799022712694e-07, "loss": 0.0412, "step": 3052 }, { "epoch": 2.3469740634005762, "grad_norm": 0.7612525976410068, "learning_rate": 9.617644778498773e-07, "loss": 0.0383, "step": 3054 }, { "epoch": 2.348511047070125, "grad_norm": 0.7280586427447336, "learning_rate": 9.574571776643617e-07, "loss": 0.0433, "step": 3056 }, { "epoch": 2.350048030739673, "grad_norm": 0.6693940571370718, "learning_rate": 9.531580155012778e-07, "loss": 0.04, "step": 3058 }, { "epoch": 2.351585014409222, "grad_norm": 0.7048821090945985, "learning_rate": 9.488670051211332e-07, "loss": 0.0394, "step": 3060 }, { "epoch": 2.3531219980787705, "grad_norm": 0.7505448748331673, "learning_rate": 9.4458416025834e-07, "loss": 0.0392, "step": 3062 }, { "epoch": 2.354658981748319, "grad_norm": 0.6771190590157495, "learning_rate": 9.403094946211808e-07, "loss": 0.0413, "step": 3064 }, { "epoch": 2.3561959654178675, "grad_norm": 0.7260675282763582, "learning_rate": 9.360430218917558e-07, "loss": 0.0398, "step": 3066 }, { "epoch": 2.357732949087416, "grad_norm": 0.687104316635614, "learning_rate": 9.317847557259387e-07, "loss": 0.0388, "step": 3068 }, { "epoch": 2.3592699327569644, "grad_norm": 0.6959356324751542, "learning_rate": 9.275347097533404e-07, "loss": 0.0409, "step": 3070 }, { "epoch": 2.360806916426513, "grad_norm": 0.7566129208877186, "learning_rate": 9.232928975772597e-07, "loss": 0.0386, "step": 3072 }, { "epoch": 2.3623439000960613, "grad_norm": 0.7158570072440973, "learning_rate": 9.190593327746406e-07, "loss": 0.0389, "step": 3074 }, { "epoch": 2.36388088376561, "grad_norm": 0.7207667687469005, "learning_rate": 9.148340288960307e-07, "loss": 0.0409, "step": 3076 }, { "epoch": 2.3654178674351587, "grad_norm": 0.7445605342380751, "learning_rate": 9.106169994655347e-07, "loss": 0.0407, "step": 3078 }, { "epoch": 2.366954851104707, "grad_norm": 0.7736558101051751, "learning_rate": 9.064082579807752e-07, "loss": 0.0387, "step": 3080 }, { "epoch": 2.3684918347742556, "grad_norm": 0.7435778083456558, "learning_rate": 9.022078179128439e-07, "loss": 0.0422, "step": 3082 }, { "epoch": 2.370028818443804, "grad_norm": 0.680093837395743, "learning_rate": 8.980156927062642e-07, "loss": 0.0341, "step": 3084 }, { "epoch": 2.3715658021133526, "grad_norm": 0.801244212730542, "learning_rate": 8.93831895778948e-07, "loss": 0.0372, "step": 3086 }, { "epoch": 2.3731027857829012, "grad_norm": 0.7357307681580836, "learning_rate": 8.896564405221447e-07, "loss": 0.0377, "step": 3088 }, { "epoch": 2.3746397694524495, "grad_norm": 0.7209769877560075, "learning_rate": 8.85489340300409e-07, "loss": 0.0423, "step": 3090 }, { "epoch": 2.376176753121998, "grad_norm": 0.7330652449279741, "learning_rate": 8.813306084515513e-07, "loss": 0.0416, "step": 3092 }, { "epoch": 2.3777137367915464, "grad_norm": 0.7504076654960494, "learning_rate": 8.771802582865972e-07, "loss": 0.0422, "step": 3094 }, { "epoch": 2.379250720461095, "grad_norm": 0.6852298932331159, "learning_rate": 8.730383030897463e-07, "loss": 0.038, "step": 3096 }, { "epoch": 2.380787704130644, "grad_norm": 0.6881568732248152, "learning_rate": 8.689047561183245e-07, "loss": 0.0382, "step": 3098 }, { "epoch": 2.382324687800192, "grad_norm": 0.6634157742716904, "learning_rate": 8.647796306027476e-07, "loss": 0.0379, "step": 3100 }, { "epoch": 2.3838616714697407, "grad_norm": 0.7328461753734546, "learning_rate": 8.606629397464787e-07, "loss": 0.0408, "step": 3102 }, { "epoch": 2.385398655139289, "grad_norm": 0.7159969869187992, "learning_rate": 8.565546967259782e-07, "loss": 0.0351, "step": 3104 }, { "epoch": 2.3869356388088376, "grad_norm": 0.7267286204071172, "learning_rate": 8.524549146906714e-07, "loss": 0.0399, "step": 3106 }, { "epoch": 2.3884726224783863, "grad_norm": 0.6908926932230512, "learning_rate": 8.483636067629014e-07, "loss": 0.0434, "step": 3108 }, { "epoch": 2.3900096061479346, "grad_norm": 0.697297173531623, "learning_rate": 8.442807860378868e-07, "loss": 0.0398, "step": 3110 }, { "epoch": 2.3915465898174832, "grad_norm": 0.704515687419953, "learning_rate": 8.40206465583683e-07, "loss": 0.041, "step": 3112 }, { "epoch": 2.393083573487032, "grad_norm": 0.717830937968017, "learning_rate": 8.361406584411343e-07, "loss": 0.0382, "step": 3114 }, { "epoch": 2.39462055715658, "grad_norm": 0.6666435251973266, "learning_rate": 8.320833776238384e-07, "loss": 0.0355, "step": 3116 }, { "epoch": 2.396157540826129, "grad_norm": 0.7678187211880796, "learning_rate": 8.280346361181063e-07, "loss": 0.0445, "step": 3118 }, { "epoch": 2.397694524495677, "grad_norm": 0.6358919496522946, "learning_rate": 8.239944468829094e-07, "loss": 0.038, "step": 3120 }, { "epoch": 2.399231508165226, "grad_norm": 0.6575677433822967, "learning_rate": 8.199628228498507e-07, "loss": 0.0382, "step": 3122 }, { "epoch": 2.400768491834774, "grad_norm": 0.7346980326551955, "learning_rate": 8.159397769231166e-07, "loss": 0.0406, "step": 3124 }, { "epoch": 2.4023054755043227, "grad_norm": 0.6780951881965817, "learning_rate": 8.119253219794369e-07, "loss": 0.0376, "step": 3126 }, { "epoch": 2.4038424591738714, "grad_norm": 0.7518449654269455, "learning_rate": 8.079194708680458e-07, "loss": 0.0432, "step": 3128 }, { "epoch": 2.4053794428434196, "grad_norm": 0.6735026739540719, "learning_rate": 8.039222364106351e-07, "loss": 0.0353, "step": 3130 }, { "epoch": 2.4069164265129683, "grad_norm": 0.6342721214297197, "learning_rate": 7.999336314013204e-07, "loss": 0.0402, "step": 3132 }, { "epoch": 2.408453410182517, "grad_norm": 0.7230053307776508, "learning_rate": 7.959536686065951e-07, "loss": 0.0391, "step": 3134 }, { "epoch": 2.4099903938520653, "grad_norm": 0.7653416656760699, "learning_rate": 7.919823607652921e-07, "loss": 0.0399, "step": 3136 }, { "epoch": 2.411527377521614, "grad_norm": 0.8030771738565683, "learning_rate": 7.880197205885418e-07, "loss": 0.0429, "step": 3138 }, { "epoch": 2.413064361191162, "grad_norm": 0.7931388452091263, "learning_rate": 7.840657607597282e-07, "loss": 0.0428, "step": 3140 }, { "epoch": 2.414601344860711, "grad_norm": 0.7430780202524496, "learning_rate": 7.80120493934458e-07, "loss": 0.0431, "step": 3142 }, { "epoch": 2.4161383285302596, "grad_norm": 0.6756429003384381, "learning_rate": 7.761839327405105e-07, "loss": 0.038, "step": 3144 }, { "epoch": 2.417675312199808, "grad_norm": 0.7445655284954013, "learning_rate": 7.722560897777989e-07, "loss": 0.0378, "step": 3146 }, { "epoch": 2.4192122958693565, "grad_norm": 0.780771575546592, "learning_rate": 7.683369776183342e-07, "loss": 0.0398, "step": 3148 }, { "epoch": 2.4207492795389047, "grad_norm": 0.7856619296684908, "learning_rate": 7.644266088061811e-07, "loss": 0.0436, "step": 3150 }, { "epoch": 2.4222862632084534, "grad_norm": 0.7727263360626014, "learning_rate": 7.605249958574199e-07, "loss": 0.0435, "step": 3152 }, { "epoch": 2.423823246878002, "grad_norm": 0.7022557826012432, "learning_rate": 7.566321512601064e-07, "loss": 0.039, "step": 3154 }, { "epoch": 2.4253602305475503, "grad_norm": 0.7135877902071903, "learning_rate": 7.527480874742269e-07, "loss": 0.0404, "step": 3156 }, { "epoch": 2.426897214217099, "grad_norm": 0.726091607679137, "learning_rate": 7.488728169316684e-07, "loss": 0.039, "step": 3158 }, { "epoch": 2.4284341978866473, "grad_norm": 0.6803423609158856, "learning_rate": 7.450063520361706e-07, "loss": 0.0382, "step": 3160 }, { "epoch": 2.429971181556196, "grad_norm": 0.631085298189217, "learning_rate": 7.411487051632861e-07, "loss": 0.0369, "step": 3162 }, { "epoch": 2.4315081652257446, "grad_norm": 0.68946022968755, "learning_rate": 7.372998886603466e-07, "loss": 0.0383, "step": 3164 }, { "epoch": 2.433045148895293, "grad_norm": 0.6831510201370967, "learning_rate": 7.33459914846419e-07, "loss": 0.0431, "step": 3166 }, { "epoch": 2.4345821325648416, "grad_norm": 0.7684038963667288, "learning_rate": 7.296287960122672e-07, "loss": 0.0414, "step": 3168 }, { "epoch": 2.43611911623439, "grad_norm": 0.7347215978088572, "learning_rate": 7.258065444203128e-07, "loss": 0.0416, "step": 3170 }, { "epoch": 2.4376560999039385, "grad_norm": 0.7643085623961072, "learning_rate": 7.219931723045929e-07, "loss": 0.0433, "step": 3172 }, { "epoch": 2.439193083573487, "grad_norm": 0.7071330978188225, "learning_rate": 7.181886918707297e-07, "loss": 0.037, "step": 3174 }, { "epoch": 2.4407300672430354, "grad_norm": 0.6841434498931844, "learning_rate": 7.143931152958791e-07, "loss": 0.0372, "step": 3176 }, { "epoch": 2.442267050912584, "grad_norm": 0.692314572344589, "learning_rate": 7.106064547287002e-07, "loss": 0.0345, "step": 3178 }, { "epoch": 2.443804034582133, "grad_norm": 0.7363574996455537, "learning_rate": 7.068287222893179e-07, "loss": 0.0379, "step": 3180 }, { "epoch": 2.445341018251681, "grad_norm": 0.6822850019837609, "learning_rate": 7.030599300692748e-07, "loss": 0.0368, "step": 3182 }, { "epoch": 2.4468780019212297, "grad_norm": 0.7168612788689864, "learning_rate": 6.993000901315013e-07, "loss": 0.0375, "step": 3184 }, { "epoch": 2.448414985590778, "grad_norm": 0.7882428320516394, "learning_rate": 6.955492145102735e-07, "loss": 0.0411, "step": 3186 }, { "epoch": 2.4499519692603267, "grad_norm": 0.7300434190827599, "learning_rate": 6.918073152111746e-07, "loss": 0.0411, "step": 3188 }, { "epoch": 2.451488952929875, "grad_norm": 0.7866316609425517, "learning_rate": 6.88074404211058e-07, "loss": 0.0439, "step": 3190 }, { "epoch": 2.4530259365994236, "grad_norm": 0.7171956812535517, "learning_rate": 6.843504934580055e-07, "loss": 0.0388, "step": 3192 }, { "epoch": 2.4545629202689723, "grad_norm": 0.704134115079434, "learning_rate": 6.806355948712931e-07, "loss": 0.0366, "step": 3194 }, { "epoch": 2.4560999039385205, "grad_norm": 0.7589267365486366, "learning_rate": 6.76929720341353e-07, "loss": 0.0402, "step": 3196 }, { "epoch": 2.457636887608069, "grad_norm": 0.7438363100690424, "learning_rate": 6.732328817297294e-07, "loss": 0.0383, "step": 3198 }, { "epoch": 2.459173871277618, "grad_norm": 0.7366824621405825, "learning_rate": 6.695450908690482e-07, "loss": 0.0402, "step": 3200 }, { "epoch": 2.460710854947166, "grad_norm": 0.7557689558570855, "learning_rate": 6.658663595629751e-07, "loss": 0.0386, "step": 3202 }, { "epoch": 2.462247838616715, "grad_norm": 0.7592608254472845, "learning_rate": 6.621966995861775e-07, "loss": 0.0383, "step": 3204 }, { "epoch": 2.463784822286263, "grad_norm": 0.7295386054315369, "learning_rate": 6.585361226842899e-07, "loss": 0.0361, "step": 3206 }, { "epoch": 2.4653218059558117, "grad_norm": 0.6625997315002057, "learning_rate": 6.548846405738713e-07, "loss": 0.038, "step": 3208 }, { "epoch": 2.46685878962536, "grad_norm": 0.7014497663989586, "learning_rate": 6.512422649423715e-07, "loss": 0.0412, "step": 3210 }, { "epoch": 2.4683957732949087, "grad_norm": 0.783558414213728, "learning_rate": 6.476090074480966e-07, "loss": 0.0393, "step": 3212 }, { "epoch": 2.4699327569644574, "grad_norm": 0.7244776560799225, "learning_rate": 6.439848797201623e-07, "loss": 0.038, "step": 3214 }, { "epoch": 2.4714697406340056, "grad_norm": 0.7224162285528907, "learning_rate": 6.403698933584653e-07, "loss": 0.0409, "step": 3216 }, { "epoch": 2.4730067243035543, "grad_norm": 0.761865817385147, "learning_rate": 6.367640599336425e-07, "loss": 0.0397, "step": 3218 }, { "epoch": 2.474543707973103, "grad_norm": 0.6853505678703133, "learning_rate": 6.331673909870353e-07, "loss": 0.0364, "step": 3220 }, { "epoch": 2.476080691642651, "grad_norm": 0.716004689687549, "learning_rate": 6.295798980306516e-07, "loss": 0.0384, "step": 3222 }, { "epoch": 2.4776176753122, "grad_norm": 0.6629622601789238, "learning_rate": 6.260015925471279e-07, "loss": 0.0382, "step": 3224 }, { "epoch": 2.479154658981748, "grad_norm": 0.6579568031971793, "learning_rate": 6.224324859896957e-07, "loss": 0.0355, "step": 3226 }, { "epoch": 2.480691642651297, "grad_norm": 0.7068591652816046, "learning_rate": 6.18872589782143e-07, "loss": 0.041, "step": 3228 }, { "epoch": 2.4822286263208455, "grad_norm": 0.7304546722170435, "learning_rate": 6.153219153187772e-07, "loss": 0.04, "step": 3230 }, { "epoch": 2.4837656099903938, "grad_norm": 0.7585585312793138, "learning_rate": 6.117804739643907e-07, "loss": 0.0394, "step": 3232 }, { "epoch": 2.4853025936599424, "grad_norm": 0.7054671882965444, "learning_rate": 6.082482770542192e-07, "loss": 0.0378, "step": 3234 }, { "epoch": 2.4868395773294907, "grad_norm": 0.703977106591607, "learning_rate": 6.047253358939142e-07, "loss": 0.0383, "step": 3236 }, { "epoch": 2.4883765609990394, "grad_norm": 0.6400130530644246, "learning_rate": 6.01211661759501e-07, "loss": 0.0397, "step": 3238 }, { "epoch": 2.489913544668588, "grad_norm": 0.7278600446549451, "learning_rate": 5.977072658973393e-07, "loss": 0.0384, "step": 3240 }, { "epoch": 2.4914505283381363, "grad_norm": 0.746204962644141, "learning_rate": 5.942121595240963e-07, "loss": 0.0362, "step": 3242 }, { "epoch": 2.492987512007685, "grad_norm": 0.7292907227238307, "learning_rate": 5.907263538267036e-07, "loss": 0.0386, "step": 3244 }, { "epoch": 2.4945244956772337, "grad_norm": 0.7163392398345111, "learning_rate": 5.872498599623248e-07, "loss": 0.0401, "step": 3246 }, { "epoch": 2.496061479346782, "grad_norm": 0.7435508674073057, "learning_rate": 5.837826890583188e-07, "loss": 0.0416, "step": 3248 }, { "epoch": 2.4975984630163306, "grad_norm": 0.72066787754061, "learning_rate": 5.803248522122008e-07, "loss": 0.0394, "step": 3250 }, { "epoch": 2.499135446685879, "grad_norm": 0.732919013969617, "learning_rate": 5.768763604916157e-07, "loss": 0.0411, "step": 3252 }, { "epoch": 2.5006724303554275, "grad_norm": 0.7405159195872731, "learning_rate": 5.734372249342942e-07, "loss": 0.0417, "step": 3254 }, { "epoch": 2.5022094140249758, "grad_norm": 0.7197000994936994, "learning_rate": 5.700074565480184e-07, "loss": 0.041, "step": 3256 }, { "epoch": 2.5037463976945245, "grad_norm": 0.7015912360666993, "learning_rate": 5.665870663105918e-07, "loss": 0.0384, "step": 3258 }, { "epoch": 2.505283381364073, "grad_norm": 0.7221454603303137, "learning_rate": 5.631760651697987e-07, "loss": 0.0406, "step": 3260 }, { "epoch": 2.5068203650336214, "grad_norm": 0.6898827529762818, "learning_rate": 5.59774464043373e-07, "loss": 0.0347, "step": 3262 }, { "epoch": 2.50835734870317, "grad_norm": 0.7561654186730838, "learning_rate": 5.56382273818961e-07, "loss": 0.0401, "step": 3264 }, { "epoch": 2.5098943323727188, "grad_norm": 0.7634858564317603, "learning_rate": 5.529995053540845e-07, "loss": 0.0393, "step": 3266 }, { "epoch": 2.511431316042267, "grad_norm": 0.7573259002994046, "learning_rate": 5.496261694761138e-07, "loss": 0.0369, "step": 3268 }, { "epoch": 2.5129682997118157, "grad_norm": 0.712091317631427, "learning_rate": 5.46262276982225e-07, "loss": 0.0364, "step": 3270 }, { "epoch": 2.514505283381364, "grad_norm": 0.7452058266292231, "learning_rate": 5.429078386393659e-07, "loss": 0.0376, "step": 3272 }, { "epoch": 2.5160422670509126, "grad_norm": 0.6945382662735007, "learning_rate": 5.3956286518423e-07, "loss": 0.0383, "step": 3274 }, { "epoch": 2.517579250720461, "grad_norm": 0.7542364397801513, "learning_rate": 5.362273673232104e-07, "loss": 0.0404, "step": 3276 }, { "epoch": 2.5191162343900095, "grad_norm": 0.6802822757529954, "learning_rate": 5.329013557323747e-07, "loss": 0.0373, "step": 3278 }, { "epoch": 2.520653218059558, "grad_norm": 0.6897346854529715, "learning_rate": 5.295848410574261e-07, "loss": 0.0371, "step": 3280 }, { "epoch": 2.5221902017291065, "grad_norm": 0.7751087080675236, "learning_rate": 5.262778339136709e-07, "loss": 0.0395, "step": 3282 }, { "epoch": 2.523727185398655, "grad_norm": 0.739466897171713, "learning_rate": 5.229803448859851e-07, "loss": 0.0387, "step": 3284 }, { "epoch": 2.525264169068204, "grad_norm": 0.7326229919534164, "learning_rate": 5.196923845287773e-07, "loss": 0.0399, "step": 3286 }, { "epoch": 2.526801152737752, "grad_norm": 0.6978471766129466, "learning_rate": 5.164139633659586e-07, "loss": 0.0371, "step": 3288 }, { "epoch": 2.5283381364073008, "grad_norm": 0.7419075355790281, "learning_rate": 5.1314509189091e-07, "loss": 0.0418, "step": 3290 }, { "epoch": 2.5298751200768494, "grad_norm": 0.6611449090491568, "learning_rate": 5.098857805664424e-07, "loss": 0.0363, "step": 3292 }, { "epoch": 2.5314121037463977, "grad_norm": 0.6982486243686108, "learning_rate": 5.066360398247698e-07, "loss": 0.0379, "step": 3294 }, { "epoch": 2.532949087415946, "grad_norm": 0.6911500588630615, "learning_rate": 5.033958800674717e-07, "loss": 0.0367, "step": 3296 }, { "epoch": 2.5344860710854946, "grad_norm": 0.669502871148484, "learning_rate": 5.001653116654625e-07, "loss": 0.0345, "step": 3298 }, { "epoch": 2.5360230547550433, "grad_norm": 0.6572589542045425, "learning_rate": 4.969443449589576e-07, "loss": 0.0366, "step": 3300 }, { "epoch": 2.5375600384245915, "grad_norm": 0.6553120661704981, "learning_rate": 4.937329902574367e-07, "loss": 0.0382, "step": 3302 }, { "epoch": 2.5390970220941402, "grad_norm": 0.7504177803505651, "learning_rate": 4.90531257839617e-07, "loss": 0.0376, "step": 3304 }, { "epoch": 2.540634005763689, "grad_norm": 0.6822045806022221, "learning_rate": 4.873391579534186e-07, "loss": 0.0383, "step": 3306 }, { "epoch": 2.542170989433237, "grad_norm": 0.6780574733579846, "learning_rate": 4.841567008159255e-07, "loss": 0.0352, "step": 3308 }, { "epoch": 2.543707973102786, "grad_norm": 0.721214268014702, "learning_rate": 4.809838966133612e-07, "loss": 0.0396, "step": 3310 }, { "epoch": 2.5452449567723345, "grad_norm": 0.7202960873503238, "learning_rate": 4.778207555010525e-07, "loss": 0.043, "step": 3312 }, { "epoch": 2.5467819404418828, "grad_norm": 0.6736522099205745, "learning_rate": 4.74667287603396e-07, "loss": 0.0379, "step": 3314 }, { "epoch": 2.5483189241114315, "grad_norm": 0.7223414892006996, "learning_rate": 4.715235030138284e-07, "loss": 0.036, "step": 3316 }, { "epoch": 2.5498559077809797, "grad_norm": 0.6901701228966389, "learning_rate": 4.6838941179479045e-07, "loss": 0.0389, "step": 3318 }, { "epoch": 2.5513928914505284, "grad_norm": 0.7617514775980445, "learning_rate": 4.652650239776981e-07, "loss": 0.0399, "step": 3320 }, { "epoch": 2.5529298751200766, "grad_norm": 0.7419234804032083, "learning_rate": 4.6215034956290966e-07, "loss": 0.0411, "step": 3322 }, { "epoch": 2.5544668587896253, "grad_norm": 0.7045434950164664, "learning_rate": 4.590453985196933e-07, "loss": 0.0342, "step": 3324 }, { "epoch": 2.556003842459174, "grad_norm": 0.6495176322545015, "learning_rate": 4.5595018078619483e-07, "loss": 0.0369, "step": 3326 }, { "epoch": 2.5575408261287222, "grad_norm": 0.6788801032608669, "learning_rate": 4.5286470626940446e-07, "loss": 0.0351, "step": 3328 }, { "epoch": 2.559077809798271, "grad_norm": 0.7617522250263151, "learning_rate": 4.497889848451303e-07, "loss": 0.0392, "step": 3330 }, { "epoch": 2.5606147934678196, "grad_norm": 0.6989723366985957, "learning_rate": 4.4672302635796267e-07, "loss": 0.0385, "step": 3332 }, { "epoch": 2.562151777137368, "grad_norm": 0.7054197171986639, "learning_rate": 4.436668406212402e-07, "loss": 0.0385, "step": 3334 }, { "epoch": 2.5636887608069165, "grad_norm": 0.6582212133744816, "learning_rate": 4.4062043741702416e-07, "loss": 0.0357, "step": 3336 }, { "epoch": 2.565225744476465, "grad_norm": 0.7280116384325578, "learning_rate": 4.37583826496064e-07, "loss": 0.0437, "step": 3338 }, { "epoch": 2.5667627281460135, "grad_norm": 0.8381876496092886, "learning_rate": 4.3455701757776623e-07, "loss": 0.0388, "step": 3340 }, { "epoch": 2.5682997118155617, "grad_norm": 0.7357236219117014, "learning_rate": 4.315400203501649e-07, "loss": 0.0389, "step": 3342 }, { "epoch": 2.5698366954851104, "grad_norm": 0.7154920208066208, "learning_rate": 4.2853284446988595e-07, "loss": 0.0394, "step": 3344 }, { "epoch": 2.571373679154659, "grad_norm": 0.7461232486368099, "learning_rate": 4.2553549956212485e-07, "loss": 0.0378, "step": 3346 }, { "epoch": 2.5729106628242073, "grad_norm": 0.6390926012256907, "learning_rate": 4.225479952206074e-07, "loss": 0.0344, "step": 3348 }, { "epoch": 2.574447646493756, "grad_norm": 0.6265363948197101, "learning_rate": 4.195703410075615e-07, "loss": 0.0358, "step": 3350 }, { "epoch": 2.5759846301633047, "grad_norm": 0.7056231822693347, "learning_rate": 4.1660254645368936e-07, "loss": 0.0383, "step": 3352 }, { "epoch": 2.577521613832853, "grad_norm": 0.7197705899541366, "learning_rate": 4.1364462105813486e-07, "loss": 0.0382, "step": 3354 }, { "epoch": 2.5790585975024016, "grad_norm": 0.7234355293425967, "learning_rate": 4.10696574288452e-07, "loss": 0.0362, "step": 3356 }, { "epoch": 2.5805955811719503, "grad_norm": 0.6700554042034669, "learning_rate": 4.077584155805774e-07, "loss": 0.0383, "step": 3358 }, { "epoch": 2.5821325648414986, "grad_norm": 0.6989541320884164, "learning_rate": 4.048301543387956e-07, "loss": 0.0334, "step": 3360 }, { "epoch": 2.583669548511047, "grad_norm": 0.7142263192429504, "learning_rate": 4.0191179993571623e-07, "loss": 0.0408, "step": 3362 }, { "epoch": 2.5852065321805955, "grad_norm": 0.665689407268323, "learning_rate": 3.990033617122375e-07, "loss": 0.0343, "step": 3364 }, { "epoch": 2.586743515850144, "grad_norm": 0.6706336215208009, "learning_rate": 3.961048489775169e-07, "loss": 0.0393, "step": 3366 }, { "epoch": 2.5882804995196924, "grad_norm": 0.746265905426555, "learning_rate": 3.932162710089454e-07, "loss": 0.0403, "step": 3368 }, { "epoch": 2.589817483189241, "grad_norm": 0.7118457765158708, "learning_rate": 3.903376370521146e-07, "loss": 0.0381, "step": 3370 }, { "epoch": 2.59135446685879, "grad_norm": 0.6708203366211161, "learning_rate": 3.874689563207873e-07, "loss": 0.0341, "step": 3372 }, { "epoch": 2.592891450528338, "grad_norm": 0.6330517123604927, "learning_rate": 3.8461023799686917e-07, "loss": 0.0364, "step": 3374 }, { "epoch": 2.5944284341978867, "grad_norm": 0.7349779518025492, "learning_rate": 3.817614912303794e-07, "loss": 0.0382, "step": 3376 }, { "epoch": 2.5959654178674354, "grad_norm": 0.704462327359088, "learning_rate": 3.7892272513942003e-07, "loss": 0.0373, "step": 3378 }, { "epoch": 2.5975024015369836, "grad_norm": 0.6642196959244105, "learning_rate": 3.760939488101463e-07, "loss": 0.036, "step": 3380 }, { "epoch": 2.5990393852065323, "grad_norm": 0.6891895315956058, "learning_rate": 3.7327517129674055e-07, "loss": 0.0374, "step": 3382 }, { "epoch": 2.6005763688760806, "grad_norm": 0.7644451152560637, "learning_rate": 3.70466401621383e-07, "loss": 0.0405, "step": 3384 }, { "epoch": 2.6021133525456293, "grad_norm": 0.7737376511884962, "learning_rate": 3.676676487742172e-07, "loss": 0.0366, "step": 3386 }, { "epoch": 2.6036503362151775, "grad_norm": 0.7181650532484439, "learning_rate": 3.648789217133282e-07, "loss": 0.0374, "step": 3388 }, { "epoch": 2.605187319884726, "grad_norm": 0.6915226559770837, "learning_rate": 3.621002293647111e-07, "loss": 0.0368, "step": 3390 }, { "epoch": 2.606724303554275, "grad_norm": 0.7570072346163113, "learning_rate": 3.593315806222415e-07, "loss": 0.04, "step": 3392 }, { "epoch": 2.608261287223823, "grad_norm": 0.8257096784505046, "learning_rate": 3.5657298434764884e-07, "loss": 0.0409, "step": 3394 }, { "epoch": 2.609798270893372, "grad_norm": 0.7210127478796208, "learning_rate": 3.538244493704862e-07, "loss": 0.0382, "step": 3396 }, { "epoch": 2.6113352545629205, "grad_norm": 0.7234247835643788, "learning_rate": 3.510859844881033e-07, "loss": 0.0374, "step": 3398 }, { "epoch": 2.6128722382324687, "grad_norm": 0.7384845763851767, "learning_rate": 3.4835759846562047e-07, "loss": 0.0386, "step": 3400 }, { "epoch": 2.6144092219020174, "grad_norm": 0.7147474025303922, "learning_rate": 3.45639300035894e-07, "loss": 0.0346, "step": 3402 }, { "epoch": 2.6159462055715657, "grad_norm": 0.6463222792959664, "learning_rate": 3.429310978994955e-07, "loss": 0.0336, "step": 3404 }, { "epoch": 2.6174831892411143, "grad_norm": 0.7095670666778837, "learning_rate": 3.402330007246798e-07, "loss": 0.0354, "step": 3406 }, { "epoch": 2.6190201729106626, "grad_norm": 0.6506286288306146, "learning_rate": 3.3754501714735867e-07, "loss": 0.0373, "step": 3408 }, { "epoch": 2.6205571565802113, "grad_norm": 0.7120588233101057, "learning_rate": 3.34867155771074e-07, "loss": 0.0387, "step": 3410 }, { "epoch": 2.62209414024976, "grad_norm": 0.7586108600316385, "learning_rate": 3.321994251669659e-07, "loss": 0.0372, "step": 3412 }, { "epoch": 2.623631123919308, "grad_norm": 0.8046131513589455, "learning_rate": 3.295418338737517e-07, "loss": 0.0394, "step": 3414 }, { "epoch": 2.625168107588857, "grad_norm": 0.6335632790717333, "learning_rate": 3.2689439039769407e-07, "loss": 0.0344, "step": 3416 }, { "epoch": 2.6267050912584056, "grad_norm": 0.7948767979535154, "learning_rate": 3.2425710321257503e-07, "loss": 0.0413, "step": 3418 }, { "epoch": 2.628242074927954, "grad_norm": 0.6525048541754009, "learning_rate": 3.216299807596697e-07, "loss": 0.0364, "step": 3420 }, { "epoch": 2.6297790585975025, "grad_norm": 0.6986942893445027, "learning_rate": 3.190130314477178e-07, "loss": 0.0424, "step": 3422 }, { "epoch": 2.631316042267051, "grad_norm": 0.6837105014346364, "learning_rate": 3.164062636528975e-07, "loss": 0.036, "step": 3424 }, { "epoch": 2.6328530259365994, "grad_norm": 0.6890104339174508, "learning_rate": 3.138096857188001e-07, "loss": 0.0343, "step": 3426 }, { "epoch": 2.6343900096061477, "grad_norm": 0.7400251284066703, "learning_rate": 3.1122330595639864e-07, "loss": 0.0338, "step": 3428 }, { "epoch": 2.6359269932756964, "grad_norm": 0.7249635097965398, "learning_rate": 3.0864713264402697e-07, "loss": 0.0409, "step": 3430 }, { "epoch": 2.637463976945245, "grad_norm": 0.6532133989690416, "learning_rate": 3.060811740273496e-07, "loss": 0.0381, "step": 3432 }, { "epoch": 2.6390009606147933, "grad_norm": 0.7524963132197209, "learning_rate": 3.03525438319338e-07, "loss": 0.0403, "step": 3434 }, { "epoch": 2.640537944284342, "grad_norm": 0.7653173809713844, "learning_rate": 3.0097993370024114e-07, "loss": 0.039, "step": 3436 }, { "epoch": 2.6420749279538907, "grad_norm": 0.7480100060268294, "learning_rate": 2.9844466831756e-07, "loss": 0.0414, "step": 3438 }, { "epoch": 2.643611911623439, "grad_norm": 0.7160584018035305, "learning_rate": 2.9591965028602584e-07, "loss": 0.0355, "step": 3440 }, { "epoch": 2.6451488952929876, "grad_norm": 0.7617333555835748, "learning_rate": 2.9340488768756837e-07, "loss": 0.0402, "step": 3442 }, { "epoch": 2.6466858789625363, "grad_norm": 0.6637519028178263, "learning_rate": 2.909003885712919e-07, "loss": 0.0391, "step": 3444 }, { "epoch": 2.6482228626320845, "grad_norm": 0.6831014037948032, "learning_rate": 2.8840616095345085e-07, "loss": 0.0362, "step": 3446 }, { "epoch": 2.649759846301633, "grad_norm": 0.6786000704889272, "learning_rate": 2.859222128174235e-07, "loss": 0.0378, "step": 3448 }, { "epoch": 2.6512968299711814, "grad_norm": 0.8326763730243029, "learning_rate": 2.83448552113686e-07, "loss": 0.0405, "step": 3450 }, { "epoch": 2.65283381364073, "grad_norm": 0.7015006547909788, "learning_rate": 2.809851867597875e-07, "loss": 0.0367, "step": 3452 }, { "epoch": 2.6543707973102784, "grad_norm": 0.710615174193323, "learning_rate": 2.7853212464032146e-07, "loss": 0.041, "step": 3454 }, { "epoch": 2.655907780979827, "grad_norm": 0.6849647259800099, "learning_rate": 2.7608937360690814e-07, "loss": 0.0363, "step": 3456 }, { "epoch": 2.6574447646493757, "grad_norm": 0.7276694595529292, "learning_rate": 2.736569414781617e-07, "loss": 0.0383, "step": 3458 }, { "epoch": 2.658981748318924, "grad_norm": 0.774608853311179, "learning_rate": 2.7123483603966824e-07, "loss": 0.0402, "step": 3460 }, { "epoch": 2.6605187319884727, "grad_norm": 0.7083364735387814, "learning_rate": 2.6882306504396143e-07, "loss": 0.0401, "step": 3462 }, { "epoch": 2.6620557156580213, "grad_norm": 0.6714099006775984, "learning_rate": 2.664216362104964e-07, "loss": 0.0354, "step": 3464 }, { "epoch": 2.6635926993275696, "grad_norm": 0.6437816346046543, "learning_rate": 2.64030557225627e-07, "loss": 0.035, "step": 3466 }, { "epoch": 2.6651296829971183, "grad_norm": 0.7386329454344525, "learning_rate": 2.6164983574257875e-07, "loss": 0.0411, "step": 3468 }, { "epoch": 2.6666666666666665, "grad_norm": 0.7173981240012108, "learning_rate": 2.592794793814257e-07, "loss": 0.0351, "step": 3470 }, { "epoch": 2.668203650336215, "grad_norm": 0.7373052648241346, "learning_rate": 2.569194957290666e-07, "loss": 0.0367, "step": 3472 }, { "epoch": 2.6697406340057634, "grad_norm": 0.7328221093520572, "learning_rate": 2.5456989233919775e-07, "loss": 0.0395, "step": 3474 }, { "epoch": 2.671277617675312, "grad_norm": 0.6381830501668562, "learning_rate": 2.5223067673229285e-07, "loss": 0.0336, "step": 3476 }, { "epoch": 2.672814601344861, "grad_norm": 0.7536818768122339, "learning_rate": 2.4990185639557777e-07, "loss": 0.0398, "step": 3478 }, { "epoch": 2.674351585014409, "grad_norm": 0.7737529199470804, "learning_rate": 2.475834387830031e-07, "loss": 0.0379, "step": 3480 }, { "epoch": 2.6758885686839577, "grad_norm": 0.7616061727604434, "learning_rate": 2.452754313152246e-07, "loss": 0.0403, "step": 3482 }, { "epoch": 2.6774255523535064, "grad_norm": 0.7209975392281303, "learning_rate": 2.429778413795775e-07, "loss": 0.0383, "step": 3484 }, { "epoch": 2.6789625360230547, "grad_norm": 0.6777003415447909, "learning_rate": 2.4069067633005426e-07, "loss": 0.0397, "step": 3486 }, { "epoch": 2.6804995196926034, "grad_norm": 0.7469038413792495, "learning_rate": 2.3841394348727856e-07, "loss": 0.0392, "step": 3488 }, { "epoch": 2.682036503362152, "grad_norm": 0.7456099202324231, "learning_rate": 2.3614765013848365e-07, "loss": 0.0389, "step": 3490 }, { "epoch": 2.6835734870317003, "grad_norm": 0.758937959458986, "learning_rate": 2.338918035374881e-07, "loss": 0.0381, "step": 3492 }, { "epoch": 2.6851104707012485, "grad_norm": 0.7223222799308419, "learning_rate": 2.3164641090467535e-07, "loss": 0.0399, "step": 3494 }, { "epoch": 2.686647454370797, "grad_norm": 0.750470365319683, "learning_rate": 2.29411479426966e-07, "loss": 0.0389, "step": 3496 }, { "epoch": 2.688184438040346, "grad_norm": 0.8119786466865458, "learning_rate": 2.27187016257798e-07, "loss": 0.0394, "step": 3498 }, { "epoch": 2.689721421709894, "grad_norm": 0.7063154674095581, "learning_rate": 2.2497302851710354e-07, "loss": 0.0366, "step": 3500 }, { "epoch": 2.689721421709894, "eval_loss": 0.15855057537555695, "eval_runtime": 360.8492, "eval_samples_per_second": 51.282, "eval_steps_per_second": 6.413, "step": 3500 }, { "epoch": 2.691258405379443, "grad_norm": 0.7006450588051517, "learning_rate": 2.2276952329128462e-07, "loss": 0.0357, "step": 3502 }, { "epoch": 2.6927953890489915, "grad_norm": 0.6689362757218708, "learning_rate": 2.2057650763319235e-07, "loss": 0.0377, "step": 3504 }, { "epoch": 2.6943323727185398, "grad_norm": 0.7529959367992736, "learning_rate": 2.1839398856210274e-07, "loss": 0.0381, "step": 3506 }, { "epoch": 2.6958693563880884, "grad_norm": 0.7460063989500526, "learning_rate": 2.1622197306369529e-07, "loss": 0.0403, "step": 3508 }, { "epoch": 2.697406340057637, "grad_norm": 0.7548043108085488, "learning_rate": 2.1406046809003016e-07, "loss": 0.0382, "step": 3510 }, { "epoch": 2.6989433237271854, "grad_norm": 0.7421499181728671, "learning_rate": 2.1190948055952634e-07, "loss": 0.0382, "step": 3512 }, { "epoch": 2.700480307396734, "grad_norm": 0.712291075185728, "learning_rate": 2.097690173569392e-07, "loss": 0.0374, "step": 3514 }, { "epoch": 2.7020172910662823, "grad_norm": 0.6764437704157867, "learning_rate": 2.0763908533333742e-07, "loss": 0.0358, "step": 3516 }, { "epoch": 2.703554274735831, "grad_norm": 0.7710410887906691, "learning_rate": 2.0551969130608366e-07, "loss": 0.0405, "step": 3518 }, { "epoch": 2.7050912584053792, "grad_norm": 0.6628607572891497, "learning_rate": 2.0341084205881088e-07, "loss": 0.0334, "step": 3520 }, { "epoch": 2.706628242074928, "grad_norm": 0.7047102030885535, "learning_rate": 2.0131254434139894e-07, "loss": 0.0364, "step": 3522 }, { "epoch": 2.7081652257444766, "grad_norm": 0.7511681830084664, "learning_rate": 1.992248048699576e-07, "loss": 0.0423, "step": 3524 }, { "epoch": 2.709702209414025, "grad_norm": 0.6655250328949831, "learning_rate": 1.971476303268007e-07, "loss": 0.0355, "step": 3526 }, { "epoch": 2.7112391930835735, "grad_norm": 0.7160428780372823, "learning_rate": 1.950810273604274e-07, "loss": 0.0375, "step": 3528 }, { "epoch": 2.712776176753122, "grad_norm": 0.7004993169642538, "learning_rate": 1.930250025855006e-07, "loss": 0.0381, "step": 3530 }, { "epoch": 2.7143131604226705, "grad_norm": 0.6778326775797812, "learning_rate": 1.90979562582822e-07, "loss": 0.0359, "step": 3532 }, { "epoch": 2.715850144092219, "grad_norm": 0.7164337600080629, "learning_rate": 1.8894471389931838e-07, "loss": 0.0373, "step": 3534 }, { "epoch": 2.7173871277617674, "grad_norm": 0.7141990651264797, "learning_rate": 1.869204630480142e-07, "loss": 0.0375, "step": 3536 }, { "epoch": 2.718924111431316, "grad_norm": 0.7001303648061724, "learning_rate": 1.8490681650801216e-07, "loss": 0.0355, "step": 3538 }, { "epoch": 2.7204610951008643, "grad_norm": 0.7165227897430166, "learning_rate": 1.829037807244751e-07, "loss": 0.0361, "step": 3540 }, { "epoch": 2.721998078770413, "grad_norm": 0.7103603815004305, "learning_rate": 1.8091136210860293e-07, "loss": 0.0394, "step": 3542 }, { "epoch": 2.7235350624399617, "grad_norm": 0.7231011724413698, "learning_rate": 1.789295670376128e-07, "loss": 0.0408, "step": 3544 }, { "epoch": 2.72507204610951, "grad_norm": 0.7187891219293795, "learning_rate": 1.7695840185471828e-07, "loss": 0.0353, "step": 3546 }, { "epoch": 2.7266090297790586, "grad_norm": 0.7249222842840717, "learning_rate": 1.7499787286910896e-07, "loss": 0.0383, "step": 3548 }, { "epoch": 2.7281460134486073, "grad_norm": 0.6726610987757241, "learning_rate": 1.7304798635593227e-07, "loss": 0.0345, "step": 3550 }, { "epoch": 2.7296829971181555, "grad_norm": 0.7798682542487884, "learning_rate": 1.711087485562714e-07, "loss": 0.0395, "step": 3552 }, { "epoch": 2.7312199807877042, "grad_norm": 0.8489020328845095, "learning_rate": 1.6918016567712457e-07, "loss": 0.0388, "step": 3554 }, { "epoch": 2.732756964457253, "grad_norm": 0.7675735405799808, "learning_rate": 1.672622438913869e-07, "loss": 0.0413, "step": 3556 }, { "epoch": 2.734293948126801, "grad_norm": 0.7672370602760468, "learning_rate": 1.6535498933783083e-07, "loss": 0.0392, "step": 3558 }, { "epoch": 2.7358309317963494, "grad_norm": 0.706536918085323, "learning_rate": 1.634584081210853e-07, "loss": 0.0353, "step": 3560 }, { "epoch": 2.737367915465898, "grad_norm": 0.6514716793107945, "learning_rate": 1.6157250631161624e-07, "loss": 0.029, "step": 3562 }, { "epoch": 2.7389048991354468, "grad_norm": 0.8151870302671732, "learning_rate": 1.5969728994570786e-07, "loss": 0.0451, "step": 3564 }, { "epoch": 2.740441882804995, "grad_norm": 0.7201013965367818, "learning_rate": 1.5783276502544413e-07, "loss": 0.0375, "step": 3566 }, { "epoch": 2.7419788664745437, "grad_norm": 0.8279664659096065, "learning_rate": 1.5597893751868574e-07, "loss": 0.0377, "step": 3568 }, { "epoch": 2.7435158501440924, "grad_norm": 0.7556263425034218, "learning_rate": 1.541358133590562e-07, "loss": 0.0383, "step": 3570 }, { "epoch": 2.7450528338136406, "grad_norm": 0.786743386378996, "learning_rate": 1.5230339844592033e-07, "loss": 0.0389, "step": 3572 }, { "epoch": 2.7465898174831893, "grad_norm": 0.6751334601351806, "learning_rate": 1.504816986443635e-07, "loss": 0.0348, "step": 3574 }, { "epoch": 2.748126801152738, "grad_norm": 0.7230502301443176, "learning_rate": 1.4867071978517626e-07, "loss": 0.0381, "step": 3576 }, { "epoch": 2.7496637848222862, "grad_norm": 0.7424159284142868, "learning_rate": 1.4687046766483425e-07, "loss": 0.0377, "step": 3578 }, { "epoch": 2.751200768491835, "grad_norm": 0.7033065172321098, "learning_rate": 1.450809480454787e-07, "loss": 0.0392, "step": 3580 }, { "epoch": 2.752737752161383, "grad_norm": 0.7422740281311779, "learning_rate": 1.4330216665490024e-07, "loss": 0.0388, "step": 3582 }, { "epoch": 2.754274735830932, "grad_norm": 0.6749501870256193, "learning_rate": 1.4153412918651736e-07, "loss": 0.0292, "step": 3584 }, { "epoch": 2.75581171950048, "grad_norm": 0.7383402984400056, "learning_rate": 1.3977684129936095e-07, "loss": 0.0362, "step": 3586 }, { "epoch": 2.757348703170029, "grad_norm": 0.7344177105323607, "learning_rate": 1.3803030861805686e-07, "loss": 0.0369, "step": 3588 }, { "epoch": 2.7588856868395775, "grad_norm": 0.7788733949519481, "learning_rate": 1.3629453673280367e-07, "loss": 0.0386, "step": 3590 }, { "epoch": 2.7604226705091257, "grad_norm": 0.7523955526390752, "learning_rate": 1.3456953119935832e-07, "loss": 0.0402, "step": 3592 }, { "epoch": 2.7619596541786744, "grad_norm": 0.6387136592938574, "learning_rate": 1.328552975390179e-07, "loss": 0.0366, "step": 3594 }, { "epoch": 2.763496637848223, "grad_norm": 0.7765496538466939, "learning_rate": 1.3115184123860141e-07, "loss": 0.038, "step": 3596 }, { "epoch": 2.7650336215177713, "grad_norm": 0.6197630907677361, "learning_rate": 1.2945916775043285e-07, "loss": 0.0347, "step": 3598 }, { "epoch": 2.76657060518732, "grad_norm": 0.6507185122236504, "learning_rate": 1.2777728249232174e-07, "loss": 0.0366, "step": 3600 }, { "epoch": 2.7681075888568683, "grad_norm": 0.7129555271323699, "learning_rate": 1.261061908475476e-07, "loss": 0.0382, "step": 3602 }, { "epoch": 2.769644572526417, "grad_norm": 0.7063456528141899, "learning_rate": 1.244458981648443e-07, "loss": 0.0357, "step": 3604 }, { "epoch": 2.771181556195965, "grad_norm": 0.7313820214800656, "learning_rate": 1.2279640975837857e-07, "loss": 0.0361, "step": 3606 }, { "epoch": 2.772718539865514, "grad_norm": 0.7892493813976871, "learning_rate": 1.2115773090773718e-07, "loss": 0.0383, "step": 3608 }, { "epoch": 2.7742555235350626, "grad_norm": 0.7011080024188364, "learning_rate": 1.1952986685790674e-07, "loss": 0.0348, "step": 3610 }, { "epoch": 2.775792507204611, "grad_norm": 0.7212790327473237, "learning_rate": 1.1791282281925968e-07, "loss": 0.0386, "step": 3612 }, { "epoch": 2.7773294908741595, "grad_norm": 0.6824883473362624, "learning_rate": 1.163066039675369e-07, "loss": 0.0351, "step": 3614 }, { "epoch": 2.778866474543708, "grad_norm": 0.6850223544852229, "learning_rate": 1.1471121544382851e-07, "loss": 0.0356, "step": 3616 }, { "epoch": 2.7804034582132564, "grad_norm": 0.7426536823064712, "learning_rate": 1.131266623545612e-07, "loss": 0.0375, "step": 3618 }, { "epoch": 2.781940441882805, "grad_norm": 0.7258544058495259, "learning_rate": 1.1155294977148028e-07, "loss": 0.0364, "step": 3620 }, { "epoch": 2.783477425552354, "grad_norm": 0.7068237112208213, "learning_rate": 1.0999008273163257e-07, "loss": 0.0427, "step": 3622 }, { "epoch": 2.785014409221902, "grad_norm": 0.723771053564773, "learning_rate": 1.0843806623735207e-07, "loss": 0.0378, "step": 3624 }, { "epoch": 2.7865513928914503, "grad_norm": 0.8519854888198337, "learning_rate": 1.0689690525624167e-07, "loss": 0.04, "step": 3626 }, { "epoch": 2.788088376560999, "grad_norm": 0.706127807785433, "learning_rate": 1.0536660472115993e-07, "loss": 0.0393, "step": 3628 }, { "epoch": 2.7896253602305476, "grad_norm": 0.6705497949051035, "learning_rate": 1.0384716953020323e-07, "loss": 0.0333, "step": 3630 }, { "epoch": 2.791162343900096, "grad_norm": 0.6886374108827851, "learning_rate": 1.0233860454669097e-07, "loss": 0.0354, "step": 3632 }, { "epoch": 2.7926993275696446, "grad_norm": 0.6453996780069803, "learning_rate": 1.0084091459914929e-07, "loss": 0.0333, "step": 3634 }, { "epoch": 2.7942363112391932, "grad_norm": 0.7016064494453276, "learning_rate": 9.935410448129705e-08, "loss": 0.0388, "step": 3636 }, { "epoch": 2.7957732949087415, "grad_norm": 0.7099217898470163, "learning_rate": 9.787817895202876e-08, "loss": 0.0341, "step": 3638 }, { "epoch": 2.79731027857829, "grad_norm": 0.7392413254776985, "learning_rate": 9.641314273540175e-08, "loss": 0.0412, "step": 3640 }, { "epoch": 2.798847262247839, "grad_norm": 0.6659018777001534, "learning_rate": 9.495900052061629e-08, "loss": 0.036, "step": 3642 }, { "epoch": 2.800384245917387, "grad_norm": 0.7498210451825361, "learning_rate": 9.351575696200754e-08, "loss": 0.0379, "step": 3644 }, { "epoch": 2.801921229586936, "grad_norm": 0.7188638489637743, "learning_rate": 9.208341667902487e-08, "loss": 0.0349, "step": 3646 }, { "epoch": 2.803458213256484, "grad_norm": 0.6975773584570585, "learning_rate": 9.06619842562183e-08, "loss": 0.0323, "step": 3648 }, { "epoch": 2.8049951969260327, "grad_norm": 0.6554005252536124, "learning_rate": 8.925146424322644e-08, "loss": 0.0329, "step": 3650 }, { "epoch": 2.806532180595581, "grad_norm": 0.7073740246874296, "learning_rate": 8.78518611547594e-08, "loss": 0.0386, "step": 3652 }, { "epoch": 2.8080691642651296, "grad_norm": 0.6691649384368833, "learning_rate": 8.64631794705844e-08, "loss": 0.0399, "step": 3654 }, { "epoch": 2.8096061479346783, "grad_norm": 0.7533417497153267, "learning_rate": 8.508542363551296e-08, "loss": 0.0376, "step": 3656 }, { "epoch": 2.8111431316042266, "grad_norm": 0.7324347703672368, "learning_rate": 8.371859805938497e-08, "loss": 0.0364, "step": 3658 }, { "epoch": 2.8126801152737753, "grad_norm": 0.6931453128827018, "learning_rate": 8.236270711705662e-08, "loss": 0.0326, "step": 3660 }, { "epoch": 2.814217098943324, "grad_norm": 0.6612068285353259, "learning_rate": 8.101775514838372e-08, "loss": 0.0374, "step": 3662 }, { "epoch": 2.815754082612872, "grad_norm": 0.6742383780806924, "learning_rate": 7.968374645820964e-08, "loss": 0.0362, "step": 3664 }, { "epoch": 2.817291066282421, "grad_norm": 0.7414092505129248, "learning_rate": 7.836068531635249e-08, "loss": 0.0403, "step": 3666 }, { "epoch": 2.818828049951969, "grad_norm": 0.6917889501190945, "learning_rate": 7.704857595758802e-08, "loss": 0.036, "step": 3668 }, { "epoch": 2.820365033621518, "grad_norm": 0.7254060046131036, "learning_rate": 7.574742258163952e-08, "loss": 0.0403, "step": 3670 }, { "epoch": 2.821902017291066, "grad_norm": 0.7112553826453433, "learning_rate": 7.445722935316307e-08, "loss": 0.0392, "step": 3672 }, { "epoch": 2.8234390009606147, "grad_norm": 0.7231907969490486, "learning_rate": 7.317800040173311e-08, "loss": 0.0388, "step": 3674 }, { "epoch": 2.8249759846301634, "grad_norm": 0.7143348806863665, "learning_rate": 7.190973982183124e-08, "loss": 0.0356, "step": 3676 }, { "epoch": 2.8265129682997117, "grad_norm": 0.722256824360624, "learning_rate": 7.065245167283179e-08, "loss": 0.0365, "step": 3678 }, { "epoch": 2.8280499519692603, "grad_norm": 0.749657038879464, "learning_rate": 6.940613997898826e-08, "loss": 0.0377, "step": 3680 }, { "epoch": 2.829586935638809, "grad_norm": 0.7506478416183923, "learning_rate": 6.817080872942393e-08, "loss": 0.039, "step": 3682 }, { "epoch": 2.8311239193083573, "grad_norm": 0.7166949562340486, "learning_rate": 6.694646187811371e-08, "loss": 0.0381, "step": 3684 }, { "epoch": 2.832660902977906, "grad_norm": 0.6919045066011337, "learning_rate": 6.573310334387544e-08, "loss": 0.0359, "step": 3686 }, { "epoch": 2.8341978866474546, "grad_norm": 0.7652082696602207, "learning_rate": 6.453073701035644e-08, "loss": 0.0387, "step": 3688 }, { "epoch": 2.835734870317003, "grad_norm": 0.6875494379565337, "learning_rate": 6.333936672602058e-08, "loss": 0.0358, "step": 3690 }, { "epoch": 2.837271853986551, "grad_norm": 0.704153070629626, "learning_rate": 6.215899630413668e-08, "loss": 0.0399, "step": 3692 }, { "epoch": 2.8388088376561, "grad_norm": 0.7683845653293229, "learning_rate": 6.098962952276449e-08, "loss": 0.0376, "step": 3694 }, { "epoch": 2.8403458213256485, "grad_norm": 0.7444818377336334, "learning_rate": 5.983127012474498e-08, "loss": 0.0375, "step": 3696 }, { "epoch": 2.8418828049951967, "grad_norm": 0.7131478664262563, "learning_rate": 5.8683921817687943e-08, "loss": 0.0377, "step": 3698 }, { "epoch": 2.8434197886647454, "grad_norm": 0.6844062973629339, "learning_rate": 5.7547588273958336e-08, "loss": 0.0348, "step": 3700 }, { "epoch": 2.844956772334294, "grad_norm": 0.6917343381332636, "learning_rate": 5.6422273130665835e-08, "loss": 0.0378, "step": 3702 }, { "epoch": 2.8464937560038424, "grad_norm": 0.7486578311506366, "learning_rate": 5.5307979989653534e-08, "loss": 0.041, "step": 3704 }, { "epoch": 2.848030739673391, "grad_norm": 0.7065517002670538, "learning_rate": 5.420471241748592e-08, "loss": 0.0344, "step": 3706 }, { "epoch": 2.8495677233429397, "grad_norm": 0.7218309216210936, "learning_rate": 5.311247394543761e-08, "loss": 0.0396, "step": 3708 }, { "epoch": 2.851104707012488, "grad_norm": 0.7344762397348806, "learning_rate": 5.203126806948127e-08, "loss": 0.0398, "step": 3710 }, { "epoch": 2.8526416906820367, "grad_norm": 0.6978785514396233, "learning_rate": 5.0961098250277166e-08, "loss": 0.035, "step": 3712 }, { "epoch": 2.854178674351585, "grad_norm": 0.7492370044630394, "learning_rate": 4.990196791316304e-08, "loss": 0.0395, "step": 3714 }, { "epoch": 2.8557156580211336, "grad_norm": 0.7347118419552384, "learning_rate": 4.8853880448140876e-08, "loss": 0.0373, "step": 3716 }, { "epoch": 2.857252641690682, "grad_norm": 0.6710823443691567, "learning_rate": 4.781683920986801e-08, "loss": 0.0353, "step": 3718 }, { "epoch": 2.8587896253602305, "grad_norm": 0.7656067717475032, "learning_rate": 4.679084751764467e-08, "loss": 0.0405, "step": 3720 }, { "epoch": 2.860326609029779, "grad_norm": 0.6658481972086318, "learning_rate": 4.5775908655405814e-08, "loss": 0.0392, "step": 3722 }, { "epoch": 2.8618635926993274, "grad_norm": 0.7325672821180539, "learning_rate": 4.4772025871709085e-08, "loss": 0.0357, "step": 3724 }, { "epoch": 2.863400576368876, "grad_norm": 0.7131675994968449, "learning_rate": 4.377920237972238e-08, "loss": 0.0391, "step": 3726 }, { "epoch": 2.864937560038425, "grad_norm": 0.735115333347456, "learning_rate": 4.279744135721764e-08, "loss": 0.0383, "step": 3728 }, { "epoch": 2.866474543707973, "grad_norm": 0.7142932075003074, "learning_rate": 4.182674594655839e-08, "loss": 0.0347, "step": 3730 }, { "epoch": 2.8680115273775217, "grad_norm": 0.7488967955446748, "learning_rate": 4.0867119254689664e-08, "loss": 0.0387, "step": 3732 }, { "epoch": 2.86954851104707, "grad_norm": 0.7704536931618361, "learning_rate": 3.991856435312868e-08, "loss": 0.0399, "step": 3734 }, { "epoch": 2.8710854947166187, "grad_norm": 0.7243897987142581, "learning_rate": 3.898108427795355e-08, "loss": 0.0407, "step": 3736 }, { "epoch": 2.872622478386167, "grad_norm": 0.7696208552815814, "learning_rate": 3.805468202979706e-08, "loss": 0.0377, "step": 3738 }, { "epoch": 2.8741594620557156, "grad_norm": 0.732607042331649, "learning_rate": 3.7139360573832715e-08, "loss": 0.0398, "step": 3740 }, { "epoch": 2.8756964457252643, "grad_norm": 0.6798024620244276, "learning_rate": 3.6235122839767707e-08, "loss": 0.0338, "step": 3742 }, { "epoch": 2.8772334293948125, "grad_norm": 0.7560881846201208, "learning_rate": 3.534197172183323e-08, "loss": 0.0385, "step": 3744 }, { "epoch": 2.878770413064361, "grad_norm": 0.7216366213167684, "learning_rate": 3.4459910078775914e-08, "loss": 0.0404, "step": 3746 }, { "epoch": 2.88030739673391, "grad_norm": 0.7186180485084087, "learning_rate": 3.358894073384616e-08, "loss": 0.0388, "step": 3748 }, { "epoch": 2.881844380403458, "grad_norm": 0.755800097162076, "learning_rate": 3.2729066474792734e-08, "loss": 0.0355, "step": 3750 }, { "epoch": 2.883381364073007, "grad_norm": 0.7190348005662992, "learning_rate": 3.18802900538499e-08, "loss": 0.0387, "step": 3752 }, { "epoch": 2.8849183477425555, "grad_norm": 0.6881759771848993, "learning_rate": 3.104261418773241e-08, "loss": 0.0371, "step": 3754 }, { "epoch": 2.8864553314121038, "grad_norm": 0.7368031850460476, "learning_rate": 3.0216041557624196e-08, "loss": 0.0363, "step": 3756 }, { "epoch": 2.887992315081652, "grad_norm": 0.6818452373614833, "learning_rate": 2.9400574809169856e-08, "loss": 0.0385, "step": 3758 }, { "epoch": 2.8895292987512007, "grad_norm": 0.671665343657444, "learning_rate": 2.859621655246841e-08, "loss": 0.0394, "step": 3760 }, { "epoch": 2.8910662824207494, "grad_norm": 0.8081746540797051, "learning_rate": 2.7802969362062057e-08, "loss": 0.0426, "step": 3762 }, { "epoch": 2.8926032660902976, "grad_norm": 0.7953691794225282, "learning_rate": 2.702083577693071e-08, "loss": 0.0401, "step": 3764 }, { "epoch": 2.8941402497598463, "grad_norm": 0.6780256825935983, "learning_rate": 2.624981830048151e-08, "loss": 0.038, "step": 3766 }, { "epoch": 2.895677233429395, "grad_norm": 0.7601952372611993, "learning_rate": 2.5489919400542236e-08, "loss": 0.0348, "step": 3768 }, { "epoch": 2.8972142170989432, "grad_norm": 0.7521761082939742, "learning_rate": 2.4741141509353136e-08, "loss": 0.0376, "step": 3770 }, { "epoch": 2.898751200768492, "grad_norm": 0.7042258551510416, "learning_rate": 2.4003487023557978e-08, "loss": 0.0383, "step": 3772 }, { "epoch": 2.9002881844380406, "grad_norm": 0.6754060246689739, "learning_rate": 2.3276958304198235e-08, "loss": 0.0356, "step": 3774 }, { "epoch": 2.901825168107589, "grad_norm": 0.738067712348994, "learning_rate": 2.2561557676705314e-08, "loss": 0.0441, "step": 3776 }, { "epoch": 2.9033621517771375, "grad_norm": 0.7057955885412858, "learning_rate": 2.1857287430891213e-08, "loss": 0.0378, "step": 3778 }, { "epoch": 2.9048991354466858, "grad_norm": 0.6886592794650539, "learning_rate": 2.1164149820942722e-08, "loss": 0.0366, "step": 3780 }, { "epoch": 2.9064361191162345, "grad_norm": 0.7673640335969266, "learning_rate": 2.048214706541479e-08, "loss": 0.0406, "step": 3782 }, { "epoch": 2.9079731027857827, "grad_norm": 0.6678527359649526, "learning_rate": 1.9811281347221597e-08, "loss": 0.037, "step": 3784 }, { "epoch": 2.9095100864553314, "grad_norm": 0.7127890261581394, "learning_rate": 1.9151554813630734e-08, "loss": 0.0361, "step": 3786 }, { "epoch": 2.91104707012488, "grad_norm": 0.7583876230260865, "learning_rate": 1.850296957625658e-08, "loss": 0.0378, "step": 3788 }, { "epoch": 2.9125840537944283, "grad_norm": 0.6957251087608395, "learning_rate": 1.7865527711052932e-08, "loss": 0.0392, "step": 3790 }, { "epoch": 2.914121037463977, "grad_norm": 0.7245984947394084, "learning_rate": 1.7239231258306397e-08, "loss": 0.0387, "step": 3792 }, { "epoch": 2.9156580211335257, "grad_norm": 0.6799380535511241, "learning_rate": 1.662408222262979e-08, "loss": 0.0359, "step": 3794 }, { "epoch": 2.917195004803074, "grad_norm": 0.6832560309649182, "learning_rate": 1.6020082572956674e-08, "loss": 0.0379, "step": 3796 }, { "epoch": 2.9187319884726226, "grad_norm": 0.7506399618941308, "learning_rate": 1.542723424253323e-08, "loss": 0.0433, "step": 3798 }, { "epoch": 2.920268972142171, "grad_norm": 0.7012670552597634, "learning_rate": 1.4845539128913954e-08, "loss": 0.038, "step": 3800 }, { "epoch": 2.9218059558117195, "grad_norm": 0.8131855947668669, "learning_rate": 1.4274999093955077e-08, "loss": 0.038, "step": 3802 }, { "epoch": 2.9233429394812678, "grad_norm": 0.72352053716851, "learning_rate": 1.371561596380677e-08, "loss": 0.04, "step": 3804 }, { "epoch": 2.9248799231508165, "grad_norm": 0.7277652228743811, "learning_rate": 1.3167391528910831e-08, "loss": 0.0404, "step": 3806 }, { "epoch": 2.926416906820365, "grad_norm": 0.6608930767109022, "learning_rate": 1.2630327543991349e-08, "loss": 0.0333, "step": 3808 }, { "epoch": 2.9279538904899134, "grad_norm": 0.7251459149646445, "learning_rate": 1.2104425728051981e-08, "loss": 0.035, "step": 3810 }, { "epoch": 2.929490874159462, "grad_norm": 0.7280772115836247, "learning_rate": 1.1589687764367807e-08, "loss": 0.0366, "step": 3812 }, { "epoch": 2.9310278578290108, "grad_norm": 0.7583993906119615, "learning_rate": 1.1086115300482203e-08, "loss": 0.0377, "step": 3814 }, { "epoch": 2.932564841498559, "grad_norm": 0.6946315289320816, "learning_rate": 1.0593709948200635e-08, "loss": 0.0375, "step": 3816 }, { "epoch": 2.9341018251681077, "grad_norm": 0.7540614936256114, "learning_rate": 1.011247328358561e-08, "loss": 0.0419, "step": 3818 }, { "epoch": 2.9356388088376564, "grad_norm": 0.7380916570292597, "learning_rate": 9.642406846950446e-09, "loss": 0.0391, "step": 3820 }, { "epoch": 2.9371757925072046, "grad_norm": 0.7229342418450578, "learning_rate": 9.183512142857342e-09, "loss": 0.0363, "step": 3822 }, { "epoch": 2.938712776176753, "grad_norm": 0.7727279695915876, "learning_rate": 8.73579064010882e-09, "loss": 0.0395, "step": 3824 }, { "epoch": 2.9402497598463015, "grad_norm": 0.741689517658393, "learning_rate": 8.299243771746179e-09, "loss": 0.0386, "step": 3826 }, { "epoch": 2.9417867435158502, "grad_norm": 0.7231470558103498, "learning_rate": 7.873872935043269e-09, "loss": 0.041, "step": 3828 }, { "epoch": 2.9433237271853985, "grad_norm": 0.6584072678052216, "learning_rate": 7.459679491501448e-09, "loss": 0.0345, "step": 3830 }, { "epoch": 2.944860710854947, "grad_norm": 0.6835760459984861, "learning_rate": 7.0566647668476315e-09, "loss": 0.0384, "step": 3832 }, { "epoch": 2.946397694524496, "grad_norm": 0.7191660536723004, "learning_rate": 6.6648300510276925e-09, "loss": 0.0358, "step": 3834 }, { "epoch": 2.947934678194044, "grad_norm": 0.7051985526337444, "learning_rate": 6.284176598202573e-09, "loss": 0.0369, "step": 3836 }, { "epoch": 2.9494716618635928, "grad_norm": 0.6889042075604795, "learning_rate": 5.914705626746342e-09, "loss": 0.0346, "step": 3838 }, { "epoch": 2.9510086455331415, "grad_norm": 0.6797954961166619, "learning_rate": 5.556418319239975e-09, "loss": 0.0358, "step": 3840 }, { "epoch": 2.9525456292026897, "grad_norm": 0.7081235515177567, "learning_rate": 5.209315822468252e-09, "loss": 0.04, "step": 3842 }, { "epoch": 2.9540826128722384, "grad_norm": 0.7586899396756654, "learning_rate": 4.873399247417032e-09, "loss": 0.041, "step": 3844 }, { "epoch": 2.9556195965417866, "grad_norm": 0.6546459502260559, "learning_rate": 4.548669669267813e-09, "loss": 0.0379, "step": 3846 }, { "epoch": 2.9571565802113353, "grad_norm": 0.7194117012638997, "learning_rate": 4.23512812739657e-09, "loss": 0.0352, "step": 3848 }, { "epoch": 2.9586935638808836, "grad_norm": 0.6550475400228738, "learning_rate": 3.9327756253683125e-09, "loss": 0.0338, "step": 3850 }, { "epoch": 2.9602305475504322, "grad_norm": 0.6471117547944658, "learning_rate": 3.6416131309355283e-09, "loss": 0.0327, "step": 3852 }, { "epoch": 2.961767531219981, "grad_norm": 0.6643076347264795, "learning_rate": 3.361641576034302e-09, "loss": 0.0366, "step": 3854 }, { "epoch": 2.963304514889529, "grad_norm": 0.7533191809512965, "learning_rate": 3.0928618567808153e-09, "loss": 0.0401, "step": 3856 }, { "epoch": 2.964841498559078, "grad_norm": 0.7192365559237837, "learning_rate": 2.835274833469403e-09, "loss": 0.0372, "step": 3858 }, { "epoch": 2.9663784822286265, "grad_norm": 0.720379151963908, "learning_rate": 2.588881330570225e-09, "loss": 0.0362, "step": 3860 }, { "epoch": 2.967915465898175, "grad_norm": 0.7348332898518517, "learning_rate": 2.3536821367246e-09, "loss": 0.0342, "step": 3862 }, { "epoch": 2.9694524495677235, "grad_norm": 0.7451802490285939, "learning_rate": 2.1296780047446172e-09, "loss": 0.0393, "step": 3864 }, { "epoch": 2.9709894332372717, "grad_norm": 0.5897360637433469, "learning_rate": 1.9168696516092544e-09, "loss": 0.035, "step": 3866 }, { "epoch": 2.9725264169068204, "grad_norm": 0.6721100987536596, "learning_rate": 1.7152577584635952e-09, "loss": 0.0376, "step": 3868 }, { "epoch": 2.9740634005763686, "grad_norm": 0.7372791169185593, "learning_rate": 1.524842970614948e-09, "loss": 0.0393, "step": 3870 }, { "epoch": 2.9756003842459173, "grad_norm": 0.7155596616446167, "learning_rate": 1.3456258975312885e-09, "loss": 0.041, "step": 3872 }, { "epoch": 2.977137367915466, "grad_norm": 0.7243460282148372, "learning_rate": 1.1776071128412614e-09, "loss": 0.0373, "step": 3874 }, { "epoch": 2.9786743515850143, "grad_norm": 0.8216882893956271, "learning_rate": 1.0207871543287394e-09, "loss": 0.0387, "step": 3876 }, { "epoch": 2.980211335254563, "grad_norm": 0.7555366446961992, "learning_rate": 8.7516652393399e-10, "loss": 0.0387, "step": 3878 }, { "epoch": 2.9817483189241116, "grad_norm": 0.7303863102314947, "learning_rate": 7.40745687751343e-10, "loss": 0.0375, "step": 3880 }, { "epoch": 2.98328530259366, "grad_norm": 0.7352509297982905, "learning_rate": 6.175250760268591e-10, "loss": 0.0398, "step": 3882 }, { "epoch": 2.9848222862632086, "grad_norm": 0.7396812555733191, "learning_rate": 5.055050831579422e-10, "loss": 0.0454, "step": 3884 }, { "epoch": 2.986359269932757, "grad_norm": 0.6935348372155691, "learning_rate": 4.0468606769178403e-10, "loss": 0.0353, "step": 3886 }, { "epoch": 2.9878962536023055, "grad_norm": 0.7901782130270826, "learning_rate": 3.150683523238107e-10, "loss": 0.0376, "step": 3888 }, { "epoch": 2.9894332372718537, "grad_norm": 0.6621002431203192, "learning_rate": 2.366522238972935e-10, "loss": 0.0356, "step": 3890 }, { "epoch": 2.9909702209414024, "grad_norm": 0.7167270045432917, "learning_rate": 1.6943793340179482e-10, "loss": 0.0381, "step": 3892 }, { "epoch": 2.992507204610951, "grad_norm": 0.70796009257099, "learning_rate": 1.1342569597277973e-10, "loss": 0.0376, "step": 3894 }, { "epoch": 2.9940441882804993, "grad_norm": 0.725988884787138, "learning_rate": 6.861569089161578e-11, "loss": 0.0354, "step": 3896 }, { "epoch": 2.995581171950048, "grad_norm": 0.6619948029465499, "learning_rate": 3.500806158285297e-11, "loss": 0.0362, "step": 3898 }, { "epoch": 2.9971181556195967, "grad_norm": 0.7007360064163597, "learning_rate": 1.2602915616166665e-11, "loss": 0.0403, "step": 3900 }, { "epoch": 2.998655139289145, "grad_norm": 0.6392213863532241, "learning_rate": 1.4003247044147572e-12, "loss": 0.0384, "step": 3902 }, { "epoch": 2.9994236311239195, "step": 3903, "total_flos": 1225143666540544.0, "train_loss": 0.11350150189982632, "train_runtime": 22267.9689, "train_samples_per_second": 22.437, "train_steps_per_second": 0.175 } ], "logging_steps": 2, "max_steps": 3903, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1225143666540544.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }