{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 6432, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009328358208955224, "grad_norm": 141.84653593783037, "learning_rate": 3.1055900621118015e-08, "loss": 1.2317, "step": 2 }, { "epoch": 0.0018656716417910447, "grad_norm": 73.92673328115033, "learning_rate": 6.211180124223603e-08, "loss": 1.1508, "step": 4 }, { "epoch": 0.002798507462686567, "grad_norm": 25.161715198399044, "learning_rate": 9.316770186335405e-08, "loss": 1.2665, "step": 6 }, { "epoch": 0.0037313432835820895, "grad_norm": 125.15554556515958, "learning_rate": 1.2422360248447206e-07, "loss": 1.2684, "step": 8 }, { "epoch": 0.0046641791044776115, "grad_norm": 34.09560313170012, "learning_rate": 1.5527950310559006e-07, "loss": 1.1518, "step": 10 }, { "epoch": 0.005597014925373134, "grad_norm": 70.10261737049221, "learning_rate": 1.863354037267081e-07, "loss": 1.1829, "step": 12 }, { "epoch": 0.0065298507462686565, "grad_norm": 85.58058584581248, "learning_rate": 2.173913043478261e-07, "loss": 1.2359, "step": 14 }, { "epoch": 0.007462686567164179, "grad_norm": 91.20705086258509, "learning_rate": 2.484472049689441e-07, "loss": 1.1911, "step": 16 }, { "epoch": 0.008395522388059701, "grad_norm": 80.47290386920726, "learning_rate": 2.795031055900621e-07, "loss": 1.1806, "step": 18 }, { "epoch": 0.009328358208955223, "grad_norm": 42.91688398439449, "learning_rate": 3.1055900621118013e-07, "loss": 1.1706, "step": 20 }, { "epoch": 0.010261194029850746, "grad_norm": 17.19066282148705, "learning_rate": 3.416149068322982e-07, "loss": 1.1021, "step": 22 }, { "epoch": 0.011194029850746268, "grad_norm": 37.72696934607804, "learning_rate": 3.726708074534162e-07, "loss": 1.1804, "step": 24 }, { "epoch": 0.012126865671641791, "grad_norm": 43.18771357786159, "learning_rate": 4.037267080745342e-07, "loss": 1.0774, "step": 26 }, { "epoch": 0.013059701492537313, "grad_norm": 22.079075604777447, "learning_rate": 4.347826086956522e-07, "loss": 0.9575, "step": 28 }, { "epoch": 0.013992537313432836, "grad_norm": 103.94449230610218, "learning_rate": 4.658385093167702e-07, "loss": 1.0526, "step": 30 }, { "epoch": 0.014925373134328358, "grad_norm": 20.482563257817198, "learning_rate": 4.968944099378882e-07, "loss": 0.9768, "step": 32 }, { "epoch": 0.01585820895522388, "grad_norm": 92.60009615720648, "learning_rate": 5.279503105590063e-07, "loss": 0.9088, "step": 34 }, { "epoch": 0.016791044776119403, "grad_norm": 72.58664016267214, "learning_rate": 5.590062111801243e-07, "loss": 0.7171, "step": 36 }, { "epoch": 0.017723880597014924, "grad_norm": 89.42561621187095, "learning_rate": 5.900621118012423e-07, "loss": 0.6902, "step": 38 }, { "epoch": 0.018656716417910446, "grad_norm": 17.953741530444063, "learning_rate": 6.211180124223603e-07, "loss": 0.6602, "step": 40 }, { "epoch": 0.01958955223880597, "grad_norm": 192.7702382369095, "learning_rate": 6.521739130434783e-07, "loss": 0.6708, "step": 42 }, { "epoch": 0.020522388059701493, "grad_norm": 271.8037595834288, "learning_rate": 6.832298136645964e-07, "loss": 0.6101, "step": 44 }, { "epoch": 0.021455223880597014, "grad_norm": 245.143722738473, "learning_rate": 7.142857142857143e-07, "loss": 0.578, "step": 46 }, { "epoch": 0.022388059701492536, "grad_norm": 82.24029703991424, "learning_rate": 7.453416149068324e-07, "loss": 0.5357, "step": 48 }, { "epoch": 0.02332089552238806, "grad_norm": 106.27466386311123, "learning_rate": 7.763975155279503e-07, "loss": 0.5292, "step": 50 }, { "epoch": 0.024253731343283583, "grad_norm": 28.097169922015855, "learning_rate": 8.074534161490684e-07, "loss": 0.4957, "step": 52 }, { "epoch": 0.025186567164179104, "grad_norm": 32.00034184187322, "learning_rate": 8.385093167701864e-07, "loss": 0.4817, "step": 54 }, { "epoch": 0.026119402985074626, "grad_norm": 28.389364549998767, "learning_rate": 8.695652173913044e-07, "loss": 0.4472, "step": 56 }, { "epoch": 0.027052238805970148, "grad_norm": 5.448904726347897, "learning_rate": 9.006211180124224e-07, "loss": 0.4102, "step": 58 }, { "epoch": 0.027985074626865673, "grad_norm": 47.33775950791769, "learning_rate": 9.316770186335404e-07, "loss": 0.3834, "step": 60 }, { "epoch": 0.028917910447761194, "grad_norm": 35.171578880874314, "learning_rate": 9.627329192546585e-07, "loss": 0.3838, "step": 62 }, { "epoch": 0.029850746268656716, "grad_norm": 22.602001415969152, "learning_rate": 9.937888198757765e-07, "loss": 0.395, "step": 64 }, { "epoch": 0.030783582089552237, "grad_norm": 43.14849541602834, "learning_rate": 1.0248447204968944e-06, "loss": 0.373, "step": 66 }, { "epoch": 0.03171641791044776, "grad_norm": 33.28823729652902, "learning_rate": 1.0559006211180126e-06, "loss": 0.3767, "step": 68 }, { "epoch": 0.03264925373134328, "grad_norm": 35.60279059054033, "learning_rate": 1.0869565217391306e-06, "loss": 0.3665, "step": 70 }, { "epoch": 0.033582089552238806, "grad_norm": 20.477590777483467, "learning_rate": 1.1180124223602485e-06, "loss": 0.3856, "step": 72 }, { "epoch": 0.03451492537313433, "grad_norm": 4.733191255137944, "learning_rate": 1.1490683229813664e-06, "loss": 0.3376, "step": 74 }, { "epoch": 0.03544776119402985, "grad_norm": 21.095445622645272, "learning_rate": 1.1801242236024846e-06, "loss": 0.3246, "step": 76 }, { "epoch": 0.036380597014925374, "grad_norm": 33.60900782671738, "learning_rate": 1.2111801242236026e-06, "loss": 0.316, "step": 78 }, { "epoch": 0.03731343283582089, "grad_norm": 37.44174652819769, "learning_rate": 1.2422360248447205e-06, "loss": 0.3482, "step": 80 }, { "epoch": 0.03824626865671642, "grad_norm": 27.856083578813237, "learning_rate": 1.2732919254658385e-06, "loss": 0.3255, "step": 82 }, { "epoch": 0.03917910447761194, "grad_norm": 20.49023173459329, "learning_rate": 1.3043478260869566e-06, "loss": 0.3151, "step": 84 }, { "epoch": 0.04011194029850746, "grad_norm": 5.727461718902441, "learning_rate": 1.3354037267080746e-06, "loss": 0.3019, "step": 86 }, { "epoch": 0.041044776119402986, "grad_norm": 19.508756868837175, "learning_rate": 1.3664596273291927e-06, "loss": 0.3101, "step": 88 }, { "epoch": 0.04197761194029851, "grad_norm": 24.136335304817063, "learning_rate": 1.3975155279503105e-06, "loss": 0.318, "step": 90 }, { "epoch": 0.04291044776119403, "grad_norm": 4.691307551468095, "learning_rate": 1.4285714285714286e-06, "loss": 0.3065, "step": 92 }, { "epoch": 0.043843283582089554, "grad_norm": 21.636244861854642, "learning_rate": 1.4596273291925466e-06, "loss": 0.3061, "step": 94 }, { "epoch": 0.04477611940298507, "grad_norm": 15.923269773290114, "learning_rate": 1.4906832298136647e-06, "loss": 0.2792, "step": 96 }, { "epoch": 0.0457089552238806, "grad_norm": 4.6897990036747546, "learning_rate": 1.521739130434783e-06, "loss": 0.3085, "step": 98 }, { "epoch": 0.04664179104477612, "grad_norm": 7.167545200566593, "learning_rate": 1.5527950310559006e-06, "loss": 0.2911, "step": 100 }, { "epoch": 0.04757462686567164, "grad_norm": 4.515330928399463, "learning_rate": 1.5838509316770188e-06, "loss": 0.2937, "step": 102 }, { "epoch": 0.048507462686567165, "grad_norm": 4.511662569904355, "learning_rate": 1.6149068322981367e-06, "loss": 0.2898, "step": 104 }, { "epoch": 0.049440298507462684, "grad_norm": 4.617959983767074, "learning_rate": 1.645962732919255e-06, "loss": 0.2755, "step": 106 }, { "epoch": 0.05037313432835821, "grad_norm": 10.068851464677156, "learning_rate": 1.6770186335403729e-06, "loss": 0.2776, "step": 108 }, { "epoch": 0.051305970149253734, "grad_norm": 6.1624005661275785, "learning_rate": 1.7080745341614908e-06, "loss": 0.3112, "step": 110 }, { "epoch": 0.05223880597014925, "grad_norm": 4.701426834934968, "learning_rate": 1.7391304347826088e-06, "loss": 0.2775, "step": 112 }, { "epoch": 0.05317164179104478, "grad_norm": 10.590233086908592, "learning_rate": 1.770186335403727e-06, "loss": 0.2883, "step": 114 }, { "epoch": 0.054104477611940295, "grad_norm": 4.270783196100384, "learning_rate": 1.8012422360248449e-06, "loss": 0.2931, "step": 116 }, { "epoch": 0.05503731343283582, "grad_norm": 3.790903016107792, "learning_rate": 1.832298136645963e-06, "loss": 0.2963, "step": 118 }, { "epoch": 0.055970149253731345, "grad_norm": 3.7190724453202084, "learning_rate": 1.8633540372670808e-06, "loss": 0.2805, "step": 120 }, { "epoch": 0.05690298507462686, "grad_norm": 7.40396046398378, "learning_rate": 1.894409937888199e-06, "loss": 0.2754, "step": 122 }, { "epoch": 0.05783582089552239, "grad_norm": 16.803503733002056, "learning_rate": 1.925465838509317e-06, "loss": 0.2662, "step": 124 }, { "epoch": 0.058768656716417914, "grad_norm": 13.285898393084533, "learning_rate": 1.956521739130435e-06, "loss": 0.2735, "step": 126 }, { "epoch": 0.05970149253731343, "grad_norm": 9.1224986259931, "learning_rate": 1.987577639751553e-06, "loss": 0.286, "step": 128 }, { "epoch": 0.06063432835820896, "grad_norm": 5.613611256654138, "learning_rate": 2.018633540372671e-06, "loss": 0.2662, "step": 130 }, { "epoch": 0.061567164179104475, "grad_norm": 7.545064210551538, "learning_rate": 2.049689440993789e-06, "loss": 0.2461, "step": 132 }, { "epoch": 0.0625, "grad_norm": 4.173401955055914, "learning_rate": 2.0807453416149073e-06, "loss": 0.2715, "step": 134 }, { "epoch": 0.06343283582089553, "grad_norm": 8.485933023187348, "learning_rate": 2.111801242236025e-06, "loss": 0.2566, "step": 136 }, { "epoch": 0.06436567164179105, "grad_norm": 4.19842829482207, "learning_rate": 2.1428571428571427e-06, "loss": 0.2735, "step": 138 }, { "epoch": 0.06529850746268656, "grad_norm": 8.425962572484579, "learning_rate": 2.173913043478261e-06, "loss": 0.2659, "step": 140 }, { "epoch": 0.06623134328358209, "grad_norm": 3.2805416423890033, "learning_rate": 2.204968944099379e-06, "loss": 0.2462, "step": 142 }, { "epoch": 0.06716417910447761, "grad_norm": 3.3278531588236193, "learning_rate": 2.236024844720497e-06, "loss": 0.2564, "step": 144 }, { "epoch": 0.06809701492537314, "grad_norm": 4.956170528518187, "learning_rate": 2.2670807453416154e-06, "loss": 0.2721, "step": 146 }, { "epoch": 0.06902985074626866, "grad_norm": 3.722032231542065, "learning_rate": 2.298136645962733e-06, "loss": 0.2456, "step": 148 }, { "epoch": 0.06996268656716417, "grad_norm": 12.012432614210493, "learning_rate": 2.3291925465838513e-06, "loss": 0.2564, "step": 150 }, { "epoch": 0.0708955223880597, "grad_norm": 3.5922232704959334, "learning_rate": 2.3602484472049692e-06, "loss": 0.2595, "step": 152 }, { "epoch": 0.07182835820895522, "grad_norm": 3.191669357887466, "learning_rate": 2.391304347826087e-06, "loss": 0.2398, "step": 154 }, { "epoch": 0.07276119402985075, "grad_norm": 3.194123430092511, "learning_rate": 2.422360248447205e-06, "loss": 0.2318, "step": 156 }, { "epoch": 0.07369402985074627, "grad_norm": 3.083149240607351, "learning_rate": 2.453416149068323e-06, "loss": 0.2267, "step": 158 }, { "epoch": 0.07462686567164178, "grad_norm": 5.7100674979514885, "learning_rate": 2.484472049689441e-06, "loss": 0.2394, "step": 160 }, { "epoch": 0.07555970149253731, "grad_norm": 4.621211487899202, "learning_rate": 2.515527950310559e-06, "loss": 0.2642, "step": 162 }, { "epoch": 0.07649253731343283, "grad_norm": 4.216113351048156, "learning_rate": 2.546583850931677e-06, "loss": 0.232, "step": 164 }, { "epoch": 0.07742537313432836, "grad_norm": 3.3356585618193075, "learning_rate": 2.5776397515527953e-06, "loss": 0.234, "step": 166 }, { "epoch": 0.07835820895522388, "grad_norm": 5.092703909893854, "learning_rate": 2.6086956521739132e-06, "loss": 0.2259, "step": 168 }, { "epoch": 0.07929104477611941, "grad_norm": 6.6435165949486725, "learning_rate": 2.639751552795031e-06, "loss": 0.2498, "step": 170 }, { "epoch": 0.08022388059701492, "grad_norm": 3.849984947948696, "learning_rate": 2.670807453416149e-06, "loss": 0.2438, "step": 172 }, { "epoch": 0.08115671641791045, "grad_norm": 3.510096516942484, "learning_rate": 2.7018633540372675e-06, "loss": 0.225, "step": 174 }, { "epoch": 0.08208955223880597, "grad_norm": 3.1995013521028457, "learning_rate": 2.7329192546583855e-06, "loss": 0.2238, "step": 176 }, { "epoch": 0.0830223880597015, "grad_norm": 3.2687726372112986, "learning_rate": 2.7639751552795034e-06, "loss": 0.207, "step": 178 }, { "epoch": 0.08395522388059702, "grad_norm": 3.3120599930892243, "learning_rate": 2.795031055900621e-06, "loss": 0.2226, "step": 180 }, { "epoch": 0.08488805970149253, "grad_norm": 3.230016392779267, "learning_rate": 2.8260869565217393e-06, "loss": 0.2168, "step": 182 }, { "epoch": 0.08582089552238806, "grad_norm": 3.1363971094017042, "learning_rate": 2.8571428571428573e-06, "loss": 0.2232, "step": 184 }, { "epoch": 0.08675373134328358, "grad_norm": 2.757809443179066, "learning_rate": 2.888198757763975e-06, "loss": 0.2296, "step": 186 }, { "epoch": 0.08768656716417911, "grad_norm": 3.1026324097251017, "learning_rate": 2.919254658385093e-06, "loss": 0.2174, "step": 188 }, { "epoch": 0.08861940298507463, "grad_norm": 3.1563889262406772, "learning_rate": 2.9503105590062115e-06, "loss": 0.2205, "step": 190 }, { "epoch": 0.08955223880597014, "grad_norm": 3.250136524075722, "learning_rate": 2.9813664596273295e-06, "loss": 0.2409, "step": 192 }, { "epoch": 0.09048507462686567, "grad_norm": 2.9602639513067603, "learning_rate": 3.0124223602484474e-06, "loss": 0.2022, "step": 194 }, { "epoch": 0.0914179104477612, "grad_norm": 2.734325686913689, "learning_rate": 3.043478260869566e-06, "loss": 0.2198, "step": 196 }, { "epoch": 0.09235074626865672, "grad_norm": 2.556678285564254, "learning_rate": 3.0745341614906837e-06, "loss": 0.2069, "step": 198 }, { "epoch": 0.09328358208955224, "grad_norm": 2.858382687845117, "learning_rate": 3.1055900621118013e-06, "loss": 0.2091, "step": 200 }, { "epoch": 0.09421641791044776, "grad_norm": 3.011854932973755, "learning_rate": 3.1366459627329192e-06, "loss": 0.2168, "step": 202 }, { "epoch": 0.09514925373134328, "grad_norm": 3.8235132670407874, "learning_rate": 3.1677018633540376e-06, "loss": 0.2174, "step": 204 }, { "epoch": 0.0960820895522388, "grad_norm": 3.0628441148043666, "learning_rate": 3.1987577639751555e-06, "loss": 0.2191, "step": 206 }, { "epoch": 0.09701492537313433, "grad_norm": 3.090876344596246, "learning_rate": 3.2298136645962735e-06, "loss": 0.2377, "step": 208 }, { "epoch": 0.09794776119402986, "grad_norm": 2.7448364914565455, "learning_rate": 3.2608695652173914e-06, "loss": 0.2058, "step": 210 }, { "epoch": 0.09888059701492537, "grad_norm": 2.5180579026465035, "learning_rate": 3.29192546583851e-06, "loss": 0.1897, "step": 212 }, { "epoch": 0.09981343283582089, "grad_norm": 2.7339433310625654, "learning_rate": 3.3229813664596278e-06, "loss": 0.2213, "step": 214 }, { "epoch": 0.10074626865671642, "grad_norm": 2.726903211923335, "learning_rate": 3.3540372670807457e-06, "loss": 0.2142, "step": 216 }, { "epoch": 0.10167910447761194, "grad_norm": 2.866983094725867, "learning_rate": 3.3850931677018632e-06, "loss": 0.2079, "step": 218 }, { "epoch": 0.10261194029850747, "grad_norm": 2.534992808853139, "learning_rate": 3.4161490683229816e-06, "loss": 0.2095, "step": 220 }, { "epoch": 0.10354477611940298, "grad_norm": 2.7790532064716027, "learning_rate": 3.4472049689440996e-06, "loss": 0.2172, "step": 222 }, { "epoch": 0.1044776119402985, "grad_norm": 3.0318314138749014, "learning_rate": 3.4782608695652175e-06, "loss": 0.2075, "step": 224 }, { "epoch": 0.10541044776119403, "grad_norm": 2.607798535697695, "learning_rate": 3.5093167701863355e-06, "loss": 0.2012, "step": 226 }, { "epoch": 0.10634328358208955, "grad_norm": 3.2197760095787875, "learning_rate": 3.540372670807454e-06, "loss": 0.1986, "step": 228 }, { "epoch": 0.10727611940298508, "grad_norm": 2.7939815005855237, "learning_rate": 3.5714285714285718e-06, "loss": 0.2185, "step": 230 }, { "epoch": 0.10820895522388059, "grad_norm": 2.5859920654046533, "learning_rate": 3.6024844720496897e-06, "loss": 0.2124, "step": 232 }, { "epoch": 0.10914179104477612, "grad_norm": 2.503484587876454, "learning_rate": 3.633540372670808e-06, "loss": 0.219, "step": 234 }, { "epoch": 0.11007462686567164, "grad_norm": 3.2942336044988125, "learning_rate": 3.664596273291926e-06, "loss": 0.2079, "step": 236 }, { "epoch": 0.11100746268656717, "grad_norm": 2.6441126873407477, "learning_rate": 3.6956521739130436e-06, "loss": 0.2222, "step": 238 }, { "epoch": 0.11194029850746269, "grad_norm": 2.487499257722225, "learning_rate": 3.7267080745341615e-06, "loss": 0.2257, "step": 240 }, { "epoch": 0.11287313432835822, "grad_norm": 2.69554936919851, "learning_rate": 3.7577639751552795e-06, "loss": 0.1924, "step": 242 }, { "epoch": 0.11380597014925373, "grad_norm": 2.5792569401105583, "learning_rate": 3.788819875776398e-06, "loss": 0.1943, "step": 244 }, { "epoch": 0.11473880597014925, "grad_norm": 2.6356025530311453, "learning_rate": 3.819875776397516e-06, "loss": 0.2285, "step": 246 }, { "epoch": 0.11567164179104478, "grad_norm": 2.3308293097806625, "learning_rate": 3.850931677018634e-06, "loss": 0.1999, "step": 248 }, { "epoch": 0.1166044776119403, "grad_norm": 2.750925154827227, "learning_rate": 3.881987577639752e-06, "loss": 0.2221, "step": 250 }, { "epoch": 0.11753731343283583, "grad_norm": 2.5408465330033754, "learning_rate": 3.91304347826087e-06, "loss": 0.2217, "step": 252 }, { "epoch": 0.11847014925373134, "grad_norm": 2.698801962288923, "learning_rate": 3.9440993788819884e-06, "loss": 0.2335, "step": 254 }, { "epoch": 0.11940298507462686, "grad_norm": 2.549182806708308, "learning_rate": 3.975155279503106e-06, "loss": 0.2037, "step": 256 }, { "epoch": 0.12033582089552239, "grad_norm": 2.464821690021277, "learning_rate": 4.0062111801242235e-06, "loss": 0.2078, "step": 258 }, { "epoch": 0.12126865671641791, "grad_norm": 2.321049960144863, "learning_rate": 4.037267080745342e-06, "loss": 0.2054, "step": 260 }, { "epoch": 0.12220149253731344, "grad_norm": 2.493722894686357, "learning_rate": 4.06832298136646e-06, "loss": 0.2041, "step": 262 }, { "epoch": 0.12313432835820895, "grad_norm": 2.442901683209422, "learning_rate": 4.099378881987578e-06, "loss": 0.2287, "step": 264 }, { "epoch": 0.12406716417910447, "grad_norm": 4.717151474594673, "learning_rate": 4.130434782608696e-06, "loss": 0.2152, "step": 266 }, { "epoch": 0.125, "grad_norm": 5.324903092991523, "learning_rate": 4.1614906832298145e-06, "loss": 0.2199, "step": 268 }, { "epoch": 0.1259328358208955, "grad_norm": 2.518797731342692, "learning_rate": 4.192546583850932e-06, "loss": 0.2109, "step": 270 }, { "epoch": 0.12686567164179105, "grad_norm": 2.5709618938357695, "learning_rate": 4.22360248447205e-06, "loss": 0.2112, "step": 272 }, { "epoch": 0.12779850746268656, "grad_norm": 2.6970173166428264, "learning_rate": 4.254658385093168e-06, "loss": 0.2067, "step": 274 }, { "epoch": 0.1287313432835821, "grad_norm": 2.264863737900119, "learning_rate": 4.2857142857142855e-06, "loss": 0.1838, "step": 276 }, { "epoch": 0.1296641791044776, "grad_norm": 2.2780908029059295, "learning_rate": 4.316770186335404e-06, "loss": 0.2125, "step": 278 }, { "epoch": 0.13059701492537312, "grad_norm": 2.352595434996149, "learning_rate": 4.347826086956522e-06, "loss": 0.2182, "step": 280 }, { "epoch": 0.13152985074626866, "grad_norm": 2.3226612212246165, "learning_rate": 4.37888198757764e-06, "loss": 0.1962, "step": 282 }, { "epoch": 0.13246268656716417, "grad_norm": 2.1616013785625374, "learning_rate": 4.409937888198758e-06, "loss": 0.2112, "step": 284 }, { "epoch": 0.1333955223880597, "grad_norm": 2.269898646687184, "learning_rate": 4.4409937888198765e-06, "loss": 0.2355, "step": 286 }, { "epoch": 0.13432835820895522, "grad_norm": 2.2764896749881225, "learning_rate": 4.472049689440994e-06, "loss": 0.2184, "step": 288 }, { "epoch": 0.13526119402985073, "grad_norm": 2.216197208473881, "learning_rate": 4.503105590062112e-06, "loss": 0.2259, "step": 290 }, { "epoch": 0.13619402985074627, "grad_norm": 2.4094758119083055, "learning_rate": 4.534161490683231e-06, "loss": 0.2059, "step": 292 }, { "epoch": 0.13712686567164178, "grad_norm": 2.6476080672295197, "learning_rate": 4.565217391304348e-06, "loss": 0.2147, "step": 294 }, { "epoch": 0.13805970149253732, "grad_norm": 2.21698603592508, "learning_rate": 4.596273291925466e-06, "loss": 0.199, "step": 296 }, { "epoch": 0.13899253731343283, "grad_norm": 2.963295045616477, "learning_rate": 4.627329192546584e-06, "loss": 0.2067, "step": 298 }, { "epoch": 0.13992537313432835, "grad_norm": 2.0669567246443896, "learning_rate": 4.6583850931677025e-06, "loss": 0.2151, "step": 300 }, { "epoch": 0.14085820895522388, "grad_norm": 2.1816566602665297, "learning_rate": 4.68944099378882e-06, "loss": 0.205, "step": 302 }, { "epoch": 0.1417910447761194, "grad_norm": 2.4711350719567413, "learning_rate": 4.7204968944099384e-06, "loss": 0.2186, "step": 304 }, { "epoch": 0.14272388059701493, "grad_norm": 2.6305476092002102, "learning_rate": 4.751552795031056e-06, "loss": 0.2188, "step": 306 }, { "epoch": 0.14365671641791045, "grad_norm": 3.7926853735649146, "learning_rate": 4.782608695652174e-06, "loss": 0.2146, "step": 308 }, { "epoch": 0.14458955223880596, "grad_norm": 2.3247573472731755, "learning_rate": 4.813664596273293e-06, "loss": 0.189, "step": 310 }, { "epoch": 0.1455223880597015, "grad_norm": 3.506305560875989, "learning_rate": 4.84472049689441e-06, "loss": 0.204, "step": 312 }, { "epoch": 0.146455223880597, "grad_norm": 4.299984565412355, "learning_rate": 4.875776397515528e-06, "loss": 0.1872, "step": 314 }, { "epoch": 0.14738805970149255, "grad_norm": 2.9268086038939476, "learning_rate": 4.906832298136646e-06, "loss": 0.2021, "step": 316 }, { "epoch": 0.14832089552238806, "grad_norm": 2.1656703445175864, "learning_rate": 4.9378881987577645e-06, "loss": 0.2024, "step": 318 }, { "epoch": 0.14925373134328357, "grad_norm": 2.2863114880706834, "learning_rate": 4.968944099378882e-06, "loss": 0.2226, "step": 320 }, { "epoch": 0.1501865671641791, "grad_norm": 2.9771175284921316, "learning_rate": 5e-06, "loss": 0.2112, "step": 322 }, { "epoch": 0.15111940298507462, "grad_norm": 2.4893906337440925, "learning_rate": 5.031055900621118e-06, "loss": 0.2073, "step": 324 }, { "epoch": 0.15205223880597016, "grad_norm": 3.6982120041090334, "learning_rate": 5.062111801242236e-06, "loss": 0.213, "step": 326 }, { "epoch": 0.15298507462686567, "grad_norm": 2.3205143866164835, "learning_rate": 5.093167701863354e-06, "loss": 0.2004, "step": 328 }, { "epoch": 0.15391791044776118, "grad_norm": 21.074705336946614, "learning_rate": 5.124223602484473e-06, "loss": 0.209, "step": 330 }, { "epoch": 0.15485074626865672, "grad_norm": 13.885594746135343, "learning_rate": 5.155279503105591e-06, "loss": 0.1944, "step": 332 }, { "epoch": 0.15578358208955223, "grad_norm": 2.591129823426044, "learning_rate": 5.186335403726709e-06, "loss": 0.2234, "step": 334 }, { "epoch": 0.15671641791044777, "grad_norm": 3.082266049414265, "learning_rate": 5.2173913043478265e-06, "loss": 0.2056, "step": 336 }, { "epoch": 0.15764925373134328, "grad_norm": 6.539986385179368, "learning_rate": 5.248447204968945e-06, "loss": 0.2135, "step": 338 }, { "epoch": 0.15858208955223882, "grad_norm": 2.4097236824819745, "learning_rate": 5.279503105590062e-06, "loss": 0.2327, "step": 340 }, { "epoch": 0.15951492537313433, "grad_norm": 2.253452500793219, "learning_rate": 5.31055900621118e-06, "loss": 0.2199, "step": 342 }, { "epoch": 0.16044776119402984, "grad_norm": 7.406755784306411, "learning_rate": 5.341614906832298e-06, "loss": 0.2211, "step": 344 }, { "epoch": 0.16138059701492538, "grad_norm": 2.7739931927374974, "learning_rate": 5.372670807453416e-06, "loss": 0.2054, "step": 346 }, { "epoch": 0.1623134328358209, "grad_norm": 3.7795105893695244, "learning_rate": 5.403726708074535e-06, "loss": 0.2133, "step": 348 }, { "epoch": 0.16324626865671643, "grad_norm": 2.2534160135806265, "learning_rate": 5.4347826086956525e-06, "loss": 0.2147, "step": 350 }, { "epoch": 0.16417910447761194, "grad_norm": 2.1919756416116405, "learning_rate": 5.465838509316771e-06, "loss": 0.2156, "step": 352 }, { "epoch": 0.16511194029850745, "grad_norm": 2.5787391970560107, "learning_rate": 5.4968944099378884e-06, "loss": 0.2155, "step": 354 }, { "epoch": 0.166044776119403, "grad_norm": 2.807216789717174, "learning_rate": 5.527950310559007e-06, "loss": 0.209, "step": 356 }, { "epoch": 0.1669776119402985, "grad_norm": 2.596292546872093, "learning_rate": 5.559006211180124e-06, "loss": 0.2105, "step": 358 }, { "epoch": 0.16791044776119404, "grad_norm": 2.6143591049228765, "learning_rate": 5.590062111801242e-06, "loss": 0.1968, "step": 360 }, { "epoch": 0.16884328358208955, "grad_norm": 2.1877348191610486, "learning_rate": 5.621118012422361e-06, "loss": 0.2227, "step": 362 }, { "epoch": 0.16977611940298507, "grad_norm": 2.2277996288851893, "learning_rate": 5.652173913043479e-06, "loss": 0.2053, "step": 364 }, { "epoch": 0.1707089552238806, "grad_norm": 1.9344505939464096, "learning_rate": 5.683229813664597e-06, "loss": 0.1832, "step": 366 }, { "epoch": 0.17164179104477612, "grad_norm": 2.11594386840255, "learning_rate": 5.7142857142857145e-06, "loss": 0.2068, "step": 368 }, { "epoch": 0.17257462686567165, "grad_norm": 2.267187349982284, "learning_rate": 5.745341614906833e-06, "loss": 0.2328, "step": 370 }, { "epoch": 0.17350746268656717, "grad_norm": 2.3086656178819065, "learning_rate": 5.77639751552795e-06, "loss": 0.2077, "step": 372 }, { "epoch": 0.17444029850746268, "grad_norm": 2.618455337831308, "learning_rate": 5.80745341614907e-06, "loss": 0.2223, "step": 374 }, { "epoch": 0.17537313432835822, "grad_norm": 2.3160961499030477, "learning_rate": 5.838509316770186e-06, "loss": 0.2025, "step": 376 }, { "epoch": 0.17630597014925373, "grad_norm": 2.515044712544632, "learning_rate": 5.8695652173913055e-06, "loss": 0.2129, "step": 378 }, { "epoch": 0.17723880597014927, "grad_norm": 2.344979232961502, "learning_rate": 5.900621118012423e-06, "loss": 0.2018, "step": 380 }, { "epoch": 0.17817164179104478, "grad_norm": 3.0863869442400094, "learning_rate": 5.931677018633541e-06, "loss": 0.2211, "step": 382 }, { "epoch": 0.1791044776119403, "grad_norm": 2.0661061390705107, "learning_rate": 5.962732919254659e-06, "loss": 0.2115, "step": 384 }, { "epoch": 0.18003731343283583, "grad_norm": 2.2332463397440834, "learning_rate": 5.9937888198757765e-06, "loss": 0.2175, "step": 386 }, { "epoch": 0.18097014925373134, "grad_norm": 2.2404951359471794, "learning_rate": 6.024844720496895e-06, "loss": 0.219, "step": 388 }, { "epoch": 0.18190298507462688, "grad_norm": 2.0830946206969503, "learning_rate": 6.055900621118012e-06, "loss": 0.2138, "step": 390 }, { "epoch": 0.1828358208955224, "grad_norm": 2.451234201268342, "learning_rate": 6.086956521739132e-06, "loss": 0.2101, "step": 392 }, { "epoch": 0.1837686567164179, "grad_norm": 2.261356156925831, "learning_rate": 6.118012422360249e-06, "loss": 0.2258, "step": 394 }, { "epoch": 0.18470149253731344, "grad_norm": 2.2792460661051694, "learning_rate": 6.1490683229813675e-06, "loss": 0.2174, "step": 396 }, { "epoch": 0.18563432835820895, "grad_norm": 2.093405250955453, "learning_rate": 6.180124223602485e-06, "loss": 0.2079, "step": 398 }, { "epoch": 0.1865671641791045, "grad_norm": 1.8380326706729546, "learning_rate": 6.2111801242236025e-06, "loss": 0.2016, "step": 400 }, { "epoch": 0.1875, "grad_norm": 2.2798338105414873, "learning_rate": 6.242236024844721e-06, "loss": 0.2031, "step": 402 }, { "epoch": 0.1884328358208955, "grad_norm": 2.2241009433733305, "learning_rate": 6.2732919254658384e-06, "loss": 0.215, "step": 404 }, { "epoch": 0.18936567164179105, "grad_norm": 1.778064175691528, "learning_rate": 6.304347826086958e-06, "loss": 0.1918, "step": 406 }, { "epoch": 0.19029850746268656, "grad_norm": 2.2863004818403145, "learning_rate": 6.335403726708075e-06, "loss": 0.2047, "step": 408 }, { "epoch": 0.1912313432835821, "grad_norm": 2.09457967279415, "learning_rate": 6.3664596273291936e-06, "loss": 0.1976, "step": 410 }, { "epoch": 0.1921641791044776, "grad_norm": 2.0479959118495237, "learning_rate": 6.397515527950311e-06, "loss": 0.1945, "step": 412 }, { "epoch": 0.19309701492537312, "grad_norm": 1.9547549466115088, "learning_rate": 6.4285714285714295e-06, "loss": 0.1873, "step": 414 }, { "epoch": 0.19402985074626866, "grad_norm": 2.640810733476452, "learning_rate": 6.459627329192547e-06, "loss": 0.2207, "step": 416 }, { "epoch": 0.19496268656716417, "grad_norm": 2.1638071780760972, "learning_rate": 6.4906832298136645e-06, "loss": 0.2036, "step": 418 }, { "epoch": 0.1958955223880597, "grad_norm": 2.3730015296994753, "learning_rate": 6.521739130434783e-06, "loss": 0.1951, "step": 420 }, { "epoch": 0.19682835820895522, "grad_norm": 1.8550104155904206, "learning_rate": 6.5527950310559e-06, "loss": 0.2211, "step": 422 }, { "epoch": 0.19776119402985073, "grad_norm": 2.171542641520255, "learning_rate": 6.58385093167702e-06, "loss": 0.2051, "step": 424 }, { "epoch": 0.19869402985074627, "grad_norm": 2.120619792279606, "learning_rate": 6.614906832298137e-06, "loss": 0.2241, "step": 426 }, { "epoch": 0.19962686567164178, "grad_norm": 2.109162558503626, "learning_rate": 6.6459627329192555e-06, "loss": 0.2179, "step": 428 }, { "epoch": 0.20055970149253732, "grad_norm": 1.9529233895125344, "learning_rate": 6.677018633540373e-06, "loss": 0.2235, "step": 430 }, { "epoch": 0.20149253731343283, "grad_norm": 1.9975837636413338, "learning_rate": 6.7080745341614914e-06, "loss": 0.2027, "step": 432 }, { "epoch": 0.20242537313432835, "grad_norm": 1.7993809748778602, "learning_rate": 6.739130434782609e-06, "loss": 0.2098, "step": 434 }, { "epoch": 0.20335820895522388, "grad_norm": 1.9132091416861527, "learning_rate": 6.7701863354037265e-06, "loss": 0.2081, "step": 436 }, { "epoch": 0.2042910447761194, "grad_norm": 2.242483796211979, "learning_rate": 6.801242236024846e-06, "loss": 0.2234, "step": 438 }, { "epoch": 0.20522388059701493, "grad_norm": 1.8892510327716943, "learning_rate": 6.832298136645963e-06, "loss": 0.2041, "step": 440 }, { "epoch": 0.20615671641791045, "grad_norm": 1.805957622431701, "learning_rate": 6.863354037267082e-06, "loss": 0.2236, "step": 442 }, { "epoch": 0.20708955223880596, "grad_norm": 2.152721916937432, "learning_rate": 6.894409937888199e-06, "loss": 0.2186, "step": 444 }, { "epoch": 0.2080223880597015, "grad_norm": 1.9420097502059597, "learning_rate": 6.9254658385093175e-06, "loss": 0.2187, "step": 446 }, { "epoch": 0.208955223880597, "grad_norm": 1.8948379598264316, "learning_rate": 6.956521739130435e-06, "loss": 0.2223, "step": 448 }, { "epoch": 0.20988805970149255, "grad_norm": 2.0999160252470013, "learning_rate": 6.987577639751553e-06, "loss": 0.2092, "step": 450 }, { "epoch": 0.21082089552238806, "grad_norm": 2.010887578337831, "learning_rate": 7.018633540372671e-06, "loss": 0.2046, "step": 452 }, { "epoch": 0.21175373134328357, "grad_norm": 1.9643307023414667, "learning_rate": 7.04968944099379e-06, "loss": 0.21, "step": 454 }, { "epoch": 0.2126865671641791, "grad_norm": 2.019291767227252, "learning_rate": 7.080745341614908e-06, "loss": 0.2173, "step": 456 }, { "epoch": 0.21361940298507462, "grad_norm": 1.8467481935245105, "learning_rate": 7.111801242236025e-06, "loss": 0.2167, "step": 458 }, { "epoch": 0.21455223880597016, "grad_norm": 1.9523692161314665, "learning_rate": 7.1428571428571436e-06, "loss": 0.1908, "step": 460 }, { "epoch": 0.21548507462686567, "grad_norm": 2.112706188943294, "learning_rate": 7.173913043478261e-06, "loss": 0.2172, "step": 462 }, { "epoch": 0.21641791044776118, "grad_norm": 1.923923317267627, "learning_rate": 7.2049689440993795e-06, "loss": 0.2312, "step": 464 }, { "epoch": 0.21735074626865672, "grad_norm": 1.8881754294705932, "learning_rate": 7.236024844720497e-06, "loss": 0.2207, "step": 466 }, { "epoch": 0.21828358208955223, "grad_norm": 1.889002257004106, "learning_rate": 7.267080745341616e-06, "loss": 0.1983, "step": 468 }, { "epoch": 0.21921641791044777, "grad_norm": 2.1115379556851375, "learning_rate": 7.298136645962734e-06, "loss": 0.2164, "step": 470 }, { "epoch": 0.22014925373134328, "grad_norm": 1.9550224775213136, "learning_rate": 7.329192546583852e-06, "loss": 0.2272, "step": 472 }, { "epoch": 0.22108208955223882, "grad_norm": 1.8510156882099758, "learning_rate": 7.36024844720497e-06, "loss": 0.1985, "step": 474 }, { "epoch": 0.22201492537313433, "grad_norm": 1.9840100594082613, "learning_rate": 7.391304347826087e-06, "loss": 0.2409, "step": 476 }, { "epoch": 0.22294776119402984, "grad_norm": 1.9922603729585162, "learning_rate": 7.4223602484472055e-06, "loss": 0.2073, "step": 478 }, { "epoch": 0.22388059701492538, "grad_norm": 1.7991976541148942, "learning_rate": 7.453416149068323e-06, "loss": 0.2142, "step": 480 }, { "epoch": 0.2248134328358209, "grad_norm": 1.693102560201708, "learning_rate": 7.484472049689442e-06, "loss": 0.2099, "step": 482 }, { "epoch": 0.22574626865671643, "grad_norm": 1.9155034278566883, "learning_rate": 7.515527950310559e-06, "loss": 0.2167, "step": 484 }, { "epoch": 0.22667910447761194, "grad_norm": 2.1053863237987844, "learning_rate": 7.546583850931678e-06, "loss": 0.2263, "step": 486 }, { "epoch": 0.22761194029850745, "grad_norm": 1.9838518240767105, "learning_rate": 7.577639751552796e-06, "loss": 0.2073, "step": 488 }, { "epoch": 0.228544776119403, "grad_norm": 1.7627840219588304, "learning_rate": 7.608695652173914e-06, "loss": 0.2045, "step": 490 }, { "epoch": 0.2294776119402985, "grad_norm": 1.897779077585881, "learning_rate": 7.639751552795032e-06, "loss": 0.2069, "step": 492 }, { "epoch": 0.23041044776119404, "grad_norm": 1.6493063407948891, "learning_rate": 7.670807453416149e-06, "loss": 0.1924, "step": 494 }, { "epoch": 0.23134328358208955, "grad_norm": 1.8688660015125396, "learning_rate": 7.701863354037268e-06, "loss": 0.2185, "step": 496 }, { "epoch": 0.23227611940298507, "grad_norm": 2.140641877622803, "learning_rate": 7.732919254658386e-06, "loss": 0.2319, "step": 498 }, { "epoch": 0.2332089552238806, "grad_norm": 1.966693543062277, "learning_rate": 7.763975155279503e-06, "loss": 0.2035, "step": 500 }, { "epoch": 0.2332089552238806, "eval_loss": 0.19479210674762726, "eval_runtime": 321.5979, "eval_samples_per_second": 47.407, "eval_steps_per_second": 5.927, "step": 500 }, { "epoch": 0.23414179104477612, "grad_norm": 1.9178591333239856, "learning_rate": 7.795031055900621e-06, "loss": 0.2146, "step": 502 }, { "epoch": 0.23507462686567165, "grad_norm": 1.780460380240722, "learning_rate": 7.82608695652174e-06, "loss": 0.2124, "step": 504 }, { "epoch": 0.23600746268656717, "grad_norm": 1.7863391968412494, "learning_rate": 7.857142857142858e-06, "loss": 0.2091, "step": 506 }, { "epoch": 0.23694029850746268, "grad_norm": 1.7153215475435755, "learning_rate": 7.888198757763977e-06, "loss": 0.2295, "step": 508 }, { "epoch": 0.23787313432835822, "grad_norm": 1.718731451315573, "learning_rate": 7.919254658385094e-06, "loss": 0.1927, "step": 510 }, { "epoch": 0.23880597014925373, "grad_norm": 1.9590871151290359, "learning_rate": 7.950310559006212e-06, "loss": 0.221, "step": 512 }, { "epoch": 0.23973880597014927, "grad_norm": 1.837007794513856, "learning_rate": 7.98136645962733e-06, "loss": 0.2062, "step": 514 }, { "epoch": 0.24067164179104478, "grad_norm": 1.8635351879601927, "learning_rate": 8.012422360248447e-06, "loss": 0.2114, "step": 516 }, { "epoch": 0.2416044776119403, "grad_norm": 1.7952872366608108, "learning_rate": 8.043478260869566e-06, "loss": 0.2008, "step": 518 }, { "epoch": 0.24253731343283583, "grad_norm": 1.7484132593526749, "learning_rate": 8.074534161490684e-06, "loss": 0.2159, "step": 520 }, { "epoch": 0.24347014925373134, "grad_norm": 2.4417074811249675, "learning_rate": 8.105590062111803e-06, "loss": 0.1903, "step": 522 }, { "epoch": 0.24440298507462688, "grad_norm": 1.6718782667387835, "learning_rate": 8.13664596273292e-06, "loss": 0.1878, "step": 524 }, { "epoch": 0.2453358208955224, "grad_norm": 1.8620075936961356, "learning_rate": 8.167701863354038e-06, "loss": 0.2327, "step": 526 }, { "epoch": 0.2462686567164179, "grad_norm": 1.8676208814049724, "learning_rate": 8.198757763975156e-06, "loss": 0.2466, "step": 528 }, { "epoch": 0.24720149253731344, "grad_norm": 1.838820078487303, "learning_rate": 8.229813664596275e-06, "loss": 0.2144, "step": 530 }, { "epoch": 0.24813432835820895, "grad_norm": 1.816911913100488, "learning_rate": 8.260869565217392e-06, "loss": 0.2211, "step": 532 }, { "epoch": 0.2490671641791045, "grad_norm": 1.8537734772298116, "learning_rate": 8.29192546583851e-06, "loss": 0.2053, "step": 534 }, { "epoch": 0.25, "grad_norm": 1.8897849266931421, "learning_rate": 8.322981366459629e-06, "loss": 0.2064, "step": 536 }, { "epoch": 0.25093283582089554, "grad_norm": 1.8253407832401187, "learning_rate": 8.354037267080745e-06, "loss": 0.2216, "step": 538 }, { "epoch": 0.251865671641791, "grad_norm": 1.740543427209268, "learning_rate": 8.385093167701864e-06, "loss": 0.2311, "step": 540 }, { "epoch": 0.25279850746268656, "grad_norm": 1.7169455656851966, "learning_rate": 8.416149068322982e-06, "loss": 0.2254, "step": 542 }, { "epoch": 0.2537313432835821, "grad_norm": 1.9161988490932003, "learning_rate": 8.4472049689441e-06, "loss": 0.2273, "step": 544 }, { "epoch": 0.25466417910447764, "grad_norm": 1.713283854075844, "learning_rate": 8.478260869565218e-06, "loss": 0.2149, "step": 546 }, { "epoch": 0.2555970149253731, "grad_norm": 1.7178625837979558, "learning_rate": 8.509316770186336e-06, "loss": 0.2042, "step": 548 }, { "epoch": 0.25652985074626866, "grad_norm": 1.6542895832157922, "learning_rate": 8.540372670807453e-06, "loss": 0.2151, "step": 550 }, { "epoch": 0.2574626865671642, "grad_norm": 1.9465538700046936, "learning_rate": 8.571428571428571e-06, "loss": 0.2194, "step": 552 }, { "epoch": 0.2583955223880597, "grad_norm": 1.8863908174945532, "learning_rate": 8.60248447204969e-06, "loss": 0.2086, "step": 554 }, { "epoch": 0.2593283582089552, "grad_norm": 1.7837760578014812, "learning_rate": 8.633540372670808e-06, "loss": 0.209, "step": 556 }, { "epoch": 0.26026119402985076, "grad_norm": 1.6815654119881749, "learning_rate": 8.664596273291927e-06, "loss": 0.2193, "step": 558 }, { "epoch": 0.26119402985074625, "grad_norm": 1.6722447753053575, "learning_rate": 8.695652173913044e-06, "loss": 0.2095, "step": 560 }, { "epoch": 0.2621268656716418, "grad_norm": 1.7256029683148997, "learning_rate": 8.726708074534162e-06, "loss": 0.2075, "step": 562 }, { "epoch": 0.2630597014925373, "grad_norm": 1.8130764308944542, "learning_rate": 8.75776397515528e-06, "loss": 0.2272, "step": 564 }, { "epoch": 0.26399253731343286, "grad_norm": 1.7483793585991647, "learning_rate": 8.788819875776399e-06, "loss": 0.2074, "step": 566 }, { "epoch": 0.26492537313432835, "grad_norm": 1.8676174583217775, "learning_rate": 8.819875776397516e-06, "loss": 0.2206, "step": 568 }, { "epoch": 0.2658582089552239, "grad_norm": 1.704628718340115, "learning_rate": 8.850931677018634e-06, "loss": 0.2304, "step": 570 }, { "epoch": 0.2667910447761194, "grad_norm": 1.7129811521018985, "learning_rate": 8.881987577639753e-06, "loss": 0.2215, "step": 572 }, { "epoch": 0.2677238805970149, "grad_norm": 1.5844731018588687, "learning_rate": 8.91304347826087e-06, "loss": 0.2129, "step": 574 }, { "epoch": 0.26865671641791045, "grad_norm": 1.7078677167962482, "learning_rate": 8.944099378881988e-06, "loss": 0.1918, "step": 576 }, { "epoch": 0.269589552238806, "grad_norm": 1.8394746045502772, "learning_rate": 8.975155279503106e-06, "loss": 0.2075, "step": 578 }, { "epoch": 0.27052238805970147, "grad_norm": 1.8533668149894602, "learning_rate": 9.006211180124225e-06, "loss": 0.2177, "step": 580 }, { "epoch": 0.271455223880597, "grad_norm": 1.6706294870333611, "learning_rate": 9.037267080745342e-06, "loss": 0.2143, "step": 582 }, { "epoch": 0.27238805970149255, "grad_norm": 1.6135862098490785, "learning_rate": 9.068322981366461e-06, "loss": 0.1932, "step": 584 }, { "epoch": 0.2733208955223881, "grad_norm": 1.8039009332081763, "learning_rate": 9.099378881987579e-06, "loss": 0.214, "step": 586 }, { "epoch": 0.27425373134328357, "grad_norm": 1.736199962187512, "learning_rate": 9.130434782608697e-06, "loss": 0.2076, "step": 588 }, { "epoch": 0.2751865671641791, "grad_norm": 1.55239171764077, "learning_rate": 9.161490683229814e-06, "loss": 0.2205, "step": 590 }, { "epoch": 0.27611940298507465, "grad_norm": 1.6499019479698962, "learning_rate": 9.192546583850932e-06, "loss": 0.2264, "step": 592 }, { "epoch": 0.27705223880597013, "grad_norm": 1.669115952705018, "learning_rate": 9.22360248447205e-06, "loss": 0.2081, "step": 594 }, { "epoch": 0.27798507462686567, "grad_norm": 1.7349736524350674, "learning_rate": 9.254658385093168e-06, "loss": 0.2182, "step": 596 }, { "epoch": 0.2789179104477612, "grad_norm": 1.6120928347026444, "learning_rate": 9.285714285714288e-06, "loss": 0.2097, "step": 598 }, { "epoch": 0.2798507462686567, "grad_norm": 1.691517126303103, "learning_rate": 9.316770186335405e-06, "loss": 0.2222, "step": 600 }, { "epoch": 0.28078358208955223, "grad_norm": 1.6356393070257824, "learning_rate": 9.347826086956523e-06, "loss": 0.2137, "step": 602 }, { "epoch": 0.28171641791044777, "grad_norm": 1.6305519634550314, "learning_rate": 9.37888198757764e-06, "loss": 0.2055, "step": 604 }, { "epoch": 0.2826492537313433, "grad_norm": 1.6004971093968854, "learning_rate": 9.40993788819876e-06, "loss": 0.2103, "step": 606 }, { "epoch": 0.2835820895522388, "grad_norm": 1.6406649340996644, "learning_rate": 9.440993788819877e-06, "loss": 0.2062, "step": 608 }, { "epoch": 0.28451492537313433, "grad_norm": 1.7544618761014004, "learning_rate": 9.472049689440994e-06, "loss": 0.2005, "step": 610 }, { "epoch": 0.28544776119402987, "grad_norm": 1.629551994561078, "learning_rate": 9.503105590062112e-06, "loss": 0.2048, "step": 612 }, { "epoch": 0.28638059701492535, "grad_norm": 1.6262560501069863, "learning_rate": 9.53416149068323e-06, "loss": 0.1945, "step": 614 }, { "epoch": 0.2873134328358209, "grad_norm": 1.7007625240298931, "learning_rate": 9.565217391304349e-06, "loss": 0.2118, "step": 616 }, { "epoch": 0.28824626865671643, "grad_norm": 1.727038130444209, "learning_rate": 9.596273291925466e-06, "loss": 0.2124, "step": 618 }, { "epoch": 0.2891791044776119, "grad_norm": 1.5071465929059924, "learning_rate": 9.627329192546585e-06, "loss": 0.2272, "step": 620 }, { "epoch": 0.29011194029850745, "grad_norm": 1.6030429455419744, "learning_rate": 9.658385093167703e-06, "loss": 0.2079, "step": 622 }, { "epoch": 0.291044776119403, "grad_norm": 1.8698961221664092, "learning_rate": 9.68944099378882e-06, "loss": 0.22, "step": 624 }, { "epoch": 0.29197761194029853, "grad_norm": 1.6379825252507367, "learning_rate": 9.720496894409938e-06, "loss": 0.2077, "step": 626 }, { "epoch": 0.292910447761194, "grad_norm": 1.5160586852181357, "learning_rate": 9.751552795031056e-06, "loss": 0.1941, "step": 628 }, { "epoch": 0.29384328358208955, "grad_norm": 1.5030995131499334, "learning_rate": 9.782608695652175e-06, "loss": 0.2262, "step": 630 }, { "epoch": 0.2947761194029851, "grad_norm": 1.6670193534167812, "learning_rate": 9.813664596273292e-06, "loss": 0.2215, "step": 632 }, { "epoch": 0.2957089552238806, "grad_norm": 1.427100498028422, "learning_rate": 9.844720496894411e-06, "loss": 0.2165, "step": 634 }, { "epoch": 0.2966417910447761, "grad_norm": 1.501137086209554, "learning_rate": 9.875776397515529e-06, "loss": 0.2255, "step": 636 }, { "epoch": 0.29757462686567165, "grad_norm": 1.4127915167067975, "learning_rate": 9.906832298136647e-06, "loss": 0.2076, "step": 638 }, { "epoch": 0.29850746268656714, "grad_norm": 1.6065120762122185, "learning_rate": 9.937888198757764e-06, "loss": 0.2198, "step": 640 }, { "epoch": 0.2994402985074627, "grad_norm": 1.970171686642738, "learning_rate": 9.968944099378883e-06, "loss": 0.2423, "step": 642 }, { "epoch": 0.3003731343283582, "grad_norm": 1.6876725397175933, "learning_rate": 1e-05, "loss": 0.2238, "step": 644 }, { "epoch": 0.30130597014925375, "grad_norm": 1.6225950050699995, "learning_rate": 9.99999705393274e-06, "loss": 0.2146, "step": 646 }, { "epoch": 0.30223880597014924, "grad_norm": 1.623384986651765, "learning_rate": 9.999988215734431e-06, "loss": 0.222, "step": 648 }, { "epoch": 0.3031716417910448, "grad_norm": 1.562280211952703, "learning_rate": 9.999973485415487e-06, "loss": 0.1901, "step": 650 }, { "epoch": 0.3041044776119403, "grad_norm": 1.9282897270749833, "learning_rate": 9.999952862993265e-06, "loss": 0.2388, "step": 652 }, { "epoch": 0.3050373134328358, "grad_norm": 1.6714150197964353, "learning_rate": 9.99992634849207e-06, "loss": 0.2177, "step": 654 }, { "epoch": 0.30597014925373134, "grad_norm": 1.6737911088642945, "learning_rate": 9.999893941943148e-06, "loss": 0.2046, "step": 656 }, { "epoch": 0.3069029850746269, "grad_norm": 1.7820731511284762, "learning_rate": 9.999855643384686e-06, "loss": 0.238, "step": 658 }, { "epoch": 0.30783582089552236, "grad_norm": 1.6100730701090555, "learning_rate": 9.999811452861817e-06, "loss": 0.2183, "step": 660 }, { "epoch": 0.3087686567164179, "grad_norm": 1.957239334896792, "learning_rate": 9.999761370426616e-06, "loss": 0.2323, "step": 662 }, { "epoch": 0.30970149253731344, "grad_norm": 1.7306655964277795, "learning_rate": 9.9997053961381e-06, "loss": 0.2407, "step": 664 }, { "epoch": 0.310634328358209, "grad_norm": 1.6647283309579866, "learning_rate": 9.999643530062232e-06, "loss": 0.2025, "step": 666 }, { "epoch": 0.31156716417910446, "grad_norm": 1.6232985705909533, "learning_rate": 9.999575772271917e-06, "loss": 0.2206, "step": 668 }, { "epoch": 0.3125, "grad_norm": 1.5143381053873757, "learning_rate": 9.999502122847003e-06, "loss": 0.211, "step": 670 }, { "epoch": 0.31343283582089554, "grad_norm": 1.6470145022132414, "learning_rate": 9.999422581874277e-06, "loss": 0.2065, "step": 672 }, { "epoch": 0.314365671641791, "grad_norm": 1.6251569640381565, "learning_rate": 9.999337149447477e-06, "loss": 0.2226, "step": 674 }, { "epoch": 0.31529850746268656, "grad_norm": 1.7086513223853053, "learning_rate": 9.999245825667275e-06, "loss": 0.248, "step": 676 }, { "epoch": 0.3162313432835821, "grad_norm": 1.9064116916209166, "learning_rate": 9.999148610641292e-06, "loss": 0.2331, "step": 678 }, { "epoch": 0.31716417910447764, "grad_norm": 1.5271764364621954, "learning_rate": 9.999045504484089e-06, "loss": 0.1978, "step": 680 }, { "epoch": 0.3180970149253731, "grad_norm": 1.5692069998589113, "learning_rate": 9.998936507317165e-06, "loss": 0.2265, "step": 682 }, { "epoch": 0.31902985074626866, "grad_norm": 1.8154077936443869, "learning_rate": 9.99882161926897e-06, "loss": 0.2045, "step": 684 }, { "epoch": 0.3199626865671642, "grad_norm": 1.5865182527224377, "learning_rate": 9.99870084047489e-06, "loss": 0.2152, "step": 686 }, { "epoch": 0.3208955223880597, "grad_norm": 1.567653226883952, "learning_rate": 9.998574171077252e-06, "loss": 0.2195, "step": 688 }, { "epoch": 0.3218283582089552, "grad_norm": 1.4771559447707423, "learning_rate": 9.998441611225329e-06, "loss": 0.1989, "step": 690 }, { "epoch": 0.32276119402985076, "grad_norm": 1.497709323554442, "learning_rate": 9.998303161075331e-06, "loss": 0.2054, "step": 692 }, { "epoch": 0.32369402985074625, "grad_norm": 1.691709816960022, "learning_rate": 9.998158820790412e-06, "loss": 0.2233, "step": 694 }, { "epoch": 0.3246268656716418, "grad_norm": 1.7240579630780002, "learning_rate": 9.99800859054067e-06, "loss": 0.2271, "step": 696 }, { "epoch": 0.3255597014925373, "grad_norm": 1.684673934833607, "learning_rate": 9.997852470503133e-06, "loss": 0.2319, "step": 698 }, { "epoch": 0.32649253731343286, "grad_norm": 1.4252784834985877, "learning_rate": 9.997690460861782e-06, "loss": 0.1736, "step": 700 }, { "epoch": 0.32742537313432835, "grad_norm": 1.579021485076662, "learning_rate": 9.997522561807534e-06, "loss": 0.2037, "step": 702 }, { "epoch": 0.3283582089552239, "grad_norm": 1.7710895889269371, "learning_rate": 9.997348773538244e-06, "loss": 0.2196, "step": 704 }, { "epoch": 0.3292910447761194, "grad_norm": 1.6752663084766275, "learning_rate": 9.99716909625871e-06, "loss": 0.2053, "step": 706 }, { "epoch": 0.3302238805970149, "grad_norm": 1.7811532579224665, "learning_rate": 9.996983530180669e-06, "loss": 0.2034, "step": 708 }, { "epoch": 0.33115671641791045, "grad_norm": 1.6934817244622433, "learning_rate": 9.996792075522795e-06, "loss": 0.2164, "step": 710 }, { "epoch": 0.332089552238806, "grad_norm": 1.5305419790452965, "learning_rate": 9.996594732510703e-06, "loss": 0.2123, "step": 712 }, { "epoch": 0.33302238805970147, "grad_norm": 1.6529334897646666, "learning_rate": 9.996391501376948e-06, "loss": 0.2222, "step": 714 }, { "epoch": 0.333955223880597, "grad_norm": 1.4988526085463474, "learning_rate": 9.996182382361027e-06, "loss": 0.2234, "step": 716 }, { "epoch": 0.33488805970149255, "grad_norm": 1.6813897787726346, "learning_rate": 9.995967375709365e-06, "loss": 0.2125, "step": 718 }, { "epoch": 0.3358208955223881, "grad_norm": 1.5640967738166371, "learning_rate": 9.995746481675335e-06, "loss": 0.204, "step": 720 }, { "epoch": 0.33675373134328357, "grad_norm": 1.546681241407843, "learning_rate": 9.995519700519246e-06, "loss": 0.2003, "step": 722 }, { "epoch": 0.3376865671641791, "grad_norm": 1.546258984412468, "learning_rate": 9.995287032508339e-06, "loss": 0.1977, "step": 724 }, { "epoch": 0.33861940298507465, "grad_norm": 1.4010002919438977, "learning_rate": 9.9950484779168e-06, "loss": 0.209, "step": 726 }, { "epoch": 0.33955223880597013, "grad_norm": 1.4525068288568763, "learning_rate": 9.994804037025745e-06, "loss": 0.2078, "step": 728 }, { "epoch": 0.34048507462686567, "grad_norm": 1.7068819400074136, "learning_rate": 9.994553710123233e-06, "loss": 0.2198, "step": 730 }, { "epoch": 0.3414179104477612, "grad_norm": 1.4631164118298687, "learning_rate": 9.994297497504253e-06, "loss": 0.2072, "step": 732 }, { "epoch": 0.3423507462686567, "grad_norm": 1.4731855366350495, "learning_rate": 9.994035399470733e-06, "loss": 0.2012, "step": 734 }, { "epoch": 0.34328358208955223, "grad_norm": 1.527380421003424, "learning_rate": 9.993767416331541e-06, "loss": 0.2089, "step": 736 }, { "epoch": 0.34421641791044777, "grad_norm": 1.5502413727467526, "learning_rate": 9.99349354840247e-06, "loss": 0.1886, "step": 738 }, { "epoch": 0.3451492537313433, "grad_norm": 1.6735399174550274, "learning_rate": 9.993213796006256e-06, "loss": 0.2135, "step": 740 }, { "epoch": 0.3460820895522388, "grad_norm": 1.5931793229897702, "learning_rate": 9.992928159472565e-06, "loss": 0.2203, "step": 742 }, { "epoch": 0.34701492537313433, "grad_norm": 1.6171684493427314, "learning_rate": 9.992636639138e-06, "loss": 0.2178, "step": 744 }, { "epoch": 0.34794776119402987, "grad_norm": 1.5619062519995661, "learning_rate": 9.992339235346096e-06, "loss": 0.1969, "step": 746 }, { "epoch": 0.34888059701492535, "grad_norm": 1.4643409964944438, "learning_rate": 9.992035948447322e-06, "loss": 0.218, "step": 748 }, { "epoch": 0.3498134328358209, "grad_norm": 1.6713694751278418, "learning_rate": 9.99172677879908e-06, "loss": 0.2345, "step": 750 }, { "epoch": 0.35074626865671643, "grad_norm": 1.6295968073285287, "learning_rate": 9.991411726765704e-06, "loss": 0.2313, "step": 752 }, { "epoch": 0.3516791044776119, "grad_norm": 1.492206227603743, "learning_rate": 9.991090792718458e-06, "loss": 0.2029, "step": 754 }, { "epoch": 0.35261194029850745, "grad_norm": 1.4920372023115085, "learning_rate": 9.99076397703554e-06, "loss": 0.2076, "step": 756 }, { "epoch": 0.353544776119403, "grad_norm": 1.6851720821939231, "learning_rate": 9.99043128010208e-06, "loss": 0.2241, "step": 758 }, { "epoch": 0.35447761194029853, "grad_norm": 1.4685009516373944, "learning_rate": 9.990092702310134e-06, "loss": 0.195, "step": 760 }, { "epoch": 0.355410447761194, "grad_norm": 1.4794010690122867, "learning_rate": 9.989748244058695e-06, "loss": 0.2171, "step": 762 }, { "epoch": 0.35634328358208955, "grad_norm": 1.4072133779038727, "learning_rate": 9.989397905753677e-06, "loss": 0.2068, "step": 764 }, { "epoch": 0.3572761194029851, "grad_norm": 1.5090128136103944, "learning_rate": 9.989041687807934e-06, "loss": 0.2259, "step": 766 }, { "epoch": 0.3582089552238806, "grad_norm": 1.6473468177048791, "learning_rate": 9.988679590641237e-06, "loss": 0.2362, "step": 768 }, { "epoch": 0.3591417910447761, "grad_norm": 1.5371299816829591, "learning_rate": 9.988311614680294e-06, "loss": 0.2094, "step": 770 }, { "epoch": 0.36007462686567165, "grad_norm": 1.5972460189685906, "learning_rate": 9.987937760358738e-06, "loss": 0.227, "step": 772 }, { "epoch": 0.36100746268656714, "grad_norm": 1.5854668624944792, "learning_rate": 9.987558028117129e-06, "loss": 0.2291, "step": 774 }, { "epoch": 0.3619402985074627, "grad_norm": 1.495419359749338, "learning_rate": 9.987172418402953e-06, "loss": 0.2107, "step": 776 }, { "epoch": 0.3628731343283582, "grad_norm": 1.5503509875180073, "learning_rate": 9.986780931670622e-06, "loss": 0.218, "step": 778 }, { "epoch": 0.36380597014925375, "grad_norm": 1.55747472502651, "learning_rate": 9.986383568381478e-06, "loss": 0.209, "step": 780 }, { "epoch": 0.36473880597014924, "grad_norm": 1.7543414208220027, "learning_rate": 9.98598032900378e-06, "loss": 0.2129, "step": 782 }, { "epoch": 0.3656716417910448, "grad_norm": 1.4477289658010006, "learning_rate": 9.985571214012717e-06, "loss": 0.1952, "step": 784 }, { "epoch": 0.3666044776119403, "grad_norm": 1.6518496996444825, "learning_rate": 9.985156223890405e-06, "loss": 0.2185, "step": 786 }, { "epoch": 0.3675373134328358, "grad_norm": 1.4856177884308097, "learning_rate": 9.984735359125875e-06, "loss": 0.2128, "step": 788 }, { "epoch": 0.36847014925373134, "grad_norm": 1.3852777645542962, "learning_rate": 9.984308620215087e-06, "loss": 0.2061, "step": 790 }, { "epoch": 0.3694029850746269, "grad_norm": 1.6573797639788397, "learning_rate": 9.983876007660924e-06, "loss": 0.2166, "step": 792 }, { "epoch": 0.37033582089552236, "grad_norm": 1.5564426202844768, "learning_rate": 9.983437521973184e-06, "loss": 0.2092, "step": 794 }, { "epoch": 0.3712686567164179, "grad_norm": 1.686966771664201, "learning_rate": 9.982993163668593e-06, "loss": 0.2267, "step": 796 }, { "epoch": 0.37220149253731344, "grad_norm": 1.4671061197216366, "learning_rate": 9.982542933270794e-06, "loss": 0.2052, "step": 798 }, { "epoch": 0.373134328358209, "grad_norm": 1.5197313893299038, "learning_rate": 9.982086831310351e-06, "loss": 0.2247, "step": 800 }, { "epoch": 0.37406716417910446, "grad_norm": 1.5059619567477527, "learning_rate": 9.981624858324747e-06, "loss": 0.2023, "step": 802 }, { "epoch": 0.375, "grad_norm": 1.373382755780966, "learning_rate": 9.981157014858384e-06, "loss": 0.2109, "step": 804 }, { "epoch": 0.37593283582089554, "grad_norm": 1.5497484919375448, "learning_rate": 9.98068330146258e-06, "loss": 0.2157, "step": 806 }, { "epoch": 0.376865671641791, "grad_norm": 1.5397780167593171, "learning_rate": 9.98020371869557e-06, "loss": 0.24, "step": 808 }, { "epoch": 0.37779850746268656, "grad_norm": 1.669189774635985, "learning_rate": 9.97971826712251e-06, "loss": 0.2128, "step": 810 }, { "epoch": 0.3787313432835821, "grad_norm": 1.4964614682006767, "learning_rate": 9.97922694731547e-06, "loss": 0.2023, "step": 812 }, { "epoch": 0.37966417910447764, "grad_norm": 1.4398142596198227, "learning_rate": 9.978729759853432e-06, "loss": 0.2063, "step": 814 }, { "epoch": 0.3805970149253731, "grad_norm": 1.3979734121897829, "learning_rate": 9.978226705322295e-06, "loss": 0.1968, "step": 816 }, { "epoch": 0.38152985074626866, "grad_norm": 1.491927436401735, "learning_rate": 9.977717784314875e-06, "loss": 0.2059, "step": 818 }, { "epoch": 0.3824626865671642, "grad_norm": 1.4357118787242178, "learning_rate": 9.977202997430895e-06, "loss": 0.2071, "step": 820 }, { "epoch": 0.3833955223880597, "grad_norm": 1.4119862490675763, "learning_rate": 9.976682345276995e-06, "loss": 0.2048, "step": 822 }, { "epoch": 0.3843283582089552, "grad_norm": 1.4669720696498303, "learning_rate": 9.976155828466725e-06, "loss": 0.1987, "step": 824 }, { "epoch": 0.38526119402985076, "grad_norm": 1.2344430086775962, "learning_rate": 9.975623447620549e-06, "loss": 0.1924, "step": 826 }, { "epoch": 0.38619402985074625, "grad_norm": 1.5652570148884641, "learning_rate": 9.975085203365834e-06, "loss": 0.1881, "step": 828 }, { "epoch": 0.3871268656716418, "grad_norm": 1.4754813636164474, "learning_rate": 9.974541096336865e-06, "loss": 0.2192, "step": 830 }, { "epoch": 0.3880597014925373, "grad_norm": 1.626829373370367, "learning_rate": 9.973991127174833e-06, "loss": 0.2366, "step": 832 }, { "epoch": 0.38899253731343286, "grad_norm": 1.4490460319497058, "learning_rate": 9.973435296527835e-06, "loss": 0.1859, "step": 834 }, { "epoch": 0.38992537313432835, "grad_norm": 1.403520038653334, "learning_rate": 9.972873605050878e-06, "loss": 0.2007, "step": 836 }, { "epoch": 0.3908582089552239, "grad_norm": 1.5661614246246318, "learning_rate": 9.97230605340587e-06, "loss": 0.2246, "step": 838 }, { "epoch": 0.3917910447761194, "grad_norm": 1.7033190578367239, "learning_rate": 9.971732642261635e-06, "loss": 0.2309, "step": 840 }, { "epoch": 0.3927238805970149, "grad_norm": 1.5226095161639421, "learning_rate": 9.971153372293893e-06, "loss": 0.2224, "step": 842 }, { "epoch": 0.39365671641791045, "grad_norm": 1.5398485144304783, "learning_rate": 9.970568244185272e-06, "loss": 0.2314, "step": 844 }, { "epoch": 0.394589552238806, "grad_norm": 1.4511269870109804, "learning_rate": 9.969977258625303e-06, "loss": 0.2171, "step": 846 }, { "epoch": 0.39552238805970147, "grad_norm": 1.5194540698178827, "learning_rate": 9.969380416310417e-06, "loss": 0.2172, "step": 848 }, { "epoch": 0.396455223880597, "grad_norm": 1.364259972259269, "learning_rate": 9.968777717943954e-06, "loss": 0.1977, "step": 850 }, { "epoch": 0.39738805970149255, "grad_norm": 1.4228579215618578, "learning_rate": 9.968169164236145e-06, "loss": 0.2053, "step": 852 }, { "epoch": 0.3983208955223881, "grad_norm": 1.6712416458354218, "learning_rate": 9.967554755904127e-06, "loss": 0.2176, "step": 854 }, { "epoch": 0.39925373134328357, "grad_norm": 1.5038386647715758, "learning_rate": 9.966934493671938e-06, "loss": 0.2133, "step": 856 }, { "epoch": 0.4001865671641791, "grad_norm": 1.3632283565658245, "learning_rate": 9.966308378270511e-06, "loss": 0.1994, "step": 858 }, { "epoch": 0.40111940298507465, "grad_norm": 1.5274204894929744, "learning_rate": 9.965676410437675e-06, "loss": 0.2362, "step": 860 }, { "epoch": 0.40205223880597013, "grad_norm": 1.37459075908075, "learning_rate": 9.965038590918157e-06, "loss": 0.2002, "step": 862 }, { "epoch": 0.40298507462686567, "grad_norm": 1.6354562797409415, "learning_rate": 9.964394920463586e-06, "loss": 0.2458, "step": 864 }, { "epoch": 0.4039179104477612, "grad_norm": 1.374731418579664, "learning_rate": 9.963745399832476e-06, "loss": 0.2146, "step": 866 }, { "epoch": 0.4048507462686567, "grad_norm": 1.464592418209558, "learning_rate": 9.96309002979024e-06, "loss": 0.1978, "step": 868 }, { "epoch": 0.40578358208955223, "grad_norm": 1.5533291604380253, "learning_rate": 9.962428811109187e-06, "loss": 0.2219, "step": 870 }, { "epoch": 0.40671641791044777, "grad_norm": 1.3856686275767705, "learning_rate": 9.961761744568512e-06, "loss": 0.2053, "step": 872 }, { "epoch": 0.4076492537313433, "grad_norm": 1.4182301324149096, "learning_rate": 9.961088830954304e-06, "loss": 0.2218, "step": 874 }, { "epoch": 0.4085820895522388, "grad_norm": 1.5276935803141443, "learning_rate": 9.960410071059543e-06, "loss": 0.2226, "step": 876 }, { "epoch": 0.40951492537313433, "grad_norm": 2.0528206448734267, "learning_rate": 9.959725465684099e-06, "loss": 0.2063, "step": 878 }, { "epoch": 0.41044776119402987, "grad_norm": 1.4855180601859719, "learning_rate": 9.959035015634727e-06, "loss": 0.1862, "step": 880 }, { "epoch": 0.41138059701492535, "grad_norm": 1.580306023027329, "learning_rate": 9.958338721725075e-06, "loss": 0.2037, "step": 882 }, { "epoch": 0.4123134328358209, "grad_norm": 1.4439857201693262, "learning_rate": 9.957636584775671e-06, "loss": 0.2075, "step": 884 }, { "epoch": 0.41324626865671643, "grad_norm": 1.4820780271477962, "learning_rate": 9.956928605613935e-06, "loss": 0.1951, "step": 886 }, { "epoch": 0.4141791044776119, "grad_norm": 1.6662032107522697, "learning_rate": 9.956214785074169e-06, "loss": 0.2113, "step": 888 }, { "epoch": 0.41511194029850745, "grad_norm": 1.6238607804516694, "learning_rate": 9.955495123997556e-06, "loss": 0.2051, "step": 890 }, { "epoch": 0.416044776119403, "grad_norm": 2.001493600922292, "learning_rate": 9.954769623232165e-06, "loss": 0.2034, "step": 892 }, { "epoch": 0.41697761194029853, "grad_norm": 1.4987183254960474, "learning_rate": 9.954038283632945e-06, "loss": 0.207, "step": 894 }, { "epoch": 0.417910447761194, "grad_norm": 2.395033978637096, "learning_rate": 9.953301106061728e-06, "loss": 0.1816, "step": 896 }, { "epoch": 0.41884328358208955, "grad_norm": 1.3809658665451596, "learning_rate": 9.952558091387224e-06, "loss": 0.1989, "step": 898 }, { "epoch": 0.4197761194029851, "grad_norm": 1.6058795299888213, "learning_rate": 9.951809240485017e-06, "loss": 0.2303, "step": 900 }, { "epoch": 0.4207089552238806, "grad_norm": 1.9037897042068004, "learning_rate": 9.951054554237579e-06, "loss": 0.222, "step": 902 }, { "epoch": 0.4216417910447761, "grad_norm": 1.4635743652473927, "learning_rate": 9.950294033534247e-06, "loss": 0.1955, "step": 904 }, { "epoch": 0.42257462686567165, "grad_norm": 1.2713707769627034, "learning_rate": 9.949527679271244e-06, "loss": 0.1822, "step": 906 }, { "epoch": 0.42350746268656714, "grad_norm": 1.5323208784487459, "learning_rate": 9.948755492351659e-06, "loss": 0.2172, "step": 908 }, { "epoch": 0.4244402985074627, "grad_norm": 1.3384965127252164, "learning_rate": 9.94797747368546e-06, "loss": 0.2022, "step": 910 }, { "epoch": 0.4253731343283582, "grad_norm": 1.5291696362819658, "learning_rate": 9.947193624189485e-06, "loss": 0.2258, "step": 912 }, { "epoch": 0.42630597014925375, "grad_norm": 1.4982152008948342, "learning_rate": 9.946403944787441e-06, "loss": 0.2147, "step": 914 }, { "epoch": 0.42723880597014924, "grad_norm": 1.4500793974667148, "learning_rate": 9.945608436409913e-06, "loss": 0.2051, "step": 916 }, { "epoch": 0.4281716417910448, "grad_norm": 1.3079337678373602, "learning_rate": 9.944807099994343e-06, "loss": 0.2033, "step": 918 }, { "epoch": 0.4291044776119403, "grad_norm": 1.3634353746403716, "learning_rate": 9.94399993648505e-06, "loss": 0.1973, "step": 920 }, { "epoch": 0.4300373134328358, "grad_norm": 1.3547395674282443, "learning_rate": 9.943186946833217e-06, "loss": 0.2012, "step": 922 }, { "epoch": 0.43097014925373134, "grad_norm": 1.5908366403478449, "learning_rate": 9.942368131996892e-06, "loss": 0.2197, "step": 924 }, { "epoch": 0.4319029850746269, "grad_norm": 1.7410389177979753, "learning_rate": 9.94154349294099e-06, "loss": 0.2272, "step": 926 }, { "epoch": 0.43283582089552236, "grad_norm": 1.4211703252136978, "learning_rate": 9.94071303063729e-06, "loss": 0.2053, "step": 928 }, { "epoch": 0.4337686567164179, "grad_norm": 1.393805940595914, "learning_rate": 9.939876746064427e-06, "loss": 0.1996, "step": 930 }, { "epoch": 0.43470149253731344, "grad_norm": 1.4740985194701417, "learning_rate": 9.939034640207902e-06, "loss": 0.2005, "step": 932 }, { "epoch": 0.435634328358209, "grad_norm": 1.257114759796336, "learning_rate": 9.938186714060077e-06, "loss": 0.1777, "step": 934 }, { "epoch": 0.43656716417910446, "grad_norm": 1.620236651876971, "learning_rate": 9.937332968620168e-06, "loss": 0.203, "step": 936 }, { "epoch": 0.4375, "grad_norm": 1.6029399275619836, "learning_rate": 9.936473404894256e-06, "loss": 0.2099, "step": 938 }, { "epoch": 0.43843283582089554, "grad_norm": 1.499678838053157, "learning_rate": 9.935608023895269e-06, "loss": 0.21, "step": 940 }, { "epoch": 0.439365671641791, "grad_norm": 1.4916831266989266, "learning_rate": 9.934736826643e-06, "loss": 0.2083, "step": 942 }, { "epoch": 0.44029850746268656, "grad_norm": 1.3116363666177426, "learning_rate": 9.933859814164089e-06, "loss": 0.2012, "step": 944 }, { "epoch": 0.4412313432835821, "grad_norm": 1.5184645183407464, "learning_rate": 9.932976987492029e-06, "loss": 0.2019, "step": 946 }, { "epoch": 0.44216417910447764, "grad_norm": 1.4435172238370206, "learning_rate": 9.93208834766717e-06, "loss": 0.2126, "step": 948 }, { "epoch": 0.4430970149253731, "grad_norm": 1.365991785854986, "learning_rate": 9.93119389573671e-06, "loss": 0.2066, "step": 950 }, { "epoch": 0.44402985074626866, "grad_norm": 1.337192667444216, "learning_rate": 9.93029363275469e-06, "loss": 0.2159, "step": 952 }, { "epoch": 0.4449626865671642, "grad_norm": 1.593228511395827, "learning_rate": 9.92938755978201e-06, "loss": 0.2147, "step": 954 }, { "epoch": 0.4458955223880597, "grad_norm": 1.312002367778796, "learning_rate": 9.928475677886407e-06, "loss": 0.1987, "step": 956 }, { "epoch": 0.4468283582089552, "grad_norm": 1.1959083930426961, "learning_rate": 9.927557988142467e-06, "loss": 0.193, "step": 958 }, { "epoch": 0.44776119402985076, "grad_norm": 1.266448270123119, "learning_rate": 9.926634491631623e-06, "loss": 0.1809, "step": 960 }, { "epoch": 0.44869402985074625, "grad_norm": 1.4053237076936411, "learning_rate": 9.925705189442145e-06, "loss": 0.194, "step": 962 }, { "epoch": 0.4496268656716418, "grad_norm": 1.3191029750043703, "learning_rate": 9.92477008266915e-06, "loss": 0.1801, "step": 964 }, { "epoch": 0.4505597014925373, "grad_norm": 1.4297129242923046, "learning_rate": 9.923829172414594e-06, "loss": 0.1937, "step": 966 }, { "epoch": 0.45149253731343286, "grad_norm": 1.1919091109463966, "learning_rate": 9.922882459787268e-06, "loss": 0.1878, "step": 968 }, { "epoch": 0.45242537313432835, "grad_norm": 1.4273355091449529, "learning_rate": 9.921929945902805e-06, "loss": 0.1963, "step": 970 }, { "epoch": 0.4533582089552239, "grad_norm": 1.420964861407647, "learning_rate": 9.920971631883673e-06, "loss": 0.1831, "step": 972 }, { "epoch": 0.4542910447761194, "grad_norm": 1.624875105719104, "learning_rate": 9.920007518859175e-06, "loss": 0.2394, "step": 974 }, { "epoch": 0.4552238805970149, "grad_norm": 1.4956135828361246, "learning_rate": 9.919037607965447e-06, "loss": 0.2032, "step": 976 }, { "epoch": 0.45615671641791045, "grad_norm": 1.3943613205695262, "learning_rate": 9.91806190034546e-06, "loss": 0.2167, "step": 978 }, { "epoch": 0.457089552238806, "grad_norm": 1.3099283306452394, "learning_rate": 9.917080397149013e-06, "loss": 0.1824, "step": 980 }, { "epoch": 0.45802238805970147, "grad_norm": 1.3819954082344559, "learning_rate": 9.916093099532733e-06, "loss": 0.1802, "step": 982 }, { "epoch": 0.458955223880597, "grad_norm": 1.5179159283297041, "learning_rate": 9.915100008660083e-06, "loss": 0.235, "step": 984 }, { "epoch": 0.45988805970149255, "grad_norm": 1.5207608234482923, "learning_rate": 9.914101125701346e-06, "loss": 0.2228, "step": 986 }, { "epoch": 0.4608208955223881, "grad_norm": 1.2521446332959851, "learning_rate": 9.913096451833631e-06, "loss": 0.1893, "step": 988 }, { "epoch": 0.46175373134328357, "grad_norm": 1.3787201715183737, "learning_rate": 9.912085988240873e-06, "loss": 0.1915, "step": 990 }, { "epoch": 0.4626865671641791, "grad_norm": 1.3829452250332792, "learning_rate": 9.911069736113831e-06, "loss": 0.1922, "step": 992 }, { "epoch": 0.46361940298507465, "grad_norm": 1.4766445985578276, "learning_rate": 9.910047696650086e-06, "loss": 0.2006, "step": 994 }, { "epoch": 0.46455223880597013, "grad_norm": 1.4891343123368928, "learning_rate": 9.909019871054032e-06, "loss": 0.2116, "step": 996 }, { "epoch": 0.46548507462686567, "grad_norm": 2.37188044041739, "learning_rate": 9.907986260536888e-06, "loss": 0.2138, "step": 998 }, { "epoch": 0.4664179104477612, "grad_norm": 1.3287044631705238, "learning_rate": 9.906946866316688e-06, "loss": 0.2163, "step": 1000 }, { "epoch": 0.4664179104477612, "eval_loss": 0.18396539986133575, "eval_runtime": 321.6955, "eval_samples_per_second": 47.393, "eval_steps_per_second": 5.925, "step": 1000 }, { "epoch": 0.4673507462686567, "grad_norm": 1.57878750964256, "learning_rate": 9.905901689618287e-06, "loss": 0.2321, "step": 1002 }, { "epoch": 0.46828358208955223, "grad_norm": 1.3222528960853128, "learning_rate": 9.904850731673342e-06, "loss": 0.1904, "step": 1004 }, { "epoch": 0.46921641791044777, "grad_norm": 1.3134015236511882, "learning_rate": 9.903793993720333e-06, "loss": 0.2007, "step": 1006 }, { "epoch": 0.4701492537313433, "grad_norm": 1.4962402402404267, "learning_rate": 9.902731477004552e-06, "loss": 0.2105, "step": 1008 }, { "epoch": 0.4710820895522388, "grad_norm": 1.4701508149160272, "learning_rate": 9.901663182778091e-06, "loss": 0.2213, "step": 1010 }, { "epoch": 0.47201492537313433, "grad_norm": 1.5957657316140008, "learning_rate": 9.900589112299862e-06, "loss": 0.1872, "step": 1012 }, { "epoch": 0.47294776119402987, "grad_norm": 1.4250689017342073, "learning_rate": 9.899509266835575e-06, "loss": 0.2115, "step": 1014 }, { "epoch": 0.47388059701492535, "grad_norm": 1.5295044551866799, "learning_rate": 9.89842364765775e-06, "loss": 0.2004, "step": 1016 }, { "epoch": 0.4748134328358209, "grad_norm": 1.375386556379696, "learning_rate": 9.897332256045712e-06, "loss": 0.2007, "step": 1018 }, { "epoch": 0.47574626865671643, "grad_norm": 1.9168976412029832, "learning_rate": 9.896235093285583e-06, "loss": 0.1938, "step": 1020 }, { "epoch": 0.4766791044776119, "grad_norm": 1.3912354039091317, "learning_rate": 9.89513216067029e-06, "loss": 0.2267, "step": 1022 }, { "epoch": 0.47761194029850745, "grad_norm": 1.2609253736383512, "learning_rate": 9.894023459499562e-06, "loss": 0.1931, "step": 1024 }, { "epoch": 0.478544776119403, "grad_norm": 1.4088269195196246, "learning_rate": 9.892908991079917e-06, "loss": 0.2268, "step": 1026 }, { "epoch": 0.47947761194029853, "grad_norm": 1.6400729111214873, "learning_rate": 9.891788756724676e-06, "loss": 0.203, "step": 1028 }, { "epoch": 0.480410447761194, "grad_norm": 1.3105879597650345, "learning_rate": 9.890662757753955e-06, "loss": 0.187, "step": 1030 }, { "epoch": 0.48134328358208955, "grad_norm": 1.3151903558889866, "learning_rate": 9.889530995494661e-06, "loss": 0.1913, "step": 1032 }, { "epoch": 0.4822761194029851, "grad_norm": 1.3362909932257605, "learning_rate": 9.888393471280493e-06, "loss": 0.2136, "step": 1034 }, { "epoch": 0.4832089552238806, "grad_norm": 1.4057266951433458, "learning_rate": 9.88725018645194e-06, "loss": 0.1947, "step": 1036 }, { "epoch": 0.4841417910447761, "grad_norm": 1.3426572746891023, "learning_rate": 9.886101142356278e-06, "loss": 0.211, "step": 1038 }, { "epoch": 0.48507462686567165, "grad_norm": 1.3769734591394878, "learning_rate": 9.884946340347574e-06, "loss": 0.1998, "step": 1040 }, { "epoch": 0.48600746268656714, "grad_norm": 1.321678508254731, "learning_rate": 9.883785781786676e-06, "loss": 0.2082, "step": 1042 }, { "epoch": 0.4869402985074627, "grad_norm": 1.378118711944151, "learning_rate": 9.882619468041219e-06, "loss": 0.1964, "step": 1044 }, { "epoch": 0.4878731343283582, "grad_norm": 1.3887007023835891, "learning_rate": 9.881447400485617e-06, "loss": 0.2226, "step": 1046 }, { "epoch": 0.48880597014925375, "grad_norm": 1.3205681224821462, "learning_rate": 9.880269580501067e-06, "loss": 0.2064, "step": 1048 }, { "epoch": 0.48973880597014924, "grad_norm": 1.3299044190591582, "learning_rate": 9.879086009475544e-06, "loss": 0.1847, "step": 1050 }, { "epoch": 0.4906716417910448, "grad_norm": 1.4086202941072181, "learning_rate": 9.8778966888038e-06, "loss": 0.207, "step": 1052 }, { "epoch": 0.4916044776119403, "grad_norm": 1.359722655692275, "learning_rate": 9.876701619887358e-06, "loss": 0.2195, "step": 1054 }, { "epoch": 0.4925373134328358, "grad_norm": 1.521119863925835, "learning_rate": 9.875500804134525e-06, "loss": 0.193, "step": 1056 }, { "epoch": 0.49347014925373134, "grad_norm": 1.3767230351467374, "learning_rate": 9.874294242960374e-06, "loss": 0.2268, "step": 1058 }, { "epoch": 0.4944029850746269, "grad_norm": 1.3086201683104317, "learning_rate": 9.873081937786746e-06, "loss": 0.1827, "step": 1060 }, { "epoch": 0.49533582089552236, "grad_norm": 1.6868000799300134, "learning_rate": 9.871863890042256e-06, "loss": 0.2084, "step": 1062 }, { "epoch": 0.4962686567164179, "grad_norm": 1.4517644975136033, "learning_rate": 9.870640101162286e-06, "loss": 0.1959, "step": 1064 }, { "epoch": 0.49720149253731344, "grad_norm": 1.2860344337333347, "learning_rate": 9.869410572588978e-06, "loss": 0.2003, "step": 1066 }, { "epoch": 0.498134328358209, "grad_norm": 1.4260945511661838, "learning_rate": 9.868175305771243e-06, "loss": 0.2079, "step": 1068 }, { "epoch": 0.49906716417910446, "grad_norm": 1.487363730524502, "learning_rate": 9.866934302164755e-06, "loss": 0.218, "step": 1070 }, { "epoch": 0.5, "grad_norm": 1.492521015928695, "learning_rate": 9.865687563231943e-06, "loss": 0.2125, "step": 1072 }, { "epoch": 0.5009328358208955, "grad_norm": 1.402308667183317, "learning_rate": 9.864435090442e-06, "loss": 0.2089, "step": 1074 }, { "epoch": 0.5018656716417911, "grad_norm": 1.1955444297697637, "learning_rate": 9.86317688527087e-06, "loss": 0.1933, "step": 1076 }, { "epoch": 0.5027985074626866, "grad_norm": 1.289349819050055, "learning_rate": 9.86191294920126e-06, "loss": 0.1968, "step": 1078 }, { "epoch": 0.503731343283582, "grad_norm": 1.2885086835168242, "learning_rate": 9.860643283722625e-06, "loss": 0.1663, "step": 1080 }, { "epoch": 0.5046641791044776, "grad_norm": 1.4330258463642025, "learning_rate": 9.859367890331173e-06, "loss": 0.2076, "step": 1082 }, { "epoch": 0.5055970149253731, "grad_norm": 1.2719656782807578, "learning_rate": 9.85808677052986e-06, "loss": 0.1955, "step": 1084 }, { "epoch": 0.5065298507462687, "grad_norm": 1.4516382311318605, "learning_rate": 9.856799925828393e-06, "loss": 0.2156, "step": 1086 }, { "epoch": 0.5074626865671642, "grad_norm": 1.3257407906976475, "learning_rate": 9.855507357743225e-06, "loss": 0.1964, "step": 1088 }, { "epoch": 0.5083955223880597, "grad_norm": 1.5135934349749167, "learning_rate": 9.854209067797553e-06, "loss": 0.202, "step": 1090 }, { "epoch": 0.5093283582089553, "grad_norm": 1.3287017566366777, "learning_rate": 9.852905057521317e-06, "loss": 0.1931, "step": 1092 }, { "epoch": 0.5102611940298507, "grad_norm": 1.3814961249786555, "learning_rate": 9.851595328451198e-06, "loss": 0.2107, "step": 1094 }, { "epoch": 0.5111940298507462, "grad_norm": 1.3474074530170514, "learning_rate": 9.850279882130613e-06, "loss": 0.2107, "step": 1096 }, { "epoch": 0.5121268656716418, "grad_norm": 1.3419692034630626, "learning_rate": 9.848958720109724e-06, "loss": 0.2067, "step": 1098 }, { "epoch": 0.5130597014925373, "grad_norm": 1.2939110597186954, "learning_rate": 9.847631843945421e-06, "loss": 0.1906, "step": 1100 }, { "epoch": 0.5139925373134329, "grad_norm": 1.3557742630802896, "learning_rate": 9.846299255201332e-06, "loss": 0.199, "step": 1102 }, { "epoch": 0.5149253731343284, "grad_norm": 1.3747405046733814, "learning_rate": 9.844960955447813e-06, "loss": 0.2199, "step": 1104 }, { "epoch": 0.5158582089552238, "grad_norm": 1.4500635932391988, "learning_rate": 9.843616946261956e-06, "loss": 0.2052, "step": 1106 }, { "epoch": 0.5167910447761194, "grad_norm": 1.4121169414478871, "learning_rate": 9.842267229227573e-06, "loss": 0.1923, "step": 1108 }, { "epoch": 0.5177238805970149, "grad_norm": 1.3440269181422098, "learning_rate": 9.840911805935211e-06, "loss": 0.1702, "step": 1110 }, { "epoch": 0.5186567164179104, "grad_norm": 1.403115977716643, "learning_rate": 9.839550677982137e-06, "loss": 0.1992, "step": 1112 }, { "epoch": 0.519589552238806, "grad_norm": 1.0990333537752326, "learning_rate": 9.838183846972337e-06, "loss": 0.1808, "step": 1114 }, { "epoch": 0.5205223880597015, "grad_norm": 1.3679893571279134, "learning_rate": 9.836811314516526e-06, "loss": 0.1985, "step": 1116 }, { "epoch": 0.5214552238805971, "grad_norm": 1.29005411066547, "learning_rate": 9.83543308223213e-06, "loss": 0.1895, "step": 1118 }, { "epoch": 0.5223880597014925, "grad_norm": 1.3141946336352162, "learning_rate": 9.834049151743296e-06, "loss": 0.1911, "step": 1120 }, { "epoch": 0.523320895522388, "grad_norm": 1.3472585769171632, "learning_rate": 9.832659524680886e-06, "loss": 0.1829, "step": 1122 }, { "epoch": 0.5242537313432836, "grad_norm": 1.4251642211467288, "learning_rate": 9.831264202682474e-06, "loss": 0.2276, "step": 1124 }, { "epoch": 0.5251865671641791, "grad_norm": 1.3075521549092934, "learning_rate": 9.82986318739234e-06, "loss": 0.1995, "step": 1126 }, { "epoch": 0.5261194029850746, "grad_norm": 1.3264911775768808, "learning_rate": 9.828456480461486e-06, "loss": 0.2019, "step": 1128 }, { "epoch": 0.5270522388059702, "grad_norm": 1.2812570885993706, "learning_rate": 9.82704408354761e-06, "loss": 0.1906, "step": 1130 }, { "epoch": 0.5279850746268657, "grad_norm": 1.295091048232688, "learning_rate": 9.825625998315117e-06, "loss": 0.1875, "step": 1132 }, { "epoch": 0.5289179104477612, "grad_norm": 1.5095428901850168, "learning_rate": 9.824202226435116e-06, "loss": 0.1839, "step": 1134 }, { "epoch": 0.5298507462686567, "grad_norm": 1.2281376530748465, "learning_rate": 9.82277276958542e-06, "loss": 0.1828, "step": 1136 }, { "epoch": 0.5307835820895522, "grad_norm": 1.4547432072625253, "learning_rate": 9.82133762945054e-06, "loss": 0.2031, "step": 1138 }, { "epoch": 0.5317164179104478, "grad_norm": 1.2675070633970242, "learning_rate": 9.819896807721682e-06, "loss": 0.1942, "step": 1140 }, { "epoch": 0.5326492537313433, "grad_norm": 1.4592301039817077, "learning_rate": 9.818450306096752e-06, "loss": 0.2331, "step": 1142 }, { "epoch": 0.5335820895522388, "grad_norm": 1.1662531402310314, "learning_rate": 9.816998126280345e-06, "loss": 0.2083, "step": 1144 }, { "epoch": 0.5345149253731343, "grad_norm": 1.3143246465827019, "learning_rate": 9.815540269983745e-06, "loss": 0.2025, "step": 1146 }, { "epoch": 0.5354477611940298, "grad_norm": 1.4739554155865877, "learning_rate": 9.814076738924934e-06, "loss": 0.2148, "step": 1148 }, { "epoch": 0.5363805970149254, "grad_norm": 1.3248478807110968, "learning_rate": 9.812607534828576e-06, "loss": 0.2223, "step": 1150 }, { "epoch": 0.5373134328358209, "grad_norm": 1.3260805713728958, "learning_rate": 9.811132659426018e-06, "loss": 0.1778, "step": 1152 }, { "epoch": 0.5382462686567164, "grad_norm": 1.3533071643276118, "learning_rate": 9.809652114455292e-06, "loss": 0.2103, "step": 1154 }, { "epoch": 0.539179104477612, "grad_norm": 1.3850454239020296, "learning_rate": 9.808165901661117e-06, "loss": 0.1889, "step": 1156 }, { "epoch": 0.5401119402985075, "grad_norm": 1.2284191439984462, "learning_rate": 9.806674022794884e-06, "loss": 0.2, "step": 1158 }, { "epoch": 0.5410447761194029, "grad_norm": 1.6822062090059398, "learning_rate": 9.805176479614661e-06, "loss": 0.237, "step": 1160 }, { "epoch": 0.5419776119402985, "grad_norm": 1.268048215194861, "learning_rate": 9.803673273885195e-06, "loss": 0.2088, "step": 1162 }, { "epoch": 0.542910447761194, "grad_norm": 1.2288813185660052, "learning_rate": 9.802164407377905e-06, "loss": 0.2138, "step": 1164 }, { "epoch": 0.5438432835820896, "grad_norm": 1.2222635278760878, "learning_rate": 9.800649881870877e-06, "loss": 0.2091, "step": 1166 }, { "epoch": 0.5447761194029851, "grad_norm": 1.4424946066311257, "learning_rate": 9.79912969914887e-06, "loss": 0.199, "step": 1168 }, { "epoch": 0.5457089552238806, "grad_norm": 1.2926068446171315, "learning_rate": 9.797603861003311e-06, "loss": 0.1697, "step": 1170 }, { "epoch": 0.5466417910447762, "grad_norm": 1.12685431622789, "learning_rate": 9.796072369232283e-06, "loss": 0.1728, "step": 1172 }, { "epoch": 0.5475746268656716, "grad_norm": 1.3114420467168177, "learning_rate": 9.794535225640544e-06, "loss": 0.2013, "step": 1174 }, { "epoch": 0.5485074626865671, "grad_norm": 1.3661627874545268, "learning_rate": 9.7929924320395e-06, "loss": 0.1773, "step": 1176 }, { "epoch": 0.5494402985074627, "grad_norm": 1.2520733508228175, "learning_rate": 9.791443990247221e-06, "loss": 0.1973, "step": 1178 }, { "epoch": 0.5503731343283582, "grad_norm": 1.363288918506065, "learning_rate": 9.789889902088435e-06, "loss": 0.2135, "step": 1180 }, { "epoch": 0.5513059701492538, "grad_norm": 1.2144120727241154, "learning_rate": 9.78833016939452e-06, "loss": 0.194, "step": 1182 }, { "epoch": 0.5522388059701493, "grad_norm": 1.2569272956554611, "learning_rate": 9.786764794003507e-06, "loss": 0.1675, "step": 1184 }, { "epoch": 0.5531716417910447, "grad_norm": 1.3627683164721545, "learning_rate": 9.785193777760075e-06, "loss": 0.2228, "step": 1186 }, { "epoch": 0.5541044776119403, "grad_norm": 1.280885062870962, "learning_rate": 9.783617122515554e-06, "loss": 0.1742, "step": 1188 }, { "epoch": 0.5550373134328358, "grad_norm": 1.196759223910446, "learning_rate": 9.782034830127916e-06, "loss": 0.1807, "step": 1190 }, { "epoch": 0.5559701492537313, "grad_norm": 1.2585826253661034, "learning_rate": 9.780446902461778e-06, "loss": 0.1936, "step": 1192 }, { "epoch": 0.5569029850746269, "grad_norm": 1.3570128331998361, "learning_rate": 9.778853341388397e-06, "loss": 0.1917, "step": 1194 }, { "epoch": 0.5578358208955224, "grad_norm": 1.3831013550474431, "learning_rate": 9.777254148785665e-06, "loss": 0.1952, "step": 1196 }, { "epoch": 0.558768656716418, "grad_norm": 1.1801792287210577, "learning_rate": 9.775649326538115e-06, "loss": 0.1894, "step": 1198 }, { "epoch": 0.5597014925373134, "grad_norm": 1.2394044831214288, "learning_rate": 9.774038876536915e-06, "loss": 0.1878, "step": 1200 }, { "epoch": 0.5606343283582089, "grad_norm": 1.372919737346026, "learning_rate": 9.772422800679859e-06, "loss": 0.2237, "step": 1202 }, { "epoch": 0.5615671641791045, "grad_norm": 1.2246685151667145, "learning_rate": 9.770801100871377e-06, "loss": 0.1957, "step": 1204 }, { "epoch": 0.5625, "grad_norm": 1.3216940992177963, "learning_rate": 9.769173779022525e-06, "loss": 0.201, "step": 1206 }, { "epoch": 0.5634328358208955, "grad_norm": 1.4574705082260384, "learning_rate": 9.767540837050978e-06, "loss": 0.1934, "step": 1208 }, { "epoch": 0.5643656716417911, "grad_norm": 1.4024977043721756, "learning_rate": 9.765902276881043e-06, "loss": 0.2063, "step": 1210 }, { "epoch": 0.5652985074626866, "grad_norm": 1.291650160322974, "learning_rate": 9.764258100443641e-06, "loss": 0.2065, "step": 1212 }, { "epoch": 0.566231343283582, "grad_norm": 1.4338365153537704, "learning_rate": 9.762608309676315e-06, "loss": 0.2178, "step": 1214 }, { "epoch": 0.5671641791044776, "grad_norm": 1.2157239415060652, "learning_rate": 9.760952906523223e-06, "loss": 0.1972, "step": 1216 }, { "epoch": 0.5680970149253731, "grad_norm": 1.3194975368927422, "learning_rate": 9.759291892935135e-06, "loss": 0.1764, "step": 1218 }, { "epoch": 0.5690298507462687, "grad_norm": 1.2348920755812913, "learning_rate": 9.757625270869437e-06, "loss": 0.19, "step": 1220 }, { "epoch": 0.5699626865671642, "grad_norm": 1.3197574539922416, "learning_rate": 9.755953042290116e-06, "loss": 0.2096, "step": 1222 }, { "epoch": 0.5708955223880597, "grad_norm": 1.4130620736835018, "learning_rate": 9.754275209167779e-06, "loss": 0.1988, "step": 1224 }, { "epoch": 0.5718283582089553, "grad_norm": 1.2923839968760171, "learning_rate": 9.752591773479622e-06, "loss": 0.2032, "step": 1226 }, { "epoch": 0.5727611940298507, "grad_norm": 1.2126080773255015, "learning_rate": 9.750902737209456e-06, "loss": 0.2058, "step": 1228 }, { "epoch": 0.5736940298507462, "grad_norm": 1.2680649551355194, "learning_rate": 9.749208102347684e-06, "loss": 0.1674, "step": 1230 }, { "epoch": 0.5746268656716418, "grad_norm": 1.2735016462093087, "learning_rate": 9.747507870891311e-06, "loss": 0.205, "step": 1232 }, { "epoch": 0.5755597014925373, "grad_norm": 1.5332136085223254, "learning_rate": 9.745802044843935e-06, "loss": 0.2426, "step": 1234 }, { "epoch": 0.5764925373134329, "grad_norm": 1.1731323643817357, "learning_rate": 9.744090626215745e-06, "loss": 0.1828, "step": 1236 }, { "epoch": 0.5774253731343284, "grad_norm": 1.2990854061105677, "learning_rate": 9.742373617023527e-06, "loss": 0.2037, "step": 1238 }, { "epoch": 0.5783582089552238, "grad_norm": 1.25919557077718, "learning_rate": 9.740651019290648e-06, "loss": 0.1856, "step": 1240 }, { "epoch": 0.5792910447761194, "grad_norm": 1.2880285382358647, "learning_rate": 9.738922835047065e-06, "loss": 0.2159, "step": 1242 }, { "epoch": 0.5802238805970149, "grad_norm": 1.3431597710356027, "learning_rate": 9.737189066329314e-06, "loss": 0.2026, "step": 1244 }, { "epoch": 0.5811567164179104, "grad_norm": 1.237926115980232, "learning_rate": 9.735449715180518e-06, "loss": 0.1939, "step": 1246 }, { "epoch": 0.582089552238806, "grad_norm": 1.416076845947073, "learning_rate": 9.733704783650374e-06, "loss": 0.1932, "step": 1248 }, { "epoch": 0.5830223880597015, "grad_norm": 1.3086449118802606, "learning_rate": 9.731954273795155e-06, "loss": 0.2094, "step": 1250 }, { "epoch": 0.5839552238805971, "grad_norm": 1.4112186537179239, "learning_rate": 9.73019818767771e-06, "loss": 0.2065, "step": 1252 }, { "epoch": 0.5848880597014925, "grad_norm": 1.1933845249042692, "learning_rate": 9.72843652736746e-06, "loss": 0.1787, "step": 1254 }, { "epoch": 0.585820895522388, "grad_norm": 1.290817590513132, "learning_rate": 9.72666929494039e-06, "loss": 0.1909, "step": 1256 }, { "epoch": 0.5867537313432836, "grad_norm": 1.2977497377639067, "learning_rate": 9.724896492479057e-06, "loss": 0.1884, "step": 1258 }, { "epoch": 0.5876865671641791, "grad_norm": 1.2916204111336886, "learning_rate": 9.723118122072575e-06, "loss": 0.2086, "step": 1260 }, { "epoch": 0.5886194029850746, "grad_norm": 1.276746671742708, "learning_rate": 9.721334185816627e-06, "loss": 0.2035, "step": 1262 }, { "epoch": 0.5895522388059702, "grad_norm": 1.2487362167598965, "learning_rate": 9.71954468581345e-06, "loss": 0.2147, "step": 1264 }, { "epoch": 0.5904850746268657, "grad_norm": 1.217991598150849, "learning_rate": 9.717749624171842e-06, "loss": 0.1729, "step": 1266 }, { "epoch": 0.5914179104477612, "grad_norm": 1.2305434973699685, "learning_rate": 9.715949003007145e-06, "loss": 0.1803, "step": 1268 }, { "epoch": 0.5923507462686567, "grad_norm": 1.3312002886023842, "learning_rate": 9.714142824441268e-06, "loss": 0.2177, "step": 1270 }, { "epoch": 0.5932835820895522, "grad_norm": 1.3622656583786625, "learning_rate": 9.712331090602654e-06, "loss": 0.2022, "step": 1272 }, { "epoch": 0.5942164179104478, "grad_norm": 1.1678229978345787, "learning_rate": 9.7105138036263e-06, "loss": 0.185, "step": 1274 }, { "epoch": 0.5951492537313433, "grad_norm": 1.3689661767316197, "learning_rate": 9.708690965653749e-06, "loss": 0.1842, "step": 1276 }, { "epoch": 0.5960820895522388, "grad_norm": 1.4710998548894345, "learning_rate": 9.70686257883308e-06, "loss": 0.197, "step": 1278 }, { "epoch": 0.5970149253731343, "grad_norm": 1.4369031982151708, "learning_rate": 9.705028645318913e-06, "loss": 0.1866, "step": 1280 }, { "epoch": 0.5979477611940298, "grad_norm": 1.3143018413171865, "learning_rate": 9.703189167272404e-06, "loss": 0.1969, "step": 1282 }, { "epoch": 0.5988805970149254, "grad_norm": 1.3181024885453547, "learning_rate": 9.701344146861246e-06, "loss": 0.2021, "step": 1284 }, { "epoch": 0.5998134328358209, "grad_norm": 1.342992985960691, "learning_rate": 9.699493586259658e-06, "loss": 0.1839, "step": 1286 }, { "epoch": 0.6007462686567164, "grad_norm": 1.3035694458919886, "learning_rate": 9.697637487648392e-06, "loss": 0.1906, "step": 1288 }, { "epoch": 0.601679104477612, "grad_norm": 1.261927746228036, "learning_rate": 9.695775853214725e-06, "loss": 0.1895, "step": 1290 }, { "epoch": 0.6026119402985075, "grad_norm": 1.2777086230160113, "learning_rate": 9.693908685152456e-06, "loss": 0.1865, "step": 1292 }, { "epoch": 0.6035447761194029, "grad_norm": 1.2120672739654221, "learning_rate": 9.692035985661906e-06, "loss": 0.2036, "step": 1294 }, { "epoch": 0.6044776119402985, "grad_norm": 1.2677580779039375, "learning_rate": 9.690157756949914e-06, "loss": 0.1968, "step": 1296 }, { "epoch": 0.605410447761194, "grad_norm": 1.3788989498698019, "learning_rate": 9.688274001229838e-06, "loss": 0.1732, "step": 1298 }, { "epoch": 0.6063432835820896, "grad_norm": 1.285932007125996, "learning_rate": 9.686384720721543e-06, "loss": 0.2002, "step": 1300 }, { "epoch": 0.6072761194029851, "grad_norm": 1.3150008423127384, "learning_rate": 9.684489917651409e-06, "loss": 0.1835, "step": 1302 }, { "epoch": 0.6082089552238806, "grad_norm": 1.3903895527118488, "learning_rate": 9.682589594252325e-06, "loss": 0.1968, "step": 1304 }, { "epoch": 0.6091417910447762, "grad_norm": 1.2665624601541696, "learning_rate": 9.68068375276368e-06, "loss": 0.1982, "step": 1306 }, { "epoch": 0.6100746268656716, "grad_norm": 1.3297377918765387, "learning_rate": 9.678772395431371e-06, "loss": 0.219, "step": 1308 }, { "epoch": 0.6110074626865671, "grad_norm": 1.2825309928521964, "learning_rate": 9.676855524507793e-06, "loss": 0.1718, "step": 1310 }, { "epoch": 0.6119402985074627, "grad_norm": 1.2282825080891733, "learning_rate": 9.674933142251836e-06, "loss": 0.1937, "step": 1312 }, { "epoch": 0.6128731343283582, "grad_norm": 1.2632075190559702, "learning_rate": 9.67300525092889e-06, "loss": 0.1969, "step": 1314 }, { "epoch": 0.6138059701492538, "grad_norm": 1.2811311953257711, "learning_rate": 9.671071852810832e-06, "loss": 0.219, "step": 1316 }, { "epoch": 0.6147388059701493, "grad_norm": 1.2853508640105762, "learning_rate": 9.66913295017603e-06, "loss": 0.186, "step": 1318 }, { "epoch": 0.6156716417910447, "grad_norm": 1.2313760707837031, "learning_rate": 9.667188545309342e-06, "loss": 0.1822, "step": 1320 }, { "epoch": 0.6166044776119403, "grad_norm": 1.3445600658444656, "learning_rate": 9.665238640502104e-06, "loss": 0.1872, "step": 1322 }, { "epoch": 0.6175373134328358, "grad_norm": 1.8331986833418121, "learning_rate": 9.663283238052136e-06, "loss": 0.1951, "step": 1324 }, { "epoch": 0.6184701492537313, "grad_norm": 1.337579658657769, "learning_rate": 9.66132234026374e-06, "loss": 0.203, "step": 1326 }, { "epoch": 0.6194029850746269, "grad_norm": 1.2549271918408587, "learning_rate": 9.659355949447689e-06, "loss": 0.1862, "step": 1328 }, { "epoch": 0.6203358208955224, "grad_norm": 1.128214501303178, "learning_rate": 9.657384067921229e-06, "loss": 0.191, "step": 1330 }, { "epoch": 0.621268656716418, "grad_norm": 1.2270654230039046, "learning_rate": 9.65540669800808e-06, "loss": 0.1932, "step": 1332 }, { "epoch": 0.6222014925373134, "grad_norm": 1.2575667177230767, "learning_rate": 9.65342384203843e-06, "loss": 0.2024, "step": 1334 }, { "epoch": 0.6231343283582089, "grad_norm": 1.2188705632524715, "learning_rate": 9.651435502348927e-06, "loss": 0.181, "step": 1336 }, { "epoch": 0.6240671641791045, "grad_norm": 1.4700426250468017, "learning_rate": 9.649441681282682e-06, "loss": 0.2092, "step": 1338 }, { "epoch": 0.625, "grad_norm": 1.847917231034415, "learning_rate": 9.647442381189273e-06, "loss": 0.1969, "step": 1340 }, { "epoch": 0.6259328358208955, "grad_norm": 1.2257028686491196, "learning_rate": 9.645437604424726e-06, "loss": 0.1877, "step": 1342 }, { "epoch": 0.6268656716417911, "grad_norm": 1.1912266570731185, "learning_rate": 9.643427353351522e-06, "loss": 0.2148, "step": 1344 }, { "epoch": 0.6277985074626866, "grad_norm": 1.3102113230009198, "learning_rate": 9.641411630338598e-06, "loss": 0.2079, "step": 1346 }, { "epoch": 0.628731343283582, "grad_norm": 1.3472399048562653, "learning_rate": 9.639390437761334e-06, "loss": 0.2002, "step": 1348 }, { "epoch": 0.6296641791044776, "grad_norm": 1.3045561125394984, "learning_rate": 9.63736377800156e-06, "loss": 0.2032, "step": 1350 }, { "epoch": 0.6305970149253731, "grad_norm": 1.1318941901276385, "learning_rate": 9.635331653447545e-06, "loss": 0.1868, "step": 1352 }, { "epoch": 0.6315298507462687, "grad_norm": 1.1953449932562992, "learning_rate": 9.633294066493999e-06, "loss": 0.1905, "step": 1354 }, { "epoch": 0.6324626865671642, "grad_norm": 1.168253289218291, "learning_rate": 9.63125101954207e-06, "loss": 0.1937, "step": 1356 }, { "epoch": 0.6333955223880597, "grad_norm": 1.3813246317593162, "learning_rate": 9.62920251499934e-06, "loss": 0.2088, "step": 1358 }, { "epoch": 0.6343283582089553, "grad_norm": 1.1336639737226735, "learning_rate": 9.627148555279819e-06, "loss": 0.1844, "step": 1360 }, { "epoch": 0.6352611940298507, "grad_norm": 1.6281711902801774, "learning_rate": 9.625089142803953e-06, "loss": 0.197, "step": 1362 }, { "epoch": 0.6361940298507462, "grad_norm": 1.2530314862291678, "learning_rate": 9.623024279998606e-06, "loss": 0.1816, "step": 1364 }, { "epoch": 0.6371268656716418, "grad_norm": 1.2861829365395432, "learning_rate": 9.620953969297067e-06, "loss": 0.1988, "step": 1366 }, { "epoch": 0.6380597014925373, "grad_norm": 1.2670269512856949, "learning_rate": 9.618878213139048e-06, "loss": 0.1954, "step": 1368 }, { "epoch": 0.6389925373134329, "grad_norm": 1.1057473468977361, "learning_rate": 9.616797013970676e-06, "loss": 0.1819, "step": 1370 }, { "epoch": 0.6399253731343284, "grad_norm": 1.283855563211758, "learning_rate": 9.61471037424449e-06, "loss": 0.2046, "step": 1372 }, { "epoch": 0.6408582089552238, "grad_norm": 1.315178191477583, "learning_rate": 9.612618296419443e-06, "loss": 0.2048, "step": 1374 }, { "epoch": 0.6417910447761194, "grad_norm": 1.221371229104398, "learning_rate": 9.610520782960899e-06, "loss": 0.1816, "step": 1376 }, { "epoch": 0.6427238805970149, "grad_norm": 1.3181977336147686, "learning_rate": 9.608417836340619e-06, "loss": 0.1953, "step": 1378 }, { "epoch": 0.6436567164179104, "grad_norm": 1.233439829786006, "learning_rate": 9.606309459036776e-06, "loss": 0.2116, "step": 1380 }, { "epoch": 0.644589552238806, "grad_norm": 1.3413337842853288, "learning_rate": 9.604195653533937e-06, "loss": 0.2185, "step": 1382 }, { "epoch": 0.6455223880597015, "grad_norm": 1.161182680072267, "learning_rate": 9.602076422323067e-06, "loss": 0.1987, "step": 1384 }, { "epoch": 0.6464552238805971, "grad_norm": 1.3403910691944039, "learning_rate": 9.599951767901527e-06, "loss": 0.1935, "step": 1386 }, { "epoch": 0.6473880597014925, "grad_norm": 1.1928214861429798, "learning_rate": 9.597821692773064e-06, "loss": 0.189, "step": 1388 }, { "epoch": 0.648320895522388, "grad_norm": 1.3653775619687112, "learning_rate": 9.595686199447818e-06, "loss": 0.2154, "step": 1390 }, { "epoch": 0.6492537313432836, "grad_norm": 1.2711818078715147, "learning_rate": 9.59354529044231e-06, "loss": 0.1834, "step": 1392 }, { "epoch": 0.6501865671641791, "grad_norm": 1.3350085586582887, "learning_rate": 9.591398968279448e-06, "loss": 0.1764, "step": 1394 }, { "epoch": 0.6511194029850746, "grad_norm": 1.1201108159093904, "learning_rate": 9.589247235488512e-06, "loss": 0.1673, "step": 1396 }, { "epoch": 0.6520522388059702, "grad_norm": 1.2278203587530399, "learning_rate": 9.587090094605163e-06, "loss": 0.1876, "step": 1398 }, { "epoch": 0.6529850746268657, "grad_norm": 1.2782004653426244, "learning_rate": 9.584927548171435e-06, "loss": 0.1934, "step": 1400 }, { "epoch": 0.6539179104477612, "grad_norm": 1.3373655776274411, "learning_rate": 9.582759598735732e-06, "loss": 0.1962, "step": 1402 }, { "epoch": 0.6548507462686567, "grad_norm": 1.22441813569092, "learning_rate": 9.58058624885282e-06, "loss": 0.2081, "step": 1404 }, { "epoch": 0.6557835820895522, "grad_norm": 1.310275513321286, "learning_rate": 9.578407501083835e-06, "loss": 0.1876, "step": 1406 }, { "epoch": 0.6567164179104478, "grad_norm": 1.388350137552061, "learning_rate": 9.576223357996272e-06, "loss": 0.1869, "step": 1408 }, { "epoch": 0.6576492537313433, "grad_norm": 1.3083547650148935, "learning_rate": 9.574033822163984e-06, "loss": 0.1872, "step": 1410 }, { "epoch": 0.6585820895522388, "grad_norm": 1.209438419679871, "learning_rate": 9.57183889616718e-06, "loss": 0.1936, "step": 1412 }, { "epoch": 0.6595149253731343, "grad_norm": 1.3540134262961245, "learning_rate": 9.569638582592418e-06, "loss": 0.2029, "step": 1414 }, { "epoch": 0.6604477611940298, "grad_norm": 1.322276588604727, "learning_rate": 9.567432884032609e-06, "loss": 0.1893, "step": 1416 }, { "epoch": 0.6613805970149254, "grad_norm": 1.1512156500697452, "learning_rate": 9.565221803087003e-06, "loss": 0.1902, "step": 1418 }, { "epoch": 0.6623134328358209, "grad_norm": 1.2368849361213836, "learning_rate": 9.563005342361204e-06, "loss": 0.1799, "step": 1420 }, { "epoch": 0.6632462686567164, "grad_norm": 1.2929994455263356, "learning_rate": 9.560783504467143e-06, "loss": 0.1947, "step": 1422 }, { "epoch": 0.664179104477612, "grad_norm": 1.1644028068055394, "learning_rate": 9.558556292023097e-06, "loss": 0.1875, "step": 1424 }, { "epoch": 0.6651119402985075, "grad_norm": 1.3848532662506554, "learning_rate": 9.55632370765367e-06, "loss": 0.1982, "step": 1426 }, { "epoch": 0.6660447761194029, "grad_norm": 1.2337805181838923, "learning_rate": 9.554085753989803e-06, "loss": 0.2111, "step": 1428 }, { "epoch": 0.6669776119402985, "grad_norm": 1.2331433498442423, "learning_rate": 9.55184243366876e-06, "loss": 0.2032, "step": 1430 }, { "epoch": 0.667910447761194, "grad_norm": 1.2746494291609998, "learning_rate": 9.54959374933413e-06, "loss": 0.1768, "step": 1432 }, { "epoch": 0.6688432835820896, "grad_norm": 1.400845736833123, "learning_rate": 9.547339703635818e-06, "loss": 0.2261, "step": 1434 }, { "epoch": 0.6697761194029851, "grad_norm": 1.2689933104196989, "learning_rate": 9.54508029923006e-06, "loss": 0.1806, "step": 1436 }, { "epoch": 0.6707089552238806, "grad_norm": 1.2141135219059682, "learning_rate": 9.542815538779395e-06, "loss": 0.1718, "step": 1438 }, { "epoch": 0.6716417910447762, "grad_norm": 1.1811979327498543, "learning_rate": 9.540545424952678e-06, "loss": 0.2136, "step": 1440 }, { "epoch": 0.6725746268656716, "grad_norm": 1.3465680725570635, "learning_rate": 9.53826996042507e-06, "loss": 0.2031, "step": 1442 }, { "epoch": 0.6735074626865671, "grad_norm": 1.3559394534551104, "learning_rate": 9.535989147878044e-06, "loss": 0.1926, "step": 1444 }, { "epoch": 0.6744402985074627, "grad_norm": 1.3486946004743048, "learning_rate": 9.53370298999937e-06, "loss": 0.1851, "step": 1446 }, { "epoch": 0.6753731343283582, "grad_norm": 1.3398692460415387, "learning_rate": 9.531411489483115e-06, "loss": 0.206, "step": 1448 }, { "epoch": 0.6763059701492538, "grad_norm": 1.0955446750720867, "learning_rate": 9.529114649029646e-06, "loss": 0.17, "step": 1450 }, { "epoch": 0.6772388059701493, "grad_norm": 1.141603349628182, "learning_rate": 9.526812471345623e-06, "loss": 0.1907, "step": 1452 }, { "epoch": 0.6781716417910447, "grad_norm": 1.3310977144390403, "learning_rate": 9.524504959143993e-06, "loss": 0.2117, "step": 1454 }, { "epoch": 0.6791044776119403, "grad_norm": 1.1120098628415085, "learning_rate": 9.522192115143992e-06, "loss": 0.1727, "step": 1456 }, { "epoch": 0.6800373134328358, "grad_norm": 1.1861915861899635, "learning_rate": 9.519873942071134e-06, "loss": 0.197, "step": 1458 }, { "epoch": 0.6809701492537313, "grad_norm": 1.2788597711174958, "learning_rate": 9.51755044265722e-06, "loss": 0.1835, "step": 1460 }, { "epoch": 0.6819029850746269, "grad_norm": 1.447329674975208, "learning_rate": 9.515221619640323e-06, "loss": 0.2019, "step": 1462 }, { "epoch": 0.6828358208955224, "grad_norm": 1.1453947729026939, "learning_rate": 9.51288747576479e-06, "loss": 0.1874, "step": 1464 }, { "epoch": 0.683768656716418, "grad_norm": 1.303187551114296, "learning_rate": 9.51054801378124e-06, "loss": 0.1906, "step": 1466 }, { "epoch": 0.6847014925373134, "grad_norm": 1.2823609320700624, "learning_rate": 9.508203236446558e-06, "loss": 0.1852, "step": 1468 }, { "epoch": 0.6856343283582089, "grad_norm": 1.2989362328025698, "learning_rate": 9.505853146523894e-06, "loss": 0.1851, "step": 1470 }, { "epoch": 0.6865671641791045, "grad_norm": 1.2218192464227928, "learning_rate": 9.503497746782652e-06, "loss": 0.1932, "step": 1472 }, { "epoch": 0.6875, "grad_norm": 1.3136247512251407, "learning_rate": 9.501137039998504e-06, "loss": 0.2039, "step": 1474 }, { "epoch": 0.6884328358208955, "grad_norm": 1.3223029230717274, "learning_rate": 9.49877102895337e-06, "loss": 0.1952, "step": 1476 }, { "epoch": 0.6893656716417911, "grad_norm": 1.22272156891932, "learning_rate": 9.496399716435417e-06, "loss": 0.1986, "step": 1478 }, { "epoch": 0.6902985074626866, "grad_norm": 1.161806305814099, "learning_rate": 9.494023105239067e-06, "loss": 0.1736, "step": 1480 }, { "epoch": 0.691231343283582, "grad_norm": 1.1899958541831737, "learning_rate": 9.49164119816498e-06, "loss": 0.1768, "step": 1482 }, { "epoch": 0.6921641791044776, "grad_norm": 1.2904641348916563, "learning_rate": 9.489253998020062e-06, "loss": 0.2, "step": 1484 }, { "epoch": 0.6930970149253731, "grad_norm": 1.483768884420732, "learning_rate": 9.486861507617452e-06, "loss": 0.202, "step": 1486 }, { "epoch": 0.6940298507462687, "grad_norm": 1.3110823692397675, "learning_rate": 9.484463729776527e-06, "loss": 0.2078, "step": 1488 }, { "epoch": 0.6949626865671642, "grad_norm": 1.173941495674722, "learning_rate": 9.48206066732289e-06, "loss": 0.1723, "step": 1490 }, { "epoch": 0.6958955223880597, "grad_norm": 1.3124080798355797, "learning_rate": 9.479652323088377e-06, "loss": 0.2186, "step": 1492 }, { "epoch": 0.6968283582089553, "grad_norm": 1.2763222228363904, "learning_rate": 9.477238699911046e-06, "loss": 0.189, "step": 1494 }, { "epoch": 0.6977611940298507, "grad_norm": 1.4136832251346632, "learning_rate": 9.474819800635174e-06, "loss": 0.2081, "step": 1496 }, { "epoch": 0.6986940298507462, "grad_norm": 1.119137762218111, "learning_rate": 9.472395628111255e-06, "loss": 0.1868, "step": 1498 }, { "epoch": 0.6996268656716418, "grad_norm": 1.387871176288593, "learning_rate": 9.469966185196003e-06, "loss": 0.2017, "step": 1500 }, { "epoch": 0.6996268656716418, "eval_loss": 0.17079760134220123, "eval_runtime": 321.3449, "eval_samples_per_second": 47.444, "eval_steps_per_second": 5.931, "step": 1500 }, { "epoch": 0.7005597014925373, "grad_norm": 1.3658040018974007, "learning_rate": 9.467531474752336e-06, "loss": 0.1958, "step": 1502 }, { "epoch": 0.7014925373134329, "grad_norm": 1.1346929234604453, "learning_rate": 9.465091499649385e-06, "loss": 0.1744, "step": 1504 }, { "epoch": 0.7024253731343284, "grad_norm": 1.3117793008529666, "learning_rate": 9.46264626276248e-06, "loss": 0.1795, "step": 1506 }, { "epoch": 0.7033582089552238, "grad_norm": 1.1950098061171308, "learning_rate": 9.460195766973154e-06, "loss": 0.1798, "step": 1508 }, { "epoch": 0.7042910447761194, "grad_norm": 1.2239090727892925, "learning_rate": 9.45774001516914e-06, "loss": 0.1882, "step": 1510 }, { "epoch": 0.7052238805970149, "grad_norm": 1.1393660911507115, "learning_rate": 9.45527901024436e-06, "loss": 0.1848, "step": 1512 }, { "epoch": 0.7061567164179104, "grad_norm": 1.3511473443468591, "learning_rate": 9.452812755098927e-06, "loss": 0.1906, "step": 1514 }, { "epoch": 0.707089552238806, "grad_norm": 1.3167759751706674, "learning_rate": 9.450341252639144e-06, "loss": 0.201, "step": 1516 }, { "epoch": 0.7080223880597015, "grad_norm": 1.1505516796278474, "learning_rate": 9.447864505777496e-06, "loss": 0.1695, "step": 1518 }, { "epoch": 0.7089552238805971, "grad_norm": 1.2689369158761514, "learning_rate": 9.445382517432648e-06, "loss": 0.1835, "step": 1520 }, { "epoch": 0.7098880597014925, "grad_norm": 1.161677660666682, "learning_rate": 9.442895290529442e-06, "loss": 0.1953, "step": 1522 }, { "epoch": 0.710820895522388, "grad_norm": 1.3495510031393991, "learning_rate": 9.440402827998893e-06, "loss": 0.2064, "step": 1524 }, { "epoch": 0.7117537313432836, "grad_norm": 1.1457330368321792, "learning_rate": 9.437905132778185e-06, "loss": 0.1906, "step": 1526 }, { "epoch": 0.7126865671641791, "grad_norm": 1.1854030525063404, "learning_rate": 9.43540220781067e-06, "loss": 0.1998, "step": 1528 }, { "epoch": 0.7136194029850746, "grad_norm": 1.478371017865085, "learning_rate": 9.432894056045862e-06, "loss": 0.1963, "step": 1530 }, { "epoch": 0.7145522388059702, "grad_norm": 1.3181432461815878, "learning_rate": 9.430380680439435e-06, "loss": 0.1785, "step": 1532 }, { "epoch": 0.7154850746268657, "grad_norm": 1.213950807873333, "learning_rate": 9.42786208395322e-06, "loss": 0.1968, "step": 1534 }, { "epoch": 0.7164179104477612, "grad_norm": 1.2198669604077816, "learning_rate": 9.425338269555193e-06, "loss": 0.1993, "step": 1536 }, { "epoch": 0.7173507462686567, "grad_norm": 1.194092739592872, "learning_rate": 9.422809240219491e-06, "loss": 0.1776, "step": 1538 }, { "epoch": 0.7182835820895522, "grad_norm": 1.3171151245106296, "learning_rate": 9.42027499892639e-06, "loss": 0.1845, "step": 1540 }, { "epoch": 0.7192164179104478, "grad_norm": 1.2333889462195131, "learning_rate": 9.417735548662302e-06, "loss": 0.199, "step": 1542 }, { "epoch": 0.7201492537313433, "grad_norm": 1.2342130003172749, "learning_rate": 9.41519089241979e-06, "loss": 0.1717, "step": 1544 }, { "epoch": 0.7210820895522388, "grad_norm": 1.297550445847169, "learning_rate": 9.412641033197543e-06, "loss": 0.1805, "step": 1546 }, { "epoch": 0.7220149253731343, "grad_norm": 1.2155591431130077, "learning_rate": 9.410085974000383e-06, "loss": 0.182, "step": 1548 }, { "epoch": 0.7229477611940298, "grad_norm": 1.3587429095966317, "learning_rate": 9.407525717839262e-06, "loss": 0.1907, "step": 1550 }, { "epoch": 0.7238805970149254, "grad_norm": 1.1368429620830403, "learning_rate": 9.404960267731251e-06, "loss": 0.1866, "step": 1552 }, { "epoch": 0.7248134328358209, "grad_norm": 1.234511344856253, "learning_rate": 9.40238962669955e-06, "loss": 0.1955, "step": 1554 }, { "epoch": 0.7257462686567164, "grad_norm": 1.312778530731583, "learning_rate": 9.399813797773472e-06, "loss": 0.2072, "step": 1556 }, { "epoch": 0.726679104477612, "grad_norm": 1.219616393554641, "learning_rate": 9.397232783988439e-06, "loss": 0.1805, "step": 1558 }, { "epoch": 0.7276119402985075, "grad_norm": 1.3768136621236455, "learning_rate": 9.39464658838599e-06, "loss": 0.1866, "step": 1560 }, { "epoch": 0.7285447761194029, "grad_norm": 1.3079769241521353, "learning_rate": 9.392055214013765e-06, "loss": 0.1859, "step": 1562 }, { "epoch": 0.7294776119402985, "grad_norm": 1.1939382831199772, "learning_rate": 9.389458663925512e-06, "loss": 0.193, "step": 1564 }, { "epoch": 0.730410447761194, "grad_norm": 1.267302678741195, "learning_rate": 9.386856941181076e-06, "loss": 0.2013, "step": 1566 }, { "epoch": 0.7313432835820896, "grad_norm": 1.230885714676747, "learning_rate": 9.384250048846394e-06, "loss": 0.1854, "step": 1568 }, { "epoch": 0.7322761194029851, "grad_norm": 1.3628808283440035, "learning_rate": 9.381637989993497e-06, "loss": 0.1856, "step": 1570 }, { "epoch": 0.7332089552238806, "grad_norm": 1.0809491914264386, "learning_rate": 9.37902076770051e-06, "loss": 0.1799, "step": 1572 }, { "epoch": 0.7341417910447762, "grad_norm": 1.3320904641678766, "learning_rate": 9.376398385051635e-06, "loss": 0.205, "step": 1574 }, { "epoch": 0.7350746268656716, "grad_norm": 1.306969225551526, "learning_rate": 9.373770845137162e-06, "loss": 0.1858, "step": 1576 }, { "epoch": 0.7360074626865671, "grad_norm": 1.4033966518233125, "learning_rate": 9.371138151053449e-06, "loss": 0.2023, "step": 1578 }, { "epoch": 0.7369402985074627, "grad_norm": 1.4139340955951025, "learning_rate": 9.368500305902939e-06, "loss": 0.2044, "step": 1580 }, { "epoch": 0.7378731343283582, "grad_norm": 1.2176456956703794, "learning_rate": 9.365857312794136e-06, "loss": 0.1791, "step": 1582 }, { "epoch": 0.7388059701492538, "grad_norm": 1.3088867965970863, "learning_rate": 9.363209174841617e-06, "loss": 0.1955, "step": 1584 }, { "epoch": 0.7397388059701493, "grad_norm": 1.201175692333774, "learning_rate": 9.360555895166015e-06, "loss": 0.1763, "step": 1586 }, { "epoch": 0.7406716417910447, "grad_norm": 1.2783969974039087, "learning_rate": 9.35789747689403e-06, "loss": 0.2124, "step": 1588 }, { "epoch": 0.7416044776119403, "grad_norm": 1.3368519874428064, "learning_rate": 9.35523392315841e-06, "loss": 0.2098, "step": 1590 }, { "epoch": 0.7425373134328358, "grad_norm": 1.2456330261819875, "learning_rate": 9.352565237097964e-06, "loss": 0.1909, "step": 1592 }, { "epoch": 0.7434701492537313, "grad_norm": 1.2614455060038814, "learning_rate": 9.34989142185754e-06, "loss": 0.1925, "step": 1594 }, { "epoch": 0.7444029850746269, "grad_norm": 1.2118219426430827, "learning_rate": 9.347212480588033e-06, "loss": 0.1708, "step": 1596 }, { "epoch": 0.7453358208955224, "grad_norm": 1.3061401074852776, "learning_rate": 9.34452841644638e-06, "loss": 0.1881, "step": 1598 }, { "epoch": 0.746268656716418, "grad_norm": 1.232290471152712, "learning_rate": 9.341839232595555e-06, "loss": 0.1878, "step": 1600 }, { "epoch": 0.7472014925373134, "grad_norm": 1.125088295370148, "learning_rate": 9.339144932204564e-06, "loss": 0.1681, "step": 1602 }, { "epoch": 0.7481343283582089, "grad_norm": 1.2799067080626618, "learning_rate": 9.336445518448442e-06, "loss": 0.1947, "step": 1604 }, { "epoch": 0.7490671641791045, "grad_norm": 1.3566521061134003, "learning_rate": 9.333740994508254e-06, "loss": 0.2069, "step": 1606 }, { "epoch": 0.75, "grad_norm": 1.218444067900376, "learning_rate": 9.331031363571082e-06, "loss": 0.1902, "step": 1608 }, { "epoch": 0.7509328358208955, "grad_norm": 1.1900834243120095, "learning_rate": 9.328316628830029e-06, "loss": 0.1902, "step": 1610 }, { "epoch": 0.7518656716417911, "grad_norm": 1.0790885063660123, "learning_rate": 9.325596793484209e-06, "loss": 0.1805, "step": 1612 }, { "epoch": 0.7527985074626866, "grad_norm": 1.2336042126748985, "learning_rate": 9.322871860738751e-06, "loss": 0.199, "step": 1614 }, { "epoch": 0.753731343283582, "grad_norm": 1.2482618141822903, "learning_rate": 9.320141833804788e-06, "loss": 0.1721, "step": 1616 }, { "epoch": 0.7546641791044776, "grad_norm": 1.196552649254962, "learning_rate": 9.317406715899458e-06, "loss": 0.1809, "step": 1618 }, { "epoch": 0.7555970149253731, "grad_norm": 1.375870755525294, "learning_rate": 9.3146665102459e-06, "loss": 0.199, "step": 1620 }, { "epoch": 0.7565298507462687, "grad_norm": 1.1480319653135727, "learning_rate": 9.31192122007324e-06, "loss": 0.1821, "step": 1622 }, { "epoch": 0.7574626865671642, "grad_norm": 1.2050690124236734, "learning_rate": 9.309170848616606e-06, "loss": 0.205, "step": 1624 }, { "epoch": 0.7583955223880597, "grad_norm": 1.1574618619887107, "learning_rate": 9.30641539911711e-06, "loss": 0.1933, "step": 1626 }, { "epoch": 0.7593283582089553, "grad_norm": 1.2836571870338427, "learning_rate": 9.303654874821846e-06, "loss": 0.2007, "step": 1628 }, { "epoch": 0.7602611940298507, "grad_norm": 1.2027530695095416, "learning_rate": 9.300889278983892e-06, "loss": 0.1877, "step": 1630 }, { "epoch": 0.7611940298507462, "grad_norm": 1.3636582387626675, "learning_rate": 9.298118614862298e-06, "loss": 0.207, "step": 1632 }, { "epoch": 0.7621268656716418, "grad_norm": 1.2630348650248526, "learning_rate": 9.295342885722092e-06, "loss": 0.1758, "step": 1634 }, { "epoch": 0.7630597014925373, "grad_norm": 1.329910260112892, "learning_rate": 9.292562094834265e-06, "loss": 0.204, "step": 1636 }, { "epoch": 0.7639925373134329, "grad_norm": 1.2353843989957862, "learning_rate": 9.289776245475777e-06, "loss": 0.1685, "step": 1638 }, { "epoch": 0.7649253731343284, "grad_norm": 1.2170648714970906, "learning_rate": 9.28698534092955e-06, "loss": 0.1836, "step": 1640 }, { "epoch": 0.7658582089552238, "grad_norm": 1.2038791996871292, "learning_rate": 9.284189384484458e-06, "loss": 0.172, "step": 1642 }, { "epoch": 0.7667910447761194, "grad_norm": 1.4119170877924636, "learning_rate": 9.281388379435332e-06, "loss": 0.198, "step": 1644 }, { "epoch": 0.7677238805970149, "grad_norm": 1.224958739306716, "learning_rate": 9.278582329082953e-06, "loss": 0.1713, "step": 1646 }, { "epoch": 0.7686567164179104, "grad_norm": 1.3635758240736457, "learning_rate": 9.275771236734046e-06, "loss": 0.2176, "step": 1648 }, { "epoch": 0.769589552238806, "grad_norm": 0.9957691545425625, "learning_rate": 9.272955105701276e-06, "loss": 0.1783, "step": 1650 }, { "epoch": 0.7705223880597015, "grad_norm": 1.2198158814021398, "learning_rate": 9.270133939303248e-06, "loss": 0.1986, "step": 1652 }, { "epoch": 0.7714552238805971, "grad_norm": 1.230290778094303, "learning_rate": 9.267307740864502e-06, "loss": 0.1805, "step": 1654 }, { "epoch": 0.7723880597014925, "grad_norm": 1.109344258482454, "learning_rate": 9.264476513715506e-06, "loss": 0.2014, "step": 1656 }, { "epoch": 0.773320895522388, "grad_norm": 1.1926452649616313, "learning_rate": 9.261640261192654e-06, "loss": 0.196, "step": 1658 }, { "epoch": 0.7742537313432836, "grad_norm": 1.1300301799492858, "learning_rate": 9.25879898663826e-06, "loss": 0.1729, "step": 1660 }, { "epoch": 0.7751865671641791, "grad_norm": 1.3360029416525159, "learning_rate": 9.255952693400562e-06, "loss": 0.1895, "step": 1662 }, { "epoch": 0.7761194029850746, "grad_norm": 1.2039456738595933, "learning_rate": 9.253101384833708e-06, "loss": 0.1984, "step": 1664 }, { "epoch": 0.7770522388059702, "grad_norm": 1.2546247288328776, "learning_rate": 9.250245064297752e-06, "loss": 0.1844, "step": 1666 }, { "epoch": 0.7779850746268657, "grad_norm": 1.4380813911346488, "learning_rate": 9.247383735158666e-06, "loss": 0.2154, "step": 1668 }, { "epoch": 0.7789179104477612, "grad_norm": 1.2627577095705367, "learning_rate": 9.24451740078831e-06, "loss": 0.2264, "step": 1670 }, { "epoch": 0.7798507462686567, "grad_norm": 1.2660475048501807, "learning_rate": 9.241646064564457e-06, "loss": 0.2043, "step": 1672 }, { "epoch": 0.7807835820895522, "grad_norm": 1.3320766055488809, "learning_rate": 9.238769729870763e-06, "loss": 0.1896, "step": 1674 }, { "epoch": 0.7817164179104478, "grad_norm": 1.190899383423424, "learning_rate": 9.235888400096776e-06, "loss": 0.1795, "step": 1676 }, { "epoch": 0.7826492537313433, "grad_norm": 1.1817997896917685, "learning_rate": 9.233002078637936e-06, "loss": 0.1846, "step": 1678 }, { "epoch": 0.7835820895522388, "grad_norm": 1.2912143789992765, "learning_rate": 9.230110768895561e-06, "loss": 0.1932, "step": 1680 }, { "epoch": 0.7845149253731343, "grad_norm": 1.2830565916667247, "learning_rate": 9.22721447427685e-06, "loss": 0.1809, "step": 1682 }, { "epoch": 0.7854477611940298, "grad_norm": 1.3494526085655905, "learning_rate": 9.224313198194869e-06, "loss": 0.1985, "step": 1684 }, { "epoch": 0.7863805970149254, "grad_norm": 1.199511283191625, "learning_rate": 9.221406944068565e-06, "loss": 0.1848, "step": 1686 }, { "epoch": 0.7873134328358209, "grad_norm": 1.37703204084647, "learning_rate": 9.218495715322744e-06, "loss": 0.178, "step": 1688 }, { "epoch": 0.7882462686567164, "grad_norm": 1.2290476289857377, "learning_rate": 9.215579515388076e-06, "loss": 0.1943, "step": 1690 }, { "epoch": 0.789179104477612, "grad_norm": 1.1614568458128005, "learning_rate": 9.212658347701091e-06, "loss": 0.1774, "step": 1692 }, { "epoch": 0.7901119402985075, "grad_norm": 1.3331907668724727, "learning_rate": 9.20973221570417e-06, "loss": 0.1844, "step": 1694 }, { "epoch": 0.7910447761194029, "grad_norm": 1.5566470906966665, "learning_rate": 9.206801122845547e-06, "loss": 0.1801, "step": 1696 }, { "epoch": 0.7919776119402985, "grad_norm": 1.1677212305334963, "learning_rate": 9.203865072579298e-06, "loss": 0.1805, "step": 1698 }, { "epoch": 0.792910447761194, "grad_norm": 1.322229911642287, "learning_rate": 9.200924068365348e-06, "loss": 0.2056, "step": 1700 }, { "epoch": 0.7938432835820896, "grad_norm": 1.3506144940700227, "learning_rate": 9.197978113669452e-06, "loss": 0.2177, "step": 1702 }, { "epoch": 0.7947761194029851, "grad_norm": 1.326459607368318, "learning_rate": 9.195027211963203e-06, "loss": 0.209, "step": 1704 }, { "epoch": 0.7957089552238806, "grad_norm": 1.2530860336286305, "learning_rate": 9.192071366724024e-06, "loss": 0.1893, "step": 1706 }, { "epoch": 0.7966417910447762, "grad_norm": 1.2392387668165896, "learning_rate": 9.189110581435164e-06, "loss": 0.1777, "step": 1708 }, { "epoch": 0.7975746268656716, "grad_norm": 1.1855817934355852, "learning_rate": 9.186144859585686e-06, "loss": 0.1905, "step": 1710 }, { "epoch": 0.7985074626865671, "grad_norm": 1.4964445044086616, "learning_rate": 9.183174204670483e-06, "loss": 0.2097, "step": 1712 }, { "epoch": 0.7994402985074627, "grad_norm": 1.1862211193269043, "learning_rate": 9.18019862019025e-06, "loss": 0.1754, "step": 1714 }, { "epoch": 0.8003731343283582, "grad_norm": 1.1506340547267884, "learning_rate": 9.1772181096515e-06, "loss": 0.1665, "step": 1716 }, { "epoch": 0.8013059701492538, "grad_norm": 1.3124081769202471, "learning_rate": 9.174232676566544e-06, "loss": 0.2067, "step": 1718 }, { "epoch": 0.8022388059701493, "grad_norm": 1.2628198489069242, "learning_rate": 9.171242324453498e-06, "loss": 0.174, "step": 1720 }, { "epoch": 0.8031716417910447, "grad_norm": 1.284276587995049, "learning_rate": 9.16824705683627e-06, "loss": 0.1996, "step": 1722 }, { "epoch": 0.8041044776119403, "grad_norm": 1.364415486198836, "learning_rate": 9.165246877244569e-06, "loss": 0.172, "step": 1724 }, { "epoch": 0.8050373134328358, "grad_norm": 1.4860780735334287, "learning_rate": 9.162241789213884e-06, "loss": 0.1906, "step": 1726 }, { "epoch": 0.8059701492537313, "grad_norm": 1.338318032226887, "learning_rate": 9.159231796285494e-06, "loss": 0.2016, "step": 1728 }, { "epoch": 0.8069029850746269, "grad_norm": 1.4399260318313443, "learning_rate": 9.156216902006452e-06, "loss": 0.179, "step": 1730 }, { "epoch": 0.8078358208955224, "grad_norm": 1.1486949764480001, "learning_rate": 9.153197109929595e-06, "loss": 0.1564, "step": 1732 }, { "epoch": 0.808768656716418, "grad_norm": 1.2549897111890425, "learning_rate": 9.150172423613524e-06, "loss": 0.1774, "step": 1734 }, { "epoch": 0.8097014925373134, "grad_norm": 1.408045646649574, "learning_rate": 9.147142846622611e-06, "loss": 0.1998, "step": 1736 }, { "epoch": 0.8106343283582089, "grad_norm": 1.214919294688259, "learning_rate": 9.144108382526992e-06, "loss": 0.1749, "step": 1738 }, { "epoch": 0.8115671641791045, "grad_norm": 1.2346241992467217, "learning_rate": 9.141069034902563e-06, "loss": 0.2004, "step": 1740 }, { "epoch": 0.8125, "grad_norm": 1.058175981214563, "learning_rate": 9.13802480733097e-06, "loss": 0.1525, "step": 1742 }, { "epoch": 0.8134328358208955, "grad_norm": 1.276358048373882, "learning_rate": 9.134975703399612e-06, "loss": 0.1803, "step": 1744 }, { "epoch": 0.8143656716417911, "grad_norm": 1.2198555824626236, "learning_rate": 9.131921726701636e-06, "loss": 0.1898, "step": 1746 }, { "epoch": 0.8152985074626866, "grad_norm": 1.501693819108629, "learning_rate": 9.128862880835934e-06, "loss": 0.177, "step": 1748 }, { "epoch": 0.816231343283582, "grad_norm": 1.2048950586310152, "learning_rate": 9.125799169407129e-06, "loss": 0.1835, "step": 1750 }, { "epoch": 0.8171641791044776, "grad_norm": 1.4797708319961769, "learning_rate": 9.122730596025579e-06, "loss": 0.186, "step": 1752 }, { "epoch": 0.8180970149253731, "grad_norm": 1.1569485141394653, "learning_rate": 9.119657164307376e-06, "loss": 0.17, "step": 1754 }, { "epoch": 0.8190298507462687, "grad_norm": 1.4888950199609203, "learning_rate": 9.116578877874335e-06, "loss": 0.1906, "step": 1756 }, { "epoch": 0.8199626865671642, "grad_norm": 1.3036336679974885, "learning_rate": 9.11349574035399e-06, "loss": 0.187, "step": 1758 }, { "epoch": 0.8208955223880597, "grad_norm": 1.2686515314533011, "learning_rate": 9.110407755379596e-06, "loss": 0.1958, "step": 1760 }, { "epoch": 0.8218283582089553, "grad_norm": 1.6421245532393385, "learning_rate": 9.107314926590114e-06, "loss": 0.169, "step": 1762 }, { "epoch": 0.8227611940298507, "grad_norm": 1.1615706530510448, "learning_rate": 9.104217257630219e-06, "loss": 0.1807, "step": 1764 }, { "epoch": 0.8236940298507462, "grad_norm": 1.26218124385913, "learning_rate": 9.101114752150287e-06, "loss": 0.1843, "step": 1766 }, { "epoch": 0.8246268656716418, "grad_norm": 1.250901164260205, "learning_rate": 9.098007413806392e-06, "loss": 0.1791, "step": 1768 }, { "epoch": 0.8255597014925373, "grad_norm": 1.0727777935748994, "learning_rate": 9.094895246260307e-06, "loss": 0.1776, "step": 1770 }, { "epoch": 0.8264925373134329, "grad_norm": 1.259600792241708, "learning_rate": 9.091778253179494e-06, "loss": 0.1918, "step": 1772 }, { "epoch": 0.8274253731343284, "grad_norm": 1.3518465115589136, "learning_rate": 9.088656438237103e-06, "loss": 0.1877, "step": 1774 }, { "epoch": 0.8283582089552238, "grad_norm": 1.2552564556451946, "learning_rate": 9.085529805111961e-06, "loss": 0.1805, "step": 1776 }, { "epoch": 0.8292910447761194, "grad_norm": 1.1862224492667095, "learning_rate": 9.082398357488579e-06, "loss": 0.1753, "step": 1778 }, { "epoch": 0.8302238805970149, "grad_norm": 1.1687326291642723, "learning_rate": 9.07926209905714e-06, "loss": 0.192, "step": 1780 }, { "epoch": 0.8311567164179104, "grad_norm": 1.0957503874315015, "learning_rate": 9.076121033513492e-06, "loss": 0.1916, "step": 1782 }, { "epoch": 0.832089552238806, "grad_norm": 1.238953371691249, "learning_rate": 9.072975164559155e-06, "loss": 0.188, "step": 1784 }, { "epoch": 0.8330223880597015, "grad_norm": 1.4933361492925055, "learning_rate": 9.0698244959013e-06, "loss": 0.1824, "step": 1786 }, { "epoch": 0.8339552238805971, "grad_norm": 1.0855474149879474, "learning_rate": 9.066669031252767e-06, "loss": 0.1738, "step": 1788 }, { "epoch": 0.8348880597014925, "grad_norm": 1.2649611847849547, "learning_rate": 9.063508774332036e-06, "loss": 0.1726, "step": 1790 }, { "epoch": 0.835820895522388, "grad_norm": 1.3122381326355543, "learning_rate": 9.06034372886324e-06, "loss": 0.1911, "step": 1792 }, { "epoch": 0.8367537313432836, "grad_norm": 1.2698887757755766, "learning_rate": 9.057173898576152e-06, "loss": 0.1929, "step": 1794 }, { "epoch": 0.8376865671641791, "grad_norm": 1.2910438945516445, "learning_rate": 9.053999287206188e-06, "loss": 0.187, "step": 1796 }, { "epoch": 0.8386194029850746, "grad_norm": 1.2712881618613445, "learning_rate": 9.050819898494393e-06, "loss": 0.2031, "step": 1798 }, { "epoch": 0.8395522388059702, "grad_norm": 1.2219658687786867, "learning_rate": 9.047635736187446e-06, "loss": 0.1959, "step": 1800 }, { "epoch": 0.8404850746268657, "grad_norm": 1.1457094612347019, "learning_rate": 9.04444680403765e-06, "loss": 0.1767, "step": 1802 }, { "epoch": 0.8414179104477612, "grad_norm": 1.344224861118119, "learning_rate": 9.041253105802927e-06, "loss": 0.2012, "step": 1804 }, { "epoch": 0.8423507462686567, "grad_norm": 1.153726215152984, "learning_rate": 9.038054645246816e-06, "loss": 0.2033, "step": 1806 }, { "epoch": 0.8432835820895522, "grad_norm": 2.172858885456257, "learning_rate": 9.03485142613847e-06, "loss": 0.2036, "step": 1808 }, { "epoch": 0.8442164179104478, "grad_norm": 1.2968213763126533, "learning_rate": 9.03164345225265e-06, "loss": 0.1879, "step": 1810 }, { "epoch": 0.8451492537313433, "grad_norm": 1.3833444029052697, "learning_rate": 9.028430727369716e-06, "loss": 0.1927, "step": 1812 }, { "epoch": 0.8460820895522388, "grad_norm": 1.3267440671572195, "learning_rate": 9.025213255275634e-06, "loss": 0.1997, "step": 1814 }, { "epoch": 0.8470149253731343, "grad_norm": 1.3234776781784925, "learning_rate": 9.021991039761952e-06, "loss": 0.1923, "step": 1816 }, { "epoch": 0.8479477611940298, "grad_norm": 1.1345481672620257, "learning_rate": 9.018764084625824e-06, "loss": 0.1833, "step": 1818 }, { "epoch": 0.8488805970149254, "grad_norm": 1.2275801230504422, "learning_rate": 9.015532393669975e-06, "loss": 0.2184, "step": 1820 }, { "epoch": 0.8498134328358209, "grad_norm": 1.1418718541841877, "learning_rate": 9.012295970702719e-06, "loss": 0.166, "step": 1822 }, { "epoch": 0.8507462686567164, "grad_norm": 1.1257828058705048, "learning_rate": 9.009054819537943e-06, "loss": 0.1925, "step": 1824 }, { "epoch": 0.851679104477612, "grad_norm": 1.2497935091095564, "learning_rate": 9.005808943995107e-06, "loss": 0.2053, "step": 1826 }, { "epoch": 0.8526119402985075, "grad_norm": 1.2997044089186343, "learning_rate": 9.002558347899238e-06, "loss": 0.1858, "step": 1828 }, { "epoch": 0.8535447761194029, "grad_norm": 1.22478317982349, "learning_rate": 8.999303035080927e-06, "loss": 0.1855, "step": 1830 }, { "epoch": 0.8544776119402985, "grad_norm": 1.217770720466568, "learning_rate": 8.99604300937632e-06, "loss": 0.1825, "step": 1832 }, { "epoch": 0.855410447761194, "grad_norm": 1.3002126450464224, "learning_rate": 8.99277827462712e-06, "loss": 0.1961, "step": 1834 }, { "epoch": 0.8563432835820896, "grad_norm": 1.4383914865756529, "learning_rate": 8.98950883468058e-06, "loss": 0.1946, "step": 1836 }, { "epoch": 0.8572761194029851, "grad_norm": 1.1353579224278034, "learning_rate": 8.986234693389492e-06, "loss": 0.1718, "step": 1838 }, { "epoch": 0.8582089552238806, "grad_norm": 1.1590788686455826, "learning_rate": 8.982955854612197e-06, "loss": 0.1807, "step": 1840 }, { "epoch": 0.8591417910447762, "grad_norm": 1.19161317404919, "learning_rate": 8.979672322212565e-06, "loss": 0.1846, "step": 1842 }, { "epoch": 0.8600746268656716, "grad_norm": 1.230929479032492, "learning_rate": 8.976384100059996e-06, "loss": 0.1839, "step": 1844 }, { "epoch": 0.8610074626865671, "grad_norm": 1.2273399420318605, "learning_rate": 8.973091192029424e-06, "loss": 0.1703, "step": 1846 }, { "epoch": 0.8619402985074627, "grad_norm": 1.2623762400287248, "learning_rate": 8.969793602001295e-06, "loss": 0.1928, "step": 1848 }, { "epoch": 0.8628731343283582, "grad_norm": 1.2114189437578842, "learning_rate": 8.966491333861585e-06, "loss": 0.2019, "step": 1850 }, { "epoch": 0.8638059701492538, "grad_norm": 1.234365487073312, "learning_rate": 8.96318439150177e-06, "loss": 0.2083, "step": 1852 }, { "epoch": 0.8647388059701493, "grad_norm": 1.1268023588165434, "learning_rate": 8.959872778818842e-06, "loss": 0.1802, "step": 1854 }, { "epoch": 0.8656716417910447, "grad_norm": 1.1047768811521825, "learning_rate": 8.956556499715293e-06, "loss": 0.1556, "step": 1856 }, { "epoch": 0.8666044776119403, "grad_norm": 1.2853101919352707, "learning_rate": 8.953235558099116e-06, "loss": 0.1996, "step": 1858 }, { "epoch": 0.8675373134328358, "grad_norm": 1.0491374239410929, "learning_rate": 8.9499099578838e-06, "loss": 0.1678, "step": 1860 }, { "epoch": 0.8684701492537313, "grad_norm": 1.1943500524679493, "learning_rate": 8.94657970298832e-06, "loss": 0.1725, "step": 1862 }, { "epoch": 0.8694029850746269, "grad_norm": 1.2729108977994532, "learning_rate": 8.943244797337138e-06, "loss": 0.1957, "step": 1864 }, { "epoch": 0.8703358208955224, "grad_norm": 1.4708930606205595, "learning_rate": 8.939905244860197e-06, "loss": 0.1969, "step": 1866 }, { "epoch": 0.871268656716418, "grad_norm": 1.2054236478211537, "learning_rate": 8.936561049492913e-06, "loss": 0.2079, "step": 1868 }, { "epoch": 0.8722014925373134, "grad_norm": 1.158524430953715, "learning_rate": 8.933212215176181e-06, "loss": 0.1746, "step": 1870 }, { "epoch": 0.8731343283582089, "grad_norm": 1.1455885414104536, "learning_rate": 8.929858745856353e-06, "loss": 0.182, "step": 1872 }, { "epoch": 0.8740671641791045, "grad_norm": 1.3397135787222707, "learning_rate": 8.92650064548525e-06, "loss": 0.189, "step": 1874 }, { "epoch": 0.875, "grad_norm": 1.1191318015925886, "learning_rate": 8.923137918020147e-06, "loss": 0.1633, "step": 1876 }, { "epoch": 0.8759328358208955, "grad_norm": 1.2274352089475598, "learning_rate": 8.919770567423772e-06, "loss": 0.1844, "step": 1878 }, { "epoch": 0.8768656716417911, "grad_norm": 1.2472104437638762, "learning_rate": 8.916398597664299e-06, "loss": 0.1922, "step": 1880 }, { "epoch": 0.8777985074626866, "grad_norm": 1.286605431682373, "learning_rate": 8.913022012715355e-06, "loss": 0.168, "step": 1882 }, { "epoch": 0.878731343283582, "grad_norm": 1.3154722505682137, "learning_rate": 8.909640816555992e-06, "loss": 0.1866, "step": 1884 }, { "epoch": 0.8796641791044776, "grad_norm": 1.223951896171603, "learning_rate": 8.906255013170707e-06, "loss": 0.1913, "step": 1886 }, { "epoch": 0.8805970149253731, "grad_norm": 1.1693079070240726, "learning_rate": 8.902864606549417e-06, "loss": 0.1767, "step": 1888 }, { "epoch": 0.8815298507462687, "grad_norm": 1.2568555877047018, "learning_rate": 8.899469600687472e-06, "loss": 0.1875, "step": 1890 }, { "epoch": 0.8824626865671642, "grad_norm": 1.248160786151591, "learning_rate": 8.896069999585636e-06, "loss": 0.2135, "step": 1892 }, { "epoch": 0.8833955223880597, "grad_norm": 1.1431255675657805, "learning_rate": 8.892665807250093e-06, "loss": 0.186, "step": 1894 }, { "epoch": 0.8843283582089553, "grad_norm": 1.1294114890786784, "learning_rate": 8.889257027692433e-06, "loss": 0.1843, "step": 1896 }, { "epoch": 0.8852611940298507, "grad_norm": 1.1933808037745908, "learning_rate": 8.885843664929654e-06, "loss": 0.1767, "step": 1898 }, { "epoch": 0.8861940298507462, "grad_norm": 1.2792703435555342, "learning_rate": 8.882425722984156e-06, "loss": 0.1991, "step": 1900 }, { "epoch": 0.8871268656716418, "grad_norm": 1.208330774950831, "learning_rate": 8.879003205883729e-06, "loss": 0.1798, "step": 1902 }, { "epoch": 0.8880597014925373, "grad_norm": 1.2296659816391375, "learning_rate": 8.875576117661565e-06, "loss": 0.1755, "step": 1904 }, { "epoch": 0.8889925373134329, "grad_norm": 1.240600067039028, "learning_rate": 8.872144462356234e-06, "loss": 0.1863, "step": 1906 }, { "epoch": 0.8899253731343284, "grad_norm": 1.0705498227682964, "learning_rate": 8.868708244011692e-06, "loss": 0.154, "step": 1908 }, { "epoch": 0.8908582089552238, "grad_norm": 1.2268381212547212, "learning_rate": 8.86526746667727e-06, "loss": 0.1687, "step": 1910 }, { "epoch": 0.8917910447761194, "grad_norm": 1.1337934021380822, "learning_rate": 8.861822134407671e-06, "loss": 0.1803, "step": 1912 }, { "epoch": 0.8927238805970149, "grad_norm": 1.195600681526038, "learning_rate": 8.858372251262972e-06, "loss": 0.1648, "step": 1914 }, { "epoch": 0.8936567164179104, "grad_norm": 1.2273955675181596, "learning_rate": 8.854917821308606e-06, "loss": 0.1572, "step": 1916 }, { "epoch": 0.894589552238806, "grad_norm": 1.17826861051738, "learning_rate": 8.851458848615364e-06, "loss": 0.1839, "step": 1918 }, { "epoch": 0.8955223880597015, "grad_norm": 1.122509041686198, "learning_rate": 8.847995337259394e-06, "loss": 0.1697, "step": 1920 }, { "epoch": 0.8964552238805971, "grad_norm": 1.257363825091277, "learning_rate": 8.844527291322192e-06, "loss": 0.178, "step": 1922 }, { "epoch": 0.8973880597014925, "grad_norm": 1.3004284149438885, "learning_rate": 8.841054714890596e-06, "loss": 0.1768, "step": 1924 }, { "epoch": 0.898320895522388, "grad_norm": 1.1392906799925309, "learning_rate": 8.837577612056782e-06, "loss": 0.1729, "step": 1926 }, { "epoch": 0.8992537313432836, "grad_norm": 1.080460792775906, "learning_rate": 8.834095986918265e-06, "loss": 0.1603, "step": 1928 }, { "epoch": 0.9001865671641791, "grad_norm": 1.120023878209664, "learning_rate": 8.830609843577882e-06, "loss": 0.1685, "step": 1930 }, { "epoch": 0.9011194029850746, "grad_norm": 1.110558451361426, "learning_rate": 8.8271191861438e-06, "loss": 0.1509, "step": 1932 }, { "epoch": 0.9020522388059702, "grad_norm": 1.2917406041428532, "learning_rate": 8.823624018729503e-06, "loss": 0.1895, "step": 1934 }, { "epoch": 0.9029850746268657, "grad_norm": 1.230968075446837, "learning_rate": 8.820124345453791e-06, "loss": 0.1789, "step": 1936 }, { "epoch": 0.9039179104477612, "grad_norm": 1.367855006225693, "learning_rate": 8.816620170440774e-06, "loss": 0.1989, "step": 1938 }, { "epoch": 0.9048507462686567, "grad_norm": 1.0454936621086743, "learning_rate": 8.813111497819861e-06, "loss": 0.1873, "step": 1940 }, { "epoch": 0.9057835820895522, "grad_norm": 1.3314003474956535, "learning_rate": 8.809598331725772e-06, "loss": 0.167, "step": 1942 }, { "epoch": 0.9067164179104478, "grad_norm": 1.2678674135148693, "learning_rate": 8.806080676298516e-06, "loss": 0.1854, "step": 1944 }, { "epoch": 0.9076492537313433, "grad_norm": 1.2768172203081714, "learning_rate": 8.80255853568339e-06, "loss": 0.1908, "step": 1946 }, { "epoch": 0.9085820895522388, "grad_norm": 1.1818274374168733, "learning_rate": 8.79903191403098e-06, "loss": 0.1622, "step": 1948 }, { "epoch": 0.9095149253731343, "grad_norm": 1.2225620547007063, "learning_rate": 8.795500815497154e-06, "loss": 0.2031, "step": 1950 }, { "epoch": 0.9104477611940298, "grad_norm": 1.1257924983539633, "learning_rate": 8.79196524424305e-06, "loss": 0.1653, "step": 1952 }, { "epoch": 0.9113805970149254, "grad_norm": 1.1028340032016384, "learning_rate": 8.788425204435082e-06, "loss": 0.1701, "step": 1954 }, { "epoch": 0.9123134328358209, "grad_norm": 1.1342103768564358, "learning_rate": 8.78488070024493e-06, "loss": 0.1692, "step": 1956 }, { "epoch": 0.9132462686567164, "grad_norm": 1.1866863060409858, "learning_rate": 8.781331735849532e-06, "loss": 0.1956, "step": 1958 }, { "epoch": 0.914179104477612, "grad_norm": 1.2104677377772022, "learning_rate": 8.77777831543108e-06, "loss": 0.1748, "step": 1960 }, { "epoch": 0.9151119402985075, "grad_norm": 1.2750638621133845, "learning_rate": 8.774220443177024e-06, "loss": 0.1852, "step": 1962 }, { "epoch": 0.9160447761194029, "grad_norm": 1.2456050181530693, "learning_rate": 8.770658123280056e-06, "loss": 0.2078, "step": 1964 }, { "epoch": 0.9169776119402985, "grad_norm": 1.1513211751202366, "learning_rate": 8.76709135993811e-06, "loss": 0.1623, "step": 1966 }, { "epoch": 0.917910447761194, "grad_norm": 1.214606556576558, "learning_rate": 8.763520157354352e-06, "loss": 0.1907, "step": 1968 }, { "epoch": 0.9188432835820896, "grad_norm": 1.2558675406655826, "learning_rate": 8.759944519737186e-06, "loss": 0.1706, "step": 1970 }, { "epoch": 0.9197761194029851, "grad_norm": 1.2009512916282046, "learning_rate": 8.756364451300241e-06, "loss": 0.1796, "step": 1972 }, { "epoch": 0.9207089552238806, "grad_norm": 1.2209617116507063, "learning_rate": 8.752779956262363e-06, "loss": 0.2001, "step": 1974 }, { "epoch": 0.9216417910447762, "grad_norm": 1.1823752723455987, "learning_rate": 8.749191038847619e-06, "loss": 0.1969, "step": 1976 }, { "epoch": 0.9225746268656716, "grad_norm": 1.3125507510378662, "learning_rate": 8.745597703285286e-06, "loss": 0.1952, "step": 1978 }, { "epoch": 0.9235074626865671, "grad_norm": 1.1560177166373937, "learning_rate": 8.741999953809847e-06, "loss": 0.167, "step": 1980 }, { "epoch": 0.9244402985074627, "grad_norm": 1.3829461046588156, "learning_rate": 8.738397794660986e-06, "loss": 0.1846, "step": 1982 }, { "epoch": 0.9253731343283582, "grad_norm": 1.3775977676280338, "learning_rate": 8.734791230083586e-06, "loss": 0.1827, "step": 1984 }, { "epoch": 0.9263059701492538, "grad_norm": 1.1430102314378263, "learning_rate": 8.73118026432772e-06, "loss": 0.1956, "step": 1986 }, { "epoch": 0.9272388059701493, "grad_norm": 1.3056522149119958, "learning_rate": 8.727564901648645e-06, "loss": 0.1953, "step": 1988 }, { "epoch": 0.9281716417910447, "grad_norm": 1.3386146049280576, "learning_rate": 8.723945146306801e-06, "loss": 0.1956, "step": 1990 }, { "epoch": 0.9291044776119403, "grad_norm": 1.2677928673430818, "learning_rate": 8.720321002567807e-06, "loss": 0.1858, "step": 1992 }, { "epoch": 0.9300373134328358, "grad_norm": 1.0720634605057584, "learning_rate": 8.71669247470245e-06, "loss": 0.1558, "step": 1994 }, { "epoch": 0.9309701492537313, "grad_norm": 1.1807522950979514, "learning_rate": 8.71305956698669e-06, "loss": 0.1576, "step": 1996 }, { "epoch": 0.9319029850746269, "grad_norm": 1.094363383366357, "learning_rate": 8.709422283701634e-06, "loss": 0.1784, "step": 1998 }, { "epoch": 0.9328358208955224, "grad_norm": 1.1430589260003636, "learning_rate": 8.705780629133565e-06, "loss": 0.1812, "step": 2000 }, { "epoch": 0.9328358208955224, "eval_loss": 0.16347220540046692, "eval_runtime": 322.8513, "eval_samples_per_second": 47.223, "eval_steps_per_second": 5.904, "step": 2000 }, { "epoch": 0.933768656716418, "grad_norm": 1.3181785803134747, "learning_rate": 8.702134607573898e-06, "loss": 0.2088, "step": 2002 }, { "epoch": 0.9347014925373134, "grad_norm": 1.114139466989517, "learning_rate": 8.698484223319206e-06, "loss": 0.1651, "step": 2004 }, { "epoch": 0.9356343283582089, "grad_norm": 1.2600477290467267, "learning_rate": 8.694829480671202e-06, "loss": 0.1851, "step": 2006 }, { "epoch": 0.9365671641791045, "grad_norm": 1.2547571579332766, "learning_rate": 8.69117038393673e-06, "loss": 0.194, "step": 2008 }, { "epoch": 0.9375, "grad_norm": 1.2823745810043528, "learning_rate": 8.68750693742777e-06, "loss": 0.1666, "step": 2010 }, { "epoch": 0.9384328358208955, "grad_norm": 1.2448797285533804, "learning_rate": 8.683839145461425e-06, "loss": 0.1928, "step": 2012 }, { "epoch": 0.9393656716417911, "grad_norm": 1.273332521268346, "learning_rate": 8.680167012359922e-06, "loss": 0.2133, "step": 2014 }, { "epoch": 0.9402985074626866, "grad_norm": 1.4296756949185516, "learning_rate": 8.676490542450597e-06, "loss": 0.1799, "step": 2016 }, { "epoch": 0.941231343283582, "grad_norm": 1.080809866903534, "learning_rate": 8.672809740065904e-06, "loss": 0.1853, "step": 2018 }, { "epoch": 0.9421641791044776, "grad_norm": 1.1436099260958916, "learning_rate": 8.6691246095434e-06, "loss": 0.1734, "step": 2020 }, { "epoch": 0.9430970149253731, "grad_norm": 1.3313725691681668, "learning_rate": 8.665435155225741e-06, "loss": 0.1813, "step": 2022 }, { "epoch": 0.9440298507462687, "grad_norm": 1.3612550524272353, "learning_rate": 8.661741381460677e-06, "loss": 0.2019, "step": 2024 }, { "epoch": 0.9449626865671642, "grad_norm": 1.1994714663237427, "learning_rate": 8.658043292601055e-06, "loss": 0.1926, "step": 2026 }, { "epoch": 0.9458955223880597, "grad_norm": 1.245193221917958, "learning_rate": 8.6543408930048e-06, "loss": 0.1758, "step": 2028 }, { "epoch": 0.9468283582089553, "grad_norm": 1.1664629995318132, "learning_rate": 8.650634187034918e-06, "loss": 0.1662, "step": 2030 }, { "epoch": 0.9477611940298507, "grad_norm": 1.3327368080660713, "learning_rate": 8.646923179059494e-06, "loss": 0.2028, "step": 2032 }, { "epoch": 0.9486940298507462, "grad_norm": 1.1966686306458933, "learning_rate": 8.643207873451678e-06, "loss": 0.1791, "step": 2034 }, { "epoch": 0.9496268656716418, "grad_norm": 1.2009028963320556, "learning_rate": 8.639488274589685e-06, "loss": 0.1904, "step": 2036 }, { "epoch": 0.9505597014925373, "grad_norm": 1.0929753841351104, "learning_rate": 8.635764386856794e-06, "loss": 0.1712, "step": 2038 }, { "epoch": 0.9514925373134329, "grad_norm": 1.1409877331710687, "learning_rate": 8.632036214641328e-06, "loss": 0.1917, "step": 2040 }, { "epoch": 0.9524253731343284, "grad_norm": 1.0857445185367467, "learning_rate": 8.628303762336671e-06, "loss": 0.1781, "step": 2042 }, { "epoch": 0.9533582089552238, "grad_norm": 1.2711429588160323, "learning_rate": 8.624567034341245e-06, "loss": 0.1763, "step": 2044 }, { "epoch": 0.9542910447761194, "grad_norm": 1.411093941004422, "learning_rate": 8.620826035058509e-06, "loss": 0.1752, "step": 2046 }, { "epoch": 0.9552238805970149, "grad_norm": 6.390236743573979, "learning_rate": 8.617080768896958e-06, "loss": 0.1729, "step": 2048 }, { "epoch": 0.9561567164179104, "grad_norm": 3.544159567320888, "learning_rate": 8.613331240270114e-06, "loss": 0.1703, "step": 2050 }, { "epoch": 0.957089552238806, "grad_norm": 1.442466623942355, "learning_rate": 8.609577453596521e-06, "loss": 0.1795, "step": 2052 }, { "epoch": 0.9580223880597015, "grad_norm": 0.9843433823848003, "learning_rate": 8.605819413299744e-06, "loss": 0.1608, "step": 2054 }, { "epoch": 0.9589552238805971, "grad_norm": 1.110051693804096, "learning_rate": 8.602057123808359e-06, "loss": 0.1827, "step": 2056 }, { "epoch": 0.9598880597014925, "grad_norm": 1.1433136186836552, "learning_rate": 8.59829058955595e-06, "loss": 0.1734, "step": 2058 }, { "epoch": 0.960820895522388, "grad_norm": 1.2138425480613468, "learning_rate": 8.594519814981098e-06, "loss": 0.1741, "step": 2060 }, { "epoch": 0.9617537313432836, "grad_norm": 1.157195368563672, "learning_rate": 8.590744804527388e-06, "loss": 0.1718, "step": 2062 }, { "epoch": 0.9626865671641791, "grad_norm": 1.4532329436175, "learning_rate": 8.586965562643397e-06, "loss": 0.168, "step": 2064 }, { "epoch": 0.9636194029850746, "grad_norm": 1.0872687701407575, "learning_rate": 8.583182093782682e-06, "loss": 0.1721, "step": 2066 }, { "epoch": 0.9645522388059702, "grad_norm": 1.3160180813549756, "learning_rate": 8.579394402403784e-06, "loss": 0.1656, "step": 2068 }, { "epoch": 0.9654850746268657, "grad_norm": 1.1101848042615787, "learning_rate": 8.575602492970221e-06, "loss": 0.1738, "step": 2070 }, { "epoch": 0.9664179104477612, "grad_norm": 1.193049436941456, "learning_rate": 8.571806369950482e-06, "loss": 0.1746, "step": 2072 }, { "epoch": 0.9673507462686567, "grad_norm": 1.1249109855183856, "learning_rate": 8.56800603781802e-06, "loss": 0.173, "step": 2074 }, { "epoch": 0.9682835820895522, "grad_norm": 1.185244259646379, "learning_rate": 8.564201501051247e-06, "loss": 0.1695, "step": 2076 }, { "epoch": 0.9692164179104478, "grad_norm": 1.1947301263077779, "learning_rate": 8.560392764133535e-06, "loss": 0.1921, "step": 2078 }, { "epoch": 0.9701492537313433, "grad_norm": 1.3941382122868435, "learning_rate": 8.556579831553198e-06, "loss": 0.1733, "step": 2080 }, { "epoch": 0.9710820895522388, "grad_norm": 1.2007817139353856, "learning_rate": 8.5527627078035e-06, "loss": 0.1823, "step": 2082 }, { "epoch": 0.9720149253731343, "grad_norm": 1.1294016590691875, "learning_rate": 8.548941397382647e-06, "loss": 0.1669, "step": 2084 }, { "epoch": 0.9729477611940298, "grad_norm": 1.2388902899558032, "learning_rate": 8.545115904793765e-06, "loss": 0.1945, "step": 2086 }, { "epoch": 0.9738805970149254, "grad_norm": 1.2592255783170496, "learning_rate": 8.541286234544923e-06, "loss": 0.1641, "step": 2088 }, { "epoch": 0.9748134328358209, "grad_norm": 1.199692607254671, "learning_rate": 8.537452391149108e-06, "loss": 0.1849, "step": 2090 }, { "epoch": 0.9757462686567164, "grad_norm": 1.0192323762345974, "learning_rate": 8.533614379124221e-06, "loss": 0.1571, "step": 2092 }, { "epoch": 0.976679104477612, "grad_norm": 1.199340760780135, "learning_rate": 8.529772202993083e-06, "loss": 0.1641, "step": 2094 }, { "epoch": 0.9776119402985075, "grad_norm": 1.2855292891688954, "learning_rate": 8.525925867283414e-06, "loss": 0.1956, "step": 2096 }, { "epoch": 0.9785447761194029, "grad_norm": 1.1278401946322196, "learning_rate": 8.52207537652784e-06, "loss": 0.1789, "step": 2098 }, { "epoch": 0.9794776119402985, "grad_norm": 1.161086795188338, "learning_rate": 8.518220735263884e-06, "loss": 0.1751, "step": 2100 }, { "epoch": 0.980410447761194, "grad_norm": 1.1844957643011036, "learning_rate": 8.514361948033958e-06, "loss": 0.1889, "step": 2102 }, { "epoch": 0.9813432835820896, "grad_norm": 1.1280350555176875, "learning_rate": 8.510499019385362e-06, "loss": 0.1935, "step": 2104 }, { "epoch": 0.9822761194029851, "grad_norm": 1.2231857221982496, "learning_rate": 8.506631953870272e-06, "loss": 0.1785, "step": 2106 }, { "epoch": 0.9832089552238806, "grad_norm": 1.1940335590482019, "learning_rate": 8.502760756045747e-06, "loss": 0.1988, "step": 2108 }, { "epoch": 0.9841417910447762, "grad_norm": 1.0584137234671658, "learning_rate": 8.498885430473707e-06, "loss": 0.1618, "step": 2110 }, { "epoch": 0.9850746268656716, "grad_norm": 1.070914953967085, "learning_rate": 8.495005981720941e-06, "loss": 0.189, "step": 2112 }, { "epoch": 0.9860074626865671, "grad_norm": 1.1323344114228981, "learning_rate": 8.491122414359095e-06, "loss": 0.1833, "step": 2114 }, { "epoch": 0.9869402985074627, "grad_norm": 1.13383803765909, "learning_rate": 8.487234732964669e-06, "loss": 0.162, "step": 2116 }, { "epoch": 0.9878731343283582, "grad_norm": 1.11065834174197, "learning_rate": 8.483342942119013e-06, "loss": 0.1788, "step": 2118 }, { "epoch": 0.9888059701492538, "grad_norm": 1.1897547761209817, "learning_rate": 8.479447046408318e-06, "loss": 0.1809, "step": 2120 }, { "epoch": 0.9897388059701493, "grad_norm": 1.1087895452610104, "learning_rate": 8.475547050423611e-06, "loss": 0.1474, "step": 2122 }, { "epoch": 0.9906716417910447, "grad_norm": 1.1557766829891358, "learning_rate": 8.471642958760752e-06, "loss": 0.1767, "step": 2124 }, { "epoch": 0.9916044776119403, "grad_norm": 1.2341066515607721, "learning_rate": 8.46773477602043e-06, "loss": 0.1952, "step": 2126 }, { "epoch": 0.9925373134328358, "grad_norm": 1.2539767202851184, "learning_rate": 8.463822506808151e-06, "loss": 0.1934, "step": 2128 }, { "epoch": 0.9934701492537313, "grad_norm": 1.245565336178777, "learning_rate": 8.45990615573424e-06, "loss": 0.1792, "step": 2130 }, { "epoch": 0.9944029850746269, "grad_norm": 1.1128968338688787, "learning_rate": 8.455985727413825e-06, "loss": 0.1809, "step": 2132 }, { "epoch": 0.9953358208955224, "grad_norm": 1.0831619753015387, "learning_rate": 8.45206122646685e-06, "loss": 0.1783, "step": 2134 }, { "epoch": 0.996268656716418, "grad_norm": 1.0810998238140512, "learning_rate": 8.44813265751805e-06, "loss": 0.1703, "step": 2136 }, { "epoch": 0.9972014925373134, "grad_norm": 1.1242809365566895, "learning_rate": 8.444200025196958e-06, "loss": 0.1697, "step": 2138 }, { "epoch": 0.9981343283582089, "grad_norm": 1.2553514854044407, "learning_rate": 8.440263334137892e-06, "loss": 0.1915, "step": 2140 }, { "epoch": 0.9990671641791045, "grad_norm": 1.1214403124297583, "learning_rate": 8.436322588979955e-06, "loss": 0.172, "step": 2142 }, { "epoch": 1.0, "grad_norm": 1.1407138244992432, "learning_rate": 8.432377794367028e-06, "loss": 0.1769, "step": 2144 }, { "epoch": 1.0009328358208955, "grad_norm": 0.9459632198413176, "learning_rate": 8.428428954947762e-06, "loss": 0.1104, "step": 2146 }, { "epoch": 1.001865671641791, "grad_norm": 0.9878390373694901, "learning_rate": 8.424476075375578e-06, "loss": 0.119, "step": 2148 }, { "epoch": 1.0027985074626866, "grad_norm": 0.892096781938565, "learning_rate": 8.420519160308651e-06, "loss": 0.1056, "step": 2150 }, { "epoch": 1.0037313432835822, "grad_norm": 2.202199497780502, "learning_rate": 8.41655821440992e-06, "loss": 0.1152, "step": 2152 }, { "epoch": 1.0046641791044777, "grad_norm": 1.2163148100669832, "learning_rate": 8.41259324234707e-06, "loss": 0.1196, "step": 2154 }, { "epoch": 1.0055970149253732, "grad_norm": 0.895501258908727, "learning_rate": 8.40862424879253e-06, "loss": 0.0988, "step": 2156 }, { "epoch": 1.0065298507462686, "grad_norm": 1.1499497762307112, "learning_rate": 8.40465123842347e-06, "loss": 0.1125, "step": 2158 }, { "epoch": 1.007462686567164, "grad_norm": 1.132993408683864, "learning_rate": 8.400674215921786e-06, "loss": 0.1016, "step": 2160 }, { "epoch": 1.0083955223880596, "grad_norm": 1.2184853847812682, "learning_rate": 8.396693185974118e-06, "loss": 0.1085, "step": 2162 }, { "epoch": 1.0093283582089552, "grad_norm": 1.0410662413518412, "learning_rate": 8.392708153271813e-06, "loss": 0.1025, "step": 2164 }, { "epoch": 1.0102611940298507, "grad_norm": 1.1724708193257103, "learning_rate": 8.388719122510943e-06, "loss": 0.1102, "step": 2166 }, { "epoch": 1.0111940298507462, "grad_norm": 1.2169251119295637, "learning_rate": 8.384726098392286e-06, "loss": 0.1158, "step": 2168 }, { "epoch": 1.0121268656716418, "grad_norm": 1.2153613709875557, "learning_rate": 8.380729085621331e-06, "loss": 0.128, "step": 2170 }, { "epoch": 1.0130597014925373, "grad_norm": 1.052983482028831, "learning_rate": 8.376728088908268e-06, "loss": 0.1031, "step": 2172 }, { "epoch": 1.0139925373134329, "grad_norm": 1.3234904654899597, "learning_rate": 8.372723112967974e-06, "loss": 0.1236, "step": 2174 }, { "epoch": 1.0149253731343284, "grad_norm": 1.1117963691600927, "learning_rate": 8.368714162520024e-06, "loss": 0.1132, "step": 2176 }, { "epoch": 1.015858208955224, "grad_norm": 1.0474400456234658, "learning_rate": 8.364701242288673e-06, "loss": 0.0981, "step": 2178 }, { "epoch": 1.0167910447761195, "grad_norm": 1.0809304922906753, "learning_rate": 8.360684357002853e-06, "loss": 0.1068, "step": 2180 }, { "epoch": 1.017723880597015, "grad_norm": 1.0628024771301163, "learning_rate": 8.356663511396169e-06, "loss": 0.1074, "step": 2182 }, { "epoch": 1.0186567164179103, "grad_norm": 1.1334774517081387, "learning_rate": 8.352638710206895e-06, "loss": 0.0995, "step": 2184 }, { "epoch": 1.0195895522388059, "grad_norm": 1.0908983028803378, "learning_rate": 8.348609958177964e-06, "loss": 0.1142, "step": 2186 }, { "epoch": 1.0205223880597014, "grad_norm": 1.0708305141240686, "learning_rate": 8.34457726005697e-06, "loss": 0.0977, "step": 2188 }, { "epoch": 1.021455223880597, "grad_norm": 1.2141354780980165, "learning_rate": 8.340540620596145e-06, "loss": 0.1178, "step": 2190 }, { "epoch": 1.0223880597014925, "grad_norm": 1.1602233732700098, "learning_rate": 8.33650004455238e-06, "loss": 0.1125, "step": 2192 }, { "epoch": 1.023320895522388, "grad_norm": 1.18008855873969, "learning_rate": 8.332455536687196e-06, "loss": 0.1167, "step": 2194 }, { "epoch": 1.0242537313432836, "grad_norm": 0.9893201186088032, "learning_rate": 8.328407101766752e-06, "loss": 0.0977, "step": 2196 }, { "epoch": 1.025186567164179, "grad_norm": 1.0727376140142242, "learning_rate": 8.324354744561829e-06, "loss": 0.1068, "step": 2198 }, { "epoch": 1.0261194029850746, "grad_norm": 1.0449511698044345, "learning_rate": 8.320298469847836e-06, "loss": 0.0984, "step": 2200 }, { "epoch": 1.0270522388059702, "grad_norm": 1.135678129051182, "learning_rate": 8.316238282404795e-06, "loss": 0.1099, "step": 2202 }, { "epoch": 1.0279850746268657, "grad_norm": 1.0466436844213056, "learning_rate": 8.312174187017343e-06, "loss": 0.0996, "step": 2204 }, { "epoch": 1.0289179104477613, "grad_norm": 0.987253100313577, "learning_rate": 8.308106188474716e-06, "loss": 0.0955, "step": 2206 }, { "epoch": 1.0298507462686568, "grad_norm": 1.1928717958128783, "learning_rate": 8.304034291570757e-06, "loss": 0.1131, "step": 2208 }, { "epoch": 1.0307835820895523, "grad_norm": 1.023793525835979, "learning_rate": 8.299958501103892e-06, "loss": 0.1136, "step": 2210 }, { "epoch": 1.0317164179104477, "grad_norm": 1.115450382287538, "learning_rate": 8.29587882187715e-06, "loss": 0.119, "step": 2212 }, { "epoch": 1.0326492537313432, "grad_norm": 1.1787560514025666, "learning_rate": 8.29179525869813e-06, "loss": 0.1096, "step": 2214 }, { "epoch": 1.0335820895522387, "grad_norm": 1.0840680636375406, "learning_rate": 8.287707816379014e-06, "loss": 0.0961, "step": 2216 }, { "epoch": 1.0345149253731343, "grad_norm": 1.1011647055667009, "learning_rate": 8.283616499736552e-06, "loss": 0.1077, "step": 2218 }, { "epoch": 1.0354477611940298, "grad_norm": 1.1717321795377318, "learning_rate": 8.279521313592067e-06, "loss": 0.1111, "step": 2220 }, { "epoch": 1.0363805970149254, "grad_norm": 1.055449976091991, "learning_rate": 8.27542226277143e-06, "loss": 0.1098, "step": 2222 }, { "epoch": 1.037313432835821, "grad_norm": 1.2746463725407722, "learning_rate": 8.271319352105078e-06, "loss": 0.1215, "step": 2224 }, { "epoch": 1.0382462686567164, "grad_norm": 1.3505954916785239, "learning_rate": 8.267212586427986e-06, "loss": 0.1144, "step": 2226 }, { "epoch": 1.039179104477612, "grad_norm": 0.9954729702087639, "learning_rate": 8.263101970579684e-06, "loss": 0.1108, "step": 2228 }, { "epoch": 1.0401119402985075, "grad_norm": 1.0559102102306708, "learning_rate": 8.258987509404227e-06, "loss": 0.1027, "step": 2230 }, { "epoch": 1.041044776119403, "grad_norm": 1.108109593911891, "learning_rate": 8.254869207750207e-06, "loss": 0.1291, "step": 2232 }, { "epoch": 1.0419776119402986, "grad_norm": 1.2817400208141994, "learning_rate": 8.250747070470743e-06, "loss": 0.1183, "step": 2234 }, { "epoch": 1.0429104477611941, "grad_norm": 1.0460244622141617, "learning_rate": 8.246621102423474e-06, "loss": 0.104, "step": 2236 }, { "epoch": 1.0438432835820897, "grad_norm": 0.99446670791871, "learning_rate": 8.242491308470548e-06, "loss": 0.1041, "step": 2238 }, { "epoch": 1.044776119402985, "grad_norm": 1.0129009049855824, "learning_rate": 8.23835769347863e-06, "loss": 0.1, "step": 2240 }, { "epoch": 1.0457089552238805, "grad_norm": 1.168358638609721, "learning_rate": 8.234220262318876e-06, "loss": 0.1164, "step": 2242 }, { "epoch": 1.046641791044776, "grad_norm": 1.1503486614148268, "learning_rate": 8.230079019866955e-06, "loss": 0.1083, "step": 2244 }, { "epoch": 1.0475746268656716, "grad_norm": 1.2380001019780191, "learning_rate": 8.225933971003011e-06, "loss": 0.1131, "step": 2246 }, { "epoch": 1.0485074626865671, "grad_norm": 1.0564780902929154, "learning_rate": 8.221785120611687e-06, "loss": 0.1111, "step": 2248 }, { "epoch": 1.0494402985074627, "grad_norm": 1.1887774356368606, "learning_rate": 8.217632473582096e-06, "loss": 0.1161, "step": 2250 }, { "epoch": 1.0503731343283582, "grad_norm": 1.138674016396561, "learning_rate": 8.213476034807827e-06, "loss": 0.1185, "step": 2252 }, { "epoch": 1.0513059701492538, "grad_norm": 1.0465094457705495, "learning_rate": 8.209315809186946e-06, "loss": 0.1095, "step": 2254 }, { "epoch": 1.0522388059701493, "grad_norm": 0.9444953157852821, "learning_rate": 8.205151801621971e-06, "loss": 0.103, "step": 2256 }, { "epoch": 1.0531716417910448, "grad_norm": 1.2998634455200997, "learning_rate": 8.20098401701988e-06, "loss": 0.1138, "step": 2258 }, { "epoch": 1.0541044776119404, "grad_norm": 0.9270483377144897, "learning_rate": 8.196812460292105e-06, "loss": 0.1084, "step": 2260 }, { "epoch": 1.055037313432836, "grad_norm": 1.1314657621167346, "learning_rate": 8.192637136354516e-06, "loss": 0.1075, "step": 2262 }, { "epoch": 1.0559701492537314, "grad_norm": 0.9303235925194865, "learning_rate": 8.188458050127433e-06, "loss": 0.0977, "step": 2264 }, { "epoch": 1.0569029850746268, "grad_norm": 1.2776777339806886, "learning_rate": 8.184275206535598e-06, "loss": 0.1187, "step": 2266 }, { "epoch": 1.0578358208955223, "grad_norm": 1.147933625291267, "learning_rate": 8.18008861050819e-06, "loss": 0.1201, "step": 2268 }, { "epoch": 1.0587686567164178, "grad_norm": 1.2369565673691505, "learning_rate": 8.175898266978805e-06, "loss": 0.1093, "step": 2270 }, { "epoch": 1.0597014925373134, "grad_norm": 1.1020580611564963, "learning_rate": 8.171704180885457e-06, "loss": 0.1142, "step": 2272 }, { "epoch": 1.060634328358209, "grad_norm": 0.9910602776066643, "learning_rate": 8.167506357170572e-06, "loss": 0.1057, "step": 2274 }, { "epoch": 1.0615671641791045, "grad_norm": 1.1072240546374714, "learning_rate": 8.163304800780975e-06, "loss": 0.1116, "step": 2276 }, { "epoch": 1.0625, "grad_norm": 1.2689376618750916, "learning_rate": 8.159099516667894e-06, "loss": 0.1141, "step": 2278 }, { "epoch": 1.0634328358208955, "grad_norm": 1.1840258899877902, "learning_rate": 8.15489050978695e-06, "loss": 0.1034, "step": 2280 }, { "epoch": 1.064365671641791, "grad_norm": 1.1334988296378041, "learning_rate": 8.150677785098149e-06, "loss": 0.1082, "step": 2282 }, { "epoch": 1.0652985074626866, "grad_norm": 1.0671351987050588, "learning_rate": 8.146461347565878e-06, "loss": 0.1073, "step": 2284 }, { "epoch": 1.0662313432835822, "grad_norm": 1.1808573891023184, "learning_rate": 8.142241202158904e-06, "loss": 0.1213, "step": 2286 }, { "epoch": 1.0671641791044777, "grad_norm": 1.0003403838040124, "learning_rate": 8.138017353850357e-06, "loss": 0.1007, "step": 2288 }, { "epoch": 1.0680970149253732, "grad_norm": 1.3889633705059334, "learning_rate": 8.133789807617734e-06, "loss": 0.1087, "step": 2290 }, { "epoch": 1.0690298507462686, "grad_norm": 1.1253435264958411, "learning_rate": 8.12955856844289e-06, "loss": 0.1125, "step": 2292 }, { "epoch": 1.069962686567164, "grad_norm": 1.0014264480234976, "learning_rate": 8.125323641312029e-06, "loss": 0.1031, "step": 2294 }, { "epoch": 1.0708955223880596, "grad_norm": 1.1620509798088041, "learning_rate": 8.121085031215705e-06, "loss": 0.1125, "step": 2296 }, { "epoch": 1.0718283582089552, "grad_norm": 0.9974077503117971, "learning_rate": 8.116842743148811e-06, "loss": 0.1073, "step": 2298 }, { "epoch": 1.0727611940298507, "grad_norm": 1.168722766350595, "learning_rate": 8.11259678211057e-06, "loss": 0.1096, "step": 2300 }, { "epoch": 1.0736940298507462, "grad_norm": 0.9697203942724552, "learning_rate": 8.108347153104543e-06, "loss": 0.106, "step": 2302 }, { "epoch": 1.0746268656716418, "grad_norm": 1.168531378243434, "learning_rate": 8.1040938611386e-06, "loss": 0.1153, "step": 2304 }, { "epoch": 1.0755597014925373, "grad_norm": 1.0871744617031431, "learning_rate": 8.099836911224938e-06, "loss": 0.1135, "step": 2306 }, { "epoch": 1.0764925373134329, "grad_norm": 1.0082360140508957, "learning_rate": 8.095576308380061e-06, "loss": 0.0853, "step": 2308 }, { "epoch": 1.0774253731343284, "grad_norm": 1.0715152510214832, "learning_rate": 8.091312057624779e-06, "loss": 0.1017, "step": 2310 }, { "epoch": 1.078358208955224, "grad_norm": 1.2738804788022269, "learning_rate": 8.087044163984197e-06, "loss": 0.1223, "step": 2312 }, { "epoch": 1.0792910447761195, "grad_norm": 1.0180530610338325, "learning_rate": 8.082772632487718e-06, "loss": 0.1042, "step": 2314 }, { "epoch": 1.080223880597015, "grad_norm": 1.3188969587079598, "learning_rate": 8.07849746816903e-06, "loss": 0.1206, "step": 2316 }, { "epoch": 1.0811567164179103, "grad_norm": 1.2444359131901108, "learning_rate": 8.074218676066102e-06, "loss": 0.1201, "step": 2318 }, { "epoch": 1.0820895522388059, "grad_norm": 0.9898485831166415, "learning_rate": 8.069936261221174e-06, "loss": 0.1051, "step": 2320 }, { "epoch": 1.0830223880597014, "grad_norm": 1.1243256433437463, "learning_rate": 8.065650228680762e-06, "loss": 0.1106, "step": 2322 }, { "epoch": 1.083955223880597, "grad_norm": 1.097399022210265, "learning_rate": 8.061360583495643e-06, "loss": 0.1193, "step": 2324 }, { "epoch": 1.0848880597014925, "grad_norm": 1.1995774170744764, "learning_rate": 8.057067330720847e-06, "loss": 0.1146, "step": 2326 }, { "epoch": 1.085820895522388, "grad_norm": 1.1476316055198292, "learning_rate": 8.052770475415661e-06, "loss": 0.1093, "step": 2328 }, { "epoch": 1.0867537313432836, "grad_norm": 1.1039623227771003, "learning_rate": 8.048470022643615e-06, "loss": 0.103, "step": 2330 }, { "epoch": 1.087686567164179, "grad_norm": 1.074020177179, "learning_rate": 8.044165977472476e-06, "loss": 0.1128, "step": 2332 }, { "epoch": 1.0886194029850746, "grad_norm": 1.0352572215204474, "learning_rate": 8.03985834497425e-06, "loss": 0.1033, "step": 2334 }, { "epoch": 1.0895522388059702, "grad_norm": 1.1609993128679654, "learning_rate": 8.035547130225164e-06, "loss": 0.1078, "step": 2336 }, { "epoch": 1.0904850746268657, "grad_norm": 1.0138191128815934, "learning_rate": 8.03123233830567e-06, "loss": 0.1008, "step": 2338 }, { "epoch": 1.0914179104477613, "grad_norm": 1.2026968052357823, "learning_rate": 8.026913974300437e-06, "loss": 0.1234, "step": 2340 }, { "epoch": 1.0923507462686568, "grad_norm": 1.2400742158660756, "learning_rate": 8.022592043298339e-06, "loss": 0.1109, "step": 2342 }, { "epoch": 1.0932835820895523, "grad_norm": 1.1441136454147998, "learning_rate": 8.018266550392457e-06, "loss": 0.1024, "step": 2344 }, { "epoch": 1.0942164179104477, "grad_norm": 1.0247183710109928, "learning_rate": 8.013937500680068e-06, "loss": 0.1032, "step": 2346 }, { "epoch": 1.0951492537313432, "grad_norm": 1.1698469519200057, "learning_rate": 8.00960489926264e-06, "loss": 0.114, "step": 2348 }, { "epoch": 1.0960820895522387, "grad_norm": 1.224534109351433, "learning_rate": 8.005268751245827e-06, "loss": 0.1112, "step": 2350 }, { "epoch": 1.0970149253731343, "grad_norm": 1.1399058033450726, "learning_rate": 8.000929061739463e-06, "loss": 0.1054, "step": 2352 }, { "epoch": 1.0979477611940298, "grad_norm": 1.1108704714855089, "learning_rate": 7.996585835857557e-06, "loss": 0.107, "step": 2354 }, { "epoch": 1.0988805970149254, "grad_norm": 1.1461037156809597, "learning_rate": 7.99223907871828e-06, "loss": 0.1228, "step": 2356 }, { "epoch": 1.099813432835821, "grad_norm": 1.1485206383984528, "learning_rate": 7.987888795443968e-06, "loss": 0.1134, "step": 2358 }, { "epoch": 1.1007462686567164, "grad_norm": 1.0878504488993384, "learning_rate": 7.983534991161113e-06, "loss": 0.1016, "step": 2360 }, { "epoch": 1.101679104477612, "grad_norm": 1.0434582793621598, "learning_rate": 7.979177671000353e-06, "loss": 0.1011, "step": 2362 }, { "epoch": 1.1026119402985075, "grad_norm": 1.0737922103362503, "learning_rate": 7.974816840096475e-06, "loss": 0.1075, "step": 2364 }, { "epoch": 1.103544776119403, "grad_norm": 1.0963592729977907, "learning_rate": 7.970452503588397e-06, "loss": 0.1031, "step": 2366 }, { "epoch": 1.1044776119402986, "grad_norm": 1.1858737460624225, "learning_rate": 7.96608466661917e-06, "loss": 0.1082, "step": 2368 }, { "epoch": 1.1054104477611941, "grad_norm": 1.0911358486899008, "learning_rate": 7.961713334335974e-06, "loss": 0.1053, "step": 2370 }, { "epoch": 1.1063432835820897, "grad_norm": 1.207881509084649, "learning_rate": 7.9573385118901e-06, "loss": 0.1118, "step": 2372 }, { "epoch": 1.107276119402985, "grad_norm": 1.130680420377781, "learning_rate": 7.952960204436959e-06, "loss": 0.1134, "step": 2374 }, { "epoch": 1.1082089552238805, "grad_norm": 1.0973475419262104, "learning_rate": 7.948578417136066e-06, "loss": 0.1058, "step": 2376 }, { "epoch": 1.109141791044776, "grad_norm": 1.0364567984508013, "learning_rate": 7.944193155151037e-06, "loss": 0.0973, "step": 2378 }, { "epoch": 1.1100746268656716, "grad_norm": 1.1250270395725783, "learning_rate": 7.939804423649582e-06, "loss": 0.111, "step": 2380 }, { "epoch": 1.1110074626865671, "grad_norm": 0.9985489805969399, "learning_rate": 7.935412227803502e-06, "loss": 0.0974, "step": 2382 }, { "epoch": 1.1119402985074627, "grad_norm": 1.074147661983881, "learning_rate": 7.931016572788676e-06, "loss": 0.1102, "step": 2384 }, { "epoch": 1.1128731343283582, "grad_norm": 1.0724172371093192, "learning_rate": 7.926617463785067e-06, "loss": 0.1123, "step": 2386 }, { "epoch": 1.1138059701492538, "grad_norm": 0.979687108625746, "learning_rate": 7.922214905976698e-06, "loss": 0.1117, "step": 2388 }, { "epoch": 1.1147388059701493, "grad_norm": 1.1046165980707205, "learning_rate": 7.917808904551662e-06, "loss": 0.1126, "step": 2390 }, { "epoch": 1.1156716417910448, "grad_norm": 1.0569140912233863, "learning_rate": 7.913399464702114e-06, "loss": 0.1043, "step": 2392 }, { "epoch": 1.1166044776119404, "grad_norm": 1.0191588533732199, "learning_rate": 7.908986591624253e-06, "loss": 0.1073, "step": 2394 }, { "epoch": 1.117537313432836, "grad_norm": 1.0089995502261069, "learning_rate": 7.90457029051833e-06, "loss": 0.0837, "step": 2396 }, { "epoch": 1.1184701492537314, "grad_norm": 1.2013326703430722, "learning_rate": 7.900150566588628e-06, "loss": 0.0964, "step": 2398 }, { "epoch": 1.1194029850746268, "grad_norm": 1.0496919708263854, "learning_rate": 7.895727425043476e-06, "loss": 0.0996, "step": 2400 }, { "epoch": 1.1203358208955223, "grad_norm": 1.2574298870825513, "learning_rate": 7.891300871095217e-06, "loss": 0.1089, "step": 2402 }, { "epoch": 1.1212686567164178, "grad_norm": 1.165238281283654, "learning_rate": 7.886870909960223e-06, "loss": 0.1061, "step": 2404 }, { "epoch": 1.1222014925373134, "grad_norm": 1.6844756064011217, "learning_rate": 7.88243754685888e-06, "loss": 0.1102, "step": 2406 }, { "epoch": 1.123134328358209, "grad_norm": 1.2887532192435922, "learning_rate": 7.87800078701558e-06, "loss": 0.1193, "step": 2408 }, { "epoch": 1.1240671641791045, "grad_norm": 1.1222346223872233, "learning_rate": 7.873560635658724e-06, "loss": 0.1042, "step": 2410 }, { "epoch": 1.125, "grad_norm": 1.0562199674654056, "learning_rate": 7.869117098020705e-06, "loss": 0.1079, "step": 2412 }, { "epoch": 1.1259328358208955, "grad_norm": 1.1049798570520013, "learning_rate": 7.864670179337904e-06, "loss": 0.1165, "step": 2414 }, { "epoch": 1.126865671641791, "grad_norm": 1.0545886326050513, "learning_rate": 7.860219884850693e-06, "loss": 0.1158, "step": 2416 }, { "epoch": 1.1277985074626866, "grad_norm": 0.9662435273081368, "learning_rate": 7.855766219803417e-06, "loss": 0.1009, "step": 2418 }, { "epoch": 1.1287313432835822, "grad_norm": 1.0122617670388458, "learning_rate": 7.851309189444396e-06, "loss": 0.097, "step": 2420 }, { "epoch": 1.1296641791044777, "grad_norm": 1.099721695389685, "learning_rate": 7.846848799025914e-06, "loss": 0.1084, "step": 2422 }, { "epoch": 1.1305970149253732, "grad_norm": 1.1184073996904251, "learning_rate": 7.842385053804214e-06, "loss": 0.1168, "step": 2424 }, { "epoch": 1.1315298507462686, "grad_norm": 1.139028773906029, "learning_rate": 7.837917959039495e-06, "loss": 0.1004, "step": 2426 }, { "epoch": 1.132462686567164, "grad_norm": 0.951833695412807, "learning_rate": 7.8334475199959e-06, "loss": 0.0978, "step": 2428 }, { "epoch": 1.1333955223880596, "grad_norm": 1.1579955713610255, "learning_rate": 7.828973741941517e-06, "loss": 0.1174, "step": 2430 }, { "epoch": 1.1343283582089552, "grad_norm": 1.0384681471468071, "learning_rate": 7.824496630148366e-06, "loss": 0.1166, "step": 2432 }, { "epoch": 1.1352611940298507, "grad_norm": 0.9963534166553635, "learning_rate": 7.820016189892391e-06, "loss": 0.0918, "step": 2434 }, { "epoch": 1.1361940298507462, "grad_norm": 1.1217727403934883, "learning_rate": 7.815532426453471e-06, "loss": 0.1028, "step": 2436 }, { "epoch": 1.1371268656716418, "grad_norm": 1.102379764515102, "learning_rate": 7.811045345115389e-06, "loss": 0.1128, "step": 2438 }, { "epoch": 1.1380597014925373, "grad_norm": 1.083350367180205, "learning_rate": 7.806554951165843e-06, "loss": 0.1056, "step": 2440 }, { "epoch": 1.1389925373134329, "grad_norm": 1.1312869810368222, "learning_rate": 7.802061249896435e-06, "loss": 0.0998, "step": 2442 }, { "epoch": 1.1399253731343284, "grad_norm": 1.0542921025787402, "learning_rate": 7.797564246602663e-06, "loss": 0.1123, "step": 2444 }, { "epoch": 1.140858208955224, "grad_norm": 1.1293486873929137, "learning_rate": 7.793063946583913e-06, "loss": 0.1039, "step": 2446 }, { "epoch": 1.1417910447761195, "grad_norm": 1.0213954185821845, "learning_rate": 7.788560355143467e-06, "loss": 0.1044, "step": 2448 }, { "epoch": 1.142723880597015, "grad_norm": 1.1584096907929313, "learning_rate": 7.784053477588474e-06, "loss": 0.1131, "step": 2450 }, { "epoch": 1.1436567164179103, "grad_norm": 1.1663773610337964, "learning_rate": 7.779543319229958e-06, "loss": 0.0995, "step": 2452 }, { "epoch": 1.1445895522388059, "grad_norm": 1.0619588002763802, "learning_rate": 7.775029885382815e-06, "loss": 0.0978, "step": 2454 }, { "epoch": 1.1455223880597014, "grad_norm": 0.977509405468944, "learning_rate": 7.770513181365794e-06, "loss": 0.102, "step": 2456 }, { "epoch": 1.146455223880597, "grad_norm": 1.080332616778311, "learning_rate": 7.765993212501502e-06, "loss": 0.1025, "step": 2458 }, { "epoch": 1.1473880597014925, "grad_norm": 1.0311526545077554, "learning_rate": 7.76146998411639e-06, "loss": 0.0899, "step": 2460 }, { "epoch": 1.148320895522388, "grad_norm": 0.9292428023774125, "learning_rate": 7.756943501540754e-06, "loss": 0.0828, "step": 2462 }, { "epoch": 1.1492537313432836, "grad_norm": 1.1061606243051447, "learning_rate": 7.752413770108723e-06, "loss": 0.1123, "step": 2464 }, { "epoch": 1.150186567164179, "grad_norm": 1.1553296981024972, "learning_rate": 7.747880795158254e-06, "loss": 0.118, "step": 2466 }, { "epoch": 1.1511194029850746, "grad_norm": 1.0859655084117374, "learning_rate": 7.743344582031125e-06, "loss": 0.1064, "step": 2468 }, { "epoch": 1.1520522388059702, "grad_norm": 1.0836789340684423, "learning_rate": 7.738805136072934e-06, "loss": 0.1071, "step": 2470 }, { "epoch": 1.1529850746268657, "grad_norm": 1.2663875819869606, "learning_rate": 7.734262462633084e-06, "loss": 0.1121, "step": 2472 }, { "epoch": 1.1539179104477613, "grad_norm": 1.356204379617711, "learning_rate": 7.729716567064787e-06, "loss": 0.1121, "step": 2474 }, { "epoch": 1.1548507462686568, "grad_norm": 1.1718719608276413, "learning_rate": 7.725167454725045e-06, "loss": 0.1134, "step": 2476 }, { "epoch": 1.1557835820895521, "grad_norm": 1.2494780252324966, "learning_rate": 7.720615130974654e-06, "loss": 0.1117, "step": 2478 }, { "epoch": 1.1567164179104479, "grad_norm": 1.1986325443330386, "learning_rate": 7.716059601178199e-06, "loss": 0.1141, "step": 2480 }, { "epoch": 1.1576492537313432, "grad_norm": 1.1556776271888145, "learning_rate": 7.711500870704036e-06, "loss": 0.1088, "step": 2482 }, { "epoch": 1.1585820895522387, "grad_norm": 0.949080655819924, "learning_rate": 7.706938944924296e-06, "loss": 0.0925, "step": 2484 }, { "epoch": 1.1595149253731343, "grad_norm": 1.001514149832634, "learning_rate": 7.702373829214873e-06, "loss": 0.1037, "step": 2486 }, { "epoch": 1.1604477611940298, "grad_norm": 1.0333838776551239, "learning_rate": 7.697805528955426e-06, "loss": 0.1149, "step": 2488 }, { "epoch": 1.1613805970149254, "grad_norm": 1.2101372951488252, "learning_rate": 7.693234049529363e-06, "loss": 0.1123, "step": 2490 }, { "epoch": 1.162313432835821, "grad_norm": 1.1237638649243655, "learning_rate": 7.688659396323834e-06, "loss": 0.1016, "step": 2492 }, { "epoch": 1.1632462686567164, "grad_norm": 1.3066181649848494, "learning_rate": 7.684081574729738e-06, "loss": 0.1041, "step": 2494 }, { "epoch": 1.164179104477612, "grad_norm": 1.0566116364025127, "learning_rate": 7.6795005901417e-06, "loss": 0.1085, "step": 2496 }, { "epoch": 1.1651119402985075, "grad_norm": 1.1430341121791319, "learning_rate": 7.674916447958076e-06, "loss": 0.1236, "step": 2498 }, { "epoch": 1.166044776119403, "grad_norm": 1.102188707092688, "learning_rate": 7.670329153580944e-06, "loss": 0.1083, "step": 2500 }, { "epoch": 1.166044776119403, "eval_loss": 0.1655130684375763, "eval_runtime": 320.1478, "eval_samples_per_second": 47.622, "eval_steps_per_second": 5.954, "step": 2500 }, { "epoch": 1.1669776119402986, "grad_norm": 1.075540598962944, "learning_rate": 7.665738712416094e-06, "loss": 0.1042, "step": 2502 }, { "epoch": 1.1679104477611941, "grad_norm": 1.154083558254164, "learning_rate": 7.661145129873026e-06, "loss": 0.107, "step": 2504 }, { "epoch": 1.1688432835820897, "grad_norm": 1.1267362871800328, "learning_rate": 7.656548411364939e-06, "loss": 0.116, "step": 2506 }, { "epoch": 1.169776119402985, "grad_norm": 1.1551330400864188, "learning_rate": 7.651948562308734e-06, "loss": 0.1145, "step": 2508 }, { "epoch": 1.1707089552238805, "grad_norm": 1.1647719703950468, "learning_rate": 7.647345588124993e-06, "loss": 0.1191, "step": 2510 }, { "epoch": 1.171641791044776, "grad_norm": 1.0514550342139921, "learning_rate": 7.642739494237986e-06, "loss": 0.116, "step": 2512 }, { "epoch": 1.1725746268656716, "grad_norm": 1.1039036169281318, "learning_rate": 7.638130286075658e-06, "loss": 0.1026, "step": 2514 }, { "epoch": 1.1735074626865671, "grad_norm": 1.1065986684739848, "learning_rate": 7.633517969069626e-06, "loss": 0.1217, "step": 2516 }, { "epoch": 1.1744402985074627, "grad_norm": 1.0843201188394793, "learning_rate": 7.628902548655164e-06, "loss": 0.1008, "step": 2518 }, { "epoch": 1.1753731343283582, "grad_norm": 1.1248288378872542, "learning_rate": 7.624284030271211e-06, "loss": 0.0976, "step": 2520 }, { "epoch": 1.1763059701492538, "grad_norm": 1.1004513739490172, "learning_rate": 7.619662419360353e-06, "loss": 0.1079, "step": 2522 }, { "epoch": 1.1772388059701493, "grad_norm": 0.998667555753008, "learning_rate": 7.615037721368818e-06, "loss": 0.1027, "step": 2524 }, { "epoch": 1.1781716417910448, "grad_norm": 1.1921378641046936, "learning_rate": 7.610409941746479e-06, "loss": 0.1158, "step": 2526 }, { "epoch": 1.1791044776119404, "grad_norm": 1.1471542350970025, "learning_rate": 7.6057790859468316e-06, "loss": 0.1099, "step": 2528 }, { "epoch": 1.180037313432836, "grad_norm": 1.0553094374178313, "learning_rate": 7.601145159427004e-06, "loss": 0.1006, "step": 2530 }, { "epoch": 1.1809701492537314, "grad_norm": 1.1869004833375891, "learning_rate": 7.5965081676477385e-06, "loss": 0.1178, "step": 2532 }, { "epoch": 1.1819029850746268, "grad_norm": 1.2649904595325534, "learning_rate": 7.591868116073391e-06, "loss": 0.1218, "step": 2534 }, { "epoch": 1.1828358208955223, "grad_norm": 1.0987236321719478, "learning_rate": 7.587225010171921e-06, "loss": 0.1178, "step": 2536 }, { "epoch": 1.1837686567164178, "grad_norm": 0.9932961409517518, "learning_rate": 7.582578855414895e-06, "loss": 0.0971, "step": 2538 }, { "epoch": 1.1847014925373134, "grad_norm": 1.0939650321271175, "learning_rate": 7.577929657277462e-06, "loss": 0.1085, "step": 2540 }, { "epoch": 1.185634328358209, "grad_norm": 1.0200331172950676, "learning_rate": 7.573277421238363e-06, "loss": 0.104, "step": 2542 }, { "epoch": 1.1865671641791045, "grad_norm": 1.5142382214048933, "learning_rate": 7.56862215277992e-06, "loss": 0.1103, "step": 2544 }, { "epoch": 1.1875, "grad_norm": 1.1075958125142682, "learning_rate": 7.5639638573880245e-06, "loss": 0.1119, "step": 2546 }, { "epoch": 1.1884328358208955, "grad_norm": 1.0490123312729123, "learning_rate": 7.559302540552138e-06, "loss": 0.1028, "step": 2548 }, { "epoch": 1.189365671641791, "grad_norm": 1.2515364153015263, "learning_rate": 7.554638207765281e-06, "loss": 0.1043, "step": 2550 }, { "epoch": 1.1902985074626866, "grad_norm": 1.096987817928207, "learning_rate": 7.5499708645240295e-06, "loss": 0.1064, "step": 2552 }, { "epoch": 1.1912313432835822, "grad_norm": 1.138866245588538, "learning_rate": 7.545300516328508e-06, "loss": 0.1116, "step": 2554 }, { "epoch": 1.1921641791044777, "grad_norm": 1.1139702774600553, "learning_rate": 7.540627168682377e-06, "loss": 0.1063, "step": 2556 }, { "epoch": 1.1930970149253732, "grad_norm": 1.1942392031376736, "learning_rate": 7.535950827092837e-06, "loss": 0.1124, "step": 2558 }, { "epoch": 1.1940298507462686, "grad_norm": 1.107898636344152, "learning_rate": 7.531271497070616e-06, "loss": 0.1029, "step": 2560 }, { "epoch": 1.194962686567164, "grad_norm": 1.0487706571983737, "learning_rate": 7.5265891841299575e-06, "loss": 0.0976, "step": 2562 }, { "epoch": 1.1958955223880596, "grad_norm": 1.1247688058216387, "learning_rate": 7.521903893788631e-06, "loss": 0.1118, "step": 2564 }, { "epoch": 1.1968283582089552, "grad_norm": 1.113193806479794, "learning_rate": 7.517215631567905e-06, "loss": 0.0991, "step": 2566 }, { "epoch": 1.1977611940298507, "grad_norm": 1.164127094358999, "learning_rate": 7.512524402992556e-06, "loss": 0.1112, "step": 2568 }, { "epoch": 1.1986940298507462, "grad_norm": 1.2170553977796374, "learning_rate": 7.507830213590852e-06, "loss": 0.1065, "step": 2570 }, { "epoch": 1.1996268656716418, "grad_norm": 0.9797528206892345, "learning_rate": 7.503133068894554e-06, "loss": 0.0887, "step": 2572 }, { "epoch": 1.2005597014925373, "grad_norm": 1.3268016153678397, "learning_rate": 7.4984329744389024e-06, "loss": 0.1069, "step": 2574 }, { "epoch": 1.2014925373134329, "grad_norm": 1.0387475686650887, "learning_rate": 7.493729935762615e-06, "loss": 0.101, "step": 2576 }, { "epoch": 1.2024253731343284, "grad_norm": 1.2411385386713536, "learning_rate": 7.489023958407878e-06, "loss": 0.1154, "step": 2578 }, { "epoch": 1.203358208955224, "grad_norm": 1.070976967240602, "learning_rate": 7.484315047920345e-06, "loss": 0.0968, "step": 2580 }, { "epoch": 1.2042910447761195, "grad_norm": 1.0279259296417749, "learning_rate": 7.479603209849121e-06, "loss": 0.101, "step": 2582 }, { "epoch": 1.205223880597015, "grad_norm": 1.1949054937326204, "learning_rate": 7.474888449746761e-06, "loss": 0.1042, "step": 2584 }, { "epoch": 1.2061567164179103, "grad_norm": 1.0934519694318268, "learning_rate": 7.470170773169268e-06, "loss": 0.1092, "step": 2586 }, { "epoch": 1.2070895522388059, "grad_norm": 1.2455261734049656, "learning_rate": 7.465450185676079e-06, "loss": 0.1141, "step": 2588 }, { "epoch": 1.2080223880597014, "grad_norm": 1.133881071885915, "learning_rate": 7.460726692830057e-06, "loss": 0.1131, "step": 2590 }, { "epoch": 1.208955223880597, "grad_norm": 1.1078991969471512, "learning_rate": 7.456000300197498e-06, "loss": 0.1106, "step": 2592 }, { "epoch": 1.2098880597014925, "grad_norm": 1.0919539546133117, "learning_rate": 7.4512710133481084e-06, "loss": 0.1101, "step": 2594 }, { "epoch": 1.210820895522388, "grad_norm": 1.042048448194764, "learning_rate": 7.446538837855006e-06, "loss": 0.1048, "step": 2596 }, { "epoch": 1.2117537313432836, "grad_norm": 1.146176533937742, "learning_rate": 7.4418037792947165e-06, "loss": 0.1099, "step": 2598 }, { "epoch": 1.212686567164179, "grad_norm": 1.1797348432147379, "learning_rate": 7.437065843247158e-06, "loss": 0.1036, "step": 2600 }, { "epoch": 1.2136194029850746, "grad_norm": 0.994822974236348, "learning_rate": 7.432325035295641e-06, "loss": 0.1041, "step": 2602 }, { "epoch": 1.2145522388059702, "grad_norm": 1.1477646285232344, "learning_rate": 7.427581361026863e-06, "loss": 0.1056, "step": 2604 }, { "epoch": 1.2154850746268657, "grad_norm": 1.0941356228947834, "learning_rate": 7.422834826030898e-06, "loss": 0.094, "step": 2606 }, { "epoch": 1.2164179104477613, "grad_norm": 1.151960814484258, "learning_rate": 7.418085435901189e-06, "loss": 0.101, "step": 2608 }, { "epoch": 1.2173507462686568, "grad_norm": 1.0873573706860375, "learning_rate": 7.413333196234545e-06, "loss": 0.0928, "step": 2610 }, { "epoch": 1.2182835820895521, "grad_norm": 1.237995409805719, "learning_rate": 7.408578112631135e-06, "loss": 0.1203, "step": 2612 }, { "epoch": 1.2192164179104479, "grad_norm": 1.0716183891826505, "learning_rate": 7.4038201906944755e-06, "loss": 0.0928, "step": 2614 }, { "epoch": 1.2201492537313432, "grad_norm": 1.1216445774836254, "learning_rate": 7.399059436031428e-06, "loss": 0.1069, "step": 2616 }, { "epoch": 1.2210820895522387, "grad_norm": 1.1611530997551531, "learning_rate": 7.3942958542522e-06, "loss": 0.1117, "step": 2618 }, { "epoch": 1.2220149253731343, "grad_norm": 1.2020861032231644, "learning_rate": 7.389529450970318e-06, "loss": 0.1165, "step": 2620 }, { "epoch": 1.2229477611940298, "grad_norm": 1.0324165413720954, "learning_rate": 7.384760231802643e-06, "loss": 0.1127, "step": 2622 }, { "epoch": 1.2238805970149254, "grad_norm": 1.1018413802488094, "learning_rate": 7.37998820236935e-06, "loss": 0.1063, "step": 2624 }, { "epoch": 1.224813432835821, "grad_norm": 1.0537352628084051, "learning_rate": 7.375213368293928e-06, "loss": 0.1049, "step": 2626 }, { "epoch": 1.2257462686567164, "grad_norm": 1.019181409127792, "learning_rate": 7.3704357352031705e-06, "loss": 0.1033, "step": 2628 }, { "epoch": 1.226679104477612, "grad_norm": 0.9836523233476584, "learning_rate": 7.365655308727167e-06, "loss": 0.1031, "step": 2630 }, { "epoch": 1.2276119402985075, "grad_norm": 1.1346228610688998, "learning_rate": 7.360872094499303e-06, "loss": 0.1076, "step": 2632 }, { "epoch": 1.228544776119403, "grad_norm": 1.0928536395888118, "learning_rate": 7.356086098156243e-06, "loss": 0.1058, "step": 2634 }, { "epoch": 1.2294776119402986, "grad_norm": 1.1029370545035155, "learning_rate": 7.351297325337936e-06, "loss": 0.1021, "step": 2636 }, { "epoch": 1.2304104477611941, "grad_norm": 1.2199129344325508, "learning_rate": 7.346505781687604e-06, "loss": 0.1189, "step": 2638 }, { "epoch": 1.2313432835820897, "grad_norm": 1.2933072752649069, "learning_rate": 7.341711472851726e-06, "loss": 0.1139, "step": 2640 }, { "epoch": 1.232276119402985, "grad_norm": 1.1456399722142987, "learning_rate": 7.336914404480046e-06, "loss": 0.1042, "step": 2642 }, { "epoch": 1.2332089552238805, "grad_norm": 1.1477447368664087, "learning_rate": 7.33211458222556e-06, "loss": 0.119, "step": 2644 }, { "epoch": 1.234141791044776, "grad_norm": 1.1902219020235507, "learning_rate": 7.327312011744505e-06, "loss": 0.112, "step": 2646 }, { "epoch": 1.2350746268656716, "grad_norm": 1.009572556191043, "learning_rate": 7.322506698696361e-06, "loss": 0.1087, "step": 2648 }, { "epoch": 1.2360074626865671, "grad_norm": 1.1156522327158893, "learning_rate": 7.3176986487438385e-06, "loss": 0.1022, "step": 2650 }, { "epoch": 1.2369402985074627, "grad_norm": 1.1797053137137803, "learning_rate": 7.312887867552873e-06, "loss": 0.1133, "step": 2652 }, { "epoch": 1.2378731343283582, "grad_norm": 1.1513873160913002, "learning_rate": 7.308074360792617e-06, "loss": 0.1229, "step": 2654 }, { "epoch": 1.2388059701492538, "grad_norm": 1.2014653170170158, "learning_rate": 7.303258134135437e-06, "loss": 0.1176, "step": 2656 }, { "epoch": 1.2397388059701493, "grad_norm": 1.1123959416781903, "learning_rate": 7.298439193256905e-06, "loss": 0.1087, "step": 2658 }, { "epoch": 1.2406716417910448, "grad_norm": 1.0567589662153143, "learning_rate": 7.293617543835789e-06, "loss": 0.1053, "step": 2660 }, { "epoch": 1.2416044776119404, "grad_norm": 1.1060400319996038, "learning_rate": 7.288793191554051e-06, "loss": 0.1133, "step": 2662 }, { "epoch": 1.242537313432836, "grad_norm": 1.1364081871037819, "learning_rate": 7.28396614209684e-06, "loss": 0.1041, "step": 2664 }, { "epoch": 1.2434701492537314, "grad_norm": 1.088808255285531, "learning_rate": 7.279136401152477e-06, "loss": 0.1022, "step": 2666 }, { "epoch": 1.2444029850746268, "grad_norm": 1.2489080995838342, "learning_rate": 7.27430397441246e-06, "loss": 0.1049, "step": 2668 }, { "epoch": 1.2453358208955223, "grad_norm": 1.0974154866574723, "learning_rate": 7.269468867571453e-06, "loss": 0.1075, "step": 2670 }, { "epoch": 1.2462686567164178, "grad_norm": 1.1336477031064254, "learning_rate": 7.2646310863272725e-06, "loss": 0.1133, "step": 2672 }, { "epoch": 1.2472014925373134, "grad_norm": 1.3686960576490796, "learning_rate": 7.259790636380892e-06, "loss": 0.1132, "step": 2674 }, { "epoch": 1.248134328358209, "grad_norm": 1.1912324914569272, "learning_rate": 7.254947523436427e-06, "loss": 0.109, "step": 2676 }, { "epoch": 1.2490671641791045, "grad_norm": 1.1268319243851983, "learning_rate": 7.250101753201134e-06, "loss": 0.1038, "step": 2678 }, { "epoch": 1.25, "grad_norm": 1.11821848462912, "learning_rate": 7.2452533313853976e-06, "loss": 0.1189, "step": 2680 }, { "epoch": 1.2509328358208955, "grad_norm": 1.0555462132424303, "learning_rate": 7.240402263702729e-06, "loss": 0.1162, "step": 2682 }, { "epoch": 1.251865671641791, "grad_norm": 1.0732946400255794, "learning_rate": 7.235548555869755e-06, "loss": 0.0932, "step": 2684 }, { "epoch": 1.2527985074626866, "grad_norm": 1.2294857763959475, "learning_rate": 7.230692213606218e-06, "loss": 0.1053, "step": 2686 }, { "epoch": 1.2537313432835822, "grad_norm": 1.2830161270144966, "learning_rate": 7.225833242634961e-06, "loss": 0.1021, "step": 2688 }, { "epoch": 1.2546641791044777, "grad_norm": 1.0594775319709628, "learning_rate": 7.2209716486819255e-06, "loss": 0.1071, "step": 2690 }, { "epoch": 1.2555970149253732, "grad_norm": 1.0805250136382083, "learning_rate": 7.216107437476148e-06, "loss": 0.1091, "step": 2692 }, { "epoch": 1.2565298507462686, "grad_norm": 1.2708581553975034, "learning_rate": 7.211240614749741e-06, "loss": 0.1226, "step": 2694 }, { "epoch": 1.2574626865671643, "grad_norm": 1.1388558899202947, "learning_rate": 7.206371186237904e-06, "loss": 0.109, "step": 2696 }, { "epoch": 1.2583955223880596, "grad_norm": 0.9995337654837192, "learning_rate": 7.201499157678899e-06, "loss": 0.1023, "step": 2698 }, { "epoch": 1.2593283582089552, "grad_norm": 1.1355489085753432, "learning_rate": 7.196624534814056e-06, "loss": 0.1129, "step": 2700 }, { "epoch": 1.2602611940298507, "grad_norm": 0.9846278711768532, "learning_rate": 7.191747323387764e-06, "loss": 0.0993, "step": 2702 }, { "epoch": 1.2611940298507462, "grad_norm": 1.1396072697603832, "learning_rate": 7.18686752914746e-06, "loss": 0.1117, "step": 2704 }, { "epoch": 1.2621268656716418, "grad_norm": 1.0983419636059346, "learning_rate": 7.1819851578436205e-06, "loss": 0.1078, "step": 2706 }, { "epoch": 1.2630597014925373, "grad_norm": 1.180833243987259, "learning_rate": 7.177100215229769e-06, "loss": 0.102, "step": 2708 }, { "epoch": 1.2639925373134329, "grad_norm": 1.0810310626293518, "learning_rate": 7.172212707062449e-06, "loss": 0.1168, "step": 2710 }, { "epoch": 1.2649253731343284, "grad_norm": 1.0619275048995278, "learning_rate": 7.167322639101235e-06, "loss": 0.1016, "step": 2712 }, { "epoch": 1.265858208955224, "grad_norm": 1.129749324815062, "learning_rate": 7.162430017108711e-06, "loss": 0.1057, "step": 2714 }, { "epoch": 1.2667910447761195, "grad_norm": 1.2492622710805221, "learning_rate": 7.157534846850478e-06, "loss": 0.1132, "step": 2716 }, { "epoch": 1.267723880597015, "grad_norm": 1.1738874513300692, "learning_rate": 7.152637134095133e-06, "loss": 0.1123, "step": 2718 }, { "epoch": 1.2686567164179103, "grad_norm": 1.175424747582013, "learning_rate": 7.147736884614274e-06, "loss": 0.1119, "step": 2720 }, { "epoch": 1.269589552238806, "grad_norm": 1.0632777721964533, "learning_rate": 7.142834104182489e-06, "loss": 0.1076, "step": 2722 }, { "epoch": 1.2705223880597014, "grad_norm": 1.1108504379493909, "learning_rate": 7.137928798577342e-06, "loss": 0.1184, "step": 2724 }, { "epoch": 1.271455223880597, "grad_norm": 1.0981340847743442, "learning_rate": 7.133020973579381e-06, "loss": 0.1045, "step": 2726 }, { "epoch": 1.2723880597014925, "grad_norm": 1.1097340538573355, "learning_rate": 7.128110634972117e-06, "loss": 0.0993, "step": 2728 }, { "epoch": 1.273320895522388, "grad_norm": 1.143718838435375, "learning_rate": 7.1231977885420256e-06, "loss": 0.1077, "step": 2730 }, { "epoch": 1.2742537313432836, "grad_norm": 1.0623841355541739, "learning_rate": 7.118282440078535e-06, "loss": 0.1041, "step": 2732 }, { "epoch": 1.275186567164179, "grad_norm": 1.0461047747851933, "learning_rate": 7.1133645953740285e-06, "loss": 0.0984, "step": 2734 }, { "epoch": 1.2761194029850746, "grad_norm": 1.1239903002286689, "learning_rate": 7.108444260223825e-06, "loss": 0.1137, "step": 2736 }, { "epoch": 1.2770522388059702, "grad_norm": 1.076943725702056, "learning_rate": 7.1035214404261775e-06, "loss": 0.1095, "step": 2738 }, { "epoch": 1.2779850746268657, "grad_norm": 1.0737252233536958, "learning_rate": 7.098596141782271e-06, "loss": 0.1132, "step": 2740 }, { "epoch": 1.2789179104477613, "grad_norm": 1.085984333930034, "learning_rate": 7.093668370096211e-06, "loss": 0.1091, "step": 2742 }, { "epoch": 1.2798507462686568, "grad_norm": 1.1152908964169612, "learning_rate": 7.088738131175014e-06, "loss": 0.1033, "step": 2744 }, { "epoch": 1.2807835820895521, "grad_norm": 1.0239521359044592, "learning_rate": 7.083805430828608e-06, "loss": 0.0963, "step": 2746 }, { "epoch": 1.2817164179104479, "grad_norm": 1.1278740790491524, "learning_rate": 7.078870274869818e-06, "loss": 0.1027, "step": 2748 }, { "epoch": 1.2826492537313432, "grad_norm": 1.1006667101736127, "learning_rate": 7.073932669114367e-06, "loss": 0.1049, "step": 2750 }, { "epoch": 1.2835820895522387, "grad_norm": 1.0385121901630985, "learning_rate": 7.068992619380859e-06, "loss": 0.1126, "step": 2752 }, { "epoch": 1.2845149253731343, "grad_norm": 1.1816975709935262, "learning_rate": 7.064050131490785e-06, "loss": 0.0966, "step": 2754 }, { "epoch": 1.2854477611940298, "grad_norm": 1.0149462286666746, "learning_rate": 7.0591052112685055e-06, "loss": 0.095, "step": 2756 }, { "epoch": 1.2863805970149254, "grad_norm": 1.1164943007592327, "learning_rate": 7.0541578645412445e-06, "loss": 0.1126, "step": 2758 }, { "epoch": 1.287313432835821, "grad_norm": 1.1310798223158123, "learning_rate": 7.049208097139091e-06, "loss": 0.1107, "step": 2760 }, { "epoch": 1.2882462686567164, "grad_norm": 1.021054986400497, "learning_rate": 7.044255914894984e-06, "loss": 0.0971, "step": 2762 }, { "epoch": 1.289179104477612, "grad_norm": 1.0478562360912111, "learning_rate": 7.039301323644708e-06, "loss": 0.1075, "step": 2764 }, { "epoch": 1.2901119402985075, "grad_norm": 1.0362026319650983, "learning_rate": 7.034344329226885e-06, "loss": 0.1045, "step": 2766 }, { "epoch": 1.291044776119403, "grad_norm": 1.0367226653554475, "learning_rate": 7.029384937482973e-06, "loss": 0.1093, "step": 2768 }, { "epoch": 1.2919776119402986, "grad_norm": 1.027171436486602, "learning_rate": 7.024423154257251e-06, "loss": 0.1082, "step": 2770 }, { "epoch": 1.292910447761194, "grad_norm": 1.2466632109908478, "learning_rate": 7.019458985396817e-06, "loss": 0.1102, "step": 2772 }, { "epoch": 1.2938432835820897, "grad_norm": 0.9880084667022045, "learning_rate": 7.0144924367515855e-06, "loss": 0.1074, "step": 2774 }, { "epoch": 1.294776119402985, "grad_norm": 1.1544989497995124, "learning_rate": 7.009523514174266e-06, "loss": 0.1028, "step": 2776 }, { "epoch": 1.2957089552238805, "grad_norm": 1.1720849319609559, "learning_rate": 7.004552223520372e-06, "loss": 0.1113, "step": 2778 }, { "epoch": 1.296641791044776, "grad_norm": 1.0462960415683409, "learning_rate": 6.999578570648209e-06, "loss": 0.0946, "step": 2780 }, { "epoch": 1.2975746268656716, "grad_norm": 0.9598856162552585, "learning_rate": 6.994602561418861e-06, "loss": 0.0938, "step": 2782 }, { "epoch": 1.2985074626865671, "grad_norm": 1.022033946406963, "learning_rate": 6.98962420169619e-06, "loss": 0.1024, "step": 2784 }, { "epoch": 1.2994402985074627, "grad_norm": 0.953687394561481, "learning_rate": 6.984643497346832e-06, "loss": 0.0925, "step": 2786 }, { "epoch": 1.3003731343283582, "grad_norm": 1.1287244050322118, "learning_rate": 6.979660454240181e-06, "loss": 0.1081, "step": 2788 }, { "epoch": 1.3013059701492538, "grad_norm": 1.1709327211576885, "learning_rate": 6.974675078248387e-06, "loss": 0.1035, "step": 2790 }, { "epoch": 1.3022388059701493, "grad_norm": 0.944588713647086, "learning_rate": 6.969687375246355e-06, "loss": 0.0959, "step": 2792 }, { "epoch": 1.3031716417910448, "grad_norm": 1.1472249598687618, "learning_rate": 6.9646973511117285e-06, "loss": 0.1188, "step": 2794 }, { "epoch": 1.3041044776119404, "grad_norm": 1.0929867903515014, "learning_rate": 6.959705011724884e-06, "loss": 0.1185, "step": 2796 }, { "epoch": 1.3050373134328357, "grad_norm": 1.2660405928601028, "learning_rate": 6.954710362968929e-06, "loss": 0.1176, "step": 2798 }, { "epoch": 1.3059701492537314, "grad_norm": 1.2582437720859714, "learning_rate": 6.9497134107296925e-06, "loss": 0.1118, "step": 2800 }, { "epoch": 1.3069029850746268, "grad_norm": 1.0001790588845447, "learning_rate": 6.944714160895717e-06, "loss": 0.1059, "step": 2802 }, { "epoch": 1.3078358208955223, "grad_norm": 1.089422532633587, "learning_rate": 6.939712619358252e-06, "loss": 0.1151, "step": 2804 }, { "epoch": 1.3087686567164178, "grad_norm": 1.1839127783632801, "learning_rate": 6.934708792011251e-06, "loss": 0.1115, "step": 2806 }, { "epoch": 1.3097014925373134, "grad_norm": 1.0133909049794272, "learning_rate": 6.92970268475136e-06, "loss": 0.1097, "step": 2808 }, { "epoch": 1.310634328358209, "grad_norm": 1.0099610884578727, "learning_rate": 6.924694303477904e-06, "loss": 0.1071, "step": 2810 }, { "epoch": 1.3115671641791045, "grad_norm": 1.0665431964123495, "learning_rate": 6.9196836540929e-06, "loss": 0.1166, "step": 2812 }, { "epoch": 1.3125, "grad_norm": 1.12726023448818, "learning_rate": 6.914670742501032e-06, "loss": 0.1117, "step": 2814 }, { "epoch": 1.3134328358208955, "grad_norm": 1.143118704139726, "learning_rate": 6.909655574609645e-06, "loss": 0.0947, "step": 2816 }, { "epoch": 1.314365671641791, "grad_norm": 1.0182486712078969, "learning_rate": 6.904638156328754e-06, "loss": 0.1013, "step": 2818 }, { "epoch": 1.3152985074626866, "grad_norm": 1.058792333177847, "learning_rate": 6.899618493571015e-06, "loss": 0.0897, "step": 2820 }, { "epoch": 1.3162313432835822, "grad_norm": 1.0291529853253425, "learning_rate": 6.894596592251735e-06, "loss": 0.1134, "step": 2822 }, { "epoch": 1.3171641791044777, "grad_norm": 1.1282168182539591, "learning_rate": 6.889572458288859e-06, "loss": 0.1042, "step": 2824 }, { "epoch": 1.3180970149253732, "grad_norm": 1.1033786820010723, "learning_rate": 6.88454609760296e-06, "loss": 0.1089, "step": 2826 }, { "epoch": 1.3190298507462686, "grad_norm": 1.2289680381551906, "learning_rate": 6.879517516117238e-06, "loss": 0.1071, "step": 2828 }, { "epoch": 1.3199626865671643, "grad_norm": 1.058572972863236, "learning_rate": 6.874486719757507e-06, "loss": 0.1047, "step": 2830 }, { "epoch": 1.3208955223880596, "grad_norm": 1.0880290932451049, "learning_rate": 6.869453714452194e-06, "loss": 0.1096, "step": 2832 }, { "epoch": 1.3218283582089552, "grad_norm": 1.0614679694534637, "learning_rate": 6.8644185061323284e-06, "loss": 0.1104, "step": 2834 }, { "epoch": 1.3227611940298507, "grad_norm": 1.1761386044220195, "learning_rate": 6.859381100731534e-06, "loss": 0.1085, "step": 2836 }, { "epoch": 1.3236940298507462, "grad_norm": 1.1925865365449022, "learning_rate": 6.854341504186025e-06, "loss": 0.1216, "step": 2838 }, { "epoch": 1.3246268656716418, "grad_norm": 1.238843754128432, "learning_rate": 6.849299722434599e-06, "loss": 0.1262, "step": 2840 }, { "epoch": 1.3255597014925373, "grad_norm": 1.017241405899312, "learning_rate": 6.844255761418625e-06, "loss": 0.1001, "step": 2842 }, { "epoch": 1.3264925373134329, "grad_norm": 1.019967528903183, "learning_rate": 6.839209627082043e-06, "loss": 0.097, "step": 2844 }, { "epoch": 1.3274253731343284, "grad_norm": 0.9335114561733073, "learning_rate": 6.834161325371354e-06, "loss": 0.1146, "step": 2846 }, { "epoch": 1.328358208955224, "grad_norm": 1.0111655177954493, "learning_rate": 6.829110862235614e-06, "loss": 0.0997, "step": 2848 }, { "epoch": 1.3292910447761195, "grad_norm": 1.05426667874679, "learning_rate": 6.824058243626421e-06, "loss": 0.0946, "step": 2850 }, { "epoch": 1.330223880597015, "grad_norm": 1.0388519317641882, "learning_rate": 6.819003475497921e-06, "loss": 0.0989, "step": 2852 }, { "epoch": 1.3311567164179103, "grad_norm": 1.2966889294733428, "learning_rate": 6.813946563806785e-06, "loss": 0.1201, "step": 2854 }, { "epoch": 1.332089552238806, "grad_norm": 0.9525414046629391, "learning_rate": 6.808887514512215e-06, "loss": 0.1066, "step": 2856 }, { "epoch": 1.3330223880597014, "grad_norm": 1.1334560470038777, "learning_rate": 6.803826333575931e-06, "loss": 0.1008, "step": 2858 }, { "epoch": 1.333955223880597, "grad_norm": 1.327554826453119, "learning_rate": 6.798763026962167e-06, "loss": 0.1175, "step": 2860 }, { "epoch": 1.3348880597014925, "grad_norm": 1.0639321575109582, "learning_rate": 6.793697600637655e-06, "loss": 0.0968, "step": 2862 }, { "epoch": 1.335820895522388, "grad_norm": 1.100781208211592, "learning_rate": 6.788630060571634e-06, "loss": 0.1205, "step": 2864 }, { "epoch": 1.3367537313432836, "grad_norm": 1.1156886283722336, "learning_rate": 6.783560412735828e-06, "loss": 0.1055, "step": 2866 }, { "epoch": 1.337686567164179, "grad_norm": 1.0748808306586775, "learning_rate": 6.7784886631044456e-06, "loss": 0.1029, "step": 2868 }, { "epoch": 1.3386194029850746, "grad_norm": 1.133633135201276, "learning_rate": 6.773414817654174e-06, "loss": 0.1074, "step": 2870 }, { "epoch": 1.3395522388059702, "grad_norm": 1.1334666060853869, "learning_rate": 6.7683388823641686e-06, "loss": 0.1016, "step": 2872 }, { "epoch": 1.3404850746268657, "grad_norm": 1.6191482296264594, "learning_rate": 6.763260863216048e-06, "loss": 0.1028, "step": 2874 }, { "epoch": 1.3414179104477613, "grad_norm": 1.2987734323409736, "learning_rate": 6.758180766193887e-06, "loss": 0.1257, "step": 2876 }, { "epoch": 1.3423507462686568, "grad_norm": 1.149422135906203, "learning_rate": 6.75309859728421e-06, "loss": 0.12, "step": 2878 }, { "epoch": 1.3432835820895521, "grad_norm": 1.2063697935527302, "learning_rate": 6.748014362475979e-06, "loss": 0.1204, "step": 2880 }, { "epoch": 1.3442164179104479, "grad_norm": 1.064773396784815, "learning_rate": 6.742928067760595e-06, "loss": 0.0977, "step": 2882 }, { "epoch": 1.3451492537313432, "grad_norm": 1.0736709924243466, "learning_rate": 6.737839719131882e-06, "loss": 0.1056, "step": 2884 }, { "epoch": 1.3460820895522387, "grad_norm": 1.1412493298954451, "learning_rate": 6.732749322586091e-06, "loss": 0.1101, "step": 2886 }, { "epoch": 1.3470149253731343, "grad_norm": 1.185933919889182, "learning_rate": 6.727656884121878e-06, "loss": 0.113, "step": 2888 }, { "epoch": 1.3479477611940298, "grad_norm": 1.2160809535817252, "learning_rate": 6.722562409740312e-06, "loss": 0.1141, "step": 2890 }, { "epoch": 1.3488805970149254, "grad_norm": 1.1575740233480094, "learning_rate": 6.71746590544486e-06, "loss": 0.1094, "step": 2892 }, { "epoch": 1.349813432835821, "grad_norm": 1.1035081404168656, "learning_rate": 6.712367377241375e-06, "loss": 0.0995, "step": 2894 }, { "epoch": 1.3507462686567164, "grad_norm": 1.1442566521045665, "learning_rate": 6.707266831138104e-06, "loss": 0.1171, "step": 2896 }, { "epoch": 1.351679104477612, "grad_norm": 1.167856879029797, "learning_rate": 6.702164273145667e-06, "loss": 0.1141, "step": 2898 }, { "epoch": 1.3526119402985075, "grad_norm": 1.0597246120572674, "learning_rate": 6.6970597092770535e-06, "loss": 0.1067, "step": 2900 }, { "epoch": 1.353544776119403, "grad_norm": 1.2212308368679017, "learning_rate": 6.6919531455476214e-06, "loss": 0.1119, "step": 2902 }, { "epoch": 1.3544776119402986, "grad_norm": 1.1023494393486082, "learning_rate": 6.6868445879750824e-06, "loss": 0.1078, "step": 2904 }, { "epoch": 1.355410447761194, "grad_norm": 1.0895226818200237, "learning_rate": 6.681734042579496e-06, "loss": 0.0952, "step": 2906 }, { "epoch": 1.3563432835820897, "grad_norm": 1.1142486667881042, "learning_rate": 6.6766215153832705e-06, "loss": 0.1128, "step": 2908 }, { "epoch": 1.357276119402985, "grad_norm": 1.2709531893368116, "learning_rate": 6.671507012411141e-06, "loss": 0.1117, "step": 2910 }, { "epoch": 1.3582089552238805, "grad_norm": 0.9951773324160267, "learning_rate": 6.6663905396901784e-06, "loss": 0.1023, "step": 2912 }, { "epoch": 1.359141791044776, "grad_norm": 1.200134402201827, "learning_rate": 6.661272103249771e-06, "loss": 0.1, "step": 2914 }, { "epoch": 1.3600746268656716, "grad_norm": 1.2228286735043175, "learning_rate": 6.6561517091216195e-06, "loss": 0.1152, "step": 2916 }, { "epoch": 1.3610074626865671, "grad_norm": 1.0505205938945563, "learning_rate": 6.651029363339739e-06, "loss": 0.1039, "step": 2918 }, { "epoch": 1.3619402985074627, "grad_norm": 1.09838183594541, "learning_rate": 6.645905071940436e-06, "loss": 0.1042, "step": 2920 }, { "epoch": 1.3628731343283582, "grad_norm": 1.053570553754682, "learning_rate": 6.6407788409623145e-06, "loss": 0.114, "step": 2922 }, { "epoch": 1.3638059701492538, "grad_norm": 1.167119974675872, "learning_rate": 6.6356506764462645e-06, "loss": 0.1018, "step": 2924 }, { "epoch": 1.3647388059701493, "grad_norm": 1.1074407954655574, "learning_rate": 6.630520584435449e-06, "loss": 0.0988, "step": 2926 }, { "epoch": 1.3656716417910448, "grad_norm": 1.0604558516149023, "learning_rate": 6.625388570975309e-06, "loss": 0.1031, "step": 2928 }, { "epoch": 1.3666044776119404, "grad_norm": 1.2005477049834712, "learning_rate": 6.620254642113549e-06, "loss": 0.1132, "step": 2930 }, { "epoch": 1.3675373134328357, "grad_norm": 1.090950226405584, "learning_rate": 6.615118803900126e-06, "loss": 0.1008, "step": 2932 }, { "epoch": 1.3684701492537314, "grad_norm": 1.0355616161327785, "learning_rate": 6.60998106238725e-06, "loss": 0.1087, "step": 2934 }, { "epoch": 1.3694029850746268, "grad_norm": 0.9729656732764957, "learning_rate": 6.604841423629377e-06, "loss": 0.1, "step": 2936 }, { "epoch": 1.3703358208955223, "grad_norm": 1.0893405931843285, "learning_rate": 6.599699893683191e-06, "loss": 0.1069, "step": 2938 }, { "epoch": 1.3712686567164178, "grad_norm": 0.936558636879193, "learning_rate": 6.594556478607613e-06, "loss": 0.0841, "step": 2940 }, { "epoch": 1.3722014925373134, "grad_norm": 1.2615085771691428, "learning_rate": 6.589411184463778e-06, "loss": 0.1139, "step": 2942 }, { "epoch": 1.373134328358209, "grad_norm": 1.1737956310426265, "learning_rate": 6.5842640173150455e-06, "loss": 0.1112, "step": 2944 }, { "epoch": 1.3740671641791045, "grad_norm": 1.171751150916603, "learning_rate": 6.5791149832269685e-06, "loss": 0.1102, "step": 2946 }, { "epoch": 1.375, "grad_norm": 0.9175008113335006, "learning_rate": 6.57396408826731e-06, "loss": 0.0972, "step": 2948 }, { "epoch": 1.3759328358208955, "grad_norm": 1.129787325635826, "learning_rate": 6.568811338506026e-06, "loss": 0.1045, "step": 2950 }, { "epoch": 1.376865671641791, "grad_norm": 1.0242504438143376, "learning_rate": 6.5636567400152505e-06, "loss": 0.086, "step": 2952 }, { "epoch": 1.3777985074626866, "grad_norm": 1.1018081705158753, "learning_rate": 6.558500298869305e-06, "loss": 0.1069, "step": 2954 }, { "epoch": 1.3787313432835822, "grad_norm": 1.0342368508145685, "learning_rate": 6.553342021144676e-06, "loss": 0.1081, "step": 2956 }, { "epoch": 1.3796641791044777, "grad_norm": 1.2000108789191752, "learning_rate": 6.548181912920018e-06, "loss": 0.1029, "step": 2958 }, { "epoch": 1.3805970149253732, "grad_norm": 1.0503527069580005, "learning_rate": 6.543019980276142e-06, "loss": 0.1036, "step": 2960 }, { "epoch": 1.3815298507462686, "grad_norm": 1.276892042282831, "learning_rate": 6.537856229296007e-06, "loss": 0.1193, "step": 2962 }, { "epoch": 1.3824626865671643, "grad_norm": 1.1400080476797572, "learning_rate": 6.5326906660647175e-06, "loss": 0.1012, "step": 2964 }, { "epoch": 1.3833955223880596, "grad_norm": 1.0266181415924787, "learning_rate": 6.5275232966695105e-06, "loss": 0.0935, "step": 2966 }, { "epoch": 1.3843283582089552, "grad_norm": 1.0976238328333847, "learning_rate": 6.522354127199751e-06, "loss": 0.0952, "step": 2968 }, { "epoch": 1.3852611940298507, "grad_norm": 1.180970128662032, "learning_rate": 6.517183163746934e-06, "loss": 0.0983, "step": 2970 }, { "epoch": 1.3861940298507462, "grad_norm": 1.1471694862180046, "learning_rate": 6.512010412404658e-06, "loss": 0.1144, "step": 2972 }, { "epoch": 1.3871268656716418, "grad_norm": 1.051434549088811, "learning_rate": 6.50683587926863e-06, "loss": 0.1093, "step": 2974 }, { "epoch": 1.3880597014925373, "grad_norm": 1.1114955765102228, "learning_rate": 6.5016595704366646e-06, "loss": 0.1056, "step": 2976 }, { "epoch": 1.3889925373134329, "grad_norm": 1.3090825218198192, "learning_rate": 6.496481492008657e-06, "loss": 0.1112, "step": 2978 }, { "epoch": 1.3899253731343284, "grad_norm": 1.1156719571047116, "learning_rate": 6.4913016500866e-06, "loss": 0.1027, "step": 2980 }, { "epoch": 1.390858208955224, "grad_norm": 1.0253311597503183, "learning_rate": 6.486120050774556e-06, "loss": 0.0968, "step": 2982 }, { "epoch": 1.3917910447761195, "grad_norm": 0.9091733131896456, "learning_rate": 6.480936700178659e-06, "loss": 0.0809, "step": 2984 }, { "epoch": 1.392723880597015, "grad_norm": 1.1235099566691318, "learning_rate": 6.475751604407114e-06, "loss": 0.1138, "step": 2986 }, { "epoch": 1.3936567164179103, "grad_norm": 1.0815530777150875, "learning_rate": 6.470564769570173e-06, "loss": 0.1067, "step": 2988 }, { "epoch": 1.394589552238806, "grad_norm": 1.1033766441590853, "learning_rate": 6.465376201780142e-06, "loss": 0.0999, "step": 2990 }, { "epoch": 1.3955223880597014, "grad_norm": 1.0591842863522525, "learning_rate": 6.460185907151372e-06, "loss": 0.1021, "step": 2992 }, { "epoch": 1.396455223880597, "grad_norm": 1.1913988809406653, "learning_rate": 6.454993891800242e-06, "loss": 0.1178, "step": 2994 }, { "epoch": 1.3973880597014925, "grad_norm": 1.0816031459872881, "learning_rate": 6.449800161845167e-06, "loss": 0.1048, "step": 2996 }, { "epoch": 1.398320895522388, "grad_norm": 1.1869785108921906, "learning_rate": 6.444604723406574e-06, "loss": 0.1168, "step": 2998 }, { "epoch": 1.3992537313432836, "grad_norm": 1.091045630125453, "learning_rate": 6.439407582606907e-06, "loss": 0.1022, "step": 3000 }, { "epoch": 1.3992537313432836, "eval_loss": 0.1596611738204956, "eval_runtime": 321.8346, "eval_samples_per_second": 47.372, "eval_steps_per_second": 5.922, "step": 3000 }, { "epoch": 1.400186567164179, "grad_norm": 1.167629978025765, "learning_rate": 6.4342087455706215e-06, "loss": 0.1102, "step": 3002 }, { "epoch": 1.4011194029850746, "grad_norm": 1.0434504127192574, "learning_rate": 6.429008218424161e-06, "loss": 0.1037, "step": 3004 }, { "epoch": 1.4020522388059702, "grad_norm": 0.9665951864069205, "learning_rate": 6.423806007295972e-06, "loss": 0.0992, "step": 3006 }, { "epoch": 1.4029850746268657, "grad_norm": 1.1641206738320007, "learning_rate": 6.418602118316476e-06, "loss": 0.108, "step": 3008 }, { "epoch": 1.4039179104477613, "grad_norm": 1.140916583792148, "learning_rate": 6.413396557618078e-06, "loss": 0.1059, "step": 3010 }, { "epoch": 1.4048507462686568, "grad_norm": 1.2090330859544889, "learning_rate": 6.408189331335151e-06, "loss": 0.1073, "step": 3012 }, { "epoch": 1.4057835820895521, "grad_norm": 0.9926869496974697, "learning_rate": 6.402980445604028e-06, "loss": 0.0986, "step": 3014 }, { "epoch": 1.4067164179104479, "grad_norm": 1.2452033859861542, "learning_rate": 6.397769906563003e-06, "loss": 0.1122, "step": 3016 }, { "epoch": 1.4076492537313432, "grad_norm": 1.0659859150777007, "learning_rate": 6.3925577203523136e-06, "loss": 0.0957, "step": 3018 }, { "epoch": 1.4085820895522387, "grad_norm": 1.156239094073771, "learning_rate": 6.38734389311414e-06, "loss": 0.0949, "step": 3020 }, { "epoch": 1.4095149253731343, "grad_norm": 1.3945291971907214, "learning_rate": 6.382128430992599e-06, "loss": 0.1089, "step": 3022 }, { "epoch": 1.4104477611940298, "grad_norm": 0.924967408265427, "learning_rate": 6.376911340133729e-06, "loss": 0.1007, "step": 3024 }, { "epoch": 1.4113805970149254, "grad_norm": 1.1830645716989097, "learning_rate": 6.371692626685491e-06, "loss": 0.1001, "step": 3026 }, { "epoch": 1.412313432835821, "grad_norm": 1.1160910803912631, "learning_rate": 6.366472296797758e-06, "loss": 0.0969, "step": 3028 }, { "epoch": 1.4132462686567164, "grad_norm": 1.0376037844861716, "learning_rate": 6.361250356622306e-06, "loss": 0.0944, "step": 3030 }, { "epoch": 1.414179104477612, "grad_norm": 1.2014647340273814, "learning_rate": 6.3560268123128085e-06, "loss": 0.1082, "step": 3032 }, { "epoch": 1.4151119402985075, "grad_norm": 1.0873051273313719, "learning_rate": 6.350801670024836e-06, "loss": 0.1038, "step": 3034 }, { "epoch": 1.416044776119403, "grad_norm": 1.0862240190195362, "learning_rate": 6.34557493591583e-06, "loss": 0.1103, "step": 3036 }, { "epoch": 1.4169776119402986, "grad_norm": 1.0072115852608963, "learning_rate": 6.34034661614512e-06, "loss": 0.0973, "step": 3038 }, { "epoch": 1.417910447761194, "grad_norm": 1.2026295177525366, "learning_rate": 6.335116716873895e-06, "loss": 0.0974, "step": 3040 }, { "epoch": 1.4188432835820897, "grad_norm": 1.1104122744891574, "learning_rate": 6.3298852442652115e-06, "loss": 0.1052, "step": 3042 }, { "epoch": 1.419776119402985, "grad_norm": 1.1996186070036516, "learning_rate": 6.3246522044839764e-06, "loss": 0.1118, "step": 3044 }, { "epoch": 1.4207089552238805, "grad_norm": 1.0113422644591767, "learning_rate": 6.319417603696944e-06, "loss": 0.0983, "step": 3046 }, { "epoch": 1.421641791044776, "grad_norm": 1.0623284572580665, "learning_rate": 6.314181448072711e-06, "loss": 0.1056, "step": 3048 }, { "epoch": 1.4225746268656716, "grad_norm": 1.1536987469950621, "learning_rate": 6.308943743781703e-06, "loss": 0.1068, "step": 3050 }, { "epoch": 1.4235074626865671, "grad_norm": 1.0826482438350977, "learning_rate": 6.303704496996168e-06, "loss": 0.0978, "step": 3052 }, { "epoch": 1.4244402985074627, "grad_norm": 1.1627640702871243, "learning_rate": 6.2984637138901815e-06, "loss": 0.1013, "step": 3054 }, { "epoch": 1.4253731343283582, "grad_norm": 1.021338061669078, "learning_rate": 6.29322140063962e-06, "loss": 0.0941, "step": 3056 }, { "epoch": 1.4263059701492538, "grad_norm": 1.117057545089706, "learning_rate": 6.287977563422165e-06, "loss": 0.1098, "step": 3058 }, { "epoch": 1.4272388059701493, "grad_norm": 1.0784406129417174, "learning_rate": 6.282732208417298e-06, "loss": 0.0954, "step": 3060 }, { "epoch": 1.4281716417910448, "grad_norm": 1.2570229258811754, "learning_rate": 6.277485341806286e-06, "loss": 0.1186, "step": 3062 }, { "epoch": 1.4291044776119404, "grad_norm": 0.8979995868785656, "learning_rate": 6.272236969772178e-06, "loss": 0.0858, "step": 3064 }, { "epoch": 1.4300373134328357, "grad_norm": 1.0380953904580874, "learning_rate": 6.266987098499795e-06, "loss": 0.0969, "step": 3066 }, { "epoch": 1.4309701492537314, "grad_norm": 1.0570972769273697, "learning_rate": 6.261735734175729e-06, "loss": 0.0968, "step": 3068 }, { "epoch": 1.4319029850746268, "grad_norm": 1.2057515763740294, "learning_rate": 6.256482882988326e-06, "loss": 0.1078, "step": 3070 }, { "epoch": 1.4328358208955223, "grad_norm": 1.0861212408518004, "learning_rate": 6.2512285511276905e-06, "loss": 0.1149, "step": 3072 }, { "epoch": 1.4337686567164178, "grad_norm": 1.0315469642547823, "learning_rate": 6.2459727447856665e-06, "loss": 0.1026, "step": 3074 }, { "epoch": 1.4347014925373134, "grad_norm": 0.9997333106764426, "learning_rate": 6.2407154701558395e-06, "loss": 0.0904, "step": 3076 }, { "epoch": 1.435634328358209, "grad_norm": 1.0774685188886144, "learning_rate": 6.235456733433519e-06, "loss": 0.1055, "step": 3078 }, { "epoch": 1.4365671641791045, "grad_norm": 1.1451288509748911, "learning_rate": 6.230196540815748e-06, "loss": 0.0992, "step": 3080 }, { "epoch": 1.4375, "grad_norm": 1.111035491471034, "learning_rate": 6.224934898501274e-06, "loss": 0.0914, "step": 3082 }, { "epoch": 1.4384328358208955, "grad_norm": 1.0707168760494123, "learning_rate": 6.219671812690559e-06, "loss": 0.1018, "step": 3084 }, { "epoch": 1.439365671641791, "grad_norm": 1.1819138824851976, "learning_rate": 6.214407289585766e-06, "loss": 0.1055, "step": 3086 }, { "epoch": 1.4402985074626866, "grad_norm": 1.1181879319957275, "learning_rate": 6.209141335390752e-06, "loss": 0.0987, "step": 3088 }, { "epoch": 1.4412313432835822, "grad_norm": 1.1834721372704728, "learning_rate": 6.203873956311055e-06, "loss": 0.1104, "step": 3090 }, { "epoch": 1.4421641791044777, "grad_norm": 1.2329644084025368, "learning_rate": 6.1986051585539e-06, "loss": 0.1168, "step": 3092 }, { "epoch": 1.4430970149253732, "grad_norm": 1.0996131554547866, "learning_rate": 6.193334948328178e-06, "loss": 0.1051, "step": 3094 }, { "epoch": 1.4440298507462686, "grad_norm": 1.0958634987738785, "learning_rate": 6.188063331844447e-06, "loss": 0.1038, "step": 3096 }, { "epoch": 1.4449626865671643, "grad_norm": 1.0963858523235155, "learning_rate": 6.182790315314922e-06, "loss": 0.1118, "step": 3098 }, { "epoch": 1.4458955223880596, "grad_norm": 1.0722540459615448, "learning_rate": 6.1775159049534675e-06, "loss": 0.1021, "step": 3100 }, { "epoch": 1.4468283582089552, "grad_norm": 1.1090454875456877, "learning_rate": 6.172240106975591e-06, "loss": 0.0976, "step": 3102 }, { "epoch": 1.4477611940298507, "grad_norm": 1.2055469445945555, "learning_rate": 6.1669629275984325e-06, "loss": 0.1064, "step": 3104 }, { "epoch": 1.4486940298507462, "grad_norm": 0.9929386352393755, "learning_rate": 6.161684373040765e-06, "loss": 0.1002, "step": 3106 }, { "epoch": 1.4496268656716418, "grad_norm": 1.2056230123395077, "learning_rate": 6.156404449522978e-06, "loss": 0.0982, "step": 3108 }, { "epoch": 1.4505597014925373, "grad_norm": 1.0424913535896907, "learning_rate": 6.151123163267074e-06, "loss": 0.0981, "step": 3110 }, { "epoch": 1.4514925373134329, "grad_norm": 1.084678518523754, "learning_rate": 6.145840520496666e-06, "loss": 0.102, "step": 3112 }, { "epoch": 1.4524253731343284, "grad_norm": 1.1979722607056424, "learning_rate": 6.140556527436962e-06, "loss": 0.1125, "step": 3114 }, { "epoch": 1.453358208955224, "grad_norm": 1.0233861227717225, "learning_rate": 6.135271190314758e-06, "loss": 0.0984, "step": 3116 }, { "epoch": 1.4542910447761195, "grad_norm": 1.0598274718309244, "learning_rate": 6.12998451535844e-06, "loss": 0.1053, "step": 3118 }, { "epoch": 1.455223880597015, "grad_norm": 1.2751975157364583, "learning_rate": 6.124696508797968e-06, "loss": 0.1208, "step": 3120 }, { "epoch": 1.4561567164179103, "grad_norm": 1.1140232283756788, "learning_rate": 6.1194071768648715e-06, "loss": 0.1092, "step": 3122 }, { "epoch": 1.457089552238806, "grad_norm": 1.1303080881263854, "learning_rate": 6.114116525792239e-06, "loss": 0.102, "step": 3124 }, { "epoch": 1.4580223880597014, "grad_norm": 1.0568236281312444, "learning_rate": 6.10882456181472e-06, "loss": 0.0853, "step": 3126 }, { "epoch": 1.458955223880597, "grad_norm": 1.1699321756566419, "learning_rate": 6.1035312911685056e-06, "loss": 0.1021, "step": 3128 }, { "epoch": 1.4598880597014925, "grad_norm": 1.050421298590088, "learning_rate": 6.098236720091326e-06, "loss": 0.1086, "step": 3130 }, { "epoch": 1.460820895522388, "grad_norm": 1.0807380921286178, "learning_rate": 6.09294085482245e-06, "loss": 0.0992, "step": 3132 }, { "epoch": 1.4617537313432836, "grad_norm": 1.0076965320461435, "learning_rate": 6.087643701602666e-06, "loss": 0.1064, "step": 3134 }, { "epoch": 1.462686567164179, "grad_norm": 1.1523421688557507, "learning_rate": 6.082345266674279e-06, "loss": 0.1021, "step": 3136 }, { "epoch": 1.4636194029850746, "grad_norm": 1.1650279713737783, "learning_rate": 6.0770455562811125e-06, "loss": 0.0953, "step": 3138 }, { "epoch": 1.4645522388059702, "grad_norm": 1.0788801296383042, "learning_rate": 6.071744576668486e-06, "loss": 0.0983, "step": 3140 }, { "epoch": 1.4654850746268657, "grad_norm": 1.32353220879985, "learning_rate": 6.066442334083214e-06, "loss": 0.1146, "step": 3142 }, { "epoch": 1.4664179104477613, "grad_norm": 1.1069120841967914, "learning_rate": 6.061138834773604e-06, "loss": 0.1064, "step": 3144 }, { "epoch": 1.4673507462686568, "grad_norm": 1.181184951475493, "learning_rate": 6.055834084989443e-06, "loss": 0.1095, "step": 3146 }, { "epoch": 1.4682835820895521, "grad_norm": 1.084969053350031, "learning_rate": 6.050528090981989e-06, "loss": 0.0958, "step": 3148 }, { "epoch": 1.4692164179104479, "grad_norm": 1.0351168200214633, "learning_rate": 6.045220859003969e-06, "loss": 0.1047, "step": 3150 }, { "epoch": 1.4701492537313432, "grad_norm": 1.0579711036430157, "learning_rate": 6.039912395309568e-06, "loss": 0.0998, "step": 3152 }, { "epoch": 1.4710820895522387, "grad_norm": 1.0861897419858866, "learning_rate": 6.034602706154422e-06, "loss": 0.1013, "step": 3154 }, { "epoch": 1.4720149253731343, "grad_norm": 1.2551293733340556, "learning_rate": 6.029291797795614e-06, "loss": 0.1087, "step": 3156 }, { "epoch": 1.4729477611940298, "grad_norm": 1.1006371029081106, "learning_rate": 6.023979676491656e-06, "loss": 0.1118, "step": 3158 }, { "epoch": 1.4738805970149254, "grad_norm": 1.0751643558376185, "learning_rate": 6.0186663485025e-06, "loss": 0.104, "step": 3160 }, { "epoch": 1.474813432835821, "grad_norm": 0.9540738113065265, "learning_rate": 6.01335182008951e-06, "loss": 0.1002, "step": 3162 }, { "epoch": 1.4757462686567164, "grad_norm": 1.0918722694265637, "learning_rate": 6.008036097515475e-06, "loss": 0.0991, "step": 3164 }, { "epoch": 1.476679104477612, "grad_norm": 1.044335258752036, "learning_rate": 6.00271918704458e-06, "loss": 0.1067, "step": 3166 }, { "epoch": 1.4776119402985075, "grad_norm": 1.0930920800274009, "learning_rate": 5.997401094942417e-06, "loss": 0.1024, "step": 3168 }, { "epoch": 1.478544776119403, "grad_norm": 1.2264099772494796, "learning_rate": 5.992081827475971e-06, "loss": 0.0995, "step": 3170 }, { "epoch": 1.4794776119402986, "grad_norm": 0.998036190610848, "learning_rate": 5.986761390913609e-06, "loss": 0.1044, "step": 3172 }, { "epoch": 1.480410447761194, "grad_norm": 1.0852227182318062, "learning_rate": 5.981439791525073e-06, "loss": 0.1028, "step": 3174 }, { "epoch": 1.4813432835820897, "grad_norm": 1.2220006546674524, "learning_rate": 5.976117035581483e-06, "loss": 0.1089, "step": 3176 }, { "epoch": 1.482276119402985, "grad_norm": 1.0873966123567707, "learning_rate": 5.970793129355318e-06, "loss": 0.1259, "step": 3178 }, { "epoch": 1.4832089552238805, "grad_norm": 1.1664256253594125, "learning_rate": 5.96546807912041e-06, "loss": 0.1028, "step": 3180 }, { "epoch": 1.484141791044776, "grad_norm": 1.2629487749264883, "learning_rate": 5.960141891151943e-06, "loss": 0.1131, "step": 3182 }, { "epoch": 1.4850746268656716, "grad_norm": 1.2184221517476623, "learning_rate": 5.954814571726438e-06, "loss": 0.1063, "step": 3184 }, { "epoch": 1.4860074626865671, "grad_norm": 1.0362946938412179, "learning_rate": 5.949486127121754e-06, "loss": 0.1007, "step": 3186 }, { "epoch": 1.4869402985074627, "grad_norm": 1.1232622115653055, "learning_rate": 5.944156563617073e-06, "loss": 0.1141, "step": 3188 }, { "epoch": 1.4878731343283582, "grad_norm": 1.0805062286617, "learning_rate": 5.938825887492895e-06, "loss": 0.1025, "step": 3190 }, { "epoch": 1.4888059701492538, "grad_norm": 1.011890034243429, "learning_rate": 5.933494105031032e-06, "loss": 0.1017, "step": 3192 }, { "epoch": 1.4897388059701493, "grad_norm": 1.0717306271050195, "learning_rate": 5.928161222514601e-06, "loss": 0.1019, "step": 3194 }, { "epoch": 1.4906716417910448, "grad_norm": 1.0027699542798725, "learning_rate": 5.9228272462280156e-06, "loss": 0.0884, "step": 3196 }, { "epoch": 1.4916044776119404, "grad_norm": 1.062085236264922, "learning_rate": 5.917492182456975e-06, "loss": 0.1003, "step": 3198 }, { "epoch": 1.4925373134328357, "grad_norm": 1.1869465393510803, "learning_rate": 5.91215603748846e-06, "loss": 0.0983, "step": 3200 }, { "epoch": 1.4934701492537314, "grad_norm": 1.3363242185288464, "learning_rate": 5.906818817610731e-06, "loss": 0.1119, "step": 3202 }, { "epoch": 1.4944029850746268, "grad_norm": 1.0686221045699777, "learning_rate": 5.90148052911331e-06, "loss": 0.0951, "step": 3204 }, { "epoch": 1.4953358208955223, "grad_norm": 1.1884349749059908, "learning_rate": 5.896141178286979e-06, "loss": 0.1014, "step": 3206 }, { "epoch": 1.4962686567164178, "grad_norm": 1.1669360727986386, "learning_rate": 5.890800771423775e-06, "loss": 0.1064, "step": 3208 }, { "epoch": 1.4972014925373134, "grad_norm": 1.2576434549103657, "learning_rate": 5.8854593148169745e-06, "loss": 0.1143, "step": 3210 }, { "epoch": 1.498134328358209, "grad_norm": 1.2180390148379803, "learning_rate": 5.8801168147610956e-06, "loss": 0.1006, "step": 3212 }, { "epoch": 1.4990671641791045, "grad_norm": 1.1235632170576928, "learning_rate": 5.874773277551883e-06, "loss": 0.1015, "step": 3214 }, { "epoch": 1.5, "grad_norm": 1.0484635859530513, "learning_rate": 5.869428709486304e-06, "loss": 0.0988, "step": 3216 }, { "epoch": 1.5009328358208955, "grad_norm": 1.137790156251054, "learning_rate": 5.864083116862544e-06, "loss": 0.1062, "step": 3218 }, { "epoch": 1.501865671641791, "grad_norm": 1.2017926673666146, "learning_rate": 5.858736505979989e-06, "loss": 0.1123, "step": 3220 }, { "epoch": 1.5027985074626866, "grad_norm": 1.0922020832754598, "learning_rate": 5.853388883139235e-06, "loss": 0.0881, "step": 3222 }, { "epoch": 1.5037313432835822, "grad_norm": 1.1279459607000017, "learning_rate": 5.84804025464206e-06, "loss": 0.0969, "step": 3224 }, { "epoch": 1.5046641791044775, "grad_norm": 1.2073940441348827, "learning_rate": 5.842690626791433e-06, "loss": 0.0992, "step": 3226 }, { "epoch": 1.5055970149253732, "grad_norm": 1.1226079923533225, "learning_rate": 5.837340005891499e-06, "loss": 0.0984, "step": 3228 }, { "epoch": 1.5065298507462686, "grad_norm": 1.11157292850359, "learning_rate": 5.831988398247576e-06, "loss": 0.1052, "step": 3230 }, { "epoch": 1.5074626865671643, "grad_norm": 1.1794443229952236, "learning_rate": 5.8266358101661365e-06, "loss": 0.1152, "step": 3232 }, { "epoch": 1.5083955223880596, "grad_norm": 1.139563854183085, "learning_rate": 5.8212822479548214e-06, "loss": 0.0998, "step": 3234 }, { "epoch": 1.5093283582089554, "grad_norm": 1.0415626306406014, "learning_rate": 5.815927717922408e-06, "loss": 0.1101, "step": 3236 }, { "epoch": 1.5102611940298507, "grad_norm": 1.0083882017568637, "learning_rate": 5.810572226378821e-06, "loss": 0.1106, "step": 3238 }, { "epoch": 1.5111940298507462, "grad_norm": 1.1104204064017498, "learning_rate": 5.8052157796351134e-06, "loss": 0.1026, "step": 3240 }, { "epoch": 1.5121268656716418, "grad_norm": 1.1481507362693006, "learning_rate": 5.799858384003469e-06, "loss": 0.1115, "step": 3242 }, { "epoch": 1.5130597014925373, "grad_norm": 0.9662005312457365, "learning_rate": 5.7945000457971844e-06, "loss": 0.0957, "step": 3244 }, { "epoch": 1.5139925373134329, "grad_norm": 1.1052757084273848, "learning_rate": 5.789140771330669e-06, "loss": 0.1011, "step": 3246 }, { "epoch": 1.5149253731343284, "grad_norm": 1.1491614020411802, "learning_rate": 5.7837805669194395e-06, "loss": 0.1091, "step": 3248 }, { "epoch": 1.515858208955224, "grad_norm": 1.117043878180387, "learning_rate": 5.778419438880103e-06, "loss": 0.0983, "step": 3250 }, { "epoch": 1.5167910447761193, "grad_norm": 1.0781656608943595, "learning_rate": 5.773057393530355e-06, "loss": 0.1053, "step": 3252 }, { "epoch": 1.517723880597015, "grad_norm": 1.1065480630337796, "learning_rate": 5.767694437188976e-06, "loss": 0.1059, "step": 3254 }, { "epoch": 1.5186567164179103, "grad_norm": 1.0864972792796228, "learning_rate": 5.762330576175821e-06, "loss": 0.1017, "step": 3256 }, { "epoch": 1.519589552238806, "grad_norm": 1.268340624743877, "learning_rate": 5.756965816811801e-06, "loss": 0.1055, "step": 3258 }, { "epoch": 1.5205223880597014, "grad_norm": 1.1517888164790135, "learning_rate": 5.7516001654189e-06, "loss": 0.0993, "step": 3260 }, { "epoch": 1.5214552238805972, "grad_norm": 1.1637222664241322, "learning_rate": 5.746233628320142e-06, "loss": 0.1165, "step": 3262 }, { "epoch": 1.5223880597014925, "grad_norm": 1.0074675729873184, "learning_rate": 5.7408662118395984e-06, "loss": 0.092, "step": 3264 }, { "epoch": 1.523320895522388, "grad_norm": 1.1399869102667521, "learning_rate": 5.73549792230238e-06, "loss": 0.1033, "step": 3266 }, { "epoch": 1.5242537313432836, "grad_norm": 1.056324688109504, "learning_rate": 5.730128766034621e-06, "loss": 0.0918, "step": 3268 }, { "epoch": 1.525186567164179, "grad_norm": 1.043679352211642, "learning_rate": 5.7247587493634805e-06, "loss": 0.0954, "step": 3270 }, { "epoch": 1.5261194029850746, "grad_norm": 1.0367796118583745, "learning_rate": 5.7193878786171305e-06, "loss": 0.0994, "step": 3272 }, { "epoch": 1.5270522388059702, "grad_norm": 1.1565588467797716, "learning_rate": 5.714016160124749e-06, "loss": 0.104, "step": 3274 }, { "epoch": 1.5279850746268657, "grad_norm": 0.9296793416169334, "learning_rate": 5.7086436002165165e-06, "loss": 0.0897, "step": 3276 }, { "epoch": 1.528917910447761, "grad_norm": 1.1012574949131864, "learning_rate": 5.7032702052235975e-06, "loss": 0.0971, "step": 3278 }, { "epoch": 1.5298507462686568, "grad_norm": 1.2098175910072202, "learning_rate": 5.69789598147815e-06, "loss": 0.0976, "step": 3280 }, { "epoch": 1.5307835820895521, "grad_norm": 1.1484092564782926, "learning_rate": 5.692520935313302e-06, "loss": 0.0984, "step": 3282 }, { "epoch": 1.5317164179104479, "grad_norm": 1.0200699887269549, "learning_rate": 5.687145073063149e-06, "loss": 0.0981, "step": 3284 }, { "epoch": 1.5326492537313432, "grad_norm": 1.0917603599612902, "learning_rate": 5.681768401062757e-06, "loss": 0.1004, "step": 3286 }, { "epoch": 1.533582089552239, "grad_norm": 1.1681225973732419, "learning_rate": 5.676390925648139e-06, "loss": 0.1096, "step": 3288 }, { "epoch": 1.5345149253731343, "grad_norm": 1.092452207603256, "learning_rate": 5.671012653156255e-06, "loss": 0.0984, "step": 3290 }, { "epoch": 1.5354477611940298, "grad_norm": 1.07350850768635, "learning_rate": 5.6656335899250085e-06, "loss": 0.1049, "step": 3292 }, { "epoch": 1.5363805970149254, "grad_norm": 1.0525563606331456, "learning_rate": 5.66025374229323e-06, "loss": 0.1053, "step": 3294 }, { "epoch": 1.537313432835821, "grad_norm": 1.0615799064262887, "learning_rate": 5.654873116600679e-06, "loss": 0.1093, "step": 3296 }, { "epoch": 1.5382462686567164, "grad_norm": 1.0535580579374406, "learning_rate": 5.649491719188029e-06, "loss": 0.1033, "step": 3298 }, { "epoch": 1.539179104477612, "grad_norm": 1.0074381294048076, "learning_rate": 5.644109556396861e-06, "loss": 0.0981, "step": 3300 }, { "epoch": 1.5401119402985075, "grad_norm": 0.9747654247984665, "learning_rate": 5.638726634569664e-06, "loss": 0.1035, "step": 3302 }, { "epoch": 1.5410447761194028, "grad_norm": 1.1555841433941612, "learning_rate": 5.633342960049816e-06, "loss": 0.1022, "step": 3304 }, { "epoch": 1.5419776119402986, "grad_norm": 1.1248105887886655, "learning_rate": 5.627958539181584e-06, "loss": 0.1134, "step": 3306 }, { "epoch": 1.542910447761194, "grad_norm": 1.149391834527609, "learning_rate": 5.6225733783101165e-06, "loss": 0.1004, "step": 3308 }, { "epoch": 1.5438432835820897, "grad_norm": 1.1425752238571576, "learning_rate": 5.6171874837814275e-06, "loss": 0.1092, "step": 3310 }, { "epoch": 1.544776119402985, "grad_norm": 1.0480678051107188, "learning_rate": 5.611800861942404e-06, "loss": 0.0949, "step": 3312 }, { "epoch": 1.5457089552238807, "grad_norm": 0.9883015763089346, "learning_rate": 5.606413519140784e-06, "loss": 0.1021, "step": 3314 }, { "epoch": 1.546641791044776, "grad_norm": 1.1055000788459297, "learning_rate": 5.6010254617251595e-06, "loss": 0.1009, "step": 3316 }, { "epoch": 1.5475746268656716, "grad_norm": 1.2151562055781369, "learning_rate": 5.595636696044959e-06, "loss": 0.1095, "step": 3318 }, { "epoch": 1.5485074626865671, "grad_norm": 1.141262195565456, "learning_rate": 5.590247228450451e-06, "loss": 0.1051, "step": 3320 }, { "epoch": 1.5494402985074627, "grad_norm": 1.1707184818521053, "learning_rate": 5.5848570652927304e-06, "loss": 0.1038, "step": 3322 }, { "epoch": 1.5503731343283582, "grad_norm": 1.1289310804178943, "learning_rate": 5.579466212923708e-06, "loss": 0.1074, "step": 3324 }, { "epoch": 1.5513059701492538, "grad_norm": 1.1064526845112488, "learning_rate": 5.574074677696109e-06, "loss": 0.1103, "step": 3326 }, { "epoch": 1.5522388059701493, "grad_norm": 1.1525760613403542, "learning_rate": 5.568682465963466e-06, "loss": 0.1085, "step": 3328 }, { "epoch": 1.5531716417910446, "grad_norm": 1.022601642611851, "learning_rate": 5.563289584080105e-06, "loss": 0.1001, "step": 3330 }, { "epoch": 1.5541044776119404, "grad_norm": 1.0444993190614293, "learning_rate": 5.557896038401143e-06, "loss": 0.1108, "step": 3332 }, { "epoch": 1.5550373134328357, "grad_norm": 0.9707826476325895, "learning_rate": 5.55250183528248e-06, "loss": 0.1008, "step": 3334 }, { "epoch": 1.5559701492537314, "grad_norm": 1.0644857641645487, "learning_rate": 5.547106981080789e-06, "loss": 0.1106, "step": 3336 }, { "epoch": 1.5569029850746268, "grad_norm": 1.192244889603234, "learning_rate": 5.541711482153513e-06, "loss": 0.1007, "step": 3338 }, { "epoch": 1.5578358208955225, "grad_norm": 1.043812796454789, "learning_rate": 5.53631534485885e-06, "loss": 0.0974, "step": 3340 }, { "epoch": 1.5587686567164178, "grad_norm": 1.045026903729856, "learning_rate": 5.530918575555757e-06, "loss": 0.1036, "step": 3342 }, { "epoch": 1.5597014925373134, "grad_norm": 1.1390461471821403, "learning_rate": 5.525521180603931e-06, "loss": 0.1062, "step": 3344 }, { "epoch": 1.560634328358209, "grad_norm": 1.0997737486755967, "learning_rate": 5.520123166363807e-06, "loss": 0.1084, "step": 3346 }, { "epoch": 1.5615671641791045, "grad_norm": 1.2744198144447323, "learning_rate": 5.514724539196549e-06, "loss": 0.1084, "step": 3348 }, { "epoch": 1.5625, "grad_norm": 1.0003129851012083, "learning_rate": 5.5093253054640476e-06, "loss": 0.0908, "step": 3350 }, { "epoch": 1.5634328358208955, "grad_norm": 1.0235800290570736, "learning_rate": 5.503925471528901e-06, "loss": 0.0977, "step": 3352 }, { "epoch": 1.564365671641791, "grad_norm": 1.049002776986393, "learning_rate": 5.498525043754422e-06, "loss": 0.0978, "step": 3354 }, { "epoch": 1.5652985074626866, "grad_norm": 0.9128697744784077, "learning_rate": 5.493124028504619e-06, "loss": 0.0881, "step": 3356 }, { "epoch": 1.5662313432835822, "grad_norm": 1.167538735990302, "learning_rate": 5.487722432144194e-06, "loss": 0.114, "step": 3358 }, { "epoch": 1.5671641791044775, "grad_norm": 0.9955146544597703, "learning_rate": 5.482320261038533e-06, "loss": 0.1005, "step": 3360 }, { "epoch": 1.5680970149253732, "grad_norm": 1.1289787193151126, "learning_rate": 5.4769175215537e-06, "loss": 0.1119, "step": 3362 }, { "epoch": 1.5690298507462686, "grad_norm": 1.0188726391534335, "learning_rate": 5.471514220056427e-06, "loss": 0.0982, "step": 3364 }, { "epoch": 1.5699626865671643, "grad_norm": 1.005685925577855, "learning_rate": 5.466110362914113e-06, "loss": 0.087, "step": 3366 }, { "epoch": 1.5708955223880596, "grad_norm": 1.1018413806786018, "learning_rate": 5.460705956494807e-06, "loss": 0.1007, "step": 3368 }, { "epoch": 1.5718283582089554, "grad_norm": 0.924044230062411, "learning_rate": 5.455301007167206e-06, "loss": 0.0842, "step": 3370 }, { "epoch": 1.5727611940298507, "grad_norm": 1.1158962490512863, "learning_rate": 5.4498955213006495e-06, "loss": 0.1061, "step": 3372 }, { "epoch": 1.5736940298507462, "grad_norm": 1.1702039603986962, "learning_rate": 5.444489505265107e-06, "loss": 0.102, "step": 3374 }, { "epoch": 1.5746268656716418, "grad_norm": 1.1595261199874245, "learning_rate": 5.439082965431172e-06, "loss": 0.0992, "step": 3376 }, { "epoch": 1.5755597014925373, "grad_norm": 1.0477115551122573, "learning_rate": 5.433675908170057e-06, "loss": 0.0916, "step": 3378 }, { "epoch": 1.5764925373134329, "grad_norm": 1.2276392020633364, "learning_rate": 5.428268339853585e-06, "loss": 0.1141, "step": 3380 }, { "epoch": 1.5774253731343284, "grad_norm": 1.15294443230766, "learning_rate": 5.422860266854178e-06, "loss": 0.1128, "step": 3382 }, { "epoch": 1.578358208955224, "grad_norm": 1.049236986120805, "learning_rate": 5.4174516955448565e-06, "loss": 0.0906, "step": 3384 }, { "epoch": 1.5792910447761193, "grad_norm": 1.0291531356928958, "learning_rate": 5.412042632299227e-06, "loss": 0.1003, "step": 3386 }, { "epoch": 1.580223880597015, "grad_norm": 1.0130861660463677, "learning_rate": 5.406633083491473e-06, "loss": 0.0991, "step": 3388 }, { "epoch": 1.5811567164179103, "grad_norm": 1.1448712948800965, "learning_rate": 5.401223055496351e-06, "loss": 0.1034, "step": 3390 }, { "epoch": 1.582089552238806, "grad_norm": 1.0408097479050962, "learning_rate": 5.395812554689188e-06, "loss": 0.0892, "step": 3392 }, { "epoch": 1.5830223880597014, "grad_norm": 0.956142775119575, "learning_rate": 5.390401587445861e-06, "loss": 0.0855, "step": 3394 }, { "epoch": 1.5839552238805972, "grad_norm": 1.1077760930238951, "learning_rate": 5.3849901601428004e-06, "loss": 0.1005, "step": 3396 }, { "epoch": 1.5848880597014925, "grad_norm": 1.0197330666390934, "learning_rate": 5.379578279156976e-06, "loss": 0.0964, "step": 3398 }, { "epoch": 1.585820895522388, "grad_norm": 1.258298892927742, "learning_rate": 5.374165950865897e-06, "loss": 0.1066, "step": 3400 }, { "epoch": 1.5867537313432836, "grad_norm": 1.2488739250729246, "learning_rate": 5.368753181647594e-06, "loss": 0.1193, "step": 3402 }, { "epoch": 1.587686567164179, "grad_norm": 1.168478047666357, "learning_rate": 5.363339977880619e-06, "loss": 0.0902, "step": 3404 }, { "epoch": 1.5886194029850746, "grad_norm": 1.0629366502594004, "learning_rate": 5.357926345944041e-06, "loss": 0.1089, "step": 3406 }, { "epoch": 1.5895522388059702, "grad_norm": 1.0796016027378421, "learning_rate": 5.352512292217427e-06, "loss": 0.0872, "step": 3408 }, { "epoch": 1.5904850746268657, "grad_norm": 1.1211043140083328, "learning_rate": 5.347097823080842e-06, "loss": 0.1019, "step": 3410 }, { "epoch": 1.591417910447761, "grad_norm": 1.0494066254235275, "learning_rate": 5.341682944914846e-06, "loss": 0.0857, "step": 3412 }, { "epoch": 1.5923507462686568, "grad_norm": 1.1468631793377873, "learning_rate": 5.3362676641004755e-06, "loss": 0.1036, "step": 3414 }, { "epoch": 1.5932835820895521, "grad_norm": 1.15286795423784, "learning_rate": 5.33085198701924e-06, "loss": 0.1128, "step": 3416 }, { "epoch": 1.5942164179104479, "grad_norm": 1.16743739856925, "learning_rate": 5.325435920053124e-06, "loss": 0.1097, "step": 3418 }, { "epoch": 1.5951492537313432, "grad_norm": 1.0253152956157596, "learning_rate": 5.320019469584562e-06, "loss": 0.1031, "step": 3420 }, { "epoch": 1.596082089552239, "grad_norm": 1.0452070457188003, "learning_rate": 5.314602641996448e-06, "loss": 0.1024, "step": 3422 }, { "epoch": 1.5970149253731343, "grad_norm": 0.9971837975286477, "learning_rate": 5.309185443672117e-06, "loss": 0.0997, "step": 3424 }, { "epoch": 1.5979477611940298, "grad_norm": 1.0506023823825037, "learning_rate": 5.303767880995339e-06, "loss": 0.1017, "step": 3426 }, { "epoch": 1.5988805970149254, "grad_norm": 1.0226928003736417, "learning_rate": 5.29834996035032e-06, "loss": 0.0933, "step": 3428 }, { "epoch": 1.599813432835821, "grad_norm": 1.1240088184633903, "learning_rate": 5.29293168812168e-06, "loss": 0.0872, "step": 3430 }, { "epoch": 1.6007462686567164, "grad_norm": 1.0934312298167896, "learning_rate": 5.287513070694458e-06, "loss": 0.1021, "step": 3432 }, { "epoch": 1.601679104477612, "grad_norm": 0.9864960877735491, "learning_rate": 5.282094114454097e-06, "loss": 0.0998, "step": 3434 }, { "epoch": 1.6026119402985075, "grad_norm": 1.1116746178129506, "learning_rate": 5.276674825786441e-06, "loss": 0.0856, "step": 3436 }, { "epoch": 1.6035447761194028, "grad_norm": 1.0861045520010988, "learning_rate": 5.271255211077729e-06, "loss": 0.1041, "step": 3438 }, { "epoch": 1.6044776119402986, "grad_norm": 1.0302177348773862, "learning_rate": 5.265835276714578e-06, "loss": 0.0933, "step": 3440 }, { "epoch": 1.605410447761194, "grad_norm": 1.0534096090894276, "learning_rate": 5.260415029083983e-06, "loss": 0.0909, "step": 3442 }, { "epoch": 1.6063432835820897, "grad_norm": 1.1515857861458405, "learning_rate": 5.254994474573314e-06, "loss": 0.1073, "step": 3444 }, { "epoch": 1.607276119402985, "grad_norm": 0.9893069640958156, "learning_rate": 5.249573619570294e-06, "loss": 0.1032, "step": 3446 }, { "epoch": 1.6082089552238807, "grad_norm": 1.182569131687945, "learning_rate": 5.244152470463006e-06, "loss": 0.102, "step": 3448 }, { "epoch": 1.609141791044776, "grad_norm": 1.1328479359944652, "learning_rate": 5.238731033639879e-06, "loss": 0.0936, "step": 3450 }, { "epoch": 1.6100746268656716, "grad_norm": 1.0894456674430686, "learning_rate": 5.233309315489679e-06, "loss": 0.0943, "step": 3452 }, { "epoch": 1.6110074626865671, "grad_norm": 1.1145375697778408, "learning_rate": 5.227887322401504e-06, "loss": 0.0897, "step": 3454 }, { "epoch": 1.6119402985074627, "grad_norm": 0.9720405214741787, "learning_rate": 5.222465060764778e-06, "loss": 0.0937, "step": 3456 }, { "epoch": 1.6128731343283582, "grad_norm": 1.0977796171823857, "learning_rate": 5.217042536969238e-06, "loss": 0.1063, "step": 3458 }, { "epoch": 1.6138059701492538, "grad_norm": 1.2000981446555192, "learning_rate": 5.211619757404933e-06, "loss": 0.1066, "step": 3460 }, { "epoch": 1.6147388059701493, "grad_norm": 1.0854262662012177, "learning_rate": 5.2061967284622125e-06, "loss": 0.087, "step": 3462 }, { "epoch": 1.6156716417910446, "grad_norm": 1.2664030492175264, "learning_rate": 5.200773456531721e-06, "loss": 0.1032, "step": 3464 }, { "epoch": 1.6166044776119404, "grad_norm": 1.0918233072376418, "learning_rate": 5.195349948004386e-06, "loss": 0.1109, "step": 3466 }, { "epoch": 1.6175373134328357, "grad_norm": 1.099476486459014, "learning_rate": 5.189926209271415e-06, "loss": 0.1024, "step": 3468 }, { "epoch": 1.6184701492537314, "grad_norm": 1.1647877506586748, "learning_rate": 5.184502246724292e-06, "loss": 0.1102, "step": 3470 }, { "epoch": 1.6194029850746268, "grad_norm": 1.0775912321365924, "learning_rate": 5.179078066754757e-06, "loss": 0.0963, "step": 3472 }, { "epoch": 1.6203358208955225, "grad_norm": 1.0958814174313056, "learning_rate": 5.173653675754807e-06, "loss": 0.0939, "step": 3474 }, { "epoch": 1.6212686567164178, "grad_norm": 1.1992241323664228, "learning_rate": 5.168229080116697e-06, "loss": 0.1117, "step": 3476 }, { "epoch": 1.6222014925373134, "grad_norm": 1.1865337510695115, "learning_rate": 5.162804286232911e-06, "loss": 0.1068, "step": 3478 }, { "epoch": 1.623134328358209, "grad_norm": 1.2189765487255413, "learning_rate": 5.157379300496175e-06, "loss": 0.1043, "step": 3480 }, { "epoch": 1.6240671641791045, "grad_norm": 1.047892442751099, "learning_rate": 5.151954129299437e-06, "loss": 0.1025, "step": 3482 }, { "epoch": 1.625, "grad_norm": 1.0966949663578955, "learning_rate": 5.146528779035864e-06, "loss": 0.0953, "step": 3484 }, { "epoch": 1.6259328358208955, "grad_norm": 1.0023631321949278, "learning_rate": 5.141103256098836e-06, "loss": 0.0958, "step": 3486 }, { "epoch": 1.626865671641791, "grad_norm": 1.0285913271463087, "learning_rate": 5.135677566881935e-06, "loss": 0.1007, "step": 3488 }, { "epoch": 1.6277985074626866, "grad_norm": 1.15379657912386, "learning_rate": 5.130251717778939e-06, "loss": 0.1009, "step": 3490 }, { "epoch": 1.6287313432835822, "grad_norm": 1.1407035220321515, "learning_rate": 5.1248257151838145e-06, "loss": 0.0983, "step": 3492 }, { "epoch": 1.6296641791044775, "grad_norm": 1.0209284136928085, "learning_rate": 5.119399565490707e-06, "loss": 0.0897, "step": 3494 }, { "epoch": 1.6305970149253732, "grad_norm": 1.34014449868634, "learning_rate": 5.113973275093942e-06, "loss": 0.1121, "step": 3496 }, { "epoch": 1.6315298507462686, "grad_norm": 1.1291582885169507, "learning_rate": 5.108546850388002e-06, "loss": 0.1027, "step": 3498 }, { "epoch": 1.6324626865671643, "grad_norm": 1.1120430135012636, "learning_rate": 5.103120297767532e-06, "loss": 0.1031, "step": 3500 }, { "epoch": 1.6324626865671643, "eval_loss": 0.1550646424293518, "eval_runtime": 323.0994, "eval_samples_per_second": 47.187, "eval_steps_per_second": 5.899, "step": 3500 }, { "epoch": 1.6333955223880596, "grad_norm": 1.224966669924812, "learning_rate": 5.09769362362733e-06, "loss": 0.1048, "step": 3502 }, { "epoch": 1.6343283582089554, "grad_norm": 1.1310448493746208, "learning_rate": 5.092266834362334e-06, "loss": 0.096, "step": 3504 }, { "epoch": 1.6352611940298507, "grad_norm": 1.1448619844530687, "learning_rate": 5.086839936367617e-06, "loss": 0.1043, "step": 3506 }, { "epoch": 1.6361940298507462, "grad_norm": 1.198784693852069, "learning_rate": 5.081412936038384e-06, "loss": 0.1002, "step": 3508 }, { "epoch": 1.6371268656716418, "grad_norm": 1.0888414013644547, "learning_rate": 5.075985839769955e-06, "loss": 0.1031, "step": 3510 }, { "epoch": 1.6380597014925373, "grad_norm": 0.9863609356730051, "learning_rate": 5.070558653957769e-06, "loss": 0.0842, "step": 3512 }, { "epoch": 1.6389925373134329, "grad_norm": 1.043429135253945, "learning_rate": 5.065131384997367e-06, "loss": 0.0972, "step": 3514 }, { "epoch": 1.6399253731343284, "grad_norm": 1.1815139017622, "learning_rate": 5.059704039284388e-06, "loss": 0.1047, "step": 3516 }, { "epoch": 1.640858208955224, "grad_norm": 1.0360845535742687, "learning_rate": 5.054276623214563e-06, "loss": 0.1016, "step": 3518 }, { "epoch": 1.6417910447761193, "grad_norm": 1.0442361030967533, "learning_rate": 5.048849143183705e-06, "loss": 0.0904, "step": 3520 }, { "epoch": 1.642723880597015, "grad_norm": 1.1365748005301108, "learning_rate": 5.043421605587703e-06, "loss": 0.0972, "step": 3522 }, { "epoch": 1.6436567164179103, "grad_norm": 0.9719007649517358, "learning_rate": 5.037994016822512e-06, "loss": 0.0847, "step": 3524 }, { "epoch": 1.644589552238806, "grad_norm": 1.1364057839703934, "learning_rate": 5.032566383284149e-06, "loss": 0.1034, "step": 3526 }, { "epoch": 1.6455223880597014, "grad_norm": 1.0413573510632972, "learning_rate": 5.027138711368684e-06, "loss": 0.101, "step": 3528 }, { "epoch": 1.6464552238805972, "grad_norm": 1.0427605352714602, "learning_rate": 5.021711007472233e-06, "loss": 0.095, "step": 3530 }, { "epoch": 1.6473880597014925, "grad_norm": 1.108928417323732, "learning_rate": 5.0162832779909455e-06, "loss": 0.1074, "step": 3532 }, { "epoch": 1.648320895522388, "grad_norm": 1.0270133226481224, "learning_rate": 5.010855529321005e-06, "loss": 0.097, "step": 3534 }, { "epoch": 1.6492537313432836, "grad_norm": 1.0433919115523331, "learning_rate": 5.005427767858616e-06, "loss": 0.1045, "step": 3536 }, { "epoch": 1.650186567164179, "grad_norm": 1.1209653834373903, "learning_rate": 5e-06, "loss": 0.1108, "step": 3538 }, { "epoch": 1.6511194029850746, "grad_norm": 1.0097951219224293, "learning_rate": 4.994572232141385e-06, "loss": 0.0936, "step": 3540 }, { "epoch": 1.6520522388059702, "grad_norm": 1.0897908829200658, "learning_rate": 4.989144470678997e-06, "loss": 0.0926, "step": 3542 }, { "epoch": 1.6529850746268657, "grad_norm": 1.1995842264569205, "learning_rate": 4.983716722009055e-06, "loss": 0.092, "step": 3544 }, { "epoch": 1.653917910447761, "grad_norm": 1.0936272317090328, "learning_rate": 4.978288992527768e-06, "loss": 0.0873, "step": 3546 }, { "epoch": 1.6548507462686568, "grad_norm": 1.0817715121855342, "learning_rate": 4.972861288631317e-06, "loss": 0.0916, "step": 3548 }, { "epoch": 1.6557835820895521, "grad_norm": 1.093440730722074, "learning_rate": 4.967433616715852e-06, "loss": 0.103, "step": 3550 }, { "epoch": 1.6567164179104479, "grad_norm": 1.1828630308619648, "learning_rate": 4.96200598317749e-06, "loss": 0.1033, "step": 3552 }, { "epoch": 1.6576492537313432, "grad_norm": 1.1365119868172056, "learning_rate": 4.956578394412298e-06, "loss": 0.0997, "step": 3554 }, { "epoch": 1.658582089552239, "grad_norm": 1.0447886853566, "learning_rate": 4.9511508568162956e-06, "loss": 0.0896, "step": 3556 }, { "epoch": 1.6595149253731343, "grad_norm": 1.1474184723616883, "learning_rate": 4.945723376785438e-06, "loss": 0.0898, "step": 3558 }, { "epoch": 1.6604477611940298, "grad_norm": 1.0744057040945956, "learning_rate": 4.940295960715613e-06, "loss": 0.1005, "step": 3560 }, { "epoch": 1.6613805970149254, "grad_norm": 1.2387998828204012, "learning_rate": 4.934868615002636e-06, "loss": 0.0905, "step": 3562 }, { "epoch": 1.662313432835821, "grad_norm": 1.0716275535612976, "learning_rate": 4.9294413460422335e-06, "loss": 0.1025, "step": 3564 }, { "epoch": 1.6632462686567164, "grad_norm": 1.0095439089550609, "learning_rate": 4.924014160230045e-06, "loss": 0.0967, "step": 3566 }, { "epoch": 1.664179104477612, "grad_norm": 1.0765502678395185, "learning_rate": 4.918587063961619e-06, "loss": 0.0995, "step": 3568 }, { "epoch": 1.6651119402985075, "grad_norm": 1.1770395394430098, "learning_rate": 4.913160063632384e-06, "loss": 0.0892, "step": 3570 }, { "epoch": 1.6660447761194028, "grad_norm": 1.1159996883856258, "learning_rate": 4.907733165637668e-06, "loss": 0.0911, "step": 3572 }, { "epoch": 1.6669776119402986, "grad_norm": 1.124433917985698, "learning_rate": 4.9023063763726715e-06, "loss": 0.1074, "step": 3574 }, { "epoch": 1.667910447761194, "grad_norm": 1.0132124225805013, "learning_rate": 4.896879702232468e-06, "loss": 0.0896, "step": 3576 }, { "epoch": 1.6688432835820897, "grad_norm": 1.0480685826509903, "learning_rate": 4.891453149611999e-06, "loss": 0.1047, "step": 3578 }, { "epoch": 1.669776119402985, "grad_norm": 1.1364901285389009, "learning_rate": 4.8860267249060596e-06, "loss": 0.1058, "step": 3580 }, { "epoch": 1.6707089552238807, "grad_norm": 1.1375200711907278, "learning_rate": 4.880600434509295e-06, "loss": 0.1058, "step": 3582 }, { "epoch": 1.671641791044776, "grad_norm": 1.1747892292580644, "learning_rate": 4.875174284816188e-06, "loss": 0.0959, "step": 3584 }, { "epoch": 1.6725746268656716, "grad_norm": 1.1273492075578033, "learning_rate": 4.869748282221063e-06, "loss": 0.1024, "step": 3586 }, { "epoch": 1.6735074626865671, "grad_norm": 1.0817554210784017, "learning_rate": 4.864322433118066e-06, "loss": 0.097, "step": 3588 }, { "epoch": 1.6744402985074627, "grad_norm": 1.1379715879944226, "learning_rate": 4.858896743901165e-06, "loss": 0.0986, "step": 3590 }, { "epoch": 1.6753731343283582, "grad_norm": 0.9924153620454816, "learning_rate": 4.853471220964137e-06, "loss": 0.0836, "step": 3592 }, { "epoch": 1.6763059701492538, "grad_norm": 1.1209912319460038, "learning_rate": 4.8480458707005654e-06, "loss": 0.0932, "step": 3594 }, { "epoch": 1.6772388059701493, "grad_norm": 1.0467206766192003, "learning_rate": 4.842620699503825e-06, "loss": 0.0876, "step": 3596 }, { "epoch": 1.6781716417910446, "grad_norm": 1.2005846968793807, "learning_rate": 4.837195713767089e-06, "loss": 0.1105, "step": 3598 }, { "epoch": 1.6791044776119404, "grad_norm": 1.0396833765069797, "learning_rate": 4.8317709198833056e-06, "loss": 0.0892, "step": 3600 }, { "epoch": 1.6800373134328357, "grad_norm": 1.1505955141214124, "learning_rate": 4.826346324245194e-06, "loss": 0.0968, "step": 3602 }, { "epoch": 1.6809701492537314, "grad_norm": 1.229195385370137, "learning_rate": 4.820921933245246e-06, "loss": 0.0976, "step": 3604 }, { "epoch": 1.6819029850746268, "grad_norm": 1.1901564600940944, "learning_rate": 4.815497753275711e-06, "loss": 0.1028, "step": 3606 }, { "epoch": 1.6828358208955225, "grad_norm": 1.1259894388193936, "learning_rate": 4.810073790728585e-06, "loss": 0.1114, "step": 3608 }, { "epoch": 1.6837686567164178, "grad_norm": 1.1101771261250724, "learning_rate": 4.804650051995615e-06, "loss": 0.0891, "step": 3610 }, { "epoch": 1.6847014925373134, "grad_norm": 1.0582792259069187, "learning_rate": 4.79922654346828e-06, "loss": 0.0914, "step": 3612 }, { "epoch": 1.685634328358209, "grad_norm": 1.1124880761448122, "learning_rate": 4.793803271537788e-06, "loss": 0.0919, "step": 3614 }, { "epoch": 1.6865671641791045, "grad_norm": 1.1358075305748105, "learning_rate": 4.7883802425950685e-06, "loss": 0.1039, "step": 3616 }, { "epoch": 1.6875, "grad_norm": 1.055685153236721, "learning_rate": 4.782957463030763e-06, "loss": 0.0966, "step": 3618 }, { "epoch": 1.6884328358208955, "grad_norm": 1.2018144133814113, "learning_rate": 4.777534939235225e-06, "loss": 0.1052, "step": 3620 }, { "epoch": 1.689365671641791, "grad_norm": 1.0712499743212405, "learning_rate": 4.772112677598498e-06, "loss": 0.0954, "step": 3622 }, { "epoch": 1.6902985074626866, "grad_norm": 1.0524792489629278, "learning_rate": 4.766690684510323e-06, "loss": 0.0835, "step": 3624 }, { "epoch": 1.6912313432835822, "grad_norm": 1.050570153381814, "learning_rate": 4.761268966360123e-06, "loss": 0.098, "step": 3626 }, { "epoch": 1.6921641791044775, "grad_norm": 1.0323616470423458, "learning_rate": 4.7558475295369945e-06, "loss": 0.098, "step": 3628 }, { "epoch": 1.6930970149253732, "grad_norm": 1.1051931761480092, "learning_rate": 4.7504263804297064e-06, "loss": 0.0996, "step": 3630 }, { "epoch": 1.6940298507462686, "grad_norm": 1.0970026265986519, "learning_rate": 4.745005525426688e-06, "loss": 0.0964, "step": 3632 }, { "epoch": 1.6949626865671643, "grad_norm": 1.1631049642066742, "learning_rate": 4.739584970916018e-06, "loss": 0.1112, "step": 3634 }, { "epoch": 1.6958955223880596, "grad_norm": 1.0453504812979213, "learning_rate": 4.734164723285424e-06, "loss": 0.0854, "step": 3636 }, { "epoch": 1.6968283582089554, "grad_norm": 1.2805705908477534, "learning_rate": 4.728744788922272e-06, "loss": 0.1026, "step": 3638 }, { "epoch": 1.6977611940298507, "grad_norm": 1.1706183399424146, "learning_rate": 4.723325174213559e-06, "loss": 0.1001, "step": 3640 }, { "epoch": 1.6986940298507462, "grad_norm": 1.0045472923324372, "learning_rate": 4.7179058855459045e-06, "loss": 0.0982, "step": 3642 }, { "epoch": 1.6996268656716418, "grad_norm": 1.1097841587253405, "learning_rate": 4.712486929305544e-06, "loss": 0.0963, "step": 3644 }, { "epoch": 1.7005597014925373, "grad_norm": 1.1361392651171767, "learning_rate": 4.707068311878322e-06, "loss": 0.1001, "step": 3646 }, { "epoch": 1.7014925373134329, "grad_norm": 1.1215581216229769, "learning_rate": 4.701650039649682e-06, "loss": 0.0982, "step": 3648 }, { "epoch": 1.7024253731343284, "grad_norm": 1.157461079632266, "learning_rate": 4.69623211900466e-06, "loss": 0.106, "step": 3650 }, { "epoch": 1.703358208955224, "grad_norm": 1.0029338314369465, "learning_rate": 4.690814556327885e-06, "loss": 0.088, "step": 3652 }, { "epoch": 1.7042910447761193, "grad_norm": 1.1530178820591044, "learning_rate": 4.685397358003554e-06, "loss": 0.1075, "step": 3654 }, { "epoch": 1.705223880597015, "grad_norm": 1.1422854763505046, "learning_rate": 4.6799805304154396e-06, "loss": 0.103, "step": 3656 }, { "epoch": 1.7061567164179103, "grad_norm": 1.2071149858502441, "learning_rate": 4.6745640799468786e-06, "loss": 0.1028, "step": 3658 }, { "epoch": 1.707089552238806, "grad_norm": 1.0088615671201564, "learning_rate": 4.669148012980761e-06, "loss": 0.0935, "step": 3660 }, { "epoch": 1.7080223880597014, "grad_norm": 1.1276835081509065, "learning_rate": 4.663732335899527e-06, "loss": 0.0919, "step": 3662 }, { "epoch": 1.7089552238805972, "grad_norm": 1.2096235399066795, "learning_rate": 4.658317055085154e-06, "loss": 0.1068, "step": 3664 }, { "epoch": 1.7098880597014925, "grad_norm": 1.1279369930636538, "learning_rate": 4.652902176919159e-06, "loss": 0.0935, "step": 3666 }, { "epoch": 1.710820895522388, "grad_norm": 1.1103063243139837, "learning_rate": 4.647487707782575e-06, "loss": 0.1016, "step": 3668 }, { "epoch": 1.7117537313432836, "grad_norm": 1.1597134084926009, "learning_rate": 4.642073654055959e-06, "loss": 0.0963, "step": 3670 }, { "epoch": 1.712686567164179, "grad_norm": 1.1020343847839857, "learning_rate": 4.636660022119382e-06, "loss": 0.0965, "step": 3672 }, { "epoch": 1.7136194029850746, "grad_norm": 1.0684766114260358, "learning_rate": 4.631246818352408e-06, "loss": 0.0843, "step": 3674 }, { "epoch": 1.7145522388059702, "grad_norm": 1.0985281014336183, "learning_rate": 4.625834049134105e-06, "loss": 0.1061, "step": 3676 }, { "epoch": 1.7154850746268657, "grad_norm": 1.1518640444108639, "learning_rate": 4.620421720843025e-06, "loss": 0.0882, "step": 3678 }, { "epoch": 1.716417910447761, "grad_norm": 1.2363234410164508, "learning_rate": 4.615009839857202e-06, "loss": 0.0967, "step": 3680 }, { "epoch": 1.7173507462686568, "grad_norm": 1.1890607734505911, "learning_rate": 4.60959841255414e-06, "loss": 0.093, "step": 3682 }, { "epoch": 1.7182835820895521, "grad_norm": 1.1660943897921863, "learning_rate": 4.604187445310814e-06, "loss": 0.1015, "step": 3684 }, { "epoch": 1.7192164179104479, "grad_norm": 1.0751777453698839, "learning_rate": 4.59877694450365e-06, "loss": 0.0905, "step": 3686 }, { "epoch": 1.7201492537313432, "grad_norm": 1.1920409481024197, "learning_rate": 4.59336691650853e-06, "loss": 0.1011, "step": 3688 }, { "epoch": 1.721082089552239, "grad_norm": 1.175634938951435, "learning_rate": 4.587957367700776e-06, "loss": 0.094, "step": 3690 }, { "epoch": 1.7220149253731343, "grad_norm": 1.3001011587599143, "learning_rate": 4.5825483044551435e-06, "loss": 0.1074, "step": 3692 }, { "epoch": 1.7229477611940298, "grad_norm": 1.1559607773710021, "learning_rate": 4.5771397331458224e-06, "loss": 0.1085, "step": 3694 }, { "epoch": 1.7238805970149254, "grad_norm": 1.0917943572687148, "learning_rate": 4.571731660146416e-06, "loss": 0.097, "step": 3696 }, { "epoch": 1.724813432835821, "grad_norm": 1.177871791179173, "learning_rate": 4.566324091829945e-06, "loss": 0.0915, "step": 3698 }, { "epoch": 1.7257462686567164, "grad_norm": 1.0381343767107474, "learning_rate": 4.5609170345688305e-06, "loss": 0.1003, "step": 3700 }, { "epoch": 1.726679104477612, "grad_norm": 1.095792541674332, "learning_rate": 4.555510494734893e-06, "loss": 0.0914, "step": 3702 }, { "epoch": 1.7276119402985075, "grad_norm": 1.2223452882085306, "learning_rate": 4.550104478699351e-06, "loss": 0.0967, "step": 3704 }, { "epoch": 1.7285447761194028, "grad_norm": 0.976454960376214, "learning_rate": 4.544698992832795e-06, "loss": 0.0921, "step": 3706 }, { "epoch": 1.7294776119402986, "grad_norm": 0.9841552149965179, "learning_rate": 4.539294043505195e-06, "loss": 0.0984, "step": 3708 }, { "epoch": 1.730410447761194, "grad_norm": 1.0250549535432676, "learning_rate": 4.533889637085888e-06, "loss": 0.0904, "step": 3710 }, { "epoch": 1.7313432835820897, "grad_norm": 1.0340199334224152, "learning_rate": 4.528485779943573e-06, "loss": 0.0917, "step": 3712 }, { "epoch": 1.732276119402985, "grad_norm": 1.1186188527488128, "learning_rate": 4.523082478446301e-06, "loss": 0.1121, "step": 3714 }, { "epoch": 1.7332089552238807, "grad_norm": 1.0236389887859945, "learning_rate": 4.517679738961468e-06, "loss": 0.0918, "step": 3716 }, { "epoch": 1.734141791044776, "grad_norm": 1.1577395975820644, "learning_rate": 4.512277567855809e-06, "loss": 0.0954, "step": 3718 }, { "epoch": 1.7350746268656716, "grad_norm": 1.2585587926998412, "learning_rate": 4.506875971495383e-06, "loss": 0.099, "step": 3720 }, { "epoch": 1.7360074626865671, "grad_norm": 1.0662256276154634, "learning_rate": 4.5014749562455805e-06, "loss": 0.0899, "step": 3722 }, { "epoch": 1.7369402985074627, "grad_norm": 1.0936922063729748, "learning_rate": 4.4960745284711e-06, "loss": 0.0976, "step": 3724 }, { "epoch": 1.7378731343283582, "grad_norm": 1.2055073240106777, "learning_rate": 4.490674694535955e-06, "loss": 0.1114, "step": 3726 }, { "epoch": 1.7388059701492538, "grad_norm": 1.0419708398365015, "learning_rate": 4.485275460803452e-06, "loss": 0.0907, "step": 3728 }, { "epoch": 1.7397388059701493, "grad_norm": 1.0903435419954495, "learning_rate": 4.479876833636196e-06, "loss": 0.097, "step": 3730 }, { "epoch": 1.7406716417910446, "grad_norm": 1.065172655996034, "learning_rate": 4.474478819396072e-06, "loss": 0.0954, "step": 3732 }, { "epoch": 1.7416044776119404, "grad_norm": 1.1264507441495129, "learning_rate": 4.469081424444243e-06, "loss": 0.0983, "step": 3734 }, { "epoch": 1.7425373134328357, "grad_norm": 1.0737139692551787, "learning_rate": 4.463684655141151e-06, "loss": 0.0969, "step": 3736 }, { "epoch": 1.7434701492537314, "grad_norm": 1.0271970616815274, "learning_rate": 4.45828851784649e-06, "loss": 0.0958, "step": 3738 }, { "epoch": 1.7444029850746268, "grad_norm": 0.9816538224728948, "learning_rate": 4.452893018919213e-06, "loss": 0.0902, "step": 3740 }, { "epoch": 1.7453358208955225, "grad_norm": 1.0339703432308618, "learning_rate": 4.447498164717522e-06, "loss": 0.0982, "step": 3742 }, { "epoch": 1.7462686567164178, "grad_norm": 1.1895943462090308, "learning_rate": 4.442103961598858e-06, "loss": 0.0974, "step": 3744 }, { "epoch": 1.7472014925373134, "grad_norm": 1.0522698584404027, "learning_rate": 4.436710415919896e-06, "loss": 0.088, "step": 3746 }, { "epoch": 1.748134328358209, "grad_norm": 1.0222227241266983, "learning_rate": 4.431317534036535e-06, "loss": 0.0886, "step": 3748 }, { "epoch": 1.7490671641791045, "grad_norm": 1.0845559270059455, "learning_rate": 4.425925322303893e-06, "loss": 0.0871, "step": 3750 }, { "epoch": 1.75, "grad_norm": 1.1453172318035394, "learning_rate": 4.420533787076295e-06, "loss": 0.0984, "step": 3752 }, { "epoch": 1.7509328358208955, "grad_norm": 1.0478874821424775, "learning_rate": 4.41514293470727e-06, "loss": 0.0838, "step": 3754 }, { "epoch": 1.751865671641791, "grad_norm": 1.0826437025539664, "learning_rate": 4.4097527715495495e-06, "loss": 0.0945, "step": 3756 }, { "epoch": 1.7527985074626866, "grad_norm": 1.0851384876664898, "learning_rate": 4.4043633039550425e-06, "loss": 0.0983, "step": 3758 }, { "epoch": 1.7537313432835822, "grad_norm": 1.0879014786594696, "learning_rate": 4.398974538274843e-06, "loss": 0.0906, "step": 3760 }, { "epoch": 1.7546641791044775, "grad_norm": 1.1075558969803165, "learning_rate": 4.393586480859217e-06, "loss": 0.0935, "step": 3762 }, { "epoch": 1.7555970149253732, "grad_norm": 1.2069947270613044, "learning_rate": 4.388199138057599e-06, "loss": 0.0998, "step": 3764 }, { "epoch": 1.7565298507462686, "grad_norm": 1.0467782837242663, "learning_rate": 4.382812516218573e-06, "loss": 0.0877, "step": 3766 }, { "epoch": 1.7574626865671643, "grad_norm": 1.0449747101766405, "learning_rate": 4.377426621689885e-06, "loss": 0.0977, "step": 3768 }, { "epoch": 1.7583955223880596, "grad_norm": 1.128941413012485, "learning_rate": 4.3720414608184175e-06, "loss": 0.0998, "step": 3770 }, { "epoch": 1.7593283582089554, "grad_norm": 1.1647396071651746, "learning_rate": 4.366657039950186e-06, "loss": 0.0976, "step": 3772 }, { "epoch": 1.7602611940298507, "grad_norm": 1.0162492358020478, "learning_rate": 4.361273365430338e-06, "loss": 0.0988, "step": 3774 }, { "epoch": 1.7611940298507462, "grad_norm": 1.1246697581947454, "learning_rate": 4.355890443603139e-06, "loss": 0.0913, "step": 3776 }, { "epoch": 1.7621268656716418, "grad_norm": 1.0394859969226904, "learning_rate": 4.350508280811973e-06, "loss": 0.0947, "step": 3778 }, { "epoch": 1.7630597014925373, "grad_norm": 1.2960785308474594, "learning_rate": 4.345126883399323e-06, "loss": 0.0949, "step": 3780 }, { "epoch": 1.7639925373134329, "grad_norm": 1.3798023038263878, "learning_rate": 4.339746257706771e-06, "loss": 0.1075, "step": 3782 }, { "epoch": 1.7649253731343284, "grad_norm": 1.0076708894235762, "learning_rate": 4.334366410074995e-06, "loss": 0.0885, "step": 3784 }, { "epoch": 1.765858208955224, "grad_norm": 1.2526434203532881, "learning_rate": 4.328987346843746e-06, "loss": 0.1009, "step": 3786 }, { "epoch": 1.7667910447761193, "grad_norm": 1.0884714717105828, "learning_rate": 4.3236090743518635e-06, "loss": 0.1029, "step": 3788 }, { "epoch": 1.767723880597015, "grad_norm": 1.0296748006116867, "learning_rate": 4.3182315989372446e-06, "loss": 0.0943, "step": 3790 }, { "epoch": 1.7686567164179103, "grad_norm": 1.1289222001502612, "learning_rate": 4.312854926936852e-06, "loss": 0.1076, "step": 3792 }, { "epoch": 1.769589552238806, "grad_norm": 1.0725243856227535, "learning_rate": 4.307479064686701e-06, "loss": 0.0915, "step": 3794 }, { "epoch": 1.7705223880597014, "grad_norm": 1.0079013994375832, "learning_rate": 4.30210401852185e-06, "loss": 0.0847, "step": 3796 }, { "epoch": 1.7714552238805972, "grad_norm": 1.0014496534946675, "learning_rate": 4.296729794776402e-06, "loss": 0.09, "step": 3798 }, { "epoch": 1.7723880597014925, "grad_norm": 1.2232482621343395, "learning_rate": 4.291356399783484e-06, "loss": 0.0905, "step": 3800 }, { "epoch": 1.773320895522388, "grad_norm": 1.1905264283336212, "learning_rate": 4.2859838398752515e-06, "loss": 0.1013, "step": 3802 }, { "epoch": 1.7742537313432836, "grad_norm": 1.0466086761704225, "learning_rate": 4.280612121382872e-06, "loss": 0.0871, "step": 3804 }, { "epoch": 1.775186567164179, "grad_norm": 1.0983217922469228, "learning_rate": 4.275241250636522e-06, "loss": 0.0955, "step": 3806 }, { "epoch": 1.7761194029850746, "grad_norm": 1.2015898573258283, "learning_rate": 4.269871233965381e-06, "loss": 0.1014, "step": 3808 }, { "epoch": 1.7770522388059702, "grad_norm": 1.0921263552324518, "learning_rate": 4.264502077697622e-06, "loss": 0.0911, "step": 3810 }, { "epoch": 1.7779850746268657, "grad_norm": 1.1822366150622516, "learning_rate": 4.259133788160402e-06, "loss": 0.0997, "step": 3812 }, { "epoch": 1.778917910447761, "grad_norm": 1.1358230989160354, "learning_rate": 4.25376637167986e-06, "loss": 0.0932, "step": 3814 }, { "epoch": 1.7798507462686568, "grad_norm": 1.1591007066127716, "learning_rate": 4.248399834581103e-06, "loss": 0.0936, "step": 3816 }, { "epoch": 1.7807835820895521, "grad_norm": 1.1903194907889803, "learning_rate": 4.243034183188199e-06, "loss": 0.0873, "step": 3818 }, { "epoch": 1.7817164179104479, "grad_norm": 1.0647104993059966, "learning_rate": 4.2376694238241815e-06, "loss": 0.0896, "step": 3820 }, { "epoch": 1.7826492537313432, "grad_norm": 1.2340235768626262, "learning_rate": 4.2323055628110245e-06, "loss": 0.1072, "step": 3822 }, { "epoch": 1.783582089552239, "grad_norm": 1.2193822544577344, "learning_rate": 4.226942606469647e-06, "loss": 0.1066, "step": 3824 }, { "epoch": 1.7845149253731343, "grad_norm": 1.1293602901076107, "learning_rate": 4.2215805611199e-06, "loss": 0.1022, "step": 3826 }, { "epoch": 1.7854477611940298, "grad_norm": 1.3256242635540962, "learning_rate": 4.216219433080561e-06, "loss": 0.1224, "step": 3828 }, { "epoch": 1.7863805970149254, "grad_norm": 1.1404796972513265, "learning_rate": 4.210859228669331e-06, "loss": 0.098, "step": 3830 }, { "epoch": 1.787313432835821, "grad_norm": 0.9573676891903881, "learning_rate": 4.205499954202817e-06, "loss": 0.0835, "step": 3832 }, { "epoch": 1.7882462686567164, "grad_norm": 1.1540044340356037, "learning_rate": 4.200141615996532e-06, "loss": 0.1081, "step": 3834 }, { "epoch": 1.789179104477612, "grad_norm": 1.2157590612321367, "learning_rate": 4.194784220364888e-06, "loss": 0.0883, "step": 3836 }, { "epoch": 1.7901119402985075, "grad_norm": 1.0340845426370617, "learning_rate": 4.189427773621179e-06, "loss": 0.0864, "step": 3838 }, { "epoch": 1.7910447761194028, "grad_norm": 1.1161150048576776, "learning_rate": 4.184072282077593e-06, "loss": 0.0966, "step": 3840 }, { "epoch": 1.7919776119402986, "grad_norm": 1.216937865414375, "learning_rate": 4.17871775204518e-06, "loss": 0.1094, "step": 3842 }, { "epoch": 1.792910447761194, "grad_norm": 1.0620932308260993, "learning_rate": 4.173364189833864e-06, "loss": 0.0854, "step": 3844 }, { "epoch": 1.7938432835820897, "grad_norm": 1.1270696297336795, "learning_rate": 4.168011601752427e-06, "loss": 0.0928, "step": 3846 }, { "epoch": 1.794776119402985, "grad_norm": 1.2350514563389072, "learning_rate": 4.162659994108502e-06, "loss": 0.1011, "step": 3848 }, { "epoch": 1.7957089552238807, "grad_norm": 1.101956567541425, "learning_rate": 4.1573093732085675e-06, "loss": 0.0963, "step": 3850 }, { "epoch": 1.796641791044776, "grad_norm": 1.0861244375436816, "learning_rate": 4.151959745357941e-06, "loss": 0.0923, "step": 3852 }, { "epoch": 1.7975746268656716, "grad_norm": 0.9689295079280268, "learning_rate": 4.146611116860767e-06, "loss": 0.0833, "step": 3854 }, { "epoch": 1.7985074626865671, "grad_norm": 1.1511239719980124, "learning_rate": 4.1412634940200116e-06, "loss": 0.0875, "step": 3856 }, { "epoch": 1.7994402985074627, "grad_norm": 1.1621238931880757, "learning_rate": 4.135916883137458e-06, "loss": 0.0991, "step": 3858 }, { "epoch": 1.8003731343283582, "grad_norm": 0.9613995980719443, "learning_rate": 4.130571290513696e-06, "loss": 0.0855, "step": 3860 }, { "epoch": 1.8013059701492538, "grad_norm": 1.1518276454691472, "learning_rate": 4.125226722448119e-06, "loss": 0.0931, "step": 3862 }, { "epoch": 1.8022388059701493, "grad_norm": 1.041908099670943, "learning_rate": 4.119883185238905e-06, "loss": 0.0871, "step": 3864 }, { "epoch": 1.8031716417910446, "grad_norm": 1.0574759751416205, "learning_rate": 4.114540685183026e-06, "loss": 0.091, "step": 3866 }, { "epoch": 1.8041044776119404, "grad_norm": 1.0966981760812202, "learning_rate": 4.109199228576227e-06, "loss": 0.0965, "step": 3868 }, { "epoch": 1.8050373134328357, "grad_norm": 1.0356744160990077, "learning_rate": 4.103858821713021e-06, "loss": 0.092, "step": 3870 }, { "epoch": 1.8059701492537314, "grad_norm": 1.1157359606890926, "learning_rate": 4.0985194708866905e-06, "loss": 0.1017, "step": 3872 }, { "epoch": 1.8069029850746268, "grad_norm": 1.2367449459949518, "learning_rate": 4.093181182389271e-06, "loss": 0.0908, "step": 3874 }, { "epoch": 1.8078358208955225, "grad_norm": 0.9933622219151041, "learning_rate": 4.087843962511541e-06, "loss": 0.0932, "step": 3876 }, { "epoch": 1.8087686567164178, "grad_norm": 1.1402345985026843, "learning_rate": 4.082507817543028e-06, "loss": 0.1026, "step": 3878 }, { "epoch": 1.8097014925373134, "grad_norm": 1.1988662208434808, "learning_rate": 4.077172753771986e-06, "loss": 0.102, "step": 3880 }, { "epoch": 1.810634328358209, "grad_norm": 1.3120916733676826, "learning_rate": 4.071838777485398e-06, "loss": 0.1039, "step": 3882 }, { "epoch": 1.8115671641791045, "grad_norm": 1.110205721427487, "learning_rate": 4.066505894968969e-06, "loss": 0.0972, "step": 3884 }, { "epoch": 1.8125, "grad_norm": 1.1716763276015492, "learning_rate": 4.061174112507106e-06, "loss": 0.0928, "step": 3886 }, { "epoch": 1.8134328358208955, "grad_norm": 1.3046999821806964, "learning_rate": 4.05584343638293e-06, "loss": 0.1009, "step": 3888 }, { "epoch": 1.814365671641791, "grad_norm": 1.0396490607292204, "learning_rate": 4.050513872878249e-06, "loss": 0.0933, "step": 3890 }, { "epoch": 1.8152985074626866, "grad_norm": 1.139036715278458, "learning_rate": 4.045185428273563e-06, "loss": 0.1071, "step": 3892 }, { "epoch": 1.8162313432835822, "grad_norm": 1.1192745314307477, "learning_rate": 4.03985810884806e-06, "loss": 0.0942, "step": 3894 }, { "epoch": 1.8171641791044775, "grad_norm": 1.0951331224480367, "learning_rate": 4.034531920879591e-06, "loss": 0.0869, "step": 3896 }, { "epoch": 1.8180970149253732, "grad_norm": 1.015898359547634, "learning_rate": 4.029206870644684e-06, "loss": 0.0892, "step": 3898 }, { "epoch": 1.8190298507462686, "grad_norm": 1.0193538642768085, "learning_rate": 4.0238829644185175e-06, "loss": 0.088, "step": 3900 }, { "epoch": 1.8199626865671643, "grad_norm": 1.175930383539463, "learning_rate": 4.018560208474927e-06, "loss": 0.0958, "step": 3902 }, { "epoch": 1.8208955223880596, "grad_norm": 0.9565088342567477, "learning_rate": 4.013238609086393e-06, "loss": 0.0787, "step": 3904 }, { "epoch": 1.8218283582089554, "grad_norm": 1.2742170396051626, "learning_rate": 4.007918172524031e-06, "loss": 0.0915, "step": 3906 }, { "epoch": 1.8227611940298507, "grad_norm": 1.1683685051254658, "learning_rate": 4.002598905057584e-06, "loss": 0.0899, "step": 3908 }, { "epoch": 1.8236940298507462, "grad_norm": 1.2504392164854112, "learning_rate": 3.997280812955423e-06, "loss": 0.1017, "step": 3910 }, { "epoch": 1.8246268656716418, "grad_norm": 1.2136286482853063, "learning_rate": 3.991963902484527e-06, "loss": 0.0958, "step": 3912 }, { "epoch": 1.8255597014925373, "grad_norm": 1.1056367344685532, "learning_rate": 3.986648179910491e-06, "loss": 0.1015, "step": 3914 }, { "epoch": 1.8264925373134329, "grad_norm": 1.0051688535853402, "learning_rate": 3.981333651497502e-06, "loss": 0.0941, "step": 3916 }, { "epoch": 1.8274253731343284, "grad_norm": 1.0874480949477714, "learning_rate": 3.976020323508345e-06, "loss": 0.0866, "step": 3918 }, { "epoch": 1.828358208955224, "grad_norm": 1.0076486745719422, "learning_rate": 3.97070820220439e-06, "loss": 0.0947, "step": 3920 }, { "epoch": 1.8292910447761193, "grad_norm": 1.2041628641455155, "learning_rate": 3.96539729384558e-06, "loss": 0.0983, "step": 3922 }, { "epoch": 1.830223880597015, "grad_norm": 1.0824329943543542, "learning_rate": 3.9600876046904326e-06, "loss": 0.0903, "step": 3924 }, { "epoch": 1.8311567164179103, "grad_norm": 1.0689057768272712, "learning_rate": 3.954779140996032e-06, "loss": 0.0905, "step": 3926 }, { "epoch": 1.832089552238806, "grad_norm": 1.0168855559064294, "learning_rate": 3.949471909018012e-06, "loss": 0.0918, "step": 3928 }, { "epoch": 1.8330223880597014, "grad_norm": 1.0242565992788855, "learning_rate": 3.944165915010559e-06, "loss": 0.0888, "step": 3930 }, { "epoch": 1.8339552238805972, "grad_norm": 1.1466337824307269, "learning_rate": 3.938861165226398e-06, "loss": 0.0996, "step": 3932 }, { "epoch": 1.8348880597014925, "grad_norm": 1.1709483615899532, "learning_rate": 3.933557665916787e-06, "loss": 0.095, "step": 3934 }, { "epoch": 1.835820895522388, "grad_norm": 1.06104474397146, "learning_rate": 3.928255423331516e-06, "loss": 0.0961, "step": 3936 }, { "epoch": 1.8367537313432836, "grad_norm": 1.173836946104865, "learning_rate": 3.922954443718889e-06, "loss": 0.0999, "step": 3938 }, { "epoch": 1.837686567164179, "grad_norm": 1.046950695546202, "learning_rate": 3.917654733325722e-06, "loss": 0.0877, "step": 3940 }, { "epoch": 1.8386194029850746, "grad_norm": 1.0207179115483678, "learning_rate": 3.912356298397338e-06, "loss": 0.0924, "step": 3942 }, { "epoch": 1.8395522388059702, "grad_norm": 1.256927733061973, "learning_rate": 3.907059145177551e-06, "loss": 0.1014, "step": 3944 }, { "epoch": 1.8404850746268657, "grad_norm": 1.111798269664183, "learning_rate": 3.901763279908675e-06, "loss": 0.0916, "step": 3946 }, { "epoch": 1.841417910447761, "grad_norm": 1.0668214975294112, "learning_rate": 3.896468708831497e-06, "loss": 0.0807, "step": 3948 }, { "epoch": 1.8423507462686568, "grad_norm": 1.2604215090545203, "learning_rate": 3.891175438185281e-06, "loss": 0.104, "step": 3950 }, { "epoch": 1.8432835820895521, "grad_norm": 0.912519566826213, "learning_rate": 3.885883474207763e-06, "loss": 0.0787, "step": 3952 }, { "epoch": 1.8442164179104479, "grad_norm": 1.0991981974832183, "learning_rate": 3.880592823135129e-06, "loss": 0.0977, "step": 3954 }, { "epoch": 1.8451492537313432, "grad_norm": 1.0354663871727174, "learning_rate": 3.875303491202033e-06, "loss": 0.0959, "step": 3956 }, { "epoch": 1.846082089552239, "grad_norm": 1.083104908415455, "learning_rate": 3.8700154846415614e-06, "loss": 0.0885, "step": 3958 }, { "epoch": 1.8470149253731343, "grad_norm": 1.0915617448120623, "learning_rate": 3.864728809685244e-06, "loss": 0.0973, "step": 3960 }, { "epoch": 1.8479477611940298, "grad_norm": 1.0462169645645247, "learning_rate": 3.859443472563041e-06, "loss": 0.095, "step": 3962 }, { "epoch": 1.8488805970149254, "grad_norm": 1.1079536096696272, "learning_rate": 3.854159479503335e-06, "loss": 0.1003, "step": 3964 }, { "epoch": 1.849813432835821, "grad_norm": 1.175094326106707, "learning_rate": 3.848876836732926e-06, "loss": 0.0926, "step": 3966 }, { "epoch": 1.8507462686567164, "grad_norm": 0.9792823022046725, "learning_rate": 3.843595550477023e-06, "loss": 0.0928, "step": 3968 }, { "epoch": 1.851679104477612, "grad_norm": 1.3560895712988963, "learning_rate": 3.838315626959236e-06, "loss": 0.1075, "step": 3970 }, { "epoch": 1.8526119402985075, "grad_norm": 1.045236010976236, "learning_rate": 3.83303707240157e-06, "loss": 0.0908, "step": 3972 }, { "epoch": 1.8535447761194028, "grad_norm": 1.1881488868066707, "learning_rate": 3.827759893024412e-06, "loss": 0.1055, "step": 3974 }, { "epoch": 1.8544776119402986, "grad_norm": 1.0705731080187553, "learning_rate": 3.822484095046533e-06, "loss": 0.095, "step": 3976 }, { "epoch": 1.855410447761194, "grad_norm": 1.1368155490767597, "learning_rate": 3.817209684685079e-06, "loss": 0.0891, "step": 3978 }, { "epoch": 1.8563432835820897, "grad_norm": 1.010088803368523, "learning_rate": 3.811936668155554e-06, "loss": 0.0848, "step": 3980 }, { "epoch": 1.857276119402985, "grad_norm": 1.1623239069133264, "learning_rate": 3.8066650516718236e-06, "loss": 0.1044, "step": 3982 }, { "epoch": 1.8582089552238807, "grad_norm": 0.9709903194165235, "learning_rate": 3.8013948414461017e-06, "loss": 0.0927, "step": 3984 }, { "epoch": 1.859141791044776, "grad_norm": 1.1693040269647939, "learning_rate": 3.7961260436889454e-06, "loss": 0.0899, "step": 3986 }, { "epoch": 1.8600746268656716, "grad_norm": 1.1244432868402252, "learning_rate": 3.790858664609249e-06, "loss": 0.1002, "step": 3988 }, { "epoch": 1.8610074626865671, "grad_norm": 1.112164749972976, "learning_rate": 3.7855927104142354e-06, "loss": 0.1083, "step": 3990 }, { "epoch": 1.8619402985074627, "grad_norm": 0.9622138242433877, "learning_rate": 3.7803281873094426e-06, "loss": 0.0804, "step": 3992 }, { "epoch": 1.8628731343283582, "grad_norm": 1.1146764918503846, "learning_rate": 3.7750651014987283e-06, "loss": 0.1018, "step": 3994 }, { "epoch": 1.8638059701492538, "grad_norm": 1.1310688782322793, "learning_rate": 3.7698034591842536e-06, "loss": 0.1015, "step": 3996 }, { "epoch": 1.8647388059701493, "grad_norm": 1.0677654354420405, "learning_rate": 3.764543266566482e-06, "loss": 0.0942, "step": 3998 }, { "epoch": 1.8656716417910446, "grad_norm": 1.0633403351629696, "learning_rate": 3.7592845298441626e-06, "loss": 0.0986, "step": 4000 }, { "epoch": 1.8656716417910446, "eval_loss": 0.1479332447052002, "eval_runtime": 320.9933, "eval_samples_per_second": 47.496, "eval_steps_per_second": 5.938, "step": 4000 }, { "epoch": 1.8666044776119404, "grad_norm": 1.2843284382909446, "learning_rate": 3.7540272552143343e-06, "loss": 0.1109, "step": 4002 }, { "epoch": 1.8675373134328357, "grad_norm": 0.986376625934349, "learning_rate": 3.7487714488723116e-06, "loss": 0.0844, "step": 4004 }, { "epoch": 1.8684701492537314, "grad_norm": 1.117605149767856, "learning_rate": 3.743517117011676e-06, "loss": 0.091, "step": 4006 }, { "epoch": 1.8694029850746268, "grad_norm": 1.1111481927243665, "learning_rate": 3.7382642658242716e-06, "loss": 0.0998, "step": 4008 }, { "epoch": 1.8703358208955225, "grad_norm": 1.0894489967394465, "learning_rate": 3.7330129015002066e-06, "loss": 0.0895, "step": 4010 }, { "epoch": 1.8712686567164178, "grad_norm": 1.1092242785657853, "learning_rate": 3.727763030227824e-06, "loss": 0.0949, "step": 4012 }, { "epoch": 1.8722014925373134, "grad_norm": 1.0182803071041748, "learning_rate": 3.7225146581937155e-06, "loss": 0.0843, "step": 4014 }, { "epoch": 1.873134328358209, "grad_norm": 1.0810906313746105, "learning_rate": 3.7172677915827037e-06, "loss": 0.0852, "step": 4016 }, { "epoch": 1.8740671641791045, "grad_norm": 1.306685098425301, "learning_rate": 3.7120224365778356e-06, "loss": 0.0991, "step": 4018 }, { "epoch": 1.875, "grad_norm": 1.14440249356662, "learning_rate": 3.7067785993603822e-06, "loss": 0.097, "step": 4020 }, { "epoch": 1.8759328358208955, "grad_norm": 1.0610042957590597, "learning_rate": 3.7015362861098197e-06, "loss": 0.0921, "step": 4022 }, { "epoch": 1.876865671641791, "grad_norm": 1.1835383394388608, "learning_rate": 3.6962955030038332e-06, "loss": 0.1, "step": 4024 }, { "epoch": 1.8777985074626866, "grad_norm": 1.0480652020524597, "learning_rate": 3.6910562562183006e-06, "loss": 0.0893, "step": 4026 }, { "epoch": 1.8787313432835822, "grad_norm": 1.0398608770884399, "learning_rate": 3.6858185519272906e-06, "loss": 0.1003, "step": 4028 }, { "epoch": 1.8796641791044775, "grad_norm": 1.0971184446881803, "learning_rate": 3.680582396303056e-06, "loss": 0.0898, "step": 4030 }, { "epoch": 1.8805970149253732, "grad_norm": 1.114461790561332, "learning_rate": 3.6753477955160244e-06, "loss": 0.0971, "step": 4032 }, { "epoch": 1.8815298507462686, "grad_norm": 1.1316670021453272, "learning_rate": 3.6701147557347893e-06, "loss": 0.1072, "step": 4034 }, { "epoch": 1.8824626865671643, "grad_norm": 1.0613309947998661, "learning_rate": 3.664883283126106e-06, "loss": 0.0942, "step": 4036 }, { "epoch": 1.8833955223880596, "grad_norm": 1.0959102353877053, "learning_rate": 3.659653383854881e-06, "loss": 0.0806, "step": 4038 }, { "epoch": 1.8843283582089554, "grad_norm": 1.1569978972917845, "learning_rate": 3.65442506408417e-06, "loss": 0.0949, "step": 4040 }, { "epoch": 1.8852611940298507, "grad_norm": 1.028704446421795, "learning_rate": 3.6491983299751665e-06, "loss": 0.0909, "step": 4042 }, { "epoch": 1.8861940298507462, "grad_norm": 0.9567363078679719, "learning_rate": 3.6439731876871928e-06, "loss": 0.0797, "step": 4044 }, { "epoch": 1.8871268656716418, "grad_norm": 0.9911284399542191, "learning_rate": 3.638749643377697e-06, "loss": 0.0924, "step": 4046 }, { "epoch": 1.8880597014925373, "grad_norm": 1.054706019306054, "learning_rate": 3.6335277032022446e-06, "loss": 0.0916, "step": 4048 }, { "epoch": 1.8889925373134329, "grad_norm": 1.0909782656433502, "learning_rate": 3.62830737331451e-06, "loss": 0.0922, "step": 4050 }, { "epoch": 1.8899253731343284, "grad_norm": 1.101096307195021, "learning_rate": 3.6230886598662717e-06, "loss": 0.0967, "step": 4052 }, { "epoch": 1.890858208955224, "grad_norm": 1.107710143260698, "learning_rate": 3.6178715690074016e-06, "loss": 0.0888, "step": 4054 }, { "epoch": 1.8917910447761193, "grad_norm": 1.11409851460641, "learning_rate": 3.6126561068858613e-06, "loss": 0.1021, "step": 4056 }, { "epoch": 1.892723880597015, "grad_norm": 1.0608903619283372, "learning_rate": 3.607442279647689e-06, "loss": 0.0942, "step": 4058 }, { "epoch": 1.8936567164179103, "grad_norm": 1.1493025908294738, "learning_rate": 3.6022300934369976e-06, "loss": 0.092, "step": 4060 }, { "epoch": 1.894589552238806, "grad_norm": 1.150870057694778, "learning_rate": 3.597019554395973e-06, "loss": 0.093, "step": 4062 }, { "epoch": 1.8955223880597014, "grad_norm": 1.1491826921399277, "learning_rate": 3.591810668664851e-06, "loss": 0.0906, "step": 4064 }, { "epoch": 1.8964552238805972, "grad_norm": 1.2290903828259188, "learning_rate": 3.586603442381923e-06, "loss": 0.1042, "step": 4066 }, { "epoch": 1.8973880597014925, "grad_norm": 1.1276710304056294, "learning_rate": 3.581397881683525e-06, "loss": 0.0965, "step": 4068 }, { "epoch": 1.898320895522388, "grad_norm": 1.0896855342468688, "learning_rate": 3.576193992704029e-06, "loss": 0.096, "step": 4070 }, { "epoch": 1.8992537313432836, "grad_norm": 1.1574983867200623, "learning_rate": 3.5709917815758388e-06, "loss": 0.0984, "step": 4072 }, { "epoch": 1.900186567164179, "grad_norm": 1.0613423439242609, "learning_rate": 3.5657912544293805e-06, "loss": 0.0898, "step": 4074 }, { "epoch": 1.9011194029850746, "grad_norm": 0.9955766320361427, "learning_rate": 3.5605924173930946e-06, "loss": 0.0934, "step": 4076 }, { "epoch": 1.9020522388059702, "grad_norm": 1.0428301262283546, "learning_rate": 3.5553952765934293e-06, "loss": 0.0921, "step": 4078 }, { "epoch": 1.9029850746268657, "grad_norm": 0.9710617575807672, "learning_rate": 3.5501998381548355e-06, "loss": 0.0915, "step": 4080 }, { "epoch": 1.903917910447761, "grad_norm": 1.0446733803222894, "learning_rate": 3.5450061081997584e-06, "loss": 0.0928, "step": 4082 }, { "epoch": 1.9048507462686568, "grad_norm": 1.115389154911288, "learning_rate": 3.539814092848629e-06, "loss": 0.0992, "step": 4084 }, { "epoch": 1.9057835820895521, "grad_norm": 1.2372692872616267, "learning_rate": 3.5346237982198586e-06, "loss": 0.1004, "step": 4086 }, { "epoch": 1.9067164179104479, "grad_norm": 0.955157355604134, "learning_rate": 3.5294352304298283e-06, "loss": 0.0848, "step": 4088 }, { "epoch": 1.9076492537313432, "grad_norm": 1.016343699030702, "learning_rate": 3.5242483955928887e-06, "loss": 0.0922, "step": 4090 }, { "epoch": 1.908582089552239, "grad_norm": 1.1036563736652032, "learning_rate": 3.51906329982134e-06, "loss": 0.0867, "step": 4092 }, { "epoch": 1.9095149253731343, "grad_norm": 1.0554652654580032, "learning_rate": 3.5138799492254462e-06, "loss": 0.0987, "step": 4094 }, { "epoch": 1.9104477611940298, "grad_norm": 0.9648272954788495, "learning_rate": 3.508698349913402e-06, "loss": 0.0885, "step": 4096 }, { "epoch": 1.9113805970149254, "grad_norm": 1.1979815887164187, "learning_rate": 3.5035185079913435e-06, "loss": 0.1009, "step": 4098 }, { "epoch": 1.912313432835821, "grad_norm": 1.1883993129279289, "learning_rate": 3.4983404295633384e-06, "loss": 0.0932, "step": 4100 }, { "epoch": 1.9132462686567164, "grad_norm": 1.1208800962615217, "learning_rate": 3.4931641207313703e-06, "loss": 0.0911, "step": 4102 }, { "epoch": 1.914179104477612, "grad_norm": 1.0802055023930612, "learning_rate": 3.487989587595344e-06, "loss": 0.0986, "step": 4104 }, { "epoch": 1.9151119402985075, "grad_norm": 0.9722483391976391, "learning_rate": 3.4828168362530668e-06, "loss": 0.0859, "step": 4106 }, { "epoch": 1.9160447761194028, "grad_norm": 1.0594462306629138, "learning_rate": 3.4776458728002495e-06, "loss": 0.104, "step": 4108 }, { "epoch": 1.9169776119402986, "grad_norm": 1.1985253301270509, "learning_rate": 3.472476703330493e-06, "loss": 0.0987, "step": 4110 }, { "epoch": 1.917910447761194, "grad_norm": 1.200043841460018, "learning_rate": 3.4673093339352837e-06, "loss": 0.0989, "step": 4112 }, { "epoch": 1.9188432835820897, "grad_norm": 1.0469389734961136, "learning_rate": 3.462143770703994e-06, "loss": 0.0832, "step": 4114 }, { "epoch": 1.919776119402985, "grad_norm": 1.0074316960370335, "learning_rate": 3.456980019723859e-06, "loss": 0.0919, "step": 4116 }, { "epoch": 1.9207089552238807, "grad_norm": 1.2522119103566154, "learning_rate": 3.451818087079982e-06, "loss": 0.097, "step": 4118 }, { "epoch": 1.921641791044776, "grad_norm": 1.0120829091491326, "learning_rate": 3.446657978855325e-06, "loss": 0.0868, "step": 4120 }, { "epoch": 1.9225746268656716, "grad_norm": 0.9249915836071871, "learning_rate": 3.4414997011306977e-06, "loss": 0.0785, "step": 4122 }, { "epoch": 1.9235074626865671, "grad_norm": 1.0473723138567228, "learning_rate": 3.4363432599847503e-06, "loss": 0.0872, "step": 4124 }, { "epoch": 1.9244402985074627, "grad_norm": 1.2071183639902356, "learning_rate": 3.4311886614939753e-06, "loss": 0.0944, "step": 4126 }, { "epoch": 1.9253731343283582, "grad_norm": 1.194995477333287, "learning_rate": 3.4260359117326914e-06, "loss": 0.1082, "step": 4128 }, { "epoch": 1.9263059701492538, "grad_norm": 1.0506765524204482, "learning_rate": 3.4208850167730336e-06, "loss": 0.0974, "step": 4130 }, { "epoch": 1.9272388059701493, "grad_norm": 1.0985602926409648, "learning_rate": 3.4157359826849575e-06, "loss": 0.0886, "step": 4132 }, { "epoch": 1.9281716417910446, "grad_norm": 1.1252582551299382, "learning_rate": 3.410588815536221e-06, "loss": 0.1009, "step": 4134 }, { "epoch": 1.9291044776119404, "grad_norm": 1.0293911711875088, "learning_rate": 3.4054435213923883e-06, "loss": 0.0877, "step": 4136 }, { "epoch": 1.9300373134328357, "grad_norm": 1.0450353348596726, "learning_rate": 3.4003001063168094e-06, "loss": 0.0933, "step": 4138 }, { "epoch": 1.9309701492537314, "grad_norm": 1.1653632467428734, "learning_rate": 3.3951585763706246e-06, "loss": 0.1013, "step": 4140 }, { "epoch": 1.9319029850746268, "grad_norm": 0.9559735541014069, "learning_rate": 3.3900189376127514e-06, "loss": 0.0759, "step": 4142 }, { "epoch": 1.9328358208955225, "grad_norm": 1.131094568676856, "learning_rate": 3.384881196099874e-06, "loss": 0.0922, "step": 4144 }, { "epoch": 1.9337686567164178, "grad_norm": 1.0411053229375848, "learning_rate": 3.3797453578864527e-06, "loss": 0.0887, "step": 4146 }, { "epoch": 1.9347014925373134, "grad_norm": 0.8612233825344169, "learning_rate": 3.374611429024691e-06, "loss": 0.0812, "step": 4148 }, { "epoch": 1.935634328358209, "grad_norm": 1.0897526188813582, "learning_rate": 3.3694794155645526e-06, "loss": 0.085, "step": 4150 }, { "epoch": 1.9365671641791045, "grad_norm": 1.0118318142102836, "learning_rate": 3.3643493235537376e-06, "loss": 0.0852, "step": 4152 }, { "epoch": 1.9375, "grad_norm": 1.0838281994563612, "learning_rate": 3.3592211590376855e-06, "loss": 0.0917, "step": 4154 }, { "epoch": 1.9384328358208955, "grad_norm": 1.0714800298171627, "learning_rate": 3.3540949280595642e-06, "loss": 0.0851, "step": 4156 }, { "epoch": 1.939365671641791, "grad_norm": 1.1281338058664754, "learning_rate": 3.3489706366602616e-06, "loss": 0.0871, "step": 4158 }, { "epoch": 1.9402985074626866, "grad_norm": 1.0997136060702164, "learning_rate": 3.3438482908783813e-06, "loss": 0.0869, "step": 4160 }, { "epoch": 1.9412313432835822, "grad_norm": 1.1232739089985604, "learning_rate": 3.338727896750232e-06, "loss": 0.0987, "step": 4162 }, { "epoch": 1.9421641791044775, "grad_norm": 1.1326358819834272, "learning_rate": 3.3336094603098245e-06, "loss": 0.1001, "step": 4164 }, { "epoch": 1.9430970149253732, "grad_norm": 1.1810449957725686, "learning_rate": 3.3284929875888603e-06, "loss": 0.0953, "step": 4166 }, { "epoch": 1.9440298507462686, "grad_norm": 1.2311762096066052, "learning_rate": 3.3233784846167316e-06, "loss": 0.0925, "step": 4168 }, { "epoch": 1.9449626865671643, "grad_norm": 1.367796457436548, "learning_rate": 3.3182659574205046e-06, "loss": 0.09, "step": 4170 }, { "epoch": 1.9458955223880596, "grad_norm": 1.1648194707236783, "learning_rate": 3.3131554120249192e-06, "loss": 0.0976, "step": 4172 }, { "epoch": 1.9468283582089554, "grad_norm": 1.083277531550915, "learning_rate": 3.3080468544523815e-06, "loss": 0.0785, "step": 4174 }, { "epoch": 1.9477611940298507, "grad_norm": 1.161086879050719, "learning_rate": 3.302940290722947e-06, "loss": 0.089, "step": 4176 }, { "epoch": 1.9486940298507462, "grad_norm": 1.0555247408080717, "learning_rate": 3.297835726854334e-06, "loss": 0.0845, "step": 4178 }, { "epoch": 1.9496268656716418, "grad_norm": 1.066838921984526, "learning_rate": 3.292733168861898e-06, "loss": 0.0851, "step": 4180 }, { "epoch": 1.9505597014925373, "grad_norm": 1.0663973156900242, "learning_rate": 3.287632622758627e-06, "loss": 0.0901, "step": 4182 }, { "epoch": 1.9514925373134329, "grad_norm": 1.1245648284576333, "learning_rate": 3.282534094555143e-06, "loss": 0.0884, "step": 4184 }, { "epoch": 1.9524253731343284, "grad_norm": 1.0210392896418412, "learning_rate": 3.277437590259689e-06, "loss": 0.0881, "step": 4186 }, { "epoch": 1.953358208955224, "grad_norm": 1.1678333482548227, "learning_rate": 3.2723431158781227e-06, "loss": 0.0788, "step": 4188 }, { "epoch": 1.9542910447761193, "grad_norm": 1.1816153238571083, "learning_rate": 3.267250677413911e-06, "loss": 0.1061, "step": 4190 }, { "epoch": 1.955223880597015, "grad_norm": 0.993285255139248, "learning_rate": 3.2621602808681196e-06, "loss": 0.0913, "step": 4192 }, { "epoch": 1.9561567164179103, "grad_norm": 1.1651125851263444, "learning_rate": 3.2570719322394083e-06, "loss": 0.1102, "step": 4194 }, { "epoch": 1.957089552238806, "grad_norm": 1.0280286979928808, "learning_rate": 3.251985637524021e-06, "loss": 0.0893, "step": 4196 }, { "epoch": 1.9580223880597014, "grad_norm": 1.0394304503897773, "learning_rate": 3.246901402715792e-06, "loss": 0.0909, "step": 4198 }, { "epoch": 1.9589552238805972, "grad_norm": 0.9390550085205801, "learning_rate": 3.241819233806114e-06, "loss": 0.09, "step": 4200 }, { "epoch": 1.9598880597014925, "grad_norm": 1.71556465795359, "learning_rate": 3.236739136783953e-06, "loss": 0.0998, "step": 4202 }, { "epoch": 1.960820895522388, "grad_norm": 1.0906502471270598, "learning_rate": 3.231661117635833e-06, "loss": 0.0825, "step": 4204 }, { "epoch": 1.9617537313432836, "grad_norm": 1.2003142159748783, "learning_rate": 3.2265851823458296e-06, "loss": 0.0877, "step": 4206 }, { "epoch": 1.962686567164179, "grad_norm": 1.1831936079852015, "learning_rate": 3.2215113368955553e-06, "loss": 0.0839, "step": 4208 }, { "epoch": 1.9636194029850746, "grad_norm": 1.1415283450250866, "learning_rate": 3.216439587264173e-06, "loss": 0.0905, "step": 4210 }, { "epoch": 1.9645522388059702, "grad_norm": 1.1583312350572683, "learning_rate": 3.2113699394283676e-06, "loss": 0.0831, "step": 4212 }, { "epoch": 1.9654850746268657, "grad_norm": 1.1417172475658672, "learning_rate": 3.2063023993623467e-06, "loss": 0.09, "step": 4214 }, { "epoch": 1.966417910447761, "grad_norm": 1.1579221054987296, "learning_rate": 3.201236973037836e-06, "loss": 0.0955, "step": 4216 }, { "epoch": 1.9673507462686568, "grad_norm": 1.1487602747709493, "learning_rate": 3.1961736664240696e-06, "loss": 0.0936, "step": 4218 }, { "epoch": 1.9682835820895521, "grad_norm": 1.0966829425924962, "learning_rate": 3.191112485487786e-06, "loss": 0.0857, "step": 4220 }, { "epoch": 1.9692164179104479, "grad_norm": 1.044076184991009, "learning_rate": 3.1860534361932166e-06, "loss": 0.0923, "step": 4222 }, { "epoch": 1.9701492537313432, "grad_norm": 1.1742499968427176, "learning_rate": 3.180996524502081e-06, "loss": 0.0909, "step": 4224 }, { "epoch": 1.971082089552239, "grad_norm": 1.115165124779012, "learning_rate": 3.1759417563735807e-06, "loss": 0.099, "step": 4226 }, { "epoch": 1.9720149253731343, "grad_norm": 1.1591610644243253, "learning_rate": 3.170889137764387e-06, "loss": 0.0862, "step": 4228 }, { "epoch": 1.9729477611940298, "grad_norm": 1.1859870681815352, "learning_rate": 3.165838674628647e-06, "loss": 0.0898, "step": 4230 }, { "epoch": 1.9738805970149254, "grad_norm": 1.1611480042631972, "learning_rate": 3.160790372917958e-06, "loss": 0.1001, "step": 4232 }, { "epoch": 1.974813432835821, "grad_norm": 1.1998716109462533, "learning_rate": 3.155744238581377e-06, "loss": 0.0926, "step": 4234 }, { "epoch": 1.9757462686567164, "grad_norm": 1.0956793067934576, "learning_rate": 3.1507002775654028e-06, "loss": 0.0725, "step": 4236 }, { "epoch": 1.976679104477612, "grad_norm": 1.0270837121270664, "learning_rate": 3.1456584958139746e-06, "loss": 0.0984, "step": 4238 }, { "epoch": 1.9776119402985075, "grad_norm": 1.1032505586198182, "learning_rate": 3.140618899268466e-06, "loss": 0.0875, "step": 4240 }, { "epoch": 1.9785447761194028, "grad_norm": 1.1000888042768668, "learning_rate": 3.135581493867672e-06, "loss": 0.0972, "step": 4242 }, { "epoch": 1.9794776119402986, "grad_norm": 1.1081542114323382, "learning_rate": 3.1305462855478076e-06, "loss": 0.0938, "step": 4244 }, { "epoch": 1.980410447761194, "grad_norm": 0.9755341035686999, "learning_rate": 3.125513280242495e-06, "loss": 0.0845, "step": 4246 }, { "epoch": 1.9813432835820897, "grad_norm": 1.0705233148737094, "learning_rate": 3.1204824838827643e-06, "loss": 0.088, "step": 4248 }, { "epoch": 1.982276119402985, "grad_norm": 1.1021245765758034, "learning_rate": 3.115453902397041e-06, "loss": 0.0885, "step": 4250 }, { "epoch": 1.9832089552238807, "grad_norm": 1.0992848410651421, "learning_rate": 3.1104275417111424e-06, "loss": 0.0991, "step": 4252 }, { "epoch": 1.984141791044776, "grad_norm": 1.0808414107147213, "learning_rate": 3.1054034077482665e-06, "loss": 0.0913, "step": 4254 }, { "epoch": 1.9850746268656716, "grad_norm": 0.9997517252230752, "learning_rate": 3.1003815064289866e-06, "loss": 0.0994, "step": 4256 }, { "epoch": 1.9860074626865671, "grad_norm": 1.1732116264185022, "learning_rate": 3.0953618436712497e-06, "loss": 0.091, "step": 4258 }, { "epoch": 1.9869402985074627, "grad_norm": 1.0301903400189767, "learning_rate": 3.090344425390355e-06, "loss": 0.0781, "step": 4260 }, { "epoch": 1.9878731343283582, "grad_norm": 1.109281160627923, "learning_rate": 3.0853292574989702e-06, "loss": 0.0788, "step": 4262 }, { "epoch": 1.9888059701492538, "grad_norm": 1.0738749164576116, "learning_rate": 3.080316345907102e-06, "loss": 0.0941, "step": 4264 }, { "epoch": 1.9897388059701493, "grad_norm": 1.0130124275734014, "learning_rate": 3.0753056965220975e-06, "loss": 0.085, "step": 4266 }, { "epoch": 1.9906716417910446, "grad_norm": 1.1281242101339484, "learning_rate": 3.0702973152486437e-06, "loss": 0.0807, "step": 4268 }, { "epoch": 1.9916044776119404, "grad_norm": 1.0620107524527567, "learning_rate": 3.065291207988749e-06, "loss": 0.079, "step": 4270 }, { "epoch": 1.9925373134328357, "grad_norm": 1.1022741961619187, "learning_rate": 3.0602873806417483e-06, "loss": 0.0898, "step": 4272 }, { "epoch": 1.9934701492537314, "grad_norm": 1.056330071950859, "learning_rate": 3.0552858391042843e-06, "loss": 0.0836, "step": 4274 }, { "epoch": 1.9944029850746268, "grad_norm": 1.0060430451565396, "learning_rate": 3.050286589270309e-06, "loss": 0.098, "step": 4276 }, { "epoch": 1.9953358208955225, "grad_norm": 1.169081076025697, "learning_rate": 3.0452896370310737e-06, "loss": 0.0933, "step": 4278 }, { "epoch": 1.9962686567164178, "grad_norm": 1.088215574341822, "learning_rate": 3.0402949882751167e-06, "loss": 0.085, "step": 4280 }, { "epoch": 1.9972014925373134, "grad_norm": 1.0244345964501775, "learning_rate": 3.035302648888273e-06, "loss": 0.0822, "step": 4282 }, { "epoch": 1.998134328358209, "grad_norm": 1.059627502264395, "learning_rate": 3.030312624753645e-06, "loss": 0.0901, "step": 4284 }, { "epoch": 1.9990671641791045, "grad_norm": 1.0628016884605678, "learning_rate": 3.025324921751614e-06, "loss": 0.0932, "step": 4286 }, { "epoch": 2.0, "grad_norm": 0.9871489671834679, "learning_rate": 3.0203395457598215e-06, "loss": 0.0849, "step": 4288 }, { "epoch": 2.0009328358208953, "grad_norm": 0.6931360568996546, "learning_rate": 3.0153565026531708e-06, "loss": 0.0431, "step": 4290 }, { "epoch": 2.001865671641791, "grad_norm": 0.7377454433761019, "learning_rate": 3.0103757983038105e-06, "loss": 0.0471, "step": 4292 }, { "epoch": 2.0027985074626864, "grad_norm": 0.7304073103295363, "learning_rate": 3.0053974385811403e-06, "loss": 0.0399, "step": 4294 }, { "epoch": 2.003731343283582, "grad_norm": 0.7003735271050772, "learning_rate": 3.0004214293517925e-06, "loss": 0.0416, "step": 4296 }, { "epoch": 2.0046641791044775, "grad_norm": 0.6983732862250597, "learning_rate": 2.9954477764796284e-06, "loss": 0.0358, "step": 4298 }, { "epoch": 2.0055970149253732, "grad_norm": 0.8109537118564591, "learning_rate": 2.990476485825736e-06, "loss": 0.0406, "step": 4300 }, { "epoch": 2.0065298507462686, "grad_norm": 0.8516665775571368, "learning_rate": 2.9855075632484166e-06, "loss": 0.0401, "step": 4302 }, { "epoch": 2.0074626865671643, "grad_norm": 0.9435027292265361, "learning_rate": 2.980541014603183e-06, "loss": 0.0385, "step": 4304 }, { "epoch": 2.0083955223880596, "grad_norm": 0.7255463921775398, "learning_rate": 2.9755768457427514e-06, "loss": 0.0361, "step": 4306 }, { "epoch": 2.0093283582089554, "grad_norm": 0.8805614327917599, "learning_rate": 2.9706150625170295e-06, "loss": 0.0291, "step": 4308 }, { "epoch": 2.0102611940298507, "grad_norm": 1.0058807213756584, "learning_rate": 2.9656556707731176e-06, "loss": 0.0345, "step": 4310 }, { "epoch": 2.0111940298507465, "grad_norm": 1.1415558500838885, "learning_rate": 2.9606986763552936e-06, "loss": 0.0428, "step": 4312 }, { "epoch": 2.012126865671642, "grad_norm": 1.0303970432207215, "learning_rate": 2.955744085105017e-06, "loss": 0.0379, "step": 4314 }, { "epoch": 2.013059701492537, "grad_norm": 0.9995586908068365, "learning_rate": 2.95079190286091e-06, "loss": 0.0402, "step": 4316 }, { "epoch": 2.013992537313433, "grad_norm": 0.922271067089082, "learning_rate": 2.9458421354587567e-06, "loss": 0.0334, "step": 4318 }, { "epoch": 2.014925373134328, "grad_norm": 0.8676276239826717, "learning_rate": 2.9408947887314966e-06, "loss": 0.0323, "step": 4320 }, { "epoch": 2.015858208955224, "grad_norm": 0.9690871345533748, "learning_rate": 2.9359498685092156e-06, "loss": 0.041, "step": 4322 }, { "epoch": 2.0167910447761193, "grad_norm": 0.911334213441479, "learning_rate": 2.931007380619141e-06, "loss": 0.0329, "step": 4324 }, { "epoch": 2.017723880597015, "grad_norm": 0.8072976599069374, "learning_rate": 2.9260673308856345e-06, "loss": 0.0337, "step": 4326 }, { "epoch": 2.0186567164179103, "grad_norm": 0.7563023773058475, "learning_rate": 2.921129725130183e-06, "loss": 0.027, "step": 4328 }, { "epoch": 2.019589552238806, "grad_norm": 0.8423642891811632, "learning_rate": 2.9161945691713944e-06, "loss": 0.0368, "step": 4330 }, { "epoch": 2.0205223880597014, "grad_norm": 0.9040389842217097, "learning_rate": 2.9112618688249874e-06, "loss": 0.0338, "step": 4332 }, { "epoch": 2.021455223880597, "grad_norm": 0.9899962998214459, "learning_rate": 2.9063316299037904e-06, "loss": 0.0386, "step": 4334 }, { "epoch": 2.0223880597014925, "grad_norm": 0.9324323804527187, "learning_rate": 2.90140385821773e-06, "loss": 0.0317, "step": 4336 }, { "epoch": 2.0233208955223883, "grad_norm": 0.9223844323445922, "learning_rate": 2.8964785595738254e-06, "loss": 0.0373, "step": 4338 }, { "epoch": 2.0242537313432836, "grad_norm": 0.9481764801025405, "learning_rate": 2.8915557397761774e-06, "loss": 0.037, "step": 4340 }, { "epoch": 2.025186567164179, "grad_norm": 0.9152698440272568, "learning_rate": 2.8866354046259736e-06, "loss": 0.0361, "step": 4342 }, { "epoch": 2.0261194029850746, "grad_norm": 0.7787048213131694, "learning_rate": 2.8817175599214653e-06, "loss": 0.0337, "step": 4344 }, { "epoch": 2.02705223880597, "grad_norm": 1.0474553599024774, "learning_rate": 2.8768022114579757e-06, "loss": 0.0352, "step": 4346 }, { "epoch": 2.0279850746268657, "grad_norm": 1.1321614377422469, "learning_rate": 2.871889365027885e-06, "loss": 0.0415, "step": 4348 }, { "epoch": 2.028917910447761, "grad_norm": 0.7472035276071027, "learning_rate": 2.86697902642062e-06, "loss": 0.0315, "step": 4350 }, { "epoch": 2.029850746268657, "grad_norm": 0.9410204403680684, "learning_rate": 2.8620712014226594e-06, "loss": 0.0358, "step": 4352 }, { "epoch": 2.030783582089552, "grad_norm": 0.9404385467072122, "learning_rate": 2.8571658958175126e-06, "loss": 0.0346, "step": 4354 }, { "epoch": 2.031716417910448, "grad_norm": 0.9364766899127865, "learning_rate": 2.852263115385725e-06, "loss": 0.0334, "step": 4356 }, { "epoch": 2.032649253731343, "grad_norm": 0.9143157879297902, "learning_rate": 2.847362865904868e-06, "loss": 0.0345, "step": 4358 }, { "epoch": 2.033582089552239, "grad_norm": 0.9643445484521032, "learning_rate": 2.842465153149525e-06, "loss": 0.0359, "step": 4360 }, { "epoch": 2.0345149253731343, "grad_norm": 0.9383657334350942, "learning_rate": 2.8375699828912895e-06, "loss": 0.0343, "step": 4362 }, { "epoch": 2.03544776119403, "grad_norm": 1.0326063733701467, "learning_rate": 2.832677360898768e-06, "loss": 0.0365, "step": 4364 }, { "epoch": 2.0363805970149254, "grad_norm": 1.0008038885283892, "learning_rate": 2.8277872929375515e-06, "loss": 0.0332, "step": 4366 }, { "epoch": 2.0373134328358207, "grad_norm": 1.0490424912019125, "learning_rate": 2.822899784770232e-06, "loss": 0.0349, "step": 4368 }, { "epoch": 2.0382462686567164, "grad_norm": 0.8746090013949105, "learning_rate": 2.8180148421563803e-06, "loss": 0.0314, "step": 4370 }, { "epoch": 2.0391791044776117, "grad_norm": 0.9742008039517758, "learning_rate": 2.813132470852543e-06, "loss": 0.0359, "step": 4372 }, { "epoch": 2.0401119402985075, "grad_norm": 0.8831437330245768, "learning_rate": 2.8082526766122377e-06, "loss": 0.0349, "step": 4374 }, { "epoch": 2.041044776119403, "grad_norm": 0.8352574338089715, "learning_rate": 2.803375465185944e-06, "loss": 0.0377, "step": 4376 }, { "epoch": 2.0419776119402986, "grad_norm": 0.9386137569142765, "learning_rate": 2.7985008423211037e-06, "loss": 0.0362, "step": 4378 }, { "epoch": 2.042910447761194, "grad_norm": 0.722804351685375, "learning_rate": 2.7936288137620976e-06, "loss": 0.0319, "step": 4380 }, { "epoch": 2.0438432835820897, "grad_norm": 0.859401137020611, "learning_rate": 2.7887593852502604e-06, "loss": 0.0331, "step": 4382 }, { "epoch": 2.044776119402985, "grad_norm": 0.8811036462247247, "learning_rate": 2.783892562523854e-06, "loss": 0.0358, "step": 4384 }, { "epoch": 2.0457089552238807, "grad_norm": 0.9824317430351688, "learning_rate": 2.7790283513180736e-06, "loss": 0.0377, "step": 4386 }, { "epoch": 2.046641791044776, "grad_norm": 0.9387833059761242, "learning_rate": 2.774166757365041e-06, "loss": 0.0351, "step": 4388 }, { "epoch": 2.047574626865672, "grad_norm": 0.9062659882647344, "learning_rate": 2.769307786393785e-06, "loss": 0.0366, "step": 4390 }, { "epoch": 2.048507462686567, "grad_norm": 1.0864150502672876, "learning_rate": 2.7644514441302466e-06, "loss": 0.0372, "step": 4392 }, { "epoch": 2.049440298507463, "grad_norm": 1.3917399137418618, "learning_rate": 2.7595977362972747e-06, "loss": 0.0355, "step": 4394 }, { "epoch": 2.050373134328358, "grad_norm": 0.9938119244375839, "learning_rate": 2.754746668614604e-06, "loss": 0.0358, "step": 4396 }, { "epoch": 2.0513059701492535, "grad_norm": 0.7966865598676414, "learning_rate": 2.7498982467988668e-06, "loss": 0.0317, "step": 4398 }, { "epoch": 2.0522388059701493, "grad_norm": 0.9056389639839669, "learning_rate": 2.745052476563574e-06, "loss": 0.0332, "step": 4400 }, { "epoch": 2.0531716417910446, "grad_norm": 0.8418617565436706, "learning_rate": 2.7402093636191085e-06, "loss": 0.0353, "step": 4402 }, { "epoch": 2.0541044776119404, "grad_norm": 0.8412614340932287, "learning_rate": 2.735368913672729e-06, "loss": 0.0368, "step": 4404 }, { "epoch": 2.0550373134328357, "grad_norm": 0.9346669550850343, "learning_rate": 2.7305311324285506e-06, "loss": 0.0307, "step": 4406 }, { "epoch": 2.0559701492537314, "grad_norm": 1.0988111290308489, "learning_rate": 2.7256960255875396e-06, "loss": 0.0322, "step": 4408 }, { "epoch": 2.0569029850746268, "grad_norm": 0.844308544081491, "learning_rate": 2.720863598847524e-06, "loss": 0.0315, "step": 4410 }, { "epoch": 2.0578358208955225, "grad_norm": 0.9981226105492794, "learning_rate": 2.7160338579031627e-06, "loss": 0.0307, "step": 4412 }, { "epoch": 2.058768656716418, "grad_norm": 0.9761698042116492, "learning_rate": 2.711206808445949e-06, "loss": 0.036, "step": 4414 }, { "epoch": 2.0597014925373136, "grad_norm": 0.8809001352579211, "learning_rate": 2.7063824561642134e-06, "loss": 0.035, "step": 4416 }, { "epoch": 2.060634328358209, "grad_norm": 1.0174452558911835, "learning_rate": 2.7015608067430965e-06, "loss": 0.0368, "step": 4418 }, { "epoch": 2.0615671641791047, "grad_norm": 0.839889447971411, "learning_rate": 2.696741865864564e-06, "loss": 0.0331, "step": 4420 }, { "epoch": 2.0625, "grad_norm": 1.002150312971576, "learning_rate": 2.691925639207385e-06, "loss": 0.0289, "step": 4422 }, { "epoch": 2.0634328358208953, "grad_norm": 0.9063546968160399, "learning_rate": 2.6871121324471305e-06, "loss": 0.0318, "step": 4424 }, { "epoch": 2.064365671641791, "grad_norm": 1.11419944914278, "learning_rate": 2.682301351256163e-06, "loss": 0.035, "step": 4426 }, { "epoch": 2.0652985074626864, "grad_norm": 1.1294151793692213, "learning_rate": 2.6774933013036396e-06, "loss": 0.0379, "step": 4428 }, { "epoch": 2.066231343283582, "grad_norm": 1.002134123185201, "learning_rate": 2.6726879882554968e-06, "loss": 0.0345, "step": 4430 }, { "epoch": 2.0671641791044775, "grad_norm": 0.9959881270577299, "learning_rate": 2.6678854177744416e-06, "loss": 0.0296, "step": 4432 }, { "epoch": 2.0680970149253732, "grad_norm": 0.9493261354779687, "learning_rate": 2.6630855955199566e-06, "loss": 0.0278, "step": 4434 }, { "epoch": 2.0690298507462686, "grad_norm": 0.87872995705695, "learning_rate": 2.6582885271482757e-06, "loss": 0.0332, "step": 4436 }, { "epoch": 2.0699626865671643, "grad_norm": 0.9175728195839566, "learning_rate": 2.653494218312397e-06, "loss": 0.0319, "step": 4438 }, { "epoch": 2.0708955223880596, "grad_norm": 0.8619144692708061, "learning_rate": 2.6487026746620637e-06, "loss": 0.0316, "step": 4440 }, { "epoch": 2.0718283582089554, "grad_norm": 1.000841756088507, "learning_rate": 2.643913901843759e-06, "loss": 0.0387, "step": 4442 }, { "epoch": 2.0727611940298507, "grad_norm": 0.9160370473851499, "learning_rate": 2.639127905500699e-06, "loss": 0.0335, "step": 4444 }, { "epoch": 2.0736940298507465, "grad_norm": 0.8524799209467483, "learning_rate": 2.6343446912728348e-06, "loss": 0.0284, "step": 4446 }, { "epoch": 2.074626865671642, "grad_norm": 0.9454922612542884, "learning_rate": 2.6295642647968307e-06, "loss": 0.0336, "step": 4448 }, { "epoch": 2.075559701492537, "grad_norm": 0.9648230986119682, "learning_rate": 2.624786631706071e-06, "loss": 0.0359, "step": 4450 }, { "epoch": 2.076492537313433, "grad_norm": 0.9570495850111927, "learning_rate": 2.6200117976306506e-06, "loss": 0.0343, "step": 4452 }, { "epoch": 2.077425373134328, "grad_norm": 0.863764434660277, "learning_rate": 2.615239768197357e-06, "loss": 0.0376, "step": 4454 }, { "epoch": 2.078358208955224, "grad_norm": 0.9767886015359025, "learning_rate": 2.610470549029684e-06, "loss": 0.0365, "step": 4456 }, { "epoch": 2.0792910447761193, "grad_norm": 0.9108636703213165, "learning_rate": 2.605704145747804e-06, "loss": 0.0301, "step": 4458 }, { "epoch": 2.080223880597015, "grad_norm": 0.8516455251630574, "learning_rate": 2.600940563968571e-06, "loss": 0.034, "step": 4460 }, { "epoch": 2.0811567164179103, "grad_norm": 1.0982161468443037, "learning_rate": 2.596179809305526e-06, "loss": 0.0377, "step": 4462 }, { "epoch": 2.082089552238806, "grad_norm": 0.9093692567373183, "learning_rate": 2.5914218873688678e-06, "loss": 0.037, "step": 4464 }, { "epoch": 2.0830223880597014, "grad_norm": 1.0092770381855278, "learning_rate": 2.5866668037654557e-06, "loss": 0.0385, "step": 4466 }, { "epoch": 2.083955223880597, "grad_norm": 0.9622699089394329, "learning_rate": 2.581914564098813e-06, "loss": 0.0325, "step": 4468 }, { "epoch": 2.0848880597014925, "grad_norm": 1.0438505108352771, "learning_rate": 2.577165173969103e-06, "loss": 0.031, "step": 4470 }, { "epoch": 2.0858208955223883, "grad_norm": 0.9997481642261107, "learning_rate": 2.5724186389731364e-06, "loss": 0.0366, "step": 4472 }, { "epoch": 2.0867537313432836, "grad_norm": 1.0884582418232287, "learning_rate": 2.5676749647043602e-06, "loss": 0.0355, "step": 4474 }, { "epoch": 2.0876865671641793, "grad_norm": 0.9273682507257243, "learning_rate": 2.5629341567528453e-06, "loss": 0.0372, "step": 4476 }, { "epoch": 2.0886194029850746, "grad_norm": 0.8771097952377875, "learning_rate": 2.5581962207052856e-06, "loss": 0.0316, "step": 4478 }, { "epoch": 2.08955223880597, "grad_norm": 0.8606340193338733, "learning_rate": 2.553461162144994e-06, "loss": 0.0315, "step": 4480 }, { "epoch": 2.0904850746268657, "grad_norm": 0.945052662282763, "learning_rate": 2.5487289866518937e-06, "loss": 0.0402, "step": 4482 }, { "epoch": 2.091417910447761, "grad_norm": 0.937247860729769, "learning_rate": 2.543999699802503e-06, "loss": 0.0336, "step": 4484 }, { "epoch": 2.092350746268657, "grad_norm": 0.8243058279419139, "learning_rate": 2.5392733071699443e-06, "loss": 0.0306, "step": 4486 }, { "epoch": 2.093283582089552, "grad_norm": 1.001169897669878, "learning_rate": 2.5345498143239233e-06, "loss": 0.0356, "step": 4488 }, { "epoch": 2.094216417910448, "grad_norm": 0.8378463772807123, "learning_rate": 2.5298292268307333e-06, "loss": 0.0364, "step": 4490 }, { "epoch": 2.095149253731343, "grad_norm": 0.9773868622932619, "learning_rate": 2.52511155025324e-06, "loss": 0.0314, "step": 4492 }, { "epoch": 2.096082089552239, "grad_norm": 0.9018453297555646, "learning_rate": 2.520396790150881e-06, "loss": 0.0333, "step": 4494 }, { "epoch": 2.0970149253731343, "grad_norm": 1.0558594419317222, "learning_rate": 2.5156849520796558e-06, "loss": 0.0361, "step": 4496 }, { "epoch": 2.09794776119403, "grad_norm": 0.9667061096277667, "learning_rate": 2.510976041592123e-06, "loss": 0.0341, "step": 4498 }, { "epoch": 2.0988805970149254, "grad_norm": 0.8584431894215813, "learning_rate": 2.5062700642373868e-06, "loss": 0.0296, "step": 4500 }, { "epoch": 2.0988805970149254, "eval_loss": 0.1816190481185913, "eval_runtime": 322.5454, "eval_samples_per_second": 47.268, "eval_steps_per_second": 5.909, "step": 4500 }, { "epoch": 2.0998134328358207, "grad_norm": 0.8906044834057271, "learning_rate": 2.501567025561098e-06, "loss": 0.0348, "step": 4502 }, { "epoch": 2.1007462686567164, "grad_norm": 0.9724157585934898, "learning_rate": 2.4968669311054473e-06, "loss": 0.0345, "step": 4504 }, { "epoch": 2.1016791044776117, "grad_norm": 1.081134027325536, "learning_rate": 2.4921697864091478e-06, "loss": 0.0387, "step": 4506 }, { "epoch": 2.1026119402985075, "grad_norm": 1.0419985812048587, "learning_rate": 2.4874755970074448e-06, "loss": 0.0337, "step": 4508 }, { "epoch": 2.103544776119403, "grad_norm": 0.9031019447209006, "learning_rate": 2.4827843684320967e-06, "loss": 0.0375, "step": 4510 }, { "epoch": 2.1044776119402986, "grad_norm": 0.8739561592400861, "learning_rate": 2.4780961062113683e-06, "loss": 0.0312, "step": 4512 }, { "epoch": 2.105410447761194, "grad_norm": 1.0421421043853216, "learning_rate": 2.473410815870042e-06, "loss": 0.0371, "step": 4514 }, { "epoch": 2.1063432835820897, "grad_norm": 0.8601097626529265, "learning_rate": 2.4687285029293866e-06, "loss": 0.0363, "step": 4516 }, { "epoch": 2.107276119402985, "grad_norm": 0.8626554964214478, "learning_rate": 2.4640491729071635e-06, "loss": 0.0327, "step": 4518 }, { "epoch": 2.1082089552238807, "grad_norm": 0.8135345781725284, "learning_rate": 2.4593728313176246e-06, "loss": 0.0304, "step": 4520 }, { "epoch": 2.109141791044776, "grad_norm": 1.0102193837120443, "learning_rate": 2.454699483671493e-06, "loss": 0.0334, "step": 4522 }, { "epoch": 2.110074626865672, "grad_norm": 0.9757241923191624, "learning_rate": 2.450029135475969e-06, "loss": 0.0377, "step": 4524 }, { "epoch": 2.111007462686567, "grad_norm": 0.8986898680617315, "learning_rate": 2.4453617922347194e-06, "loss": 0.0303, "step": 4526 }, { "epoch": 2.111940298507463, "grad_norm": 1.0113102214648557, "learning_rate": 2.440697459447864e-06, "loss": 0.0395, "step": 4528 }, { "epoch": 2.112873134328358, "grad_norm": 0.9163503674162015, "learning_rate": 2.4360361426119767e-06, "loss": 0.0345, "step": 4530 }, { "epoch": 2.1138059701492535, "grad_norm": 0.8831550411456144, "learning_rate": 2.4313778472200824e-06, "loss": 0.0324, "step": 4532 }, { "epoch": 2.1147388059701493, "grad_norm": 0.8697203486215254, "learning_rate": 2.4267225787616376e-06, "loss": 0.0322, "step": 4534 }, { "epoch": 2.1156716417910446, "grad_norm": 0.8707276440239047, "learning_rate": 2.4220703427225384e-06, "loss": 0.0326, "step": 4536 }, { "epoch": 2.1166044776119404, "grad_norm": 0.9452643458255807, "learning_rate": 2.4174211445851066e-06, "loss": 0.0342, "step": 4538 }, { "epoch": 2.1175373134328357, "grad_norm": 0.9742990319517244, "learning_rate": 2.4127749898280783e-06, "loss": 0.0384, "step": 4540 }, { "epoch": 2.1184701492537314, "grad_norm": 1.0034601356655437, "learning_rate": 2.4081318839266117e-06, "loss": 0.036, "step": 4542 }, { "epoch": 2.1194029850746268, "grad_norm": 0.9056355918535672, "learning_rate": 2.4034918323522628e-06, "loss": 0.029, "step": 4544 }, { "epoch": 2.1203358208955225, "grad_norm": 0.8015820442039946, "learning_rate": 2.398854840572998e-06, "loss": 0.0321, "step": 4546 }, { "epoch": 2.121268656716418, "grad_norm": 0.8309431070007485, "learning_rate": 2.3942209140531693e-06, "loss": 0.0311, "step": 4548 }, { "epoch": 2.1222014925373136, "grad_norm": 0.8648653780893618, "learning_rate": 2.389590058253523e-06, "loss": 0.033, "step": 4550 }, { "epoch": 2.123134328358209, "grad_norm": 1.1200010143374086, "learning_rate": 2.384962278631182e-06, "loss": 0.0387, "step": 4552 }, { "epoch": 2.1240671641791047, "grad_norm": 0.9618596793829509, "learning_rate": 2.3803375806396474e-06, "loss": 0.0373, "step": 4554 }, { "epoch": 2.125, "grad_norm": 1.004063616906792, "learning_rate": 2.3757159697287895e-06, "loss": 0.0376, "step": 4556 }, { "epoch": 2.1259328358208953, "grad_norm": 0.8760913390235822, "learning_rate": 2.371097451344836e-06, "loss": 0.0312, "step": 4558 }, { "epoch": 2.126865671641791, "grad_norm": 1.064785225545377, "learning_rate": 2.366482030930376e-06, "loss": 0.0344, "step": 4560 }, { "epoch": 2.1277985074626864, "grad_norm": 0.940320101450485, "learning_rate": 2.3618697139243437e-06, "loss": 0.0347, "step": 4562 }, { "epoch": 2.128731343283582, "grad_norm": 0.8600791222658538, "learning_rate": 2.357260505762015e-06, "loss": 0.0331, "step": 4564 }, { "epoch": 2.1296641791044775, "grad_norm": 1.0088948594978933, "learning_rate": 2.3526544118750077e-06, "loss": 0.0354, "step": 4566 }, { "epoch": 2.1305970149253732, "grad_norm": 0.9604993555548367, "learning_rate": 2.348051437691268e-06, "loss": 0.0341, "step": 4568 }, { "epoch": 2.1315298507462686, "grad_norm": 1.0054616972058592, "learning_rate": 2.343451588635061e-06, "loss": 0.0373, "step": 4570 }, { "epoch": 2.1324626865671643, "grad_norm": 1.0641537750139058, "learning_rate": 2.3388548701269763e-06, "loss": 0.033, "step": 4572 }, { "epoch": 2.1333955223880596, "grad_norm": 1.0002127636434242, "learning_rate": 2.3342612875839095e-06, "loss": 0.0362, "step": 4574 }, { "epoch": 2.1343283582089554, "grad_norm": 0.9797640995927954, "learning_rate": 2.3296708464190567e-06, "loss": 0.0332, "step": 4576 }, { "epoch": 2.1352611940298507, "grad_norm": 0.915471639453477, "learning_rate": 2.325083552041925e-06, "loss": 0.0334, "step": 4578 }, { "epoch": 2.1361940298507465, "grad_norm": 1.1342225227237468, "learning_rate": 2.3204994098583026e-06, "loss": 0.0357, "step": 4580 }, { "epoch": 2.137126865671642, "grad_norm": 1.012524808599099, "learning_rate": 2.3159184252702636e-06, "loss": 0.04, "step": 4582 }, { "epoch": 2.138059701492537, "grad_norm": 0.8438797093214802, "learning_rate": 2.3113406036761676e-06, "loss": 0.0295, "step": 4584 }, { "epoch": 2.138992537313433, "grad_norm": 0.9949292984726634, "learning_rate": 2.306765950470639e-06, "loss": 0.0367, "step": 4586 }, { "epoch": 2.139925373134328, "grad_norm": 0.9869746962652547, "learning_rate": 2.302194471044573e-06, "loss": 0.0329, "step": 4588 }, { "epoch": 2.140858208955224, "grad_norm": 0.96929114570457, "learning_rate": 2.2976261707851272e-06, "loss": 0.0352, "step": 4590 }, { "epoch": 2.1417910447761193, "grad_norm": 1.0021017263034497, "learning_rate": 2.293061055075707e-06, "loss": 0.0318, "step": 4592 }, { "epoch": 2.142723880597015, "grad_norm": 1.0421455590386683, "learning_rate": 2.288499129295966e-06, "loss": 0.0327, "step": 4594 }, { "epoch": 2.1436567164179103, "grad_norm": 0.8489890614711563, "learning_rate": 2.2839403988218016e-06, "loss": 0.0319, "step": 4596 }, { "epoch": 2.144589552238806, "grad_norm": 0.7705589657767264, "learning_rate": 2.279384869025347e-06, "loss": 0.0327, "step": 4598 }, { "epoch": 2.1455223880597014, "grad_norm": 1.1150761153080226, "learning_rate": 2.2748325452749567e-06, "loss": 0.0369, "step": 4600 }, { "epoch": 2.146455223880597, "grad_norm": 1.0975772203589185, "learning_rate": 2.270283432935216e-06, "loss": 0.0356, "step": 4602 }, { "epoch": 2.1473880597014925, "grad_norm": 0.9610466206421365, "learning_rate": 2.265737537366916e-06, "loss": 0.0312, "step": 4604 }, { "epoch": 2.1483208955223883, "grad_norm": 0.9811749020127127, "learning_rate": 2.261194863927068e-06, "loss": 0.0349, "step": 4606 }, { "epoch": 2.1492537313432836, "grad_norm": 1.0291519371503837, "learning_rate": 2.2566554179688756e-06, "loss": 0.0299, "step": 4608 }, { "epoch": 2.1501865671641793, "grad_norm": 1.0796810043556722, "learning_rate": 2.252119204841747e-06, "loss": 0.0305, "step": 4610 }, { "epoch": 2.1511194029850746, "grad_norm": 0.9349459452646798, "learning_rate": 2.2475862298912784e-06, "loss": 0.0306, "step": 4612 }, { "epoch": 2.15205223880597, "grad_norm": 1.0097370464745856, "learning_rate": 2.243056498459248e-06, "loss": 0.0356, "step": 4614 }, { "epoch": 2.1529850746268657, "grad_norm": 1.0695894313307166, "learning_rate": 2.2385300158836116e-06, "loss": 0.0365, "step": 4616 }, { "epoch": 2.153917910447761, "grad_norm": 0.909446348523728, "learning_rate": 2.2340067874984995e-06, "loss": 0.0313, "step": 4618 }, { "epoch": 2.154850746268657, "grad_norm": 1.463227945494659, "learning_rate": 2.2294868186342085e-06, "loss": 0.0344, "step": 4620 }, { "epoch": 2.155783582089552, "grad_norm": 0.885503083261715, "learning_rate": 2.2249701146171864e-06, "loss": 0.0321, "step": 4622 }, { "epoch": 2.156716417910448, "grad_norm": 0.9832054843436491, "learning_rate": 2.2204566807700433e-06, "loss": 0.0347, "step": 4624 }, { "epoch": 2.157649253731343, "grad_norm": 0.8772604963675062, "learning_rate": 2.2159465224115295e-06, "loss": 0.032, "step": 4626 }, { "epoch": 2.158582089552239, "grad_norm": 1.1186244209607723, "learning_rate": 2.2114396448565328e-06, "loss": 0.0375, "step": 4628 }, { "epoch": 2.1595149253731343, "grad_norm": 0.9442783503559156, "learning_rate": 2.2069360534160865e-06, "loss": 0.0363, "step": 4630 }, { "epoch": 2.16044776119403, "grad_norm": 0.9397474301368692, "learning_rate": 2.20243575339734e-06, "loss": 0.0309, "step": 4632 }, { "epoch": 2.1613805970149254, "grad_norm": 0.8914471569658011, "learning_rate": 2.1979387501035666e-06, "loss": 0.0302, "step": 4634 }, { "epoch": 2.1623134328358207, "grad_norm": 0.8553458524065944, "learning_rate": 2.1934450488341584e-06, "loss": 0.0305, "step": 4636 }, { "epoch": 2.1632462686567164, "grad_norm": 0.9603054838866182, "learning_rate": 2.1889546548846117e-06, "loss": 0.0317, "step": 4638 }, { "epoch": 2.1641791044776117, "grad_norm": 0.980620474309731, "learning_rate": 2.1844675735465285e-06, "loss": 0.0368, "step": 4640 }, { "epoch": 2.1651119402985075, "grad_norm": 0.8389929699446843, "learning_rate": 2.1799838101076086e-06, "loss": 0.0297, "step": 4642 }, { "epoch": 2.166044776119403, "grad_norm": 1.0237442187823946, "learning_rate": 2.1755033698516374e-06, "loss": 0.0353, "step": 4644 }, { "epoch": 2.1669776119402986, "grad_norm": 0.9671345740512663, "learning_rate": 2.171026258058484e-06, "loss": 0.0308, "step": 4646 }, { "epoch": 2.167910447761194, "grad_norm": 0.9021871451964945, "learning_rate": 2.1665524800041015e-06, "loss": 0.0353, "step": 4648 }, { "epoch": 2.1688432835820897, "grad_norm": 0.917686541925775, "learning_rate": 2.1620820409605067e-06, "loss": 0.0334, "step": 4650 }, { "epoch": 2.169776119402985, "grad_norm": 0.8653719393610244, "learning_rate": 2.1576149461957867e-06, "loss": 0.0322, "step": 4652 }, { "epoch": 2.1707089552238807, "grad_norm": 0.8406907699111601, "learning_rate": 2.153151200974088e-06, "loss": 0.0354, "step": 4654 }, { "epoch": 2.171641791044776, "grad_norm": 0.830325595315677, "learning_rate": 2.1486908105556047e-06, "loss": 0.0329, "step": 4656 }, { "epoch": 2.172574626865672, "grad_norm": 0.8915313413102008, "learning_rate": 2.1442337801965844e-06, "loss": 0.031, "step": 4658 }, { "epoch": 2.173507462686567, "grad_norm": 1.0455817370734015, "learning_rate": 2.139780115149308e-06, "loss": 0.0366, "step": 4660 }, { "epoch": 2.174440298507463, "grad_norm": 0.8336915273217668, "learning_rate": 2.135329820662096e-06, "loss": 0.031, "step": 4662 }, { "epoch": 2.175373134328358, "grad_norm": 1.034412884123277, "learning_rate": 2.130882901979297e-06, "loss": 0.0311, "step": 4664 }, { "epoch": 2.1763059701492535, "grad_norm": 1.014121104563138, "learning_rate": 2.1264393643412778e-06, "loss": 0.0292, "step": 4666 }, { "epoch": 2.1772388059701493, "grad_norm": 0.9249121927535365, "learning_rate": 2.1219992129844207e-06, "loss": 0.0336, "step": 4668 }, { "epoch": 2.1781716417910446, "grad_norm": 0.9316547180306575, "learning_rate": 2.1175624531411215e-06, "loss": 0.0331, "step": 4670 }, { "epoch": 2.1791044776119404, "grad_norm": 1.042818841209333, "learning_rate": 2.1131290900397792e-06, "loss": 0.0372, "step": 4672 }, { "epoch": 2.1800373134328357, "grad_norm": 0.8447600194276534, "learning_rate": 2.108699128904784e-06, "loss": 0.0299, "step": 4674 }, { "epoch": 2.1809701492537314, "grad_norm": 0.7899589890991614, "learning_rate": 2.104272574956526e-06, "loss": 0.0316, "step": 4676 }, { "epoch": 2.1819029850746268, "grad_norm": 0.898035918160858, "learning_rate": 2.0998494334113733e-06, "loss": 0.0355, "step": 4678 }, { "epoch": 2.1828358208955225, "grad_norm": 0.9252003811177696, "learning_rate": 2.0954297094816708e-06, "loss": 0.0335, "step": 4680 }, { "epoch": 2.183768656716418, "grad_norm": 0.8512045478373349, "learning_rate": 2.091013408375747e-06, "loss": 0.0315, "step": 4682 }, { "epoch": 2.1847014925373136, "grad_norm": 1.0448281959161687, "learning_rate": 2.0866005352978875e-06, "loss": 0.0355, "step": 4684 }, { "epoch": 2.185634328358209, "grad_norm": 0.9289854915666329, "learning_rate": 2.082191095448338e-06, "loss": 0.0325, "step": 4686 }, { "epoch": 2.1865671641791047, "grad_norm": 0.9352449594094461, "learning_rate": 2.077785094023305e-06, "loss": 0.0302, "step": 4688 }, { "epoch": 2.1875, "grad_norm": 1.0548731176048733, "learning_rate": 2.0733825362149356e-06, "loss": 0.0371, "step": 4690 }, { "epoch": 2.1884328358208953, "grad_norm": 1.1335185831863699, "learning_rate": 2.0689834272113234e-06, "loss": 0.036, "step": 4692 }, { "epoch": 2.189365671641791, "grad_norm": 0.9700440826454366, "learning_rate": 2.0645877721964996e-06, "loss": 0.033, "step": 4694 }, { "epoch": 2.1902985074626864, "grad_norm": 1.0383039451974871, "learning_rate": 2.0601955763504207e-06, "loss": 0.0351, "step": 4696 }, { "epoch": 2.191231343283582, "grad_norm": 0.9495829448601033, "learning_rate": 2.0558068448489647e-06, "loss": 0.0317, "step": 4698 }, { "epoch": 2.1921641791044775, "grad_norm": 1.1212627856210597, "learning_rate": 2.051421582863937e-06, "loss": 0.0363, "step": 4700 }, { "epoch": 2.1930970149253732, "grad_norm": 1.0209101068037678, "learning_rate": 2.047039795563043e-06, "loss": 0.0335, "step": 4702 }, { "epoch": 2.1940298507462686, "grad_norm": 0.9650803299518893, "learning_rate": 2.0426614881099013e-06, "loss": 0.0292, "step": 4704 }, { "epoch": 2.1949626865671643, "grad_norm": 0.9853328481980234, "learning_rate": 2.0382866656640288e-06, "loss": 0.0306, "step": 4706 }, { "epoch": 2.1958955223880596, "grad_norm": 0.7923048271755836, "learning_rate": 2.0339153333808304e-06, "loss": 0.0303, "step": 4708 }, { "epoch": 2.1968283582089554, "grad_norm": 0.965579537679953, "learning_rate": 2.029547496411605e-06, "loss": 0.0382, "step": 4710 }, { "epoch": 2.1977611940298507, "grad_norm": 1.1500344236104285, "learning_rate": 2.025183159903526e-06, "loss": 0.0366, "step": 4712 }, { "epoch": 2.1986940298507465, "grad_norm": 1.06417345608817, "learning_rate": 2.0208223289996466e-06, "loss": 0.0371, "step": 4714 }, { "epoch": 2.199626865671642, "grad_norm": 1.110285550269103, "learning_rate": 2.016465008838889e-06, "loss": 0.0332, "step": 4716 }, { "epoch": 2.200559701492537, "grad_norm": 0.9693828931537851, "learning_rate": 2.012111204556035e-06, "loss": 0.0329, "step": 4718 }, { "epoch": 2.201492537313433, "grad_norm": 1.0073905057103043, "learning_rate": 2.0077609212817224e-06, "loss": 0.0347, "step": 4720 }, { "epoch": 2.202425373134328, "grad_norm": 0.9489285384383775, "learning_rate": 2.0034141641424437e-06, "loss": 0.0326, "step": 4722 }, { "epoch": 2.203358208955224, "grad_norm": 0.9206842935899366, "learning_rate": 1.999070938260537e-06, "loss": 0.0342, "step": 4724 }, { "epoch": 2.2042910447761193, "grad_norm": 0.8781498563869979, "learning_rate": 1.994731248754173e-06, "loss": 0.0314, "step": 4726 }, { "epoch": 2.205223880597015, "grad_norm": 0.9872802161438039, "learning_rate": 1.9903951007373617e-06, "loss": 0.0329, "step": 4728 }, { "epoch": 2.2061567164179103, "grad_norm": 0.9455452744440873, "learning_rate": 1.9860624993199345e-06, "loss": 0.035, "step": 4730 }, { "epoch": 2.207089552238806, "grad_norm": 0.904213166976788, "learning_rate": 1.9817334496075447e-06, "loss": 0.0317, "step": 4732 }, { "epoch": 2.2080223880597014, "grad_norm": 0.9319034379101149, "learning_rate": 1.9774079567016613e-06, "loss": 0.0308, "step": 4734 }, { "epoch": 2.208955223880597, "grad_norm": 0.9522537506099564, "learning_rate": 1.9730860256995643e-06, "loss": 0.0319, "step": 4736 }, { "epoch": 2.2098880597014925, "grad_norm": 1.003207526746631, "learning_rate": 1.9687676616943303e-06, "loss": 0.0303, "step": 4738 }, { "epoch": 2.2108208955223883, "grad_norm": 0.9941387313771194, "learning_rate": 1.964452869774838e-06, "loss": 0.0341, "step": 4740 }, { "epoch": 2.2117537313432836, "grad_norm": 1.078428961396446, "learning_rate": 1.960141655025751e-06, "loss": 0.0328, "step": 4742 }, { "epoch": 2.2126865671641793, "grad_norm": 1.047698707396123, "learning_rate": 1.9558340225275236e-06, "loss": 0.0388, "step": 4744 }, { "epoch": 2.2136194029850746, "grad_norm": 0.9245180530064719, "learning_rate": 1.9515299773563862e-06, "loss": 0.0367, "step": 4746 }, { "epoch": 2.21455223880597, "grad_norm": 0.7666841210019265, "learning_rate": 1.947229524584341e-06, "loss": 0.0317, "step": 4748 }, { "epoch": 2.2154850746268657, "grad_norm": 1.0904125425418785, "learning_rate": 1.942932669279154e-06, "loss": 0.0318, "step": 4750 }, { "epoch": 2.216417910447761, "grad_norm": 0.9895065655721109, "learning_rate": 1.9386394165043596e-06, "loss": 0.0347, "step": 4752 }, { "epoch": 2.217350746268657, "grad_norm": 0.9353829693815657, "learning_rate": 1.9343497713192387e-06, "loss": 0.0302, "step": 4754 }, { "epoch": 2.218283582089552, "grad_norm": 0.9003688886804847, "learning_rate": 1.930063738778827e-06, "loss": 0.0294, "step": 4756 }, { "epoch": 2.219216417910448, "grad_norm": 1.0649572955034126, "learning_rate": 1.925781323933901e-06, "loss": 0.0343, "step": 4758 }, { "epoch": 2.220149253731343, "grad_norm": 0.9021986373914942, "learning_rate": 1.9215025318309704e-06, "loss": 0.0327, "step": 4760 }, { "epoch": 2.221082089552239, "grad_norm": 0.9654614343606513, "learning_rate": 1.9172273675122833e-06, "loss": 0.034, "step": 4762 }, { "epoch": 2.2220149253731343, "grad_norm": 0.9753988770300552, "learning_rate": 1.9129558360158057e-06, "loss": 0.0358, "step": 4764 }, { "epoch": 2.22294776119403, "grad_norm": 1.0302665303404692, "learning_rate": 1.9086879423752218e-06, "loss": 0.0378, "step": 4766 }, { "epoch": 2.2238805970149254, "grad_norm": 0.9546071003694403, "learning_rate": 1.9044236916199404e-06, "loss": 0.0331, "step": 4768 }, { "epoch": 2.2248134328358207, "grad_norm": 0.9541014652633691, "learning_rate": 1.9001630887750643e-06, "loss": 0.0331, "step": 4770 }, { "epoch": 2.2257462686567164, "grad_norm": 0.8919137335366932, "learning_rate": 1.8959061388614013e-06, "loss": 0.0295, "step": 4772 }, { "epoch": 2.2266791044776117, "grad_norm": 1.0383502957226958, "learning_rate": 1.8916528468954598e-06, "loss": 0.028, "step": 4774 }, { "epoch": 2.2276119402985075, "grad_norm": 0.9602279618187743, "learning_rate": 1.8874032178894291e-06, "loss": 0.0302, "step": 4776 }, { "epoch": 2.228544776119403, "grad_norm": 0.9150970720932146, "learning_rate": 1.8831572568511891e-06, "loss": 0.0316, "step": 4778 }, { "epoch": 2.2294776119402986, "grad_norm": 0.9796102398418659, "learning_rate": 1.8789149687842955e-06, "loss": 0.0318, "step": 4780 }, { "epoch": 2.230410447761194, "grad_norm": 0.9452647015501673, "learning_rate": 1.8746763586879729e-06, "loss": 0.0324, "step": 4782 }, { "epoch": 2.2313432835820897, "grad_norm": 1.056537633599161, "learning_rate": 1.8704414315571117e-06, "loss": 0.0381, "step": 4784 }, { "epoch": 2.232276119402985, "grad_norm": 0.9104889867814913, "learning_rate": 1.8662101923822668e-06, "loss": 0.0321, "step": 4786 }, { "epoch": 2.2332089552238807, "grad_norm": 1.1917555284634107, "learning_rate": 1.861982646149645e-06, "loss": 0.0332, "step": 4788 }, { "epoch": 2.234141791044776, "grad_norm": 0.9347915254367777, "learning_rate": 1.8577587978410967e-06, "loss": 0.03, "step": 4790 }, { "epoch": 2.235074626865672, "grad_norm": 0.9894703553497254, "learning_rate": 1.8535386524341225e-06, "loss": 0.0291, "step": 4792 }, { "epoch": 2.236007462686567, "grad_norm": 0.9633339225718022, "learning_rate": 1.8493222149018524e-06, "loss": 0.0321, "step": 4794 }, { "epoch": 2.236940298507463, "grad_norm": 1.0344433383500846, "learning_rate": 1.8451094902130506e-06, "loss": 0.0357, "step": 4796 }, { "epoch": 2.237873134328358, "grad_norm": 1.1072755366728024, "learning_rate": 1.840900483332107e-06, "loss": 0.0367, "step": 4798 }, { "epoch": 2.2388059701492535, "grad_norm": 0.9811169471003288, "learning_rate": 1.8366951992190275e-06, "loss": 0.0301, "step": 4800 }, { "epoch": 2.2397388059701493, "grad_norm": 0.8912312323349124, "learning_rate": 1.8324936428294293e-06, "loss": 0.0294, "step": 4802 }, { "epoch": 2.2406716417910446, "grad_norm": 0.9284678038980523, "learning_rate": 1.828295819114544e-06, "loss": 0.0315, "step": 4804 }, { "epoch": 2.2416044776119404, "grad_norm": 0.9793525696039931, "learning_rate": 1.8241017330211958e-06, "loss": 0.03, "step": 4806 }, { "epoch": 2.2425373134328357, "grad_norm": 0.8786912497482439, "learning_rate": 1.8199113894918103e-06, "loss": 0.0302, "step": 4808 }, { "epoch": 2.2434701492537314, "grad_norm": 0.8317446223359907, "learning_rate": 1.8157247934644035e-06, "loss": 0.0278, "step": 4810 }, { "epoch": 2.2444029850746268, "grad_norm": 1.0260812557611483, "learning_rate": 1.8115419498725684e-06, "loss": 0.0302, "step": 4812 }, { "epoch": 2.2453358208955225, "grad_norm": 0.8198119601969011, "learning_rate": 1.8073628636454848e-06, "loss": 0.0291, "step": 4814 }, { "epoch": 2.246268656716418, "grad_norm": 0.8436443298067087, "learning_rate": 1.8031875397078984e-06, "loss": 0.0274, "step": 4816 }, { "epoch": 2.2472014925373136, "grad_norm": 1.1190571985048972, "learning_rate": 1.799015982980119e-06, "loss": 0.0316, "step": 4818 }, { "epoch": 2.248134328358209, "grad_norm": 0.9082824287317579, "learning_rate": 1.7948481983780292e-06, "loss": 0.0325, "step": 4820 }, { "epoch": 2.2490671641791047, "grad_norm": 1.0549276508580878, "learning_rate": 1.7906841908130545e-06, "loss": 0.0345, "step": 4822 }, { "epoch": 2.25, "grad_norm": 1.019585336150271, "learning_rate": 1.7865239651921723e-06, "loss": 0.0339, "step": 4824 }, { "epoch": 2.2509328358208958, "grad_norm": 1.0619989537882322, "learning_rate": 1.7823675264179068e-06, "loss": 0.0303, "step": 4826 }, { "epoch": 2.251865671641791, "grad_norm": 0.9252518561016344, "learning_rate": 1.7782148793883147e-06, "loss": 0.0315, "step": 4828 }, { "epoch": 2.2527985074626864, "grad_norm": 0.9736992699499954, "learning_rate": 1.7740660289969886e-06, "loss": 0.0304, "step": 4830 }, { "epoch": 2.253731343283582, "grad_norm": 1.0516237752505884, "learning_rate": 1.769920980133047e-06, "loss": 0.0344, "step": 4832 }, { "epoch": 2.2546641791044775, "grad_norm": 0.9351502996935023, "learning_rate": 1.7657797376811252e-06, "loss": 0.0324, "step": 4834 }, { "epoch": 2.2555970149253732, "grad_norm": 0.8430799506091149, "learning_rate": 1.7616423065213729e-06, "loss": 0.031, "step": 4836 }, { "epoch": 2.2565298507462686, "grad_norm": 0.8875048034060027, "learning_rate": 1.7575086915294525e-06, "loss": 0.0327, "step": 4838 }, { "epoch": 2.2574626865671643, "grad_norm": 0.9534089379751329, "learning_rate": 1.7533788975765281e-06, "loss": 0.0343, "step": 4840 }, { "epoch": 2.2583955223880596, "grad_norm": 1.0787212009961646, "learning_rate": 1.7492529295292577e-06, "loss": 0.0326, "step": 4842 }, { "epoch": 2.2593283582089554, "grad_norm": 0.941164075013674, "learning_rate": 1.745130792249795e-06, "loss": 0.0326, "step": 4844 }, { "epoch": 2.2602611940298507, "grad_norm": 0.8528604588181112, "learning_rate": 1.741012490595777e-06, "loss": 0.0286, "step": 4846 }, { "epoch": 2.2611940298507465, "grad_norm": 0.9063789272252968, "learning_rate": 1.7368980294203185e-06, "loss": 0.0333, "step": 4848 }, { "epoch": 2.262126865671642, "grad_norm": 0.8720875843908757, "learning_rate": 1.732787413572014e-06, "loss": 0.0303, "step": 4850 }, { "epoch": 2.263059701492537, "grad_norm": 0.9966272096294451, "learning_rate": 1.7286806478949247e-06, "loss": 0.0316, "step": 4852 }, { "epoch": 2.263992537313433, "grad_norm": 0.9455107563473335, "learning_rate": 1.724577737228571e-06, "loss": 0.0284, "step": 4854 }, { "epoch": 2.264925373134328, "grad_norm": 1.122251605217344, "learning_rate": 1.720478686407936e-06, "loss": 0.0332, "step": 4856 }, { "epoch": 2.265858208955224, "grad_norm": 0.852383887426748, "learning_rate": 1.7163835002634483e-06, "loss": 0.0307, "step": 4858 }, { "epoch": 2.2667910447761193, "grad_norm": 0.9025202081516984, "learning_rate": 1.7122921836209866e-06, "loss": 0.0285, "step": 4860 }, { "epoch": 2.267723880597015, "grad_norm": 1.0153947109788386, "learning_rate": 1.7082047413018715e-06, "loss": 0.0346, "step": 4862 }, { "epoch": 2.2686567164179103, "grad_norm": 0.9393692050654928, "learning_rate": 1.7041211781228506e-06, "loss": 0.0337, "step": 4864 }, { "epoch": 2.269589552238806, "grad_norm": 1.078006744869332, "learning_rate": 1.7000414988961083e-06, "loss": 0.0357, "step": 4866 }, { "epoch": 2.2705223880597014, "grad_norm": 0.9746984688936382, "learning_rate": 1.6959657084292463e-06, "loss": 0.0285, "step": 4868 }, { "epoch": 2.271455223880597, "grad_norm": 1.1149398638348036, "learning_rate": 1.6918938115252847e-06, "loss": 0.0342, "step": 4870 }, { "epoch": 2.2723880597014925, "grad_norm": 0.9420355662172784, "learning_rate": 1.6878258129826575e-06, "loss": 0.0371, "step": 4872 }, { "epoch": 2.2733208955223883, "grad_norm": 0.8833812328634052, "learning_rate": 1.6837617175952058e-06, "loss": 0.0305, "step": 4874 }, { "epoch": 2.2742537313432836, "grad_norm": 0.9162630957322199, "learning_rate": 1.6797015301521653e-06, "loss": 0.0322, "step": 4876 }, { "epoch": 2.2751865671641793, "grad_norm": 0.944306854206795, "learning_rate": 1.6756452554381736e-06, "loss": 0.028, "step": 4878 }, { "epoch": 2.2761194029850746, "grad_norm": 0.8812094451894885, "learning_rate": 1.6715928982332503e-06, "loss": 0.0294, "step": 4880 }, { "epoch": 2.27705223880597, "grad_norm": 0.921622264454937, "learning_rate": 1.6675444633128041e-06, "loss": 0.0341, "step": 4882 }, { "epoch": 2.2779850746268657, "grad_norm": 1.0337252983568204, "learning_rate": 1.6634999554476211e-06, "loss": 0.0366, "step": 4884 }, { "epoch": 2.278917910447761, "grad_norm": 1.0296969246910936, "learning_rate": 1.6594593794038565e-06, "loss": 0.03, "step": 4886 }, { "epoch": 2.279850746268657, "grad_norm": 0.9209078112203684, "learning_rate": 1.6554227399430328e-06, "loss": 0.0306, "step": 4888 }, { "epoch": 2.280783582089552, "grad_norm": 1.0601700915448702, "learning_rate": 1.651390041822037e-06, "loss": 0.0329, "step": 4890 }, { "epoch": 2.281716417910448, "grad_norm": 0.9373147261111446, "learning_rate": 1.6473612897931063e-06, "loss": 0.0309, "step": 4892 }, { "epoch": 2.282649253731343, "grad_norm": 1.0350624982733692, "learning_rate": 1.6433364886038316e-06, "loss": 0.0325, "step": 4894 }, { "epoch": 2.283582089552239, "grad_norm": 1.005571213343325, "learning_rate": 1.6393156429971491e-06, "loss": 0.0336, "step": 4896 }, { "epoch": 2.2845149253731343, "grad_norm": 0.9550712495760202, "learning_rate": 1.6352987577113295e-06, "loss": 0.0333, "step": 4898 }, { "epoch": 2.28544776119403, "grad_norm": 0.9359834378755599, "learning_rate": 1.6312858374799773e-06, "loss": 0.0318, "step": 4900 }, { "epoch": 2.2863805970149254, "grad_norm": 1.0374437239494685, "learning_rate": 1.6272768870320265e-06, "loss": 0.0373, "step": 4902 }, { "epoch": 2.2873134328358207, "grad_norm": 0.8784400737974469, "learning_rate": 1.6232719110917344e-06, "loss": 0.0318, "step": 4904 }, { "epoch": 2.2882462686567164, "grad_norm": 0.887693364695598, "learning_rate": 1.6192709143786695e-06, "loss": 0.0332, "step": 4906 }, { "epoch": 2.2891791044776117, "grad_norm": 0.9029182379544906, "learning_rate": 1.6152739016077162e-06, "loss": 0.0326, "step": 4908 }, { "epoch": 2.2901119402985075, "grad_norm": 1.11264434807744, "learning_rate": 1.6112808774890592e-06, "loss": 0.0349, "step": 4910 }, { "epoch": 2.291044776119403, "grad_norm": 0.9429615283258698, "learning_rate": 1.6072918467281874e-06, "loss": 0.0302, "step": 4912 }, { "epoch": 2.2919776119402986, "grad_norm": 0.99134599303643, "learning_rate": 1.603306814025883e-06, "loss": 0.0334, "step": 4914 }, { "epoch": 2.292910447761194, "grad_norm": 1.0255295343792188, "learning_rate": 1.5993257840782127e-06, "loss": 0.029, "step": 4916 }, { "epoch": 2.2938432835820897, "grad_norm": 1.101418351960286, "learning_rate": 1.595348761576533e-06, "loss": 0.0315, "step": 4918 }, { "epoch": 2.294776119402985, "grad_norm": 0.9079827607045441, "learning_rate": 1.5913757512074724e-06, "loss": 0.0307, "step": 4920 }, { "epoch": 2.2957089552238807, "grad_norm": 0.9558019184078752, "learning_rate": 1.5874067576529306e-06, "loss": 0.029, "step": 4922 }, { "epoch": 2.296641791044776, "grad_norm": 0.916647014585152, "learning_rate": 1.5834417855900796e-06, "loss": 0.0355, "step": 4924 }, { "epoch": 2.297574626865672, "grad_norm": 1.0450965539339738, "learning_rate": 1.5794808396913503e-06, "loss": 0.0323, "step": 4926 }, { "epoch": 2.298507462686567, "grad_norm": 0.9195856408768837, "learning_rate": 1.5755239246244235e-06, "loss": 0.0271, "step": 4928 }, { "epoch": 2.299440298507463, "grad_norm": 1.0044292930015237, "learning_rate": 1.5715710450522393e-06, "loss": 0.0327, "step": 4930 }, { "epoch": 2.300373134328358, "grad_norm": 1.0332966948680593, "learning_rate": 1.5676222056329744e-06, "loss": 0.0316, "step": 4932 }, { "epoch": 2.3013059701492535, "grad_norm": 0.9990698485315225, "learning_rate": 1.5636774110200447e-06, "loss": 0.0317, "step": 4934 }, { "epoch": 2.3022388059701493, "grad_norm": 0.9005564633661388, "learning_rate": 1.5597366658621093e-06, "loss": 0.0292, "step": 4936 }, { "epoch": 2.3031716417910446, "grad_norm": 1.0235086592087248, "learning_rate": 1.5557999748030445e-06, "loss": 0.03, "step": 4938 }, { "epoch": 2.3041044776119404, "grad_norm": 1.0407345019915932, "learning_rate": 1.5518673424819508e-06, "loss": 0.031, "step": 4940 }, { "epoch": 2.3050373134328357, "grad_norm": 1.1173523357373325, "learning_rate": 1.5479387735331524e-06, "loss": 0.033, "step": 4942 }, { "epoch": 2.3059701492537314, "grad_norm": 0.9244988906969726, "learning_rate": 1.5440142725861763e-06, "loss": 0.032, "step": 4944 }, { "epoch": 2.3069029850746268, "grad_norm": 1.0233139804844182, "learning_rate": 1.5400938442657625e-06, "loss": 0.0303, "step": 4946 }, { "epoch": 2.3078358208955225, "grad_norm": 0.799951804777741, "learning_rate": 1.53617749319185e-06, "loss": 0.0305, "step": 4948 }, { "epoch": 2.308768656716418, "grad_norm": 1.000997552437667, "learning_rate": 1.5322652239795717e-06, "loss": 0.0352, "step": 4950 }, { "epoch": 2.3097014925373136, "grad_norm": 0.8318375059715735, "learning_rate": 1.5283570412392478e-06, "loss": 0.0312, "step": 4952 }, { "epoch": 2.310634328358209, "grad_norm": 0.9785791023544128, "learning_rate": 1.5244529495763893e-06, "loss": 0.0309, "step": 4954 }, { "epoch": 2.3115671641791042, "grad_norm": 0.8964024452247994, "learning_rate": 1.5205529535916834e-06, "loss": 0.0284, "step": 4956 }, { "epoch": 2.3125, "grad_norm": 0.945822136524235, "learning_rate": 1.5166570578809869e-06, "loss": 0.0307, "step": 4958 }, { "epoch": 2.3134328358208958, "grad_norm": 0.8345305166463103, "learning_rate": 1.5127652670353321e-06, "loss": 0.0284, "step": 4960 }, { "epoch": 2.314365671641791, "grad_norm": 0.9264264389198162, "learning_rate": 1.5088775856409066e-06, "loss": 0.0278, "step": 4962 }, { "epoch": 2.3152985074626864, "grad_norm": 0.9547188914742342, "learning_rate": 1.5049940182790602e-06, "loss": 0.0304, "step": 4964 }, { "epoch": 2.316231343283582, "grad_norm": 1.1407054779735974, "learning_rate": 1.5011145695262947e-06, "loss": 0.0337, "step": 4966 }, { "epoch": 2.3171641791044775, "grad_norm": 0.8996820314811704, "learning_rate": 1.4972392439542533e-06, "loss": 0.0274, "step": 4968 }, { "epoch": 2.3180970149253732, "grad_norm": 1.0059265196229077, "learning_rate": 1.4933680461297279e-06, "loss": 0.0317, "step": 4970 }, { "epoch": 2.3190298507462686, "grad_norm": 0.9141775842258535, "learning_rate": 1.4895009806146404e-06, "loss": 0.0269, "step": 4972 }, { "epoch": 2.3199626865671643, "grad_norm": 0.9204833484695144, "learning_rate": 1.4856380519660429e-06, "loss": 0.0334, "step": 4974 }, { "epoch": 2.3208955223880596, "grad_norm": 0.9472225911308894, "learning_rate": 1.4817792647361168e-06, "loss": 0.0317, "step": 4976 }, { "epoch": 2.3218283582089554, "grad_norm": 0.8292477805867834, "learning_rate": 1.477924623472161e-06, "loss": 0.0307, "step": 4978 }, { "epoch": 2.3227611940298507, "grad_norm": 1.1163000585725409, "learning_rate": 1.4740741327165869e-06, "loss": 0.0354, "step": 4980 }, { "epoch": 2.3236940298507465, "grad_norm": 0.9550471993279462, "learning_rate": 1.4702277970069184e-06, "loss": 0.031, "step": 4982 }, { "epoch": 2.324626865671642, "grad_norm": 0.9561926828491114, "learning_rate": 1.4663856208757797e-06, "loss": 0.0335, "step": 4984 }, { "epoch": 2.325559701492537, "grad_norm": 0.9241964509065581, "learning_rate": 1.4625476088508917e-06, "loss": 0.0288, "step": 4986 }, { "epoch": 2.326492537313433, "grad_norm": 0.8550279123404502, "learning_rate": 1.458713765455077e-06, "loss": 0.0307, "step": 4988 }, { "epoch": 2.327425373134328, "grad_norm": 0.9570729328656946, "learning_rate": 1.4548840952062365e-06, "loss": 0.0324, "step": 4990 }, { "epoch": 2.328358208955224, "grad_norm": 0.8785046354623663, "learning_rate": 1.4510586026173557e-06, "loss": 0.0316, "step": 4992 }, { "epoch": 2.3292910447761193, "grad_norm": 1.041722775563538, "learning_rate": 1.4472372921965005e-06, "loss": 0.0298, "step": 4994 }, { "epoch": 2.330223880597015, "grad_norm": 0.9351483285560535, "learning_rate": 1.443420168446803e-06, "loss": 0.0308, "step": 4996 }, { "epoch": 2.3311567164179103, "grad_norm": 1.058348585781574, "learning_rate": 1.4396072358664665e-06, "loss": 0.0338, "step": 4998 }, { "epoch": 2.332089552238806, "grad_norm": 0.9652039258901834, "learning_rate": 1.4357984989487545e-06, "loss": 0.0293, "step": 5000 }, { "epoch": 2.332089552238806, "eval_loss": 0.18245889246463776, "eval_runtime": 322.1455, "eval_samples_per_second": 47.326, "eval_steps_per_second": 5.917, "step": 5000 }, { "epoch": 2.3330223880597014, "grad_norm": 1.1851156887139767, "learning_rate": 1.4319939621819835e-06, "loss": 0.0362, "step": 5002 }, { "epoch": 2.333955223880597, "grad_norm": 0.8529028593835726, "learning_rate": 1.4281936300495198e-06, "loss": 0.0346, "step": 5004 }, { "epoch": 2.3348880597014925, "grad_norm": 0.9594188285524563, "learning_rate": 1.4243975070297817e-06, "loss": 0.0335, "step": 5006 }, { "epoch": 2.3358208955223883, "grad_norm": 0.9074486334593885, "learning_rate": 1.4206055975962179e-06, "loss": 0.0297, "step": 5008 }, { "epoch": 2.3367537313432836, "grad_norm": 0.9751995519315027, "learning_rate": 1.4168179062173193e-06, "loss": 0.0306, "step": 5010 }, { "epoch": 2.3376865671641793, "grad_norm": 0.9616759833614095, "learning_rate": 1.413034437356604e-06, "loss": 0.0284, "step": 5012 }, { "epoch": 2.3386194029850746, "grad_norm": 1.1592255195505259, "learning_rate": 1.4092551954726113e-06, "loss": 0.0339, "step": 5014 }, { "epoch": 2.33955223880597, "grad_norm": 1.2948160643221482, "learning_rate": 1.4054801850189038e-06, "loss": 0.0339, "step": 5016 }, { "epoch": 2.3404850746268657, "grad_norm": 0.8674935561887429, "learning_rate": 1.4017094104440527e-06, "loss": 0.0322, "step": 5018 }, { "epoch": 2.341417910447761, "grad_norm": 1.0367099183574866, "learning_rate": 1.397942876191642e-06, "loss": 0.0341, "step": 5020 }, { "epoch": 2.342350746268657, "grad_norm": 0.809587220441153, "learning_rate": 1.3941805867002578e-06, "loss": 0.0295, "step": 5022 }, { "epoch": 2.343283582089552, "grad_norm": 1.1115723698500566, "learning_rate": 1.3904225464034821e-06, "loss": 0.0323, "step": 5024 }, { "epoch": 2.344216417910448, "grad_norm": 0.9093107346126881, "learning_rate": 1.386668759729889e-06, "loss": 0.0302, "step": 5026 }, { "epoch": 2.345149253731343, "grad_norm": 0.9181842054624373, "learning_rate": 1.3829192311030438e-06, "loss": 0.0316, "step": 5028 }, { "epoch": 2.346082089552239, "grad_norm": 0.9669415717064661, "learning_rate": 1.3791739649414926e-06, "loss": 0.0295, "step": 5030 }, { "epoch": 2.3470149253731343, "grad_norm": 0.9549079919101966, "learning_rate": 1.3754329656587556e-06, "loss": 0.0334, "step": 5032 }, { "epoch": 2.34794776119403, "grad_norm": 0.8704922978244095, "learning_rate": 1.3716962376633296e-06, "loss": 0.0294, "step": 5034 }, { "epoch": 2.3488805970149254, "grad_norm": 0.860650439580406, "learning_rate": 1.367963785358674e-06, "loss": 0.031, "step": 5036 }, { "epoch": 2.3498134328358207, "grad_norm": 0.8669502064663793, "learning_rate": 1.3642356131432078e-06, "loss": 0.0302, "step": 5038 }, { "epoch": 2.3507462686567164, "grad_norm": 0.8529582341937184, "learning_rate": 1.3605117254103157e-06, "loss": 0.0288, "step": 5040 }, { "epoch": 2.3516791044776117, "grad_norm": 1.0080665155964503, "learning_rate": 1.3567921265483241e-06, "loss": 0.0298, "step": 5042 }, { "epoch": 2.3526119402985075, "grad_norm": 1.0272770219623781, "learning_rate": 1.3530768209405064e-06, "loss": 0.0308, "step": 5044 }, { "epoch": 2.353544776119403, "grad_norm": 0.9141275593896965, "learning_rate": 1.3493658129650827e-06, "loss": 0.0298, "step": 5046 }, { "epoch": 2.3544776119402986, "grad_norm": 0.8505700202865444, "learning_rate": 1.3456591069952008e-06, "loss": 0.0291, "step": 5048 }, { "epoch": 2.355410447761194, "grad_norm": 0.8724278486813672, "learning_rate": 1.341956707398945e-06, "loss": 0.0312, "step": 5050 }, { "epoch": 2.3563432835820897, "grad_norm": 1.0522051991743648, "learning_rate": 1.3382586185393232e-06, "loss": 0.0322, "step": 5052 }, { "epoch": 2.357276119402985, "grad_norm": 1.010415268619055, "learning_rate": 1.334564844774262e-06, "loss": 0.0361, "step": 5054 }, { "epoch": 2.3582089552238807, "grad_norm": 0.876681552657445, "learning_rate": 1.330875390456602e-06, "loss": 0.0269, "step": 5056 }, { "epoch": 2.359141791044776, "grad_norm": 1.016793734088532, "learning_rate": 1.327190259934098e-06, "loss": 0.0288, "step": 5058 }, { "epoch": 2.360074626865672, "grad_norm": 1.1488278864813115, "learning_rate": 1.3235094575494044e-06, "loss": 0.0326, "step": 5060 }, { "epoch": 2.361007462686567, "grad_norm": 0.8092207099884563, "learning_rate": 1.3198329876400795e-06, "loss": 0.0267, "step": 5062 }, { "epoch": 2.361940298507463, "grad_norm": 1.0147049454179322, "learning_rate": 1.3161608545385756e-06, "loss": 0.0327, "step": 5064 }, { "epoch": 2.362873134328358, "grad_norm": 1.3147640207279299, "learning_rate": 1.3124930625722304e-06, "loss": 0.0307, "step": 5066 }, { "epoch": 2.3638059701492535, "grad_norm": 1.0315398673305383, "learning_rate": 1.3088296160632714e-06, "loss": 0.033, "step": 5068 }, { "epoch": 2.3647388059701493, "grad_norm": 0.9598345702077918, "learning_rate": 1.3051705193287995e-06, "loss": 0.0363, "step": 5070 }, { "epoch": 2.3656716417910446, "grad_norm": 0.909327421024035, "learning_rate": 1.301515776680794e-06, "loss": 0.0305, "step": 5072 }, { "epoch": 2.3666044776119404, "grad_norm": 1.005616156281911, "learning_rate": 1.2978653924261037e-06, "loss": 0.0326, "step": 5074 }, { "epoch": 2.3675373134328357, "grad_norm": 1.0697155919210022, "learning_rate": 1.294219370866438e-06, "loss": 0.0311, "step": 5076 }, { "epoch": 2.3684701492537314, "grad_norm": 0.9354398582066434, "learning_rate": 1.2905777162983657e-06, "loss": 0.0287, "step": 5078 }, { "epoch": 2.3694029850746268, "grad_norm": 0.973132698739944, "learning_rate": 1.2869404330133117e-06, "loss": 0.0326, "step": 5080 }, { "epoch": 2.3703358208955225, "grad_norm": 0.897005600047769, "learning_rate": 1.2833075252975501e-06, "loss": 0.0328, "step": 5082 }, { "epoch": 2.371268656716418, "grad_norm": 0.7750168931572203, "learning_rate": 1.2796789974321938e-06, "loss": 0.0256, "step": 5084 }, { "epoch": 2.3722014925373136, "grad_norm": 1.0031754188776967, "learning_rate": 1.276054853693201e-06, "loss": 0.0329, "step": 5086 }, { "epoch": 2.373134328358209, "grad_norm": 0.7937335711902828, "learning_rate": 1.2724350983513583e-06, "loss": 0.0261, "step": 5088 }, { "epoch": 2.3740671641791042, "grad_norm": 1.0261279258957414, "learning_rate": 1.268819735672282e-06, "loss": 0.036, "step": 5090 }, { "epoch": 2.375, "grad_norm": 1.168216731934682, "learning_rate": 1.265208769916414e-06, "loss": 0.0359, "step": 5092 }, { "epoch": 2.3759328358208958, "grad_norm": 0.9558805913389741, "learning_rate": 1.2616022053390143e-06, "loss": 0.0291, "step": 5094 }, { "epoch": 2.376865671641791, "grad_norm": 0.9325047029076223, "learning_rate": 1.2580000461901532e-06, "loss": 0.0296, "step": 5096 }, { "epoch": 2.3777985074626864, "grad_norm": 0.9211146961961714, "learning_rate": 1.254402296714715e-06, "loss": 0.0294, "step": 5098 }, { "epoch": 2.378731343283582, "grad_norm": 0.9409998763279245, "learning_rate": 1.2508089611523816e-06, "loss": 0.0328, "step": 5100 }, { "epoch": 2.3796641791044775, "grad_norm": 0.9300544977203749, "learning_rate": 1.247220043737637e-06, "loss": 0.0264, "step": 5102 }, { "epoch": 2.3805970149253732, "grad_norm": 1.0399909193526722, "learning_rate": 1.2436355486997604e-06, "loss": 0.0325, "step": 5104 }, { "epoch": 2.3815298507462686, "grad_norm": 0.8802218937605405, "learning_rate": 1.2400554802628155e-06, "loss": 0.0261, "step": 5106 }, { "epoch": 2.3824626865671643, "grad_norm": 1.0890395468692708, "learning_rate": 1.2364798426456499e-06, "loss": 0.0338, "step": 5108 }, { "epoch": 2.3833955223880596, "grad_norm": 0.9340708937253251, "learning_rate": 1.2329086400618934e-06, "loss": 0.0278, "step": 5110 }, { "epoch": 2.3843283582089554, "grad_norm": 0.891003892039185, "learning_rate": 1.229341876719945e-06, "loss": 0.026, "step": 5112 }, { "epoch": 2.3852611940298507, "grad_norm": 1.049112253731958, "learning_rate": 1.2257795568229759e-06, "loss": 0.0305, "step": 5114 }, { "epoch": 2.3861940298507465, "grad_norm": 0.9150152383861456, "learning_rate": 1.2222216845689205e-06, "loss": 0.0251, "step": 5116 }, { "epoch": 2.387126865671642, "grad_norm": 0.970598936477048, "learning_rate": 1.2186682641504694e-06, "loss": 0.0304, "step": 5118 }, { "epoch": 2.388059701492537, "grad_norm": 0.8440222362537796, "learning_rate": 1.2151192997550708e-06, "loss": 0.0305, "step": 5120 }, { "epoch": 2.388992537313433, "grad_norm": 1.1241825153824638, "learning_rate": 1.2115747955649177e-06, "loss": 0.0321, "step": 5122 }, { "epoch": 2.389925373134328, "grad_norm": 0.8960130220118138, "learning_rate": 1.20803475575695e-06, "loss": 0.0284, "step": 5124 }, { "epoch": 2.390858208955224, "grad_norm": 0.9201909754595783, "learning_rate": 1.2044991845028482e-06, "loss": 0.0302, "step": 5126 }, { "epoch": 2.3917910447761193, "grad_norm": 1.0146207135534862, "learning_rate": 1.2009680859690215e-06, "loss": 0.0341, "step": 5128 }, { "epoch": 2.392723880597015, "grad_norm": 1.0357233507057502, "learning_rate": 1.1974414643166116e-06, "loss": 0.032, "step": 5130 }, { "epoch": 2.3936567164179103, "grad_norm": 1.1342502952525662, "learning_rate": 1.1939193237014862e-06, "loss": 0.0289, "step": 5132 }, { "epoch": 2.394589552238806, "grad_norm": 0.8186531199171573, "learning_rate": 1.1904016682742286e-06, "loss": 0.031, "step": 5134 }, { "epoch": 2.3955223880597014, "grad_norm": 1.138168214225384, "learning_rate": 1.1868885021801392e-06, "loss": 0.0312, "step": 5136 }, { "epoch": 2.396455223880597, "grad_norm": 1.083624852759322, "learning_rate": 1.1833798295592291e-06, "loss": 0.0314, "step": 5138 }, { "epoch": 2.3973880597014925, "grad_norm": 0.9628914631831336, "learning_rate": 1.1798756545462114e-06, "loss": 0.0317, "step": 5140 }, { "epoch": 2.3983208955223883, "grad_norm": 0.8970788976866345, "learning_rate": 1.1763759812704984e-06, "loss": 0.0296, "step": 5142 }, { "epoch": 2.3992537313432836, "grad_norm": 1.035008794468747, "learning_rate": 1.1728808138562008e-06, "loss": 0.0307, "step": 5144 }, { "epoch": 2.4001865671641793, "grad_norm": 0.8289167809473592, "learning_rate": 1.1693901564221193e-06, "loss": 0.0296, "step": 5146 }, { "epoch": 2.4011194029850746, "grad_norm": 0.8649727360028945, "learning_rate": 1.1659040130817361e-06, "loss": 0.0305, "step": 5148 }, { "epoch": 2.40205223880597, "grad_norm": 0.9736790378086797, "learning_rate": 1.1624223879432183e-06, "loss": 0.032, "step": 5150 }, { "epoch": 2.4029850746268657, "grad_norm": 0.821620381528247, "learning_rate": 1.1589452851094063e-06, "loss": 0.0281, "step": 5152 }, { "epoch": 2.403917910447761, "grad_norm": 0.7649796672832583, "learning_rate": 1.1554727086778077e-06, "loss": 0.0284, "step": 5154 }, { "epoch": 2.404850746268657, "grad_norm": 1.0293759177657253, "learning_rate": 1.1520046627406061e-06, "loss": 0.0307, "step": 5156 }, { "epoch": 2.405783582089552, "grad_norm": 0.9986647214916511, "learning_rate": 1.1485411513846379e-06, "loss": 0.0323, "step": 5158 }, { "epoch": 2.406716417910448, "grad_norm": 1.0208320988485347, "learning_rate": 1.1450821786913957e-06, "loss": 0.0288, "step": 5160 }, { "epoch": 2.407649253731343, "grad_norm": 0.8048689222465932, "learning_rate": 1.1416277487370293e-06, "loss": 0.0234, "step": 5162 }, { "epoch": 2.408582089552239, "grad_norm": 0.839697880086066, "learning_rate": 1.1381778655923293e-06, "loss": 0.0263, "step": 5164 }, { "epoch": 2.4095149253731343, "grad_norm": 1.1554814591563138, "learning_rate": 1.1347325333227315e-06, "loss": 0.0353, "step": 5166 }, { "epoch": 2.41044776119403, "grad_norm": 1.1131093653428739, "learning_rate": 1.1312917559883101e-06, "loss": 0.0334, "step": 5168 }, { "epoch": 2.4113805970149254, "grad_norm": 1.0319878579150472, "learning_rate": 1.1278555376437666e-06, "loss": 0.0309, "step": 5170 }, { "epoch": 2.4123134328358207, "grad_norm": 0.9848055830331043, "learning_rate": 1.1244238823384363e-06, "loss": 0.0315, "step": 5172 }, { "epoch": 2.4132462686567164, "grad_norm": 1.0071179852409573, "learning_rate": 1.1209967941162726e-06, "loss": 0.0329, "step": 5174 }, { "epoch": 2.4141791044776117, "grad_norm": 0.8809510188750823, "learning_rate": 1.117574277015847e-06, "loss": 0.0309, "step": 5176 }, { "epoch": 2.4151119402985075, "grad_norm": 0.971103027505273, "learning_rate": 1.114156335070347e-06, "loss": 0.0331, "step": 5178 }, { "epoch": 2.416044776119403, "grad_norm": 1.0973511216365415, "learning_rate": 1.1107429723075685e-06, "loss": 0.0306, "step": 5180 }, { "epoch": 2.4169776119402986, "grad_norm": 1.0082100658114197, "learning_rate": 1.1073341927499082e-06, "loss": 0.0335, "step": 5182 }, { "epoch": 2.417910447761194, "grad_norm": 0.9427629531340962, "learning_rate": 1.1039300004143655e-06, "loss": 0.0346, "step": 5184 }, { "epoch": 2.4188432835820897, "grad_norm": 0.8141044026232971, "learning_rate": 1.1005303993125299e-06, "loss": 0.0252, "step": 5186 }, { "epoch": 2.419776119402985, "grad_norm": 0.947017846185009, "learning_rate": 1.097135393450584e-06, "loss": 0.033, "step": 5188 }, { "epoch": 2.4207089552238807, "grad_norm": 1.3696165446597555, "learning_rate": 1.093744986829296e-06, "loss": 0.0347, "step": 5190 }, { "epoch": 2.421641791044776, "grad_norm": 0.9307203687285788, "learning_rate": 1.0903591834440096e-06, "loss": 0.0296, "step": 5192 }, { "epoch": 2.422574626865672, "grad_norm": 0.9779965790293061, "learning_rate": 1.0869779872846465e-06, "loss": 0.0294, "step": 5194 }, { "epoch": 2.423507462686567, "grad_norm": 0.8606747677304812, "learning_rate": 1.0836014023357e-06, "loss": 0.0279, "step": 5196 }, { "epoch": 2.424440298507463, "grad_norm": 0.813404834932556, "learning_rate": 1.0802294325762303e-06, "loss": 0.0281, "step": 5198 }, { "epoch": 2.425373134328358, "grad_norm": 0.9607300866710246, "learning_rate": 1.0768620819798543e-06, "loss": 0.0295, "step": 5200 }, { "epoch": 2.4263059701492535, "grad_norm": 0.8114237433585384, "learning_rate": 1.0734993545147514e-06, "loss": 0.0275, "step": 5202 }, { "epoch": 2.4272388059701493, "grad_norm": 0.9803779968633127, "learning_rate": 1.0701412541436484e-06, "loss": 0.0337, "step": 5204 }, { "epoch": 2.4281716417910446, "grad_norm": 0.8865864969325976, "learning_rate": 1.066787784823819e-06, "loss": 0.0342, "step": 5206 }, { "epoch": 2.4291044776119404, "grad_norm": 1.2507290047495112, "learning_rate": 1.063438950507087e-06, "loss": 0.0315, "step": 5208 }, { "epoch": 2.4300373134328357, "grad_norm": 0.8700821742585412, "learning_rate": 1.0600947551398055e-06, "loss": 0.0322, "step": 5210 }, { "epoch": 2.4309701492537314, "grad_norm": 1.0494241299606317, "learning_rate": 1.0567552026628635e-06, "loss": 0.0328, "step": 5212 }, { "epoch": 2.4319029850746268, "grad_norm": 1.0655792543369247, "learning_rate": 1.0534202970116825e-06, "loss": 0.0296, "step": 5214 }, { "epoch": 2.4328358208955225, "grad_norm": 0.8772367123572078, "learning_rate": 1.0500900421162013e-06, "loss": 0.0272, "step": 5216 }, { "epoch": 2.433768656716418, "grad_norm": 0.9924973823945795, "learning_rate": 1.0467644419008843e-06, "loss": 0.0288, "step": 5218 }, { "epoch": 2.4347014925373136, "grad_norm": 0.901073820435966, "learning_rate": 1.0434435002847088e-06, "loss": 0.0312, "step": 5220 }, { "epoch": 2.435634328358209, "grad_norm": 0.8865583609137004, "learning_rate": 1.0401272211811598e-06, "loss": 0.0316, "step": 5222 }, { "epoch": 2.4365671641791042, "grad_norm": 0.9166280232072145, "learning_rate": 1.0368156084982318e-06, "loss": 0.0316, "step": 5224 }, { "epoch": 2.4375, "grad_norm": 0.8817845348476032, "learning_rate": 1.0335086661384175e-06, "loss": 0.0295, "step": 5226 }, { "epoch": 2.4384328358208958, "grad_norm": 1.0149708192946463, "learning_rate": 1.0302063979987053e-06, "loss": 0.03, "step": 5228 }, { "epoch": 2.439365671641791, "grad_norm": 0.9425035280111684, "learning_rate": 1.0269088079705775e-06, "loss": 0.0308, "step": 5230 }, { "epoch": 2.4402985074626864, "grad_norm": 1.01515710324735, "learning_rate": 1.0236158999400054e-06, "loss": 0.0304, "step": 5232 }, { "epoch": 2.441231343283582, "grad_norm": 0.9825655743348506, "learning_rate": 1.0203276777874365e-06, "loss": 0.0347, "step": 5234 }, { "epoch": 2.4421641791044775, "grad_norm": 0.9109880566827612, "learning_rate": 1.0170441453878038e-06, "loss": 0.0287, "step": 5236 }, { "epoch": 2.4430970149253732, "grad_norm": 0.950996314380109, "learning_rate": 1.0137653066105073e-06, "loss": 0.0305, "step": 5238 }, { "epoch": 2.4440298507462686, "grad_norm": 0.9666491371474853, "learning_rate": 1.0104911653194205e-06, "loss": 0.0303, "step": 5240 }, { "epoch": 2.4449626865671643, "grad_norm": 0.918975788396854, "learning_rate": 1.0072217253728806e-06, "loss": 0.028, "step": 5242 }, { "epoch": 2.4458955223880596, "grad_norm": 1.0443591962307095, "learning_rate": 1.0039569906236819e-06, "loss": 0.03, "step": 5244 }, { "epoch": 2.4468283582089554, "grad_norm": 1.0129350368733916, "learning_rate": 1.0006969649190746e-06, "loss": 0.0308, "step": 5246 }, { "epoch": 2.4477611940298507, "grad_norm": 1.024034540470462, "learning_rate": 9.974416521007635e-07, "loss": 0.0346, "step": 5248 }, { "epoch": 2.4486940298507465, "grad_norm": 1.04027731110459, "learning_rate": 9.94191056004894e-07, "loss": 0.0325, "step": 5250 }, { "epoch": 2.449626865671642, "grad_norm": 0.9032430034695119, "learning_rate": 9.909451804620579e-07, "loss": 0.0308, "step": 5252 }, { "epoch": 2.450559701492537, "grad_norm": 0.918163109867578, "learning_rate": 9.877040292972823e-07, "loss": 0.0295, "step": 5254 }, { "epoch": 2.451492537313433, "grad_norm": 0.9116229447610709, "learning_rate": 9.844676063300268e-07, "loss": 0.0284, "step": 5256 }, { "epoch": 2.452425373134328, "grad_norm": 0.8915138996779104, "learning_rate": 9.81235915374178e-07, "loss": 0.0318, "step": 5258 }, { "epoch": 2.453358208955224, "grad_norm": 1.0826412085183617, "learning_rate": 9.780089602380477e-07, "loss": 0.0305, "step": 5260 }, { "epoch": 2.4542910447761193, "grad_norm": 0.922620217878607, "learning_rate": 9.747867447243692e-07, "loss": 0.0269, "step": 5262 }, { "epoch": 2.455223880597015, "grad_norm": 0.9087907682602725, "learning_rate": 9.715692726302845e-07, "loss": 0.0346, "step": 5264 }, { "epoch": 2.4561567164179103, "grad_norm": 0.9586238741385978, "learning_rate": 9.683565477473517e-07, "loss": 0.0298, "step": 5266 }, { "epoch": 2.457089552238806, "grad_norm": 1.0223064558524098, "learning_rate": 9.651485738615308e-07, "loss": 0.0318, "step": 5268 }, { "epoch": 2.4580223880597014, "grad_norm": 0.9752253770560073, "learning_rate": 9.61945354753185e-07, "loss": 0.0388, "step": 5270 }, { "epoch": 2.458955223880597, "grad_norm": 1.0970714216579576, "learning_rate": 9.58746894197075e-07, "loss": 0.033, "step": 5272 }, { "epoch": 2.4598880597014925, "grad_norm": 0.8692948176360848, "learning_rate": 9.555531959623505e-07, "loss": 0.0312, "step": 5274 }, { "epoch": 2.4608208955223883, "grad_norm": 1.2234499178707334, "learning_rate": 9.523642638125541e-07, "loss": 0.0307, "step": 5276 }, { "epoch": 2.4617537313432836, "grad_norm": 1.032870140912907, "learning_rate": 9.491801015056079e-07, "loss": 0.028, "step": 5278 }, { "epoch": 2.4626865671641793, "grad_norm": 1.0229643703041138, "learning_rate": 9.460007127938131e-07, "loss": 0.0343, "step": 5280 }, { "epoch": 2.4636194029850746, "grad_norm": 0.9697729207859996, "learning_rate": 9.428261014238482e-07, "loss": 0.0295, "step": 5282 }, { "epoch": 2.46455223880597, "grad_norm": 0.8585679577937615, "learning_rate": 9.396562711367618e-07, "loss": 0.0287, "step": 5284 }, { "epoch": 2.4654850746268657, "grad_norm": 0.8600373390020909, "learning_rate": 9.364912256679648e-07, "loss": 0.0287, "step": 5286 }, { "epoch": 2.466417910447761, "grad_norm": 0.7654761908933982, "learning_rate": 9.333309687472342e-07, "loss": 0.0238, "step": 5288 }, { "epoch": 2.467350746268657, "grad_norm": 1.0261690678902156, "learning_rate": 9.301755040987009e-07, "loss": 0.0316, "step": 5290 }, { "epoch": 2.468283582089552, "grad_norm": 1.0608751660597768, "learning_rate": 9.270248354408467e-07, "loss": 0.0347, "step": 5292 }, { "epoch": 2.469216417910448, "grad_norm": 1.021052839418142, "learning_rate": 9.238789664865095e-07, "loss": 0.0304, "step": 5294 }, { "epoch": 2.470149253731343, "grad_norm": 1.172851673632966, "learning_rate": 9.207379009428624e-07, "loss": 0.0351, "step": 5296 }, { "epoch": 2.471082089552239, "grad_norm": 1.0208080543389906, "learning_rate": 9.17601642511422e-07, "loss": 0.0348, "step": 5298 }, { "epoch": 2.4720149253731343, "grad_norm": 0.9561375880016924, "learning_rate": 9.144701948880407e-07, "loss": 0.0264, "step": 5300 }, { "epoch": 2.47294776119403, "grad_norm": 0.8711993131825574, "learning_rate": 9.113435617628985e-07, "loss": 0.0297, "step": 5302 }, { "epoch": 2.4738805970149254, "grad_norm": 1.0468283824085718, "learning_rate": 9.082217468205057e-07, "loss": 0.032, "step": 5304 }, { "epoch": 2.4748134328358207, "grad_norm": 0.8070596299062749, "learning_rate": 9.05104753739694e-07, "loss": 0.0299, "step": 5306 }, { "epoch": 2.4757462686567164, "grad_norm": 0.8130945780791511, "learning_rate": 9.019925861936101e-07, "loss": 0.0241, "step": 5308 }, { "epoch": 2.4766791044776117, "grad_norm": 1.0637012776335473, "learning_rate": 8.988852478497156e-07, "loss": 0.031, "step": 5310 }, { "epoch": 2.4776119402985075, "grad_norm": 0.9987356483323814, "learning_rate": 8.957827423697823e-07, "loss": 0.0391, "step": 5312 }, { "epoch": 2.478544776119403, "grad_norm": 0.8320233706324502, "learning_rate": 8.926850734098874e-07, "loss": 0.0266, "step": 5314 }, { "epoch": 2.4794776119402986, "grad_norm": 0.9298389858750122, "learning_rate": 8.895922446204053e-07, "loss": 0.0332, "step": 5316 }, { "epoch": 2.480410447761194, "grad_norm": 0.950930613061798, "learning_rate": 8.865042596460111e-07, "loss": 0.0321, "step": 5318 }, { "epoch": 2.4813432835820897, "grad_norm": 0.9617986783790913, "learning_rate": 8.834211221256661e-07, "loss": 0.0339, "step": 5320 }, { "epoch": 2.482276119402985, "grad_norm": 1.2039081267856504, "learning_rate": 8.803428356926242e-07, "loss": 0.0312, "step": 5322 }, { "epoch": 2.4832089552238807, "grad_norm": 0.9853941051886902, "learning_rate": 8.772694039744228e-07, "loss": 0.0281, "step": 5324 }, { "epoch": 2.484141791044776, "grad_norm": 0.8819923200252056, "learning_rate": 8.742008305928728e-07, "loss": 0.0265, "step": 5326 }, { "epoch": 2.485074626865672, "grad_norm": 0.9926797819846446, "learning_rate": 8.711371191640677e-07, "loss": 0.0273, "step": 5328 }, { "epoch": 2.486007462686567, "grad_norm": 1.024681171097431, "learning_rate": 8.680782732983645e-07, "loss": 0.0347, "step": 5330 }, { "epoch": 2.486940298507463, "grad_norm": 1.0012248853144015, "learning_rate": 8.650242966003897e-07, "loss": 0.0349, "step": 5332 }, { "epoch": 2.487873134328358, "grad_norm": 1.0895809997598143, "learning_rate": 8.619751926690317e-07, "loss": 0.0314, "step": 5334 }, { "epoch": 2.4888059701492535, "grad_norm": 0.9253355336146758, "learning_rate": 8.589309650974387e-07, "loss": 0.0309, "step": 5336 }, { "epoch": 2.4897388059701493, "grad_norm": 0.8271357690929486, "learning_rate": 8.558916174730076e-07, "loss": 0.0288, "step": 5338 }, { "epoch": 2.4906716417910446, "grad_norm": 0.9870974780023992, "learning_rate": 8.528571533773894e-07, "loss": 0.0284, "step": 5340 }, { "epoch": 2.4916044776119404, "grad_norm": 1.0610406782504171, "learning_rate": 8.498275763864782e-07, "loss": 0.0293, "step": 5342 }, { "epoch": 2.4925373134328357, "grad_norm": 0.7604632823561313, "learning_rate": 8.46802890070405e-07, "loss": 0.0228, "step": 5344 }, { "epoch": 2.4934701492537314, "grad_norm": 1.1049817644836104, "learning_rate": 8.43783097993548e-07, "loss": 0.0299, "step": 5346 }, { "epoch": 2.4944029850746268, "grad_norm": 1.0011593857407741, "learning_rate": 8.407682037145076e-07, "loss": 0.0242, "step": 5348 }, { "epoch": 2.4953358208955225, "grad_norm": 1.014442412297917, "learning_rate": 8.37758210786116e-07, "loss": 0.0344, "step": 5350 }, { "epoch": 2.496268656716418, "grad_norm": 0.8394720810240413, "learning_rate": 8.347531227554323e-07, "loss": 0.0271, "step": 5352 }, { "epoch": 2.4972014925373136, "grad_norm": 0.9904793674093569, "learning_rate": 8.3175294316373e-07, "loss": 0.0367, "step": 5354 }, { "epoch": 2.498134328358209, "grad_norm": 1.0671548012590597, "learning_rate": 8.287576755465032e-07, "loss": 0.0291, "step": 5356 }, { "epoch": 2.4990671641791042, "grad_norm": 0.9743440646929675, "learning_rate": 8.257673234334568e-07, "loss": 0.0363, "step": 5358 }, { "epoch": 2.5, "grad_norm": 0.929426697584014, "learning_rate": 8.227818903485013e-07, "loss": 0.029, "step": 5360 }, { "epoch": 2.5009328358208958, "grad_norm": 0.9015513886617461, "learning_rate": 8.198013798097498e-07, "loss": 0.0306, "step": 5362 }, { "epoch": 2.501865671641791, "grad_norm": 0.7719495924948875, "learning_rate": 8.168257953295178e-07, "loss": 0.0265, "step": 5364 }, { "epoch": 2.5027985074626864, "grad_norm": 0.9986872200315227, "learning_rate": 8.138551404143147e-07, "loss": 0.0294, "step": 5366 }, { "epoch": 2.503731343283582, "grad_norm": 0.9005280842153865, "learning_rate": 8.108894185648381e-07, "loss": 0.0252, "step": 5368 }, { "epoch": 2.5046641791044775, "grad_norm": 0.9499952978506555, "learning_rate": 8.079286332759762e-07, "loss": 0.0296, "step": 5370 }, { "epoch": 2.5055970149253732, "grad_norm": 0.9765392005856854, "learning_rate": 8.049727880367969e-07, "loss": 0.0286, "step": 5372 }, { "epoch": 2.5065298507462686, "grad_norm": 0.9917205237607287, "learning_rate": 8.02021886330549e-07, "loss": 0.0316, "step": 5374 }, { "epoch": 2.5074626865671643, "grad_norm": 1.012228541643098, "learning_rate": 7.99075931634653e-07, "loss": 0.025, "step": 5376 }, { "epoch": 2.5083955223880596, "grad_norm": 1.079233208839584, "learning_rate": 7.961349274207014e-07, "loss": 0.0313, "step": 5378 }, { "epoch": 2.5093283582089554, "grad_norm": 0.8942220063497852, "learning_rate": 7.931988771544547e-07, "loss": 0.0287, "step": 5380 }, { "epoch": 2.5102611940298507, "grad_norm": 0.8068043510615183, "learning_rate": 7.902677842958318e-07, "loss": 0.024, "step": 5382 }, { "epoch": 2.5111940298507465, "grad_norm": 0.9965812102549537, "learning_rate": 7.873416522989108e-07, "loss": 0.0284, "step": 5384 }, { "epoch": 2.512126865671642, "grad_norm": 0.841081254464499, "learning_rate": 7.844204846119247e-07, "loss": 0.0294, "step": 5386 }, { "epoch": 2.513059701492537, "grad_norm": 0.9501727571983167, "learning_rate": 7.81504284677258e-07, "loss": 0.0255, "step": 5388 }, { "epoch": 2.513992537313433, "grad_norm": 1.0016727854947534, "learning_rate": 7.785930559314364e-07, "loss": 0.0289, "step": 5390 }, { "epoch": 2.5149253731343286, "grad_norm": 0.8721408729745113, "learning_rate": 7.756868018051323e-07, "loss": 0.0286, "step": 5392 }, { "epoch": 2.515858208955224, "grad_norm": 1.0936890407002346, "learning_rate": 7.727855257231537e-07, "loss": 0.0303, "step": 5394 }, { "epoch": 2.5167910447761193, "grad_norm": 1.014948867589655, "learning_rate": 7.698892311044387e-07, "loss": 0.0293, "step": 5396 }, { "epoch": 2.517723880597015, "grad_norm": 0.9958364273519411, "learning_rate": 7.669979213620643e-07, "loss": 0.0299, "step": 5398 }, { "epoch": 2.5186567164179103, "grad_norm": 1.1617242124661515, "learning_rate": 7.641115999032251e-07, "loss": 0.032, "step": 5400 }, { "epoch": 2.519589552238806, "grad_norm": 0.957098893021318, "learning_rate": 7.61230270129239e-07, "loss": 0.0266, "step": 5402 }, { "epoch": 2.5205223880597014, "grad_norm": 0.9879727516732639, "learning_rate": 7.583539354355445e-07, "loss": 0.0297, "step": 5404 }, { "epoch": 2.521455223880597, "grad_norm": 0.7791493964795234, "learning_rate": 7.554825992116898e-07, "loss": 0.0278, "step": 5406 }, { "epoch": 2.5223880597014925, "grad_norm": 0.9508455925438611, "learning_rate": 7.526162648413354e-07, "loss": 0.0311, "step": 5408 }, { "epoch": 2.523320895522388, "grad_norm": 0.9375269950337454, "learning_rate": 7.497549357022488e-07, "loss": 0.032, "step": 5410 }, { "epoch": 2.5242537313432836, "grad_norm": 0.8181613386415205, "learning_rate": 7.468986151662955e-07, "loss": 0.0248, "step": 5412 }, { "epoch": 2.5251865671641793, "grad_norm": 0.8449332338645831, "learning_rate": 7.440473065994391e-07, "loss": 0.0249, "step": 5414 }, { "epoch": 2.5261194029850746, "grad_norm": 0.8932020081455512, "learning_rate": 7.412010133617415e-07, "loss": 0.027, "step": 5416 }, { "epoch": 2.52705223880597, "grad_norm": 1.2082616553576144, "learning_rate": 7.383597388073482e-07, "loss": 0.0321, "step": 5418 }, { "epoch": 2.5279850746268657, "grad_norm": 0.9138818756713926, "learning_rate": 7.355234862844945e-07, "loss": 0.0311, "step": 5420 }, { "epoch": 2.528917910447761, "grad_norm": 1.0202618923997713, "learning_rate": 7.32692259135499e-07, "loss": 0.0362, "step": 5422 }, { "epoch": 2.529850746268657, "grad_norm": 1.0665120247254412, "learning_rate": 7.298660606967523e-07, "loss": 0.0315, "step": 5424 }, { "epoch": 2.530783582089552, "grad_norm": 1.1352439636722527, "learning_rate": 7.270448942987263e-07, "loss": 0.0337, "step": 5426 }, { "epoch": 2.531716417910448, "grad_norm": 0.865442600561773, "learning_rate": 7.242287632659556e-07, "loss": 0.026, "step": 5428 }, { "epoch": 2.532649253731343, "grad_norm": 0.9410527537956525, "learning_rate": 7.214176709170484e-07, "loss": 0.0284, "step": 5430 }, { "epoch": 2.533582089552239, "grad_norm": 0.8269563301903696, "learning_rate": 7.186116205646687e-07, "loss": 0.0248, "step": 5432 }, { "epoch": 2.5345149253731343, "grad_norm": 1.374109206927514, "learning_rate": 7.158106155155437e-07, "loss": 0.0284, "step": 5434 }, { "epoch": 2.53544776119403, "grad_norm": 0.9714396595595592, "learning_rate": 7.130146590704512e-07, "loss": 0.033, "step": 5436 }, { "epoch": 2.5363805970149254, "grad_norm": 1.076768884681306, "learning_rate": 7.10223754524223e-07, "loss": 0.0331, "step": 5438 }, { "epoch": 2.5373134328358207, "grad_norm": 0.925026652018418, "learning_rate": 7.074379051657366e-07, "loss": 0.0291, "step": 5440 }, { "epoch": 2.5382462686567164, "grad_norm": 0.7852779727368535, "learning_rate": 7.046571142779096e-07, "loss": 0.0257, "step": 5442 }, { "epoch": 2.539179104477612, "grad_norm": 1.054140753661845, "learning_rate": 7.018813851377032e-07, "loss": 0.0307, "step": 5444 }, { "epoch": 2.5401119402985075, "grad_norm": 0.855612003871619, "learning_rate": 6.991107210161102e-07, "loss": 0.0267, "step": 5446 }, { "epoch": 2.541044776119403, "grad_norm": 0.9754413111180732, "learning_rate": 6.96345125178155e-07, "loss": 0.0282, "step": 5448 }, { "epoch": 2.5419776119402986, "grad_norm": 0.9831546455408445, "learning_rate": 6.935846008828906e-07, "loss": 0.033, "step": 5450 }, { "epoch": 2.542910447761194, "grad_norm": 0.9872681523455986, "learning_rate": 6.908291513833948e-07, "loss": 0.0271, "step": 5452 }, { "epoch": 2.5438432835820897, "grad_norm": 1.0047813949957318, "learning_rate": 6.880787799267608e-07, "loss": 0.0293, "step": 5454 }, { "epoch": 2.544776119402985, "grad_norm": 1.1256701558985338, "learning_rate": 6.853334897541031e-07, "loss": 0.0302, "step": 5456 }, { "epoch": 2.5457089552238807, "grad_norm": 0.9508103757308844, "learning_rate": 6.825932841005434e-07, "loss": 0.0265, "step": 5458 }, { "epoch": 2.546641791044776, "grad_norm": 0.9345261118581804, "learning_rate": 6.79858166195212e-07, "loss": 0.0352, "step": 5460 }, { "epoch": 2.5475746268656714, "grad_norm": 0.9236243033974604, "learning_rate": 6.771281392612505e-07, "loss": 0.0231, "step": 5462 }, { "epoch": 2.548507462686567, "grad_norm": 0.8445919648298371, "learning_rate": 6.744032065157929e-07, "loss": 0.0267, "step": 5464 }, { "epoch": 2.549440298507463, "grad_norm": 0.7960534332376328, "learning_rate": 6.716833711699727e-07, "loss": 0.0238, "step": 5466 }, { "epoch": 2.550373134328358, "grad_norm": 1.0426106071680399, "learning_rate": 6.689686364289194e-07, "loss": 0.0286, "step": 5468 }, { "epoch": 2.5513059701492535, "grad_norm": 1.012093584788351, "learning_rate": 6.662590054917467e-07, "loss": 0.0324, "step": 5470 }, { "epoch": 2.5522388059701493, "grad_norm": 0.8610382191725122, "learning_rate": 6.635544815515576e-07, "loss": 0.0254, "step": 5472 }, { "epoch": 2.5531716417910446, "grad_norm": 0.9000569602375049, "learning_rate": 6.608550677954379e-07, "loss": 0.0335, "step": 5474 }, { "epoch": 2.5541044776119404, "grad_norm": 0.9811954480600797, "learning_rate": 6.581607674044466e-07, "loss": 0.0237, "step": 5476 }, { "epoch": 2.5550373134328357, "grad_norm": 0.8697194881029953, "learning_rate": 6.554715835536224e-07, "loss": 0.0292, "step": 5478 }, { "epoch": 2.5559701492537314, "grad_norm": 0.9550648238319489, "learning_rate": 6.527875194119687e-07, "loss": 0.0296, "step": 5480 }, { "epoch": 2.5569029850746268, "grad_norm": 1.0465671636648464, "learning_rate": 6.501085781424621e-07, "loss": 0.0303, "step": 5482 }, { "epoch": 2.5578358208955225, "grad_norm": 1.0142530036315984, "learning_rate": 6.474347629020367e-07, "loss": 0.0337, "step": 5484 }, { "epoch": 2.558768656716418, "grad_norm": 1.0136757509818954, "learning_rate": 6.447660768415897e-07, "loss": 0.0348, "step": 5486 }, { "epoch": 2.5597014925373136, "grad_norm": 0.9556311189905142, "learning_rate": 6.421025231059713e-07, "loss": 0.0315, "step": 5488 }, { "epoch": 2.560634328358209, "grad_norm": 1.147131138974972, "learning_rate": 6.394441048339867e-07, "loss": 0.0344, "step": 5490 }, { "epoch": 2.5615671641791042, "grad_norm": 0.9801129122588047, "learning_rate": 6.367908251583854e-07, "loss": 0.032, "step": 5492 }, { "epoch": 2.5625, "grad_norm": 1.0133405335811954, "learning_rate": 6.341426872058648e-07, "loss": 0.0305, "step": 5494 }, { "epoch": 2.5634328358208958, "grad_norm": 0.8627633534135905, "learning_rate": 6.314996940970624e-07, "loss": 0.026, "step": 5496 }, { "epoch": 2.564365671641791, "grad_norm": 1.0032084741951597, "learning_rate": 6.28861848946552e-07, "loss": 0.0281, "step": 5498 }, { "epoch": 2.5652985074626864, "grad_norm": 0.9378214505319038, "learning_rate": 6.262291548628397e-07, "loss": 0.0324, "step": 5500 }, { "epoch": 2.5652985074626864, "eval_loss": 0.18188245594501495, "eval_runtime": 320.8699, "eval_samples_per_second": 47.515, "eval_steps_per_second": 5.94, "step": 5500 }, { "epoch": 2.566231343283582, "grad_norm": 0.8900669468176743, "learning_rate": 6.236016149483647e-07, "loss": 0.0273, "step": 5502 }, { "epoch": 2.5671641791044775, "grad_norm": 0.9006635215935282, "learning_rate": 6.209792322994912e-07, "loss": 0.0307, "step": 5504 }, { "epoch": 2.5680970149253732, "grad_norm": 0.7531383759987479, "learning_rate": 6.183620100065035e-07, "loss": 0.0291, "step": 5506 }, { "epoch": 2.5690298507462686, "grad_norm": 0.9617374260328377, "learning_rate": 6.157499511536091e-07, "loss": 0.0279, "step": 5508 }, { "epoch": 2.5699626865671643, "grad_norm": 0.9928579984722113, "learning_rate": 6.131430588189275e-07, "loss": 0.0296, "step": 5510 }, { "epoch": 2.5708955223880596, "grad_norm": 0.8946572444097028, "learning_rate": 6.105413360744883e-07, "loss": 0.023, "step": 5512 }, { "epoch": 2.5718283582089554, "grad_norm": 1.0747235948124607, "learning_rate": 6.079447859862353e-07, "loss": 0.0307, "step": 5514 }, { "epoch": 2.5727611940298507, "grad_norm": 0.8143405644267114, "learning_rate": 6.05353411614012e-07, "loss": 0.0266, "step": 5516 }, { "epoch": 2.5736940298507465, "grad_norm": 0.9067389240466966, "learning_rate": 6.027672160115622e-07, "loss": 0.0255, "step": 5518 }, { "epoch": 2.574626865671642, "grad_norm": 0.8681536868285394, "learning_rate": 6.001862022265298e-07, "loss": 0.0328, "step": 5520 }, { "epoch": 2.575559701492537, "grad_norm": 0.9659977467940498, "learning_rate": 5.976103733004501e-07, "loss": 0.027, "step": 5522 }, { "epoch": 2.576492537313433, "grad_norm": 1.0675965024576906, "learning_rate": 5.95039732268749e-07, "loss": 0.0324, "step": 5524 }, { "epoch": 2.5774253731343286, "grad_norm": 0.9736658548517676, "learning_rate": 5.924742821607404e-07, "loss": 0.0283, "step": 5526 }, { "epoch": 2.578358208955224, "grad_norm": 0.9451643634696305, "learning_rate": 5.899140259996183e-07, "loss": 0.025, "step": 5528 }, { "epoch": 2.5792910447761193, "grad_norm": 1.0005795398514292, "learning_rate": 5.873589668024593e-07, "loss": 0.0268, "step": 5530 }, { "epoch": 2.580223880597015, "grad_norm": 0.9091404318354299, "learning_rate": 5.848091075802121e-07, "loss": 0.0264, "step": 5532 }, { "epoch": 2.5811567164179103, "grad_norm": 1.0100657012475414, "learning_rate": 5.82264451337699e-07, "loss": 0.0313, "step": 5534 }, { "epoch": 2.582089552238806, "grad_norm": 1.1476126000293594, "learning_rate": 5.797250010736122e-07, "loss": 0.0335, "step": 5536 }, { "epoch": 2.5830223880597014, "grad_norm": 0.9087013565256767, "learning_rate": 5.771907597805098e-07, "loss": 0.0271, "step": 5538 }, { "epoch": 2.583955223880597, "grad_norm": 0.9520747941296617, "learning_rate": 5.746617304448071e-07, "loss": 0.0287, "step": 5540 }, { "epoch": 2.5848880597014925, "grad_norm": 0.9385766823424762, "learning_rate": 5.721379160467827e-07, "loss": 0.0298, "step": 5542 }, { "epoch": 2.585820895522388, "grad_norm": 0.8307023237191612, "learning_rate": 5.696193195605654e-07, "loss": 0.0259, "step": 5544 }, { "epoch": 2.5867537313432836, "grad_norm": 0.9211507876184712, "learning_rate": 5.671059439541383e-07, "loss": 0.0305, "step": 5546 }, { "epoch": 2.5876865671641793, "grad_norm": 0.846567556390467, "learning_rate": 5.645977921893308e-07, "loss": 0.0273, "step": 5548 }, { "epoch": 2.5886194029850746, "grad_norm": 0.994620831670817, "learning_rate": 5.620948672218169e-07, "loss": 0.0301, "step": 5550 }, { "epoch": 2.58955223880597, "grad_norm": 0.9796801084651251, "learning_rate": 5.59597172001109e-07, "loss": 0.0272, "step": 5552 }, { "epoch": 2.5904850746268657, "grad_norm": 0.8391949064190108, "learning_rate": 5.57104709470559e-07, "loss": 0.0256, "step": 5554 }, { "epoch": 2.591417910447761, "grad_norm": 0.8616962242798509, "learning_rate": 5.546174825673528e-07, "loss": 0.0253, "step": 5556 }, { "epoch": 2.592350746268657, "grad_norm": 1.0133282243772208, "learning_rate": 5.521354942225043e-07, "loss": 0.0295, "step": 5558 }, { "epoch": 2.593283582089552, "grad_norm": 0.9944881514774758, "learning_rate": 5.496587473608572e-07, "loss": 0.0315, "step": 5560 }, { "epoch": 2.594216417910448, "grad_norm": 0.9006894177917104, "learning_rate": 5.471872449010752e-07, "loss": 0.0287, "step": 5562 }, { "epoch": 2.595149253731343, "grad_norm": 0.9840676939981822, "learning_rate": 5.44720989755641e-07, "loss": 0.0325, "step": 5564 }, { "epoch": 2.596082089552239, "grad_norm": 1.0492334400722652, "learning_rate": 5.422599848308602e-07, "loss": 0.0334, "step": 5566 }, { "epoch": 2.5970149253731343, "grad_norm": 1.0202829525957806, "learning_rate": 5.398042330268461e-07, "loss": 0.0345, "step": 5568 }, { "epoch": 2.59794776119403, "grad_norm": 0.8092232677751454, "learning_rate": 5.373537372375209e-07, "loss": 0.0278, "step": 5570 }, { "epoch": 2.5988805970149254, "grad_norm": 1.1415943323729512, "learning_rate": 5.349085003506166e-07, "loss": 0.0319, "step": 5572 }, { "epoch": 2.5998134328358207, "grad_norm": 1.0558290186703099, "learning_rate": 5.324685252476647e-07, "loss": 0.0352, "step": 5574 }, { "epoch": 2.6007462686567164, "grad_norm": 0.9278735390408485, "learning_rate": 5.300338148039979e-07, "loss": 0.0279, "step": 5576 }, { "epoch": 2.601679104477612, "grad_norm": 0.8822870846249138, "learning_rate": 5.276043718887464e-07, "loss": 0.0278, "step": 5578 }, { "epoch": 2.6026119402985075, "grad_norm": 0.9867527231063329, "learning_rate": 5.251801993648281e-07, "loss": 0.0294, "step": 5580 }, { "epoch": 2.603544776119403, "grad_norm": 0.936868363790127, "learning_rate": 5.227613000889558e-07, "loss": 0.0275, "step": 5582 }, { "epoch": 2.6044776119402986, "grad_norm": 0.9319660667976284, "learning_rate": 5.203476769116239e-07, "loss": 0.0253, "step": 5584 }, { "epoch": 2.605410447761194, "grad_norm": 0.9204835657360955, "learning_rate": 5.179393326771104e-07, "loss": 0.0284, "step": 5586 }, { "epoch": 2.6063432835820897, "grad_norm": 0.7635562770824101, "learning_rate": 5.15536270223474e-07, "loss": 0.0264, "step": 5588 }, { "epoch": 2.607276119402985, "grad_norm": 0.7592415906330424, "learning_rate": 5.131384923825489e-07, "loss": 0.0256, "step": 5590 }, { "epoch": 2.6082089552238807, "grad_norm": 0.8305660864451233, "learning_rate": 5.107460019799387e-07, "loss": 0.0241, "step": 5592 }, { "epoch": 2.609141791044776, "grad_norm": 0.9241178513054383, "learning_rate": 5.083588018350211e-07, "loss": 0.0271, "step": 5594 }, { "epoch": 2.6100746268656714, "grad_norm": 0.897370132959456, "learning_rate": 5.059768947609345e-07, "loss": 0.0275, "step": 5596 }, { "epoch": 2.611007462686567, "grad_norm": 0.9744646780096524, "learning_rate": 5.036002835645837e-07, "loss": 0.0295, "step": 5598 }, { "epoch": 2.611940298507463, "grad_norm": 1.0062160171716794, "learning_rate": 5.012289710466317e-07, "loss": 0.025, "step": 5600 }, { "epoch": 2.612873134328358, "grad_norm": 1.055927155622184, "learning_rate": 4.988629600014966e-07, "loss": 0.0274, "step": 5602 }, { "epoch": 2.6138059701492535, "grad_norm": 0.837625553907277, "learning_rate": 4.96502253217348e-07, "loss": 0.0288, "step": 5604 }, { "epoch": 2.6147388059701493, "grad_norm": 0.9584577611531877, "learning_rate": 4.941468534761074e-07, "loss": 0.0285, "step": 5606 }, { "epoch": 2.6156716417910446, "grad_norm": 0.9342871100256482, "learning_rate": 4.917967635534421e-07, "loss": 0.0299, "step": 5608 }, { "epoch": 2.6166044776119404, "grad_norm": 0.9819128727842276, "learning_rate": 4.894519862187596e-07, "loss": 0.0271, "step": 5610 }, { "epoch": 2.6175373134328357, "grad_norm": 0.7754444347557119, "learning_rate": 4.87112524235211e-07, "loss": 0.0241, "step": 5612 }, { "epoch": 2.6184701492537314, "grad_norm": 0.9015445051832793, "learning_rate": 4.847783803596789e-07, "loss": 0.0251, "step": 5614 }, { "epoch": 2.6194029850746268, "grad_norm": 0.8595223243471014, "learning_rate": 4.824495573427818e-07, "loss": 0.0255, "step": 5616 }, { "epoch": 2.6203358208955225, "grad_norm": 0.9052102415236957, "learning_rate": 4.801260579288669e-07, "loss": 0.0274, "step": 5618 }, { "epoch": 2.621268656716418, "grad_norm": 1.219194280438804, "learning_rate": 4.778078848560108e-07, "loss": 0.0253, "step": 5620 }, { "epoch": 2.6222014925373136, "grad_norm": 0.9275939908539204, "learning_rate": 4.7549504085600773e-07, "loss": 0.0309, "step": 5622 }, { "epoch": 2.623134328358209, "grad_norm": 1.0371372383715778, "learning_rate": 4.731875286543786e-07, "loss": 0.0308, "step": 5624 }, { "epoch": 2.6240671641791042, "grad_norm": 0.886916317382346, "learning_rate": 4.7088535097035483e-07, "loss": 0.0253, "step": 5626 }, { "epoch": 2.625, "grad_norm": 0.9554220624448035, "learning_rate": 4.685885105168864e-07, "loss": 0.0282, "step": 5628 }, { "epoch": 2.6259328358208958, "grad_norm": 0.8886167057050335, "learning_rate": 4.66297010000632e-07, "loss": 0.0328, "step": 5630 }, { "epoch": 2.626865671641791, "grad_norm": 0.9304815536749925, "learning_rate": 4.6401085212195607e-07, "loss": 0.0263, "step": 5632 }, { "epoch": 2.6277985074626864, "grad_norm": 1.075769767251724, "learning_rate": 4.6173003957493026e-07, "loss": 0.0288, "step": 5634 }, { "epoch": 2.628731343283582, "grad_norm": 0.9417260104752829, "learning_rate": 4.594545750473245e-07, "loss": 0.0243, "step": 5636 }, { "epoch": 2.6296641791044775, "grad_norm": 0.9669856754306663, "learning_rate": 4.5718446122060666e-07, "loss": 0.0254, "step": 5638 }, { "epoch": 2.6305970149253732, "grad_norm": 1.0714829385244788, "learning_rate": 4.5491970076994074e-07, "loss": 0.031, "step": 5640 }, { "epoch": 2.6315298507462686, "grad_norm": 1.2512403663477403, "learning_rate": 4.526602963641824e-07, "loss": 0.0312, "step": 5642 }, { "epoch": 2.6324626865671643, "grad_norm": 0.9466518590437843, "learning_rate": 4.504062506658724e-07, "loss": 0.0261, "step": 5644 }, { "epoch": 2.6333955223880596, "grad_norm": 0.8880752182018787, "learning_rate": 4.481575663312415e-07, "loss": 0.0278, "step": 5646 }, { "epoch": 2.6343283582089554, "grad_norm": 0.9454381484970772, "learning_rate": 4.4591424601019674e-07, "loss": 0.0278, "step": 5648 }, { "epoch": 2.6352611940298507, "grad_norm": 0.8350721440652583, "learning_rate": 4.436762923463295e-07, "loss": 0.0243, "step": 5650 }, { "epoch": 2.6361940298507465, "grad_norm": 1.0080469270429602, "learning_rate": 4.414437079769046e-07, "loss": 0.0304, "step": 5652 }, { "epoch": 2.637126865671642, "grad_norm": 1.0557127886720743, "learning_rate": 4.392164955328582e-07, "loss": 0.0331, "step": 5654 }, { "epoch": 2.638059701492537, "grad_norm": 0.9324378923778945, "learning_rate": 4.369946576387979e-07, "loss": 0.03, "step": 5656 }, { "epoch": 2.638992537313433, "grad_norm": 0.9421771251974606, "learning_rate": 4.347781969129977e-07, "loss": 0.0275, "step": 5658 }, { "epoch": 2.6399253731343286, "grad_norm": 0.8141898581331491, "learning_rate": 4.325671159673933e-07, "loss": 0.0275, "step": 5660 }, { "epoch": 2.640858208955224, "grad_norm": 0.9178761025153416, "learning_rate": 4.303614174075826e-07, "loss": 0.0324, "step": 5662 }, { "epoch": 2.6417910447761193, "grad_norm": 0.9662115133527521, "learning_rate": 4.281611038328215e-07, "loss": 0.0267, "step": 5664 }, { "epoch": 2.642723880597015, "grad_norm": 0.8822810881757057, "learning_rate": 4.2596617783601744e-07, "loss": 0.0276, "step": 5666 }, { "epoch": 2.6436567164179103, "grad_norm": 1.030215439678188, "learning_rate": 4.2377664200372927e-07, "loss": 0.0288, "step": 5668 }, { "epoch": 2.644589552238806, "grad_norm": 1.0104314700113746, "learning_rate": 4.2159249891616626e-07, "loss": 0.03, "step": 5670 }, { "epoch": 2.6455223880597014, "grad_norm": 1.0056294457362005, "learning_rate": 4.194137511471824e-07, "loss": 0.0286, "step": 5672 }, { "epoch": 2.646455223880597, "grad_norm": 1.0073607104422846, "learning_rate": 4.1724040126427e-07, "loss": 0.0274, "step": 5674 }, { "epoch": 2.6473880597014925, "grad_norm": 0.9770224547966474, "learning_rate": 4.150724518285659e-07, "loss": 0.0271, "step": 5676 }, { "epoch": 2.648320895522388, "grad_norm": 0.9416539715365718, "learning_rate": 4.1290990539483767e-07, "loss": 0.0288, "step": 5678 }, { "epoch": 2.6492537313432836, "grad_norm": 0.8329335425143416, "learning_rate": 4.107527645114889e-07, "loss": 0.0256, "step": 5680 }, { "epoch": 2.6501865671641793, "grad_norm": 1.1138931507107372, "learning_rate": 4.0860103172055354e-07, "loss": 0.0374, "step": 5682 }, { "epoch": 2.6511194029850746, "grad_norm": 0.9365299677312837, "learning_rate": 4.064547095576904e-07, "loss": 0.0302, "step": 5684 }, { "epoch": 2.65205223880597, "grad_norm": 0.9889780175992751, "learning_rate": 4.0431380055218297e-07, "loss": 0.0279, "step": 5686 }, { "epoch": 2.6529850746268657, "grad_norm": 1.1592467281919736, "learning_rate": 4.02178307226937e-07, "loss": 0.0284, "step": 5688 }, { "epoch": 2.653917910447761, "grad_norm": 1.028022907074052, "learning_rate": 4.0004823209847386e-07, "loss": 0.0292, "step": 5690 }, { "epoch": 2.654850746268657, "grad_norm": 0.9199005290867535, "learning_rate": 3.9792357767693244e-07, "loss": 0.0274, "step": 5692 }, { "epoch": 2.655783582089552, "grad_norm": 0.9100899857863779, "learning_rate": 3.958043464660638e-07, "loss": 0.0257, "step": 5694 }, { "epoch": 2.656716417910448, "grad_norm": 1.0666340677971675, "learning_rate": 3.9369054096322414e-07, "loss": 0.0299, "step": 5696 }, { "epoch": 2.657649253731343, "grad_norm": 0.8878129132710791, "learning_rate": 3.9158216365938193e-07, "loss": 0.0186, "step": 5698 }, { "epoch": 2.658582089552239, "grad_norm": 1.0480066675487099, "learning_rate": 3.8947921703910374e-07, "loss": 0.0294, "step": 5700 }, { "epoch": 2.6595149253731343, "grad_norm": 0.841133672918763, "learning_rate": 3.873817035805572e-07, "loss": 0.0292, "step": 5702 }, { "epoch": 2.66044776119403, "grad_norm": 0.9827002199151351, "learning_rate": 3.8528962575551167e-07, "loss": 0.0295, "step": 5704 }, { "epoch": 2.6613805970149254, "grad_norm": 0.7956684171826566, "learning_rate": 3.8320298602932626e-07, "loss": 0.0257, "step": 5706 }, { "epoch": 2.6623134328358207, "grad_norm": 0.8299638245728805, "learning_rate": 3.811217868609535e-07, "loss": 0.0231, "step": 5708 }, { "epoch": 2.6632462686567164, "grad_norm": 0.8740361430812925, "learning_rate": 3.790460307029348e-07, "loss": 0.0271, "step": 5710 }, { "epoch": 2.664179104477612, "grad_norm": 1.035152612295088, "learning_rate": 3.7697572000139624e-07, "loss": 0.0289, "step": 5712 }, { "epoch": 2.6651119402985075, "grad_norm": 1.040699866935444, "learning_rate": 3.7491085719604805e-07, "loss": 0.029, "step": 5714 }, { "epoch": 2.666044776119403, "grad_norm": 0.857941292118624, "learning_rate": 3.728514447201814e-07, "loss": 0.0288, "step": 5716 }, { "epoch": 2.6669776119402986, "grad_norm": 0.8676768629391651, "learning_rate": 3.707974850006624e-07, "loss": 0.0285, "step": 5718 }, { "epoch": 2.667910447761194, "grad_norm": 0.8954866462200547, "learning_rate": 3.6874898045793086e-07, "loss": 0.0248, "step": 5720 }, { "epoch": 2.6688432835820897, "grad_norm": 0.8895165404260262, "learning_rate": 3.667059335060014e-07, "loss": 0.0288, "step": 5722 }, { "epoch": 2.669776119402985, "grad_norm": 1.0408571334135006, "learning_rate": 3.646683465524564e-07, "loss": 0.0308, "step": 5724 }, { "epoch": 2.6707089552238807, "grad_norm": 1.0023749200849796, "learning_rate": 3.6263622199844085e-07, "loss": 0.0276, "step": 5726 }, { "epoch": 2.671641791044776, "grad_norm": 0.9015517833503137, "learning_rate": 3.6060956223866683e-07, "loss": 0.0275, "step": 5728 }, { "epoch": 2.6725746268656714, "grad_norm": 0.9523846371790706, "learning_rate": 3.5858836966140345e-07, "loss": 0.0289, "step": 5730 }, { "epoch": 2.673507462686567, "grad_norm": 0.9897606811173157, "learning_rate": 3.565726466484798e-07, "loss": 0.028, "step": 5732 }, { "epoch": 2.674440298507463, "grad_norm": 0.8536692203456191, "learning_rate": 3.5456239557527585e-07, "loss": 0.0275, "step": 5734 }, { "epoch": 2.675373134328358, "grad_norm": 1.026573977594397, "learning_rate": 3.5255761881072823e-07, "loss": 0.0274, "step": 5736 }, { "epoch": 2.6763059701492535, "grad_norm": 1.136749269539635, "learning_rate": 3.505583187173178e-07, "loss": 0.0312, "step": 5738 }, { "epoch": 2.6772388059701493, "grad_norm": 0.9466562456545305, "learning_rate": 3.485644976510755e-07, "loss": 0.03, "step": 5740 }, { "epoch": 2.6781716417910446, "grad_norm": 1.0678576679932985, "learning_rate": 3.465761579615712e-07, "loss": 0.0326, "step": 5742 }, { "epoch": 2.6791044776119404, "grad_norm": 0.8681817771137303, "learning_rate": 3.445933019919195e-07, "loss": 0.0245, "step": 5744 }, { "epoch": 2.6800373134328357, "grad_norm": 0.8061913529013403, "learning_rate": 3.42615932078772e-07, "loss": 0.0266, "step": 5746 }, { "epoch": 2.6809701492537314, "grad_norm": 0.9215643445592973, "learning_rate": 3.406440505523123e-07, "loss": 0.0292, "step": 5748 }, { "epoch": 2.6819029850746268, "grad_norm": 0.9952871439157782, "learning_rate": 3.386776597362612e-07, "loss": 0.0301, "step": 5750 }, { "epoch": 2.6828358208955225, "grad_norm": 0.7926829149673023, "learning_rate": 3.367167619478651e-07, "loss": 0.0235, "step": 5752 }, { "epoch": 2.683768656716418, "grad_norm": 0.9451852918480704, "learning_rate": 3.347613594978971e-07, "loss": 0.0255, "step": 5754 }, { "epoch": 2.6847014925373136, "grad_norm": 0.8259137510974414, "learning_rate": 3.3281145469065913e-07, "loss": 0.0265, "step": 5756 }, { "epoch": 2.685634328358209, "grad_norm": 0.8248583370521401, "learning_rate": 3.3086704982397077e-07, "loss": 0.0256, "step": 5758 }, { "epoch": 2.6865671641791042, "grad_norm": 1.002464695024011, "learning_rate": 3.289281471891692e-07, "loss": 0.0275, "step": 5760 }, { "epoch": 2.6875, "grad_norm": 0.9291674809789717, "learning_rate": 3.269947490711117e-07, "loss": 0.0255, "step": 5762 }, { "epoch": 2.6884328358208958, "grad_norm": 1.0062847946924403, "learning_rate": 3.2506685774816527e-07, "loss": 0.031, "step": 5764 }, { "epoch": 2.689365671641791, "grad_norm": 0.8845161324267664, "learning_rate": 3.231444754922086e-07, "loss": 0.0283, "step": 5766 }, { "epoch": 2.6902985074626864, "grad_norm": 1.1501011277586621, "learning_rate": 3.2122760456863023e-07, "loss": 0.0277, "step": 5768 }, { "epoch": 2.691231343283582, "grad_norm": 0.923500915565738, "learning_rate": 3.1931624723632147e-07, "loss": 0.0281, "step": 5770 }, { "epoch": 2.6921641791044775, "grad_norm": 1.0121932562794196, "learning_rate": 3.174104057476768e-07, "loss": 0.0287, "step": 5772 }, { "epoch": 2.6930970149253732, "grad_norm": 0.8451398328503532, "learning_rate": 3.1551008234859236e-07, "loss": 0.0256, "step": 5774 }, { "epoch": 2.6940298507462686, "grad_norm": 0.8868264662556339, "learning_rate": 3.136152792784586e-07, "loss": 0.0311, "step": 5776 }, { "epoch": 2.6949626865671643, "grad_norm": 1.132609349725213, "learning_rate": 3.1172599877016316e-07, "loss": 0.0304, "step": 5778 }, { "epoch": 2.6958955223880596, "grad_norm": 0.9300362845643634, "learning_rate": 3.098422430500864e-07, "loss": 0.0344, "step": 5780 }, { "epoch": 2.6968283582089554, "grad_norm": 0.9961563897889548, "learning_rate": 3.0796401433809465e-07, "loss": 0.028, "step": 5782 }, { "epoch": 2.6977611940298507, "grad_norm": 0.9169652584714555, "learning_rate": 3.060913148475453e-07, "loss": 0.0272, "step": 5784 }, { "epoch": 2.6986940298507465, "grad_norm": 0.8041017569702099, "learning_rate": 3.0422414678527526e-07, "loss": 0.0266, "step": 5786 }, { "epoch": 2.699626865671642, "grad_norm": 0.7222524826917436, "learning_rate": 3.0236251235160827e-07, "loss": 0.0225, "step": 5788 }, { "epoch": 2.700559701492537, "grad_norm": 0.9999139168019376, "learning_rate": 3.005064137403424e-07, "loss": 0.0284, "step": 5790 }, { "epoch": 2.701492537313433, "grad_norm": 0.9201166365049777, "learning_rate": 2.986558531387557e-07, "loss": 0.0243, "step": 5792 }, { "epoch": 2.7024253731343286, "grad_norm": 1.0336207789711631, "learning_rate": 2.9681083272759645e-07, "loss": 0.0296, "step": 5794 }, { "epoch": 2.703358208955224, "grad_norm": 0.9861828294752073, "learning_rate": 2.949713546810884e-07, "loss": 0.0277, "step": 5796 }, { "epoch": 2.7042910447761193, "grad_norm": 0.8417513599421644, "learning_rate": 2.931374211669219e-07, "loss": 0.0263, "step": 5798 }, { "epoch": 2.705223880597015, "grad_norm": 1.1332062360436599, "learning_rate": 2.913090343462516e-07, "loss": 0.0313, "step": 5800 }, { "epoch": 2.7061567164179103, "grad_norm": 0.9197858394528459, "learning_rate": 2.8948619637370056e-07, "loss": 0.0331, "step": 5802 }, { "epoch": 2.707089552238806, "grad_norm": 1.2269001374096953, "learning_rate": 2.876689093973484e-07, "loss": 0.0295, "step": 5804 }, { "epoch": 2.7080223880597014, "grad_norm": 0.9099957401660649, "learning_rate": 2.8585717555873307e-07, "loss": 0.031, "step": 5806 }, { "epoch": 2.708955223880597, "grad_norm": 0.8969373627355374, "learning_rate": 2.8405099699285456e-07, "loss": 0.0293, "step": 5808 }, { "epoch": 2.7098880597014925, "grad_norm": 1.072333792921784, "learning_rate": 2.8225037582816027e-07, "loss": 0.0332, "step": 5810 }, { "epoch": 2.710820895522388, "grad_norm": 0.8388134088824518, "learning_rate": 2.804553141865496e-07, "loss": 0.029, "step": 5812 }, { "epoch": 2.7117537313432836, "grad_norm": 1.0705158139568056, "learning_rate": 2.786658141833737e-07, "loss": 0.0342, "step": 5814 }, { "epoch": 2.7126865671641793, "grad_norm": 0.8876563936002391, "learning_rate": 2.768818779274263e-07, "loss": 0.0275, "step": 5816 }, { "epoch": 2.7136194029850746, "grad_norm": 0.8903356890804164, "learning_rate": 2.7510350752094404e-07, "loss": 0.0258, "step": 5818 }, { "epoch": 2.71455223880597, "grad_norm": 0.8764898589140518, "learning_rate": 2.7333070505961014e-07, "loss": 0.0258, "step": 5820 }, { "epoch": 2.7154850746268657, "grad_norm": 0.9665390589158259, "learning_rate": 2.7156347263254057e-07, "loss": 0.0286, "step": 5822 }, { "epoch": 2.716417910447761, "grad_norm": 0.8531868203598048, "learning_rate": 2.6980181232228953e-07, "loss": 0.0289, "step": 5824 }, { "epoch": 2.717350746268657, "grad_norm": 1.1495247812983556, "learning_rate": 2.680457262048458e-07, "loss": 0.0306, "step": 5826 }, { "epoch": 2.718283582089552, "grad_norm": 1.123839315238744, "learning_rate": 2.662952163496274e-07, "loss": 0.0303, "step": 5828 }, { "epoch": 2.719216417910448, "grad_norm": 0.7964731534176683, "learning_rate": 2.645502848194831e-07, "loss": 0.0225, "step": 5830 }, { "epoch": 2.720149253731343, "grad_norm": 1.069790484465966, "learning_rate": 2.628109336706874e-07, "loss": 0.029, "step": 5832 }, { "epoch": 2.721082089552239, "grad_norm": 1.021373389027856, "learning_rate": 2.61077164952937e-07, "loss": 0.0249, "step": 5834 }, { "epoch": 2.7220149253731343, "grad_norm": 0.937542896119991, "learning_rate": 2.593489807093536e-07, "loss": 0.0304, "step": 5836 }, { "epoch": 2.72294776119403, "grad_norm": 0.9098111222144343, "learning_rate": 2.5762638297647416e-07, "loss": 0.026, "step": 5838 }, { "epoch": 2.7238805970149254, "grad_norm": 0.9242261549967771, "learning_rate": 2.559093737842561e-07, "loss": 0.0307, "step": 5840 }, { "epoch": 2.7248134328358207, "grad_norm": 0.9992330153262171, "learning_rate": 2.541979551560669e-07, "loss": 0.0279, "step": 5842 }, { "epoch": 2.7257462686567164, "grad_norm": 0.8755327544571123, "learning_rate": 2.524921291086907e-07, "loss": 0.0226, "step": 5844 }, { "epoch": 2.726679104477612, "grad_norm": 1.0727272877606369, "learning_rate": 2.5079189765231716e-07, "loss": 0.0301, "step": 5846 }, { "epoch": 2.7276119402985075, "grad_norm": 0.9367220868135965, "learning_rate": 2.4909726279054527e-07, "loss": 0.028, "step": 5848 }, { "epoch": 2.728544776119403, "grad_norm": 1.083830164979458, "learning_rate": 2.4740822652037865e-07, "loss": 0.0301, "step": 5850 }, { "epoch": 2.7294776119402986, "grad_norm": 1.1236279986244866, "learning_rate": 2.4572479083222243e-07, "loss": 0.0291, "step": 5852 }, { "epoch": 2.730410447761194, "grad_norm": 0.7827245639762057, "learning_rate": 2.4404695770988364e-07, "loss": 0.0218, "step": 5854 }, { "epoch": 2.7313432835820897, "grad_norm": 0.9827612668764716, "learning_rate": 2.42374729130565e-07, "loss": 0.0281, "step": 5856 }, { "epoch": 2.732276119402985, "grad_norm": 1.0807720694649268, "learning_rate": 2.4070810706486536e-07, "loss": 0.0291, "step": 5858 }, { "epoch": 2.7332089552238807, "grad_norm": 0.9403172056503358, "learning_rate": 2.39047093476778e-07, "loss": 0.0278, "step": 5860 }, { "epoch": 2.734141791044776, "grad_norm": 0.963183647905619, "learning_rate": 2.373916903236856e-07, "loss": 0.0305, "step": 5862 }, { "epoch": 2.7350746268656714, "grad_norm": 0.9178054620047662, "learning_rate": 2.357418995563593e-07, "loss": 0.0292, "step": 5864 }, { "epoch": 2.736007462686567, "grad_norm": 0.8718140353571424, "learning_rate": 2.340977231189584e-07, "loss": 0.0305, "step": 5866 }, { "epoch": 2.736940298507463, "grad_norm": 0.9119854800983569, "learning_rate": 2.3245916294902306e-07, "loss": 0.0284, "step": 5868 }, { "epoch": 2.737873134328358, "grad_norm": 0.8672871707764913, "learning_rate": 2.3082622097747643e-07, "loss": 0.0238, "step": 5870 }, { "epoch": 2.7388059701492535, "grad_norm": 1.0926101102506087, "learning_rate": 2.2919889912862313e-07, "loss": 0.0267, "step": 5872 }, { "epoch": 2.7397388059701493, "grad_norm": 0.9671659616597236, "learning_rate": 2.2757719932014199e-07, "loss": 0.0278, "step": 5874 }, { "epoch": 2.7406716417910446, "grad_norm": 0.8876914643257722, "learning_rate": 2.259611234630865e-07, "loss": 0.0271, "step": 5876 }, { "epoch": 2.7416044776119404, "grad_norm": 0.9926268794456025, "learning_rate": 2.243506734618861e-07, "loss": 0.0314, "step": 5878 }, { "epoch": 2.7425373134328357, "grad_norm": 1.036129350284489, "learning_rate": 2.2274585121433712e-07, "loss": 0.031, "step": 5880 }, { "epoch": 2.7434701492537314, "grad_norm": 0.9216796468280406, "learning_rate": 2.211466586116051e-07, "loss": 0.0262, "step": 5882 }, { "epoch": 2.7444029850746268, "grad_norm": 1.105867185044268, "learning_rate": 2.1955309753822262e-07, "loss": 0.0273, "step": 5884 }, { "epoch": 2.7453358208955225, "grad_norm": 0.9475958664820686, "learning_rate": 2.1796516987208361e-07, "loss": 0.0283, "step": 5886 }, { "epoch": 2.746268656716418, "grad_norm": 0.95920474645987, "learning_rate": 2.1638287748444675e-07, "loss": 0.0292, "step": 5888 }, { "epoch": 2.7472014925373136, "grad_norm": 0.8738107936054988, "learning_rate": 2.148062222399261e-07, "loss": 0.0245, "step": 5890 }, { "epoch": 2.748134328358209, "grad_norm": 0.9399184144653854, "learning_rate": 2.1323520599649484e-07, "loss": 0.0279, "step": 5892 }, { "epoch": 2.7490671641791042, "grad_norm": 0.8915402456411565, "learning_rate": 2.1166983060548097e-07, "loss": 0.0312, "step": 5894 }, { "epoch": 2.75, "grad_norm": 0.9847475058229502, "learning_rate": 2.101100979115661e-07, "loss": 0.0339, "step": 5896 }, { "epoch": 2.7509328358208958, "grad_norm": 1.1775427811502297, "learning_rate": 2.0855600975277945e-07, "loss": 0.0284, "step": 5898 }, { "epoch": 2.751865671641791, "grad_norm": 0.8642503660649238, "learning_rate": 2.0700756796050213e-07, "loss": 0.0273, "step": 5900 }, { "epoch": 2.7527985074626864, "grad_norm": 0.9802969737371192, "learning_rate": 2.0546477435945733e-07, "loss": 0.0266, "step": 5902 }, { "epoch": 2.753731343283582, "grad_norm": 0.9502542802301004, "learning_rate": 2.0392763076771626e-07, "loss": 0.028, "step": 5904 }, { "epoch": 2.7546641791044775, "grad_norm": 0.8383762876328574, "learning_rate": 2.0239613899669052e-07, "loss": 0.0235, "step": 5906 }, { "epoch": 2.7555970149253732, "grad_norm": 0.9390110842420307, "learning_rate": 2.0087030085113034e-07, "loss": 0.0282, "step": 5908 }, { "epoch": 2.7565298507462686, "grad_norm": 1.0505544986084194, "learning_rate": 1.9935011812912408e-07, "loss": 0.0306, "step": 5910 }, { "epoch": 2.7574626865671643, "grad_norm": 0.8970754307861333, "learning_rate": 1.978355926220965e-07, "loss": 0.0243, "step": 5912 }, { "epoch": 2.7583955223880596, "grad_norm": 0.9528293896094844, "learning_rate": 1.9632672611480607e-07, "loss": 0.0283, "step": 5914 }, { "epoch": 2.7593283582089554, "grad_norm": 1.0440224674452707, "learning_rate": 1.948235203853399e-07, "loss": 0.0268, "step": 5916 }, { "epoch": 2.7602611940298507, "grad_norm": 0.882354315586926, "learning_rate": 1.933259772051177e-07, "loss": 0.0241, "step": 5918 }, { "epoch": 2.7611940298507465, "grad_norm": 0.9615225185117673, "learning_rate": 1.918340983388839e-07, "loss": 0.0265, "step": 5920 }, { "epoch": 2.762126865671642, "grad_norm": 0.8003798033632283, "learning_rate": 1.9034788554470718e-07, "loss": 0.0238, "step": 5922 }, { "epoch": 2.763059701492537, "grad_norm": 1.2083306229143707, "learning_rate": 1.888673405739838e-07, "loss": 0.03, "step": 5924 }, { "epoch": 2.763992537313433, "grad_norm": 1.196171660226283, "learning_rate": 1.873924651714265e-07, "loss": 0.0301, "step": 5926 }, { "epoch": 2.7649253731343286, "grad_norm": 0.8162603525073673, "learning_rate": 1.859232610750672e-07, "loss": 0.0264, "step": 5928 }, { "epoch": 2.765858208955224, "grad_norm": 0.9070875234636459, "learning_rate": 1.844597300162565e-07, "loss": 0.0289, "step": 5930 }, { "epoch": 2.7667910447761193, "grad_norm": 0.917794530772954, "learning_rate": 1.8300187371965762e-07, "loss": 0.0293, "step": 5932 }, { "epoch": 2.767723880597015, "grad_norm": 0.984817465761431, "learning_rate": 1.8154969390324905e-07, "loss": 0.0274, "step": 5934 }, { "epoch": 2.7686567164179103, "grad_norm": 0.9713453699788799, "learning_rate": 1.8010319227831808e-07, "loss": 0.0258, "step": 5936 }, { "epoch": 2.769589552238806, "grad_norm": 0.9728200254285655, "learning_rate": 1.7866237054946168e-07, "loss": 0.0269, "step": 5938 }, { "epoch": 2.7705223880597014, "grad_norm": 1.1102464322971721, "learning_rate": 1.772272304145811e-07, "loss": 0.0291, "step": 5940 }, { "epoch": 2.771455223880597, "grad_norm": 0.9314621692224927, "learning_rate": 1.7579777356488637e-07, "loss": 0.0233, "step": 5942 }, { "epoch": 2.7723880597014925, "grad_norm": 0.9343419601411406, "learning_rate": 1.7437400168488604e-07, "loss": 0.0267, "step": 5944 }, { "epoch": 2.773320895522388, "grad_norm": 0.88850369106571, "learning_rate": 1.7295591645239195e-07, "loss": 0.0245, "step": 5946 }, { "epoch": 2.7742537313432836, "grad_norm": 0.9150195040244912, "learning_rate": 1.7154351953851456e-07, "loss": 0.0287, "step": 5948 }, { "epoch": 2.7751865671641793, "grad_norm": 0.8471315847351008, "learning_rate": 1.7013681260765912e-07, "loss": 0.0247, "step": 5950 }, { "epoch": 2.7761194029850746, "grad_norm": 0.8248877102500325, "learning_rate": 1.6873579731752797e-07, "loss": 0.0267, "step": 5952 }, { "epoch": 2.77705223880597, "grad_norm": 0.8266015763287755, "learning_rate": 1.6734047531911436e-07, "loss": 0.0245, "step": 5954 }, { "epoch": 2.7779850746268657, "grad_norm": 0.8942446725228442, "learning_rate": 1.6595084825670403e-07, "loss": 0.028, "step": 5956 }, { "epoch": 2.778917910447761, "grad_norm": 1.0361421543357303, "learning_rate": 1.6456691776787103e-07, "loss": 0.0263, "step": 5958 }, { "epoch": 2.779850746268657, "grad_norm": 0.95816275670667, "learning_rate": 1.6318868548347578e-07, "loss": 0.0318, "step": 5960 }, { "epoch": 2.780783582089552, "grad_norm": 1.0419289710700441, "learning_rate": 1.618161530276635e-07, "loss": 0.0292, "step": 5962 }, { "epoch": 2.781716417910448, "grad_norm": 0.9671095594028484, "learning_rate": 1.604493220178649e-07, "loss": 0.0289, "step": 5964 }, { "epoch": 2.782649253731343, "grad_norm": 0.8723906868587412, "learning_rate": 1.590881940647898e-07, "loss": 0.028, "step": 5966 }, { "epoch": 2.783582089552239, "grad_norm": 1.0100792425284035, "learning_rate": 1.5773277077242744e-07, "loss": 0.0301, "step": 5968 }, { "epoch": 2.7845149253731343, "grad_norm": 1.094048809300156, "learning_rate": 1.5638305373804618e-07, "loss": 0.0313, "step": 5970 }, { "epoch": 2.78544776119403, "grad_norm": 0.986166343643483, "learning_rate": 1.550390445521882e-07, "loss": 0.0252, "step": 5972 }, { "epoch": 2.7863805970149254, "grad_norm": 0.9034236201858159, "learning_rate": 1.537007447986699e-07, "loss": 0.0246, "step": 5974 }, { "epoch": 2.7873134328358207, "grad_norm": 0.8889417799041324, "learning_rate": 1.5236815605457977e-07, "loss": 0.0324, "step": 5976 }, { "epoch": 2.7882462686567164, "grad_norm": 0.7915261215048099, "learning_rate": 1.5104127989027661e-07, "loss": 0.0239, "step": 5978 }, { "epoch": 2.789179104477612, "grad_norm": 0.8899753000706484, "learning_rate": 1.4972011786938688e-07, "loss": 0.0262, "step": 5980 }, { "epoch": 2.7901119402985075, "grad_norm": 1.0494806109561634, "learning_rate": 1.4840467154880412e-07, "loss": 0.0304, "step": 5982 }, { "epoch": 2.791044776119403, "grad_norm": 0.9028726174284042, "learning_rate": 1.4709494247868384e-07, "loss": 0.0274, "step": 5984 }, { "epoch": 2.7919776119402986, "grad_norm": 0.9484390079992623, "learning_rate": 1.4579093220244755e-07, "loss": 0.0275, "step": 5986 }, { "epoch": 2.792910447761194, "grad_norm": 0.9266575933813405, "learning_rate": 1.4449264225677607e-07, "loss": 0.0258, "step": 5988 }, { "epoch": 2.7938432835820897, "grad_norm": 0.820131848565527, "learning_rate": 1.432000741716083e-07, "loss": 0.022, "step": 5990 }, { "epoch": 2.794776119402985, "grad_norm": 0.7670878758198434, "learning_rate": 1.4191322947014198e-07, "loss": 0.0224, "step": 5992 }, { "epoch": 2.7957089552238807, "grad_norm": 0.8111190526491343, "learning_rate": 1.40632109668829e-07, "loss": 0.0267, "step": 5994 }, { "epoch": 2.796641791044776, "grad_norm": 1.054620489439934, "learning_rate": 1.3935671627737568e-07, "loss": 0.031, "step": 5996 }, { "epoch": 2.7975746268656714, "grad_norm": 0.9122734685692475, "learning_rate": 1.3808705079873974e-07, "loss": 0.0286, "step": 5998 }, { "epoch": 2.798507462686567, "grad_norm": 1.011995085628338, "learning_rate": 1.368231147291299e-07, "loss": 0.0253, "step": 6000 }, { "epoch": 2.798507462686567, "eval_loss": 0.18329007923603058, "eval_runtime": 323.0954, "eval_samples_per_second": 47.187, "eval_steps_per_second": 5.899, "step": 6000 }, { "epoch": 2.799440298507463, "grad_norm": 0.9221866409015929, "learning_rate": 1.3556490955800084e-07, "loss": 0.0239, "step": 6002 }, { "epoch": 2.800373134328358, "grad_norm": 0.8265850904760484, "learning_rate": 1.3431243676805706e-07, "loss": 0.0235, "step": 6004 }, { "epoch": 2.8013059701492535, "grad_norm": 0.916524786966227, "learning_rate": 1.3306569783524515e-07, "loss": 0.0278, "step": 6006 }, { "epoch": 2.8022388059701493, "grad_norm": 1.0627597059350238, "learning_rate": 1.31824694228756e-07, "loss": 0.0268, "step": 6008 }, { "epoch": 2.8031716417910446, "grad_norm": 0.9854456242392609, "learning_rate": 1.3058942741102255e-07, "loss": 0.0296, "step": 6010 }, { "epoch": 2.8041044776119404, "grad_norm": 0.8733962291870478, "learning_rate": 1.293598988377154e-07, "loss": 0.0271, "step": 6012 }, { "epoch": 2.8050373134328357, "grad_norm": 0.9020372572074464, "learning_rate": 1.2813610995774383e-07, "loss": 0.0267, "step": 6014 }, { "epoch": 2.8059701492537314, "grad_norm": 1.0309149190481628, "learning_rate": 1.2691806221325488e-07, "loss": 0.0271, "step": 6016 }, { "epoch": 2.8069029850746268, "grad_norm": 0.7397732761136856, "learning_rate": 1.257057570396275e-07, "loss": 0.0249, "step": 6018 }, { "epoch": 2.8078358208955225, "grad_norm": 0.9746213437526945, "learning_rate": 1.244991958654751e-07, "loss": 0.0255, "step": 6020 }, { "epoch": 2.808768656716418, "grad_norm": 0.9092447986844799, "learning_rate": 1.2329838011264305e-07, "loss": 0.0297, "step": 6022 }, { "epoch": 2.8097014925373136, "grad_norm": 0.8983926055526076, "learning_rate": 1.2210331119620333e-07, "loss": 0.0252, "step": 6024 }, { "epoch": 2.810634328358209, "grad_norm": 0.9677539682770142, "learning_rate": 1.2091399052445774e-07, "loss": 0.027, "step": 6026 }, { "epoch": 2.8115671641791042, "grad_norm": 1.0472270528820966, "learning_rate": 1.197304194989335e-07, "loss": 0.0279, "step": 6028 }, { "epoch": 2.8125, "grad_norm": 0.891285413190194, "learning_rate": 1.185525995143838e-07, "loss": 0.0245, "step": 6030 }, { "epoch": 2.8134328358208958, "grad_norm": 1.0446696188843714, "learning_rate": 1.1738053195878174e-07, "loss": 0.0285, "step": 6032 }, { "epoch": 2.814365671641791, "grad_norm": 0.8281663266840147, "learning_rate": 1.1621421821332469e-07, "loss": 0.0237, "step": 6034 }, { "epoch": 2.8152985074626864, "grad_norm": 0.985949627033281, "learning_rate": 1.150536596524271e-07, "loss": 0.0281, "step": 6036 }, { "epoch": 2.816231343283582, "grad_norm": 0.9968398571881046, "learning_rate": 1.1389885764372221e-07, "loss": 0.0278, "step": 6038 }, { "epoch": 2.8171641791044775, "grad_norm": 0.9660425784764326, "learning_rate": 1.1274981354806147e-07, "loss": 0.0328, "step": 6040 }, { "epoch": 2.8180970149253732, "grad_norm": 0.9324463921158841, "learning_rate": 1.1160652871950839e-07, "loss": 0.0269, "step": 6042 }, { "epoch": 2.8190298507462686, "grad_norm": 0.9029724221313022, "learning_rate": 1.1046900450533971e-07, "loss": 0.021, "step": 6044 }, { "epoch": 2.8199626865671643, "grad_norm": 1.0833243574375553, "learning_rate": 1.0933724224604536e-07, "loss": 0.0308, "step": 6046 }, { "epoch": 2.8208955223880596, "grad_norm": 1.1845420995114608, "learning_rate": 1.0821124327532462e-07, "loss": 0.0319, "step": 6048 }, { "epoch": 2.8218283582089554, "grad_norm": 1.060331874970539, "learning_rate": 1.070910089200844e-07, "loss": 0.0265, "step": 6050 }, { "epoch": 2.8227611940298507, "grad_norm": 1.0277983078485242, "learning_rate": 1.0597654050043982e-07, "loss": 0.0306, "step": 6052 }, { "epoch": 2.8236940298507465, "grad_norm": 0.7963096751323487, "learning_rate": 1.0486783932970924e-07, "loss": 0.0237, "step": 6054 }, { "epoch": 2.824626865671642, "grad_norm": 0.9308811945112528, "learning_rate": 1.0376490671441752e-07, "loss": 0.0286, "step": 6056 }, { "epoch": 2.825559701492537, "grad_norm": 1.0045664589682328, "learning_rate": 1.0266774395428947e-07, "loss": 0.0244, "step": 6058 }, { "epoch": 2.826492537313433, "grad_norm": 0.9398241600917389, "learning_rate": 1.0157635234224971e-07, "loss": 0.0282, "step": 6060 }, { "epoch": 2.8274253731343286, "grad_norm": 0.9638086300700636, "learning_rate": 1.0049073316442559e-07, "loss": 0.0265, "step": 6062 }, { "epoch": 2.828358208955224, "grad_norm": 1.06633468461235, "learning_rate": 9.941088770013929e-08, "loss": 0.0297, "step": 6064 }, { "epoch": 2.8292910447761193, "grad_norm": 0.9251580272353037, "learning_rate": 9.833681722190901e-08, "loss": 0.0285, "step": 6066 }, { "epoch": 2.830223880597015, "grad_norm": 0.9621616842150614, "learning_rate": 9.726852299544953e-08, "loss": 0.0264, "step": 6068 }, { "epoch": 2.8311567164179103, "grad_norm": 0.8017067867848854, "learning_rate": 9.620600627966659e-08, "loss": 0.0235, "step": 6070 }, { "epoch": 2.832089552238806, "grad_norm": 2.2184701837589484, "learning_rate": 9.514926832665861e-08, "loss": 0.0285, "step": 6072 }, { "epoch": 2.8330223880597014, "grad_norm": 0.7971259494348162, "learning_rate": 9.409831038171501e-08, "loss": 0.0264, "step": 6074 }, { "epoch": 2.833955223880597, "grad_norm": 1.082444967355677, "learning_rate": 9.305313368331126e-08, "loss": 0.0302, "step": 6076 }, { "epoch": 2.8348880597014925, "grad_norm": 0.9740324947287216, "learning_rate": 9.201373946311266e-08, "loss": 0.0277, "step": 6078 }, { "epoch": 2.835820895522388, "grad_norm": 0.9693233271849888, "learning_rate": 9.098012894596886e-08, "loss": 0.0284, "step": 6080 }, { "epoch": 2.8367537313432836, "grad_norm": 0.8613165274822283, "learning_rate": 8.995230334991556e-08, "loss": 0.0279, "step": 6082 }, { "epoch": 2.8376865671641793, "grad_norm": 0.9483829784452807, "learning_rate": 8.893026388616832e-08, "loss": 0.0261, "step": 6084 }, { "epoch": 2.8386194029850746, "grad_norm": 1.0098060586216793, "learning_rate": 8.791401175912706e-08, "loss": 0.0303, "step": 6086 }, { "epoch": 2.83955223880597, "grad_norm": 1.1054269837399973, "learning_rate": 8.690354816637048e-08, "loss": 0.0308, "step": 6088 }, { "epoch": 2.8404850746268657, "grad_norm": 1.068783945533865, "learning_rate": 8.58988742986555e-08, "loss": 0.0261, "step": 6090 }, { "epoch": 2.841417910447761, "grad_norm": 0.8568263460183595, "learning_rate": 8.489999133991789e-08, "loss": 0.0255, "step": 6092 }, { "epoch": 2.842350746268657, "grad_norm": 1.0088164392874945, "learning_rate": 8.390690046726768e-08, "loss": 0.0279, "step": 6094 }, { "epoch": 2.843283582089552, "grad_norm": 0.8665430326713278, "learning_rate": 8.291960285098877e-08, "loss": 0.0244, "step": 6096 }, { "epoch": 2.844216417910448, "grad_norm": 0.9040495958153504, "learning_rate": 8.193809965454102e-08, "loss": 0.027, "step": 6098 }, { "epoch": 2.845149253731343, "grad_norm": 0.8918358245233772, "learning_rate": 8.096239203455313e-08, "loss": 0.0247, "step": 6100 }, { "epoch": 2.846082089552239, "grad_norm": 1.0048720812225236, "learning_rate": 7.999248114082536e-08, "loss": 0.0248, "step": 6102 }, { "epoch": 2.8470149253731343, "grad_norm": 0.8524361002017556, "learning_rate": 7.902836811632786e-08, "loss": 0.0266, "step": 6104 }, { "epoch": 2.84794776119403, "grad_norm": 0.855140402063462, "learning_rate": 7.807005409719515e-08, "loss": 0.0239, "step": 6106 }, { "epoch": 2.8488805970149254, "grad_norm": 1.0686415048668705, "learning_rate": 7.711754021273276e-08, "loss": 0.0291, "step": 6108 }, { "epoch": 2.8498134328358207, "grad_norm": 0.8725245590138367, "learning_rate": 7.617082758540673e-08, "loss": 0.0227, "step": 6110 }, { "epoch": 2.8507462686567164, "grad_norm": 0.8834451617455856, "learning_rate": 7.522991733084905e-08, "loss": 0.0255, "step": 6112 }, { "epoch": 2.851679104477612, "grad_norm": 0.9748544937737061, "learning_rate": 7.429481055785503e-08, "loss": 0.0267, "step": 6114 }, { "epoch": 2.8526119402985075, "grad_norm": 0.8702482515646909, "learning_rate": 7.336550836837819e-08, "loss": 0.0275, "step": 6116 }, { "epoch": 2.853544776119403, "grad_norm": 0.7926249763968709, "learning_rate": 7.244201185753364e-08, "loss": 0.0235, "step": 6118 }, { "epoch": 2.8544776119402986, "grad_norm": 1.0912402619676589, "learning_rate": 7.152432211359472e-08, "loss": 0.027, "step": 6120 }, { "epoch": 2.855410447761194, "grad_norm": 1.0075662816982829, "learning_rate": 7.061244021799141e-08, "loss": 0.0271, "step": 6122 }, { "epoch": 2.8563432835820897, "grad_norm": 0.9768209172507724, "learning_rate": 6.970636724531021e-08, "loss": 0.0281, "step": 6124 }, { "epoch": 2.857276119402985, "grad_norm": 0.8339877883558028, "learning_rate": 6.880610426329149e-08, "loss": 0.0257, "step": 6126 }, { "epoch": 2.8582089552238807, "grad_norm": 1.224494729962025, "learning_rate": 6.791165233282992e-08, "loss": 0.033, "step": 6128 }, { "epoch": 2.859141791044776, "grad_norm": 0.8309826211366992, "learning_rate": 6.702301250797128e-08, "loss": 0.0261, "step": 6130 }, { "epoch": 2.8600746268656714, "grad_norm": 0.9156184970303739, "learning_rate": 6.614018583591287e-08, "loss": 0.0285, "step": 6132 }, { "epoch": 2.861007462686567, "grad_norm": 1.063081447354269, "learning_rate": 6.526317335700083e-08, "loss": 0.0311, "step": 6134 }, { "epoch": 2.861940298507463, "grad_norm": 0.9681128399998323, "learning_rate": 6.439197610473125e-08, "loss": 0.0295, "step": 6136 }, { "epoch": 2.862873134328358, "grad_norm": 0.8339319778717733, "learning_rate": 6.352659510574565e-08, "loss": 0.0255, "step": 6138 }, { "epoch": 2.8638059701492535, "grad_norm": 0.9510334700077668, "learning_rate": 6.266703137983221e-08, "loss": 0.0222, "step": 6140 }, { "epoch": 2.8647388059701493, "grad_norm": 1.1004684213983682, "learning_rate": 6.181328593992508e-08, "loss": 0.0278, "step": 6142 }, { "epoch": 2.8656716417910446, "grad_norm": 1.0347689464621461, "learning_rate": 6.096535979209894e-08, "loss": 0.0315, "step": 6144 }, { "epoch": 2.8666044776119404, "grad_norm": 0.9194327140627475, "learning_rate": 6.012325393557505e-08, "loss": 0.0253, "step": 6146 }, { "epoch": 2.8675373134328357, "grad_norm": 0.9195629780953789, "learning_rate": 5.928696936271128e-08, "loss": 0.0259, "step": 6148 }, { "epoch": 2.8684701492537314, "grad_norm": 0.867690935896647, "learning_rate": 5.845650705900985e-08, "loss": 0.0275, "step": 6150 }, { "epoch": 2.8694029850746268, "grad_norm": 0.9709391500841479, "learning_rate": 5.763186800310849e-08, "loss": 0.028, "step": 6152 }, { "epoch": 2.8703358208955225, "grad_norm": 0.920743512258816, "learning_rate": 5.681305316678487e-08, "loss": 0.0276, "step": 6154 }, { "epoch": 2.871268656716418, "grad_norm": 1.0514477761920524, "learning_rate": 5.600006351495213e-08, "loss": 0.0299, "step": 6156 }, { "epoch": 2.8722014925373136, "grad_norm": 0.9521379752950706, "learning_rate": 5.519290000565891e-08, "loss": 0.025, "step": 6158 }, { "epoch": 2.873134328358209, "grad_norm": 1.0000354064391062, "learning_rate": 5.4391563590089345e-08, "loss": 0.0277, "step": 6160 }, { "epoch": 2.8740671641791042, "grad_norm": 0.9998472317484027, "learning_rate": 5.359605521255862e-08, "loss": 0.0271, "step": 6162 }, { "epoch": 2.875, "grad_norm": 1.0154098658204735, "learning_rate": 5.2806375810515173e-08, "loss": 0.0288, "step": 6164 }, { "epoch": 2.8759328358208958, "grad_norm": 0.8651533114830908, "learning_rate": 5.202252631454019e-08, "loss": 0.0275, "step": 6166 }, { "epoch": 2.876865671641791, "grad_norm": 0.9536255093285815, "learning_rate": 5.1244507648341436e-08, "loss": 0.0242, "step": 6168 }, { "epoch": 2.8777985074626864, "grad_norm": 0.9571731096409607, "learning_rate": 5.0472320728757184e-08, "loss": 0.0262, "step": 6170 }, { "epoch": 2.878731343283582, "grad_norm": 0.9541813407147736, "learning_rate": 4.970596646575399e-08, "loss": 0.0319, "step": 6172 }, { "epoch": 2.8796641791044775, "grad_norm": 1.0303653115831537, "learning_rate": 4.894544576242333e-08, "loss": 0.0267, "step": 6174 }, { "epoch": 2.8805970149253732, "grad_norm": 0.864462744748314, "learning_rate": 4.8190759514983866e-08, "loss": 0.027, "step": 6176 }, { "epoch": 2.8815298507462686, "grad_norm": 1.014102706273404, "learning_rate": 4.744190861277864e-08, "loss": 0.0248, "step": 6178 }, { "epoch": 2.8824626865671643, "grad_norm": 1.2006667032261047, "learning_rate": 4.669889393827287e-08, "loss": 0.0316, "step": 6180 }, { "epoch": 2.8833955223880596, "grad_norm": 0.8940806054512117, "learning_rate": 4.5961716367055044e-08, "loss": 0.0304, "step": 6182 }, { "epoch": 2.8843283582089554, "grad_norm": 0.8938382426244617, "learning_rate": 4.523037676783581e-08, "loss": 0.0237, "step": 6184 }, { "epoch": 2.8852611940298507, "grad_norm": 0.9887231931165578, "learning_rate": 4.4504876002444683e-08, "loss": 0.0255, "step": 6186 }, { "epoch": 2.8861940298507465, "grad_norm": 0.8738112893845056, "learning_rate": 4.3785214925831655e-08, "loss": 0.0288, "step": 6188 }, { "epoch": 2.887126865671642, "grad_norm": 1.024335918443421, "learning_rate": 4.3071394386064444e-08, "loss": 0.0288, "step": 6190 }, { "epoch": 2.888059701492537, "grad_norm": 1.1255002002907022, "learning_rate": 4.2363415224329076e-08, "loss": 0.0279, "step": 6192 }, { "epoch": 2.888992537313433, "grad_norm": 0.8826371849514132, "learning_rate": 4.16612782749265e-08, "loss": 0.0241, "step": 6194 }, { "epoch": 2.8899253731343286, "grad_norm": 1.0465092663367443, "learning_rate": 4.096498436527374e-08, "loss": 0.0316, "step": 6196 }, { "epoch": 2.890858208955224, "grad_norm": 1.0830772730773754, "learning_rate": 4.027453431590278e-08, "loss": 0.0275, "step": 6198 }, { "epoch": 2.8917910447761193, "grad_norm": 0.9642422179559949, "learning_rate": 3.9589928940457766e-08, "loss": 0.0303, "step": 6200 }, { "epoch": 2.892723880597015, "grad_norm": 0.9520275482238393, "learning_rate": 3.891116904569725e-08, "loss": 0.0278, "step": 6202 }, { "epoch": 2.8936567164179103, "grad_norm": 1.0370943740202896, "learning_rate": 3.823825543148918e-08, "loss": 0.0259, "step": 6204 }, { "epoch": 2.894589552238806, "grad_norm": 1.1011707446846262, "learning_rate": 3.7571188890813685e-08, "loss": 0.03, "step": 6206 }, { "epoch": 2.8955223880597014, "grad_norm": 1.0173726680913833, "learning_rate": 3.690997020975973e-08, "loss": 0.03, "step": 6208 }, { "epoch": 2.896455223880597, "grad_norm": 1.058358228923588, "learning_rate": 3.6254600167524576e-08, "loss": 0.0273, "step": 6210 }, { "epoch": 2.8973880597014925, "grad_norm": 0.9030915878853645, "learning_rate": 3.56050795364149e-08, "loss": 0.0283, "step": 6212 }, { "epoch": 2.898320895522388, "grad_norm": 0.9657589253456974, "learning_rate": 3.496140908184287e-08, "loss": 0.0248, "step": 6214 }, { "epoch": 2.8992537313432836, "grad_norm": 0.8917914008530972, "learning_rate": 3.432358956232673e-08, "loss": 0.0289, "step": 6216 }, { "epoch": 2.9001865671641793, "grad_norm": 1.0958380997281072, "learning_rate": 3.3691621729490254e-08, "loss": 0.0289, "step": 6218 }, { "epoch": 2.9011194029850746, "grad_norm": 0.9187748425779532, "learning_rate": 3.3065506328062155e-08, "loss": 0.0283, "step": 6220 }, { "epoch": 2.90205223880597, "grad_norm": 0.9134599893174407, "learning_rate": 3.2445244095872796e-08, "loss": 0.0255, "step": 6222 }, { "epoch": 2.9029850746268657, "grad_norm": 1.0351918552003565, "learning_rate": 3.183083576385637e-08, "loss": 0.0279, "step": 6224 }, { "epoch": 2.903917910447761, "grad_norm": 0.9153608814644609, "learning_rate": 3.1222282056047605e-08, "loss": 0.0287, "step": 6226 }, { "epoch": 2.904850746268657, "grad_norm": 0.8807198315508802, "learning_rate": 3.0619583689582845e-08, "loss": 0.0251, "step": 6228 }, { "epoch": 2.905783582089552, "grad_norm": 1.0102393493914026, "learning_rate": 3.002274137469841e-08, "loss": 0.0282, "step": 6230 }, { "epoch": 2.906716417910448, "grad_norm": 1.079497308767095, "learning_rate": 2.9431755814729456e-08, "loss": 0.0318, "step": 6232 }, { "epoch": 2.907649253731343, "grad_norm": 0.8345846685268756, "learning_rate": 2.8846627706108354e-08, "loss": 0.0238, "step": 6234 }, { "epoch": 2.908582089552239, "grad_norm": 0.970867167127175, "learning_rate": 2.826735773836631e-08, "loss": 0.0235, "step": 6236 }, { "epoch": 2.9095149253731343, "grad_norm": 0.9857067058881129, "learning_rate": 2.7693946594130604e-08, "loss": 0.0287, "step": 6238 }, { "epoch": 2.91044776119403, "grad_norm": 0.9140368482341609, "learning_rate": 2.712639494912461e-08, "loss": 0.0283, "step": 6240 }, { "epoch": 2.9113805970149254, "grad_norm": 1.025972374574341, "learning_rate": 2.6564703472166663e-08, "loss": 0.0271, "step": 6242 }, { "epoch": 2.9123134328358207, "grad_norm": 0.7858423153629281, "learning_rate": 2.6008872825168397e-08, "loss": 0.0258, "step": 6244 }, { "epoch": 2.9132462686567164, "grad_norm": 0.8850157140085816, "learning_rate": 2.5458903663135304e-08, "loss": 0.0268, "step": 6246 }, { "epoch": 2.914179104477612, "grad_norm": 0.9543044924610737, "learning_rate": 2.4914796634166738e-08, "loss": 0.0293, "step": 6248 }, { "epoch": 2.9151119402985075, "grad_norm": 0.924077198749757, "learning_rate": 2.4376552379453135e-08, "loss": 0.026, "step": 6250 }, { "epoch": 2.916044776119403, "grad_norm": 0.83228980205813, "learning_rate": 2.384417153327545e-08, "loss": 0.0234, "step": 6252 }, { "epoch": 2.9169776119402986, "grad_norm": 0.8620306027234695, "learning_rate": 2.331765472300629e-08, "loss": 0.0262, "step": 6254 }, { "epoch": 2.917910447761194, "grad_norm": 0.9031245188087478, "learning_rate": 2.2797002569105998e-08, "loss": 0.0267, "step": 6256 }, { "epoch": 2.9188432835820897, "grad_norm": 0.8827627269674895, "learning_rate": 2.2282215685126007e-08, "loss": 0.0258, "step": 6258 }, { "epoch": 2.919776119402985, "grad_norm": 0.952802571636724, "learning_rate": 2.1773294677704947e-08, "loss": 0.0282, "step": 6260 }, { "epoch": 2.9207089552238807, "grad_norm": 1.0057131059426605, "learning_rate": 2.1270240146568644e-08, "loss": 0.0282, "step": 6262 }, { "epoch": 2.921641791044776, "grad_norm": 0.9906956285539033, "learning_rate": 2.077305268453067e-08, "loss": 0.0272, "step": 6264 }, { "epoch": 2.9225746268656714, "grad_norm": 0.9684122942210909, "learning_rate": 2.028173287748958e-08, "loss": 0.0323, "step": 6266 }, { "epoch": 2.923507462686567, "grad_norm": 0.847667725451742, "learning_rate": 1.9796281304430564e-08, "loss": 0.0299, "step": 6268 }, { "epoch": 2.924440298507463, "grad_norm": 0.7925191142475678, "learning_rate": 1.9316698537421573e-08, "loss": 0.0245, "step": 6270 }, { "epoch": 2.925373134328358, "grad_norm": 1.1385562354438568, "learning_rate": 1.88429851416172e-08, "loss": 0.0275, "step": 6272 }, { "epoch": 2.9263059701492535, "grad_norm": 0.8590721493865495, "learning_rate": 1.8375141675253116e-08, "loss": 0.027, "step": 6274 }, { "epoch": 2.9272388059701493, "grad_norm": 0.8298284936217379, "learning_rate": 1.7913168689648876e-08, "loss": 0.0278, "step": 6276 }, { "epoch": 2.9281716417910446, "grad_norm": 0.8712924189202683, "learning_rate": 1.7457066729206773e-08, "loss": 0.0263, "step": 6278 }, { "epoch": 2.9291044776119404, "grad_norm": 0.8347084759818374, "learning_rate": 1.7006836331407982e-08, "loss": 0.0237, "step": 6280 }, { "epoch": 2.9300373134328357, "grad_norm": 1.0056064708676893, "learning_rate": 1.6562478026816987e-08, "loss": 0.0255, "step": 6282 }, { "epoch": 2.9309701492537314, "grad_norm": 1.0253670648975486, "learning_rate": 1.6123992339077688e-08, "loss": 0.0265, "step": 6284 }, { "epoch": 2.9319029850746268, "grad_norm": 0.9581708770481409, "learning_rate": 1.569137978491342e-08, "loss": 0.0295, "step": 6286 }, { "epoch": 2.9328358208955225, "grad_norm": 0.9625395716354909, "learning_rate": 1.526464087412638e-08, "loss": 0.0279, "step": 6288 }, { "epoch": 2.933768656716418, "grad_norm": 0.9632669348201957, "learning_rate": 1.4843776109597085e-08, "loss": 0.0303, "step": 6290 }, { "epoch": 2.9347014925373136, "grad_norm": 1.036470359074804, "learning_rate": 1.4428785987283811e-08, "loss": 0.0291, "step": 6292 }, { "epoch": 2.935634328358209, "grad_norm": 0.9357231035398019, "learning_rate": 1.4019670996222035e-08, "loss": 0.0291, "step": 6294 }, { "epoch": 2.9365671641791042, "grad_norm": 0.9355580162680617, "learning_rate": 1.361643161852444e-08, "loss": 0.034, "step": 6296 }, { "epoch": 2.9375, "grad_norm": 0.9047004989253603, "learning_rate": 1.3219068329378692e-08, "loss": 0.0269, "step": 6298 }, { "epoch": 2.9384328358208958, "grad_norm": 1.023104774114, "learning_rate": 1.2827581597048555e-08, "loss": 0.0298, "step": 6300 }, { "epoch": 2.939365671641791, "grad_norm": 1.0934483715480892, "learning_rate": 1.2441971882871661e-08, "loss": 0.0254, "step": 6302 }, { "epoch": 2.9402985074626864, "grad_norm": 1.0238497968664415, "learning_rate": 1.2062239641262296e-08, "loss": 0.027, "step": 6304 }, { "epoch": 2.941231343283582, "grad_norm": 0.8175109564063253, "learning_rate": 1.1688385319706397e-08, "loss": 0.0279, "step": 6306 }, { "epoch": 2.9421641791044775, "grad_norm": 0.983306906139509, "learning_rate": 1.1320409358763774e-08, "loss": 0.0266, "step": 6308 }, { "epoch": 2.9430970149253732, "grad_norm": 0.8155616372891258, "learning_rate": 1.095831219206811e-08, "loss": 0.0264, "step": 6310 }, { "epoch": 2.9440298507462686, "grad_norm": 1.05483652059521, "learning_rate": 1.060209424632308e-08, "loss": 0.0286, "step": 6312 }, { "epoch": 2.9449626865671643, "grad_norm": 0.9180941499613573, "learning_rate": 1.025175594130623e-08, "loss": 0.0308, "step": 6314 }, { "epoch": 2.9458955223880596, "grad_norm": 1.057002712330282, "learning_rate": 9.907297689866202e-09, "loss": 0.0283, "step": 6316 }, { "epoch": 2.9468283582089554, "grad_norm": 1.1253102564493433, "learning_rate": 9.568719897921075e-09, "loss": 0.0303, "step": 6318 }, { "epoch": 2.9477611940298507, "grad_norm": 0.9288836836839905, "learning_rate": 9.236022964460023e-09, "loss": 0.0242, "step": 6320 }, { "epoch": 2.9486940298507465, "grad_norm": 0.9637668923295082, "learning_rate": 8.90920728154221e-09, "loss": 0.028, "step": 6322 }, { "epoch": 2.949626865671642, "grad_norm": 0.9433711127474065, "learning_rate": 8.58827323429623e-09, "loss": 0.027, "step": 6324 }, { "epoch": 2.950559701492537, "grad_norm": 0.8158995413991843, "learning_rate": 8.273221200919557e-09, "loss": 0.0247, "step": 6326 }, { "epoch": 2.951492537313433, "grad_norm": 0.9681643099605649, "learning_rate": 7.964051552677432e-09, "loss": 0.0273, "step": 6328 }, { "epoch": 2.9524253731343286, "grad_norm": 1.2526866086239252, "learning_rate": 7.660764653903973e-09, "loss": 0.0281, "step": 6330 }, { "epoch": 2.953358208955224, "grad_norm": 1.094048312685975, "learning_rate": 7.3633608620005125e-09, "loss": 0.029, "step": 6332 }, { "epoch": 2.9542910447761193, "grad_norm": 1.083859619444637, "learning_rate": 7.071840527436147e-09, "loss": 0.0257, "step": 6334 }, { "epoch": 2.955223880597015, "grad_norm": 1.0005781231009265, "learning_rate": 6.786203993745521e-09, "loss": 0.0273, "step": 6336 }, { "epoch": 2.9561567164179103, "grad_norm": 0.9873159201675941, "learning_rate": 6.506451597531049e-09, "loss": 0.0285, "step": 6338 }, { "epoch": 2.957089552238806, "grad_norm": 0.8175862094658, "learning_rate": 6.232583668460135e-09, "loss": 0.0266, "step": 6340 }, { "epoch": 2.9580223880597014, "grad_norm": 0.9556419677870821, "learning_rate": 5.9646005292662845e-09, "loss": 0.0316, "step": 6342 }, { "epoch": 2.958955223880597, "grad_norm": 0.9863955999860845, "learning_rate": 5.702502495747997e-09, "loss": 0.028, "step": 6344 }, { "epoch": 2.9598880597014925, "grad_norm": 0.932823706531385, "learning_rate": 5.446289876768207e-09, "loss": 0.0294, "step": 6346 }, { "epoch": 2.960820895522388, "grad_norm": 0.8680199001481473, "learning_rate": 5.195962974255953e-09, "loss": 0.0256, "step": 6348 }, { "epoch": 2.9617537313432836, "grad_norm": 0.8537228433311188, "learning_rate": 4.951522083201376e-09, "loss": 0.0252, "step": 6350 }, { "epoch": 2.9626865671641793, "grad_norm": 0.9302734526930525, "learning_rate": 4.712967491661835e-09, "loss": 0.0328, "step": 6352 }, { "epoch": 2.9636194029850746, "grad_norm": 0.9259714911734612, "learning_rate": 4.48029948075579e-09, "loss": 0.0229, "step": 6354 }, { "epoch": 2.96455223880597, "grad_norm": 0.9973336943890746, "learning_rate": 4.2535183246655844e-09, "loss": 0.0281, "step": 6356 }, { "epoch": 2.9654850746268657, "grad_norm": 1.0207690677170809, "learning_rate": 4.0326242906363335e-09, "loss": 0.0266, "step": 6358 }, { "epoch": 2.966417910447761, "grad_norm": 0.9171689885593927, "learning_rate": 3.817617638975369e-09, "loss": 0.0277, "step": 6360 }, { "epoch": 2.967350746268657, "grad_norm": 0.8686180948538196, "learning_rate": 3.6084986230522366e-09, "loss": 0.0271, "step": 6362 }, { "epoch": 2.968283582089552, "grad_norm": 0.7886370663731553, "learning_rate": 3.4052674892987026e-09, "loss": 0.0238, "step": 6364 }, { "epoch": 2.969216417910448, "grad_norm": 1.0177966708767086, "learning_rate": 3.2079244772070804e-09, "loss": 0.0275, "step": 6366 }, { "epoch": 2.970149253731343, "grad_norm": 0.9196618009315258, "learning_rate": 3.016469819332457e-09, "loss": 0.0266, "step": 6368 }, { "epoch": 2.971082089552239, "grad_norm": 0.7971439936377686, "learning_rate": 2.8309037412904695e-09, "loss": 0.0239, "step": 6370 }, { "epoch": 2.9720149253731343, "grad_norm": 0.8620747348937791, "learning_rate": 2.6512264617556405e-09, "loss": 0.0253, "step": 6372 }, { "epoch": 2.97294776119403, "grad_norm": 1.1855536878453208, "learning_rate": 2.4774381924663747e-09, "loss": 0.0279, "step": 6374 }, { "epoch": 2.9738805970149254, "grad_norm": 0.9928672856584764, "learning_rate": 2.3095391382182974e-09, "loss": 0.0296, "step": 6376 }, { "epoch": 2.9748134328358207, "grad_norm": 0.9231333294247348, "learning_rate": 2.1475294968681393e-09, "loss": 0.0253, "step": 6378 }, { "epoch": 2.9757462686567164, "grad_norm": 0.9637582200766458, "learning_rate": 1.9914094593326273e-09, "loss": 0.0263, "step": 6380 }, { "epoch": 2.976679104477612, "grad_norm": 1.118299323214901, "learning_rate": 1.8411792095884839e-09, "loss": 0.0259, "step": 6382 }, { "epoch": 2.9776119402985075, "grad_norm": 0.8457213495206082, "learning_rate": 1.6968389246702078e-09, "loss": 0.0287, "step": 6384 }, { "epoch": 2.978544776119403, "grad_norm": 0.8760950110479465, "learning_rate": 1.5583887746722926e-09, "loss": 0.0259, "step": 6386 }, { "epoch": 2.9794776119402986, "grad_norm": 0.9701160754069891, "learning_rate": 1.4258289227486732e-09, "loss": 0.0277, "step": 6388 }, { "epoch": 2.980410447761194, "grad_norm": 0.9854784089831389, "learning_rate": 1.2991595251110601e-09, "loss": 0.0314, "step": 6390 }, { "epoch": 2.9813432835820897, "grad_norm": 0.9870630151905555, "learning_rate": 1.1783807310300489e-09, "loss": 0.0269, "step": 6392 }, { "epoch": 2.982276119402985, "grad_norm": 0.8952968586690281, "learning_rate": 1.0634926828351212e-09, "loss": 0.0284, "step": 6394 }, { "epoch": 2.9832089552238807, "grad_norm": 0.7978447765777141, "learning_rate": 9.544955159129788e-10, "loss": 0.0252, "step": 6396 }, { "epoch": 2.984141791044776, "grad_norm": 0.9911363159165204, "learning_rate": 8.513893587086542e-10, "loss": 0.0259, "step": 6398 }, { "epoch": 2.9850746268656714, "grad_norm": 0.8169854362980064, "learning_rate": 7.541743327255102e-10, "loss": 0.0227, "step": 6400 }, { "epoch": 2.986007462686567, "grad_norm": 0.7691147539159162, "learning_rate": 6.628505525246853e-10, "loss": 0.0237, "step": 6402 }, { "epoch": 2.986940298507463, "grad_norm": 1.1580080517708256, "learning_rate": 5.77418125723983e-10, "loss": 0.0342, "step": 6404 }, { "epoch": 2.987873134328358, "grad_norm": 0.8169325748785082, "learning_rate": 4.978771529989824e-10, "loss": 0.0264, "step": 6406 }, { "epoch": 2.9888059701492535, "grad_norm": 0.9819887823547255, "learning_rate": 4.242277280841478e-10, "loss": 0.0286, "step": 6408 }, { "epoch": 2.9897388059701493, "grad_norm": 1.0286085389310848, "learning_rate": 3.56469937768944e-10, "loss": 0.0287, "step": 6410 }, { "epoch": 2.9906716417910446, "grad_norm": 0.8886770760953744, "learning_rate": 2.9460386190116594e-10, "loss": 0.03, "step": 6412 }, { "epoch": 2.9916044776119404, "grad_norm": 0.8067009248418416, "learning_rate": 2.386295733852739e-10, "loss": 0.0273, "step": 6414 }, { "epoch": 2.9925373134328357, "grad_norm": 0.9329454148658527, "learning_rate": 1.8854713818350356e-10, "loss": 0.0263, "step": 6416 }, { "epoch": 2.9934701492537314, "grad_norm": 0.9177801864653198, "learning_rate": 1.4435661531420065e-10, "loss": 0.0282, "step": 6418 }, { "epoch": 2.9944029850746268, "grad_norm": 0.831949946740059, "learning_rate": 1.0605805685237613e-10, "loss": 0.0244, "step": 6420 }, { "epoch": 2.9953358208955225, "grad_norm": 0.9174951548532697, "learning_rate": 7.365150792970621e-11, "loss": 0.0258, "step": 6422 }, { "epoch": 2.996268656716418, "grad_norm": 1.0174082789220957, "learning_rate": 4.7137006735642475e-11, "loss": 0.0254, "step": 6424 }, { "epoch": 2.9972014925373136, "grad_norm": 0.8765238929135598, "learning_rate": 2.651458451519151e-11, "loss": 0.0243, "step": 6426 }, { "epoch": 2.998134328358209, "grad_norm": 1.0169919835877, "learning_rate": 1.178426557058021e-11, "loss": 0.0315, "step": 6428 }, { "epoch": 2.9990671641791042, "grad_norm": 0.882103163342718, "learning_rate": 2.94606726070068e-12, "loss": 0.028, "step": 6430 }, { "epoch": 3.0, "grad_norm": 1.3865468012985325, "learning_rate": 0.0, "loss": 0.026, "step": 6432 }, { "epoch": 3.0, "step": 6432, "total_flos": 601546217226240.0, "train_loss": 0.11825174548714167, "train_runtime": 22853.2367, "train_samples_per_second": 18.011, "train_steps_per_second": 0.281 } ], "logging_steps": 2, "max_steps": 6432, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 601546217226240.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }