diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,22650 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 6432, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009328358208955224, + "grad_norm": 141.84653593783037, + "learning_rate": 3.1055900621118015e-08, + "loss": 1.2317, + "step": 2 + }, + { + "epoch": 0.0018656716417910447, + "grad_norm": 73.92673328115033, + "learning_rate": 6.211180124223603e-08, + "loss": 1.1508, + "step": 4 + }, + { + "epoch": 0.002798507462686567, + "grad_norm": 25.161715198399044, + "learning_rate": 9.316770186335405e-08, + "loss": 1.2665, + "step": 6 + }, + { + "epoch": 0.0037313432835820895, + "grad_norm": 125.15554556515958, + "learning_rate": 1.2422360248447206e-07, + "loss": 1.2684, + "step": 8 + }, + { + "epoch": 0.0046641791044776115, + "grad_norm": 34.09560313170012, + "learning_rate": 1.5527950310559006e-07, + "loss": 1.1518, + "step": 10 + }, + { + "epoch": 0.005597014925373134, + "grad_norm": 70.10261737049221, + "learning_rate": 1.863354037267081e-07, + "loss": 1.1829, + "step": 12 + }, + { + "epoch": 0.0065298507462686565, + "grad_norm": 85.58058584581248, + "learning_rate": 2.173913043478261e-07, + "loss": 1.2359, + "step": 14 + }, + { + "epoch": 0.007462686567164179, + "grad_norm": 91.20705086258509, + "learning_rate": 2.484472049689441e-07, + "loss": 1.1911, + "step": 16 + }, + { + "epoch": 0.008395522388059701, + "grad_norm": 80.47290386920726, + "learning_rate": 2.795031055900621e-07, + "loss": 1.1806, + "step": 18 + }, + { + "epoch": 0.009328358208955223, + "grad_norm": 42.91688398439449, + "learning_rate": 3.1055900621118013e-07, + "loss": 1.1706, + "step": 20 + }, + { + "epoch": 0.010261194029850746, + "grad_norm": 17.19066282148705, + "learning_rate": 3.416149068322982e-07, + "loss": 1.1021, + "step": 22 + }, + { + "epoch": 0.011194029850746268, + "grad_norm": 37.72696934607804, + "learning_rate": 3.726708074534162e-07, + "loss": 1.1804, + "step": 24 + }, + { + "epoch": 0.012126865671641791, + "grad_norm": 43.18771357786159, + "learning_rate": 4.037267080745342e-07, + "loss": 1.0774, + "step": 26 + }, + { + "epoch": 0.013059701492537313, + "grad_norm": 22.079075604777447, + "learning_rate": 4.347826086956522e-07, + "loss": 0.9575, + "step": 28 + }, + { + "epoch": 0.013992537313432836, + "grad_norm": 103.94449230610218, + "learning_rate": 4.658385093167702e-07, + "loss": 1.0526, + "step": 30 + }, + { + "epoch": 0.014925373134328358, + "grad_norm": 20.482563257817198, + "learning_rate": 4.968944099378882e-07, + "loss": 0.9768, + "step": 32 + }, + { + "epoch": 0.01585820895522388, + "grad_norm": 92.60009615720648, + "learning_rate": 5.279503105590063e-07, + "loss": 0.9088, + "step": 34 + }, + { + "epoch": 0.016791044776119403, + "grad_norm": 72.58664016267214, + "learning_rate": 5.590062111801243e-07, + "loss": 0.7171, + "step": 36 + }, + { + "epoch": 0.017723880597014924, + "grad_norm": 89.42561621187095, + "learning_rate": 5.900621118012423e-07, + "loss": 0.6902, + "step": 38 + }, + { + "epoch": 0.018656716417910446, + "grad_norm": 17.953741530444063, + "learning_rate": 6.211180124223603e-07, + "loss": 0.6602, + "step": 40 + }, + { + "epoch": 0.01958955223880597, + "grad_norm": 192.7702382369095, + "learning_rate": 6.521739130434783e-07, + "loss": 0.6708, + "step": 42 + }, + { + "epoch": 0.020522388059701493, + "grad_norm": 271.8037595834288, + "learning_rate": 6.832298136645964e-07, + "loss": 0.6101, + "step": 44 + }, + { + "epoch": 0.021455223880597014, + "grad_norm": 245.143722738473, + "learning_rate": 7.142857142857143e-07, + "loss": 0.578, + "step": 46 + }, + { + "epoch": 0.022388059701492536, + "grad_norm": 82.24029703991424, + "learning_rate": 7.453416149068324e-07, + "loss": 0.5357, + "step": 48 + }, + { + "epoch": 0.02332089552238806, + "grad_norm": 106.27466386311123, + "learning_rate": 7.763975155279503e-07, + "loss": 0.5292, + "step": 50 + }, + { + "epoch": 0.024253731343283583, + "grad_norm": 28.097169922015855, + "learning_rate": 8.074534161490684e-07, + "loss": 0.4957, + "step": 52 + }, + { + "epoch": 0.025186567164179104, + "grad_norm": 32.00034184187322, + "learning_rate": 8.385093167701864e-07, + "loss": 0.4817, + "step": 54 + }, + { + "epoch": 0.026119402985074626, + "grad_norm": 28.389364549998767, + "learning_rate": 8.695652173913044e-07, + "loss": 0.4472, + "step": 56 + }, + { + "epoch": 0.027052238805970148, + "grad_norm": 5.448904726347897, + "learning_rate": 9.006211180124224e-07, + "loss": 0.4102, + "step": 58 + }, + { + "epoch": 0.027985074626865673, + "grad_norm": 47.33775950791769, + "learning_rate": 9.316770186335404e-07, + "loss": 0.3834, + "step": 60 + }, + { + "epoch": 0.028917910447761194, + "grad_norm": 35.171578880874314, + "learning_rate": 9.627329192546585e-07, + "loss": 0.3838, + "step": 62 + }, + { + "epoch": 0.029850746268656716, + "grad_norm": 22.602001415969152, + "learning_rate": 9.937888198757765e-07, + "loss": 0.395, + "step": 64 + }, + { + "epoch": 0.030783582089552237, + "grad_norm": 43.14849541602834, + "learning_rate": 1.0248447204968944e-06, + "loss": 0.373, + "step": 66 + }, + { + "epoch": 0.03171641791044776, + "grad_norm": 33.28823729652902, + "learning_rate": 1.0559006211180126e-06, + "loss": 0.3767, + "step": 68 + }, + { + "epoch": 0.03264925373134328, + "grad_norm": 35.60279059054033, + "learning_rate": 1.0869565217391306e-06, + "loss": 0.3665, + "step": 70 + }, + { + "epoch": 0.033582089552238806, + "grad_norm": 20.477590777483467, + "learning_rate": 1.1180124223602485e-06, + "loss": 0.3856, + "step": 72 + }, + { + "epoch": 0.03451492537313433, + "grad_norm": 4.733191255137944, + "learning_rate": 1.1490683229813664e-06, + "loss": 0.3376, + "step": 74 + }, + { + "epoch": 0.03544776119402985, + "grad_norm": 21.095445622645272, + "learning_rate": 1.1801242236024846e-06, + "loss": 0.3246, + "step": 76 + }, + { + "epoch": 0.036380597014925374, + "grad_norm": 33.60900782671738, + "learning_rate": 1.2111801242236026e-06, + "loss": 0.316, + "step": 78 + }, + { + "epoch": 0.03731343283582089, + "grad_norm": 37.44174652819769, + "learning_rate": 1.2422360248447205e-06, + "loss": 0.3482, + "step": 80 + }, + { + "epoch": 0.03824626865671642, + "grad_norm": 27.856083578813237, + "learning_rate": 1.2732919254658385e-06, + "loss": 0.3255, + "step": 82 + }, + { + "epoch": 0.03917910447761194, + "grad_norm": 20.49023173459329, + "learning_rate": 1.3043478260869566e-06, + "loss": 0.3151, + "step": 84 + }, + { + "epoch": 0.04011194029850746, + "grad_norm": 5.727461718902441, + "learning_rate": 1.3354037267080746e-06, + "loss": 0.3019, + "step": 86 + }, + { + "epoch": 0.041044776119402986, + "grad_norm": 19.508756868837175, + "learning_rate": 1.3664596273291927e-06, + "loss": 0.3101, + "step": 88 + }, + { + "epoch": 0.04197761194029851, + "grad_norm": 24.136335304817063, + "learning_rate": 1.3975155279503105e-06, + "loss": 0.318, + "step": 90 + }, + { + "epoch": 0.04291044776119403, + "grad_norm": 4.691307551468095, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.3065, + "step": 92 + }, + { + "epoch": 0.043843283582089554, + "grad_norm": 21.636244861854642, + "learning_rate": 1.4596273291925466e-06, + "loss": 0.3061, + "step": 94 + }, + { + "epoch": 0.04477611940298507, + "grad_norm": 15.923269773290114, + "learning_rate": 1.4906832298136647e-06, + "loss": 0.2792, + "step": 96 + }, + { + "epoch": 0.0457089552238806, + "grad_norm": 4.6897990036747546, + "learning_rate": 1.521739130434783e-06, + "loss": 0.3085, + "step": 98 + }, + { + "epoch": 0.04664179104477612, + "grad_norm": 7.167545200566593, + "learning_rate": 1.5527950310559006e-06, + "loss": 0.2911, + "step": 100 + }, + { + "epoch": 0.04757462686567164, + "grad_norm": 4.515330928399463, + "learning_rate": 1.5838509316770188e-06, + "loss": 0.2937, + "step": 102 + }, + { + "epoch": 0.048507462686567165, + "grad_norm": 4.511662569904355, + "learning_rate": 1.6149068322981367e-06, + "loss": 0.2898, + "step": 104 + }, + { + "epoch": 0.049440298507462684, + "grad_norm": 4.617959983767074, + "learning_rate": 1.645962732919255e-06, + "loss": 0.2755, + "step": 106 + }, + { + "epoch": 0.05037313432835821, + "grad_norm": 10.068851464677156, + "learning_rate": 1.6770186335403729e-06, + "loss": 0.2776, + "step": 108 + }, + { + "epoch": 0.051305970149253734, + "grad_norm": 6.1624005661275785, + "learning_rate": 1.7080745341614908e-06, + "loss": 0.3112, + "step": 110 + }, + { + "epoch": 0.05223880597014925, + "grad_norm": 4.701426834934968, + "learning_rate": 1.7391304347826088e-06, + "loss": 0.2775, + "step": 112 + }, + { + "epoch": 0.05317164179104478, + "grad_norm": 10.590233086908592, + "learning_rate": 1.770186335403727e-06, + "loss": 0.2883, + "step": 114 + }, + { + "epoch": 0.054104477611940295, + "grad_norm": 4.270783196100384, + "learning_rate": 1.8012422360248449e-06, + "loss": 0.2931, + "step": 116 + }, + { + "epoch": 0.05503731343283582, + "grad_norm": 3.790903016107792, + "learning_rate": 1.832298136645963e-06, + "loss": 0.2963, + "step": 118 + }, + { + "epoch": 0.055970149253731345, + "grad_norm": 3.7190724453202084, + "learning_rate": 1.8633540372670808e-06, + "loss": 0.2805, + "step": 120 + }, + { + "epoch": 0.05690298507462686, + "grad_norm": 7.40396046398378, + "learning_rate": 1.894409937888199e-06, + "loss": 0.2754, + "step": 122 + }, + { + "epoch": 0.05783582089552239, + "grad_norm": 16.803503733002056, + "learning_rate": 1.925465838509317e-06, + "loss": 0.2662, + "step": 124 + }, + { + "epoch": 0.058768656716417914, + "grad_norm": 13.285898393084533, + "learning_rate": 1.956521739130435e-06, + "loss": 0.2735, + "step": 126 + }, + { + "epoch": 0.05970149253731343, + "grad_norm": 9.1224986259931, + "learning_rate": 1.987577639751553e-06, + "loss": 0.286, + "step": 128 + }, + { + "epoch": 0.06063432835820896, + "grad_norm": 5.613611256654138, + "learning_rate": 2.018633540372671e-06, + "loss": 0.2662, + "step": 130 + }, + { + "epoch": 0.061567164179104475, + "grad_norm": 7.545064210551538, + "learning_rate": 2.049689440993789e-06, + "loss": 0.2461, + "step": 132 + }, + { + "epoch": 0.0625, + "grad_norm": 4.173401955055914, + "learning_rate": 2.0807453416149073e-06, + "loss": 0.2715, + "step": 134 + }, + { + "epoch": 0.06343283582089553, + "grad_norm": 8.485933023187348, + "learning_rate": 2.111801242236025e-06, + "loss": 0.2566, + "step": 136 + }, + { + "epoch": 0.06436567164179105, + "grad_norm": 4.19842829482207, + "learning_rate": 2.1428571428571427e-06, + "loss": 0.2735, + "step": 138 + }, + { + "epoch": 0.06529850746268656, + "grad_norm": 8.425962572484579, + "learning_rate": 2.173913043478261e-06, + "loss": 0.2659, + "step": 140 + }, + { + "epoch": 0.06623134328358209, + "grad_norm": 3.2805416423890033, + "learning_rate": 2.204968944099379e-06, + "loss": 0.2462, + "step": 142 + }, + { + "epoch": 0.06716417910447761, + "grad_norm": 3.3278531588236193, + "learning_rate": 2.236024844720497e-06, + "loss": 0.2564, + "step": 144 + }, + { + "epoch": 0.06809701492537314, + "grad_norm": 4.956170528518187, + "learning_rate": 2.2670807453416154e-06, + "loss": 0.2721, + "step": 146 + }, + { + "epoch": 0.06902985074626866, + "grad_norm": 3.722032231542065, + "learning_rate": 2.298136645962733e-06, + "loss": 0.2456, + "step": 148 + }, + { + "epoch": 0.06996268656716417, + "grad_norm": 12.012432614210493, + "learning_rate": 2.3291925465838513e-06, + "loss": 0.2564, + "step": 150 + }, + { + "epoch": 0.0708955223880597, + "grad_norm": 3.5922232704959334, + "learning_rate": 2.3602484472049692e-06, + "loss": 0.2595, + "step": 152 + }, + { + "epoch": 0.07182835820895522, + "grad_norm": 3.191669357887466, + "learning_rate": 2.391304347826087e-06, + "loss": 0.2398, + "step": 154 + }, + { + "epoch": 0.07276119402985075, + "grad_norm": 3.194123430092511, + "learning_rate": 2.422360248447205e-06, + "loss": 0.2318, + "step": 156 + }, + { + "epoch": 0.07369402985074627, + "grad_norm": 3.083149240607351, + "learning_rate": 2.453416149068323e-06, + "loss": 0.2267, + "step": 158 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 5.7100674979514885, + "learning_rate": 2.484472049689441e-06, + "loss": 0.2394, + "step": 160 + }, + { + "epoch": 0.07555970149253731, + "grad_norm": 4.621211487899202, + "learning_rate": 2.515527950310559e-06, + "loss": 0.2642, + "step": 162 + }, + { + "epoch": 0.07649253731343283, + "grad_norm": 4.216113351048156, + "learning_rate": 2.546583850931677e-06, + "loss": 0.232, + "step": 164 + }, + { + "epoch": 0.07742537313432836, + "grad_norm": 3.3356585618193075, + "learning_rate": 2.5776397515527953e-06, + "loss": 0.234, + "step": 166 + }, + { + "epoch": 0.07835820895522388, + "grad_norm": 5.092703909893854, + "learning_rate": 2.6086956521739132e-06, + "loss": 0.2259, + "step": 168 + }, + { + "epoch": 0.07929104477611941, + "grad_norm": 6.6435165949486725, + "learning_rate": 2.639751552795031e-06, + "loss": 0.2498, + "step": 170 + }, + { + "epoch": 0.08022388059701492, + "grad_norm": 3.849984947948696, + "learning_rate": 2.670807453416149e-06, + "loss": 0.2438, + "step": 172 + }, + { + "epoch": 0.08115671641791045, + "grad_norm": 3.510096516942484, + "learning_rate": 2.7018633540372675e-06, + "loss": 0.225, + "step": 174 + }, + { + "epoch": 0.08208955223880597, + "grad_norm": 3.1995013521028457, + "learning_rate": 2.7329192546583855e-06, + "loss": 0.2238, + "step": 176 + }, + { + "epoch": 0.0830223880597015, + "grad_norm": 3.2687726372112986, + "learning_rate": 2.7639751552795034e-06, + "loss": 0.207, + "step": 178 + }, + { + "epoch": 0.08395522388059702, + "grad_norm": 3.3120599930892243, + "learning_rate": 2.795031055900621e-06, + "loss": 0.2226, + "step": 180 + }, + { + "epoch": 0.08488805970149253, + "grad_norm": 3.230016392779267, + "learning_rate": 2.8260869565217393e-06, + "loss": 0.2168, + "step": 182 + }, + { + "epoch": 0.08582089552238806, + "grad_norm": 3.1363971094017042, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.2232, + "step": 184 + }, + { + "epoch": 0.08675373134328358, + "grad_norm": 2.757809443179066, + "learning_rate": 2.888198757763975e-06, + "loss": 0.2296, + "step": 186 + }, + { + "epoch": 0.08768656716417911, + "grad_norm": 3.1026324097251017, + "learning_rate": 2.919254658385093e-06, + "loss": 0.2174, + "step": 188 + }, + { + "epoch": 0.08861940298507463, + "grad_norm": 3.1563889262406772, + "learning_rate": 2.9503105590062115e-06, + "loss": 0.2205, + "step": 190 + }, + { + "epoch": 0.08955223880597014, + "grad_norm": 3.250136524075722, + "learning_rate": 2.9813664596273295e-06, + "loss": 0.2409, + "step": 192 + }, + { + "epoch": 0.09048507462686567, + "grad_norm": 2.9602639513067603, + "learning_rate": 3.0124223602484474e-06, + "loss": 0.2022, + "step": 194 + }, + { + "epoch": 0.0914179104477612, + "grad_norm": 2.734325686913689, + "learning_rate": 3.043478260869566e-06, + "loss": 0.2198, + "step": 196 + }, + { + "epoch": 0.09235074626865672, + "grad_norm": 2.556678285564254, + "learning_rate": 3.0745341614906837e-06, + "loss": 0.2069, + "step": 198 + }, + { + "epoch": 0.09328358208955224, + "grad_norm": 2.858382687845117, + "learning_rate": 3.1055900621118013e-06, + "loss": 0.2091, + "step": 200 + }, + { + "epoch": 0.09421641791044776, + "grad_norm": 3.011854932973755, + "learning_rate": 3.1366459627329192e-06, + "loss": 0.2168, + "step": 202 + }, + { + "epoch": 0.09514925373134328, + "grad_norm": 3.8235132670407874, + "learning_rate": 3.1677018633540376e-06, + "loss": 0.2174, + "step": 204 + }, + { + "epoch": 0.0960820895522388, + "grad_norm": 3.0628441148043666, + "learning_rate": 3.1987577639751555e-06, + "loss": 0.2191, + "step": 206 + }, + { + "epoch": 0.09701492537313433, + "grad_norm": 3.090876344596246, + "learning_rate": 3.2298136645962735e-06, + "loss": 0.2377, + "step": 208 + }, + { + "epoch": 0.09794776119402986, + "grad_norm": 2.7448364914565455, + "learning_rate": 3.2608695652173914e-06, + "loss": 0.2058, + "step": 210 + }, + { + "epoch": 0.09888059701492537, + "grad_norm": 2.5180579026465035, + "learning_rate": 3.29192546583851e-06, + "loss": 0.1897, + "step": 212 + }, + { + "epoch": 0.09981343283582089, + "grad_norm": 2.7339433310625654, + "learning_rate": 3.3229813664596278e-06, + "loss": 0.2213, + "step": 214 + }, + { + "epoch": 0.10074626865671642, + "grad_norm": 2.726903211923335, + "learning_rate": 3.3540372670807457e-06, + "loss": 0.2142, + "step": 216 + }, + { + "epoch": 0.10167910447761194, + "grad_norm": 2.866983094725867, + "learning_rate": 3.3850931677018632e-06, + "loss": 0.2079, + "step": 218 + }, + { + "epoch": 0.10261194029850747, + "grad_norm": 2.534992808853139, + "learning_rate": 3.4161490683229816e-06, + "loss": 0.2095, + "step": 220 + }, + { + "epoch": 0.10354477611940298, + "grad_norm": 2.7790532064716027, + "learning_rate": 3.4472049689440996e-06, + "loss": 0.2172, + "step": 222 + }, + { + "epoch": 0.1044776119402985, + "grad_norm": 3.0318314138749014, + "learning_rate": 3.4782608695652175e-06, + "loss": 0.2075, + "step": 224 + }, + { + "epoch": 0.10541044776119403, + "grad_norm": 2.607798535697695, + "learning_rate": 3.5093167701863355e-06, + "loss": 0.2012, + "step": 226 + }, + { + "epoch": 0.10634328358208955, + "grad_norm": 3.2197760095787875, + "learning_rate": 3.540372670807454e-06, + "loss": 0.1986, + "step": 228 + }, + { + "epoch": 0.10727611940298508, + "grad_norm": 2.7939815005855237, + "learning_rate": 3.5714285714285718e-06, + "loss": 0.2185, + "step": 230 + }, + { + "epoch": 0.10820895522388059, + "grad_norm": 2.5859920654046533, + "learning_rate": 3.6024844720496897e-06, + "loss": 0.2124, + "step": 232 + }, + { + "epoch": 0.10914179104477612, + "grad_norm": 2.503484587876454, + "learning_rate": 3.633540372670808e-06, + "loss": 0.219, + "step": 234 + }, + { + "epoch": 0.11007462686567164, + "grad_norm": 3.2942336044988125, + "learning_rate": 3.664596273291926e-06, + "loss": 0.2079, + "step": 236 + }, + { + "epoch": 0.11100746268656717, + "grad_norm": 2.6441126873407477, + "learning_rate": 3.6956521739130436e-06, + "loss": 0.2222, + "step": 238 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 2.487499257722225, + "learning_rate": 3.7267080745341615e-06, + "loss": 0.2257, + "step": 240 + }, + { + "epoch": 0.11287313432835822, + "grad_norm": 2.69554936919851, + "learning_rate": 3.7577639751552795e-06, + "loss": 0.1924, + "step": 242 + }, + { + "epoch": 0.11380597014925373, + "grad_norm": 2.5792569401105583, + "learning_rate": 3.788819875776398e-06, + "loss": 0.1943, + "step": 244 + }, + { + "epoch": 0.11473880597014925, + "grad_norm": 2.6356025530311453, + "learning_rate": 3.819875776397516e-06, + "loss": 0.2285, + "step": 246 + }, + { + "epoch": 0.11567164179104478, + "grad_norm": 2.3308293097806625, + "learning_rate": 3.850931677018634e-06, + "loss": 0.1999, + "step": 248 + }, + { + "epoch": 0.1166044776119403, + "grad_norm": 2.750925154827227, + "learning_rate": 3.881987577639752e-06, + "loss": 0.2221, + "step": 250 + }, + { + "epoch": 0.11753731343283583, + "grad_norm": 2.5408465330033754, + "learning_rate": 3.91304347826087e-06, + "loss": 0.2217, + "step": 252 + }, + { + "epoch": 0.11847014925373134, + "grad_norm": 2.698801962288923, + "learning_rate": 3.9440993788819884e-06, + "loss": 0.2335, + "step": 254 + }, + { + "epoch": 0.11940298507462686, + "grad_norm": 2.549182806708308, + "learning_rate": 3.975155279503106e-06, + "loss": 0.2037, + "step": 256 + }, + { + "epoch": 0.12033582089552239, + "grad_norm": 2.464821690021277, + "learning_rate": 4.0062111801242235e-06, + "loss": 0.2078, + "step": 258 + }, + { + "epoch": 0.12126865671641791, + "grad_norm": 2.321049960144863, + "learning_rate": 4.037267080745342e-06, + "loss": 0.2054, + "step": 260 + }, + { + "epoch": 0.12220149253731344, + "grad_norm": 2.493722894686357, + "learning_rate": 4.06832298136646e-06, + "loss": 0.2041, + "step": 262 + }, + { + "epoch": 0.12313432835820895, + "grad_norm": 2.442901683209422, + "learning_rate": 4.099378881987578e-06, + "loss": 0.2287, + "step": 264 + }, + { + "epoch": 0.12406716417910447, + "grad_norm": 4.717151474594673, + "learning_rate": 4.130434782608696e-06, + "loss": 0.2152, + "step": 266 + }, + { + "epoch": 0.125, + "grad_norm": 5.324903092991523, + "learning_rate": 4.1614906832298145e-06, + "loss": 0.2199, + "step": 268 + }, + { + "epoch": 0.1259328358208955, + "grad_norm": 2.518797731342692, + "learning_rate": 4.192546583850932e-06, + "loss": 0.2109, + "step": 270 + }, + { + "epoch": 0.12686567164179105, + "grad_norm": 2.5709618938357695, + "learning_rate": 4.22360248447205e-06, + "loss": 0.2112, + "step": 272 + }, + { + "epoch": 0.12779850746268656, + "grad_norm": 2.6970173166428264, + "learning_rate": 4.254658385093168e-06, + "loss": 0.2067, + "step": 274 + }, + { + "epoch": 0.1287313432835821, + "grad_norm": 2.264863737900119, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.1838, + "step": 276 + }, + { + "epoch": 0.1296641791044776, + "grad_norm": 2.2780908029059295, + "learning_rate": 4.316770186335404e-06, + "loss": 0.2125, + "step": 278 + }, + { + "epoch": 0.13059701492537312, + "grad_norm": 2.352595434996149, + "learning_rate": 4.347826086956522e-06, + "loss": 0.2182, + "step": 280 + }, + { + "epoch": 0.13152985074626866, + "grad_norm": 2.3226612212246165, + "learning_rate": 4.37888198757764e-06, + "loss": 0.1962, + "step": 282 + }, + { + "epoch": 0.13246268656716417, + "grad_norm": 2.1616013785625374, + "learning_rate": 4.409937888198758e-06, + "loss": 0.2112, + "step": 284 + }, + { + "epoch": 0.1333955223880597, + "grad_norm": 2.269898646687184, + "learning_rate": 4.4409937888198765e-06, + "loss": 0.2355, + "step": 286 + }, + { + "epoch": 0.13432835820895522, + "grad_norm": 2.2764896749881225, + "learning_rate": 4.472049689440994e-06, + "loss": 0.2184, + "step": 288 + }, + { + "epoch": 0.13526119402985073, + "grad_norm": 2.216197208473881, + "learning_rate": 4.503105590062112e-06, + "loss": 0.2259, + "step": 290 + }, + { + "epoch": 0.13619402985074627, + "grad_norm": 2.4094758119083055, + "learning_rate": 4.534161490683231e-06, + "loss": 0.2059, + "step": 292 + }, + { + "epoch": 0.13712686567164178, + "grad_norm": 2.6476080672295197, + "learning_rate": 4.565217391304348e-06, + "loss": 0.2147, + "step": 294 + }, + { + "epoch": 0.13805970149253732, + "grad_norm": 2.21698603592508, + "learning_rate": 4.596273291925466e-06, + "loss": 0.199, + "step": 296 + }, + { + "epoch": 0.13899253731343283, + "grad_norm": 2.963295045616477, + "learning_rate": 4.627329192546584e-06, + "loss": 0.2067, + "step": 298 + }, + { + "epoch": 0.13992537313432835, + "grad_norm": 2.0669567246443896, + "learning_rate": 4.6583850931677025e-06, + "loss": 0.2151, + "step": 300 + }, + { + "epoch": 0.14085820895522388, + "grad_norm": 2.1816566602665297, + "learning_rate": 4.68944099378882e-06, + "loss": 0.205, + "step": 302 + }, + { + "epoch": 0.1417910447761194, + "grad_norm": 2.4711350719567413, + "learning_rate": 4.7204968944099384e-06, + "loss": 0.2186, + "step": 304 + }, + { + "epoch": 0.14272388059701493, + "grad_norm": 2.6305476092002102, + "learning_rate": 4.751552795031056e-06, + "loss": 0.2188, + "step": 306 + }, + { + "epoch": 0.14365671641791045, + "grad_norm": 3.7926853735649146, + "learning_rate": 4.782608695652174e-06, + "loss": 0.2146, + "step": 308 + }, + { + "epoch": 0.14458955223880596, + "grad_norm": 2.3247573472731755, + "learning_rate": 4.813664596273293e-06, + "loss": 0.189, + "step": 310 + }, + { + "epoch": 0.1455223880597015, + "grad_norm": 3.506305560875989, + "learning_rate": 4.84472049689441e-06, + "loss": 0.204, + "step": 312 + }, + { + "epoch": 0.146455223880597, + "grad_norm": 4.299984565412355, + "learning_rate": 4.875776397515528e-06, + "loss": 0.1872, + "step": 314 + }, + { + "epoch": 0.14738805970149255, + "grad_norm": 2.9268086038939476, + "learning_rate": 4.906832298136646e-06, + "loss": 0.2021, + "step": 316 + }, + { + "epoch": 0.14832089552238806, + "grad_norm": 2.1656703445175864, + "learning_rate": 4.9378881987577645e-06, + "loss": 0.2024, + "step": 318 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 2.2863114880706834, + "learning_rate": 4.968944099378882e-06, + "loss": 0.2226, + "step": 320 + }, + { + "epoch": 0.1501865671641791, + "grad_norm": 2.9771175284921316, + "learning_rate": 5e-06, + "loss": 0.2112, + "step": 322 + }, + { + "epoch": 0.15111940298507462, + "grad_norm": 2.4893906337440925, + "learning_rate": 5.031055900621118e-06, + "loss": 0.2073, + "step": 324 + }, + { + "epoch": 0.15205223880597016, + "grad_norm": 3.6982120041090334, + "learning_rate": 5.062111801242236e-06, + "loss": 0.213, + "step": 326 + }, + { + "epoch": 0.15298507462686567, + "grad_norm": 2.3205143866164835, + "learning_rate": 5.093167701863354e-06, + "loss": 0.2004, + "step": 328 + }, + { + "epoch": 0.15391791044776118, + "grad_norm": 21.074705336946614, + "learning_rate": 5.124223602484473e-06, + "loss": 0.209, + "step": 330 + }, + { + "epoch": 0.15485074626865672, + "grad_norm": 13.885594746135343, + "learning_rate": 5.155279503105591e-06, + "loss": 0.1944, + "step": 332 + }, + { + "epoch": 0.15578358208955223, + "grad_norm": 2.591129823426044, + "learning_rate": 5.186335403726709e-06, + "loss": 0.2234, + "step": 334 + }, + { + "epoch": 0.15671641791044777, + "grad_norm": 3.082266049414265, + "learning_rate": 5.2173913043478265e-06, + "loss": 0.2056, + "step": 336 + }, + { + "epoch": 0.15764925373134328, + "grad_norm": 6.539986385179368, + "learning_rate": 5.248447204968945e-06, + "loss": 0.2135, + "step": 338 + }, + { + "epoch": 0.15858208955223882, + "grad_norm": 2.4097236824819745, + "learning_rate": 5.279503105590062e-06, + "loss": 0.2327, + "step": 340 + }, + { + "epoch": 0.15951492537313433, + "grad_norm": 2.253452500793219, + "learning_rate": 5.31055900621118e-06, + "loss": 0.2199, + "step": 342 + }, + { + "epoch": 0.16044776119402984, + "grad_norm": 7.406755784306411, + "learning_rate": 5.341614906832298e-06, + "loss": 0.2211, + "step": 344 + }, + { + "epoch": 0.16138059701492538, + "grad_norm": 2.7739931927374974, + "learning_rate": 5.372670807453416e-06, + "loss": 0.2054, + "step": 346 + }, + { + "epoch": 0.1623134328358209, + "grad_norm": 3.7795105893695244, + "learning_rate": 5.403726708074535e-06, + "loss": 0.2133, + "step": 348 + }, + { + "epoch": 0.16324626865671643, + "grad_norm": 2.2534160135806265, + "learning_rate": 5.4347826086956525e-06, + "loss": 0.2147, + "step": 350 + }, + { + "epoch": 0.16417910447761194, + "grad_norm": 2.1919756416116405, + "learning_rate": 5.465838509316771e-06, + "loss": 0.2156, + "step": 352 + }, + { + "epoch": 0.16511194029850745, + "grad_norm": 2.5787391970560107, + "learning_rate": 5.4968944099378884e-06, + "loss": 0.2155, + "step": 354 + }, + { + "epoch": 0.166044776119403, + "grad_norm": 2.807216789717174, + "learning_rate": 5.527950310559007e-06, + "loss": 0.209, + "step": 356 + }, + { + "epoch": 0.1669776119402985, + "grad_norm": 2.596292546872093, + "learning_rate": 5.559006211180124e-06, + "loss": 0.2105, + "step": 358 + }, + { + "epoch": 0.16791044776119404, + "grad_norm": 2.6143591049228765, + "learning_rate": 5.590062111801242e-06, + "loss": 0.1968, + "step": 360 + }, + { + "epoch": 0.16884328358208955, + "grad_norm": 2.1877348191610486, + "learning_rate": 5.621118012422361e-06, + "loss": 0.2227, + "step": 362 + }, + { + "epoch": 0.16977611940298507, + "grad_norm": 2.2277996288851893, + "learning_rate": 5.652173913043479e-06, + "loss": 0.2053, + "step": 364 + }, + { + "epoch": 0.1707089552238806, + "grad_norm": 1.9344505939464096, + "learning_rate": 5.683229813664597e-06, + "loss": 0.1832, + "step": 366 + }, + { + "epoch": 0.17164179104477612, + "grad_norm": 2.11594386840255, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.2068, + "step": 368 + }, + { + "epoch": 0.17257462686567165, + "grad_norm": 2.267187349982284, + "learning_rate": 5.745341614906833e-06, + "loss": 0.2328, + "step": 370 + }, + { + "epoch": 0.17350746268656717, + "grad_norm": 2.3086656178819065, + "learning_rate": 5.77639751552795e-06, + "loss": 0.2077, + "step": 372 + }, + { + "epoch": 0.17444029850746268, + "grad_norm": 2.618455337831308, + "learning_rate": 5.80745341614907e-06, + "loss": 0.2223, + "step": 374 + }, + { + "epoch": 0.17537313432835822, + "grad_norm": 2.3160961499030477, + "learning_rate": 5.838509316770186e-06, + "loss": 0.2025, + "step": 376 + }, + { + "epoch": 0.17630597014925373, + "grad_norm": 2.515044712544632, + "learning_rate": 5.8695652173913055e-06, + "loss": 0.2129, + "step": 378 + }, + { + "epoch": 0.17723880597014927, + "grad_norm": 2.344979232961502, + "learning_rate": 5.900621118012423e-06, + "loss": 0.2018, + "step": 380 + }, + { + "epoch": 0.17817164179104478, + "grad_norm": 3.0863869442400094, + "learning_rate": 5.931677018633541e-06, + "loss": 0.2211, + "step": 382 + }, + { + "epoch": 0.1791044776119403, + "grad_norm": 2.0661061390705107, + "learning_rate": 5.962732919254659e-06, + "loss": 0.2115, + "step": 384 + }, + { + "epoch": 0.18003731343283583, + "grad_norm": 2.2332463397440834, + "learning_rate": 5.9937888198757765e-06, + "loss": 0.2175, + "step": 386 + }, + { + "epoch": 0.18097014925373134, + "grad_norm": 2.2404951359471794, + "learning_rate": 6.024844720496895e-06, + "loss": 0.219, + "step": 388 + }, + { + "epoch": 0.18190298507462688, + "grad_norm": 2.0830946206969503, + "learning_rate": 6.055900621118012e-06, + "loss": 0.2138, + "step": 390 + }, + { + "epoch": 0.1828358208955224, + "grad_norm": 2.451234201268342, + "learning_rate": 6.086956521739132e-06, + "loss": 0.2101, + "step": 392 + }, + { + "epoch": 0.1837686567164179, + "grad_norm": 2.261356156925831, + "learning_rate": 6.118012422360249e-06, + "loss": 0.2258, + "step": 394 + }, + { + "epoch": 0.18470149253731344, + "grad_norm": 2.2792460661051694, + "learning_rate": 6.1490683229813675e-06, + "loss": 0.2174, + "step": 396 + }, + { + "epoch": 0.18563432835820895, + "grad_norm": 2.093405250955453, + "learning_rate": 6.180124223602485e-06, + "loss": 0.2079, + "step": 398 + }, + { + "epoch": 0.1865671641791045, + "grad_norm": 1.8380326706729546, + "learning_rate": 6.2111801242236025e-06, + "loss": 0.2016, + "step": 400 + }, + { + "epoch": 0.1875, + "grad_norm": 2.2798338105414873, + "learning_rate": 6.242236024844721e-06, + "loss": 0.2031, + "step": 402 + }, + { + "epoch": 0.1884328358208955, + "grad_norm": 2.2241009433733305, + "learning_rate": 6.2732919254658384e-06, + "loss": 0.215, + "step": 404 + }, + { + "epoch": 0.18936567164179105, + "grad_norm": 1.778064175691528, + "learning_rate": 6.304347826086958e-06, + "loss": 0.1918, + "step": 406 + }, + { + "epoch": 0.19029850746268656, + "grad_norm": 2.2863004818403145, + "learning_rate": 6.335403726708075e-06, + "loss": 0.2047, + "step": 408 + }, + { + "epoch": 0.1912313432835821, + "grad_norm": 2.09457967279415, + "learning_rate": 6.3664596273291936e-06, + "loss": 0.1976, + "step": 410 + }, + { + "epoch": 0.1921641791044776, + "grad_norm": 2.0479959118495237, + "learning_rate": 6.397515527950311e-06, + "loss": 0.1945, + "step": 412 + }, + { + "epoch": 0.19309701492537312, + "grad_norm": 1.9547549466115088, + "learning_rate": 6.4285714285714295e-06, + "loss": 0.1873, + "step": 414 + }, + { + "epoch": 0.19402985074626866, + "grad_norm": 2.640810733476452, + "learning_rate": 6.459627329192547e-06, + "loss": 0.2207, + "step": 416 + }, + { + "epoch": 0.19496268656716417, + "grad_norm": 2.1638071780760972, + "learning_rate": 6.4906832298136645e-06, + "loss": 0.2036, + "step": 418 + }, + { + "epoch": 0.1958955223880597, + "grad_norm": 2.3730015296994753, + "learning_rate": 6.521739130434783e-06, + "loss": 0.1951, + "step": 420 + }, + { + "epoch": 0.19682835820895522, + "grad_norm": 1.8550104155904206, + "learning_rate": 6.5527950310559e-06, + "loss": 0.2211, + "step": 422 + }, + { + "epoch": 0.19776119402985073, + "grad_norm": 2.171542641520255, + "learning_rate": 6.58385093167702e-06, + "loss": 0.2051, + "step": 424 + }, + { + "epoch": 0.19869402985074627, + "grad_norm": 2.120619792279606, + "learning_rate": 6.614906832298137e-06, + "loss": 0.2241, + "step": 426 + }, + { + "epoch": 0.19962686567164178, + "grad_norm": 2.109162558503626, + "learning_rate": 6.6459627329192555e-06, + "loss": 0.2179, + "step": 428 + }, + { + "epoch": 0.20055970149253732, + "grad_norm": 1.9529233895125344, + "learning_rate": 6.677018633540373e-06, + "loss": 0.2235, + "step": 430 + }, + { + "epoch": 0.20149253731343283, + "grad_norm": 1.9975837636413338, + "learning_rate": 6.7080745341614914e-06, + "loss": 0.2027, + "step": 432 + }, + { + "epoch": 0.20242537313432835, + "grad_norm": 1.7993809748778602, + "learning_rate": 6.739130434782609e-06, + "loss": 0.2098, + "step": 434 + }, + { + "epoch": 0.20335820895522388, + "grad_norm": 1.9132091416861527, + "learning_rate": 6.7701863354037265e-06, + "loss": 0.2081, + "step": 436 + }, + { + "epoch": 0.2042910447761194, + "grad_norm": 2.242483796211979, + "learning_rate": 6.801242236024846e-06, + "loss": 0.2234, + "step": 438 + }, + { + "epoch": 0.20522388059701493, + "grad_norm": 1.8892510327716943, + "learning_rate": 6.832298136645963e-06, + "loss": 0.2041, + "step": 440 + }, + { + "epoch": 0.20615671641791045, + "grad_norm": 1.805957622431701, + "learning_rate": 6.863354037267082e-06, + "loss": 0.2236, + "step": 442 + }, + { + "epoch": 0.20708955223880596, + "grad_norm": 2.152721916937432, + "learning_rate": 6.894409937888199e-06, + "loss": 0.2186, + "step": 444 + }, + { + "epoch": 0.2080223880597015, + "grad_norm": 1.9420097502059597, + "learning_rate": 6.9254658385093175e-06, + "loss": 0.2187, + "step": 446 + }, + { + "epoch": 0.208955223880597, + "grad_norm": 1.8948379598264316, + "learning_rate": 6.956521739130435e-06, + "loss": 0.2223, + "step": 448 + }, + { + "epoch": 0.20988805970149255, + "grad_norm": 2.0999160252470013, + "learning_rate": 6.987577639751553e-06, + "loss": 0.2092, + "step": 450 + }, + { + "epoch": 0.21082089552238806, + "grad_norm": 2.010887578337831, + "learning_rate": 7.018633540372671e-06, + "loss": 0.2046, + "step": 452 + }, + { + "epoch": 0.21175373134328357, + "grad_norm": 1.9643307023414667, + "learning_rate": 7.04968944099379e-06, + "loss": 0.21, + "step": 454 + }, + { + "epoch": 0.2126865671641791, + "grad_norm": 2.019291767227252, + "learning_rate": 7.080745341614908e-06, + "loss": 0.2173, + "step": 456 + }, + { + "epoch": 0.21361940298507462, + "grad_norm": 1.8467481935245105, + "learning_rate": 7.111801242236025e-06, + "loss": 0.2167, + "step": 458 + }, + { + "epoch": 0.21455223880597016, + "grad_norm": 1.9523692161314665, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.1908, + "step": 460 + }, + { + "epoch": 0.21548507462686567, + "grad_norm": 2.112706188943294, + "learning_rate": 7.173913043478261e-06, + "loss": 0.2172, + "step": 462 + }, + { + "epoch": 0.21641791044776118, + "grad_norm": 1.923923317267627, + "learning_rate": 7.2049689440993795e-06, + "loss": 0.2312, + "step": 464 + }, + { + "epoch": 0.21735074626865672, + "grad_norm": 1.8881754294705932, + "learning_rate": 7.236024844720497e-06, + "loss": 0.2207, + "step": 466 + }, + { + "epoch": 0.21828358208955223, + "grad_norm": 1.889002257004106, + "learning_rate": 7.267080745341616e-06, + "loss": 0.1983, + "step": 468 + }, + { + "epoch": 0.21921641791044777, + "grad_norm": 2.1115379556851375, + "learning_rate": 7.298136645962734e-06, + "loss": 0.2164, + "step": 470 + }, + { + "epoch": 0.22014925373134328, + "grad_norm": 1.9550224775213136, + "learning_rate": 7.329192546583852e-06, + "loss": 0.2272, + "step": 472 + }, + { + "epoch": 0.22108208955223882, + "grad_norm": 1.8510156882099758, + "learning_rate": 7.36024844720497e-06, + "loss": 0.1985, + "step": 474 + }, + { + "epoch": 0.22201492537313433, + "grad_norm": 1.9840100594082613, + "learning_rate": 7.391304347826087e-06, + "loss": 0.2409, + "step": 476 + }, + { + "epoch": 0.22294776119402984, + "grad_norm": 1.9922603729585162, + "learning_rate": 7.4223602484472055e-06, + "loss": 0.2073, + "step": 478 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 1.7991976541148942, + "learning_rate": 7.453416149068323e-06, + "loss": 0.2142, + "step": 480 + }, + { + "epoch": 0.2248134328358209, + "grad_norm": 1.693102560201708, + "learning_rate": 7.484472049689442e-06, + "loss": 0.2099, + "step": 482 + }, + { + "epoch": 0.22574626865671643, + "grad_norm": 1.9155034278566883, + "learning_rate": 7.515527950310559e-06, + "loss": 0.2167, + "step": 484 + }, + { + "epoch": 0.22667910447761194, + "grad_norm": 2.1053863237987844, + "learning_rate": 7.546583850931678e-06, + "loss": 0.2263, + "step": 486 + }, + { + "epoch": 0.22761194029850745, + "grad_norm": 1.9838518240767105, + "learning_rate": 7.577639751552796e-06, + "loss": 0.2073, + "step": 488 + }, + { + "epoch": 0.228544776119403, + "grad_norm": 1.7627840219588304, + "learning_rate": 7.608695652173914e-06, + "loss": 0.2045, + "step": 490 + }, + { + "epoch": 0.2294776119402985, + "grad_norm": 1.897779077585881, + "learning_rate": 7.639751552795032e-06, + "loss": 0.2069, + "step": 492 + }, + { + "epoch": 0.23041044776119404, + "grad_norm": 1.6493063407948891, + "learning_rate": 7.670807453416149e-06, + "loss": 0.1924, + "step": 494 + }, + { + "epoch": 0.23134328358208955, + "grad_norm": 1.8688660015125396, + "learning_rate": 7.701863354037268e-06, + "loss": 0.2185, + "step": 496 + }, + { + "epoch": 0.23227611940298507, + "grad_norm": 2.140641877622803, + "learning_rate": 7.732919254658386e-06, + "loss": 0.2319, + "step": 498 + }, + { + "epoch": 0.2332089552238806, + "grad_norm": 1.966693543062277, + "learning_rate": 7.763975155279503e-06, + "loss": 0.2035, + "step": 500 + }, + { + "epoch": 0.2332089552238806, + "eval_loss": 0.19479210674762726, + "eval_runtime": 321.5979, + "eval_samples_per_second": 47.407, + "eval_steps_per_second": 5.927, + "step": 500 + }, + { + "epoch": 0.23414179104477612, + "grad_norm": 1.9178591333239856, + "learning_rate": 7.795031055900621e-06, + "loss": 0.2146, + "step": 502 + }, + { + "epoch": 0.23507462686567165, + "grad_norm": 1.780460380240722, + "learning_rate": 7.82608695652174e-06, + "loss": 0.2124, + "step": 504 + }, + { + "epoch": 0.23600746268656717, + "grad_norm": 1.7863391968412494, + "learning_rate": 7.857142857142858e-06, + "loss": 0.2091, + "step": 506 + }, + { + "epoch": 0.23694029850746268, + "grad_norm": 1.7153215475435755, + "learning_rate": 7.888198757763977e-06, + "loss": 0.2295, + "step": 508 + }, + { + "epoch": 0.23787313432835822, + "grad_norm": 1.718731451315573, + "learning_rate": 7.919254658385094e-06, + "loss": 0.1927, + "step": 510 + }, + { + "epoch": 0.23880597014925373, + "grad_norm": 1.9590871151290359, + "learning_rate": 7.950310559006212e-06, + "loss": 0.221, + "step": 512 + }, + { + "epoch": 0.23973880597014927, + "grad_norm": 1.837007794513856, + "learning_rate": 7.98136645962733e-06, + "loss": 0.2062, + "step": 514 + }, + { + "epoch": 0.24067164179104478, + "grad_norm": 1.8635351879601927, + "learning_rate": 8.012422360248447e-06, + "loss": 0.2114, + "step": 516 + }, + { + "epoch": 0.2416044776119403, + "grad_norm": 1.7952872366608108, + "learning_rate": 8.043478260869566e-06, + "loss": 0.2008, + "step": 518 + }, + { + "epoch": 0.24253731343283583, + "grad_norm": 1.7484132593526749, + "learning_rate": 8.074534161490684e-06, + "loss": 0.2159, + "step": 520 + }, + { + "epoch": 0.24347014925373134, + "grad_norm": 2.4417074811249675, + "learning_rate": 8.105590062111803e-06, + "loss": 0.1903, + "step": 522 + }, + { + "epoch": 0.24440298507462688, + "grad_norm": 1.6718782667387835, + "learning_rate": 8.13664596273292e-06, + "loss": 0.1878, + "step": 524 + }, + { + "epoch": 0.2453358208955224, + "grad_norm": 1.8620075936961356, + "learning_rate": 8.167701863354038e-06, + "loss": 0.2327, + "step": 526 + }, + { + "epoch": 0.2462686567164179, + "grad_norm": 1.8676208814049724, + "learning_rate": 8.198757763975156e-06, + "loss": 0.2466, + "step": 528 + }, + { + "epoch": 0.24720149253731344, + "grad_norm": 1.838820078487303, + "learning_rate": 8.229813664596275e-06, + "loss": 0.2144, + "step": 530 + }, + { + "epoch": 0.24813432835820895, + "grad_norm": 1.816911913100488, + "learning_rate": 8.260869565217392e-06, + "loss": 0.2211, + "step": 532 + }, + { + "epoch": 0.2490671641791045, + "grad_norm": 1.8537734772298116, + "learning_rate": 8.29192546583851e-06, + "loss": 0.2053, + "step": 534 + }, + { + "epoch": 0.25, + "grad_norm": 1.8897849266931421, + "learning_rate": 8.322981366459629e-06, + "loss": 0.2064, + "step": 536 + }, + { + "epoch": 0.25093283582089554, + "grad_norm": 1.8253407832401187, + "learning_rate": 8.354037267080745e-06, + "loss": 0.2216, + "step": 538 + }, + { + "epoch": 0.251865671641791, + "grad_norm": 1.740543427209268, + "learning_rate": 8.385093167701864e-06, + "loss": 0.2311, + "step": 540 + }, + { + "epoch": 0.25279850746268656, + "grad_norm": 1.7169455656851966, + "learning_rate": 8.416149068322982e-06, + "loss": 0.2254, + "step": 542 + }, + { + "epoch": 0.2537313432835821, + "grad_norm": 1.9161988490932003, + "learning_rate": 8.4472049689441e-06, + "loss": 0.2273, + "step": 544 + }, + { + "epoch": 0.25466417910447764, + "grad_norm": 1.713283854075844, + "learning_rate": 8.478260869565218e-06, + "loss": 0.2149, + "step": 546 + }, + { + "epoch": 0.2555970149253731, + "grad_norm": 1.7178625837979558, + "learning_rate": 8.509316770186336e-06, + "loss": 0.2042, + "step": 548 + }, + { + "epoch": 0.25652985074626866, + "grad_norm": 1.6542895832157922, + "learning_rate": 8.540372670807453e-06, + "loss": 0.2151, + "step": 550 + }, + { + "epoch": 0.2574626865671642, + "grad_norm": 1.9465538700046936, + "learning_rate": 8.571428571428571e-06, + "loss": 0.2194, + "step": 552 + }, + { + "epoch": 0.2583955223880597, + "grad_norm": 1.8863908174945532, + "learning_rate": 8.60248447204969e-06, + "loss": 0.2086, + "step": 554 + }, + { + "epoch": 0.2593283582089552, + "grad_norm": 1.7837760578014812, + "learning_rate": 8.633540372670808e-06, + "loss": 0.209, + "step": 556 + }, + { + "epoch": 0.26026119402985076, + "grad_norm": 1.6815654119881749, + "learning_rate": 8.664596273291927e-06, + "loss": 0.2193, + "step": 558 + }, + { + "epoch": 0.26119402985074625, + "grad_norm": 1.6722447753053575, + "learning_rate": 8.695652173913044e-06, + "loss": 0.2095, + "step": 560 + }, + { + "epoch": 0.2621268656716418, + "grad_norm": 1.7256029683148997, + "learning_rate": 8.726708074534162e-06, + "loss": 0.2075, + "step": 562 + }, + { + "epoch": 0.2630597014925373, + "grad_norm": 1.8130764308944542, + "learning_rate": 8.75776397515528e-06, + "loss": 0.2272, + "step": 564 + }, + { + "epoch": 0.26399253731343286, + "grad_norm": 1.7483793585991647, + "learning_rate": 8.788819875776399e-06, + "loss": 0.2074, + "step": 566 + }, + { + "epoch": 0.26492537313432835, + "grad_norm": 1.8676174583217775, + "learning_rate": 8.819875776397516e-06, + "loss": 0.2206, + "step": 568 + }, + { + "epoch": 0.2658582089552239, + "grad_norm": 1.704628718340115, + "learning_rate": 8.850931677018634e-06, + "loss": 0.2304, + "step": 570 + }, + { + "epoch": 0.2667910447761194, + "grad_norm": 1.7129811521018985, + "learning_rate": 8.881987577639753e-06, + "loss": 0.2215, + "step": 572 + }, + { + "epoch": 0.2677238805970149, + "grad_norm": 1.5844731018588687, + "learning_rate": 8.91304347826087e-06, + "loss": 0.2129, + "step": 574 + }, + { + "epoch": 0.26865671641791045, + "grad_norm": 1.7078677167962482, + "learning_rate": 8.944099378881988e-06, + "loss": 0.1918, + "step": 576 + }, + { + "epoch": 0.269589552238806, + "grad_norm": 1.8394746045502772, + "learning_rate": 8.975155279503106e-06, + "loss": 0.2075, + "step": 578 + }, + { + "epoch": 0.27052238805970147, + "grad_norm": 1.8533668149894602, + "learning_rate": 9.006211180124225e-06, + "loss": 0.2177, + "step": 580 + }, + { + "epoch": 0.271455223880597, + "grad_norm": 1.6706294870333611, + "learning_rate": 9.037267080745342e-06, + "loss": 0.2143, + "step": 582 + }, + { + "epoch": 0.27238805970149255, + "grad_norm": 1.6135862098490785, + "learning_rate": 9.068322981366461e-06, + "loss": 0.1932, + "step": 584 + }, + { + "epoch": 0.2733208955223881, + "grad_norm": 1.8039009332081763, + "learning_rate": 9.099378881987579e-06, + "loss": 0.214, + "step": 586 + }, + { + "epoch": 0.27425373134328357, + "grad_norm": 1.736199962187512, + "learning_rate": 9.130434782608697e-06, + "loss": 0.2076, + "step": 588 + }, + { + "epoch": 0.2751865671641791, + "grad_norm": 1.55239171764077, + "learning_rate": 9.161490683229814e-06, + "loss": 0.2205, + "step": 590 + }, + { + "epoch": 0.27611940298507465, + "grad_norm": 1.6499019479698962, + "learning_rate": 9.192546583850932e-06, + "loss": 0.2264, + "step": 592 + }, + { + "epoch": 0.27705223880597013, + "grad_norm": 1.669115952705018, + "learning_rate": 9.22360248447205e-06, + "loss": 0.2081, + "step": 594 + }, + { + "epoch": 0.27798507462686567, + "grad_norm": 1.7349736524350674, + "learning_rate": 9.254658385093168e-06, + "loss": 0.2182, + "step": 596 + }, + { + "epoch": 0.2789179104477612, + "grad_norm": 1.6120928347026444, + "learning_rate": 9.285714285714288e-06, + "loss": 0.2097, + "step": 598 + }, + { + "epoch": 0.2798507462686567, + "grad_norm": 1.691517126303103, + "learning_rate": 9.316770186335405e-06, + "loss": 0.2222, + "step": 600 + }, + { + "epoch": 0.28078358208955223, + "grad_norm": 1.6356393070257824, + "learning_rate": 9.347826086956523e-06, + "loss": 0.2137, + "step": 602 + }, + { + "epoch": 0.28171641791044777, + "grad_norm": 1.6305519634550314, + "learning_rate": 9.37888198757764e-06, + "loss": 0.2055, + "step": 604 + }, + { + "epoch": 0.2826492537313433, + "grad_norm": 1.6004971093968854, + "learning_rate": 9.40993788819876e-06, + "loss": 0.2103, + "step": 606 + }, + { + "epoch": 0.2835820895522388, + "grad_norm": 1.6406649340996644, + "learning_rate": 9.440993788819877e-06, + "loss": 0.2062, + "step": 608 + }, + { + "epoch": 0.28451492537313433, + "grad_norm": 1.7544618761014004, + "learning_rate": 9.472049689440994e-06, + "loss": 0.2005, + "step": 610 + }, + { + "epoch": 0.28544776119402987, + "grad_norm": 1.629551994561078, + "learning_rate": 9.503105590062112e-06, + "loss": 0.2048, + "step": 612 + }, + { + "epoch": 0.28638059701492535, + "grad_norm": 1.6262560501069863, + "learning_rate": 9.53416149068323e-06, + "loss": 0.1945, + "step": 614 + }, + { + "epoch": 0.2873134328358209, + "grad_norm": 1.7007625240298931, + "learning_rate": 9.565217391304349e-06, + "loss": 0.2118, + "step": 616 + }, + { + "epoch": 0.28824626865671643, + "grad_norm": 1.727038130444209, + "learning_rate": 9.596273291925466e-06, + "loss": 0.2124, + "step": 618 + }, + { + "epoch": 0.2891791044776119, + "grad_norm": 1.5071465929059924, + "learning_rate": 9.627329192546585e-06, + "loss": 0.2272, + "step": 620 + }, + { + "epoch": 0.29011194029850745, + "grad_norm": 1.6030429455419744, + "learning_rate": 9.658385093167703e-06, + "loss": 0.2079, + "step": 622 + }, + { + "epoch": 0.291044776119403, + "grad_norm": 1.8698961221664092, + "learning_rate": 9.68944099378882e-06, + "loss": 0.22, + "step": 624 + }, + { + "epoch": 0.29197761194029853, + "grad_norm": 1.6379825252507367, + "learning_rate": 9.720496894409938e-06, + "loss": 0.2077, + "step": 626 + }, + { + "epoch": 0.292910447761194, + "grad_norm": 1.5160586852181357, + "learning_rate": 9.751552795031056e-06, + "loss": 0.1941, + "step": 628 + }, + { + "epoch": 0.29384328358208955, + "grad_norm": 1.5030995131499334, + "learning_rate": 9.782608695652175e-06, + "loss": 0.2262, + "step": 630 + }, + { + "epoch": 0.2947761194029851, + "grad_norm": 1.6670193534167812, + "learning_rate": 9.813664596273292e-06, + "loss": 0.2215, + "step": 632 + }, + { + "epoch": 0.2957089552238806, + "grad_norm": 1.427100498028422, + "learning_rate": 9.844720496894411e-06, + "loss": 0.2165, + "step": 634 + }, + { + "epoch": 0.2966417910447761, + "grad_norm": 1.501137086209554, + "learning_rate": 9.875776397515529e-06, + "loss": 0.2255, + "step": 636 + }, + { + "epoch": 0.29757462686567165, + "grad_norm": 1.4127915167067975, + "learning_rate": 9.906832298136647e-06, + "loss": 0.2076, + "step": 638 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 1.6065120762122185, + "learning_rate": 9.937888198757764e-06, + "loss": 0.2198, + "step": 640 + }, + { + "epoch": 0.2994402985074627, + "grad_norm": 1.970171686642738, + "learning_rate": 9.968944099378883e-06, + "loss": 0.2423, + "step": 642 + }, + { + "epoch": 0.3003731343283582, + "grad_norm": 1.6876725397175933, + "learning_rate": 1e-05, + "loss": 0.2238, + "step": 644 + }, + { + "epoch": 0.30130597014925375, + "grad_norm": 1.6225950050699995, + "learning_rate": 9.99999705393274e-06, + "loss": 0.2146, + "step": 646 + }, + { + "epoch": 0.30223880597014924, + "grad_norm": 1.623384986651765, + "learning_rate": 9.999988215734431e-06, + "loss": 0.222, + "step": 648 + }, + { + "epoch": 0.3031716417910448, + "grad_norm": 1.562280211952703, + "learning_rate": 9.999973485415487e-06, + "loss": 0.1901, + "step": 650 + }, + { + "epoch": 0.3041044776119403, + "grad_norm": 1.9282897270749833, + "learning_rate": 9.999952862993265e-06, + "loss": 0.2388, + "step": 652 + }, + { + "epoch": 0.3050373134328358, + "grad_norm": 1.6714150197964353, + "learning_rate": 9.99992634849207e-06, + "loss": 0.2177, + "step": 654 + }, + { + "epoch": 0.30597014925373134, + "grad_norm": 1.6737911088642945, + "learning_rate": 9.999893941943148e-06, + "loss": 0.2046, + "step": 656 + }, + { + "epoch": 0.3069029850746269, + "grad_norm": 1.7820731511284762, + "learning_rate": 9.999855643384686e-06, + "loss": 0.238, + "step": 658 + }, + { + "epoch": 0.30783582089552236, + "grad_norm": 1.6100730701090555, + "learning_rate": 9.999811452861817e-06, + "loss": 0.2183, + "step": 660 + }, + { + "epoch": 0.3087686567164179, + "grad_norm": 1.957239334896792, + "learning_rate": 9.999761370426616e-06, + "loss": 0.2323, + "step": 662 + }, + { + "epoch": 0.30970149253731344, + "grad_norm": 1.7306655964277795, + "learning_rate": 9.9997053961381e-06, + "loss": 0.2407, + "step": 664 + }, + { + "epoch": 0.310634328358209, + "grad_norm": 1.6647283309579866, + "learning_rate": 9.999643530062232e-06, + "loss": 0.2025, + "step": 666 + }, + { + "epoch": 0.31156716417910446, + "grad_norm": 1.6232985705909533, + "learning_rate": 9.999575772271917e-06, + "loss": 0.2206, + "step": 668 + }, + { + "epoch": 0.3125, + "grad_norm": 1.5143381053873757, + "learning_rate": 9.999502122847003e-06, + "loss": 0.211, + "step": 670 + }, + { + "epoch": 0.31343283582089554, + "grad_norm": 1.6470145022132414, + "learning_rate": 9.999422581874277e-06, + "loss": 0.2065, + "step": 672 + }, + { + "epoch": 0.314365671641791, + "grad_norm": 1.6251569640381565, + "learning_rate": 9.999337149447477e-06, + "loss": 0.2226, + "step": 674 + }, + { + "epoch": 0.31529850746268656, + "grad_norm": 1.7086513223853053, + "learning_rate": 9.999245825667275e-06, + "loss": 0.248, + "step": 676 + }, + { + "epoch": 0.3162313432835821, + "grad_norm": 1.9064116916209166, + "learning_rate": 9.999148610641292e-06, + "loss": 0.2331, + "step": 678 + }, + { + "epoch": 0.31716417910447764, + "grad_norm": 1.5271764364621954, + "learning_rate": 9.999045504484089e-06, + "loss": 0.1978, + "step": 680 + }, + { + "epoch": 0.3180970149253731, + "grad_norm": 1.5692069998589113, + "learning_rate": 9.998936507317165e-06, + "loss": 0.2265, + "step": 682 + }, + { + "epoch": 0.31902985074626866, + "grad_norm": 1.8154077936443869, + "learning_rate": 9.99882161926897e-06, + "loss": 0.2045, + "step": 684 + }, + { + "epoch": 0.3199626865671642, + "grad_norm": 1.5865182527224377, + "learning_rate": 9.99870084047489e-06, + "loss": 0.2152, + "step": 686 + }, + { + "epoch": 0.3208955223880597, + "grad_norm": 1.567653226883952, + "learning_rate": 9.998574171077252e-06, + "loss": 0.2195, + "step": 688 + }, + { + "epoch": 0.3218283582089552, + "grad_norm": 1.4771559447707423, + "learning_rate": 9.998441611225329e-06, + "loss": 0.1989, + "step": 690 + }, + { + "epoch": 0.32276119402985076, + "grad_norm": 1.497709323554442, + "learning_rate": 9.998303161075331e-06, + "loss": 0.2054, + "step": 692 + }, + { + "epoch": 0.32369402985074625, + "grad_norm": 1.691709816960022, + "learning_rate": 9.998158820790412e-06, + "loss": 0.2233, + "step": 694 + }, + { + "epoch": 0.3246268656716418, + "grad_norm": 1.7240579630780002, + "learning_rate": 9.99800859054067e-06, + "loss": 0.2271, + "step": 696 + }, + { + "epoch": 0.3255597014925373, + "grad_norm": 1.684673934833607, + "learning_rate": 9.997852470503133e-06, + "loss": 0.2319, + "step": 698 + }, + { + "epoch": 0.32649253731343286, + "grad_norm": 1.4252784834985877, + "learning_rate": 9.997690460861782e-06, + "loss": 0.1736, + "step": 700 + }, + { + "epoch": 0.32742537313432835, + "grad_norm": 1.579021485076662, + "learning_rate": 9.997522561807534e-06, + "loss": 0.2037, + "step": 702 + }, + { + "epoch": 0.3283582089552239, + "grad_norm": 1.7710895889269371, + "learning_rate": 9.997348773538244e-06, + "loss": 0.2196, + "step": 704 + }, + { + "epoch": 0.3292910447761194, + "grad_norm": 1.6752663084766275, + "learning_rate": 9.99716909625871e-06, + "loss": 0.2053, + "step": 706 + }, + { + "epoch": 0.3302238805970149, + "grad_norm": 1.7811532579224665, + "learning_rate": 9.996983530180669e-06, + "loss": 0.2034, + "step": 708 + }, + { + "epoch": 0.33115671641791045, + "grad_norm": 1.6934817244622433, + "learning_rate": 9.996792075522795e-06, + "loss": 0.2164, + "step": 710 + }, + { + "epoch": 0.332089552238806, + "grad_norm": 1.5305419790452965, + "learning_rate": 9.996594732510703e-06, + "loss": 0.2123, + "step": 712 + }, + { + "epoch": 0.33302238805970147, + "grad_norm": 1.6529334897646666, + "learning_rate": 9.996391501376948e-06, + "loss": 0.2222, + "step": 714 + }, + { + "epoch": 0.333955223880597, + "grad_norm": 1.4988526085463474, + "learning_rate": 9.996182382361027e-06, + "loss": 0.2234, + "step": 716 + }, + { + "epoch": 0.33488805970149255, + "grad_norm": 1.6813897787726346, + "learning_rate": 9.995967375709365e-06, + "loss": 0.2125, + "step": 718 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 1.5640967738166371, + "learning_rate": 9.995746481675335e-06, + "loss": 0.204, + "step": 720 + }, + { + "epoch": 0.33675373134328357, + "grad_norm": 1.546681241407843, + "learning_rate": 9.995519700519246e-06, + "loss": 0.2003, + "step": 722 + }, + { + "epoch": 0.3376865671641791, + "grad_norm": 1.546258984412468, + "learning_rate": 9.995287032508339e-06, + "loss": 0.1977, + "step": 724 + }, + { + "epoch": 0.33861940298507465, + "grad_norm": 1.4010002919438977, + "learning_rate": 9.9950484779168e-06, + "loss": 0.209, + "step": 726 + }, + { + "epoch": 0.33955223880597013, + "grad_norm": 1.4525068288568763, + "learning_rate": 9.994804037025745e-06, + "loss": 0.2078, + "step": 728 + }, + { + "epoch": 0.34048507462686567, + "grad_norm": 1.7068819400074136, + "learning_rate": 9.994553710123233e-06, + "loss": 0.2198, + "step": 730 + }, + { + "epoch": 0.3414179104477612, + "grad_norm": 1.4631164118298687, + "learning_rate": 9.994297497504253e-06, + "loss": 0.2072, + "step": 732 + }, + { + "epoch": 0.3423507462686567, + "grad_norm": 1.4731855366350495, + "learning_rate": 9.994035399470733e-06, + "loss": 0.2012, + "step": 734 + }, + { + "epoch": 0.34328358208955223, + "grad_norm": 1.527380421003424, + "learning_rate": 9.993767416331541e-06, + "loss": 0.2089, + "step": 736 + }, + { + "epoch": 0.34421641791044777, + "grad_norm": 1.5502413727467526, + "learning_rate": 9.99349354840247e-06, + "loss": 0.1886, + "step": 738 + }, + { + "epoch": 0.3451492537313433, + "grad_norm": 1.6735399174550274, + "learning_rate": 9.993213796006256e-06, + "loss": 0.2135, + "step": 740 + }, + { + "epoch": 0.3460820895522388, + "grad_norm": 1.5931793229897702, + "learning_rate": 9.992928159472565e-06, + "loss": 0.2203, + "step": 742 + }, + { + "epoch": 0.34701492537313433, + "grad_norm": 1.6171684493427314, + "learning_rate": 9.992636639138e-06, + "loss": 0.2178, + "step": 744 + }, + { + "epoch": 0.34794776119402987, + "grad_norm": 1.5619062519995661, + "learning_rate": 9.992339235346096e-06, + "loss": 0.1969, + "step": 746 + }, + { + "epoch": 0.34888059701492535, + "grad_norm": 1.4643409964944438, + "learning_rate": 9.992035948447322e-06, + "loss": 0.218, + "step": 748 + }, + { + "epoch": 0.3498134328358209, + "grad_norm": 1.6713694751278418, + "learning_rate": 9.99172677879908e-06, + "loss": 0.2345, + "step": 750 + }, + { + "epoch": 0.35074626865671643, + "grad_norm": 1.6295968073285287, + "learning_rate": 9.991411726765704e-06, + "loss": 0.2313, + "step": 752 + }, + { + "epoch": 0.3516791044776119, + "grad_norm": 1.492206227603743, + "learning_rate": 9.991090792718458e-06, + "loss": 0.2029, + "step": 754 + }, + { + "epoch": 0.35261194029850745, + "grad_norm": 1.4920372023115085, + "learning_rate": 9.99076397703554e-06, + "loss": 0.2076, + "step": 756 + }, + { + "epoch": 0.353544776119403, + "grad_norm": 1.6851720821939231, + "learning_rate": 9.99043128010208e-06, + "loss": 0.2241, + "step": 758 + }, + { + "epoch": 0.35447761194029853, + "grad_norm": 1.4685009516373944, + "learning_rate": 9.990092702310134e-06, + "loss": 0.195, + "step": 760 + }, + { + "epoch": 0.355410447761194, + "grad_norm": 1.4794010690122867, + "learning_rate": 9.989748244058695e-06, + "loss": 0.2171, + "step": 762 + }, + { + "epoch": 0.35634328358208955, + "grad_norm": 1.4072133779038727, + "learning_rate": 9.989397905753677e-06, + "loss": 0.2068, + "step": 764 + }, + { + "epoch": 0.3572761194029851, + "grad_norm": 1.5090128136103944, + "learning_rate": 9.989041687807934e-06, + "loss": 0.2259, + "step": 766 + }, + { + "epoch": 0.3582089552238806, + "grad_norm": 1.6473468177048791, + "learning_rate": 9.988679590641237e-06, + "loss": 0.2362, + "step": 768 + }, + { + "epoch": 0.3591417910447761, + "grad_norm": 1.5371299816829591, + "learning_rate": 9.988311614680294e-06, + "loss": 0.2094, + "step": 770 + }, + { + "epoch": 0.36007462686567165, + "grad_norm": 1.5972460189685906, + "learning_rate": 9.987937760358738e-06, + "loss": 0.227, + "step": 772 + }, + { + "epoch": 0.36100746268656714, + "grad_norm": 1.5854668624944792, + "learning_rate": 9.987558028117129e-06, + "loss": 0.2291, + "step": 774 + }, + { + "epoch": 0.3619402985074627, + "grad_norm": 1.495419359749338, + "learning_rate": 9.987172418402953e-06, + "loss": 0.2107, + "step": 776 + }, + { + "epoch": 0.3628731343283582, + "grad_norm": 1.5503509875180073, + "learning_rate": 9.986780931670622e-06, + "loss": 0.218, + "step": 778 + }, + { + "epoch": 0.36380597014925375, + "grad_norm": 1.55747472502651, + "learning_rate": 9.986383568381478e-06, + "loss": 0.209, + "step": 780 + }, + { + "epoch": 0.36473880597014924, + "grad_norm": 1.7543414208220027, + "learning_rate": 9.98598032900378e-06, + "loss": 0.2129, + "step": 782 + }, + { + "epoch": 0.3656716417910448, + "grad_norm": 1.4477289658010006, + "learning_rate": 9.985571214012717e-06, + "loss": 0.1952, + "step": 784 + }, + { + "epoch": 0.3666044776119403, + "grad_norm": 1.6518496996444825, + "learning_rate": 9.985156223890405e-06, + "loss": 0.2185, + "step": 786 + }, + { + "epoch": 0.3675373134328358, + "grad_norm": 1.4856177884308097, + "learning_rate": 9.984735359125875e-06, + "loss": 0.2128, + "step": 788 + }, + { + "epoch": 0.36847014925373134, + "grad_norm": 1.3852777645542962, + "learning_rate": 9.984308620215087e-06, + "loss": 0.2061, + "step": 790 + }, + { + "epoch": 0.3694029850746269, + "grad_norm": 1.6573797639788397, + "learning_rate": 9.983876007660924e-06, + "loss": 0.2166, + "step": 792 + }, + { + "epoch": 0.37033582089552236, + "grad_norm": 1.5564426202844768, + "learning_rate": 9.983437521973184e-06, + "loss": 0.2092, + "step": 794 + }, + { + "epoch": 0.3712686567164179, + "grad_norm": 1.686966771664201, + "learning_rate": 9.982993163668593e-06, + "loss": 0.2267, + "step": 796 + }, + { + "epoch": 0.37220149253731344, + "grad_norm": 1.4671061197216366, + "learning_rate": 9.982542933270794e-06, + "loss": 0.2052, + "step": 798 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 1.5197313893299038, + "learning_rate": 9.982086831310351e-06, + "loss": 0.2247, + "step": 800 + }, + { + "epoch": 0.37406716417910446, + "grad_norm": 1.5059619567477527, + "learning_rate": 9.981624858324747e-06, + "loss": 0.2023, + "step": 802 + }, + { + "epoch": 0.375, + "grad_norm": 1.373382755780966, + "learning_rate": 9.981157014858384e-06, + "loss": 0.2109, + "step": 804 + }, + { + "epoch": 0.37593283582089554, + "grad_norm": 1.5497484919375448, + "learning_rate": 9.98068330146258e-06, + "loss": 0.2157, + "step": 806 + }, + { + "epoch": 0.376865671641791, + "grad_norm": 1.5397780167593171, + "learning_rate": 9.98020371869557e-06, + "loss": 0.24, + "step": 808 + }, + { + "epoch": 0.37779850746268656, + "grad_norm": 1.669189774635985, + "learning_rate": 9.97971826712251e-06, + "loss": 0.2128, + "step": 810 + }, + { + "epoch": 0.3787313432835821, + "grad_norm": 1.4964614682006767, + "learning_rate": 9.97922694731547e-06, + "loss": 0.2023, + "step": 812 + }, + { + "epoch": 0.37966417910447764, + "grad_norm": 1.4398142596198227, + "learning_rate": 9.978729759853432e-06, + "loss": 0.2063, + "step": 814 + }, + { + "epoch": 0.3805970149253731, + "grad_norm": 1.3979734121897829, + "learning_rate": 9.978226705322295e-06, + "loss": 0.1968, + "step": 816 + }, + { + "epoch": 0.38152985074626866, + "grad_norm": 1.491927436401735, + "learning_rate": 9.977717784314875e-06, + "loss": 0.2059, + "step": 818 + }, + { + "epoch": 0.3824626865671642, + "grad_norm": 1.4357118787242178, + "learning_rate": 9.977202997430895e-06, + "loss": 0.2071, + "step": 820 + }, + { + "epoch": 0.3833955223880597, + "grad_norm": 1.4119862490675763, + "learning_rate": 9.976682345276995e-06, + "loss": 0.2048, + "step": 822 + }, + { + "epoch": 0.3843283582089552, + "grad_norm": 1.4669720696498303, + "learning_rate": 9.976155828466725e-06, + "loss": 0.1987, + "step": 824 + }, + { + "epoch": 0.38526119402985076, + "grad_norm": 1.2344430086775962, + "learning_rate": 9.975623447620549e-06, + "loss": 0.1924, + "step": 826 + }, + { + "epoch": 0.38619402985074625, + "grad_norm": 1.5652570148884641, + "learning_rate": 9.975085203365834e-06, + "loss": 0.1881, + "step": 828 + }, + { + "epoch": 0.3871268656716418, + "grad_norm": 1.4754813636164474, + "learning_rate": 9.974541096336865e-06, + "loss": 0.2192, + "step": 830 + }, + { + "epoch": 0.3880597014925373, + "grad_norm": 1.626829373370367, + "learning_rate": 9.973991127174833e-06, + "loss": 0.2366, + "step": 832 + }, + { + "epoch": 0.38899253731343286, + "grad_norm": 1.4490460319497058, + "learning_rate": 9.973435296527835e-06, + "loss": 0.1859, + "step": 834 + }, + { + "epoch": 0.38992537313432835, + "grad_norm": 1.403520038653334, + "learning_rate": 9.972873605050878e-06, + "loss": 0.2007, + "step": 836 + }, + { + "epoch": 0.3908582089552239, + "grad_norm": 1.5661614246246318, + "learning_rate": 9.97230605340587e-06, + "loss": 0.2246, + "step": 838 + }, + { + "epoch": 0.3917910447761194, + "grad_norm": 1.7033190578367239, + "learning_rate": 9.971732642261635e-06, + "loss": 0.2309, + "step": 840 + }, + { + "epoch": 0.3927238805970149, + "grad_norm": 1.5226095161639421, + "learning_rate": 9.971153372293893e-06, + "loss": 0.2224, + "step": 842 + }, + { + "epoch": 0.39365671641791045, + "grad_norm": 1.5398485144304783, + "learning_rate": 9.970568244185272e-06, + "loss": 0.2314, + "step": 844 + }, + { + "epoch": 0.394589552238806, + "grad_norm": 1.4511269870109804, + "learning_rate": 9.969977258625303e-06, + "loss": 0.2171, + "step": 846 + }, + { + "epoch": 0.39552238805970147, + "grad_norm": 1.5194540698178827, + "learning_rate": 9.969380416310417e-06, + "loss": 0.2172, + "step": 848 + }, + { + "epoch": 0.396455223880597, + "grad_norm": 1.364259972259269, + "learning_rate": 9.968777717943954e-06, + "loss": 0.1977, + "step": 850 + }, + { + "epoch": 0.39738805970149255, + "grad_norm": 1.4228579215618578, + "learning_rate": 9.968169164236145e-06, + "loss": 0.2053, + "step": 852 + }, + { + "epoch": 0.3983208955223881, + "grad_norm": 1.6712416458354218, + "learning_rate": 9.967554755904127e-06, + "loss": 0.2176, + "step": 854 + }, + { + "epoch": 0.39925373134328357, + "grad_norm": 1.5038386647715758, + "learning_rate": 9.966934493671938e-06, + "loss": 0.2133, + "step": 856 + }, + { + "epoch": 0.4001865671641791, + "grad_norm": 1.3632283565658245, + "learning_rate": 9.966308378270511e-06, + "loss": 0.1994, + "step": 858 + }, + { + "epoch": 0.40111940298507465, + "grad_norm": 1.5274204894929744, + "learning_rate": 9.965676410437675e-06, + "loss": 0.2362, + "step": 860 + }, + { + "epoch": 0.40205223880597013, + "grad_norm": 1.37459075908075, + "learning_rate": 9.965038590918157e-06, + "loss": 0.2002, + "step": 862 + }, + { + "epoch": 0.40298507462686567, + "grad_norm": 1.6354562797409415, + "learning_rate": 9.964394920463586e-06, + "loss": 0.2458, + "step": 864 + }, + { + "epoch": 0.4039179104477612, + "grad_norm": 1.374731418579664, + "learning_rate": 9.963745399832476e-06, + "loss": 0.2146, + "step": 866 + }, + { + "epoch": 0.4048507462686567, + "grad_norm": 1.464592418209558, + "learning_rate": 9.96309002979024e-06, + "loss": 0.1978, + "step": 868 + }, + { + "epoch": 0.40578358208955223, + "grad_norm": 1.5533291604380253, + "learning_rate": 9.962428811109187e-06, + "loss": 0.2219, + "step": 870 + }, + { + "epoch": 0.40671641791044777, + "grad_norm": 1.3856686275767705, + "learning_rate": 9.961761744568512e-06, + "loss": 0.2053, + "step": 872 + }, + { + "epoch": 0.4076492537313433, + "grad_norm": 1.4182301324149096, + "learning_rate": 9.961088830954304e-06, + "loss": 0.2218, + "step": 874 + }, + { + "epoch": 0.4085820895522388, + "grad_norm": 1.5276935803141443, + "learning_rate": 9.960410071059543e-06, + "loss": 0.2226, + "step": 876 + }, + { + "epoch": 0.40951492537313433, + "grad_norm": 2.0528206448734267, + "learning_rate": 9.959725465684099e-06, + "loss": 0.2063, + "step": 878 + }, + { + "epoch": 0.41044776119402987, + "grad_norm": 1.4855180601859719, + "learning_rate": 9.959035015634727e-06, + "loss": 0.1862, + "step": 880 + }, + { + "epoch": 0.41138059701492535, + "grad_norm": 1.580306023027329, + "learning_rate": 9.958338721725075e-06, + "loss": 0.2037, + "step": 882 + }, + { + "epoch": 0.4123134328358209, + "grad_norm": 1.4439857201693262, + "learning_rate": 9.957636584775671e-06, + "loss": 0.2075, + "step": 884 + }, + { + "epoch": 0.41324626865671643, + "grad_norm": 1.4820780271477962, + "learning_rate": 9.956928605613935e-06, + "loss": 0.1951, + "step": 886 + }, + { + "epoch": 0.4141791044776119, + "grad_norm": 1.6662032107522697, + "learning_rate": 9.956214785074169e-06, + "loss": 0.2113, + "step": 888 + }, + { + "epoch": 0.41511194029850745, + "grad_norm": 1.6238607804516694, + "learning_rate": 9.955495123997556e-06, + "loss": 0.2051, + "step": 890 + }, + { + "epoch": 0.416044776119403, + "grad_norm": 2.001493600922292, + "learning_rate": 9.954769623232165e-06, + "loss": 0.2034, + "step": 892 + }, + { + "epoch": 0.41697761194029853, + "grad_norm": 1.4987183254960474, + "learning_rate": 9.954038283632945e-06, + "loss": 0.207, + "step": 894 + }, + { + "epoch": 0.417910447761194, + "grad_norm": 2.395033978637096, + "learning_rate": 9.953301106061728e-06, + "loss": 0.1816, + "step": 896 + }, + { + "epoch": 0.41884328358208955, + "grad_norm": 1.3809658665451596, + "learning_rate": 9.952558091387224e-06, + "loss": 0.1989, + "step": 898 + }, + { + "epoch": 0.4197761194029851, + "grad_norm": 1.6058795299888213, + "learning_rate": 9.951809240485017e-06, + "loss": 0.2303, + "step": 900 + }, + { + "epoch": 0.4207089552238806, + "grad_norm": 1.9037897042068004, + "learning_rate": 9.951054554237579e-06, + "loss": 0.222, + "step": 902 + }, + { + "epoch": 0.4216417910447761, + "grad_norm": 1.4635743652473927, + "learning_rate": 9.950294033534247e-06, + "loss": 0.1955, + "step": 904 + }, + { + "epoch": 0.42257462686567165, + "grad_norm": 1.2713707769627034, + "learning_rate": 9.949527679271244e-06, + "loss": 0.1822, + "step": 906 + }, + { + "epoch": 0.42350746268656714, + "grad_norm": 1.5323208784487459, + "learning_rate": 9.948755492351659e-06, + "loss": 0.2172, + "step": 908 + }, + { + "epoch": 0.4244402985074627, + "grad_norm": 1.3384965127252164, + "learning_rate": 9.94797747368546e-06, + "loss": 0.2022, + "step": 910 + }, + { + "epoch": 0.4253731343283582, + "grad_norm": 1.5291696362819658, + "learning_rate": 9.947193624189485e-06, + "loss": 0.2258, + "step": 912 + }, + { + "epoch": 0.42630597014925375, + "grad_norm": 1.4982152008948342, + "learning_rate": 9.946403944787441e-06, + "loss": 0.2147, + "step": 914 + }, + { + "epoch": 0.42723880597014924, + "grad_norm": 1.4500793974667148, + "learning_rate": 9.945608436409913e-06, + "loss": 0.2051, + "step": 916 + }, + { + "epoch": 0.4281716417910448, + "grad_norm": 1.3079337678373602, + "learning_rate": 9.944807099994343e-06, + "loss": 0.2033, + "step": 918 + }, + { + "epoch": 0.4291044776119403, + "grad_norm": 1.3634353746403716, + "learning_rate": 9.94399993648505e-06, + "loss": 0.1973, + "step": 920 + }, + { + "epoch": 0.4300373134328358, + "grad_norm": 1.3547395674282443, + "learning_rate": 9.943186946833217e-06, + "loss": 0.2012, + "step": 922 + }, + { + "epoch": 0.43097014925373134, + "grad_norm": 1.5908366403478449, + "learning_rate": 9.942368131996892e-06, + "loss": 0.2197, + "step": 924 + }, + { + "epoch": 0.4319029850746269, + "grad_norm": 1.7410389177979753, + "learning_rate": 9.94154349294099e-06, + "loss": 0.2272, + "step": 926 + }, + { + "epoch": 0.43283582089552236, + "grad_norm": 1.4211703252136978, + "learning_rate": 9.94071303063729e-06, + "loss": 0.2053, + "step": 928 + }, + { + "epoch": 0.4337686567164179, + "grad_norm": 1.393805940595914, + "learning_rate": 9.939876746064427e-06, + "loss": 0.1996, + "step": 930 + }, + { + "epoch": 0.43470149253731344, + "grad_norm": 1.4740985194701417, + "learning_rate": 9.939034640207902e-06, + "loss": 0.2005, + "step": 932 + }, + { + "epoch": 0.435634328358209, + "grad_norm": 1.257114759796336, + "learning_rate": 9.938186714060077e-06, + "loss": 0.1777, + "step": 934 + }, + { + "epoch": 0.43656716417910446, + "grad_norm": 1.620236651876971, + "learning_rate": 9.937332968620168e-06, + "loss": 0.203, + "step": 936 + }, + { + "epoch": 0.4375, + "grad_norm": 1.6029399275619836, + "learning_rate": 9.936473404894256e-06, + "loss": 0.2099, + "step": 938 + }, + { + "epoch": 0.43843283582089554, + "grad_norm": 1.499678838053157, + "learning_rate": 9.935608023895269e-06, + "loss": 0.21, + "step": 940 + }, + { + "epoch": 0.439365671641791, + "grad_norm": 1.4916831266989266, + "learning_rate": 9.934736826643e-06, + "loss": 0.2083, + "step": 942 + }, + { + "epoch": 0.44029850746268656, + "grad_norm": 1.3116363666177426, + "learning_rate": 9.933859814164089e-06, + "loss": 0.2012, + "step": 944 + }, + { + "epoch": 0.4412313432835821, + "grad_norm": 1.5184645183407464, + "learning_rate": 9.932976987492029e-06, + "loss": 0.2019, + "step": 946 + }, + { + "epoch": 0.44216417910447764, + "grad_norm": 1.4435172238370206, + "learning_rate": 9.93208834766717e-06, + "loss": 0.2126, + "step": 948 + }, + { + "epoch": 0.4430970149253731, + "grad_norm": 1.365991785854986, + "learning_rate": 9.93119389573671e-06, + "loss": 0.2066, + "step": 950 + }, + { + "epoch": 0.44402985074626866, + "grad_norm": 1.337192667444216, + "learning_rate": 9.93029363275469e-06, + "loss": 0.2159, + "step": 952 + }, + { + "epoch": 0.4449626865671642, + "grad_norm": 1.593228511395827, + "learning_rate": 9.92938755978201e-06, + "loss": 0.2147, + "step": 954 + }, + { + "epoch": 0.4458955223880597, + "grad_norm": 1.312002367778796, + "learning_rate": 9.928475677886407e-06, + "loss": 0.1987, + "step": 956 + }, + { + "epoch": 0.4468283582089552, + "grad_norm": 1.1959083930426961, + "learning_rate": 9.927557988142467e-06, + "loss": 0.193, + "step": 958 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 1.266448270123119, + "learning_rate": 9.926634491631623e-06, + "loss": 0.1809, + "step": 960 + }, + { + "epoch": 0.44869402985074625, + "grad_norm": 1.4053237076936411, + "learning_rate": 9.925705189442145e-06, + "loss": 0.194, + "step": 962 + }, + { + "epoch": 0.4496268656716418, + "grad_norm": 1.3191029750043703, + "learning_rate": 9.92477008266915e-06, + "loss": 0.1801, + "step": 964 + }, + { + "epoch": 0.4505597014925373, + "grad_norm": 1.4297129242923046, + "learning_rate": 9.923829172414594e-06, + "loss": 0.1937, + "step": 966 + }, + { + "epoch": 0.45149253731343286, + "grad_norm": 1.1919091109463966, + "learning_rate": 9.922882459787268e-06, + "loss": 0.1878, + "step": 968 + }, + { + "epoch": 0.45242537313432835, + "grad_norm": 1.4273355091449529, + "learning_rate": 9.921929945902805e-06, + "loss": 0.1963, + "step": 970 + }, + { + "epoch": 0.4533582089552239, + "grad_norm": 1.420964861407647, + "learning_rate": 9.920971631883673e-06, + "loss": 0.1831, + "step": 972 + }, + { + "epoch": 0.4542910447761194, + "grad_norm": 1.624875105719104, + "learning_rate": 9.920007518859175e-06, + "loss": 0.2394, + "step": 974 + }, + { + "epoch": 0.4552238805970149, + "grad_norm": 1.4956135828361246, + "learning_rate": 9.919037607965447e-06, + "loss": 0.2032, + "step": 976 + }, + { + "epoch": 0.45615671641791045, + "grad_norm": 1.3943613205695262, + "learning_rate": 9.91806190034546e-06, + "loss": 0.2167, + "step": 978 + }, + { + "epoch": 0.457089552238806, + "grad_norm": 1.3099283306452394, + "learning_rate": 9.917080397149013e-06, + "loss": 0.1824, + "step": 980 + }, + { + "epoch": 0.45802238805970147, + "grad_norm": 1.3819954082344559, + "learning_rate": 9.916093099532733e-06, + "loss": 0.1802, + "step": 982 + }, + { + "epoch": 0.458955223880597, + "grad_norm": 1.5179159283297041, + "learning_rate": 9.915100008660083e-06, + "loss": 0.235, + "step": 984 + }, + { + "epoch": 0.45988805970149255, + "grad_norm": 1.5207608234482923, + "learning_rate": 9.914101125701346e-06, + "loss": 0.2228, + "step": 986 + }, + { + "epoch": 0.4608208955223881, + "grad_norm": 1.2521446332959851, + "learning_rate": 9.913096451833631e-06, + "loss": 0.1893, + "step": 988 + }, + { + "epoch": 0.46175373134328357, + "grad_norm": 1.3787201715183737, + "learning_rate": 9.912085988240873e-06, + "loss": 0.1915, + "step": 990 + }, + { + "epoch": 0.4626865671641791, + "grad_norm": 1.3829452250332792, + "learning_rate": 9.911069736113831e-06, + "loss": 0.1922, + "step": 992 + }, + { + "epoch": 0.46361940298507465, + "grad_norm": 1.4766445985578276, + "learning_rate": 9.910047696650086e-06, + "loss": 0.2006, + "step": 994 + }, + { + "epoch": 0.46455223880597013, + "grad_norm": 1.4891343123368928, + "learning_rate": 9.909019871054032e-06, + "loss": 0.2116, + "step": 996 + }, + { + "epoch": 0.46548507462686567, + "grad_norm": 2.37188044041739, + "learning_rate": 9.907986260536888e-06, + "loss": 0.2138, + "step": 998 + }, + { + "epoch": 0.4664179104477612, + "grad_norm": 1.3287044631705238, + "learning_rate": 9.906946866316688e-06, + "loss": 0.2163, + "step": 1000 + }, + { + "epoch": 0.4664179104477612, + "eval_loss": 0.18396539986133575, + "eval_runtime": 321.6955, + "eval_samples_per_second": 47.393, + "eval_steps_per_second": 5.925, + "step": 1000 + }, + { + "epoch": 0.4673507462686567, + "grad_norm": 1.57878750964256, + "learning_rate": 9.905901689618287e-06, + "loss": 0.2321, + "step": 1002 + }, + { + "epoch": 0.46828358208955223, + "grad_norm": 1.3222528960853128, + "learning_rate": 9.904850731673342e-06, + "loss": 0.1904, + "step": 1004 + }, + { + "epoch": 0.46921641791044777, + "grad_norm": 1.3134015236511882, + "learning_rate": 9.903793993720333e-06, + "loss": 0.2007, + "step": 1006 + }, + { + "epoch": 0.4701492537313433, + "grad_norm": 1.4962402402404267, + "learning_rate": 9.902731477004552e-06, + "loss": 0.2105, + "step": 1008 + }, + { + "epoch": 0.4710820895522388, + "grad_norm": 1.4701508149160272, + "learning_rate": 9.901663182778091e-06, + "loss": 0.2213, + "step": 1010 + }, + { + "epoch": 0.47201492537313433, + "grad_norm": 1.5957657316140008, + "learning_rate": 9.900589112299862e-06, + "loss": 0.1872, + "step": 1012 + }, + { + "epoch": 0.47294776119402987, + "grad_norm": 1.4250689017342073, + "learning_rate": 9.899509266835575e-06, + "loss": 0.2115, + "step": 1014 + }, + { + "epoch": 0.47388059701492535, + "grad_norm": 1.5295044551866799, + "learning_rate": 9.89842364765775e-06, + "loss": 0.2004, + "step": 1016 + }, + { + "epoch": 0.4748134328358209, + "grad_norm": 1.375386556379696, + "learning_rate": 9.897332256045712e-06, + "loss": 0.2007, + "step": 1018 + }, + { + "epoch": 0.47574626865671643, + "grad_norm": 1.9168976412029832, + "learning_rate": 9.896235093285583e-06, + "loss": 0.1938, + "step": 1020 + }, + { + "epoch": 0.4766791044776119, + "grad_norm": 1.3912354039091317, + "learning_rate": 9.89513216067029e-06, + "loss": 0.2267, + "step": 1022 + }, + { + "epoch": 0.47761194029850745, + "grad_norm": 1.2609253736383512, + "learning_rate": 9.894023459499562e-06, + "loss": 0.1931, + "step": 1024 + }, + { + "epoch": 0.478544776119403, + "grad_norm": 1.4088269195196246, + "learning_rate": 9.892908991079917e-06, + "loss": 0.2268, + "step": 1026 + }, + { + "epoch": 0.47947761194029853, + "grad_norm": 1.6400729111214873, + "learning_rate": 9.891788756724676e-06, + "loss": 0.203, + "step": 1028 + }, + { + "epoch": 0.480410447761194, + "grad_norm": 1.3105879597650345, + "learning_rate": 9.890662757753955e-06, + "loss": 0.187, + "step": 1030 + }, + { + "epoch": 0.48134328358208955, + "grad_norm": 1.3151903558889866, + "learning_rate": 9.889530995494661e-06, + "loss": 0.1913, + "step": 1032 + }, + { + "epoch": 0.4822761194029851, + "grad_norm": 1.3362909932257605, + "learning_rate": 9.888393471280493e-06, + "loss": 0.2136, + "step": 1034 + }, + { + "epoch": 0.4832089552238806, + "grad_norm": 1.4057266951433458, + "learning_rate": 9.88725018645194e-06, + "loss": 0.1947, + "step": 1036 + }, + { + "epoch": 0.4841417910447761, + "grad_norm": 1.3426572746891023, + "learning_rate": 9.886101142356278e-06, + "loss": 0.211, + "step": 1038 + }, + { + "epoch": 0.48507462686567165, + "grad_norm": 1.3769734591394878, + "learning_rate": 9.884946340347574e-06, + "loss": 0.1998, + "step": 1040 + }, + { + "epoch": 0.48600746268656714, + "grad_norm": 1.321678508254731, + "learning_rate": 9.883785781786676e-06, + "loss": 0.2082, + "step": 1042 + }, + { + "epoch": 0.4869402985074627, + "grad_norm": 1.378118711944151, + "learning_rate": 9.882619468041219e-06, + "loss": 0.1964, + "step": 1044 + }, + { + "epoch": 0.4878731343283582, + "grad_norm": 1.3887007023835891, + "learning_rate": 9.881447400485617e-06, + "loss": 0.2226, + "step": 1046 + }, + { + "epoch": 0.48880597014925375, + "grad_norm": 1.3205681224821462, + "learning_rate": 9.880269580501067e-06, + "loss": 0.2064, + "step": 1048 + }, + { + "epoch": 0.48973880597014924, + "grad_norm": 1.3299044190591582, + "learning_rate": 9.879086009475544e-06, + "loss": 0.1847, + "step": 1050 + }, + { + "epoch": 0.4906716417910448, + "grad_norm": 1.4086202941072181, + "learning_rate": 9.8778966888038e-06, + "loss": 0.207, + "step": 1052 + }, + { + "epoch": 0.4916044776119403, + "grad_norm": 1.359722655692275, + "learning_rate": 9.876701619887358e-06, + "loss": 0.2195, + "step": 1054 + }, + { + "epoch": 0.4925373134328358, + "grad_norm": 1.521119863925835, + "learning_rate": 9.875500804134525e-06, + "loss": 0.193, + "step": 1056 + }, + { + "epoch": 0.49347014925373134, + "grad_norm": 1.3767230351467374, + "learning_rate": 9.874294242960374e-06, + "loss": 0.2268, + "step": 1058 + }, + { + "epoch": 0.4944029850746269, + "grad_norm": 1.3086201683104317, + "learning_rate": 9.873081937786746e-06, + "loss": 0.1827, + "step": 1060 + }, + { + "epoch": 0.49533582089552236, + "grad_norm": 1.6868000799300134, + "learning_rate": 9.871863890042256e-06, + "loss": 0.2084, + "step": 1062 + }, + { + "epoch": 0.4962686567164179, + "grad_norm": 1.4517644975136033, + "learning_rate": 9.870640101162286e-06, + "loss": 0.1959, + "step": 1064 + }, + { + "epoch": 0.49720149253731344, + "grad_norm": 1.2860344337333347, + "learning_rate": 9.869410572588978e-06, + "loss": 0.2003, + "step": 1066 + }, + { + "epoch": 0.498134328358209, + "grad_norm": 1.4260945511661838, + "learning_rate": 9.868175305771243e-06, + "loss": 0.2079, + "step": 1068 + }, + { + "epoch": 0.49906716417910446, + "grad_norm": 1.487363730524502, + "learning_rate": 9.866934302164755e-06, + "loss": 0.218, + "step": 1070 + }, + { + "epoch": 0.5, + "grad_norm": 1.492521015928695, + "learning_rate": 9.865687563231943e-06, + "loss": 0.2125, + "step": 1072 + }, + { + "epoch": 0.5009328358208955, + "grad_norm": 1.402308667183317, + "learning_rate": 9.864435090442e-06, + "loss": 0.2089, + "step": 1074 + }, + { + "epoch": 0.5018656716417911, + "grad_norm": 1.1955444297697637, + "learning_rate": 9.86317688527087e-06, + "loss": 0.1933, + "step": 1076 + }, + { + "epoch": 0.5027985074626866, + "grad_norm": 1.289349819050055, + "learning_rate": 9.86191294920126e-06, + "loss": 0.1968, + "step": 1078 + }, + { + "epoch": 0.503731343283582, + "grad_norm": 1.2885086835168242, + "learning_rate": 9.860643283722625e-06, + "loss": 0.1663, + "step": 1080 + }, + { + "epoch": 0.5046641791044776, + "grad_norm": 1.4330258463642025, + "learning_rate": 9.859367890331173e-06, + "loss": 0.2076, + "step": 1082 + }, + { + "epoch": 0.5055970149253731, + "grad_norm": 1.2719656782807578, + "learning_rate": 9.85808677052986e-06, + "loss": 0.1955, + "step": 1084 + }, + { + "epoch": 0.5065298507462687, + "grad_norm": 1.4516382311318605, + "learning_rate": 9.856799925828393e-06, + "loss": 0.2156, + "step": 1086 + }, + { + "epoch": 0.5074626865671642, + "grad_norm": 1.3257407906976475, + "learning_rate": 9.855507357743225e-06, + "loss": 0.1964, + "step": 1088 + }, + { + "epoch": 0.5083955223880597, + "grad_norm": 1.5135934349749167, + "learning_rate": 9.854209067797553e-06, + "loss": 0.202, + "step": 1090 + }, + { + "epoch": 0.5093283582089553, + "grad_norm": 1.3287017566366777, + "learning_rate": 9.852905057521317e-06, + "loss": 0.1931, + "step": 1092 + }, + { + "epoch": 0.5102611940298507, + "grad_norm": 1.3814961249786555, + "learning_rate": 9.851595328451198e-06, + "loss": 0.2107, + "step": 1094 + }, + { + "epoch": 0.5111940298507462, + "grad_norm": 1.3474074530170514, + "learning_rate": 9.850279882130613e-06, + "loss": 0.2107, + "step": 1096 + }, + { + "epoch": 0.5121268656716418, + "grad_norm": 1.3419692034630626, + "learning_rate": 9.848958720109724e-06, + "loss": 0.2067, + "step": 1098 + }, + { + "epoch": 0.5130597014925373, + "grad_norm": 1.2939110597186954, + "learning_rate": 9.847631843945421e-06, + "loss": 0.1906, + "step": 1100 + }, + { + "epoch": 0.5139925373134329, + "grad_norm": 1.3557742630802896, + "learning_rate": 9.846299255201332e-06, + "loss": 0.199, + "step": 1102 + }, + { + "epoch": 0.5149253731343284, + "grad_norm": 1.3747405046733814, + "learning_rate": 9.844960955447813e-06, + "loss": 0.2199, + "step": 1104 + }, + { + "epoch": 0.5158582089552238, + "grad_norm": 1.4500635932391988, + "learning_rate": 9.843616946261956e-06, + "loss": 0.2052, + "step": 1106 + }, + { + "epoch": 0.5167910447761194, + "grad_norm": 1.4121169414478871, + "learning_rate": 9.842267229227573e-06, + "loss": 0.1923, + "step": 1108 + }, + { + "epoch": 0.5177238805970149, + "grad_norm": 1.3440269181422098, + "learning_rate": 9.840911805935211e-06, + "loss": 0.1702, + "step": 1110 + }, + { + "epoch": 0.5186567164179104, + "grad_norm": 1.403115977716643, + "learning_rate": 9.839550677982137e-06, + "loss": 0.1992, + "step": 1112 + }, + { + "epoch": 0.519589552238806, + "grad_norm": 1.0990333537752326, + "learning_rate": 9.838183846972337e-06, + "loss": 0.1808, + "step": 1114 + }, + { + "epoch": 0.5205223880597015, + "grad_norm": 1.3679893571279134, + "learning_rate": 9.836811314516526e-06, + "loss": 0.1985, + "step": 1116 + }, + { + "epoch": 0.5214552238805971, + "grad_norm": 1.29005411066547, + "learning_rate": 9.83543308223213e-06, + "loss": 0.1895, + "step": 1118 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 1.3141946336352162, + "learning_rate": 9.834049151743296e-06, + "loss": 0.1911, + "step": 1120 + }, + { + "epoch": 0.523320895522388, + "grad_norm": 1.3472585769171632, + "learning_rate": 9.832659524680886e-06, + "loss": 0.1829, + "step": 1122 + }, + { + "epoch": 0.5242537313432836, + "grad_norm": 1.4251642211467288, + "learning_rate": 9.831264202682474e-06, + "loss": 0.2276, + "step": 1124 + }, + { + "epoch": 0.5251865671641791, + "grad_norm": 1.3075521549092934, + "learning_rate": 9.82986318739234e-06, + "loss": 0.1995, + "step": 1126 + }, + { + "epoch": 0.5261194029850746, + "grad_norm": 1.3264911775768808, + "learning_rate": 9.828456480461486e-06, + "loss": 0.2019, + "step": 1128 + }, + { + "epoch": 0.5270522388059702, + "grad_norm": 1.2812570885993706, + "learning_rate": 9.82704408354761e-06, + "loss": 0.1906, + "step": 1130 + }, + { + "epoch": 0.5279850746268657, + "grad_norm": 1.295091048232688, + "learning_rate": 9.825625998315117e-06, + "loss": 0.1875, + "step": 1132 + }, + { + "epoch": 0.5289179104477612, + "grad_norm": 1.5095428901850168, + "learning_rate": 9.824202226435116e-06, + "loss": 0.1839, + "step": 1134 + }, + { + "epoch": 0.5298507462686567, + "grad_norm": 1.2281376530748465, + "learning_rate": 9.82277276958542e-06, + "loss": 0.1828, + "step": 1136 + }, + { + "epoch": 0.5307835820895522, + "grad_norm": 1.4547432072625253, + "learning_rate": 9.82133762945054e-06, + "loss": 0.2031, + "step": 1138 + }, + { + "epoch": 0.5317164179104478, + "grad_norm": 1.2675070633970242, + "learning_rate": 9.819896807721682e-06, + "loss": 0.1942, + "step": 1140 + }, + { + "epoch": 0.5326492537313433, + "grad_norm": 1.4592301039817077, + "learning_rate": 9.818450306096752e-06, + "loss": 0.2331, + "step": 1142 + }, + { + "epoch": 0.5335820895522388, + "grad_norm": 1.1662531402310314, + "learning_rate": 9.816998126280345e-06, + "loss": 0.2083, + "step": 1144 + }, + { + "epoch": 0.5345149253731343, + "grad_norm": 1.3143246465827019, + "learning_rate": 9.815540269983745e-06, + "loss": 0.2025, + "step": 1146 + }, + { + "epoch": 0.5354477611940298, + "grad_norm": 1.4739554155865877, + "learning_rate": 9.814076738924934e-06, + "loss": 0.2148, + "step": 1148 + }, + { + "epoch": 0.5363805970149254, + "grad_norm": 1.3248478807110968, + "learning_rate": 9.812607534828576e-06, + "loss": 0.2223, + "step": 1150 + }, + { + "epoch": 0.5373134328358209, + "grad_norm": 1.3260805713728958, + "learning_rate": 9.811132659426018e-06, + "loss": 0.1778, + "step": 1152 + }, + { + "epoch": 0.5382462686567164, + "grad_norm": 1.3533071643276118, + "learning_rate": 9.809652114455292e-06, + "loss": 0.2103, + "step": 1154 + }, + { + "epoch": 0.539179104477612, + "grad_norm": 1.3850454239020296, + "learning_rate": 9.808165901661117e-06, + "loss": 0.1889, + "step": 1156 + }, + { + "epoch": 0.5401119402985075, + "grad_norm": 1.2284191439984462, + "learning_rate": 9.806674022794884e-06, + "loss": 0.2, + "step": 1158 + }, + { + "epoch": 0.5410447761194029, + "grad_norm": 1.6822062090059398, + "learning_rate": 9.805176479614661e-06, + "loss": 0.237, + "step": 1160 + }, + { + "epoch": 0.5419776119402985, + "grad_norm": 1.268048215194861, + "learning_rate": 9.803673273885195e-06, + "loss": 0.2088, + "step": 1162 + }, + { + "epoch": 0.542910447761194, + "grad_norm": 1.2288813185660052, + "learning_rate": 9.802164407377905e-06, + "loss": 0.2138, + "step": 1164 + }, + { + "epoch": 0.5438432835820896, + "grad_norm": 1.2222635278760878, + "learning_rate": 9.800649881870877e-06, + "loss": 0.2091, + "step": 1166 + }, + { + "epoch": 0.5447761194029851, + "grad_norm": 1.4424946066311257, + "learning_rate": 9.79912969914887e-06, + "loss": 0.199, + "step": 1168 + }, + { + "epoch": 0.5457089552238806, + "grad_norm": 1.2926068446171315, + "learning_rate": 9.797603861003311e-06, + "loss": 0.1697, + "step": 1170 + }, + { + "epoch": 0.5466417910447762, + "grad_norm": 1.12685431622789, + "learning_rate": 9.796072369232283e-06, + "loss": 0.1728, + "step": 1172 + }, + { + "epoch": 0.5475746268656716, + "grad_norm": 1.3114420467168177, + "learning_rate": 9.794535225640544e-06, + "loss": 0.2013, + "step": 1174 + }, + { + "epoch": 0.5485074626865671, + "grad_norm": 1.3661627874545268, + "learning_rate": 9.7929924320395e-06, + "loss": 0.1773, + "step": 1176 + }, + { + "epoch": 0.5494402985074627, + "grad_norm": 1.2520733508228175, + "learning_rate": 9.791443990247221e-06, + "loss": 0.1973, + "step": 1178 + }, + { + "epoch": 0.5503731343283582, + "grad_norm": 1.363288918506065, + "learning_rate": 9.789889902088435e-06, + "loss": 0.2135, + "step": 1180 + }, + { + "epoch": 0.5513059701492538, + "grad_norm": 1.2144120727241154, + "learning_rate": 9.78833016939452e-06, + "loss": 0.194, + "step": 1182 + }, + { + "epoch": 0.5522388059701493, + "grad_norm": 1.2569272956554611, + "learning_rate": 9.786764794003507e-06, + "loss": 0.1675, + "step": 1184 + }, + { + "epoch": 0.5531716417910447, + "grad_norm": 1.3627683164721545, + "learning_rate": 9.785193777760075e-06, + "loss": 0.2228, + "step": 1186 + }, + { + "epoch": 0.5541044776119403, + "grad_norm": 1.280885062870962, + "learning_rate": 9.783617122515554e-06, + "loss": 0.1742, + "step": 1188 + }, + { + "epoch": 0.5550373134328358, + "grad_norm": 1.196759223910446, + "learning_rate": 9.782034830127916e-06, + "loss": 0.1807, + "step": 1190 + }, + { + "epoch": 0.5559701492537313, + "grad_norm": 1.2585826253661034, + "learning_rate": 9.780446902461778e-06, + "loss": 0.1936, + "step": 1192 + }, + { + "epoch": 0.5569029850746269, + "grad_norm": 1.3570128331998361, + "learning_rate": 9.778853341388397e-06, + "loss": 0.1917, + "step": 1194 + }, + { + "epoch": 0.5578358208955224, + "grad_norm": 1.3831013550474431, + "learning_rate": 9.777254148785665e-06, + "loss": 0.1952, + "step": 1196 + }, + { + "epoch": 0.558768656716418, + "grad_norm": 1.1801792287210577, + "learning_rate": 9.775649326538115e-06, + "loss": 0.1894, + "step": 1198 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 1.2394044831214288, + "learning_rate": 9.774038876536915e-06, + "loss": 0.1878, + "step": 1200 + }, + { + "epoch": 0.5606343283582089, + "grad_norm": 1.372919737346026, + "learning_rate": 9.772422800679859e-06, + "loss": 0.2237, + "step": 1202 + }, + { + "epoch": 0.5615671641791045, + "grad_norm": 1.2246685151667145, + "learning_rate": 9.770801100871377e-06, + "loss": 0.1957, + "step": 1204 + }, + { + "epoch": 0.5625, + "grad_norm": 1.3216940992177963, + "learning_rate": 9.769173779022525e-06, + "loss": 0.201, + "step": 1206 + }, + { + "epoch": 0.5634328358208955, + "grad_norm": 1.4574705082260384, + "learning_rate": 9.767540837050978e-06, + "loss": 0.1934, + "step": 1208 + }, + { + "epoch": 0.5643656716417911, + "grad_norm": 1.4024977043721756, + "learning_rate": 9.765902276881043e-06, + "loss": 0.2063, + "step": 1210 + }, + { + "epoch": 0.5652985074626866, + "grad_norm": 1.291650160322974, + "learning_rate": 9.764258100443641e-06, + "loss": 0.2065, + "step": 1212 + }, + { + "epoch": 0.566231343283582, + "grad_norm": 1.4338365153537704, + "learning_rate": 9.762608309676315e-06, + "loss": 0.2178, + "step": 1214 + }, + { + "epoch": 0.5671641791044776, + "grad_norm": 1.2157239415060652, + "learning_rate": 9.760952906523223e-06, + "loss": 0.1972, + "step": 1216 + }, + { + "epoch": 0.5680970149253731, + "grad_norm": 1.3194975368927422, + "learning_rate": 9.759291892935135e-06, + "loss": 0.1764, + "step": 1218 + }, + { + "epoch": 0.5690298507462687, + "grad_norm": 1.2348920755812913, + "learning_rate": 9.757625270869437e-06, + "loss": 0.19, + "step": 1220 + }, + { + "epoch": 0.5699626865671642, + "grad_norm": 1.3197574539922416, + "learning_rate": 9.755953042290116e-06, + "loss": 0.2096, + "step": 1222 + }, + { + "epoch": 0.5708955223880597, + "grad_norm": 1.4130620736835018, + "learning_rate": 9.754275209167779e-06, + "loss": 0.1988, + "step": 1224 + }, + { + "epoch": 0.5718283582089553, + "grad_norm": 1.2923839968760171, + "learning_rate": 9.752591773479622e-06, + "loss": 0.2032, + "step": 1226 + }, + { + "epoch": 0.5727611940298507, + "grad_norm": 1.2126080773255015, + "learning_rate": 9.750902737209456e-06, + "loss": 0.2058, + "step": 1228 + }, + { + "epoch": 0.5736940298507462, + "grad_norm": 1.2680649551355194, + "learning_rate": 9.749208102347684e-06, + "loss": 0.1674, + "step": 1230 + }, + { + "epoch": 0.5746268656716418, + "grad_norm": 1.2735016462093087, + "learning_rate": 9.747507870891311e-06, + "loss": 0.205, + "step": 1232 + }, + { + "epoch": 0.5755597014925373, + "grad_norm": 1.5332136085223254, + "learning_rate": 9.745802044843935e-06, + "loss": 0.2426, + "step": 1234 + }, + { + "epoch": 0.5764925373134329, + "grad_norm": 1.1731323643817357, + "learning_rate": 9.744090626215745e-06, + "loss": 0.1828, + "step": 1236 + }, + { + "epoch": 0.5774253731343284, + "grad_norm": 1.2990854061105677, + "learning_rate": 9.742373617023527e-06, + "loss": 0.2037, + "step": 1238 + }, + { + "epoch": 0.5783582089552238, + "grad_norm": 1.25919557077718, + "learning_rate": 9.740651019290648e-06, + "loss": 0.1856, + "step": 1240 + }, + { + "epoch": 0.5792910447761194, + "grad_norm": 1.2880285382358647, + "learning_rate": 9.738922835047065e-06, + "loss": 0.2159, + "step": 1242 + }, + { + "epoch": 0.5802238805970149, + "grad_norm": 1.3431597710356027, + "learning_rate": 9.737189066329314e-06, + "loss": 0.2026, + "step": 1244 + }, + { + "epoch": 0.5811567164179104, + "grad_norm": 1.237926115980232, + "learning_rate": 9.735449715180518e-06, + "loss": 0.1939, + "step": 1246 + }, + { + "epoch": 0.582089552238806, + "grad_norm": 1.416076845947073, + "learning_rate": 9.733704783650374e-06, + "loss": 0.1932, + "step": 1248 + }, + { + "epoch": 0.5830223880597015, + "grad_norm": 1.3086449118802606, + "learning_rate": 9.731954273795155e-06, + "loss": 0.2094, + "step": 1250 + }, + { + "epoch": 0.5839552238805971, + "grad_norm": 1.4112186537179239, + "learning_rate": 9.73019818767771e-06, + "loss": 0.2065, + "step": 1252 + }, + { + "epoch": 0.5848880597014925, + "grad_norm": 1.1933845249042692, + "learning_rate": 9.72843652736746e-06, + "loss": 0.1787, + "step": 1254 + }, + { + "epoch": 0.585820895522388, + "grad_norm": 1.290817590513132, + "learning_rate": 9.72666929494039e-06, + "loss": 0.1909, + "step": 1256 + }, + { + "epoch": 0.5867537313432836, + "grad_norm": 1.2977497377639067, + "learning_rate": 9.724896492479057e-06, + "loss": 0.1884, + "step": 1258 + }, + { + "epoch": 0.5876865671641791, + "grad_norm": 1.2916204111336886, + "learning_rate": 9.723118122072575e-06, + "loss": 0.2086, + "step": 1260 + }, + { + "epoch": 0.5886194029850746, + "grad_norm": 1.276746671742708, + "learning_rate": 9.721334185816627e-06, + "loss": 0.2035, + "step": 1262 + }, + { + "epoch": 0.5895522388059702, + "grad_norm": 1.2487362167598965, + "learning_rate": 9.71954468581345e-06, + "loss": 0.2147, + "step": 1264 + }, + { + "epoch": 0.5904850746268657, + "grad_norm": 1.217991598150849, + "learning_rate": 9.717749624171842e-06, + "loss": 0.1729, + "step": 1266 + }, + { + "epoch": 0.5914179104477612, + "grad_norm": 1.2305434973699685, + "learning_rate": 9.715949003007145e-06, + "loss": 0.1803, + "step": 1268 + }, + { + "epoch": 0.5923507462686567, + "grad_norm": 1.3312002886023842, + "learning_rate": 9.714142824441268e-06, + "loss": 0.2177, + "step": 1270 + }, + { + "epoch": 0.5932835820895522, + "grad_norm": 1.3622656583786625, + "learning_rate": 9.712331090602654e-06, + "loss": 0.2022, + "step": 1272 + }, + { + "epoch": 0.5942164179104478, + "grad_norm": 1.1678229978345787, + "learning_rate": 9.7105138036263e-06, + "loss": 0.185, + "step": 1274 + }, + { + "epoch": 0.5951492537313433, + "grad_norm": 1.3689661767316197, + "learning_rate": 9.708690965653749e-06, + "loss": 0.1842, + "step": 1276 + }, + { + "epoch": 0.5960820895522388, + "grad_norm": 1.4710998548894345, + "learning_rate": 9.70686257883308e-06, + "loss": 0.197, + "step": 1278 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 1.4369031982151708, + "learning_rate": 9.705028645318913e-06, + "loss": 0.1866, + "step": 1280 + }, + { + "epoch": 0.5979477611940298, + "grad_norm": 1.3143018413171865, + "learning_rate": 9.703189167272404e-06, + "loss": 0.1969, + "step": 1282 + }, + { + "epoch": 0.5988805970149254, + "grad_norm": 1.3181024885453547, + "learning_rate": 9.701344146861246e-06, + "loss": 0.2021, + "step": 1284 + }, + { + "epoch": 0.5998134328358209, + "grad_norm": 1.342992985960691, + "learning_rate": 9.699493586259658e-06, + "loss": 0.1839, + "step": 1286 + }, + { + "epoch": 0.6007462686567164, + "grad_norm": 1.3035694458919886, + "learning_rate": 9.697637487648392e-06, + "loss": 0.1906, + "step": 1288 + }, + { + "epoch": 0.601679104477612, + "grad_norm": 1.261927746228036, + "learning_rate": 9.695775853214725e-06, + "loss": 0.1895, + "step": 1290 + }, + { + "epoch": 0.6026119402985075, + "grad_norm": 1.2777086230160113, + "learning_rate": 9.693908685152456e-06, + "loss": 0.1865, + "step": 1292 + }, + { + "epoch": 0.6035447761194029, + "grad_norm": 1.2120672739654221, + "learning_rate": 9.692035985661906e-06, + "loss": 0.2036, + "step": 1294 + }, + { + "epoch": 0.6044776119402985, + "grad_norm": 1.2677580779039375, + "learning_rate": 9.690157756949914e-06, + "loss": 0.1968, + "step": 1296 + }, + { + "epoch": 0.605410447761194, + "grad_norm": 1.3788989498698019, + "learning_rate": 9.688274001229838e-06, + "loss": 0.1732, + "step": 1298 + }, + { + "epoch": 0.6063432835820896, + "grad_norm": 1.285932007125996, + "learning_rate": 9.686384720721543e-06, + "loss": 0.2002, + "step": 1300 + }, + { + "epoch": 0.6072761194029851, + "grad_norm": 1.3150008423127384, + "learning_rate": 9.684489917651409e-06, + "loss": 0.1835, + "step": 1302 + }, + { + "epoch": 0.6082089552238806, + "grad_norm": 1.3903895527118488, + "learning_rate": 9.682589594252325e-06, + "loss": 0.1968, + "step": 1304 + }, + { + "epoch": 0.6091417910447762, + "grad_norm": 1.2665624601541696, + "learning_rate": 9.68068375276368e-06, + "loss": 0.1982, + "step": 1306 + }, + { + "epoch": 0.6100746268656716, + "grad_norm": 1.3297377918765387, + "learning_rate": 9.678772395431371e-06, + "loss": 0.219, + "step": 1308 + }, + { + "epoch": 0.6110074626865671, + "grad_norm": 1.2825309928521964, + "learning_rate": 9.676855524507793e-06, + "loss": 0.1718, + "step": 1310 + }, + { + "epoch": 0.6119402985074627, + "grad_norm": 1.2282825080891733, + "learning_rate": 9.674933142251836e-06, + "loss": 0.1937, + "step": 1312 + }, + { + "epoch": 0.6128731343283582, + "grad_norm": 1.2632075190559702, + "learning_rate": 9.67300525092889e-06, + "loss": 0.1969, + "step": 1314 + }, + { + "epoch": 0.6138059701492538, + "grad_norm": 1.2811311953257711, + "learning_rate": 9.671071852810832e-06, + "loss": 0.219, + "step": 1316 + }, + { + "epoch": 0.6147388059701493, + "grad_norm": 1.2853508640105762, + "learning_rate": 9.66913295017603e-06, + "loss": 0.186, + "step": 1318 + }, + { + "epoch": 0.6156716417910447, + "grad_norm": 1.2313760707837031, + "learning_rate": 9.667188545309342e-06, + "loss": 0.1822, + "step": 1320 + }, + { + "epoch": 0.6166044776119403, + "grad_norm": 1.3445600658444656, + "learning_rate": 9.665238640502104e-06, + "loss": 0.1872, + "step": 1322 + }, + { + "epoch": 0.6175373134328358, + "grad_norm": 1.8331986833418121, + "learning_rate": 9.663283238052136e-06, + "loss": 0.1951, + "step": 1324 + }, + { + "epoch": 0.6184701492537313, + "grad_norm": 1.337579658657769, + "learning_rate": 9.66132234026374e-06, + "loss": 0.203, + "step": 1326 + }, + { + "epoch": 0.6194029850746269, + "grad_norm": 1.2549271918408587, + "learning_rate": 9.659355949447689e-06, + "loss": 0.1862, + "step": 1328 + }, + { + "epoch": 0.6203358208955224, + "grad_norm": 1.128214501303178, + "learning_rate": 9.657384067921229e-06, + "loss": 0.191, + "step": 1330 + }, + { + "epoch": 0.621268656716418, + "grad_norm": 1.2270654230039046, + "learning_rate": 9.65540669800808e-06, + "loss": 0.1932, + "step": 1332 + }, + { + "epoch": 0.6222014925373134, + "grad_norm": 1.2575667177230767, + "learning_rate": 9.65342384203843e-06, + "loss": 0.2024, + "step": 1334 + }, + { + "epoch": 0.6231343283582089, + "grad_norm": 1.2188705632524715, + "learning_rate": 9.651435502348927e-06, + "loss": 0.181, + "step": 1336 + }, + { + "epoch": 0.6240671641791045, + "grad_norm": 1.4700426250468017, + "learning_rate": 9.649441681282682e-06, + "loss": 0.2092, + "step": 1338 + }, + { + "epoch": 0.625, + "grad_norm": 1.847917231034415, + "learning_rate": 9.647442381189273e-06, + "loss": 0.1969, + "step": 1340 + }, + { + "epoch": 0.6259328358208955, + "grad_norm": 1.2257028686491196, + "learning_rate": 9.645437604424726e-06, + "loss": 0.1877, + "step": 1342 + }, + { + "epoch": 0.6268656716417911, + "grad_norm": 1.1912266570731185, + "learning_rate": 9.643427353351522e-06, + "loss": 0.2148, + "step": 1344 + }, + { + "epoch": 0.6277985074626866, + "grad_norm": 1.3102113230009198, + "learning_rate": 9.641411630338598e-06, + "loss": 0.2079, + "step": 1346 + }, + { + "epoch": 0.628731343283582, + "grad_norm": 1.3472399048562653, + "learning_rate": 9.639390437761334e-06, + "loss": 0.2002, + "step": 1348 + }, + { + "epoch": 0.6296641791044776, + "grad_norm": 1.3045561125394984, + "learning_rate": 9.63736377800156e-06, + "loss": 0.2032, + "step": 1350 + }, + { + "epoch": 0.6305970149253731, + "grad_norm": 1.1318941901276385, + "learning_rate": 9.635331653447545e-06, + "loss": 0.1868, + "step": 1352 + }, + { + "epoch": 0.6315298507462687, + "grad_norm": 1.1953449932562992, + "learning_rate": 9.633294066493999e-06, + "loss": 0.1905, + "step": 1354 + }, + { + "epoch": 0.6324626865671642, + "grad_norm": 1.168253289218291, + "learning_rate": 9.63125101954207e-06, + "loss": 0.1937, + "step": 1356 + }, + { + "epoch": 0.6333955223880597, + "grad_norm": 1.3813246317593162, + "learning_rate": 9.62920251499934e-06, + "loss": 0.2088, + "step": 1358 + }, + { + "epoch": 0.6343283582089553, + "grad_norm": 1.1336639737226735, + "learning_rate": 9.627148555279819e-06, + "loss": 0.1844, + "step": 1360 + }, + { + "epoch": 0.6352611940298507, + "grad_norm": 1.6281711902801774, + "learning_rate": 9.625089142803953e-06, + "loss": 0.197, + "step": 1362 + }, + { + "epoch": 0.6361940298507462, + "grad_norm": 1.2530314862291678, + "learning_rate": 9.623024279998606e-06, + "loss": 0.1816, + "step": 1364 + }, + { + "epoch": 0.6371268656716418, + "grad_norm": 1.2861829365395432, + "learning_rate": 9.620953969297067e-06, + "loss": 0.1988, + "step": 1366 + }, + { + "epoch": 0.6380597014925373, + "grad_norm": 1.2670269512856949, + "learning_rate": 9.618878213139048e-06, + "loss": 0.1954, + "step": 1368 + }, + { + "epoch": 0.6389925373134329, + "grad_norm": 1.1057473468977361, + "learning_rate": 9.616797013970676e-06, + "loss": 0.1819, + "step": 1370 + }, + { + "epoch": 0.6399253731343284, + "grad_norm": 1.283855563211758, + "learning_rate": 9.61471037424449e-06, + "loss": 0.2046, + "step": 1372 + }, + { + "epoch": 0.6408582089552238, + "grad_norm": 1.315178191477583, + "learning_rate": 9.612618296419443e-06, + "loss": 0.2048, + "step": 1374 + }, + { + "epoch": 0.6417910447761194, + "grad_norm": 1.221371229104398, + "learning_rate": 9.610520782960899e-06, + "loss": 0.1816, + "step": 1376 + }, + { + "epoch": 0.6427238805970149, + "grad_norm": 1.3181977336147686, + "learning_rate": 9.608417836340619e-06, + "loss": 0.1953, + "step": 1378 + }, + { + "epoch": 0.6436567164179104, + "grad_norm": 1.233439829786006, + "learning_rate": 9.606309459036776e-06, + "loss": 0.2116, + "step": 1380 + }, + { + "epoch": 0.644589552238806, + "grad_norm": 1.3413337842853288, + "learning_rate": 9.604195653533937e-06, + "loss": 0.2185, + "step": 1382 + }, + { + "epoch": 0.6455223880597015, + "grad_norm": 1.161182680072267, + "learning_rate": 9.602076422323067e-06, + "loss": 0.1987, + "step": 1384 + }, + { + "epoch": 0.6464552238805971, + "grad_norm": 1.3403910691944039, + "learning_rate": 9.599951767901527e-06, + "loss": 0.1935, + "step": 1386 + }, + { + "epoch": 0.6473880597014925, + "grad_norm": 1.1928214861429798, + "learning_rate": 9.597821692773064e-06, + "loss": 0.189, + "step": 1388 + }, + { + "epoch": 0.648320895522388, + "grad_norm": 1.3653775619687112, + "learning_rate": 9.595686199447818e-06, + "loss": 0.2154, + "step": 1390 + }, + { + "epoch": 0.6492537313432836, + "grad_norm": 1.2711818078715147, + "learning_rate": 9.59354529044231e-06, + "loss": 0.1834, + "step": 1392 + }, + { + "epoch": 0.6501865671641791, + "grad_norm": 1.3350085586582887, + "learning_rate": 9.591398968279448e-06, + "loss": 0.1764, + "step": 1394 + }, + { + "epoch": 0.6511194029850746, + "grad_norm": 1.1201108159093904, + "learning_rate": 9.589247235488512e-06, + "loss": 0.1673, + "step": 1396 + }, + { + "epoch": 0.6520522388059702, + "grad_norm": 1.2278203587530399, + "learning_rate": 9.587090094605163e-06, + "loss": 0.1876, + "step": 1398 + }, + { + "epoch": 0.6529850746268657, + "grad_norm": 1.2782004653426244, + "learning_rate": 9.584927548171435e-06, + "loss": 0.1934, + "step": 1400 + }, + { + "epoch": 0.6539179104477612, + "grad_norm": 1.3373655776274411, + "learning_rate": 9.582759598735732e-06, + "loss": 0.1962, + "step": 1402 + }, + { + "epoch": 0.6548507462686567, + "grad_norm": 1.22441813569092, + "learning_rate": 9.58058624885282e-06, + "loss": 0.2081, + "step": 1404 + }, + { + "epoch": 0.6557835820895522, + "grad_norm": 1.310275513321286, + "learning_rate": 9.578407501083835e-06, + "loss": 0.1876, + "step": 1406 + }, + { + "epoch": 0.6567164179104478, + "grad_norm": 1.388350137552061, + "learning_rate": 9.576223357996272e-06, + "loss": 0.1869, + "step": 1408 + }, + { + "epoch": 0.6576492537313433, + "grad_norm": 1.3083547650148935, + "learning_rate": 9.574033822163984e-06, + "loss": 0.1872, + "step": 1410 + }, + { + "epoch": 0.6585820895522388, + "grad_norm": 1.209438419679871, + "learning_rate": 9.57183889616718e-06, + "loss": 0.1936, + "step": 1412 + }, + { + "epoch": 0.6595149253731343, + "grad_norm": 1.3540134262961245, + "learning_rate": 9.569638582592418e-06, + "loss": 0.2029, + "step": 1414 + }, + { + "epoch": 0.6604477611940298, + "grad_norm": 1.322276588604727, + "learning_rate": 9.567432884032609e-06, + "loss": 0.1893, + "step": 1416 + }, + { + "epoch": 0.6613805970149254, + "grad_norm": 1.1512156500697452, + "learning_rate": 9.565221803087003e-06, + "loss": 0.1902, + "step": 1418 + }, + { + "epoch": 0.6623134328358209, + "grad_norm": 1.2368849361213836, + "learning_rate": 9.563005342361204e-06, + "loss": 0.1799, + "step": 1420 + }, + { + "epoch": 0.6632462686567164, + "grad_norm": 1.2929994455263356, + "learning_rate": 9.560783504467143e-06, + "loss": 0.1947, + "step": 1422 + }, + { + "epoch": 0.664179104477612, + "grad_norm": 1.1644028068055394, + "learning_rate": 9.558556292023097e-06, + "loss": 0.1875, + "step": 1424 + }, + { + "epoch": 0.6651119402985075, + "grad_norm": 1.3848532662506554, + "learning_rate": 9.55632370765367e-06, + "loss": 0.1982, + "step": 1426 + }, + { + "epoch": 0.6660447761194029, + "grad_norm": 1.2337805181838923, + "learning_rate": 9.554085753989803e-06, + "loss": 0.2111, + "step": 1428 + }, + { + "epoch": 0.6669776119402985, + "grad_norm": 1.2331433498442423, + "learning_rate": 9.55184243366876e-06, + "loss": 0.2032, + "step": 1430 + }, + { + "epoch": 0.667910447761194, + "grad_norm": 1.2746494291609998, + "learning_rate": 9.54959374933413e-06, + "loss": 0.1768, + "step": 1432 + }, + { + "epoch": 0.6688432835820896, + "grad_norm": 1.400845736833123, + "learning_rate": 9.547339703635818e-06, + "loss": 0.2261, + "step": 1434 + }, + { + "epoch": 0.6697761194029851, + "grad_norm": 1.2689933104196989, + "learning_rate": 9.54508029923006e-06, + "loss": 0.1806, + "step": 1436 + }, + { + "epoch": 0.6707089552238806, + "grad_norm": 1.2141135219059682, + "learning_rate": 9.542815538779395e-06, + "loss": 0.1718, + "step": 1438 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 1.1811979327498543, + "learning_rate": 9.540545424952678e-06, + "loss": 0.2136, + "step": 1440 + }, + { + "epoch": 0.6725746268656716, + "grad_norm": 1.3465680725570635, + "learning_rate": 9.53826996042507e-06, + "loss": 0.2031, + "step": 1442 + }, + { + "epoch": 0.6735074626865671, + "grad_norm": 1.3559394534551104, + "learning_rate": 9.535989147878044e-06, + "loss": 0.1926, + "step": 1444 + }, + { + "epoch": 0.6744402985074627, + "grad_norm": 1.3486946004743048, + "learning_rate": 9.53370298999937e-06, + "loss": 0.1851, + "step": 1446 + }, + { + "epoch": 0.6753731343283582, + "grad_norm": 1.3398692460415387, + "learning_rate": 9.531411489483115e-06, + "loss": 0.206, + "step": 1448 + }, + { + "epoch": 0.6763059701492538, + "grad_norm": 1.0955446750720867, + "learning_rate": 9.529114649029646e-06, + "loss": 0.17, + "step": 1450 + }, + { + "epoch": 0.6772388059701493, + "grad_norm": 1.141603349628182, + "learning_rate": 9.526812471345623e-06, + "loss": 0.1907, + "step": 1452 + }, + { + "epoch": 0.6781716417910447, + "grad_norm": 1.3310977144390403, + "learning_rate": 9.524504959143993e-06, + "loss": 0.2117, + "step": 1454 + }, + { + "epoch": 0.6791044776119403, + "grad_norm": 1.1120098628415085, + "learning_rate": 9.522192115143992e-06, + "loss": 0.1727, + "step": 1456 + }, + { + "epoch": 0.6800373134328358, + "grad_norm": 1.1861915861899635, + "learning_rate": 9.519873942071134e-06, + "loss": 0.197, + "step": 1458 + }, + { + "epoch": 0.6809701492537313, + "grad_norm": 1.2788597711174958, + "learning_rate": 9.51755044265722e-06, + "loss": 0.1835, + "step": 1460 + }, + { + "epoch": 0.6819029850746269, + "grad_norm": 1.447329674975208, + "learning_rate": 9.515221619640323e-06, + "loss": 0.2019, + "step": 1462 + }, + { + "epoch": 0.6828358208955224, + "grad_norm": 1.1453947729026939, + "learning_rate": 9.51288747576479e-06, + "loss": 0.1874, + "step": 1464 + }, + { + "epoch": 0.683768656716418, + "grad_norm": 1.303187551114296, + "learning_rate": 9.51054801378124e-06, + "loss": 0.1906, + "step": 1466 + }, + { + "epoch": 0.6847014925373134, + "grad_norm": 1.2823609320700624, + "learning_rate": 9.508203236446558e-06, + "loss": 0.1852, + "step": 1468 + }, + { + "epoch": 0.6856343283582089, + "grad_norm": 1.2989362328025698, + "learning_rate": 9.505853146523894e-06, + "loss": 0.1851, + "step": 1470 + }, + { + "epoch": 0.6865671641791045, + "grad_norm": 1.2218192464227928, + "learning_rate": 9.503497746782652e-06, + "loss": 0.1932, + "step": 1472 + }, + { + "epoch": 0.6875, + "grad_norm": 1.3136247512251407, + "learning_rate": 9.501137039998504e-06, + "loss": 0.2039, + "step": 1474 + }, + { + "epoch": 0.6884328358208955, + "grad_norm": 1.3223029230717274, + "learning_rate": 9.49877102895337e-06, + "loss": 0.1952, + "step": 1476 + }, + { + "epoch": 0.6893656716417911, + "grad_norm": 1.22272156891932, + "learning_rate": 9.496399716435417e-06, + "loss": 0.1986, + "step": 1478 + }, + { + "epoch": 0.6902985074626866, + "grad_norm": 1.161806305814099, + "learning_rate": 9.494023105239067e-06, + "loss": 0.1736, + "step": 1480 + }, + { + "epoch": 0.691231343283582, + "grad_norm": 1.1899958541831737, + "learning_rate": 9.49164119816498e-06, + "loss": 0.1768, + "step": 1482 + }, + { + "epoch": 0.6921641791044776, + "grad_norm": 1.2904641348916563, + "learning_rate": 9.489253998020062e-06, + "loss": 0.2, + "step": 1484 + }, + { + "epoch": 0.6930970149253731, + "grad_norm": 1.483768884420732, + "learning_rate": 9.486861507617452e-06, + "loss": 0.202, + "step": 1486 + }, + { + "epoch": 0.6940298507462687, + "grad_norm": 1.3110823692397675, + "learning_rate": 9.484463729776527e-06, + "loss": 0.2078, + "step": 1488 + }, + { + "epoch": 0.6949626865671642, + "grad_norm": 1.173941495674722, + "learning_rate": 9.48206066732289e-06, + "loss": 0.1723, + "step": 1490 + }, + { + "epoch": 0.6958955223880597, + "grad_norm": 1.3124080798355797, + "learning_rate": 9.479652323088377e-06, + "loss": 0.2186, + "step": 1492 + }, + { + "epoch": 0.6968283582089553, + "grad_norm": 1.2763222228363904, + "learning_rate": 9.477238699911046e-06, + "loss": 0.189, + "step": 1494 + }, + { + "epoch": 0.6977611940298507, + "grad_norm": 1.4136832251346632, + "learning_rate": 9.474819800635174e-06, + "loss": 0.2081, + "step": 1496 + }, + { + "epoch": 0.6986940298507462, + "grad_norm": 1.119137762218111, + "learning_rate": 9.472395628111255e-06, + "loss": 0.1868, + "step": 1498 + }, + { + "epoch": 0.6996268656716418, + "grad_norm": 1.387871176288593, + "learning_rate": 9.469966185196003e-06, + "loss": 0.2017, + "step": 1500 + }, + { + "epoch": 0.6996268656716418, + "eval_loss": 0.17079760134220123, + "eval_runtime": 321.3449, + "eval_samples_per_second": 47.444, + "eval_steps_per_second": 5.931, + "step": 1500 + }, + { + "epoch": 0.7005597014925373, + "grad_norm": 1.3658040018974007, + "learning_rate": 9.467531474752336e-06, + "loss": 0.1958, + "step": 1502 + }, + { + "epoch": 0.7014925373134329, + "grad_norm": 1.1346929234604453, + "learning_rate": 9.465091499649385e-06, + "loss": 0.1744, + "step": 1504 + }, + { + "epoch": 0.7024253731343284, + "grad_norm": 1.3117793008529666, + "learning_rate": 9.46264626276248e-06, + "loss": 0.1795, + "step": 1506 + }, + { + "epoch": 0.7033582089552238, + "grad_norm": 1.1950098061171308, + "learning_rate": 9.460195766973154e-06, + "loss": 0.1798, + "step": 1508 + }, + { + "epoch": 0.7042910447761194, + "grad_norm": 1.2239090727892925, + "learning_rate": 9.45774001516914e-06, + "loss": 0.1882, + "step": 1510 + }, + { + "epoch": 0.7052238805970149, + "grad_norm": 1.1393660911507115, + "learning_rate": 9.45527901024436e-06, + "loss": 0.1848, + "step": 1512 + }, + { + "epoch": 0.7061567164179104, + "grad_norm": 1.3511473443468591, + "learning_rate": 9.452812755098927e-06, + "loss": 0.1906, + "step": 1514 + }, + { + "epoch": 0.707089552238806, + "grad_norm": 1.3167759751706674, + "learning_rate": 9.450341252639144e-06, + "loss": 0.201, + "step": 1516 + }, + { + "epoch": 0.7080223880597015, + "grad_norm": 1.1505516796278474, + "learning_rate": 9.447864505777496e-06, + "loss": 0.1695, + "step": 1518 + }, + { + "epoch": 0.7089552238805971, + "grad_norm": 1.2689369158761514, + "learning_rate": 9.445382517432648e-06, + "loss": 0.1835, + "step": 1520 + }, + { + "epoch": 0.7098880597014925, + "grad_norm": 1.161677660666682, + "learning_rate": 9.442895290529442e-06, + "loss": 0.1953, + "step": 1522 + }, + { + "epoch": 0.710820895522388, + "grad_norm": 1.3495510031393991, + "learning_rate": 9.440402827998893e-06, + "loss": 0.2064, + "step": 1524 + }, + { + "epoch": 0.7117537313432836, + "grad_norm": 1.1457330368321792, + "learning_rate": 9.437905132778185e-06, + "loss": 0.1906, + "step": 1526 + }, + { + "epoch": 0.7126865671641791, + "grad_norm": 1.1854030525063404, + "learning_rate": 9.43540220781067e-06, + "loss": 0.1998, + "step": 1528 + }, + { + "epoch": 0.7136194029850746, + "grad_norm": 1.478371017865085, + "learning_rate": 9.432894056045862e-06, + "loss": 0.1963, + "step": 1530 + }, + { + "epoch": 0.7145522388059702, + "grad_norm": 1.3181432461815878, + "learning_rate": 9.430380680439435e-06, + "loss": 0.1785, + "step": 1532 + }, + { + "epoch": 0.7154850746268657, + "grad_norm": 1.213950807873333, + "learning_rate": 9.42786208395322e-06, + "loss": 0.1968, + "step": 1534 + }, + { + "epoch": 0.7164179104477612, + "grad_norm": 1.2198669604077816, + "learning_rate": 9.425338269555193e-06, + "loss": 0.1993, + "step": 1536 + }, + { + "epoch": 0.7173507462686567, + "grad_norm": 1.194092739592872, + "learning_rate": 9.422809240219491e-06, + "loss": 0.1776, + "step": 1538 + }, + { + "epoch": 0.7182835820895522, + "grad_norm": 1.3171151245106296, + "learning_rate": 9.42027499892639e-06, + "loss": 0.1845, + "step": 1540 + }, + { + "epoch": 0.7192164179104478, + "grad_norm": 1.2333889462195131, + "learning_rate": 9.417735548662302e-06, + "loss": 0.199, + "step": 1542 + }, + { + "epoch": 0.7201492537313433, + "grad_norm": 1.2342130003172749, + "learning_rate": 9.41519089241979e-06, + "loss": 0.1717, + "step": 1544 + }, + { + "epoch": 0.7210820895522388, + "grad_norm": 1.297550445847169, + "learning_rate": 9.412641033197543e-06, + "loss": 0.1805, + "step": 1546 + }, + { + "epoch": 0.7220149253731343, + "grad_norm": 1.2155591431130077, + "learning_rate": 9.410085974000383e-06, + "loss": 0.182, + "step": 1548 + }, + { + "epoch": 0.7229477611940298, + "grad_norm": 1.3587429095966317, + "learning_rate": 9.407525717839262e-06, + "loss": 0.1907, + "step": 1550 + }, + { + "epoch": 0.7238805970149254, + "grad_norm": 1.1368429620830403, + "learning_rate": 9.404960267731251e-06, + "loss": 0.1866, + "step": 1552 + }, + { + "epoch": 0.7248134328358209, + "grad_norm": 1.234511344856253, + "learning_rate": 9.40238962669955e-06, + "loss": 0.1955, + "step": 1554 + }, + { + "epoch": 0.7257462686567164, + "grad_norm": 1.312778530731583, + "learning_rate": 9.399813797773472e-06, + "loss": 0.2072, + "step": 1556 + }, + { + "epoch": 0.726679104477612, + "grad_norm": 1.219616393554641, + "learning_rate": 9.397232783988439e-06, + "loss": 0.1805, + "step": 1558 + }, + { + "epoch": 0.7276119402985075, + "grad_norm": 1.3768136621236455, + "learning_rate": 9.39464658838599e-06, + "loss": 0.1866, + "step": 1560 + }, + { + "epoch": 0.7285447761194029, + "grad_norm": 1.3079769241521353, + "learning_rate": 9.392055214013765e-06, + "loss": 0.1859, + "step": 1562 + }, + { + "epoch": 0.7294776119402985, + "grad_norm": 1.1939382831199772, + "learning_rate": 9.389458663925512e-06, + "loss": 0.193, + "step": 1564 + }, + { + "epoch": 0.730410447761194, + "grad_norm": 1.267302678741195, + "learning_rate": 9.386856941181076e-06, + "loss": 0.2013, + "step": 1566 + }, + { + "epoch": 0.7313432835820896, + "grad_norm": 1.230885714676747, + "learning_rate": 9.384250048846394e-06, + "loss": 0.1854, + "step": 1568 + }, + { + "epoch": 0.7322761194029851, + "grad_norm": 1.3628808283440035, + "learning_rate": 9.381637989993497e-06, + "loss": 0.1856, + "step": 1570 + }, + { + "epoch": 0.7332089552238806, + "grad_norm": 1.0809491914264386, + "learning_rate": 9.37902076770051e-06, + "loss": 0.1799, + "step": 1572 + }, + { + "epoch": 0.7341417910447762, + "grad_norm": 1.3320904641678766, + "learning_rate": 9.376398385051635e-06, + "loss": 0.205, + "step": 1574 + }, + { + "epoch": 0.7350746268656716, + "grad_norm": 1.306969225551526, + "learning_rate": 9.373770845137162e-06, + "loss": 0.1858, + "step": 1576 + }, + { + "epoch": 0.7360074626865671, + "grad_norm": 1.4033966518233125, + "learning_rate": 9.371138151053449e-06, + "loss": 0.2023, + "step": 1578 + }, + { + "epoch": 0.7369402985074627, + "grad_norm": 1.4139340955951025, + "learning_rate": 9.368500305902939e-06, + "loss": 0.2044, + "step": 1580 + }, + { + "epoch": 0.7378731343283582, + "grad_norm": 1.2176456956703794, + "learning_rate": 9.365857312794136e-06, + "loss": 0.1791, + "step": 1582 + }, + { + "epoch": 0.7388059701492538, + "grad_norm": 1.3088867965970863, + "learning_rate": 9.363209174841617e-06, + "loss": 0.1955, + "step": 1584 + }, + { + "epoch": 0.7397388059701493, + "grad_norm": 1.201175692333774, + "learning_rate": 9.360555895166015e-06, + "loss": 0.1763, + "step": 1586 + }, + { + "epoch": 0.7406716417910447, + "grad_norm": 1.2783969974039087, + "learning_rate": 9.35789747689403e-06, + "loss": 0.2124, + "step": 1588 + }, + { + "epoch": 0.7416044776119403, + "grad_norm": 1.3368519874428064, + "learning_rate": 9.35523392315841e-06, + "loss": 0.2098, + "step": 1590 + }, + { + "epoch": 0.7425373134328358, + "grad_norm": 1.2456330261819875, + "learning_rate": 9.352565237097964e-06, + "loss": 0.1909, + "step": 1592 + }, + { + "epoch": 0.7434701492537313, + "grad_norm": 1.2614455060038814, + "learning_rate": 9.34989142185754e-06, + "loss": 0.1925, + "step": 1594 + }, + { + "epoch": 0.7444029850746269, + "grad_norm": 1.2118219426430827, + "learning_rate": 9.347212480588033e-06, + "loss": 0.1708, + "step": 1596 + }, + { + "epoch": 0.7453358208955224, + "grad_norm": 1.3061401074852776, + "learning_rate": 9.34452841644638e-06, + "loss": 0.1881, + "step": 1598 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 1.232290471152712, + "learning_rate": 9.341839232595555e-06, + "loss": 0.1878, + "step": 1600 + }, + { + "epoch": 0.7472014925373134, + "grad_norm": 1.125088295370148, + "learning_rate": 9.339144932204564e-06, + "loss": 0.1681, + "step": 1602 + }, + { + "epoch": 0.7481343283582089, + "grad_norm": 1.2799067080626618, + "learning_rate": 9.336445518448442e-06, + "loss": 0.1947, + "step": 1604 + }, + { + "epoch": 0.7490671641791045, + "grad_norm": 1.3566521061134003, + "learning_rate": 9.333740994508254e-06, + "loss": 0.2069, + "step": 1606 + }, + { + "epoch": 0.75, + "grad_norm": 1.218444067900376, + "learning_rate": 9.331031363571082e-06, + "loss": 0.1902, + "step": 1608 + }, + { + "epoch": 0.7509328358208955, + "grad_norm": 1.1900834243120095, + "learning_rate": 9.328316628830029e-06, + "loss": 0.1902, + "step": 1610 + }, + { + "epoch": 0.7518656716417911, + "grad_norm": 1.0790885063660123, + "learning_rate": 9.325596793484209e-06, + "loss": 0.1805, + "step": 1612 + }, + { + "epoch": 0.7527985074626866, + "grad_norm": 1.2336042126748985, + "learning_rate": 9.322871860738751e-06, + "loss": 0.199, + "step": 1614 + }, + { + "epoch": 0.753731343283582, + "grad_norm": 1.2482618141822903, + "learning_rate": 9.320141833804788e-06, + "loss": 0.1721, + "step": 1616 + }, + { + "epoch": 0.7546641791044776, + "grad_norm": 1.196552649254962, + "learning_rate": 9.317406715899458e-06, + "loss": 0.1809, + "step": 1618 + }, + { + "epoch": 0.7555970149253731, + "grad_norm": 1.375870755525294, + "learning_rate": 9.3146665102459e-06, + "loss": 0.199, + "step": 1620 + }, + { + "epoch": 0.7565298507462687, + "grad_norm": 1.1480319653135727, + "learning_rate": 9.31192122007324e-06, + "loss": 0.1821, + "step": 1622 + }, + { + "epoch": 0.7574626865671642, + "grad_norm": 1.2050690124236734, + "learning_rate": 9.309170848616606e-06, + "loss": 0.205, + "step": 1624 + }, + { + "epoch": 0.7583955223880597, + "grad_norm": 1.1574618619887107, + "learning_rate": 9.30641539911711e-06, + "loss": 0.1933, + "step": 1626 + }, + { + "epoch": 0.7593283582089553, + "grad_norm": 1.2836571870338427, + "learning_rate": 9.303654874821846e-06, + "loss": 0.2007, + "step": 1628 + }, + { + "epoch": 0.7602611940298507, + "grad_norm": 1.2027530695095416, + "learning_rate": 9.300889278983892e-06, + "loss": 0.1877, + "step": 1630 + }, + { + "epoch": 0.7611940298507462, + "grad_norm": 1.3636582387626675, + "learning_rate": 9.298118614862298e-06, + "loss": 0.207, + "step": 1632 + }, + { + "epoch": 0.7621268656716418, + "grad_norm": 1.2630348650248526, + "learning_rate": 9.295342885722092e-06, + "loss": 0.1758, + "step": 1634 + }, + { + "epoch": 0.7630597014925373, + "grad_norm": 1.329910260112892, + "learning_rate": 9.292562094834265e-06, + "loss": 0.204, + "step": 1636 + }, + { + "epoch": 0.7639925373134329, + "grad_norm": 1.2353843989957862, + "learning_rate": 9.289776245475777e-06, + "loss": 0.1685, + "step": 1638 + }, + { + "epoch": 0.7649253731343284, + "grad_norm": 1.2170648714970906, + "learning_rate": 9.28698534092955e-06, + "loss": 0.1836, + "step": 1640 + }, + { + "epoch": 0.7658582089552238, + "grad_norm": 1.2038791996871292, + "learning_rate": 9.284189384484458e-06, + "loss": 0.172, + "step": 1642 + }, + { + "epoch": 0.7667910447761194, + "grad_norm": 1.4119170877924636, + "learning_rate": 9.281388379435332e-06, + "loss": 0.198, + "step": 1644 + }, + { + "epoch": 0.7677238805970149, + "grad_norm": 1.224958739306716, + "learning_rate": 9.278582329082953e-06, + "loss": 0.1713, + "step": 1646 + }, + { + "epoch": 0.7686567164179104, + "grad_norm": 1.3635758240736457, + "learning_rate": 9.275771236734046e-06, + "loss": 0.2176, + "step": 1648 + }, + { + "epoch": 0.769589552238806, + "grad_norm": 0.9957691545425625, + "learning_rate": 9.272955105701276e-06, + "loss": 0.1783, + "step": 1650 + }, + { + "epoch": 0.7705223880597015, + "grad_norm": 1.2198158814021398, + "learning_rate": 9.270133939303248e-06, + "loss": 0.1986, + "step": 1652 + }, + { + "epoch": 0.7714552238805971, + "grad_norm": 1.230290778094303, + "learning_rate": 9.267307740864502e-06, + "loss": 0.1805, + "step": 1654 + }, + { + "epoch": 0.7723880597014925, + "grad_norm": 1.109344258482454, + "learning_rate": 9.264476513715506e-06, + "loss": 0.2014, + "step": 1656 + }, + { + "epoch": 0.773320895522388, + "grad_norm": 1.1926452649616313, + "learning_rate": 9.261640261192654e-06, + "loss": 0.196, + "step": 1658 + }, + { + "epoch": 0.7742537313432836, + "grad_norm": 1.1300301799492858, + "learning_rate": 9.25879898663826e-06, + "loss": 0.1729, + "step": 1660 + }, + { + "epoch": 0.7751865671641791, + "grad_norm": 1.3360029416525159, + "learning_rate": 9.255952693400562e-06, + "loss": 0.1895, + "step": 1662 + }, + { + "epoch": 0.7761194029850746, + "grad_norm": 1.2039456738595933, + "learning_rate": 9.253101384833708e-06, + "loss": 0.1984, + "step": 1664 + }, + { + "epoch": 0.7770522388059702, + "grad_norm": 1.2546247288328776, + "learning_rate": 9.250245064297752e-06, + "loss": 0.1844, + "step": 1666 + }, + { + "epoch": 0.7779850746268657, + "grad_norm": 1.4380813911346488, + "learning_rate": 9.247383735158666e-06, + "loss": 0.2154, + "step": 1668 + }, + { + "epoch": 0.7789179104477612, + "grad_norm": 1.2627577095705367, + "learning_rate": 9.24451740078831e-06, + "loss": 0.2264, + "step": 1670 + }, + { + "epoch": 0.7798507462686567, + "grad_norm": 1.2660475048501807, + "learning_rate": 9.241646064564457e-06, + "loss": 0.2043, + "step": 1672 + }, + { + "epoch": 0.7807835820895522, + "grad_norm": 1.3320766055488809, + "learning_rate": 9.238769729870763e-06, + "loss": 0.1896, + "step": 1674 + }, + { + "epoch": 0.7817164179104478, + "grad_norm": 1.190899383423424, + "learning_rate": 9.235888400096776e-06, + "loss": 0.1795, + "step": 1676 + }, + { + "epoch": 0.7826492537313433, + "grad_norm": 1.1817997896917685, + "learning_rate": 9.233002078637936e-06, + "loss": 0.1846, + "step": 1678 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 1.2912143789992765, + "learning_rate": 9.230110768895561e-06, + "loss": 0.1932, + "step": 1680 + }, + { + "epoch": 0.7845149253731343, + "grad_norm": 1.2830565916667247, + "learning_rate": 9.22721447427685e-06, + "loss": 0.1809, + "step": 1682 + }, + { + "epoch": 0.7854477611940298, + "grad_norm": 1.3494526085655905, + "learning_rate": 9.224313198194869e-06, + "loss": 0.1985, + "step": 1684 + }, + { + "epoch": 0.7863805970149254, + "grad_norm": 1.199511283191625, + "learning_rate": 9.221406944068565e-06, + "loss": 0.1848, + "step": 1686 + }, + { + "epoch": 0.7873134328358209, + "grad_norm": 1.37703204084647, + "learning_rate": 9.218495715322744e-06, + "loss": 0.178, + "step": 1688 + }, + { + "epoch": 0.7882462686567164, + "grad_norm": 1.2290476289857377, + "learning_rate": 9.215579515388076e-06, + "loss": 0.1943, + "step": 1690 + }, + { + "epoch": 0.789179104477612, + "grad_norm": 1.1614568458128005, + "learning_rate": 9.212658347701091e-06, + "loss": 0.1774, + "step": 1692 + }, + { + "epoch": 0.7901119402985075, + "grad_norm": 1.3331907668724727, + "learning_rate": 9.20973221570417e-06, + "loss": 0.1844, + "step": 1694 + }, + { + "epoch": 0.7910447761194029, + "grad_norm": 1.5566470906966665, + "learning_rate": 9.206801122845547e-06, + "loss": 0.1801, + "step": 1696 + }, + { + "epoch": 0.7919776119402985, + "grad_norm": 1.1677212305334963, + "learning_rate": 9.203865072579298e-06, + "loss": 0.1805, + "step": 1698 + }, + { + "epoch": 0.792910447761194, + "grad_norm": 1.322229911642287, + "learning_rate": 9.200924068365348e-06, + "loss": 0.2056, + "step": 1700 + }, + { + "epoch": 0.7938432835820896, + "grad_norm": 1.3506144940700227, + "learning_rate": 9.197978113669452e-06, + "loss": 0.2177, + "step": 1702 + }, + { + "epoch": 0.7947761194029851, + "grad_norm": 1.326459607368318, + "learning_rate": 9.195027211963203e-06, + "loss": 0.209, + "step": 1704 + }, + { + "epoch": 0.7957089552238806, + "grad_norm": 1.2530860336286305, + "learning_rate": 9.192071366724024e-06, + "loss": 0.1893, + "step": 1706 + }, + { + "epoch": 0.7966417910447762, + "grad_norm": 1.2392387668165896, + "learning_rate": 9.189110581435164e-06, + "loss": 0.1777, + "step": 1708 + }, + { + "epoch": 0.7975746268656716, + "grad_norm": 1.1855817934355852, + "learning_rate": 9.186144859585686e-06, + "loss": 0.1905, + "step": 1710 + }, + { + "epoch": 0.7985074626865671, + "grad_norm": 1.4964445044086616, + "learning_rate": 9.183174204670483e-06, + "loss": 0.2097, + "step": 1712 + }, + { + "epoch": 0.7994402985074627, + "grad_norm": 1.1862211193269043, + "learning_rate": 9.18019862019025e-06, + "loss": 0.1754, + "step": 1714 + }, + { + "epoch": 0.8003731343283582, + "grad_norm": 1.1506340547267884, + "learning_rate": 9.1772181096515e-06, + "loss": 0.1665, + "step": 1716 + }, + { + "epoch": 0.8013059701492538, + "grad_norm": 1.3124081769202471, + "learning_rate": 9.174232676566544e-06, + "loss": 0.2067, + "step": 1718 + }, + { + "epoch": 0.8022388059701493, + "grad_norm": 1.2628198489069242, + "learning_rate": 9.171242324453498e-06, + "loss": 0.174, + "step": 1720 + }, + { + "epoch": 0.8031716417910447, + "grad_norm": 1.284276587995049, + "learning_rate": 9.16824705683627e-06, + "loss": 0.1996, + "step": 1722 + }, + { + "epoch": 0.8041044776119403, + "grad_norm": 1.364415486198836, + "learning_rate": 9.165246877244569e-06, + "loss": 0.172, + "step": 1724 + }, + { + "epoch": 0.8050373134328358, + "grad_norm": 1.4860780735334287, + "learning_rate": 9.162241789213884e-06, + "loss": 0.1906, + "step": 1726 + }, + { + "epoch": 0.8059701492537313, + "grad_norm": 1.338318032226887, + "learning_rate": 9.159231796285494e-06, + "loss": 0.2016, + "step": 1728 + }, + { + "epoch": 0.8069029850746269, + "grad_norm": 1.4399260318313443, + "learning_rate": 9.156216902006452e-06, + "loss": 0.179, + "step": 1730 + }, + { + "epoch": 0.8078358208955224, + "grad_norm": 1.1486949764480001, + "learning_rate": 9.153197109929595e-06, + "loss": 0.1564, + "step": 1732 + }, + { + "epoch": 0.808768656716418, + "grad_norm": 1.2549897111890425, + "learning_rate": 9.150172423613524e-06, + "loss": 0.1774, + "step": 1734 + }, + { + "epoch": 0.8097014925373134, + "grad_norm": 1.408045646649574, + "learning_rate": 9.147142846622611e-06, + "loss": 0.1998, + "step": 1736 + }, + { + "epoch": 0.8106343283582089, + "grad_norm": 1.214919294688259, + "learning_rate": 9.144108382526992e-06, + "loss": 0.1749, + "step": 1738 + }, + { + "epoch": 0.8115671641791045, + "grad_norm": 1.2346241992467217, + "learning_rate": 9.141069034902563e-06, + "loss": 0.2004, + "step": 1740 + }, + { + "epoch": 0.8125, + "grad_norm": 1.058175981214563, + "learning_rate": 9.13802480733097e-06, + "loss": 0.1525, + "step": 1742 + }, + { + "epoch": 0.8134328358208955, + "grad_norm": 1.276358048373882, + "learning_rate": 9.134975703399612e-06, + "loss": 0.1803, + "step": 1744 + }, + { + "epoch": 0.8143656716417911, + "grad_norm": 1.2198555824626236, + "learning_rate": 9.131921726701636e-06, + "loss": 0.1898, + "step": 1746 + }, + { + "epoch": 0.8152985074626866, + "grad_norm": 1.501693819108629, + "learning_rate": 9.128862880835934e-06, + "loss": 0.177, + "step": 1748 + }, + { + "epoch": 0.816231343283582, + "grad_norm": 1.2048950586310152, + "learning_rate": 9.125799169407129e-06, + "loss": 0.1835, + "step": 1750 + }, + { + "epoch": 0.8171641791044776, + "grad_norm": 1.4797708319961769, + "learning_rate": 9.122730596025579e-06, + "loss": 0.186, + "step": 1752 + }, + { + "epoch": 0.8180970149253731, + "grad_norm": 1.1569485141394653, + "learning_rate": 9.119657164307376e-06, + "loss": 0.17, + "step": 1754 + }, + { + "epoch": 0.8190298507462687, + "grad_norm": 1.4888950199609203, + "learning_rate": 9.116578877874335e-06, + "loss": 0.1906, + "step": 1756 + }, + { + "epoch": 0.8199626865671642, + "grad_norm": 1.3036336679974885, + "learning_rate": 9.11349574035399e-06, + "loss": 0.187, + "step": 1758 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 1.2686515314533011, + "learning_rate": 9.110407755379596e-06, + "loss": 0.1958, + "step": 1760 + }, + { + "epoch": 0.8218283582089553, + "grad_norm": 1.6421245532393385, + "learning_rate": 9.107314926590114e-06, + "loss": 0.169, + "step": 1762 + }, + { + "epoch": 0.8227611940298507, + "grad_norm": 1.1615706530510448, + "learning_rate": 9.104217257630219e-06, + "loss": 0.1807, + "step": 1764 + }, + { + "epoch": 0.8236940298507462, + "grad_norm": 1.26218124385913, + "learning_rate": 9.101114752150287e-06, + "loss": 0.1843, + "step": 1766 + }, + { + "epoch": 0.8246268656716418, + "grad_norm": 1.250901164260205, + "learning_rate": 9.098007413806392e-06, + "loss": 0.1791, + "step": 1768 + }, + { + "epoch": 0.8255597014925373, + "grad_norm": 1.0727777935748994, + "learning_rate": 9.094895246260307e-06, + "loss": 0.1776, + "step": 1770 + }, + { + "epoch": 0.8264925373134329, + "grad_norm": 1.259600792241708, + "learning_rate": 9.091778253179494e-06, + "loss": 0.1918, + "step": 1772 + }, + { + "epoch": 0.8274253731343284, + "grad_norm": 1.3518465115589136, + "learning_rate": 9.088656438237103e-06, + "loss": 0.1877, + "step": 1774 + }, + { + "epoch": 0.8283582089552238, + "grad_norm": 1.2552564556451946, + "learning_rate": 9.085529805111961e-06, + "loss": 0.1805, + "step": 1776 + }, + { + "epoch": 0.8292910447761194, + "grad_norm": 1.1862224492667095, + "learning_rate": 9.082398357488579e-06, + "loss": 0.1753, + "step": 1778 + }, + { + "epoch": 0.8302238805970149, + "grad_norm": 1.1687326291642723, + "learning_rate": 9.07926209905714e-06, + "loss": 0.192, + "step": 1780 + }, + { + "epoch": 0.8311567164179104, + "grad_norm": 1.0957503874315015, + "learning_rate": 9.076121033513492e-06, + "loss": 0.1916, + "step": 1782 + }, + { + "epoch": 0.832089552238806, + "grad_norm": 1.238953371691249, + "learning_rate": 9.072975164559155e-06, + "loss": 0.188, + "step": 1784 + }, + { + "epoch": 0.8330223880597015, + "grad_norm": 1.4933361492925055, + "learning_rate": 9.0698244959013e-06, + "loss": 0.1824, + "step": 1786 + }, + { + "epoch": 0.8339552238805971, + "grad_norm": 1.0855474149879474, + "learning_rate": 9.066669031252767e-06, + "loss": 0.1738, + "step": 1788 + }, + { + "epoch": 0.8348880597014925, + "grad_norm": 1.2649611847849547, + "learning_rate": 9.063508774332036e-06, + "loss": 0.1726, + "step": 1790 + }, + { + "epoch": 0.835820895522388, + "grad_norm": 1.3122381326355543, + "learning_rate": 9.06034372886324e-06, + "loss": 0.1911, + "step": 1792 + }, + { + "epoch": 0.8367537313432836, + "grad_norm": 1.2698887757755766, + "learning_rate": 9.057173898576152e-06, + "loss": 0.1929, + "step": 1794 + }, + { + "epoch": 0.8376865671641791, + "grad_norm": 1.2910438945516445, + "learning_rate": 9.053999287206188e-06, + "loss": 0.187, + "step": 1796 + }, + { + "epoch": 0.8386194029850746, + "grad_norm": 1.2712881618613445, + "learning_rate": 9.050819898494393e-06, + "loss": 0.2031, + "step": 1798 + }, + { + "epoch": 0.8395522388059702, + "grad_norm": 1.2219658687786867, + "learning_rate": 9.047635736187446e-06, + "loss": 0.1959, + "step": 1800 + }, + { + "epoch": 0.8404850746268657, + "grad_norm": 1.1457094612347019, + "learning_rate": 9.04444680403765e-06, + "loss": 0.1767, + "step": 1802 + }, + { + "epoch": 0.8414179104477612, + "grad_norm": 1.344224861118119, + "learning_rate": 9.041253105802927e-06, + "loss": 0.2012, + "step": 1804 + }, + { + "epoch": 0.8423507462686567, + "grad_norm": 1.153726215152984, + "learning_rate": 9.038054645246816e-06, + "loss": 0.2033, + "step": 1806 + }, + { + "epoch": 0.8432835820895522, + "grad_norm": 2.172858885456257, + "learning_rate": 9.03485142613847e-06, + "loss": 0.2036, + "step": 1808 + }, + { + "epoch": 0.8442164179104478, + "grad_norm": 1.2968213763126533, + "learning_rate": 9.03164345225265e-06, + "loss": 0.1879, + "step": 1810 + }, + { + "epoch": 0.8451492537313433, + "grad_norm": 1.3833444029052697, + "learning_rate": 9.028430727369716e-06, + "loss": 0.1927, + "step": 1812 + }, + { + "epoch": 0.8460820895522388, + "grad_norm": 1.3267440671572195, + "learning_rate": 9.025213255275634e-06, + "loss": 0.1997, + "step": 1814 + }, + { + "epoch": 0.8470149253731343, + "grad_norm": 1.3234776781784925, + "learning_rate": 9.021991039761952e-06, + "loss": 0.1923, + "step": 1816 + }, + { + "epoch": 0.8479477611940298, + "grad_norm": 1.1345481672620257, + "learning_rate": 9.018764084625824e-06, + "loss": 0.1833, + "step": 1818 + }, + { + "epoch": 0.8488805970149254, + "grad_norm": 1.2275801230504422, + "learning_rate": 9.015532393669975e-06, + "loss": 0.2184, + "step": 1820 + }, + { + "epoch": 0.8498134328358209, + "grad_norm": 1.1418718541841877, + "learning_rate": 9.012295970702719e-06, + "loss": 0.166, + "step": 1822 + }, + { + "epoch": 0.8507462686567164, + "grad_norm": 1.1257828058705048, + "learning_rate": 9.009054819537943e-06, + "loss": 0.1925, + "step": 1824 + }, + { + "epoch": 0.851679104477612, + "grad_norm": 1.2497935091095564, + "learning_rate": 9.005808943995107e-06, + "loss": 0.2053, + "step": 1826 + }, + { + "epoch": 0.8526119402985075, + "grad_norm": 1.2997044089186343, + "learning_rate": 9.002558347899238e-06, + "loss": 0.1858, + "step": 1828 + }, + { + "epoch": 0.8535447761194029, + "grad_norm": 1.22478317982349, + "learning_rate": 8.999303035080927e-06, + "loss": 0.1855, + "step": 1830 + }, + { + "epoch": 0.8544776119402985, + "grad_norm": 1.217770720466568, + "learning_rate": 8.99604300937632e-06, + "loss": 0.1825, + "step": 1832 + }, + { + "epoch": 0.855410447761194, + "grad_norm": 1.3002126450464224, + "learning_rate": 8.99277827462712e-06, + "loss": 0.1961, + "step": 1834 + }, + { + "epoch": 0.8563432835820896, + "grad_norm": 1.4383914865756529, + "learning_rate": 8.98950883468058e-06, + "loss": 0.1946, + "step": 1836 + }, + { + "epoch": 0.8572761194029851, + "grad_norm": 1.1353579224278034, + "learning_rate": 8.986234693389492e-06, + "loss": 0.1718, + "step": 1838 + }, + { + "epoch": 0.8582089552238806, + "grad_norm": 1.1590788686455826, + "learning_rate": 8.982955854612197e-06, + "loss": 0.1807, + "step": 1840 + }, + { + "epoch": 0.8591417910447762, + "grad_norm": 1.19161317404919, + "learning_rate": 8.979672322212565e-06, + "loss": 0.1846, + "step": 1842 + }, + { + "epoch": 0.8600746268656716, + "grad_norm": 1.230929479032492, + "learning_rate": 8.976384100059996e-06, + "loss": 0.1839, + "step": 1844 + }, + { + "epoch": 0.8610074626865671, + "grad_norm": 1.2273399420318605, + "learning_rate": 8.973091192029424e-06, + "loss": 0.1703, + "step": 1846 + }, + { + "epoch": 0.8619402985074627, + "grad_norm": 1.2623762400287248, + "learning_rate": 8.969793602001295e-06, + "loss": 0.1928, + "step": 1848 + }, + { + "epoch": 0.8628731343283582, + "grad_norm": 1.2114189437578842, + "learning_rate": 8.966491333861585e-06, + "loss": 0.2019, + "step": 1850 + }, + { + "epoch": 0.8638059701492538, + "grad_norm": 1.234365487073312, + "learning_rate": 8.96318439150177e-06, + "loss": 0.2083, + "step": 1852 + }, + { + "epoch": 0.8647388059701493, + "grad_norm": 1.1268023588165434, + "learning_rate": 8.959872778818842e-06, + "loss": 0.1802, + "step": 1854 + }, + { + "epoch": 0.8656716417910447, + "grad_norm": 1.1047768811521825, + "learning_rate": 8.956556499715293e-06, + "loss": 0.1556, + "step": 1856 + }, + { + "epoch": 0.8666044776119403, + "grad_norm": 1.2853101919352707, + "learning_rate": 8.953235558099116e-06, + "loss": 0.1996, + "step": 1858 + }, + { + "epoch": 0.8675373134328358, + "grad_norm": 1.0491374239410929, + "learning_rate": 8.9499099578838e-06, + "loss": 0.1678, + "step": 1860 + }, + { + "epoch": 0.8684701492537313, + "grad_norm": 1.1943500524679493, + "learning_rate": 8.94657970298832e-06, + "loss": 0.1725, + "step": 1862 + }, + { + "epoch": 0.8694029850746269, + "grad_norm": 1.2729108977994532, + "learning_rate": 8.943244797337138e-06, + "loss": 0.1957, + "step": 1864 + }, + { + "epoch": 0.8703358208955224, + "grad_norm": 1.4708930606205595, + "learning_rate": 8.939905244860197e-06, + "loss": 0.1969, + "step": 1866 + }, + { + "epoch": 0.871268656716418, + "grad_norm": 1.2054236478211537, + "learning_rate": 8.936561049492913e-06, + "loss": 0.2079, + "step": 1868 + }, + { + "epoch": 0.8722014925373134, + "grad_norm": 1.158524430953715, + "learning_rate": 8.933212215176181e-06, + "loss": 0.1746, + "step": 1870 + }, + { + "epoch": 0.8731343283582089, + "grad_norm": 1.1455885414104536, + "learning_rate": 8.929858745856353e-06, + "loss": 0.182, + "step": 1872 + }, + { + "epoch": 0.8740671641791045, + "grad_norm": 1.3397135787222707, + "learning_rate": 8.92650064548525e-06, + "loss": 0.189, + "step": 1874 + }, + { + "epoch": 0.875, + "grad_norm": 1.1191318015925886, + "learning_rate": 8.923137918020147e-06, + "loss": 0.1633, + "step": 1876 + }, + { + "epoch": 0.8759328358208955, + "grad_norm": 1.2274352089475598, + "learning_rate": 8.919770567423772e-06, + "loss": 0.1844, + "step": 1878 + }, + { + "epoch": 0.8768656716417911, + "grad_norm": 1.2472104437638762, + "learning_rate": 8.916398597664299e-06, + "loss": 0.1922, + "step": 1880 + }, + { + "epoch": 0.8777985074626866, + "grad_norm": 1.286605431682373, + "learning_rate": 8.913022012715355e-06, + "loss": 0.168, + "step": 1882 + }, + { + "epoch": 0.878731343283582, + "grad_norm": 1.3154722505682137, + "learning_rate": 8.909640816555992e-06, + "loss": 0.1866, + "step": 1884 + }, + { + "epoch": 0.8796641791044776, + "grad_norm": 1.223951896171603, + "learning_rate": 8.906255013170707e-06, + "loss": 0.1913, + "step": 1886 + }, + { + "epoch": 0.8805970149253731, + "grad_norm": 1.1693079070240726, + "learning_rate": 8.902864606549417e-06, + "loss": 0.1767, + "step": 1888 + }, + { + "epoch": 0.8815298507462687, + "grad_norm": 1.2568555877047018, + "learning_rate": 8.899469600687472e-06, + "loss": 0.1875, + "step": 1890 + }, + { + "epoch": 0.8824626865671642, + "grad_norm": 1.248160786151591, + "learning_rate": 8.896069999585636e-06, + "loss": 0.2135, + "step": 1892 + }, + { + "epoch": 0.8833955223880597, + "grad_norm": 1.1431255675657805, + "learning_rate": 8.892665807250093e-06, + "loss": 0.186, + "step": 1894 + }, + { + "epoch": 0.8843283582089553, + "grad_norm": 1.1294114890786784, + "learning_rate": 8.889257027692433e-06, + "loss": 0.1843, + "step": 1896 + }, + { + "epoch": 0.8852611940298507, + "grad_norm": 1.1933808037745908, + "learning_rate": 8.885843664929654e-06, + "loss": 0.1767, + "step": 1898 + }, + { + "epoch": 0.8861940298507462, + "grad_norm": 1.2792703435555342, + "learning_rate": 8.882425722984156e-06, + "loss": 0.1991, + "step": 1900 + }, + { + "epoch": 0.8871268656716418, + "grad_norm": 1.208330774950831, + "learning_rate": 8.879003205883729e-06, + "loss": 0.1798, + "step": 1902 + }, + { + "epoch": 0.8880597014925373, + "grad_norm": 1.2296659816391375, + "learning_rate": 8.875576117661565e-06, + "loss": 0.1755, + "step": 1904 + }, + { + "epoch": 0.8889925373134329, + "grad_norm": 1.240600067039028, + "learning_rate": 8.872144462356234e-06, + "loss": 0.1863, + "step": 1906 + }, + { + "epoch": 0.8899253731343284, + "grad_norm": 1.0705498227682964, + "learning_rate": 8.868708244011692e-06, + "loss": 0.154, + "step": 1908 + }, + { + "epoch": 0.8908582089552238, + "grad_norm": 1.2268381212547212, + "learning_rate": 8.86526746667727e-06, + "loss": 0.1687, + "step": 1910 + }, + { + "epoch": 0.8917910447761194, + "grad_norm": 1.1337934021380822, + "learning_rate": 8.861822134407671e-06, + "loss": 0.1803, + "step": 1912 + }, + { + "epoch": 0.8927238805970149, + "grad_norm": 1.195600681526038, + "learning_rate": 8.858372251262972e-06, + "loss": 0.1648, + "step": 1914 + }, + { + "epoch": 0.8936567164179104, + "grad_norm": 1.2273955675181596, + "learning_rate": 8.854917821308606e-06, + "loss": 0.1572, + "step": 1916 + }, + { + "epoch": 0.894589552238806, + "grad_norm": 1.17826861051738, + "learning_rate": 8.851458848615364e-06, + "loss": 0.1839, + "step": 1918 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 1.122509041686198, + "learning_rate": 8.847995337259394e-06, + "loss": 0.1697, + "step": 1920 + }, + { + "epoch": 0.8964552238805971, + "grad_norm": 1.257363825091277, + "learning_rate": 8.844527291322192e-06, + "loss": 0.178, + "step": 1922 + }, + { + "epoch": 0.8973880597014925, + "grad_norm": 1.3004284149438885, + "learning_rate": 8.841054714890596e-06, + "loss": 0.1768, + "step": 1924 + }, + { + "epoch": 0.898320895522388, + "grad_norm": 1.1392906799925309, + "learning_rate": 8.837577612056782e-06, + "loss": 0.1729, + "step": 1926 + }, + { + "epoch": 0.8992537313432836, + "grad_norm": 1.080460792775906, + "learning_rate": 8.834095986918265e-06, + "loss": 0.1603, + "step": 1928 + }, + { + "epoch": 0.9001865671641791, + "grad_norm": 1.120023878209664, + "learning_rate": 8.830609843577882e-06, + "loss": 0.1685, + "step": 1930 + }, + { + "epoch": 0.9011194029850746, + "grad_norm": 1.110558451361426, + "learning_rate": 8.8271191861438e-06, + "loss": 0.1509, + "step": 1932 + }, + { + "epoch": 0.9020522388059702, + "grad_norm": 1.2917406041428532, + "learning_rate": 8.823624018729503e-06, + "loss": 0.1895, + "step": 1934 + }, + { + "epoch": 0.9029850746268657, + "grad_norm": 1.230968075446837, + "learning_rate": 8.820124345453791e-06, + "loss": 0.1789, + "step": 1936 + }, + { + "epoch": 0.9039179104477612, + "grad_norm": 1.367855006225693, + "learning_rate": 8.816620170440774e-06, + "loss": 0.1989, + "step": 1938 + }, + { + "epoch": 0.9048507462686567, + "grad_norm": 1.0454936621086743, + "learning_rate": 8.813111497819861e-06, + "loss": 0.1873, + "step": 1940 + }, + { + "epoch": 0.9057835820895522, + "grad_norm": 1.3314003474956535, + "learning_rate": 8.809598331725772e-06, + "loss": 0.167, + "step": 1942 + }, + { + "epoch": 0.9067164179104478, + "grad_norm": 1.2678674135148693, + "learning_rate": 8.806080676298516e-06, + "loss": 0.1854, + "step": 1944 + }, + { + "epoch": 0.9076492537313433, + "grad_norm": 1.2768172203081714, + "learning_rate": 8.80255853568339e-06, + "loss": 0.1908, + "step": 1946 + }, + { + "epoch": 0.9085820895522388, + "grad_norm": 1.1818274374168733, + "learning_rate": 8.79903191403098e-06, + "loss": 0.1622, + "step": 1948 + }, + { + "epoch": 0.9095149253731343, + "grad_norm": 1.2225620547007063, + "learning_rate": 8.795500815497154e-06, + "loss": 0.2031, + "step": 1950 + }, + { + "epoch": 0.9104477611940298, + "grad_norm": 1.1257924983539633, + "learning_rate": 8.79196524424305e-06, + "loss": 0.1653, + "step": 1952 + }, + { + "epoch": 0.9113805970149254, + "grad_norm": 1.1028340032016384, + "learning_rate": 8.788425204435082e-06, + "loss": 0.1701, + "step": 1954 + }, + { + "epoch": 0.9123134328358209, + "grad_norm": 1.1342103768564358, + "learning_rate": 8.78488070024493e-06, + "loss": 0.1692, + "step": 1956 + }, + { + "epoch": 0.9132462686567164, + "grad_norm": 1.1866863060409858, + "learning_rate": 8.781331735849532e-06, + "loss": 0.1956, + "step": 1958 + }, + { + "epoch": 0.914179104477612, + "grad_norm": 1.2104677377772022, + "learning_rate": 8.77777831543108e-06, + "loss": 0.1748, + "step": 1960 + }, + { + "epoch": 0.9151119402985075, + "grad_norm": 1.2750638621133845, + "learning_rate": 8.774220443177024e-06, + "loss": 0.1852, + "step": 1962 + }, + { + "epoch": 0.9160447761194029, + "grad_norm": 1.2456050181530693, + "learning_rate": 8.770658123280056e-06, + "loss": 0.2078, + "step": 1964 + }, + { + "epoch": 0.9169776119402985, + "grad_norm": 1.1513211751202366, + "learning_rate": 8.76709135993811e-06, + "loss": 0.1623, + "step": 1966 + }, + { + "epoch": 0.917910447761194, + "grad_norm": 1.214606556576558, + "learning_rate": 8.763520157354352e-06, + "loss": 0.1907, + "step": 1968 + }, + { + "epoch": 0.9188432835820896, + "grad_norm": 1.2558675406655826, + "learning_rate": 8.759944519737186e-06, + "loss": 0.1706, + "step": 1970 + }, + { + "epoch": 0.9197761194029851, + "grad_norm": 1.2009512916282046, + "learning_rate": 8.756364451300241e-06, + "loss": 0.1796, + "step": 1972 + }, + { + "epoch": 0.9207089552238806, + "grad_norm": 1.2209617116507063, + "learning_rate": 8.752779956262363e-06, + "loss": 0.2001, + "step": 1974 + }, + { + "epoch": 0.9216417910447762, + "grad_norm": 1.1823752723455987, + "learning_rate": 8.749191038847619e-06, + "loss": 0.1969, + "step": 1976 + }, + { + "epoch": 0.9225746268656716, + "grad_norm": 1.3125507510378662, + "learning_rate": 8.745597703285286e-06, + "loss": 0.1952, + "step": 1978 + }, + { + "epoch": 0.9235074626865671, + "grad_norm": 1.1560177166373937, + "learning_rate": 8.741999953809847e-06, + "loss": 0.167, + "step": 1980 + }, + { + "epoch": 0.9244402985074627, + "grad_norm": 1.3829461046588156, + "learning_rate": 8.738397794660986e-06, + "loss": 0.1846, + "step": 1982 + }, + { + "epoch": 0.9253731343283582, + "grad_norm": 1.3775977676280338, + "learning_rate": 8.734791230083586e-06, + "loss": 0.1827, + "step": 1984 + }, + { + "epoch": 0.9263059701492538, + "grad_norm": 1.1430102314378263, + "learning_rate": 8.73118026432772e-06, + "loss": 0.1956, + "step": 1986 + }, + { + "epoch": 0.9272388059701493, + "grad_norm": 1.3056522149119958, + "learning_rate": 8.727564901648645e-06, + "loss": 0.1953, + "step": 1988 + }, + { + "epoch": 0.9281716417910447, + "grad_norm": 1.3386146049280576, + "learning_rate": 8.723945146306801e-06, + "loss": 0.1956, + "step": 1990 + }, + { + "epoch": 0.9291044776119403, + "grad_norm": 1.2677928673430818, + "learning_rate": 8.720321002567807e-06, + "loss": 0.1858, + "step": 1992 + }, + { + "epoch": 0.9300373134328358, + "grad_norm": 1.0720634605057584, + "learning_rate": 8.71669247470245e-06, + "loss": 0.1558, + "step": 1994 + }, + { + "epoch": 0.9309701492537313, + "grad_norm": 1.1807522950979514, + "learning_rate": 8.71305956698669e-06, + "loss": 0.1576, + "step": 1996 + }, + { + "epoch": 0.9319029850746269, + "grad_norm": 1.094363383366357, + "learning_rate": 8.709422283701634e-06, + "loss": 0.1784, + "step": 1998 + }, + { + "epoch": 0.9328358208955224, + "grad_norm": 1.1430589260003636, + "learning_rate": 8.705780629133565e-06, + "loss": 0.1812, + "step": 2000 + }, + { + "epoch": 0.9328358208955224, + "eval_loss": 0.16347220540046692, + "eval_runtime": 322.8513, + "eval_samples_per_second": 47.223, + "eval_steps_per_second": 5.904, + "step": 2000 + }, + { + "epoch": 0.933768656716418, + "grad_norm": 1.3181785803134747, + "learning_rate": 8.702134607573898e-06, + "loss": 0.2088, + "step": 2002 + }, + { + "epoch": 0.9347014925373134, + "grad_norm": 1.114139466989517, + "learning_rate": 8.698484223319206e-06, + "loss": 0.1651, + "step": 2004 + }, + { + "epoch": 0.9356343283582089, + "grad_norm": 1.2600477290467267, + "learning_rate": 8.694829480671202e-06, + "loss": 0.1851, + "step": 2006 + }, + { + "epoch": 0.9365671641791045, + "grad_norm": 1.2547571579332766, + "learning_rate": 8.69117038393673e-06, + "loss": 0.194, + "step": 2008 + }, + { + "epoch": 0.9375, + "grad_norm": 1.2823745810043528, + "learning_rate": 8.68750693742777e-06, + "loss": 0.1666, + "step": 2010 + }, + { + "epoch": 0.9384328358208955, + "grad_norm": 1.2448797285533804, + "learning_rate": 8.683839145461425e-06, + "loss": 0.1928, + "step": 2012 + }, + { + "epoch": 0.9393656716417911, + "grad_norm": 1.273332521268346, + "learning_rate": 8.680167012359922e-06, + "loss": 0.2133, + "step": 2014 + }, + { + "epoch": 0.9402985074626866, + "grad_norm": 1.4296756949185516, + "learning_rate": 8.676490542450597e-06, + "loss": 0.1799, + "step": 2016 + }, + { + "epoch": 0.941231343283582, + "grad_norm": 1.080809866903534, + "learning_rate": 8.672809740065904e-06, + "loss": 0.1853, + "step": 2018 + }, + { + "epoch": 0.9421641791044776, + "grad_norm": 1.1436099260958916, + "learning_rate": 8.6691246095434e-06, + "loss": 0.1734, + "step": 2020 + }, + { + "epoch": 0.9430970149253731, + "grad_norm": 1.3313725691681668, + "learning_rate": 8.665435155225741e-06, + "loss": 0.1813, + "step": 2022 + }, + { + "epoch": 0.9440298507462687, + "grad_norm": 1.3612550524272353, + "learning_rate": 8.661741381460677e-06, + "loss": 0.2019, + "step": 2024 + }, + { + "epoch": 0.9449626865671642, + "grad_norm": 1.1994714663237427, + "learning_rate": 8.658043292601055e-06, + "loss": 0.1926, + "step": 2026 + }, + { + "epoch": 0.9458955223880597, + "grad_norm": 1.245193221917958, + "learning_rate": 8.6543408930048e-06, + "loss": 0.1758, + "step": 2028 + }, + { + "epoch": 0.9468283582089553, + "grad_norm": 1.1664629995318132, + "learning_rate": 8.650634187034918e-06, + "loss": 0.1662, + "step": 2030 + }, + { + "epoch": 0.9477611940298507, + "grad_norm": 1.3327368080660713, + "learning_rate": 8.646923179059494e-06, + "loss": 0.2028, + "step": 2032 + }, + { + "epoch": 0.9486940298507462, + "grad_norm": 1.1966686306458933, + "learning_rate": 8.643207873451678e-06, + "loss": 0.1791, + "step": 2034 + }, + { + "epoch": 0.9496268656716418, + "grad_norm": 1.2009028963320556, + "learning_rate": 8.639488274589685e-06, + "loss": 0.1904, + "step": 2036 + }, + { + "epoch": 0.9505597014925373, + "grad_norm": 1.0929753841351104, + "learning_rate": 8.635764386856794e-06, + "loss": 0.1712, + "step": 2038 + }, + { + "epoch": 0.9514925373134329, + "grad_norm": 1.1409877331710687, + "learning_rate": 8.632036214641328e-06, + "loss": 0.1917, + "step": 2040 + }, + { + "epoch": 0.9524253731343284, + "grad_norm": 1.0857445185367467, + "learning_rate": 8.628303762336671e-06, + "loss": 0.1781, + "step": 2042 + }, + { + "epoch": 0.9533582089552238, + "grad_norm": 1.2711429588160323, + "learning_rate": 8.624567034341245e-06, + "loss": 0.1763, + "step": 2044 + }, + { + "epoch": 0.9542910447761194, + "grad_norm": 1.411093941004422, + "learning_rate": 8.620826035058509e-06, + "loss": 0.1752, + "step": 2046 + }, + { + "epoch": 0.9552238805970149, + "grad_norm": 6.390236743573979, + "learning_rate": 8.617080768896958e-06, + "loss": 0.1729, + "step": 2048 + }, + { + "epoch": 0.9561567164179104, + "grad_norm": 3.544159567320888, + "learning_rate": 8.613331240270114e-06, + "loss": 0.1703, + "step": 2050 + }, + { + "epoch": 0.957089552238806, + "grad_norm": 1.442466623942355, + "learning_rate": 8.609577453596521e-06, + "loss": 0.1795, + "step": 2052 + }, + { + "epoch": 0.9580223880597015, + "grad_norm": 0.9843433823848003, + "learning_rate": 8.605819413299744e-06, + "loss": 0.1608, + "step": 2054 + }, + { + "epoch": 0.9589552238805971, + "grad_norm": 1.110051693804096, + "learning_rate": 8.602057123808359e-06, + "loss": 0.1827, + "step": 2056 + }, + { + "epoch": 0.9598880597014925, + "grad_norm": 1.1433136186836552, + "learning_rate": 8.59829058955595e-06, + "loss": 0.1734, + "step": 2058 + }, + { + "epoch": 0.960820895522388, + "grad_norm": 1.2138425480613468, + "learning_rate": 8.594519814981098e-06, + "loss": 0.1741, + "step": 2060 + }, + { + "epoch": 0.9617537313432836, + "grad_norm": 1.157195368563672, + "learning_rate": 8.590744804527388e-06, + "loss": 0.1718, + "step": 2062 + }, + { + "epoch": 0.9626865671641791, + "grad_norm": 1.4532329436175, + "learning_rate": 8.586965562643397e-06, + "loss": 0.168, + "step": 2064 + }, + { + "epoch": 0.9636194029850746, + "grad_norm": 1.0872687701407575, + "learning_rate": 8.583182093782682e-06, + "loss": 0.1721, + "step": 2066 + }, + { + "epoch": 0.9645522388059702, + "grad_norm": 1.3160180813549756, + "learning_rate": 8.579394402403784e-06, + "loss": 0.1656, + "step": 2068 + }, + { + "epoch": 0.9654850746268657, + "grad_norm": 1.1101848042615787, + "learning_rate": 8.575602492970221e-06, + "loss": 0.1738, + "step": 2070 + }, + { + "epoch": 0.9664179104477612, + "grad_norm": 1.193049436941456, + "learning_rate": 8.571806369950482e-06, + "loss": 0.1746, + "step": 2072 + }, + { + "epoch": 0.9673507462686567, + "grad_norm": 1.1249109855183856, + "learning_rate": 8.56800603781802e-06, + "loss": 0.173, + "step": 2074 + }, + { + "epoch": 0.9682835820895522, + "grad_norm": 1.185244259646379, + "learning_rate": 8.564201501051247e-06, + "loss": 0.1695, + "step": 2076 + }, + { + "epoch": 0.9692164179104478, + "grad_norm": 1.1947301263077779, + "learning_rate": 8.560392764133535e-06, + "loss": 0.1921, + "step": 2078 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 1.3941382122868435, + "learning_rate": 8.556579831553198e-06, + "loss": 0.1733, + "step": 2080 + }, + { + "epoch": 0.9710820895522388, + "grad_norm": 1.2007817139353856, + "learning_rate": 8.5527627078035e-06, + "loss": 0.1823, + "step": 2082 + }, + { + "epoch": 0.9720149253731343, + "grad_norm": 1.1294016590691875, + "learning_rate": 8.548941397382647e-06, + "loss": 0.1669, + "step": 2084 + }, + { + "epoch": 0.9729477611940298, + "grad_norm": 1.2388902899558032, + "learning_rate": 8.545115904793765e-06, + "loss": 0.1945, + "step": 2086 + }, + { + "epoch": 0.9738805970149254, + "grad_norm": 1.2592255783170496, + "learning_rate": 8.541286234544923e-06, + "loss": 0.1641, + "step": 2088 + }, + { + "epoch": 0.9748134328358209, + "grad_norm": 1.199692607254671, + "learning_rate": 8.537452391149108e-06, + "loss": 0.1849, + "step": 2090 + }, + { + "epoch": 0.9757462686567164, + "grad_norm": 1.0192323762345974, + "learning_rate": 8.533614379124221e-06, + "loss": 0.1571, + "step": 2092 + }, + { + "epoch": 0.976679104477612, + "grad_norm": 1.199340760780135, + "learning_rate": 8.529772202993083e-06, + "loss": 0.1641, + "step": 2094 + }, + { + "epoch": 0.9776119402985075, + "grad_norm": 1.2855292891688954, + "learning_rate": 8.525925867283414e-06, + "loss": 0.1956, + "step": 2096 + }, + { + "epoch": 0.9785447761194029, + "grad_norm": 1.1278401946322196, + "learning_rate": 8.52207537652784e-06, + "loss": 0.1789, + "step": 2098 + }, + { + "epoch": 0.9794776119402985, + "grad_norm": 1.161086795188338, + "learning_rate": 8.518220735263884e-06, + "loss": 0.1751, + "step": 2100 + }, + { + "epoch": 0.980410447761194, + "grad_norm": 1.1844957643011036, + "learning_rate": 8.514361948033958e-06, + "loss": 0.1889, + "step": 2102 + }, + { + "epoch": 0.9813432835820896, + "grad_norm": 1.1280350555176875, + "learning_rate": 8.510499019385362e-06, + "loss": 0.1935, + "step": 2104 + }, + { + "epoch": 0.9822761194029851, + "grad_norm": 1.2231857221982496, + "learning_rate": 8.506631953870272e-06, + "loss": 0.1785, + "step": 2106 + }, + { + "epoch": 0.9832089552238806, + "grad_norm": 1.1940335590482019, + "learning_rate": 8.502760756045747e-06, + "loss": 0.1988, + "step": 2108 + }, + { + "epoch": 0.9841417910447762, + "grad_norm": 1.0584137234671658, + "learning_rate": 8.498885430473707e-06, + "loss": 0.1618, + "step": 2110 + }, + { + "epoch": 0.9850746268656716, + "grad_norm": 1.070914953967085, + "learning_rate": 8.495005981720941e-06, + "loss": 0.189, + "step": 2112 + }, + { + "epoch": 0.9860074626865671, + "grad_norm": 1.1323344114228981, + "learning_rate": 8.491122414359095e-06, + "loss": 0.1833, + "step": 2114 + }, + { + "epoch": 0.9869402985074627, + "grad_norm": 1.13383803765909, + "learning_rate": 8.487234732964669e-06, + "loss": 0.162, + "step": 2116 + }, + { + "epoch": 0.9878731343283582, + "grad_norm": 1.11065834174197, + "learning_rate": 8.483342942119013e-06, + "loss": 0.1788, + "step": 2118 + }, + { + "epoch": 0.9888059701492538, + "grad_norm": 1.1897547761209817, + "learning_rate": 8.479447046408318e-06, + "loss": 0.1809, + "step": 2120 + }, + { + "epoch": 0.9897388059701493, + "grad_norm": 1.1087895452610104, + "learning_rate": 8.475547050423611e-06, + "loss": 0.1474, + "step": 2122 + }, + { + "epoch": 0.9906716417910447, + "grad_norm": 1.1557766829891358, + "learning_rate": 8.471642958760752e-06, + "loss": 0.1767, + "step": 2124 + }, + { + "epoch": 0.9916044776119403, + "grad_norm": 1.2341066515607721, + "learning_rate": 8.46773477602043e-06, + "loss": 0.1952, + "step": 2126 + }, + { + "epoch": 0.9925373134328358, + "grad_norm": 1.2539767202851184, + "learning_rate": 8.463822506808151e-06, + "loss": 0.1934, + "step": 2128 + }, + { + "epoch": 0.9934701492537313, + "grad_norm": 1.245565336178777, + "learning_rate": 8.45990615573424e-06, + "loss": 0.1792, + "step": 2130 + }, + { + "epoch": 0.9944029850746269, + "grad_norm": 1.1128968338688787, + "learning_rate": 8.455985727413825e-06, + "loss": 0.1809, + "step": 2132 + }, + { + "epoch": 0.9953358208955224, + "grad_norm": 1.0831619753015387, + "learning_rate": 8.45206122646685e-06, + "loss": 0.1783, + "step": 2134 + }, + { + "epoch": 0.996268656716418, + "grad_norm": 1.0810998238140512, + "learning_rate": 8.44813265751805e-06, + "loss": 0.1703, + "step": 2136 + }, + { + "epoch": 0.9972014925373134, + "grad_norm": 1.1242809365566895, + "learning_rate": 8.444200025196958e-06, + "loss": 0.1697, + "step": 2138 + }, + { + "epoch": 0.9981343283582089, + "grad_norm": 1.2553514854044407, + "learning_rate": 8.440263334137892e-06, + "loss": 0.1915, + "step": 2140 + }, + { + "epoch": 0.9990671641791045, + "grad_norm": 1.1214403124297583, + "learning_rate": 8.436322588979955e-06, + "loss": 0.172, + "step": 2142 + }, + { + "epoch": 1.0, + "grad_norm": 1.1407138244992432, + "learning_rate": 8.432377794367028e-06, + "loss": 0.1769, + "step": 2144 + }, + { + "epoch": 1.0009328358208955, + "grad_norm": 0.9459632198413176, + "learning_rate": 8.428428954947762e-06, + "loss": 0.1104, + "step": 2146 + }, + { + "epoch": 1.001865671641791, + "grad_norm": 0.9878390373694901, + "learning_rate": 8.424476075375578e-06, + "loss": 0.119, + "step": 2148 + }, + { + "epoch": 1.0027985074626866, + "grad_norm": 0.892096781938565, + "learning_rate": 8.420519160308651e-06, + "loss": 0.1056, + "step": 2150 + }, + { + "epoch": 1.0037313432835822, + "grad_norm": 2.202199497780502, + "learning_rate": 8.41655821440992e-06, + "loss": 0.1152, + "step": 2152 + }, + { + "epoch": 1.0046641791044777, + "grad_norm": 1.2163148100669832, + "learning_rate": 8.41259324234707e-06, + "loss": 0.1196, + "step": 2154 + }, + { + "epoch": 1.0055970149253732, + "grad_norm": 0.895501258908727, + "learning_rate": 8.40862424879253e-06, + "loss": 0.0988, + "step": 2156 + }, + { + "epoch": 1.0065298507462686, + "grad_norm": 1.1499497762307112, + "learning_rate": 8.40465123842347e-06, + "loss": 0.1125, + "step": 2158 + }, + { + "epoch": 1.007462686567164, + "grad_norm": 1.132993408683864, + "learning_rate": 8.400674215921786e-06, + "loss": 0.1016, + "step": 2160 + }, + { + "epoch": 1.0083955223880596, + "grad_norm": 1.2184853847812682, + "learning_rate": 8.396693185974118e-06, + "loss": 0.1085, + "step": 2162 + }, + { + "epoch": 1.0093283582089552, + "grad_norm": 1.0410662413518412, + "learning_rate": 8.392708153271813e-06, + "loss": 0.1025, + "step": 2164 + }, + { + "epoch": 1.0102611940298507, + "grad_norm": 1.1724708193257103, + "learning_rate": 8.388719122510943e-06, + "loss": 0.1102, + "step": 2166 + }, + { + "epoch": 1.0111940298507462, + "grad_norm": 1.2169251119295637, + "learning_rate": 8.384726098392286e-06, + "loss": 0.1158, + "step": 2168 + }, + { + "epoch": 1.0121268656716418, + "grad_norm": 1.2153613709875557, + "learning_rate": 8.380729085621331e-06, + "loss": 0.128, + "step": 2170 + }, + { + "epoch": 1.0130597014925373, + "grad_norm": 1.052983482028831, + "learning_rate": 8.376728088908268e-06, + "loss": 0.1031, + "step": 2172 + }, + { + "epoch": 1.0139925373134329, + "grad_norm": 1.3234904654899597, + "learning_rate": 8.372723112967974e-06, + "loss": 0.1236, + "step": 2174 + }, + { + "epoch": 1.0149253731343284, + "grad_norm": 1.1117963691600927, + "learning_rate": 8.368714162520024e-06, + "loss": 0.1132, + "step": 2176 + }, + { + "epoch": 1.015858208955224, + "grad_norm": 1.0474400456234658, + "learning_rate": 8.364701242288673e-06, + "loss": 0.0981, + "step": 2178 + }, + { + "epoch": 1.0167910447761195, + "grad_norm": 1.0809304922906753, + "learning_rate": 8.360684357002853e-06, + "loss": 0.1068, + "step": 2180 + }, + { + "epoch": 1.017723880597015, + "grad_norm": 1.0628024771301163, + "learning_rate": 8.356663511396169e-06, + "loss": 0.1074, + "step": 2182 + }, + { + "epoch": 1.0186567164179103, + "grad_norm": 1.1334774517081387, + "learning_rate": 8.352638710206895e-06, + "loss": 0.0995, + "step": 2184 + }, + { + "epoch": 1.0195895522388059, + "grad_norm": 1.0908983028803378, + "learning_rate": 8.348609958177964e-06, + "loss": 0.1142, + "step": 2186 + }, + { + "epoch": 1.0205223880597014, + "grad_norm": 1.0708305141240686, + "learning_rate": 8.34457726005697e-06, + "loss": 0.0977, + "step": 2188 + }, + { + "epoch": 1.021455223880597, + "grad_norm": 1.2141354780980165, + "learning_rate": 8.340540620596145e-06, + "loss": 0.1178, + "step": 2190 + }, + { + "epoch": 1.0223880597014925, + "grad_norm": 1.1602233732700098, + "learning_rate": 8.33650004455238e-06, + "loss": 0.1125, + "step": 2192 + }, + { + "epoch": 1.023320895522388, + "grad_norm": 1.18008855873969, + "learning_rate": 8.332455536687196e-06, + "loss": 0.1167, + "step": 2194 + }, + { + "epoch": 1.0242537313432836, + "grad_norm": 0.9893201186088032, + "learning_rate": 8.328407101766752e-06, + "loss": 0.0977, + "step": 2196 + }, + { + "epoch": 1.025186567164179, + "grad_norm": 1.0727376140142242, + "learning_rate": 8.324354744561829e-06, + "loss": 0.1068, + "step": 2198 + }, + { + "epoch": 1.0261194029850746, + "grad_norm": 1.0449511698044345, + "learning_rate": 8.320298469847836e-06, + "loss": 0.0984, + "step": 2200 + }, + { + "epoch": 1.0270522388059702, + "grad_norm": 1.135678129051182, + "learning_rate": 8.316238282404795e-06, + "loss": 0.1099, + "step": 2202 + }, + { + "epoch": 1.0279850746268657, + "grad_norm": 1.0466436844213056, + "learning_rate": 8.312174187017343e-06, + "loss": 0.0996, + "step": 2204 + }, + { + "epoch": 1.0289179104477613, + "grad_norm": 0.987253100313577, + "learning_rate": 8.308106188474716e-06, + "loss": 0.0955, + "step": 2206 + }, + { + "epoch": 1.0298507462686568, + "grad_norm": 1.1928717958128783, + "learning_rate": 8.304034291570757e-06, + "loss": 0.1131, + "step": 2208 + }, + { + "epoch": 1.0307835820895523, + "grad_norm": 1.023793525835979, + "learning_rate": 8.299958501103892e-06, + "loss": 0.1136, + "step": 2210 + }, + { + "epoch": 1.0317164179104477, + "grad_norm": 1.115450382287538, + "learning_rate": 8.29587882187715e-06, + "loss": 0.119, + "step": 2212 + }, + { + "epoch": 1.0326492537313432, + "grad_norm": 1.1787560514025666, + "learning_rate": 8.29179525869813e-06, + "loss": 0.1096, + "step": 2214 + }, + { + "epoch": 1.0335820895522387, + "grad_norm": 1.0840680636375406, + "learning_rate": 8.287707816379014e-06, + "loss": 0.0961, + "step": 2216 + }, + { + "epoch": 1.0345149253731343, + "grad_norm": 1.1011647055667009, + "learning_rate": 8.283616499736552e-06, + "loss": 0.1077, + "step": 2218 + }, + { + "epoch": 1.0354477611940298, + "grad_norm": 1.1717321795377318, + "learning_rate": 8.279521313592067e-06, + "loss": 0.1111, + "step": 2220 + }, + { + "epoch": 1.0363805970149254, + "grad_norm": 1.055449976091991, + "learning_rate": 8.27542226277143e-06, + "loss": 0.1098, + "step": 2222 + }, + { + "epoch": 1.037313432835821, + "grad_norm": 1.2746463725407722, + "learning_rate": 8.271319352105078e-06, + "loss": 0.1215, + "step": 2224 + }, + { + "epoch": 1.0382462686567164, + "grad_norm": 1.3505954916785239, + "learning_rate": 8.267212586427986e-06, + "loss": 0.1144, + "step": 2226 + }, + { + "epoch": 1.039179104477612, + "grad_norm": 0.9954729702087639, + "learning_rate": 8.263101970579684e-06, + "loss": 0.1108, + "step": 2228 + }, + { + "epoch": 1.0401119402985075, + "grad_norm": 1.0559102102306708, + "learning_rate": 8.258987509404227e-06, + "loss": 0.1027, + "step": 2230 + }, + { + "epoch": 1.041044776119403, + "grad_norm": 1.108109593911891, + "learning_rate": 8.254869207750207e-06, + "loss": 0.1291, + "step": 2232 + }, + { + "epoch": 1.0419776119402986, + "grad_norm": 1.2817400208141994, + "learning_rate": 8.250747070470743e-06, + "loss": 0.1183, + "step": 2234 + }, + { + "epoch": 1.0429104477611941, + "grad_norm": 1.0460244622141617, + "learning_rate": 8.246621102423474e-06, + "loss": 0.104, + "step": 2236 + }, + { + "epoch": 1.0438432835820897, + "grad_norm": 0.99446670791871, + "learning_rate": 8.242491308470548e-06, + "loss": 0.1041, + "step": 2238 + }, + { + "epoch": 1.044776119402985, + "grad_norm": 1.0129009049855824, + "learning_rate": 8.23835769347863e-06, + "loss": 0.1, + "step": 2240 + }, + { + "epoch": 1.0457089552238805, + "grad_norm": 1.168358638609721, + "learning_rate": 8.234220262318876e-06, + "loss": 0.1164, + "step": 2242 + }, + { + "epoch": 1.046641791044776, + "grad_norm": 1.1503486614148268, + "learning_rate": 8.230079019866955e-06, + "loss": 0.1083, + "step": 2244 + }, + { + "epoch": 1.0475746268656716, + "grad_norm": 1.2380001019780191, + "learning_rate": 8.225933971003011e-06, + "loss": 0.1131, + "step": 2246 + }, + { + "epoch": 1.0485074626865671, + "grad_norm": 1.0564780902929154, + "learning_rate": 8.221785120611687e-06, + "loss": 0.1111, + "step": 2248 + }, + { + "epoch": 1.0494402985074627, + "grad_norm": 1.1887774356368606, + "learning_rate": 8.217632473582096e-06, + "loss": 0.1161, + "step": 2250 + }, + { + "epoch": 1.0503731343283582, + "grad_norm": 1.138674016396561, + "learning_rate": 8.213476034807827e-06, + "loss": 0.1185, + "step": 2252 + }, + { + "epoch": 1.0513059701492538, + "grad_norm": 1.0465094457705495, + "learning_rate": 8.209315809186946e-06, + "loss": 0.1095, + "step": 2254 + }, + { + "epoch": 1.0522388059701493, + "grad_norm": 0.9444953157852821, + "learning_rate": 8.205151801621971e-06, + "loss": 0.103, + "step": 2256 + }, + { + "epoch": 1.0531716417910448, + "grad_norm": 1.2998634455200997, + "learning_rate": 8.20098401701988e-06, + "loss": 0.1138, + "step": 2258 + }, + { + "epoch": 1.0541044776119404, + "grad_norm": 0.9270483377144897, + "learning_rate": 8.196812460292105e-06, + "loss": 0.1084, + "step": 2260 + }, + { + "epoch": 1.055037313432836, + "grad_norm": 1.1314657621167346, + "learning_rate": 8.192637136354516e-06, + "loss": 0.1075, + "step": 2262 + }, + { + "epoch": 1.0559701492537314, + "grad_norm": 0.9303235925194865, + "learning_rate": 8.188458050127433e-06, + "loss": 0.0977, + "step": 2264 + }, + { + "epoch": 1.0569029850746268, + "grad_norm": 1.2776777339806886, + "learning_rate": 8.184275206535598e-06, + "loss": 0.1187, + "step": 2266 + }, + { + "epoch": 1.0578358208955223, + "grad_norm": 1.147933625291267, + "learning_rate": 8.18008861050819e-06, + "loss": 0.1201, + "step": 2268 + }, + { + "epoch": 1.0587686567164178, + "grad_norm": 1.2369565673691505, + "learning_rate": 8.175898266978805e-06, + "loss": 0.1093, + "step": 2270 + }, + { + "epoch": 1.0597014925373134, + "grad_norm": 1.1020580611564963, + "learning_rate": 8.171704180885457e-06, + "loss": 0.1142, + "step": 2272 + }, + { + "epoch": 1.060634328358209, + "grad_norm": 0.9910602776066643, + "learning_rate": 8.167506357170572e-06, + "loss": 0.1057, + "step": 2274 + }, + { + "epoch": 1.0615671641791045, + "grad_norm": 1.1072240546374714, + "learning_rate": 8.163304800780975e-06, + "loss": 0.1116, + "step": 2276 + }, + { + "epoch": 1.0625, + "grad_norm": 1.2689376618750916, + "learning_rate": 8.159099516667894e-06, + "loss": 0.1141, + "step": 2278 + }, + { + "epoch": 1.0634328358208955, + "grad_norm": 1.1840258899877902, + "learning_rate": 8.15489050978695e-06, + "loss": 0.1034, + "step": 2280 + }, + { + "epoch": 1.064365671641791, + "grad_norm": 1.1334988296378041, + "learning_rate": 8.150677785098149e-06, + "loss": 0.1082, + "step": 2282 + }, + { + "epoch": 1.0652985074626866, + "grad_norm": 1.0671351987050588, + "learning_rate": 8.146461347565878e-06, + "loss": 0.1073, + "step": 2284 + }, + { + "epoch": 1.0662313432835822, + "grad_norm": 1.1808573891023184, + "learning_rate": 8.142241202158904e-06, + "loss": 0.1213, + "step": 2286 + }, + { + "epoch": 1.0671641791044777, + "grad_norm": 1.0003403838040124, + "learning_rate": 8.138017353850357e-06, + "loss": 0.1007, + "step": 2288 + }, + { + "epoch": 1.0680970149253732, + "grad_norm": 1.3889633705059334, + "learning_rate": 8.133789807617734e-06, + "loss": 0.1087, + "step": 2290 + }, + { + "epoch": 1.0690298507462686, + "grad_norm": 1.1253435264958411, + "learning_rate": 8.12955856844289e-06, + "loss": 0.1125, + "step": 2292 + }, + { + "epoch": 1.069962686567164, + "grad_norm": 1.0014264480234976, + "learning_rate": 8.125323641312029e-06, + "loss": 0.1031, + "step": 2294 + }, + { + "epoch": 1.0708955223880596, + "grad_norm": 1.1620509798088041, + "learning_rate": 8.121085031215705e-06, + "loss": 0.1125, + "step": 2296 + }, + { + "epoch": 1.0718283582089552, + "grad_norm": 0.9974077503117971, + "learning_rate": 8.116842743148811e-06, + "loss": 0.1073, + "step": 2298 + }, + { + "epoch": 1.0727611940298507, + "grad_norm": 1.168722766350595, + "learning_rate": 8.11259678211057e-06, + "loss": 0.1096, + "step": 2300 + }, + { + "epoch": 1.0736940298507462, + "grad_norm": 0.9697203942724552, + "learning_rate": 8.108347153104543e-06, + "loss": 0.106, + "step": 2302 + }, + { + "epoch": 1.0746268656716418, + "grad_norm": 1.168531378243434, + "learning_rate": 8.1040938611386e-06, + "loss": 0.1153, + "step": 2304 + }, + { + "epoch": 1.0755597014925373, + "grad_norm": 1.0871744617031431, + "learning_rate": 8.099836911224938e-06, + "loss": 0.1135, + "step": 2306 + }, + { + "epoch": 1.0764925373134329, + "grad_norm": 1.0082360140508957, + "learning_rate": 8.095576308380061e-06, + "loss": 0.0853, + "step": 2308 + }, + { + "epoch": 1.0774253731343284, + "grad_norm": 1.0715152510214832, + "learning_rate": 8.091312057624779e-06, + "loss": 0.1017, + "step": 2310 + }, + { + "epoch": 1.078358208955224, + "grad_norm": 1.2738804788022269, + "learning_rate": 8.087044163984197e-06, + "loss": 0.1223, + "step": 2312 + }, + { + "epoch": 1.0792910447761195, + "grad_norm": 1.0180530610338325, + "learning_rate": 8.082772632487718e-06, + "loss": 0.1042, + "step": 2314 + }, + { + "epoch": 1.080223880597015, + "grad_norm": 1.3188969587079598, + "learning_rate": 8.07849746816903e-06, + "loss": 0.1206, + "step": 2316 + }, + { + "epoch": 1.0811567164179103, + "grad_norm": 1.2444359131901108, + "learning_rate": 8.074218676066102e-06, + "loss": 0.1201, + "step": 2318 + }, + { + "epoch": 1.0820895522388059, + "grad_norm": 0.9898485831166415, + "learning_rate": 8.069936261221174e-06, + "loss": 0.1051, + "step": 2320 + }, + { + "epoch": 1.0830223880597014, + "grad_norm": 1.1243256433437463, + "learning_rate": 8.065650228680762e-06, + "loss": 0.1106, + "step": 2322 + }, + { + "epoch": 1.083955223880597, + "grad_norm": 1.097399022210265, + "learning_rate": 8.061360583495643e-06, + "loss": 0.1193, + "step": 2324 + }, + { + "epoch": 1.0848880597014925, + "grad_norm": 1.1995774170744764, + "learning_rate": 8.057067330720847e-06, + "loss": 0.1146, + "step": 2326 + }, + { + "epoch": 1.085820895522388, + "grad_norm": 1.1476316055198292, + "learning_rate": 8.052770475415661e-06, + "loss": 0.1093, + "step": 2328 + }, + { + "epoch": 1.0867537313432836, + "grad_norm": 1.1039623227771003, + "learning_rate": 8.048470022643615e-06, + "loss": 0.103, + "step": 2330 + }, + { + "epoch": 1.087686567164179, + "grad_norm": 1.074020177179, + "learning_rate": 8.044165977472476e-06, + "loss": 0.1128, + "step": 2332 + }, + { + "epoch": 1.0886194029850746, + "grad_norm": 1.0352572215204474, + "learning_rate": 8.03985834497425e-06, + "loss": 0.1033, + "step": 2334 + }, + { + "epoch": 1.0895522388059702, + "grad_norm": 1.1609993128679654, + "learning_rate": 8.035547130225164e-06, + "loss": 0.1078, + "step": 2336 + }, + { + "epoch": 1.0904850746268657, + "grad_norm": 1.0138191128815934, + "learning_rate": 8.03123233830567e-06, + "loss": 0.1008, + "step": 2338 + }, + { + "epoch": 1.0914179104477613, + "grad_norm": 1.2026968052357823, + "learning_rate": 8.026913974300437e-06, + "loss": 0.1234, + "step": 2340 + }, + { + "epoch": 1.0923507462686568, + "grad_norm": 1.2400742158660756, + "learning_rate": 8.022592043298339e-06, + "loss": 0.1109, + "step": 2342 + }, + { + "epoch": 1.0932835820895523, + "grad_norm": 1.1441136454147998, + "learning_rate": 8.018266550392457e-06, + "loss": 0.1024, + "step": 2344 + }, + { + "epoch": 1.0942164179104477, + "grad_norm": 1.0247183710109928, + "learning_rate": 8.013937500680068e-06, + "loss": 0.1032, + "step": 2346 + }, + { + "epoch": 1.0951492537313432, + "grad_norm": 1.1698469519200057, + "learning_rate": 8.00960489926264e-06, + "loss": 0.114, + "step": 2348 + }, + { + "epoch": 1.0960820895522387, + "grad_norm": 1.224534109351433, + "learning_rate": 8.005268751245827e-06, + "loss": 0.1112, + "step": 2350 + }, + { + "epoch": 1.0970149253731343, + "grad_norm": 1.1399058033450726, + "learning_rate": 8.000929061739463e-06, + "loss": 0.1054, + "step": 2352 + }, + { + "epoch": 1.0979477611940298, + "grad_norm": 1.1108704714855089, + "learning_rate": 7.996585835857557e-06, + "loss": 0.107, + "step": 2354 + }, + { + "epoch": 1.0988805970149254, + "grad_norm": 1.1461037156809597, + "learning_rate": 7.99223907871828e-06, + "loss": 0.1228, + "step": 2356 + }, + { + "epoch": 1.099813432835821, + "grad_norm": 1.1485206383984528, + "learning_rate": 7.987888795443968e-06, + "loss": 0.1134, + "step": 2358 + }, + { + "epoch": 1.1007462686567164, + "grad_norm": 1.0878504488993384, + "learning_rate": 7.983534991161113e-06, + "loss": 0.1016, + "step": 2360 + }, + { + "epoch": 1.101679104477612, + "grad_norm": 1.0434582793621598, + "learning_rate": 7.979177671000353e-06, + "loss": 0.1011, + "step": 2362 + }, + { + "epoch": 1.1026119402985075, + "grad_norm": 1.0737922103362503, + "learning_rate": 7.974816840096475e-06, + "loss": 0.1075, + "step": 2364 + }, + { + "epoch": 1.103544776119403, + "grad_norm": 1.0963592729977907, + "learning_rate": 7.970452503588397e-06, + "loss": 0.1031, + "step": 2366 + }, + { + "epoch": 1.1044776119402986, + "grad_norm": 1.1858737460624225, + "learning_rate": 7.96608466661917e-06, + "loss": 0.1082, + "step": 2368 + }, + { + "epoch": 1.1054104477611941, + "grad_norm": 1.0911358486899008, + "learning_rate": 7.961713334335974e-06, + "loss": 0.1053, + "step": 2370 + }, + { + "epoch": 1.1063432835820897, + "grad_norm": 1.207881509084649, + "learning_rate": 7.9573385118901e-06, + "loss": 0.1118, + "step": 2372 + }, + { + "epoch": 1.107276119402985, + "grad_norm": 1.130680420377781, + "learning_rate": 7.952960204436959e-06, + "loss": 0.1134, + "step": 2374 + }, + { + "epoch": 1.1082089552238805, + "grad_norm": 1.0973475419262104, + "learning_rate": 7.948578417136066e-06, + "loss": 0.1058, + "step": 2376 + }, + { + "epoch": 1.109141791044776, + "grad_norm": 1.0364567984508013, + "learning_rate": 7.944193155151037e-06, + "loss": 0.0973, + "step": 2378 + }, + { + "epoch": 1.1100746268656716, + "grad_norm": 1.1250270395725783, + "learning_rate": 7.939804423649582e-06, + "loss": 0.111, + "step": 2380 + }, + { + "epoch": 1.1110074626865671, + "grad_norm": 0.9985489805969399, + "learning_rate": 7.935412227803502e-06, + "loss": 0.0974, + "step": 2382 + }, + { + "epoch": 1.1119402985074627, + "grad_norm": 1.074147661983881, + "learning_rate": 7.931016572788676e-06, + "loss": 0.1102, + "step": 2384 + }, + { + "epoch": 1.1128731343283582, + "grad_norm": 1.0724172371093192, + "learning_rate": 7.926617463785067e-06, + "loss": 0.1123, + "step": 2386 + }, + { + "epoch": 1.1138059701492538, + "grad_norm": 0.979687108625746, + "learning_rate": 7.922214905976698e-06, + "loss": 0.1117, + "step": 2388 + }, + { + "epoch": 1.1147388059701493, + "grad_norm": 1.1046165980707205, + "learning_rate": 7.917808904551662e-06, + "loss": 0.1126, + "step": 2390 + }, + { + "epoch": 1.1156716417910448, + "grad_norm": 1.0569140912233863, + "learning_rate": 7.913399464702114e-06, + "loss": 0.1043, + "step": 2392 + }, + { + "epoch": 1.1166044776119404, + "grad_norm": 1.0191588533732199, + "learning_rate": 7.908986591624253e-06, + "loss": 0.1073, + "step": 2394 + }, + { + "epoch": 1.117537313432836, + "grad_norm": 1.0089995502261069, + "learning_rate": 7.90457029051833e-06, + "loss": 0.0837, + "step": 2396 + }, + { + "epoch": 1.1184701492537314, + "grad_norm": 1.2013326703430722, + "learning_rate": 7.900150566588628e-06, + "loss": 0.0964, + "step": 2398 + }, + { + "epoch": 1.1194029850746268, + "grad_norm": 1.0496919708263854, + "learning_rate": 7.895727425043476e-06, + "loss": 0.0996, + "step": 2400 + }, + { + "epoch": 1.1203358208955223, + "grad_norm": 1.2574298870825513, + "learning_rate": 7.891300871095217e-06, + "loss": 0.1089, + "step": 2402 + }, + { + "epoch": 1.1212686567164178, + "grad_norm": 1.165238281283654, + "learning_rate": 7.886870909960223e-06, + "loss": 0.1061, + "step": 2404 + }, + { + "epoch": 1.1222014925373134, + "grad_norm": 1.6844756064011217, + "learning_rate": 7.88243754685888e-06, + "loss": 0.1102, + "step": 2406 + }, + { + "epoch": 1.123134328358209, + "grad_norm": 1.2887532192435922, + "learning_rate": 7.87800078701558e-06, + "loss": 0.1193, + "step": 2408 + }, + { + "epoch": 1.1240671641791045, + "grad_norm": 1.1222346223872233, + "learning_rate": 7.873560635658724e-06, + "loss": 0.1042, + "step": 2410 + }, + { + "epoch": 1.125, + "grad_norm": 1.0562199674654056, + "learning_rate": 7.869117098020705e-06, + "loss": 0.1079, + "step": 2412 + }, + { + "epoch": 1.1259328358208955, + "grad_norm": 1.1049798570520013, + "learning_rate": 7.864670179337904e-06, + "loss": 0.1165, + "step": 2414 + }, + { + "epoch": 1.126865671641791, + "grad_norm": 1.0545886326050513, + "learning_rate": 7.860219884850693e-06, + "loss": 0.1158, + "step": 2416 + }, + { + "epoch": 1.1277985074626866, + "grad_norm": 0.9662435273081368, + "learning_rate": 7.855766219803417e-06, + "loss": 0.1009, + "step": 2418 + }, + { + "epoch": 1.1287313432835822, + "grad_norm": 1.0122617670388458, + "learning_rate": 7.851309189444396e-06, + "loss": 0.097, + "step": 2420 + }, + { + "epoch": 1.1296641791044777, + "grad_norm": 1.099721695389685, + "learning_rate": 7.846848799025914e-06, + "loss": 0.1084, + "step": 2422 + }, + { + "epoch": 1.1305970149253732, + "grad_norm": 1.1184073996904251, + "learning_rate": 7.842385053804214e-06, + "loss": 0.1168, + "step": 2424 + }, + { + "epoch": 1.1315298507462686, + "grad_norm": 1.139028773906029, + "learning_rate": 7.837917959039495e-06, + "loss": 0.1004, + "step": 2426 + }, + { + "epoch": 1.132462686567164, + "grad_norm": 0.951833695412807, + "learning_rate": 7.8334475199959e-06, + "loss": 0.0978, + "step": 2428 + }, + { + "epoch": 1.1333955223880596, + "grad_norm": 1.1579955713610255, + "learning_rate": 7.828973741941517e-06, + "loss": 0.1174, + "step": 2430 + }, + { + "epoch": 1.1343283582089552, + "grad_norm": 1.0384681471468071, + "learning_rate": 7.824496630148366e-06, + "loss": 0.1166, + "step": 2432 + }, + { + "epoch": 1.1352611940298507, + "grad_norm": 0.9963534166553635, + "learning_rate": 7.820016189892391e-06, + "loss": 0.0918, + "step": 2434 + }, + { + "epoch": 1.1361940298507462, + "grad_norm": 1.1217727403934883, + "learning_rate": 7.815532426453471e-06, + "loss": 0.1028, + "step": 2436 + }, + { + "epoch": 1.1371268656716418, + "grad_norm": 1.102379764515102, + "learning_rate": 7.811045345115389e-06, + "loss": 0.1128, + "step": 2438 + }, + { + "epoch": 1.1380597014925373, + "grad_norm": 1.083350367180205, + "learning_rate": 7.806554951165843e-06, + "loss": 0.1056, + "step": 2440 + }, + { + "epoch": 1.1389925373134329, + "grad_norm": 1.1312869810368222, + "learning_rate": 7.802061249896435e-06, + "loss": 0.0998, + "step": 2442 + }, + { + "epoch": 1.1399253731343284, + "grad_norm": 1.0542921025787402, + "learning_rate": 7.797564246602663e-06, + "loss": 0.1123, + "step": 2444 + }, + { + "epoch": 1.140858208955224, + "grad_norm": 1.1293486873929137, + "learning_rate": 7.793063946583913e-06, + "loss": 0.1039, + "step": 2446 + }, + { + "epoch": 1.1417910447761195, + "grad_norm": 1.0213954185821845, + "learning_rate": 7.788560355143467e-06, + "loss": 0.1044, + "step": 2448 + }, + { + "epoch": 1.142723880597015, + "grad_norm": 1.1584096907929313, + "learning_rate": 7.784053477588474e-06, + "loss": 0.1131, + "step": 2450 + }, + { + "epoch": 1.1436567164179103, + "grad_norm": 1.1663773610337964, + "learning_rate": 7.779543319229958e-06, + "loss": 0.0995, + "step": 2452 + }, + { + "epoch": 1.1445895522388059, + "grad_norm": 1.0619588002763802, + "learning_rate": 7.775029885382815e-06, + "loss": 0.0978, + "step": 2454 + }, + { + "epoch": 1.1455223880597014, + "grad_norm": 0.977509405468944, + "learning_rate": 7.770513181365794e-06, + "loss": 0.102, + "step": 2456 + }, + { + "epoch": 1.146455223880597, + "grad_norm": 1.080332616778311, + "learning_rate": 7.765993212501502e-06, + "loss": 0.1025, + "step": 2458 + }, + { + "epoch": 1.1473880597014925, + "grad_norm": 1.0311526545077554, + "learning_rate": 7.76146998411639e-06, + "loss": 0.0899, + "step": 2460 + }, + { + "epoch": 1.148320895522388, + "grad_norm": 0.9292428023774125, + "learning_rate": 7.756943501540754e-06, + "loss": 0.0828, + "step": 2462 + }, + { + "epoch": 1.1492537313432836, + "grad_norm": 1.1061606243051447, + "learning_rate": 7.752413770108723e-06, + "loss": 0.1123, + "step": 2464 + }, + { + "epoch": 1.150186567164179, + "grad_norm": 1.1553296981024972, + "learning_rate": 7.747880795158254e-06, + "loss": 0.118, + "step": 2466 + }, + { + "epoch": 1.1511194029850746, + "grad_norm": 1.0859655084117374, + "learning_rate": 7.743344582031125e-06, + "loss": 0.1064, + "step": 2468 + }, + { + "epoch": 1.1520522388059702, + "grad_norm": 1.0836789340684423, + "learning_rate": 7.738805136072934e-06, + "loss": 0.1071, + "step": 2470 + }, + { + "epoch": 1.1529850746268657, + "grad_norm": 1.2663875819869606, + "learning_rate": 7.734262462633084e-06, + "loss": 0.1121, + "step": 2472 + }, + { + "epoch": 1.1539179104477613, + "grad_norm": 1.356204379617711, + "learning_rate": 7.729716567064787e-06, + "loss": 0.1121, + "step": 2474 + }, + { + "epoch": 1.1548507462686568, + "grad_norm": 1.1718719608276413, + "learning_rate": 7.725167454725045e-06, + "loss": 0.1134, + "step": 2476 + }, + { + "epoch": 1.1557835820895521, + "grad_norm": 1.2494780252324966, + "learning_rate": 7.720615130974654e-06, + "loss": 0.1117, + "step": 2478 + }, + { + "epoch": 1.1567164179104479, + "grad_norm": 1.1986325443330386, + "learning_rate": 7.716059601178199e-06, + "loss": 0.1141, + "step": 2480 + }, + { + "epoch": 1.1576492537313432, + "grad_norm": 1.1556776271888145, + "learning_rate": 7.711500870704036e-06, + "loss": 0.1088, + "step": 2482 + }, + { + "epoch": 1.1585820895522387, + "grad_norm": 0.949080655819924, + "learning_rate": 7.706938944924296e-06, + "loss": 0.0925, + "step": 2484 + }, + { + "epoch": 1.1595149253731343, + "grad_norm": 1.001514149832634, + "learning_rate": 7.702373829214873e-06, + "loss": 0.1037, + "step": 2486 + }, + { + "epoch": 1.1604477611940298, + "grad_norm": 1.0333838776551239, + "learning_rate": 7.697805528955426e-06, + "loss": 0.1149, + "step": 2488 + }, + { + "epoch": 1.1613805970149254, + "grad_norm": 1.2101372951488252, + "learning_rate": 7.693234049529363e-06, + "loss": 0.1123, + "step": 2490 + }, + { + "epoch": 1.162313432835821, + "grad_norm": 1.1237638649243655, + "learning_rate": 7.688659396323834e-06, + "loss": 0.1016, + "step": 2492 + }, + { + "epoch": 1.1632462686567164, + "grad_norm": 1.3066181649848494, + "learning_rate": 7.684081574729738e-06, + "loss": 0.1041, + "step": 2494 + }, + { + "epoch": 1.164179104477612, + "grad_norm": 1.0566116364025127, + "learning_rate": 7.6795005901417e-06, + "loss": 0.1085, + "step": 2496 + }, + { + "epoch": 1.1651119402985075, + "grad_norm": 1.1430341121791319, + "learning_rate": 7.674916447958076e-06, + "loss": 0.1236, + "step": 2498 + }, + { + "epoch": 1.166044776119403, + "grad_norm": 1.102188707092688, + "learning_rate": 7.670329153580944e-06, + "loss": 0.1083, + "step": 2500 + }, + { + "epoch": 1.166044776119403, + "eval_loss": 0.1655130684375763, + "eval_runtime": 320.1478, + "eval_samples_per_second": 47.622, + "eval_steps_per_second": 5.954, + "step": 2500 + }, + { + "epoch": 1.1669776119402986, + "grad_norm": 1.075540598962944, + "learning_rate": 7.665738712416094e-06, + "loss": 0.1042, + "step": 2502 + }, + { + "epoch": 1.1679104477611941, + "grad_norm": 1.154083558254164, + "learning_rate": 7.661145129873026e-06, + "loss": 0.107, + "step": 2504 + }, + { + "epoch": 1.1688432835820897, + "grad_norm": 1.1267362871800328, + "learning_rate": 7.656548411364939e-06, + "loss": 0.116, + "step": 2506 + }, + { + "epoch": 1.169776119402985, + "grad_norm": 1.1551330400864188, + "learning_rate": 7.651948562308734e-06, + "loss": 0.1145, + "step": 2508 + }, + { + "epoch": 1.1707089552238805, + "grad_norm": 1.1647719703950468, + "learning_rate": 7.647345588124993e-06, + "loss": 0.1191, + "step": 2510 + }, + { + "epoch": 1.171641791044776, + "grad_norm": 1.0514550342139921, + "learning_rate": 7.642739494237986e-06, + "loss": 0.116, + "step": 2512 + }, + { + "epoch": 1.1725746268656716, + "grad_norm": 1.1039036169281318, + "learning_rate": 7.638130286075658e-06, + "loss": 0.1026, + "step": 2514 + }, + { + "epoch": 1.1735074626865671, + "grad_norm": 1.1065986684739848, + "learning_rate": 7.633517969069626e-06, + "loss": 0.1217, + "step": 2516 + }, + { + "epoch": 1.1744402985074627, + "grad_norm": 1.0843201188394793, + "learning_rate": 7.628902548655164e-06, + "loss": 0.1008, + "step": 2518 + }, + { + "epoch": 1.1753731343283582, + "grad_norm": 1.1248288378872542, + "learning_rate": 7.624284030271211e-06, + "loss": 0.0976, + "step": 2520 + }, + { + "epoch": 1.1763059701492538, + "grad_norm": 1.1004513739490172, + "learning_rate": 7.619662419360353e-06, + "loss": 0.1079, + "step": 2522 + }, + { + "epoch": 1.1772388059701493, + "grad_norm": 0.998667555753008, + "learning_rate": 7.615037721368818e-06, + "loss": 0.1027, + "step": 2524 + }, + { + "epoch": 1.1781716417910448, + "grad_norm": 1.1921378641046936, + "learning_rate": 7.610409941746479e-06, + "loss": 0.1158, + "step": 2526 + }, + { + "epoch": 1.1791044776119404, + "grad_norm": 1.1471542350970025, + "learning_rate": 7.6057790859468316e-06, + "loss": 0.1099, + "step": 2528 + }, + { + "epoch": 1.180037313432836, + "grad_norm": 1.0553094374178313, + "learning_rate": 7.601145159427004e-06, + "loss": 0.1006, + "step": 2530 + }, + { + "epoch": 1.1809701492537314, + "grad_norm": 1.1869004833375891, + "learning_rate": 7.5965081676477385e-06, + "loss": 0.1178, + "step": 2532 + }, + { + "epoch": 1.1819029850746268, + "grad_norm": 1.2649904595325534, + "learning_rate": 7.591868116073391e-06, + "loss": 0.1218, + "step": 2534 + }, + { + "epoch": 1.1828358208955223, + "grad_norm": 1.0987236321719478, + "learning_rate": 7.587225010171921e-06, + "loss": 0.1178, + "step": 2536 + }, + { + "epoch": 1.1837686567164178, + "grad_norm": 0.9932961409517518, + "learning_rate": 7.582578855414895e-06, + "loss": 0.0971, + "step": 2538 + }, + { + "epoch": 1.1847014925373134, + "grad_norm": 1.0939650321271175, + "learning_rate": 7.577929657277462e-06, + "loss": 0.1085, + "step": 2540 + }, + { + "epoch": 1.185634328358209, + "grad_norm": 1.0200331172950676, + "learning_rate": 7.573277421238363e-06, + "loss": 0.104, + "step": 2542 + }, + { + "epoch": 1.1865671641791045, + "grad_norm": 1.5142382214048933, + "learning_rate": 7.56862215277992e-06, + "loss": 0.1103, + "step": 2544 + }, + { + "epoch": 1.1875, + "grad_norm": 1.1075958125142682, + "learning_rate": 7.5639638573880245e-06, + "loss": 0.1119, + "step": 2546 + }, + { + "epoch": 1.1884328358208955, + "grad_norm": 1.0490123312729123, + "learning_rate": 7.559302540552138e-06, + "loss": 0.1028, + "step": 2548 + }, + { + "epoch": 1.189365671641791, + "grad_norm": 1.2515364153015263, + "learning_rate": 7.554638207765281e-06, + "loss": 0.1043, + "step": 2550 + }, + { + "epoch": 1.1902985074626866, + "grad_norm": 1.096987817928207, + "learning_rate": 7.5499708645240295e-06, + "loss": 0.1064, + "step": 2552 + }, + { + "epoch": 1.1912313432835822, + "grad_norm": 1.138866245588538, + "learning_rate": 7.545300516328508e-06, + "loss": 0.1116, + "step": 2554 + }, + { + "epoch": 1.1921641791044777, + "grad_norm": 1.1139702774600553, + "learning_rate": 7.540627168682377e-06, + "loss": 0.1063, + "step": 2556 + }, + { + "epoch": 1.1930970149253732, + "grad_norm": 1.1942392031376736, + "learning_rate": 7.535950827092837e-06, + "loss": 0.1124, + "step": 2558 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 1.107898636344152, + "learning_rate": 7.531271497070616e-06, + "loss": 0.1029, + "step": 2560 + }, + { + "epoch": 1.194962686567164, + "grad_norm": 1.0487706571983737, + "learning_rate": 7.5265891841299575e-06, + "loss": 0.0976, + "step": 2562 + }, + { + "epoch": 1.1958955223880596, + "grad_norm": 1.1247688058216387, + "learning_rate": 7.521903893788631e-06, + "loss": 0.1118, + "step": 2564 + }, + { + "epoch": 1.1968283582089552, + "grad_norm": 1.113193806479794, + "learning_rate": 7.517215631567905e-06, + "loss": 0.0991, + "step": 2566 + }, + { + "epoch": 1.1977611940298507, + "grad_norm": 1.164127094358999, + "learning_rate": 7.512524402992556e-06, + "loss": 0.1112, + "step": 2568 + }, + { + "epoch": 1.1986940298507462, + "grad_norm": 1.2170553977796374, + "learning_rate": 7.507830213590852e-06, + "loss": 0.1065, + "step": 2570 + }, + { + "epoch": 1.1996268656716418, + "grad_norm": 0.9797528206892345, + "learning_rate": 7.503133068894554e-06, + "loss": 0.0887, + "step": 2572 + }, + { + "epoch": 1.2005597014925373, + "grad_norm": 1.3268016153678397, + "learning_rate": 7.4984329744389024e-06, + "loss": 0.1069, + "step": 2574 + }, + { + "epoch": 1.2014925373134329, + "grad_norm": 1.0387475686650887, + "learning_rate": 7.493729935762615e-06, + "loss": 0.101, + "step": 2576 + }, + { + "epoch": 1.2024253731343284, + "grad_norm": 1.2411385386713536, + "learning_rate": 7.489023958407878e-06, + "loss": 0.1154, + "step": 2578 + }, + { + "epoch": 1.203358208955224, + "grad_norm": 1.070976967240602, + "learning_rate": 7.484315047920345e-06, + "loss": 0.0968, + "step": 2580 + }, + { + "epoch": 1.2042910447761195, + "grad_norm": 1.0279259296417749, + "learning_rate": 7.479603209849121e-06, + "loss": 0.101, + "step": 2582 + }, + { + "epoch": 1.205223880597015, + "grad_norm": 1.1949054937326204, + "learning_rate": 7.474888449746761e-06, + "loss": 0.1042, + "step": 2584 + }, + { + "epoch": 1.2061567164179103, + "grad_norm": 1.0934519694318268, + "learning_rate": 7.470170773169268e-06, + "loss": 0.1092, + "step": 2586 + }, + { + "epoch": 1.2070895522388059, + "grad_norm": 1.2455261734049656, + "learning_rate": 7.465450185676079e-06, + "loss": 0.1141, + "step": 2588 + }, + { + "epoch": 1.2080223880597014, + "grad_norm": 1.133881071885915, + "learning_rate": 7.460726692830057e-06, + "loss": 0.1131, + "step": 2590 + }, + { + "epoch": 1.208955223880597, + "grad_norm": 1.1078991969471512, + "learning_rate": 7.456000300197498e-06, + "loss": 0.1106, + "step": 2592 + }, + { + "epoch": 1.2098880597014925, + "grad_norm": 1.0919539546133117, + "learning_rate": 7.4512710133481084e-06, + "loss": 0.1101, + "step": 2594 + }, + { + "epoch": 1.210820895522388, + "grad_norm": 1.042048448194764, + "learning_rate": 7.446538837855006e-06, + "loss": 0.1048, + "step": 2596 + }, + { + "epoch": 1.2117537313432836, + "grad_norm": 1.146176533937742, + "learning_rate": 7.4418037792947165e-06, + "loss": 0.1099, + "step": 2598 + }, + { + "epoch": 1.212686567164179, + "grad_norm": 1.1797348432147379, + "learning_rate": 7.437065843247158e-06, + "loss": 0.1036, + "step": 2600 + }, + { + "epoch": 1.2136194029850746, + "grad_norm": 0.994822974236348, + "learning_rate": 7.432325035295641e-06, + "loss": 0.1041, + "step": 2602 + }, + { + "epoch": 1.2145522388059702, + "grad_norm": 1.1477646285232344, + "learning_rate": 7.427581361026863e-06, + "loss": 0.1056, + "step": 2604 + }, + { + "epoch": 1.2154850746268657, + "grad_norm": 1.0941356228947834, + "learning_rate": 7.422834826030898e-06, + "loss": 0.094, + "step": 2606 + }, + { + "epoch": 1.2164179104477613, + "grad_norm": 1.151960814484258, + "learning_rate": 7.418085435901189e-06, + "loss": 0.101, + "step": 2608 + }, + { + "epoch": 1.2173507462686568, + "grad_norm": 1.0873573706860375, + "learning_rate": 7.413333196234545e-06, + "loss": 0.0928, + "step": 2610 + }, + { + "epoch": 1.2182835820895521, + "grad_norm": 1.237995409805719, + "learning_rate": 7.408578112631135e-06, + "loss": 0.1203, + "step": 2612 + }, + { + "epoch": 1.2192164179104479, + "grad_norm": 1.0716183891826505, + "learning_rate": 7.4038201906944755e-06, + "loss": 0.0928, + "step": 2614 + }, + { + "epoch": 1.2201492537313432, + "grad_norm": 1.1216445774836254, + "learning_rate": 7.399059436031428e-06, + "loss": 0.1069, + "step": 2616 + }, + { + "epoch": 1.2210820895522387, + "grad_norm": 1.1611530997551531, + "learning_rate": 7.3942958542522e-06, + "loss": 0.1117, + "step": 2618 + }, + { + "epoch": 1.2220149253731343, + "grad_norm": 1.2020861032231644, + "learning_rate": 7.389529450970318e-06, + "loss": 0.1165, + "step": 2620 + }, + { + "epoch": 1.2229477611940298, + "grad_norm": 1.0324165413720954, + "learning_rate": 7.384760231802643e-06, + "loss": 0.1127, + "step": 2622 + }, + { + "epoch": 1.2238805970149254, + "grad_norm": 1.1018413802488094, + "learning_rate": 7.37998820236935e-06, + "loss": 0.1063, + "step": 2624 + }, + { + "epoch": 1.224813432835821, + "grad_norm": 1.0537352628084051, + "learning_rate": 7.375213368293928e-06, + "loss": 0.1049, + "step": 2626 + }, + { + "epoch": 1.2257462686567164, + "grad_norm": 1.019181409127792, + "learning_rate": 7.3704357352031705e-06, + "loss": 0.1033, + "step": 2628 + }, + { + "epoch": 1.226679104477612, + "grad_norm": 0.9836523233476584, + "learning_rate": 7.365655308727167e-06, + "loss": 0.1031, + "step": 2630 + }, + { + "epoch": 1.2276119402985075, + "grad_norm": 1.1346228610688998, + "learning_rate": 7.360872094499303e-06, + "loss": 0.1076, + "step": 2632 + }, + { + "epoch": 1.228544776119403, + "grad_norm": 1.0928536395888118, + "learning_rate": 7.356086098156243e-06, + "loss": 0.1058, + "step": 2634 + }, + { + "epoch": 1.2294776119402986, + "grad_norm": 1.1029370545035155, + "learning_rate": 7.351297325337936e-06, + "loss": 0.1021, + "step": 2636 + }, + { + "epoch": 1.2304104477611941, + "grad_norm": 1.2199129344325508, + "learning_rate": 7.346505781687604e-06, + "loss": 0.1189, + "step": 2638 + }, + { + "epoch": 1.2313432835820897, + "grad_norm": 1.2933072752649069, + "learning_rate": 7.341711472851726e-06, + "loss": 0.1139, + "step": 2640 + }, + { + "epoch": 1.232276119402985, + "grad_norm": 1.1456399722142987, + "learning_rate": 7.336914404480046e-06, + "loss": 0.1042, + "step": 2642 + }, + { + "epoch": 1.2332089552238805, + "grad_norm": 1.1477447368664087, + "learning_rate": 7.33211458222556e-06, + "loss": 0.119, + "step": 2644 + }, + { + "epoch": 1.234141791044776, + "grad_norm": 1.1902219020235507, + "learning_rate": 7.327312011744505e-06, + "loss": 0.112, + "step": 2646 + }, + { + "epoch": 1.2350746268656716, + "grad_norm": 1.009572556191043, + "learning_rate": 7.322506698696361e-06, + "loss": 0.1087, + "step": 2648 + }, + { + "epoch": 1.2360074626865671, + "grad_norm": 1.1156522327158893, + "learning_rate": 7.3176986487438385e-06, + "loss": 0.1022, + "step": 2650 + }, + { + "epoch": 1.2369402985074627, + "grad_norm": 1.1797053137137803, + "learning_rate": 7.312887867552873e-06, + "loss": 0.1133, + "step": 2652 + }, + { + "epoch": 1.2378731343283582, + "grad_norm": 1.1513873160913002, + "learning_rate": 7.308074360792617e-06, + "loss": 0.1229, + "step": 2654 + }, + { + "epoch": 1.2388059701492538, + "grad_norm": 1.2014653170170158, + "learning_rate": 7.303258134135437e-06, + "loss": 0.1176, + "step": 2656 + }, + { + "epoch": 1.2397388059701493, + "grad_norm": 1.1123959416781903, + "learning_rate": 7.298439193256905e-06, + "loss": 0.1087, + "step": 2658 + }, + { + "epoch": 1.2406716417910448, + "grad_norm": 1.0567589662153143, + "learning_rate": 7.293617543835789e-06, + "loss": 0.1053, + "step": 2660 + }, + { + "epoch": 1.2416044776119404, + "grad_norm": 1.1060400319996038, + "learning_rate": 7.288793191554051e-06, + "loss": 0.1133, + "step": 2662 + }, + { + "epoch": 1.242537313432836, + "grad_norm": 1.1364081871037819, + "learning_rate": 7.28396614209684e-06, + "loss": 0.1041, + "step": 2664 + }, + { + "epoch": 1.2434701492537314, + "grad_norm": 1.088808255285531, + "learning_rate": 7.279136401152477e-06, + "loss": 0.1022, + "step": 2666 + }, + { + "epoch": 1.2444029850746268, + "grad_norm": 1.2489080995838342, + "learning_rate": 7.27430397441246e-06, + "loss": 0.1049, + "step": 2668 + }, + { + "epoch": 1.2453358208955223, + "grad_norm": 1.0974154866574723, + "learning_rate": 7.269468867571453e-06, + "loss": 0.1075, + "step": 2670 + }, + { + "epoch": 1.2462686567164178, + "grad_norm": 1.1336477031064254, + "learning_rate": 7.2646310863272725e-06, + "loss": 0.1133, + "step": 2672 + }, + { + "epoch": 1.2472014925373134, + "grad_norm": 1.3686960576490796, + "learning_rate": 7.259790636380892e-06, + "loss": 0.1132, + "step": 2674 + }, + { + "epoch": 1.248134328358209, + "grad_norm": 1.1912324914569272, + "learning_rate": 7.254947523436427e-06, + "loss": 0.109, + "step": 2676 + }, + { + "epoch": 1.2490671641791045, + "grad_norm": 1.1268319243851983, + "learning_rate": 7.250101753201134e-06, + "loss": 0.1038, + "step": 2678 + }, + { + "epoch": 1.25, + "grad_norm": 1.11821848462912, + "learning_rate": 7.2452533313853976e-06, + "loss": 0.1189, + "step": 2680 + }, + { + "epoch": 1.2509328358208955, + "grad_norm": 1.0555462132424303, + "learning_rate": 7.240402263702729e-06, + "loss": 0.1162, + "step": 2682 + }, + { + "epoch": 1.251865671641791, + "grad_norm": 1.0732946400255794, + "learning_rate": 7.235548555869755e-06, + "loss": 0.0932, + "step": 2684 + }, + { + "epoch": 1.2527985074626866, + "grad_norm": 1.2294857763959475, + "learning_rate": 7.230692213606218e-06, + "loss": 0.1053, + "step": 2686 + }, + { + "epoch": 1.2537313432835822, + "grad_norm": 1.2830161270144966, + "learning_rate": 7.225833242634961e-06, + "loss": 0.1021, + "step": 2688 + }, + { + "epoch": 1.2546641791044777, + "grad_norm": 1.0594775319709628, + "learning_rate": 7.2209716486819255e-06, + "loss": 0.1071, + "step": 2690 + }, + { + "epoch": 1.2555970149253732, + "grad_norm": 1.0805250136382083, + "learning_rate": 7.216107437476148e-06, + "loss": 0.1091, + "step": 2692 + }, + { + "epoch": 1.2565298507462686, + "grad_norm": 1.2708581553975034, + "learning_rate": 7.211240614749741e-06, + "loss": 0.1226, + "step": 2694 + }, + { + "epoch": 1.2574626865671643, + "grad_norm": 1.1388558899202947, + "learning_rate": 7.206371186237904e-06, + "loss": 0.109, + "step": 2696 + }, + { + "epoch": 1.2583955223880596, + "grad_norm": 0.9995337654837192, + "learning_rate": 7.201499157678899e-06, + "loss": 0.1023, + "step": 2698 + }, + { + "epoch": 1.2593283582089552, + "grad_norm": 1.1355489085753432, + "learning_rate": 7.196624534814056e-06, + "loss": 0.1129, + "step": 2700 + }, + { + "epoch": 1.2602611940298507, + "grad_norm": 0.9846278711768532, + "learning_rate": 7.191747323387764e-06, + "loss": 0.0993, + "step": 2702 + }, + { + "epoch": 1.2611940298507462, + "grad_norm": 1.1396072697603832, + "learning_rate": 7.18686752914746e-06, + "loss": 0.1117, + "step": 2704 + }, + { + "epoch": 1.2621268656716418, + "grad_norm": 1.0983419636059346, + "learning_rate": 7.1819851578436205e-06, + "loss": 0.1078, + "step": 2706 + }, + { + "epoch": 1.2630597014925373, + "grad_norm": 1.180833243987259, + "learning_rate": 7.177100215229769e-06, + "loss": 0.102, + "step": 2708 + }, + { + "epoch": 1.2639925373134329, + "grad_norm": 1.0810310626293518, + "learning_rate": 7.172212707062449e-06, + "loss": 0.1168, + "step": 2710 + }, + { + "epoch": 1.2649253731343284, + "grad_norm": 1.0619275048995278, + "learning_rate": 7.167322639101235e-06, + "loss": 0.1016, + "step": 2712 + }, + { + "epoch": 1.265858208955224, + "grad_norm": 1.129749324815062, + "learning_rate": 7.162430017108711e-06, + "loss": 0.1057, + "step": 2714 + }, + { + "epoch": 1.2667910447761195, + "grad_norm": 1.2492622710805221, + "learning_rate": 7.157534846850478e-06, + "loss": 0.1132, + "step": 2716 + }, + { + "epoch": 1.267723880597015, + "grad_norm": 1.1738874513300692, + "learning_rate": 7.152637134095133e-06, + "loss": 0.1123, + "step": 2718 + }, + { + "epoch": 1.2686567164179103, + "grad_norm": 1.175424747582013, + "learning_rate": 7.147736884614274e-06, + "loss": 0.1119, + "step": 2720 + }, + { + "epoch": 1.269589552238806, + "grad_norm": 1.0632777721964533, + "learning_rate": 7.142834104182489e-06, + "loss": 0.1076, + "step": 2722 + }, + { + "epoch": 1.2705223880597014, + "grad_norm": 1.1108504379493909, + "learning_rate": 7.137928798577342e-06, + "loss": 0.1184, + "step": 2724 + }, + { + "epoch": 1.271455223880597, + "grad_norm": 1.0981340847743442, + "learning_rate": 7.133020973579381e-06, + "loss": 0.1045, + "step": 2726 + }, + { + "epoch": 1.2723880597014925, + "grad_norm": 1.1097340538573355, + "learning_rate": 7.128110634972117e-06, + "loss": 0.0993, + "step": 2728 + }, + { + "epoch": 1.273320895522388, + "grad_norm": 1.143718838435375, + "learning_rate": 7.1231977885420256e-06, + "loss": 0.1077, + "step": 2730 + }, + { + "epoch": 1.2742537313432836, + "grad_norm": 1.0623841355541739, + "learning_rate": 7.118282440078535e-06, + "loss": 0.1041, + "step": 2732 + }, + { + "epoch": 1.275186567164179, + "grad_norm": 1.0461047747851933, + "learning_rate": 7.1133645953740285e-06, + "loss": 0.0984, + "step": 2734 + }, + { + "epoch": 1.2761194029850746, + "grad_norm": 1.1239903002286689, + "learning_rate": 7.108444260223825e-06, + "loss": 0.1137, + "step": 2736 + }, + { + "epoch": 1.2770522388059702, + "grad_norm": 1.076943725702056, + "learning_rate": 7.1035214404261775e-06, + "loss": 0.1095, + "step": 2738 + }, + { + "epoch": 1.2779850746268657, + "grad_norm": 1.0737252233536958, + "learning_rate": 7.098596141782271e-06, + "loss": 0.1132, + "step": 2740 + }, + { + "epoch": 1.2789179104477613, + "grad_norm": 1.085984333930034, + "learning_rate": 7.093668370096211e-06, + "loss": 0.1091, + "step": 2742 + }, + { + "epoch": 1.2798507462686568, + "grad_norm": 1.1152908964169612, + "learning_rate": 7.088738131175014e-06, + "loss": 0.1033, + "step": 2744 + }, + { + "epoch": 1.2807835820895521, + "grad_norm": 1.0239521359044592, + "learning_rate": 7.083805430828608e-06, + "loss": 0.0963, + "step": 2746 + }, + { + "epoch": 1.2817164179104479, + "grad_norm": 1.1278740790491524, + "learning_rate": 7.078870274869818e-06, + "loss": 0.1027, + "step": 2748 + }, + { + "epoch": 1.2826492537313432, + "grad_norm": 1.1006667101736127, + "learning_rate": 7.073932669114367e-06, + "loss": 0.1049, + "step": 2750 + }, + { + "epoch": 1.2835820895522387, + "grad_norm": 1.0385121901630985, + "learning_rate": 7.068992619380859e-06, + "loss": 0.1126, + "step": 2752 + }, + { + "epoch": 1.2845149253731343, + "grad_norm": 1.1816975709935262, + "learning_rate": 7.064050131490785e-06, + "loss": 0.0966, + "step": 2754 + }, + { + "epoch": 1.2854477611940298, + "grad_norm": 1.0149462286666746, + "learning_rate": 7.0591052112685055e-06, + "loss": 0.095, + "step": 2756 + }, + { + "epoch": 1.2863805970149254, + "grad_norm": 1.1164943007592327, + "learning_rate": 7.0541578645412445e-06, + "loss": 0.1126, + "step": 2758 + }, + { + "epoch": 1.287313432835821, + "grad_norm": 1.1310798223158123, + "learning_rate": 7.049208097139091e-06, + "loss": 0.1107, + "step": 2760 + }, + { + "epoch": 1.2882462686567164, + "grad_norm": 1.021054986400497, + "learning_rate": 7.044255914894984e-06, + "loss": 0.0971, + "step": 2762 + }, + { + "epoch": 1.289179104477612, + "grad_norm": 1.0478562360912111, + "learning_rate": 7.039301323644708e-06, + "loss": 0.1075, + "step": 2764 + }, + { + "epoch": 1.2901119402985075, + "grad_norm": 1.0362026319650983, + "learning_rate": 7.034344329226885e-06, + "loss": 0.1045, + "step": 2766 + }, + { + "epoch": 1.291044776119403, + "grad_norm": 1.0367226653554475, + "learning_rate": 7.029384937482973e-06, + "loss": 0.1093, + "step": 2768 + }, + { + "epoch": 1.2919776119402986, + "grad_norm": 1.027171436486602, + "learning_rate": 7.024423154257251e-06, + "loss": 0.1082, + "step": 2770 + }, + { + "epoch": 1.292910447761194, + "grad_norm": 1.2466632109908478, + "learning_rate": 7.019458985396817e-06, + "loss": 0.1102, + "step": 2772 + }, + { + "epoch": 1.2938432835820897, + "grad_norm": 0.9880084667022045, + "learning_rate": 7.0144924367515855e-06, + "loss": 0.1074, + "step": 2774 + }, + { + "epoch": 1.294776119402985, + "grad_norm": 1.1544989497995124, + "learning_rate": 7.009523514174266e-06, + "loss": 0.1028, + "step": 2776 + }, + { + "epoch": 1.2957089552238805, + "grad_norm": 1.1720849319609559, + "learning_rate": 7.004552223520372e-06, + "loss": 0.1113, + "step": 2778 + }, + { + "epoch": 1.296641791044776, + "grad_norm": 1.0462960415683409, + "learning_rate": 6.999578570648209e-06, + "loss": 0.0946, + "step": 2780 + }, + { + "epoch": 1.2975746268656716, + "grad_norm": 0.9598856162552585, + "learning_rate": 6.994602561418861e-06, + "loss": 0.0938, + "step": 2782 + }, + { + "epoch": 1.2985074626865671, + "grad_norm": 1.022033946406963, + "learning_rate": 6.98962420169619e-06, + "loss": 0.1024, + "step": 2784 + }, + { + "epoch": 1.2994402985074627, + "grad_norm": 0.953687394561481, + "learning_rate": 6.984643497346832e-06, + "loss": 0.0925, + "step": 2786 + }, + { + "epoch": 1.3003731343283582, + "grad_norm": 1.1287244050322118, + "learning_rate": 6.979660454240181e-06, + "loss": 0.1081, + "step": 2788 + }, + { + "epoch": 1.3013059701492538, + "grad_norm": 1.1709327211576885, + "learning_rate": 6.974675078248387e-06, + "loss": 0.1035, + "step": 2790 + }, + { + "epoch": 1.3022388059701493, + "grad_norm": 0.944588713647086, + "learning_rate": 6.969687375246355e-06, + "loss": 0.0959, + "step": 2792 + }, + { + "epoch": 1.3031716417910448, + "grad_norm": 1.1472249598687618, + "learning_rate": 6.9646973511117285e-06, + "loss": 0.1188, + "step": 2794 + }, + { + "epoch": 1.3041044776119404, + "grad_norm": 1.0929867903515014, + "learning_rate": 6.959705011724884e-06, + "loss": 0.1185, + "step": 2796 + }, + { + "epoch": 1.3050373134328357, + "grad_norm": 1.2660405928601028, + "learning_rate": 6.954710362968929e-06, + "loss": 0.1176, + "step": 2798 + }, + { + "epoch": 1.3059701492537314, + "grad_norm": 1.2582437720859714, + "learning_rate": 6.9497134107296925e-06, + "loss": 0.1118, + "step": 2800 + }, + { + "epoch": 1.3069029850746268, + "grad_norm": 1.0001790588845447, + "learning_rate": 6.944714160895717e-06, + "loss": 0.1059, + "step": 2802 + }, + { + "epoch": 1.3078358208955223, + "grad_norm": 1.089422532633587, + "learning_rate": 6.939712619358252e-06, + "loss": 0.1151, + "step": 2804 + }, + { + "epoch": 1.3087686567164178, + "grad_norm": 1.1839127783632801, + "learning_rate": 6.934708792011251e-06, + "loss": 0.1115, + "step": 2806 + }, + { + "epoch": 1.3097014925373134, + "grad_norm": 1.0133909049794272, + "learning_rate": 6.92970268475136e-06, + "loss": 0.1097, + "step": 2808 + }, + { + "epoch": 1.310634328358209, + "grad_norm": 1.0099610884578727, + "learning_rate": 6.924694303477904e-06, + "loss": 0.1071, + "step": 2810 + }, + { + "epoch": 1.3115671641791045, + "grad_norm": 1.0665431964123495, + "learning_rate": 6.9196836540929e-06, + "loss": 0.1166, + "step": 2812 + }, + { + "epoch": 1.3125, + "grad_norm": 1.12726023448818, + "learning_rate": 6.914670742501032e-06, + "loss": 0.1117, + "step": 2814 + }, + { + "epoch": 1.3134328358208955, + "grad_norm": 1.143118704139726, + "learning_rate": 6.909655574609645e-06, + "loss": 0.0947, + "step": 2816 + }, + { + "epoch": 1.314365671641791, + "grad_norm": 1.0182486712078969, + "learning_rate": 6.904638156328754e-06, + "loss": 0.1013, + "step": 2818 + }, + { + "epoch": 1.3152985074626866, + "grad_norm": 1.058792333177847, + "learning_rate": 6.899618493571015e-06, + "loss": 0.0897, + "step": 2820 + }, + { + "epoch": 1.3162313432835822, + "grad_norm": 1.0291529853253425, + "learning_rate": 6.894596592251735e-06, + "loss": 0.1134, + "step": 2822 + }, + { + "epoch": 1.3171641791044777, + "grad_norm": 1.1282168182539591, + "learning_rate": 6.889572458288859e-06, + "loss": 0.1042, + "step": 2824 + }, + { + "epoch": 1.3180970149253732, + "grad_norm": 1.1033786820010723, + "learning_rate": 6.88454609760296e-06, + "loss": 0.1089, + "step": 2826 + }, + { + "epoch": 1.3190298507462686, + "grad_norm": 1.2289680381551906, + "learning_rate": 6.879517516117238e-06, + "loss": 0.1071, + "step": 2828 + }, + { + "epoch": 1.3199626865671643, + "grad_norm": 1.058572972863236, + "learning_rate": 6.874486719757507e-06, + "loss": 0.1047, + "step": 2830 + }, + { + "epoch": 1.3208955223880596, + "grad_norm": 1.0880290932451049, + "learning_rate": 6.869453714452194e-06, + "loss": 0.1096, + "step": 2832 + }, + { + "epoch": 1.3218283582089552, + "grad_norm": 1.0614679694534637, + "learning_rate": 6.8644185061323284e-06, + "loss": 0.1104, + "step": 2834 + }, + { + "epoch": 1.3227611940298507, + "grad_norm": 1.1761386044220195, + "learning_rate": 6.859381100731534e-06, + "loss": 0.1085, + "step": 2836 + }, + { + "epoch": 1.3236940298507462, + "grad_norm": 1.1925865365449022, + "learning_rate": 6.854341504186025e-06, + "loss": 0.1216, + "step": 2838 + }, + { + "epoch": 1.3246268656716418, + "grad_norm": 1.238843754128432, + "learning_rate": 6.849299722434599e-06, + "loss": 0.1262, + "step": 2840 + }, + { + "epoch": 1.3255597014925373, + "grad_norm": 1.017241405899312, + "learning_rate": 6.844255761418625e-06, + "loss": 0.1001, + "step": 2842 + }, + { + "epoch": 1.3264925373134329, + "grad_norm": 1.019967528903183, + "learning_rate": 6.839209627082043e-06, + "loss": 0.097, + "step": 2844 + }, + { + "epoch": 1.3274253731343284, + "grad_norm": 0.9335114561733073, + "learning_rate": 6.834161325371354e-06, + "loss": 0.1146, + "step": 2846 + }, + { + "epoch": 1.328358208955224, + "grad_norm": 1.0111655177954493, + "learning_rate": 6.829110862235614e-06, + "loss": 0.0997, + "step": 2848 + }, + { + "epoch": 1.3292910447761195, + "grad_norm": 1.05426667874679, + "learning_rate": 6.824058243626421e-06, + "loss": 0.0946, + "step": 2850 + }, + { + "epoch": 1.330223880597015, + "grad_norm": 1.0388519317641882, + "learning_rate": 6.819003475497921e-06, + "loss": 0.0989, + "step": 2852 + }, + { + "epoch": 1.3311567164179103, + "grad_norm": 1.2966889294733428, + "learning_rate": 6.813946563806785e-06, + "loss": 0.1201, + "step": 2854 + }, + { + "epoch": 1.332089552238806, + "grad_norm": 0.9525414046629391, + "learning_rate": 6.808887514512215e-06, + "loss": 0.1066, + "step": 2856 + }, + { + "epoch": 1.3330223880597014, + "grad_norm": 1.1334560470038777, + "learning_rate": 6.803826333575931e-06, + "loss": 0.1008, + "step": 2858 + }, + { + "epoch": 1.333955223880597, + "grad_norm": 1.327554826453119, + "learning_rate": 6.798763026962167e-06, + "loss": 0.1175, + "step": 2860 + }, + { + "epoch": 1.3348880597014925, + "grad_norm": 1.0639321575109582, + "learning_rate": 6.793697600637655e-06, + "loss": 0.0968, + "step": 2862 + }, + { + "epoch": 1.335820895522388, + "grad_norm": 1.100781208211592, + "learning_rate": 6.788630060571634e-06, + "loss": 0.1205, + "step": 2864 + }, + { + "epoch": 1.3367537313432836, + "grad_norm": 1.1156886283722336, + "learning_rate": 6.783560412735828e-06, + "loss": 0.1055, + "step": 2866 + }, + { + "epoch": 1.337686567164179, + "grad_norm": 1.0748808306586775, + "learning_rate": 6.7784886631044456e-06, + "loss": 0.1029, + "step": 2868 + }, + { + "epoch": 1.3386194029850746, + "grad_norm": 1.133633135201276, + "learning_rate": 6.773414817654174e-06, + "loss": 0.1074, + "step": 2870 + }, + { + "epoch": 1.3395522388059702, + "grad_norm": 1.1334666060853869, + "learning_rate": 6.7683388823641686e-06, + "loss": 0.1016, + "step": 2872 + }, + { + "epoch": 1.3404850746268657, + "grad_norm": 1.6191482296264594, + "learning_rate": 6.763260863216048e-06, + "loss": 0.1028, + "step": 2874 + }, + { + "epoch": 1.3414179104477613, + "grad_norm": 1.2987734323409736, + "learning_rate": 6.758180766193887e-06, + "loss": 0.1257, + "step": 2876 + }, + { + "epoch": 1.3423507462686568, + "grad_norm": 1.149422135906203, + "learning_rate": 6.75309859728421e-06, + "loss": 0.12, + "step": 2878 + }, + { + "epoch": 1.3432835820895521, + "grad_norm": 1.2063697935527302, + "learning_rate": 6.748014362475979e-06, + "loss": 0.1204, + "step": 2880 + }, + { + "epoch": 1.3442164179104479, + "grad_norm": 1.064773396784815, + "learning_rate": 6.742928067760595e-06, + "loss": 0.0977, + "step": 2882 + }, + { + "epoch": 1.3451492537313432, + "grad_norm": 1.0736709924243466, + "learning_rate": 6.737839719131882e-06, + "loss": 0.1056, + "step": 2884 + }, + { + "epoch": 1.3460820895522387, + "grad_norm": 1.1412493298954451, + "learning_rate": 6.732749322586091e-06, + "loss": 0.1101, + "step": 2886 + }, + { + "epoch": 1.3470149253731343, + "grad_norm": 1.185933919889182, + "learning_rate": 6.727656884121878e-06, + "loss": 0.113, + "step": 2888 + }, + { + "epoch": 1.3479477611940298, + "grad_norm": 1.2160809535817252, + "learning_rate": 6.722562409740312e-06, + "loss": 0.1141, + "step": 2890 + }, + { + "epoch": 1.3488805970149254, + "grad_norm": 1.1575740233480094, + "learning_rate": 6.71746590544486e-06, + "loss": 0.1094, + "step": 2892 + }, + { + "epoch": 1.349813432835821, + "grad_norm": 1.1035081404168656, + "learning_rate": 6.712367377241375e-06, + "loss": 0.0995, + "step": 2894 + }, + { + "epoch": 1.3507462686567164, + "grad_norm": 1.1442566521045665, + "learning_rate": 6.707266831138104e-06, + "loss": 0.1171, + "step": 2896 + }, + { + "epoch": 1.351679104477612, + "grad_norm": 1.167856879029797, + "learning_rate": 6.702164273145667e-06, + "loss": 0.1141, + "step": 2898 + }, + { + "epoch": 1.3526119402985075, + "grad_norm": 1.0597246120572674, + "learning_rate": 6.6970597092770535e-06, + "loss": 0.1067, + "step": 2900 + }, + { + "epoch": 1.353544776119403, + "grad_norm": 1.2212308368679017, + "learning_rate": 6.6919531455476214e-06, + "loss": 0.1119, + "step": 2902 + }, + { + "epoch": 1.3544776119402986, + "grad_norm": 1.1023494393486082, + "learning_rate": 6.6868445879750824e-06, + "loss": 0.1078, + "step": 2904 + }, + { + "epoch": 1.355410447761194, + "grad_norm": 1.0895226818200237, + "learning_rate": 6.681734042579496e-06, + "loss": 0.0952, + "step": 2906 + }, + { + "epoch": 1.3563432835820897, + "grad_norm": 1.1142486667881042, + "learning_rate": 6.6766215153832705e-06, + "loss": 0.1128, + "step": 2908 + }, + { + "epoch": 1.357276119402985, + "grad_norm": 1.2709531893368116, + "learning_rate": 6.671507012411141e-06, + "loss": 0.1117, + "step": 2910 + }, + { + "epoch": 1.3582089552238805, + "grad_norm": 0.9951773324160267, + "learning_rate": 6.6663905396901784e-06, + "loss": 0.1023, + "step": 2912 + }, + { + "epoch": 1.359141791044776, + "grad_norm": 1.200134402201827, + "learning_rate": 6.661272103249771e-06, + "loss": 0.1, + "step": 2914 + }, + { + "epoch": 1.3600746268656716, + "grad_norm": 1.2228286735043175, + "learning_rate": 6.6561517091216195e-06, + "loss": 0.1152, + "step": 2916 + }, + { + "epoch": 1.3610074626865671, + "grad_norm": 1.0505205938945563, + "learning_rate": 6.651029363339739e-06, + "loss": 0.1039, + "step": 2918 + }, + { + "epoch": 1.3619402985074627, + "grad_norm": 1.09838183594541, + "learning_rate": 6.645905071940436e-06, + "loss": 0.1042, + "step": 2920 + }, + { + "epoch": 1.3628731343283582, + "grad_norm": 1.053570553754682, + "learning_rate": 6.6407788409623145e-06, + "loss": 0.114, + "step": 2922 + }, + { + "epoch": 1.3638059701492538, + "grad_norm": 1.167119974675872, + "learning_rate": 6.6356506764462645e-06, + "loss": 0.1018, + "step": 2924 + }, + { + "epoch": 1.3647388059701493, + "grad_norm": 1.1074407954655574, + "learning_rate": 6.630520584435449e-06, + "loss": 0.0988, + "step": 2926 + }, + { + "epoch": 1.3656716417910448, + "grad_norm": 1.0604558516149023, + "learning_rate": 6.625388570975309e-06, + "loss": 0.1031, + "step": 2928 + }, + { + "epoch": 1.3666044776119404, + "grad_norm": 1.2005477049834712, + "learning_rate": 6.620254642113549e-06, + "loss": 0.1132, + "step": 2930 + }, + { + "epoch": 1.3675373134328357, + "grad_norm": 1.090950226405584, + "learning_rate": 6.615118803900126e-06, + "loss": 0.1008, + "step": 2932 + }, + { + "epoch": 1.3684701492537314, + "grad_norm": 1.0355616161327785, + "learning_rate": 6.60998106238725e-06, + "loss": 0.1087, + "step": 2934 + }, + { + "epoch": 1.3694029850746268, + "grad_norm": 0.9729656732764957, + "learning_rate": 6.604841423629377e-06, + "loss": 0.1, + "step": 2936 + }, + { + "epoch": 1.3703358208955223, + "grad_norm": 1.0893405931843285, + "learning_rate": 6.599699893683191e-06, + "loss": 0.1069, + "step": 2938 + }, + { + "epoch": 1.3712686567164178, + "grad_norm": 0.936558636879193, + "learning_rate": 6.594556478607613e-06, + "loss": 0.0841, + "step": 2940 + }, + { + "epoch": 1.3722014925373134, + "grad_norm": 1.2615085771691428, + "learning_rate": 6.589411184463778e-06, + "loss": 0.1139, + "step": 2942 + }, + { + "epoch": 1.373134328358209, + "grad_norm": 1.1737956310426265, + "learning_rate": 6.5842640173150455e-06, + "loss": 0.1112, + "step": 2944 + }, + { + "epoch": 1.3740671641791045, + "grad_norm": 1.171751150916603, + "learning_rate": 6.5791149832269685e-06, + "loss": 0.1102, + "step": 2946 + }, + { + "epoch": 1.375, + "grad_norm": 0.9175008113335006, + "learning_rate": 6.57396408826731e-06, + "loss": 0.0972, + "step": 2948 + }, + { + "epoch": 1.3759328358208955, + "grad_norm": 1.129787325635826, + "learning_rate": 6.568811338506026e-06, + "loss": 0.1045, + "step": 2950 + }, + { + "epoch": 1.376865671641791, + "grad_norm": 1.0242504438143376, + "learning_rate": 6.5636567400152505e-06, + "loss": 0.086, + "step": 2952 + }, + { + "epoch": 1.3777985074626866, + "grad_norm": 1.1018081705158753, + "learning_rate": 6.558500298869305e-06, + "loss": 0.1069, + "step": 2954 + }, + { + "epoch": 1.3787313432835822, + "grad_norm": 1.0342368508145685, + "learning_rate": 6.553342021144676e-06, + "loss": 0.1081, + "step": 2956 + }, + { + "epoch": 1.3796641791044777, + "grad_norm": 1.2000108789191752, + "learning_rate": 6.548181912920018e-06, + "loss": 0.1029, + "step": 2958 + }, + { + "epoch": 1.3805970149253732, + "grad_norm": 1.0503527069580005, + "learning_rate": 6.543019980276142e-06, + "loss": 0.1036, + "step": 2960 + }, + { + "epoch": 1.3815298507462686, + "grad_norm": 1.276892042282831, + "learning_rate": 6.537856229296007e-06, + "loss": 0.1193, + "step": 2962 + }, + { + "epoch": 1.3824626865671643, + "grad_norm": 1.1400080476797572, + "learning_rate": 6.5326906660647175e-06, + "loss": 0.1012, + "step": 2964 + }, + { + "epoch": 1.3833955223880596, + "grad_norm": 1.0266181415924787, + "learning_rate": 6.5275232966695105e-06, + "loss": 0.0935, + "step": 2966 + }, + { + "epoch": 1.3843283582089552, + "grad_norm": 1.0976238328333847, + "learning_rate": 6.522354127199751e-06, + "loss": 0.0952, + "step": 2968 + }, + { + "epoch": 1.3852611940298507, + "grad_norm": 1.180970128662032, + "learning_rate": 6.517183163746934e-06, + "loss": 0.0983, + "step": 2970 + }, + { + "epoch": 1.3861940298507462, + "grad_norm": 1.1471694862180046, + "learning_rate": 6.512010412404658e-06, + "loss": 0.1144, + "step": 2972 + }, + { + "epoch": 1.3871268656716418, + "grad_norm": 1.051434549088811, + "learning_rate": 6.50683587926863e-06, + "loss": 0.1093, + "step": 2974 + }, + { + "epoch": 1.3880597014925373, + "grad_norm": 1.1114955765102228, + "learning_rate": 6.5016595704366646e-06, + "loss": 0.1056, + "step": 2976 + }, + { + "epoch": 1.3889925373134329, + "grad_norm": 1.3090825218198192, + "learning_rate": 6.496481492008657e-06, + "loss": 0.1112, + "step": 2978 + }, + { + "epoch": 1.3899253731343284, + "grad_norm": 1.1156719571047116, + "learning_rate": 6.4913016500866e-06, + "loss": 0.1027, + "step": 2980 + }, + { + "epoch": 1.390858208955224, + "grad_norm": 1.0253311597503183, + "learning_rate": 6.486120050774556e-06, + "loss": 0.0968, + "step": 2982 + }, + { + "epoch": 1.3917910447761195, + "grad_norm": 0.9091733131896456, + "learning_rate": 6.480936700178659e-06, + "loss": 0.0809, + "step": 2984 + }, + { + "epoch": 1.392723880597015, + "grad_norm": 1.1235099566691318, + "learning_rate": 6.475751604407114e-06, + "loss": 0.1138, + "step": 2986 + }, + { + "epoch": 1.3936567164179103, + "grad_norm": 1.0815530777150875, + "learning_rate": 6.470564769570173e-06, + "loss": 0.1067, + "step": 2988 + }, + { + "epoch": 1.394589552238806, + "grad_norm": 1.1033766441590853, + "learning_rate": 6.465376201780142e-06, + "loss": 0.0999, + "step": 2990 + }, + { + "epoch": 1.3955223880597014, + "grad_norm": 1.0591842863522525, + "learning_rate": 6.460185907151372e-06, + "loss": 0.1021, + "step": 2992 + }, + { + "epoch": 1.396455223880597, + "grad_norm": 1.1913988809406653, + "learning_rate": 6.454993891800242e-06, + "loss": 0.1178, + "step": 2994 + }, + { + "epoch": 1.3973880597014925, + "grad_norm": 1.0816031459872881, + "learning_rate": 6.449800161845167e-06, + "loss": 0.1048, + "step": 2996 + }, + { + "epoch": 1.398320895522388, + "grad_norm": 1.1869785108921906, + "learning_rate": 6.444604723406574e-06, + "loss": 0.1168, + "step": 2998 + }, + { + "epoch": 1.3992537313432836, + "grad_norm": 1.091045630125453, + "learning_rate": 6.439407582606907e-06, + "loss": 0.1022, + "step": 3000 + }, + { + "epoch": 1.3992537313432836, + "eval_loss": 0.1596611738204956, + "eval_runtime": 321.8346, + "eval_samples_per_second": 47.372, + "eval_steps_per_second": 5.922, + "step": 3000 + }, + { + "epoch": 1.400186567164179, + "grad_norm": 1.167629978025765, + "learning_rate": 6.4342087455706215e-06, + "loss": 0.1102, + "step": 3002 + }, + { + "epoch": 1.4011194029850746, + "grad_norm": 1.0434504127192574, + "learning_rate": 6.429008218424161e-06, + "loss": 0.1037, + "step": 3004 + }, + { + "epoch": 1.4020522388059702, + "grad_norm": 0.9665951864069205, + "learning_rate": 6.423806007295972e-06, + "loss": 0.0992, + "step": 3006 + }, + { + "epoch": 1.4029850746268657, + "grad_norm": 1.1641206738320007, + "learning_rate": 6.418602118316476e-06, + "loss": 0.108, + "step": 3008 + }, + { + "epoch": 1.4039179104477613, + "grad_norm": 1.140916583792148, + "learning_rate": 6.413396557618078e-06, + "loss": 0.1059, + "step": 3010 + }, + { + "epoch": 1.4048507462686568, + "grad_norm": 1.2090330859544889, + "learning_rate": 6.408189331335151e-06, + "loss": 0.1073, + "step": 3012 + }, + { + "epoch": 1.4057835820895521, + "grad_norm": 0.9926869496974697, + "learning_rate": 6.402980445604028e-06, + "loss": 0.0986, + "step": 3014 + }, + { + "epoch": 1.4067164179104479, + "grad_norm": 1.2452033859861542, + "learning_rate": 6.397769906563003e-06, + "loss": 0.1122, + "step": 3016 + }, + { + "epoch": 1.4076492537313432, + "grad_norm": 1.0659859150777007, + "learning_rate": 6.3925577203523136e-06, + "loss": 0.0957, + "step": 3018 + }, + { + "epoch": 1.4085820895522387, + "grad_norm": 1.156239094073771, + "learning_rate": 6.38734389311414e-06, + "loss": 0.0949, + "step": 3020 + }, + { + "epoch": 1.4095149253731343, + "grad_norm": 1.3945291971907214, + "learning_rate": 6.382128430992599e-06, + "loss": 0.1089, + "step": 3022 + }, + { + "epoch": 1.4104477611940298, + "grad_norm": 0.924967408265427, + "learning_rate": 6.376911340133729e-06, + "loss": 0.1007, + "step": 3024 + }, + { + "epoch": 1.4113805970149254, + "grad_norm": 1.1830645716989097, + "learning_rate": 6.371692626685491e-06, + "loss": 0.1001, + "step": 3026 + }, + { + "epoch": 1.412313432835821, + "grad_norm": 1.1160910803912631, + "learning_rate": 6.366472296797758e-06, + "loss": 0.0969, + "step": 3028 + }, + { + "epoch": 1.4132462686567164, + "grad_norm": 1.0376037844861716, + "learning_rate": 6.361250356622306e-06, + "loss": 0.0944, + "step": 3030 + }, + { + "epoch": 1.414179104477612, + "grad_norm": 1.2014647340273814, + "learning_rate": 6.3560268123128085e-06, + "loss": 0.1082, + "step": 3032 + }, + { + "epoch": 1.4151119402985075, + "grad_norm": 1.0873051273313719, + "learning_rate": 6.350801670024836e-06, + "loss": 0.1038, + "step": 3034 + }, + { + "epoch": 1.416044776119403, + "grad_norm": 1.0862240190195362, + "learning_rate": 6.34557493591583e-06, + "loss": 0.1103, + "step": 3036 + }, + { + "epoch": 1.4169776119402986, + "grad_norm": 1.0072115852608963, + "learning_rate": 6.34034661614512e-06, + "loss": 0.0973, + "step": 3038 + }, + { + "epoch": 1.417910447761194, + "grad_norm": 1.2026295177525366, + "learning_rate": 6.335116716873895e-06, + "loss": 0.0974, + "step": 3040 + }, + { + "epoch": 1.4188432835820897, + "grad_norm": 1.1104122744891574, + "learning_rate": 6.3298852442652115e-06, + "loss": 0.1052, + "step": 3042 + }, + { + "epoch": 1.419776119402985, + "grad_norm": 1.1996186070036516, + "learning_rate": 6.3246522044839764e-06, + "loss": 0.1118, + "step": 3044 + }, + { + "epoch": 1.4207089552238805, + "grad_norm": 1.0113422644591767, + "learning_rate": 6.319417603696944e-06, + "loss": 0.0983, + "step": 3046 + }, + { + "epoch": 1.421641791044776, + "grad_norm": 1.0623284572580665, + "learning_rate": 6.314181448072711e-06, + "loss": 0.1056, + "step": 3048 + }, + { + "epoch": 1.4225746268656716, + "grad_norm": 1.1536987469950621, + "learning_rate": 6.308943743781703e-06, + "loss": 0.1068, + "step": 3050 + }, + { + "epoch": 1.4235074626865671, + "grad_norm": 1.0826482438350977, + "learning_rate": 6.303704496996168e-06, + "loss": 0.0978, + "step": 3052 + }, + { + "epoch": 1.4244402985074627, + "grad_norm": 1.1627640702871243, + "learning_rate": 6.2984637138901815e-06, + "loss": 0.1013, + "step": 3054 + }, + { + "epoch": 1.4253731343283582, + "grad_norm": 1.021338061669078, + "learning_rate": 6.29322140063962e-06, + "loss": 0.0941, + "step": 3056 + }, + { + "epoch": 1.4263059701492538, + "grad_norm": 1.117057545089706, + "learning_rate": 6.287977563422165e-06, + "loss": 0.1098, + "step": 3058 + }, + { + "epoch": 1.4272388059701493, + "grad_norm": 1.0784406129417174, + "learning_rate": 6.282732208417298e-06, + "loss": 0.0954, + "step": 3060 + }, + { + "epoch": 1.4281716417910448, + "grad_norm": 1.2570229258811754, + "learning_rate": 6.277485341806286e-06, + "loss": 0.1186, + "step": 3062 + }, + { + "epoch": 1.4291044776119404, + "grad_norm": 0.8979995868785656, + "learning_rate": 6.272236969772178e-06, + "loss": 0.0858, + "step": 3064 + }, + { + "epoch": 1.4300373134328357, + "grad_norm": 1.0380953904580874, + "learning_rate": 6.266987098499795e-06, + "loss": 0.0969, + "step": 3066 + }, + { + "epoch": 1.4309701492537314, + "grad_norm": 1.0570972769273697, + "learning_rate": 6.261735734175729e-06, + "loss": 0.0968, + "step": 3068 + }, + { + "epoch": 1.4319029850746268, + "grad_norm": 1.2057515763740294, + "learning_rate": 6.256482882988326e-06, + "loss": 0.1078, + "step": 3070 + }, + { + "epoch": 1.4328358208955223, + "grad_norm": 1.0861212408518004, + "learning_rate": 6.2512285511276905e-06, + "loss": 0.1149, + "step": 3072 + }, + { + "epoch": 1.4337686567164178, + "grad_norm": 1.0315469642547823, + "learning_rate": 6.2459727447856665e-06, + "loss": 0.1026, + "step": 3074 + }, + { + "epoch": 1.4347014925373134, + "grad_norm": 0.9997333106764426, + "learning_rate": 6.2407154701558395e-06, + "loss": 0.0904, + "step": 3076 + }, + { + "epoch": 1.435634328358209, + "grad_norm": 1.0774685188886144, + "learning_rate": 6.235456733433519e-06, + "loss": 0.1055, + "step": 3078 + }, + { + "epoch": 1.4365671641791045, + "grad_norm": 1.1451288509748911, + "learning_rate": 6.230196540815748e-06, + "loss": 0.0992, + "step": 3080 + }, + { + "epoch": 1.4375, + "grad_norm": 1.111035491471034, + "learning_rate": 6.224934898501274e-06, + "loss": 0.0914, + "step": 3082 + }, + { + "epoch": 1.4384328358208955, + "grad_norm": 1.0707168760494123, + "learning_rate": 6.219671812690559e-06, + "loss": 0.1018, + "step": 3084 + }, + { + "epoch": 1.439365671641791, + "grad_norm": 1.1819138824851976, + "learning_rate": 6.214407289585766e-06, + "loss": 0.1055, + "step": 3086 + }, + { + "epoch": 1.4402985074626866, + "grad_norm": 1.1181879319957275, + "learning_rate": 6.209141335390752e-06, + "loss": 0.0987, + "step": 3088 + }, + { + "epoch": 1.4412313432835822, + "grad_norm": 1.1834721372704728, + "learning_rate": 6.203873956311055e-06, + "loss": 0.1104, + "step": 3090 + }, + { + "epoch": 1.4421641791044777, + "grad_norm": 1.2329644084025368, + "learning_rate": 6.1986051585539e-06, + "loss": 0.1168, + "step": 3092 + }, + { + "epoch": 1.4430970149253732, + "grad_norm": 1.0996131554547866, + "learning_rate": 6.193334948328178e-06, + "loss": 0.1051, + "step": 3094 + }, + { + "epoch": 1.4440298507462686, + "grad_norm": 1.0958634987738785, + "learning_rate": 6.188063331844447e-06, + "loss": 0.1038, + "step": 3096 + }, + { + "epoch": 1.4449626865671643, + "grad_norm": 1.0963858523235155, + "learning_rate": 6.182790315314922e-06, + "loss": 0.1118, + "step": 3098 + }, + { + "epoch": 1.4458955223880596, + "grad_norm": 1.0722540459615448, + "learning_rate": 6.1775159049534675e-06, + "loss": 0.1021, + "step": 3100 + }, + { + "epoch": 1.4468283582089552, + "grad_norm": 1.1090454875456877, + "learning_rate": 6.172240106975591e-06, + "loss": 0.0976, + "step": 3102 + }, + { + "epoch": 1.4477611940298507, + "grad_norm": 1.2055469445945555, + "learning_rate": 6.1669629275984325e-06, + "loss": 0.1064, + "step": 3104 + }, + { + "epoch": 1.4486940298507462, + "grad_norm": 0.9929386352393755, + "learning_rate": 6.161684373040765e-06, + "loss": 0.1002, + "step": 3106 + }, + { + "epoch": 1.4496268656716418, + "grad_norm": 1.2056230123395077, + "learning_rate": 6.156404449522978e-06, + "loss": 0.0982, + "step": 3108 + }, + { + "epoch": 1.4505597014925373, + "grad_norm": 1.0424913535896907, + "learning_rate": 6.151123163267074e-06, + "loss": 0.0981, + "step": 3110 + }, + { + "epoch": 1.4514925373134329, + "grad_norm": 1.084678518523754, + "learning_rate": 6.145840520496666e-06, + "loss": 0.102, + "step": 3112 + }, + { + "epoch": 1.4524253731343284, + "grad_norm": 1.1979722607056424, + "learning_rate": 6.140556527436962e-06, + "loss": 0.1125, + "step": 3114 + }, + { + "epoch": 1.453358208955224, + "grad_norm": 1.0233861227717225, + "learning_rate": 6.135271190314758e-06, + "loss": 0.0984, + "step": 3116 + }, + { + "epoch": 1.4542910447761195, + "grad_norm": 1.0598274718309244, + "learning_rate": 6.12998451535844e-06, + "loss": 0.1053, + "step": 3118 + }, + { + "epoch": 1.455223880597015, + "grad_norm": 1.2751975157364583, + "learning_rate": 6.124696508797968e-06, + "loss": 0.1208, + "step": 3120 + }, + { + "epoch": 1.4561567164179103, + "grad_norm": 1.1140232283756788, + "learning_rate": 6.1194071768648715e-06, + "loss": 0.1092, + "step": 3122 + }, + { + "epoch": 1.457089552238806, + "grad_norm": 1.1303080881263854, + "learning_rate": 6.114116525792239e-06, + "loss": 0.102, + "step": 3124 + }, + { + "epoch": 1.4580223880597014, + "grad_norm": 1.0568236281312444, + "learning_rate": 6.10882456181472e-06, + "loss": 0.0853, + "step": 3126 + }, + { + "epoch": 1.458955223880597, + "grad_norm": 1.1699321756566419, + "learning_rate": 6.1035312911685056e-06, + "loss": 0.1021, + "step": 3128 + }, + { + "epoch": 1.4598880597014925, + "grad_norm": 1.050421298590088, + "learning_rate": 6.098236720091326e-06, + "loss": 0.1086, + "step": 3130 + }, + { + "epoch": 1.460820895522388, + "grad_norm": 1.0807380921286178, + "learning_rate": 6.09294085482245e-06, + "loss": 0.0992, + "step": 3132 + }, + { + "epoch": 1.4617537313432836, + "grad_norm": 1.0076965320461435, + "learning_rate": 6.087643701602666e-06, + "loss": 0.1064, + "step": 3134 + }, + { + "epoch": 1.462686567164179, + "grad_norm": 1.1523421688557507, + "learning_rate": 6.082345266674279e-06, + "loss": 0.1021, + "step": 3136 + }, + { + "epoch": 1.4636194029850746, + "grad_norm": 1.1650279713737783, + "learning_rate": 6.0770455562811125e-06, + "loss": 0.0953, + "step": 3138 + }, + { + "epoch": 1.4645522388059702, + "grad_norm": 1.0788801296383042, + "learning_rate": 6.071744576668486e-06, + "loss": 0.0983, + "step": 3140 + }, + { + "epoch": 1.4654850746268657, + "grad_norm": 1.32353220879985, + "learning_rate": 6.066442334083214e-06, + "loss": 0.1146, + "step": 3142 + }, + { + "epoch": 1.4664179104477613, + "grad_norm": 1.1069120841967914, + "learning_rate": 6.061138834773604e-06, + "loss": 0.1064, + "step": 3144 + }, + { + "epoch": 1.4673507462686568, + "grad_norm": 1.181184951475493, + "learning_rate": 6.055834084989443e-06, + "loss": 0.1095, + "step": 3146 + }, + { + "epoch": 1.4682835820895521, + "grad_norm": 1.084969053350031, + "learning_rate": 6.050528090981989e-06, + "loss": 0.0958, + "step": 3148 + }, + { + "epoch": 1.4692164179104479, + "grad_norm": 1.0351168200214633, + "learning_rate": 6.045220859003969e-06, + "loss": 0.1047, + "step": 3150 + }, + { + "epoch": 1.4701492537313432, + "grad_norm": 1.0579711036430157, + "learning_rate": 6.039912395309568e-06, + "loss": 0.0998, + "step": 3152 + }, + { + "epoch": 1.4710820895522387, + "grad_norm": 1.0861897419858866, + "learning_rate": 6.034602706154422e-06, + "loss": 0.1013, + "step": 3154 + }, + { + "epoch": 1.4720149253731343, + "grad_norm": 1.2551293733340556, + "learning_rate": 6.029291797795614e-06, + "loss": 0.1087, + "step": 3156 + }, + { + "epoch": 1.4729477611940298, + "grad_norm": 1.1006371029081106, + "learning_rate": 6.023979676491656e-06, + "loss": 0.1118, + "step": 3158 + }, + { + "epoch": 1.4738805970149254, + "grad_norm": 1.0751643558376185, + "learning_rate": 6.0186663485025e-06, + "loss": 0.104, + "step": 3160 + }, + { + "epoch": 1.474813432835821, + "grad_norm": 0.9540738113065265, + "learning_rate": 6.01335182008951e-06, + "loss": 0.1002, + "step": 3162 + }, + { + "epoch": 1.4757462686567164, + "grad_norm": 1.0918722694265637, + "learning_rate": 6.008036097515475e-06, + "loss": 0.0991, + "step": 3164 + }, + { + "epoch": 1.476679104477612, + "grad_norm": 1.044335258752036, + "learning_rate": 6.00271918704458e-06, + "loss": 0.1067, + "step": 3166 + }, + { + "epoch": 1.4776119402985075, + "grad_norm": 1.0930920800274009, + "learning_rate": 5.997401094942417e-06, + "loss": 0.1024, + "step": 3168 + }, + { + "epoch": 1.478544776119403, + "grad_norm": 1.2264099772494796, + "learning_rate": 5.992081827475971e-06, + "loss": 0.0995, + "step": 3170 + }, + { + "epoch": 1.4794776119402986, + "grad_norm": 0.998036190610848, + "learning_rate": 5.986761390913609e-06, + "loss": 0.1044, + "step": 3172 + }, + { + "epoch": 1.480410447761194, + "grad_norm": 1.0852227182318062, + "learning_rate": 5.981439791525073e-06, + "loss": 0.1028, + "step": 3174 + }, + { + "epoch": 1.4813432835820897, + "grad_norm": 1.2220006546674524, + "learning_rate": 5.976117035581483e-06, + "loss": 0.1089, + "step": 3176 + }, + { + "epoch": 1.482276119402985, + "grad_norm": 1.0873966123567707, + "learning_rate": 5.970793129355318e-06, + "loss": 0.1259, + "step": 3178 + }, + { + "epoch": 1.4832089552238805, + "grad_norm": 1.1664256253594125, + "learning_rate": 5.96546807912041e-06, + "loss": 0.1028, + "step": 3180 + }, + { + "epoch": 1.484141791044776, + "grad_norm": 1.2629487749264883, + "learning_rate": 5.960141891151943e-06, + "loss": 0.1131, + "step": 3182 + }, + { + "epoch": 1.4850746268656716, + "grad_norm": 1.2184221517476623, + "learning_rate": 5.954814571726438e-06, + "loss": 0.1063, + "step": 3184 + }, + { + "epoch": 1.4860074626865671, + "grad_norm": 1.0362946938412179, + "learning_rate": 5.949486127121754e-06, + "loss": 0.1007, + "step": 3186 + }, + { + "epoch": 1.4869402985074627, + "grad_norm": 1.1232622115653055, + "learning_rate": 5.944156563617073e-06, + "loss": 0.1141, + "step": 3188 + }, + { + "epoch": 1.4878731343283582, + "grad_norm": 1.0805062286617, + "learning_rate": 5.938825887492895e-06, + "loss": 0.1025, + "step": 3190 + }, + { + "epoch": 1.4888059701492538, + "grad_norm": 1.011890034243429, + "learning_rate": 5.933494105031032e-06, + "loss": 0.1017, + "step": 3192 + }, + { + "epoch": 1.4897388059701493, + "grad_norm": 1.0717306271050195, + "learning_rate": 5.928161222514601e-06, + "loss": 0.1019, + "step": 3194 + }, + { + "epoch": 1.4906716417910448, + "grad_norm": 1.0027699542798725, + "learning_rate": 5.9228272462280156e-06, + "loss": 0.0884, + "step": 3196 + }, + { + "epoch": 1.4916044776119404, + "grad_norm": 1.062085236264922, + "learning_rate": 5.917492182456975e-06, + "loss": 0.1003, + "step": 3198 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 1.1869465393510803, + "learning_rate": 5.91215603748846e-06, + "loss": 0.0983, + "step": 3200 + }, + { + "epoch": 1.4934701492537314, + "grad_norm": 1.3363242185288464, + "learning_rate": 5.906818817610731e-06, + "loss": 0.1119, + "step": 3202 + }, + { + "epoch": 1.4944029850746268, + "grad_norm": 1.0686221045699777, + "learning_rate": 5.90148052911331e-06, + "loss": 0.0951, + "step": 3204 + }, + { + "epoch": 1.4953358208955223, + "grad_norm": 1.1884349749059908, + "learning_rate": 5.896141178286979e-06, + "loss": 0.1014, + "step": 3206 + }, + { + "epoch": 1.4962686567164178, + "grad_norm": 1.1669360727986386, + "learning_rate": 5.890800771423775e-06, + "loss": 0.1064, + "step": 3208 + }, + { + "epoch": 1.4972014925373134, + "grad_norm": 1.2576434549103657, + "learning_rate": 5.8854593148169745e-06, + "loss": 0.1143, + "step": 3210 + }, + { + "epoch": 1.498134328358209, + "grad_norm": 1.2180390148379803, + "learning_rate": 5.8801168147610956e-06, + "loss": 0.1006, + "step": 3212 + }, + { + "epoch": 1.4990671641791045, + "grad_norm": 1.1235632170576928, + "learning_rate": 5.874773277551883e-06, + "loss": 0.1015, + "step": 3214 + }, + { + "epoch": 1.5, + "grad_norm": 1.0484635859530513, + "learning_rate": 5.869428709486304e-06, + "loss": 0.0988, + "step": 3216 + }, + { + "epoch": 1.5009328358208955, + "grad_norm": 1.137790156251054, + "learning_rate": 5.864083116862544e-06, + "loss": 0.1062, + "step": 3218 + }, + { + "epoch": 1.501865671641791, + "grad_norm": 1.2017926673666146, + "learning_rate": 5.858736505979989e-06, + "loss": 0.1123, + "step": 3220 + }, + { + "epoch": 1.5027985074626866, + "grad_norm": 1.0922020832754598, + "learning_rate": 5.853388883139235e-06, + "loss": 0.0881, + "step": 3222 + }, + { + "epoch": 1.5037313432835822, + "grad_norm": 1.1279459607000017, + "learning_rate": 5.84804025464206e-06, + "loss": 0.0969, + "step": 3224 + }, + { + "epoch": 1.5046641791044775, + "grad_norm": 1.2073940441348827, + "learning_rate": 5.842690626791433e-06, + "loss": 0.0992, + "step": 3226 + }, + { + "epoch": 1.5055970149253732, + "grad_norm": 1.1226079923533225, + "learning_rate": 5.837340005891499e-06, + "loss": 0.0984, + "step": 3228 + }, + { + "epoch": 1.5065298507462686, + "grad_norm": 1.11157292850359, + "learning_rate": 5.831988398247576e-06, + "loss": 0.1052, + "step": 3230 + }, + { + "epoch": 1.5074626865671643, + "grad_norm": 1.1794443229952236, + "learning_rate": 5.8266358101661365e-06, + "loss": 0.1152, + "step": 3232 + }, + { + "epoch": 1.5083955223880596, + "grad_norm": 1.139563854183085, + "learning_rate": 5.8212822479548214e-06, + "loss": 0.0998, + "step": 3234 + }, + { + "epoch": 1.5093283582089554, + "grad_norm": 1.0415626306406014, + "learning_rate": 5.815927717922408e-06, + "loss": 0.1101, + "step": 3236 + }, + { + "epoch": 1.5102611940298507, + "grad_norm": 1.0083882017568637, + "learning_rate": 5.810572226378821e-06, + "loss": 0.1106, + "step": 3238 + }, + { + "epoch": 1.5111940298507462, + "grad_norm": 1.1104204064017498, + "learning_rate": 5.8052157796351134e-06, + "loss": 0.1026, + "step": 3240 + }, + { + "epoch": 1.5121268656716418, + "grad_norm": 1.1481507362693006, + "learning_rate": 5.799858384003469e-06, + "loss": 0.1115, + "step": 3242 + }, + { + "epoch": 1.5130597014925373, + "grad_norm": 0.9662005312457365, + "learning_rate": 5.7945000457971844e-06, + "loss": 0.0957, + "step": 3244 + }, + { + "epoch": 1.5139925373134329, + "grad_norm": 1.1052757084273848, + "learning_rate": 5.789140771330669e-06, + "loss": 0.1011, + "step": 3246 + }, + { + "epoch": 1.5149253731343284, + "grad_norm": 1.1491614020411802, + "learning_rate": 5.7837805669194395e-06, + "loss": 0.1091, + "step": 3248 + }, + { + "epoch": 1.515858208955224, + "grad_norm": 1.117043878180387, + "learning_rate": 5.778419438880103e-06, + "loss": 0.0983, + "step": 3250 + }, + { + "epoch": 1.5167910447761193, + "grad_norm": 1.0781656608943595, + "learning_rate": 5.773057393530355e-06, + "loss": 0.1053, + "step": 3252 + }, + { + "epoch": 1.517723880597015, + "grad_norm": 1.1065480630337796, + "learning_rate": 5.767694437188976e-06, + "loss": 0.1059, + "step": 3254 + }, + { + "epoch": 1.5186567164179103, + "grad_norm": 1.0864972792796228, + "learning_rate": 5.762330576175821e-06, + "loss": 0.1017, + "step": 3256 + }, + { + "epoch": 1.519589552238806, + "grad_norm": 1.268340624743877, + "learning_rate": 5.756965816811801e-06, + "loss": 0.1055, + "step": 3258 + }, + { + "epoch": 1.5205223880597014, + "grad_norm": 1.1517888164790135, + "learning_rate": 5.7516001654189e-06, + "loss": 0.0993, + "step": 3260 + }, + { + "epoch": 1.5214552238805972, + "grad_norm": 1.1637222664241322, + "learning_rate": 5.746233628320142e-06, + "loss": 0.1165, + "step": 3262 + }, + { + "epoch": 1.5223880597014925, + "grad_norm": 1.0074675729873184, + "learning_rate": 5.7408662118395984e-06, + "loss": 0.092, + "step": 3264 + }, + { + "epoch": 1.523320895522388, + "grad_norm": 1.1399869102667521, + "learning_rate": 5.73549792230238e-06, + "loss": 0.1033, + "step": 3266 + }, + { + "epoch": 1.5242537313432836, + "grad_norm": 1.056324688109504, + "learning_rate": 5.730128766034621e-06, + "loss": 0.0918, + "step": 3268 + }, + { + "epoch": 1.525186567164179, + "grad_norm": 1.043679352211642, + "learning_rate": 5.7247587493634805e-06, + "loss": 0.0954, + "step": 3270 + }, + { + "epoch": 1.5261194029850746, + "grad_norm": 1.0367796118583745, + "learning_rate": 5.7193878786171305e-06, + "loss": 0.0994, + "step": 3272 + }, + { + "epoch": 1.5270522388059702, + "grad_norm": 1.1565588467797716, + "learning_rate": 5.714016160124749e-06, + "loss": 0.104, + "step": 3274 + }, + { + "epoch": 1.5279850746268657, + "grad_norm": 0.9296793416169334, + "learning_rate": 5.7086436002165165e-06, + "loss": 0.0897, + "step": 3276 + }, + { + "epoch": 1.528917910447761, + "grad_norm": 1.1012574949131864, + "learning_rate": 5.7032702052235975e-06, + "loss": 0.0971, + "step": 3278 + }, + { + "epoch": 1.5298507462686568, + "grad_norm": 1.2098175910072202, + "learning_rate": 5.69789598147815e-06, + "loss": 0.0976, + "step": 3280 + }, + { + "epoch": 1.5307835820895521, + "grad_norm": 1.1484092564782926, + "learning_rate": 5.692520935313302e-06, + "loss": 0.0984, + "step": 3282 + }, + { + "epoch": 1.5317164179104479, + "grad_norm": 1.0200699887269549, + "learning_rate": 5.687145073063149e-06, + "loss": 0.0981, + "step": 3284 + }, + { + "epoch": 1.5326492537313432, + "grad_norm": 1.0917603599612902, + "learning_rate": 5.681768401062757e-06, + "loss": 0.1004, + "step": 3286 + }, + { + "epoch": 1.533582089552239, + "grad_norm": 1.1681225973732419, + "learning_rate": 5.676390925648139e-06, + "loss": 0.1096, + "step": 3288 + }, + { + "epoch": 1.5345149253731343, + "grad_norm": 1.092452207603256, + "learning_rate": 5.671012653156255e-06, + "loss": 0.0984, + "step": 3290 + }, + { + "epoch": 1.5354477611940298, + "grad_norm": 1.07350850768635, + "learning_rate": 5.6656335899250085e-06, + "loss": 0.1049, + "step": 3292 + }, + { + "epoch": 1.5363805970149254, + "grad_norm": 1.0525563606331456, + "learning_rate": 5.66025374229323e-06, + "loss": 0.1053, + "step": 3294 + }, + { + "epoch": 1.537313432835821, + "grad_norm": 1.0615799064262887, + "learning_rate": 5.654873116600679e-06, + "loss": 0.1093, + "step": 3296 + }, + { + "epoch": 1.5382462686567164, + "grad_norm": 1.0535580579374406, + "learning_rate": 5.649491719188029e-06, + "loss": 0.1033, + "step": 3298 + }, + { + "epoch": 1.539179104477612, + "grad_norm": 1.0074381294048076, + "learning_rate": 5.644109556396861e-06, + "loss": 0.0981, + "step": 3300 + }, + { + "epoch": 1.5401119402985075, + "grad_norm": 0.9747654247984665, + "learning_rate": 5.638726634569664e-06, + "loss": 0.1035, + "step": 3302 + }, + { + "epoch": 1.5410447761194028, + "grad_norm": 1.1555841433941612, + "learning_rate": 5.633342960049816e-06, + "loss": 0.1022, + "step": 3304 + }, + { + "epoch": 1.5419776119402986, + "grad_norm": 1.1248105887886655, + "learning_rate": 5.627958539181584e-06, + "loss": 0.1134, + "step": 3306 + }, + { + "epoch": 1.542910447761194, + "grad_norm": 1.149391834527609, + "learning_rate": 5.6225733783101165e-06, + "loss": 0.1004, + "step": 3308 + }, + { + "epoch": 1.5438432835820897, + "grad_norm": 1.1425752238571576, + "learning_rate": 5.6171874837814275e-06, + "loss": 0.1092, + "step": 3310 + }, + { + "epoch": 1.544776119402985, + "grad_norm": 1.0480678051107188, + "learning_rate": 5.611800861942404e-06, + "loss": 0.0949, + "step": 3312 + }, + { + "epoch": 1.5457089552238807, + "grad_norm": 0.9883015763089346, + "learning_rate": 5.606413519140784e-06, + "loss": 0.1021, + "step": 3314 + }, + { + "epoch": 1.546641791044776, + "grad_norm": 1.1055000788459297, + "learning_rate": 5.6010254617251595e-06, + "loss": 0.1009, + "step": 3316 + }, + { + "epoch": 1.5475746268656716, + "grad_norm": 1.2151562055781369, + "learning_rate": 5.595636696044959e-06, + "loss": 0.1095, + "step": 3318 + }, + { + "epoch": 1.5485074626865671, + "grad_norm": 1.141262195565456, + "learning_rate": 5.590247228450451e-06, + "loss": 0.1051, + "step": 3320 + }, + { + "epoch": 1.5494402985074627, + "grad_norm": 1.1707184818521053, + "learning_rate": 5.5848570652927304e-06, + "loss": 0.1038, + "step": 3322 + }, + { + "epoch": 1.5503731343283582, + "grad_norm": 1.1289310804178943, + "learning_rate": 5.579466212923708e-06, + "loss": 0.1074, + "step": 3324 + }, + { + "epoch": 1.5513059701492538, + "grad_norm": 1.1064526845112488, + "learning_rate": 5.574074677696109e-06, + "loss": 0.1103, + "step": 3326 + }, + { + "epoch": 1.5522388059701493, + "grad_norm": 1.1525760613403542, + "learning_rate": 5.568682465963466e-06, + "loss": 0.1085, + "step": 3328 + }, + { + "epoch": 1.5531716417910446, + "grad_norm": 1.022601642611851, + "learning_rate": 5.563289584080105e-06, + "loss": 0.1001, + "step": 3330 + }, + { + "epoch": 1.5541044776119404, + "grad_norm": 1.0444993190614293, + "learning_rate": 5.557896038401143e-06, + "loss": 0.1108, + "step": 3332 + }, + { + "epoch": 1.5550373134328357, + "grad_norm": 0.9707826476325895, + "learning_rate": 5.55250183528248e-06, + "loss": 0.1008, + "step": 3334 + }, + { + "epoch": 1.5559701492537314, + "grad_norm": 1.0644857641645487, + "learning_rate": 5.547106981080789e-06, + "loss": 0.1106, + "step": 3336 + }, + { + "epoch": 1.5569029850746268, + "grad_norm": 1.192244889603234, + "learning_rate": 5.541711482153513e-06, + "loss": 0.1007, + "step": 3338 + }, + { + "epoch": 1.5578358208955225, + "grad_norm": 1.043812796454789, + "learning_rate": 5.53631534485885e-06, + "loss": 0.0974, + "step": 3340 + }, + { + "epoch": 1.5587686567164178, + "grad_norm": 1.045026903729856, + "learning_rate": 5.530918575555757e-06, + "loss": 0.1036, + "step": 3342 + }, + { + "epoch": 1.5597014925373134, + "grad_norm": 1.1390461471821403, + "learning_rate": 5.525521180603931e-06, + "loss": 0.1062, + "step": 3344 + }, + { + "epoch": 1.560634328358209, + "grad_norm": 1.0997737486755967, + "learning_rate": 5.520123166363807e-06, + "loss": 0.1084, + "step": 3346 + }, + { + "epoch": 1.5615671641791045, + "grad_norm": 1.2744198144447323, + "learning_rate": 5.514724539196549e-06, + "loss": 0.1084, + "step": 3348 + }, + { + "epoch": 1.5625, + "grad_norm": 1.0003129851012083, + "learning_rate": 5.5093253054640476e-06, + "loss": 0.0908, + "step": 3350 + }, + { + "epoch": 1.5634328358208955, + "grad_norm": 1.0235800290570736, + "learning_rate": 5.503925471528901e-06, + "loss": 0.0977, + "step": 3352 + }, + { + "epoch": 1.564365671641791, + "grad_norm": 1.049002776986393, + "learning_rate": 5.498525043754422e-06, + "loss": 0.0978, + "step": 3354 + }, + { + "epoch": 1.5652985074626866, + "grad_norm": 0.9128697744784077, + "learning_rate": 5.493124028504619e-06, + "loss": 0.0881, + "step": 3356 + }, + { + "epoch": 1.5662313432835822, + "grad_norm": 1.167538735990302, + "learning_rate": 5.487722432144194e-06, + "loss": 0.114, + "step": 3358 + }, + { + "epoch": 1.5671641791044775, + "grad_norm": 0.9955146544597703, + "learning_rate": 5.482320261038533e-06, + "loss": 0.1005, + "step": 3360 + }, + { + "epoch": 1.5680970149253732, + "grad_norm": 1.1289787193151126, + "learning_rate": 5.4769175215537e-06, + "loss": 0.1119, + "step": 3362 + }, + { + "epoch": 1.5690298507462686, + "grad_norm": 1.0188726391534335, + "learning_rate": 5.471514220056427e-06, + "loss": 0.0982, + "step": 3364 + }, + { + "epoch": 1.5699626865671643, + "grad_norm": 1.005685925577855, + "learning_rate": 5.466110362914113e-06, + "loss": 0.087, + "step": 3366 + }, + { + "epoch": 1.5708955223880596, + "grad_norm": 1.1018413806786018, + "learning_rate": 5.460705956494807e-06, + "loss": 0.1007, + "step": 3368 + }, + { + "epoch": 1.5718283582089554, + "grad_norm": 0.924044230062411, + "learning_rate": 5.455301007167206e-06, + "loss": 0.0842, + "step": 3370 + }, + { + "epoch": 1.5727611940298507, + "grad_norm": 1.1158962490512863, + "learning_rate": 5.4498955213006495e-06, + "loss": 0.1061, + "step": 3372 + }, + { + "epoch": 1.5736940298507462, + "grad_norm": 1.1702039603986962, + "learning_rate": 5.444489505265107e-06, + "loss": 0.102, + "step": 3374 + }, + { + "epoch": 1.5746268656716418, + "grad_norm": 1.1595261199874245, + "learning_rate": 5.439082965431172e-06, + "loss": 0.0992, + "step": 3376 + }, + { + "epoch": 1.5755597014925373, + "grad_norm": 1.0477115551122573, + "learning_rate": 5.433675908170057e-06, + "loss": 0.0916, + "step": 3378 + }, + { + "epoch": 1.5764925373134329, + "grad_norm": 1.2276392020633364, + "learning_rate": 5.428268339853585e-06, + "loss": 0.1141, + "step": 3380 + }, + { + "epoch": 1.5774253731343284, + "grad_norm": 1.15294443230766, + "learning_rate": 5.422860266854178e-06, + "loss": 0.1128, + "step": 3382 + }, + { + "epoch": 1.578358208955224, + "grad_norm": 1.049236986120805, + "learning_rate": 5.4174516955448565e-06, + "loss": 0.0906, + "step": 3384 + }, + { + "epoch": 1.5792910447761193, + "grad_norm": 1.0291531356928958, + "learning_rate": 5.412042632299227e-06, + "loss": 0.1003, + "step": 3386 + }, + { + "epoch": 1.580223880597015, + "grad_norm": 1.0130861660463677, + "learning_rate": 5.406633083491473e-06, + "loss": 0.0991, + "step": 3388 + }, + { + "epoch": 1.5811567164179103, + "grad_norm": 1.1448712948800965, + "learning_rate": 5.401223055496351e-06, + "loss": 0.1034, + "step": 3390 + }, + { + "epoch": 1.582089552238806, + "grad_norm": 1.0408097479050962, + "learning_rate": 5.395812554689188e-06, + "loss": 0.0892, + "step": 3392 + }, + { + "epoch": 1.5830223880597014, + "grad_norm": 0.956142775119575, + "learning_rate": 5.390401587445861e-06, + "loss": 0.0855, + "step": 3394 + }, + { + "epoch": 1.5839552238805972, + "grad_norm": 1.1077760930238951, + "learning_rate": 5.3849901601428004e-06, + "loss": 0.1005, + "step": 3396 + }, + { + "epoch": 1.5848880597014925, + "grad_norm": 1.0197330666390934, + "learning_rate": 5.379578279156976e-06, + "loss": 0.0964, + "step": 3398 + }, + { + "epoch": 1.585820895522388, + "grad_norm": 1.258298892927742, + "learning_rate": 5.374165950865897e-06, + "loss": 0.1066, + "step": 3400 + }, + { + "epoch": 1.5867537313432836, + "grad_norm": 1.2488739250729246, + "learning_rate": 5.368753181647594e-06, + "loss": 0.1193, + "step": 3402 + }, + { + "epoch": 1.587686567164179, + "grad_norm": 1.168478047666357, + "learning_rate": 5.363339977880619e-06, + "loss": 0.0902, + "step": 3404 + }, + { + "epoch": 1.5886194029850746, + "grad_norm": 1.0629366502594004, + "learning_rate": 5.357926345944041e-06, + "loss": 0.1089, + "step": 3406 + }, + { + "epoch": 1.5895522388059702, + "grad_norm": 1.0796016027378421, + "learning_rate": 5.352512292217427e-06, + "loss": 0.0872, + "step": 3408 + }, + { + "epoch": 1.5904850746268657, + "grad_norm": 1.1211043140083328, + "learning_rate": 5.347097823080842e-06, + "loss": 0.1019, + "step": 3410 + }, + { + "epoch": 1.591417910447761, + "grad_norm": 1.0494066254235275, + "learning_rate": 5.341682944914846e-06, + "loss": 0.0857, + "step": 3412 + }, + { + "epoch": 1.5923507462686568, + "grad_norm": 1.1468631793377873, + "learning_rate": 5.3362676641004755e-06, + "loss": 0.1036, + "step": 3414 + }, + { + "epoch": 1.5932835820895521, + "grad_norm": 1.15286795423784, + "learning_rate": 5.33085198701924e-06, + "loss": 0.1128, + "step": 3416 + }, + { + "epoch": 1.5942164179104479, + "grad_norm": 1.16743739856925, + "learning_rate": 5.325435920053124e-06, + "loss": 0.1097, + "step": 3418 + }, + { + "epoch": 1.5951492537313432, + "grad_norm": 1.0253152956157596, + "learning_rate": 5.320019469584562e-06, + "loss": 0.1031, + "step": 3420 + }, + { + "epoch": 1.596082089552239, + "grad_norm": 1.0452070457188003, + "learning_rate": 5.314602641996448e-06, + "loss": 0.1024, + "step": 3422 + }, + { + "epoch": 1.5970149253731343, + "grad_norm": 0.9971837975286477, + "learning_rate": 5.309185443672117e-06, + "loss": 0.0997, + "step": 3424 + }, + { + "epoch": 1.5979477611940298, + "grad_norm": 1.0506023823825037, + "learning_rate": 5.303767880995339e-06, + "loss": 0.1017, + "step": 3426 + }, + { + "epoch": 1.5988805970149254, + "grad_norm": 1.0226928003736417, + "learning_rate": 5.29834996035032e-06, + "loss": 0.0933, + "step": 3428 + }, + { + "epoch": 1.599813432835821, + "grad_norm": 1.1240088184633903, + "learning_rate": 5.29293168812168e-06, + "loss": 0.0872, + "step": 3430 + }, + { + "epoch": 1.6007462686567164, + "grad_norm": 1.0934312298167896, + "learning_rate": 5.287513070694458e-06, + "loss": 0.1021, + "step": 3432 + }, + { + "epoch": 1.601679104477612, + "grad_norm": 0.9864960877735491, + "learning_rate": 5.282094114454097e-06, + "loss": 0.0998, + "step": 3434 + }, + { + "epoch": 1.6026119402985075, + "grad_norm": 1.1116746178129506, + "learning_rate": 5.276674825786441e-06, + "loss": 0.0856, + "step": 3436 + }, + { + "epoch": 1.6035447761194028, + "grad_norm": 1.0861045520010988, + "learning_rate": 5.271255211077729e-06, + "loss": 0.1041, + "step": 3438 + }, + { + "epoch": 1.6044776119402986, + "grad_norm": 1.0302177348773862, + "learning_rate": 5.265835276714578e-06, + "loss": 0.0933, + "step": 3440 + }, + { + "epoch": 1.605410447761194, + "grad_norm": 1.0534096090894276, + "learning_rate": 5.260415029083983e-06, + "loss": 0.0909, + "step": 3442 + }, + { + "epoch": 1.6063432835820897, + "grad_norm": 1.1515857861458405, + "learning_rate": 5.254994474573314e-06, + "loss": 0.1073, + "step": 3444 + }, + { + "epoch": 1.607276119402985, + "grad_norm": 0.9893069640958156, + "learning_rate": 5.249573619570294e-06, + "loss": 0.1032, + "step": 3446 + }, + { + "epoch": 1.6082089552238807, + "grad_norm": 1.182569131687945, + "learning_rate": 5.244152470463006e-06, + "loss": 0.102, + "step": 3448 + }, + { + "epoch": 1.609141791044776, + "grad_norm": 1.1328479359944652, + "learning_rate": 5.238731033639879e-06, + "loss": 0.0936, + "step": 3450 + }, + { + "epoch": 1.6100746268656716, + "grad_norm": 1.0894456674430686, + "learning_rate": 5.233309315489679e-06, + "loss": 0.0943, + "step": 3452 + }, + { + "epoch": 1.6110074626865671, + "grad_norm": 1.1145375697778408, + "learning_rate": 5.227887322401504e-06, + "loss": 0.0897, + "step": 3454 + }, + { + "epoch": 1.6119402985074627, + "grad_norm": 0.9720405214741787, + "learning_rate": 5.222465060764778e-06, + "loss": 0.0937, + "step": 3456 + }, + { + "epoch": 1.6128731343283582, + "grad_norm": 1.0977796171823857, + "learning_rate": 5.217042536969238e-06, + "loss": 0.1063, + "step": 3458 + }, + { + "epoch": 1.6138059701492538, + "grad_norm": 1.2000981446555192, + "learning_rate": 5.211619757404933e-06, + "loss": 0.1066, + "step": 3460 + }, + { + "epoch": 1.6147388059701493, + "grad_norm": 1.0854262662012177, + "learning_rate": 5.2061967284622125e-06, + "loss": 0.087, + "step": 3462 + }, + { + "epoch": 1.6156716417910446, + "grad_norm": 1.2664030492175264, + "learning_rate": 5.200773456531721e-06, + "loss": 0.1032, + "step": 3464 + }, + { + "epoch": 1.6166044776119404, + "grad_norm": 1.0918233072376418, + "learning_rate": 5.195349948004386e-06, + "loss": 0.1109, + "step": 3466 + }, + { + "epoch": 1.6175373134328357, + "grad_norm": 1.099476486459014, + "learning_rate": 5.189926209271415e-06, + "loss": 0.1024, + "step": 3468 + }, + { + "epoch": 1.6184701492537314, + "grad_norm": 1.1647877506586748, + "learning_rate": 5.184502246724292e-06, + "loss": 0.1102, + "step": 3470 + }, + { + "epoch": 1.6194029850746268, + "grad_norm": 1.0775912321365924, + "learning_rate": 5.179078066754757e-06, + "loss": 0.0963, + "step": 3472 + }, + { + "epoch": 1.6203358208955225, + "grad_norm": 1.0958814174313056, + "learning_rate": 5.173653675754807e-06, + "loss": 0.0939, + "step": 3474 + }, + { + "epoch": 1.6212686567164178, + "grad_norm": 1.1992241323664228, + "learning_rate": 5.168229080116697e-06, + "loss": 0.1117, + "step": 3476 + }, + { + "epoch": 1.6222014925373134, + "grad_norm": 1.1865337510695115, + "learning_rate": 5.162804286232911e-06, + "loss": 0.1068, + "step": 3478 + }, + { + "epoch": 1.623134328358209, + "grad_norm": 1.2189765487255413, + "learning_rate": 5.157379300496175e-06, + "loss": 0.1043, + "step": 3480 + }, + { + "epoch": 1.6240671641791045, + "grad_norm": 1.047892442751099, + "learning_rate": 5.151954129299437e-06, + "loss": 0.1025, + "step": 3482 + }, + { + "epoch": 1.625, + "grad_norm": 1.0966949663578955, + "learning_rate": 5.146528779035864e-06, + "loss": 0.0953, + "step": 3484 + }, + { + "epoch": 1.6259328358208955, + "grad_norm": 1.0023631321949278, + "learning_rate": 5.141103256098836e-06, + "loss": 0.0958, + "step": 3486 + }, + { + "epoch": 1.626865671641791, + "grad_norm": 1.0285913271463087, + "learning_rate": 5.135677566881935e-06, + "loss": 0.1007, + "step": 3488 + }, + { + "epoch": 1.6277985074626866, + "grad_norm": 1.15379657912386, + "learning_rate": 5.130251717778939e-06, + "loss": 0.1009, + "step": 3490 + }, + { + "epoch": 1.6287313432835822, + "grad_norm": 1.1407035220321515, + "learning_rate": 5.1248257151838145e-06, + "loss": 0.0983, + "step": 3492 + }, + { + "epoch": 1.6296641791044775, + "grad_norm": 1.0209284136928085, + "learning_rate": 5.119399565490707e-06, + "loss": 0.0897, + "step": 3494 + }, + { + "epoch": 1.6305970149253732, + "grad_norm": 1.34014449868634, + "learning_rate": 5.113973275093942e-06, + "loss": 0.1121, + "step": 3496 + }, + { + "epoch": 1.6315298507462686, + "grad_norm": 1.1291582885169507, + "learning_rate": 5.108546850388002e-06, + "loss": 0.1027, + "step": 3498 + }, + { + "epoch": 1.6324626865671643, + "grad_norm": 1.1120430135012636, + "learning_rate": 5.103120297767532e-06, + "loss": 0.1031, + "step": 3500 + }, + { + "epoch": 1.6324626865671643, + "eval_loss": 0.1550646424293518, + "eval_runtime": 323.0994, + "eval_samples_per_second": 47.187, + "eval_steps_per_second": 5.899, + "step": 3500 + }, + { + "epoch": 1.6333955223880596, + "grad_norm": 1.224966669924812, + "learning_rate": 5.09769362362733e-06, + "loss": 0.1048, + "step": 3502 + }, + { + "epoch": 1.6343283582089554, + "grad_norm": 1.1310448493746208, + "learning_rate": 5.092266834362334e-06, + "loss": 0.096, + "step": 3504 + }, + { + "epoch": 1.6352611940298507, + "grad_norm": 1.1448619844530687, + "learning_rate": 5.086839936367617e-06, + "loss": 0.1043, + "step": 3506 + }, + { + "epoch": 1.6361940298507462, + "grad_norm": 1.198784693852069, + "learning_rate": 5.081412936038384e-06, + "loss": 0.1002, + "step": 3508 + }, + { + "epoch": 1.6371268656716418, + "grad_norm": 1.0888414013644547, + "learning_rate": 5.075985839769955e-06, + "loss": 0.1031, + "step": 3510 + }, + { + "epoch": 1.6380597014925373, + "grad_norm": 0.9863609356730051, + "learning_rate": 5.070558653957769e-06, + "loss": 0.0842, + "step": 3512 + }, + { + "epoch": 1.6389925373134329, + "grad_norm": 1.043429135253945, + "learning_rate": 5.065131384997367e-06, + "loss": 0.0972, + "step": 3514 + }, + { + "epoch": 1.6399253731343284, + "grad_norm": 1.1815139017622, + "learning_rate": 5.059704039284388e-06, + "loss": 0.1047, + "step": 3516 + }, + { + "epoch": 1.640858208955224, + "grad_norm": 1.0360845535742687, + "learning_rate": 5.054276623214563e-06, + "loss": 0.1016, + "step": 3518 + }, + { + "epoch": 1.6417910447761193, + "grad_norm": 1.0442361030967533, + "learning_rate": 5.048849143183705e-06, + "loss": 0.0904, + "step": 3520 + }, + { + "epoch": 1.642723880597015, + "grad_norm": 1.1365748005301108, + "learning_rate": 5.043421605587703e-06, + "loss": 0.0972, + "step": 3522 + }, + { + "epoch": 1.6436567164179103, + "grad_norm": 0.9719007649517358, + "learning_rate": 5.037994016822512e-06, + "loss": 0.0847, + "step": 3524 + }, + { + "epoch": 1.644589552238806, + "grad_norm": 1.1364057839703934, + "learning_rate": 5.032566383284149e-06, + "loss": 0.1034, + "step": 3526 + }, + { + "epoch": 1.6455223880597014, + "grad_norm": 1.0413573510632972, + "learning_rate": 5.027138711368684e-06, + "loss": 0.101, + "step": 3528 + }, + { + "epoch": 1.6464552238805972, + "grad_norm": 1.0427605352714602, + "learning_rate": 5.021711007472233e-06, + "loss": 0.095, + "step": 3530 + }, + { + "epoch": 1.6473880597014925, + "grad_norm": 1.108928417323732, + "learning_rate": 5.0162832779909455e-06, + "loss": 0.1074, + "step": 3532 + }, + { + "epoch": 1.648320895522388, + "grad_norm": 1.0270133226481224, + "learning_rate": 5.010855529321005e-06, + "loss": 0.097, + "step": 3534 + }, + { + "epoch": 1.6492537313432836, + "grad_norm": 1.0433919115523331, + "learning_rate": 5.005427767858616e-06, + "loss": 0.1045, + "step": 3536 + }, + { + "epoch": 1.650186567164179, + "grad_norm": 1.1209653834373903, + "learning_rate": 5e-06, + "loss": 0.1108, + "step": 3538 + }, + { + "epoch": 1.6511194029850746, + "grad_norm": 1.0097951219224293, + "learning_rate": 4.994572232141385e-06, + "loss": 0.0936, + "step": 3540 + }, + { + "epoch": 1.6520522388059702, + "grad_norm": 1.0897908829200658, + "learning_rate": 4.989144470678997e-06, + "loss": 0.0926, + "step": 3542 + }, + { + "epoch": 1.6529850746268657, + "grad_norm": 1.1995842264569205, + "learning_rate": 4.983716722009055e-06, + "loss": 0.092, + "step": 3544 + }, + { + "epoch": 1.653917910447761, + "grad_norm": 1.0936272317090328, + "learning_rate": 4.978288992527768e-06, + "loss": 0.0873, + "step": 3546 + }, + { + "epoch": 1.6548507462686568, + "grad_norm": 1.0817715121855342, + "learning_rate": 4.972861288631317e-06, + "loss": 0.0916, + "step": 3548 + }, + { + "epoch": 1.6557835820895521, + "grad_norm": 1.093440730722074, + "learning_rate": 4.967433616715852e-06, + "loss": 0.103, + "step": 3550 + }, + { + "epoch": 1.6567164179104479, + "grad_norm": 1.1828630308619648, + "learning_rate": 4.96200598317749e-06, + "loss": 0.1033, + "step": 3552 + }, + { + "epoch": 1.6576492537313432, + "grad_norm": 1.1365119868172056, + "learning_rate": 4.956578394412298e-06, + "loss": 0.0997, + "step": 3554 + }, + { + "epoch": 1.658582089552239, + "grad_norm": 1.0447886853566, + "learning_rate": 4.9511508568162956e-06, + "loss": 0.0896, + "step": 3556 + }, + { + "epoch": 1.6595149253731343, + "grad_norm": 1.1474184723616883, + "learning_rate": 4.945723376785438e-06, + "loss": 0.0898, + "step": 3558 + }, + { + "epoch": 1.6604477611940298, + "grad_norm": 1.0744057040945956, + "learning_rate": 4.940295960715613e-06, + "loss": 0.1005, + "step": 3560 + }, + { + "epoch": 1.6613805970149254, + "grad_norm": 1.2387998828204012, + "learning_rate": 4.934868615002636e-06, + "loss": 0.0905, + "step": 3562 + }, + { + "epoch": 1.662313432835821, + "grad_norm": 1.0716275535612976, + "learning_rate": 4.9294413460422335e-06, + "loss": 0.1025, + "step": 3564 + }, + { + "epoch": 1.6632462686567164, + "grad_norm": 1.0095439089550609, + "learning_rate": 4.924014160230045e-06, + "loss": 0.0967, + "step": 3566 + }, + { + "epoch": 1.664179104477612, + "grad_norm": 1.0765502678395185, + "learning_rate": 4.918587063961619e-06, + "loss": 0.0995, + "step": 3568 + }, + { + "epoch": 1.6651119402985075, + "grad_norm": 1.1770395394430098, + "learning_rate": 4.913160063632384e-06, + "loss": 0.0892, + "step": 3570 + }, + { + "epoch": 1.6660447761194028, + "grad_norm": 1.1159996883856258, + "learning_rate": 4.907733165637668e-06, + "loss": 0.0911, + "step": 3572 + }, + { + "epoch": 1.6669776119402986, + "grad_norm": 1.124433917985698, + "learning_rate": 4.9023063763726715e-06, + "loss": 0.1074, + "step": 3574 + }, + { + "epoch": 1.667910447761194, + "grad_norm": 1.0132124225805013, + "learning_rate": 4.896879702232468e-06, + "loss": 0.0896, + "step": 3576 + }, + { + "epoch": 1.6688432835820897, + "grad_norm": 1.0480685826509903, + "learning_rate": 4.891453149611999e-06, + "loss": 0.1047, + "step": 3578 + }, + { + "epoch": 1.669776119402985, + "grad_norm": 1.1364901285389009, + "learning_rate": 4.8860267249060596e-06, + "loss": 0.1058, + "step": 3580 + }, + { + "epoch": 1.6707089552238807, + "grad_norm": 1.1375200711907278, + "learning_rate": 4.880600434509295e-06, + "loss": 0.1058, + "step": 3582 + }, + { + "epoch": 1.671641791044776, + "grad_norm": 1.1747892292580644, + "learning_rate": 4.875174284816188e-06, + "loss": 0.0959, + "step": 3584 + }, + { + "epoch": 1.6725746268656716, + "grad_norm": 1.1273492075578033, + "learning_rate": 4.869748282221063e-06, + "loss": 0.1024, + "step": 3586 + }, + { + "epoch": 1.6735074626865671, + "grad_norm": 1.0817554210784017, + "learning_rate": 4.864322433118066e-06, + "loss": 0.097, + "step": 3588 + }, + { + "epoch": 1.6744402985074627, + "grad_norm": 1.1379715879944226, + "learning_rate": 4.858896743901165e-06, + "loss": 0.0986, + "step": 3590 + }, + { + "epoch": 1.6753731343283582, + "grad_norm": 0.9924153620454816, + "learning_rate": 4.853471220964137e-06, + "loss": 0.0836, + "step": 3592 + }, + { + "epoch": 1.6763059701492538, + "grad_norm": 1.1209912319460038, + "learning_rate": 4.8480458707005654e-06, + "loss": 0.0932, + "step": 3594 + }, + { + "epoch": 1.6772388059701493, + "grad_norm": 1.0467206766192003, + "learning_rate": 4.842620699503825e-06, + "loss": 0.0876, + "step": 3596 + }, + { + "epoch": 1.6781716417910446, + "grad_norm": 1.2005846968793807, + "learning_rate": 4.837195713767089e-06, + "loss": 0.1105, + "step": 3598 + }, + { + "epoch": 1.6791044776119404, + "grad_norm": 1.0396833765069797, + "learning_rate": 4.8317709198833056e-06, + "loss": 0.0892, + "step": 3600 + }, + { + "epoch": 1.6800373134328357, + "grad_norm": 1.1505955141214124, + "learning_rate": 4.826346324245194e-06, + "loss": 0.0968, + "step": 3602 + }, + { + "epoch": 1.6809701492537314, + "grad_norm": 1.229195385370137, + "learning_rate": 4.820921933245246e-06, + "loss": 0.0976, + "step": 3604 + }, + { + "epoch": 1.6819029850746268, + "grad_norm": 1.1901564600940944, + "learning_rate": 4.815497753275711e-06, + "loss": 0.1028, + "step": 3606 + }, + { + "epoch": 1.6828358208955225, + "grad_norm": 1.1259894388193936, + "learning_rate": 4.810073790728585e-06, + "loss": 0.1114, + "step": 3608 + }, + { + "epoch": 1.6837686567164178, + "grad_norm": 1.1101771261250724, + "learning_rate": 4.804650051995615e-06, + "loss": 0.0891, + "step": 3610 + }, + { + "epoch": 1.6847014925373134, + "grad_norm": 1.0582792259069187, + "learning_rate": 4.79922654346828e-06, + "loss": 0.0914, + "step": 3612 + }, + { + "epoch": 1.685634328358209, + "grad_norm": 1.1124880761448122, + "learning_rate": 4.793803271537788e-06, + "loss": 0.0919, + "step": 3614 + }, + { + "epoch": 1.6865671641791045, + "grad_norm": 1.1358075305748105, + "learning_rate": 4.7883802425950685e-06, + "loss": 0.1039, + "step": 3616 + }, + { + "epoch": 1.6875, + "grad_norm": 1.055685153236721, + "learning_rate": 4.782957463030763e-06, + "loss": 0.0966, + "step": 3618 + }, + { + "epoch": 1.6884328358208955, + "grad_norm": 1.2018144133814113, + "learning_rate": 4.777534939235225e-06, + "loss": 0.1052, + "step": 3620 + }, + { + "epoch": 1.689365671641791, + "grad_norm": 1.0712499743212405, + "learning_rate": 4.772112677598498e-06, + "loss": 0.0954, + "step": 3622 + }, + { + "epoch": 1.6902985074626866, + "grad_norm": 1.0524792489629278, + "learning_rate": 4.766690684510323e-06, + "loss": 0.0835, + "step": 3624 + }, + { + "epoch": 1.6912313432835822, + "grad_norm": 1.050570153381814, + "learning_rate": 4.761268966360123e-06, + "loss": 0.098, + "step": 3626 + }, + { + "epoch": 1.6921641791044775, + "grad_norm": 1.0323616470423458, + "learning_rate": 4.7558475295369945e-06, + "loss": 0.098, + "step": 3628 + }, + { + "epoch": 1.6930970149253732, + "grad_norm": 1.1051931761480092, + "learning_rate": 4.7504263804297064e-06, + "loss": 0.0996, + "step": 3630 + }, + { + "epoch": 1.6940298507462686, + "grad_norm": 1.0970026265986519, + "learning_rate": 4.745005525426688e-06, + "loss": 0.0964, + "step": 3632 + }, + { + "epoch": 1.6949626865671643, + "grad_norm": 1.1631049642066742, + "learning_rate": 4.739584970916018e-06, + "loss": 0.1112, + "step": 3634 + }, + { + "epoch": 1.6958955223880596, + "grad_norm": 1.0453504812979213, + "learning_rate": 4.734164723285424e-06, + "loss": 0.0854, + "step": 3636 + }, + { + "epoch": 1.6968283582089554, + "grad_norm": 1.2805705908477534, + "learning_rate": 4.728744788922272e-06, + "loss": 0.1026, + "step": 3638 + }, + { + "epoch": 1.6977611940298507, + "grad_norm": 1.1706183399424146, + "learning_rate": 4.723325174213559e-06, + "loss": 0.1001, + "step": 3640 + }, + { + "epoch": 1.6986940298507462, + "grad_norm": 1.0045472923324372, + "learning_rate": 4.7179058855459045e-06, + "loss": 0.0982, + "step": 3642 + }, + { + "epoch": 1.6996268656716418, + "grad_norm": 1.1097841587253405, + "learning_rate": 4.712486929305544e-06, + "loss": 0.0963, + "step": 3644 + }, + { + "epoch": 1.7005597014925373, + "grad_norm": 1.1361392651171767, + "learning_rate": 4.707068311878322e-06, + "loss": 0.1001, + "step": 3646 + }, + { + "epoch": 1.7014925373134329, + "grad_norm": 1.1215581216229769, + "learning_rate": 4.701650039649682e-06, + "loss": 0.0982, + "step": 3648 + }, + { + "epoch": 1.7024253731343284, + "grad_norm": 1.157461079632266, + "learning_rate": 4.69623211900466e-06, + "loss": 0.106, + "step": 3650 + }, + { + "epoch": 1.703358208955224, + "grad_norm": 1.0029338314369465, + "learning_rate": 4.690814556327885e-06, + "loss": 0.088, + "step": 3652 + }, + { + "epoch": 1.7042910447761193, + "grad_norm": 1.1530178820591044, + "learning_rate": 4.685397358003554e-06, + "loss": 0.1075, + "step": 3654 + }, + { + "epoch": 1.705223880597015, + "grad_norm": 1.1422854763505046, + "learning_rate": 4.6799805304154396e-06, + "loss": 0.103, + "step": 3656 + }, + { + "epoch": 1.7061567164179103, + "grad_norm": 1.2071149858502441, + "learning_rate": 4.6745640799468786e-06, + "loss": 0.1028, + "step": 3658 + }, + { + "epoch": 1.707089552238806, + "grad_norm": 1.0088615671201564, + "learning_rate": 4.669148012980761e-06, + "loss": 0.0935, + "step": 3660 + }, + { + "epoch": 1.7080223880597014, + "grad_norm": 1.1276835081509065, + "learning_rate": 4.663732335899527e-06, + "loss": 0.0919, + "step": 3662 + }, + { + "epoch": 1.7089552238805972, + "grad_norm": 1.2096235399066795, + "learning_rate": 4.658317055085154e-06, + "loss": 0.1068, + "step": 3664 + }, + { + "epoch": 1.7098880597014925, + "grad_norm": 1.1279369930636538, + "learning_rate": 4.652902176919159e-06, + "loss": 0.0935, + "step": 3666 + }, + { + "epoch": 1.710820895522388, + "grad_norm": 1.1103063243139837, + "learning_rate": 4.647487707782575e-06, + "loss": 0.1016, + "step": 3668 + }, + { + "epoch": 1.7117537313432836, + "grad_norm": 1.1597134084926009, + "learning_rate": 4.642073654055959e-06, + "loss": 0.0963, + "step": 3670 + }, + { + "epoch": 1.712686567164179, + "grad_norm": 1.1020343847839857, + "learning_rate": 4.636660022119382e-06, + "loss": 0.0965, + "step": 3672 + }, + { + "epoch": 1.7136194029850746, + "grad_norm": 1.0684766114260358, + "learning_rate": 4.631246818352408e-06, + "loss": 0.0843, + "step": 3674 + }, + { + "epoch": 1.7145522388059702, + "grad_norm": 1.0985281014336183, + "learning_rate": 4.625834049134105e-06, + "loss": 0.1061, + "step": 3676 + }, + { + "epoch": 1.7154850746268657, + "grad_norm": 1.1518640444108639, + "learning_rate": 4.620421720843025e-06, + "loss": 0.0882, + "step": 3678 + }, + { + "epoch": 1.716417910447761, + "grad_norm": 1.2363234410164508, + "learning_rate": 4.615009839857202e-06, + "loss": 0.0967, + "step": 3680 + }, + { + "epoch": 1.7173507462686568, + "grad_norm": 1.1890607734505911, + "learning_rate": 4.60959841255414e-06, + "loss": 0.093, + "step": 3682 + }, + { + "epoch": 1.7182835820895521, + "grad_norm": 1.1660943897921863, + "learning_rate": 4.604187445310814e-06, + "loss": 0.1015, + "step": 3684 + }, + { + "epoch": 1.7192164179104479, + "grad_norm": 1.0751777453698839, + "learning_rate": 4.59877694450365e-06, + "loss": 0.0905, + "step": 3686 + }, + { + "epoch": 1.7201492537313432, + "grad_norm": 1.1920409481024197, + "learning_rate": 4.59336691650853e-06, + "loss": 0.1011, + "step": 3688 + }, + { + "epoch": 1.721082089552239, + "grad_norm": 1.175634938951435, + "learning_rate": 4.587957367700776e-06, + "loss": 0.094, + "step": 3690 + }, + { + "epoch": 1.7220149253731343, + "grad_norm": 1.3001011587599143, + "learning_rate": 4.5825483044551435e-06, + "loss": 0.1074, + "step": 3692 + }, + { + "epoch": 1.7229477611940298, + "grad_norm": 1.1559607773710021, + "learning_rate": 4.5771397331458224e-06, + "loss": 0.1085, + "step": 3694 + }, + { + "epoch": 1.7238805970149254, + "grad_norm": 1.0917943572687148, + "learning_rate": 4.571731660146416e-06, + "loss": 0.097, + "step": 3696 + }, + { + "epoch": 1.724813432835821, + "grad_norm": 1.177871791179173, + "learning_rate": 4.566324091829945e-06, + "loss": 0.0915, + "step": 3698 + }, + { + "epoch": 1.7257462686567164, + "grad_norm": 1.0381343767107474, + "learning_rate": 4.5609170345688305e-06, + "loss": 0.1003, + "step": 3700 + }, + { + "epoch": 1.726679104477612, + "grad_norm": 1.095792541674332, + "learning_rate": 4.555510494734893e-06, + "loss": 0.0914, + "step": 3702 + }, + { + "epoch": 1.7276119402985075, + "grad_norm": 1.2223452882085306, + "learning_rate": 4.550104478699351e-06, + "loss": 0.0967, + "step": 3704 + }, + { + "epoch": 1.7285447761194028, + "grad_norm": 0.976454960376214, + "learning_rate": 4.544698992832795e-06, + "loss": 0.0921, + "step": 3706 + }, + { + "epoch": 1.7294776119402986, + "grad_norm": 0.9841552149965179, + "learning_rate": 4.539294043505195e-06, + "loss": 0.0984, + "step": 3708 + }, + { + "epoch": 1.730410447761194, + "grad_norm": 1.0250549535432676, + "learning_rate": 4.533889637085888e-06, + "loss": 0.0904, + "step": 3710 + }, + { + "epoch": 1.7313432835820897, + "grad_norm": 1.0340199334224152, + "learning_rate": 4.528485779943573e-06, + "loss": 0.0917, + "step": 3712 + }, + { + "epoch": 1.732276119402985, + "grad_norm": 1.1186188527488128, + "learning_rate": 4.523082478446301e-06, + "loss": 0.1121, + "step": 3714 + }, + { + "epoch": 1.7332089552238807, + "grad_norm": 1.0236389887859945, + "learning_rate": 4.517679738961468e-06, + "loss": 0.0918, + "step": 3716 + }, + { + "epoch": 1.734141791044776, + "grad_norm": 1.1577395975820644, + "learning_rate": 4.512277567855809e-06, + "loss": 0.0954, + "step": 3718 + }, + { + "epoch": 1.7350746268656716, + "grad_norm": 1.2585587926998412, + "learning_rate": 4.506875971495383e-06, + "loss": 0.099, + "step": 3720 + }, + { + "epoch": 1.7360074626865671, + "grad_norm": 1.0662256276154634, + "learning_rate": 4.5014749562455805e-06, + "loss": 0.0899, + "step": 3722 + }, + { + "epoch": 1.7369402985074627, + "grad_norm": 1.0936922063729748, + "learning_rate": 4.4960745284711e-06, + "loss": 0.0976, + "step": 3724 + }, + { + "epoch": 1.7378731343283582, + "grad_norm": 1.2055073240106777, + "learning_rate": 4.490674694535955e-06, + "loss": 0.1114, + "step": 3726 + }, + { + "epoch": 1.7388059701492538, + "grad_norm": 1.0419708398365015, + "learning_rate": 4.485275460803452e-06, + "loss": 0.0907, + "step": 3728 + }, + { + "epoch": 1.7397388059701493, + "grad_norm": 1.0903435419954495, + "learning_rate": 4.479876833636196e-06, + "loss": 0.097, + "step": 3730 + }, + { + "epoch": 1.7406716417910446, + "grad_norm": 1.065172655996034, + "learning_rate": 4.474478819396072e-06, + "loss": 0.0954, + "step": 3732 + }, + { + "epoch": 1.7416044776119404, + "grad_norm": 1.1264507441495129, + "learning_rate": 4.469081424444243e-06, + "loss": 0.0983, + "step": 3734 + }, + { + "epoch": 1.7425373134328357, + "grad_norm": 1.0737139692551787, + "learning_rate": 4.463684655141151e-06, + "loss": 0.0969, + "step": 3736 + }, + { + "epoch": 1.7434701492537314, + "grad_norm": 1.0271970616815274, + "learning_rate": 4.45828851784649e-06, + "loss": 0.0958, + "step": 3738 + }, + { + "epoch": 1.7444029850746268, + "grad_norm": 0.9816538224728948, + "learning_rate": 4.452893018919213e-06, + "loss": 0.0902, + "step": 3740 + }, + { + "epoch": 1.7453358208955225, + "grad_norm": 1.0339703432308618, + "learning_rate": 4.447498164717522e-06, + "loss": 0.0982, + "step": 3742 + }, + { + "epoch": 1.7462686567164178, + "grad_norm": 1.1895943462090308, + "learning_rate": 4.442103961598858e-06, + "loss": 0.0974, + "step": 3744 + }, + { + "epoch": 1.7472014925373134, + "grad_norm": 1.0522698584404027, + "learning_rate": 4.436710415919896e-06, + "loss": 0.088, + "step": 3746 + }, + { + "epoch": 1.748134328358209, + "grad_norm": 1.0222227241266983, + "learning_rate": 4.431317534036535e-06, + "loss": 0.0886, + "step": 3748 + }, + { + "epoch": 1.7490671641791045, + "grad_norm": 1.0845559270059455, + "learning_rate": 4.425925322303893e-06, + "loss": 0.0871, + "step": 3750 + }, + { + "epoch": 1.75, + "grad_norm": 1.1453172318035394, + "learning_rate": 4.420533787076295e-06, + "loss": 0.0984, + "step": 3752 + }, + { + "epoch": 1.7509328358208955, + "grad_norm": 1.0478874821424775, + "learning_rate": 4.41514293470727e-06, + "loss": 0.0838, + "step": 3754 + }, + { + "epoch": 1.751865671641791, + "grad_norm": 1.0826437025539664, + "learning_rate": 4.4097527715495495e-06, + "loss": 0.0945, + "step": 3756 + }, + { + "epoch": 1.7527985074626866, + "grad_norm": 1.0851384876664898, + "learning_rate": 4.4043633039550425e-06, + "loss": 0.0983, + "step": 3758 + }, + { + "epoch": 1.7537313432835822, + "grad_norm": 1.0879014786594696, + "learning_rate": 4.398974538274843e-06, + "loss": 0.0906, + "step": 3760 + }, + { + "epoch": 1.7546641791044775, + "grad_norm": 1.1075558969803165, + "learning_rate": 4.393586480859217e-06, + "loss": 0.0935, + "step": 3762 + }, + { + "epoch": 1.7555970149253732, + "grad_norm": 1.2069947270613044, + "learning_rate": 4.388199138057599e-06, + "loss": 0.0998, + "step": 3764 + }, + { + "epoch": 1.7565298507462686, + "grad_norm": 1.0467782837242663, + "learning_rate": 4.382812516218573e-06, + "loss": 0.0877, + "step": 3766 + }, + { + "epoch": 1.7574626865671643, + "grad_norm": 1.0449747101766405, + "learning_rate": 4.377426621689885e-06, + "loss": 0.0977, + "step": 3768 + }, + { + "epoch": 1.7583955223880596, + "grad_norm": 1.128941413012485, + "learning_rate": 4.3720414608184175e-06, + "loss": 0.0998, + "step": 3770 + }, + { + "epoch": 1.7593283582089554, + "grad_norm": 1.1647396071651746, + "learning_rate": 4.366657039950186e-06, + "loss": 0.0976, + "step": 3772 + }, + { + "epoch": 1.7602611940298507, + "grad_norm": 1.0162492358020478, + "learning_rate": 4.361273365430338e-06, + "loss": 0.0988, + "step": 3774 + }, + { + "epoch": 1.7611940298507462, + "grad_norm": 1.1246697581947454, + "learning_rate": 4.355890443603139e-06, + "loss": 0.0913, + "step": 3776 + }, + { + "epoch": 1.7621268656716418, + "grad_norm": 1.0394859969226904, + "learning_rate": 4.350508280811973e-06, + "loss": 0.0947, + "step": 3778 + }, + { + "epoch": 1.7630597014925373, + "grad_norm": 1.2960785308474594, + "learning_rate": 4.345126883399323e-06, + "loss": 0.0949, + "step": 3780 + }, + { + "epoch": 1.7639925373134329, + "grad_norm": 1.3798023038263878, + "learning_rate": 4.339746257706771e-06, + "loss": 0.1075, + "step": 3782 + }, + { + "epoch": 1.7649253731343284, + "grad_norm": 1.0076708894235762, + "learning_rate": 4.334366410074995e-06, + "loss": 0.0885, + "step": 3784 + }, + { + "epoch": 1.765858208955224, + "grad_norm": 1.2526434203532881, + "learning_rate": 4.328987346843746e-06, + "loss": 0.1009, + "step": 3786 + }, + { + "epoch": 1.7667910447761193, + "grad_norm": 1.0884714717105828, + "learning_rate": 4.3236090743518635e-06, + "loss": 0.1029, + "step": 3788 + }, + { + "epoch": 1.767723880597015, + "grad_norm": 1.0296748006116867, + "learning_rate": 4.3182315989372446e-06, + "loss": 0.0943, + "step": 3790 + }, + { + "epoch": 1.7686567164179103, + "grad_norm": 1.1289222001502612, + "learning_rate": 4.312854926936852e-06, + "loss": 0.1076, + "step": 3792 + }, + { + "epoch": 1.769589552238806, + "grad_norm": 1.0725243856227535, + "learning_rate": 4.307479064686701e-06, + "loss": 0.0915, + "step": 3794 + }, + { + "epoch": 1.7705223880597014, + "grad_norm": 1.0079013994375832, + "learning_rate": 4.30210401852185e-06, + "loss": 0.0847, + "step": 3796 + }, + { + "epoch": 1.7714552238805972, + "grad_norm": 1.0014496534946675, + "learning_rate": 4.296729794776402e-06, + "loss": 0.09, + "step": 3798 + }, + { + "epoch": 1.7723880597014925, + "grad_norm": 1.2232482621343395, + "learning_rate": 4.291356399783484e-06, + "loss": 0.0905, + "step": 3800 + }, + { + "epoch": 1.773320895522388, + "grad_norm": 1.1905264283336212, + "learning_rate": 4.2859838398752515e-06, + "loss": 0.1013, + "step": 3802 + }, + { + "epoch": 1.7742537313432836, + "grad_norm": 1.0466086761704225, + "learning_rate": 4.280612121382872e-06, + "loss": 0.0871, + "step": 3804 + }, + { + "epoch": 1.775186567164179, + "grad_norm": 1.0983217922469228, + "learning_rate": 4.275241250636522e-06, + "loss": 0.0955, + "step": 3806 + }, + { + "epoch": 1.7761194029850746, + "grad_norm": 1.2015898573258283, + "learning_rate": 4.269871233965381e-06, + "loss": 0.1014, + "step": 3808 + }, + { + "epoch": 1.7770522388059702, + "grad_norm": 1.0921263552324518, + "learning_rate": 4.264502077697622e-06, + "loss": 0.0911, + "step": 3810 + }, + { + "epoch": 1.7779850746268657, + "grad_norm": 1.1822366150622516, + "learning_rate": 4.259133788160402e-06, + "loss": 0.0997, + "step": 3812 + }, + { + "epoch": 1.778917910447761, + "grad_norm": 1.1358230989160354, + "learning_rate": 4.25376637167986e-06, + "loss": 0.0932, + "step": 3814 + }, + { + "epoch": 1.7798507462686568, + "grad_norm": 1.1591007066127716, + "learning_rate": 4.248399834581103e-06, + "loss": 0.0936, + "step": 3816 + }, + { + "epoch": 1.7807835820895521, + "grad_norm": 1.1903194907889803, + "learning_rate": 4.243034183188199e-06, + "loss": 0.0873, + "step": 3818 + }, + { + "epoch": 1.7817164179104479, + "grad_norm": 1.0647104993059966, + "learning_rate": 4.2376694238241815e-06, + "loss": 0.0896, + "step": 3820 + }, + { + "epoch": 1.7826492537313432, + "grad_norm": 1.2340235768626262, + "learning_rate": 4.2323055628110245e-06, + "loss": 0.1072, + "step": 3822 + }, + { + "epoch": 1.783582089552239, + "grad_norm": 1.2193822544577344, + "learning_rate": 4.226942606469647e-06, + "loss": 0.1066, + "step": 3824 + }, + { + "epoch": 1.7845149253731343, + "grad_norm": 1.1293602901076107, + "learning_rate": 4.2215805611199e-06, + "loss": 0.1022, + "step": 3826 + }, + { + "epoch": 1.7854477611940298, + "grad_norm": 1.3256242635540962, + "learning_rate": 4.216219433080561e-06, + "loss": 0.1224, + "step": 3828 + }, + { + "epoch": 1.7863805970149254, + "grad_norm": 1.1404796972513265, + "learning_rate": 4.210859228669331e-06, + "loss": 0.098, + "step": 3830 + }, + { + "epoch": 1.787313432835821, + "grad_norm": 0.9573676891903881, + "learning_rate": 4.205499954202817e-06, + "loss": 0.0835, + "step": 3832 + }, + { + "epoch": 1.7882462686567164, + "grad_norm": 1.1540044340356037, + "learning_rate": 4.200141615996532e-06, + "loss": 0.1081, + "step": 3834 + }, + { + "epoch": 1.789179104477612, + "grad_norm": 1.2157590612321367, + "learning_rate": 4.194784220364888e-06, + "loss": 0.0883, + "step": 3836 + }, + { + "epoch": 1.7901119402985075, + "grad_norm": 1.0340845426370617, + "learning_rate": 4.189427773621179e-06, + "loss": 0.0864, + "step": 3838 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 1.1161150048576776, + "learning_rate": 4.184072282077593e-06, + "loss": 0.0966, + "step": 3840 + }, + { + "epoch": 1.7919776119402986, + "grad_norm": 1.216937865414375, + "learning_rate": 4.17871775204518e-06, + "loss": 0.1094, + "step": 3842 + }, + { + "epoch": 1.792910447761194, + "grad_norm": 1.0620932308260993, + "learning_rate": 4.173364189833864e-06, + "loss": 0.0854, + "step": 3844 + }, + { + "epoch": 1.7938432835820897, + "grad_norm": 1.1270696297336795, + "learning_rate": 4.168011601752427e-06, + "loss": 0.0928, + "step": 3846 + }, + { + "epoch": 1.794776119402985, + "grad_norm": 1.2350514563389072, + "learning_rate": 4.162659994108502e-06, + "loss": 0.1011, + "step": 3848 + }, + { + "epoch": 1.7957089552238807, + "grad_norm": 1.101956567541425, + "learning_rate": 4.1573093732085675e-06, + "loss": 0.0963, + "step": 3850 + }, + { + "epoch": 1.796641791044776, + "grad_norm": 1.0861244375436816, + "learning_rate": 4.151959745357941e-06, + "loss": 0.0923, + "step": 3852 + }, + { + "epoch": 1.7975746268656716, + "grad_norm": 0.9689295079280268, + "learning_rate": 4.146611116860767e-06, + "loss": 0.0833, + "step": 3854 + }, + { + "epoch": 1.7985074626865671, + "grad_norm": 1.1511239719980124, + "learning_rate": 4.1412634940200116e-06, + "loss": 0.0875, + "step": 3856 + }, + { + "epoch": 1.7994402985074627, + "grad_norm": 1.1621238931880757, + "learning_rate": 4.135916883137458e-06, + "loss": 0.0991, + "step": 3858 + }, + { + "epoch": 1.8003731343283582, + "grad_norm": 0.9613995980719443, + "learning_rate": 4.130571290513696e-06, + "loss": 0.0855, + "step": 3860 + }, + { + "epoch": 1.8013059701492538, + "grad_norm": 1.1518276454691472, + "learning_rate": 4.125226722448119e-06, + "loss": 0.0931, + "step": 3862 + }, + { + "epoch": 1.8022388059701493, + "grad_norm": 1.041908099670943, + "learning_rate": 4.119883185238905e-06, + "loss": 0.0871, + "step": 3864 + }, + { + "epoch": 1.8031716417910446, + "grad_norm": 1.0574759751416205, + "learning_rate": 4.114540685183026e-06, + "loss": 0.091, + "step": 3866 + }, + { + "epoch": 1.8041044776119404, + "grad_norm": 1.0966981760812202, + "learning_rate": 4.109199228576227e-06, + "loss": 0.0965, + "step": 3868 + }, + { + "epoch": 1.8050373134328357, + "grad_norm": 1.0356744160990077, + "learning_rate": 4.103858821713021e-06, + "loss": 0.092, + "step": 3870 + }, + { + "epoch": 1.8059701492537314, + "grad_norm": 1.1157359606890926, + "learning_rate": 4.0985194708866905e-06, + "loss": 0.1017, + "step": 3872 + }, + { + "epoch": 1.8069029850746268, + "grad_norm": 1.2367449459949518, + "learning_rate": 4.093181182389271e-06, + "loss": 0.0908, + "step": 3874 + }, + { + "epoch": 1.8078358208955225, + "grad_norm": 0.9933622219151041, + "learning_rate": 4.087843962511541e-06, + "loss": 0.0932, + "step": 3876 + }, + { + "epoch": 1.8087686567164178, + "grad_norm": 1.1402345985026843, + "learning_rate": 4.082507817543028e-06, + "loss": 0.1026, + "step": 3878 + }, + { + "epoch": 1.8097014925373134, + "grad_norm": 1.1988662208434808, + "learning_rate": 4.077172753771986e-06, + "loss": 0.102, + "step": 3880 + }, + { + "epoch": 1.810634328358209, + "grad_norm": 1.3120916733676826, + "learning_rate": 4.071838777485398e-06, + "loss": 0.1039, + "step": 3882 + }, + { + "epoch": 1.8115671641791045, + "grad_norm": 1.110205721427487, + "learning_rate": 4.066505894968969e-06, + "loss": 0.0972, + "step": 3884 + }, + { + "epoch": 1.8125, + "grad_norm": 1.1716763276015492, + "learning_rate": 4.061174112507106e-06, + "loss": 0.0928, + "step": 3886 + }, + { + "epoch": 1.8134328358208955, + "grad_norm": 1.3046999821806964, + "learning_rate": 4.05584343638293e-06, + "loss": 0.1009, + "step": 3888 + }, + { + "epoch": 1.814365671641791, + "grad_norm": 1.0396490607292204, + "learning_rate": 4.050513872878249e-06, + "loss": 0.0933, + "step": 3890 + }, + { + "epoch": 1.8152985074626866, + "grad_norm": 1.139036715278458, + "learning_rate": 4.045185428273563e-06, + "loss": 0.1071, + "step": 3892 + }, + { + "epoch": 1.8162313432835822, + "grad_norm": 1.1192745314307477, + "learning_rate": 4.03985810884806e-06, + "loss": 0.0942, + "step": 3894 + }, + { + "epoch": 1.8171641791044775, + "grad_norm": 1.0951331224480367, + "learning_rate": 4.034531920879591e-06, + "loss": 0.0869, + "step": 3896 + }, + { + "epoch": 1.8180970149253732, + "grad_norm": 1.015898359547634, + "learning_rate": 4.029206870644684e-06, + "loss": 0.0892, + "step": 3898 + }, + { + "epoch": 1.8190298507462686, + "grad_norm": 1.0193538642768085, + "learning_rate": 4.0238829644185175e-06, + "loss": 0.088, + "step": 3900 + }, + { + "epoch": 1.8199626865671643, + "grad_norm": 1.175930383539463, + "learning_rate": 4.018560208474927e-06, + "loss": 0.0958, + "step": 3902 + }, + { + "epoch": 1.8208955223880596, + "grad_norm": 0.9565088342567477, + "learning_rate": 4.013238609086393e-06, + "loss": 0.0787, + "step": 3904 + }, + { + "epoch": 1.8218283582089554, + "grad_norm": 1.2742170396051626, + "learning_rate": 4.007918172524031e-06, + "loss": 0.0915, + "step": 3906 + }, + { + "epoch": 1.8227611940298507, + "grad_norm": 1.1683685051254658, + "learning_rate": 4.002598905057584e-06, + "loss": 0.0899, + "step": 3908 + }, + { + "epoch": 1.8236940298507462, + "grad_norm": 1.2504392164854112, + "learning_rate": 3.997280812955423e-06, + "loss": 0.1017, + "step": 3910 + }, + { + "epoch": 1.8246268656716418, + "grad_norm": 1.2136286482853063, + "learning_rate": 3.991963902484527e-06, + "loss": 0.0958, + "step": 3912 + }, + { + "epoch": 1.8255597014925373, + "grad_norm": 1.1056367344685532, + "learning_rate": 3.986648179910491e-06, + "loss": 0.1015, + "step": 3914 + }, + { + "epoch": 1.8264925373134329, + "grad_norm": 1.0051688535853402, + "learning_rate": 3.981333651497502e-06, + "loss": 0.0941, + "step": 3916 + }, + { + "epoch": 1.8274253731343284, + "grad_norm": 1.0874480949477714, + "learning_rate": 3.976020323508345e-06, + "loss": 0.0866, + "step": 3918 + }, + { + "epoch": 1.828358208955224, + "grad_norm": 1.0076486745719422, + "learning_rate": 3.97070820220439e-06, + "loss": 0.0947, + "step": 3920 + }, + { + "epoch": 1.8292910447761193, + "grad_norm": 1.2041628641455155, + "learning_rate": 3.96539729384558e-06, + "loss": 0.0983, + "step": 3922 + }, + { + "epoch": 1.830223880597015, + "grad_norm": 1.0824329943543542, + "learning_rate": 3.9600876046904326e-06, + "loss": 0.0903, + "step": 3924 + }, + { + "epoch": 1.8311567164179103, + "grad_norm": 1.0689057768272712, + "learning_rate": 3.954779140996032e-06, + "loss": 0.0905, + "step": 3926 + }, + { + "epoch": 1.832089552238806, + "grad_norm": 1.0168855559064294, + "learning_rate": 3.949471909018012e-06, + "loss": 0.0918, + "step": 3928 + }, + { + "epoch": 1.8330223880597014, + "grad_norm": 1.0242565992788855, + "learning_rate": 3.944165915010559e-06, + "loss": 0.0888, + "step": 3930 + }, + { + "epoch": 1.8339552238805972, + "grad_norm": 1.1466337824307269, + "learning_rate": 3.938861165226398e-06, + "loss": 0.0996, + "step": 3932 + }, + { + "epoch": 1.8348880597014925, + "grad_norm": 1.1709483615899532, + "learning_rate": 3.933557665916787e-06, + "loss": 0.095, + "step": 3934 + }, + { + "epoch": 1.835820895522388, + "grad_norm": 1.06104474397146, + "learning_rate": 3.928255423331516e-06, + "loss": 0.0961, + "step": 3936 + }, + { + "epoch": 1.8367537313432836, + "grad_norm": 1.173836946104865, + "learning_rate": 3.922954443718889e-06, + "loss": 0.0999, + "step": 3938 + }, + { + "epoch": 1.837686567164179, + "grad_norm": 1.046950695546202, + "learning_rate": 3.917654733325722e-06, + "loss": 0.0877, + "step": 3940 + }, + { + "epoch": 1.8386194029850746, + "grad_norm": 1.0207179115483678, + "learning_rate": 3.912356298397338e-06, + "loss": 0.0924, + "step": 3942 + }, + { + "epoch": 1.8395522388059702, + "grad_norm": 1.256927733061973, + "learning_rate": 3.907059145177551e-06, + "loss": 0.1014, + "step": 3944 + }, + { + "epoch": 1.8404850746268657, + "grad_norm": 1.111798269664183, + "learning_rate": 3.901763279908675e-06, + "loss": 0.0916, + "step": 3946 + }, + { + "epoch": 1.841417910447761, + "grad_norm": 1.0668214975294112, + "learning_rate": 3.896468708831497e-06, + "loss": 0.0807, + "step": 3948 + }, + { + "epoch": 1.8423507462686568, + "grad_norm": 1.2604215090545203, + "learning_rate": 3.891175438185281e-06, + "loss": 0.104, + "step": 3950 + }, + { + "epoch": 1.8432835820895521, + "grad_norm": 0.912519566826213, + "learning_rate": 3.885883474207763e-06, + "loss": 0.0787, + "step": 3952 + }, + { + "epoch": 1.8442164179104479, + "grad_norm": 1.0991981974832183, + "learning_rate": 3.880592823135129e-06, + "loss": 0.0977, + "step": 3954 + }, + { + "epoch": 1.8451492537313432, + "grad_norm": 1.0354663871727174, + "learning_rate": 3.875303491202033e-06, + "loss": 0.0959, + "step": 3956 + }, + { + "epoch": 1.846082089552239, + "grad_norm": 1.083104908415455, + "learning_rate": 3.8700154846415614e-06, + "loss": 0.0885, + "step": 3958 + }, + { + "epoch": 1.8470149253731343, + "grad_norm": 1.0915617448120623, + "learning_rate": 3.864728809685244e-06, + "loss": 0.0973, + "step": 3960 + }, + { + "epoch": 1.8479477611940298, + "grad_norm": 1.0462169645645247, + "learning_rate": 3.859443472563041e-06, + "loss": 0.095, + "step": 3962 + }, + { + "epoch": 1.8488805970149254, + "grad_norm": 1.1079536096696272, + "learning_rate": 3.854159479503335e-06, + "loss": 0.1003, + "step": 3964 + }, + { + "epoch": 1.849813432835821, + "grad_norm": 1.175094326106707, + "learning_rate": 3.848876836732926e-06, + "loss": 0.0926, + "step": 3966 + }, + { + "epoch": 1.8507462686567164, + "grad_norm": 0.9792823022046725, + "learning_rate": 3.843595550477023e-06, + "loss": 0.0928, + "step": 3968 + }, + { + "epoch": 1.851679104477612, + "grad_norm": 1.3560895712988963, + "learning_rate": 3.838315626959236e-06, + "loss": 0.1075, + "step": 3970 + }, + { + "epoch": 1.8526119402985075, + "grad_norm": 1.045236010976236, + "learning_rate": 3.83303707240157e-06, + "loss": 0.0908, + "step": 3972 + }, + { + "epoch": 1.8535447761194028, + "grad_norm": 1.1881488868066707, + "learning_rate": 3.827759893024412e-06, + "loss": 0.1055, + "step": 3974 + }, + { + "epoch": 1.8544776119402986, + "grad_norm": 1.0705731080187553, + "learning_rate": 3.822484095046533e-06, + "loss": 0.095, + "step": 3976 + }, + { + "epoch": 1.855410447761194, + "grad_norm": 1.1368155490767597, + "learning_rate": 3.817209684685079e-06, + "loss": 0.0891, + "step": 3978 + }, + { + "epoch": 1.8563432835820897, + "grad_norm": 1.010088803368523, + "learning_rate": 3.811936668155554e-06, + "loss": 0.0848, + "step": 3980 + }, + { + "epoch": 1.857276119402985, + "grad_norm": 1.1623239069133264, + "learning_rate": 3.8066650516718236e-06, + "loss": 0.1044, + "step": 3982 + }, + { + "epoch": 1.8582089552238807, + "grad_norm": 0.9709903194165235, + "learning_rate": 3.8013948414461017e-06, + "loss": 0.0927, + "step": 3984 + }, + { + "epoch": 1.859141791044776, + "grad_norm": 1.1693040269647939, + "learning_rate": 3.7961260436889454e-06, + "loss": 0.0899, + "step": 3986 + }, + { + "epoch": 1.8600746268656716, + "grad_norm": 1.1244432868402252, + "learning_rate": 3.790858664609249e-06, + "loss": 0.1002, + "step": 3988 + }, + { + "epoch": 1.8610074626865671, + "grad_norm": 1.112164749972976, + "learning_rate": 3.7855927104142354e-06, + "loss": 0.1083, + "step": 3990 + }, + { + "epoch": 1.8619402985074627, + "grad_norm": 0.9622138242433877, + "learning_rate": 3.7803281873094426e-06, + "loss": 0.0804, + "step": 3992 + }, + { + "epoch": 1.8628731343283582, + "grad_norm": 1.1146764918503846, + "learning_rate": 3.7750651014987283e-06, + "loss": 0.1018, + "step": 3994 + }, + { + "epoch": 1.8638059701492538, + "grad_norm": 1.1310688782322793, + "learning_rate": 3.7698034591842536e-06, + "loss": 0.1015, + "step": 3996 + }, + { + "epoch": 1.8647388059701493, + "grad_norm": 1.0677654354420405, + "learning_rate": 3.764543266566482e-06, + "loss": 0.0942, + "step": 3998 + }, + { + "epoch": 1.8656716417910446, + "grad_norm": 1.0633403351629696, + "learning_rate": 3.7592845298441626e-06, + "loss": 0.0986, + "step": 4000 + }, + { + "epoch": 1.8656716417910446, + "eval_loss": 0.1479332447052002, + "eval_runtime": 320.9933, + "eval_samples_per_second": 47.496, + "eval_steps_per_second": 5.938, + "step": 4000 + }, + { + "epoch": 1.8666044776119404, + "grad_norm": 1.2843284382909446, + "learning_rate": 3.7540272552143343e-06, + "loss": 0.1109, + "step": 4002 + }, + { + "epoch": 1.8675373134328357, + "grad_norm": 0.986376625934349, + "learning_rate": 3.7487714488723116e-06, + "loss": 0.0844, + "step": 4004 + }, + { + "epoch": 1.8684701492537314, + "grad_norm": 1.117605149767856, + "learning_rate": 3.743517117011676e-06, + "loss": 0.091, + "step": 4006 + }, + { + "epoch": 1.8694029850746268, + "grad_norm": 1.1111481927243665, + "learning_rate": 3.7382642658242716e-06, + "loss": 0.0998, + "step": 4008 + }, + { + "epoch": 1.8703358208955225, + "grad_norm": 1.0894489967394465, + "learning_rate": 3.7330129015002066e-06, + "loss": 0.0895, + "step": 4010 + }, + { + "epoch": 1.8712686567164178, + "grad_norm": 1.1092242785657853, + "learning_rate": 3.727763030227824e-06, + "loss": 0.0949, + "step": 4012 + }, + { + "epoch": 1.8722014925373134, + "grad_norm": 1.0182803071041748, + "learning_rate": 3.7225146581937155e-06, + "loss": 0.0843, + "step": 4014 + }, + { + "epoch": 1.873134328358209, + "grad_norm": 1.0810906313746105, + "learning_rate": 3.7172677915827037e-06, + "loss": 0.0852, + "step": 4016 + }, + { + "epoch": 1.8740671641791045, + "grad_norm": 1.306685098425301, + "learning_rate": 3.7120224365778356e-06, + "loss": 0.0991, + "step": 4018 + }, + { + "epoch": 1.875, + "grad_norm": 1.14440249356662, + "learning_rate": 3.7067785993603822e-06, + "loss": 0.097, + "step": 4020 + }, + { + "epoch": 1.8759328358208955, + "grad_norm": 1.0610042957590597, + "learning_rate": 3.7015362861098197e-06, + "loss": 0.0921, + "step": 4022 + }, + { + "epoch": 1.876865671641791, + "grad_norm": 1.1835383394388608, + "learning_rate": 3.6962955030038332e-06, + "loss": 0.1, + "step": 4024 + }, + { + "epoch": 1.8777985074626866, + "grad_norm": 1.0480652020524597, + "learning_rate": 3.6910562562183006e-06, + "loss": 0.0893, + "step": 4026 + }, + { + "epoch": 1.8787313432835822, + "grad_norm": 1.0398608770884399, + "learning_rate": 3.6858185519272906e-06, + "loss": 0.1003, + "step": 4028 + }, + { + "epoch": 1.8796641791044775, + "grad_norm": 1.0971184446881803, + "learning_rate": 3.680582396303056e-06, + "loss": 0.0898, + "step": 4030 + }, + { + "epoch": 1.8805970149253732, + "grad_norm": 1.114461790561332, + "learning_rate": 3.6753477955160244e-06, + "loss": 0.0971, + "step": 4032 + }, + { + "epoch": 1.8815298507462686, + "grad_norm": 1.1316670021453272, + "learning_rate": 3.6701147557347893e-06, + "loss": 0.1072, + "step": 4034 + }, + { + "epoch": 1.8824626865671643, + "grad_norm": 1.0613309947998661, + "learning_rate": 3.664883283126106e-06, + "loss": 0.0942, + "step": 4036 + }, + { + "epoch": 1.8833955223880596, + "grad_norm": 1.0959102353877053, + "learning_rate": 3.659653383854881e-06, + "loss": 0.0806, + "step": 4038 + }, + { + "epoch": 1.8843283582089554, + "grad_norm": 1.1569978972917845, + "learning_rate": 3.65442506408417e-06, + "loss": 0.0949, + "step": 4040 + }, + { + "epoch": 1.8852611940298507, + "grad_norm": 1.028704446421795, + "learning_rate": 3.6491983299751665e-06, + "loss": 0.0909, + "step": 4042 + }, + { + "epoch": 1.8861940298507462, + "grad_norm": 0.9567363078679719, + "learning_rate": 3.6439731876871928e-06, + "loss": 0.0797, + "step": 4044 + }, + { + "epoch": 1.8871268656716418, + "grad_norm": 0.9911284399542191, + "learning_rate": 3.638749643377697e-06, + "loss": 0.0924, + "step": 4046 + }, + { + "epoch": 1.8880597014925373, + "grad_norm": 1.054706019306054, + "learning_rate": 3.6335277032022446e-06, + "loss": 0.0916, + "step": 4048 + }, + { + "epoch": 1.8889925373134329, + "grad_norm": 1.0909782656433502, + "learning_rate": 3.62830737331451e-06, + "loss": 0.0922, + "step": 4050 + }, + { + "epoch": 1.8899253731343284, + "grad_norm": 1.101096307195021, + "learning_rate": 3.6230886598662717e-06, + "loss": 0.0967, + "step": 4052 + }, + { + "epoch": 1.890858208955224, + "grad_norm": 1.107710143260698, + "learning_rate": 3.6178715690074016e-06, + "loss": 0.0888, + "step": 4054 + }, + { + "epoch": 1.8917910447761193, + "grad_norm": 1.11409851460641, + "learning_rate": 3.6126561068858613e-06, + "loss": 0.1021, + "step": 4056 + }, + { + "epoch": 1.892723880597015, + "grad_norm": 1.0608903619283372, + "learning_rate": 3.607442279647689e-06, + "loss": 0.0942, + "step": 4058 + }, + { + "epoch": 1.8936567164179103, + "grad_norm": 1.1493025908294738, + "learning_rate": 3.6022300934369976e-06, + "loss": 0.092, + "step": 4060 + }, + { + "epoch": 1.894589552238806, + "grad_norm": 1.150870057694778, + "learning_rate": 3.597019554395973e-06, + "loss": 0.093, + "step": 4062 + }, + { + "epoch": 1.8955223880597014, + "grad_norm": 1.1491826921399277, + "learning_rate": 3.591810668664851e-06, + "loss": 0.0906, + "step": 4064 + }, + { + "epoch": 1.8964552238805972, + "grad_norm": 1.2290903828259188, + "learning_rate": 3.586603442381923e-06, + "loss": 0.1042, + "step": 4066 + }, + { + "epoch": 1.8973880597014925, + "grad_norm": 1.1276710304056294, + "learning_rate": 3.581397881683525e-06, + "loss": 0.0965, + "step": 4068 + }, + { + "epoch": 1.898320895522388, + "grad_norm": 1.0896855342468688, + "learning_rate": 3.576193992704029e-06, + "loss": 0.096, + "step": 4070 + }, + { + "epoch": 1.8992537313432836, + "grad_norm": 1.1574983867200623, + "learning_rate": 3.5709917815758388e-06, + "loss": 0.0984, + "step": 4072 + }, + { + "epoch": 1.900186567164179, + "grad_norm": 1.0613423439242609, + "learning_rate": 3.5657912544293805e-06, + "loss": 0.0898, + "step": 4074 + }, + { + "epoch": 1.9011194029850746, + "grad_norm": 0.9955766320361427, + "learning_rate": 3.5605924173930946e-06, + "loss": 0.0934, + "step": 4076 + }, + { + "epoch": 1.9020522388059702, + "grad_norm": 1.0428301262283546, + "learning_rate": 3.5553952765934293e-06, + "loss": 0.0921, + "step": 4078 + }, + { + "epoch": 1.9029850746268657, + "grad_norm": 0.9710617575807672, + "learning_rate": 3.5501998381548355e-06, + "loss": 0.0915, + "step": 4080 + }, + { + "epoch": 1.903917910447761, + "grad_norm": 1.0446733803222894, + "learning_rate": 3.5450061081997584e-06, + "loss": 0.0928, + "step": 4082 + }, + { + "epoch": 1.9048507462686568, + "grad_norm": 1.115389154911288, + "learning_rate": 3.539814092848629e-06, + "loss": 0.0992, + "step": 4084 + }, + { + "epoch": 1.9057835820895521, + "grad_norm": 1.2372692872616267, + "learning_rate": 3.5346237982198586e-06, + "loss": 0.1004, + "step": 4086 + }, + { + "epoch": 1.9067164179104479, + "grad_norm": 0.955157355604134, + "learning_rate": 3.5294352304298283e-06, + "loss": 0.0848, + "step": 4088 + }, + { + "epoch": 1.9076492537313432, + "grad_norm": 1.016343699030702, + "learning_rate": 3.5242483955928887e-06, + "loss": 0.0922, + "step": 4090 + }, + { + "epoch": 1.908582089552239, + "grad_norm": 1.1036563736652032, + "learning_rate": 3.51906329982134e-06, + "loss": 0.0867, + "step": 4092 + }, + { + "epoch": 1.9095149253731343, + "grad_norm": 1.0554652654580032, + "learning_rate": 3.5138799492254462e-06, + "loss": 0.0987, + "step": 4094 + }, + { + "epoch": 1.9104477611940298, + "grad_norm": 0.9648272954788495, + "learning_rate": 3.508698349913402e-06, + "loss": 0.0885, + "step": 4096 + }, + { + "epoch": 1.9113805970149254, + "grad_norm": 1.1979815887164187, + "learning_rate": 3.5035185079913435e-06, + "loss": 0.1009, + "step": 4098 + }, + { + "epoch": 1.912313432835821, + "grad_norm": 1.1883993129279289, + "learning_rate": 3.4983404295633384e-06, + "loss": 0.0932, + "step": 4100 + }, + { + "epoch": 1.9132462686567164, + "grad_norm": 1.1208800962615217, + "learning_rate": 3.4931641207313703e-06, + "loss": 0.0911, + "step": 4102 + }, + { + "epoch": 1.914179104477612, + "grad_norm": 1.0802055023930612, + "learning_rate": 3.487989587595344e-06, + "loss": 0.0986, + "step": 4104 + }, + { + "epoch": 1.9151119402985075, + "grad_norm": 0.9722483391976391, + "learning_rate": 3.4828168362530668e-06, + "loss": 0.0859, + "step": 4106 + }, + { + "epoch": 1.9160447761194028, + "grad_norm": 1.0594462306629138, + "learning_rate": 3.4776458728002495e-06, + "loss": 0.104, + "step": 4108 + }, + { + "epoch": 1.9169776119402986, + "grad_norm": 1.1985253301270509, + "learning_rate": 3.472476703330493e-06, + "loss": 0.0987, + "step": 4110 + }, + { + "epoch": 1.917910447761194, + "grad_norm": 1.200043841460018, + "learning_rate": 3.4673093339352837e-06, + "loss": 0.0989, + "step": 4112 + }, + { + "epoch": 1.9188432835820897, + "grad_norm": 1.0469389734961136, + "learning_rate": 3.462143770703994e-06, + "loss": 0.0832, + "step": 4114 + }, + { + "epoch": 1.919776119402985, + "grad_norm": 1.0074316960370335, + "learning_rate": 3.456980019723859e-06, + "loss": 0.0919, + "step": 4116 + }, + { + "epoch": 1.9207089552238807, + "grad_norm": 1.2522119103566154, + "learning_rate": 3.451818087079982e-06, + "loss": 0.097, + "step": 4118 + }, + { + "epoch": 1.921641791044776, + "grad_norm": 1.0120829091491326, + "learning_rate": 3.446657978855325e-06, + "loss": 0.0868, + "step": 4120 + }, + { + "epoch": 1.9225746268656716, + "grad_norm": 0.9249915836071871, + "learning_rate": 3.4414997011306977e-06, + "loss": 0.0785, + "step": 4122 + }, + { + "epoch": 1.9235074626865671, + "grad_norm": 1.0473723138567228, + "learning_rate": 3.4363432599847503e-06, + "loss": 0.0872, + "step": 4124 + }, + { + "epoch": 1.9244402985074627, + "grad_norm": 1.2071183639902356, + "learning_rate": 3.4311886614939753e-06, + "loss": 0.0944, + "step": 4126 + }, + { + "epoch": 1.9253731343283582, + "grad_norm": 1.194995477333287, + "learning_rate": 3.4260359117326914e-06, + "loss": 0.1082, + "step": 4128 + }, + { + "epoch": 1.9263059701492538, + "grad_norm": 1.0506765524204482, + "learning_rate": 3.4208850167730336e-06, + "loss": 0.0974, + "step": 4130 + }, + { + "epoch": 1.9272388059701493, + "grad_norm": 1.0985602926409648, + "learning_rate": 3.4157359826849575e-06, + "loss": 0.0886, + "step": 4132 + }, + { + "epoch": 1.9281716417910446, + "grad_norm": 1.1252582551299382, + "learning_rate": 3.410588815536221e-06, + "loss": 0.1009, + "step": 4134 + }, + { + "epoch": 1.9291044776119404, + "grad_norm": 1.0293911711875088, + "learning_rate": 3.4054435213923883e-06, + "loss": 0.0877, + "step": 4136 + }, + { + "epoch": 1.9300373134328357, + "grad_norm": 1.0450353348596726, + "learning_rate": 3.4003001063168094e-06, + "loss": 0.0933, + "step": 4138 + }, + { + "epoch": 1.9309701492537314, + "grad_norm": 1.1653632467428734, + "learning_rate": 3.3951585763706246e-06, + "loss": 0.1013, + "step": 4140 + }, + { + "epoch": 1.9319029850746268, + "grad_norm": 0.9559735541014069, + "learning_rate": 3.3900189376127514e-06, + "loss": 0.0759, + "step": 4142 + }, + { + "epoch": 1.9328358208955225, + "grad_norm": 1.131094568676856, + "learning_rate": 3.384881196099874e-06, + "loss": 0.0922, + "step": 4144 + }, + { + "epoch": 1.9337686567164178, + "grad_norm": 1.0411053229375848, + "learning_rate": 3.3797453578864527e-06, + "loss": 0.0887, + "step": 4146 + }, + { + "epoch": 1.9347014925373134, + "grad_norm": 0.8612233825344169, + "learning_rate": 3.374611429024691e-06, + "loss": 0.0812, + "step": 4148 + }, + { + "epoch": 1.935634328358209, + "grad_norm": 1.0897526188813582, + "learning_rate": 3.3694794155645526e-06, + "loss": 0.085, + "step": 4150 + }, + { + "epoch": 1.9365671641791045, + "grad_norm": 1.0118318142102836, + "learning_rate": 3.3643493235537376e-06, + "loss": 0.0852, + "step": 4152 + }, + { + "epoch": 1.9375, + "grad_norm": 1.0838281994563612, + "learning_rate": 3.3592211590376855e-06, + "loss": 0.0917, + "step": 4154 + }, + { + "epoch": 1.9384328358208955, + "grad_norm": 1.0714800298171627, + "learning_rate": 3.3540949280595642e-06, + "loss": 0.0851, + "step": 4156 + }, + { + "epoch": 1.939365671641791, + "grad_norm": 1.1281338058664754, + "learning_rate": 3.3489706366602616e-06, + "loss": 0.0871, + "step": 4158 + }, + { + "epoch": 1.9402985074626866, + "grad_norm": 1.0997136060702164, + "learning_rate": 3.3438482908783813e-06, + "loss": 0.0869, + "step": 4160 + }, + { + "epoch": 1.9412313432835822, + "grad_norm": 1.1232739089985604, + "learning_rate": 3.338727896750232e-06, + "loss": 0.0987, + "step": 4162 + }, + { + "epoch": 1.9421641791044775, + "grad_norm": 1.1326358819834272, + "learning_rate": 3.3336094603098245e-06, + "loss": 0.1001, + "step": 4164 + }, + { + "epoch": 1.9430970149253732, + "grad_norm": 1.1810449957725686, + "learning_rate": 3.3284929875888603e-06, + "loss": 0.0953, + "step": 4166 + }, + { + "epoch": 1.9440298507462686, + "grad_norm": 1.2311762096066052, + "learning_rate": 3.3233784846167316e-06, + "loss": 0.0925, + "step": 4168 + }, + { + "epoch": 1.9449626865671643, + "grad_norm": 1.367796457436548, + "learning_rate": 3.3182659574205046e-06, + "loss": 0.09, + "step": 4170 + }, + { + "epoch": 1.9458955223880596, + "grad_norm": 1.1648194707236783, + "learning_rate": 3.3131554120249192e-06, + "loss": 0.0976, + "step": 4172 + }, + { + "epoch": 1.9468283582089554, + "grad_norm": 1.083277531550915, + "learning_rate": 3.3080468544523815e-06, + "loss": 0.0785, + "step": 4174 + }, + { + "epoch": 1.9477611940298507, + "grad_norm": 1.161086879050719, + "learning_rate": 3.302940290722947e-06, + "loss": 0.089, + "step": 4176 + }, + { + "epoch": 1.9486940298507462, + "grad_norm": 1.0555247408080717, + "learning_rate": 3.297835726854334e-06, + "loss": 0.0845, + "step": 4178 + }, + { + "epoch": 1.9496268656716418, + "grad_norm": 1.066838921984526, + "learning_rate": 3.292733168861898e-06, + "loss": 0.0851, + "step": 4180 + }, + { + "epoch": 1.9505597014925373, + "grad_norm": 1.0663973156900242, + "learning_rate": 3.287632622758627e-06, + "loss": 0.0901, + "step": 4182 + }, + { + "epoch": 1.9514925373134329, + "grad_norm": 1.1245648284576333, + "learning_rate": 3.282534094555143e-06, + "loss": 0.0884, + "step": 4184 + }, + { + "epoch": 1.9524253731343284, + "grad_norm": 1.0210392896418412, + "learning_rate": 3.277437590259689e-06, + "loss": 0.0881, + "step": 4186 + }, + { + "epoch": 1.953358208955224, + "grad_norm": 1.1678333482548227, + "learning_rate": 3.2723431158781227e-06, + "loss": 0.0788, + "step": 4188 + }, + { + "epoch": 1.9542910447761193, + "grad_norm": 1.1816153238571083, + "learning_rate": 3.267250677413911e-06, + "loss": 0.1061, + "step": 4190 + }, + { + "epoch": 1.955223880597015, + "grad_norm": 0.993285255139248, + "learning_rate": 3.2621602808681196e-06, + "loss": 0.0913, + "step": 4192 + }, + { + "epoch": 1.9561567164179103, + "grad_norm": 1.1651125851263444, + "learning_rate": 3.2570719322394083e-06, + "loss": 0.1102, + "step": 4194 + }, + { + "epoch": 1.957089552238806, + "grad_norm": 1.0280286979928808, + "learning_rate": 3.251985637524021e-06, + "loss": 0.0893, + "step": 4196 + }, + { + "epoch": 1.9580223880597014, + "grad_norm": 1.0394304503897773, + "learning_rate": 3.246901402715792e-06, + "loss": 0.0909, + "step": 4198 + }, + { + "epoch": 1.9589552238805972, + "grad_norm": 0.9390550085205801, + "learning_rate": 3.241819233806114e-06, + "loss": 0.09, + "step": 4200 + }, + { + "epoch": 1.9598880597014925, + "grad_norm": 1.71556465795359, + "learning_rate": 3.236739136783953e-06, + "loss": 0.0998, + "step": 4202 + }, + { + "epoch": 1.960820895522388, + "grad_norm": 1.0906502471270598, + "learning_rate": 3.231661117635833e-06, + "loss": 0.0825, + "step": 4204 + }, + { + "epoch": 1.9617537313432836, + "grad_norm": 1.2003142159748783, + "learning_rate": 3.2265851823458296e-06, + "loss": 0.0877, + "step": 4206 + }, + { + "epoch": 1.962686567164179, + "grad_norm": 1.1831936079852015, + "learning_rate": 3.2215113368955553e-06, + "loss": 0.0839, + "step": 4208 + }, + { + "epoch": 1.9636194029850746, + "grad_norm": 1.1415283450250866, + "learning_rate": 3.216439587264173e-06, + "loss": 0.0905, + "step": 4210 + }, + { + "epoch": 1.9645522388059702, + "grad_norm": 1.1583312350572683, + "learning_rate": 3.2113699394283676e-06, + "loss": 0.0831, + "step": 4212 + }, + { + "epoch": 1.9654850746268657, + "grad_norm": 1.1417172475658672, + "learning_rate": 3.2063023993623467e-06, + "loss": 0.09, + "step": 4214 + }, + { + "epoch": 1.966417910447761, + "grad_norm": 1.1579221054987296, + "learning_rate": 3.201236973037836e-06, + "loss": 0.0955, + "step": 4216 + }, + { + "epoch": 1.9673507462686568, + "grad_norm": 1.1487602747709493, + "learning_rate": 3.1961736664240696e-06, + "loss": 0.0936, + "step": 4218 + }, + { + "epoch": 1.9682835820895521, + "grad_norm": 1.0966829425924962, + "learning_rate": 3.191112485487786e-06, + "loss": 0.0857, + "step": 4220 + }, + { + "epoch": 1.9692164179104479, + "grad_norm": 1.044076184991009, + "learning_rate": 3.1860534361932166e-06, + "loss": 0.0923, + "step": 4222 + }, + { + "epoch": 1.9701492537313432, + "grad_norm": 1.1742499968427176, + "learning_rate": 3.180996524502081e-06, + "loss": 0.0909, + "step": 4224 + }, + { + "epoch": 1.971082089552239, + "grad_norm": 1.115165124779012, + "learning_rate": 3.1759417563735807e-06, + "loss": 0.099, + "step": 4226 + }, + { + "epoch": 1.9720149253731343, + "grad_norm": 1.1591610644243253, + "learning_rate": 3.170889137764387e-06, + "loss": 0.0862, + "step": 4228 + }, + { + "epoch": 1.9729477611940298, + "grad_norm": 1.1859870681815352, + "learning_rate": 3.165838674628647e-06, + "loss": 0.0898, + "step": 4230 + }, + { + "epoch": 1.9738805970149254, + "grad_norm": 1.1611480042631972, + "learning_rate": 3.160790372917958e-06, + "loss": 0.1001, + "step": 4232 + }, + { + "epoch": 1.974813432835821, + "grad_norm": 1.1998716109462533, + "learning_rate": 3.155744238581377e-06, + "loss": 0.0926, + "step": 4234 + }, + { + "epoch": 1.9757462686567164, + "grad_norm": 1.0956793067934576, + "learning_rate": 3.1507002775654028e-06, + "loss": 0.0725, + "step": 4236 + }, + { + "epoch": 1.976679104477612, + "grad_norm": 1.0270837121270664, + "learning_rate": 3.1456584958139746e-06, + "loss": 0.0984, + "step": 4238 + }, + { + "epoch": 1.9776119402985075, + "grad_norm": 1.1032505586198182, + "learning_rate": 3.140618899268466e-06, + "loss": 0.0875, + "step": 4240 + }, + { + "epoch": 1.9785447761194028, + "grad_norm": 1.1000888042768668, + "learning_rate": 3.135581493867672e-06, + "loss": 0.0972, + "step": 4242 + }, + { + "epoch": 1.9794776119402986, + "grad_norm": 1.1081542114323382, + "learning_rate": 3.1305462855478076e-06, + "loss": 0.0938, + "step": 4244 + }, + { + "epoch": 1.980410447761194, + "grad_norm": 0.9755341035686999, + "learning_rate": 3.125513280242495e-06, + "loss": 0.0845, + "step": 4246 + }, + { + "epoch": 1.9813432835820897, + "grad_norm": 1.0705233148737094, + "learning_rate": 3.1204824838827643e-06, + "loss": 0.088, + "step": 4248 + }, + { + "epoch": 1.982276119402985, + "grad_norm": 1.1021245765758034, + "learning_rate": 3.115453902397041e-06, + "loss": 0.0885, + "step": 4250 + }, + { + "epoch": 1.9832089552238807, + "grad_norm": 1.0992848410651421, + "learning_rate": 3.1104275417111424e-06, + "loss": 0.0991, + "step": 4252 + }, + { + "epoch": 1.984141791044776, + "grad_norm": 1.0808414107147213, + "learning_rate": 3.1054034077482665e-06, + "loss": 0.0913, + "step": 4254 + }, + { + "epoch": 1.9850746268656716, + "grad_norm": 0.9997517252230752, + "learning_rate": 3.1003815064289866e-06, + "loss": 0.0994, + "step": 4256 + }, + { + "epoch": 1.9860074626865671, + "grad_norm": 1.1732116264185022, + "learning_rate": 3.0953618436712497e-06, + "loss": 0.091, + "step": 4258 + }, + { + "epoch": 1.9869402985074627, + "grad_norm": 1.0301903400189767, + "learning_rate": 3.090344425390355e-06, + "loss": 0.0781, + "step": 4260 + }, + { + "epoch": 1.9878731343283582, + "grad_norm": 1.109281160627923, + "learning_rate": 3.0853292574989702e-06, + "loss": 0.0788, + "step": 4262 + }, + { + "epoch": 1.9888059701492538, + "grad_norm": 1.0738749164576116, + "learning_rate": 3.080316345907102e-06, + "loss": 0.0941, + "step": 4264 + }, + { + "epoch": 1.9897388059701493, + "grad_norm": 1.0130124275734014, + "learning_rate": 3.0753056965220975e-06, + "loss": 0.085, + "step": 4266 + }, + { + "epoch": 1.9906716417910446, + "grad_norm": 1.1281242101339484, + "learning_rate": 3.0702973152486437e-06, + "loss": 0.0807, + "step": 4268 + }, + { + "epoch": 1.9916044776119404, + "grad_norm": 1.0620107524527567, + "learning_rate": 3.065291207988749e-06, + "loss": 0.079, + "step": 4270 + }, + { + "epoch": 1.9925373134328357, + "grad_norm": 1.1022741961619187, + "learning_rate": 3.0602873806417483e-06, + "loss": 0.0898, + "step": 4272 + }, + { + "epoch": 1.9934701492537314, + "grad_norm": 1.056330071950859, + "learning_rate": 3.0552858391042843e-06, + "loss": 0.0836, + "step": 4274 + }, + { + "epoch": 1.9944029850746268, + "grad_norm": 1.0060430451565396, + "learning_rate": 3.050286589270309e-06, + "loss": 0.098, + "step": 4276 + }, + { + "epoch": 1.9953358208955225, + "grad_norm": 1.169081076025697, + "learning_rate": 3.0452896370310737e-06, + "loss": 0.0933, + "step": 4278 + }, + { + "epoch": 1.9962686567164178, + "grad_norm": 1.088215574341822, + "learning_rate": 3.0402949882751167e-06, + "loss": 0.085, + "step": 4280 + }, + { + "epoch": 1.9972014925373134, + "grad_norm": 1.0244345964501775, + "learning_rate": 3.035302648888273e-06, + "loss": 0.0822, + "step": 4282 + }, + { + "epoch": 1.998134328358209, + "grad_norm": 1.059627502264395, + "learning_rate": 3.030312624753645e-06, + "loss": 0.0901, + "step": 4284 + }, + { + "epoch": 1.9990671641791045, + "grad_norm": 1.0628016884605678, + "learning_rate": 3.025324921751614e-06, + "loss": 0.0932, + "step": 4286 + }, + { + "epoch": 2.0, + "grad_norm": 0.9871489671834679, + "learning_rate": 3.0203395457598215e-06, + "loss": 0.0849, + "step": 4288 + }, + { + "epoch": 2.0009328358208953, + "grad_norm": 0.6931360568996546, + "learning_rate": 3.0153565026531708e-06, + "loss": 0.0431, + "step": 4290 + }, + { + "epoch": 2.001865671641791, + "grad_norm": 0.7377454433761019, + "learning_rate": 3.0103757983038105e-06, + "loss": 0.0471, + "step": 4292 + }, + { + "epoch": 2.0027985074626864, + "grad_norm": 0.7304073103295363, + "learning_rate": 3.0053974385811403e-06, + "loss": 0.0399, + "step": 4294 + }, + { + "epoch": 2.003731343283582, + "grad_norm": 0.7003735271050772, + "learning_rate": 3.0004214293517925e-06, + "loss": 0.0416, + "step": 4296 + }, + { + "epoch": 2.0046641791044775, + "grad_norm": 0.6983732862250597, + "learning_rate": 2.9954477764796284e-06, + "loss": 0.0358, + "step": 4298 + }, + { + "epoch": 2.0055970149253732, + "grad_norm": 0.8109537118564591, + "learning_rate": 2.990476485825736e-06, + "loss": 0.0406, + "step": 4300 + }, + { + "epoch": 2.0065298507462686, + "grad_norm": 0.8516665775571368, + "learning_rate": 2.9855075632484166e-06, + "loss": 0.0401, + "step": 4302 + }, + { + "epoch": 2.0074626865671643, + "grad_norm": 0.9435027292265361, + "learning_rate": 2.980541014603183e-06, + "loss": 0.0385, + "step": 4304 + }, + { + "epoch": 2.0083955223880596, + "grad_norm": 0.7255463921775398, + "learning_rate": 2.9755768457427514e-06, + "loss": 0.0361, + "step": 4306 + }, + { + "epoch": 2.0093283582089554, + "grad_norm": 0.8805614327917599, + "learning_rate": 2.9706150625170295e-06, + "loss": 0.0291, + "step": 4308 + }, + { + "epoch": 2.0102611940298507, + "grad_norm": 1.0058807213756584, + "learning_rate": 2.9656556707731176e-06, + "loss": 0.0345, + "step": 4310 + }, + { + "epoch": 2.0111940298507465, + "grad_norm": 1.1415558500838885, + "learning_rate": 2.9606986763552936e-06, + "loss": 0.0428, + "step": 4312 + }, + { + "epoch": 2.012126865671642, + "grad_norm": 1.0303970432207215, + "learning_rate": 2.955744085105017e-06, + "loss": 0.0379, + "step": 4314 + }, + { + "epoch": 2.013059701492537, + "grad_norm": 0.9995586908068365, + "learning_rate": 2.95079190286091e-06, + "loss": 0.0402, + "step": 4316 + }, + { + "epoch": 2.013992537313433, + "grad_norm": 0.922271067089082, + "learning_rate": 2.9458421354587567e-06, + "loss": 0.0334, + "step": 4318 + }, + { + "epoch": 2.014925373134328, + "grad_norm": 0.8676276239826717, + "learning_rate": 2.9408947887314966e-06, + "loss": 0.0323, + "step": 4320 + }, + { + "epoch": 2.015858208955224, + "grad_norm": 0.9690871345533748, + "learning_rate": 2.9359498685092156e-06, + "loss": 0.041, + "step": 4322 + }, + { + "epoch": 2.0167910447761193, + "grad_norm": 0.911334213441479, + "learning_rate": 2.931007380619141e-06, + "loss": 0.0329, + "step": 4324 + }, + { + "epoch": 2.017723880597015, + "grad_norm": 0.8072976599069374, + "learning_rate": 2.9260673308856345e-06, + "loss": 0.0337, + "step": 4326 + }, + { + "epoch": 2.0186567164179103, + "grad_norm": 0.7563023773058475, + "learning_rate": 2.921129725130183e-06, + "loss": 0.027, + "step": 4328 + }, + { + "epoch": 2.019589552238806, + "grad_norm": 0.8423642891811632, + "learning_rate": 2.9161945691713944e-06, + "loss": 0.0368, + "step": 4330 + }, + { + "epoch": 2.0205223880597014, + "grad_norm": 0.9040389842217097, + "learning_rate": 2.9112618688249874e-06, + "loss": 0.0338, + "step": 4332 + }, + { + "epoch": 2.021455223880597, + "grad_norm": 0.9899962998214459, + "learning_rate": 2.9063316299037904e-06, + "loss": 0.0386, + "step": 4334 + }, + { + "epoch": 2.0223880597014925, + "grad_norm": 0.9324323804527187, + "learning_rate": 2.90140385821773e-06, + "loss": 0.0317, + "step": 4336 + }, + { + "epoch": 2.0233208955223883, + "grad_norm": 0.9223844323445922, + "learning_rate": 2.8964785595738254e-06, + "loss": 0.0373, + "step": 4338 + }, + { + "epoch": 2.0242537313432836, + "grad_norm": 0.9481764801025405, + "learning_rate": 2.8915557397761774e-06, + "loss": 0.037, + "step": 4340 + }, + { + "epoch": 2.025186567164179, + "grad_norm": 0.9152698440272568, + "learning_rate": 2.8866354046259736e-06, + "loss": 0.0361, + "step": 4342 + }, + { + "epoch": 2.0261194029850746, + "grad_norm": 0.7787048213131694, + "learning_rate": 2.8817175599214653e-06, + "loss": 0.0337, + "step": 4344 + }, + { + "epoch": 2.02705223880597, + "grad_norm": 1.0474553599024774, + "learning_rate": 2.8768022114579757e-06, + "loss": 0.0352, + "step": 4346 + }, + { + "epoch": 2.0279850746268657, + "grad_norm": 1.1321614377422469, + "learning_rate": 2.871889365027885e-06, + "loss": 0.0415, + "step": 4348 + }, + { + "epoch": 2.028917910447761, + "grad_norm": 0.7472035276071027, + "learning_rate": 2.86697902642062e-06, + "loss": 0.0315, + "step": 4350 + }, + { + "epoch": 2.029850746268657, + "grad_norm": 0.9410204403680684, + "learning_rate": 2.8620712014226594e-06, + "loss": 0.0358, + "step": 4352 + }, + { + "epoch": 2.030783582089552, + "grad_norm": 0.9404385467072122, + "learning_rate": 2.8571658958175126e-06, + "loss": 0.0346, + "step": 4354 + }, + { + "epoch": 2.031716417910448, + "grad_norm": 0.9364766899127865, + "learning_rate": 2.852263115385725e-06, + "loss": 0.0334, + "step": 4356 + }, + { + "epoch": 2.032649253731343, + "grad_norm": 0.9143157879297902, + "learning_rate": 2.847362865904868e-06, + "loss": 0.0345, + "step": 4358 + }, + { + "epoch": 2.033582089552239, + "grad_norm": 0.9643445484521032, + "learning_rate": 2.842465153149525e-06, + "loss": 0.0359, + "step": 4360 + }, + { + "epoch": 2.0345149253731343, + "grad_norm": 0.9383657334350942, + "learning_rate": 2.8375699828912895e-06, + "loss": 0.0343, + "step": 4362 + }, + { + "epoch": 2.03544776119403, + "grad_norm": 1.0326063733701467, + "learning_rate": 2.832677360898768e-06, + "loss": 0.0365, + "step": 4364 + }, + { + "epoch": 2.0363805970149254, + "grad_norm": 1.0008038885283892, + "learning_rate": 2.8277872929375515e-06, + "loss": 0.0332, + "step": 4366 + }, + { + "epoch": 2.0373134328358207, + "grad_norm": 1.0490424912019125, + "learning_rate": 2.822899784770232e-06, + "loss": 0.0349, + "step": 4368 + }, + { + "epoch": 2.0382462686567164, + "grad_norm": 0.8746090013949105, + "learning_rate": 2.8180148421563803e-06, + "loss": 0.0314, + "step": 4370 + }, + { + "epoch": 2.0391791044776117, + "grad_norm": 0.9742008039517758, + "learning_rate": 2.813132470852543e-06, + "loss": 0.0359, + "step": 4372 + }, + { + "epoch": 2.0401119402985075, + "grad_norm": 0.8831437330245768, + "learning_rate": 2.8082526766122377e-06, + "loss": 0.0349, + "step": 4374 + }, + { + "epoch": 2.041044776119403, + "grad_norm": 0.8352574338089715, + "learning_rate": 2.803375465185944e-06, + "loss": 0.0377, + "step": 4376 + }, + { + "epoch": 2.0419776119402986, + "grad_norm": 0.9386137569142765, + "learning_rate": 2.7985008423211037e-06, + "loss": 0.0362, + "step": 4378 + }, + { + "epoch": 2.042910447761194, + "grad_norm": 0.722804351685375, + "learning_rate": 2.7936288137620976e-06, + "loss": 0.0319, + "step": 4380 + }, + { + "epoch": 2.0438432835820897, + "grad_norm": 0.859401137020611, + "learning_rate": 2.7887593852502604e-06, + "loss": 0.0331, + "step": 4382 + }, + { + "epoch": 2.044776119402985, + "grad_norm": 0.8811036462247247, + "learning_rate": 2.783892562523854e-06, + "loss": 0.0358, + "step": 4384 + }, + { + "epoch": 2.0457089552238807, + "grad_norm": 0.9824317430351688, + "learning_rate": 2.7790283513180736e-06, + "loss": 0.0377, + "step": 4386 + }, + { + "epoch": 2.046641791044776, + "grad_norm": 0.9387833059761242, + "learning_rate": 2.774166757365041e-06, + "loss": 0.0351, + "step": 4388 + }, + { + "epoch": 2.047574626865672, + "grad_norm": 0.9062659882647344, + "learning_rate": 2.769307786393785e-06, + "loss": 0.0366, + "step": 4390 + }, + { + "epoch": 2.048507462686567, + "grad_norm": 1.0864150502672876, + "learning_rate": 2.7644514441302466e-06, + "loss": 0.0372, + "step": 4392 + }, + { + "epoch": 2.049440298507463, + "grad_norm": 1.3917399137418618, + "learning_rate": 2.7595977362972747e-06, + "loss": 0.0355, + "step": 4394 + }, + { + "epoch": 2.050373134328358, + "grad_norm": 0.9938119244375839, + "learning_rate": 2.754746668614604e-06, + "loss": 0.0358, + "step": 4396 + }, + { + "epoch": 2.0513059701492535, + "grad_norm": 0.7966865598676414, + "learning_rate": 2.7498982467988668e-06, + "loss": 0.0317, + "step": 4398 + }, + { + "epoch": 2.0522388059701493, + "grad_norm": 0.9056389639839669, + "learning_rate": 2.745052476563574e-06, + "loss": 0.0332, + "step": 4400 + }, + { + "epoch": 2.0531716417910446, + "grad_norm": 0.8418617565436706, + "learning_rate": 2.7402093636191085e-06, + "loss": 0.0353, + "step": 4402 + }, + { + "epoch": 2.0541044776119404, + "grad_norm": 0.8412614340932287, + "learning_rate": 2.735368913672729e-06, + "loss": 0.0368, + "step": 4404 + }, + { + "epoch": 2.0550373134328357, + "grad_norm": 0.9346669550850343, + "learning_rate": 2.7305311324285506e-06, + "loss": 0.0307, + "step": 4406 + }, + { + "epoch": 2.0559701492537314, + "grad_norm": 1.0988111290308489, + "learning_rate": 2.7256960255875396e-06, + "loss": 0.0322, + "step": 4408 + }, + { + "epoch": 2.0569029850746268, + "grad_norm": 0.844308544081491, + "learning_rate": 2.720863598847524e-06, + "loss": 0.0315, + "step": 4410 + }, + { + "epoch": 2.0578358208955225, + "grad_norm": 0.9981226105492794, + "learning_rate": 2.7160338579031627e-06, + "loss": 0.0307, + "step": 4412 + }, + { + "epoch": 2.058768656716418, + "grad_norm": 0.9761698042116492, + "learning_rate": 2.711206808445949e-06, + "loss": 0.036, + "step": 4414 + }, + { + "epoch": 2.0597014925373136, + "grad_norm": 0.8809001352579211, + "learning_rate": 2.7063824561642134e-06, + "loss": 0.035, + "step": 4416 + }, + { + "epoch": 2.060634328358209, + "grad_norm": 1.0174452558911835, + "learning_rate": 2.7015608067430965e-06, + "loss": 0.0368, + "step": 4418 + }, + { + "epoch": 2.0615671641791047, + "grad_norm": 0.839889447971411, + "learning_rate": 2.696741865864564e-06, + "loss": 0.0331, + "step": 4420 + }, + { + "epoch": 2.0625, + "grad_norm": 1.002150312971576, + "learning_rate": 2.691925639207385e-06, + "loss": 0.0289, + "step": 4422 + }, + { + "epoch": 2.0634328358208953, + "grad_norm": 0.9063546968160399, + "learning_rate": 2.6871121324471305e-06, + "loss": 0.0318, + "step": 4424 + }, + { + "epoch": 2.064365671641791, + "grad_norm": 1.11419944914278, + "learning_rate": 2.682301351256163e-06, + "loss": 0.035, + "step": 4426 + }, + { + "epoch": 2.0652985074626864, + "grad_norm": 1.1294151793692213, + "learning_rate": 2.6774933013036396e-06, + "loss": 0.0379, + "step": 4428 + }, + { + "epoch": 2.066231343283582, + "grad_norm": 1.002134123185201, + "learning_rate": 2.6726879882554968e-06, + "loss": 0.0345, + "step": 4430 + }, + { + "epoch": 2.0671641791044775, + "grad_norm": 0.9959881270577299, + "learning_rate": 2.6678854177744416e-06, + "loss": 0.0296, + "step": 4432 + }, + { + "epoch": 2.0680970149253732, + "grad_norm": 0.9493261354779687, + "learning_rate": 2.6630855955199566e-06, + "loss": 0.0278, + "step": 4434 + }, + { + "epoch": 2.0690298507462686, + "grad_norm": 0.87872995705695, + "learning_rate": 2.6582885271482757e-06, + "loss": 0.0332, + "step": 4436 + }, + { + "epoch": 2.0699626865671643, + "grad_norm": 0.9175728195839566, + "learning_rate": 2.653494218312397e-06, + "loss": 0.0319, + "step": 4438 + }, + { + "epoch": 2.0708955223880596, + "grad_norm": 0.8619144692708061, + "learning_rate": 2.6487026746620637e-06, + "loss": 0.0316, + "step": 4440 + }, + { + "epoch": 2.0718283582089554, + "grad_norm": 1.000841756088507, + "learning_rate": 2.643913901843759e-06, + "loss": 0.0387, + "step": 4442 + }, + { + "epoch": 2.0727611940298507, + "grad_norm": 0.9160370473851499, + "learning_rate": 2.639127905500699e-06, + "loss": 0.0335, + "step": 4444 + }, + { + "epoch": 2.0736940298507465, + "grad_norm": 0.8524799209467483, + "learning_rate": 2.6343446912728348e-06, + "loss": 0.0284, + "step": 4446 + }, + { + "epoch": 2.074626865671642, + "grad_norm": 0.9454922612542884, + "learning_rate": 2.6295642647968307e-06, + "loss": 0.0336, + "step": 4448 + }, + { + "epoch": 2.075559701492537, + "grad_norm": 0.9648230986119682, + "learning_rate": 2.624786631706071e-06, + "loss": 0.0359, + "step": 4450 + }, + { + "epoch": 2.076492537313433, + "grad_norm": 0.9570495850111927, + "learning_rate": 2.6200117976306506e-06, + "loss": 0.0343, + "step": 4452 + }, + { + "epoch": 2.077425373134328, + "grad_norm": 0.863764434660277, + "learning_rate": 2.615239768197357e-06, + "loss": 0.0376, + "step": 4454 + }, + { + "epoch": 2.078358208955224, + "grad_norm": 0.9767886015359025, + "learning_rate": 2.610470549029684e-06, + "loss": 0.0365, + "step": 4456 + }, + { + "epoch": 2.0792910447761193, + "grad_norm": 0.9108636703213165, + "learning_rate": 2.605704145747804e-06, + "loss": 0.0301, + "step": 4458 + }, + { + "epoch": 2.080223880597015, + "grad_norm": 0.8516455251630574, + "learning_rate": 2.600940563968571e-06, + "loss": 0.034, + "step": 4460 + }, + { + "epoch": 2.0811567164179103, + "grad_norm": 1.0982161468443037, + "learning_rate": 2.596179809305526e-06, + "loss": 0.0377, + "step": 4462 + }, + { + "epoch": 2.082089552238806, + "grad_norm": 0.9093692567373183, + "learning_rate": 2.5914218873688678e-06, + "loss": 0.037, + "step": 4464 + }, + { + "epoch": 2.0830223880597014, + "grad_norm": 1.0092770381855278, + "learning_rate": 2.5866668037654557e-06, + "loss": 0.0385, + "step": 4466 + }, + { + "epoch": 2.083955223880597, + "grad_norm": 0.9622699089394329, + "learning_rate": 2.581914564098813e-06, + "loss": 0.0325, + "step": 4468 + }, + { + "epoch": 2.0848880597014925, + "grad_norm": 1.0438505108352771, + "learning_rate": 2.577165173969103e-06, + "loss": 0.031, + "step": 4470 + }, + { + "epoch": 2.0858208955223883, + "grad_norm": 0.9997481642261107, + "learning_rate": 2.5724186389731364e-06, + "loss": 0.0366, + "step": 4472 + }, + { + "epoch": 2.0867537313432836, + "grad_norm": 1.0884582418232287, + "learning_rate": 2.5676749647043602e-06, + "loss": 0.0355, + "step": 4474 + }, + { + "epoch": 2.0876865671641793, + "grad_norm": 0.9273682507257243, + "learning_rate": 2.5629341567528453e-06, + "loss": 0.0372, + "step": 4476 + }, + { + "epoch": 2.0886194029850746, + "grad_norm": 0.8771097952377875, + "learning_rate": 2.5581962207052856e-06, + "loss": 0.0316, + "step": 4478 + }, + { + "epoch": 2.08955223880597, + "grad_norm": 0.8606340193338733, + "learning_rate": 2.553461162144994e-06, + "loss": 0.0315, + "step": 4480 + }, + { + "epoch": 2.0904850746268657, + "grad_norm": 0.945052662282763, + "learning_rate": 2.5487289866518937e-06, + "loss": 0.0402, + "step": 4482 + }, + { + "epoch": 2.091417910447761, + "grad_norm": 0.937247860729769, + "learning_rate": 2.543999699802503e-06, + "loss": 0.0336, + "step": 4484 + }, + { + "epoch": 2.092350746268657, + "grad_norm": 0.8243058279419139, + "learning_rate": 2.5392733071699443e-06, + "loss": 0.0306, + "step": 4486 + }, + { + "epoch": 2.093283582089552, + "grad_norm": 1.001169897669878, + "learning_rate": 2.5345498143239233e-06, + "loss": 0.0356, + "step": 4488 + }, + { + "epoch": 2.094216417910448, + "grad_norm": 0.8378463772807123, + "learning_rate": 2.5298292268307333e-06, + "loss": 0.0364, + "step": 4490 + }, + { + "epoch": 2.095149253731343, + "grad_norm": 0.9773868622932619, + "learning_rate": 2.52511155025324e-06, + "loss": 0.0314, + "step": 4492 + }, + { + "epoch": 2.096082089552239, + "grad_norm": 0.9018453297555646, + "learning_rate": 2.520396790150881e-06, + "loss": 0.0333, + "step": 4494 + }, + { + "epoch": 2.0970149253731343, + "grad_norm": 1.0558594419317222, + "learning_rate": 2.5156849520796558e-06, + "loss": 0.0361, + "step": 4496 + }, + { + "epoch": 2.09794776119403, + "grad_norm": 0.9667061096277667, + "learning_rate": 2.510976041592123e-06, + "loss": 0.0341, + "step": 4498 + }, + { + "epoch": 2.0988805970149254, + "grad_norm": 0.8584431894215813, + "learning_rate": 2.5062700642373868e-06, + "loss": 0.0296, + "step": 4500 + }, + { + "epoch": 2.0988805970149254, + "eval_loss": 0.1816190481185913, + "eval_runtime": 322.5454, + "eval_samples_per_second": 47.268, + "eval_steps_per_second": 5.909, + "step": 4500 + }, + { + "epoch": 2.0998134328358207, + "grad_norm": 0.8906044834057271, + "learning_rate": 2.501567025561098e-06, + "loss": 0.0348, + "step": 4502 + }, + { + "epoch": 2.1007462686567164, + "grad_norm": 0.9724157585934898, + "learning_rate": 2.4968669311054473e-06, + "loss": 0.0345, + "step": 4504 + }, + { + "epoch": 2.1016791044776117, + "grad_norm": 1.081134027325536, + "learning_rate": 2.4921697864091478e-06, + "loss": 0.0387, + "step": 4506 + }, + { + "epoch": 2.1026119402985075, + "grad_norm": 1.0419985812048587, + "learning_rate": 2.4874755970074448e-06, + "loss": 0.0337, + "step": 4508 + }, + { + "epoch": 2.103544776119403, + "grad_norm": 0.9031019447209006, + "learning_rate": 2.4827843684320967e-06, + "loss": 0.0375, + "step": 4510 + }, + { + "epoch": 2.1044776119402986, + "grad_norm": 0.8739561592400861, + "learning_rate": 2.4780961062113683e-06, + "loss": 0.0312, + "step": 4512 + }, + { + "epoch": 2.105410447761194, + "grad_norm": 1.0421421043853216, + "learning_rate": 2.473410815870042e-06, + "loss": 0.0371, + "step": 4514 + }, + { + "epoch": 2.1063432835820897, + "grad_norm": 0.8601097626529265, + "learning_rate": 2.4687285029293866e-06, + "loss": 0.0363, + "step": 4516 + }, + { + "epoch": 2.107276119402985, + "grad_norm": 0.8626554964214478, + "learning_rate": 2.4640491729071635e-06, + "loss": 0.0327, + "step": 4518 + }, + { + "epoch": 2.1082089552238807, + "grad_norm": 0.8135345781725284, + "learning_rate": 2.4593728313176246e-06, + "loss": 0.0304, + "step": 4520 + }, + { + "epoch": 2.109141791044776, + "grad_norm": 1.0102193837120443, + "learning_rate": 2.454699483671493e-06, + "loss": 0.0334, + "step": 4522 + }, + { + "epoch": 2.110074626865672, + "grad_norm": 0.9757241923191624, + "learning_rate": 2.450029135475969e-06, + "loss": 0.0377, + "step": 4524 + }, + { + "epoch": 2.111007462686567, + "grad_norm": 0.8986898680617315, + "learning_rate": 2.4453617922347194e-06, + "loss": 0.0303, + "step": 4526 + }, + { + "epoch": 2.111940298507463, + "grad_norm": 1.0113102214648557, + "learning_rate": 2.440697459447864e-06, + "loss": 0.0395, + "step": 4528 + }, + { + "epoch": 2.112873134328358, + "grad_norm": 0.9163503674162015, + "learning_rate": 2.4360361426119767e-06, + "loss": 0.0345, + "step": 4530 + }, + { + "epoch": 2.1138059701492535, + "grad_norm": 0.8831550411456144, + "learning_rate": 2.4313778472200824e-06, + "loss": 0.0324, + "step": 4532 + }, + { + "epoch": 2.1147388059701493, + "grad_norm": 0.8697203486215254, + "learning_rate": 2.4267225787616376e-06, + "loss": 0.0322, + "step": 4534 + }, + { + "epoch": 2.1156716417910446, + "grad_norm": 0.8707276440239047, + "learning_rate": 2.4220703427225384e-06, + "loss": 0.0326, + "step": 4536 + }, + { + "epoch": 2.1166044776119404, + "grad_norm": 0.9452643458255807, + "learning_rate": 2.4174211445851066e-06, + "loss": 0.0342, + "step": 4538 + }, + { + "epoch": 2.1175373134328357, + "grad_norm": 0.9742990319517244, + "learning_rate": 2.4127749898280783e-06, + "loss": 0.0384, + "step": 4540 + }, + { + "epoch": 2.1184701492537314, + "grad_norm": 1.0034601356655437, + "learning_rate": 2.4081318839266117e-06, + "loss": 0.036, + "step": 4542 + }, + { + "epoch": 2.1194029850746268, + "grad_norm": 0.9056355918535672, + "learning_rate": 2.4034918323522628e-06, + "loss": 0.029, + "step": 4544 + }, + { + "epoch": 2.1203358208955225, + "grad_norm": 0.8015820442039946, + "learning_rate": 2.398854840572998e-06, + "loss": 0.0321, + "step": 4546 + }, + { + "epoch": 2.121268656716418, + "grad_norm": 0.8309431070007485, + "learning_rate": 2.3942209140531693e-06, + "loss": 0.0311, + "step": 4548 + }, + { + "epoch": 2.1222014925373136, + "grad_norm": 0.8648653780893618, + "learning_rate": 2.389590058253523e-06, + "loss": 0.033, + "step": 4550 + }, + { + "epoch": 2.123134328358209, + "grad_norm": 1.1200010143374086, + "learning_rate": 2.384962278631182e-06, + "loss": 0.0387, + "step": 4552 + }, + { + "epoch": 2.1240671641791047, + "grad_norm": 0.9618596793829509, + "learning_rate": 2.3803375806396474e-06, + "loss": 0.0373, + "step": 4554 + }, + { + "epoch": 2.125, + "grad_norm": 1.004063616906792, + "learning_rate": 2.3757159697287895e-06, + "loss": 0.0376, + "step": 4556 + }, + { + "epoch": 2.1259328358208953, + "grad_norm": 0.8760913390235822, + "learning_rate": 2.371097451344836e-06, + "loss": 0.0312, + "step": 4558 + }, + { + "epoch": 2.126865671641791, + "grad_norm": 1.064785225545377, + "learning_rate": 2.366482030930376e-06, + "loss": 0.0344, + "step": 4560 + }, + { + "epoch": 2.1277985074626864, + "grad_norm": 0.940320101450485, + "learning_rate": 2.3618697139243437e-06, + "loss": 0.0347, + "step": 4562 + }, + { + "epoch": 2.128731343283582, + "grad_norm": 0.8600791222658538, + "learning_rate": 2.357260505762015e-06, + "loss": 0.0331, + "step": 4564 + }, + { + "epoch": 2.1296641791044775, + "grad_norm": 1.0088948594978933, + "learning_rate": 2.3526544118750077e-06, + "loss": 0.0354, + "step": 4566 + }, + { + "epoch": 2.1305970149253732, + "grad_norm": 0.9604993555548367, + "learning_rate": 2.348051437691268e-06, + "loss": 0.0341, + "step": 4568 + }, + { + "epoch": 2.1315298507462686, + "grad_norm": 1.0054616972058592, + "learning_rate": 2.343451588635061e-06, + "loss": 0.0373, + "step": 4570 + }, + { + "epoch": 2.1324626865671643, + "grad_norm": 1.0641537750139058, + "learning_rate": 2.3388548701269763e-06, + "loss": 0.033, + "step": 4572 + }, + { + "epoch": 2.1333955223880596, + "grad_norm": 1.0002127636434242, + "learning_rate": 2.3342612875839095e-06, + "loss": 0.0362, + "step": 4574 + }, + { + "epoch": 2.1343283582089554, + "grad_norm": 0.9797640995927954, + "learning_rate": 2.3296708464190567e-06, + "loss": 0.0332, + "step": 4576 + }, + { + "epoch": 2.1352611940298507, + "grad_norm": 0.915471639453477, + "learning_rate": 2.325083552041925e-06, + "loss": 0.0334, + "step": 4578 + }, + { + "epoch": 2.1361940298507465, + "grad_norm": 1.1342225227237468, + "learning_rate": 2.3204994098583026e-06, + "loss": 0.0357, + "step": 4580 + }, + { + "epoch": 2.137126865671642, + "grad_norm": 1.012524808599099, + "learning_rate": 2.3159184252702636e-06, + "loss": 0.04, + "step": 4582 + }, + { + "epoch": 2.138059701492537, + "grad_norm": 0.8438797093214802, + "learning_rate": 2.3113406036761676e-06, + "loss": 0.0295, + "step": 4584 + }, + { + "epoch": 2.138992537313433, + "grad_norm": 0.9949292984726634, + "learning_rate": 2.306765950470639e-06, + "loss": 0.0367, + "step": 4586 + }, + { + "epoch": 2.139925373134328, + "grad_norm": 0.9869746962652547, + "learning_rate": 2.302194471044573e-06, + "loss": 0.0329, + "step": 4588 + }, + { + "epoch": 2.140858208955224, + "grad_norm": 0.96929114570457, + "learning_rate": 2.2976261707851272e-06, + "loss": 0.0352, + "step": 4590 + }, + { + "epoch": 2.1417910447761193, + "grad_norm": 1.0021017263034497, + "learning_rate": 2.293061055075707e-06, + "loss": 0.0318, + "step": 4592 + }, + { + "epoch": 2.142723880597015, + "grad_norm": 1.0421455590386683, + "learning_rate": 2.288499129295966e-06, + "loss": 0.0327, + "step": 4594 + }, + { + "epoch": 2.1436567164179103, + "grad_norm": 0.8489890614711563, + "learning_rate": 2.2839403988218016e-06, + "loss": 0.0319, + "step": 4596 + }, + { + "epoch": 2.144589552238806, + "grad_norm": 0.7705589657767264, + "learning_rate": 2.279384869025347e-06, + "loss": 0.0327, + "step": 4598 + }, + { + "epoch": 2.1455223880597014, + "grad_norm": 1.1150761153080226, + "learning_rate": 2.2748325452749567e-06, + "loss": 0.0369, + "step": 4600 + }, + { + "epoch": 2.146455223880597, + "grad_norm": 1.0975772203589185, + "learning_rate": 2.270283432935216e-06, + "loss": 0.0356, + "step": 4602 + }, + { + "epoch": 2.1473880597014925, + "grad_norm": 0.9610466206421365, + "learning_rate": 2.265737537366916e-06, + "loss": 0.0312, + "step": 4604 + }, + { + "epoch": 2.1483208955223883, + "grad_norm": 0.9811749020127127, + "learning_rate": 2.261194863927068e-06, + "loss": 0.0349, + "step": 4606 + }, + { + "epoch": 2.1492537313432836, + "grad_norm": 1.0291519371503837, + "learning_rate": 2.2566554179688756e-06, + "loss": 0.0299, + "step": 4608 + }, + { + "epoch": 2.1501865671641793, + "grad_norm": 1.0796810043556722, + "learning_rate": 2.252119204841747e-06, + "loss": 0.0305, + "step": 4610 + }, + { + "epoch": 2.1511194029850746, + "grad_norm": 0.9349459452646798, + "learning_rate": 2.2475862298912784e-06, + "loss": 0.0306, + "step": 4612 + }, + { + "epoch": 2.15205223880597, + "grad_norm": 1.0097370464745856, + "learning_rate": 2.243056498459248e-06, + "loss": 0.0356, + "step": 4614 + }, + { + "epoch": 2.1529850746268657, + "grad_norm": 1.0695894313307166, + "learning_rate": 2.2385300158836116e-06, + "loss": 0.0365, + "step": 4616 + }, + { + "epoch": 2.153917910447761, + "grad_norm": 0.909446348523728, + "learning_rate": 2.2340067874984995e-06, + "loss": 0.0313, + "step": 4618 + }, + { + "epoch": 2.154850746268657, + "grad_norm": 1.463227945494659, + "learning_rate": 2.2294868186342085e-06, + "loss": 0.0344, + "step": 4620 + }, + { + "epoch": 2.155783582089552, + "grad_norm": 0.885503083261715, + "learning_rate": 2.2249701146171864e-06, + "loss": 0.0321, + "step": 4622 + }, + { + "epoch": 2.156716417910448, + "grad_norm": 0.9832054843436491, + "learning_rate": 2.2204566807700433e-06, + "loss": 0.0347, + "step": 4624 + }, + { + "epoch": 2.157649253731343, + "grad_norm": 0.8772604963675062, + "learning_rate": 2.2159465224115295e-06, + "loss": 0.032, + "step": 4626 + }, + { + "epoch": 2.158582089552239, + "grad_norm": 1.1186244209607723, + "learning_rate": 2.2114396448565328e-06, + "loss": 0.0375, + "step": 4628 + }, + { + "epoch": 2.1595149253731343, + "grad_norm": 0.9442783503559156, + "learning_rate": 2.2069360534160865e-06, + "loss": 0.0363, + "step": 4630 + }, + { + "epoch": 2.16044776119403, + "grad_norm": 0.9397474301368692, + "learning_rate": 2.20243575339734e-06, + "loss": 0.0309, + "step": 4632 + }, + { + "epoch": 2.1613805970149254, + "grad_norm": 0.8914471569658011, + "learning_rate": 2.1979387501035666e-06, + "loss": 0.0302, + "step": 4634 + }, + { + "epoch": 2.1623134328358207, + "grad_norm": 0.8553458524065944, + "learning_rate": 2.1934450488341584e-06, + "loss": 0.0305, + "step": 4636 + }, + { + "epoch": 2.1632462686567164, + "grad_norm": 0.9603054838866182, + "learning_rate": 2.1889546548846117e-06, + "loss": 0.0317, + "step": 4638 + }, + { + "epoch": 2.1641791044776117, + "grad_norm": 0.980620474309731, + "learning_rate": 2.1844675735465285e-06, + "loss": 0.0368, + "step": 4640 + }, + { + "epoch": 2.1651119402985075, + "grad_norm": 0.8389929699446843, + "learning_rate": 2.1799838101076086e-06, + "loss": 0.0297, + "step": 4642 + }, + { + "epoch": 2.166044776119403, + "grad_norm": 1.0237442187823946, + "learning_rate": 2.1755033698516374e-06, + "loss": 0.0353, + "step": 4644 + }, + { + "epoch": 2.1669776119402986, + "grad_norm": 0.9671345740512663, + "learning_rate": 2.171026258058484e-06, + "loss": 0.0308, + "step": 4646 + }, + { + "epoch": 2.167910447761194, + "grad_norm": 0.9021871451964945, + "learning_rate": 2.1665524800041015e-06, + "loss": 0.0353, + "step": 4648 + }, + { + "epoch": 2.1688432835820897, + "grad_norm": 0.917686541925775, + "learning_rate": 2.1620820409605067e-06, + "loss": 0.0334, + "step": 4650 + }, + { + "epoch": 2.169776119402985, + "grad_norm": 0.8653719393610244, + "learning_rate": 2.1576149461957867e-06, + "loss": 0.0322, + "step": 4652 + }, + { + "epoch": 2.1707089552238807, + "grad_norm": 0.8406907699111601, + "learning_rate": 2.153151200974088e-06, + "loss": 0.0354, + "step": 4654 + }, + { + "epoch": 2.171641791044776, + "grad_norm": 0.830325595315677, + "learning_rate": 2.1486908105556047e-06, + "loss": 0.0329, + "step": 4656 + }, + { + "epoch": 2.172574626865672, + "grad_norm": 0.8915313413102008, + "learning_rate": 2.1442337801965844e-06, + "loss": 0.031, + "step": 4658 + }, + { + "epoch": 2.173507462686567, + "grad_norm": 1.0455817370734015, + "learning_rate": 2.139780115149308e-06, + "loss": 0.0366, + "step": 4660 + }, + { + "epoch": 2.174440298507463, + "grad_norm": 0.8336915273217668, + "learning_rate": 2.135329820662096e-06, + "loss": 0.031, + "step": 4662 + }, + { + "epoch": 2.175373134328358, + "grad_norm": 1.034412884123277, + "learning_rate": 2.130882901979297e-06, + "loss": 0.0311, + "step": 4664 + }, + { + "epoch": 2.1763059701492535, + "grad_norm": 1.014121104563138, + "learning_rate": 2.1264393643412778e-06, + "loss": 0.0292, + "step": 4666 + }, + { + "epoch": 2.1772388059701493, + "grad_norm": 0.9249121927535365, + "learning_rate": 2.1219992129844207e-06, + "loss": 0.0336, + "step": 4668 + }, + { + "epoch": 2.1781716417910446, + "grad_norm": 0.9316547180306575, + "learning_rate": 2.1175624531411215e-06, + "loss": 0.0331, + "step": 4670 + }, + { + "epoch": 2.1791044776119404, + "grad_norm": 1.042818841209333, + "learning_rate": 2.1131290900397792e-06, + "loss": 0.0372, + "step": 4672 + }, + { + "epoch": 2.1800373134328357, + "grad_norm": 0.8447600194276534, + "learning_rate": 2.108699128904784e-06, + "loss": 0.0299, + "step": 4674 + }, + { + "epoch": 2.1809701492537314, + "grad_norm": 0.7899589890991614, + "learning_rate": 2.104272574956526e-06, + "loss": 0.0316, + "step": 4676 + }, + { + "epoch": 2.1819029850746268, + "grad_norm": 0.898035918160858, + "learning_rate": 2.0998494334113733e-06, + "loss": 0.0355, + "step": 4678 + }, + { + "epoch": 2.1828358208955225, + "grad_norm": 0.9252003811177696, + "learning_rate": 2.0954297094816708e-06, + "loss": 0.0335, + "step": 4680 + }, + { + "epoch": 2.183768656716418, + "grad_norm": 0.8512045478373349, + "learning_rate": 2.091013408375747e-06, + "loss": 0.0315, + "step": 4682 + }, + { + "epoch": 2.1847014925373136, + "grad_norm": 1.0448281959161687, + "learning_rate": 2.0866005352978875e-06, + "loss": 0.0355, + "step": 4684 + }, + { + "epoch": 2.185634328358209, + "grad_norm": 0.9289854915666329, + "learning_rate": 2.082191095448338e-06, + "loss": 0.0325, + "step": 4686 + }, + { + "epoch": 2.1865671641791047, + "grad_norm": 0.9352449594094461, + "learning_rate": 2.077785094023305e-06, + "loss": 0.0302, + "step": 4688 + }, + { + "epoch": 2.1875, + "grad_norm": 1.0548731176048733, + "learning_rate": 2.0733825362149356e-06, + "loss": 0.0371, + "step": 4690 + }, + { + "epoch": 2.1884328358208953, + "grad_norm": 1.1335185831863699, + "learning_rate": 2.0689834272113234e-06, + "loss": 0.036, + "step": 4692 + }, + { + "epoch": 2.189365671641791, + "grad_norm": 0.9700440826454366, + "learning_rate": 2.0645877721964996e-06, + "loss": 0.033, + "step": 4694 + }, + { + "epoch": 2.1902985074626864, + "grad_norm": 1.0383039451974871, + "learning_rate": 2.0601955763504207e-06, + "loss": 0.0351, + "step": 4696 + }, + { + "epoch": 2.191231343283582, + "grad_norm": 0.9495829448601033, + "learning_rate": 2.0558068448489647e-06, + "loss": 0.0317, + "step": 4698 + }, + { + "epoch": 2.1921641791044775, + "grad_norm": 1.1212627856210597, + "learning_rate": 2.051421582863937e-06, + "loss": 0.0363, + "step": 4700 + }, + { + "epoch": 2.1930970149253732, + "grad_norm": 1.0209101068037678, + "learning_rate": 2.047039795563043e-06, + "loss": 0.0335, + "step": 4702 + }, + { + "epoch": 2.1940298507462686, + "grad_norm": 0.9650803299518893, + "learning_rate": 2.0426614881099013e-06, + "loss": 0.0292, + "step": 4704 + }, + { + "epoch": 2.1949626865671643, + "grad_norm": 0.9853328481980234, + "learning_rate": 2.0382866656640288e-06, + "loss": 0.0306, + "step": 4706 + }, + { + "epoch": 2.1958955223880596, + "grad_norm": 0.7923048271755836, + "learning_rate": 2.0339153333808304e-06, + "loss": 0.0303, + "step": 4708 + }, + { + "epoch": 2.1968283582089554, + "grad_norm": 0.965579537679953, + "learning_rate": 2.029547496411605e-06, + "loss": 0.0382, + "step": 4710 + }, + { + "epoch": 2.1977611940298507, + "grad_norm": 1.1500344236104285, + "learning_rate": 2.025183159903526e-06, + "loss": 0.0366, + "step": 4712 + }, + { + "epoch": 2.1986940298507465, + "grad_norm": 1.06417345608817, + "learning_rate": 2.0208223289996466e-06, + "loss": 0.0371, + "step": 4714 + }, + { + "epoch": 2.199626865671642, + "grad_norm": 1.110285550269103, + "learning_rate": 2.016465008838889e-06, + "loss": 0.0332, + "step": 4716 + }, + { + "epoch": 2.200559701492537, + "grad_norm": 0.9693828931537851, + "learning_rate": 2.012111204556035e-06, + "loss": 0.0329, + "step": 4718 + }, + { + "epoch": 2.201492537313433, + "grad_norm": 1.0073905057103043, + "learning_rate": 2.0077609212817224e-06, + "loss": 0.0347, + "step": 4720 + }, + { + "epoch": 2.202425373134328, + "grad_norm": 0.9489285384383775, + "learning_rate": 2.0034141641424437e-06, + "loss": 0.0326, + "step": 4722 + }, + { + "epoch": 2.203358208955224, + "grad_norm": 0.9206842935899366, + "learning_rate": 1.999070938260537e-06, + "loss": 0.0342, + "step": 4724 + }, + { + "epoch": 2.2042910447761193, + "grad_norm": 0.8781498563869979, + "learning_rate": 1.994731248754173e-06, + "loss": 0.0314, + "step": 4726 + }, + { + "epoch": 2.205223880597015, + "grad_norm": 0.9872802161438039, + "learning_rate": 1.9903951007373617e-06, + "loss": 0.0329, + "step": 4728 + }, + { + "epoch": 2.2061567164179103, + "grad_norm": 0.9455452744440873, + "learning_rate": 1.9860624993199345e-06, + "loss": 0.035, + "step": 4730 + }, + { + "epoch": 2.207089552238806, + "grad_norm": 0.904213166976788, + "learning_rate": 1.9817334496075447e-06, + "loss": 0.0317, + "step": 4732 + }, + { + "epoch": 2.2080223880597014, + "grad_norm": 0.9319034379101149, + "learning_rate": 1.9774079567016613e-06, + "loss": 0.0308, + "step": 4734 + }, + { + "epoch": 2.208955223880597, + "grad_norm": 0.9522537506099564, + "learning_rate": 1.9730860256995643e-06, + "loss": 0.0319, + "step": 4736 + }, + { + "epoch": 2.2098880597014925, + "grad_norm": 1.003207526746631, + "learning_rate": 1.9687676616943303e-06, + "loss": 0.0303, + "step": 4738 + }, + { + "epoch": 2.2108208955223883, + "grad_norm": 0.9941387313771194, + "learning_rate": 1.964452869774838e-06, + "loss": 0.0341, + "step": 4740 + }, + { + "epoch": 2.2117537313432836, + "grad_norm": 1.078428961396446, + "learning_rate": 1.960141655025751e-06, + "loss": 0.0328, + "step": 4742 + }, + { + "epoch": 2.2126865671641793, + "grad_norm": 1.047698707396123, + "learning_rate": 1.9558340225275236e-06, + "loss": 0.0388, + "step": 4744 + }, + { + "epoch": 2.2136194029850746, + "grad_norm": 0.9245180530064719, + "learning_rate": 1.9515299773563862e-06, + "loss": 0.0367, + "step": 4746 + }, + { + "epoch": 2.21455223880597, + "grad_norm": 0.7666841210019265, + "learning_rate": 1.947229524584341e-06, + "loss": 0.0317, + "step": 4748 + }, + { + "epoch": 2.2154850746268657, + "grad_norm": 1.0904125425418785, + "learning_rate": 1.942932669279154e-06, + "loss": 0.0318, + "step": 4750 + }, + { + "epoch": 2.216417910447761, + "grad_norm": 0.9895065655721109, + "learning_rate": 1.9386394165043596e-06, + "loss": 0.0347, + "step": 4752 + }, + { + "epoch": 2.217350746268657, + "grad_norm": 0.9353829693815657, + "learning_rate": 1.9343497713192387e-06, + "loss": 0.0302, + "step": 4754 + }, + { + "epoch": 2.218283582089552, + "grad_norm": 0.9003688886804847, + "learning_rate": 1.930063738778827e-06, + "loss": 0.0294, + "step": 4756 + }, + { + "epoch": 2.219216417910448, + "grad_norm": 1.0649572955034126, + "learning_rate": 1.925781323933901e-06, + "loss": 0.0343, + "step": 4758 + }, + { + "epoch": 2.220149253731343, + "grad_norm": 0.9021986373914942, + "learning_rate": 1.9215025318309704e-06, + "loss": 0.0327, + "step": 4760 + }, + { + "epoch": 2.221082089552239, + "grad_norm": 0.9654614343606513, + "learning_rate": 1.9172273675122833e-06, + "loss": 0.034, + "step": 4762 + }, + { + "epoch": 2.2220149253731343, + "grad_norm": 0.9753988770300552, + "learning_rate": 1.9129558360158057e-06, + "loss": 0.0358, + "step": 4764 + }, + { + "epoch": 2.22294776119403, + "grad_norm": 1.0302665303404692, + "learning_rate": 1.9086879423752218e-06, + "loss": 0.0378, + "step": 4766 + }, + { + "epoch": 2.2238805970149254, + "grad_norm": 0.9546071003694403, + "learning_rate": 1.9044236916199404e-06, + "loss": 0.0331, + "step": 4768 + }, + { + "epoch": 2.2248134328358207, + "grad_norm": 0.9541014652633691, + "learning_rate": 1.9001630887750643e-06, + "loss": 0.0331, + "step": 4770 + }, + { + "epoch": 2.2257462686567164, + "grad_norm": 0.8919137335366932, + "learning_rate": 1.8959061388614013e-06, + "loss": 0.0295, + "step": 4772 + }, + { + "epoch": 2.2266791044776117, + "grad_norm": 1.0383502957226958, + "learning_rate": 1.8916528468954598e-06, + "loss": 0.028, + "step": 4774 + }, + { + "epoch": 2.2276119402985075, + "grad_norm": 0.9602279618187743, + "learning_rate": 1.8874032178894291e-06, + "loss": 0.0302, + "step": 4776 + }, + { + "epoch": 2.228544776119403, + "grad_norm": 0.9150970720932146, + "learning_rate": 1.8831572568511891e-06, + "loss": 0.0316, + "step": 4778 + }, + { + "epoch": 2.2294776119402986, + "grad_norm": 0.9796102398418659, + "learning_rate": 1.8789149687842955e-06, + "loss": 0.0318, + "step": 4780 + }, + { + "epoch": 2.230410447761194, + "grad_norm": 0.9452647015501673, + "learning_rate": 1.8746763586879729e-06, + "loss": 0.0324, + "step": 4782 + }, + { + "epoch": 2.2313432835820897, + "grad_norm": 1.056537633599161, + "learning_rate": 1.8704414315571117e-06, + "loss": 0.0381, + "step": 4784 + }, + { + "epoch": 2.232276119402985, + "grad_norm": 0.9104889867814913, + "learning_rate": 1.8662101923822668e-06, + "loss": 0.0321, + "step": 4786 + }, + { + "epoch": 2.2332089552238807, + "grad_norm": 1.1917555284634107, + "learning_rate": 1.861982646149645e-06, + "loss": 0.0332, + "step": 4788 + }, + { + "epoch": 2.234141791044776, + "grad_norm": 0.9347915254367777, + "learning_rate": 1.8577587978410967e-06, + "loss": 0.03, + "step": 4790 + }, + { + "epoch": 2.235074626865672, + "grad_norm": 0.9894703553497254, + "learning_rate": 1.8535386524341225e-06, + "loss": 0.0291, + "step": 4792 + }, + { + "epoch": 2.236007462686567, + "grad_norm": 0.9633339225718022, + "learning_rate": 1.8493222149018524e-06, + "loss": 0.0321, + "step": 4794 + }, + { + "epoch": 2.236940298507463, + "grad_norm": 1.0344433383500846, + "learning_rate": 1.8451094902130506e-06, + "loss": 0.0357, + "step": 4796 + }, + { + "epoch": 2.237873134328358, + "grad_norm": 1.1072755366728024, + "learning_rate": 1.840900483332107e-06, + "loss": 0.0367, + "step": 4798 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 0.9811169471003288, + "learning_rate": 1.8366951992190275e-06, + "loss": 0.0301, + "step": 4800 + }, + { + "epoch": 2.2397388059701493, + "grad_norm": 0.8912312323349124, + "learning_rate": 1.8324936428294293e-06, + "loss": 0.0294, + "step": 4802 + }, + { + "epoch": 2.2406716417910446, + "grad_norm": 0.9284678038980523, + "learning_rate": 1.828295819114544e-06, + "loss": 0.0315, + "step": 4804 + }, + { + "epoch": 2.2416044776119404, + "grad_norm": 0.9793525696039931, + "learning_rate": 1.8241017330211958e-06, + "loss": 0.03, + "step": 4806 + }, + { + "epoch": 2.2425373134328357, + "grad_norm": 0.8786912497482439, + "learning_rate": 1.8199113894918103e-06, + "loss": 0.0302, + "step": 4808 + }, + { + "epoch": 2.2434701492537314, + "grad_norm": 0.8317446223359907, + "learning_rate": 1.8157247934644035e-06, + "loss": 0.0278, + "step": 4810 + }, + { + "epoch": 2.2444029850746268, + "grad_norm": 1.0260812557611483, + "learning_rate": 1.8115419498725684e-06, + "loss": 0.0302, + "step": 4812 + }, + { + "epoch": 2.2453358208955225, + "grad_norm": 0.8198119601969011, + "learning_rate": 1.8073628636454848e-06, + "loss": 0.0291, + "step": 4814 + }, + { + "epoch": 2.246268656716418, + "grad_norm": 0.8436443298067087, + "learning_rate": 1.8031875397078984e-06, + "loss": 0.0274, + "step": 4816 + }, + { + "epoch": 2.2472014925373136, + "grad_norm": 1.1190571985048972, + "learning_rate": 1.799015982980119e-06, + "loss": 0.0316, + "step": 4818 + }, + { + "epoch": 2.248134328358209, + "grad_norm": 0.9082824287317579, + "learning_rate": 1.7948481983780292e-06, + "loss": 0.0325, + "step": 4820 + }, + { + "epoch": 2.2490671641791047, + "grad_norm": 1.0549276508580878, + "learning_rate": 1.7906841908130545e-06, + "loss": 0.0345, + "step": 4822 + }, + { + "epoch": 2.25, + "grad_norm": 1.019585336150271, + "learning_rate": 1.7865239651921723e-06, + "loss": 0.0339, + "step": 4824 + }, + { + "epoch": 2.2509328358208958, + "grad_norm": 1.0619989537882322, + "learning_rate": 1.7823675264179068e-06, + "loss": 0.0303, + "step": 4826 + }, + { + "epoch": 2.251865671641791, + "grad_norm": 0.9252518561016344, + "learning_rate": 1.7782148793883147e-06, + "loss": 0.0315, + "step": 4828 + }, + { + "epoch": 2.2527985074626864, + "grad_norm": 0.9736992699499954, + "learning_rate": 1.7740660289969886e-06, + "loss": 0.0304, + "step": 4830 + }, + { + "epoch": 2.253731343283582, + "grad_norm": 1.0516237752505884, + "learning_rate": 1.769920980133047e-06, + "loss": 0.0344, + "step": 4832 + }, + { + "epoch": 2.2546641791044775, + "grad_norm": 0.9351502996935023, + "learning_rate": 1.7657797376811252e-06, + "loss": 0.0324, + "step": 4834 + }, + { + "epoch": 2.2555970149253732, + "grad_norm": 0.8430799506091149, + "learning_rate": 1.7616423065213729e-06, + "loss": 0.031, + "step": 4836 + }, + { + "epoch": 2.2565298507462686, + "grad_norm": 0.8875048034060027, + "learning_rate": 1.7575086915294525e-06, + "loss": 0.0327, + "step": 4838 + }, + { + "epoch": 2.2574626865671643, + "grad_norm": 0.9534089379751329, + "learning_rate": 1.7533788975765281e-06, + "loss": 0.0343, + "step": 4840 + }, + { + "epoch": 2.2583955223880596, + "grad_norm": 1.0787212009961646, + "learning_rate": 1.7492529295292577e-06, + "loss": 0.0326, + "step": 4842 + }, + { + "epoch": 2.2593283582089554, + "grad_norm": 0.941164075013674, + "learning_rate": 1.745130792249795e-06, + "loss": 0.0326, + "step": 4844 + }, + { + "epoch": 2.2602611940298507, + "grad_norm": 0.8528604588181112, + "learning_rate": 1.741012490595777e-06, + "loss": 0.0286, + "step": 4846 + }, + { + "epoch": 2.2611940298507465, + "grad_norm": 0.9063789272252968, + "learning_rate": 1.7368980294203185e-06, + "loss": 0.0333, + "step": 4848 + }, + { + "epoch": 2.262126865671642, + "grad_norm": 0.8720875843908757, + "learning_rate": 1.732787413572014e-06, + "loss": 0.0303, + "step": 4850 + }, + { + "epoch": 2.263059701492537, + "grad_norm": 0.9966272096294451, + "learning_rate": 1.7286806478949247e-06, + "loss": 0.0316, + "step": 4852 + }, + { + "epoch": 2.263992537313433, + "grad_norm": 0.9455107563473335, + "learning_rate": 1.724577737228571e-06, + "loss": 0.0284, + "step": 4854 + }, + { + "epoch": 2.264925373134328, + "grad_norm": 1.122251605217344, + "learning_rate": 1.720478686407936e-06, + "loss": 0.0332, + "step": 4856 + }, + { + "epoch": 2.265858208955224, + "grad_norm": 0.852383887426748, + "learning_rate": 1.7163835002634483e-06, + "loss": 0.0307, + "step": 4858 + }, + { + "epoch": 2.2667910447761193, + "grad_norm": 0.9025202081516984, + "learning_rate": 1.7122921836209866e-06, + "loss": 0.0285, + "step": 4860 + }, + { + "epoch": 2.267723880597015, + "grad_norm": 1.0153947109788386, + "learning_rate": 1.7082047413018715e-06, + "loss": 0.0346, + "step": 4862 + }, + { + "epoch": 2.2686567164179103, + "grad_norm": 0.9393692050654928, + "learning_rate": 1.7041211781228506e-06, + "loss": 0.0337, + "step": 4864 + }, + { + "epoch": 2.269589552238806, + "grad_norm": 1.078006744869332, + "learning_rate": 1.7000414988961083e-06, + "loss": 0.0357, + "step": 4866 + }, + { + "epoch": 2.2705223880597014, + "grad_norm": 0.9746984688936382, + "learning_rate": 1.6959657084292463e-06, + "loss": 0.0285, + "step": 4868 + }, + { + "epoch": 2.271455223880597, + "grad_norm": 1.1149398638348036, + "learning_rate": 1.6918938115252847e-06, + "loss": 0.0342, + "step": 4870 + }, + { + "epoch": 2.2723880597014925, + "grad_norm": 0.9420355662172784, + "learning_rate": 1.6878258129826575e-06, + "loss": 0.0371, + "step": 4872 + }, + { + "epoch": 2.2733208955223883, + "grad_norm": 0.8833812328634052, + "learning_rate": 1.6837617175952058e-06, + "loss": 0.0305, + "step": 4874 + }, + { + "epoch": 2.2742537313432836, + "grad_norm": 0.9162630957322199, + "learning_rate": 1.6797015301521653e-06, + "loss": 0.0322, + "step": 4876 + }, + { + "epoch": 2.2751865671641793, + "grad_norm": 0.944306854206795, + "learning_rate": 1.6756452554381736e-06, + "loss": 0.028, + "step": 4878 + }, + { + "epoch": 2.2761194029850746, + "grad_norm": 0.8812094451894885, + "learning_rate": 1.6715928982332503e-06, + "loss": 0.0294, + "step": 4880 + }, + { + "epoch": 2.27705223880597, + "grad_norm": 0.921622264454937, + "learning_rate": 1.6675444633128041e-06, + "loss": 0.0341, + "step": 4882 + }, + { + "epoch": 2.2779850746268657, + "grad_norm": 1.0337252983568204, + "learning_rate": 1.6634999554476211e-06, + "loss": 0.0366, + "step": 4884 + }, + { + "epoch": 2.278917910447761, + "grad_norm": 1.0296969246910936, + "learning_rate": 1.6594593794038565e-06, + "loss": 0.03, + "step": 4886 + }, + { + "epoch": 2.279850746268657, + "grad_norm": 0.9209078112203684, + "learning_rate": 1.6554227399430328e-06, + "loss": 0.0306, + "step": 4888 + }, + { + "epoch": 2.280783582089552, + "grad_norm": 1.0601700915448702, + "learning_rate": 1.651390041822037e-06, + "loss": 0.0329, + "step": 4890 + }, + { + "epoch": 2.281716417910448, + "grad_norm": 0.9373147261111446, + "learning_rate": 1.6473612897931063e-06, + "loss": 0.0309, + "step": 4892 + }, + { + "epoch": 2.282649253731343, + "grad_norm": 1.0350624982733692, + "learning_rate": 1.6433364886038316e-06, + "loss": 0.0325, + "step": 4894 + }, + { + "epoch": 2.283582089552239, + "grad_norm": 1.005571213343325, + "learning_rate": 1.6393156429971491e-06, + "loss": 0.0336, + "step": 4896 + }, + { + "epoch": 2.2845149253731343, + "grad_norm": 0.9550712495760202, + "learning_rate": 1.6352987577113295e-06, + "loss": 0.0333, + "step": 4898 + }, + { + "epoch": 2.28544776119403, + "grad_norm": 0.9359834378755599, + "learning_rate": 1.6312858374799773e-06, + "loss": 0.0318, + "step": 4900 + }, + { + "epoch": 2.2863805970149254, + "grad_norm": 1.0374437239494685, + "learning_rate": 1.6272768870320265e-06, + "loss": 0.0373, + "step": 4902 + }, + { + "epoch": 2.2873134328358207, + "grad_norm": 0.8784400737974469, + "learning_rate": 1.6232719110917344e-06, + "loss": 0.0318, + "step": 4904 + }, + { + "epoch": 2.2882462686567164, + "grad_norm": 0.887693364695598, + "learning_rate": 1.6192709143786695e-06, + "loss": 0.0332, + "step": 4906 + }, + { + "epoch": 2.2891791044776117, + "grad_norm": 0.9029182379544906, + "learning_rate": 1.6152739016077162e-06, + "loss": 0.0326, + "step": 4908 + }, + { + "epoch": 2.2901119402985075, + "grad_norm": 1.11264434807744, + "learning_rate": 1.6112808774890592e-06, + "loss": 0.0349, + "step": 4910 + }, + { + "epoch": 2.291044776119403, + "grad_norm": 0.9429615283258698, + "learning_rate": 1.6072918467281874e-06, + "loss": 0.0302, + "step": 4912 + }, + { + "epoch": 2.2919776119402986, + "grad_norm": 0.99134599303643, + "learning_rate": 1.603306814025883e-06, + "loss": 0.0334, + "step": 4914 + }, + { + "epoch": 2.292910447761194, + "grad_norm": 1.0255295343792188, + "learning_rate": 1.5993257840782127e-06, + "loss": 0.029, + "step": 4916 + }, + { + "epoch": 2.2938432835820897, + "grad_norm": 1.101418351960286, + "learning_rate": 1.595348761576533e-06, + "loss": 0.0315, + "step": 4918 + }, + { + "epoch": 2.294776119402985, + "grad_norm": 0.9079827607045441, + "learning_rate": 1.5913757512074724e-06, + "loss": 0.0307, + "step": 4920 + }, + { + "epoch": 2.2957089552238807, + "grad_norm": 0.9558019184078752, + "learning_rate": 1.5874067576529306e-06, + "loss": 0.029, + "step": 4922 + }, + { + "epoch": 2.296641791044776, + "grad_norm": 0.916647014585152, + "learning_rate": 1.5834417855900796e-06, + "loss": 0.0355, + "step": 4924 + }, + { + "epoch": 2.297574626865672, + "grad_norm": 1.0450965539339738, + "learning_rate": 1.5794808396913503e-06, + "loss": 0.0323, + "step": 4926 + }, + { + "epoch": 2.298507462686567, + "grad_norm": 0.9195856408768837, + "learning_rate": 1.5755239246244235e-06, + "loss": 0.0271, + "step": 4928 + }, + { + "epoch": 2.299440298507463, + "grad_norm": 1.0044292930015237, + "learning_rate": 1.5715710450522393e-06, + "loss": 0.0327, + "step": 4930 + }, + { + "epoch": 2.300373134328358, + "grad_norm": 1.0332966948680593, + "learning_rate": 1.5676222056329744e-06, + "loss": 0.0316, + "step": 4932 + }, + { + "epoch": 2.3013059701492535, + "grad_norm": 0.9990698485315225, + "learning_rate": 1.5636774110200447e-06, + "loss": 0.0317, + "step": 4934 + }, + { + "epoch": 2.3022388059701493, + "grad_norm": 0.9005564633661388, + "learning_rate": 1.5597366658621093e-06, + "loss": 0.0292, + "step": 4936 + }, + { + "epoch": 2.3031716417910446, + "grad_norm": 1.0235086592087248, + "learning_rate": 1.5557999748030445e-06, + "loss": 0.03, + "step": 4938 + }, + { + "epoch": 2.3041044776119404, + "grad_norm": 1.0407345019915932, + "learning_rate": 1.5518673424819508e-06, + "loss": 0.031, + "step": 4940 + }, + { + "epoch": 2.3050373134328357, + "grad_norm": 1.1173523357373325, + "learning_rate": 1.5479387735331524e-06, + "loss": 0.033, + "step": 4942 + }, + { + "epoch": 2.3059701492537314, + "grad_norm": 0.9244988906969726, + "learning_rate": 1.5440142725861763e-06, + "loss": 0.032, + "step": 4944 + }, + { + "epoch": 2.3069029850746268, + "grad_norm": 1.0233139804844182, + "learning_rate": 1.5400938442657625e-06, + "loss": 0.0303, + "step": 4946 + }, + { + "epoch": 2.3078358208955225, + "grad_norm": 0.799951804777741, + "learning_rate": 1.53617749319185e-06, + "loss": 0.0305, + "step": 4948 + }, + { + "epoch": 2.308768656716418, + "grad_norm": 1.000997552437667, + "learning_rate": 1.5322652239795717e-06, + "loss": 0.0352, + "step": 4950 + }, + { + "epoch": 2.3097014925373136, + "grad_norm": 0.8318375059715735, + "learning_rate": 1.5283570412392478e-06, + "loss": 0.0312, + "step": 4952 + }, + { + "epoch": 2.310634328358209, + "grad_norm": 0.9785791023544128, + "learning_rate": 1.5244529495763893e-06, + "loss": 0.0309, + "step": 4954 + }, + { + "epoch": 2.3115671641791042, + "grad_norm": 0.8964024452247994, + "learning_rate": 1.5205529535916834e-06, + "loss": 0.0284, + "step": 4956 + }, + { + "epoch": 2.3125, + "grad_norm": 0.945822136524235, + "learning_rate": 1.5166570578809869e-06, + "loss": 0.0307, + "step": 4958 + }, + { + "epoch": 2.3134328358208958, + "grad_norm": 0.8345305166463103, + "learning_rate": 1.5127652670353321e-06, + "loss": 0.0284, + "step": 4960 + }, + { + "epoch": 2.314365671641791, + "grad_norm": 0.9264264389198162, + "learning_rate": 1.5088775856409066e-06, + "loss": 0.0278, + "step": 4962 + }, + { + "epoch": 2.3152985074626864, + "grad_norm": 0.9547188914742342, + "learning_rate": 1.5049940182790602e-06, + "loss": 0.0304, + "step": 4964 + }, + { + "epoch": 2.316231343283582, + "grad_norm": 1.1407054779735974, + "learning_rate": 1.5011145695262947e-06, + "loss": 0.0337, + "step": 4966 + }, + { + "epoch": 2.3171641791044775, + "grad_norm": 0.8996820314811704, + "learning_rate": 1.4972392439542533e-06, + "loss": 0.0274, + "step": 4968 + }, + { + "epoch": 2.3180970149253732, + "grad_norm": 1.0059265196229077, + "learning_rate": 1.4933680461297279e-06, + "loss": 0.0317, + "step": 4970 + }, + { + "epoch": 2.3190298507462686, + "grad_norm": 0.9141775842258535, + "learning_rate": 1.4895009806146404e-06, + "loss": 0.0269, + "step": 4972 + }, + { + "epoch": 2.3199626865671643, + "grad_norm": 0.9204833484695144, + "learning_rate": 1.4856380519660429e-06, + "loss": 0.0334, + "step": 4974 + }, + { + "epoch": 2.3208955223880596, + "grad_norm": 0.9472225911308894, + "learning_rate": 1.4817792647361168e-06, + "loss": 0.0317, + "step": 4976 + }, + { + "epoch": 2.3218283582089554, + "grad_norm": 0.8292477805867834, + "learning_rate": 1.477924623472161e-06, + "loss": 0.0307, + "step": 4978 + }, + { + "epoch": 2.3227611940298507, + "grad_norm": 1.1163000585725409, + "learning_rate": 1.4740741327165869e-06, + "loss": 0.0354, + "step": 4980 + }, + { + "epoch": 2.3236940298507465, + "grad_norm": 0.9550471993279462, + "learning_rate": 1.4702277970069184e-06, + "loss": 0.031, + "step": 4982 + }, + { + "epoch": 2.324626865671642, + "grad_norm": 0.9561926828491114, + "learning_rate": 1.4663856208757797e-06, + "loss": 0.0335, + "step": 4984 + }, + { + "epoch": 2.325559701492537, + "grad_norm": 0.9241964509065581, + "learning_rate": 1.4625476088508917e-06, + "loss": 0.0288, + "step": 4986 + }, + { + "epoch": 2.326492537313433, + "grad_norm": 0.8550279123404502, + "learning_rate": 1.458713765455077e-06, + "loss": 0.0307, + "step": 4988 + }, + { + "epoch": 2.327425373134328, + "grad_norm": 0.9570729328656946, + "learning_rate": 1.4548840952062365e-06, + "loss": 0.0324, + "step": 4990 + }, + { + "epoch": 2.328358208955224, + "grad_norm": 0.8785046354623663, + "learning_rate": 1.4510586026173557e-06, + "loss": 0.0316, + "step": 4992 + }, + { + "epoch": 2.3292910447761193, + "grad_norm": 1.041722775563538, + "learning_rate": 1.4472372921965005e-06, + "loss": 0.0298, + "step": 4994 + }, + { + "epoch": 2.330223880597015, + "grad_norm": 0.9351483285560535, + "learning_rate": 1.443420168446803e-06, + "loss": 0.0308, + "step": 4996 + }, + { + "epoch": 2.3311567164179103, + "grad_norm": 1.058348585781574, + "learning_rate": 1.4396072358664665e-06, + "loss": 0.0338, + "step": 4998 + }, + { + "epoch": 2.332089552238806, + "grad_norm": 0.9652039258901834, + "learning_rate": 1.4357984989487545e-06, + "loss": 0.0293, + "step": 5000 + }, + { + "epoch": 2.332089552238806, + "eval_loss": 0.18245889246463776, + "eval_runtime": 322.1455, + "eval_samples_per_second": 47.326, + "eval_steps_per_second": 5.917, + "step": 5000 + }, + { + "epoch": 2.3330223880597014, + "grad_norm": 1.1851156887139767, + "learning_rate": 1.4319939621819835e-06, + "loss": 0.0362, + "step": 5002 + }, + { + "epoch": 2.333955223880597, + "grad_norm": 0.8529028593835726, + "learning_rate": 1.4281936300495198e-06, + "loss": 0.0346, + "step": 5004 + }, + { + "epoch": 2.3348880597014925, + "grad_norm": 0.9594188285524563, + "learning_rate": 1.4243975070297817e-06, + "loss": 0.0335, + "step": 5006 + }, + { + "epoch": 2.3358208955223883, + "grad_norm": 0.9074486334593885, + "learning_rate": 1.4206055975962179e-06, + "loss": 0.0297, + "step": 5008 + }, + { + "epoch": 2.3367537313432836, + "grad_norm": 0.9751995519315027, + "learning_rate": 1.4168179062173193e-06, + "loss": 0.0306, + "step": 5010 + }, + { + "epoch": 2.3376865671641793, + "grad_norm": 0.9616759833614095, + "learning_rate": 1.413034437356604e-06, + "loss": 0.0284, + "step": 5012 + }, + { + "epoch": 2.3386194029850746, + "grad_norm": 1.1592255195505259, + "learning_rate": 1.4092551954726113e-06, + "loss": 0.0339, + "step": 5014 + }, + { + "epoch": 2.33955223880597, + "grad_norm": 1.2948160643221482, + "learning_rate": 1.4054801850189038e-06, + "loss": 0.0339, + "step": 5016 + }, + { + "epoch": 2.3404850746268657, + "grad_norm": 0.8674935561887429, + "learning_rate": 1.4017094104440527e-06, + "loss": 0.0322, + "step": 5018 + }, + { + "epoch": 2.341417910447761, + "grad_norm": 1.0367099183574866, + "learning_rate": 1.397942876191642e-06, + "loss": 0.0341, + "step": 5020 + }, + { + "epoch": 2.342350746268657, + "grad_norm": 0.809587220441153, + "learning_rate": 1.3941805867002578e-06, + "loss": 0.0295, + "step": 5022 + }, + { + "epoch": 2.343283582089552, + "grad_norm": 1.1115723698500566, + "learning_rate": 1.3904225464034821e-06, + "loss": 0.0323, + "step": 5024 + }, + { + "epoch": 2.344216417910448, + "grad_norm": 0.9093107346126881, + "learning_rate": 1.386668759729889e-06, + "loss": 0.0302, + "step": 5026 + }, + { + "epoch": 2.345149253731343, + "grad_norm": 0.9181842054624373, + "learning_rate": 1.3829192311030438e-06, + "loss": 0.0316, + "step": 5028 + }, + { + "epoch": 2.346082089552239, + "grad_norm": 0.9669415717064661, + "learning_rate": 1.3791739649414926e-06, + "loss": 0.0295, + "step": 5030 + }, + { + "epoch": 2.3470149253731343, + "grad_norm": 0.9549079919101966, + "learning_rate": 1.3754329656587556e-06, + "loss": 0.0334, + "step": 5032 + }, + { + "epoch": 2.34794776119403, + "grad_norm": 0.8704922978244095, + "learning_rate": 1.3716962376633296e-06, + "loss": 0.0294, + "step": 5034 + }, + { + "epoch": 2.3488805970149254, + "grad_norm": 0.860650439580406, + "learning_rate": 1.367963785358674e-06, + "loss": 0.031, + "step": 5036 + }, + { + "epoch": 2.3498134328358207, + "grad_norm": 0.8669502064663793, + "learning_rate": 1.3642356131432078e-06, + "loss": 0.0302, + "step": 5038 + }, + { + "epoch": 2.3507462686567164, + "grad_norm": 0.8529582341937184, + "learning_rate": 1.3605117254103157e-06, + "loss": 0.0288, + "step": 5040 + }, + { + "epoch": 2.3516791044776117, + "grad_norm": 1.0080665155964503, + "learning_rate": 1.3567921265483241e-06, + "loss": 0.0298, + "step": 5042 + }, + { + "epoch": 2.3526119402985075, + "grad_norm": 1.0272770219623781, + "learning_rate": 1.3530768209405064e-06, + "loss": 0.0308, + "step": 5044 + }, + { + "epoch": 2.353544776119403, + "grad_norm": 0.9141275593896965, + "learning_rate": 1.3493658129650827e-06, + "loss": 0.0298, + "step": 5046 + }, + { + "epoch": 2.3544776119402986, + "grad_norm": 0.8505700202865444, + "learning_rate": 1.3456591069952008e-06, + "loss": 0.0291, + "step": 5048 + }, + { + "epoch": 2.355410447761194, + "grad_norm": 0.8724278486813672, + "learning_rate": 1.341956707398945e-06, + "loss": 0.0312, + "step": 5050 + }, + { + "epoch": 2.3563432835820897, + "grad_norm": 1.0522051991743648, + "learning_rate": 1.3382586185393232e-06, + "loss": 0.0322, + "step": 5052 + }, + { + "epoch": 2.357276119402985, + "grad_norm": 1.010415268619055, + "learning_rate": 1.334564844774262e-06, + "loss": 0.0361, + "step": 5054 + }, + { + "epoch": 2.3582089552238807, + "grad_norm": 0.876681552657445, + "learning_rate": 1.330875390456602e-06, + "loss": 0.0269, + "step": 5056 + }, + { + "epoch": 2.359141791044776, + "grad_norm": 1.016793734088532, + "learning_rate": 1.327190259934098e-06, + "loss": 0.0288, + "step": 5058 + }, + { + "epoch": 2.360074626865672, + "grad_norm": 1.1488278864813115, + "learning_rate": 1.3235094575494044e-06, + "loss": 0.0326, + "step": 5060 + }, + { + "epoch": 2.361007462686567, + "grad_norm": 0.8092207099884563, + "learning_rate": 1.3198329876400795e-06, + "loss": 0.0267, + "step": 5062 + }, + { + "epoch": 2.361940298507463, + "grad_norm": 1.0147049454179322, + "learning_rate": 1.3161608545385756e-06, + "loss": 0.0327, + "step": 5064 + }, + { + "epoch": 2.362873134328358, + "grad_norm": 1.3147640207279299, + "learning_rate": 1.3124930625722304e-06, + "loss": 0.0307, + "step": 5066 + }, + { + "epoch": 2.3638059701492535, + "grad_norm": 1.0315398673305383, + "learning_rate": 1.3088296160632714e-06, + "loss": 0.033, + "step": 5068 + }, + { + "epoch": 2.3647388059701493, + "grad_norm": 0.9598345702077918, + "learning_rate": 1.3051705193287995e-06, + "loss": 0.0363, + "step": 5070 + }, + { + "epoch": 2.3656716417910446, + "grad_norm": 0.909327421024035, + "learning_rate": 1.301515776680794e-06, + "loss": 0.0305, + "step": 5072 + }, + { + "epoch": 2.3666044776119404, + "grad_norm": 1.005616156281911, + "learning_rate": 1.2978653924261037e-06, + "loss": 0.0326, + "step": 5074 + }, + { + "epoch": 2.3675373134328357, + "grad_norm": 1.0697155919210022, + "learning_rate": 1.294219370866438e-06, + "loss": 0.0311, + "step": 5076 + }, + { + "epoch": 2.3684701492537314, + "grad_norm": 0.9354398582066434, + "learning_rate": 1.2905777162983657e-06, + "loss": 0.0287, + "step": 5078 + }, + { + "epoch": 2.3694029850746268, + "grad_norm": 0.973132698739944, + "learning_rate": 1.2869404330133117e-06, + "loss": 0.0326, + "step": 5080 + }, + { + "epoch": 2.3703358208955225, + "grad_norm": 0.897005600047769, + "learning_rate": 1.2833075252975501e-06, + "loss": 0.0328, + "step": 5082 + }, + { + "epoch": 2.371268656716418, + "grad_norm": 0.7750168931572203, + "learning_rate": 1.2796789974321938e-06, + "loss": 0.0256, + "step": 5084 + }, + { + "epoch": 2.3722014925373136, + "grad_norm": 1.0031754188776967, + "learning_rate": 1.276054853693201e-06, + "loss": 0.0329, + "step": 5086 + }, + { + "epoch": 2.373134328358209, + "grad_norm": 0.7937335711902828, + "learning_rate": 1.2724350983513583e-06, + "loss": 0.0261, + "step": 5088 + }, + { + "epoch": 2.3740671641791042, + "grad_norm": 1.0261279258957414, + "learning_rate": 1.268819735672282e-06, + "loss": 0.036, + "step": 5090 + }, + { + "epoch": 2.375, + "grad_norm": 1.168216731934682, + "learning_rate": 1.265208769916414e-06, + "loss": 0.0359, + "step": 5092 + }, + { + "epoch": 2.3759328358208958, + "grad_norm": 0.9558805913389741, + "learning_rate": 1.2616022053390143e-06, + "loss": 0.0291, + "step": 5094 + }, + { + "epoch": 2.376865671641791, + "grad_norm": 0.9325047029076223, + "learning_rate": 1.2580000461901532e-06, + "loss": 0.0296, + "step": 5096 + }, + { + "epoch": 2.3777985074626864, + "grad_norm": 0.9211146961961714, + "learning_rate": 1.254402296714715e-06, + "loss": 0.0294, + "step": 5098 + }, + { + "epoch": 2.378731343283582, + "grad_norm": 0.9409998763279245, + "learning_rate": 1.2508089611523816e-06, + "loss": 0.0328, + "step": 5100 + }, + { + "epoch": 2.3796641791044775, + "grad_norm": 0.9300544977203749, + "learning_rate": 1.247220043737637e-06, + "loss": 0.0264, + "step": 5102 + }, + { + "epoch": 2.3805970149253732, + "grad_norm": 1.0399909193526722, + "learning_rate": 1.2436355486997604e-06, + "loss": 0.0325, + "step": 5104 + }, + { + "epoch": 2.3815298507462686, + "grad_norm": 0.8802218937605405, + "learning_rate": 1.2400554802628155e-06, + "loss": 0.0261, + "step": 5106 + }, + { + "epoch": 2.3824626865671643, + "grad_norm": 1.0890395468692708, + "learning_rate": 1.2364798426456499e-06, + "loss": 0.0338, + "step": 5108 + }, + { + "epoch": 2.3833955223880596, + "grad_norm": 0.9340708937253251, + "learning_rate": 1.2329086400618934e-06, + "loss": 0.0278, + "step": 5110 + }, + { + "epoch": 2.3843283582089554, + "grad_norm": 0.891003892039185, + "learning_rate": 1.229341876719945e-06, + "loss": 0.026, + "step": 5112 + }, + { + "epoch": 2.3852611940298507, + "grad_norm": 1.049112253731958, + "learning_rate": 1.2257795568229759e-06, + "loss": 0.0305, + "step": 5114 + }, + { + "epoch": 2.3861940298507465, + "grad_norm": 0.9150152383861456, + "learning_rate": 1.2222216845689205e-06, + "loss": 0.0251, + "step": 5116 + }, + { + "epoch": 2.387126865671642, + "grad_norm": 0.970598936477048, + "learning_rate": 1.2186682641504694e-06, + "loss": 0.0304, + "step": 5118 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 0.8440222362537796, + "learning_rate": 1.2151192997550708e-06, + "loss": 0.0305, + "step": 5120 + }, + { + "epoch": 2.388992537313433, + "grad_norm": 1.1241825153824638, + "learning_rate": 1.2115747955649177e-06, + "loss": 0.0321, + "step": 5122 + }, + { + "epoch": 2.389925373134328, + "grad_norm": 0.8960130220118138, + "learning_rate": 1.20803475575695e-06, + "loss": 0.0284, + "step": 5124 + }, + { + "epoch": 2.390858208955224, + "grad_norm": 0.9201909754595783, + "learning_rate": 1.2044991845028482e-06, + "loss": 0.0302, + "step": 5126 + }, + { + "epoch": 2.3917910447761193, + "grad_norm": 1.0146207135534862, + "learning_rate": 1.2009680859690215e-06, + "loss": 0.0341, + "step": 5128 + }, + { + "epoch": 2.392723880597015, + "grad_norm": 1.0357233507057502, + "learning_rate": 1.1974414643166116e-06, + "loss": 0.032, + "step": 5130 + }, + { + "epoch": 2.3936567164179103, + "grad_norm": 1.1342502952525662, + "learning_rate": 1.1939193237014862e-06, + "loss": 0.0289, + "step": 5132 + }, + { + "epoch": 2.394589552238806, + "grad_norm": 0.8186531199171573, + "learning_rate": 1.1904016682742286e-06, + "loss": 0.031, + "step": 5134 + }, + { + "epoch": 2.3955223880597014, + "grad_norm": 1.138168214225384, + "learning_rate": 1.1868885021801392e-06, + "loss": 0.0312, + "step": 5136 + }, + { + "epoch": 2.396455223880597, + "grad_norm": 1.083624852759322, + "learning_rate": 1.1833798295592291e-06, + "loss": 0.0314, + "step": 5138 + }, + { + "epoch": 2.3973880597014925, + "grad_norm": 0.9628914631831336, + "learning_rate": 1.1798756545462114e-06, + "loss": 0.0317, + "step": 5140 + }, + { + "epoch": 2.3983208955223883, + "grad_norm": 0.8970788976866345, + "learning_rate": 1.1763759812704984e-06, + "loss": 0.0296, + "step": 5142 + }, + { + "epoch": 2.3992537313432836, + "grad_norm": 1.035008794468747, + "learning_rate": 1.1728808138562008e-06, + "loss": 0.0307, + "step": 5144 + }, + { + "epoch": 2.4001865671641793, + "grad_norm": 0.8289167809473592, + "learning_rate": 1.1693901564221193e-06, + "loss": 0.0296, + "step": 5146 + }, + { + "epoch": 2.4011194029850746, + "grad_norm": 0.8649727360028945, + "learning_rate": 1.1659040130817361e-06, + "loss": 0.0305, + "step": 5148 + }, + { + "epoch": 2.40205223880597, + "grad_norm": 0.9736790378086797, + "learning_rate": 1.1624223879432183e-06, + "loss": 0.032, + "step": 5150 + }, + { + "epoch": 2.4029850746268657, + "grad_norm": 0.821620381528247, + "learning_rate": 1.1589452851094063e-06, + "loss": 0.0281, + "step": 5152 + }, + { + "epoch": 2.403917910447761, + "grad_norm": 0.7649796672832583, + "learning_rate": 1.1554727086778077e-06, + "loss": 0.0284, + "step": 5154 + }, + { + "epoch": 2.404850746268657, + "grad_norm": 1.0293759177657253, + "learning_rate": 1.1520046627406061e-06, + "loss": 0.0307, + "step": 5156 + }, + { + "epoch": 2.405783582089552, + "grad_norm": 0.9986647214916511, + "learning_rate": 1.1485411513846379e-06, + "loss": 0.0323, + "step": 5158 + }, + { + "epoch": 2.406716417910448, + "grad_norm": 1.0208320988485347, + "learning_rate": 1.1450821786913957e-06, + "loss": 0.0288, + "step": 5160 + }, + { + "epoch": 2.407649253731343, + "grad_norm": 0.8048689222465932, + "learning_rate": 1.1416277487370293e-06, + "loss": 0.0234, + "step": 5162 + }, + { + "epoch": 2.408582089552239, + "grad_norm": 0.839697880086066, + "learning_rate": 1.1381778655923293e-06, + "loss": 0.0263, + "step": 5164 + }, + { + "epoch": 2.4095149253731343, + "grad_norm": 1.1554814591563138, + "learning_rate": 1.1347325333227315e-06, + "loss": 0.0353, + "step": 5166 + }, + { + "epoch": 2.41044776119403, + "grad_norm": 1.1131093653428739, + "learning_rate": 1.1312917559883101e-06, + "loss": 0.0334, + "step": 5168 + }, + { + "epoch": 2.4113805970149254, + "grad_norm": 1.0319878579150472, + "learning_rate": 1.1278555376437666e-06, + "loss": 0.0309, + "step": 5170 + }, + { + "epoch": 2.4123134328358207, + "grad_norm": 0.9848055830331043, + "learning_rate": 1.1244238823384363e-06, + "loss": 0.0315, + "step": 5172 + }, + { + "epoch": 2.4132462686567164, + "grad_norm": 1.0071179852409573, + "learning_rate": 1.1209967941162726e-06, + "loss": 0.0329, + "step": 5174 + }, + { + "epoch": 2.4141791044776117, + "grad_norm": 0.8809510188750823, + "learning_rate": 1.117574277015847e-06, + "loss": 0.0309, + "step": 5176 + }, + { + "epoch": 2.4151119402985075, + "grad_norm": 0.971103027505273, + "learning_rate": 1.114156335070347e-06, + "loss": 0.0331, + "step": 5178 + }, + { + "epoch": 2.416044776119403, + "grad_norm": 1.0973511216365415, + "learning_rate": 1.1107429723075685e-06, + "loss": 0.0306, + "step": 5180 + }, + { + "epoch": 2.4169776119402986, + "grad_norm": 1.0082100658114197, + "learning_rate": 1.1073341927499082e-06, + "loss": 0.0335, + "step": 5182 + }, + { + "epoch": 2.417910447761194, + "grad_norm": 0.9427629531340962, + "learning_rate": 1.1039300004143655e-06, + "loss": 0.0346, + "step": 5184 + }, + { + "epoch": 2.4188432835820897, + "grad_norm": 0.8141044026232971, + "learning_rate": 1.1005303993125299e-06, + "loss": 0.0252, + "step": 5186 + }, + { + "epoch": 2.419776119402985, + "grad_norm": 0.947017846185009, + "learning_rate": 1.097135393450584e-06, + "loss": 0.033, + "step": 5188 + }, + { + "epoch": 2.4207089552238807, + "grad_norm": 1.3696165446597555, + "learning_rate": 1.093744986829296e-06, + "loss": 0.0347, + "step": 5190 + }, + { + "epoch": 2.421641791044776, + "grad_norm": 0.9307203687285788, + "learning_rate": 1.0903591834440096e-06, + "loss": 0.0296, + "step": 5192 + }, + { + "epoch": 2.422574626865672, + "grad_norm": 0.9779965790293061, + "learning_rate": 1.0869779872846465e-06, + "loss": 0.0294, + "step": 5194 + }, + { + "epoch": 2.423507462686567, + "grad_norm": 0.8606747677304812, + "learning_rate": 1.0836014023357e-06, + "loss": 0.0279, + "step": 5196 + }, + { + "epoch": 2.424440298507463, + "grad_norm": 0.813404834932556, + "learning_rate": 1.0802294325762303e-06, + "loss": 0.0281, + "step": 5198 + }, + { + "epoch": 2.425373134328358, + "grad_norm": 0.9607300866710246, + "learning_rate": 1.0768620819798543e-06, + "loss": 0.0295, + "step": 5200 + }, + { + "epoch": 2.4263059701492535, + "grad_norm": 0.8114237433585384, + "learning_rate": 1.0734993545147514e-06, + "loss": 0.0275, + "step": 5202 + }, + { + "epoch": 2.4272388059701493, + "grad_norm": 0.9803779968633127, + "learning_rate": 1.0701412541436484e-06, + "loss": 0.0337, + "step": 5204 + }, + { + "epoch": 2.4281716417910446, + "grad_norm": 0.8865864969325976, + "learning_rate": 1.066787784823819e-06, + "loss": 0.0342, + "step": 5206 + }, + { + "epoch": 2.4291044776119404, + "grad_norm": 1.2507290047495112, + "learning_rate": 1.063438950507087e-06, + "loss": 0.0315, + "step": 5208 + }, + { + "epoch": 2.4300373134328357, + "grad_norm": 0.8700821742585412, + "learning_rate": 1.0600947551398055e-06, + "loss": 0.0322, + "step": 5210 + }, + { + "epoch": 2.4309701492537314, + "grad_norm": 1.0494241299606317, + "learning_rate": 1.0567552026628635e-06, + "loss": 0.0328, + "step": 5212 + }, + { + "epoch": 2.4319029850746268, + "grad_norm": 1.0655792543369247, + "learning_rate": 1.0534202970116825e-06, + "loss": 0.0296, + "step": 5214 + }, + { + "epoch": 2.4328358208955225, + "grad_norm": 0.8772367123572078, + "learning_rate": 1.0500900421162013e-06, + "loss": 0.0272, + "step": 5216 + }, + { + "epoch": 2.433768656716418, + "grad_norm": 0.9924973823945795, + "learning_rate": 1.0467644419008843e-06, + "loss": 0.0288, + "step": 5218 + }, + { + "epoch": 2.4347014925373136, + "grad_norm": 0.901073820435966, + "learning_rate": 1.0434435002847088e-06, + "loss": 0.0312, + "step": 5220 + }, + { + "epoch": 2.435634328358209, + "grad_norm": 0.8865583609137004, + "learning_rate": 1.0401272211811598e-06, + "loss": 0.0316, + "step": 5222 + }, + { + "epoch": 2.4365671641791042, + "grad_norm": 0.9166280232072145, + "learning_rate": 1.0368156084982318e-06, + "loss": 0.0316, + "step": 5224 + }, + { + "epoch": 2.4375, + "grad_norm": 0.8817845348476032, + "learning_rate": 1.0335086661384175e-06, + "loss": 0.0295, + "step": 5226 + }, + { + "epoch": 2.4384328358208958, + "grad_norm": 1.0149708192946463, + "learning_rate": 1.0302063979987053e-06, + "loss": 0.03, + "step": 5228 + }, + { + "epoch": 2.439365671641791, + "grad_norm": 0.9425035280111684, + "learning_rate": 1.0269088079705775e-06, + "loss": 0.0308, + "step": 5230 + }, + { + "epoch": 2.4402985074626864, + "grad_norm": 1.01515710324735, + "learning_rate": 1.0236158999400054e-06, + "loss": 0.0304, + "step": 5232 + }, + { + "epoch": 2.441231343283582, + "grad_norm": 0.9825655743348506, + "learning_rate": 1.0203276777874365e-06, + "loss": 0.0347, + "step": 5234 + }, + { + "epoch": 2.4421641791044775, + "grad_norm": 0.9109880566827612, + "learning_rate": 1.0170441453878038e-06, + "loss": 0.0287, + "step": 5236 + }, + { + "epoch": 2.4430970149253732, + "grad_norm": 0.950996314380109, + "learning_rate": 1.0137653066105073e-06, + "loss": 0.0305, + "step": 5238 + }, + { + "epoch": 2.4440298507462686, + "grad_norm": 0.9666491371474853, + "learning_rate": 1.0104911653194205e-06, + "loss": 0.0303, + "step": 5240 + }, + { + "epoch": 2.4449626865671643, + "grad_norm": 0.918975788396854, + "learning_rate": 1.0072217253728806e-06, + "loss": 0.028, + "step": 5242 + }, + { + "epoch": 2.4458955223880596, + "grad_norm": 1.0443591962307095, + "learning_rate": 1.0039569906236819e-06, + "loss": 0.03, + "step": 5244 + }, + { + "epoch": 2.4468283582089554, + "grad_norm": 1.0129350368733916, + "learning_rate": 1.0006969649190746e-06, + "loss": 0.0308, + "step": 5246 + }, + { + "epoch": 2.4477611940298507, + "grad_norm": 1.024034540470462, + "learning_rate": 9.974416521007635e-07, + "loss": 0.0346, + "step": 5248 + }, + { + "epoch": 2.4486940298507465, + "grad_norm": 1.04027731110459, + "learning_rate": 9.94191056004894e-07, + "loss": 0.0325, + "step": 5250 + }, + { + "epoch": 2.449626865671642, + "grad_norm": 0.9032430034695119, + "learning_rate": 9.909451804620579e-07, + "loss": 0.0308, + "step": 5252 + }, + { + "epoch": 2.450559701492537, + "grad_norm": 0.918163109867578, + "learning_rate": 9.877040292972823e-07, + "loss": 0.0295, + "step": 5254 + }, + { + "epoch": 2.451492537313433, + "grad_norm": 0.9116229447610709, + "learning_rate": 9.844676063300268e-07, + "loss": 0.0284, + "step": 5256 + }, + { + "epoch": 2.452425373134328, + "grad_norm": 0.8915138996779104, + "learning_rate": 9.81235915374178e-07, + "loss": 0.0318, + "step": 5258 + }, + { + "epoch": 2.453358208955224, + "grad_norm": 1.0826412085183617, + "learning_rate": 9.780089602380477e-07, + "loss": 0.0305, + "step": 5260 + }, + { + "epoch": 2.4542910447761193, + "grad_norm": 0.922620217878607, + "learning_rate": 9.747867447243692e-07, + "loss": 0.0269, + "step": 5262 + }, + { + "epoch": 2.455223880597015, + "grad_norm": 0.9087907682602725, + "learning_rate": 9.715692726302845e-07, + "loss": 0.0346, + "step": 5264 + }, + { + "epoch": 2.4561567164179103, + "grad_norm": 0.9586238741385978, + "learning_rate": 9.683565477473517e-07, + "loss": 0.0298, + "step": 5266 + }, + { + "epoch": 2.457089552238806, + "grad_norm": 1.0223064558524098, + "learning_rate": 9.651485738615308e-07, + "loss": 0.0318, + "step": 5268 + }, + { + "epoch": 2.4580223880597014, + "grad_norm": 0.9752253770560073, + "learning_rate": 9.61945354753185e-07, + "loss": 0.0388, + "step": 5270 + }, + { + "epoch": 2.458955223880597, + "grad_norm": 1.0970714216579576, + "learning_rate": 9.58746894197075e-07, + "loss": 0.033, + "step": 5272 + }, + { + "epoch": 2.4598880597014925, + "grad_norm": 0.8692948176360848, + "learning_rate": 9.555531959623505e-07, + "loss": 0.0312, + "step": 5274 + }, + { + "epoch": 2.4608208955223883, + "grad_norm": 1.2234499178707334, + "learning_rate": 9.523642638125541e-07, + "loss": 0.0307, + "step": 5276 + }, + { + "epoch": 2.4617537313432836, + "grad_norm": 1.032870140912907, + "learning_rate": 9.491801015056079e-07, + "loss": 0.028, + "step": 5278 + }, + { + "epoch": 2.4626865671641793, + "grad_norm": 1.0229643703041138, + "learning_rate": 9.460007127938131e-07, + "loss": 0.0343, + "step": 5280 + }, + { + "epoch": 2.4636194029850746, + "grad_norm": 0.9697729207859996, + "learning_rate": 9.428261014238482e-07, + "loss": 0.0295, + "step": 5282 + }, + { + "epoch": 2.46455223880597, + "grad_norm": 0.8585679577937615, + "learning_rate": 9.396562711367618e-07, + "loss": 0.0287, + "step": 5284 + }, + { + "epoch": 2.4654850746268657, + "grad_norm": 0.8600373390020909, + "learning_rate": 9.364912256679648e-07, + "loss": 0.0287, + "step": 5286 + }, + { + "epoch": 2.466417910447761, + "grad_norm": 0.7654761908933982, + "learning_rate": 9.333309687472342e-07, + "loss": 0.0238, + "step": 5288 + }, + { + "epoch": 2.467350746268657, + "grad_norm": 1.0261690678902156, + "learning_rate": 9.301755040987009e-07, + "loss": 0.0316, + "step": 5290 + }, + { + "epoch": 2.468283582089552, + "grad_norm": 1.0608751660597768, + "learning_rate": 9.270248354408467e-07, + "loss": 0.0347, + "step": 5292 + }, + { + "epoch": 2.469216417910448, + "grad_norm": 1.021052839418142, + "learning_rate": 9.238789664865095e-07, + "loss": 0.0304, + "step": 5294 + }, + { + "epoch": 2.470149253731343, + "grad_norm": 1.172851673632966, + "learning_rate": 9.207379009428624e-07, + "loss": 0.0351, + "step": 5296 + }, + { + "epoch": 2.471082089552239, + "grad_norm": 1.0208080543389906, + "learning_rate": 9.17601642511422e-07, + "loss": 0.0348, + "step": 5298 + }, + { + "epoch": 2.4720149253731343, + "grad_norm": 0.9561375880016924, + "learning_rate": 9.144701948880407e-07, + "loss": 0.0264, + "step": 5300 + }, + { + "epoch": 2.47294776119403, + "grad_norm": 0.8711993131825574, + "learning_rate": 9.113435617628985e-07, + "loss": 0.0297, + "step": 5302 + }, + { + "epoch": 2.4738805970149254, + "grad_norm": 1.0468283824085718, + "learning_rate": 9.082217468205057e-07, + "loss": 0.032, + "step": 5304 + }, + { + "epoch": 2.4748134328358207, + "grad_norm": 0.8070596299062749, + "learning_rate": 9.05104753739694e-07, + "loss": 0.0299, + "step": 5306 + }, + { + "epoch": 2.4757462686567164, + "grad_norm": 0.8130945780791511, + "learning_rate": 9.019925861936101e-07, + "loss": 0.0241, + "step": 5308 + }, + { + "epoch": 2.4766791044776117, + "grad_norm": 1.0637012776335473, + "learning_rate": 8.988852478497156e-07, + "loss": 0.031, + "step": 5310 + }, + { + "epoch": 2.4776119402985075, + "grad_norm": 0.9987356483323814, + "learning_rate": 8.957827423697823e-07, + "loss": 0.0391, + "step": 5312 + }, + { + "epoch": 2.478544776119403, + "grad_norm": 0.8320233706324502, + "learning_rate": 8.926850734098874e-07, + "loss": 0.0266, + "step": 5314 + }, + { + "epoch": 2.4794776119402986, + "grad_norm": 0.9298389858750122, + "learning_rate": 8.895922446204053e-07, + "loss": 0.0332, + "step": 5316 + }, + { + "epoch": 2.480410447761194, + "grad_norm": 0.950930613061798, + "learning_rate": 8.865042596460111e-07, + "loss": 0.0321, + "step": 5318 + }, + { + "epoch": 2.4813432835820897, + "grad_norm": 0.9617986783790913, + "learning_rate": 8.834211221256661e-07, + "loss": 0.0339, + "step": 5320 + }, + { + "epoch": 2.482276119402985, + "grad_norm": 1.2039081267856504, + "learning_rate": 8.803428356926242e-07, + "loss": 0.0312, + "step": 5322 + }, + { + "epoch": 2.4832089552238807, + "grad_norm": 0.9853941051886902, + "learning_rate": 8.772694039744228e-07, + "loss": 0.0281, + "step": 5324 + }, + { + "epoch": 2.484141791044776, + "grad_norm": 0.8819923200252056, + "learning_rate": 8.742008305928728e-07, + "loss": 0.0265, + "step": 5326 + }, + { + "epoch": 2.485074626865672, + "grad_norm": 0.9926797819846446, + "learning_rate": 8.711371191640677e-07, + "loss": 0.0273, + "step": 5328 + }, + { + "epoch": 2.486007462686567, + "grad_norm": 1.024681171097431, + "learning_rate": 8.680782732983645e-07, + "loss": 0.0347, + "step": 5330 + }, + { + "epoch": 2.486940298507463, + "grad_norm": 1.0012248853144015, + "learning_rate": 8.650242966003897e-07, + "loss": 0.0349, + "step": 5332 + }, + { + "epoch": 2.487873134328358, + "grad_norm": 1.0895809997598143, + "learning_rate": 8.619751926690317e-07, + "loss": 0.0314, + "step": 5334 + }, + { + "epoch": 2.4888059701492535, + "grad_norm": 0.9253355336146758, + "learning_rate": 8.589309650974387e-07, + "loss": 0.0309, + "step": 5336 + }, + { + "epoch": 2.4897388059701493, + "grad_norm": 0.8271357690929486, + "learning_rate": 8.558916174730076e-07, + "loss": 0.0288, + "step": 5338 + }, + { + "epoch": 2.4906716417910446, + "grad_norm": 0.9870974780023992, + "learning_rate": 8.528571533773894e-07, + "loss": 0.0284, + "step": 5340 + }, + { + "epoch": 2.4916044776119404, + "grad_norm": 1.0610406782504171, + "learning_rate": 8.498275763864782e-07, + "loss": 0.0293, + "step": 5342 + }, + { + "epoch": 2.4925373134328357, + "grad_norm": 0.7604632823561313, + "learning_rate": 8.46802890070405e-07, + "loss": 0.0228, + "step": 5344 + }, + { + "epoch": 2.4934701492537314, + "grad_norm": 1.1049817644836104, + "learning_rate": 8.43783097993548e-07, + "loss": 0.0299, + "step": 5346 + }, + { + "epoch": 2.4944029850746268, + "grad_norm": 1.0011593857407741, + "learning_rate": 8.407682037145076e-07, + "loss": 0.0242, + "step": 5348 + }, + { + "epoch": 2.4953358208955225, + "grad_norm": 1.014442412297917, + "learning_rate": 8.37758210786116e-07, + "loss": 0.0344, + "step": 5350 + }, + { + "epoch": 2.496268656716418, + "grad_norm": 0.8394720810240413, + "learning_rate": 8.347531227554323e-07, + "loss": 0.0271, + "step": 5352 + }, + { + "epoch": 2.4972014925373136, + "grad_norm": 0.9904793674093569, + "learning_rate": 8.3175294316373e-07, + "loss": 0.0367, + "step": 5354 + }, + { + "epoch": 2.498134328358209, + "grad_norm": 1.0671548012590597, + "learning_rate": 8.287576755465032e-07, + "loss": 0.0291, + "step": 5356 + }, + { + "epoch": 2.4990671641791042, + "grad_norm": 0.9743440646929675, + "learning_rate": 8.257673234334568e-07, + "loss": 0.0363, + "step": 5358 + }, + { + "epoch": 2.5, + "grad_norm": 0.929426697584014, + "learning_rate": 8.227818903485013e-07, + "loss": 0.029, + "step": 5360 + }, + { + "epoch": 2.5009328358208958, + "grad_norm": 0.9015513886617461, + "learning_rate": 8.198013798097498e-07, + "loss": 0.0306, + "step": 5362 + }, + { + "epoch": 2.501865671641791, + "grad_norm": 0.7719495924948875, + "learning_rate": 8.168257953295178e-07, + "loss": 0.0265, + "step": 5364 + }, + { + "epoch": 2.5027985074626864, + "grad_norm": 0.9986872200315227, + "learning_rate": 8.138551404143147e-07, + "loss": 0.0294, + "step": 5366 + }, + { + "epoch": 2.503731343283582, + "grad_norm": 0.9005280842153865, + "learning_rate": 8.108894185648381e-07, + "loss": 0.0252, + "step": 5368 + }, + { + "epoch": 2.5046641791044775, + "grad_norm": 0.9499952978506555, + "learning_rate": 8.079286332759762e-07, + "loss": 0.0296, + "step": 5370 + }, + { + "epoch": 2.5055970149253732, + "grad_norm": 0.9765392005856854, + "learning_rate": 8.049727880367969e-07, + "loss": 0.0286, + "step": 5372 + }, + { + "epoch": 2.5065298507462686, + "grad_norm": 0.9917205237607287, + "learning_rate": 8.02021886330549e-07, + "loss": 0.0316, + "step": 5374 + }, + { + "epoch": 2.5074626865671643, + "grad_norm": 1.012228541643098, + "learning_rate": 7.99075931634653e-07, + "loss": 0.025, + "step": 5376 + }, + { + "epoch": 2.5083955223880596, + "grad_norm": 1.079233208839584, + "learning_rate": 7.961349274207014e-07, + "loss": 0.0313, + "step": 5378 + }, + { + "epoch": 2.5093283582089554, + "grad_norm": 0.8942220063497852, + "learning_rate": 7.931988771544547e-07, + "loss": 0.0287, + "step": 5380 + }, + { + "epoch": 2.5102611940298507, + "grad_norm": 0.8068043510615183, + "learning_rate": 7.902677842958318e-07, + "loss": 0.024, + "step": 5382 + }, + { + "epoch": 2.5111940298507465, + "grad_norm": 0.9965812102549537, + "learning_rate": 7.873416522989108e-07, + "loss": 0.0284, + "step": 5384 + }, + { + "epoch": 2.512126865671642, + "grad_norm": 0.841081254464499, + "learning_rate": 7.844204846119247e-07, + "loss": 0.0294, + "step": 5386 + }, + { + "epoch": 2.513059701492537, + "grad_norm": 0.9501727571983167, + "learning_rate": 7.81504284677258e-07, + "loss": 0.0255, + "step": 5388 + }, + { + "epoch": 2.513992537313433, + "grad_norm": 1.0016727854947534, + "learning_rate": 7.785930559314364e-07, + "loss": 0.0289, + "step": 5390 + }, + { + "epoch": 2.5149253731343286, + "grad_norm": 0.8721408729745113, + "learning_rate": 7.756868018051323e-07, + "loss": 0.0286, + "step": 5392 + }, + { + "epoch": 2.515858208955224, + "grad_norm": 1.0936890407002346, + "learning_rate": 7.727855257231537e-07, + "loss": 0.0303, + "step": 5394 + }, + { + "epoch": 2.5167910447761193, + "grad_norm": 1.014948867589655, + "learning_rate": 7.698892311044387e-07, + "loss": 0.0293, + "step": 5396 + }, + { + "epoch": 2.517723880597015, + "grad_norm": 0.9958364273519411, + "learning_rate": 7.669979213620643e-07, + "loss": 0.0299, + "step": 5398 + }, + { + "epoch": 2.5186567164179103, + "grad_norm": 1.1617242124661515, + "learning_rate": 7.641115999032251e-07, + "loss": 0.032, + "step": 5400 + }, + { + "epoch": 2.519589552238806, + "grad_norm": 0.957098893021318, + "learning_rate": 7.61230270129239e-07, + "loss": 0.0266, + "step": 5402 + }, + { + "epoch": 2.5205223880597014, + "grad_norm": 0.9879727516732639, + "learning_rate": 7.583539354355445e-07, + "loss": 0.0297, + "step": 5404 + }, + { + "epoch": 2.521455223880597, + "grad_norm": 0.7791493964795234, + "learning_rate": 7.554825992116898e-07, + "loss": 0.0278, + "step": 5406 + }, + { + "epoch": 2.5223880597014925, + "grad_norm": 0.9508455925438611, + "learning_rate": 7.526162648413354e-07, + "loss": 0.0311, + "step": 5408 + }, + { + "epoch": 2.523320895522388, + "grad_norm": 0.9375269950337454, + "learning_rate": 7.497549357022488e-07, + "loss": 0.032, + "step": 5410 + }, + { + "epoch": 2.5242537313432836, + "grad_norm": 0.8181613386415205, + "learning_rate": 7.468986151662955e-07, + "loss": 0.0248, + "step": 5412 + }, + { + "epoch": 2.5251865671641793, + "grad_norm": 0.8449332338645831, + "learning_rate": 7.440473065994391e-07, + "loss": 0.0249, + "step": 5414 + }, + { + "epoch": 2.5261194029850746, + "grad_norm": 0.8932020081455512, + "learning_rate": 7.412010133617415e-07, + "loss": 0.027, + "step": 5416 + }, + { + "epoch": 2.52705223880597, + "grad_norm": 1.2082616553576144, + "learning_rate": 7.383597388073482e-07, + "loss": 0.0321, + "step": 5418 + }, + { + "epoch": 2.5279850746268657, + "grad_norm": 0.9138818756713926, + "learning_rate": 7.355234862844945e-07, + "loss": 0.0311, + "step": 5420 + }, + { + "epoch": 2.528917910447761, + "grad_norm": 1.0202618923997713, + "learning_rate": 7.32692259135499e-07, + "loss": 0.0362, + "step": 5422 + }, + { + "epoch": 2.529850746268657, + "grad_norm": 1.0665120247254412, + "learning_rate": 7.298660606967523e-07, + "loss": 0.0315, + "step": 5424 + }, + { + "epoch": 2.530783582089552, + "grad_norm": 1.1352439636722527, + "learning_rate": 7.270448942987263e-07, + "loss": 0.0337, + "step": 5426 + }, + { + "epoch": 2.531716417910448, + "grad_norm": 0.865442600561773, + "learning_rate": 7.242287632659556e-07, + "loss": 0.026, + "step": 5428 + }, + { + "epoch": 2.532649253731343, + "grad_norm": 0.9410527537956525, + "learning_rate": 7.214176709170484e-07, + "loss": 0.0284, + "step": 5430 + }, + { + "epoch": 2.533582089552239, + "grad_norm": 0.8269563301903696, + "learning_rate": 7.186116205646687e-07, + "loss": 0.0248, + "step": 5432 + }, + { + "epoch": 2.5345149253731343, + "grad_norm": 1.374109206927514, + "learning_rate": 7.158106155155437e-07, + "loss": 0.0284, + "step": 5434 + }, + { + "epoch": 2.53544776119403, + "grad_norm": 0.9714396595595592, + "learning_rate": 7.130146590704512e-07, + "loss": 0.033, + "step": 5436 + }, + { + "epoch": 2.5363805970149254, + "grad_norm": 1.076768884681306, + "learning_rate": 7.10223754524223e-07, + "loss": 0.0331, + "step": 5438 + }, + { + "epoch": 2.5373134328358207, + "grad_norm": 0.925026652018418, + "learning_rate": 7.074379051657366e-07, + "loss": 0.0291, + "step": 5440 + }, + { + "epoch": 2.5382462686567164, + "grad_norm": 0.7852779727368535, + "learning_rate": 7.046571142779096e-07, + "loss": 0.0257, + "step": 5442 + }, + { + "epoch": 2.539179104477612, + "grad_norm": 1.054140753661845, + "learning_rate": 7.018813851377032e-07, + "loss": 0.0307, + "step": 5444 + }, + { + "epoch": 2.5401119402985075, + "grad_norm": 0.855612003871619, + "learning_rate": 6.991107210161102e-07, + "loss": 0.0267, + "step": 5446 + }, + { + "epoch": 2.541044776119403, + "grad_norm": 0.9754413111180732, + "learning_rate": 6.96345125178155e-07, + "loss": 0.0282, + "step": 5448 + }, + { + "epoch": 2.5419776119402986, + "grad_norm": 0.9831546455408445, + "learning_rate": 6.935846008828906e-07, + "loss": 0.033, + "step": 5450 + }, + { + "epoch": 2.542910447761194, + "grad_norm": 0.9872681523455986, + "learning_rate": 6.908291513833948e-07, + "loss": 0.0271, + "step": 5452 + }, + { + "epoch": 2.5438432835820897, + "grad_norm": 1.0047813949957318, + "learning_rate": 6.880787799267608e-07, + "loss": 0.0293, + "step": 5454 + }, + { + "epoch": 2.544776119402985, + "grad_norm": 1.1256701558985338, + "learning_rate": 6.853334897541031e-07, + "loss": 0.0302, + "step": 5456 + }, + { + "epoch": 2.5457089552238807, + "grad_norm": 0.9508103757308844, + "learning_rate": 6.825932841005434e-07, + "loss": 0.0265, + "step": 5458 + }, + { + "epoch": 2.546641791044776, + "grad_norm": 0.9345261118581804, + "learning_rate": 6.79858166195212e-07, + "loss": 0.0352, + "step": 5460 + }, + { + "epoch": 2.5475746268656714, + "grad_norm": 0.9236243033974604, + "learning_rate": 6.771281392612505e-07, + "loss": 0.0231, + "step": 5462 + }, + { + "epoch": 2.548507462686567, + "grad_norm": 0.8445919648298371, + "learning_rate": 6.744032065157929e-07, + "loss": 0.0267, + "step": 5464 + }, + { + "epoch": 2.549440298507463, + "grad_norm": 0.7960534332376328, + "learning_rate": 6.716833711699727e-07, + "loss": 0.0238, + "step": 5466 + }, + { + "epoch": 2.550373134328358, + "grad_norm": 1.0426106071680399, + "learning_rate": 6.689686364289194e-07, + "loss": 0.0286, + "step": 5468 + }, + { + "epoch": 2.5513059701492535, + "grad_norm": 1.012093584788351, + "learning_rate": 6.662590054917467e-07, + "loss": 0.0324, + "step": 5470 + }, + { + "epoch": 2.5522388059701493, + "grad_norm": 0.8610382191725122, + "learning_rate": 6.635544815515576e-07, + "loss": 0.0254, + "step": 5472 + }, + { + "epoch": 2.5531716417910446, + "grad_norm": 0.9000569602375049, + "learning_rate": 6.608550677954379e-07, + "loss": 0.0335, + "step": 5474 + }, + { + "epoch": 2.5541044776119404, + "grad_norm": 0.9811954480600797, + "learning_rate": 6.581607674044466e-07, + "loss": 0.0237, + "step": 5476 + }, + { + "epoch": 2.5550373134328357, + "grad_norm": 0.8697194881029953, + "learning_rate": 6.554715835536224e-07, + "loss": 0.0292, + "step": 5478 + }, + { + "epoch": 2.5559701492537314, + "grad_norm": 0.9550648238319489, + "learning_rate": 6.527875194119687e-07, + "loss": 0.0296, + "step": 5480 + }, + { + "epoch": 2.5569029850746268, + "grad_norm": 1.0465671636648464, + "learning_rate": 6.501085781424621e-07, + "loss": 0.0303, + "step": 5482 + }, + { + "epoch": 2.5578358208955225, + "grad_norm": 1.0142530036315984, + "learning_rate": 6.474347629020367e-07, + "loss": 0.0337, + "step": 5484 + }, + { + "epoch": 2.558768656716418, + "grad_norm": 1.0136757509818954, + "learning_rate": 6.447660768415897e-07, + "loss": 0.0348, + "step": 5486 + }, + { + "epoch": 2.5597014925373136, + "grad_norm": 0.9556311189905142, + "learning_rate": 6.421025231059713e-07, + "loss": 0.0315, + "step": 5488 + }, + { + "epoch": 2.560634328358209, + "grad_norm": 1.147131138974972, + "learning_rate": 6.394441048339867e-07, + "loss": 0.0344, + "step": 5490 + }, + { + "epoch": 2.5615671641791042, + "grad_norm": 0.9801129122588047, + "learning_rate": 6.367908251583854e-07, + "loss": 0.032, + "step": 5492 + }, + { + "epoch": 2.5625, + "grad_norm": 1.0133405335811954, + "learning_rate": 6.341426872058648e-07, + "loss": 0.0305, + "step": 5494 + }, + { + "epoch": 2.5634328358208958, + "grad_norm": 0.8627633534135905, + "learning_rate": 6.314996940970624e-07, + "loss": 0.026, + "step": 5496 + }, + { + "epoch": 2.564365671641791, + "grad_norm": 1.0032084741951597, + "learning_rate": 6.28861848946552e-07, + "loss": 0.0281, + "step": 5498 + }, + { + "epoch": 2.5652985074626864, + "grad_norm": 0.9378214505319038, + "learning_rate": 6.262291548628397e-07, + "loss": 0.0324, + "step": 5500 + }, + { + "epoch": 2.5652985074626864, + "eval_loss": 0.18188245594501495, + "eval_runtime": 320.8699, + "eval_samples_per_second": 47.515, + "eval_steps_per_second": 5.94, + "step": 5500 + }, + { + "epoch": 2.566231343283582, + "grad_norm": 0.8900669468176743, + "learning_rate": 6.236016149483647e-07, + "loss": 0.0273, + "step": 5502 + }, + { + "epoch": 2.5671641791044775, + "grad_norm": 0.9006635215935282, + "learning_rate": 6.209792322994912e-07, + "loss": 0.0307, + "step": 5504 + }, + { + "epoch": 2.5680970149253732, + "grad_norm": 0.7531383759987479, + "learning_rate": 6.183620100065035e-07, + "loss": 0.0291, + "step": 5506 + }, + { + "epoch": 2.5690298507462686, + "grad_norm": 0.9617374260328377, + "learning_rate": 6.157499511536091e-07, + "loss": 0.0279, + "step": 5508 + }, + { + "epoch": 2.5699626865671643, + "grad_norm": 0.9928579984722113, + "learning_rate": 6.131430588189275e-07, + "loss": 0.0296, + "step": 5510 + }, + { + "epoch": 2.5708955223880596, + "grad_norm": 0.8946572444097028, + "learning_rate": 6.105413360744883e-07, + "loss": 0.023, + "step": 5512 + }, + { + "epoch": 2.5718283582089554, + "grad_norm": 1.0747235948124607, + "learning_rate": 6.079447859862353e-07, + "loss": 0.0307, + "step": 5514 + }, + { + "epoch": 2.5727611940298507, + "grad_norm": 0.8143405644267114, + "learning_rate": 6.05353411614012e-07, + "loss": 0.0266, + "step": 5516 + }, + { + "epoch": 2.5736940298507465, + "grad_norm": 0.9067389240466966, + "learning_rate": 6.027672160115622e-07, + "loss": 0.0255, + "step": 5518 + }, + { + "epoch": 2.574626865671642, + "grad_norm": 0.8681536868285394, + "learning_rate": 6.001862022265298e-07, + "loss": 0.0328, + "step": 5520 + }, + { + "epoch": 2.575559701492537, + "grad_norm": 0.9659977467940498, + "learning_rate": 5.976103733004501e-07, + "loss": 0.027, + "step": 5522 + }, + { + "epoch": 2.576492537313433, + "grad_norm": 1.0675965024576906, + "learning_rate": 5.95039732268749e-07, + "loss": 0.0324, + "step": 5524 + }, + { + "epoch": 2.5774253731343286, + "grad_norm": 0.9736658548517676, + "learning_rate": 5.924742821607404e-07, + "loss": 0.0283, + "step": 5526 + }, + { + "epoch": 2.578358208955224, + "grad_norm": 0.9451643634696305, + "learning_rate": 5.899140259996183e-07, + "loss": 0.025, + "step": 5528 + }, + { + "epoch": 2.5792910447761193, + "grad_norm": 1.0005795398514292, + "learning_rate": 5.873589668024593e-07, + "loss": 0.0268, + "step": 5530 + }, + { + "epoch": 2.580223880597015, + "grad_norm": 0.9091404318354299, + "learning_rate": 5.848091075802121e-07, + "loss": 0.0264, + "step": 5532 + }, + { + "epoch": 2.5811567164179103, + "grad_norm": 1.0100657012475414, + "learning_rate": 5.82264451337699e-07, + "loss": 0.0313, + "step": 5534 + }, + { + "epoch": 2.582089552238806, + "grad_norm": 1.1476126000293594, + "learning_rate": 5.797250010736122e-07, + "loss": 0.0335, + "step": 5536 + }, + { + "epoch": 2.5830223880597014, + "grad_norm": 0.9087013565256767, + "learning_rate": 5.771907597805098e-07, + "loss": 0.0271, + "step": 5538 + }, + { + "epoch": 2.583955223880597, + "grad_norm": 0.9520747941296617, + "learning_rate": 5.746617304448071e-07, + "loss": 0.0287, + "step": 5540 + }, + { + "epoch": 2.5848880597014925, + "grad_norm": 0.9385766823424762, + "learning_rate": 5.721379160467827e-07, + "loss": 0.0298, + "step": 5542 + }, + { + "epoch": 2.585820895522388, + "grad_norm": 0.8307023237191612, + "learning_rate": 5.696193195605654e-07, + "loss": 0.0259, + "step": 5544 + }, + { + "epoch": 2.5867537313432836, + "grad_norm": 0.9211507876184712, + "learning_rate": 5.671059439541383e-07, + "loss": 0.0305, + "step": 5546 + }, + { + "epoch": 2.5876865671641793, + "grad_norm": 0.846567556390467, + "learning_rate": 5.645977921893308e-07, + "loss": 0.0273, + "step": 5548 + }, + { + "epoch": 2.5886194029850746, + "grad_norm": 0.994620831670817, + "learning_rate": 5.620948672218169e-07, + "loss": 0.0301, + "step": 5550 + }, + { + "epoch": 2.58955223880597, + "grad_norm": 0.9796801084651251, + "learning_rate": 5.59597172001109e-07, + "loss": 0.0272, + "step": 5552 + }, + { + "epoch": 2.5904850746268657, + "grad_norm": 0.8391949064190108, + "learning_rate": 5.57104709470559e-07, + "loss": 0.0256, + "step": 5554 + }, + { + "epoch": 2.591417910447761, + "grad_norm": 0.8616962242798509, + "learning_rate": 5.546174825673528e-07, + "loss": 0.0253, + "step": 5556 + }, + { + "epoch": 2.592350746268657, + "grad_norm": 1.0133282243772208, + "learning_rate": 5.521354942225043e-07, + "loss": 0.0295, + "step": 5558 + }, + { + "epoch": 2.593283582089552, + "grad_norm": 0.9944881514774758, + "learning_rate": 5.496587473608572e-07, + "loss": 0.0315, + "step": 5560 + }, + { + "epoch": 2.594216417910448, + "grad_norm": 0.9006894177917104, + "learning_rate": 5.471872449010752e-07, + "loss": 0.0287, + "step": 5562 + }, + { + "epoch": 2.595149253731343, + "grad_norm": 0.9840676939981822, + "learning_rate": 5.44720989755641e-07, + "loss": 0.0325, + "step": 5564 + }, + { + "epoch": 2.596082089552239, + "grad_norm": 1.0492334400722652, + "learning_rate": 5.422599848308602e-07, + "loss": 0.0334, + "step": 5566 + }, + { + "epoch": 2.5970149253731343, + "grad_norm": 1.0202829525957806, + "learning_rate": 5.398042330268461e-07, + "loss": 0.0345, + "step": 5568 + }, + { + "epoch": 2.59794776119403, + "grad_norm": 0.8092232677751454, + "learning_rate": 5.373537372375209e-07, + "loss": 0.0278, + "step": 5570 + }, + { + "epoch": 2.5988805970149254, + "grad_norm": 1.1415943323729512, + "learning_rate": 5.349085003506166e-07, + "loss": 0.0319, + "step": 5572 + }, + { + "epoch": 2.5998134328358207, + "grad_norm": 1.0558290186703099, + "learning_rate": 5.324685252476647e-07, + "loss": 0.0352, + "step": 5574 + }, + { + "epoch": 2.6007462686567164, + "grad_norm": 0.9278735390408485, + "learning_rate": 5.300338148039979e-07, + "loss": 0.0279, + "step": 5576 + }, + { + "epoch": 2.601679104477612, + "grad_norm": 0.8822870846249138, + "learning_rate": 5.276043718887464e-07, + "loss": 0.0278, + "step": 5578 + }, + { + "epoch": 2.6026119402985075, + "grad_norm": 0.9867527231063329, + "learning_rate": 5.251801993648281e-07, + "loss": 0.0294, + "step": 5580 + }, + { + "epoch": 2.603544776119403, + "grad_norm": 0.936868363790127, + "learning_rate": 5.227613000889558e-07, + "loss": 0.0275, + "step": 5582 + }, + { + "epoch": 2.6044776119402986, + "grad_norm": 0.9319660667976284, + "learning_rate": 5.203476769116239e-07, + "loss": 0.0253, + "step": 5584 + }, + { + "epoch": 2.605410447761194, + "grad_norm": 0.9204835657360955, + "learning_rate": 5.179393326771104e-07, + "loss": 0.0284, + "step": 5586 + }, + { + "epoch": 2.6063432835820897, + "grad_norm": 0.7635562770824101, + "learning_rate": 5.15536270223474e-07, + "loss": 0.0264, + "step": 5588 + }, + { + "epoch": 2.607276119402985, + "grad_norm": 0.7592415906330424, + "learning_rate": 5.131384923825489e-07, + "loss": 0.0256, + "step": 5590 + }, + { + "epoch": 2.6082089552238807, + "grad_norm": 0.8305660864451233, + "learning_rate": 5.107460019799387e-07, + "loss": 0.0241, + "step": 5592 + }, + { + "epoch": 2.609141791044776, + "grad_norm": 0.9241178513054383, + "learning_rate": 5.083588018350211e-07, + "loss": 0.0271, + "step": 5594 + }, + { + "epoch": 2.6100746268656714, + "grad_norm": 0.897370132959456, + "learning_rate": 5.059768947609345e-07, + "loss": 0.0275, + "step": 5596 + }, + { + "epoch": 2.611007462686567, + "grad_norm": 0.9744646780096524, + "learning_rate": 5.036002835645837e-07, + "loss": 0.0295, + "step": 5598 + }, + { + "epoch": 2.611940298507463, + "grad_norm": 1.0062160171716794, + "learning_rate": 5.012289710466317e-07, + "loss": 0.025, + "step": 5600 + }, + { + "epoch": 2.612873134328358, + "grad_norm": 1.055927155622184, + "learning_rate": 4.988629600014966e-07, + "loss": 0.0274, + "step": 5602 + }, + { + "epoch": 2.6138059701492535, + "grad_norm": 0.837625553907277, + "learning_rate": 4.96502253217348e-07, + "loss": 0.0288, + "step": 5604 + }, + { + "epoch": 2.6147388059701493, + "grad_norm": 0.9584577611531877, + "learning_rate": 4.941468534761074e-07, + "loss": 0.0285, + "step": 5606 + }, + { + "epoch": 2.6156716417910446, + "grad_norm": 0.9342871100256482, + "learning_rate": 4.917967635534421e-07, + "loss": 0.0299, + "step": 5608 + }, + { + "epoch": 2.6166044776119404, + "grad_norm": 0.9819128727842276, + "learning_rate": 4.894519862187596e-07, + "loss": 0.0271, + "step": 5610 + }, + { + "epoch": 2.6175373134328357, + "grad_norm": 0.7754444347557119, + "learning_rate": 4.87112524235211e-07, + "loss": 0.0241, + "step": 5612 + }, + { + "epoch": 2.6184701492537314, + "grad_norm": 0.9015445051832793, + "learning_rate": 4.847783803596789e-07, + "loss": 0.0251, + "step": 5614 + }, + { + "epoch": 2.6194029850746268, + "grad_norm": 0.8595223243471014, + "learning_rate": 4.824495573427818e-07, + "loss": 0.0255, + "step": 5616 + }, + { + "epoch": 2.6203358208955225, + "grad_norm": 0.9052102415236957, + "learning_rate": 4.801260579288669e-07, + "loss": 0.0274, + "step": 5618 + }, + { + "epoch": 2.621268656716418, + "grad_norm": 1.219194280438804, + "learning_rate": 4.778078848560108e-07, + "loss": 0.0253, + "step": 5620 + }, + { + "epoch": 2.6222014925373136, + "grad_norm": 0.9275939908539204, + "learning_rate": 4.7549504085600773e-07, + "loss": 0.0309, + "step": 5622 + }, + { + "epoch": 2.623134328358209, + "grad_norm": 1.0371372383715778, + "learning_rate": 4.731875286543786e-07, + "loss": 0.0308, + "step": 5624 + }, + { + "epoch": 2.6240671641791042, + "grad_norm": 0.886916317382346, + "learning_rate": 4.7088535097035483e-07, + "loss": 0.0253, + "step": 5626 + }, + { + "epoch": 2.625, + "grad_norm": 0.9554220624448035, + "learning_rate": 4.685885105168864e-07, + "loss": 0.0282, + "step": 5628 + }, + { + "epoch": 2.6259328358208958, + "grad_norm": 0.8886167057050335, + "learning_rate": 4.66297010000632e-07, + "loss": 0.0328, + "step": 5630 + }, + { + "epoch": 2.626865671641791, + "grad_norm": 0.9304815536749925, + "learning_rate": 4.6401085212195607e-07, + "loss": 0.0263, + "step": 5632 + }, + { + "epoch": 2.6277985074626864, + "grad_norm": 1.075769767251724, + "learning_rate": 4.6173003957493026e-07, + "loss": 0.0288, + "step": 5634 + }, + { + "epoch": 2.628731343283582, + "grad_norm": 0.9417260104752829, + "learning_rate": 4.594545750473245e-07, + "loss": 0.0243, + "step": 5636 + }, + { + "epoch": 2.6296641791044775, + "grad_norm": 0.9669856754306663, + "learning_rate": 4.5718446122060666e-07, + "loss": 0.0254, + "step": 5638 + }, + { + "epoch": 2.6305970149253732, + "grad_norm": 1.0714829385244788, + "learning_rate": 4.5491970076994074e-07, + "loss": 0.031, + "step": 5640 + }, + { + "epoch": 2.6315298507462686, + "grad_norm": 1.2512403663477403, + "learning_rate": 4.526602963641824e-07, + "loss": 0.0312, + "step": 5642 + }, + { + "epoch": 2.6324626865671643, + "grad_norm": 0.9466518590437843, + "learning_rate": 4.504062506658724e-07, + "loss": 0.0261, + "step": 5644 + }, + { + "epoch": 2.6333955223880596, + "grad_norm": 0.8880752182018787, + "learning_rate": 4.481575663312415e-07, + "loss": 0.0278, + "step": 5646 + }, + { + "epoch": 2.6343283582089554, + "grad_norm": 0.9454381484970772, + "learning_rate": 4.4591424601019674e-07, + "loss": 0.0278, + "step": 5648 + }, + { + "epoch": 2.6352611940298507, + "grad_norm": 0.8350721440652583, + "learning_rate": 4.436762923463295e-07, + "loss": 0.0243, + "step": 5650 + }, + { + "epoch": 2.6361940298507465, + "grad_norm": 1.0080469270429602, + "learning_rate": 4.414437079769046e-07, + "loss": 0.0304, + "step": 5652 + }, + { + "epoch": 2.637126865671642, + "grad_norm": 1.0557127886720743, + "learning_rate": 4.392164955328582e-07, + "loss": 0.0331, + "step": 5654 + }, + { + "epoch": 2.638059701492537, + "grad_norm": 0.9324378923778945, + "learning_rate": 4.369946576387979e-07, + "loss": 0.03, + "step": 5656 + }, + { + "epoch": 2.638992537313433, + "grad_norm": 0.9421771251974606, + "learning_rate": 4.347781969129977e-07, + "loss": 0.0275, + "step": 5658 + }, + { + "epoch": 2.6399253731343286, + "grad_norm": 0.8141898581331491, + "learning_rate": 4.325671159673933e-07, + "loss": 0.0275, + "step": 5660 + }, + { + "epoch": 2.640858208955224, + "grad_norm": 0.9178761025153416, + "learning_rate": 4.303614174075826e-07, + "loss": 0.0324, + "step": 5662 + }, + { + "epoch": 2.6417910447761193, + "grad_norm": 0.9662115133527521, + "learning_rate": 4.281611038328215e-07, + "loss": 0.0267, + "step": 5664 + }, + { + "epoch": 2.642723880597015, + "grad_norm": 0.8822810881757057, + "learning_rate": 4.2596617783601744e-07, + "loss": 0.0276, + "step": 5666 + }, + { + "epoch": 2.6436567164179103, + "grad_norm": 1.030215439678188, + "learning_rate": 4.2377664200372927e-07, + "loss": 0.0288, + "step": 5668 + }, + { + "epoch": 2.644589552238806, + "grad_norm": 1.0104314700113746, + "learning_rate": 4.2159249891616626e-07, + "loss": 0.03, + "step": 5670 + }, + { + "epoch": 2.6455223880597014, + "grad_norm": 1.0056294457362005, + "learning_rate": 4.194137511471824e-07, + "loss": 0.0286, + "step": 5672 + }, + { + "epoch": 2.646455223880597, + "grad_norm": 1.0073607104422846, + "learning_rate": 4.1724040126427e-07, + "loss": 0.0274, + "step": 5674 + }, + { + "epoch": 2.6473880597014925, + "grad_norm": 0.9770224547966474, + "learning_rate": 4.150724518285659e-07, + "loss": 0.0271, + "step": 5676 + }, + { + "epoch": 2.648320895522388, + "grad_norm": 0.9416539715365718, + "learning_rate": 4.1290990539483767e-07, + "loss": 0.0288, + "step": 5678 + }, + { + "epoch": 2.6492537313432836, + "grad_norm": 0.8329335425143416, + "learning_rate": 4.107527645114889e-07, + "loss": 0.0256, + "step": 5680 + }, + { + "epoch": 2.6501865671641793, + "grad_norm": 1.1138931507107372, + "learning_rate": 4.0860103172055354e-07, + "loss": 0.0374, + "step": 5682 + }, + { + "epoch": 2.6511194029850746, + "grad_norm": 0.9365299677312837, + "learning_rate": 4.064547095576904e-07, + "loss": 0.0302, + "step": 5684 + }, + { + "epoch": 2.65205223880597, + "grad_norm": 0.9889780175992751, + "learning_rate": 4.0431380055218297e-07, + "loss": 0.0279, + "step": 5686 + }, + { + "epoch": 2.6529850746268657, + "grad_norm": 1.1592467281919736, + "learning_rate": 4.02178307226937e-07, + "loss": 0.0284, + "step": 5688 + }, + { + "epoch": 2.653917910447761, + "grad_norm": 1.028022907074052, + "learning_rate": 4.0004823209847386e-07, + "loss": 0.0292, + "step": 5690 + }, + { + "epoch": 2.654850746268657, + "grad_norm": 0.9199005290867535, + "learning_rate": 3.9792357767693244e-07, + "loss": 0.0274, + "step": 5692 + }, + { + "epoch": 2.655783582089552, + "grad_norm": 0.9100899857863779, + "learning_rate": 3.958043464660638e-07, + "loss": 0.0257, + "step": 5694 + }, + { + "epoch": 2.656716417910448, + "grad_norm": 1.0666340677971675, + "learning_rate": 3.9369054096322414e-07, + "loss": 0.0299, + "step": 5696 + }, + { + "epoch": 2.657649253731343, + "grad_norm": 0.8878129132710791, + "learning_rate": 3.9158216365938193e-07, + "loss": 0.0186, + "step": 5698 + }, + { + "epoch": 2.658582089552239, + "grad_norm": 1.0480066675487099, + "learning_rate": 3.8947921703910374e-07, + "loss": 0.0294, + "step": 5700 + }, + { + "epoch": 2.6595149253731343, + "grad_norm": 0.841133672918763, + "learning_rate": 3.873817035805572e-07, + "loss": 0.0292, + "step": 5702 + }, + { + "epoch": 2.66044776119403, + "grad_norm": 0.9827002199151351, + "learning_rate": 3.8528962575551167e-07, + "loss": 0.0295, + "step": 5704 + }, + { + "epoch": 2.6613805970149254, + "grad_norm": 0.7956684171826566, + "learning_rate": 3.8320298602932626e-07, + "loss": 0.0257, + "step": 5706 + }, + { + "epoch": 2.6623134328358207, + "grad_norm": 0.8299638245728805, + "learning_rate": 3.811217868609535e-07, + "loss": 0.0231, + "step": 5708 + }, + { + "epoch": 2.6632462686567164, + "grad_norm": 0.8740361430812925, + "learning_rate": 3.790460307029348e-07, + "loss": 0.0271, + "step": 5710 + }, + { + "epoch": 2.664179104477612, + "grad_norm": 1.035152612295088, + "learning_rate": 3.7697572000139624e-07, + "loss": 0.0289, + "step": 5712 + }, + { + "epoch": 2.6651119402985075, + "grad_norm": 1.040699866935444, + "learning_rate": 3.7491085719604805e-07, + "loss": 0.029, + "step": 5714 + }, + { + "epoch": 2.666044776119403, + "grad_norm": 0.857941292118624, + "learning_rate": 3.728514447201814e-07, + "loss": 0.0288, + "step": 5716 + }, + { + "epoch": 2.6669776119402986, + "grad_norm": 0.8676768629391651, + "learning_rate": 3.707974850006624e-07, + "loss": 0.0285, + "step": 5718 + }, + { + "epoch": 2.667910447761194, + "grad_norm": 0.8954866462200547, + "learning_rate": 3.6874898045793086e-07, + "loss": 0.0248, + "step": 5720 + }, + { + "epoch": 2.6688432835820897, + "grad_norm": 0.8895165404260262, + "learning_rate": 3.667059335060014e-07, + "loss": 0.0288, + "step": 5722 + }, + { + "epoch": 2.669776119402985, + "grad_norm": 1.0408571334135006, + "learning_rate": 3.646683465524564e-07, + "loss": 0.0308, + "step": 5724 + }, + { + "epoch": 2.6707089552238807, + "grad_norm": 1.0023749200849796, + "learning_rate": 3.6263622199844085e-07, + "loss": 0.0276, + "step": 5726 + }, + { + "epoch": 2.671641791044776, + "grad_norm": 0.9015517833503137, + "learning_rate": 3.6060956223866683e-07, + "loss": 0.0275, + "step": 5728 + }, + { + "epoch": 2.6725746268656714, + "grad_norm": 0.9523846371790706, + "learning_rate": 3.5858836966140345e-07, + "loss": 0.0289, + "step": 5730 + }, + { + "epoch": 2.673507462686567, + "grad_norm": 0.9897606811173157, + "learning_rate": 3.565726466484798e-07, + "loss": 0.028, + "step": 5732 + }, + { + "epoch": 2.674440298507463, + "grad_norm": 0.8536692203456191, + "learning_rate": 3.5456239557527585e-07, + "loss": 0.0275, + "step": 5734 + }, + { + "epoch": 2.675373134328358, + "grad_norm": 1.026573977594397, + "learning_rate": 3.5255761881072823e-07, + "loss": 0.0274, + "step": 5736 + }, + { + "epoch": 2.6763059701492535, + "grad_norm": 1.136749269539635, + "learning_rate": 3.505583187173178e-07, + "loss": 0.0312, + "step": 5738 + }, + { + "epoch": 2.6772388059701493, + "grad_norm": 0.9466562456545305, + "learning_rate": 3.485644976510755e-07, + "loss": 0.03, + "step": 5740 + }, + { + "epoch": 2.6781716417910446, + "grad_norm": 1.0678576679932985, + "learning_rate": 3.465761579615712e-07, + "loss": 0.0326, + "step": 5742 + }, + { + "epoch": 2.6791044776119404, + "grad_norm": 0.8681817771137303, + "learning_rate": 3.445933019919195e-07, + "loss": 0.0245, + "step": 5744 + }, + { + "epoch": 2.6800373134328357, + "grad_norm": 0.8061913529013403, + "learning_rate": 3.42615932078772e-07, + "loss": 0.0266, + "step": 5746 + }, + { + "epoch": 2.6809701492537314, + "grad_norm": 0.9215643445592973, + "learning_rate": 3.406440505523123e-07, + "loss": 0.0292, + "step": 5748 + }, + { + "epoch": 2.6819029850746268, + "grad_norm": 0.9952871439157782, + "learning_rate": 3.386776597362612e-07, + "loss": 0.0301, + "step": 5750 + }, + { + "epoch": 2.6828358208955225, + "grad_norm": 0.7926829149673023, + "learning_rate": 3.367167619478651e-07, + "loss": 0.0235, + "step": 5752 + }, + { + "epoch": 2.683768656716418, + "grad_norm": 0.9451852918480704, + "learning_rate": 3.347613594978971e-07, + "loss": 0.0255, + "step": 5754 + }, + { + "epoch": 2.6847014925373136, + "grad_norm": 0.8259137510974414, + "learning_rate": 3.3281145469065913e-07, + "loss": 0.0265, + "step": 5756 + }, + { + "epoch": 2.685634328358209, + "grad_norm": 0.8248583370521401, + "learning_rate": 3.3086704982397077e-07, + "loss": 0.0256, + "step": 5758 + }, + { + "epoch": 2.6865671641791042, + "grad_norm": 1.002464695024011, + "learning_rate": 3.289281471891692e-07, + "loss": 0.0275, + "step": 5760 + }, + { + "epoch": 2.6875, + "grad_norm": 0.9291674809789717, + "learning_rate": 3.269947490711117e-07, + "loss": 0.0255, + "step": 5762 + }, + { + "epoch": 2.6884328358208958, + "grad_norm": 1.0062847946924403, + "learning_rate": 3.2506685774816527e-07, + "loss": 0.031, + "step": 5764 + }, + { + "epoch": 2.689365671641791, + "grad_norm": 0.8845161324267664, + "learning_rate": 3.231444754922086e-07, + "loss": 0.0283, + "step": 5766 + }, + { + "epoch": 2.6902985074626864, + "grad_norm": 1.1501011277586621, + "learning_rate": 3.2122760456863023e-07, + "loss": 0.0277, + "step": 5768 + }, + { + "epoch": 2.691231343283582, + "grad_norm": 0.923500915565738, + "learning_rate": 3.1931624723632147e-07, + "loss": 0.0281, + "step": 5770 + }, + { + "epoch": 2.6921641791044775, + "grad_norm": 1.0121932562794196, + "learning_rate": 3.174104057476768e-07, + "loss": 0.0287, + "step": 5772 + }, + { + "epoch": 2.6930970149253732, + "grad_norm": 0.8451398328503532, + "learning_rate": 3.1551008234859236e-07, + "loss": 0.0256, + "step": 5774 + }, + { + "epoch": 2.6940298507462686, + "grad_norm": 0.8868264662556339, + "learning_rate": 3.136152792784586e-07, + "loss": 0.0311, + "step": 5776 + }, + { + "epoch": 2.6949626865671643, + "grad_norm": 1.132609349725213, + "learning_rate": 3.1172599877016316e-07, + "loss": 0.0304, + "step": 5778 + }, + { + "epoch": 2.6958955223880596, + "grad_norm": 0.9300362845643634, + "learning_rate": 3.098422430500864e-07, + "loss": 0.0344, + "step": 5780 + }, + { + "epoch": 2.6968283582089554, + "grad_norm": 0.9961563897889548, + "learning_rate": 3.0796401433809465e-07, + "loss": 0.028, + "step": 5782 + }, + { + "epoch": 2.6977611940298507, + "grad_norm": 0.9169652584714555, + "learning_rate": 3.060913148475453e-07, + "loss": 0.0272, + "step": 5784 + }, + { + "epoch": 2.6986940298507465, + "grad_norm": 0.8041017569702099, + "learning_rate": 3.0422414678527526e-07, + "loss": 0.0266, + "step": 5786 + }, + { + "epoch": 2.699626865671642, + "grad_norm": 0.7222524826917436, + "learning_rate": 3.0236251235160827e-07, + "loss": 0.0225, + "step": 5788 + }, + { + "epoch": 2.700559701492537, + "grad_norm": 0.9999139168019376, + "learning_rate": 3.005064137403424e-07, + "loss": 0.0284, + "step": 5790 + }, + { + "epoch": 2.701492537313433, + "grad_norm": 0.9201166365049777, + "learning_rate": 2.986558531387557e-07, + "loss": 0.0243, + "step": 5792 + }, + { + "epoch": 2.7024253731343286, + "grad_norm": 1.0336207789711631, + "learning_rate": 2.9681083272759645e-07, + "loss": 0.0296, + "step": 5794 + }, + { + "epoch": 2.703358208955224, + "grad_norm": 0.9861828294752073, + "learning_rate": 2.949713546810884e-07, + "loss": 0.0277, + "step": 5796 + }, + { + "epoch": 2.7042910447761193, + "grad_norm": 0.8417513599421644, + "learning_rate": 2.931374211669219e-07, + "loss": 0.0263, + "step": 5798 + }, + { + "epoch": 2.705223880597015, + "grad_norm": 1.1332062360436599, + "learning_rate": 2.913090343462516e-07, + "loss": 0.0313, + "step": 5800 + }, + { + "epoch": 2.7061567164179103, + "grad_norm": 0.9197858394528459, + "learning_rate": 2.8948619637370056e-07, + "loss": 0.0331, + "step": 5802 + }, + { + "epoch": 2.707089552238806, + "grad_norm": 1.2269001374096953, + "learning_rate": 2.876689093973484e-07, + "loss": 0.0295, + "step": 5804 + }, + { + "epoch": 2.7080223880597014, + "grad_norm": 0.9099957401660649, + "learning_rate": 2.8585717555873307e-07, + "loss": 0.031, + "step": 5806 + }, + { + "epoch": 2.708955223880597, + "grad_norm": 0.8969373627355374, + "learning_rate": 2.8405099699285456e-07, + "loss": 0.0293, + "step": 5808 + }, + { + "epoch": 2.7098880597014925, + "grad_norm": 1.072333792921784, + "learning_rate": 2.8225037582816027e-07, + "loss": 0.0332, + "step": 5810 + }, + { + "epoch": 2.710820895522388, + "grad_norm": 0.8388134088824518, + "learning_rate": 2.804553141865496e-07, + "loss": 0.029, + "step": 5812 + }, + { + "epoch": 2.7117537313432836, + "grad_norm": 1.0705158139568056, + "learning_rate": 2.786658141833737e-07, + "loss": 0.0342, + "step": 5814 + }, + { + "epoch": 2.7126865671641793, + "grad_norm": 0.8876563936002391, + "learning_rate": 2.768818779274263e-07, + "loss": 0.0275, + "step": 5816 + }, + { + "epoch": 2.7136194029850746, + "grad_norm": 0.8903356890804164, + "learning_rate": 2.7510350752094404e-07, + "loss": 0.0258, + "step": 5818 + }, + { + "epoch": 2.71455223880597, + "grad_norm": 0.8764898589140518, + "learning_rate": 2.7333070505961014e-07, + "loss": 0.0258, + "step": 5820 + }, + { + "epoch": 2.7154850746268657, + "grad_norm": 0.9665390589158259, + "learning_rate": 2.7156347263254057e-07, + "loss": 0.0286, + "step": 5822 + }, + { + "epoch": 2.716417910447761, + "grad_norm": 0.8531868203598048, + "learning_rate": 2.6980181232228953e-07, + "loss": 0.0289, + "step": 5824 + }, + { + "epoch": 2.717350746268657, + "grad_norm": 1.1495247812983556, + "learning_rate": 2.680457262048458e-07, + "loss": 0.0306, + "step": 5826 + }, + { + "epoch": 2.718283582089552, + "grad_norm": 1.123839315238744, + "learning_rate": 2.662952163496274e-07, + "loss": 0.0303, + "step": 5828 + }, + { + "epoch": 2.719216417910448, + "grad_norm": 0.7964731534176683, + "learning_rate": 2.645502848194831e-07, + "loss": 0.0225, + "step": 5830 + }, + { + "epoch": 2.720149253731343, + "grad_norm": 1.069790484465966, + "learning_rate": 2.628109336706874e-07, + "loss": 0.029, + "step": 5832 + }, + { + "epoch": 2.721082089552239, + "grad_norm": 1.021373389027856, + "learning_rate": 2.61077164952937e-07, + "loss": 0.0249, + "step": 5834 + }, + { + "epoch": 2.7220149253731343, + "grad_norm": 0.937542896119991, + "learning_rate": 2.593489807093536e-07, + "loss": 0.0304, + "step": 5836 + }, + { + "epoch": 2.72294776119403, + "grad_norm": 0.9098111222144343, + "learning_rate": 2.5762638297647416e-07, + "loss": 0.026, + "step": 5838 + }, + { + "epoch": 2.7238805970149254, + "grad_norm": 0.9242261549967771, + "learning_rate": 2.559093737842561e-07, + "loss": 0.0307, + "step": 5840 + }, + { + "epoch": 2.7248134328358207, + "grad_norm": 0.9992330153262171, + "learning_rate": 2.541979551560669e-07, + "loss": 0.0279, + "step": 5842 + }, + { + "epoch": 2.7257462686567164, + "grad_norm": 0.8755327544571123, + "learning_rate": 2.524921291086907e-07, + "loss": 0.0226, + "step": 5844 + }, + { + "epoch": 2.726679104477612, + "grad_norm": 1.0727272877606369, + "learning_rate": 2.5079189765231716e-07, + "loss": 0.0301, + "step": 5846 + }, + { + "epoch": 2.7276119402985075, + "grad_norm": 0.9367220868135965, + "learning_rate": 2.4909726279054527e-07, + "loss": 0.028, + "step": 5848 + }, + { + "epoch": 2.728544776119403, + "grad_norm": 1.083830164979458, + "learning_rate": 2.4740822652037865e-07, + "loss": 0.0301, + "step": 5850 + }, + { + "epoch": 2.7294776119402986, + "grad_norm": 1.1236279986244866, + "learning_rate": 2.4572479083222243e-07, + "loss": 0.0291, + "step": 5852 + }, + { + "epoch": 2.730410447761194, + "grad_norm": 0.7827245639762057, + "learning_rate": 2.4404695770988364e-07, + "loss": 0.0218, + "step": 5854 + }, + { + "epoch": 2.7313432835820897, + "grad_norm": 0.9827612668764716, + "learning_rate": 2.42374729130565e-07, + "loss": 0.0281, + "step": 5856 + }, + { + "epoch": 2.732276119402985, + "grad_norm": 1.0807720694649268, + "learning_rate": 2.4070810706486536e-07, + "loss": 0.0291, + "step": 5858 + }, + { + "epoch": 2.7332089552238807, + "grad_norm": 0.9403172056503358, + "learning_rate": 2.39047093476778e-07, + "loss": 0.0278, + "step": 5860 + }, + { + "epoch": 2.734141791044776, + "grad_norm": 0.963183647905619, + "learning_rate": 2.373916903236856e-07, + "loss": 0.0305, + "step": 5862 + }, + { + "epoch": 2.7350746268656714, + "grad_norm": 0.9178054620047662, + "learning_rate": 2.357418995563593e-07, + "loss": 0.0292, + "step": 5864 + }, + { + "epoch": 2.736007462686567, + "grad_norm": 0.8718140353571424, + "learning_rate": 2.340977231189584e-07, + "loss": 0.0305, + "step": 5866 + }, + { + "epoch": 2.736940298507463, + "grad_norm": 0.9119854800983569, + "learning_rate": 2.3245916294902306e-07, + "loss": 0.0284, + "step": 5868 + }, + { + "epoch": 2.737873134328358, + "grad_norm": 0.8672871707764913, + "learning_rate": 2.3082622097747643e-07, + "loss": 0.0238, + "step": 5870 + }, + { + "epoch": 2.7388059701492535, + "grad_norm": 1.0926101102506087, + "learning_rate": 2.2919889912862313e-07, + "loss": 0.0267, + "step": 5872 + }, + { + "epoch": 2.7397388059701493, + "grad_norm": 0.9671659616597236, + "learning_rate": 2.2757719932014199e-07, + "loss": 0.0278, + "step": 5874 + }, + { + "epoch": 2.7406716417910446, + "grad_norm": 0.8876914643257722, + "learning_rate": 2.259611234630865e-07, + "loss": 0.0271, + "step": 5876 + }, + { + "epoch": 2.7416044776119404, + "grad_norm": 0.9926268794456025, + "learning_rate": 2.243506734618861e-07, + "loss": 0.0314, + "step": 5878 + }, + { + "epoch": 2.7425373134328357, + "grad_norm": 1.036129350284489, + "learning_rate": 2.2274585121433712e-07, + "loss": 0.031, + "step": 5880 + }, + { + "epoch": 2.7434701492537314, + "grad_norm": 0.9216796468280406, + "learning_rate": 2.211466586116051e-07, + "loss": 0.0262, + "step": 5882 + }, + { + "epoch": 2.7444029850746268, + "grad_norm": 1.105867185044268, + "learning_rate": 2.1955309753822262e-07, + "loss": 0.0273, + "step": 5884 + }, + { + "epoch": 2.7453358208955225, + "grad_norm": 0.9475958664820686, + "learning_rate": 2.1796516987208361e-07, + "loss": 0.0283, + "step": 5886 + }, + { + "epoch": 2.746268656716418, + "grad_norm": 0.95920474645987, + "learning_rate": 2.1638287748444675e-07, + "loss": 0.0292, + "step": 5888 + }, + { + "epoch": 2.7472014925373136, + "grad_norm": 0.8738107936054988, + "learning_rate": 2.148062222399261e-07, + "loss": 0.0245, + "step": 5890 + }, + { + "epoch": 2.748134328358209, + "grad_norm": 0.9399184144653854, + "learning_rate": 2.1323520599649484e-07, + "loss": 0.0279, + "step": 5892 + }, + { + "epoch": 2.7490671641791042, + "grad_norm": 0.8915402456411565, + "learning_rate": 2.1166983060548097e-07, + "loss": 0.0312, + "step": 5894 + }, + { + "epoch": 2.75, + "grad_norm": 0.9847475058229502, + "learning_rate": 2.101100979115661e-07, + "loss": 0.0339, + "step": 5896 + }, + { + "epoch": 2.7509328358208958, + "grad_norm": 1.1775427811502297, + "learning_rate": 2.0855600975277945e-07, + "loss": 0.0284, + "step": 5898 + }, + { + "epoch": 2.751865671641791, + "grad_norm": 0.8642503660649238, + "learning_rate": 2.0700756796050213e-07, + "loss": 0.0273, + "step": 5900 + }, + { + "epoch": 2.7527985074626864, + "grad_norm": 0.9802969737371192, + "learning_rate": 2.0546477435945733e-07, + "loss": 0.0266, + "step": 5902 + }, + { + "epoch": 2.753731343283582, + "grad_norm": 0.9502542802301004, + "learning_rate": 2.0392763076771626e-07, + "loss": 0.028, + "step": 5904 + }, + { + "epoch": 2.7546641791044775, + "grad_norm": 0.8383762876328574, + "learning_rate": 2.0239613899669052e-07, + "loss": 0.0235, + "step": 5906 + }, + { + "epoch": 2.7555970149253732, + "grad_norm": 0.9390110842420307, + "learning_rate": 2.0087030085113034e-07, + "loss": 0.0282, + "step": 5908 + }, + { + "epoch": 2.7565298507462686, + "grad_norm": 1.0505544986084194, + "learning_rate": 1.9935011812912408e-07, + "loss": 0.0306, + "step": 5910 + }, + { + "epoch": 2.7574626865671643, + "grad_norm": 0.8970754307861333, + "learning_rate": 1.978355926220965e-07, + "loss": 0.0243, + "step": 5912 + }, + { + "epoch": 2.7583955223880596, + "grad_norm": 0.9528293896094844, + "learning_rate": 1.9632672611480607e-07, + "loss": 0.0283, + "step": 5914 + }, + { + "epoch": 2.7593283582089554, + "grad_norm": 1.0440224674452707, + "learning_rate": 1.948235203853399e-07, + "loss": 0.0268, + "step": 5916 + }, + { + "epoch": 2.7602611940298507, + "grad_norm": 0.882354315586926, + "learning_rate": 1.933259772051177e-07, + "loss": 0.0241, + "step": 5918 + }, + { + "epoch": 2.7611940298507465, + "grad_norm": 0.9615225185117673, + "learning_rate": 1.918340983388839e-07, + "loss": 0.0265, + "step": 5920 + }, + { + "epoch": 2.762126865671642, + "grad_norm": 0.8003798033632283, + "learning_rate": 1.9034788554470718e-07, + "loss": 0.0238, + "step": 5922 + }, + { + "epoch": 2.763059701492537, + "grad_norm": 1.2083306229143707, + "learning_rate": 1.888673405739838e-07, + "loss": 0.03, + "step": 5924 + }, + { + "epoch": 2.763992537313433, + "grad_norm": 1.196171660226283, + "learning_rate": 1.873924651714265e-07, + "loss": 0.0301, + "step": 5926 + }, + { + "epoch": 2.7649253731343286, + "grad_norm": 0.8162603525073673, + "learning_rate": 1.859232610750672e-07, + "loss": 0.0264, + "step": 5928 + }, + { + "epoch": 2.765858208955224, + "grad_norm": 0.9070875234636459, + "learning_rate": 1.844597300162565e-07, + "loss": 0.0289, + "step": 5930 + }, + { + "epoch": 2.7667910447761193, + "grad_norm": 0.917794530772954, + "learning_rate": 1.8300187371965762e-07, + "loss": 0.0293, + "step": 5932 + }, + { + "epoch": 2.767723880597015, + "grad_norm": 0.984817465761431, + "learning_rate": 1.8154969390324905e-07, + "loss": 0.0274, + "step": 5934 + }, + { + "epoch": 2.7686567164179103, + "grad_norm": 0.9713453699788799, + "learning_rate": 1.8010319227831808e-07, + "loss": 0.0258, + "step": 5936 + }, + { + "epoch": 2.769589552238806, + "grad_norm": 0.9728200254285655, + "learning_rate": 1.7866237054946168e-07, + "loss": 0.0269, + "step": 5938 + }, + { + "epoch": 2.7705223880597014, + "grad_norm": 1.1102464322971721, + "learning_rate": 1.772272304145811e-07, + "loss": 0.0291, + "step": 5940 + }, + { + "epoch": 2.771455223880597, + "grad_norm": 0.9314621692224927, + "learning_rate": 1.7579777356488637e-07, + "loss": 0.0233, + "step": 5942 + }, + { + "epoch": 2.7723880597014925, + "grad_norm": 0.9343419601411406, + "learning_rate": 1.7437400168488604e-07, + "loss": 0.0267, + "step": 5944 + }, + { + "epoch": 2.773320895522388, + "grad_norm": 0.88850369106571, + "learning_rate": 1.7295591645239195e-07, + "loss": 0.0245, + "step": 5946 + }, + { + "epoch": 2.7742537313432836, + "grad_norm": 0.9150195040244912, + "learning_rate": 1.7154351953851456e-07, + "loss": 0.0287, + "step": 5948 + }, + { + "epoch": 2.7751865671641793, + "grad_norm": 0.8471315847351008, + "learning_rate": 1.7013681260765912e-07, + "loss": 0.0247, + "step": 5950 + }, + { + "epoch": 2.7761194029850746, + "grad_norm": 0.8248877102500325, + "learning_rate": 1.6873579731752797e-07, + "loss": 0.0267, + "step": 5952 + }, + { + "epoch": 2.77705223880597, + "grad_norm": 0.8266015763287755, + "learning_rate": 1.6734047531911436e-07, + "loss": 0.0245, + "step": 5954 + }, + { + "epoch": 2.7779850746268657, + "grad_norm": 0.8942446725228442, + "learning_rate": 1.6595084825670403e-07, + "loss": 0.028, + "step": 5956 + }, + { + "epoch": 2.778917910447761, + "grad_norm": 1.0361421543357303, + "learning_rate": 1.6456691776787103e-07, + "loss": 0.0263, + "step": 5958 + }, + { + "epoch": 2.779850746268657, + "grad_norm": 0.95816275670667, + "learning_rate": 1.6318868548347578e-07, + "loss": 0.0318, + "step": 5960 + }, + { + "epoch": 2.780783582089552, + "grad_norm": 1.0419289710700441, + "learning_rate": 1.618161530276635e-07, + "loss": 0.0292, + "step": 5962 + }, + { + "epoch": 2.781716417910448, + "grad_norm": 0.9671095594028484, + "learning_rate": 1.604493220178649e-07, + "loss": 0.0289, + "step": 5964 + }, + { + "epoch": 2.782649253731343, + "grad_norm": 0.8723906868587412, + "learning_rate": 1.590881940647898e-07, + "loss": 0.028, + "step": 5966 + }, + { + "epoch": 2.783582089552239, + "grad_norm": 1.0100792425284035, + "learning_rate": 1.5773277077242744e-07, + "loss": 0.0301, + "step": 5968 + }, + { + "epoch": 2.7845149253731343, + "grad_norm": 1.094048809300156, + "learning_rate": 1.5638305373804618e-07, + "loss": 0.0313, + "step": 5970 + }, + { + "epoch": 2.78544776119403, + "grad_norm": 0.986166343643483, + "learning_rate": 1.550390445521882e-07, + "loss": 0.0252, + "step": 5972 + }, + { + "epoch": 2.7863805970149254, + "grad_norm": 0.9034236201858159, + "learning_rate": 1.537007447986699e-07, + "loss": 0.0246, + "step": 5974 + }, + { + "epoch": 2.7873134328358207, + "grad_norm": 0.8889417799041324, + "learning_rate": 1.5236815605457977e-07, + "loss": 0.0324, + "step": 5976 + }, + { + "epoch": 2.7882462686567164, + "grad_norm": 0.7915261215048099, + "learning_rate": 1.5104127989027661e-07, + "loss": 0.0239, + "step": 5978 + }, + { + "epoch": 2.789179104477612, + "grad_norm": 0.8899753000706484, + "learning_rate": 1.4972011786938688e-07, + "loss": 0.0262, + "step": 5980 + }, + { + "epoch": 2.7901119402985075, + "grad_norm": 1.0494806109561634, + "learning_rate": 1.4840467154880412e-07, + "loss": 0.0304, + "step": 5982 + }, + { + "epoch": 2.791044776119403, + "grad_norm": 0.9028726174284042, + "learning_rate": 1.4709494247868384e-07, + "loss": 0.0274, + "step": 5984 + }, + { + "epoch": 2.7919776119402986, + "grad_norm": 0.9484390079992623, + "learning_rate": 1.4579093220244755e-07, + "loss": 0.0275, + "step": 5986 + }, + { + "epoch": 2.792910447761194, + "grad_norm": 0.9266575933813405, + "learning_rate": 1.4449264225677607e-07, + "loss": 0.0258, + "step": 5988 + }, + { + "epoch": 2.7938432835820897, + "grad_norm": 0.820131848565527, + "learning_rate": 1.432000741716083e-07, + "loss": 0.022, + "step": 5990 + }, + { + "epoch": 2.794776119402985, + "grad_norm": 0.7670878758198434, + "learning_rate": 1.4191322947014198e-07, + "loss": 0.0224, + "step": 5992 + }, + { + "epoch": 2.7957089552238807, + "grad_norm": 0.8111190526491343, + "learning_rate": 1.40632109668829e-07, + "loss": 0.0267, + "step": 5994 + }, + { + "epoch": 2.796641791044776, + "grad_norm": 1.054620489439934, + "learning_rate": 1.3935671627737568e-07, + "loss": 0.031, + "step": 5996 + }, + { + "epoch": 2.7975746268656714, + "grad_norm": 0.9122734685692475, + "learning_rate": 1.3808705079873974e-07, + "loss": 0.0286, + "step": 5998 + }, + { + "epoch": 2.798507462686567, + "grad_norm": 1.011995085628338, + "learning_rate": 1.368231147291299e-07, + "loss": 0.0253, + "step": 6000 + }, + { + "epoch": 2.798507462686567, + "eval_loss": 0.18329007923603058, + "eval_runtime": 323.0954, + "eval_samples_per_second": 47.187, + "eval_steps_per_second": 5.899, + "step": 6000 + }, + { + "epoch": 2.799440298507463, + "grad_norm": 0.9221866409015929, + "learning_rate": 1.3556490955800084e-07, + "loss": 0.0239, + "step": 6002 + }, + { + "epoch": 2.800373134328358, + "grad_norm": 0.8265850904760484, + "learning_rate": 1.3431243676805706e-07, + "loss": 0.0235, + "step": 6004 + }, + { + "epoch": 2.8013059701492535, + "grad_norm": 0.916524786966227, + "learning_rate": 1.3306569783524515e-07, + "loss": 0.0278, + "step": 6006 + }, + { + "epoch": 2.8022388059701493, + "grad_norm": 1.0627597059350238, + "learning_rate": 1.31824694228756e-07, + "loss": 0.0268, + "step": 6008 + }, + { + "epoch": 2.8031716417910446, + "grad_norm": 0.9854456242392609, + "learning_rate": 1.3058942741102255e-07, + "loss": 0.0296, + "step": 6010 + }, + { + "epoch": 2.8041044776119404, + "grad_norm": 0.8733962291870478, + "learning_rate": 1.293598988377154e-07, + "loss": 0.0271, + "step": 6012 + }, + { + "epoch": 2.8050373134328357, + "grad_norm": 0.9020372572074464, + "learning_rate": 1.2813610995774383e-07, + "loss": 0.0267, + "step": 6014 + }, + { + "epoch": 2.8059701492537314, + "grad_norm": 1.0309149190481628, + "learning_rate": 1.2691806221325488e-07, + "loss": 0.0271, + "step": 6016 + }, + { + "epoch": 2.8069029850746268, + "grad_norm": 0.7397732761136856, + "learning_rate": 1.257057570396275e-07, + "loss": 0.0249, + "step": 6018 + }, + { + "epoch": 2.8078358208955225, + "grad_norm": 0.9746213437526945, + "learning_rate": 1.244991958654751e-07, + "loss": 0.0255, + "step": 6020 + }, + { + "epoch": 2.808768656716418, + "grad_norm": 0.9092447986844799, + "learning_rate": 1.2329838011264305e-07, + "loss": 0.0297, + "step": 6022 + }, + { + "epoch": 2.8097014925373136, + "grad_norm": 0.8983926055526076, + "learning_rate": 1.2210331119620333e-07, + "loss": 0.0252, + "step": 6024 + }, + { + "epoch": 2.810634328358209, + "grad_norm": 0.9677539682770142, + "learning_rate": 1.2091399052445774e-07, + "loss": 0.027, + "step": 6026 + }, + { + "epoch": 2.8115671641791042, + "grad_norm": 1.0472270528820966, + "learning_rate": 1.197304194989335e-07, + "loss": 0.0279, + "step": 6028 + }, + { + "epoch": 2.8125, + "grad_norm": 0.891285413190194, + "learning_rate": 1.185525995143838e-07, + "loss": 0.0245, + "step": 6030 + }, + { + "epoch": 2.8134328358208958, + "grad_norm": 1.0446696188843714, + "learning_rate": 1.1738053195878174e-07, + "loss": 0.0285, + "step": 6032 + }, + { + "epoch": 2.814365671641791, + "grad_norm": 0.8281663266840147, + "learning_rate": 1.1621421821332469e-07, + "loss": 0.0237, + "step": 6034 + }, + { + "epoch": 2.8152985074626864, + "grad_norm": 0.985949627033281, + "learning_rate": 1.150536596524271e-07, + "loss": 0.0281, + "step": 6036 + }, + { + "epoch": 2.816231343283582, + "grad_norm": 0.9968398571881046, + "learning_rate": 1.1389885764372221e-07, + "loss": 0.0278, + "step": 6038 + }, + { + "epoch": 2.8171641791044775, + "grad_norm": 0.9660425784764326, + "learning_rate": 1.1274981354806147e-07, + "loss": 0.0328, + "step": 6040 + }, + { + "epoch": 2.8180970149253732, + "grad_norm": 0.9324463921158841, + "learning_rate": 1.1160652871950839e-07, + "loss": 0.0269, + "step": 6042 + }, + { + "epoch": 2.8190298507462686, + "grad_norm": 0.9029724221313022, + "learning_rate": 1.1046900450533971e-07, + "loss": 0.021, + "step": 6044 + }, + { + "epoch": 2.8199626865671643, + "grad_norm": 1.0833243574375553, + "learning_rate": 1.0933724224604536e-07, + "loss": 0.0308, + "step": 6046 + }, + { + "epoch": 2.8208955223880596, + "grad_norm": 1.1845420995114608, + "learning_rate": 1.0821124327532462e-07, + "loss": 0.0319, + "step": 6048 + }, + { + "epoch": 2.8218283582089554, + "grad_norm": 1.060331874970539, + "learning_rate": 1.070910089200844e-07, + "loss": 0.0265, + "step": 6050 + }, + { + "epoch": 2.8227611940298507, + "grad_norm": 1.0277983078485242, + "learning_rate": 1.0597654050043982e-07, + "loss": 0.0306, + "step": 6052 + }, + { + "epoch": 2.8236940298507465, + "grad_norm": 0.7963096751323487, + "learning_rate": 1.0486783932970924e-07, + "loss": 0.0237, + "step": 6054 + }, + { + "epoch": 2.824626865671642, + "grad_norm": 0.9308811945112528, + "learning_rate": 1.0376490671441752e-07, + "loss": 0.0286, + "step": 6056 + }, + { + "epoch": 2.825559701492537, + "grad_norm": 1.0045664589682328, + "learning_rate": 1.0266774395428947e-07, + "loss": 0.0244, + "step": 6058 + }, + { + "epoch": 2.826492537313433, + "grad_norm": 0.9398241600917389, + "learning_rate": 1.0157635234224971e-07, + "loss": 0.0282, + "step": 6060 + }, + { + "epoch": 2.8274253731343286, + "grad_norm": 0.9638086300700636, + "learning_rate": 1.0049073316442559e-07, + "loss": 0.0265, + "step": 6062 + }, + { + "epoch": 2.828358208955224, + "grad_norm": 1.06633468461235, + "learning_rate": 9.941088770013929e-08, + "loss": 0.0297, + "step": 6064 + }, + { + "epoch": 2.8292910447761193, + "grad_norm": 0.9251580272353037, + "learning_rate": 9.833681722190901e-08, + "loss": 0.0285, + "step": 6066 + }, + { + "epoch": 2.830223880597015, + "grad_norm": 0.9621616842150614, + "learning_rate": 9.726852299544953e-08, + "loss": 0.0264, + "step": 6068 + }, + { + "epoch": 2.8311567164179103, + "grad_norm": 0.8017067867848854, + "learning_rate": 9.620600627966659e-08, + "loss": 0.0235, + "step": 6070 + }, + { + "epoch": 2.832089552238806, + "grad_norm": 2.2184701837589484, + "learning_rate": 9.514926832665861e-08, + "loss": 0.0285, + "step": 6072 + }, + { + "epoch": 2.8330223880597014, + "grad_norm": 0.7971259494348162, + "learning_rate": 9.409831038171501e-08, + "loss": 0.0264, + "step": 6074 + }, + { + "epoch": 2.833955223880597, + "grad_norm": 1.082444967355677, + "learning_rate": 9.305313368331126e-08, + "loss": 0.0302, + "step": 6076 + }, + { + "epoch": 2.8348880597014925, + "grad_norm": 0.9740324947287216, + "learning_rate": 9.201373946311266e-08, + "loss": 0.0277, + "step": 6078 + }, + { + "epoch": 2.835820895522388, + "grad_norm": 0.9693233271849888, + "learning_rate": 9.098012894596886e-08, + "loss": 0.0284, + "step": 6080 + }, + { + "epoch": 2.8367537313432836, + "grad_norm": 0.8613165274822283, + "learning_rate": 8.995230334991556e-08, + "loss": 0.0279, + "step": 6082 + }, + { + "epoch": 2.8376865671641793, + "grad_norm": 0.9483829784452807, + "learning_rate": 8.893026388616832e-08, + "loss": 0.0261, + "step": 6084 + }, + { + "epoch": 2.8386194029850746, + "grad_norm": 1.0098060586216793, + "learning_rate": 8.791401175912706e-08, + "loss": 0.0303, + "step": 6086 + }, + { + "epoch": 2.83955223880597, + "grad_norm": 1.1054269837399973, + "learning_rate": 8.690354816637048e-08, + "loss": 0.0308, + "step": 6088 + }, + { + "epoch": 2.8404850746268657, + "grad_norm": 1.068783945533865, + "learning_rate": 8.58988742986555e-08, + "loss": 0.0261, + "step": 6090 + }, + { + "epoch": 2.841417910447761, + "grad_norm": 0.8568263460183595, + "learning_rate": 8.489999133991789e-08, + "loss": 0.0255, + "step": 6092 + }, + { + "epoch": 2.842350746268657, + "grad_norm": 1.0088164392874945, + "learning_rate": 8.390690046726768e-08, + "loss": 0.0279, + "step": 6094 + }, + { + "epoch": 2.843283582089552, + "grad_norm": 0.8665430326713278, + "learning_rate": 8.291960285098877e-08, + "loss": 0.0244, + "step": 6096 + }, + { + "epoch": 2.844216417910448, + "grad_norm": 0.9040495958153504, + "learning_rate": 8.193809965454102e-08, + "loss": 0.027, + "step": 6098 + }, + { + "epoch": 2.845149253731343, + "grad_norm": 0.8918358245233772, + "learning_rate": 8.096239203455313e-08, + "loss": 0.0247, + "step": 6100 + }, + { + "epoch": 2.846082089552239, + "grad_norm": 1.0048720812225236, + "learning_rate": 7.999248114082536e-08, + "loss": 0.0248, + "step": 6102 + }, + { + "epoch": 2.8470149253731343, + "grad_norm": 0.8524361002017556, + "learning_rate": 7.902836811632786e-08, + "loss": 0.0266, + "step": 6104 + }, + { + "epoch": 2.84794776119403, + "grad_norm": 0.855140402063462, + "learning_rate": 7.807005409719515e-08, + "loss": 0.0239, + "step": 6106 + }, + { + "epoch": 2.8488805970149254, + "grad_norm": 1.0686415048668705, + "learning_rate": 7.711754021273276e-08, + "loss": 0.0291, + "step": 6108 + }, + { + "epoch": 2.8498134328358207, + "grad_norm": 0.8725245590138367, + "learning_rate": 7.617082758540673e-08, + "loss": 0.0227, + "step": 6110 + }, + { + "epoch": 2.8507462686567164, + "grad_norm": 0.8834451617455856, + "learning_rate": 7.522991733084905e-08, + "loss": 0.0255, + "step": 6112 + }, + { + "epoch": 2.851679104477612, + "grad_norm": 0.9748544937737061, + "learning_rate": 7.429481055785503e-08, + "loss": 0.0267, + "step": 6114 + }, + { + "epoch": 2.8526119402985075, + "grad_norm": 0.8702482515646909, + "learning_rate": 7.336550836837819e-08, + "loss": 0.0275, + "step": 6116 + }, + { + "epoch": 2.853544776119403, + "grad_norm": 0.7926249763968709, + "learning_rate": 7.244201185753364e-08, + "loss": 0.0235, + "step": 6118 + }, + { + "epoch": 2.8544776119402986, + "grad_norm": 1.0912402619676589, + "learning_rate": 7.152432211359472e-08, + "loss": 0.027, + "step": 6120 + }, + { + "epoch": 2.855410447761194, + "grad_norm": 1.0075662816982829, + "learning_rate": 7.061244021799141e-08, + "loss": 0.0271, + "step": 6122 + }, + { + "epoch": 2.8563432835820897, + "grad_norm": 0.9768209172507724, + "learning_rate": 6.970636724531021e-08, + "loss": 0.0281, + "step": 6124 + }, + { + "epoch": 2.857276119402985, + "grad_norm": 0.8339877883558028, + "learning_rate": 6.880610426329149e-08, + "loss": 0.0257, + "step": 6126 + }, + { + "epoch": 2.8582089552238807, + "grad_norm": 1.224494729962025, + "learning_rate": 6.791165233282992e-08, + "loss": 0.033, + "step": 6128 + }, + { + "epoch": 2.859141791044776, + "grad_norm": 0.8309826211366992, + "learning_rate": 6.702301250797128e-08, + "loss": 0.0261, + "step": 6130 + }, + { + "epoch": 2.8600746268656714, + "grad_norm": 0.9156184970303739, + "learning_rate": 6.614018583591287e-08, + "loss": 0.0285, + "step": 6132 + }, + { + "epoch": 2.861007462686567, + "grad_norm": 1.063081447354269, + "learning_rate": 6.526317335700083e-08, + "loss": 0.0311, + "step": 6134 + }, + { + "epoch": 2.861940298507463, + "grad_norm": 0.9681128399998323, + "learning_rate": 6.439197610473125e-08, + "loss": 0.0295, + "step": 6136 + }, + { + "epoch": 2.862873134328358, + "grad_norm": 0.8339319778717733, + "learning_rate": 6.352659510574565e-08, + "loss": 0.0255, + "step": 6138 + }, + { + "epoch": 2.8638059701492535, + "grad_norm": 0.9510334700077668, + "learning_rate": 6.266703137983221e-08, + "loss": 0.0222, + "step": 6140 + }, + { + "epoch": 2.8647388059701493, + "grad_norm": 1.1004684213983682, + "learning_rate": 6.181328593992508e-08, + "loss": 0.0278, + "step": 6142 + }, + { + "epoch": 2.8656716417910446, + "grad_norm": 1.0347689464621461, + "learning_rate": 6.096535979209894e-08, + "loss": 0.0315, + "step": 6144 + }, + { + "epoch": 2.8666044776119404, + "grad_norm": 0.9194327140627475, + "learning_rate": 6.012325393557505e-08, + "loss": 0.0253, + "step": 6146 + }, + { + "epoch": 2.8675373134328357, + "grad_norm": 0.9195629780953789, + "learning_rate": 5.928696936271128e-08, + "loss": 0.0259, + "step": 6148 + }, + { + "epoch": 2.8684701492537314, + "grad_norm": 0.867690935896647, + "learning_rate": 5.845650705900985e-08, + "loss": 0.0275, + "step": 6150 + }, + { + "epoch": 2.8694029850746268, + "grad_norm": 0.9709391500841479, + "learning_rate": 5.763186800310849e-08, + "loss": 0.028, + "step": 6152 + }, + { + "epoch": 2.8703358208955225, + "grad_norm": 0.920743512258816, + "learning_rate": 5.681305316678487e-08, + "loss": 0.0276, + "step": 6154 + }, + { + "epoch": 2.871268656716418, + "grad_norm": 1.0514477761920524, + "learning_rate": 5.600006351495213e-08, + "loss": 0.0299, + "step": 6156 + }, + { + "epoch": 2.8722014925373136, + "grad_norm": 0.9521379752950706, + "learning_rate": 5.519290000565891e-08, + "loss": 0.025, + "step": 6158 + }, + { + "epoch": 2.873134328358209, + "grad_norm": 1.0000354064391062, + "learning_rate": 5.4391563590089345e-08, + "loss": 0.0277, + "step": 6160 + }, + { + "epoch": 2.8740671641791042, + "grad_norm": 0.9998472317484027, + "learning_rate": 5.359605521255862e-08, + "loss": 0.0271, + "step": 6162 + }, + { + "epoch": 2.875, + "grad_norm": 1.0154098658204735, + "learning_rate": 5.2806375810515173e-08, + "loss": 0.0288, + "step": 6164 + }, + { + "epoch": 2.8759328358208958, + "grad_norm": 0.8651533114830908, + "learning_rate": 5.202252631454019e-08, + "loss": 0.0275, + "step": 6166 + }, + { + "epoch": 2.876865671641791, + "grad_norm": 0.9536255093285815, + "learning_rate": 5.1244507648341436e-08, + "loss": 0.0242, + "step": 6168 + }, + { + "epoch": 2.8777985074626864, + "grad_norm": 0.9571731096409607, + "learning_rate": 5.0472320728757184e-08, + "loss": 0.0262, + "step": 6170 + }, + { + "epoch": 2.878731343283582, + "grad_norm": 0.9541813407147736, + "learning_rate": 4.970596646575399e-08, + "loss": 0.0319, + "step": 6172 + }, + { + "epoch": 2.8796641791044775, + "grad_norm": 1.0303653115831537, + "learning_rate": 4.894544576242333e-08, + "loss": 0.0267, + "step": 6174 + }, + { + "epoch": 2.8805970149253732, + "grad_norm": 0.864462744748314, + "learning_rate": 4.8190759514983866e-08, + "loss": 0.027, + "step": 6176 + }, + { + "epoch": 2.8815298507462686, + "grad_norm": 1.014102706273404, + "learning_rate": 4.744190861277864e-08, + "loss": 0.0248, + "step": 6178 + }, + { + "epoch": 2.8824626865671643, + "grad_norm": 1.2006667032261047, + "learning_rate": 4.669889393827287e-08, + "loss": 0.0316, + "step": 6180 + }, + { + "epoch": 2.8833955223880596, + "grad_norm": 0.8940806054512117, + "learning_rate": 4.5961716367055044e-08, + "loss": 0.0304, + "step": 6182 + }, + { + "epoch": 2.8843283582089554, + "grad_norm": 0.8938382426244617, + "learning_rate": 4.523037676783581e-08, + "loss": 0.0237, + "step": 6184 + }, + { + "epoch": 2.8852611940298507, + "grad_norm": 0.9887231931165578, + "learning_rate": 4.4504876002444683e-08, + "loss": 0.0255, + "step": 6186 + }, + { + "epoch": 2.8861940298507465, + "grad_norm": 0.8738112893845056, + "learning_rate": 4.3785214925831655e-08, + "loss": 0.0288, + "step": 6188 + }, + { + "epoch": 2.887126865671642, + "grad_norm": 1.024335918443421, + "learning_rate": 4.3071394386064444e-08, + "loss": 0.0288, + "step": 6190 + }, + { + "epoch": 2.888059701492537, + "grad_norm": 1.1255002002907022, + "learning_rate": 4.2363415224329076e-08, + "loss": 0.0279, + "step": 6192 + }, + { + "epoch": 2.888992537313433, + "grad_norm": 0.8826371849514132, + "learning_rate": 4.16612782749265e-08, + "loss": 0.0241, + "step": 6194 + }, + { + "epoch": 2.8899253731343286, + "grad_norm": 1.0465092663367443, + "learning_rate": 4.096498436527374e-08, + "loss": 0.0316, + "step": 6196 + }, + { + "epoch": 2.890858208955224, + "grad_norm": 1.0830772730773754, + "learning_rate": 4.027453431590278e-08, + "loss": 0.0275, + "step": 6198 + }, + { + "epoch": 2.8917910447761193, + "grad_norm": 0.9642422179559949, + "learning_rate": 3.9589928940457766e-08, + "loss": 0.0303, + "step": 6200 + }, + { + "epoch": 2.892723880597015, + "grad_norm": 0.9520275482238393, + "learning_rate": 3.891116904569725e-08, + "loss": 0.0278, + "step": 6202 + }, + { + "epoch": 2.8936567164179103, + "grad_norm": 1.0370943740202896, + "learning_rate": 3.823825543148918e-08, + "loss": 0.0259, + "step": 6204 + }, + { + "epoch": 2.894589552238806, + "grad_norm": 1.1011707446846262, + "learning_rate": 3.7571188890813685e-08, + "loss": 0.03, + "step": 6206 + }, + { + "epoch": 2.8955223880597014, + "grad_norm": 1.0173726680913833, + "learning_rate": 3.690997020975973e-08, + "loss": 0.03, + "step": 6208 + }, + { + "epoch": 2.896455223880597, + "grad_norm": 1.058358228923588, + "learning_rate": 3.6254600167524576e-08, + "loss": 0.0273, + "step": 6210 + }, + { + "epoch": 2.8973880597014925, + "grad_norm": 0.9030915878853645, + "learning_rate": 3.56050795364149e-08, + "loss": 0.0283, + "step": 6212 + }, + { + "epoch": 2.898320895522388, + "grad_norm": 0.9657589253456974, + "learning_rate": 3.496140908184287e-08, + "loss": 0.0248, + "step": 6214 + }, + { + "epoch": 2.8992537313432836, + "grad_norm": 0.8917914008530972, + "learning_rate": 3.432358956232673e-08, + "loss": 0.0289, + "step": 6216 + }, + { + "epoch": 2.9001865671641793, + "grad_norm": 1.0958380997281072, + "learning_rate": 3.3691621729490254e-08, + "loss": 0.0289, + "step": 6218 + }, + { + "epoch": 2.9011194029850746, + "grad_norm": 0.9187748425779532, + "learning_rate": 3.3065506328062155e-08, + "loss": 0.0283, + "step": 6220 + }, + { + "epoch": 2.90205223880597, + "grad_norm": 0.9134599893174407, + "learning_rate": 3.2445244095872796e-08, + "loss": 0.0255, + "step": 6222 + }, + { + "epoch": 2.9029850746268657, + "grad_norm": 1.0351918552003565, + "learning_rate": 3.183083576385637e-08, + "loss": 0.0279, + "step": 6224 + }, + { + "epoch": 2.903917910447761, + "grad_norm": 0.9153608814644609, + "learning_rate": 3.1222282056047605e-08, + "loss": 0.0287, + "step": 6226 + }, + { + "epoch": 2.904850746268657, + "grad_norm": 0.8807198315508802, + "learning_rate": 3.0619583689582845e-08, + "loss": 0.0251, + "step": 6228 + }, + { + "epoch": 2.905783582089552, + "grad_norm": 1.0102393493914026, + "learning_rate": 3.002274137469841e-08, + "loss": 0.0282, + "step": 6230 + }, + { + "epoch": 2.906716417910448, + "grad_norm": 1.079497308767095, + "learning_rate": 2.9431755814729456e-08, + "loss": 0.0318, + "step": 6232 + }, + { + "epoch": 2.907649253731343, + "grad_norm": 0.8345846685268756, + "learning_rate": 2.8846627706108354e-08, + "loss": 0.0238, + "step": 6234 + }, + { + "epoch": 2.908582089552239, + "grad_norm": 0.970867167127175, + "learning_rate": 2.826735773836631e-08, + "loss": 0.0235, + "step": 6236 + }, + { + "epoch": 2.9095149253731343, + "grad_norm": 0.9857067058881129, + "learning_rate": 2.7693946594130604e-08, + "loss": 0.0287, + "step": 6238 + }, + { + "epoch": 2.91044776119403, + "grad_norm": 0.9140368482341609, + "learning_rate": 2.712639494912461e-08, + "loss": 0.0283, + "step": 6240 + }, + { + "epoch": 2.9113805970149254, + "grad_norm": 1.025972374574341, + "learning_rate": 2.6564703472166663e-08, + "loss": 0.0271, + "step": 6242 + }, + { + "epoch": 2.9123134328358207, + "grad_norm": 0.7858423153629281, + "learning_rate": 2.6008872825168397e-08, + "loss": 0.0258, + "step": 6244 + }, + { + "epoch": 2.9132462686567164, + "grad_norm": 0.8850157140085816, + "learning_rate": 2.5458903663135304e-08, + "loss": 0.0268, + "step": 6246 + }, + { + "epoch": 2.914179104477612, + "grad_norm": 0.9543044924610737, + "learning_rate": 2.4914796634166738e-08, + "loss": 0.0293, + "step": 6248 + }, + { + "epoch": 2.9151119402985075, + "grad_norm": 0.924077198749757, + "learning_rate": 2.4376552379453135e-08, + "loss": 0.026, + "step": 6250 + }, + { + "epoch": 2.916044776119403, + "grad_norm": 0.83228980205813, + "learning_rate": 2.384417153327545e-08, + "loss": 0.0234, + "step": 6252 + }, + { + "epoch": 2.9169776119402986, + "grad_norm": 0.8620306027234695, + "learning_rate": 2.331765472300629e-08, + "loss": 0.0262, + "step": 6254 + }, + { + "epoch": 2.917910447761194, + "grad_norm": 0.9031245188087478, + "learning_rate": 2.2797002569105998e-08, + "loss": 0.0267, + "step": 6256 + }, + { + "epoch": 2.9188432835820897, + "grad_norm": 0.8827627269674895, + "learning_rate": 2.2282215685126007e-08, + "loss": 0.0258, + "step": 6258 + }, + { + "epoch": 2.919776119402985, + "grad_norm": 0.952802571636724, + "learning_rate": 2.1773294677704947e-08, + "loss": 0.0282, + "step": 6260 + }, + { + "epoch": 2.9207089552238807, + "grad_norm": 1.0057131059426605, + "learning_rate": 2.1270240146568644e-08, + "loss": 0.0282, + "step": 6262 + }, + { + "epoch": 2.921641791044776, + "grad_norm": 0.9906956285539033, + "learning_rate": 2.077305268453067e-08, + "loss": 0.0272, + "step": 6264 + }, + { + "epoch": 2.9225746268656714, + "grad_norm": 0.9684122942210909, + "learning_rate": 2.028173287748958e-08, + "loss": 0.0323, + "step": 6266 + }, + { + "epoch": 2.923507462686567, + "grad_norm": 0.847667725451742, + "learning_rate": 1.9796281304430564e-08, + "loss": 0.0299, + "step": 6268 + }, + { + "epoch": 2.924440298507463, + "grad_norm": 0.7925191142475678, + "learning_rate": 1.9316698537421573e-08, + "loss": 0.0245, + "step": 6270 + }, + { + "epoch": 2.925373134328358, + "grad_norm": 1.1385562354438568, + "learning_rate": 1.88429851416172e-08, + "loss": 0.0275, + "step": 6272 + }, + { + "epoch": 2.9263059701492535, + "grad_norm": 0.8590721493865495, + "learning_rate": 1.8375141675253116e-08, + "loss": 0.027, + "step": 6274 + }, + { + "epoch": 2.9272388059701493, + "grad_norm": 0.8298284936217379, + "learning_rate": 1.7913168689648876e-08, + "loss": 0.0278, + "step": 6276 + }, + { + "epoch": 2.9281716417910446, + "grad_norm": 0.8712924189202683, + "learning_rate": 1.7457066729206773e-08, + "loss": 0.0263, + "step": 6278 + }, + { + "epoch": 2.9291044776119404, + "grad_norm": 0.8347084759818374, + "learning_rate": 1.7006836331407982e-08, + "loss": 0.0237, + "step": 6280 + }, + { + "epoch": 2.9300373134328357, + "grad_norm": 1.0056064708676893, + "learning_rate": 1.6562478026816987e-08, + "loss": 0.0255, + "step": 6282 + }, + { + "epoch": 2.9309701492537314, + "grad_norm": 1.0253670648975486, + "learning_rate": 1.6123992339077688e-08, + "loss": 0.0265, + "step": 6284 + }, + { + "epoch": 2.9319029850746268, + "grad_norm": 0.9581708770481409, + "learning_rate": 1.569137978491342e-08, + "loss": 0.0295, + "step": 6286 + }, + { + "epoch": 2.9328358208955225, + "grad_norm": 0.9625395716354909, + "learning_rate": 1.526464087412638e-08, + "loss": 0.0279, + "step": 6288 + }, + { + "epoch": 2.933768656716418, + "grad_norm": 0.9632669348201957, + "learning_rate": 1.4843776109597085e-08, + "loss": 0.0303, + "step": 6290 + }, + { + "epoch": 2.9347014925373136, + "grad_norm": 1.036470359074804, + "learning_rate": 1.4428785987283811e-08, + "loss": 0.0291, + "step": 6292 + }, + { + "epoch": 2.935634328358209, + "grad_norm": 0.9357231035398019, + "learning_rate": 1.4019670996222035e-08, + "loss": 0.0291, + "step": 6294 + }, + { + "epoch": 2.9365671641791042, + "grad_norm": 0.9355580162680617, + "learning_rate": 1.361643161852444e-08, + "loss": 0.034, + "step": 6296 + }, + { + "epoch": 2.9375, + "grad_norm": 0.9047004989253603, + "learning_rate": 1.3219068329378692e-08, + "loss": 0.0269, + "step": 6298 + }, + { + "epoch": 2.9384328358208958, + "grad_norm": 1.023104774114, + "learning_rate": 1.2827581597048555e-08, + "loss": 0.0298, + "step": 6300 + }, + { + "epoch": 2.939365671641791, + "grad_norm": 1.0934483715480892, + "learning_rate": 1.2441971882871661e-08, + "loss": 0.0254, + "step": 6302 + }, + { + "epoch": 2.9402985074626864, + "grad_norm": 1.0238497968664415, + "learning_rate": 1.2062239641262296e-08, + "loss": 0.027, + "step": 6304 + }, + { + "epoch": 2.941231343283582, + "grad_norm": 0.8175109564063253, + "learning_rate": 1.1688385319706397e-08, + "loss": 0.0279, + "step": 6306 + }, + { + "epoch": 2.9421641791044775, + "grad_norm": 0.983306906139509, + "learning_rate": 1.1320409358763774e-08, + "loss": 0.0266, + "step": 6308 + }, + { + "epoch": 2.9430970149253732, + "grad_norm": 0.8155616372891258, + "learning_rate": 1.095831219206811e-08, + "loss": 0.0264, + "step": 6310 + }, + { + "epoch": 2.9440298507462686, + "grad_norm": 1.05483652059521, + "learning_rate": 1.060209424632308e-08, + "loss": 0.0286, + "step": 6312 + }, + { + "epoch": 2.9449626865671643, + "grad_norm": 0.9180941499613573, + "learning_rate": 1.025175594130623e-08, + "loss": 0.0308, + "step": 6314 + }, + { + "epoch": 2.9458955223880596, + "grad_norm": 1.057002712330282, + "learning_rate": 9.907297689866202e-09, + "loss": 0.0283, + "step": 6316 + }, + { + "epoch": 2.9468283582089554, + "grad_norm": 1.1253102564493433, + "learning_rate": 9.568719897921075e-09, + "loss": 0.0303, + "step": 6318 + }, + { + "epoch": 2.9477611940298507, + "grad_norm": 0.9288836836839905, + "learning_rate": 9.236022964460023e-09, + "loss": 0.0242, + "step": 6320 + }, + { + "epoch": 2.9486940298507465, + "grad_norm": 0.9637668923295082, + "learning_rate": 8.90920728154221e-09, + "loss": 0.028, + "step": 6322 + }, + { + "epoch": 2.949626865671642, + "grad_norm": 0.9433711127474065, + "learning_rate": 8.58827323429623e-09, + "loss": 0.027, + "step": 6324 + }, + { + "epoch": 2.950559701492537, + "grad_norm": 0.8158995413991843, + "learning_rate": 8.273221200919557e-09, + "loss": 0.0247, + "step": 6326 + }, + { + "epoch": 2.951492537313433, + "grad_norm": 0.9681643099605649, + "learning_rate": 7.964051552677432e-09, + "loss": 0.0273, + "step": 6328 + }, + { + "epoch": 2.9524253731343286, + "grad_norm": 1.2526866086239252, + "learning_rate": 7.660764653903973e-09, + "loss": 0.0281, + "step": 6330 + }, + { + "epoch": 2.953358208955224, + "grad_norm": 1.094048312685975, + "learning_rate": 7.3633608620005125e-09, + "loss": 0.029, + "step": 6332 + }, + { + "epoch": 2.9542910447761193, + "grad_norm": 1.083859619444637, + "learning_rate": 7.071840527436147e-09, + "loss": 0.0257, + "step": 6334 + }, + { + "epoch": 2.955223880597015, + "grad_norm": 1.0005781231009265, + "learning_rate": 6.786203993745521e-09, + "loss": 0.0273, + "step": 6336 + }, + { + "epoch": 2.9561567164179103, + "grad_norm": 0.9873159201675941, + "learning_rate": 6.506451597531049e-09, + "loss": 0.0285, + "step": 6338 + }, + { + "epoch": 2.957089552238806, + "grad_norm": 0.8175862094658, + "learning_rate": 6.232583668460135e-09, + "loss": 0.0266, + "step": 6340 + }, + { + "epoch": 2.9580223880597014, + "grad_norm": 0.9556419677870821, + "learning_rate": 5.9646005292662845e-09, + "loss": 0.0316, + "step": 6342 + }, + { + "epoch": 2.958955223880597, + "grad_norm": 0.9863955999860845, + "learning_rate": 5.702502495747997e-09, + "loss": 0.028, + "step": 6344 + }, + { + "epoch": 2.9598880597014925, + "grad_norm": 0.932823706531385, + "learning_rate": 5.446289876768207e-09, + "loss": 0.0294, + "step": 6346 + }, + { + "epoch": 2.960820895522388, + "grad_norm": 0.8680199001481473, + "learning_rate": 5.195962974255953e-09, + "loss": 0.0256, + "step": 6348 + }, + { + "epoch": 2.9617537313432836, + "grad_norm": 0.8537228433311188, + "learning_rate": 4.951522083201376e-09, + "loss": 0.0252, + "step": 6350 + }, + { + "epoch": 2.9626865671641793, + "grad_norm": 0.9302734526930525, + "learning_rate": 4.712967491661835e-09, + "loss": 0.0328, + "step": 6352 + }, + { + "epoch": 2.9636194029850746, + "grad_norm": 0.9259714911734612, + "learning_rate": 4.48029948075579e-09, + "loss": 0.0229, + "step": 6354 + }, + { + "epoch": 2.96455223880597, + "grad_norm": 0.9973336943890746, + "learning_rate": 4.2535183246655844e-09, + "loss": 0.0281, + "step": 6356 + }, + { + "epoch": 2.9654850746268657, + "grad_norm": 1.0207690677170809, + "learning_rate": 4.0326242906363335e-09, + "loss": 0.0266, + "step": 6358 + }, + { + "epoch": 2.966417910447761, + "grad_norm": 0.9171689885593927, + "learning_rate": 3.817617638975369e-09, + "loss": 0.0277, + "step": 6360 + }, + { + "epoch": 2.967350746268657, + "grad_norm": 0.8686180948538196, + "learning_rate": 3.6084986230522366e-09, + "loss": 0.0271, + "step": 6362 + }, + { + "epoch": 2.968283582089552, + "grad_norm": 0.7886370663731553, + "learning_rate": 3.4052674892987026e-09, + "loss": 0.0238, + "step": 6364 + }, + { + "epoch": 2.969216417910448, + "grad_norm": 1.0177966708767086, + "learning_rate": 3.2079244772070804e-09, + "loss": 0.0275, + "step": 6366 + }, + { + "epoch": 2.970149253731343, + "grad_norm": 0.9196618009315258, + "learning_rate": 3.016469819332457e-09, + "loss": 0.0266, + "step": 6368 + }, + { + "epoch": 2.971082089552239, + "grad_norm": 0.7971439936377686, + "learning_rate": 2.8309037412904695e-09, + "loss": 0.0239, + "step": 6370 + }, + { + "epoch": 2.9720149253731343, + "grad_norm": 0.8620747348937791, + "learning_rate": 2.6512264617556405e-09, + "loss": 0.0253, + "step": 6372 + }, + { + "epoch": 2.97294776119403, + "grad_norm": 1.1855536878453208, + "learning_rate": 2.4774381924663747e-09, + "loss": 0.0279, + "step": 6374 + }, + { + "epoch": 2.9738805970149254, + "grad_norm": 0.9928672856584764, + "learning_rate": 2.3095391382182974e-09, + "loss": 0.0296, + "step": 6376 + }, + { + "epoch": 2.9748134328358207, + "grad_norm": 0.9231333294247348, + "learning_rate": 2.1475294968681393e-09, + "loss": 0.0253, + "step": 6378 + }, + { + "epoch": 2.9757462686567164, + "grad_norm": 0.9637582200766458, + "learning_rate": 1.9914094593326273e-09, + "loss": 0.0263, + "step": 6380 + }, + { + "epoch": 2.976679104477612, + "grad_norm": 1.118299323214901, + "learning_rate": 1.8411792095884839e-09, + "loss": 0.0259, + "step": 6382 + }, + { + "epoch": 2.9776119402985075, + "grad_norm": 0.8457213495206082, + "learning_rate": 1.6968389246702078e-09, + "loss": 0.0287, + "step": 6384 + }, + { + "epoch": 2.978544776119403, + "grad_norm": 0.8760950110479465, + "learning_rate": 1.5583887746722926e-09, + "loss": 0.0259, + "step": 6386 + }, + { + "epoch": 2.9794776119402986, + "grad_norm": 0.9701160754069891, + "learning_rate": 1.4258289227486732e-09, + "loss": 0.0277, + "step": 6388 + }, + { + "epoch": 2.980410447761194, + "grad_norm": 0.9854784089831389, + "learning_rate": 1.2991595251110601e-09, + "loss": 0.0314, + "step": 6390 + }, + { + "epoch": 2.9813432835820897, + "grad_norm": 0.9870630151905555, + "learning_rate": 1.1783807310300489e-09, + "loss": 0.0269, + "step": 6392 + }, + { + "epoch": 2.982276119402985, + "grad_norm": 0.8952968586690281, + "learning_rate": 1.0634926828351212e-09, + "loss": 0.0284, + "step": 6394 + }, + { + "epoch": 2.9832089552238807, + "grad_norm": 0.7978447765777141, + "learning_rate": 9.544955159129788e-10, + "loss": 0.0252, + "step": 6396 + }, + { + "epoch": 2.984141791044776, + "grad_norm": 0.9911363159165204, + "learning_rate": 8.513893587086542e-10, + "loss": 0.0259, + "step": 6398 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 0.8169854362980064, + "learning_rate": 7.541743327255102e-10, + "loss": 0.0227, + "step": 6400 + }, + { + "epoch": 2.986007462686567, + "grad_norm": 0.7691147539159162, + "learning_rate": 6.628505525246853e-10, + "loss": 0.0237, + "step": 6402 + }, + { + "epoch": 2.986940298507463, + "grad_norm": 1.1580080517708256, + "learning_rate": 5.77418125723983e-10, + "loss": 0.0342, + "step": 6404 + }, + { + "epoch": 2.987873134328358, + "grad_norm": 0.8169325748785082, + "learning_rate": 4.978771529989824e-10, + "loss": 0.0264, + "step": 6406 + }, + { + "epoch": 2.9888059701492535, + "grad_norm": 0.9819887823547255, + "learning_rate": 4.242277280841478e-10, + "loss": 0.0286, + "step": 6408 + }, + { + "epoch": 2.9897388059701493, + "grad_norm": 1.0286085389310848, + "learning_rate": 3.56469937768944e-10, + "loss": 0.0287, + "step": 6410 + }, + { + "epoch": 2.9906716417910446, + "grad_norm": 0.8886770760953744, + "learning_rate": 2.9460386190116594e-10, + "loss": 0.03, + "step": 6412 + }, + { + "epoch": 2.9916044776119404, + "grad_norm": 0.8067009248418416, + "learning_rate": 2.386295733852739e-10, + "loss": 0.0273, + "step": 6414 + }, + { + "epoch": 2.9925373134328357, + "grad_norm": 0.9329454148658527, + "learning_rate": 1.8854713818350356e-10, + "loss": 0.0263, + "step": 6416 + }, + { + "epoch": 2.9934701492537314, + "grad_norm": 0.9177801864653198, + "learning_rate": 1.4435661531420065e-10, + "loss": 0.0282, + "step": 6418 + }, + { + "epoch": 2.9944029850746268, + "grad_norm": 0.831949946740059, + "learning_rate": 1.0605805685237613e-10, + "loss": 0.0244, + "step": 6420 + }, + { + "epoch": 2.9953358208955225, + "grad_norm": 0.9174951548532697, + "learning_rate": 7.365150792970621e-11, + "loss": 0.0258, + "step": 6422 + }, + { + "epoch": 2.996268656716418, + "grad_norm": 1.0174082789220957, + "learning_rate": 4.7137006735642475e-11, + "loss": 0.0254, + "step": 6424 + }, + { + "epoch": 2.9972014925373136, + "grad_norm": 0.8765238929135598, + "learning_rate": 2.651458451519151e-11, + "loss": 0.0243, + "step": 6426 + }, + { + "epoch": 2.998134328358209, + "grad_norm": 1.0169919835877, + "learning_rate": 1.178426557058021e-11, + "loss": 0.0315, + "step": 6428 + }, + { + "epoch": 2.9990671641791042, + "grad_norm": 0.882103163342718, + "learning_rate": 2.94606726070068e-12, + "loss": 0.028, + "step": 6430 + }, + { + "epoch": 3.0, + "grad_norm": 1.3865468012985325, + "learning_rate": 0.0, + "loss": 0.026, + "step": 6432 + }, + { + "epoch": 3.0, + "step": 6432, + "total_flos": 601546217226240.0, + "train_loss": 0.11825174548714167, + "train_runtime": 22853.2367, + "train_samples_per_second": 18.011, + "train_steps_per_second": 0.281 + } + ], + "logging_steps": 2, + "max_steps": 6432, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 601546217226240.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}