{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 6498, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006156213928434013, "grad_norm": 3.09375, "learning_rate": 2.5641025641025643e-08, "loss": 1.2494438886642456, "step": 2 }, { "epoch": 0.0012312427856868025, "grad_norm": 3.75, "learning_rate": 7.692307692307694e-08, "loss": 1.9450684785842896, "step": 4 }, { "epoch": 0.001846864178530204, "grad_norm": 4.40625, "learning_rate": 1.282051282051282e-07, "loss": 1.6189125776290894, "step": 6 }, { "epoch": 0.002462485571373605, "grad_norm": 5.8125, "learning_rate": 1.7948717948717948e-07, "loss": 1.9541618824005127, "step": 8 }, { "epoch": 0.0030781069642170067, "grad_norm": 7.40625, "learning_rate": 2.307692307692308e-07, "loss": 2.2807464599609375, "step": 10 }, { "epoch": 0.003693728357060408, "grad_norm": 6.3125, "learning_rate": 2.820512820512821e-07, "loss": 1.4778738021850586, "step": 12 }, { "epoch": 0.004309349749903809, "grad_norm": 22.0, "learning_rate": 3.3333333333333335e-07, "loss": 2.5894782543182373, "step": 14 }, { "epoch": 0.00492497114274721, "grad_norm": 6.78125, "learning_rate": 3.846153846153847e-07, "loss": 1.8322064876556396, "step": 16 }, { "epoch": 0.005540592535590611, "grad_norm": 25.0, "learning_rate": 4.358974358974359e-07, "loss": 1.6440393924713135, "step": 18 }, { "epoch": 0.0061562139284340135, "grad_norm": 6.0625, "learning_rate": 4.871794871794872e-07, "loss": 1.502462387084961, "step": 20 }, { "epoch": 0.006771835321277415, "grad_norm": 5.5, "learning_rate": 5.384615384615386e-07, "loss": 1.8018434047698975, "step": 22 }, { "epoch": 0.007387456714120816, "grad_norm": 4.03125, "learning_rate": 5.897435897435898e-07, "loss": 1.8351409435272217, "step": 24 }, { "epoch": 0.008003078106964217, "grad_norm": 4.65625, "learning_rate": 6.41025641025641e-07, "loss": 1.6938329935073853, "step": 26 }, { "epoch": 0.008618699499807618, "grad_norm": 5.125, "learning_rate": 6.923076923076924e-07, "loss": 1.592248558998108, "step": 28 }, { "epoch": 0.00923432089265102, "grad_norm": 3.796875, "learning_rate": 7.435897435897436e-07, "loss": 1.804097294807434, "step": 30 }, { "epoch": 0.00984994228549442, "grad_norm": 6.65625, "learning_rate": 7.948717948717949e-07, "loss": 1.8579797744750977, "step": 32 }, { "epoch": 0.010465563678337822, "grad_norm": 4.1875, "learning_rate": 8.461538461538463e-07, "loss": 2.209737777709961, "step": 34 }, { "epoch": 0.011081185071181223, "grad_norm": 4.875, "learning_rate": 8.974358974358975e-07, "loss": 1.4839755296707153, "step": 36 }, { "epoch": 0.011696806464024625, "grad_norm": 12.6875, "learning_rate": 9.487179487179487e-07, "loss": 1.8842666149139404, "step": 38 }, { "epoch": 0.012312427856868027, "grad_norm": 5.5, "learning_rate": 1.0000000000000002e-06, "loss": 1.1992511749267578, "step": 40 }, { "epoch": 0.012928049249711427, "grad_norm": 8.3125, "learning_rate": 1.0512820512820514e-06, "loss": 1.2397454977035522, "step": 42 }, { "epoch": 0.01354367064255483, "grad_norm": 15.25, "learning_rate": 1.1025641025641026e-06, "loss": 2.012077569961548, "step": 44 }, { "epoch": 0.01415929203539823, "grad_norm": 3.0, "learning_rate": 1.153846153846154e-06, "loss": 1.4503027200698853, "step": 46 }, { "epoch": 0.014774913428241632, "grad_norm": 9.125, "learning_rate": 1.2051282051282053e-06, "loss": 1.7287070751190186, "step": 48 }, { "epoch": 0.015390534821085032, "grad_norm": 4.59375, "learning_rate": 1.2564102564102565e-06, "loss": 1.3663043975830078, "step": 50 }, { "epoch": 0.016006156213928435, "grad_norm": 6.25, "learning_rate": 1.307692307692308e-06, "loss": 1.6556223630905151, "step": 52 }, { "epoch": 0.016621777606771835, "grad_norm": 5.71875, "learning_rate": 1.358974358974359e-06, "loss": 1.983393907546997, "step": 54 }, { "epoch": 0.017237398999615235, "grad_norm": 9.75, "learning_rate": 1.4102564102564104e-06, "loss": 1.5182173252105713, "step": 56 }, { "epoch": 0.01785302039245864, "grad_norm": 4.8125, "learning_rate": 1.4615384615384618e-06, "loss": 1.9359819889068604, "step": 58 }, { "epoch": 0.01846864178530204, "grad_norm": 3.453125, "learning_rate": 1.5128205128205128e-06, "loss": 1.8004614114761353, "step": 60 }, { "epoch": 0.01908426317814544, "grad_norm": 5.78125, "learning_rate": 1.5641025641025642e-06, "loss": 1.513994812965393, "step": 62 }, { "epoch": 0.01969988457098884, "grad_norm": 2.71875, "learning_rate": 1.6153846153846157e-06, "loss": 1.17235267162323, "step": 64 }, { "epoch": 0.020315505963832244, "grad_norm": 13.4375, "learning_rate": 1.6666666666666667e-06, "loss": 1.682073712348938, "step": 66 }, { "epoch": 0.020931127356675645, "grad_norm": 2.640625, "learning_rate": 1.717948717948718e-06, "loss": 1.2705780267715454, "step": 68 }, { "epoch": 0.021546748749519045, "grad_norm": 6.3125, "learning_rate": 1.7692307692307695e-06, "loss": 2.2972326278686523, "step": 70 }, { "epoch": 0.022162370142362445, "grad_norm": 10.6875, "learning_rate": 1.8205128205128205e-06, "loss": 1.8854734897613525, "step": 72 }, { "epoch": 0.02277799153520585, "grad_norm": 5.71875, "learning_rate": 1.871794871794872e-06, "loss": 1.6820818185806274, "step": 74 }, { "epoch": 0.02339361292804925, "grad_norm": 11.0, "learning_rate": 1.9230769230769234e-06, "loss": 2.026367664337158, "step": 76 }, { "epoch": 0.02400923432089265, "grad_norm": 184.0, "learning_rate": 1.9743589743589744e-06, "loss": 1.9582500457763672, "step": 78 }, { "epoch": 0.024624855713736054, "grad_norm": 2.375, "learning_rate": 2.025641025641026e-06, "loss": 1.233195185661316, "step": 80 }, { "epoch": 0.025240477106579454, "grad_norm": 15.875, "learning_rate": 2.0769230769230773e-06, "loss": 1.5411051511764526, "step": 82 }, { "epoch": 0.025856098499422855, "grad_norm": 5.96875, "learning_rate": 2.1282051282051283e-06, "loss": 1.4217170476913452, "step": 84 }, { "epoch": 0.026471719892266255, "grad_norm": 5.03125, "learning_rate": 2.1794871794871797e-06, "loss": 1.4688079357147217, "step": 86 }, { "epoch": 0.02708734128510966, "grad_norm": 7.875, "learning_rate": 2.230769230769231e-06, "loss": 1.6037238836288452, "step": 88 }, { "epoch": 0.02770296267795306, "grad_norm": 13.9375, "learning_rate": 2.282051282051282e-06, "loss": 2.0153872966766357, "step": 90 }, { "epoch": 0.02831858407079646, "grad_norm": 4.96875, "learning_rate": 2.3333333333333336e-06, "loss": 1.4056789875030518, "step": 92 }, { "epoch": 0.02893420546363986, "grad_norm": 5.0625, "learning_rate": 2.384615384615385e-06, "loss": 1.8487026691436768, "step": 94 }, { "epoch": 0.029549826856483264, "grad_norm": 2.703125, "learning_rate": 2.435897435897436e-06, "loss": 1.5239802598953247, "step": 96 }, { "epoch": 0.030165448249326664, "grad_norm": 6.3125, "learning_rate": 2.4871794871794875e-06, "loss": 1.6807596683502197, "step": 98 }, { "epoch": 0.030781069642170065, "grad_norm": 12.9375, "learning_rate": 2.5384615384615385e-06, "loss": 1.1042921543121338, "step": 100 }, { "epoch": 0.03139669103501347, "grad_norm": 7.34375, "learning_rate": 2.5897435897435903e-06, "loss": 1.5535751581192017, "step": 102 }, { "epoch": 0.03201231242785687, "grad_norm": 6.28125, "learning_rate": 2.6410256410256413e-06, "loss": 1.560031771659851, "step": 104 }, { "epoch": 0.03262793382070027, "grad_norm": 8.6875, "learning_rate": 2.6923076923076923e-06, "loss": 1.714495301246643, "step": 106 }, { "epoch": 0.03324355521354367, "grad_norm": 2.890625, "learning_rate": 2.743589743589744e-06, "loss": 1.4674392938613892, "step": 108 }, { "epoch": 0.03385917660638707, "grad_norm": 2.859375, "learning_rate": 2.794871794871795e-06, "loss": 1.6619113683700562, "step": 110 }, { "epoch": 0.03447479799923047, "grad_norm": 21.625, "learning_rate": 2.846153846153846e-06, "loss": 1.9322985410690308, "step": 112 }, { "epoch": 0.03509041939207388, "grad_norm": 8.5625, "learning_rate": 2.897435897435898e-06, "loss": 1.994782567024231, "step": 114 }, { "epoch": 0.03570604078491728, "grad_norm": 2.8125, "learning_rate": 2.948717948717949e-06, "loss": 1.4228806495666504, "step": 116 }, { "epoch": 0.03632166217776068, "grad_norm": 14.0, "learning_rate": 3e-06, "loss": 1.8283569812774658, "step": 118 }, { "epoch": 0.03693728357060408, "grad_norm": 5.0, "learning_rate": 3.051282051282052e-06, "loss": 1.550577163696289, "step": 120 }, { "epoch": 0.03755290496344748, "grad_norm": 4.4375, "learning_rate": 3.102564102564103e-06, "loss": 1.4163514375686646, "step": 122 }, { "epoch": 0.03816852635629088, "grad_norm": 2.4375, "learning_rate": 3.153846153846154e-06, "loss": 1.0851366519927979, "step": 124 }, { "epoch": 0.03878414774913428, "grad_norm": 7.96875, "learning_rate": 3.205128205128206e-06, "loss": 1.3987257480621338, "step": 126 }, { "epoch": 0.03939976914197768, "grad_norm": 10.75, "learning_rate": 3.256410256410257e-06, "loss": 1.721047282218933, "step": 128 }, { "epoch": 0.04001539053482109, "grad_norm": 5.375, "learning_rate": 3.307692307692308e-06, "loss": 1.260140299797058, "step": 130 }, { "epoch": 0.04063101192766449, "grad_norm": 5.78125, "learning_rate": 3.358974358974359e-06, "loss": 1.6700891256332397, "step": 132 }, { "epoch": 0.04124663332050789, "grad_norm": 5.375, "learning_rate": 3.4102564102564107e-06, "loss": 1.6653907299041748, "step": 134 }, { "epoch": 0.04186225471335129, "grad_norm": 19.5, "learning_rate": 3.4615384615384617e-06, "loss": 0.9548298120498657, "step": 136 }, { "epoch": 0.04247787610619469, "grad_norm": 18.75, "learning_rate": 3.5128205128205127e-06, "loss": 1.584119439125061, "step": 138 }, { "epoch": 0.04309349749903809, "grad_norm": 9.8125, "learning_rate": 3.5641025641025646e-06, "loss": 1.7926617860794067, "step": 140 }, { "epoch": 0.04370911889188149, "grad_norm": 6.03125, "learning_rate": 3.6153846153846156e-06, "loss": 1.367248296737671, "step": 142 }, { "epoch": 0.04432474028472489, "grad_norm": 2.375, "learning_rate": 3.6666666666666666e-06, "loss": 1.4224942922592163, "step": 144 }, { "epoch": 0.0449403616775683, "grad_norm": 5.59375, "learning_rate": 3.7179487179487184e-06, "loss": 1.492388367652893, "step": 146 }, { "epoch": 0.0455559830704117, "grad_norm": 11.125, "learning_rate": 3.7692307692307694e-06, "loss": 1.9079093933105469, "step": 148 }, { "epoch": 0.0461716044632551, "grad_norm": 7.03125, "learning_rate": 3.8205128205128204e-06, "loss": 1.3468117713928223, "step": 150 }, { "epoch": 0.0467872258560985, "grad_norm": 12.8125, "learning_rate": 3.871794871794872e-06, "loss": 1.4502298831939697, "step": 152 }, { "epoch": 0.0474028472489419, "grad_norm": 9.6875, "learning_rate": 3.923076923076923e-06, "loss": 1.6394906044006348, "step": 154 }, { "epoch": 0.0480184686417853, "grad_norm": 5.9375, "learning_rate": 3.974358974358974e-06, "loss": 1.235146164894104, "step": 156 }, { "epoch": 0.0486340900346287, "grad_norm": 3.546875, "learning_rate": 4.025641025641026e-06, "loss": 1.3920354843139648, "step": 158 }, { "epoch": 0.04924971142747211, "grad_norm": 9.1875, "learning_rate": 4.076923076923077e-06, "loss": 1.631940245628357, "step": 160 }, { "epoch": 0.04986533282031551, "grad_norm": 2.96875, "learning_rate": 4.128205128205128e-06, "loss": 1.311639666557312, "step": 162 }, { "epoch": 0.05048095421315891, "grad_norm": 5.375, "learning_rate": 4.17948717948718e-06, "loss": 1.2662135362625122, "step": 164 }, { "epoch": 0.05109657560600231, "grad_norm": 13.75, "learning_rate": 4.230769230769231e-06, "loss": 1.3407542705535889, "step": 166 }, { "epoch": 0.05171219699884571, "grad_norm": 3.296875, "learning_rate": 4.282051282051282e-06, "loss": 1.7766584157943726, "step": 168 }, { "epoch": 0.05232781839168911, "grad_norm": 4.03125, "learning_rate": 4.333333333333334e-06, "loss": 1.4202091693878174, "step": 170 }, { "epoch": 0.05294343978453251, "grad_norm": 11.375, "learning_rate": 4.384615384615385e-06, "loss": 1.832815170288086, "step": 172 }, { "epoch": 0.05355906117737591, "grad_norm": 6.65625, "learning_rate": 4.435897435897436e-06, "loss": 1.2218002080917358, "step": 174 }, { "epoch": 0.05417468257021932, "grad_norm": 6.65625, "learning_rate": 4.487179487179488e-06, "loss": 1.5034165382385254, "step": 176 }, { "epoch": 0.05479030396306272, "grad_norm": 9.75, "learning_rate": 4.538461538461539e-06, "loss": 1.6898847818374634, "step": 178 }, { "epoch": 0.05540592535590612, "grad_norm": 15.3125, "learning_rate": 4.58974358974359e-06, "loss": 1.3020204305648804, "step": 180 }, { "epoch": 0.05602154674874952, "grad_norm": 7.4375, "learning_rate": 4.641025641025642e-06, "loss": 1.4048819541931152, "step": 182 }, { "epoch": 0.05663716814159292, "grad_norm": 4.59375, "learning_rate": 4.692307692307693e-06, "loss": 0.9551922082901001, "step": 184 }, { "epoch": 0.05725278953443632, "grad_norm": 5.59375, "learning_rate": 4.743589743589744e-06, "loss": 1.7097375392913818, "step": 186 }, { "epoch": 0.05786841092727972, "grad_norm": 11.8125, "learning_rate": 4.7948717948717955e-06, "loss": 1.3811229467391968, "step": 188 }, { "epoch": 0.05848403232012313, "grad_norm": 6.46875, "learning_rate": 4.8461538461538465e-06, "loss": 1.6789026260375977, "step": 190 }, { "epoch": 0.05909965371296653, "grad_norm": 4.3125, "learning_rate": 4.8974358974358975e-06, "loss": 1.3382556438446045, "step": 192 }, { "epoch": 0.05971527510580993, "grad_norm": 1.9453125, "learning_rate": 4.948717948717949e-06, "loss": 1.215793251991272, "step": 194 }, { "epoch": 0.06033089649865333, "grad_norm": 4.90625, "learning_rate": 5e-06, "loss": 1.4898529052734375, "step": 196 }, { "epoch": 0.06094651789149673, "grad_norm": 3.78125, "learning_rate": 4.999999006277585e-06, "loss": 1.278192162513733, "step": 198 }, { "epoch": 0.06156213928434013, "grad_norm": 2.9375, "learning_rate": 4.9999960251113246e-06, "loss": 1.3816601037979126, "step": 200 }, { "epoch": 0.06217776067718353, "grad_norm": 8.625, "learning_rate": 4.999991056504183e-06, "loss": 1.6115779876708984, "step": 202 }, { "epoch": 0.06279338207002694, "grad_norm": 5.46875, "learning_rate": 4.9999841004610975e-06, "loss": 1.047288417816162, "step": 204 }, { "epoch": 0.06340900346287033, "grad_norm": 5.8125, "learning_rate": 4.999975156988978e-06, "loss": 1.6929690837860107, "step": 206 }, { "epoch": 0.06402462485571374, "grad_norm": 9.625, "learning_rate": 4.999964226096716e-06, "loss": 1.6581546068191528, "step": 208 }, { "epoch": 0.06464024624855713, "grad_norm": 6.96875, "learning_rate": 4.999951307795171e-06, "loss": 1.750710129737854, "step": 210 }, { "epoch": 0.06525586764140054, "grad_norm": 8.5, "learning_rate": 4.999936402097182e-06, "loss": 1.7493253946304321, "step": 212 }, { "epoch": 0.06587148903424395, "grad_norm": 10.6875, "learning_rate": 4.999919509017559e-06, "loss": 1.6592433452606201, "step": 214 }, { "epoch": 0.06648711042708734, "grad_norm": 10.875, "learning_rate": 4.99990062857309e-06, "loss": 1.333007574081421, "step": 216 }, { "epoch": 0.06710273181993075, "grad_norm": 7.84375, "learning_rate": 4.999879760782537e-06, "loss": 1.5750339031219482, "step": 218 }, { "epoch": 0.06771835321277414, "grad_norm": 4.0625, "learning_rate": 4.999856905666636e-06, "loss": 1.4182538986206055, "step": 220 }, { "epoch": 0.06833397460561755, "grad_norm": 5.3125, "learning_rate": 4.9998320632481e-06, "loss": 1.384655237197876, "step": 222 }, { "epoch": 0.06894959599846094, "grad_norm": 3.125, "learning_rate": 4.999805233551616e-06, "loss": 1.140051007270813, "step": 224 }, { "epoch": 0.06956521739130435, "grad_norm": 2.109375, "learning_rate": 4.999776416603842e-06, "loss": 1.4289488792419434, "step": 226 }, { "epoch": 0.07018083878414776, "grad_norm": 10.0, "learning_rate": 4.999745612433418e-06, "loss": 1.4386173486709595, "step": 228 }, { "epoch": 0.07079646017699115, "grad_norm": 18.5, "learning_rate": 4.999712821070951e-06, "loss": 1.656036615371704, "step": 230 }, { "epoch": 0.07141208156983456, "grad_norm": 12.5625, "learning_rate": 4.99967804254903e-06, "loss": 1.302478313446045, "step": 232 }, { "epoch": 0.07202770296267795, "grad_norm": 7.09375, "learning_rate": 4.999641276902213e-06, "loss": 1.3145049810409546, "step": 234 }, { "epoch": 0.07264332435552136, "grad_norm": 3.734375, "learning_rate": 4.999602524167036e-06, "loss": 1.347992181777954, "step": 236 }, { "epoch": 0.07325894574836475, "grad_norm": 4.5, "learning_rate": 4.999561784382009e-06, "loss": 1.5541445016860962, "step": 238 }, { "epoch": 0.07387456714120816, "grad_norm": 6.1875, "learning_rate": 4.999519057587613e-06, "loss": 1.3125475645065308, "step": 240 }, { "epoch": 0.07449018853405155, "grad_norm": 3.265625, "learning_rate": 4.999474343826309e-06, "loss": 1.1761116981506348, "step": 242 }, { "epoch": 0.07510580992689496, "grad_norm": 4.5625, "learning_rate": 4.999427643142531e-06, "loss": 1.3364779949188232, "step": 244 }, { "epoch": 0.07572143131973837, "grad_norm": 6.5625, "learning_rate": 4.999378955582684e-06, "loss": 1.5807875394821167, "step": 246 }, { "epoch": 0.07633705271258176, "grad_norm": 5.40625, "learning_rate": 4.9993282811951514e-06, "loss": 1.500673532485962, "step": 248 }, { "epoch": 0.07695267410542517, "grad_norm": 3.765625, "learning_rate": 4.99927562003029e-06, "loss": 1.0658483505249023, "step": 250 }, { "epoch": 0.07756829549826856, "grad_norm": 8.125, "learning_rate": 4.999220972140427e-06, "loss": 1.478998064994812, "step": 252 }, { "epoch": 0.07818391689111197, "grad_norm": 4.84375, "learning_rate": 4.999164337579873e-06, "loss": 1.6777269840240479, "step": 254 }, { "epoch": 0.07879953828395536, "grad_norm": 9.0625, "learning_rate": 4.999105716404901e-06, "loss": 1.8494199514389038, "step": 256 }, { "epoch": 0.07941515967679877, "grad_norm": 4.875, "learning_rate": 4.999045108673769e-06, "loss": 1.463538646697998, "step": 258 }, { "epoch": 0.08003078106964218, "grad_norm": 1.890625, "learning_rate": 4.998982514446702e-06, "loss": 1.5650489330291748, "step": 260 }, { "epoch": 0.08064640246248557, "grad_norm": 10.0, "learning_rate": 4.9989179337859e-06, "loss": 1.5145618915557861, "step": 262 }, { "epoch": 0.08126202385532898, "grad_norm": 6.78125, "learning_rate": 4.998851366755541e-06, "loss": 1.5924221277236938, "step": 264 }, { "epoch": 0.08187764524817237, "grad_norm": 4.75, "learning_rate": 4.998782813421773e-06, "loss": 1.2749375104904175, "step": 266 }, { "epoch": 0.08249326664101578, "grad_norm": 5.0625, "learning_rate": 4.998712273852719e-06, "loss": 1.400267243385315, "step": 268 }, { "epoch": 0.08310888803385917, "grad_norm": 2.109375, "learning_rate": 4.998639748118476e-06, "loss": 1.3081163167953491, "step": 270 }, { "epoch": 0.08372450942670258, "grad_norm": 5.25, "learning_rate": 4.998565236291114e-06, "loss": 1.604859709739685, "step": 272 }, { "epoch": 0.08434013081954599, "grad_norm": 5.03125, "learning_rate": 4.9984887384446755e-06, "loss": 1.1373594999313354, "step": 274 }, { "epoch": 0.08495575221238938, "grad_norm": 2.59375, "learning_rate": 4.998410254655181e-06, "loss": 1.4989488124847412, "step": 276 }, { "epoch": 0.08557137360523279, "grad_norm": 4.84375, "learning_rate": 4.998329785000621e-06, "loss": 1.4071130752563477, "step": 278 }, { "epoch": 0.08618699499807618, "grad_norm": 14.0, "learning_rate": 4.998247329560959e-06, "loss": 1.5088939666748047, "step": 280 }, { "epoch": 0.08680261639091959, "grad_norm": 6.34375, "learning_rate": 4.9981628884181335e-06, "loss": 1.5704026222229004, "step": 282 }, { "epoch": 0.08741823778376298, "grad_norm": 17.625, "learning_rate": 4.9980764616560555e-06, "loss": 1.4247150421142578, "step": 284 }, { "epoch": 0.08803385917660639, "grad_norm": 25.5, "learning_rate": 4.997988049360608e-06, "loss": 2.021291494369507, "step": 286 }, { "epoch": 0.08864948056944978, "grad_norm": 12.8125, "learning_rate": 4.99789765161965e-06, "loss": 1.0340229272842407, "step": 288 }, { "epoch": 0.08926510196229319, "grad_norm": 14.4375, "learning_rate": 4.9978052685230105e-06, "loss": 1.5912001132965088, "step": 290 }, { "epoch": 0.0898807233551366, "grad_norm": 5.75, "learning_rate": 4.997710900162494e-06, "loss": 1.0831013917922974, "step": 292 }, { "epoch": 0.09049634474797999, "grad_norm": 10.75, "learning_rate": 4.997614546631875e-06, "loss": 1.4437825679779053, "step": 294 }, { "epoch": 0.0911119661408234, "grad_norm": 5.40625, "learning_rate": 4.997516208026902e-06, "loss": 1.5366003513336182, "step": 296 }, { "epoch": 0.09172758753366679, "grad_norm": 40.75, "learning_rate": 4.997415884445299e-06, "loss": 1.1476376056671143, "step": 298 }, { "epoch": 0.0923432089265102, "grad_norm": 8.625, "learning_rate": 4.997313575986756e-06, "loss": 1.4211304187774658, "step": 300 }, { "epoch": 0.09295883031935359, "grad_norm": 6.84375, "learning_rate": 4.997209282752943e-06, "loss": 1.0276066064834595, "step": 302 }, { "epoch": 0.093574451712197, "grad_norm": 18.375, "learning_rate": 4.997103004847496e-06, "loss": 1.928350806236267, "step": 304 }, { "epoch": 0.0941900731050404, "grad_norm": 5.90625, "learning_rate": 4.996994742376025e-06, "loss": 1.7541956901550293, "step": 306 }, { "epoch": 0.0948056944978838, "grad_norm": 2.28125, "learning_rate": 4.996884495446116e-06, "loss": 0.8440700769424438, "step": 308 }, { "epoch": 0.0954213158907272, "grad_norm": 18.375, "learning_rate": 4.996772264167321e-06, "loss": 2.0790932178497314, "step": 310 }, { "epoch": 0.0960369372835706, "grad_norm": 9.1875, "learning_rate": 4.996658048651169e-06, "loss": 1.836275577545166, "step": 312 }, { "epoch": 0.09665255867641401, "grad_norm": 7.0625, "learning_rate": 4.996541849011156e-06, "loss": 1.8032808303833008, "step": 314 }, { "epoch": 0.0972681800692574, "grad_norm": 5.5625, "learning_rate": 4.996423665362754e-06, "loss": 0.6839714050292969, "step": 316 }, { "epoch": 0.09788380146210081, "grad_norm": 3.21875, "learning_rate": 4.9963034978234035e-06, "loss": 1.235926866531372, "step": 318 }, { "epoch": 0.09849942285494422, "grad_norm": 4.25, "learning_rate": 4.99618134651252e-06, "loss": 0.9765989184379578, "step": 320 }, { "epoch": 0.09911504424778761, "grad_norm": 2.515625, "learning_rate": 4.996057211551485e-06, "loss": 1.3052048683166504, "step": 322 }, { "epoch": 0.09973066564063102, "grad_norm": 258.0, "learning_rate": 4.995931093063656e-06, "loss": 1.5037379264831543, "step": 324 }, { "epoch": 0.10034628703347441, "grad_norm": 6.375, "learning_rate": 4.99580299117436e-06, "loss": 1.4417295455932617, "step": 326 }, { "epoch": 0.10096190842631782, "grad_norm": 14.5625, "learning_rate": 4.995672906010893e-06, "loss": 1.6018487215042114, "step": 328 }, { "epoch": 0.10157752981916121, "grad_norm": 4.96875, "learning_rate": 4.9955408377025245e-06, "loss": 1.2220993041992188, "step": 330 }, { "epoch": 0.10219315121200462, "grad_norm": 5.125, "learning_rate": 4.995406786380496e-06, "loss": 1.2403991222381592, "step": 332 }, { "epoch": 0.10280877260484801, "grad_norm": 7.5, "learning_rate": 4.995270752178013e-06, "loss": 1.5319416522979736, "step": 334 }, { "epoch": 0.10342439399769142, "grad_norm": 2.3125, "learning_rate": 4.995132735230258e-06, "loss": 1.2290902137756348, "step": 336 }, { "epoch": 0.10404001539053483, "grad_norm": 9.75, "learning_rate": 4.994992735674382e-06, "loss": 1.1177040338516235, "step": 338 }, { "epoch": 0.10465563678337822, "grad_norm": 12.625, "learning_rate": 4.994850753649506e-06, "loss": 1.6134198904037476, "step": 340 }, { "epoch": 0.10527125817622163, "grad_norm": 29.125, "learning_rate": 4.99470678929672e-06, "loss": 1.218366026878357, "step": 342 }, { "epoch": 0.10588687956906502, "grad_norm": 7.375, "learning_rate": 4.9945608427590834e-06, "loss": 1.4785023927688599, "step": 344 }, { "epoch": 0.10650250096190843, "grad_norm": 2.890625, "learning_rate": 4.994412914181627e-06, "loss": 1.3170098066329956, "step": 346 }, { "epoch": 0.10711812235475182, "grad_norm": 5.4375, "learning_rate": 4.994263003711351e-06, "loss": 1.3708548545837402, "step": 348 }, { "epoch": 0.10773374374759523, "grad_norm": 6.21875, "learning_rate": 4.994111111497227e-06, "loss": 0.997755229473114, "step": 350 }, { "epoch": 0.10834936514043864, "grad_norm": 1.6953125, "learning_rate": 4.993957237690191e-06, "loss": 1.177932858467102, "step": 352 }, { "epoch": 0.10896498653328203, "grad_norm": 8.625, "learning_rate": 4.993801382443152e-06, "loss": 1.588719367980957, "step": 354 }, { "epoch": 0.10958060792612544, "grad_norm": 6.0625, "learning_rate": 4.993643545910986e-06, "loss": 1.4122413396835327, "step": 356 }, { "epoch": 0.11019622931896883, "grad_norm": 5.53125, "learning_rate": 4.99348372825054e-06, "loss": 1.3857849836349487, "step": 358 }, { "epoch": 0.11081185071181224, "grad_norm": 5.4375, "learning_rate": 4.993321929620627e-06, "loss": 1.5077461004257202, "step": 360 }, { "epoch": 0.11142747210465563, "grad_norm": 8.875, "learning_rate": 4.9931581501820315e-06, "loss": 1.5170936584472656, "step": 362 }, { "epoch": 0.11204309349749904, "grad_norm": 4.78125, "learning_rate": 4.992992390097503e-06, "loss": 1.5608993768692017, "step": 364 }, { "epoch": 0.11265871489034245, "grad_norm": 11.25, "learning_rate": 4.992824649531762e-06, "loss": 0.5916320085525513, "step": 366 }, { "epoch": 0.11327433628318584, "grad_norm": 11.1875, "learning_rate": 4.992654928651496e-06, "loss": 1.656692624092102, "step": 368 }, { "epoch": 0.11388995767602925, "grad_norm": 25.75, "learning_rate": 4.99248322762536e-06, "loss": 1.4191769361495972, "step": 370 }, { "epoch": 0.11450557906887264, "grad_norm": 2.125, "learning_rate": 4.992309546623978e-06, "loss": 1.0085556507110596, "step": 372 }, { "epoch": 0.11512120046171605, "grad_norm": 11.125, "learning_rate": 4.99213388581994e-06, "loss": 1.6765049695968628, "step": 374 }, { "epoch": 0.11573682185455944, "grad_norm": 3.328125, "learning_rate": 4.991956245387805e-06, "loss": 1.2738518714904785, "step": 376 }, { "epoch": 0.11635244324740285, "grad_norm": 5.46875, "learning_rate": 4.991776625504097e-06, "loss": 1.433395504951477, "step": 378 }, { "epoch": 0.11696806464024626, "grad_norm": 3.09375, "learning_rate": 4.991595026347309e-06, "loss": 1.188396692276001, "step": 380 }, { "epoch": 0.11758368603308965, "grad_norm": 2.0625, "learning_rate": 4.9914114480979e-06, "loss": 1.204866647720337, "step": 382 }, { "epoch": 0.11819930742593306, "grad_norm": 10.125, "learning_rate": 4.991225890938296e-06, "loss": 1.6785403490066528, "step": 384 }, { "epoch": 0.11881492881877645, "grad_norm": 7.21875, "learning_rate": 4.991038355052889e-06, "loss": 1.2951123714447021, "step": 386 }, { "epoch": 0.11943055021161986, "grad_norm": 5.46875, "learning_rate": 4.9908488406280375e-06, "loss": 1.6818344593048096, "step": 388 }, { "epoch": 0.12004617160446325, "grad_norm": 6.1875, "learning_rate": 4.990657347852067e-06, "loss": 0.9846329092979431, "step": 390 }, { "epoch": 0.12066179299730666, "grad_norm": 3.578125, "learning_rate": 4.990463876915268e-06, "loss": 1.4272600412368774, "step": 392 }, { "epoch": 0.12127741439015005, "grad_norm": 13.5, "learning_rate": 4.9902684280098964e-06, "loss": 1.4261621236801147, "step": 394 }, { "epoch": 0.12189303578299346, "grad_norm": 3.75, "learning_rate": 4.990071001330174e-06, "loss": 1.3102799654006958, "step": 396 }, { "epoch": 0.12250865717583687, "grad_norm": 5.96875, "learning_rate": 4.989871597072289e-06, "loss": 1.2802685499191284, "step": 398 }, { "epoch": 0.12312427856868026, "grad_norm": 4.375, "learning_rate": 4.989670215434393e-06, "loss": 1.4422342777252197, "step": 400 }, { "epoch": 0.12373989996152367, "grad_norm": 5.71875, "learning_rate": 4.989466856616604e-06, "loss": 1.5702648162841797, "step": 402 }, { "epoch": 0.12435552135436706, "grad_norm": 7.875, "learning_rate": 4.989261520821004e-06, "loss": 1.0299426317214966, "step": 404 }, { "epoch": 0.12497114274721047, "grad_norm": 7.53125, "learning_rate": 4.98905420825164e-06, "loss": 1.2051361799240112, "step": 406 }, { "epoch": 0.12558676414005387, "grad_norm": 9.6875, "learning_rate": 4.988844919114523e-06, "loss": 1.3921747207641602, "step": 408 }, { "epoch": 0.12620238553289725, "grad_norm": 2.875, "learning_rate": 4.988633653617628e-06, "loss": 1.3282568454742432, "step": 410 }, { "epoch": 0.12681800692574066, "grad_norm": 6.46875, "learning_rate": 4.9884204119708946e-06, "loss": 1.248948335647583, "step": 412 }, { "epoch": 0.12743362831858407, "grad_norm": 5.8125, "learning_rate": 4.988205194386225e-06, "loss": 1.5414966344833374, "step": 414 }, { "epoch": 0.12804924971142748, "grad_norm": 12.875, "learning_rate": 4.987988001077487e-06, "loss": 1.5431689023971558, "step": 416 }, { "epoch": 0.12866487110427088, "grad_norm": 14.8125, "learning_rate": 4.98776883226051e-06, "loss": 0.8197199106216431, "step": 418 }, { "epoch": 0.12928049249711426, "grad_norm": 17.0, "learning_rate": 4.987547688153087e-06, "loss": 1.4242963790893555, "step": 420 }, { "epoch": 0.12989611388995767, "grad_norm": 33.25, "learning_rate": 4.987324568974974e-06, "loss": 1.5151054859161377, "step": 422 }, { "epoch": 0.13051173528280108, "grad_norm": 8.125, "learning_rate": 4.987099474947889e-06, "loss": 1.0335174798965454, "step": 424 }, { "epoch": 0.13112735667564449, "grad_norm": 6.6875, "learning_rate": 4.986872406295513e-06, "loss": 1.4056828022003174, "step": 426 }, { "epoch": 0.1317429780684879, "grad_norm": 11.0625, "learning_rate": 4.9866433632434895e-06, "loss": 1.2344582080841064, "step": 428 }, { "epoch": 0.13235859946133127, "grad_norm": 5.3125, "learning_rate": 4.986412346019423e-06, "loss": 1.3656508922576904, "step": 430 }, { "epoch": 0.13297422085417468, "grad_norm": 10.6875, "learning_rate": 4.9861793548528835e-06, "loss": 1.4779958724975586, "step": 432 }, { "epoch": 0.1335898422470181, "grad_norm": 27.25, "learning_rate": 4.985944389975396e-06, "loss": 1.868300199508667, "step": 434 }, { "epoch": 0.1342054636398615, "grad_norm": 13.6875, "learning_rate": 4.98570745162045e-06, "loss": 1.876098394393921, "step": 436 }, { "epoch": 0.13482108503270487, "grad_norm": 4.84375, "learning_rate": 4.985468540023501e-06, "loss": 1.430997371673584, "step": 438 }, { "epoch": 0.13543670642554828, "grad_norm": 14.4375, "learning_rate": 4.985227655421956e-06, "loss": 0.9740759134292603, "step": 440 }, { "epoch": 0.1360523278183917, "grad_norm": 9.6875, "learning_rate": 4.984984798055189e-06, "loss": 1.6053884029388428, "step": 442 }, { "epoch": 0.1366679492112351, "grad_norm": 8.5, "learning_rate": 4.984739968164534e-06, "loss": 1.3193613290786743, "step": 444 }, { "epoch": 0.1372835706040785, "grad_norm": 6.46875, "learning_rate": 4.9844931659932825e-06, "loss": 1.4540421962738037, "step": 446 }, { "epoch": 0.13789919199692188, "grad_norm": 18.875, "learning_rate": 4.984244391786688e-06, "loss": 2.0545427799224854, "step": 448 }, { "epoch": 0.1385148133897653, "grad_norm": 5.40625, "learning_rate": 4.983993645791962e-06, "loss": 1.3011374473571777, "step": 450 }, { "epoch": 0.1391304347826087, "grad_norm": 8.6875, "learning_rate": 4.9837409282582795e-06, "loss": 1.555488109588623, "step": 452 }, { "epoch": 0.1397460561754521, "grad_norm": 9.625, "learning_rate": 4.983486239436768e-06, "loss": 1.208080530166626, "step": 454 }, { "epoch": 0.1403616775682955, "grad_norm": 7.34375, "learning_rate": 4.983229579580519e-06, "loss": 1.504934310913086, "step": 456 }, { "epoch": 0.1409772989611389, "grad_norm": 23.75, "learning_rate": 4.982970948944581e-06, "loss": 1.4824737310409546, "step": 458 }, { "epoch": 0.1415929203539823, "grad_norm": 6.3125, "learning_rate": 4.98271034778596e-06, "loss": 1.8614106178283691, "step": 460 }, { "epoch": 0.1422085417468257, "grad_norm": 10.0625, "learning_rate": 4.982447776363625e-06, "loss": 1.7203400135040283, "step": 462 }, { "epoch": 0.1428241631396691, "grad_norm": 5.71875, "learning_rate": 4.982183234938495e-06, "loss": 1.408145785331726, "step": 464 }, { "epoch": 0.1434397845325125, "grad_norm": 6.625, "learning_rate": 4.9819167237734515e-06, "loss": 1.392209768295288, "step": 466 }, { "epoch": 0.1440554059253559, "grad_norm": 9.5625, "learning_rate": 4.981648243133334e-06, "loss": 1.7749637365341187, "step": 468 }, { "epoch": 0.1446710273181993, "grad_norm": 3.96875, "learning_rate": 4.9813777932849365e-06, "loss": 1.681519865989685, "step": 470 }, { "epoch": 0.14528664871104272, "grad_norm": 5.875, "learning_rate": 4.981105374497012e-06, "loss": 1.4043043851852417, "step": 472 }, { "epoch": 0.14590227010388612, "grad_norm": 6.75, "learning_rate": 4.9808309870402685e-06, "loss": 1.5208261013031006, "step": 474 }, { "epoch": 0.1465178914967295, "grad_norm": 6.4375, "learning_rate": 4.980554631187371e-06, "loss": 1.7243291139602661, "step": 476 }, { "epoch": 0.1471335128895729, "grad_norm": 2.34375, "learning_rate": 4.980276307212941e-06, "loss": 1.0319477319717407, "step": 478 }, { "epoch": 0.14774913428241632, "grad_norm": 6.84375, "learning_rate": 4.9799960153935555e-06, "loss": 1.298151969909668, "step": 480 }, { "epoch": 0.14836475567525972, "grad_norm": 4.75, "learning_rate": 4.9797137560077456e-06, "loss": 1.3441214561462402, "step": 482 }, { "epoch": 0.1489803770681031, "grad_norm": 5.84375, "learning_rate": 4.979429529335999e-06, "loss": 1.2724394798278809, "step": 484 }, { "epoch": 0.1495959984609465, "grad_norm": 7.875, "learning_rate": 4.97914333566076e-06, "loss": 1.5625994205474854, "step": 486 }, { "epoch": 0.15021161985378992, "grad_norm": 7.125, "learning_rate": 4.978855175266423e-06, "loss": 1.6065152883529663, "step": 488 }, { "epoch": 0.15082724124663333, "grad_norm": 11.625, "learning_rate": 4.978565048439341e-06, "loss": 1.4982917308807373, "step": 490 }, { "epoch": 0.15144286263947673, "grad_norm": 5.9375, "learning_rate": 4.9782729554678185e-06, "loss": 1.2153326272964478, "step": 492 }, { "epoch": 0.1520584840323201, "grad_norm": 9.125, "learning_rate": 4.977978896642117e-06, "loss": 0.9097068309783936, "step": 494 }, { "epoch": 0.15267410542516352, "grad_norm": 7.3125, "learning_rate": 4.9776828722544465e-06, "loss": 1.5312962532043457, "step": 496 }, { "epoch": 0.15328972681800693, "grad_norm": 7.75, "learning_rate": 4.977384882598976e-06, "loss": 1.705657958984375, "step": 498 }, { "epoch": 0.15390534821085033, "grad_norm": 8.5625, "learning_rate": 4.9770849279718215e-06, "loss": 1.2420040369033813, "step": 500 }, { "epoch": 0.15452096960369374, "grad_norm": 8.125, "learning_rate": 4.9767830086710565e-06, "loss": 1.4940606355667114, "step": 502 }, { "epoch": 0.15513659099653712, "grad_norm": 7.0625, "learning_rate": 4.976479124996705e-06, "loss": 1.698551058769226, "step": 504 }, { "epoch": 0.15575221238938053, "grad_norm": 4.65625, "learning_rate": 4.976173277250742e-06, "loss": 1.4665323495864868, "step": 506 }, { "epoch": 0.15636783378222394, "grad_norm": 6.1875, "learning_rate": 4.975865465737096e-06, "loss": 1.350402593612671, "step": 508 }, { "epoch": 0.15698345517506734, "grad_norm": 6.0, "learning_rate": 4.9755556907616455e-06, "loss": 1.5278868675231934, "step": 510 }, { "epoch": 0.15759907656791072, "grad_norm": 11.6875, "learning_rate": 4.9752439526322224e-06, "loss": 1.794555902481079, "step": 512 }, { "epoch": 0.15821469796075413, "grad_norm": 7.71875, "learning_rate": 4.974930251658606e-06, "loss": 1.5282689332962036, "step": 514 }, { "epoch": 0.15883031935359754, "grad_norm": 11.75, "learning_rate": 4.97461458815253e-06, "loss": 1.35675048828125, "step": 516 }, { "epoch": 0.15944594074644095, "grad_norm": 3.734375, "learning_rate": 4.9742969624276735e-06, "loss": 1.4859691858291626, "step": 518 }, { "epoch": 0.16006156213928435, "grad_norm": 6.78125, "learning_rate": 4.9739773747996715e-06, "loss": 1.4229437112808228, "step": 520 }, { "epoch": 0.16067718353212773, "grad_norm": 16.125, "learning_rate": 4.973655825586102e-06, "loss": 1.8158589601516724, "step": 522 }, { "epoch": 0.16129280492497114, "grad_norm": 12.375, "learning_rate": 4.973332315106499e-06, "loss": 1.4468257427215576, "step": 524 }, { "epoch": 0.16190842631781455, "grad_norm": 9.375, "learning_rate": 4.97300684368234e-06, "loss": 1.7638283967971802, "step": 526 }, { "epoch": 0.16252404771065795, "grad_norm": 4.3125, "learning_rate": 4.972679411637053e-06, "loss": 1.5149681568145752, "step": 528 }, { "epoch": 0.16313966910350133, "grad_norm": 6.96875, "learning_rate": 4.972350019296017e-06, "loss": 0.9841006994247437, "step": 530 }, { "epoch": 0.16375529049634474, "grad_norm": 2.671875, "learning_rate": 4.972018666986554e-06, "loss": 1.3767181634902954, "step": 532 }, { "epoch": 0.16437091188918815, "grad_norm": 8.75, "learning_rate": 4.971685355037938e-06, "loss": 1.373497724533081, "step": 534 }, { "epoch": 0.16498653328203156, "grad_norm": 20.0, "learning_rate": 4.971350083781387e-06, "loss": 1.4224358797073364, "step": 536 }, { "epoch": 0.16560215467487496, "grad_norm": 9.5625, "learning_rate": 4.971012853550069e-06, "loss": 1.730084776878357, "step": 538 }, { "epoch": 0.16621777606771834, "grad_norm": 6.78125, "learning_rate": 4.970673664679097e-06, "loss": 1.3526341915130615, "step": 540 }, { "epoch": 0.16683339746056175, "grad_norm": 7.75, "learning_rate": 4.9703325175055285e-06, "loss": 1.5470267534255981, "step": 542 }, { "epoch": 0.16744901885340516, "grad_norm": 7.4375, "learning_rate": 4.969989412368371e-06, "loss": 1.348077654838562, "step": 544 }, { "epoch": 0.16806464024624856, "grad_norm": 4.5, "learning_rate": 4.969644349608576e-06, "loss": 0.9325212836265564, "step": 546 }, { "epoch": 0.16868026163909197, "grad_norm": 7.34375, "learning_rate": 4.969297329569039e-06, "loss": 1.2758643627166748, "step": 548 }, { "epoch": 0.16929588303193535, "grad_norm": 4.75, "learning_rate": 4.968948352594604e-06, "loss": 1.388833999633789, "step": 550 }, { "epoch": 0.16991150442477876, "grad_norm": 11.8125, "learning_rate": 4.968597419032053e-06, "loss": 1.6071833372116089, "step": 552 }, { "epoch": 0.17052712581762217, "grad_norm": 10.1875, "learning_rate": 4.96824452923012e-06, "loss": 1.414900779724121, "step": 554 }, { "epoch": 0.17114274721046557, "grad_norm": 20.875, "learning_rate": 4.967889683539479e-06, "loss": 1.6497011184692383, "step": 556 }, { "epoch": 0.17175836860330895, "grad_norm": 6.3125, "learning_rate": 4.9675328823127465e-06, "loss": 1.6876095533370972, "step": 558 }, { "epoch": 0.17237398999615236, "grad_norm": 8.25, "learning_rate": 4.967174125904486e-06, "loss": 1.365379810333252, "step": 560 }, { "epoch": 0.17298961138899577, "grad_norm": 3.515625, "learning_rate": 4.9668134146712e-06, "loss": 0.9860933423042297, "step": 562 }, { "epoch": 0.17360523278183917, "grad_norm": 11.5, "learning_rate": 4.966450748971336e-06, "loss": 1.3949452638626099, "step": 564 }, { "epoch": 0.17422085417468258, "grad_norm": 8.625, "learning_rate": 4.966086129165283e-06, "loss": 0.9252087473869324, "step": 566 }, { "epoch": 0.17483647556752596, "grad_norm": 5.15625, "learning_rate": 4.9657195556153725e-06, "loss": 1.239025592803955, "step": 568 }, { "epoch": 0.17545209696036937, "grad_norm": 13.0, "learning_rate": 4.965351028685876e-06, "loss": 1.5434101819992065, "step": 570 }, { "epoch": 0.17606771835321278, "grad_norm": 5.28125, "learning_rate": 4.964980548743009e-06, "loss": 1.3591309785842896, "step": 572 }, { "epoch": 0.17668333974605618, "grad_norm": 6.8125, "learning_rate": 4.964608116154922e-06, "loss": 1.2411075830459595, "step": 574 }, { "epoch": 0.17729896113889956, "grad_norm": 8.4375, "learning_rate": 4.9642337312917125e-06, "loss": 1.6506497859954834, "step": 576 }, { "epoch": 0.17791458253174297, "grad_norm": 5.46875, "learning_rate": 4.963857394525414e-06, "loss": 0.9767615795135498, "step": 578 }, { "epoch": 0.17853020392458638, "grad_norm": 6.875, "learning_rate": 4.963479106230001e-06, "loss": 0.9908381104469299, "step": 580 }, { "epoch": 0.17914582531742979, "grad_norm": 43.75, "learning_rate": 4.963098866781387e-06, "loss": 1.3052326440811157, "step": 582 }, { "epoch": 0.1797614467102732, "grad_norm": 15.25, "learning_rate": 4.9627166765574255e-06, "loss": 1.7691961526870728, "step": 584 }, { "epoch": 0.18037706810311657, "grad_norm": 67.0, "learning_rate": 4.962332535937906e-06, "loss": 1.5837366580963135, "step": 586 }, { "epoch": 0.18099268949595998, "grad_norm": 4.09375, "learning_rate": 4.961946445304559e-06, "loss": 1.6570383310317993, "step": 588 }, { "epoch": 0.1816083108888034, "grad_norm": 10.875, "learning_rate": 4.961558405041048e-06, "loss": 1.3879214525222778, "step": 590 }, { "epoch": 0.1822239322816468, "grad_norm": 4.0625, "learning_rate": 4.961168415532983e-06, "loss": 1.4827848672866821, "step": 592 }, { "epoch": 0.1828395536744902, "grad_norm": 6.625, "learning_rate": 4.9607764771679e-06, "loss": 1.458146095275879, "step": 594 }, { "epoch": 0.18345517506733358, "grad_norm": 13.375, "learning_rate": 4.960382590335281e-06, "loss": 1.4375436305999756, "step": 596 }, { "epoch": 0.184070796460177, "grad_norm": 5.65625, "learning_rate": 4.959986755426538e-06, "loss": 0.9506613612174988, "step": 598 }, { "epoch": 0.1846864178530204, "grad_norm": 11.375, "learning_rate": 4.95958897283502e-06, "loss": 0.8702154159545898, "step": 600 }, { "epoch": 0.1853020392458638, "grad_norm": 21.0, "learning_rate": 4.959189242956015e-06, "loss": 1.3946590423583984, "step": 602 }, { "epoch": 0.18591766063870718, "grad_norm": 7.3125, "learning_rate": 4.958787566186743e-06, "loss": 1.4666697978973389, "step": 604 }, { "epoch": 0.1865332820315506, "grad_norm": 8.0, "learning_rate": 4.958383942926358e-06, "loss": 1.4276539087295532, "step": 606 }, { "epoch": 0.187148903424394, "grad_norm": 4.25, "learning_rate": 4.95797837357595e-06, "loss": 1.5988221168518066, "step": 608 }, { "epoch": 0.1877645248172374, "grad_norm": 4.84375, "learning_rate": 4.957570858538543e-06, "loss": 1.6277081966400146, "step": 610 }, { "epoch": 0.1883801462100808, "grad_norm": 42.25, "learning_rate": 4.957161398219092e-06, "loss": 0.6431888341903687, "step": 612 }, { "epoch": 0.1889957676029242, "grad_norm": 23.25, "learning_rate": 4.956749993024489e-06, "loss": 1.298551321029663, "step": 614 }, { "epoch": 0.1896113889957676, "grad_norm": 7.78125, "learning_rate": 4.956336643363556e-06, "loss": 1.4090662002563477, "step": 616 }, { "epoch": 0.190227010388611, "grad_norm": 9.625, "learning_rate": 4.955921349647047e-06, "loss": 1.3029241561889648, "step": 618 }, { "epoch": 0.1908426317814544, "grad_norm": 17.375, "learning_rate": 4.95550411228765e-06, "loss": 0.9873400926589966, "step": 620 }, { "epoch": 0.1914582531742978, "grad_norm": 7.5625, "learning_rate": 4.955084931699982e-06, "loss": 1.5364199876785278, "step": 622 }, { "epoch": 0.1920738745671412, "grad_norm": 16.875, "learning_rate": 4.954663808300593e-06, "loss": 1.638574242591858, "step": 624 }, { "epoch": 0.1926894959599846, "grad_norm": 13.1875, "learning_rate": 4.954240742507961e-06, "loss": 1.0367610454559326, "step": 626 }, { "epoch": 0.19330511735282802, "grad_norm": 12.25, "learning_rate": 4.9538157347424985e-06, "loss": 1.7227222919464111, "step": 628 }, { "epoch": 0.19392073874567142, "grad_norm": 8.5625, "learning_rate": 4.953388785426544e-06, "loss": 1.8985217809677124, "step": 630 }, { "epoch": 0.1945363601385148, "grad_norm": 8.6875, "learning_rate": 4.952959894984365e-06, "loss": 1.3614797592163086, "step": 632 }, { "epoch": 0.1951519815313582, "grad_norm": 4.375, "learning_rate": 4.952529063842163e-06, "loss": 1.24729323387146, "step": 634 }, { "epoch": 0.19576760292420162, "grad_norm": 6.8125, "learning_rate": 4.952096292428062e-06, "loss": 1.5064369440078735, "step": 636 }, { "epoch": 0.19638322431704502, "grad_norm": 6.53125, "learning_rate": 4.951661581172117e-06, "loss": 1.2572686672210693, "step": 638 }, { "epoch": 0.19699884570988843, "grad_norm": 9.125, "learning_rate": 4.951224930506311e-06, "loss": 1.4917693138122559, "step": 640 }, { "epoch": 0.1976144671027318, "grad_norm": 6.15625, "learning_rate": 4.950786340864553e-06, "loss": 1.6578432321548462, "step": 642 }, { "epoch": 0.19823008849557522, "grad_norm": 12.6875, "learning_rate": 4.95034581268268e-06, "loss": 1.349355697631836, "step": 644 }, { "epoch": 0.19884570988841863, "grad_norm": 6.125, "learning_rate": 4.9499033463984535e-06, "loss": 1.4282751083374023, "step": 646 }, { "epoch": 0.19946133128126203, "grad_norm": 20.0, "learning_rate": 4.9494589424515636e-06, "loss": 1.1630656719207764, "step": 648 }, { "epoch": 0.2000769526741054, "grad_norm": 5.65625, "learning_rate": 4.949012601283624e-06, "loss": 1.5436525344848633, "step": 650 }, { "epoch": 0.20069257406694882, "grad_norm": 10.375, "learning_rate": 4.948564323338174e-06, "loss": 1.4936554431915283, "step": 652 }, { "epoch": 0.20130819545979223, "grad_norm": 4.875, "learning_rate": 4.948114109060677e-06, "loss": 1.0556228160858154, "step": 654 }, { "epoch": 0.20192381685263563, "grad_norm": 10.125, "learning_rate": 4.947661958898521e-06, "loss": 1.6761348247528076, "step": 656 }, { "epoch": 0.20253943824547904, "grad_norm": 7.3125, "learning_rate": 4.947207873301018e-06, "loss": 1.4784631729125977, "step": 658 }, { "epoch": 0.20315505963832242, "grad_norm": 4.8125, "learning_rate": 4.946751852719403e-06, "loss": 1.4860063791275024, "step": 660 }, { "epoch": 0.20377068103116583, "grad_norm": 5.75, "learning_rate": 4.946293897606833e-06, "loss": 1.3539552688598633, "step": 662 }, { "epoch": 0.20438630242400924, "grad_norm": 7.34375, "learning_rate": 4.945834008418391e-06, "loss": 1.3625863790512085, "step": 664 }, { "epoch": 0.20500192381685264, "grad_norm": 3.578125, "learning_rate": 4.945372185611076e-06, "loss": 0.7780288457870483, "step": 666 }, { "epoch": 0.20561754520969602, "grad_norm": 7.84375, "learning_rate": 4.9449084296438135e-06, "loss": 1.1947097778320312, "step": 668 }, { "epoch": 0.20623316660253943, "grad_norm": 3.921875, "learning_rate": 4.944442740977447e-06, "loss": 1.4437352418899536, "step": 670 }, { "epoch": 0.20684878799538284, "grad_norm": 5.8125, "learning_rate": 4.943975120074743e-06, "loss": 1.557558298110962, "step": 672 }, { "epoch": 0.20746440938822625, "grad_norm": 15.8125, "learning_rate": 4.943505567400387e-06, "loss": 1.3439304828643799, "step": 674 }, { "epoch": 0.20808003078106965, "grad_norm": 38.25, "learning_rate": 4.943034083420983e-06, "loss": 1.2202385663986206, "step": 676 }, { "epoch": 0.20869565217391303, "grad_norm": 5.15625, "learning_rate": 4.942560668605055e-06, "loss": 1.6866698265075684, "step": 678 }, { "epoch": 0.20931127356675644, "grad_norm": 9.0625, "learning_rate": 4.942085323423048e-06, "loss": 1.5172860622406006, "step": 680 }, { "epoch": 0.20992689495959985, "grad_norm": 7.0625, "learning_rate": 4.941608048347321e-06, "loss": 1.3118488788604736, "step": 682 }, { "epoch": 0.21054251635244325, "grad_norm": 7.28125, "learning_rate": 4.941128843852152e-06, "loss": 1.3376363515853882, "step": 684 }, { "epoch": 0.21115813774528666, "grad_norm": 10.625, "learning_rate": 4.940647710413741e-06, "loss": 1.5807679891586304, "step": 686 }, { "epoch": 0.21177375913813004, "grad_norm": 7.46875, "learning_rate": 4.940164648510197e-06, "loss": 1.8470110893249512, "step": 688 }, { "epoch": 0.21238938053097345, "grad_norm": 7.6875, "learning_rate": 4.939679658621552e-06, "loss": 1.506554126739502, "step": 690 }, { "epoch": 0.21300500192381686, "grad_norm": 11.3125, "learning_rate": 4.9391927412297525e-06, "loss": 1.1695079803466797, "step": 692 }, { "epoch": 0.21362062331666026, "grad_norm": 12.0625, "learning_rate": 4.938703896818655e-06, "loss": 1.4596199989318848, "step": 694 }, { "epoch": 0.21423624470950364, "grad_norm": 5.34375, "learning_rate": 4.938213125874039e-06, "loss": 1.5376107692718506, "step": 696 }, { "epoch": 0.21485186610234705, "grad_norm": 4.15625, "learning_rate": 4.937720428883594e-06, "loss": 1.3674694299697876, "step": 698 }, { "epoch": 0.21546748749519046, "grad_norm": 5.0, "learning_rate": 4.937225806336921e-06, "loss": 1.2678442001342773, "step": 700 }, { "epoch": 0.21608310888803386, "grad_norm": 5.375, "learning_rate": 4.93672925872554e-06, "loss": 1.3211055994033813, "step": 702 }, { "epoch": 0.21669873028087727, "grad_norm": 2.609375, "learning_rate": 4.936230786542883e-06, "loss": 1.1183050870895386, "step": 704 }, { "epoch": 0.21731435167372065, "grad_norm": 10.375, "learning_rate": 4.935730390284289e-06, "loss": 1.6955947875976562, "step": 706 }, { "epoch": 0.21792997306656406, "grad_norm": 4.75, "learning_rate": 4.935228070447017e-06, "loss": 1.372933030128479, "step": 708 }, { "epoch": 0.21854559445940747, "grad_norm": 12.0, "learning_rate": 4.934723827530231e-06, "loss": 1.433283805847168, "step": 710 }, { "epoch": 0.21916121585225087, "grad_norm": 5.96875, "learning_rate": 4.934217662035008e-06, "loss": 1.4605717658996582, "step": 712 }, { "epoch": 0.21977683724509428, "grad_norm": 10.0, "learning_rate": 4.9337095744643385e-06, "loss": 1.422659158706665, "step": 714 }, { "epoch": 0.22039245863793766, "grad_norm": 5.0, "learning_rate": 4.933199565323119e-06, "loss": 1.2856898307800293, "step": 716 }, { "epoch": 0.22100808003078107, "grad_norm": 11.375, "learning_rate": 4.932687635118157e-06, "loss": 1.190561294555664, "step": 718 }, { "epoch": 0.22162370142362448, "grad_norm": 6.0625, "learning_rate": 4.9321737843581685e-06, "loss": 1.4947484731674194, "step": 720 }, { "epoch": 0.22223932281646788, "grad_norm": 10.0, "learning_rate": 4.931658013553781e-06, "loss": 1.70393705368042, "step": 722 }, { "epoch": 0.22285494420931126, "grad_norm": 5.0625, "learning_rate": 4.931140323217524e-06, "loss": 0.7426916360855103, "step": 724 }, { "epoch": 0.22347056560215467, "grad_norm": 7.8125, "learning_rate": 4.93062071386384e-06, "loss": 1.5670533180236816, "step": 726 }, { "epoch": 0.22408618699499808, "grad_norm": 6.15625, "learning_rate": 4.930099186009077e-06, "loss": 1.7750446796417236, "step": 728 }, { "epoch": 0.22470180838784148, "grad_norm": 53.25, "learning_rate": 4.929575740171488e-06, "loss": 1.3493258953094482, "step": 730 }, { "epoch": 0.2253174297806849, "grad_norm": 5.125, "learning_rate": 4.929050376871231e-06, "loss": 1.225829839706421, "step": 732 }, { "epoch": 0.22593305117352827, "grad_norm": 2.109375, "learning_rate": 4.928523096630376e-06, "loss": 1.150283694267273, "step": 734 }, { "epoch": 0.22654867256637168, "grad_norm": 7.09375, "learning_rate": 4.9279938999728886e-06, "loss": 1.4825005531311035, "step": 736 }, { "epoch": 0.22716429395921509, "grad_norm": 6.3125, "learning_rate": 4.927462787424646e-06, "loss": 1.4402482509613037, "step": 738 }, { "epoch": 0.2277799153520585, "grad_norm": 12.0625, "learning_rate": 4.926929759513426e-06, "loss": 1.7994085550308228, "step": 740 }, { "epoch": 0.22839553674490187, "grad_norm": 10.5625, "learning_rate": 4.926394816768909e-06, "loss": 1.7221037149429321, "step": 742 }, { "epoch": 0.22901115813774528, "grad_norm": 4.34375, "learning_rate": 4.925857959722682e-06, "loss": 0.974231481552124, "step": 744 }, { "epoch": 0.2296267795305887, "grad_norm": 7.4375, "learning_rate": 4.92531918890823e-06, "loss": 1.3925503492355347, "step": 746 }, { "epoch": 0.2302424009234321, "grad_norm": 12.625, "learning_rate": 4.924778504860943e-06, "loss": 1.2160550355911255, "step": 748 }, { "epoch": 0.2308580223162755, "grad_norm": 4.125, "learning_rate": 4.92423590811811e-06, "loss": 1.2490298748016357, "step": 750 }, { "epoch": 0.23147364370911888, "grad_norm": 12.1875, "learning_rate": 4.923691399218921e-06, "loss": 1.4309897422790527, "step": 752 }, { "epoch": 0.2320892651019623, "grad_norm": 14.75, "learning_rate": 4.9231449787044695e-06, "loss": 1.4852542877197266, "step": 754 }, { "epoch": 0.2327048864948057, "grad_norm": 5.375, "learning_rate": 4.922596647117742e-06, "loss": 1.4291030168533325, "step": 756 }, { "epoch": 0.2333205078876491, "grad_norm": 8.25, "learning_rate": 4.92204640500363e-06, "loss": 1.3657782077789307, "step": 758 }, { "epoch": 0.2339361292804925, "grad_norm": 5.6875, "learning_rate": 4.9214942529089215e-06, "loss": 1.50307297706604, "step": 760 }, { "epoch": 0.2345517506733359, "grad_norm": 5.125, "learning_rate": 4.920940191382302e-06, "loss": 1.1985366344451904, "step": 762 }, { "epoch": 0.2351673720661793, "grad_norm": 6.96875, "learning_rate": 4.920384220974355e-06, "loss": 1.7522002458572388, "step": 764 }, { "epoch": 0.2357829934590227, "grad_norm": 9.9375, "learning_rate": 4.919826342237559e-06, "loss": 1.6885839700698853, "step": 766 }, { "epoch": 0.2363986148518661, "grad_norm": 12.4375, "learning_rate": 4.919266555726293e-06, "loss": 1.5652323961257935, "step": 768 }, { "epoch": 0.2370142362447095, "grad_norm": 6.21875, "learning_rate": 4.918704861996829e-06, "loss": 1.2147345542907715, "step": 770 }, { "epoch": 0.2376298576375529, "grad_norm": 6.0625, "learning_rate": 4.918141261607335e-06, "loss": 1.2249404191970825, "step": 772 }, { "epoch": 0.2382454790303963, "grad_norm": 10.8125, "learning_rate": 4.917575755117872e-06, "loss": 1.6967936754226685, "step": 774 }, { "epoch": 0.23886110042323971, "grad_norm": 5.78125, "learning_rate": 4.917008343090397e-06, "loss": 1.049770474433899, "step": 776 }, { "epoch": 0.23947672181608312, "grad_norm": 7.25, "learning_rate": 4.91643902608876e-06, "loss": 1.4362744092941284, "step": 778 }, { "epoch": 0.2400923432089265, "grad_norm": 7.3125, "learning_rate": 4.915867804678704e-06, "loss": 1.4061223268508911, "step": 780 }, { "epoch": 0.2407079646017699, "grad_norm": 8.0625, "learning_rate": 4.915294679427865e-06, "loss": 1.106104850769043, "step": 782 }, { "epoch": 0.24132358599461332, "grad_norm": 2.1875, "learning_rate": 4.91471965090577e-06, "loss": 1.4797524213790894, "step": 784 }, { "epoch": 0.24193920738745672, "grad_norm": 11.1875, "learning_rate": 4.914142719683839e-06, "loss": 1.674882173538208, "step": 786 }, { "epoch": 0.2425548287803001, "grad_norm": 6.0625, "learning_rate": 4.913563886335379e-06, "loss": 1.3974217176437378, "step": 788 }, { "epoch": 0.2431704501731435, "grad_norm": 3.53125, "learning_rate": 4.9129831514355915e-06, "loss": 1.4340710639953613, "step": 790 }, { "epoch": 0.24378607156598692, "grad_norm": 9.6875, "learning_rate": 4.912400515561565e-06, "loss": 1.3193720579147339, "step": 792 }, { "epoch": 0.24440169295883032, "grad_norm": 5.15625, "learning_rate": 4.911815979292278e-06, "loss": 1.2771092653274536, "step": 794 }, { "epoch": 0.24501731435167373, "grad_norm": 6.625, "learning_rate": 4.911229543208598e-06, "loss": 1.6375248432159424, "step": 796 }, { "epoch": 0.2456329357445171, "grad_norm": 5.34375, "learning_rate": 4.9106412078932785e-06, "loss": 1.5073238611221313, "step": 798 }, { "epoch": 0.24624855713736052, "grad_norm": 6.53125, "learning_rate": 4.9100509739309635e-06, "loss": 1.5905678272247314, "step": 800 }, { "epoch": 0.24686417853020393, "grad_norm": 6.09375, "learning_rate": 4.909458841908179e-06, "loss": 1.6703548431396484, "step": 802 }, { "epoch": 0.24747979992304733, "grad_norm": 7.5625, "learning_rate": 4.908864812413341e-06, "loss": 1.626749038696289, "step": 804 }, { "epoch": 0.24809542131589074, "grad_norm": 9.375, "learning_rate": 4.908268886036751e-06, "loss": 1.822136402130127, "step": 806 }, { "epoch": 0.24871104270873412, "grad_norm": 9.0625, "learning_rate": 4.907671063370592e-06, "loss": 1.4152077436447144, "step": 808 }, { "epoch": 0.24932666410157753, "grad_norm": 18.75, "learning_rate": 4.907071345008938e-06, "loss": 1.0158774852752686, "step": 810 }, { "epoch": 0.24994228549442093, "grad_norm": 15.9375, "learning_rate": 4.906469731547738e-06, "loss": 1.6534260511398315, "step": 812 }, { "epoch": 0.2505579068872643, "grad_norm": 3.53125, "learning_rate": 4.905866223584831e-06, "loss": 1.1389552354812622, "step": 814 }, { "epoch": 0.25117352828010775, "grad_norm": 4.625, "learning_rate": 4.905260821719936e-06, "loss": 1.2016785144805908, "step": 816 }, { "epoch": 0.25178914967295113, "grad_norm": 16.625, "learning_rate": 4.904653526554655e-06, "loss": 1.0394606590270996, "step": 818 }, { "epoch": 0.2524047710657945, "grad_norm": 4.5, "learning_rate": 4.9040443386924694e-06, "loss": 1.1572508811950684, "step": 820 }, { "epoch": 0.25302039245863794, "grad_norm": 8.5, "learning_rate": 4.903433258738744e-06, "loss": 1.594638705253601, "step": 822 }, { "epoch": 0.2536360138514813, "grad_norm": 3.59375, "learning_rate": 4.9028202873007216e-06, "loss": 1.3011562824249268, "step": 824 }, { "epoch": 0.25425163524432476, "grad_norm": 6.84375, "learning_rate": 4.902205424987528e-06, "loss": 1.2228734493255615, "step": 826 }, { "epoch": 0.25486725663716814, "grad_norm": 9.125, "learning_rate": 4.901588672410163e-06, "loss": 0.702058732509613, "step": 828 }, { "epoch": 0.2554828780300115, "grad_norm": 7.40625, "learning_rate": 4.900970030181509e-06, "loss": 1.5469894409179688, "step": 830 }, { "epoch": 0.25609849942285495, "grad_norm": 2.578125, "learning_rate": 4.900349498916324e-06, "loss": 1.3061200380325317, "step": 832 }, { "epoch": 0.25671412081569833, "grad_norm": 9.4375, "learning_rate": 4.899727079231244e-06, "loss": 1.4491020441055298, "step": 834 }, { "epoch": 0.25732974220854177, "grad_norm": 9.75, "learning_rate": 4.899102771744781e-06, "loss": 1.0271233320236206, "step": 836 }, { "epoch": 0.25794536360138515, "grad_norm": 40.5, "learning_rate": 4.898476577077325e-06, "loss": 0.7111718654632568, "step": 838 }, { "epoch": 0.2585609849942285, "grad_norm": 9.875, "learning_rate": 4.897848495851137e-06, "loss": 1.659692645072937, "step": 840 }, { "epoch": 0.25917660638707196, "grad_norm": 6.0625, "learning_rate": 4.897218528690357e-06, "loss": 1.3913519382476807, "step": 842 }, { "epoch": 0.25979222777991534, "grad_norm": 6.09375, "learning_rate": 4.896586676220998e-06, "loss": 1.4254862070083618, "step": 844 }, { "epoch": 0.2604078491727588, "grad_norm": 12.5, "learning_rate": 4.895952939070946e-06, "loss": 1.488398790359497, "step": 846 }, { "epoch": 0.26102347056560216, "grad_norm": 5.78125, "learning_rate": 4.8953173178699575e-06, "loss": 1.8268243074417114, "step": 848 }, { "epoch": 0.26163909195844554, "grad_norm": 2.4375, "learning_rate": 4.894679813249666e-06, "loss": 1.1114075183868408, "step": 850 }, { "epoch": 0.26225471335128897, "grad_norm": 13.6875, "learning_rate": 4.8940404258435725e-06, "loss": 1.3839600086212158, "step": 852 }, { "epoch": 0.26287033474413235, "grad_norm": 4.46875, "learning_rate": 4.893399156287052e-06, "loss": 1.1785236597061157, "step": 854 }, { "epoch": 0.2634859561369758, "grad_norm": 10.9375, "learning_rate": 4.892756005217347e-06, "loss": 1.7065340280532837, "step": 856 }, { "epoch": 0.26410157752981916, "grad_norm": 8.25, "learning_rate": 4.892110973273573e-06, "loss": 1.713850498199463, "step": 858 }, { "epoch": 0.26471719892266254, "grad_norm": 3.1875, "learning_rate": 4.891464061096711e-06, "loss": 1.5599521398544312, "step": 860 }, { "epoch": 0.265332820315506, "grad_norm": 11.375, "learning_rate": 4.890815269329613e-06, "loss": 1.5649224519729614, "step": 862 }, { "epoch": 0.26594844170834936, "grad_norm": 11.125, "learning_rate": 4.890164598616997e-06, "loss": 1.5326846837997437, "step": 864 }, { "epoch": 0.2665640631011928, "grad_norm": 9.0, "learning_rate": 4.88951204960545e-06, "loss": 0.8126274347305298, "step": 866 }, { "epoch": 0.2671796844940362, "grad_norm": 2.9375, "learning_rate": 4.888857622943426e-06, "loss": 1.1992944478988647, "step": 868 }, { "epoch": 0.26779530588687955, "grad_norm": 14.25, "learning_rate": 4.88820131928124e-06, "loss": 1.210860013961792, "step": 870 }, { "epoch": 0.268410927279723, "grad_norm": 5.84375, "learning_rate": 4.887543139271078e-06, "loss": 1.1094002723693848, "step": 872 }, { "epoch": 0.26902654867256637, "grad_norm": 4.1875, "learning_rate": 4.886883083566988e-06, "loss": 1.062155842781067, "step": 874 }, { "epoch": 0.26964217006540975, "grad_norm": 5.0625, "learning_rate": 4.88622115282488e-06, "loss": 0.9888610243797302, "step": 876 }, { "epoch": 0.2702577914582532, "grad_norm": 17.0, "learning_rate": 4.885557347702533e-06, "loss": 1.4706164598464966, "step": 878 }, { "epoch": 0.27087341285109656, "grad_norm": 5.375, "learning_rate": 4.884891668859583e-06, "loss": 1.4921408891677856, "step": 880 }, { "epoch": 0.27148903424394, "grad_norm": 11.6875, "learning_rate": 4.88422411695753e-06, "loss": 0.9078661799430847, "step": 882 }, { "epoch": 0.2721046556367834, "grad_norm": 7.53125, "learning_rate": 4.883554692659736e-06, "loss": 1.3483184576034546, "step": 884 }, { "epoch": 0.27272027702962676, "grad_norm": 3.40625, "learning_rate": 4.882883396631421e-06, "loss": 1.2152024507522583, "step": 886 }, { "epoch": 0.2733358984224702, "grad_norm": 7.46875, "learning_rate": 4.88221022953967e-06, "loss": 1.4284247159957886, "step": 888 }, { "epoch": 0.27395151981531357, "grad_norm": 6.34375, "learning_rate": 4.881535192053423e-06, "loss": 1.623680830001831, "step": 890 }, { "epoch": 0.274567141208157, "grad_norm": 8.0625, "learning_rate": 4.880858284843477e-06, "loss": 1.4158018827438354, "step": 892 }, { "epoch": 0.2751827626010004, "grad_norm": 5.96875, "learning_rate": 4.8801795085824945e-06, "loss": 1.7160724401474, "step": 894 }, { "epoch": 0.27579838399384377, "grad_norm": 8.25, "learning_rate": 4.879498863944988e-06, "loss": 1.4056710004806519, "step": 896 }, { "epoch": 0.2764140053866872, "grad_norm": 11.0625, "learning_rate": 4.87881635160733e-06, "loss": 1.3139357566833496, "step": 898 }, { "epoch": 0.2770296267795306, "grad_norm": 7.375, "learning_rate": 4.878131972247747e-06, "loss": 1.2047233581542969, "step": 900 }, { "epoch": 0.277645248172374, "grad_norm": 7.46875, "learning_rate": 4.8774457265463245e-06, "loss": 1.2656543254852295, "step": 902 }, { "epoch": 0.2782608695652174, "grad_norm": 4.59375, "learning_rate": 4.8767576151849985e-06, "loss": 1.1122605800628662, "step": 904 }, { "epoch": 0.2788764909580608, "grad_norm": 5.21875, "learning_rate": 4.876067638847561e-06, "loss": 1.4599850177764893, "step": 906 }, { "epoch": 0.2794921123509042, "grad_norm": 4.09375, "learning_rate": 4.875375798219658e-06, "loss": 1.1917310953140259, "step": 908 }, { "epoch": 0.2801077337437476, "grad_norm": 5.4375, "learning_rate": 4.874682093988786e-06, "loss": 1.4234414100646973, "step": 910 }, { "epoch": 0.280723355136591, "grad_norm": 7.59375, "learning_rate": 4.873986526844294e-06, "loss": 0.9316292405128479, "step": 912 }, { "epoch": 0.2813389765294344, "grad_norm": 9.0625, "learning_rate": 4.873289097477384e-06, "loss": 0.9619709849357605, "step": 914 }, { "epoch": 0.2819545979222778, "grad_norm": 3.671875, "learning_rate": 4.872589806581106e-06, "loss": 0.9365772604942322, "step": 916 }, { "epoch": 0.2825702193151212, "grad_norm": 8.25, "learning_rate": 4.871888654850362e-06, "loss": 1.7384915351867676, "step": 918 }, { "epoch": 0.2831858407079646, "grad_norm": 7.96875, "learning_rate": 4.871185642981901e-06, "loss": 1.2540651559829712, "step": 920 }, { "epoch": 0.283801462100808, "grad_norm": 5.625, "learning_rate": 4.870480771674324e-06, "loss": 1.3530112504959106, "step": 922 }, { "epoch": 0.2844170834936514, "grad_norm": 5.9375, "learning_rate": 4.869774041628075e-06, "loss": 0.942622184753418, "step": 924 }, { "epoch": 0.2850327048864948, "grad_norm": 9.6875, "learning_rate": 4.869065453545447e-06, "loss": 1.549060583114624, "step": 926 }, { "epoch": 0.2856483262793382, "grad_norm": 17.75, "learning_rate": 4.868355008130583e-06, "loss": 1.539589285850525, "step": 928 }, { "epoch": 0.2862639476721816, "grad_norm": 8.9375, "learning_rate": 4.867642706089466e-06, "loss": 1.3125108480453491, "step": 930 }, { "epoch": 0.286879569065025, "grad_norm": 7.84375, "learning_rate": 4.866928548129927e-06, "loss": 1.1507761478424072, "step": 932 }, { "epoch": 0.2874951904578684, "grad_norm": 8.875, "learning_rate": 4.866212534961641e-06, "loss": 1.196966528892517, "step": 934 }, { "epoch": 0.2881108118507118, "grad_norm": 11.4375, "learning_rate": 4.865494667296126e-06, "loss": 1.3384065628051758, "step": 936 }, { "epoch": 0.28872643324355524, "grad_norm": 5.4375, "learning_rate": 4.864774945846744e-06, "loss": 1.2341539859771729, "step": 938 }, { "epoch": 0.2893420546363986, "grad_norm": 9.75, "learning_rate": 4.864053371328697e-06, "loss": 1.3900071382522583, "step": 940 }, { "epoch": 0.289957676029242, "grad_norm": 2.578125, "learning_rate": 4.8633299444590324e-06, "loss": 1.0347307920455933, "step": 942 }, { "epoch": 0.29057329742208543, "grad_norm": 4.46875, "learning_rate": 4.862604665956633e-06, "loss": 0.9689977169036865, "step": 944 }, { "epoch": 0.2911889188149288, "grad_norm": 9.1875, "learning_rate": 4.8618775365422246e-06, "loss": 1.257887363433838, "step": 946 }, { "epoch": 0.29180454020777224, "grad_norm": 10.1875, "learning_rate": 4.861148556938372e-06, "loss": 1.772337794303894, "step": 948 }, { "epoch": 0.2924201616006156, "grad_norm": 7.6875, "learning_rate": 4.860417727869481e-06, "loss": 1.334553837776184, "step": 950 }, { "epoch": 0.293035782993459, "grad_norm": 34.25, "learning_rate": 4.85968505006179e-06, "loss": 1.46852445602417, "step": 952 }, { "epoch": 0.29365140438630244, "grad_norm": 4.0625, "learning_rate": 4.858950524243379e-06, "loss": 1.1797504425048828, "step": 954 }, { "epoch": 0.2942670257791458, "grad_norm": 4.78125, "learning_rate": 4.858214151144161e-06, "loss": 1.172776460647583, "step": 956 }, { "epoch": 0.29488264717198925, "grad_norm": 5.5625, "learning_rate": 4.857475931495888e-06, "loss": 1.4012408256530762, "step": 958 }, { "epoch": 0.29549826856483263, "grad_norm": 13.375, "learning_rate": 4.8567358660321465e-06, "loss": 1.6080305576324463, "step": 960 }, { "epoch": 0.296113889957676, "grad_norm": 7.96875, "learning_rate": 4.8559939554883526e-06, "loss": 1.382482886314392, "step": 962 }, { "epoch": 0.29672951135051945, "grad_norm": 8.25, "learning_rate": 4.855250200601762e-06, "loss": 1.3735034465789795, "step": 964 }, { "epoch": 0.2973451327433628, "grad_norm": 8.5625, "learning_rate": 4.854504602111461e-06, "loss": 1.5223190784454346, "step": 966 }, { "epoch": 0.2979607541362062, "grad_norm": 7.03125, "learning_rate": 4.853757160758367e-06, "loss": 1.059695839881897, "step": 968 }, { "epoch": 0.29857637552904964, "grad_norm": 6.5, "learning_rate": 4.853007877285226e-06, "loss": 1.1238740682601929, "step": 970 }, { "epoch": 0.299191996921893, "grad_norm": 6.84375, "learning_rate": 4.852256752436623e-06, "loss": 0.9654524326324463, "step": 972 }, { "epoch": 0.29980761831473646, "grad_norm": 5.75, "learning_rate": 4.851503786958965e-06, "loss": 1.393660545349121, "step": 974 }, { "epoch": 0.30042323970757984, "grad_norm": 7.03125, "learning_rate": 4.85074898160049e-06, "loss": 1.5199719667434692, "step": 976 }, { "epoch": 0.3010388611004232, "grad_norm": 8.25, "learning_rate": 4.849992337111267e-06, "loss": 1.1787185668945312, "step": 978 }, { "epoch": 0.30165448249326665, "grad_norm": 4.8125, "learning_rate": 4.849233854243189e-06, "loss": 1.5050737857818604, "step": 980 }, { "epoch": 0.30227010388611003, "grad_norm": 9.3125, "learning_rate": 4.848473533749979e-06, "loss": 1.610541820526123, "step": 982 }, { "epoch": 0.30288572527895347, "grad_norm": 4.0625, "learning_rate": 4.847711376387182e-06, "loss": 1.2235430479049683, "step": 984 }, { "epoch": 0.30350134667179685, "grad_norm": 9.0, "learning_rate": 4.846947382912173e-06, "loss": 1.4376128911972046, "step": 986 }, { "epoch": 0.3041169680646402, "grad_norm": 7.4375, "learning_rate": 4.846181554084147e-06, "loss": 1.3893898725509644, "step": 988 }, { "epoch": 0.30473258945748366, "grad_norm": 8.1875, "learning_rate": 4.845413890664129e-06, "loss": 1.1377290487289429, "step": 990 }, { "epoch": 0.30534821085032704, "grad_norm": 5.96875, "learning_rate": 4.844644393414961e-06, "loss": 1.4446502923965454, "step": 992 }, { "epoch": 0.3059638322431705, "grad_norm": 12.25, "learning_rate": 4.84387306310131e-06, "loss": 1.6131486892700195, "step": 994 }, { "epoch": 0.30657945363601385, "grad_norm": 6.3125, "learning_rate": 4.843099900489664e-06, "loss": 1.3349088430404663, "step": 996 }, { "epoch": 0.30719507502885723, "grad_norm": 20.125, "learning_rate": 4.842324906348333e-06, "loss": 1.4115350246429443, "step": 998 }, { "epoch": 0.30781069642170067, "grad_norm": 16.25, "learning_rate": 4.841548081447445e-06, "loss": 1.2290444374084473, "step": 1000 }, { "epoch": 0.30842631781454405, "grad_norm": 7.625, "learning_rate": 4.840769426558948e-06, "loss": 1.7017638683319092, "step": 1002 }, { "epoch": 0.3090419392073875, "grad_norm": 7.09375, "learning_rate": 4.839988942456609e-06, "loss": 1.12124764919281, "step": 1004 }, { "epoch": 0.30965756060023086, "grad_norm": 57.5, "learning_rate": 4.839206629916015e-06, "loss": 1.723775863647461, "step": 1006 }, { "epoch": 0.31027318199307424, "grad_norm": 2.328125, "learning_rate": 4.838422489714564e-06, "loss": 1.3458318710327148, "step": 1008 }, { "epoch": 0.3108888033859177, "grad_norm": 5.3125, "learning_rate": 4.837636522631475e-06, "loss": 1.2959532737731934, "step": 1010 }, { "epoch": 0.31150442477876106, "grad_norm": 5.75, "learning_rate": 4.8368487294477815e-06, "loss": 1.2677240371704102, "step": 1012 }, { "epoch": 0.31212004617160444, "grad_norm": 6.5625, "learning_rate": 4.836059110946332e-06, "loss": 1.1831039190292358, "step": 1014 }, { "epoch": 0.3127356675644479, "grad_norm": 16.625, "learning_rate": 4.835267667911786e-06, "loss": 1.4186724424362183, "step": 1016 }, { "epoch": 0.31335128895729125, "grad_norm": 20.0, "learning_rate": 4.83447440113062e-06, "loss": 1.467540979385376, "step": 1018 }, { "epoch": 0.3139669103501347, "grad_norm": 4.6875, "learning_rate": 4.833679311391121e-06, "loss": 1.560141921043396, "step": 1020 }, { "epoch": 0.31458253174297807, "grad_norm": 5.9375, "learning_rate": 4.832882399483385e-06, "loss": 1.241237998008728, "step": 1022 }, { "epoch": 0.31519815313582145, "grad_norm": 5.0625, "learning_rate": 4.832083666199324e-06, "loss": 1.0696933269500732, "step": 1024 }, { "epoch": 0.3158137745286649, "grad_norm": 10.4375, "learning_rate": 4.8312831123326565e-06, "loss": 0.7319807410240173, "step": 1026 }, { "epoch": 0.31642939592150826, "grad_norm": 5.15625, "learning_rate": 4.83048073867891e-06, "loss": 1.38838791847229, "step": 1028 }, { "epoch": 0.3170450173143517, "grad_norm": 6.375, "learning_rate": 4.829676546035422e-06, "loss": 1.09077787399292, "step": 1030 }, { "epoch": 0.3176606387071951, "grad_norm": 18.25, "learning_rate": 4.828870535201336e-06, "loss": 1.5093919038772583, "step": 1032 }, { "epoch": 0.31827626010003846, "grad_norm": 5.6875, "learning_rate": 4.828062706977605e-06, "loss": 1.4667083024978638, "step": 1034 }, { "epoch": 0.3188918814928819, "grad_norm": 17.0, "learning_rate": 4.827253062166985e-06, "loss": 1.5847958326339722, "step": 1036 }, { "epoch": 0.31950750288572527, "grad_norm": 12.125, "learning_rate": 4.826441601574035e-06, "loss": 1.0991942882537842, "step": 1038 }, { "epoch": 0.3201231242785687, "grad_norm": 4.59375, "learning_rate": 4.825628326005126e-06, "loss": 1.5162996053695679, "step": 1040 }, { "epoch": 0.3207387456714121, "grad_norm": 6.9375, "learning_rate": 4.824813236268425e-06, "loss": 1.5189971923828125, "step": 1042 }, { "epoch": 0.32135436706425546, "grad_norm": 8.3125, "learning_rate": 4.823996333173908e-06, "loss": 1.4960123300552368, "step": 1044 }, { "epoch": 0.3219699884570989, "grad_norm": 7.875, "learning_rate": 4.823177617533348e-06, "loss": 1.262855052947998, "step": 1046 }, { "epoch": 0.3225856098499423, "grad_norm": 4.3125, "learning_rate": 4.822357090160321e-06, "loss": 1.3328428268432617, "step": 1048 }, { "epoch": 0.3232012312427857, "grad_norm": 14.375, "learning_rate": 4.821534751870205e-06, "loss": 1.2429380416870117, "step": 1050 }, { "epoch": 0.3238168526356291, "grad_norm": 7.53125, "learning_rate": 4.8207106034801735e-06, "loss": 1.1245135068893433, "step": 1052 }, { "epoch": 0.3244324740284725, "grad_norm": 14.0, "learning_rate": 4.819884645809203e-06, "loss": 0.9234024882316589, "step": 1054 }, { "epoch": 0.3250480954213159, "grad_norm": 7.40625, "learning_rate": 4.819056879678066e-06, "loss": 1.314733862876892, "step": 1056 }, { "epoch": 0.3256637168141593, "grad_norm": 4.25, "learning_rate": 4.818227305909332e-06, "loss": 1.3780370950698853, "step": 1058 }, { "epoch": 0.32627933820700267, "grad_norm": 13.4375, "learning_rate": 4.817395925327367e-06, "loss": 1.2600730657577515, "step": 1060 }, { "epoch": 0.3268949595998461, "grad_norm": 6.0, "learning_rate": 4.8165627387583316e-06, "loss": 1.6714415550231934, "step": 1062 }, { "epoch": 0.3275105809926895, "grad_norm": 8.5, "learning_rate": 4.815727747030184e-06, "loss": 1.5505750179290771, "step": 1064 }, { "epoch": 0.3281262023855329, "grad_norm": 16.375, "learning_rate": 4.814890950972672e-06, "loss": 1.1855148077011108, "step": 1066 }, { "epoch": 0.3287418237783763, "grad_norm": 9.0625, "learning_rate": 4.814052351417341e-06, "loss": 0.9353585243225098, "step": 1068 }, { "epoch": 0.3293574451712197, "grad_norm": 6.90625, "learning_rate": 4.813211949197525e-06, "loss": 1.228911280632019, "step": 1070 }, { "epoch": 0.3299730665640631, "grad_norm": 8.25, "learning_rate": 4.81236974514835e-06, "loss": 1.7351125478744507, "step": 1072 }, { "epoch": 0.3305886879569065, "grad_norm": 16.5, "learning_rate": 4.811525740106734e-06, "loss": 1.7127091884613037, "step": 1074 }, { "epoch": 0.3312043093497499, "grad_norm": 19.625, "learning_rate": 4.810679934911382e-06, "loss": 1.5812722444534302, "step": 1076 }, { "epoch": 0.3318199307425933, "grad_norm": 4.9375, "learning_rate": 4.8098323304027915e-06, "loss": 1.3842276334762573, "step": 1078 }, { "epoch": 0.3324355521354367, "grad_norm": 2.578125, "learning_rate": 4.808982927423246e-06, "loss": 1.4643383026123047, "step": 1080 }, { "epoch": 0.3330511735282801, "grad_norm": 7.875, "learning_rate": 4.808131726816814e-06, "loss": 1.3700556755065918, "step": 1082 }, { "epoch": 0.3336667949211235, "grad_norm": 32.5, "learning_rate": 4.807278729429356e-06, "loss": 1.2572020292282104, "step": 1084 }, { "epoch": 0.33428241631396693, "grad_norm": 11.625, "learning_rate": 4.8064239361085115e-06, "loss": 1.3481972217559814, "step": 1086 }, { "epoch": 0.3348980377068103, "grad_norm": 13.875, "learning_rate": 4.80556734770371e-06, "loss": 1.007904291152954, "step": 1088 }, { "epoch": 0.3355136590996537, "grad_norm": 2.546875, "learning_rate": 4.804708965066162e-06, "loss": 1.2376952171325684, "step": 1090 }, { "epoch": 0.33612928049249713, "grad_norm": 18.75, "learning_rate": 4.803848789048861e-06, "loss": 1.1707139015197754, "step": 1092 }, { "epoch": 0.3367449018853405, "grad_norm": 2.515625, "learning_rate": 4.802986820506583e-06, "loss": 0.4984327554702759, "step": 1094 }, { "epoch": 0.33736052327818394, "grad_norm": 8.0625, "learning_rate": 4.802123060295887e-06, "loss": 1.3485133647918701, "step": 1096 }, { "epoch": 0.3379761446710273, "grad_norm": 5.40625, "learning_rate": 4.801257509275109e-06, "loss": 1.1582012176513672, "step": 1098 }, { "epoch": 0.3385917660638707, "grad_norm": 6.15625, "learning_rate": 4.8003901683043675e-06, "loss": 1.3930922746658325, "step": 1100 }, { "epoch": 0.33920738745671414, "grad_norm": 6.25, "learning_rate": 4.799521038245559e-06, "loss": 1.6170856952667236, "step": 1102 }, { "epoch": 0.3398230088495575, "grad_norm": 12.375, "learning_rate": 4.798650119962357e-06, "loss": 1.0825083255767822, "step": 1104 }, { "epoch": 0.3404386302424009, "grad_norm": 13.375, "learning_rate": 4.797777414320213e-06, "loss": 1.1367437839508057, "step": 1106 }, { "epoch": 0.34105425163524433, "grad_norm": 5.03125, "learning_rate": 4.796902922186353e-06, "loss": 1.1999157667160034, "step": 1108 }, { "epoch": 0.3416698730280877, "grad_norm": 8.4375, "learning_rate": 4.7960266444297794e-06, "loss": 1.4281408786773682, "step": 1110 }, { "epoch": 0.34228549442093115, "grad_norm": 7.03125, "learning_rate": 4.79514858192127e-06, "loss": 1.3028841018676758, "step": 1112 }, { "epoch": 0.3429011158137745, "grad_norm": 15.9375, "learning_rate": 4.794268735533377e-06, "loss": 1.4932379722595215, "step": 1114 }, { "epoch": 0.3435167372066179, "grad_norm": 6.90625, "learning_rate": 4.7933871061404204e-06, "loss": 1.7872411012649536, "step": 1116 }, { "epoch": 0.34413235859946134, "grad_norm": 8.625, "learning_rate": 4.792503694618495e-06, "loss": 1.3513400554656982, "step": 1118 }, { "epoch": 0.3447479799923047, "grad_norm": 11.5625, "learning_rate": 4.791618501845469e-06, "loss": 1.5094581842422485, "step": 1120 }, { "epoch": 0.34536360138514816, "grad_norm": 3.515625, "learning_rate": 4.790731528700977e-06, "loss": 1.0170141458511353, "step": 1122 }, { "epoch": 0.34597922277799154, "grad_norm": 10.375, "learning_rate": 4.789842776066425e-06, "loss": 1.5296138525009155, "step": 1124 }, { "epoch": 0.3465948441708349, "grad_norm": 8.3125, "learning_rate": 4.788952244824984e-06, "loss": 1.539614200592041, "step": 1126 }, { "epoch": 0.34721046556367835, "grad_norm": 4.875, "learning_rate": 4.788059935861597e-06, "loss": 1.3610502481460571, "step": 1128 }, { "epoch": 0.34782608695652173, "grad_norm": 9.6875, "learning_rate": 4.78716585006297e-06, "loss": 1.7436330318450928, "step": 1130 }, { "epoch": 0.34844170834936516, "grad_norm": 16.25, "learning_rate": 4.786269988317579e-06, "loss": 1.3264261484146118, "step": 1132 }, { "epoch": 0.34905732974220854, "grad_norm": 11.3125, "learning_rate": 4.785372351515659e-06, "loss": 1.6471713781356812, "step": 1134 }, { "epoch": 0.3496729511350519, "grad_norm": 7.34375, "learning_rate": 4.784472940549213e-06, "loss": 1.5563104152679443, "step": 1136 }, { "epoch": 0.35028857252789536, "grad_norm": 2.703125, "learning_rate": 4.7835717563120044e-06, "loss": 1.2945740222930908, "step": 1138 }, { "epoch": 0.35090419392073874, "grad_norm": 7.3125, "learning_rate": 4.782668799699563e-06, "loss": 1.3756704330444336, "step": 1140 }, { "epoch": 0.3515198153135822, "grad_norm": 2.078125, "learning_rate": 4.781764071609173e-06, "loss": 1.108988642692566, "step": 1142 }, { "epoch": 0.35213543670642555, "grad_norm": 5.21875, "learning_rate": 4.7808575729398865e-06, "loss": 1.2461122274398804, "step": 1144 }, { "epoch": 0.35275105809926893, "grad_norm": 6.15625, "learning_rate": 4.779949304592511e-06, "loss": 1.4991414546966553, "step": 1146 }, { "epoch": 0.35336667949211237, "grad_norm": 7.6875, "learning_rate": 4.779039267469612e-06, "loss": 1.4341750144958496, "step": 1148 }, { "epoch": 0.35398230088495575, "grad_norm": 14.0, "learning_rate": 4.778127462475513e-06, "loss": 1.4996871948242188, "step": 1150 }, { "epoch": 0.3545979222777991, "grad_norm": 13.9375, "learning_rate": 4.777213890516299e-06, "loss": 1.656551718711853, "step": 1152 }, { "epoch": 0.35521354367064256, "grad_norm": 23.75, "learning_rate": 4.776298552499803e-06, "loss": 1.1717579364776611, "step": 1154 }, { "epoch": 0.35582916506348594, "grad_norm": 8.0, "learning_rate": 4.775381449335617e-06, "loss": 1.55185866355896, "step": 1156 }, { "epoch": 0.3564447864563294, "grad_norm": 19.75, "learning_rate": 4.77446258193509e-06, "loss": 1.2072546482086182, "step": 1158 }, { "epoch": 0.35706040784917276, "grad_norm": 7.5, "learning_rate": 4.773541951211318e-06, "loss": 1.4020482301712036, "step": 1160 }, { "epoch": 0.35767602924201614, "grad_norm": 22.0, "learning_rate": 4.772619558079154e-06, "loss": 1.4429304599761963, "step": 1162 }, { "epoch": 0.35829165063485957, "grad_norm": 13.25, "learning_rate": 4.771695403455201e-06, "loss": 1.5435585975646973, "step": 1164 }, { "epoch": 0.35890727202770295, "grad_norm": 11.0625, "learning_rate": 4.770769488257812e-06, "loss": 1.156057596206665, "step": 1166 }, { "epoch": 0.3595228934205464, "grad_norm": 5.65625, "learning_rate": 4.769841813407088e-06, "loss": 1.3467957973480225, "step": 1168 }, { "epoch": 0.36013851481338977, "grad_norm": 26.375, "learning_rate": 4.768912379824882e-06, "loss": 1.4512295722961426, "step": 1170 }, { "epoch": 0.36075413620623314, "grad_norm": 3.296875, "learning_rate": 4.767981188434791e-06, "loss": 1.6179447174072266, "step": 1172 }, { "epoch": 0.3613697575990766, "grad_norm": 12.9375, "learning_rate": 4.767048240162164e-06, "loss": 1.697928547859192, "step": 1174 }, { "epoch": 0.36198537899191996, "grad_norm": 7.4375, "learning_rate": 4.7661135359340915e-06, "loss": 1.2352161407470703, "step": 1176 }, { "epoch": 0.3626010003847634, "grad_norm": 9.0625, "learning_rate": 4.7651770766794085e-06, "loss": 1.5190646648406982, "step": 1178 }, { "epoch": 0.3632166217776068, "grad_norm": 12.75, "learning_rate": 4.764238863328696e-06, "loss": 1.2285685539245605, "step": 1180 }, { "epoch": 0.36383224317045015, "grad_norm": 20.5, "learning_rate": 4.763298896814279e-06, "loss": 1.5371445417404175, "step": 1182 }, { "epoch": 0.3644478645632936, "grad_norm": 5.78125, "learning_rate": 4.762357178070221e-06, "loss": 1.3828462362289429, "step": 1184 }, { "epoch": 0.36506348595613697, "grad_norm": 7.875, "learning_rate": 4.761413708032332e-06, "loss": 1.4076499938964844, "step": 1186 }, { "epoch": 0.3656791073489804, "grad_norm": 6.53125, "learning_rate": 4.760468487638158e-06, "loss": 1.313516616821289, "step": 1188 }, { "epoch": 0.3662947287418238, "grad_norm": 6.5625, "learning_rate": 4.759521517826985e-06, "loss": 1.4816573858261108, "step": 1190 }, { "epoch": 0.36691035013466716, "grad_norm": 6.46875, "learning_rate": 4.7585727995398376e-06, "loss": 1.158787727355957, "step": 1192 }, { "epoch": 0.3675259715275106, "grad_norm": 4.40625, "learning_rate": 4.75762233371948e-06, "loss": 1.4099538326263428, "step": 1194 }, { "epoch": 0.368141592920354, "grad_norm": 8.3125, "learning_rate": 4.756670121310411e-06, "loss": 1.3869171142578125, "step": 1196 }, { "epoch": 0.36875721431319736, "grad_norm": 11.875, "learning_rate": 4.7557161632588655e-06, "loss": 1.889589786529541, "step": 1198 }, { "epoch": 0.3693728357060408, "grad_norm": 15.5, "learning_rate": 4.754760460512813e-06, "loss": 1.5803718566894531, "step": 1200 }, { "epoch": 0.36998845709888417, "grad_norm": 2.171875, "learning_rate": 4.753803014021956e-06, "loss": 1.3832993507385254, "step": 1202 }, { "epoch": 0.3706040784917276, "grad_norm": 7.84375, "learning_rate": 4.75284382473773e-06, "loss": 1.3015557527542114, "step": 1204 }, { "epoch": 0.371219699884571, "grad_norm": 10.8125, "learning_rate": 4.751882893613305e-06, "loss": 1.5658292770385742, "step": 1206 }, { "epoch": 0.37183532127741437, "grad_norm": 6.625, "learning_rate": 4.75092022160358e-06, "loss": 1.6867892742156982, "step": 1208 }, { "epoch": 0.3724509426702578, "grad_norm": 7.65625, "learning_rate": 4.7499558096651796e-06, "loss": 1.3453547954559326, "step": 1210 }, { "epoch": 0.3730665640631012, "grad_norm": 9.125, "learning_rate": 4.748989658756467e-06, "loss": 1.4894853830337524, "step": 1212 }, { "epoch": 0.3736821854559446, "grad_norm": 6.0625, "learning_rate": 4.748021769837524e-06, "loss": 1.0069468021392822, "step": 1214 }, { "epoch": 0.374297806848788, "grad_norm": 9.75, "learning_rate": 4.747052143870166e-06, "loss": 1.6321216821670532, "step": 1216 }, { "epoch": 0.3749134282416314, "grad_norm": 7.25, "learning_rate": 4.746080781817929e-06, "loss": 1.0080413818359375, "step": 1218 }, { "epoch": 0.3755290496344748, "grad_norm": 2.34375, "learning_rate": 4.745107684646081e-06, "loss": 1.0679363012313843, "step": 1220 }, { "epoch": 0.3761446710273182, "grad_norm": 5.1875, "learning_rate": 4.744132853321608e-06, "loss": 1.5123836994171143, "step": 1222 }, { "epoch": 0.3767602924201616, "grad_norm": 11.375, "learning_rate": 4.743156288813223e-06, "loss": 1.4279792308807373, "step": 1224 }, { "epoch": 0.377375913813005, "grad_norm": 3.765625, "learning_rate": 4.742177992091359e-06, "loss": 1.2972075939178467, "step": 1226 }, { "epoch": 0.3779915352058484, "grad_norm": 6.78125, "learning_rate": 4.7411979641281724e-06, "loss": 1.2801681756973267, "step": 1228 }, { "epoch": 0.3786071565986918, "grad_norm": 5.21875, "learning_rate": 4.7402162058975375e-06, "loss": 1.3559138774871826, "step": 1230 }, { "epoch": 0.3792227779915352, "grad_norm": 9.375, "learning_rate": 4.7392327183750516e-06, "loss": 1.3966338634490967, "step": 1232 }, { "epoch": 0.37983839938437863, "grad_norm": 5.4375, "learning_rate": 4.738247502538027e-06, "loss": 1.4354243278503418, "step": 1234 }, { "epoch": 0.380454020777222, "grad_norm": 3.953125, "learning_rate": 4.737260559365494e-06, "loss": 1.4101216793060303, "step": 1236 }, { "epoch": 0.3810696421700654, "grad_norm": 5.59375, "learning_rate": 4.736271889838201e-06, "loss": 1.9176892042160034, "step": 1238 }, { "epoch": 0.3816852635629088, "grad_norm": 6.4375, "learning_rate": 4.735281494938612e-06, "loss": 1.3109514713287354, "step": 1240 }, { "epoch": 0.3823008849557522, "grad_norm": 5.1875, "learning_rate": 4.734289375650903e-06, "loss": 1.9060190916061401, "step": 1242 }, { "epoch": 0.3829165063485956, "grad_norm": 5.15625, "learning_rate": 4.733295532960966e-06, "loss": 1.3563264608383179, "step": 1244 }, { "epoch": 0.383532127741439, "grad_norm": 10.3125, "learning_rate": 4.732299967856405e-06, "loss": 1.6917022466659546, "step": 1246 }, { "epoch": 0.3841477491342824, "grad_norm": 7.625, "learning_rate": 4.731302681326535e-06, "loss": 1.2438175678253174, "step": 1248 }, { "epoch": 0.38476337052712584, "grad_norm": 6.25, "learning_rate": 4.730303674362382e-06, "loss": 1.493094801902771, "step": 1250 }, { "epoch": 0.3853789919199692, "grad_norm": 52.25, "learning_rate": 4.729302947956681e-06, "loss": 1.2970855236053467, "step": 1252 }, { "epoch": 0.3859946133128126, "grad_norm": 13.9375, "learning_rate": 4.7283005031038775e-06, "loss": 1.180719256401062, "step": 1254 }, { "epoch": 0.38661023470565603, "grad_norm": 6.53125, "learning_rate": 4.727296340800123e-06, "loss": 1.365455150604248, "step": 1256 }, { "epoch": 0.3872258560984994, "grad_norm": 5.65625, "learning_rate": 4.726290462043275e-06, "loss": 1.53617262840271, "step": 1258 }, { "epoch": 0.38784147749134285, "grad_norm": 8.125, "learning_rate": 4.725282867832899e-06, "loss": 1.7439792156219482, "step": 1260 }, { "epoch": 0.3884570988841862, "grad_norm": 8.5625, "learning_rate": 4.724273559170264e-06, "loss": 1.4724012613296509, "step": 1262 }, { "epoch": 0.3890727202770296, "grad_norm": 39.5, "learning_rate": 4.723262537058342e-06, "loss": 1.6723387241363525, "step": 1264 }, { "epoch": 0.38968834166987304, "grad_norm": 7.625, "learning_rate": 4.722249802501807e-06, "loss": 1.2530168294906616, "step": 1266 }, { "epoch": 0.3903039630627164, "grad_norm": 2.75, "learning_rate": 4.72123535650704e-06, "loss": 1.153831124305725, "step": 1268 }, { "epoch": 0.39091958445555985, "grad_norm": 4.75, "learning_rate": 4.720219200082116e-06, "loss": 1.2530832290649414, "step": 1270 }, { "epoch": 0.39153520584840323, "grad_norm": 77.0, "learning_rate": 4.719201334236811e-06, "loss": 1.822005033493042, "step": 1272 }, { "epoch": 0.3921508272412466, "grad_norm": 6.40625, "learning_rate": 4.718181759982604e-06, "loss": 1.5550251007080078, "step": 1274 }, { "epoch": 0.39276644863409005, "grad_norm": 9.375, "learning_rate": 4.7171604783326674e-06, "loss": 1.3926851749420166, "step": 1276 }, { "epoch": 0.39338207002693343, "grad_norm": 30.875, "learning_rate": 4.716137490301872e-06, "loss": 1.1721335649490356, "step": 1278 }, { "epoch": 0.39399769141977686, "grad_norm": 3.09375, "learning_rate": 4.715112796906784e-06, "loss": 1.2481324672698975, "step": 1280 }, { "epoch": 0.39461331281262024, "grad_norm": 10.4375, "learning_rate": 4.714086399165664e-06, "loss": 0.8791213631629944, "step": 1282 }, { "epoch": 0.3952289342054636, "grad_norm": 5.09375, "learning_rate": 4.713058298098467e-06, "loss": 1.1425285339355469, "step": 1284 }, { "epoch": 0.39584455559830706, "grad_norm": 4.6875, "learning_rate": 4.712028494726838e-06, "loss": 1.3065851926803589, "step": 1286 }, { "epoch": 0.39646017699115044, "grad_norm": 6.3125, "learning_rate": 4.7109969900741185e-06, "loss": 1.2983765602111816, "step": 1288 }, { "epoch": 0.3970757983839938, "grad_norm": 5.8125, "learning_rate": 4.709963785165336e-06, "loss": 1.4547548294067383, "step": 1290 }, { "epoch": 0.39769141977683725, "grad_norm": 7.0, "learning_rate": 4.708928881027209e-06, "loss": 1.8190933465957642, "step": 1292 }, { "epoch": 0.39830704116968063, "grad_norm": 8.375, "learning_rate": 4.707892278688148e-06, "loss": 1.7181881666183472, "step": 1294 }, { "epoch": 0.39892266256252407, "grad_norm": 6.9375, "learning_rate": 4.706853979178244e-06, "loss": 1.382314920425415, "step": 1296 }, { "epoch": 0.39953828395536745, "grad_norm": 5.9375, "learning_rate": 4.705813983529282e-06, "loss": 1.3636555671691895, "step": 1298 }, { "epoch": 0.4001539053482108, "grad_norm": 6.96875, "learning_rate": 4.704772292774726e-06, "loss": 1.2776868343353271, "step": 1300 }, { "epoch": 0.40076952674105426, "grad_norm": 5.5625, "learning_rate": 4.703728907949729e-06, "loss": 1.383784532546997, "step": 1302 }, { "epoch": 0.40138514813389764, "grad_norm": 8.75, "learning_rate": 4.702683830091127e-06, "loss": 1.3588171005249023, "step": 1304 }, { "epoch": 0.4020007695267411, "grad_norm": 15.5, "learning_rate": 4.701637060237434e-06, "loss": 1.5341674089431763, "step": 1306 }, { "epoch": 0.40261639091958445, "grad_norm": 7.75, "learning_rate": 4.700588599428851e-06, "loss": 1.5815376043319702, "step": 1308 }, { "epoch": 0.40323201231242783, "grad_norm": 3.953125, "learning_rate": 4.699538448707258e-06, "loss": 1.2344942092895508, "step": 1310 }, { "epoch": 0.40384763370527127, "grad_norm": 5.03125, "learning_rate": 4.698486609116212e-06, "loss": 1.4477823972702026, "step": 1312 }, { "epoch": 0.40446325509811465, "grad_norm": 7.5, "learning_rate": 4.697433081700949e-06, "loss": 1.0380557775497437, "step": 1314 }, { "epoch": 0.4050788764909581, "grad_norm": 3.046875, "learning_rate": 4.6963778675083815e-06, "loss": 1.322392225265503, "step": 1316 }, { "epoch": 0.40569449788380146, "grad_norm": 23.0, "learning_rate": 4.695320967587104e-06, "loss": 1.6188472509384155, "step": 1318 }, { "epoch": 0.40631011927664484, "grad_norm": 4.53125, "learning_rate": 4.694262382987377e-06, "loss": 1.4975945949554443, "step": 1320 }, { "epoch": 0.4069257406694883, "grad_norm": 17.0, "learning_rate": 4.693202114761143e-06, "loss": 1.310897946357727, "step": 1322 }, { "epoch": 0.40754136206233166, "grad_norm": 3.21875, "learning_rate": 4.692140163962012e-06, "loss": 1.1289759874343872, "step": 1324 }, { "epoch": 0.4081569834551751, "grad_norm": 6.46875, "learning_rate": 4.69107653164527e-06, "loss": 1.5048915147781372, "step": 1326 }, { "epoch": 0.4087726048480185, "grad_norm": 3.25, "learning_rate": 4.6900112188678715e-06, "loss": 1.0909032821655273, "step": 1328 }, { "epoch": 0.40938822624086185, "grad_norm": 9.625, "learning_rate": 4.688944226688442e-06, "loss": 1.2539284229278564, "step": 1330 }, { "epoch": 0.4100038476337053, "grad_norm": 5.28125, "learning_rate": 4.687875556167275e-06, "loss": 1.3737448453903198, "step": 1332 }, { "epoch": 0.41061946902654867, "grad_norm": 8.0625, "learning_rate": 4.686805208366333e-06, "loss": 1.6758344173431396, "step": 1334 }, { "epoch": 0.41123509041939205, "grad_norm": 18.75, "learning_rate": 4.685733184349245e-06, "loss": 1.5155155658721924, "step": 1336 }, { "epoch": 0.4118507118122355, "grad_norm": 6.40625, "learning_rate": 4.684659485181303e-06, "loss": 1.2852325439453125, "step": 1338 }, { "epoch": 0.41246633320507886, "grad_norm": 9.25, "learning_rate": 4.683584111929469e-06, "loss": 1.2048956155776978, "step": 1340 }, { "epoch": 0.4130819545979223, "grad_norm": 8.125, "learning_rate": 4.682507065662363e-06, "loss": 1.2220932245254517, "step": 1342 }, { "epoch": 0.4136975759907657, "grad_norm": 13.3125, "learning_rate": 4.681428347450271e-06, "loss": 1.357061505317688, "step": 1344 }, { "epoch": 0.41431319738360906, "grad_norm": 7.25, "learning_rate": 4.68034795836514e-06, "loss": 1.5958317518234253, "step": 1346 }, { "epoch": 0.4149288187764525, "grad_norm": 7.15625, "learning_rate": 4.679265899480577e-06, "loss": 1.5246134996414185, "step": 1348 }, { "epoch": 0.41554444016929587, "grad_norm": 6.25, "learning_rate": 4.678182171871847e-06, "loss": 1.5932855606079102, "step": 1350 }, { "epoch": 0.4161600615621393, "grad_norm": 6.03125, "learning_rate": 4.677096776615875e-06, "loss": 1.343849539756775, "step": 1352 }, { "epoch": 0.4167756829549827, "grad_norm": 4.625, "learning_rate": 4.676009714791242e-06, "loss": 1.0770719051361084, "step": 1354 }, { "epoch": 0.41739130434782606, "grad_norm": 3.53125, "learning_rate": 4.6749209874781864e-06, "loss": 1.2976691722869873, "step": 1356 }, { "epoch": 0.4180069257406695, "grad_norm": 9.125, "learning_rate": 4.6738305957586e-06, "loss": 1.6580630540847778, "step": 1358 }, { "epoch": 0.4186225471335129, "grad_norm": 51.25, "learning_rate": 4.672738540716032e-06, "loss": 1.1791507005691528, "step": 1360 }, { "epoch": 0.4192381685263563, "grad_norm": 7.375, "learning_rate": 4.671644823435681e-06, "loss": 1.7933979034423828, "step": 1362 }, { "epoch": 0.4198537899191997, "grad_norm": 7.59375, "learning_rate": 4.670549445004395e-06, "loss": 1.090211272239685, "step": 1364 }, { "epoch": 0.4204694113120431, "grad_norm": 4.75, "learning_rate": 4.669452406510681e-06, "loss": 1.3155393600463867, "step": 1366 }, { "epoch": 0.4210850327048865, "grad_norm": 7.71875, "learning_rate": 4.6683537090446875e-06, "loss": 1.668478012084961, "step": 1368 }, { "epoch": 0.4217006540977299, "grad_norm": 11.4375, "learning_rate": 4.667253353698216e-06, "loss": 1.3040167093276978, "step": 1370 }, { "epoch": 0.4223162754905733, "grad_norm": 4.40625, "learning_rate": 4.666151341564713e-06, "loss": 1.1923213005065918, "step": 1372 }, { "epoch": 0.4229318968834167, "grad_norm": 25.625, "learning_rate": 4.665047673739275e-06, "loss": 1.5164082050323486, "step": 1374 }, { "epoch": 0.4235475182762601, "grad_norm": 6.0625, "learning_rate": 4.66394235131864e-06, "loss": 1.4245156049728394, "step": 1376 }, { "epoch": 0.4241631396691035, "grad_norm": 10.1875, "learning_rate": 4.662835375401191e-06, "loss": 1.396084189414978, "step": 1378 }, { "epoch": 0.4247787610619469, "grad_norm": 4.875, "learning_rate": 4.661726747086957e-06, "loss": 1.2419757843017578, "step": 1380 }, { "epoch": 0.4253943824547903, "grad_norm": 8.6875, "learning_rate": 4.660616467477604e-06, "loss": 1.0193966627120972, "step": 1382 }, { "epoch": 0.4260100038476337, "grad_norm": 7.5625, "learning_rate": 4.659504537676444e-06, "loss": 1.6151647567749023, "step": 1384 }, { "epoch": 0.4266256252404771, "grad_norm": 3.75, "learning_rate": 4.658390958788426e-06, "loss": 1.2757296562194824, "step": 1386 }, { "epoch": 0.4272412466333205, "grad_norm": 8.0625, "learning_rate": 4.6572757319201366e-06, "loss": 1.3928544521331787, "step": 1388 }, { "epoch": 0.4278568680261639, "grad_norm": 2.796875, "learning_rate": 4.656158858179805e-06, "loss": 1.0542001724243164, "step": 1390 }, { "epoch": 0.4284724894190073, "grad_norm": 7.0, "learning_rate": 4.655040338677292e-06, "loss": 1.1852567195892334, "step": 1392 }, { "epoch": 0.4290881108118507, "grad_norm": 6.5, "learning_rate": 4.6539201745240925e-06, "loss": 1.287937879562378, "step": 1394 }, { "epoch": 0.4297037322046941, "grad_norm": 12.9375, "learning_rate": 4.652798366833344e-06, "loss": 1.372180461883545, "step": 1396 }, { "epoch": 0.43031935359753754, "grad_norm": 7.46875, "learning_rate": 4.651674916719809e-06, "loss": 1.5380756855010986, "step": 1398 }, { "epoch": 0.4309349749903809, "grad_norm": 3.359375, "learning_rate": 4.650549825299886e-06, "loss": 1.2015230655670166, "step": 1400 }, { "epoch": 0.4315505963832243, "grad_norm": 5.25, "learning_rate": 4.649423093691603e-06, "loss": 1.5411839485168457, "step": 1402 }, { "epoch": 0.43216621777606773, "grad_norm": 6.21875, "learning_rate": 4.648294723014618e-06, "loss": 1.373792290687561, "step": 1404 }, { "epoch": 0.4327818391689111, "grad_norm": 1.4296875, "learning_rate": 4.647164714390219e-06, "loss": 1.1725441217422485, "step": 1406 }, { "epoch": 0.43339746056175454, "grad_norm": 7.0, "learning_rate": 4.6460330689413214e-06, "loss": 1.2541426420211792, "step": 1408 }, { "epoch": 0.4340130819545979, "grad_norm": 34.0, "learning_rate": 4.644899787792465e-06, "loss": 1.3803458213806152, "step": 1410 }, { "epoch": 0.4346287033474413, "grad_norm": 7.03125, "learning_rate": 4.643764872069819e-06, "loss": 1.414416790008545, "step": 1412 }, { "epoch": 0.43524432474028474, "grad_norm": 5.09375, "learning_rate": 4.642628322901171e-06, "loss": 1.1327471733093262, "step": 1414 }, { "epoch": 0.4358599461331281, "grad_norm": 28.0, "learning_rate": 4.64149014141594e-06, "loss": 1.6805217266082764, "step": 1416 }, { "epoch": 0.43647556752597155, "grad_norm": 7.96875, "learning_rate": 4.640350328745159e-06, "loss": 1.5550297498703003, "step": 1418 }, { "epoch": 0.43709118891881493, "grad_norm": 6.8125, "learning_rate": 4.6392088860214865e-06, "loss": 1.574475884437561, "step": 1420 }, { "epoch": 0.4377068103116583, "grad_norm": 13.5, "learning_rate": 4.638065814379201e-06, "loss": 1.6193119287490845, "step": 1422 }, { "epoch": 0.43832243170450175, "grad_norm": 6.0625, "learning_rate": 4.636921114954196e-06, "loss": 1.1327303647994995, "step": 1424 }, { "epoch": 0.4389380530973451, "grad_norm": 5.65625, "learning_rate": 4.635774788883986e-06, "loss": 0.9087588787078857, "step": 1426 }, { "epoch": 0.43955367449018856, "grad_norm": 9.1875, "learning_rate": 4.634626837307702e-06, "loss": 1.2922405004501343, "step": 1428 }, { "epoch": 0.44016929588303194, "grad_norm": 4.09375, "learning_rate": 4.633477261366087e-06, "loss": 1.099453330039978, "step": 1430 }, { "epoch": 0.4407849172758753, "grad_norm": 6.875, "learning_rate": 4.632326062201502e-06, "loss": 1.2831623554229736, "step": 1432 }, { "epoch": 0.44140053866871876, "grad_norm": 11.5625, "learning_rate": 4.631173240957919e-06, "loss": 1.104215145111084, "step": 1434 }, { "epoch": 0.44201616006156214, "grad_norm": 8.1875, "learning_rate": 4.630018798780923e-06, "loss": 1.4881500005722046, "step": 1436 }, { "epoch": 0.4426317814544055, "grad_norm": 6.90625, "learning_rate": 4.628862736817707e-06, "loss": 1.2838746309280396, "step": 1438 }, { "epoch": 0.44324740284724895, "grad_norm": 6.3125, "learning_rate": 4.627705056217079e-06, "loss": 1.402595043182373, "step": 1440 }, { "epoch": 0.44386302424009233, "grad_norm": 5.40625, "learning_rate": 4.626545758129449e-06, "loss": 1.4191675186157227, "step": 1442 }, { "epoch": 0.44447864563293576, "grad_norm": 3.375, "learning_rate": 4.62538484370684e-06, "loss": 1.5125514268875122, "step": 1444 }, { "epoch": 0.44509426702577914, "grad_norm": 6.25, "learning_rate": 4.624222314102876e-06, "loss": 1.2079944610595703, "step": 1446 }, { "epoch": 0.4457098884186225, "grad_norm": 4.78125, "learning_rate": 4.623058170472792e-06, "loss": 1.1275867223739624, "step": 1448 }, { "epoch": 0.44632550981146596, "grad_norm": 55.75, "learning_rate": 4.62189241397342e-06, "loss": 1.4862993955612183, "step": 1450 }, { "epoch": 0.44694113120430934, "grad_norm": 7.5, "learning_rate": 4.6207250457632e-06, "loss": 1.3993580341339111, "step": 1452 }, { "epoch": 0.4475567525971528, "grad_norm": 3.5625, "learning_rate": 4.619556067002173e-06, "loss": 1.0116881132125854, "step": 1454 }, { "epoch": 0.44817237398999615, "grad_norm": 7.875, "learning_rate": 4.6183854788519785e-06, "loss": 1.0309436321258545, "step": 1456 }, { "epoch": 0.44878799538283953, "grad_norm": 12.5, "learning_rate": 4.6172132824758565e-06, "loss": 1.2677288055419922, "step": 1458 }, { "epoch": 0.44940361677568297, "grad_norm": 18.0, "learning_rate": 4.616039479038644e-06, "loss": 1.6138582229614258, "step": 1460 }, { "epoch": 0.45001923816852635, "grad_norm": 9.0625, "learning_rate": 4.614864069706777e-06, "loss": 1.326206922531128, "step": 1462 }, { "epoch": 0.4506348595613698, "grad_norm": 11.75, "learning_rate": 4.613687055648285e-06, "loss": 1.2700221538543701, "step": 1464 }, { "epoch": 0.45125048095421316, "grad_norm": 13.125, "learning_rate": 4.6125084380327935e-06, "loss": 1.2647291421890259, "step": 1466 }, { "epoch": 0.45186610234705654, "grad_norm": 6.25, "learning_rate": 4.611328218031521e-06, "loss": 1.2361282110214233, "step": 1468 }, { "epoch": 0.4524817237399, "grad_norm": 10.3125, "learning_rate": 4.6101463968172795e-06, "loss": 1.4439221620559692, "step": 1470 }, { "epoch": 0.45309734513274336, "grad_norm": 10.5625, "learning_rate": 4.608962975564471e-06, "loss": 1.5618476867675781, "step": 1472 }, { "epoch": 0.4537129665255868, "grad_norm": 13.25, "learning_rate": 4.6077779554490875e-06, "loss": 1.4153897762298584, "step": 1474 }, { "epoch": 0.45432858791843017, "grad_norm": 12.125, "learning_rate": 4.606591337648709e-06, "loss": 1.6324784755706787, "step": 1476 }, { "epoch": 0.45494420931127355, "grad_norm": 10.75, "learning_rate": 4.605403123342506e-06, "loss": 1.528695821762085, "step": 1478 }, { "epoch": 0.455559830704117, "grad_norm": 9.125, "learning_rate": 4.604213313711232e-06, "loss": 1.4585542678833008, "step": 1480 }, { "epoch": 0.45617545209696037, "grad_norm": 7.65625, "learning_rate": 4.60302190993723e-06, "loss": 1.4700865745544434, "step": 1482 }, { "epoch": 0.45679107348980375, "grad_norm": 11.75, "learning_rate": 4.601828913204421e-06, "loss": 1.332728385925293, "step": 1484 }, { "epoch": 0.4574066948826472, "grad_norm": 11.875, "learning_rate": 4.600634324698317e-06, "loss": 1.1675676107406616, "step": 1486 }, { "epoch": 0.45802231627549056, "grad_norm": 18.75, "learning_rate": 4.599438145606003e-06, "loss": 0.9165405631065369, "step": 1488 }, { "epoch": 0.458637937668334, "grad_norm": 3.859375, "learning_rate": 4.5982403771161525e-06, "loss": 0.7547895908355713, "step": 1490 }, { "epoch": 0.4592535590611774, "grad_norm": 16.625, "learning_rate": 4.597041020419012e-06, "loss": 1.3234155178070068, "step": 1492 }, { "epoch": 0.45986918045402075, "grad_norm": 16.875, "learning_rate": 4.595840076706411e-06, "loss": 1.7954037189483643, "step": 1494 }, { "epoch": 0.4604848018468642, "grad_norm": 16.0, "learning_rate": 4.5946375471717545e-06, "loss": 1.4703285694122314, "step": 1496 }, { "epoch": 0.46110042323970757, "grad_norm": 8.25, "learning_rate": 4.593433433010021e-06, "loss": 1.2443583011627197, "step": 1498 }, { "epoch": 0.461716044632551, "grad_norm": 8.875, "learning_rate": 4.592227735417768e-06, "loss": 0.90807044506073, "step": 1500 }, { "epoch": 0.4623316660253944, "grad_norm": 14.875, "learning_rate": 4.591020455593123e-06, "loss": 1.6143991947174072, "step": 1502 }, { "epoch": 0.46294728741823776, "grad_norm": 5.71875, "learning_rate": 4.589811594735785e-06, "loss": 1.3481473922729492, "step": 1504 }, { "epoch": 0.4635629088110812, "grad_norm": 13.25, "learning_rate": 4.588601154047031e-06, "loss": 1.9968032836914062, "step": 1506 }, { "epoch": 0.4641785302039246, "grad_norm": 8.3125, "learning_rate": 4.5873891347296995e-06, "loss": 1.3899567127227783, "step": 1508 }, { "epoch": 0.464794151596768, "grad_norm": 4.8125, "learning_rate": 4.586175537988204e-06, "loss": 1.2550766468048096, "step": 1510 }, { "epoch": 0.4654097729896114, "grad_norm": 6.8125, "learning_rate": 4.584960365028519e-06, "loss": 1.5146225690841675, "step": 1512 }, { "epoch": 0.46602539438245477, "grad_norm": 7.34375, "learning_rate": 4.58374361705819e-06, "loss": 0.8701425194740295, "step": 1514 }, { "epoch": 0.4666410157752982, "grad_norm": 12.1875, "learning_rate": 4.58252529528633e-06, "loss": 0.8581297993659973, "step": 1516 }, { "epoch": 0.4672566371681416, "grad_norm": 8.3125, "learning_rate": 4.58130540092361e-06, "loss": 1.3546233177185059, "step": 1518 }, { "epoch": 0.467872258560985, "grad_norm": 22.125, "learning_rate": 4.5800839351822665e-06, "loss": 1.392436146736145, "step": 1520 }, { "epoch": 0.4684878799538284, "grad_norm": 7.125, "learning_rate": 4.578860899276097e-06, "loss": 1.556898832321167, "step": 1522 }, { "epoch": 0.4691035013466718, "grad_norm": 9.3125, "learning_rate": 4.577636294420462e-06, "loss": 1.5069950819015503, "step": 1524 }, { "epoch": 0.4697191227395152, "grad_norm": 5.28125, "learning_rate": 4.5764101218322765e-06, "loss": 1.4646862745285034, "step": 1526 }, { "epoch": 0.4703347441323586, "grad_norm": 1.984375, "learning_rate": 4.575182382730016e-06, "loss": 1.3124823570251465, "step": 1528 }, { "epoch": 0.470950365525202, "grad_norm": 4.4375, "learning_rate": 4.573953078333712e-06, "loss": 0.9778600931167603, "step": 1530 }, { "epoch": 0.4715659869180454, "grad_norm": 5.40625, "learning_rate": 4.572722209864955e-06, "loss": 1.3515981435775757, "step": 1532 }, { "epoch": 0.4721816083108888, "grad_norm": 10.0625, "learning_rate": 4.571489778546883e-06, "loss": 1.7041574716567993, "step": 1534 }, { "epoch": 0.4727972297037322, "grad_norm": 15.8125, "learning_rate": 4.57025578560419e-06, "loss": 1.6992871761322021, "step": 1536 }, { "epoch": 0.4734128510965756, "grad_norm": 6.65625, "learning_rate": 4.569020232263127e-06, "loss": 1.851335048675537, "step": 1538 }, { "epoch": 0.474028472489419, "grad_norm": 44.5, "learning_rate": 4.567783119751487e-06, "loss": 1.4812127351760864, "step": 1540 }, { "epoch": 0.4746440938822624, "grad_norm": 5.4375, "learning_rate": 4.566544449298618e-06, "loss": 1.3944592475891113, "step": 1542 }, { "epoch": 0.4752597152751058, "grad_norm": 6.1875, "learning_rate": 4.565304222135414e-06, "loss": 1.2519789934158325, "step": 1544 }, { "epoch": 0.47587533666794923, "grad_norm": 1.609375, "learning_rate": 4.5640624394943164e-06, "loss": 1.0768710374832153, "step": 1546 }, { "epoch": 0.4764909580607926, "grad_norm": 11.3125, "learning_rate": 4.562819102609314e-06, "loss": 1.4202693700790405, "step": 1548 }, { "epoch": 0.477106579453636, "grad_norm": 7.53125, "learning_rate": 4.5615742127159365e-06, "loss": 1.2447857856750488, "step": 1550 }, { "epoch": 0.47772220084647943, "grad_norm": 3.359375, "learning_rate": 4.560327771051262e-06, "loss": 1.0244370698928833, "step": 1552 }, { "epoch": 0.4783378222393228, "grad_norm": 5.65625, "learning_rate": 4.5590797788539035e-06, "loss": 1.51302170753479, "step": 1554 }, { "epoch": 0.47895344363216624, "grad_norm": 10.9375, "learning_rate": 4.55783023736402e-06, "loss": 1.6049150228500366, "step": 1556 }, { "epoch": 0.4795690650250096, "grad_norm": 8.625, "learning_rate": 4.556579147823311e-06, "loss": 1.128237247467041, "step": 1558 }, { "epoch": 0.480184686417853, "grad_norm": 9.5625, "learning_rate": 4.55532651147501e-06, "loss": 1.1766349077224731, "step": 1560 }, { "epoch": 0.48080030781069644, "grad_norm": 7.875, "learning_rate": 4.554072329563891e-06, "loss": 1.5755650997161865, "step": 1562 }, { "epoch": 0.4814159292035398, "grad_norm": 11.75, "learning_rate": 4.552816603336262e-06, "loss": 1.2936866283416748, "step": 1564 }, { "epoch": 0.48203155059638325, "grad_norm": 2.796875, "learning_rate": 4.551559334039966e-06, "loss": 1.2639069557189941, "step": 1566 }, { "epoch": 0.48264717198922663, "grad_norm": 9.625, "learning_rate": 4.550300522924383e-06, "loss": 1.2861605882644653, "step": 1568 }, { "epoch": 0.48326279338207, "grad_norm": 8.0, "learning_rate": 4.549040171240416e-06, "loss": 1.325995922088623, "step": 1570 }, { "epoch": 0.48387841477491345, "grad_norm": 8.6875, "learning_rate": 4.54777828024051e-06, "loss": 1.4679720401763916, "step": 1572 }, { "epoch": 0.4844940361677568, "grad_norm": 5.9375, "learning_rate": 4.546514851178631e-06, "loss": 1.2052301168441772, "step": 1574 }, { "epoch": 0.4851096575606002, "grad_norm": 1.6328125, "learning_rate": 4.545249885310278e-06, "loss": 1.1089370250701904, "step": 1576 }, { "epoch": 0.48572527895344364, "grad_norm": 5.71875, "learning_rate": 4.543983383892477e-06, "loss": 1.4348812103271484, "step": 1578 }, { "epoch": 0.486340900346287, "grad_norm": 3.28125, "learning_rate": 4.542715348183776e-06, "loss": 1.1429481506347656, "step": 1580 }, { "epoch": 0.48695652173913045, "grad_norm": 8.125, "learning_rate": 4.541445779444252e-06, "loss": 1.3008825778961182, "step": 1582 }, { "epoch": 0.48757214313197383, "grad_norm": 5.40625, "learning_rate": 4.540174678935506e-06, "loss": 1.3641581535339355, "step": 1584 }, { "epoch": 0.4881877645248172, "grad_norm": 4.40625, "learning_rate": 4.538902047920657e-06, "loss": 1.3861111402511597, "step": 1586 }, { "epoch": 0.48880338591766065, "grad_norm": 8.5625, "learning_rate": 4.537627887664346e-06, "loss": 1.1937060356140137, "step": 1588 }, { "epoch": 0.48941900731050403, "grad_norm": 7.3125, "learning_rate": 4.536352199432737e-06, "loss": 1.2857470512390137, "step": 1590 }, { "epoch": 0.49003462870334746, "grad_norm": 11.875, "learning_rate": 4.535074984493508e-06, "loss": 1.3743950128555298, "step": 1592 }, { "epoch": 0.49065025009619084, "grad_norm": 7.09375, "learning_rate": 4.533796244115858e-06, "loss": 1.0143524408340454, "step": 1594 }, { "epoch": 0.4912658714890342, "grad_norm": 3.234375, "learning_rate": 4.532515979570498e-06, "loss": 0.5565921664237976, "step": 1596 }, { "epoch": 0.49188149288187766, "grad_norm": 7.03125, "learning_rate": 4.5312341921296565e-06, "loss": 1.3684699535369873, "step": 1598 }, { "epoch": 0.49249711427472104, "grad_norm": 16.25, "learning_rate": 4.5299508830670745e-06, "loss": 1.6425446271896362, "step": 1600 }, { "epoch": 0.4931127356675645, "grad_norm": 23.0, "learning_rate": 4.528666053658005e-06, "loss": 1.9633190631866455, "step": 1602 }, { "epoch": 0.49372835706040785, "grad_norm": 7.78125, "learning_rate": 4.5273797051792114e-06, "loss": 1.4236207008361816, "step": 1604 }, { "epoch": 0.49434397845325123, "grad_norm": 9.0, "learning_rate": 4.526091838908968e-06, "loss": 1.4245315790176392, "step": 1606 }, { "epoch": 0.49495959984609467, "grad_norm": 4.0625, "learning_rate": 4.524802456127054e-06, "loss": 1.3921014070510864, "step": 1608 }, { "epoch": 0.49557522123893805, "grad_norm": 16.0, "learning_rate": 4.523511558114762e-06, "loss": 1.4808584451675415, "step": 1610 }, { "epoch": 0.4961908426317815, "grad_norm": 4.875, "learning_rate": 4.522219146154883e-06, "loss": 1.4472503662109375, "step": 1612 }, { "epoch": 0.49680646402462486, "grad_norm": 3.484375, "learning_rate": 4.520925221531716e-06, "loss": 1.0601378679275513, "step": 1614 }, { "epoch": 0.49742208541746824, "grad_norm": 6.8125, "learning_rate": 4.519629785531063e-06, "loss": 1.198182463645935, "step": 1616 }, { "epoch": 0.4980377068103117, "grad_norm": 3.390625, "learning_rate": 4.518332839440231e-06, "loss": 1.1204509735107422, "step": 1618 }, { "epoch": 0.49865332820315506, "grad_norm": 16.625, "learning_rate": 4.517034384548019e-06, "loss": 1.464831829071045, "step": 1620 }, { "epoch": 0.49926894959599843, "grad_norm": 4.78125, "learning_rate": 4.515734422144734e-06, "loss": 1.3328585624694824, "step": 1622 }, { "epoch": 0.49988457098884187, "grad_norm": 4.71875, "learning_rate": 4.514432953522178e-06, "loss": 1.0713456869125366, "step": 1624 }, { "epoch": 0.5005001923816853, "grad_norm": 6.0, "learning_rate": 4.513129979973648e-06, "loss": 1.3135950565338135, "step": 1626 }, { "epoch": 0.5011158137745286, "grad_norm": 7.6875, "learning_rate": 4.51182550279394e-06, "loss": 1.5405608415603638, "step": 1628 }, { "epoch": 0.5017314351673721, "grad_norm": 7.9375, "learning_rate": 4.5105195232793405e-06, "loss": 1.6978867053985596, "step": 1630 }, { "epoch": 0.5023470565602155, "grad_norm": 2.96875, "learning_rate": 4.509212042727632e-06, "loss": 0.8523469567298889, "step": 1632 }, { "epoch": 0.5029626779530588, "grad_norm": 7.125, "learning_rate": 4.5079030624380845e-06, "loss": 1.5460319519042969, "step": 1634 }, { "epoch": 0.5035782993459023, "grad_norm": 12.4375, "learning_rate": 4.5065925837114645e-06, "loss": 1.2278990745544434, "step": 1636 }, { "epoch": 0.5041939207387457, "grad_norm": 8.8125, "learning_rate": 4.5052806078500225e-06, "loss": 1.4185572862625122, "step": 1638 }, { "epoch": 0.504809542131589, "grad_norm": 6.0625, "learning_rate": 4.503967136157498e-06, "loss": 1.1045608520507812, "step": 1640 }, { "epoch": 0.5054251635244325, "grad_norm": 18.125, "learning_rate": 4.502652169939117e-06, "loss": 1.389453649520874, "step": 1642 }, { "epoch": 0.5060407849172759, "grad_norm": 12.9375, "learning_rate": 4.501335710501592e-06, "loss": 1.7360384464263916, "step": 1644 }, { "epoch": 0.5066564063101193, "grad_norm": 4.28125, "learning_rate": 4.500017759153118e-06, "loss": 0.5589945316314697, "step": 1646 }, { "epoch": 0.5072720277029626, "grad_norm": 5.8125, "learning_rate": 4.498698317203373e-06, "loss": 1.0301040410995483, "step": 1648 }, { "epoch": 0.5078876490958061, "grad_norm": 5.625, "learning_rate": 4.497377385963514e-06, "loss": 1.1001111268997192, "step": 1650 }, { "epoch": 0.5085032704886495, "grad_norm": 3.6875, "learning_rate": 4.496054966746183e-06, "loss": 1.3102152347564697, "step": 1652 }, { "epoch": 0.5091188918814928, "grad_norm": 4.84375, "learning_rate": 4.494731060865496e-06, "loss": 1.5156065225601196, "step": 1654 }, { "epoch": 0.5097345132743363, "grad_norm": 3.859375, "learning_rate": 4.493405669637048e-06, "loss": 1.0802805423736572, "step": 1656 }, { "epoch": 0.5103501346671797, "grad_norm": 8.5625, "learning_rate": 4.49207879437791e-06, "loss": 1.690042495727539, "step": 1658 }, { "epoch": 0.510965756060023, "grad_norm": 5.34375, "learning_rate": 4.490750436406628e-06, "loss": 1.1154403686523438, "step": 1660 }, { "epoch": 0.5115813774528665, "grad_norm": 13.8125, "learning_rate": 4.489420597043221e-06, "loss": 0.5433551669120789, "step": 1662 }, { "epoch": 0.5121969988457099, "grad_norm": 3.96875, "learning_rate": 4.48808927760918e-06, "loss": 1.4064877033233643, "step": 1664 }, { "epoch": 0.5128126202385533, "grad_norm": 8.5625, "learning_rate": 4.486756479427467e-06, "loss": 1.0362461805343628, "step": 1666 }, { "epoch": 0.5134282416313967, "grad_norm": 6.375, "learning_rate": 4.485422203822515e-06, "loss": 1.279341220855713, "step": 1668 }, { "epoch": 0.5140438630242401, "grad_norm": 3.6875, "learning_rate": 4.484086452120221e-06, "loss": 0.9909439086914062, "step": 1670 }, { "epoch": 0.5146594844170835, "grad_norm": 19.875, "learning_rate": 4.482749225647952e-06, "loss": 1.3306505680084229, "step": 1672 }, { "epoch": 0.5152751058099269, "grad_norm": 5.5625, "learning_rate": 4.481410525734541e-06, "loss": 1.437589406967163, "step": 1674 }, { "epoch": 0.5158907272027703, "grad_norm": 10.5625, "learning_rate": 4.480070353710283e-06, "loss": 1.6570290327072144, "step": 1676 }, { "epoch": 0.5165063485956137, "grad_norm": 5.875, "learning_rate": 4.478728710906938e-06, "loss": 1.4835489988327026, "step": 1678 }, { "epoch": 0.517121969988457, "grad_norm": 8.125, "learning_rate": 4.4773855986577255e-06, "loss": 1.515626311302185, "step": 1680 }, { "epoch": 0.5177375913813005, "grad_norm": 8.375, "learning_rate": 4.476041018297327e-06, "loss": 1.5490896701812744, "step": 1682 }, { "epoch": 0.5183532127741439, "grad_norm": 26.75, "learning_rate": 4.474694971161882e-06, "loss": 1.4817088842391968, "step": 1684 }, { "epoch": 0.5189688341669872, "grad_norm": 7.0, "learning_rate": 4.473347458588987e-06, "loss": 1.4445440769195557, "step": 1686 }, { "epoch": 0.5195844555598307, "grad_norm": 10.625, "learning_rate": 4.471998481917698e-06, "loss": 1.2884917259216309, "step": 1688 }, { "epoch": 0.5202000769526741, "grad_norm": 5.71875, "learning_rate": 4.47064804248852e-06, "loss": 1.2980061769485474, "step": 1690 }, { "epoch": 0.5208156983455176, "grad_norm": 3.53125, "learning_rate": 4.4692961416434156e-06, "loss": 1.294404149055481, "step": 1692 }, { "epoch": 0.5214313197383609, "grad_norm": 5.0625, "learning_rate": 4.467942780725801e-06, "loss": 1.3953012228012085, "step": 1694 }, { "epoch": 0.5220469411312043, "grad_norm": 7.90625, "learning_rate": 4.46658796108054e-06, "loss": 1.4300891160964966, "step": 1696 }, { "epoch": 0.5226625625240477, "grad_norm": 6.65625, "learning_rate": 4.465231684053947e-06, "loss": 1.6064602136611938, "step": 1698 }, { "epoch": 0.5232781839168911, "grad_norm": 15.625, "learning_rate": 4.463873950993786e-06, "loss": 1.0882776975631714, "step": 1700 }, { "epoch": 0.5238938053097345, "grad_norm": 6.1875, "learning_rate": 4.462514763249265e-06, "loss": 1.3740888833999634, "step": 1702 }, { "epoch": 0.5245094267025779, "grad_norm": 2.421875, "learning_rate": 4.46115412217104e-06, "loss": 1.3197846412658691, "step": 1704 }, { "epoch": 0.5251250480954213, "grad_norm": 4.71875, "learning_rate": 4.459792029111211e-06, "loss": 1.0752228498458862, "step": 1706 }, { "epoch": 0.5257406694882647, "grad_norm": 2.171875, "learning_rate": 4.45842848542332e-06, "loss": 1.2539163827896118, "step": 1708 }, { "epoch": 0.5263562908811081, "grad_norm": 17.375, "learning_rate": 4.457063492462352e-06, "loss": 1.708069920539856, "step": 1710 }, { "epoch": 0.5269719122739516, "grad_norm": 12.8125, "learning_rate": 4.4556970515847305e-06, "loss": 1.1730300188064575, "step": 1712 }, { "epoch": 0.5275875336667949, "grad_norm": 4.65625, "learning_rate": 4.454329164148317e-06, "loss": 1.1715214252471924, "step": 1714 }, { "epoch": 0.5282031550596383, "grad_norm": 10.5625, "learning_rate": 4.452959831512414e-06, "loss": 1.459154486656189, "step": 1716 }, { "epoch": 0.5288187764524818, "grad_norm": 2.875, "learning_rate": 4.451589055037757e-06, "loss": 1.113134503364563, "step": 1718 }, { "epoch": 0.5294343978453251, "grad_norm": 6.1875, "learning_rate": 4.4502168360865175e-06, "loss": 0.9972245097160339, "step": 1720 }, { "epoch": 0.5300500192381685, "grad_norm": 7.84375, "learning_rate": 4.448843176022299e-06, "loss": 1.3292319774627686, "step": 1722 }, { "epoch": 0.530665640631012, "grad_norm": 7.78125, "learning_rate": 4.44746807621014e-06, "loss": 1.3687920570373535, "step": 1724 }, { "epoch": 0.5312812620238553, "grad_norm": 5.84375, "learning_rate": 4.44609153801651e-06, "loss": 1.1176486015319824, "step": 1726 }, { "epoch": 0.5318968834166987, "grad_norm": 6.53125, "learning_rate": 4.4447135628093e-06, "loss": 1.3572094440460205, "step": 1728 }, { "epoch": 0.5325125048095422, "grad_norm": 7.40625, "learning_rate": 4.44333415195784e-06, "loss": 1.3096528053283691, "step": 1730 }, { "epoch": 0.5331281262023856, "grad_norm": 9.0, "learning_rate": 4.441953306832879e-06, "loss": 1.611821174621582, "step": 1732 }, { "epoch": 0.5337437475952289, "grad_norm": 5.71875, "learning_rate": 4.440571028806594e-06, "loss": 1.4841302633285522, "step": 1734 }, { "epoch": 0.5343593689880723, "grad_norm": 10.8125, "learning_rate": 4.439187319252586e-06, "loss": 1.7697718143463135, "step": 1736 }, { "epoch": 0.5349749903809158, "grad_norm": 13.8125, "learning_rate": 4.437802179545879e-06, "loss": 1.5109689235687256, "step": 1738 }, { "epoch": 0.5355906117737591, "grad_norm": 3.5, "learning_rate": 4.436415611062916e-06, "loss": 1.1347957849502563, "step": 1740 }, { "epoch": 0.5362062331666025, "grad_norm": 26.125, "learning_rate": 4.435027615181563e-06, "loss": 1.394513487815857, "step": 1742 }, { "epoch": 0.536821854559446, "grad_norm": 2.921875, "learning_rate": 4.4336381932811e-06, "loss": 1.1014000177383423, "step": 1744 }, { "epoch": 0.5374374759522893, "grad_norm": 26.375, "learning_rate": 4.43224734674223e-06, "loss": 1.4138752222061157, "step": 1746 }, { "epoch": 0.5380530973451327, "grad_norm": 4.71875, "learning_rate": 4.4308550769470645e-06, "loss": 1.3574260473251343, "step": 1748 }, { "epoch": 0.5386687187379762, "grad_norm": 9.5, "learning_rate": 4.429461385279136e-06, "loss": 1.3197760581970215, "step": 1750 }, { "epoch": 0.5392843401308195, "grad_norm": 5.375, "learning_rate": 4.428066273123387e-06, "loss": 1.2813074588775635, "step": 1752 }, { "epoch": 0.5398999615236629, "grad_norm": 3.234375, "learning_rate": 4.4266697418661705e-06, "loss": 1.2030224800109863, "step": 1754 }, { "epoch": 0.5405155829165064, "grad_norm": 4.65625, "learning_rate": 4.425271792895252e-06, "loss": 1.0384302139282227, "step": 1756 }, { "epoch": 0.5411312043093498, "grad_norm": 10.3125, "learning_rate": 4.423872427599804e-06, "loss": 1.326210618019104, "step": 1758 }, { "epoch": 0.5417468257021931, "grad_norm": 6.96875, "learning_rate": 4.422471647370406e-06, "loss": 1.1778022050857544, "step": 1760 }, { "epoch": 0.5423624470950366, "grad_norm": 6.78125, "learning_rate": 4.421069453599049e-06, "loss": 1.474500060081482, "step": 1762 }, { "epoch": 0.54297806848788, "grad_norm": 8.3125, "learning_rate": 4.41966584767912e-06, "loss": 1.3914190530776978, "step": 1764 }, { "epoch": 0.5435936898807233, "grad_norm": 11.6875, "learning_rate": 4.418260831005415e-06, "loss": 1.714091181755066, "step": 1766 }, { "epoch": 0.5442093112735668, "grad_norm": 8.5, "learning_rate": 4.4168544049741304e-06, "loss": 1.2844468355178833, "step": 1768 }, { "epoch": 0.5448249326664102, "grad_norm": 7.1875, "learning_rate": 4.415446570982864e-06, "loss": 1.6642802953720093, "step": 1770 }, { "epoch": 0.5454405540592535, "grad_norm": 4.90625, "learning_rate": 4.414037330430611e-06, "loss": 1.248421549797058, "step": 1772 }, { "epoch": 0.546056175452097, "grad_norm": 13.4375, "learning_rate": 4.412626684717768e-06, "loss": 1.2882202863693237, "step": 1774 }, { "epoch": 0.5466717968449404, "grad_norm": 6.8125, "learning_rate": 4.4112146352461216e-06, "loss": 1.4664686918258667, "step": 1776 }, { "epoch": 0.5472874182377838, "grad_norm": 7.78125, "learning_rate": 4.409801183418858e-06, "loss": 1.3767423629760742, "step": 1778 }, { "epoch": 0.5479030396306271, "grad_norm": 4.71875, "learning_rate": 4.408386330640559e-06, "loss": 1.5045976638793945, "step": 1780 }, { "epoch": 0.5485186610234706, "grad_norm": 15.25, "learning_rate": 4.40697007831719e-06, "loss": 1.7616403102874756, "step": 1782 }, { "epoch": 0.549134282416314, "grad_norm": 9.1875, "learning_rate": 4.4055524278561175e-06, "loss": 1.5928730964660645, "step": 1784 }, { "epoch": 0.5497499038091573, "grad_norm": 8.1875, "learning_rate": 4.40413338066609e-06, "loss": 1.1425762176513672, "step": 1786 }, { "epoch": 0.5503655252020008, "grad_norm": 9.25, "learning_rate": 4.402712938157249e-06, "loss": 1.3747527599334717, "step": 1788 }, { "epoch": 0.5509811465948442, "grad_norm": 9.4375, "learning_rate": 4.401291101741116e-06, "loss": 1.4497945308685303, "step": 1790 }, { "epoch": 0.5515967679876875, "grad_norm": 7.15625, "learning_rate": 4.399867872830607e-06, "loss": 1.469414472579956, "step": 1792 }, { "epoch": 0.552212389380531, "grad_norm": 9.0, "learning_rate": 4.398443252840011e-06, "loss": 1.0230164527893066, "step": 1794 }, { "epoch": 0.5528280107733744, "grad_norm": 2.59375, "learning_rate": 4.397017243185008e-06, "loss": 1.3341656923294067, "step": 1796 }, { "epoch": 0.5534436321662177, "grad_norm": 6.5625, "learning_rate": 4.395589845282656e-06, "loss": 1.4266735315322876, "step": 1798 }, { "epoch": 0.5540592535590612, "grad_norm": 5.96875, "learning_rate": 4.3941610605513905e-06, "loss": 1.1735732555389404, "step": 1800 }, { "epoch": 0.5546748749519046, "grad_norm": 6.65625, "learning_rate": 4.392730890411029e-06, "loss": 1.1153186559677124, "step": 1802 }, { "epoch": 0.555290496344748, "grad_norm": 10.375, "learning_rate": 4.391299336282761e-06, "loss": 1.5208882093429565, "step": 1804 }, { "epoch": 0.5559061177375914, "grad_norm": 7.875, "learning_rate": 4.389866399589157e-06, "loss": 0.9873666763305664, "step": 1806 }, { "epoch": 0.5565217391304348, "grad_norm": 11.0, "learning_rate": 4.388432081754155e-06, "loss": 1.7669349908828735, "step": 1808 }, { "epoch": 0.5571373605232782, "grad_norm": 3.71875, "learning_rate": 4.386996384203072e-06, "loss": 1.1138397455215454, "step": 1810 }, { "epoch": 0.5577529819161215, "grad_norm": 9.4375, "learning_rate": 4.3855593083625904e-06, "loss": 1.7766603231430054, "step": 1812 }, { "epoch": 0.558368603308965, "grad_norm": 15.0625, "learning_rate": 4.384120855660765e-06, "loss": 1.8187227249145508, "step": 1814 }, { "epoch": 0.5589842247018084, "grad_norm": 8.875, "learning_rate": 4.382681027527021e-06, "loss": 1.538252830505371, "step": 1816 }, { "epoch": 0.5595998460946517, "grad_norm": 8.75, "learning_rate": 4.381239825392144e-06, "loss": 1.093695044517517, "step": 1818 }, { "epoch": 0.5602154674874952, "grad_norm": 3.9375, "learning_rate": 4.379797250688292e-06, "loss": 1.2730540037155151, "step": 1820 }, { "epoch": 0.5608310888803386, "grad_norm": 8.3125, "learning_rate": 4.378353304848982e-06, "loss": 1.4347320795059204, "step": 1822 }, { "epoch": 0.561446710273182, "grad_norm": 4.71875, "learning_rate": 4.376907989309097e-06, "loss": 1.4746718406677246, "step": 1824 }, { "epoch": 0.5620623316660254, "grad_norm": 12.75, "learning_rate": 4.375461305504879e-06, "loss": 1.1002285480499268, "step": 1826 }, { "epoch": 0.5626779530588688, "grad_norm": 17.625, "learning_rate": 4.374013254873929e-06, "loss": 1.2012693881988525, "step": 1828 }, { "epoch": 0.5632935744517122, "grad_norm": 6.28125, "learning_rate": 4.372563838855207e-06, "loss": 1.7264009714126587, "step": 1830 }, { "epoch": 0.5639091958445556, "grad_norm": 18.5, "learning_rate": 4.3711130588890315e-06, "loss": 1.62944757938385, "step": 1832 }, { "epoch": 0.564524817237399, "grad_norm": 11.1875, "learning_rate": 4.369660916417076e-06, "loss": 1.4099031686782837, "step": 1834 }, { "epoch": 0.5651404386302424, "grad_norm": 4.46875, "learning_rate": 4.3682074128823645e-06, "loss": 1.4762779474258423, "step": 1836 }, { "epoch": 0.5657560600230858, "grad_norm": 34.5, "learning_rate": 4.3667525497292776e-06, "loss": 1.6303919553756714, "step": 1838 }, { "epoch": 0.5663716814159292, "grad_norm": 6.65625, "learning_rate": 4.365296328403546e-06, "loss": 1.3313677310943604, "step": 1840 }, { "epoch": 0.5669873028087726, "grad_norm": 5.75, "learning_rate": 4.363838750352247e-06, "loss": 1.5207209587097168, "step": 1842 }, { "epoch": 0.567602924201616, "grad_norm": 9.1875, "learning_rate": 4.362379817023811e-06, "loss": 1.0308501720428467, "step": 1844 }, { "epoch": 0.5682185455944594, "grad_norm": 3.15625, "learning_rate": 4.3609195298680115e-06, "loss": 1.0928391218185425, "step": 1846 }, { "epoch": 0.5688341669873028, "grad_norm": 10.0, "learning_rate": 4.3594578903359695e-06, "loss": 1.54830801486969, "step": 1848 }, { "epoch": 0.5694497883801463, "grad_norm": 5.625, "learning_rate": 4.357994899880149e-06, "loss": 1.4137128591537476, "step": 1850 }, { "epoch": 0.5700654097729896, "grad_norm": 5.8125, "learning_rate": 4.356530559954356e-06, "loss": 1.3005894422531128, "step": 1852 }, { "epoch": 0.570681031165833, "grad_norm": 10.875, "learning_rate": 4.355064872013737e-06, "loss": 1.2449814081192017, "step": 1854 }, { "epoch": 0.5712966525586765, "grad_norm": 6.90625, "learning_rate": 4.353597837514779e-06, "loss": 0.9704883098602295, "step": 1856 }, { "epoch": 0.5719122739515198, "grad_norm": 6.59375, "learning_rate": 4.3521294579153096e-06, "loss": 1.3039305210113525, "step": 1858 }, { "epoch": 0.5725278953443632, "grad_norm": 8.125, "learning_rate": 4.350659734674488e-06, "loss": 1.180039882659912, "step": 1860 }, { "epoch": 0.5731435167372066, "grad_norm": 5.1875, "learning_rate": 4.3491886692528115e-06, "loss": 1.2943824529647827, "step": 1862 }, { "epoch": 0.57375913813005, "grad_norm": 6.125, "learning_rate": 4.347716263112112e-06, "loss": 1.1885600090026855, "step": 1864 }, { "epoch": 0.5743747595228934, "grad_norm": 4.71875, "learning_rate": 4.346242517715551e-06, "loss": 1.2636488676071167, "step": 1866 }, { "epoch": 0.5749903809157368, "grad_norm": 4.65625, "learning_rate": 4.344767434527623e-06, "loss": 1.3801287412643433, "step": 1868 }, { "epoch": 0.5756060023085803, "grad_norm": 4.03125, "learning_rate": 4.343291015014152e-06, "loss": 1.2369672060012817, "step": 1870 }, { "epoch": 0.5762216237014236, "grad_norm": 18.0, "learning_rate": 4.341813260642288e-06, "loss": 0.8730614185333252, "step": 1872 }, { "epoch": 0.576837245094267, "grad_norm": 5.375, "learning_rate": 4.340334172880508e-06, "loss": 1.412117838859558, "step": 1874 }, { "epoch": 0.5774528664871105, "grad_norm": 3.109375, "learning_rate": 4.338853753198618e-06, "loss": 1.1440041065216064, "step": 1876 }, { "epoch": 0.5780684878799538, "grad_norm": 8.625, "learning_rate": 4.33737200306774e-06, "loss": 1.1688601970672607, "step": 1878 }, { "epoch": 0.5786841092727972, "grad_norm": 17.0, "learning_rate": 4.3358889239603245e-06, "loss": 1.5140353441238403, "step": 1880 }, { "epoch": 0.5792997306656407, "grad_norm": 5.0625, "learning_rate": 4.33440451735014e-06, "loss": 1.4316186904907227, "step": 1882 }, { "epoch": 0.579915352058484, "grad_norm": 26.125, "learning_rate": 4.332918784712276e-06, "loss": 1.2686431407928467, "step": 1884 }, { "epoch": 0.5805309734513274, "grad_norm": 4.96875, "learning_rate": 4.331431727523136e-06, "loss": 1.2316128015518188, "step": 1886 }, { "epoch": 0.5811465948441709, "grad_norm": 18.375, "learning_rate": 4.3299433472604445e-06, "loss": 1.5370372533798218, "step": 1888 }, { "epoch": 0.5817622162370142, "grad_norm": 6.59375, "learning_rate": 4.3284536454032356e-06, "loss": 1.5379806756973267, "step": 1890 }, { "epoch": 0.5823778376298576, "grad_norm": 9.5625, "learning_rate": 4.326962623431862e-06, "loss": 1.0020148754119873, "step": 1892 }, { "epoch": 0.582993459022701, "grad_norm": 5.375, "learning_rate": 4.325470282827984e-06, "loss": 1.5552363395690918, "step": 1894 }, { "epoch": 0.5836090804155445, "grad_norm": 7.34375, "learning_rate": 4.323976625074574e-06, "loss": 1.3486781120300293, "step": 1896 }, { "epoch": 0.5842247018083878, "grad_norm": 7.25, "learning_rate": 4.3224816516559145e-06, "loss": 1.2770202159881592, "step": 1898 }, { "epoch": 0.5848403232012312, "grad_norm": 7.6875, "learning_rate": 4.320985364057593e-06, "loss": 1.44130277633667, "step": 1900 }, { "epoch": 0.5854559445940747, "grad_norm": 12.375, "learning_rate": 4.3194877637665035e-06, "loss": 1.6760141849517822, "step": 1902 }, { "epoch": 0.586071565986918, "grad_norm": 6.21875, "learning_rate": 4.317988852270845e-06, "loss": 1.637210726737976, "step": 1904 }, { "epoch": 0.5866871873797614, "grad_norm": 13.25, "learning_rate": 4.3164886310601224e-06, "loss": 1.471998929977417, "step": 1906 }, { "epoch": 0.5873028087726049, "grad_norm": 10.1875, "learning_rate": 4.3149871016251365e-06, "loss": 1.327508568763733, "step": 1908 }, { "epoch": 0.5879184301654482, "grad_norm": 4.65625, "learning_rate": 4.31348426545799e-06, "loss": 1.1353042125701904, "step": 1910 }, { "epoch": 0.5885340515582916, "grad_norm": 8.75, "learning_rate": 4.311980124052087e-06, "loss": 1.2621204853057861, "step": 1912 }, { "epoch": 0.5891496729511351, "grad_norm": 5.09375, "learning_rate": 4.310474678902126e-06, "loss": 1.4534045457839966, "step": 1914 }, { "epoch": 0.5897652943439785, "grad_norm": 1.546875, "learning_rate": 4.3089679315041e-06, "loss": 1.2747136354446411, "step": 1916 }, { "epoch": 0.5903809157368218, "grad_norm": 5.90625, "learning_rate": 4.307459883355299e-06, "loss": 1.384915828704834, "step": 1918 }, { "epoch": 0.5909965371296653, "grad_norm": 15.6875, "learning_rate": 4.305950535954305e-06, "loss": 0.9512431621551514, "step": 1920 }, { "epoch": 0.5916121585225087, "grad_norm": 9.9375, "learning_rate": 4.30443989080099e-06, "loss": 1.3275504112243652, "step": 1922 }, { "epoch": 0.592227779915352, "grad_norm": 8.5, "learning_rate": 4.3029279493965135e-06, "loss": 1.2675621509552002, "step": 1924 }, { "epoch": 0.5928434013081955, "grad_norm": 5.03125, "learning_rate": 4.301414713243328e-06, "loss": 1.1729919910430908, "step": 1926 }, { "epoch": 0.5934590227010389, "grad_norm": 9.5, "learning_rate": 4.299900183845171e-06, "loss": 1.3558121919631958, "step": 1928 }, { "epoch": 0.5940746440938822, "grad_norm": 6.1875, "learning_rate": 4.2983843627070625e-06, "loss": 1.3227899074554443, "step": 1930 }, { "epoch": 0.5946902654867257, "grad_norm": 12.5, "learning_rate": 4.2968672513353075e-06, "loss": 1.5395879745483398, "step": 1932 }, { "epoch": 0.5953058868795691, "grad_norm": 13.75, "learning_rate": 4.295348851237494e-06, "loss": 1.1591770648956299, "step": 1934 }, { "epoch": 0.5959215082724124, "grad_norm": 3.9375, "learning_rate": 4.293829163922491e-06, "loss": 1.4132144451141357, "step": 1936 }, { "epoch": 0.5965371296652558, "grad_norm": 6.21875, "learning_rate": 4.2923081909004475e-06, "loss": 1.6093425750732422, "step": 1938 }, { "epoch": 0.5971527510580993, "grad_norm": 7.15625, "learning_rate": 4.290785933682785e-06, "loss": 1.4642449617385864, "step": 1940 }, { "epoch": 0.5977683724509427, "grad_norm": 6.0, "learning_rate": 4.289262393782206e-06, "loss": 1.2696763277053833, "step": 1942 }, { "epoch": 0.598383993843786, "grad_norm": 36.0, "learning_rate": 4.287737572712687e-06, "loss": 1.4310896396636963, "step": 1944 }, { "epoch": 0.5989996152366295, "grad_norm": 2.703125, "learning_rate": 4.2862114719894754e-06, "loss": 1.1763420104980469, "step": 1946 }, { "epoch": 0.5996152366294729, "grad_norm": 9.375, "learning_rate": 4.284684093129093e-06, "loss": 0.9990206956863403, "step": 1948 }, { "epoch": 0.6002308580223162, "grad_norm": 17.0, "learning_rate": 4.283155437649331e-06, "loss": 1.5044660568237305, "step": 1950 }, { "epoch": 0.6008464794151597, "grad_norm": 9.6875, "learning_rate": 4.281625507069247e-06, "loss": 1.063808798789978, "step": 1952 }, { "epoch": 0.6014621008080031, "grad_norm": 5.65625, "learning_rate": 4.280094302909168e-06, "loss": 1.3041024208068848, "step": 1954 }, { "epoch": 0.6020777222008464, "grad_norm": 6.90625, "learning_rate": 4.2785618266906844e-06, "loss": 1.295802116394043, "step": 1956 }, { "epoch": 0.6026933435936899, "grad_norm": 3.125, "learning_rate": 4.277028079936656e-06, "loss": 1.0778144598007202, "step": 1958 }, { "epoch": 0.6033089649865333, "grad_norm": 56.0, "learning_rate": 4.2754930641711974e-06, "loss": 1.7069801092147827, "step": 1960 }, { "epoch": 0.6039245863793767, "grad_norm": 8.5, "learning_rate": 4.27395678091969e-06, "loss": 1.3479996919631958, "step": 1962 }, { "epoch": 0.6045402077722201, "grad_norm": 10.75, "learning_rate": 4.272419231708773e-06, "loss": 1.3411500453948975, "step": 1964 }, { "epoch": 0.6051558291650635, "grad_norm": 13.0, "learning_rate": 4.270880418066342e-06, "loss": 1.144321322441101, "step": 1966 }, { "epoch": 0.6057714505579069, "grad_norm": 8.875, "learning_rate": 4.2693403415215525e-06, "loss": 0.7488104104995728, "step": 1968 }, { "epoch": 0.6063870719507503, "grad_norm": 12.375, "learning_rate": 4.267799003604812e-06, "loss": 1.4354453086853027, "step": 1970 }, { "epoch": 0.6070026933435937, "grad_norm": 4.8125, "learning_rate": 4.266256405847784e-06, "loss": 1.4058129787445068, "step": 1972 }, { "epoch": 0.6076183147364371, "grad_norm": 2.640625, "learning_rate": 4.264712549783381e-06, "loss": 0.9840666055679321, "step": 1974 }, { "epoch": 0.6082339361292805, "grad_norm": 7.625, "learning_rate": 4.263167436945767e-06, "loss": 1.3485808372497559, "step": 1976 }, { "epoch": 0.6088495575221239, "grad_norm": 14.875, "learning_rate": 4.261621068870355e-06, "loss": 0.8841054439544678, "step": 1978 }, { "epoch": 0.6094651789149673, "grad_norm": 39.5, "learning_rate": 4.260073447093808e-06, "loss": 1.401408076286316, "step": 1980 }, { "epoch": 0.6100808003078106, "grad_norm": 6.46875, "learning_rate": 4.258524573154031e-06, "loss": 1.2707502841949463, "step": 1982 }, { "epoch": 0.6106964217006541, "grad_norm": 10.5, "learning_rate": 4.256974448590174e-06, "loss": 1.7859660387039185, "step": 1984 }, { "epoch": 0.6113120430934975, "grad_norm": 8.6875, "learning_rate": 4.255423074942632e-06, "loss": 1.7836737632751465, "step": 1986 }, { "epoch": 0.611927664486341, "grad_norm": 7.75, "learning_rate": 4.25387045375304e-06, "loss": 1.2297873497009277, "step": 1988 }, { "epoch": 0.6125432858791843, "grad_norm": 6.5625, "learning_rate": 4.252316586564269e-06, "loss": 1.423917531967163, "step": 1990 }, { "epoch": 0.6131589072720277, "grad_norm": 3.546875, "learning_rate": 4.250761474920437e-06, "loss": 1.277387261390686, "step": 1992 }, { "epoch": 0.6137745286648711, "grad_norm": 6.53125, "learning_rate": 4.249205120366888e-06, "loss": 1.3519625663757324, "step": 1994 }, { "epoch": 0.6143901500577145, "grad_norm": 7.625, "learning_rate": 4.2476475244502105e-06, "loss": 1.0225456953048706, "step": 1996 }, { "epoch": 0.6150057714505579, "grad_norm": 5.0, "learning_rate": 4.246088688718221e-06, "loss": 1.5308113098144531, "step": 1998 }, { "epoch": 0.6156213928434013, "grad_norm": 11.0, "learning_rate": 4.244528614719968e-06, "loss": 1.7866837978363037, "step": 2000 }, { "epoch": 0.6162370142362447, "grad_norm": 13.875, "learning_rate": 4.242967304005734e-06, "loss": 1.0780322551727295, "step": 2002 }, { "epoch": 0.6168526356290881, "grad_norm": 7.78125, "learning_rate": 4.241404758127029e-06, "loss": 1.3198527097702026, "step": 2004 }, { "epoch": 0.6174682570219315, "grad_norm": 8.4375, "learning_rate": 4.239840978636588e-06, "loss": 1.4212534427642822, "step": 2006 }, { "epoch": 0.618083878414775, "grad_norm": 18.5, "learning_rate": 4.238275967088375e-06, "loss": 1.2522720098495483, "step": 2008 }, { "epoch": 0.6186994998076183, "grad_norm": 4.4375, "learning_rate": 4.2367097250375744e-06, "loss": 1.012687087059021, "step": 2010 }, { "epoch": 0.6193151212004617, "grad_norm": 4.53125, "learning_rate": 4.2351422540406005e-06, "loss": 1.2856965065002441, "step": 2012 }, { "epoch": 0.6199307425933052, "grad_norm": 5.625, "learning_rate": 4.23357355565508e-06, "loss": 1.5035991668701172, "step": 2014 }, { "epoch": 0.6205463639861485, "grad_norm": 4.4375, "learning_rate": 4.232003631439868e-06, "loss": 1.4920026063919067, "step": 2016 }, { "epoch": 0.6211619853789919, "grad_norm": 4.90625, "learning_rate": 4.2304324829550306e-06, "loss": 1.1564490795135498, "step": 2018 }, { "epoch": 0.6217776067718354, "grad_norm": 5.5, "learning_rate": 4.228860111761852e-06, "loss": 1.3548294305801392, "step": 2020 }, { "epoch": 0.6223932281646787, "grad_norm": 19.75, "learning_rate": 4.2272865194228355e-06, "loss": 1.7174577713012695, "step": 2022 }, { "epoch": 0.6230088495575221, "grad_norm": 7.375, "learning_rate": 4.225711707501694e-06, "loss": 0.9778303503990173, "step": 2024 }, { "epoch": 0.6236244709503656, "grad_norm": 4.59375, "learning_rate": 4.224135677563354e-06, "loss": 1.320866584777832, "step": 2026 }, { "epoch": 0.6242400923432089, "grad_norm": 5.34375, "learning_rate": 4.22255843117395e-06, "loss": 1.6332377195358276, "step": 2028 }, { "epoch": 0.6248557137360523, "grad_norm": 10.1875, "learning_rate": 4.220979969900828e-06, "loss": 1.790048360824585, "step": 2030 }, { "epoch": 0.6254713351288957, "grad_norm": 10.6875, "learning_rate": 4.219400295312542e-06, "loss": 1.036562204360962, "step": 2032 }, { "epoch": 0.6260869565217392, "grad_norm": 52.75, "learning_rate": 4.217819408978848e-06, "loss": 1.2087514400482178, "step": 2034 }, { "epoch": 0.6267025779145825, "grad_norm": 11.9375, "learning_rate": 4.216237312470709e-06, "loss": 1.2638986110687256, "step": 2036 }, { "epoch": 0.6273181993074259, "grad_norm": 7.9375, "learning_rate": 4.214654007360289e-06, "loss": 1.248311161994934, "step": 2038 }, { "epoch": 0.6279338207002694, "grad_norm": 12.4375, "learning_rate": 4.213069495220955e-06, "loss": 1.0299032926559448, "step": 2040 }, { "epoch": 0.6285494420931127, "grad_norm": 5.0625, "learning_rate": 4.211483777627272e-06, "loss": 0.9252605438232422, "step": 2042 }, { "epoch": 0.6291650634859561, "grad_norm": 3.609375, "learning_rate": 4.2098968561550025e-06, "loss": 1.118775486946106, "step": 2044 }, { "epoch": 0.6297806848787996, "grad_norm": 41.25, "learning_rate": 4.208308732381106e-06, "loss": 1.6422300338745117, "step": 2046 }, { "epoch": 0.6303963062716429, "grad_norm": 7.28125, "learning_rate": 4.206719407883737e-06, "loss": 1.361843228340149, "step": 2048 }, { "epoch": 0.6310119276644863, "grad_norm": 5.875, "learning_rate": 4.205128884242243e-06, "loss": 1.3892865180969238, "step": 2050 }, { "epoch": 0.6316275490573298, "grad_norm": 27.125, "learning_rate": 4.203537163037163e-06, "loss": 1.3998996019363403, "step": 2052 }, { "epoch": 0.6322431704501732, "grad_norm": 2.71875, "learning_rate": 4.201944245850224e-06, "loss": 1.0297900438308716, "step": 2054 }, { "epoch": 0.6328587918430165, "grad_norm": 6.78125, "learning_rate": 4.200350134264347e-06, "loss": 1.2010695934295654, "step": 2056 }, { "epoch": 0.63347441323586, "grad_norm": 9.5, "learning_rate": 4.198754829863635e-06, "loss": 1.0788944959640503, "step": 2058 }, { "epoch": 0.6340900346287034, "grad_norm": 10.875, "learning_rate": 4.197158334233376e-06, "loss": 1.6973694562911987, "step": 2060 }, { "epoch": 0.6347056560215467, "grad_norm": 8.5, "learning_rate": 4.195560648960046e-06, "loss": 1.5456275939941406, "step": 2062 }, { "epoch": 0.6353212774143902, "grad_norm": 8.0625, "learning_rate": 4.1939617756313e-06, "loss": 1.54824960231781, "step": 2064 }, { "epoch": 0.6359368988072336, "grad_norm": 3.453125, "learning_rate": 4.192361715835973e-06, "loss": 1.167006254196167, "step": 2066 }, { "epoch": 0.6365525202000769, "grad_norm": 14.0625, "learning_rate": 4.190760471164081e-06, "loss": 1.243059754371643, "step": 2068 }, { "epoch": 0.6371681415929203, "grad_norm": 4.15625, "learning_rate": 4.189158043206818e-06, "loss": 1.1235977411270142, "step": 2070 }, { "epoch": 0.6377837629857638, "grad_norm": 5.53125, "learning_rate": 4.187554433556552e-06, "loss": 1.3937187194824219, "step": 2072 }, { "epoch": 0.6383993843786071, "grad_norm": 9.375, "learning_rate": 4.185949643806824e-06, "loss": 1.7005552053451538, "step": 2074 }, { "epoch": 0.6390150057714505, "grad_norm": 5.71875, "learning_rate": 4.184343675552351e-06, "loss": 1.7017693519592285, "step": 2076 }, { "epoch": 0.639630627164294, "grad_norm": 23.125, "learning_rate": 4.18273653038902e-06, "loss": 1.5986844301223755, "step": 2078 }, { "epoch": 0.6402462485571374, "grad_norm": 5.84375, "learning_rate": 4.1811282099138865e-06, "loss": 1.2742788791656494, "step": 2080 }, { "epoch": 0.6408618699499807, "grad_norm": 8.375, "learning_rate": 4.179518715725175e-06, "loss": 1.2799839973449707, "step": 2082 }, { "epoch": 0.6414774913428242, "grad_norm": 5.46875, "learning_rate": 4.177908049422276e-06, "loss": 1.5190964937210083, "step": 2084 }, { "epoch": 0.6420931127356676, "grad_norm": 8.6875, "learning_rate": 4.176296212605744e-06, "loss": 1.379951000213623, "step": 2086 }, { "epoch": 0.6427087341285109, "grad_norm": 8.25, "learning_rate": 4.174683206877298e-06, "loss": 1.459817886352539, "step": 2088 }, { "epoch": 0.6433243555213544, "grad_norm": 4.8125, "learning_rate": 4.173069033839818e-06, "loss": 1.223576307296753, "step": 2090 }, { "epoch": 0.6439399769141978, "grad_norm": 5.84375, "learning_rate": 4.171453695097344e-06, "loss": 1.3411672115325928, "step": 2092 }, { "epoch": 0.6445555983070411, "grad_norm": 14.25, "learning_rate": 4.169837192255073e-06, "loss": 1.7932548522949219, "step": 2094 }, { "epoch": 0.6451712196998846, "grad_norm": 8.125, "learning_rate": 4.168219526919361e-06, "loss": 1.269776701927185, "step": 2096 }, { "epoch": 0.645786841092728, "grad_norm": 4.90625, "learning_rate": 4.16660070069772e-06, "loss": 1.080666184425354, "step": 2098 }, { "epoch": 0.6464024624855714, "grad_norm": 11.625, "learning_rate": 4.164980715198812e-06, "loss": 1.3051639795303345, "step": 2100 }, { "epoch": 0.6470180838784148, "grad_norm": 14.4375, "learning_rate": 4.1633595720324525e-06, "loss": 1.7639557123184204, "step": 2102 }, { "epoch": 0.6476337052712582, "grad_norm": 8.3125, "learning_rate": 4.161737272809609e-06, "loss": 1.2831711769104004, "step": 2104 }, { "epoch": 0.6482493266641016, "grad_norm": 8.375, "learning_rate": 4.160113819142398e-06, "loss": 1.4829983711242676, "step": 2106 }, { "epoch": 0.648864948056945, "grad_norm": 8.5625, "learning_rate": 4.158489212644078e-06, "loss": 1.9377659559249878, "step": 2108 }, { "epoch": 0.6494805694497884, "grad_norm": 11.0, "learning_rate": 4.1568634549290585e-06, "loss": 1.1541517972946167, "step": 2110 }, { "epoch": 0.6500961908426318, "grad_norm": 22.625, "learning_rate": 4.155236547612893e-06, "loss": 1.2363619804382324, "step": 2112 }, { "epoch": 0.6507118122354751, "grad_norm": 21.625, "learning_rate": 4.153608492312273e-06, "loss": 1.6787185668945312, "step": 2114 }, { "epoch": 0.6513274336283186, "grad_norm": 10.5, "learning_rate": 4.151979290645037e-06, "loss": 1.7561014890670776, "step": 2116 }, { "epoch": 0.651943055021162, "grad_norm": 2.96875, "learning_rate": 4.150348944230157e-06, "loss": 1.3189289569854736, "step": 2118 }, { "epoch": 0.6525586764140053, "grad_norm": 5.5, "learning_rate": 4.148717454687744e-06, "loss": 1.1471681594848633, "step": 2120 }, { "epoch": 0.6531742978068488, "grad_norm": 4.90625, "learning_rate": 4.147084823639048e-06, "loss": 1.2217907905578613, "step": 2122 }, { "epoch": 0.6537899191996922, "grad_norm": 6.0, "learning_rate": 4.1454510527064495e-06, "loss": 1.4032762050628662, "step": 2124 }, { "epoch": 0.6544055405925356, "grad_norm": 6.78125, "learning_rate": 4.143816143513463e-06, "loss": 1.2402032613754272, "step": 2126 }, { "epoch": 0.655021161985379, "grad_norm": 5.90625, "learning_rate": 4.1421800976847355e-06, "loss": 1.4848300218582153, "step": 2128 }, { "epoch": 0.6556367833782224, "grad_norm": 4.625, "learning_rate": 4.140542916846042e-06, "loss": 1.312753677368164, "step": 2130 }, { "epoch": 0.6562524047710658, "grad_norm": 11.875, "learning_rate": 4.138904602624286e-06, "loss": 1.695920705795288, "step": 2132 }, { "epoch": 0.6568680261639092, "grad_norm": 3.9375, "learning_rate": 4.137265156647496e-06, "loss": 1.3408093452453613, "step": 2134 }, { "epoch": 0.6574836475567526, "grad_norm": 3.78125, "learning_rate": 4.135624580544829e-06, "loss": 1.3905094861984253, "step": 2136 }, { "epoch": 0.658099268949596, "grad_norm": 4.53125, "learning_rate": 4.13398287594656e-06, "loss": 1.229650616645813, "step": 2138 }, { "epoch": 0.6587148903424394, "grad_norm": 11.625, "learning_rate": 4.132340044484086e-06, "loss": 1.1174551248550415, "step": 2140 }, { "epoch": 0.6593305117352828, "grad_norm": 4.90625, "learning_rate": 4.130696087789929e-06, "loss": 1.40761399269104, "step": 2142 }, { "epoch": 0.6599461331281262, "grad_norm": 7.125, "learning_rate": 4.129051007497723e-06, "loss": 1.3790582418441772, "step": 2144 }, { "epoch": 0.6605617545209697, "grad_norm": 11.0625, "learning_rate": 4.127404805242224e-06, "loss": 1.678246259689331, "step": 2146 }, { "epoch": 0.661177375913813, "grad_norm": 6.84375, "learning_rate": 4.1257574826592975e-06, "loss": 1.2218875885009766, "step": 2148 }, { "epoch": 0.6617929973066564, "grad_norm": 13.375, "learning_rate": 4.124109041385925e-06, "loss": 1.3439253568649292, "step": 2150 }, { "epoch": 0.6624086186994999, "grad_norm": 7.9375, "learning_rate": 4.122459483060201e-06, "loss": 1.3088505268096924, "step": 2152 }, { "epoch": 0.6630242400923432, "grad_norm": 18.0, "learning_rate": 4.1208088093213275e-06, "loss": 1.2833305597305298, "step": 2154 }, { "epoch": 0.6636398614851866, "grad_norm": 8.0625, "learning_rate": 4.119157021809616e-06, "loss": 1.722816824913025, "step": 2156 }, { "epoch": 0.66425548287803, "grad_norm": 3.453125, "learning_rate": 4.1175041221664855e-06, "loss": 1.346421718597412, "step": 2158 }, { "epoch": 0.6648711042708734, "grad_norm": 8.6875, "learning_rate": 4.11585011203446e-06, "loss": 1.4551198482513428, "step": 2160 }, { "epoch": 0.6654867256637168, "grad_norm": 4.4375, "learning_rate": 4.114194993057163e-06, "loss": 1.3853923082351685, "step": 2162 }, { "epoch": 0.6661023470565602, "grad_norm": 9.0, "learning_rate": 4.112538766879328e-06, "loss": 1.7009466886520386, "step": 2164 }, { "epoch": 0.6667179684494036, "grad_norm": 6.96875, "learning_rate": 4.110881435146782e-06, "loss": 1.2280793190002441, "step": 2166 }, { "epoch": 0.667333589842247, "grad_norm": 12.4375, "learning_rate": 4.109222999506452e-06, "loss": 1.4504725933074951, "step": 2168 }, { "epoch": 0.6679492112350904, "grad_norm": 11.375, "learning_rate": 4.107563461606362e-06, "loss": 1.5431996583938599, "step": 2170 }, { "epoch": 0.6685648326279339, "grad_norm": 5.28125, "learning_rate": 4.105902823095634e-06, "loss": 1.3769593238830566, "step": 2172 }, { "epoch": 0.6691804540207772, "grad_norm": 19.0, "learning_rate": 4.104241085624482e-06, "loss": 1.5673210620880127, "step": 2174 }, { "epoch": 0.6697960754136206, "grad_norm": 17.75, "learning_rate": 4.102578250844209e-06, "loss": 1.2605724334716797, "step": 2176 }, { "epoch": 0.6704116968064641, "grad_norm": 6.4375, "learning_rate": 4.100914320407213e-06, "loss": 1.0668892860412598, "step": 2178 }, { "epoch": 0.6710273181993074, "grad_norm": 12.6875, "learning_rate": 4.099249295966981e-06, "loss": 1.7679181098937988, "step": 2180 }, { "epoch": 0.6716429395921508, "grad_norm": 9.8125, "learning_rate": 4.0975831791780815e-06, "loss": 1.7594194412231445, "step": 2182 }, { "epoch": 0.6722585609849943, "grad_norm": 5.4375, "learning_rate": 4.0959159716961735e-06, "loss": 1.4177989959716797, "step": 2184 }, { "epoch": 0.6728741823778376, "grad_norm": 7.25, "learning_rate": 4.094247675177999e-06, "loss": 1.2712677717208862, "step": 2186 }, { "epoch": 0.673489803770681, "grad_norm": 4.90625, "learning_rate": 4.0925782912813815e-06, "loss": 1.2475125789642334, "step": 2188 }, { "epoch": 0.6741054251635245, "grad_norm": 10.6875, "learning_rate": 4.0909078216652245e-06, "loss": 1.1614550352096558, "step": 2190 }, { "epoch": 0.6747210465563679, "grad_norm": 6.84375, "learning_rate": 4.089236267989512e-06, "loss": 1.6680519580841064, "step": 2192 }, { "epoch": 0.6753366679492112, "grad_norm": 4.25, "learning_rate": 4.0875636319153025e-06, "loss": 1.6825952529907227, "step": 2194 }, { "epoch": 0.6759522893420546, "grad_norm": 10.125, "learning_rate": 4.085889915104735e-06, "loss": 0.9246540069580078, "step": 2196 }, { "epoch": 0.6765679107348981, "grad_norm": 4.34375, "learning_rate": 4.084215119221016e-06, "loss": 0.7624496221542358, "step": 2198 }, { "epoch": 0.6771835321277414, "grad_norm": 20.5, "learning_rate": 4.0825392459284305e-06, "loss": 1.0339101552963257, "step": 2200 }, { "epoch": 0.6777991535205848, "grad_norm": 7.3125, "learning_rate": 4.08086229689233e-06, "loss": 1.7177927494049072, "step": 2202 }, { "epoch": 0.6784147749134283, "grad_norm": 4.09375, "learning_rate": 4.079184273779138e-06, "loss": 0.9616536498069763, "step": 2204 }, { "epoch": 0.6790303963062716, "grad_norm": 8.625, "learning_rate": 4.07750517825634e-06, "loss": 0.8436191082000732, "step": 2206 }, { "epoch": 0.679646017699115, "grad_norm": 7.15625, "learning_rate": 4.075825011992495e-06, "loss": 1.8311975002288818, "step": 2208 }, { "epoch": 0.6802616390919585, "grad_norm": 4.75, "learning_rate": 4.074143776657219e-06, "loss": 1.3282256126403809, "step": 2210 }, { "epoch": 0.6808772604848018, "grad_norm": 4.375, "learning_rate": 4.072461473921196e-06, "loss": 0.8911463022232056, "step": 2212 }, { "epoch": 0.6814928818776452, "grad_norm": 3.484375, "learning_rate": 4.070778105456164e-06, "loss": 1.3046122789382935, "step": 2214 }, { "epoch": 0.6821085032704887, "grad_norm": 10.4375, "learning_rate": 4.0690936729349275e-06, "loss": 1.3092163801193237, "step": 2216 }, { "epoch": 0.6827241246633321, "grad_norm": 4.78125, "learning_rate": 4.067408178031342e-06, "loss": 1.1632335186004639, "step": 2218 }, { "epoch": 0.6833397460561754, "grad_norm": 2.28125, "learning_rate": 4.0657216224203255e-06, "loss": 1.214794635772705, "step": 2220 }, { "epoch": 0.6839553674490189, "grad_norm": 5.3125, "learning_rate": 4.064034007777843e-06, "loss": 1.2219361066818237, "step": 2222 }, { "epoch": 0.6845709888418623, "grad_norm": 4.375, "learning_rate": 4.062345335780915e-06, "loss": 1.2587751150131226, "step": 2224 }, { "epoch": 0.6851866102347056, "grad_norm": 7.34375, "learning_rate": 4.0606556081076145e-06, "loss": 1.2284146547317505, "step": 2226 }, { "epoch": 0.685802231627549, "grad_norm": 8.8125, "learning_rate": 4.058964826437059e-06, "loss": 1.3370146751403809, "step": 2228 }, { "epoch": 0.6864178530203925, "grad_norm": 12.9375, "learning_rate": 4.057272992449419e-06, "loss": 1.803123116493225, "step": 2230 }, { "epoch": 0.6870334744132358, "grad_norm": 20.5, "learning_rate": 4.0555801078259085e-06, "loss": 1.3987561464309692, "step": 2232 }, { "epoch": 0.6876490958060792, "grad_norm": 19.375, "learning_rate": 4.0538861742487815e-06, "loss": 1.7791513204574585, "step": 2234 }, { "epoch": 0.6882647171989227, "grad_norm": 8.4375, "learning_rate": 4.052191193401339e-06, "loss": 1.7824699878692627, "step": 2236 }, { "epoch": 0.6888803385917661, "grad_norm": 7.03125, "learning_rate": 4.050495166967922e-06, "loss": 1.3275808095932007, "step": 2238 }, { "epoch": 0.6894959599846094, "grad_norm": 5.4375, "learning_rate": 4.04879809663391e-06, "loss": 1.5000485181808472, "step": 2240 }, { "epoch": 0.6901115813774529, "grad_norm": 5.65625, "learning_rate": 4.04709998408572e-06, "loss": 1.3786470890045166, "step": 2242 }, { "epoch": 0.6907272027702963, "grad_norm": 4.25, "learning_rate": 4.045400831010804e-06, "loss": 1.2372782230377197, "step": 2244 }, { "epoch": 0.6913428241631396, "grad_norm": 7.125, "learning_rate": 4.043700639097649e-06, "loss": 1.7251200675964355, "step": 2246 }, { "epoch": 0.6919584455559831, "grad_norm": 8.75, "learning_rate": 4.0419994100357725e-06, "loss": 1.489999771118164, "step": 2248 }, { "epoch": 0.6925740669488265, "grad_norm": 4.65625, "learning_rate": 4.0402971455157255e-06, "loss": 1.1909527778625488, "step": 2250 }, { "epoch": 0.6931896883416698, "grad_norm": 12.1875, "learning_rate": 4.038593847229088e-06, "loss": 1.633176326751709, "step": 2252 }, { "epoch": 0.6938053097345133, "grad_norm": 9.125, "learning_rate": 4.036889516868461e-06, "loss": 0.6954819560050964, "step": 2254 }, { "epoch": 0.6944209311273567, "grad_norm": 2.40625, "learning_rate": 4.035184156127478e-06, "loss": 1.3334051370620728, "step": 2256 }, { "epoch": 0.6950365525202, "grad_norm": 8.25, "learning_rate": 4.0334777667007966e-06, "loss": 1.3189637660980225, "step": 2258 }, { "epoch": 0.6956521739130435, "grad_norm": 7.15625, "learning_rate": 4.031770350284091e-06, "loss": 1.2316111326217651, "step": 2260 }, { "epoch": 0.6962677953058869, "grad_norm": 11.1875, "learning_rate": 4.03006190857406e-06, "loss": 1.5647934675216675, "step": 2262 }, { "epoch": 0.6968834166987303, "grad_norm": 8.9375, "learning_rate": 4.028352443268422e-06, "loss": 1.5584977865219116, "step": 2264 }, { "epoch": 0.6974990380915737, "grad_norm": 2.734375, "learning_rate": 4.026641956065908e-06, "loss": 1.2405673265457153, "step": 2266 }, { "epoch": 0.6981146594844171, "grad_norm": 10.9375, "learning_rate": 4.02493044866627e-06, "loss": 1.471353530883789, "step": 2268 }, { "epoch": 0.6987302808772605, "grad_norm": 7.125, "learning_rate": 4.023217922770272e-06, "loss": 1.5434600114822388, "step": 2270 }, { "epoch": 0.6993459022701038, "grad_norm": 6.0625, "learning_rate": 4.021504380079686e-06, "loss": 1.395086646080017, "step": 2272 }, { "epoch": 0.6999615236629473, "grad_norm": 8.375, "learning_rate": 4.0197898222973e-06, "loss": 1.3828070163726807, "step": 2274 }, { "epoch": 0.7005771450557907, "grad_norm": 2.828125, "learning_rate": 4.018074251126908e-06, "loss": 1.3579838275909424, "step": 2276 }, { "epoch": 0.701192766448634, "grad_norm": 13.1875, "learning_rate": 4.016357668273313e-06, "loss": 1.3393607139587402, "step": 2278 }, { "epoch": 0.7018083878414775, "grad_norm": 5.34375, "learning_rate": 4.014640075442318e-06, "loss": 1.1327935457229614, "step": 2280 }, { "epoch": 0.7024240092343209, "grad_norm": 5.75, "learning_rate": 4.012921474340738e-06, "loss": 1.2218248844146729, "step": 2282 }, { "epoch": 0.7030396306271643, "grad_norm": 8.0, "learning_rate": 4.011201866676383e-06, "loss": 0.902611494064331, "step": 2284 }, { "epoch": 0.7036552520200077, "grad_norm": 6.21875, "learning_rate": 4.009481254158066e-06, "loss": 1.4177908897399902, "step": 2286 }, { "epoch": 0.7042708734128511, "grad_norm": 3.6875, "learning_rate": 4.007759638495599e-06, "loss": 1.1517624855041504, "step": 2288 }, { "epoch": 0.7048864948056945, "grad_norm": 4.65625, "learning_rate": 4.006037021399789e-06, "loss": 1.2392499446868896, "step": 2290 }, { "epoch": 0.7055021161985379, "grad_norm": 3.75, "learning_rate": 4.004313404582439e-06, "loss": 1.0375635623931885, "step": 2292 }, { "epoch": 0.7061177375913813, "grad_norm": 7.21875, "learning_rate": 4.002588789756349e-06, "loss": 1.2339847087860107, "step": 2294 }, { "epoch": 0.7067333589842247, "grad_norm": 6.59375, "learning_rate": 4.000863178635301e-06, "loss": 1.436223030090332, "step": 2296 }, { "epoch": 0.7073489803770681, "grad_norm": 7.71875, "learning_rate": 3.9991365729340805e-06, "loss": 1.2261691093444824, "step": 2298 }, { "epoch": 0.7079646017699115, "grad_norm": 5.1875, "learning_rate": 3.997408974368449e-06, "loss": 1.1778173446655273, "step": 2300 }, { "epoch": 0.7085802231627549, "grad_norm": 6.25, "learning_rate": 3.995680384655162e-06, "loss": 1.211751103401184, "step": 2302 }, { "epoch": 0.7091958445555983, "grad_norm": 18.0, "learning_rate": 3.993950805511959e-06, "loss": 1.642501950263977, "step": 2304 }, { "epoch": 0.7098114659484417, "grad_norm": 15.8125, "learning_rate": 3.992220238657559e-06, "loss": 1.3975833654403687, "step": 2306 }, { "epoch": 0.7104270873412851, "grad_norm": 2.203125, "learning_rate": 3.990488685811667e-06, "loss": 1.1813315153121948, "step": 2308 }, { "epoch": 0.7110427087341286, "grad_norm": 4.8125, "learning_rate": 3.9887561486949655e-06, "loss": 1.1163524389266968, "step": 2310 }, { "epoch": 0.7116583301269719, "grad_norm": 6.3125, "learning_rate": 3.987022629029115e-06, "loss": 1.407476782798767, "step": 2312 }, { "epoch": 0.7122739515198153, "grad_norm": 4.8125, "learning_rate": 3.985288128536753e-06, "loss": 1.3963619470596313, "step": 2314 }, { "epoch": 0.7128895729126588, "grad_norm": 4.8125, "learning_rate": 3.983552648941492e-06, "loss": 1.281212568283081, "step": 2316 }, { "epoch": 0.7135051943055021, "grad_norm": 6.0625, "learning_rate": 3.981816191967917e-06, "loss": 1.3652002811431885, "step": 2318 }, { "epoch": 0.7141208156983455, "grad_norm": 11.5625, "learning_rate": 3.980078759341582e-06, "loss": 1.2505300045013428, "step": 2320 }, { "epoch": 0.714736437091189, "grad_norm": 6.90625, "learning_rate": 3.978340352789017e-06, "loss": 0.7349134683609009, "step": 2322 }, { "epoch": 0.7153520584840323, "grad_norm": 6.71875, "learning_rate": 3.976600974037711e-06, "loss": 1.4582650661468506, "step": 2324 }, { "epoch": 0.7159676798768757, "grad_norm": 14.5, "learning_rate": 3.974860624816126e-06, "loss": 1.3325326442718506, "step": 2326 }, { "epoch": 0.7165833012697191, "grad_norm": 33.25, "learning_rate": 3.973119306853687e-06, "loss": 1.6099287271499634, "step": 2328 }, { "epoch": 0.7171989226625626, "grad_norm": 7.5625, "learning_rate": 3.971377021880779e-06, "loss": 1.5254729986190796, "step": 2330 }, { "epoch": 0.7178145440554059, "grad_norm": 12.0625, "learning_rate": 3.96963377162875e-06, "loss": 1.2081190347671509, "step": 2332 }, { "epoch": 0.7184301654482493, "grad_norm": 6.8125, "learning_rate": 3.967889557829907e-06, "loss": 1.290348768234253, "step": 2334 }, { "epoch": 0.7190457868410928, "grad_norm": 9.375, "learning_rate": 3.966144382217514e-06, "loss": 1.1293039321899414, "step": 2336 }, { "epoch": 0.7196614082339361, "grad_norm": 6.78125, "learning_rate": 3.964398246525791e-06, "loss": 1.4362313747406006, "step": 2338 }, { "epoch": 0.7202770296267795, "grad_norm": 6.8125, "learning_rate": 3.962651152489914e-06, "loss": 1.1427749395370483, "step": 2340 }, { "epoch": 0.720892651019623, "grad_norm": 18.0, "learning_rate": 3.960903101846009e-06, "loss": 1.441338062286377, "step": 2342 }, { "epoch": 0.7215082724124663, "grad_norm": 16.5, "learning_rate": 3.959154096331149e-06, "loss": 1.6988006830215454, "step": 2344 }, { "epoch": 0.7221238938053097, "grad_norm": 6.28125, "learning_rate": 3.957404137683366e-06, "loss": 1.1266911029815674, "step": 2346 }, { "epoch": 0.7227395151981532, "grad_norm": 5.40625, "learning_rate": 3.955653227641629e-06, "loss": 1.1880162954330444, "step": 2348 }, { "epoch": 0.7233551365909965, "grad_norm": 6.6875, "learning_rate": 3.953901367945858e-06, "loss": 1.274377703666687, "step": 2350 }, { "epoch": 0.7239707579838399, "grad_norm": 4.65625, "learning_rate": 3.952148560336916e-06, "loss": 1.4765647649765015, "step": 2352 }, { "epoch": 0.7245863793766834, "grad_norm": 4.0625, "learning_rate": 3.950394806556607e-06, "loss": 1.2762149572372437, "step": 2354 }, { "epoch": 0.7252020007695268, "grad_norm": 9.8125, "learning_rate": 3.948640108347673e-06, "loss": 1.3984206914901733, "step": 2356 }, { "epoch": 0.7258176221623701, "grad_norm": 3.34375, "learning_rate": 3.9468844674537995e-06, "loss": 1.2570399045944214, "step": 2358 }, { "epoch": 0.7264332435552135, "grad_norm": 9.1875, "learning_rate": 3.945127885619605e-06, "loss": 1.921069860458374, "step": 2360 }, { "epoch": 0.727048864948057, "grad_norm": 6.21875, "learning_rate": 3.943370364590646e-06, "loss": 1.4534980058670044, "step": 2362 }, { "epoch": 0.7276644863409003, "grad_norm": 13.5625, "learning_rate": 3.941611906113409e-06, "loss": 1.3451404571533203, "step": 2364 }, { "epoch": 0.7282801077337437, "grad_norm": 4.65625, "learning_rate": 3.939852511935313e-06, "loss": 1.1856927871704102, "step": 2366 }, { "epoch": 0.7288957291265872, "grad_norm": 9.5, "learning_rate": 3.938092183804709e-06, "loss": 1.5575098991394043, "step": 2368 }, { "epoch": 0.7295113505194305, "grad_norm": 5.875, "learning_rate": 3.936330923470874e-06, "loss": 1.4615497589111328, "step": 2370 }, { "epoch": 0.7301269719122739, "grad_norm": 11.5, "learning_rate": 3.934568732684011e-06, "loss": 1.7046332359313965, "step": 2372 }, { "epoch": 0.7307425933051174, "grad_norm": 5.28125, "learning_rate": 3.932805613195249e-06, "loss": 1.2465029954910278, "step": 2374 }, { "epoch": 0.7313582146979608, "grad_norm": 9.125, "learning_rate": 3.9310415667566405e-06, "loss": 1.5415079593658447, "step": 2376 }, { "epoch": 0.7319738360908041, "grad_norm": 21.25, "learning_rate": 3.929276595121157e-06, "loss": 1.7812328338623047, "step": 2378 }, { "epoch": 0.7325894574836476, "grad_norm": 8.1875, "learning_rate": 3.927510700042689e-06, "loss": 1.452361822128296, "step": 2380 }, { "epoch": 0.733205078876491, "grad_norm": 12.5625, "learning_rate": 3.9257438832760485e-06, "loss": 1.593746304512024, "step": 2382 }, { "epoch": 0.7338207002693343, "grad_norm": 6.0625, "learning_rate": 3.923976146576961e-06, "loss": 1.1884487867355347, "step": 2384 }, { "epoch": 0.7344363216621778, "grad_norm": 6.53125, "learning_rate": 3.922207491702064e-06, "loss": 1.1821532249450684, "step": 2386 }, { "epoch": 0.7350519430550212, "grad_norm": 6.15625, "learning_rate": 3.9204379204089095e-06, "loss": 1.617356538772583, "step": 2388 }, { "epoch": 0.7356675644478645, "grad_norm": 4.90625, "learning_rate": 3.918667434455962e-06, "loss": 1.1489228010177612, "step": 2390 }, { "epoch": 0.736283185840708, "grad_norm": 4.5, "learning_rate": 3.916896035602592e-06, "loss": 1.4210864305496216, "step": 2392 }, { "epoch": 0.7368988072335514, "grad_norm": 4.9375, "learning_rate": 3.915123725609079e-06, "loss": 1.2937191724777222, "step": 2394 }, { "epoch": 0.7375144286263947, "grad_norm": 4.40625, "learning_rate": 3.913350506236606e-06, "loss": 1.2571167945861816, "step": 2396 }, { "epoch": 0.7381300500192381, "grad_norm": 6.625, "learning_rate": 3.9115763792472615e-06, "loss": 1.1669660806655884, "step": 2398 }, { "epoch": 0.7387456714120816, "grad_norm": 3.875, "learning_rate": 3.909801346404035e-06, "loss": 1.3945196866989136, "step": 2400 }, { "epoch": 0.739361292804925, "grad_norm": 5.71875, "learning_rate": 3.908025409470817e-06, "loss": 1.3932634592056274, "step": 2402 }, { "epoch": 0.7399769141977683, "grad_norm": 5.28125, "learning_rate": 3.906248570212395e-06, "loss": 1.3155955076217651, "step": 2404 }, { "epoch": 0.7405925355906118, "grad_norm": 7.03125, "learning_rate": 3.904470830394455e-06, "loss": 1.3457392454147339, "step": 2406 }, { "epoch": 0.7412081569834552, "grad_norm": 9.75, "learning_rate": 3.902692191783576e-06, "loss": 1.2566583156585693, "step": 2408 }, { "epoch": 0.7418237783762985, "grad_norm": 11.25, "learning_rate": 3.9009126561472325e-06, "loss": 1.4173073768615723, "step": 2410 }, { "epoch": 0.742439399769142, "grad_norm": 3.953125, "learning_rate": 3.899132225253786e-06, "loss": 1.030098557472229, "step": 2412 }, { "epoch": 0.7430550211619854, "grad_norm": 5.65625, "learning_rate": 3.897350900872494e-06, "loss": 1.1613000631332397, "step": 2414 }, { "epoch": 0.7436706425548287, "grad_norm": 8.0625, "learning_rate": 3.895568684773496e-06, "loss": 1.7477487325668335, "step": 2416 }, { "epoch": 0.7442862639476722, "grad_norm": 3.765625, "learning_rate": 3.893785578727821e-06, "loss": 1.2887831926345825, "step": 2418 }, { "epoch": 0.7449018853405156, "grad_norm": 5.34375, "learning_rate": 3.892001584507382e-06, "loss": 1.3257797956466675, "step": 2420 }, { "epoch": 0.745517506733359, "grad_norm": 2.890625, "learning_rate": 3.890216703884974e-06, "loss": 1.1927638053894043, "step": 2422 }, { "epoch": 0.7461331281262024, "grad_norm": 4.4375, "learning_rate": 3.888430938634272e-06, "loss": 1.2174712419509888, "step": 2424 }, { "epoch": 0.7467487495190458, "grad_norm": 36.75, "learning_rate": 3.886644290529831e-06, "loss": 0.8827616572380066, "step": 2426 }, { "epoch": 0.7473643709118892, "grad_norm": 6.4375, "learning_rate": 3.884856761347084e-06, "loss": 1.3602244853973389, "step": 2428 }, { "epoch": 0.7479799923047326, "grad_norm": 6.5, "learning_rate": 3.883068352862338e-06, "loss": 0.8709210157394409, "step": 2430 }, { "epoch": 0.748595613697576, "grad_norm": 7.09375, "learning_rate": 3.8812790668527755e-06, "loss": 1.2259773015975952, "step": 2432 }, { "epoch": 0.7492112350904194, "grad_norm": 6.15625, "learning_rate": 3.879488905096449e-06, "loss": 1.3588147163391113, "step": 2434 }, { "epoch": 0.7498268564832627, "grad_norm": 7.34375, "learning_rate": 3.877697869372284e-06, "loss": 1.464431643486023, "step": 2436 }, { "epoch": 0.7504424778761062, "grad_norm": 4.875, "learning_rate": 3.8759059614600705e-06, "loss": 1.271799087524414, "step": 2438 }, { "epoch": 0.7510580992689496, "grad_norm": 2.625, "learning_rate": 3.87411318314047e-06, "loss": 1.2482190132141113, "step": 2440 }, { "epoch": 0.7516737206617929, "grad_norm": 13.0, "learning_rate": 3.872319536195006e-06, "loss": 1.4509402513504028, "step": 2442 }, { "epoch": 0.7522893420546364, "grad_norm": 9.9375, "learning_rate": 3.870525022406064e-06, "loss": 1.517437219619751, "step": 2444 }, { "epoch": 0.7529049634474798, "grad_norm": 5.75, "learning_rate": 3.8687296435568945e-06, "loss": 1.2650505304336548, "step": 2446 }, { "epoch": 0.7535205848403232, "grad_norm": 14.125, "learning_rate": 3.866933401431604e-06, "loss": 1.5364501476287842, "step": 2448 }, { "epoch": 0.7541362062331666, "grad_norm": 1.8984375, "learning_rate": 3.865136297815161e-06, "loss": 1.1213115453720093, "step": 2450 }, { "epoch": 0.75475182762601, "grad_norm": 4.3125, "learning_rate": 3.863338334493384e-06, "loss": 1.0022122859954834, "step": 2452 }, { "epoch": 0.7553674490188534, "grad_norm": 15.8125, "learning_rate": 3.8615395132529536e-06, "loss": 1.7348800897598267, "step": 2454 }, { "epoch": 0.7559830704116968, "grad_norm": 3.109375, "learning_rate": 3.859739835881394e-06, "loss": 1.3278826475143433, "step": 2456 }, { "epoch": 0.7565986918045402, "grad_norm": 5.125, "learning_rate": 3.85793930416709e-06, "loss": 1.449325680732727, "step": 2458 }, { "epoch": 0.7572143131973836, "grad_norm": 7.96875, "learning_rate": 3.856137919899268e-06, "loss": 1.2966344356536865, "step": 2460 }, { "epoch": 0.757829934590227, "grad_norm": 8.75, "learning_rate": 3.854335684868004e-06, "loss": 1.4409971237182617, "step": 2462 }, { "epoch": 0.7584455559830704, "grad_norm": 7.9375, "learning_rate": 3.852532600864218e-06, "loss": 1.4250284433364868, "step": 2464 }, { "epoch": 0.7590611773759138, "grad_norm": 10.0, "learning_rate": 3.850728669679678e-06, "loss": 1.5082494020462036, "step": 2466 }, { "epoch": 0.7596767987687573, "grad_norm": 4.75, "learning_rate": 3.848923893106987e-06, "loss": 1.257767915725708, "step": 2468 }, { "epoch": 0.7602924201616006, "grad_norm": 4.46875, "learning_rate": 3.847118272939595e-06, "loss": 1.1704938411712646, "step": 2470 }, { "epoch": 0.760908041554444, "grad_norm": 9.9375, "learning_rate": 3.845311810971787e-06, "loss": 1.3709163665771484, "step": 2472 }, { "epoch": 0.7615236629472875, "grad_norm": 8.75, "learning_rate": 3.843504508998684e-06, "loss": 1.782489538192749, "step": 2474 }, { "epoch": 0.7621392843401308, "grad_norm": 5.90625, "learning_rate": 3.841696368816242e-06, "loss": 1.6315412521362305, "step": 2476 }, { "epoch": 0.7627549057329742, "grad_norm": 13.1875, "learning_rate": 3.839887392221252e-06, "loss": 1.3365678787231445, "step": 2478 }, { "epoch": 0.7633705271258177, "grad_norm": 10.375, "learning_rate": 3.838077581011332e-06, "loss": 1.4168919324874878, "step": 2480 }, { "epoch": 0.763986148518661, "grad_norm": 10.5, "learning_rate": 3.836266936984934e-06, "loss": 1.4411951303482056, "step": 2482 }, { "epoch": 0.7646017699115044, "grad_norm": 11.4375, "learning_rate": 3.834455461941335e-06, "loss": 1.7773278951644897, "step": 2484 }, { "epoch": 0.7652173913043478, "grad_norm": 4.96875, "learning_rate": 3.83264315768064e-06, "loss": 1.1491798162460327, "step": 2486 }, { "epoch": 0.7658330126971912, "grad_norm": 4.625, "learning_rate": 3.830830026003774e-06, "loss": 1.0899100303649902, "step": 2488 }, { "epoch": 0.7664486340900346, "grad_norm": 12.4375, "learning_rate": 3.829016068712486e-06, "loss": 1.6271624565124512, "step": 2490 }, { "epoch": 0.767064255482878, "grad_norm": 5.8125, "learning_rate": 3.827201287609349e-06, "loss": 1.2503747940063477, "step": 2492 }, { "epoch": 0.7676798768757215, "grad_norm": 7.40625, "learning_rate": 3.82538568449775e-06, "loss": 0.8986793160438538, "step": 2494 }, { "epoch": 0.7682954982685648, "grad_norm": 15.1875, "learning_rate": 3.823569261181894e-06, "loss": 1.471342921257019, "step": 2496 }, { "epoch": 0.7689111196614082, "grad_norm": 9.0, "learning_rate": 3.821752019466803e-06, "loss": 1.291356086730957, "step": 2498 }, { "epoch": 0.7695267410542517, "grad_norm": 5.0625, "learning_rate": 3.819933961158308e-06, "loss": 1.318943738937378, "step": 2500 }, { "epoch": 0.770142362447095, "grad_norm": 6.0, "learning_rate": 3.818115088063058e-06, "loss": 1.4840826988220215, "step": 2502 }, { "epoch": 0.7707579838399384, "grad_norm": 4.6875, "learning_rate": 3.816295401988507e-06, "loss": 1.2456152439117432, "step": 2504 }, { "epoch": 0.7713736052327819, "grad_norm": 6.28125, "learning_rate": 3.814474904742916e-06, "loss": 1.1569583415985107, "step": 2506 }, { "epoch": 0.7719892266256252, "grad_norm": 14.5625, "learning_rate": 3.812653598135355e-06, "loss": 1.612607717514038, "step": 2508 }, { "epoch": 0.7726048480184686, "grad_norm": 6.40625, "learning_rate": 3.8108314839756976e-06, "loss": 1.265571117401123, "step": 2510 }, { "epoch": 0.7732204694113121, "grad_norm": 16.125, "learning_rate": 3.809008564074619e-06, "loss": 1.9577373266220093, "step": 2512 }, { "epoch": 0.7738360908041555, "grad_norm": 5.125, "learning_rate": 3.807184840243595e-06, "loss": 1.4305379390716553, "step": 2514 }, { "epoch": 0.7744517121969988, "grad_norm": 11.875, "learning_rate": 3.8053603142949024e-06, "loss": 1.0490630865097046, "step": 2516 }, { "epoch": 0.7750673335898423, "grad_norm": 5.875, "learning_rate": 3.803534988041613e-06, "loss": 1.2258621454238892, "step": 2518 }, { "epoch": 0.7756829549826857, "grad_norm": 7.59375, "learning_rate": 3.8017088632975928e-06, "loss": 1.3509862422943115, "step": 2520 }, { "epoch": 0.776298576375529, "grad_norm": 6.90625, "learning_rate": 3.7998819418775044e-06, "loss": 1.5921310186386108, "step": 2522 }, { "epoch": 0.7769141977683724, "grad_norm": 11.1875, "learning_rate": 3.798054225596801e-06, "loss": 1.2857153415679932, "step": 2524 }, { "epoch": 0.7775298191612159, "grad_norm": 2.78125, "learning_rate": 3.7962257162717242e-06, "loss": 1.2583740949630737, "step": 2526 }, { "epoch": 0.7781454405540592, "grad_norm": 4.28125, "learning_rate": 3.7943964157193057e-06, "loss": 1.31504225730896, "step": 2528 }, { "epoch": 0.7787610619469026, "grad_norm": 11.375, "learning_rate": 3.792566325757361e-06, "loss": 1.482974886894226, "step": 2530 }, { "epoch": 0.7793766833397461, "grad_norm": 7.46875, "learning_rate": 3.790735448204492e-06, "loss": 1.0821229219436646, "step": 2532 }, { "epoch": 0.7799923047325894, "grad_norm": 5.75, "learning_rate": 3.7889037848800846e-06, "loss": 1.1284804344177246, "step": 2534 }, { "epoch": 0.7806079261254328, "grad_norm": 1.8984375, "learning_rate": 3.787071337604301e-06, "loss": 1.0225565433502197, "step": 2536 }, { "epoch": 0.7812235475182763, "grad_norm": 5.5, "learning_rate": 3.785238108198087e-06, "loss": 1.2461652755737305, "step": 2538 }, { "epoch": 0.7818391689111197, "grad_norm": 8.125, "learning_rate": 3.783404098483163e-06, "loss": 0.9364984035491943, "step": 2540 }, { "epoch": 0.782454790303963, "grad_norm": 3.984375, "learning_rate": 3.781569310282026e-06, "loss": 1.2495629787445068, "step": 2542 }, { "epoch": 0.7830704116968065, "grad_norm": 12.25, "learning_rate": 3.779733745417945e-06, "loss": 0.924501895904541, "step": 2544 }, { "epoch": 0.7836860330896499, "grad_norm": 9.4375, "learning_rate": 3.7778974057149632e-06, "loss": 1.6362602710723877, "step": 2546 }, { "epoch": 0.7843016544824932, "grad_norm": 10.125, "learning_rate": 3.776060292997893e-06, "loss": 1.1575554609298706, "step": 2548 }, { "epoch": 0.7849172758753367, "grad_norm": 7.78125, "learning_rate": 3.774222409092313e-06, "loss": 1.4146034717559814, "step": 2550 }, { "epoch": 0.7855328972681801, "grad_norm": 10.1875, "learning_rate": 3.772383755824569e-06, "loss": 1.4862112998962402, "step": 2552 }, { "epoch": 0.7861485186610234, "grad_norm": 8.625, "learning_rate": 3.770544335021774e-06, "loss": 1.2228214740753174, "step": 2554 }, { "epoch": 0.7867641400538669, "grad_norm": 10.875, "learning_rate": 3.7687041485118025e-06, "loss": 1.2453134059906006, "step": 2556 }, { "epoch": 0.7873797614467103, "grad_norm": 11.25, "learning_rate": 3.7668631981232852e-06, "loss": 1.4665039777755737, "step": 2558 }, { "epoch": 0.7879953828395537, "grad_norm": 4.8125, "learning_rate": 3.7650214856856192e-06, "loss": 1.2003722190856934, "step": 2560 }, { "epoch": 0.788611004232397, "grad_norm": 7.53125, "learning_rate": 3.7631790130289548e-06, "loss": 1.515622615814209, "step": 2562 }, { "epoch": 0.7892266256252405, "grad_norm": 8.3125, "learning_rate": 3.7613357819841968e-06, "loss": 1.1541849374771118, "step": 2564 }, { "epoch": 0.7898422470180839, "grad_norm": 6.8125, "learning_rate": 3.7594917943830065e-06, "loss": 1.406124472618103, "step": 2566 }, { "epoch": 0.7904578684109272, "grad_norm": 4.5625, "learning_rate": 3.757647052057796e-06, "loss": 1.10648775100708, "step": 2568 }, { "epoch": 0.7910734898037707, "grad_norm": 6.625, "learning_rate": 3.755801556841726e-06, "loss": 1.2991050481796265, "step": 2570 }, { "epoch": 0.7916891111966141, "grad_norm": 5.78125, "learning_rate": 3.7539553105687067e-06, "loss": 1.4035756587982178, "step": 2572 }, { "epoch": 0.7923047325894574, "grad_norm": 8.4375, "learning_rate": 3.7521083150733952e-06, "loss": 1.6464347839355469, "step": 2574 }, { "epoch": 0.7929203539823009, "grad_norm": 7.6875, "learning_rate": 3.7502605721911923e-06, "loss": 1.4995614290237427, "step": 2576 }, { "epoch": 0.7935359753751443, "grad_norm": 8.0, "learning_rate": 3.7484120837582405e-06, "loss": 1.1336731910705566, "step": 2578 }, { "epoch": 0.7941515967679876, "grad_norm": 3.90625, "learning_rate": 3.746562851611425e-06, "loss": 0.9471673369407654, "step": 2580 }, { "epoch": 0.7947672181608311, "grad_norm": 13.125, "learning_rate": 3.744712877588369e-06, "loss": 1.2849431037902832, "step": 2582 }, { "epoch": 0.7953828395536745, "grad_norm": 5.03125, "learning_rate": 3.7428621635274333e-06, "loss": 1.2788987159729004, "step": 2584 }, { "epoch": 0.7959984609465179, "grad_norm": 9.625, "learning_rate": 3.741010711267714e-06, "loss": 1.6069245338439941, "step": 2586 }, { "epoch": 0.7966140823393613, "grad_norm": 7.25, "learning_rate": 3.73915852264904e-06, "loss": 1.4633699655532837, "step": 2588 }, { "epoch": 0.7972297037322047, "grad_norm": 6.625, "learning_rate": 3.7373055995119745e-06, "loss": 1.132173776626587, "step": 2590 }, { "epoch": 0.7978453251250481, "grad_norm": 6.59375, "learning_rate": 3.7354519436978056e-06, "loss": 1.3197592496871948, "step": 2592 }, { "epoch": 0.7984609465178915, "grad_norm": 20.5, "learning_rate": 3.733597557048555e-06, "loss": 1.333539605140686, "step": 2594 }, { "epoch": 0.7990765679107349, "grad_norm": 5.75, "learning_rate": 3.731742441406969e-06, "loss": 1.344053864479065, "step": 2596 }, { "epoch": 0.7996921893035783, "grad_norm": 6.6875, "learning_rate": 3.7298865986165154e-06, "loss": 1.3256186246871948, "step": 2598 }, { "epoch": 0.8003078106964217, "grad_norm": 10.75, "learning_rate": 3.728030030521387e-06, "loss": 1.4850950241088867, "step": 2600 }, { "epoch": 0.8009234320892651, "grad_norm": 4.5, "learning_rate": 3.7261727389664993e-06, "loss": 1.096773386001587, "step": 2602 }, { "epoch": 0.8015390534821085, "grad_norm": 1.96875, "learning_rate": 3.7243147257974832e-06, "loss": 1.0473754405975342, "step": 2604 }, { "epoch": 0.802154674874952, "grad_norm": 7.875, "learning_rate": 3.7224559928606862e-06, "loss": 1.1904107332229614, "step": 2606 }, { "epoch": 0.8027702962677953, "grad_norm": 9.25, "learning_rate": 3.7205965420031763e-06, "loss": 1.3250120878219604, "step": 2608 }, { "epoch": 0.8033859176606387, "grad_norm": 5.96875, "learning_rate": 3.718736375072729e-06, "loss": 1.4196326732635498, "step": 2610 }, { "epoch": 0.8040015390534822, "grad_norm": 9.75, "learning_rate": 3.716875493917834e-06, "loss": 0.9277944564819336, "step": 2612 }, { "epoch": 0.8046171604463255, "grad_norm": 8.625, "learning_rate": 3.715013900387693e-06, "loss": 1.0608748197555542, "step": 2614 }, { "epoch": 0.8052327818391689, "grad_norm": 5.875, "learning_rate": 3.7131515963322106e-06, "loss": 1.126943826675415, "step": 2616 }, { "epoch": 0.8058484032320123, "grad_norm": 15.875, "learning_rate": 3.7112885836020017e-06, "loss": 1.363018274307251, "step": 2618 }, { "epoch": 0.8064640246248557, "grad_norm": 5.21875, "learning_rate": 3.7094248640483834e-06, "loss": 1.227006196975708, "step": 2620 }, { "epoch": 0.8070796460176991, "grad_norm": 8.1875, "learning_rate": 3.7075604395233745e-06, "loss": 1.02199125289917, "step": 2622 }, { "epoch": 0.8076952674105425, "grad_norm": 1.9375, "learning_rate": 3.705695311879696e-06, "loss": 1.1358833312988281, "step": 2624 }, { "epoch": 0.8083108888033859, "grad_norm": 6.875, "learning_rate": 3.7038294829707675e-06, "loss": 1.2867252826690674, "step": 2626 }, { "epoch": 0.8089265101962293, "grad_norm": 8.5, "learning_rate": 3.701962954650705e-06, "loss": 1.364060401916504, "step": 2628 }, { "epoch": 0.8095421315890727, "grad_norm": 4.78125, "learning_rate": 3.70009572877432e-06, "loss": 0.9407942295074463, "step": 2630 }, { "epoch": 0.8101577529819162, "grad_norm": 2.4375, "learning_rate": 3.6982278071971158e-06, "loss": 1.1905866861343384, "step": 2632 }, { "epoch": 0.8107733743747595, "grad_norm": 9.0, "learning_rate": 3.696359191775288e-06, "loss": 1.0922561883926392, "step": 2634 }, { "epoch": 0.8113889957676029, "grad_norm": 7.03125, "learning_rate": 3.694489884365722e-06, "loss": 1.2597466707229614, "step": 2636 }, { "epoch": 0.8120046171604464, "grad_norm": 9.875, "learning_rate": 3.69261988682599e-06, "loss": 1.451778769493103, "step": 2638 }, { "epoch": 0.8126202385532897, "grad_norm": 11.0, "learning_rate": 3.690749201014352e-06, "loss": 1.3754864931106567, "step": 2640 }, { "epoch": 0.8132358599461331, "grad_norm": 9.5, "learning_rate": 3.6888778287897477e-06, "loss": 1.460006833076477, "step": 2642 }, { "epoch": 0.8138514813389766, "grad_norm": 7.78125, "learning_rate": 3.6870057720118036e-06, "loss": 1.3695088624954224, "step": 2644 }, { "epoch": 0.8144671027318199, "grad_norm": 4.625, "learning_rate": 3.6851330325408242e-06, "loss": 1.2939337491989136, "step": 2646 }, { "epoch": 0.8150827241246633, "grad_norm": 3.6875, "learning_rate": 3.6832596122377926e-06, "loss": 1.1832215785980225, "step": 2648 }, { "epoch": 0.8156983455175068, "grad_norm": 4.84375, "learning_rate": 3.681385512964368e-06, "loss": 1.4975953102111816, "step": 2650 }, { "epoch": 0.8163139669103502, "grad_norm": 5.6875, "learning_rate": 3.6795107365828862e-06, "loss": 1.2492905855178833, "step": 2652 }, { "epoch": 0.8169295883031935, "grad_norm": 49.75, "learning_rate": 3.6776352849563534e-06, "loss": 1.4812604188919067, "step": 2654 }, { "epoch": 0.817545209696037, "grad_norm": 9.875, "learning_rate": 3.6757591599484476e-06, "loss": 1.4885751008987427, "step": 2656 }, { "epoch": 0.8181608310888804, "grad_norm": 6.09375, "learning_rate": 3.6738823634235175e-06, "loss": 1.0009148120880127, "step": 2658 }, { "epoch": 0.8187764524817237, "grad_norm": 5.1875, "learning_rate": 3.6720048972465773e-06, "loss": 1.2781193256378174, "step": 2660 }, { "epoch": 0.8193920738745671, "grad_norm": 7.875, "learning_rate": 3.670126763283307e-06, "loss": 1.4532701969146729, "step": 2662 }, { "epoch": 0.8200076952674106, "grad_norm": 4.9375, "learning_rate": 3.66824796340005e-06, "loss": 1.2384744882583618, "step": 2664 }, { "epoch": 0.8206233166602539, "grad_norm": 7.75, "learning_rate": 3.666368499463814e-06, "loss": 1.1035149097442627, "step": 2666 }, { "epoch": 0.8212389380530973, "grad_norm": 9.6875, "learning_rate": 3.664488373342261e-06, "loss": 1.0949159860610962, "step": 2668 }, { "epoch": 0.8218545594459408, "grad_norm": 7.96875, "learning_rate": 3.662607586903717e-06, "loss": 1.260567545890808, "step": 2670 }, { "epoch": 0.8224701808387841, "grad_norm": 5.65625, "learning_rate": 3.6607261420171614e-06, "loss": 1.4338582754135132, "step": 2672 }, { "epoch": 0.8230858022316275, "grad_norm": 7.21875, "learning_rate": 3.6588440405522275e-06, "loss": 1.106650710105896, "step": 2674 }, { "epoch": 0.823701423624471, "grad_norm": 7.9375, "learning_rate": 3.6569612843792015e-06, "loss": 1.488775610923767, "step": 2676 }, { "epoch": 0.8243170450173144, "grad_norm": 4.8125, "learning_rate": 3.655077875369022e-06, "loss": 1.230544090270996, "step": 2678 }, { "epoch": 0.8249326664101577, "grad_norm": 4.625, "learning_rate": 3.653193815393272e-06, "loss": 1.0270192623138428, "step": 2680 }, { "epoch": 0.8255482878030012, "grad_norm": 8.5, "learning_rate": 3.6513091063241878e-06, "loss": 1.27272367477417, "step": 2682 }, { "epoch": 0.8261639091958446, "grad_norm": 10.6875, "learning_rate": 3.649423750034643e-06, "loss": 1.2475188970565796, "step": 2684 }, { "epoch": 0.8267795305886879, "grad_norm": 6.71875, "learning_rate": 3.647537748398162e-06, "loss": 1.383508563041687, "step": 2686 }, { "epoch": 0.8273951519815314, "grad_norm": 2.59375, "learning_rate": 3.645651103288904e-06, "loss": 1.3146872520446777, "step": 2688 }, { "epoch": 0.8280107733743748, "grad_norm": 8.375, "learning_rate": 3.6437638165816725e-06, "loss": 1.4075785875320435, "step": 2690 }, { "epoch": 0.8286263947672181, "grad_norm": 4.8125, "learning_rate": 3.641875890151906e-06, "loss": 1.0752105712890625, "step": 2692 }, { "epoch": 0.8292420161600615, "grad_norm": 14.6875, "learning_rate": 3.63998732587568e-06, "loss": 1.566037654876709, "step": 2694 }, { "epoch": 0.829857637552905, "grad_norm": 9.5, "learning_rate": 3.638098125629701e-06, "loss": 1.4545506238937378, "step": 2696 }, { "epoch": 0.8304732589457484, "grad_norm": 9.4375, "learning_rate": 3.636208291291312e-06, "loss": 1.1511316299438477, "step": 2698 }, { "epoch": 0.8310888803385917, "grad_norm": 9.125, "learning_rate": 3.6343178247384827e-06, "loss": 1.2352555990219116, "step": 2700 }, { "epoch": 0.8317045017314352, "grad_norm": 4.71875, "learning_rate": 3.6324267278498114e-06, "loss": 1.4677183628082275, "step": 2702 }, { "epoch": 0.8323201231242786, "grad_norm": 1.609375, "learning_rate": 3.630535002504526e-06, "loss": 1.2390086650848389, "step": 2704 }, { "epoch": 0.8329357445171219, "grad_norm": 36.25, "learning_rate": 3.6286426505824734e-06, "loss": 1.2680785655975342, "step": 2706 }, { "epoch": 0.8335513659099654, "grad_norm": 15.75, "learning_rate": 3.6267496739641272e-06, "loss": 1.656466007232666, "step": 2708 }, { "epoch": 0.8341669873028088, "grad_norm": 14.125, "learning_rate": 3.6248560745305818e-06, "loss": 1.5022876262664795, "step": 2710 }, { "epoch": 0.8347826086956521, "grad_norm": 8.75, "learning_rate": 3.622961854163548e-06, "loss": 1.4779717922210693, "step": 2712 }, { "epoch": 0.8353982300884956, "grad_norm": 2.125, "learning_rate": 3.6210670147453555e-06, "loss": 0.9939274191856384, "step": 2714 }, { "epoch": 0.836013851481339, "grad_norm": 14.75, "learning_rate": 3.6191715581589482e-06, "loss": 1.1958377361297607, "step": 2716 }, { "epoch": 0.8366294728741823, "grad_norm": 8.9375, "learning_rate": 3.6172754862878844e-06, "loss": 1.7432280778884888, "step": 2718 }, { "epoch": 0.8372450942670258, "grad_norm": 7.53125, "learning_rate": 3.6153788010163336e-06, "loss": 1.4366165399551392, "step": 2720 }, { "epoch": 0.8378607156598692, "grad_norm": 6.25, "learning_rate": 3.6134815042290737e-06, "loss": 1.3425190448760986, "step": 2722 }, { "epoch": 0.8384763370527126, "grad_norm": 23.875, "learning_rate": 3.611583597811491e-06, "loss": 1.2663666009902954, "step": 2724 }, { "epoch": 0.839091958445556, "grad_norm": 4.0625, "learning_rate": 3.609685083649579e-06, "loss": 1.2251818180084229, "step": 2726 }, { "epoch": 0.8397075798383994, "grad_norm": 11.125, "learning_rate": 3.6077859636299316e-06, "loss": 1.2505325078964233, "step": 2728 }, { "epoch": 0.8403232012312428, "grad_norm": 3.578125, "learning_rate": 3.60588623963975e-06, "loss": 1.2615528106689453, "step": 2730 }, { "epoch": 0.8409388226240861, "grad_norm": 4.78125, "learning_rate": 3.6039859135668287e-06, "loss": 1.3801602125167847, "step": 2732 }, { "epoch": 0.8415544440169296, "grad_norm": 12.125, "learning_rate": 3.602084987299567e-06, "loss": 1.6640632152557373, "step": 2734 }, { "epoch": 0.842170065409773, "grad_norm": 9.0, "learning_rate": 3.6001834627269573e-06, "loss": 1.6788702011108398, "step": 2736 }, { "epoch": 0.8427856868026163, "grad_norm": 8.125, "learning_rate": 3.5982813417385876e-06, "loss": 1.5393545627593994, "step": 2738 }, { "epoch": 0.8434013081954598, "grad_norm": 2.375, "learning_rate": 3.596378626224636e-06, "loss": 1.3888500928878784, "step": 2740 }, { "epoch": 0.8440169295883032, "grad_norm": 6.71875, "learning_rate": 3.594475318075876e-06, "loss": 1.542112112045288, "step": 2742 }, { "epoch": 0.8446325509811466, "grad_norm": 5.65625, "learning_rate": 3.592571419183667e-06, "loss": 1.5727214813232422, "step": 2744 }, { "epoch": 0.84524817237399, "grad_norm": 8.3125, "learning_rate": 3.5906669314399555e-06, "loss": 1.481203556060791, "step": 2746 }, { "epoch": 0.8458637937668334, "grad_norm": 11.625, "learning_rate": 3.5887618567372752e-06, "loss": 1.3226168155670166, "step": 2748 }, { "epoch": 0.8464794151596768, "grad_norm": 15.6875, "learning_rate": 3.5868561969687387e-06, "loss": 1.7433969974517822, "step": 2750 }, { "epoch": 0.8470950365525202, "grad_norm": 5.0, "learning_rate": 3.584949954028045e-06, "loss": 1.3892580270767212, "step": 2752 }, { "epoch": 0.8477106579453636, "grad_norm": 6.5625, "learning_rate": 3.583043129809469e-06, "loss": 1.2613940238952637, "step": 2754 }, { "epoch": 0.848326279338207, "grad_norm": 2.4375, "learning_rate": 3.581135726207867e-06, "loss": 0.9564993381500244, "step": 2756 }, { "epoch": 0.8489419007310504, "grad_norm": 7.90625, "learning_rate": 3.5792277451186665e-06, "loss": 1.3259934186935425, "step": 2758 }, { "epoch": 0.8495575221238938, "grad_norm": 6.03125, "learning_rate": 3.577319188437872e-06, "loss": 1.374593734741211, "step": 2760 }, { "epoch": 0.8501731435167372, "grad_norm": 7.625, "learning_rate": 3.5754100580620587e-06, "loss": 1.2887630462646484, "step": 2762 }, { "epoch": 0.8507887649095806, "grad_norm": 6.25, "learning_rate": 3.573500355888372e-06, "loss": 1.3124195337295532, "step": 2764 }, { "epoch": 0.851404386302424, "grad_norm": 2.125, "learning_rate": 3.5715900838145267e-06, "loss": 1.0512268543243408, "step": 2766 }, { "epoch": 0.8520200076952674, "grad_norm": 7.53125, "learning_rate": 3.569679243738803e-06, "loss": 1.4116840362548828, "step": 2768 }, { "epoch": 0.8526356290881109, "grad_norm": 6.15625, "learning_rate": 3.5677678375600443e-06, "loss": 1.2163076400756836, "step": 2770 }, { "epoch": 0.8532512504809542, "grad_norm": 15.0, "learning_rate": 3.5658558671776577e-06, "loss": 1.4438374042510986, "step": 2772 }, { "epoch": 0.8538668718737976, "grad_norm": 9.1875, "learning_rate": 3.5639433344916117e-06, "loss": 1.4797570705413818, "step": 2774 }, { "epoch": 0.854482493266641, "grad_norm": 10.375, "learning_rate": 3.5620302414024345e-06, "loss": 1.3446606397628784, "step": 2776 }, { "epoch": 0.8550981146594844, "grad_norm": 3.296875, "learning_rate": 3.560116589811207e-06, "loss": 1.3919823169708252, "step": 2778 }, { "epoch": 0.8557137360523278, "grad_norm": 56.25, "learning_rate": 3.5582023816195687e-06, "loss": 1.4049403667449951, "step": 2780 }, { "epoch": 0.8563293574451712, "grad_norm": 6.4375, "learning_rate": 3.5562876187297125e-06, "loss": 1.478960394859314, "step": 2782 }, { "epoch": 0.8569449788380146, "grad_norm": 1.4453125, "learning_rate": 3.554372303044379e-06, "loss": 1.004663109779358, "step": 2784 }, { "epoch": 0.857560600230858, "grad_norm": 35.5, "learning_rate": 3.5524564364668602e-06, "loss": 1.060371994972229, "step": 2786 }, { "epoch": 0.8581762216237014, "grad_norm": 4.0625, "learning_rate": 3.550540020900998e-06, "loss": 1.1833809614181519, "step": 2788 }, { "epoch": 0.8587918430165449, "grad_norm": 7.5625, "learning_rate": 3.548623058251176e-06, "loss": 1.2524820566177368, "step": 2790 }, { "epoch": 0.8594074644093882, "grad_norm": 5.90625, "learning_rate": 3.5467055504223225e-06, "loss": 0.8956518173217773, "step": 2792 }, { "epoch": 0.8600230858022316, "grad_norm": 6.125, "learning_rate": 3.5447874993199095e-06, "loss": 1.0373610258102417, "step": 2794 }, { "epoch": 0.8606387071950751, "grad_norm": 10.3125, "learning_rate": 3.542868906849947e-06, "loss": 1.2199324369430542, "step": 2796 }, { "epoch": 0.8612543285879184, "grad_norm": 4.4375, "learning_rate": 3.5409497749189814e-06, "loss": 1.2490025758743286, "step": 2798 }, { "epoch": 0.8618699499807618, "grad_norm": 6.96875, "learning_rate": 3.539030105434099e-06, "loss": 1.2936158180236816, "step": 2800 }, { "epoch": 0.8624855713736053, "grad_norm": 6.1875, "learning_rate": 3.5371099003029184e-06, "loss": 0.8811333179473877, "step": 2802 }, { "epoch": 0.8631011927664486, "grad_norm": 7.3125, "learning_rate": 3.535189161433591e-06, "loss": 1.2990635633468628, "step": 2804 }, { "epoch": 0.863716814159292, "grad_norm": 7.65625, "learning_rate": 3.5332678907347963e-06, "loss": 1.4486134052276611, "step": 2806 }, { "epoch": 0.8643324355521355, "grad_norm": 8.5, "learning_rate": 3.531346090115745e-06, "loss": 1.3083335161209106, "step": 2808 }, { "epoch": 0.8649480569449788, "grad_norm": 2.359375, "learning_rate": 3.5294237614861738e-06, "loss": 1.341057300567627, "step": 2810 }, { "epoch": 0.8655636783378222, "grad_norm": 5.3125, "learning_rate": 3.5275009067563413e-06, "loss": 1.33603835105896, "step": 2812 }, { "epoch": 0.8661792997306657, "grad_norm": 4.78125, "learning_rate": 3.5255775278370363e-06, "loss": 1.279923677444458, "step": 2814 }, { "epoch": 0.8667949211235091, "grad_norm": 5.125, "learning_rate": 3.5236536266395594e-06, "loss": 1.374291181564331, "step": 2816 }, { "epoch": 0.8674105425163524, "grad_norm": 6.3125, "learning_rate": 3.521729205075736e-06, "loss": 1.5705500841140747, "step": 2818 }, { "epoch": 0.8680261639091958, "grad_norm": 14.6875, "learning_rate": 3.5198042650579043e-06, "loss": 1.232345700263977, "step": 2820 }, { "epoch": 0.8686417853020393, "grad_norm": 3.59375, "learning_rate": 3.5178788084989244e-06, "loss": 1.432027816772461, "step": 2822 }, { "epoch": 0.8692574066948826, "grad_norm": 4.46875, "learning_rate": 3.5159528373121645e-06, "loss": 1.2847026586532593, "step": 2824 }, { "epoch": 0.869873028087726, "grad_norm": 3.46875, "learning_rate": 3.5140263534115038e-06, "loss": 1.088910460472107, "step": 2826 }, { "epoch": 0.8704886494805695, "grad_norm": 3.5, "learning_rate": 3.512099358711333e-06, "loss": 1.2364428043365479, "step": 2828 }, { "epoch": 0.8711042708734128, "grad_norm": 9.4375, "learning_rate": 3.5101718551265505e-06, "loss": 1.2363696098327637, "step": 2830 }, { "epoch": 0.8717198922662562, "grad_norm": 4.78125, "learning_rate": 3.50824384457256e-06, "loss": 1.062808871269226, "step": 2832 }, { "epoch": 0.8723355136590997, "grad_norm": 11.625, "learning_rate": 3.5063153289652685e-06, "loss": 1.218300700187683, "step": 2834 }, { "epoch": 0.8729511350519431, "grad_norm": 8.5, "learning_rate": 3.5043863102210853e-06, "loss": 1.5316327810287476, "step": 2836 }, { "epoch": 0.8735667564447864, "grad_norm": 44.5, "learning_rate": 3.5024567902569196e-06, "loss": 1.658814549446106, "step": 2838 }, { "epoch": 0.8741823778376299, "grad_norm": 24.375, "learning_rate": 3.500526770990177e-06, "loss": 1.4052304029464722, "step": 2840 }, { "epoch": 0.8747979992304733, "grad_norm": 17.75, "learning_rate": 3.4985962543387632e-06, "loss": 1.5683777332305908, "step": 2842 }, { "epoch": 0.8754136206233166, "grad_norm": 8.4375, "learning_rate": 3.4966652422210746e-06, "loss": 1.5750112533569336, "step": 2844 }, { "epoch": 0.8760292420161601, "grad_norm": 11.4375, "learning_rate": 3.4947337365560023e-06, "loss": 1.8130897283554077, "step": 2846 }, { "epoch": 0.8766448634090035, "grad_norm": 7.5, "learning_rate": 3.4928017392629265e-06, "loss": 1.3907134532928467, "step": 2848 }, { "epoch": 0.8772604848018468, "grad_norm": 7.03125, "learning_rate": 3.4908692522617147e-06, "loss": 1.6184890270233154, "step": 2850 }, { "epoch": 0.8778761061946903, "grad_norm": 8.3125, "learning_rate": 3.4889362774727244e-06, "loss": 1.3463777303695679, "step": 2852 }, { "epoch": 0.8784917275875337, "grad_norm": 6.59375, "learning_rate": 3.487002816816796e-06, "loss": 1.1038285493850708, "step": 2854 }, { "epoch": 0.8791073489803771, "grad_norm": 4.59375, "learning_rate": 3.4850688722152498e-06, "loss": 1.0783321857452393, "step": 2856 }, { "epoch": 0.8797229703732204, "grad_norm": 6.46875, "learning_rate": 3.4831344455898937e-06, "loss": 1.2051217555999756, "step": 2858 }, { "epoch": 0.8803385917660639, "grad_norm": 8.875, "learning_rate": 3.4811995388630093e-06, "loss": 1.3849332332611084, "step": 2860 }, { "epoch": 0.8809542131589073, "grad_norm": 11.1875, "learning_rate": 3.4792641539573558e-06, "loss": 1.4876724481582642, "step": 2862 }, { "epoch": 0.8815698345517506, "grad_norm": 15.375, "learning_rate": 3.4773282927961693e-06, "loss": 1.4236429929733276, "step": 2864 }, { "epoch": 0.8821854559445941, "grad_norm": 17.125, "learning_rate": 3.475391957303159e-06, "loss": 0.7241360545158386, "step": 2866 }, { "epoch": 0.8828010773374375, "grad_norm": 31.375, "learning_rate": 3.4734551494025047e-06, "loss": 1.4327094554901123, "step": 2868 }, { "epoch": 0.8834166987302808, "grad_norm": 7.46875, "learning_rate": 3.4715178710188552e-06, "loss": 1.6214407682418823, "step": 2870 }, { "epoch": 0.8840323201231243, "grad_norm": 11.875, "learning_rate": 3.469580124077328e-06, "loss": 1.151062250137329, "step": 2872 }, { "epoch": 0.8846479415159677, "grad_norm": 4.6875, "learning_rate": 3.4676419105035054e-06, "loss": 1.0868035554885864, "step": 2874 }, { "epoch": 0.885263562908811, "grad_norm": 9.3125, "learning_rate": 3.465703232223433e-06, "loss": 1.3283675909042358, "step": 2876 }, { "epoch": 0.8858791843016545, "grad_norm": 7.28125, "learning_rate": 3.4637640911636206e-06, "loss": 1.2539609670639038, "step": 2878 }, { "epoch": 0.8864948056944979, "grad_norm": 3.40625, "learning_rate": 3.4618244892510346e-06, "loss": 1.2640516757965088, "step": 2880 }, { "epoch": 0.8871104270873413, "grad_norm": 5.4375, "learning_rate": 3.4598844284131017e-06, "loss": 1.2733051776885986, "step": 2882 }, { "epoch": 0.8877260484801847, "grad_norm": 9.4375, "learning_rate": 3.4579439105777034e-06, "loss": 1.0956273078918457, "step": 2884 }, { "epoch": 0.8883416698730281, "grad_norm": 5.09375, "learning_rate": 3.4560029376731765e-06, "loss": 1.4090516567230225, "step": 2886 }, { "epoch": 0.8889572912658715, "grad_norm": 7.125, "learning_rate": 3.454061511628308e-06, "loss": 1.0756299495697021, "step": 2888 }, { "epoch": 0.8895729126587149, "grad_norm": 13.8125, "learning_rate": 3.4521196343723377e-06, "loss": 0.6679868102073669, "step": 2890 }, { "epoch": 0.8901885340515583, "grad_norm": 2.765625, "learning_rate": 3.450177307834952e-06, "loss": 1.3705148696899414, "step": 2892 }, { "epoch": 0.8908041554444017, "grad_norm": 5.03125, "learning_rate": 3.448234533946284e-06, "loss": 1.2735440731048584, "step": 2894 }, { "epoch": 0.891419776837245, "grad_norm": 12.1875, "learning_rate": 3.4462913146369124e-06, "loss": 1.1582732200622559, "step": 2896 }, { "epoch": 0.8920353982300885, "grad_norm": 28.25, "learning_rate": 3.4443476518378583e-06, "loss": 1.429880976676941, "step": 2898 }, { "epoch": 0.8926510196229319, "grad_norm": 6.96875, "learning_rate": 3.4424035474805808e-06, "loss": 1.5774238109588623, "step": 2900 }, { "epoch": 0.8932666410157754, "grad_norm": 5.25, "learning_rate": 3.440459003496982e-06, "loss": 1.1342121362686157, "step": 2902 }, { "epoch": 0.8938822624086187, "grad_norm": 7.28125, "learning_rate": 3.4385140218193978e-06, "loss": 1.3799785375595093, "step": 2904 }, { "epoch": 0.8944978838014621, "grad_norm": 6.46875, "learning_rate": 3.4365686043806014e-06, "loss": 1.3847746849060059, "step": 2906 }, { "epoch": 0.8951135051943055, "grad_norm": 9.0625, "learning_rate": 3.4346227531137954e-06, "loss": 1.256567120552063, "step": 2908 }, { "epoch": 0.8957291265871489, "grad_norm": 21.25, "learning_rate": 3.4326764699526184e-06, "loss": 1.6475443840026855, "step": 2910 }, { "epoch": 0.8963447479799923, "grad_norm": 7.9375, "learning_rate": 3.4307297568311337e-06, "loss": 1.379146933555603, "step": 2912 }, { "epoch": 0.8969603693728357, "grad_norm": 8.625, "learning_rate": 3.428782615683835e-06, "loss": 1.2929694652557373, "step": 2914 }, { "epoch": 0.8975759907656791, "grad_norm": 13.25, "learning_rate": 3.4268350484456385e-06, "loss": 1.4918608665466309, "step": 2916 }, { "epoch": 0.8981916121585225, "grad_norm": 2.421875, "learning_rate": 3.4248870570518875e-06, "loss": 1.288706660270691, "step": 2918 }, { "epoch": 0.8988072335513659, "grad_norm": 5.1875, "learning_rate": 3.4229386434383438e-06, "loss": 1.5433827638626099, "step": 2920 }, { "epoch": 0.8994228549442093, "grad_norm": 5.03125, "learning_rate": 3.4209898095411894e-06, "loss": 1.0797992944717407, "step": 2922 }, { "epoch": 0.9000384763370527, "grad_norm": 26.875, "learning_rate": 3.4190405572970242e-06, "loss": 1.0770173072814941, "step": 2924 }, { "epoch": 0.9006540977298961, "grad_norm": 6.375, "learning_rate": 3.4170908886428644e-06, "loss": 1.7204927206039429, "step": 2926 }, { "epoch": 0.9012697191227396, "grad_norm": 5.09375, "learning_rate": 3.4151408055161385e-06, "loss": 1.4469215869903564, "step": 2928 }, { "epoch": 0.9018853405155829, "grad_norm": 4.46875, "learning_rate": 3.413190309854688e-06, "loss": 1.4087347984313965, "step": 2930 }, { "epoch": 0.9025009619084263, "grad_norm": 13.125, "learning_rate": 3.4112394035967656e-06, "loss": 1.660951852798462, "step": 2932 }, { "epoch": 0.9031165833012698, "grad_norm": 2.671875, "learning_rate": 3.40928808868103e-06, "loss": 1.0870805978775024, "step": 2934 }, { "epoch": 0.9037322046941131, "grad_norm": 4.375, "learning_rate": 3.407336367046545e-06, "loss": 1.313590168952942, "step": 2936 }, { "epoch": 0.9043478260869565, "grad_norm": 5.8125, "learning_rate": 3.405384240632782e-06, "loss": 1.594503402709961, "step": 2938 }, { "epoch": 0.9049634474798, "grad_norm": 4.71875, "learning_rate": 3.4034317113796125e-06, "loss": 1.244261384010315, "step": 2940 }, { "epoch": 0.9055790688726433, "grad_norm": 17.375, "learning_rate": 3.4014787812273063e-06, "loss": 1.5862045288085938, "step": 2942 }, { "epoch": 0.9061946902654867, "grad_norm": 12.125, "learning_rate": 3.3995254521165376e-06, "loss": 1.4014017581939697, "step": 2944 }, { "epoch": 0.9068103116583301, "grad_norm": 6.375, "learning_rate": 3.397571725988371e-06, "loss": 1.2643262147903442, "step": 2946 }, { "epoch": 0.9074259330511736, "grad_norm": 9.9375, "learning_rate": 3.3956176047842683e-06, "loss": 1.7110437154769897, "step": 2948 }, { "epoch": 0.9080415544440169, "grad_norm": 7.21875, "learning_rate": 3.393663090446083e-06, "loss": 1.3050544261932373, "step": 2950 }, { "epoch": 0.9086571758368603, "grad_norm": 20.375, "learning_rate": 3.391708184916061e-06, "loss": 0.8747286796569824, "step": 2952 }, { "epoch": 0.9092727972297038, "grad_norm": 5.9375, "learning_rate": 3.389752890136835e-06, "loss": 0.7631102800369263, "step": 2954 }, { "epoch": 0.9098884186225471, "grad_norm": 22.0, "learning_rate": 3.3877972080514255e-06, "loss": 1.5913681983947754, "step": 2956 }, { "epoch": 0.9105040400153905, "grad_norm": 4.78125, "learning_rate": 3.385841140603238e-06, "loss": 1.36069655418396, "step": 2958 }, { "epoch": 0.911119661408234, "grad_norm": 5.53125, "learning_rate": 3.3838846897360595e-06, "loss": 1.1376392841339111, "step": 2960 }, { "epoch": 0.9117352828010773, "grad_norm": 12.75, "learning_rate": 3.3819278573940595e-06, "loss": 1.6261056661605835, "step": 2962 }, { "epoch": 0.9123509041939207, "grad_norm": 6.9375, "learning_rate": 3.3799706455217875e-06, "loss": 1.4007984399795532, "step": 2964 }, { "epoch": 0.9129665255867642, "grad_norm": 4.28125, "learning_rate": 3.3780130560641666e-06, "loss": 1.3082530498504639, "step": 2966 }, { "epoch": 0.9135821469796075, "grad_norm": 11.0625, "learning_rate": 3.376055090966499e-06, "loss": 1.2865114212036133, "step": 2968 }, { "epoch": 0.9141977683724509, "grad_norm": 16.625, "learning_rate": 3.3740967521744584e-06, "loss": 1.3429794311523438, "step": 2970 }, { "epoch": 0.9148133897652944, "grad_norm": 5.90625, "learning_rate": 3.372138041634088e-06, "loss": 1.3486827611923218, "step": 2972 }, { "epoch": 0.9154290111581378, "grad_norm": 27.5, "learning_rate": 3.3701789612918047e-06, "loss": 1.6032177209854126, "step": 2974 }, { "epoch": 0.9160446325509811, "grad_norm": 5.25, "learning_rate": 3.3682195130943897e-06, "loss": 1.3208991289138794, "step": 2976 }, { "epoch": 0.9166602539438246, "grad_norm": 22.875, "learning_rate": 3.3662596989889906e-06, "loss": 1.5932797193527222, "step": 2978 }, { "epoch": 0.917275875336668, "grad_norm": 2.9375, "learning_rate": 3.364299520923118e-06, "loss": 1.0427111387252808, "step": 2980 }, { "epoch": 0.9178914967295113, "grad_norm": 3.578125, "learning_rate": 3.3623389808446468e-06, "loss": 1.0028395652770996, "step": 2982 }, { "epoch": 0.9185071181223547, "grad_norm": 7.75, "learning_rate": 3.360378080701807e-06, "loss": 1.156231164932251, "step": 2984 }, { "epoch": 0.9191227395151982, "grad_norm": 1.9140625, "learning_rate": 3.3584168224431902e-06, "loss": 1.1786229610443115, "step": 2986 }, { "epoch": 0.9197383609080415, "grad_norm": 5.9375, "learning_rate": 3.3564552080177438e-06, "loss": 1.3133583068847656, "step": 2988 }, { "epoch": 0.9203539823008849, "grad_norm": 5.53125, "learning_rate": 3.354493239374766e-06, "loss": 1.6019172668457031, "step": 2990 }, { "epoch": 0.9209696036937284, "grad_norm": 6.15625, "learning_rate": 3.35253091846391e-06, "loss": 1.4596740007400513, "step": 2992 }, { "epoch": 0.9215852250865718, "grad_norm": 5.65625, "learning_rate": 3.350568247235178e-06, "loss": 0.813117265701294, "step": 2994 }, { "epoch": 0.9222008464794151, "grad_norm": 9.125, "learning_rate": 3.348605227638921e-06, "loss": 1.4932818412780762, "step": 2996 }, { "epoch": 0.9228164678722586, "grad_norm": 9.625, "learning_rate": 3.3466418616258345e-06, "loss": 1.745734453201294, "step": 2998 }, { "epoch": 0.923432089265102, "grad_norm": 6.65625, "learning_rate": 3.3446781511469606e-06, "loss": 1.5422130823135376, "step": 3000 }, { "epoch": 0.9240477106579453, "grad_norm": 3.0, "learning_rate": 3.342714098153681e-06, "loss": 1.240844964981079, "step": 3002 }, { "epoch": 0.9246633320507888, "grad_norm": 7.0625, "learning_rate": 3.34074970459772e-06, "loss": 1.431497573852539, "step": 3004 }, { "epoch": 0.9252789534436322, "grad_norm": 3.390625, "learning_rate": 3.3387849724311383e-06, "loss": 1.3990793228149414, "step": 3006 }, { "epoch": 0.9258945748364755, "grad_norm": 6.3125, "learning_rate": 3.336819903606337e-06, "loss": 1.4542325735092163, "step": 3008 }, { "epoch": 0.926510196229319, "grad_norm": 1.6328125, "learning_rate": 3.3348545000760468e-06, "loss": 1.1801506280899048, "step": 3010 }, { "epoch": 0.9271258176221624, "grad_norm": 2.75, "learning_rate": 3.332888763793334e-06, "loss": 1.095503807067871, "step": 3012 }, { "epoch": 0.9277414390150057, "grad_norm": 6.40625, "learning_rate": 3.3309226967115936e-06, "loss": 1.2318484783172607, "step": 3014 }, { "epoch": 0.9283570604078492, "grad_norm": 3.828125, "learning_rate": 3.3289563007845525e-06, "loss": 1.251956820487976, "step": 3016 }, { "epoch": 0.9289726818006926, "grad_norm": 7.5625, "learning_rate": 3.326989577966262e-06, "loss": 1.3468469381332397, "step": 3018 }, { "epoch": 0.929588303193536, "grad_norm": 8.75, "learning_rate": 3.3250225302110973e-06, "loss": 1.019060730934143, "step": 3020 }, { "epoch": 0.9302039245863793, "grad_norm": 3.046875, "learning_rate": 3.323055159473759e-06, "loss": 1.1843026876449585, "step": 3022 }, { "epoch": 0.9308195459792228, "grad_norm": 7.59375, "learning_rate": 3.3210874677092675e-06, "loss": 1.3412996530532837, "step": 3024 }, { "epoch": 0.9314351673720662, "grad_norm": 11.1875, "learning_rate": 3.319119456872961e-06, "loss": 1.5219874382019043, "step": 3026 }, { "epoch": 0.9320507887649095, "grad_norm": 6.0625, "learning_rate": 3.3171511289204973e-06, "loss": 1.341783881187439, "step": 3028 }, { "epoch": 0.932666410157753, "grad_norm": 7.65625, "learning_rate": 3.3151824858078474e-06, "loss": 1.3534069061279297, "step": 3030 }, { "epoch": 0.9332820315505964, "grad_norm": 5.375, "learning_rate": 3.313213529491297e-06, "loss": 1.3870600461959839, "step": 3032 }, { "epoch": 0.9338976529434397, "grad_norm": 9.1875, "learning_rate": 3.311244261927441e-06, "loss": 1.5487487316131592, "step": 3034 }, { "epoch": 0.9345132743362832, "grad_norm": 6.1875, "learning_rate": 3.309274685073185e-06, "loss": 1.4522387981414795, "step": 3036 }, { "epoch": 0.9351288957291266, "grad_norm": 11.9375, "learning_rate": 3.307304800885741e-06, "loss": 1.3280080556869507, "step": 3038 }, { "epoch": 0.93574451712197, "grad_norm": 10.375, "learning_rate": 3.3053346113226287e-06, "loss": 1.4234414100646973, "step": 3040 }, { "epoch": 0.9363601385148134, "grad_norm": 5.09375, "learning_rate": 3.3033641183416686e-06, "loss": 1.592652440071106, "step": 3042 }, { "epoch": 0.9369757599076568, "grad_norm": 4.21875, "learning_rate": 3.301393323900984e-06, "loss": 1.0118399858474731, "step": 3044 }, { "epoch": 0.9375913813005002, "grad_norm": 3.59375, "learning_rate": 3.2994222299589986e-06, "loss": 1.1533851623535156, "step": 3046 }, { "epoch": 0.9382070026933436, "grad_norm": 5.1875, "learning_rate": 3.2974508384744303e-06, "loss": 1.3363091945648193, "step": 3048 }, { "epoch": 0.938822624086187, "grad_norm": 5.4375, "learning_rate": 3.295479151406296e-06, "loss": 1.2613701820373535, "step": 3050 }, { "epoch": 0.9394382454790304, "grad_norm": 12.5, "learning_rate": 3.293507170713906e-06, "loss": 1.6866613626480103, "step": 3052 }, { "epoch": 0.9400538668718738, "grad_norm": 7.875, "learning_rate": 3.2915348983568612e-06, "loss": 1.5392214059829712, "step": 3054 }, { "epoch": 0.9406694882647172, "grad_norm": 5.53125, "learning_rate": 3.2895623362950533e-06, "loss": 1.2921439409255981, "step": 3056 }, { "epoch": 0.9412851096575606, "grad_norm": 21.875, "learning_rate": 3.2875894864886604e-06, "loss": 1.4320118427276611, "step": 3058 }, { "epoch": 0.941900731050404, "grad_norm": 5.625, "learning_rate": 3.2856163508981486e-06, "loss": 1.557847023010254, "step": 3060 }, { "epoch": 0.9425163524432474, "grad_norm": 5.625, "learning_rate": 3.283642931484266e-06, "loss": 1.168736457824707, "step": 3062 }, { "epoch": 0.9431319738360908, "grad_norm": 6.46875, "learning_rate": 3.281669230208045e-06, "loss": 1.7805376052856445, "step": 3064 }, { "epoch": 0.9437475952289343, "grad_norm": 8.375, "learning_rate": 3.2796952490307953e-06, "loss": 1.4829258918762207, "step": 3066 }, { "epoch": 0.9443632166217776, "grad_norm": 4.4375, "learning_rate": 3.2777209899141084e-06, "loss": 1.155414342880249, "step": 3068 }, { "epoch": 0.944978838014621, "grad_norm": 5.90625, "learning_rate": 3.275746454819847e-06, "loss": 1.3559682369232178, "step": 3070 }, { "epoch": 0.9455944594074644, "grad_norm": 8.9375, "learning_rate": 3.273771645710153e-06, "loss": 1.1242122650146484, "step": 3072 }, { "epoch": 0.9462100808003078, "grad_norm": 3.109375, "learning_rate": 3.2717965645474382e-06, "loss": 1.1673762798309326, "step": 3074 }, { "epoch": 0.9468257021931512, "grad_norm": 4.375, "learning_rate": 3.269821213294384e-06, "loss": 1.437583088874817, "step": 3076 }, { "epoch": 0.9474413235859946, "grad_norm": 4.875, "learning_rate": 3.2678455939139418e-06, "loss": 1.2453110218048096, "step": 3078 }, { "epoch": 0.948056944978838, "grad_norm": 12.5, "learning_rate": 3.2658697083693302e-06, "loss": 1.655014991760254, "step": 3080 }, { "epoch": 0.9486725663716814, "grad_norm": 9.375, "learning_rate": 3.263893558624028e-06, "loss": 1.2827417850494385, "step": 3082 }, { "epoch": 0.9492881877645248, "grad_norm": 7.53125, "learning_rate": 3.2619171466417823e-06, "loss": 0.9656928777694702, "step": 3084 }, { "epoch": 0.9499038091573683, "grad_norm": 5.3125, "learning_rate": 3.259940474386597e-06, "loss": 1.532065987586975, "step": 3086 }, { "epoch": 0.9505194305502116, "grad_norm": 8.5, "learning_rate": 3.2579635438227354e-06, "loss": 1.1672122478485107, "step": 3088 }, { "epoch": 0.951135051943055, "grad_norm": 2.265625, "learning_rate": 3.2559863569147167e-06, "loss": 1.1641114950180054, "step": 3090 }, { "epoch": 0.9517506733358985, "grad_norm": 16.25, "learning_rate": 3.2540089156273185e-06, "loss": 1.7468979358673096, "step": 3092 }, { "epoch": 0.9523662947287418, "grad_norm": 13.6875, "learning_rate": 3.2520312219255656e-06, "loss": 1.3798656463623047, "step": 3094 }, { "epoch": 0.9529819161215852, "grad_norm": 11.0, "learning_rate": 3.250053277774739e-06, "loss": 1.3403314352035522, "step": 3096 }, { "epoch": 0.9535975375144287, "grad_norm": 5.375, "learning_rate": 3.2480750851403652e-06, "loss": 1.4658020734786987, "step": 3098 }, { "epoch": 0.954213158907272, "grad_norm": 5.84375, "learning_rate": 3.2460966459882182e-06, "loss": 1.1230003833770752, "step": 3100 }, { "epoch": 0.9548287803001154, "grad_norm": 3.9375, "learning_rate": 3.2441179622843178e-06, "loss": 1.2341731786727905, "step": 3102 }, { "epoch": 0.9554444016929589, "grad_norm": 11.375, "learning_rate": 3.242139035994926e-06, "loss": 1.2085893154144287, "step": 3104 }, { "epoch": 0.9560600230858022, "grad_norm": 3.265625, "learning_rate": 3.240159869086546e-06, "loss": 1.1262381076812744, "step": 3106 }, { "epoch": 0.9566756444786456, "grad_norm": 4.28125, "learning_rate": 3.2381804635259208e-06, "loss": 1.32058846950531, "step": 3108 }, { "epoch": 0.957291265871489, "grad_norm": 4.78125, "learning_rate": 3.236200821280031e-06, "loss": 1.0688804388046265, "step": 3110 }, { "epoch": 0.9579068872643325, "grad_norm": 21.375, "learning_rate": 3.2342209443160895e-06, "loss": 1.5210665464401245, "step": 3112 }, { "epoch": 0.9585225086571758, "grad_norm": 31.25, "learning_rate": 3.2322408346015453e-06, "loss": 1.36966073513031, "step": 3114 }, { "epoch": 0.9591381300500192, "grad_norm": 5.875, "learning_rate": 3.230260494104078e-06, "loss": 1.208855390548706, "step": 3116 }, { "epoch": 0.9597537514428627, "grad_norm": 53.0, "learning_rate": 3.2282799247915964e-06, "loss": 1.295619010925293, "step": 3118 }, { "epoch": 0.960369372835706, "grad_norm": 4.15625, "learning_rate": 3.2262991286322366e-06, "loss": 1.0792986154556274, "step": 3120 }, { "epoch": 0.9609849942285494, "grad_norm": 4.125, "learning_rate": 3.2243181075943595e-06, "loss": 1.2362124919891357, "step": 3122 }, { "epoch": 0.9616006156213929, "grad_norm": 46.75, "learning_rate": 3.2223368636465513e-06, "loss": 1.3643451929092407, "step": 3124 }, { "epoch": 0.9622162370142362, "grad_norm": 9.6875, "learning_rate": 3.2203553987576175e-06, "loss": 1.571152687072754, "step": 3126 }, { "epoch": 0.9628318584070796, "grad_norm": 28.0, "learning_rate": 3.2183737148965845e-06, "loss": 1.1599127054214478, "step": 3128 }, { "epoch": 0.9634474797999231, "grad_norm": 6.15625, "learning_rate": 3.216391814032696e-06, "loss": 0.9567220211029053, "step": 3130 }, { "epoch": 0.9640631011927665, "grad_norm": 3.0, "learning_rate": 3.2144096981354113e-06, "loss": 1.0243805646896362, "step": 3132 }, { "epoch": 0.9646787225856098, "grad_norm": 11.8125, "learning_rate": 3.2124273691744032e-06, "loss": 1.4391425848007202, "step": 3134 }, { "epoch": 0.9652943439784533, "grad_norm": 3.765625, "learning_rate": 3.2104448291195567e-06, "loss": 1.2657476663589478, "step": 3136 }, { "epoch": 0.9659099653712967, "grad_norm": 4.125, "learning_rate": 3.208462079940966e-06, "loss": 1.4858558177947998, "step": 3138 }, { "epoch": 0.96652558676414, "grad_norm": 17.625, "learning_rate": 3.2064791236089344e-06, "loss": 1.4508737325668335, "step": 3140 }, { "epoch": 0.9671412081569835, "grad_norm": 4.75, "learning_rate": 3.2044959620939685e-06, "loss": 1.6149027347564697, "step": 3142 }, { "epoch": 0.9677568295498269, "grad_norm": 7.4375, "learning_rate": 3.2025125973667815e-06, "loss": 1.2314261198043823, "step": 3144 }, { "epoch": 0.9683724509426702, "grad_norm": 5.65625, "learning_rate": 3.2005290313982864e-06, "loss": 1.1191548109054565, "step": 3146 }, { "epoch": 0.9689880723355137, "grad_norm": 15.75, "learning_rate": 3.1985452661595984e-06, "loss": 1.3416482210159302, "step": 3148 }, { "epoch": 0.9696036937283571, "grad_norm": 13.3125, "learning_rate": 3.1965613036220283e-06, "loss": 1.5457309484481812, "step": 3150 }, { "epoch": 0.9702193151212004, "grad_norm": 9.125, "learning_rate": 3.1945771457570855e-06, "loss": 1.552435040473938, "step": 3152 }, { "epoch": 0.9708349365140438, "grad_norm": 7.65625, "learning_rate": 3.192592794536471e-06, "loss": 1.7783514261245728, "step": 3154 }, { "epoch": 0.9714505579068873, "grad_norm": 6.875, "learning_rate": 3.1906082519320793e-06, "loss": 1.5031394958496094, "step": 3156 }, { "epoch": 0.9720661792997307, "grad_norm": 2.9375, "learning_rate": 3.1886235199159955e-06, "loss": 1.2081122398376465, "step": 3158 }, { "epoch": 0.972681800692574, "grad_norm": 4.3125, "learning_rate": 3.186638600460491e-06, "loss": 1.0296821594238281, "step": 3160 }, { "epoch": 0.9732974220854175, "grad_norm": 8.375, "learning_rate": 3.1846534955380257e-06, "loss": 1.1985293626785278, "step": 3162 }, { "epoch": 0.9739130434782609, "grad_norm": 7.78125, "learning_rate": 3.1826682071212435e-06, "loss": 1.211530089378357, "step": 3164 }, { "epoch": 0.9745286648711042, "grad_norm": 5.375, "learning_rate": 3.1806827371829687e-06, "loss": 1.4879037141799927, "step": 3166 }, { "epoch": 0.9751442862639477, "grad_norm": 8.125, "learning_rate": 3.1786970876962076e-06, "loss": 1.6099568605422974, "step": 3168 }, { "epoch": 0.9757599076567911, "grad_norm": 9.1875, "learning_rate": 3.1767112606341454e-06, "loss": 1.311849594116211, "step": 3170 }, { "epoch": 0.9763755290496344, "grad_norm": 6.34375, "learning_rate": 3.1747252579701415e-06, "loss": 1.1136656999588013, "step": 3172 }, { "epoch": 0.9769911504424779, "grad_norm": 15.0625, "learning_rate": 3.1727390816777326e-06, "loss": 1.661421775817871, "step": 3174 }, { "epoch": 0.9776067718353213, "grad_norm": 17.875, "learning_rate": 3.1707527337306266e-06, "loss": 1.4666205644607544, "step": 3176 }, { "epoch": 0.9782223932281647, "grad_norm": 9.625, "learning_rate": 3.168766216102701e-06, "loss": 1.4527794122695923, "step": 3178 }, { "epoch": 0.9788380146210081, "grad_norm": 11.625, "learning_rate": 3.166779530768004e-06, "loss": 1.731745719909668, "step": 3180 }, { "epoch": 0.9794536360138515, "grad_norm": 14.375, "learning_rate": 3.164792679700748e-06, "loss": 1.977335810661316, "step": 3182 }, { "epoch": 0.9800692574066949, "grad_norm": 30.125, "learning_rate": 3.1628056648753127e-06, "loss": 1.8837709426879883, "step": 3184 }, { "epoch": 0.9806848787995383, "grad_norm": 24.75, "learning_rate": 3.1608184882662386e-06, "loss": 1.3388452529907227, "step": 3186 }, { "epoch": 0.9813005001923817, "grad_norm": 11.5, "learning_rate": 3.158831151848228e-06, "loss": 1.4896000623703003, "step": 3188 }, { "epoch": 0.9819161215852251, "grad_norm": 22.875, "learning_rate": 3.1568436575961412e-06, "loss": 1.3206768035888672, "step": 3190 }, { "epoch": 0.9825317429780684, "grad_norm": 7.21875, "learning_rate": 3.1548560074849965e-06, "loss": 0.7416938543319702, "step": 3192 }, { "epoch": 0.9831473643709119, "grad_norm": 8.8125, "learning_rate": 3.152868203489965e-06, "loss": 1.5659494400024414, "step": 3194 }, { "epoch": 0.9837629857637553, "grad_norm": 3.765625, "learning_rate": 3.150880247586374e-06, "loss": 1.1568924188613892, "step": 3196 }, { "epoch": 0.9843786071565986, "grad_norm": 3.875, "learning_rate": 3.1488921417496985e-06, "loss": 1.079269528388977, "step": 3198 }, { "epoch": 0.9849942285494421, "grad_norm": 5.625, "learning_rate": 3.1469038879555647e-06, "loss": 1.287787914276123, "step": 3200 }, { "epoch": 0.9856098499422855, "grad_norm": 13.3125, "learning_rate": 3.144915488179744e-06, "loss": 1.1777808666229248, "step": 3202 }, { "epoch": 0.986225471335129, "grad_norm": 3.78125, "learning_rate": 3.1429269443981537e-06, "loss": 0.975529134273529, "step": 3204 }, { "epoch": 0.9868410927279723, "grad_norm": 10.4375, "learning_rate": 3.1409382585868553e-06, "loss": 1.20067298412323, "step": 3206 }, { "epoch": 0.9874567141208157, "grad_norm": 7.125, "learning_rate": 3.1389494327220506e-06, "loss": 1.0966123342514038, "step": 3208 }, { "epoch": 0.9880723355136591, "grad_norm": 6.4375, "learning_rate": 3.1369604687800804e-06, "loss": 1.4545329809188843, "step": 3210 }, { "epoch": 0.9886879569065025, "grad_norm": 2.875, "learning_rate": 3.1349713687374213e-06, "loss": 1.449361801147461, "step": 3212 }, { "epoch": 0.9893035782993459, "grad_norm": 2.59375, "learning_rate": 3.1329821345706877e-06, "loss": 1.2503468990325928, "step": 3214 }, { "epoch": 0.9899191996921893, "grad_norm": 8.125, "learning_rate": 3.1309927682566266e-06, "loss": 1.3865764141082764, "step": 3216 }, { "epoch": 0.9905348210850327, "grad_norm": 4.34375, "learning_rate": 3.1290032717721143e-06, "loss": 1.2038789987564087, "step": 3218 }, { "epoch": 0.9911504424778761, "grad_norm": 25.375, "learning_rate": 3.1270136470941604e-06, "loss": 1.167089819908142, "step": 3220 }, { "epoch": 0.9917660638707195, "grad_norm": 24.875, "learning_rate": 3.1250238961998972e-06, "loss": 1.6324681043624878, "step": 3222 }, { "epoch": 0.992381685263563, "grad_norm": 12.0625, "learning_rate": 3.1230340210665866e-06, "loss": 1.3044756650924683, "step": 3224 }, { "epoch": 0.9929973066564063, "grad_norm": 6.1875, "learning_rate": 3.121044023671611e-06, "loss": 1.5603355169296265, "step": 3226 }, { "epoch": 0.9936129280492497, "grad_norm": 14.375, "learning_rate": 3.1190539059924756e-06, "loss": 1.5595190525054932, "step": 3228 }, { "epoch": 0.9942285494420932, "grad_norm": 11.0, "learning_rate": 3.117063670006806e-06, "loss": 1.5781952142715454, "step": 3230 }, { "epoch": 0.9948441708349365, "grad_norm": 5.46875, "learning_rate": 3.115073317692342e-06, "loss": 1.336469292640686, "step": 3232 }, { "epoch": 0.9954597922277799, "grad_norm": 3.265625, "learning_rate": 3.113082851026944e-06, "loss": 0.8571915626525879, "step": 3234 }, { "epoch": 0.9960754136206234, "grad_norm": 4.84375, "learning_rate": 3.1110922719885817e-06, "loss": 0.9757181406021118, "step": 3236 }, { "epoch": 0.9966910350134667, "grad_norm": 6.34375, "learning_rate": 3.109101582555338e-06, "loss": 0.931984543800354, "step": 3238 }, { "epoch": 0.9973066564063101, "grad_norm": 4.03125, "learning_rate": 3.1071107847054074e-06, "loss": 0.8089004755020142, "step": 3240 }, { "epoch": 0.9979222777991535, "grad_norm": 4.6875, "learning_rate": 3.1051198804170877e-06, "loss": 1.2749488353729248, "step": 3242 }, { "epoch": 0.9985378991919969, "grad_norm": 4.34375, "learning_rate": 3.103128871668787e-06, "loss": 1.2046691179275513, "step": 3244 }, { "epoch": 0.9991535205848403, "grad_norm": 25.625, "learning_rate": 3.1011377604390147e-06, "loss": 1.3801567554473877, "step": 3246 }, { "epoch": 0.9997691419776837, "grad_norm": 54.0, "learning_rate": 3.099146548706383e-06, "loss": 1.3879741430282593, "step": 3248 }, { "epoch": 1.0003078106964216, "grad_norm": 2.9375, "learning_rate": 3.0971552384496028e-06, "loss": 1.3940231800079346, "step": 3250 }, { "epoch": 1.000923432089265, "grad_norm": 10.5625, "learning_rate": 3.0951638316474853e-06, "loss": 1.43914794921875, "step": 3252 }, { "epoch": 1.0015390534821085, "grad_norm": 6.75, "learning_rate": 3.0931723302789352e-06, "loss": 1.196732997894287, "step": 3254 }, { "epoch": 1.002154674874952, "grad_norm": 9.3125, "learning_rate": 3.0911807363229505e-06, "loss": 1.4213953018188477, "step": 3256 }, { "epoch": 1.0027702962677953, "grad_norm": 12.6875, "learning_rate": 3.0891890517586254e-06, "loss": 1.6695702075958252, "step": 3258 }, { "epoch": 1.0033859176606388, "grad_norm": 3.8125, "learning_rate": 3.0871972785651395e-06, "loss": 1.1721190214157104, "step": 3260 }, { "epoch": 1.0040015390534822, "grad_norm": 10.4375, "learning_rate": 3.085205418721764e-06, "loss": 0.7390797138214111, "step": 3262 }, { "epoch": 1.0046171604463254, "grad_norm": 8.75, "learning_rate": 3.083213474207854e-06, "loss": 1.515286922454834, "step": 3264 }, { "epoch": 1.0052327818391689, "grad_norm": 3.3125, "learning_rate": 3.081221447002849e-06, "loss": 1.2244067192077637, "step": 3266 }, { "epoch": 1.0058484032320123, "grad_norm": 24.875, "learning_rate": 3.0792293390862715e-06, "loss": 1.3070162534713745, "step": 3268 }, { "epoch": 1.0064640246248557, "grad_norm": 7.59375, "learning_rate": 3.0772371524377242e-06, "loss": 1.2169034481048584, "step": 3270 }, { "epoch": 1.0070796460176992, "grad_norm": 9.0, "learning_rate": 3.0752448890368865e-06, "loss": 1.5556094646453857, "step": 3272 }, { "epoch": 1.0076952674105426, "grad_norm": 7.9375, "learning_rate": 3.0732525508635157e-06, "loss": 1.0534472465515137, "step": 3274 }, { "epoch": 1.0083108888033858, "grad_norm": 6.09375, "learning_rate": 3.071260139897445e-06, "loss": 1.1789206266403198, "step": 3276 }, { "epoch": 1.0089265101962293, "grad_norm": 7.78125, "learning_rate": 3.069267658118574e-06, "loss": 1.4941965341567993, "step": 3278 }, { "epoch": 1.0095421315890727, "grad_norm": 28.125, "learning_rate": 3.0672751075068796e-06, "loss": 1.2050433158874512, "step": 3280 }, { "epoch": 1.0101577529819161, "grad_norm": 14.625, "learning_rate": 3.0652824900424015e-06, "loss": 1.8228259086608887, "step": 3282 }, { "epoch": 1.0107733743747596, "grad_norm": 4.4375, "learning_rate": 3.063289807705251e-06, "loss": 1.3229200839996338, "step": 3284 }, { "epoch": 1.011388995767603, "grad_norm": 6.21875, "learning_rate": 3.061297062475599e-06, "loss": 1.2523341178894043, "step": 3286 }, { "epoch": 1.0120046171604464, "grad_norm": 2.234375, "learning_rate": 3.059304256333682e-06, "loss": 1.1175116300582886, "step": 3288 }, { "epoch": 1.0126202385532896, "grad_norm": 2.5625, "learning_rate": 3.0573113912597967e-06, "loss": 1.0994659662246704, "step": 3290 }, { "epoch": 1.013235859946133, "grad_norm": 8.25, "learning_rate": 3.0553184692342967e-06, "loss": 1.1365960836410522, "step": 3292 }, { "epoch": 1.0138514813389765, "grad_norm": 6.15625, "learning_rate": 3.0533254922375942e-06, "loss": 1.5383667945861816, "step": 3294 }, { "epoch": 1.01446710273182, "grad_norm": 25.375, "learning_rate": 3.051332462250155e-06, "loss": 1.1995936632156372, "step": 3296 }, { "epoch": 1.0150827241246634, "grad_norm": 5.3125, "learning_rate": 3.049339381252497e-06, "loss": 1.3785426616668701, "step": 3298 }, { "epoch": 1.0156983455175068, "grad_norm": 3.0625, "learning_rate": 3.04734625122519e-06, "loss": 1.074662446975708, "step": 3300 }, { "epoch": 1.01631396691035, "grad_norm": 6.25, "learning_rate": 3.045353074148851e-06, "loss": 1.5231975317001343, "step": 3302 }, { "epoch": 1.0169295883031935, "grad_norm": 3.578125, "learning_rate": 3.0433598520041462e-06, "loss": 1.3498982191085815, "step": 3304 }, { "epoch": 1.017545209696037, "grad_norm": 8.4375, "learning_rate": 3.041366586771786e-06, "loss": 1.6604855060577393, "step": 3306 }, { "epoch": 1.0181608310888803, "grad_norm": 9.3125, "learning_rate": 3.0393732804325193e-06, "loss": 1.5140495300292969, "step": 3308 }, { "epoch": 1.0187764524817238, "grad_norm": 2.5, "learning_rate": 3.037379934967142e-06, "loss": 1.2019877433776855, "step": 3310 }, { "epoch": 1.0193920738745672, "grad_norm": 3.34375, "learning_rate": 3.0353865523564854e-06, "loss": 1.2346773147583008, "step": 3312 }, { "epoch": 1.0200076952674106, "grad_norm": 5.5625, "learning_rate": 3.0333931345814177e-06, "loss": 1.1609855890274048, "step": 3314 }, { "epoch": 1.0206233166602539, "grad_norm": 4.34375, "learning_rate": 3.031399683622844e-06, "loss": 1.006929874420166, "step": 3316 }, { "epoch": 1.0212389380530973, "grad_norm": 20.5, "learning_rate": 3.029406201461702e-06, "loss": 1.6011276245117188, "step": 3318 }, { "epoch": 1.0218545594459407, "grad_norm": 9.4375, "learning_rate": 3.0274126900789575e-06, "loss": 1.4379968643188477, "step": 3320 }, { "epoch": 1.0224701808387842, "grad_norm": 15.25, "learning_rate": 3.0254191514556084e-06, "loss": 1.412289023399353, "step": 3322 }, { "epoch": 1.0230858022316276, "grad_norm": 5.84375, "learning_rate": 3.023425587572678e-06, "loss": 1.3547412157058716, "step": 3324 }, { "epoch": 1.023701423624471, "grad_norm": 7.65625, "learning_rate": 3.0214320004112176e-06, "loss": 1.49954354763031, "step": 3326 }, { "epoch": 1.0243170450173142, "grad_norm": 27.75, "learning_rate": 3.019438391952297e-06, "loss": 1.1961394548416138, "step": 3328 }, { "epoch": 1.0249326664101577, "grad_norm": 5.1875, "learning_rate": 3.017444764177011e-06, "loss": 1.1155706644058228, "step": 3330 }, { "epoch": 1.0255482878030011, "grad_norm": 6.28125, "learning_rate": 3.0154511190664713e-06, "loss": 1.260684847831726, "step": 3332 }, { "epoch": 1.0261639091958445, "grad_norm": 2.5, "learning_rate": 3.0134574586018085e-06, "loss": 0.9367880821228027, "step": 3334 }, { "epoch": 1.026779530588688, "grad_norm": 12.375, "learning_rate": 3.011463784764168e-06, "loss": 1.618381381034851, "step": 3336 }, { "epoch": 1.0273951519815314, "grad_norm": 11.25, "learning_rate": 3.009470099534707e-06, "loss": 1.3859292268753052, "step": 3338 }, { "epoch": 1.0280107733743749, "grad_norm": 3.546875, "learning_rate": 3.0074764048945963e-06, "loss": 1.3100966215133667, "step": 3340 }, { "epoch": 1.028626394767218, "grad_norm": 14.8125, "learning_rate": 3.005482702825014e-06, "loss": 1.727941632270813, "step": 3342 }, { "epoch": 1.0292420161600615, "grad_norm": 7.34375, "learning_rate": 3.0034889953071466e-06, "loss": 1.530333161354065, "step": 3344 }, { "epoch": 1.029857637552905, "grad_norm": 10.25, "learning_rate": 3.0014952843221874e-06, "loss": 1.1521104574203491, "step": 3346 }, { "epoch": 1.0304732589457484, "grad_norm": 2.78125, "learning_rate": 2.9995015718513296e-06, "loss": 1.0660818815231323, "step": 3348 }, { "epoch": 1.0310888803385918, "grad_norm": 2.265625, "learning_rate": 2.9975078598757723e-06, "loss": 1.217354416847229, "step": 3350 }, { "epoch": 1.0317045017314352, "grad_norm": 6.0, "learning_rate": 2.9955141503767093e-06, "loss": 1.3975917100906372, "step": 3352 }, { "epoch": 1.0323201231242787, "grad_norm": 5.5625, "learning_rate": 2.9935204453353363e-06, "loss": 1.3113981485366821, "step": 3354 }, { "epoch": 1.032935744517122, "grad_norm": 4.90625, "learning_rate": 2.9915267467328414e-06, "loss": 1.3454499244689941, "step": 3356 }, { "epoch": 1.0335513659099653, "grad_norm": 13.25, "learning_rate": 2.9895330565504088e-06, "loss": 1.5505820512771606, "step": 3358 }, { "epoch": 1.0341669873028088, "grad_norm": 8.8125, "learning_rate": 2.9875393767692117e-06, "loss": 1.083367109298706, "step": 3360 }, { "epoch": 1.0347826086956522, "grad_norm": 27.0, "learning_rate": 2.9855457093704166e-06, "loss": 1.8957247734069824, "step": 3362 }, { "epoch": 1.0353982300884956, "grad_norm": 9.375, "learning_rate": 2.9835520563351737e-06, "loss": 1.5325194597244263, "step": 3364 }, { "epoch": 1.036013851481339, "grad_norm": 6.0, "learning_rate": 2.98155841964462e-06, "loss": 1.1251282691955566, "step": 3366 }, { "epoch": 1.0366294728741823, "grad_norm": 4.78125, "learning_rate": 2.9795648012798795e-06, "loss": 1.5618937015533447, "step": 3368 }, { "epoch": 1.0372450942670257, "grad_norm": 12.0625, "learning_rate": 2.9775712032220526e-06, "loss": 1.2356228828430176, "step": 3370 }, { "epoch": 1.0378607156598691, "grad_norm": 6.875, "learning_rate": 2.975577627452225e-06, "loss": 1.0679433345794678, "step": 3372 }, { "epoch": 1.0384763370527126, "grad_norm": 1.796875, "learning_rate": 2.973584075951456e-06, "loss": 1.0582077503204346, "step": 3374 }, { "epoch": 1.039091958445556, "grad_norm": 14.8125, "learning_rate": 2.9715905507007837e-06, "loss": 1.3411349058151245, "step": 3376 }, { "epoch": 1.0397075798383995, "grad_norm": 6.15625, "learning_rate": 2.969597053681217e-06, "loss": 1.0146530866622925, "step": 3378 }, { "epoch": 1.0403232012312429, "grad_norm": 6.46875, "learning_rate": 2.9676035868737397e-06, "loss": 1.305014967918396, "step": 3380 }, { "epoch": 1.040938822624086, "grad_norm": 48.75, "learning_rate": 2.965610152259304e-06, "loss": 1.4220374822616577, "step": 3382 }, { "epoch": 1.0415544440169295, "grad_norm": 1.796875, "learning_rate": 2.9636167518188308e-06, "loss": 1.1082732677459717, "step": 3384 }, { "epoch": 1.042170065409773, "grad_norm": 2.78125, "learning_rate": 2.961623387533208e-06, "loss": 0.9837604761123657, "step": 3386 }, { "epoch": 1.0427856868026164, "grad_norm": 6.875, "learning_rate": 2.959630061383285e-06, "loss": 1.675440788269043, "step": 3388 }, { "epoch": 1.0434013081954598, "grad_norm": 2.6875, "learning_rate": 2.957636775349874e-06, "loss": 1.466201663017273, "step": 3390 }, { "epoch": 1.0440169295883033, "grad_norm": 6.84375, "learning_rate": 2.9556435314137495e-06, "loss": 1.3465406894683838, "step": 3392 }, { "epoch": 1.0446325509811465, "grad_norm": 4.34375, "learning_rate": 2.953650331555642e-06, "loss": 1.222961664199829, "step": 3394 }, { "epoch": 1.04524817237399, "grad_norm": 8.8125, "learning_rate": 2.9516571777562387e-06, "loss": 1.6121528148651123, "step": 3396 }, { "epoch": 1.0458637937668334, "grad_norm": 6.8125, "learning_rate": 2.949664071996182e-06, "loss": 1.0301077365875244, "step": 3398 }, { "epoch": 1.0464794151596768, "grad_norm": 6.90625, "learning_rate": 2.947671016256066e-06, "loss": 1.1865458488464355, "step": 3400 }, { "epoch": 1.0470950365525202, "grad_norm": 10.8125, "learning_rate": 2.945678012516433e-06, "loss": 1.6517291069030762, "step": 3402 }, { "epoch": 1.0477106579453637, "grad_norm": 3.96875, "learning_rate": 2.943685062757777e-06, "loss": 1.078000783920288, "step": 3404 }, { "epoch": 1.048326279338207, "grad_norm": 8.75, "learning_rate": 2.941692168960536e-06, "loss": 1.5187592506408691, "step": 3406 }, { "epoch": 1.0489419007310503, "grad_norm": 20.375, "learning_rate": 2.9396993331050944e-06, "loss": 1.0346647500991821, "step": 3408 }, { "epoch": 1.0495575221238937, "grad_norm": 6.15625, "learning_rate": 2.937706557171777e-06, "loss": 1.3847370147705078, "step": 3410 }, { "epoch": 1.0501731435167372, "grad_norm": 3.0625, "learning_rate": 2.9357138431408493e-06, "loss": 0.9305753707885742, "step": 3412 }, { "epoch": 1.0507887649095806, "grad_norm": 32.5, "learning_rate": 2.933721192992518e-06, "loss": 0.8851282000541687, "step": 3414 }, { "epoch": 1.051404386302424, "grad_norm": 9.3125, "learning_rate": 2.9317286087069225e-06, "loss": 1.888558268547058, "step": 3416 }, { "epoch": 1.0520200076952675, "grad_norm": 4.65625, "learning_rate": 2.9297360922641393e-06, "loss": 1.3558225631713867, "step": 3418 }, { "epoch": 1.052635629088111, "grad_norm": 3.21875, "learning_rate": 2.9277436456441755e-06, "loss": 1.1919571161270142, "step": 3420 }, { "epoch": 1.0532512504809541, "grad_norm": 6.53125, "learning_rate": 2.92575127082697e-06, "loss": 1.3299031257629395, "step": 3422 }, { "epoch": 1.0538668718737976, "grad_norm": 1.484375, "learning_rate": 2.9237589697923917e-06, "loss": 1.3148071765899658, "step": 3424 }, { "epoch": 1.054482493266641, "grad_norm": 6.0, "learning_rate": 2.921766744520235e-06, "loss": 1.4010940790176392, "step": 3426 }, { "epoch": 1.0550981146594844, "grad_norm": 3.4375, "learning_rate": 2.919774596990217e-06, "loss": 1.2768487930297852, "step": 3428 }, { "epoch": 1.0557137360523279, "grad_norm": 5.125, "learning_rate": 2.917782529181981e-06, "loss": 1.5137887001037598, "step": 3430 }, { "epoch": 1.0563293574451713, "grad_norm": 9.3125, "learning_rate": 2.9157905430750884e-06, "loss": 0.9090896844863892, "step": 3432 }, { "epoch": 1.0569449788380145, "grad_norm": 7.28125, "learning_rate": 2.9137986406490205e-06, "loss": 1.3498903512954712, "step": 3434 }, { "epoch": 1.057560600230858, "grad_norm": 7.90625, "learning_rate": 2.9118068238831755e-06, "loss": 0.933256983757019, "step": 3436 }, { "epoch": 1.0581762216237014, "grad_norm": 5.3125, "learning_rate": 2.909815094756867e-06, "loss": 1.6633082628250122, "step": 3438 }, { "epoch": 1.0587918430165448, "grad_norm": 4.625, "learning_rate": 2.907823455249321e-06, "loss": 1.3229087591171265, "step": 3440 }, { "epoch": 1.0594074644093883, "grad_norm": 2.515625, "learning_rate": 2.9058319073396725e-06, "loss": 1.1567513942718506, "step": 3442 }, { "epoch": 1.0600230858022317, "grad_norm": 7.8125, "learning_rate": 2.9038404530069687e-06, "loss": 1.1566039323806763, "step": 3444 }, { "epoch": 1.060638707195075, "grad_norm": 7.875, "learning_rate": 2.9018490942301625e-06, "loss": 1.2508800029754639, "step": 3446 }, { "epoch": 1.0612543285879183, "grad_norm": 10.75, "learning_rate": 2.899857832988112e-06, "loss": 1.4512087106704712, "step": 3448 }, { "epoch": 1.0618699499807618, "grad_norm": 6.03125, "learning_rate": 2.897866671259577e-06, "loss": 1.2040528059005737, "step": 3450 }, { "epoch": 1.0624855713736052, "grad_norm": 6.125, "learning_rate": 2.8958756110232212e-06, "loss": 1.1625609397888184, "step": 3452 }, { "epoch": 1.0631011927664487, "grad_norm": 10.875, "learning_rate": 2.893884654257604e-06, "loss": 1.39517343044281, "step": 3454 }, { "epoch": 1.063716814159292, "grad_norm": 12.8125, "learning_rate": 2.8918938029411836e-06, "loss": 1.4427934885025024, "step": 3456 }, { "epoch": 1.0643324355521355, "grad_norm": 4.1875, "learning_rate": 2.889903059052315e-06, "loss": 1.4380152225494385, "step": 3458 }, { "epoch": 1.0649480569449787, "grad_norm": 11.875, "learning_rate": 2.8879124245692456e-06, "loss": 1.8976993560791016, "step": 3460 }, { "epoch": 1.0655636783378222, "grad_norm": 6.96875, "learning_rate": 2.8859219014701112e-06, "loss": 1.3139724731445312, "step": 3462 }, { "epoch": 1.0661792997306656, "grad_norm": 7.65625, "learning_rate": 2.883931491732942e-06, "loss": 1.3236113786697388, "step": 3464 }, { "epoch": 1.066794921123509, "grad_norm": 11.375, "learning_rate": 2.8819411973356497e-06, "loss": 1.1875003576278687, "step": 3466 }, { "epoch": 1.0674105425163525, "grad_norm": 9.5625, "learning_rate": 2.8799510202560366e-06, "loss": 1.552634358406067, "step": 3468 }, { "epoch": 1.068026163909196, "grad_norm": 6.34375, "learning_rate": 2.8779609624717854e-06, "loss": 1.2890095710754395, "step": 3470 }, { "epoch": 1.0686417853020393, "grad_norm": 6.4375, "learning_rate": 2.8759710259604616e-06, "loss": 1.207531452178955, "step": 3472 }, { "epoch": 1.0692574066948826, "grad_norm": 5.6875, "learning_rate": 2.8739812126995094e-06, "loss": 1.124848484992981, "step": 3474 }, { "epoch": 1.069873028087726, "grad_norm": 1.1953125, "learning_rate": 2.871991524666251e-06, "loss": 1.2248820066452026, "step": 3476 }, { "epoch": 1.0704886494805694, "grad_norm": 6.125, "learning_rate": 2.8700019638378846e-06, "loss": 1.2643013000488281, "step": 3478 }, { "epoch": 1.0711042708734129, "grad_norm": 4.21875, "learning_rate": 2.86801253219148e-06, "loss": 1.5682262182235718, "step": 3480 }, { "epoch": 1.0717198922662563, "grad_norm": 5.625, "learning_rate": 2.8660232317039804e-06, "loss": 1.1708942651748657, "step": 3482 }, { "epoch": 1.0723355136590997, "grad_norm": 6.8125, "learning_rate": 2.8640340643521996e-06, "loss": 1.252043604850769, "step": 3484 }, { "epoch": 1.0729511350519432, "grad_norm": 5.65625, "learning_rate": 2.862045032112817e-06, "loss": 1.1808435916900635, "step": 3486 }, { "epoch": 1.0735667564447864, "grad_norm": 8.0625, "learning_rate": 2.860056136962377e-06, "loss": 1.2189979553222656, "step": 3488 }, { "epoch": 1.0741823778376298, "grad_norm": 7.8125, "learning_rate": 2.858067380877292e-06, "loss": 1.2846750020980835, "step": 3490 }, { "epoch": 1.0747979992304733, "grad_norm": 14.4375, "learning_rate": 2.856078765833832e-06, "loss": 1.0122082233428955, "step": 3492 }, { "epoch": 1.0754136206233167, "grad_norm": 10.5, "learning_rate": 2.8540902938081285e-06, "loss": 1.2930790185928345, "step": 3494 }, { "epoch": 1.0760292420161601, "grad_norm": 5.59375, "learning_rate": 2.8521019667761697e-06, "loss": 1.3262078762054443, "step": 3496 }, { "epoch": 1.0766448634090036, "grad_norm": 5.1875, "learning_rate": 2.8501137867138017e-06, "loss": 1.196105718612671, "step": 3498 }, { "epoch": 1.0772604848018468, "grad_norm": 8.1875, "learning_rate": 2.8481257555967244e-06, "loss": 1.1585220098495483, "step": 3500 }, { "epoch": 1.0778761061946902, "grad_norm": 4.65625, "learning_rate": 2.8461378754004886e-06, "loss": 1.36484956741333, "step": 3502 }, { "epoch": 1.0784917275875336, "grad_norm": 2.59375, "learning_rate": 2.844150148100495e-06, "loss": 1.497213363647461, "step": 3504 }, { "epoch": 1.079107348980377, "grad_norm": 5.09375, "learning_rate": 2.8421625756719923e-06, "loss": 1.8112924098968506, "step": 3506 }, { "epoch": 1.0797229703732205, "grad_norm": 9.8125, "learning_rate": 2.840175160090076e-06, "loss": 1.5275667905807495, "step": 3508 }, { "epoch": 1.080338591766064, "grad_norm": 12.3125, "learning_rate": 2.8381879033296856e-06, "loss": 1.0221048593521118, "step": 3510 }, { "epoch": 1.0809542131589072, "grad_norm": 6.3125, "learning_rate": 2.8362008073656033e-06, "loss": 1.7415841817855835, "step": 3512 }, { "epoch": 1.0815698345517506, "grad_norm": 1.703125, "learning_rate": 2.83421387417245e-06, "loss": 1.1348897218704224, "step": 3514 }, { "epoch": 1.082185455944594, "grad_norm": 2.59375, "learning_rate": 2.8322271057246864e-06, "loss": 1.2618352174758911, "step": 3516 }, { "epoch": 1.0828010773374375, "grad_norm": 8.3125, "learning_rate": 2.830240503996609e-06, "loss": 1.410486102104187, "step": 3518 }, { "epoch": 1.083416698730281, "grad_norm": 11.25, "learning_rate": 2.8282540709623473e-06, "loss": 1.1378185749053955, "step": 3520 }, { "epoch": 1.0840323201231243, "grad_norm": 9.375, "learning_rate": 2.8262678085958657e-06, "loss": 1.0309453010559082, "step": 3522 }, { "epoch": 1.0846479415159678, "grad_norm": 14.6875, "learning_rate": 2.8242817188709563e-06, "loss": 1.4624661207199097, "step": 3524 }, { "epoch": 1.085263562908811, "grad_norm": 9.6875, "learning_rate": 2.8222958037612423e-06, "loss": 1.2220213413238525, "step": 3526 }, { "epoch": 1.0858791843016544, "grad_norm": 5.375, "learning_rate": 2.8203100652401714e-06, "loss": 1.385746955871582, "step": 3528 }, { "epoch": 1.0864948056944979, "grad_norm": 18.625, "learning_rate": 2.818324505281017e-06, "loss": 1.4827823638916016, "step": 3530 }, { "epoch": 1.0871104270873413, "grad_norm": 1.984375, "learning_rate": 2.8163391258568738e-06, "loss": 1.048915982246399, "step": 3532 }, { "epoch": 1.0877260484801847, "grad_norm": 9.6875, "learning_rate": 2.8143539289406583e-06, "loss": 1.7094111442565918, "step": 3534 }, { "epoch": 1.0883416698730282, "grad_norm": 6.46875, "learning_rate": 2.8123689165051042e-06, "loss": 1.5769238471984863, "step": 3536 }, { "epoch": 1.0889572912658716, "grad_norm": 5.96875, "learning_rate": 2.810384090522765e-06, "loss": 0.83470618724823, "step": 3538 }, { "epoch": 1.0895729126587148, "grad_norm": 6.625, "learning_rate": 2.808399452966004e-06, "loss": 1.711340308189392, "step": 3540 }, { "epoch": 1.0901885340515582, "grad_norm": 2.671875, "learning_rate": 2.8064150058070026e-06, "loss": 0.7356083393096924, "step": 3542 }, { "epoch": 1.0908041554444017, "grad_norm": 38.5, "learning_rate": 2.804430751017749e-06, "loss": 1.528712272644043, "step": 3544 }, { "epoch": 1.0914197768372451, "grad_norm": 5.03125, "learning_rate": 2.802446690570042e-06, "loss": 1.3249646425247192, "step": 3546 }, { "epoch": 1.0920353982300885, "grad_norm": 6.34375, "learning_rate": 2.8004628264354873e-06, "loss": 0.8770416378974915, "step": 3548 }, { "epoch": 1.092651019622932, "grad_norm": 7.53125, "learning_rate": 2.7984791605854935e-06, "loss": 1.2947643995285034, "step": 3550 }, { "epoch": 1.0932666410157752, "grad_norm": 8.5, "learning_rate": 2.796495694991276e-06, "loss": 1.35930597782135, "step": 3552 }, { "epoch": 1.0938822624086186, "grad_norm": 6.78125, "learning_rate": 2.794512431623847e-06, "loss": 1.733473300933838, "step": 3554 }, { "epoch": 1.094497883801462, "grad_norm": 10.125, "learning_rate": 2.7925293724540226e-06, "loss": 1.0471633672714233, "step": 3556 }, { "epoch": 1.0951135051943055, "grad_norm": 13.4375, "learning_rate": 2.7905465194524106e-06, "loss": 1.462620735168457, "step": 3558 }, { "epoch": 1.095729126587149, "grad_norm": 10.0, "learning_rate": 2.788563874589417e-06, "loss": 2.0687508583068848, "step": 3560 }, { "epoch": 1.0963447479799924, "grad_norm": 6.375, "learning_rate": 2.7865814398352412e-06, "loss": 1.5122491121292114, "step": 3562 }, { "epoch": 1.0969603693728358, "grad_norm": 5.53125, "learning_rate": 2.7845992171598724e-06, "loss": 0.8604429960250854, "step": 3564 }, { "epoch": 1.097575990765679, "grad_norm": 19.625, "learning_rate": 2.7826172085330895e-06, "loss": 1.1481788158416748, "step": 3566 }, { "epoch": 1.0981916121585225, "grad_norm": 6.96875, "learning_rate": 2.780635415924461e-06, "loss": 0.8968583941459656, "step": 3568 }, { "epoch": 1.098807233551366, "grad_norm": 4.9375, "learning_rate": 2.7786538413033347e-06, "loss": 1.1648143529891968, "step": 3570 }, { "epoch": 1.0994228549442093, "grad_norm": 8.25, "learning_rate": 2.7766724866388496e-06, "loss": 1.365343689918518, "step": 3572 }, { "epoch": 1.1000384763370528, "grad_norm": 1.8203125, "learning_rate": 2.7746913538999197e-06, "loss": 0.9039756655693054, "step": 3574 }, { "epoch": 1.1006540977298962, "grad_norm": 5.3125, "learning_rate": 2.772710445055242e-06, "loss": 1.5619524717330933, "step": 3576 }, { "epoch": 1.1012697191227394, "grad_norm": 2.390625, "learning_rate": 2.7707297620732897e-06, "loss": 1.3508001565933228, "step": 3578 }, { "epoch": 1.1018853405155828, "grad_norm": 3.984375, "learning_rate": 2.7687493069223128e-06, "loss": 1.1135735511779785, "step": 3580 }, { "epoch": 1.1025009619084263, "grad_norm": 4.84375, "learning_rate": 2.766769081570333e-06, "loss": 1.3249479532241821, "step": 3582 }, { "epoch": 1.1031165833012697, "grad_norm": 6.21875, "learning_rate": 2.7647890879851447e-06, "loss": 1.39180326461792, "step": 3584 }, { "epoch": 1.1037322046941132, "grad_norm": 3.46875, "learning_rate": 2.7628093281343127e-06, "loss": 0.8148363828659058, "step": 3586 }, { "epoch": 1.1043478260869566, "grad_norm": 4.3125, "learning_rate": 2.760829803985167e-06, "loss": 1.3193633556365967, "step": 3588 }, { "epoch": 1.1049634474798, "grad_norm": 5.03125, "learning_rate": 2.7588505175048074e-06, "loss": 1.305018424987793, "step": 3590 }, { "epoch": 1.1055790688726432, "grad_norm": 4.90625, "learning_rate": 2.7568714706600935e-06, "loss": 1.3810641765594482, "step": 3592 }, { "epoch": 1.1061946902654867, "grad_norm": 5.71875, "learning_rate": 2.75489266541765e-06, "loss": 1.2623786926269531, "step": 3594 }, { "epoch": 1.10681031165833, "grad_norm": 16.5, "learning_rate": 2.7529141037438584e-06, "loss": 1.2916265726089478, "step": 3596 }, { "epoch": 1.1074259330511735, "grad_norm": 3.0, "learning_rate": 2.7509357876048604e-06, "loss": 1.1904457807540894, "step": 3598 }, { "epoch": 1.108041554444017, "grad_norm": 3.53125, "learning_rate": 2.7489577189665535e-06, "loss": 0.8278931975364685, "step": 3600 }, { "epoch": 1.1086571758368604, "grad_norm": 5.25, "learning_rate": 2.7469798997945886e-06, "loss": 1.2530715465545654, "step": 3602 }, { "epoch": 1.1092727972297038, "grad_norm": 4.96875, "learning_rate": 2.7450023320543685e-06, "loss": 1.4102144241333008, "step": 3604 }, { "epoch": 1.109888418622547, "grad_norm": 7.15625, "learning_rate": 2.743025017711047e-06, "loss": 1.2377433776855469, "step": 3606 }, { "epoch": 1.1105040400153905, "grad_norm": 7.03125, "learning_rate": 2.7410479587295272e-06, "loss": 1.302953839302063, "step": 3608 }, { "epoch": 1.111119661408234, "grad_norm": 10.875, "learning_rate": 2.7390711570744542e-06, "loss": 1.2824311256408691, "step": 3610 }, { "epoch": 1.1117352828010774, "grad_norm": 17.0, "learning_rate": 2.7370946147102216e-06, "loss": 1.6581602096557617, "step": 3612 }, { "epoch": 1.1123509041939208, "grad_norm": 3.78125, "learning_rate": 2.7351183336009633e-06, "loss": 1.1298326253890991, "step": 3614 }, { "epoch": 1.1129665255867642, "grad_norm": 5.71875, "learning_rate": 2.733142315710555e-06, "loss": 0.7118703126907349, "step": 3616 }, { "epoch": 1.1135821469796074, "grad_norm": 5.5625, "learning_rate": 2.7311665630026086e-06, "loss": 1.3887100219726562, "step": 3618 }, { "epoch": 1.1141977683724509, "grad_norm": 1.78125, "learning_rate": 2.7291910774404764e-06, "loss": 1.3764194250106812, "step": 3620 }, { "epoch": 1.1148133897652943, "grad_norm": 17.25, "learning_rate": 2.727215860987239e-06, "loss": 1.3574413061141968, "step": 3622 }, { "epoch": 1.1154290111581378, "grad_norm": 7.75, "learning_rate": 2.725240915605716e-06, "loss": 1.3049046993255615, "step": 3624 }, { "epoch": 1.1160446325509812, "grad_norm": 5.0625, "learning_rate": 2.7232662432584546e-06, "loss": 1.1837899684906006, "step": 3626 }, { "epoch": 1.1166602539438246, "grad_norm": 5.75, "learning_rate": 2.7212918459077293e-06, "loss": 1.251051425933838, "step": 3628 }, { "epoch": 1.1172758753366678, "grad_norm": 5.6875, "learning_rate": 2.7193177255155447e-06, "loss": 1.1709699630737305, "step": 3630 }, { "epoch": 1.1178914967295113, "grad_norm": 6.1875, "learning_rate": 2.717343884043628e-06, "loss": 1.2499865293502808, "step": 3632 }, { "epoch": 1.1185071181223547, "grad_norm": 3.640625, "learning_rate": 2.71537032345343e-06, "loss": 1.274937629699707, "step": 3634 }, { "epoch": 1.1191227395151981, "grad_norm": 10.5, "learning_rate": 2.713397045706122e-06, "loss": 1.678328514099121, "step": 3636 }, { "epoch": 1.1197383609080416, "grad_norm": 12.8125, "learning_rate": 2.7114240527625935e-06, "loss": 0.8977378010749817, "step": 3638 }, { "epoch": 1.120353982300885, "grad_norm": 13.5, "learning_rate": 2.7094513465834528e-06, "loss": 1.4107129573822021, "step": 3640 }, { "epoch": 1.1209696036937284, "grad_norm": 6.03125, "learning_rate": 2.7074789291290214e-06, "loss": 1.2703056335449219, "step": 3642 }, { "epoch": 1.1215852250865717, "grad_norm": 6.0, "learning_rate": 2.7055068023593356e-06, "loss": 1.2788171768188477, "step": 3644 }, { "epoch": 1.122200846479415, "grad_norm": 2.484375, "learning_rate": 2.703534968234142e-06, "loss": 1.0924688577651978, "step": 3646 }, { "epoch": 1.1228164678722585, "grad_norm": 4.09375, "learning_rate": 2.7015634287128955e-06, "loss": 1.3006528615951538, "step": 3648 }, { "epoch": 1.123432089265102, "grad_norm": 11.5625, "learning_rate": 2.6995921857547604e-06, "loss": 1.451360821723938, "step": 3650 }, { "epoch": 1.1240477106579454, "grad_norm": 7.71875, "learning_rate": 2.697621241318603e-06, "loss": 0.8371405005455017, "step": 3652 }, { "epoch": 1.1246633320507888, "grad_norm": 3.28125, "learning_rate": 2.6956505973629965e-06, "loss": 1.2515637874603271, "step": 3654 }, { "epoch": 1.1252789534436323, "grad_norm": 7.53125, "learning_rate": 2.6936802558462136e-06, "loss": 1.2885754108428955, "step": 3656 }, { "epoch": 1.1258945748364755, "grad_norm": 6.375, "learning_rate": 2.6917102187262266e-06, "loss": 1.4919085502624512, "step": 3658 }, { "epoch": 1.126510196229319, "grad_norm": 4.03125, "learning_rate": 2.689740487960707e-06, "loss": 1.0740493535995483, "step": 3660 }, { "epoch": 1.1271258176221624, "grad_norm": 11.375, "learning_rate": 2.687771065507019e-06, "loss": 1.2783704996109009, "step": 3662 }, { "epoch": 1.1277414390150058, "grad_norm": 7.34375, "learning_rate": 2.6858019533222215e-06, "loss": 1.3667179346084595, "step": 3664 }, { "epoch": 1.1283570604078492, "grad_norm": 11.1875, "learning_rate": 2.6838331533630658e-06, "loss": 1.2072407007217407, "step": 3666 }, { "epoch": 1.1289726818006927, "grad_norm": 10.25, "learning_rate": 2.6818646675859926e-06, "loss": 0.8582851886749268, "step": 3668 }, { "epoch": 1.129588303193536, "grad_norm": 14.8125, "learning_rate": 2.679896497947131e-06, "loss": 1.3421087265014648, "step": 3670 }, { "epoch": 1.1302039245863793, "grad_norm": 8.1875, "learning_rate": 2.677928646402296e-06, "loss": 1.018212080001831, "step": 3672 }, { "epoch": 1.1308195459792227, "grad_norm": 4.53125, "learning_rate": 2.6759611149069826e-06, "loss": 1.347084641456604, "step": 3674 }, { "epoch": 1.1314351673720662, "grad_norm": 8.3125, "learning_rate": 2.6739939054163734e-06, "loss": 1.146074891090393, "step": 3676 }, { "epoch": 1.1320507887649096, "grad_norm": 6.65625, "learning_rate": 2.672027019885328e-06, "loss": 1.2172248363494873, "step": 3678 }, { "epoch": 1.132666410157753, "grad_norm": 6.0, "learning_rate": 2.6700604602683856e-06, "loss": 1.3072665929794312, "step": 3680 }, { "epoch": 1.1332820315505965, "grad_norm": 9.375, "learning_rate": 2.6680942285197586e-06, "loss": 1.6006169319152832, "step": 3682 }, { "epoch": 1.1338976529434397, "grad_norm": 18.125, "learning_rate": 2.666128326593337e-06, "loss": 1.7242209911346436, "step": 3684 }, { "epoch": 1.1345132743362831, "grad_norm": 8.375, "learning_rate": 2.664162756442682e-06, "loss": 1.8018231391906738, "step": 3686 }, { "epoch": 1.1351288957291266, "grad_norm": 5.46875, "learning_rate": 2.6621975200210238e-06, "loss": 0.7340916395187378, "step": 3688 }, { "epoch": 1.13574451712197, "grad_norm": 7.0, "learning_rate": 2.660232619281261e-06, "loss": 1.3796991109848022, "step": 3690 }, { "epoch": 1.1363601385148134, "grad_norm": 19.0, "learning_rate": 2.6582680561759615e-06, "loss": 1.5105079412460327, "step": 3692 }, { "epoch": 1.1369757599076569, "grad_norm": 15.9375, "learning_rate": 2.656303832657354e-06, "loss": 1.0867021083831787, "step": 3694 }, { "epoch": 1.1375913813005, "grad_norm": 14.875, "learning_rate": 2.6543399506773333e-06, "loss": 1.6368738412857056, "step": 3696 }, { "epoch": 1.1382070026933435, "grad_norm": 8.5625, "learning_rate": 2.652376412187452e-06, "loss": 1.5982671976089478, "step": 3698 }, { "epoch": 1.138822624086187, "grad_norm": 12.3125, "learning_rate": 2.650413219138921e-06, "loss": 1.488477110862732, "step": 3700 }, { "epoch": 1.1394382454790304, "grad_norm": 4.375, "learning_rate": 2.648450373482612e-06, "loss": 1.094407320022583, "step": 3702 }, { "epoch": 1.1400538668718738, "grad_norm": 18.625, "learning_rate": 2.646487877169045e-06, "loss": 1.3227121829986572, "step": 3704 }, { "epoch": 1.1406694882647173, "grad_norm": 8.4375, "learning_rate": 2.6445257321483998e-06, "loss": 1.2866860628128052, "step": 3706 }, { "epoch": 1.1412851096575607, "grad_norm": 9.1875, "learning_rate": 2.6425639403705028e-06, "loss": 1.722301721572876, "step": 3708 }, { "epoch": 1.141900731050404, "grad_norm": 7.09375, "learning_rate": 2.6406025037848316e-06, "loss": 1.615889310836792, "step": 3710 }, { "epoch": 1.1425163524432473, "grad_norm": 10.5, "learning_rate": 2.6386414243405068e-06, "loss": 1.5408185720443726, "step": 3712 }, { "epoch": 1.1431319738360908, "grad_norm": 18.25, "learning_rate": 2.6366807039863e-06, "loss": 1.4188480377197266, "step": 3714 }, { "epoch": 1.1437475952289342, "grad_norm": 6.15625, "learning_rate": 2.6347203446706214e-06, "loss": 1.2408636808395386, "step": 3716 }, { "epoch": 1.1443632166217776, "grad_norm": 13.1875, "learning_rate": 2.632760348341524e-06, "loss": 1.8462144136428833, "step": 3718 }, { "epoch": 1.144978838014621, "grad_norm": 5.40625, "learning_rate": 2.6308007169467003e-06, "loss": 1.3214315176010132, "step": 3720 }, { "epoch": 1.1455944594074645, "grad_norm": 4.71875, "learning_rate": 2.6288414524334803e-06, "loss": 1.3338682651519775, "step": 3722 }, { "epoch": 1.1462100808003077, "grad_norm": 28.5, "learning_rate": 2.6268825567488297e-06, "loss": 1.6675822734832764, "step": 3724 }, { "epoch": 1.1468257021931512, "grad_norm": 6.75, "learning_rate": 2.6249240318393454e-06, "loss": 1.1487302780151367, "step": 3726 }, { "epoch": 1.1474413235859946, "grad_norm": 3.625, "learning_rate": 2.622965879651258e-06, "loss": 1.1260499954223633, "step": 3728 }, { "epoch": 1.148056944978838, "grad_norm": 3.5625, "learning_rate": 2.6210081021304278e-06, "loss": 1.261810302734375, "step": 3730 }, { "epoch": 1.1486725663716815, "grad_norm": 9.5, "learning_rate": 2.619050701222342e-06, "loss": 1.1598726511001587, "step": 3732 }, { "epoch": 1.149288187764525, "grad_norm": 5.3125, "learning_rate": 2.617093678872114e-06, "loss": 1.3789466619491577, "step": 3734 }, { "epoch": 1.1499038091573683, "grad_norm": 5.625, "learning_rate": 2.6151370370244807e-06, "loss": 1.467581033706665, "step": 3736 }, { "epoch": 1.1505194305502116, "grad_norm": 3.125, "learning_rate": 2.6131807776238007e-06, "loss": 1.3655937910079956, "step": 3738 }, { "epoch": 1.151135051943055, "grad_norm": 4.65625, "learning_rate": 2.6112249026140515e-06, "loss": 1.4911147356033325, "step": 3740 }, { "epoch": 1.1517506733358984, "grad_norm": 4.46875, "learning_rate": 2.609269413938832e-06, "loss": 0.998041033744812, "step": 3742 }, { "epoch": 1.1523662947287419, "grad_norm": 6.875, "learning_rate": 2.6073143135413546e-06, "loss": 0.9422230124473572, "step": 3744 }, { "epoch": 1.1529819161215853, "grad_norm": 7.75, "learning_rate": 2.6053596033644463e-06, "loss": 1.4808106422424316, "step": 3746 }, { "epoch": 1.1535975375144285, "grad_norm": 3.625, "learning_rate": 2.603405285350546e-06, "loss": 1.4457093477249146, "step": 3748 }, { "epoch": 1.154213158907272, "grad_norm": 4.96875, "learning_rate": 2.601451361441705e-06, "loss": 1.3998022079467773, "step": 3750 }, { "epoch": 1.1548287803001154, "grad_norm": 42.25, "learning_rate": 2.5994978335795784e-06, "loss": 1.4046134948730469, "step": 3752 }, { "epoch": 1.1554444016929588, "grad_norm": 6.78125, "learning_rate": 2.5975447037054325e-06, "loss": 1.610612392425537, "step": 3754 }, { "epoch": 1.1560600230858022, "grad_norm": 3.671875, "learning_rate": 2.595591973760135e-06, "loss": 1.1340110301971436, "step": 3756 }, { "epoch": 1.1566756444786457, "grad_norm": 16.375, "learning_rate": 2.5936396456841597e-06, "loss": 1.4932774305343628, "step": 3758 }, { "epoch": 1.1572912658714891, "grad_norm": 10.3125, "learning_rate": 2.5916877214175774e-06, "loss": 1.3877044916152954, "step": 3760 }, { "epoch": 1.1579068872643323, "grad_norm": 5.59375, "learning_rate": 2.5897362029000583e-06, "loss": 1.7278505563735962, "step": 3762 }, { "epoch": 1.1585225086571758, "grad_norm": 6.28125, "learning_rate": 2.5877850920708714e-06, "loss": 1.4104070663452148, "step": 3764 }, { "epoch": 1.1591381300500192, "grad_norm": 50.5, "learning_rate": 2.585834390868878e-06, "loss": 1.2621724605560303, "step": 3766 }, { "epoch": 1.1597537514428626, "grad_norm": 7.5, "learning_rate": 2.5838841012325344e-06, "loss": 1.3192962408065796, "step": 3768 }, { "epoch": 1.160369372835706, "grad_norm": 7.09375, "learning_rate": 2.581934225099887e-06, "loss": 1.565580129623413, "step": 3770 }, { "epoch": 1.1609849942285495, "grad_norm": 5.96875, "learning_rate": 2.579984764408572e-06, "loss": 1.4773286581039429, "step": 3772 }, { "epoch": 1.161600615621393, "grad_norm": 18.625, "learning_rate": 2.578035721095811e-06, "loss": 1.5876959562301636, "step": 3774 }, { "epoch": 1.1622162370142362, "grad_norm": 9.875, "learning_rate": 2.5760870970984132e-06, "loss": 1.556991696357727, "step": 3776 }, { "epoch": 1.1628318584070796, "grad_norm": 4.78125, "learning_rate": 2.5741388943527684e-06, "loss": 1.2104475498199463, "step": 3778 }, { "epoch": 1.163447479799923, "grad_norm": 12.5, "learning_rate": 2.572191114794851e-06, "loss": 1.2030988931655884, "step": 3780 }, { "epoch": 1.1640631011927665, "grad_norm": 8.4375, "learning_rate": 2.5702437603602125e-06, "loss": 1.2617757320404053, "step": 3782 }, { "epoch": 1.16467872258561, "grad_norm": 36.0, "learning_rate": 2.568296832983982e-06, "loss": 1.4930431842803955, "step": 3784 }, { "epoch": 1.1652943439784533, "grad_norm": 8.625, "learning_rate": 2.5663503346008663e-06, "loss": 1.2744383811950684, "step": 3786 }, { "epoch": 1.1659099653712968, "grad_norm": 3.1875, "learning_rate": 2.564404267145144e-06, "loss": 1.3536694049835205, "step": 3788 }, { "epoch": 1.16652558676414, "grad_norm": 11.75, "learning_rate": 2.562458632550665e-06, "loss": 1.6043957471847534, "step": 3790 }, { "epoch": 1.1671412081569834, "grad_norm": 6.15625, "learning_rate": 2.5605134327508506e-06, "loss": 1.3562557697296143, "step": 3792 }, { "epoch": 1.1677568295498268, "grad_norm": 11.125, "learning_rate": 2.5585686696786903e-06, "loss": 0.7872848510742188, "step": 3794 }, { "epoch": 1.1683724509426703, "grad_norm": 7.21875, "learning_rate": 2.5566243452667374e-06, "loss": 1.2110083103179932, "step": 3796 }, { "epoch": 1.1689880723355137, "grad_norm": 5.15625, "learning_rate": 2.554680461447111e-06, "loss": 1.2281920909881592, "step": 3798 }, { "epoch": 1.1696036937283572, "grad_norm": 13.375, "learning_rate": 2.5527370201514924e-06, "loss": 1.279605507850647, "step": 3800 }, { "epoch": 1.1702193151212004, "grad_norm": 8.1875, "learning_rate": 2.550794023311124e-06, "loss": 1.4011292457580566, "step": 3802 }, { "epoch": 1.1708349365140438, "grad_norm": 9.4375, "learning_rate": 2.5488514728568026e-06, "loss": 1.5559638738632202, "step": 3804 }, { "epoch": 1.1714505579068872, "grad_norm": 11.625, "learning_rate": 2.5469093707188854e-06, "loss": 1.8137891292572021, "step": 3806 }, { "epoch": 1.1720661792997307, "grad_norm": 7.96875, "learning_rate": 2.5449677188272825e-06, "loss": 1.4091744422912598, "step": 3808 }, { "epoch": 1.172681800692574, "grad_norm": 12.5625, "learning_rate": 2.5430265191114587e-06, "loss": 0.9213383197784424, "step": 3810 }, { "epoch": 1.1732974220854175, "grad_norm": 6.625, "learning_rate": 2.541085773500426e-06, "loss": 1.2011784315109253, "step": 3812 }, { "epoch": 1.1739130434782608, "grad_norm": 6.71875, "learning_rate": 2.539145483922747e-06, "loss": 0.9825503826141357, "step": 3814 }, { "epoch": 1.1745286648711042, "grad_norm": 27.375, "learning_rate": 2.5372056523065304e-06, "loss": 1.07127845287323, "step": 3816 }, { "epoch": 1.1751442862639476, "grad_norm": 13.9375, "learning_rate": 2.5352662805794313e-06, "loss": 1.530601143836975, "step": 3818 }, { "epoch": 1.175759907656791, "grad_norm": 5.125, "learning_rate": 2.533327370668647e-06, "loss": 1.3024001121520996, "step": 3820 }, { "epoch": 1.1763755290496345, "grad_norm": 16.5, "learning_rate": 2.531388924500915e-06, "loss": 1.3389467000961304, "step": 3822 }, { "epoch": 1.176991150442478, "grad_norm": 10.5, "learning_rate": 2.5294509440025127e-06, "loss": 1.2247848510742188, "step": 3824 }, { "epoch": 1.1776067718353214, "grad_norm": 6.25, "learning_rate": 2.5275134310992554e-06, "loss": 1.303011417388916, "step": 3826 }, { "epoch": 1.1782223932281646, "grad_norm": 5.5, "learning_rate": 2.5255763877164933e-06, "loss": 0.8400078415870667, "step": 3828 }, { "epoch": 1.178838014621008, "grad_norm": 11.9375, "learning_rate": 2.5236398157791085e-06, "loss": 1.3081486225128174, "step": 3830 }, { "epoch": 1.1794536360138514, "grad_norm": 10.3125, "learning_rate": 2.521703717211518e-06, "loss": 1.2455041408538818, "step": 3832 }, { "epoch": 1.1800692574066949, "grad_norm": 16.0, "learning_rate": 2.519768093937664e-06, "loss": 1.7230615615844727, "step": 3834 }, { "epoch": 1.1806848787995383, "grad_norm": 7.59375, "learning_rate": 2.5178329478810198e-06, "loss": 1.3537473678588867, "step": 3836 }, { "epoch": 1.1813005001923818, "grad_norm": 4.28125, "learning_rate": 2.5158982809645838e-06, "loss": 1.4008030891418457, "step": 3838 }, { "epoch": 1.1819161215852252, "grad_norm": 7.75, "learning_rate": 2.5139640951108777e-06, "loss": 1.5628221035003662, "step": 3840 }, { "epoch": 1.1825317429780684, "grad_norm": 6.6875, "learning_rate": 2.512030392241945e-06, "loss": 1.3440524339675903, "step": 3842 }, { "epoch": 1.1831473643709118, "grad_norm": 5.78125, "learning_rate": 2.5100971742793502e-06, "loss": 1.3246959447860718, "step": 3844 }, { "epoch": 1.1837629857637553, "grad_norm": 4.09375, "learning_rate": 2.508164443144174e-06, "loss": 1.1068414449691772, "step": 3846 }, { "epoch": 1.1843786071565987, "grad_norm": 3.90625, "learning_rate": 2.506232200757016e-06, "loss": 0.8754851818084717, "step": 3848 }, { "epoch": 1.1849942285494421, "grad_norm": 4.9375, "learning_rate": 2.5043004490379887e-06, "loss": 0.9590293169021606, "step": 3850 }, { "epoch": 1.1856098499422856, "grad_norm": 14.5, "learning_rate": 2.502369189906716e-06, "loss": 1.3972516059875488, "step": 3852 }, { "epoch": 1.186225471335129, "grad_norm": 8.75, "learning_rate": 2.5004384252823353e-06, "loss": 1.3978736400604248, "step": 3854 }, { "epoch": 1.1868410927279722, "grad_norm": 7.5, "learning_rate": 2.498508157083489e-06, "loss": 1.71484375, "step": 3856 }, { "epoch": 1.1874567141208157, "grad_norm": 6.875, "learning_rate": 2.4965783872283275e-06, "loss": 1.3648632764816284, "step": 3858 }, { "epoch": 1.188072335513659, "grad_norm": 10.625, "learning_rate": 2.4946491176345077e-06, "loss": 1.0106247663497925, "step": 3860 }, { "epoch": 1.1886879569065025, "grad_norm": 5.0, "learning_rate": 2.4927203502191873e-06, "loss": 0.8573707938194275, "step": 3862 }, { "epoch": 1.189303578299346, "grad_norm": 72.5, "learning_rate": 2.4907920868990266e-06, "loss": 1.3709218502044678, "step": 3864 }, { "epoch": 1.1899191996921894, "grad_norm": 13.5, "learning_rate": 2.4888643295901834e-06, "loss": 1.093931794166565, "step": 3866 }, { "epoch": 1.1905348210850326, "grad_norm": 4.75, "learning_rate": 2.4869370802083135e-06, "loss": 0.9337255358695984, "step": 3868 }, { "epoch": 1.191150442477876, "grad_norm": 6.21875, "learning_rate": 2.485010340668567e-06, "loss": 1.3781424760818481, "step": 3870 }, { "epoch": 1.1917660638707195, "grad_norm": 6.5, "learning_rate": 2.4830841128855894e-06, "loss": 1.520790934562683, "step": 3872 }, { "epoch": 1.192381685263563, "grad_norm": 6.25, "learning_rate": 2.4811583987735157e-06, "loss": 1.0379527807235718, "step": 3874 }, { "epoch": 1.1929973066564064, "grad_norm": 9.875, "learning_rate": 2.4792332002459717e-06, "loss": 1.5793462991714478, "step": 3876 }, { "epoch": 1.1936129280492498, "grad_norm": 14.6875, "learning_rate": 2.4773085192160697e-06, "loss": 1.8597513437271118, "step": 3878 }, { "epoch": 1.194228549442093, "grad_norm": 7.15625, "learning_rate": 2.4753843575964094e-06, "loss": 1.4552783966064453, "step": 3880 }, { "epoch": 1.1948441708349364, "grad_norm": 4.9375, "learning_rate": 2.473460717299072e-06, "loss": 1.150854468345642, "step": 3882 }, { "epoch": 1.1954597922277799, "grad_norm": 7.5625, "learning_rate": 2.4715376002356225e-06, "loss": 1.5145831108093262, "step": 3884 }, { "epoch": 1.1960754136206233, "grad_norm": 17.0, "learning_rate": 2.4696150083171057e-06, "loss": 1.0157501697540283, "step": 3886 }, { "epoch": 1.1966910350134667, "grad_norm": 10.4375, "learning_rate": 2.4676929434540444e-06, "loss": 1.3922085762023926, "step": 3888 }, { "epoch": 1.1973066564063102, "grad_norm": 13.5625, "learning_rate": 2.4657714075564374e-06, "loss": 1.631951093673706, "step": 3890 }, { "epoch": 1.1979222777991536, "grad_norm": 5.84375, "learning_rate": 2.463850402533758e-06, "loss": 1.3009285926818848, "step": 3892 }, { "epoch": 1.1985378991919968, "grad_norm": 14.125, "learning_rate": 2.4619299302949517e-06, "loss": 1.3850191831588745, "step": 3894 }, { "epoch": 1.1991535205848403, "grad_norm": 7.0625, "learning_rate": 2.4600099927484345e-06, "loss": 0.9943208694458008, "step": 3896 }, { "epoch": 1.1997691419776837, "grad_norm": 12.0, "learning_rate": 2.458090591802092e-06, "loss": 1.3799011707305908, "step": 3898 }, { "epoch": 1.2003847633705271, "grad_norm": 4.40625, "learning_rate": 2.456171729363276e-06, "loss": 1.3535175323486328, "step": 3900 }, { "epoch": 1.2010003847633706, "grad_norm": 6.21875, "learning_rate": 2.4542534073388026e-06, "loss": 1.3181215524673462, "step": 3902 }, { "epoch": 1.201616006156214, "grad_norm": 8.5, "learning_rate": 2.4523356276349515e-06, "loss": 1.240586280822754, "step": 3904 }, { "epoch": 1.2022316275490574, "grad_norm": 8.0625, "learning_rate": 2.4504183921574648e-06, "loss": 1.71377694606781, "step": 3906 }, { "epoch": 1.2028472489419006, "grad_norm": 32.25, "learning_rate": 2.44850170281154e-06, "loss": 1.4551841020584106, "step": 3908 }, { "epoch": 1.203462870334744, "grad_norm": 10.6875, "learning_rate": 2.446585561501836e-06, "loss": 1.301320195198059, "step": 3910 }, { "epoch": 1.2040784917275875, "grad_norm": 4.875, "learning_rate": 2.4446699701324643e-06, "loss": 1.2411322593688965, "step": 3912 }, { "epoch": 1.204694113120431, "grad_norm": 5.65625, "learning_rate": 2.4427549306069918e-06, "loss": 0.7906562089920044, "step": 3914 }, { "epoch": 1.2053097345132744, "grad_norm": 7.71875, "learning_rate": 2.4408404448284352e-06, "loss": 1.1022000312805176, "step": 3916 }, { "epoch": 1.2059253559061178, "grad_norm": 6.53125, "learning_rate": 2.4389265146992637e-06, "loss": 1.2730300426483154, "step": 3918 }, { "epoch": 1.2065409772989613, "grad_norm": 7.53125, "learning_rate": 2.437013142121391e-06, "loss": 1.5247621536254883, "step": 3920 }, { "epoch": 1.2071565986918045, "grad_norm": 9.125, "learning_rate": 2.435100328996179e-06, "loss": 1.2567329406738281, "step": 3922 }, { "epoch": 1.207772220084648, "grad_norm": 4.0, "learning_rate": 2.433188077224432e-06, "loss": 1.0988311767578125, "step": 3924 }, { "epoch": 1.2083878414774913, "grad_norm": 5.15625, "learning_rate": 2.431276388706398e-06, "loss": 1.4548678398132324, "step": 3926 }, { "epoch": 1.2090034628703348, "grad_norm": 10.625, "learning_rate": 2.429365265341766e-06, "loss": 1.5173746347427368, "step": 3928 }, { "epoch": 1.2096190842631782, "grad_norm": 5.8125, "learning_rate": 2.4274547090296614e-06, "loss": 1.312397837638855, "step": 3930 }, { "epoch": 1.2102347056560214, "grad_norm": 4.125, "learning_rate": 2.4255447216686455e-06, "loss": 1.3173680305480957, "step": 3932 }, { "epoch": 1.2108503270488649, "grad_norm": 3.6875, "learning_rate": 2.4236353051567172e-06, "loss": 1.3009153604507446, "step": 3934 }, { "epoch": 1.2114659484417083, "grad_norm": 8.125, "learning_rate": 2.4217264613913053e-06, "loss": 1.756670594215393, "step": 3936 }, { "epoch": 1.2120815698345517, "grad_norm": 9.8125, "learning_rate": 2.4198181922692714e-06, "loss": 1.6502559185028076, "step": 3938 }, { "epoch": 1.2126971912273952, "grad_norm": 7.21875, "learning_rate": 2.417910499686905e-06, "loss": 1.3588635921478271, "step": 3940 }, { "epoch": 1.2133128126202386, "grad_norm": 12.3125, "learning_rate": 2.4160033855399235e-06, "loss": 1.100928783416748, "step": 3942 }, { "epoch": 1.213928434013082, "grad_norm": 4.78125, "learning_rate": 2.4140968517234682e-06, "loss": 1.347513198852539, "step": 3944 }, { "epoch": 1.2145440554059252, "grad_norm": 6.6875, "learning_rate": 2.4121909001321054e-06, "loss": 1.5707118511199951, "step": 3946 }, { "epoch": 1.2151596767987687, "grad_norm": 3.171875, "learning_rate": 2.4102855326598205e-06, "loss": 1.0723353624343872, "step": 3948 }, { "epoch": 1.2157752981916121, "grad_norm": 5.78125, "learning_rate": 2.408380751200021e-06, "loss": 1.1773736476898193, "step": 3950 }, { "epoch": 1.2163909195844556, "grad_norm": 6.0625, "learning_rate": 2.4064765576455307e-06, "loss": 1.230358600616455, "step": 3952 }, { "epoch": 1.217006540977299, "grad_norm": 7.90625, "learning_rate": 2.40457295388859e-06, "loss": 1.334424614906311, "step": 3954 }, { "epoch": 1.2176221623701424, "grad_norm": 8.25, "learning_rate": 2.402669941820852e-06, "loss": 1.5765841007232666, "step": 3956 }, { "epoch": 1.2182377837629859, "grad_norm": 10.375, "learning_rate": 2.4007675233333816e-06, "loss": 1.255386471748352, "step": 3958 }, { "epoch": 1.218853405155829, "grad_norm": 8.4375, "learning_rate": 2.398865700316656e-06, "loss": 1.4202911853790283, "step": 3960 }, { "epoch": 1.2194690265486725, "grad_norm": 8.1875, "learning_rate": 2.3969644746605584e-06, "loss": 1.4272559881210327, "step": 3962 }, { "epoch": 1.220084647941516, "grad_norm": 2.046875, "learning_rate": 2.39506384825438e-06, "loss": 1.2267391681671143, "step": 3964 }, { "epoch": 1.2207002693343594, "grad_norm": 8.375, "learning_rate": 2.3931638229868163e-06, "loss": 1.056898593902588, "step": 3966 }, { "epoch": 1.2213158907272028, "grad_norm": 8.5625, "learning_rate": 2.391264400745964e-06, "loss": 1.5538334846496582, "step": 3968 }, { "epoch": 1.2219315121200462, "grad_norm": 8.625, "learning_rate": 2.389365583419323e-06, "loss": 1.3304157257080078, "step": 3970 }, { "epoch": 1.2225471335128897, "grad_norm": 6.71875, "learning_rate": 2.3874673728937886e-06, "loss": 1.3482346534729004, "step": 3972 }, { "epoch": 1.223162754905733, "grad_norm": 7.65625, "learning_rate": 2.3855697710556562e-06, "loss": 0.9510328769683838, "step": 3974 }, { "epoch": 1.2237783762985763, "grad_norm": 7.6875, "learning_rate": 2.3836727797906157e-06, "loss": 1.4577893018722534, "step": 3976 }, { "epoch": 1.2243939976914198, "grad_norm": 5.6875, "learning_rate": 2.381776400983749e-06, "loss": 1.333723545074463, "step": 3978 }, { "epoch": 1.2250096190842632, "grad_norm": 10.625, "learning_rate": 2.3798806365195305e-06, "loss": 1.211458444595337, "step": 3980 }, { "epoch": 1.2256252404771066, "grad_norm": 2.34375, "learning_rate": 2.377985488281825e-06, "loss": 1.0996417999267578, "step": 3982 }, { "epoch": 1.22624086186995, "grad_norm": 3.9375, "learning_rate": 2.3760909581538818e-06, "loss": 1.1678493022918701, "step": 3984 }, { "epoch": 1.2268564832627935, "grad_norm": 14.9375, "learning_rate": 2.374197048018339e-06, "loss": 1.5703445672988892, "step": 3986 }, { "epoch": 1.2274721046556367, "grad_norm": 6.25, "learning_rate": 2.372303759757218e-06, "loss": 1.2629894018173218, "step": 3988 }, { "epoch": 1.2280877260484802, "grad_norm": 16.75, "learning_rate": 2.3704110952519206e-06, "loss": 2.015996217727661, "step": 3990 }, { "epoch": 1.2287033474413236, "grad_norm": 17.0, "learning_rate": 2.3685190563832307e-06, "loss": 0.9846193790435791, "step": 3992 }, { "epoch": 1.229318968834167, "grad_norm": 3.828125, "learning_rate": 2.36662764503131e-06, "loss": 1.3436334133148193, "step": 3994 }, { "epoch": 1.2299345902270105, "grad_norm": 6.625, "learning_rate": 2.3647368630756964e-06, "loss": 1.0975793600082397, "step": 3996 }, { "epoch": 1.2305502116198537, "grad_norm": 5.875, "learning_rate": 2.3628467123953015e-06, "loss": 1.2426040172576904, "step": 3998 }, { "epoch": 1.231165833012697, "grad_norm": 4.90625, "learning_rate": 2.3609571948684107e-06, "loss": 1.2495640516281128, "step": 4000 }, { "epoch": 1.2317814544055405, "grad_norm": 9.125, "learning_rate": 2.35906831237268e-06, "loss": 1.39968740940094, "step": 4002 }, { "epoch": 1.232397075798384, "grad_norm": 13.125, "learning_rate": 2.3571800667851343e-06, "loss": 1.3630619049072266, "step": 4004 }, { "epoch": 1.2330126971912274, "grad_norm": 5.09375, "learning_rate": 2.355292459982165e-06, "loss": 1.2856032848358154, "step": 4006 }, { "epoch": 1.2336283185840708, "grad_norm": 6.8125, "learning_rate": 2.3534054938395313e-06, "loss": 1.4433751106262207, "step": 4008 }, { "epoch": 1.2342439399769143, "grad_norm": 5.5, "learning_rate": 2.351519170232352e-06, "loss": 1.1141834259033203, "step": 4010 }, { "epoch": 1.2348595613697575, "grad_norm": 7.71875, "learning_rate": 2.3496334910351086e-06, "loss": 1.4758944511413574, "step": 4012 }, { "epoch": 1.235475182762601, "grad_norm": 12.25, "learning_rate": 2.3477484581216435e-06, "loss": 1.576067328453064, "step": 4014 }, { "epoch": 1.2360908041554444, "grad_norm": 9.125, "learning_rate": 2.345864073365157e-06, "loss": 1.7607650756835938, "step": 4016 }, { "epoch": 1.2367064255482878, "grad_norm": 4.9375, "learning_rate": 2.3439803386382033e-06, "loss": 1.2492669820785522, "step": 4018 }, { "epoch": 1.2373220469411312, "grad_norm": 3.890625, "learning_rate": 2.3420972558126933e-06, "loss": 1.0438222885131836, "step": 4020 }, { "epoch": 1.2379376683339747, "grad_norm": 11.6875, "learning_rate": 2.3402148267598875e-06, "loss": 1.4356199502944946, "step": 4022 }, { "epoch": 1.238553289726818, "grad_norm": 2.890625, "learning_rate": 2.3383330533503973e-06, "loss": 1.433849573135376, "step": 4024 }, { "epoch": 1.2391689111196613, "grad_norm": 11.0625, "learning_rate": 2.3364519374541838e-06, "loss": 1.0784897804260254, "step": 4026 }, { "epoch": 1.2397845325125048, "grad_norm": 5.78125, "learning_rate": 2.334571480940554e-06, "loss": 1.433074712753296, "step": 4028 }, { "epoch": 1.2404001539053482, "grad_norm": 10.125, "learning_rate": 2.3326916856781603e-06, "loss": 0.9229840636253357, "step": 4030 }, { "epoch": 1.2410157752981916, "grad_norm": 9.125, "learning_rate": 2.330812553534996e-06, "loss": 1.4485481977462769, "step": 4032 }, { "epoch": 1.241631396691035, "grad_norm": 5.0625, "learning_rate": 2.3289340863783993e-06, "loss": 1.4929358959197998, "step": 4034 }, { "epoch": 1.2422470180838785, "grad_norm": 7.0, "learning_rate": 2.327056286075042e-06, "loss": 1.5561566352844238, "step": 4036 }, { "epoch": 1.242862639476722, "grad_norm": 11.875, "learning_rate": 2.325179154490938e-06, "loss": 1.411828637123108, "step": 4038 }, { "epoch": 1.2434782608695651, "grad_norm": 4.84375, "learning_rate": 2.3233026934914347e-06, "loss": 1.2632527351379395, "step": 4040 }, { "epoch": 1.2440938822624086, "grad_norm": 10.8125, "learning_rate": 2.3214269049412142e-06, "loss": 1.2041977643966675, "step": 4042 }, { "epoch": 1.244709503655252, "grad_norm": 6.9375, "learning_rate": 2.3195517907042884e-06, "loss": 1.3839242458343506, "step": 4044 }, { "epoch": 1.2453251250480954, "grad_norm": 5.375, "learning_rate": 2.317677352644001e-06, "loss": 1.4752936363220215, "step": 4046 }, { "epoch": 1.2459407464409389, "grad_norm": 7.34375, "learning_rate": 2.315803592623024e-06, "loss": 1.6179171800613403, "step": 4048 }, { "epoch": 1.2465563678337823, "grad_norm": 4.46875, "learning_rate": 2.3139305125033533e-06, "loss": 1.3558285236358643, "step": 4050 }, { "epoch": 1.2471719892266255, "grad_norm": 5.28125, "learning_rate": 2.3120581141463107e-06, "loss": 1.619982123374939, "step": 4052 }, { "epoch": 1.247787610619469, "grad_norm": 10.0, "learning_rate": 2.3101863994125417e-06, "loss": 1.641546368598938, "step": 4054 }, { "epoch": 1.2484032320123124, "grad_norm": 5.25, "learning_rate": 2.30831537016201e-06, "loss": 1.7766023874282837, "step": 4056 }, { "epoch": 1.2490188534051558, "grad_norm": 18.625, "learning_rate": 2.3064450282539993e-06, "loss": 1.1997973918914795, "step": 4058 }, { "epoch": 1.2496344747979993, "grad_norm": 12.0625, "learning_rate": 2.3045753755471114e-06, "loss": 1.1165738105773926, "step": 4060 }, { "epoch": 1.2502500961908427, "grad_norm": 5.46875, "learning_rate": 2.3027064138992604e-06, "loss": 1.2666813135147095, "step": 4062 }, { "epoch": 1.250865717583686, "grad_norm": 26.25, "learning_rate": 2.3008381451676764e-06, "loss": 1.0904207229614258, "step": 4064 }, { "epoch": 1.2514813389765294, "grad_norm": 13.125, "learning_rate": 2.2989705712089004e-06, "loss": 1.314873456954956, "step": 4066 }, { "epoch": 1.2520969603693728, "grad_norm": 5.4375, "learning_rate": 2.2971036938787816e-06, "loss": 0.7683751583099365, "step": 4068 }, { "epoch": 1.2527125817622162, "grad_norm": 4.40625, "learning_rate": 2.2952375150324785e-06, "loss": 1.3286861181259155, "step": 4070 }, { "epoch": 1.2533282031550597, "grad_norm": 6.0, "learning_rate": 2.293372036524454e-06, "loss": 1.5416184663772583, "step": 4072 }, { "epoch": 1.253943824547903, "grad_norm": 10.0625, "learning_rate": 2.2915072602084778e-06, "loss": 1.1479551792144775, "step": 4074 }, { "epoch": 1.2545594459407465, "grad_norm": 17.0, "learning_rate": 2.2896431879376177e-06, "loss": 0.9122674465179443, "step": 4076 }, { "epoch": 1.2551750673335897, "grad_norm": 4.28125, "learning_rate": 2.2877798215642445e-06, "loss": 0.7844657897949219, "step": 4078 }, { "epoch": 1.2557906887264332, "grad_norm": 4.78125, "learning_rate": 2.285917162940028e-06, "loss": 1.523891568183899, "step": 4080 }, { "epoch": 1.2564063101192766, "grad_norm": 7.0, "learning_rate": 2.2840552139159335e-06, "loss": 1.2284789085388184, "step": 4082 }, { "epoch": 1.25702193151212, "grad_norm": 8.0625, "learning_rate": 2.2821939763422217e-06, "loss": 1.3383231163024902, "step": 4084 }, { "epoch": 1.2576375529049635, "grad_norm": 4.6875, "learning_rate": 2.2803334520684456e-06, "loss": 0.7896304726600647, "step": 4086 }, { "epoch": 1.258253174297807, "grad_norm": 10.875, "learning_rate": 2.2784736429434505e-06, "loss": 0.9124029874801636, "step": 4088 }, { "epoch": 1.2588687956906504, "grad_norm": 25.125, "learning_rate": 2.276614550815369e-06, "loss": 1.7797210216522217, "step": 4090 }, { "epoch": 1.2594844170834936, "grad_norm": 6.84375, "learning_rate": 2.274756177531624e-06, "loss": 1.1753852367401123, "step": 4092 }, { "epoch": 1.260100038476337, "grad_norm": 7.09375, "learning_rate": 2.2728985249389225e-06, "loss": 1.2321572303771973, "step": 4094 }, { "epoch": 1.2607156598691804, "grad_norm": 6.5625, "learning_rate": 2.2710415948832557e-06, "loss": 1.5728049278259277, "step": 4096 }, { "epoch": 1.2613312812620239, "grad_norm": 6.53125, "learning_rate": 2.2691853892098957e-06, "loss": 1.596179485321045, "step": 4098 }, { "epoch": 1.2619469026548673, "grad_norm": 6.71875, "learning_rate": 2.267329909763397e-06, "loss": 1.1732280254364014, "step": 4100 }, { "epoch": 1.2625625240477105, "grad_norm": 4.0625, "learning_rate": 2.26547515838759e-06, "loss": 1.041813850402832, "step": 4102 }, { "epoch": 1.2631781454405542, "grad_norm": 19.875, "learning_rate": 2.263621136925583e-06, "loss": 1.3980480432510376, "step": 4104 }, { "epoch": 1.2637937668333974, "grad_norm": 7.5, "learning_rate": 2.261767847219758e-06, "loss": 1.753159761428833, "step": 4106 }, { "epoch": 1.2644093882262408, "grad_norm": 8.625, "learning_rate": 2.2599152911117726e-06, "loss": 1.6130338907241821, "step": 4108 }, { "epoch": 1.2650250096190843, "grad_norm": 3.34375, "learning_rate": 2.2580634704425513e-06, "loss": 1.5305721759796143, "step": 4110 }, { "epoch": 1.2656406310119277, "grad_norm": 6.25, "learning_rate": 2.2562123870522914e-06, "loss": 1.4050418138504028, "step": 4112 }, { "epoch": 1.2662562524047711, "grad_norm": 9.8125, "learning_rate": 2.254362042780454e-06, "loss": 1.247245192527771, "step": 4114 }, { "epoch": 1.2668718737976143, "grad_norm": 5.875, "learning_rate": 2.2525124394657694e-06, "loss": 0.8592467308044434, "step": 4116 }, { "epoch": 1.267487495190458, "grad_norm": 7.09375, "learning_rate": 2.2506635789462287e-06, "loss": 1.0305540561676025, "step": 4118 }, { "epoch": 1.2681031165833012, "grad_norm": 3.09375, "learning_rate": 2.2488154630590876e-06, "loss": 1.1216397285461426, "step": 4120 }, { "epoch": 1.2687187379761447, "grad_norm": 4.59375, "learning_rate": 2.2469680936408584e-06, "loss": 1.134533405303955, "step": 4122 }, { "epoch": 1.269334359368988, "grad_norm": 7.1875, "learning_rate": 2.2451214725273143e-06, "loss": 0.668088436126709, "step": 4124 }, { "epoch": 1.2699499807618315, "grad_norm": 7.8125, "learning_rate": 2.2432756015534853e-06, "loss": 1.3163663148880005, "step": 4126 }, { "epoch": 1.270565602154675, "grad_norm": 10.0, "learning_rate": 2.2414304825536526e-06, "loss": 1.5193448066711426, "step": 4128 }, { "epoch": 1.2711812235475182, "grad_norm": 4.53125, "learning_rate": 2.239586117361354e-06, "loss": 1.1491132974624634, "step": 4130 }, { "epoch": 1.2717968449403616, "grad_norm": 7.71875, "learning_rate": 2.237742507809375e-06, "loss": 0.9073981046676636, "step": 4132 }, { "epoch": 1.272412466333205, "grad_norm": 5.90625, "learning_rate": 2.2358996557297534e-06, "loss": 1.2663335800170898, "step": 4134 }, { "epoch": 1.2730280877260485, "grad_norm": 5.8125, "learning_rate": 2.2340575629537713e-06, "loss": 1.2100262641906738, "step": 4136 }, { "epoch": 1.273643709118892, "grad_norm": 8.5625, "learning_rate": 2.232216231311959e-06, "loss": 1.367043375968933, "step": 4138 }, { "epoch": 1.2742593305117353, "grad_norm": 6.4375, "learning_rate": 2.2303756626340875e-06, "loss": 1.49073326587677, "step": 4140 }, { "epoch": 1.2748749519045788, "grad_norm": 5.1875, "learning_rate": 2.228535858749172e-06, "loss": 1.3855880498886108, "step": 4142 }, { "epoch": 1.275490573297422, "grad_norm": 11.25, "learning_rate": 2.2266968214854664e-06, "loss": 1.4641410112380981, "step": 4144 }, { "epoch": 1.2761061946902654, "grad_norm": 5.21875, "learning_rate": 2.2248585526704635e-06, "loss": 1.4227370023727417, "step": 4146 }, { "epoch": 1.2767218160831089, "grad_norm": 6.0, "learning_rate": 2.223021054130892e-06, "loss": 1.0786595344543457, "step": 4148 }, { "epoch": 1.2773374374759523, "grad_norm": 13.9375, "learning_rate": 2.221184327692717e-06, "loss": 1.286320447921753, "step": 4150 }, { "epoch": 1.2779530588687957, "grad_norm": 2.046875, "learning_rate": 2.2193483751811324e-06, "loss": 1.0497817993164062, "step": 4152 }, { "epoch": 1.2785686802616392, "grad_norm": 5.53125, "learning_rate": 2.2175131984205664e-06, "loss": 1.2185951471328735, "step": 4154 }, { "epoch": 1.2791843016544826, "grad_norm": 7.34375, "learning_rate": 2.2156787992346752e-06, "loss": 1.302861213684082, "step": 4156 }, { "epoch": 1.2797999230473258, "grad_norm": 7.875, "learning_rate": 2.2138451794463423e-06, "loss": 1.4149022102355957, "step": 4158 }, { "epoch": 1.2804155444401693, "grad_norm": 5.6875, "learning_rate": 2.2120123408776765e-06, "loss": 0.7368856072425842, "step": 4160 }, { "epoch": 1.2810311658330127, "grad_norm": 2.53125, "learning_rate": 2.2101802853500118e-06, "loss": 1.0423181056976318, "step": 4162 }, { "epoch": 1.2816467872258561, "grad_norm": 11.5, "learning_rate": 2.2083490146839023e-06, "loss": 0.8842816352844238, "step": 4164 }, { "epoch": 1.2822624086186996, "grad_norm": 7.0, "learning_rate": 2.206518530699122e-06, "loss": 1.50794517993927, "step": 4166 }, { "epoch": 1.2828780300115428, "grad_norm": 7.625, "learning_rate": 2.2046888352146632e-06, "loss": 1.2277774810791016, "step": 4168 }, { "epoch": 1.2834936514043864, "grad_norm": 6.34375, "learning_rate": 2.2028599300487372e-06, "loss": 1.3741124868392944, "step": 4170 }, { "epoch": 1.2841092727972296, "grad_norm": 5.96875, "learning_rate": 2.2010318170187676e-06, "loss": 0.7409383654594421, "step": 4172 }, { "epoch": 1.284724894190073, "grad_norm": 6.46875, "learning_rate": 2.199204497941391e-06, "loss": 1.3880950212478638, "step": 4174 }, { "epoch": 1.2853405155829165, "grad_norm": 13.8125, "learning_rate": 2.1973779746324556e-06, "loss": 1.5435235500335693, "step": 4176 }, { "epoch": 1.28595613697576, "grad_norm": 7.34375, "learning_rate": 2.195552248907018e-06, "loss": 1.1817162036895752, "step": 4178 }, { "epoch": 1.2865717583686034, "grad_norm": 5.75, "learning_rate": 2.1937273225793422e-06, "loss": 1.2965961694717407, "step": 4180 }, { "epoch": 1.2871873797614466, "grad_norm": 6.78125, "learning_rate": 2.1919031974628995e-06, "loss": 1.1692346334457397, "step": 4182 }, { "epoch": 1.2878030011542902, "grad_norm": 7.9375, "learning_rate": 2.190079875370364e-06, "loss": 1.1079033613204956, "step": 4184 }, { "epoch": 1.2884186225471335, "grad_norm": 3.515625, "learning_rate": 2.1882573581136112e-06, "loss": 1.3430814743041992, "step": 4186 }, { "epoch": 1.289034243939977, "grad_norm": 5.75, "learning_rate": 2.1864356475037175e-06, "loss": 1.2273108959197998, "step": 4188 }, { "epoch": 1.2896498653328203, "grad_norm": 3.65625, "learning_rate": 2.184614745350958e-06, "loss": 1.1358712911605835, "step": 4190 }, { "epoch": 1.2902654867256638, "grad_norm": 4.75, "learning_rate": 2.1827946534648035e-06, "loss": 1.0054996013641357, "step": 4192 }, { "epoch": 1.2908811081185072, "grad_norm": 2.4375, "learning_rate": 2.1809753736539195e-06, "loss": 0.9101351499557495, "step": 4194 }, { "epoch": 1.2914967295113504, "grad_norm": 9.4375, "learning_rate": 2.1791569077261663e-06, "loss": 1.3053233623504639, "step": 4196 }, { "epoch": 1.2921123509041939, "grad_norm": 9.125, "learning_rate": 2.177339257488594e-06, "loss": 1.711419939994812, "step": 4198 }, { "epoch": 1.2927279722970373, "grad_norm": 4.625, "learning_rate": 2.1755224247474416e-06, "loss": 1.2489417791366577, "step": 4200 }, { "epoch": 1.2933435936898807, "grad_norm": 8.9375, "learning_rate": 2.1737064113081376e-06, "loss": 1.293163776397705, "step": 4202 }, { "epoch": 1.2939592150827242, "grad_norm": 4.34375, "learning_rate": 2.1718912189752945e-06, "loss": 1.2553480863571167, "step": 4204 }, { "epoch": 1.2945748364755676, "grad_norm": 5.40625, "learning_rate": 2.1700768495527097e-06, "loss": 1.1565001010894775, "step": 4206 }, { "epoch": 1.295190457868411, "grad_norm": 4.65625, "learning_rate": 2.168263304843363e-06, "loss": 1.3635306358337402, "step": 4208 }, { "epoch": 1.2958060792612542, "grad_norm": 11.625, "learning_rate": 2.1664505866494143e-06, "loss": 1.366041898727417, "step": 4210 }, { "epoch": 1.2964217006540977, "grad_norm": 7.625, "learning_rate": 2.1646386967722024e-06, "loss": 1.3417686223983765, "step": 4212 }, { "epoch": 1.297037322046941, "grad_norm": 8.9375, "learning_rate": 2.1628276370122435e-06, "loss": 1.6105479001998901, "step": 4214 }, { "epoch": 1.2976529434397845, "grad_norm": 9.0, "learning_rate": 2.161017409169227e-06, "loss": 1.046107530593872, "step": 4216 }, { "epoch": 1.298268564832628, "grad_norm": 7.65625, "learning_rate": 2.159208015042018e-06, "loss": 1.0018730163574219, "step": 4218 }, { "epoch": 1.2988841862254714, "grad_norm": 11.125, "learning_rate": 2.1573994564286504e-06, "loss": 0.7739481925964355, "step": 4220 }, { "epoch": 1.2994998076183149, "grad_norm": 6.1875, "learning_rate": 2.1555917351263315e-06, "loss": 1.2665600776672363, "step": 4222 }, { "epoch": 1.300115429011158, "grad_norm": 14.4375, "learning_rate": 2.153784852931433e-06, "loss": 1.6424885988235474, "step": 4224 }, { "epoch": 1.3007310504040015, "grad_norm": 8.3125, "learning_rate": 2.1519788116394944e-06, "loss": 0.8370161056518555, "step": 4226 }, { "epoch": 1.301346671796845, "grad_norm": 7.5, "learning_rate": 2.1501736130452215e-06, "loss": 1.579601526260376, "step": 4228 }, { "epoch": 1.3019622931896884, "grad_norm": 20.125, "learning_rate": 2.148369258942477e-06, "loss": 1.3582897186279297, "step": 4230 }, { "epoch": 1.3025779145825318, "grad_norm": 7.96875, "learning_rate": 2.146565751124291e-06, "loss": 1.4349735975265503, "step": 4232 }, { "epoch": 1.303193535975375, "grad_norm": 7.21875, "learning_rate": 2.1447630913828486e-06, "loss": 1.290523648262024, "step": 4234 }, { "epoch": 1.3038091573682187, "grad_norm": 5.375, "learning_rate": 2.1429612815094917e-06, "loss": 1.2350491285324097, "step": 4236 }, { "epoch": 1.3044247787610619, "grad_norm": 7.5625, "learning_rate": 2.141160323294722e-06, "loss": 1.2998807430267334, "step": 4238 }, { "epoch": 1.3050404001539053, "grad_norm": 23.5, "learning_rate": 2.1393602185281895e-06, "loss": 1.1882212162017822, "step": 4240 }, { "epoch": 1.3056560215467488, "grad_norm": 11.1875, "learning_rate": 2.1375609689987018e-06, "loss": 1.3439292907714844, "step": 4242 }, { "epoch": 1.3062716429395922, "grad_norm": 5.4375, "learning_rate": 2.1357625764942096e-06, "loss": 1.469037652015686, "step": 4244 }, { "epoch": 1.3068872643324356, "grad_norm": 5.5, "learning_rate": 2.133965042801819e-06, "loss": 1.5019780397415161, "step": 4246 }, { "epoch": 1.3075028857252788, "grad_norm": 13.5, "learning_rate": 2.1321683697077774e-06, "loss": 0.697911262512207, "step": 4248 }, { "epoch": 1.3081185071181223, "grad_norm": 11.875, "learning_rate": 2.1303725589974797e-06, "loss": 1.7564258575439453, "step": 4250 }, { "epoch": 1.3087341285109657, "grad_norm": 5.625, "learning_rate": 2.1285776124554644e-06, "loss": 1.2818973064422607, "step": 4252 }, { "epoch": 1.3093497499038091, "grad_norm": 13.3125, "learning_rate": 2.126783531865409e-06, "loss": 1.2938495874404907, "step": 4254 }, { "epoch": 1.3099653712966526, "grad_norm": 4.21875, "learning_rate": 2.124990319010132e-06, "loss": 1.5566442012786865, "step": 4256 }, { "epoch": 1.310580992689496, "grad_norm": 19.125, "learning_rate": 2.123197975671589e-06, "loss": 1.201784610748291, "step": 4258 }, { "epoch": 1.3111966140823395, "grad_norm": 4.25, "learning_rate": 2.121406503630871e-06, "loss": 1.2029253244400024, "step": 4260 }, { "epoch": 1.3118122354751827, "grad_norm": 7.0, "learning_rate": 2.1196159046682058e-06, "loss": 1.1904261112213135, "step": 4262 }, { "epoch": 1.312427856868026, "grad_norm": 33.25, "learning_rate": 2.1178261805629495e-06, "loss": 1.1313016414642334, "step": 4264 }, { "epoch": 1.3130434782608695, "grad_norm": 7.5, "learning_rate": 2.1160373330935937e-06, "loss": 1.395995020866394, "step": 4266 }, { "epoch": 1.313659099653713, "grad_norm": 10.375, "learning_rate": 2.114249364037754e-06, "loss": 1.5595605373382568, "step": 4268 }, { "epoch": 1.3142747210465564, "grad_norm": 3.890625, "learning_rate": 2.112462275172176e-06, "loss": 1.3686449527740479, "step": 4270 }, { "epoch": 1.3148903424393998, "grad_norm": 17.0, "learning_rate": 2.110676068272731e-06, "loss": 0.719721794128418, "step": 4272 }, { "epoch": 1.3155059638322433, "grad_norm": 5.28125, "learning_rate": 2.1088907451144105e-06, "loss": 1.0529125928878784, "step": 4274 }, { "epoch": 1.3161215852250865, "grad_norm": 4.625, "learning_rate": 2.107106307471332e-06, "loss": 0.9316014051437378, "step": 4276 }, { "epoch": 1.31673720661793, "grad_norm": 5.03125, "learning_rate": 2.1053227571167316e-06, "loss": 0.9757555723190308, "step": 4278 }, { "epoch": 1.3173528280107734, "grad_norm": 30.875, "learning_rate": 2.1035400958229617e-06, "loss": 1.6467084884643555, "step": 4280 }, { "epoch": 1.3179684494036168, "grad_norm": 32.75, "learning_rate": 2.1017583253614936e-06, "loss": 1.2693678140640259, "step": 4282 }, { "epoch": 1.3185840707964602, "grad_norm": 20.25, "learning_rate": 2.099977447502912e-06, "loss": 1.5373982191085815, "step": 4284 }, { "epoch": 1.3191996921893034, "grad_norm": 6.0, "learning_rate": 2.0981974640169155e-06, "loss": 1.1194545030593872, "step": 4286 }, { "epoch": 1.319815313582147, "grad_norm": 12.4375, "learning_rate": 2.0964183766723143e-06, "loss": 1.5202691555023193, "step": 4288 }, { "epoch": 1.3204309349749903, "grad_norm": 13.4375, "learning_rate": 2.094640187237026e-06, "loss": 1.4231841564178467, "step": 4290 }, { "epoch": 1.3210465563678337, "grad_norm": 10.5, "learning_rate": 2.0928628974780784e-06, "loss": 1.7791295051574707, "step": 4292 }, { "epoch": 1.3216621777606772, "grad_norm": 2.875, "learning_rate": 2.0910865091616044e-06, "loss": 0.8141826391220093, "step": 4294 }, { "epoch": 1.3222777991535206, "grad_norm": 8.5625, "learning_rate": 2.08931102405284e-06, "loss": 1.2941914796829224, "step": 4296 }, { "epoch": 1.322893420546364, "grad_norm": 11.8125, "learning_rate": 2.087536443916124e-06, "loss": 1.2962119579315186, "step": 4298 }, { "epoch": 1.3235090419392073, "grad_norm": 7.75, "learning_rate": 2.0857627705148985e-06, "loss": 0.9499344825744629, "step": 4300 }, { "epoch": 1.324124663332051, "grad_norm": 5.90625, "learning_rate": 2.083990005611701e-06, "loss": 1.2333036661148071, "step": 4302 }, { "epoch": 1.3247402847248941, "grad_norm": 7.25, "learning_rate": 2.082218150968167e-06, "loss": 0.8718281984329224, "step": 4304 }, { "epoch": 1.3253559061177376, "grad_norm": 13.625, "learning_rate": 2.080447208345031e-06, "loss": 1.2827845811843872, "step": 4306 }, { "epoch": 1.325971527510581, "grad_norm": 14.25, "learning_rate": 2.078677179502115e-06, "loss": 1.189026951789856, "step": 4308 }, { "epoch": 1.3265871489034244, "grad_norm": 16.25, "learning_rate": 2.076908066198339e-06, "loss": 1.5428333282470703, "step": 4310 }, { "epoch": 1.3272027702962679, "grad_norm": 11.3125, "learning_rate": 2.0751398701917092e-06, "loss": 1.5201754570007324, "step": 4312 }, { "epoch": 1.327818391689111, "grad_norm": 9.1875, "learning_rate": 2.073372593239321e-06, "loss": 1.4332178831100464, "step": 4314 }, { "epoch": 1.3284340130819545, "grad_norm": 7.3125, "learning_rate": 2.0716062370973587e-06, "loss": 0.520851731300354, "step": 4316 }, { "epoch": 1.329049634474798, "grad_norm": 4.1875, "learning_rate": 2.069840803521089e-06, "loss": 1.1611192226409912, "step": 4318 }, { "epoch": 1.3296652558676414, "grad_norm": 11.9375, "learning_rate": 2.0680762942648646e-06, "loss": 1.5519795417785645, "step": 4320 }, { "epoch": 1.3302808772604848, "grad_norm": 8.75, "learning_rate": 2.0663127110821144e-06, "loss": 1.5584685802459717, "step": 4322 }, { "epoch": 1.3308964986533283, "grad_norm": 9.5625, "learning_rate": 2.0645500557253544e-06, "loss": 1.6258420944213867, "step": 4324 }, { "epoch": 1.3315121200461717, "grad_norm": 7.03125, "learning_rate": 2.062788329946172e-06, "loss": 1.5503873825073242, "step": 4326 }, { "epoch": 1.332127741439015, "grad_norm": 9.0, "learning_rate": 2.0610275354952338e-06, "loss": 1.4425421953201294, "step": 4328 }, { "epoch": 1.3327433628318583, "grad_norm": 10.125, "learning_rate": 2.059267674122283e-06, "loss": 1.1959125995635986, "step": 4330 }, { "epoch": 1.3333589842247018, "grad_norm": 2.09375, "learning_rate": 2.057508747576131e-06, "loss": 1.3274942636489868, "step": 4332 }, { "epoch": 1.3339746056175452, "grad_norm": 6.53125, "learning_rate": 2.0557507576046632e-06, "loss": 1.3199305534362793, "step": 4334 }, { "epoch": 1.3345902270103887, "grad_norm": 39.5, "learning_rate": 2.0539937059548336e-06, "loss": 1.1806223392486572, "step": 4336 }, { "epoch": 1.335205848403232, "grad_norm": 3.75, "learning_rate": 2.0522375943726634e-06, "loss": 1.0360240936279297, "step": 4338 }, { "epoch": 1.3358214697960755, "grad_norm": 3.0625, "learning_rate": 2.050482424603242e-06, "loss": 1.1422436237335205, "step": 4340 }, { "epoch": 1.3364370911889187, "grad_norm": 6.3125, "learning_rate": 2.0487281983907185e-06, "loss": 0.6544202566146851, "step": 4342 }, { "epoch": 1.3370527125817622, "grad_norm": 9.4375, "learning_rate": 2.0469749174783072e-06, "loss": 0.6959512829780579, "step": 4344 }, { "epoch": 1.3376683339746056, "grad_norm": 7.75, "learning_rate": 2.045222583608285e-06, "loss": 1.5423005819320679, "step": 4346 }, { "epoch": 1.338283955367449, "grad_norm": 5.96875, "learning_rate": 2.0434711985219823e-06, "loss": 1.20880126953125, "step": 4348 }, { "epoch": 1.3388995767602925, "grad_norm": 6.21875, "learning_rate": 2.041720763959791e-06, "loss": 1.2663626670837402, "step": 4350 }, { "epoch": 1.3395151981531357, "grad_norm": 10.625, "learning_rate": 2.0399712816611573e-06, "loss": 1.7219278812408447, "step": 4352 }, { "epoch": 1.3401308195459793, "grad_norm": 3.0, "learning_rate": 2.0382227533645813e-06, "loss": 0.7353307008743286, "step": 4354 }, { "epoch": 1.3407464409388226, "grad_norm": 6.15625, "learning_rate": 2.0364751808076142e-06, "loss": 1.160996437072754, "step": 4356 }, { "epoch": 1.341362062331666, "grad_norm": 7.71875, "learning_rate": 2.034728565726858e-06, "loss": 1.242622971534729, "step": 4358 }, { "epoch": 1.3419776837245094, "grad_norm": 18.875, "learning_rate": 2.032982909857964e-06, "loss": 1.3442938327789307, "step": 4360 }, { "epoch": 1.3425933051173529, "grad_norm": 12.125, "learning_rate": 2.0312382149356276e-06, "loss": 1.2941884994506836, "step": 4362 }, { "epoch": 1.3432089265101963, "grad_norm": 15.3125, "learning_rate": 2.0294944826935937e-06, "loss": 1.6418509483337402, "step": 4364 }, { "epoch": 1.3438245479030395, "grad_norm": 4.84375, "learning_rate": 2.027751714864647e-06, "loss": 1.308324933052063, "step": 4366 }, { "epoch": 1.3444401692958832, "grad_norm": 6.40625, "learning_rate": 2.0260099131806137e-06, "loss": 1.695947289466858, "step": 4368 }, { "epoch": 1.3450557906887264, "grad_norm": 2.09375, "learning_rate": 2.024269079372365e-06, "loss": 1.0717357397079468, "step": 4370 }, { "epoch": 1.3456714120815698, "grad_norm": 7.1875, "learning_rate": 2.0225292151698016e-06, "loss": 1.1823691129684448, "step": 4372 }, { "epoch": 1.3462870334744133, "grad_norm": 19.125, "learning_rate": 2.0207903223018686e-06, "loss": 1.3260829448699951, "step": 4374 }, { "epoch": 1.3469026548672567, "grad_norm": 4.84375, "learning_rate": 2.019052402496542e-06, "loss": 1.543877363204956, "step": 4376 }, { "epoch": 1.3475182762601001, "grad_norm": 6.8125, "learning_rate": 2.017315457480832e-06, "loss": 1.5688681602478027, "step": 4378 }, { "epoch": 1.3481338976529433, "grad_norm": 4.375, "learning_rate": 2.0155794889807802e-06, "loss": 1.315077781677246, "step": 4380 }, { "epoch": 1.3487495190457868, "grad_norm": 9.75, "learning_rate": 2.0138444987214556e-06, "loss": 1.5093774795532227, "step": 4382 }, { "epoch": 1.3493651404386302, "grad_norm": 11.75, "learning_rate": 2.0121104884269598e-06, "loss": 1.6181879043579102, "step": 4384 }, { "epoch": 1.3499807618314736, "grad_norm": 7.0625, "learning_rate": 2.0103774598204144e-06, "loss": 1.4826487302780151, "step": 4386 }, { "epoch": 1.350596383224317, "grad_norm": 7.90625, "learning_rate": 2.008645414623971e-06, "loss": 1.058173418045044, "step": 4388 }, { "epoch": 1.3512120046171605, "grad_norm": 13.0625, "learning_rate": 2.006914354558801e-06, "loss": 1.2639877796173096, "step": 4390 }, { "epoch": 1.351827626010004, "grad_norm": 4.75, "learning_rate": 2.0051842813450977e-06, "loss": 1.0501971244812012, "step": 4392 }, { "epoch": 1.3524432474028472, "grad_norm": 4.75, "learning_rate": 2.003455196702074e-06, "loss": 1.299783706665039, "step": 4394 }, { "epoch": 1.3530588687956906, "grad_norm": 9.9375, "learning_rate": 2.0017271023479595e-06, "loss": 1.511252522468567, "step": 4396 }, { "epoch": 1.353674490188534, "grad_norm": 6.78125, "learning_rate": 2.0000000000000008e-06, "loss": 1.2938201427459717, "step": 4398 }, { "epoch": 1.3542901115813775, "grad_norm": 7.28125, "learning_rate": 1.9982738913744574e-06, "loss": 1.4771069288253784, "step": 4400 }, { "epoch": 1.354905732974221, "grad_norm": 2.984375, "learning_rate": 1.9965487781866026e-06, "loss": 1.3068078756332397, "step": 4402 }, { "epoch": 1.3555213543670643, "grad_norm": 34.25, "learning_rate": 1.9948246621507204e-06, "loss": 1.4712064266204834, "step": 4404 }, { "epoch": 1.3561369757599078, "grad_norm": 6.84375, "learning_rate": 1.993101544980103e-06, "loss": 1.3041491508483887, "step": 4406 }, { "epoch": 1.356752597152751, "grad_norm": 4.5, "learning_rate": 1.9913794283870513e-06, "loss": 1.106006145477295, "step": 4408 }, { "epoch": 1.3573682185455944, "grad_norm": 7.75, "learning_rate": 1.9896583140828707e-06, "loss": 1.4679481983184814, "step": 4410 }, { "epoch": 1.3579838399384379, "grad_norm": 9.5625, "learning_rate": 1.987938203777871e-06, "loss": 1.508618950843811, "step": 4412 }, { "epoch": 1.3585994613312813, "grad_norm": 8.125, "learning_rate": 1.9862190991813642e-06, "loss": 1.4362380504608154, "step": 4414 }, { "epoch": 1.3592150827241247, "grad_norm": 5.71875, "learning_rate": 1.984501002001663e-06, "loss": 0.9908819198608398, "step": 4416 }, { "epoch": 1.359830704116968, "grad_norm": 5.3125, "learning_rate": 1.9827839139460793e-06, "loss": 1.3242170810699463, "step": 4418 }, { "epoch": 1.3604463255098116, "grad_norm": 10.125, "learning_rate": 1.981067836720923e-06, "loss": 1.7057260274887085, "step": 4420 }, { "epoch": 1.3610619469026548, "grad_norm": 7.6875, "learning_rate": 1.979352772031497e-06, "loss": 1.3802309036254883, "step": 4422 }, { "epoch": 1.3616775682954982, "grad_norm": 6.84375, "learning_rate": 1.9776387215821e-06, "loss": 1.4686613082885742, "step": 4424 }, { "epoch": 1.3622931896883417, "grad_norm": 5.625, "learning_rate": 1.9759256870760226e-06, "loss": 1.37690269947052, "step": 4426 }, { "epoch": 1.3629088110811851, "grad_norm": 13.0625, "learning_rate": 1.9742136702155452e-06, "loss": 1.2001221179962158, "step": 4428 }, { "epoch": 1.3635244324740285, "grad_norm": 19.75, "learning_rate": 1.9725026727019368e-06, "loss": 1.3925925493240356, "step": 4430 }, { "epoch": 1.3641400538668718, "grad_norm": 9.375, "learning_rate": 1.970792696235456e-06, "loss": 1.4831708669662476, "step": 4432 }, { "epoch": 1.3647556752597152, "grad_norm": 6.03125, "learning_rate": 1.9690837425153433e-06, "loss": 1.2565829753875732, "step": 4434 }, { "epoch": 1.3653712966525586, "grad_norm": 6.71875, "learning_rate": 1.9673758132398245e-06, "loss": 1.4044508934020996, "step": 4436 }, { "epoch": 1.365986918045402, "grad_norm": 7.53125, "learning_rate": 1.9656689101061076e-06, "loss": 1.3536463975906372, "step": 4438 }, { "epoch": 1.3666025394382455, "grad_norm": 4.0625, "learning_rate": 1.963963034810379e-06, "loss": 1.1673328876495361, "step": 4440 }, { "epoch": 1.367218160831089, "grad_norm": 6.34375, "learning_rate": 1.9622581890478066e-06, "loss": 1.3062219619750977, "step": 4442 }, { "epoch": 1.3678337822239324, "grad_norm": 10.875, "learning_rate": 1.9605543745125343e-06, "loss": 1.3968011140823364, "step": 4444 }, { "epoch": 1.3684494036167756, "grad_norm": 10.0625, "learning_rate": 1.9588515928976793e-06, "loss": 1.5252107381820679, "step": 4446 }, { "epoch": 1.369065025009619, "grad_norm": 9.0, "learning_rate": 1.957149845895336e-06, "loss": 1.6871984004974365, "step": 4448 }, { "epoch": 1.3696806464024625, "grad_norm": 11.625, "learning_rate": 1.9554491351965654e-06, "loss": 1.533086895942688, "step": 4450 }, { "epoch": 1.370296267795306, "grad_norm": 4.09375, "learning_rate": 1.9537494624914046e-06, "loss": 1.1465895175933838, "step": 4452 }, { "epoch": 1.3709118891881493, "grad_norm": 5.5625, "learning_rate": 1.9520508294688558e-06, "loss": 1.3533809185028076, "step": 4454 }, { "epoch": 1.3715275105809928, "grad_norm": 18.125, "learning_rate": 1.950353237816887e-06, "loss": 1.780302882194519, "step": 4456 }, { "epoch": 1.3721431319738362, "grad_norm": 4.21875, "learning_rate": 1.9486566892224355e-06, "loss": 1.293370246887207, "step": 4458 }, { "epoch": 1.3727587533666794, "grad_norm": 8.375, "learning_rate": 1.9469611853713984e-06, "loss": 1.4877686500549316, "step": 4460 }, { "epoch": 1.3733743747595228, "grad_norm": 12.625, "learning_rate": 1.945266727948637e-06, "loss": 0.8963188529014587, "step": 4462 }, { "epoch": 1.3739899961523663, "grad_norm": 8.625, "learning_rate": 1.9435733186379694e-06, "loss": 1.6254758834838867, "step": 4464 }, { "epoch": 1.3746056175452097, "grad_norm": 38.5, "learning_rate": 1.941880959122177e-06, "loss": 0.9384620785713196, "step": 4466 }, { "epoch": 1.3752212389380531, "grad_norm": 12.25, "learning_rate": 1.9401896510829935e-06, "loss": 1.291838526725769, "step": 4468 }, { "epoch": 1.3758368603308964, "grad_norm": 12.6875, "learning_rate": 1.93849939620111e-06, "loss": 1.2295382022857666, "step": 4470 }, { "epoch": 1.37645248172374, "grad_norm": 9.625, "learning_rate": 1.9368101961561712e-06, "loss": 1.405733346939087, "step": 4472 }, { "epoch": 1.3770681031165832, "grad_norm": 4.5, "learning_rate": 1.935122052626773e-06, "loss": 1.1982272863388062, "step": 4474 }, { "epoch": 1.3776837245094267, "grad_norm": 19.875, "learning_rate": 1.933434967290461e-06, "loss": 1.320880651473999, "step": 4476 }, { "epoch": 1.37829934590227, "grad_norm": 6.28125, "learning_rate": 1.9317489418237303e-06, "loss": 1.209782361984253, "step": 4478 }, { "epoch": 1.3789149672951135, "grad_norm": 5.09375, "learning_rate": 1.930063977902021e-06, "loss": 1.2707037925720215, "step": 4480 }, { "epoch": 1.379530588687957, "grad_norm": 6.40625, "learning_rate": 1.928380077199721e-06, "loss": 1.4870203733444214, "step": 4482 }, { "epoch": 1.3801462100808002, "grad_norm": 65.0, "learning_rate": 1.926697241390159e-06, "loss": 1.3282127380371094, "step": 4484 }, { "epoch": 1.3807618314736438, "grad_norm": 12.3125, "learning_rate": 1.9250154721456075e-06, "loss": 1.694354772567749, "step": 4486 }, { "epoch": 1.381377452866487, "grad_norm": 9.375, "learning_rate": 1.9233347711372794e-06, "loss": 1.4913123846054077, "step": 4488 }, { "epoch": 1.3819930742593305, "grad_norm": 7.9375, "learning_rate": 1.9216551400353213e-06, "loss": 1.6237177848815918, "step": 4490 }, { "epoch": 1.382608695652174, "grad_norm": 4.96875, "learning_rate": 1.9199765805088237e-06, "loss": 1.5483479499816895, "step": 4492 }, { "epoch": 1.3832243170450174, "grad_norm": 22.875, "learning_rate": 1.9182990942258074e-06, "loss": 1.4631401300430298, "step": 4494 }, { "epoch": 1.3838399384378608, "grad_norm": 5.8125, "learning_rate": 1.9166226828532285e-06, "loss": 1.4134818315505981, "step": 4496 }, { "epoch": 1.384455559830704, "grad_norm": 6.25, "learning_rate": 1.9149473480569747e-06, "loss": 1.2620103359222412, "step": 4498 }, { "epoch": 1.3850711812235474, "grad_norm": 11.375, "learning_rate": 1.913273091501863e-06, "loss": 1.4693878889083862, "step": 4500 }, { "epoch": 1.3856868026163909, "grad_norm": 7.0, "learning_rate": 1.9115999148516408e-06, "loss": 1.0056512355804443, "step": 4502 }, { "epoch": 1.3863024240092343, "grad_norm": 8.5, "learning_rate": 1.9099278197689796e-06, "loss": 1.149704098701477, "step": 4504 }, { "epoch": 1.3869180454020777, "grad_norm": 12.3125, "learning_rate": 1.9082568079154797e-06, "loss": 1.523463249206543, "step": 4506 }, { "epoch": 1.3875336667949212, "grad_norm": 22.25, "learning_rate": 1.906586880951662e-06, "loss": 1.2891063690185547, "step": 4508 }, { "epoch": 1.3881492881877646, "grad_norm": 14.25, "learning_rate": 1.9049180405369693e-06, "loss": 1.7021067142486572, "step": 4510 }, { "epoch": 1.3887649095806078, "grad_norm": 14.875, "learning_rate": 1.9032502883297683e-06, "loss": 1.4896409511566162, "step": 4512 }, { "epoch": 1.3893805309734513, "grad_norm": 11.9375, "learning_rate": 1.9015836259873399e-06, "loss": 1.4434374570846558, "step": 4514 }, { "epoch": 1.3899961523662947, "grad_norm": 3.84375, "learning_rate": 1.8999180551658844e-06, "loss": 1.1028622388839722, "step": 4516 }, { "epoch": 1.3906117737591381, "grad_norm": 11.625, "learning_rate": 1.898253577520516e-06, "loss": 1.107566237449646, "step": 4518 }, { "epoch": 1.3912273951519816, "grad_norm": 8.6875, "learning_rate": 1.8965901947052648e-06, "loss": 1.5406841039657593, "step": 4520 }, { "epoch": 1.391843016544825, "grad_norm": 8.0625, "learning_rate": 1.8949279083730713e-06, "loss": 1.5343679189682007, "step": 4522 }, { "epoch": 1.3924586379376684, "grad_norm": 12.125, "learning_rate": 1.8932667201757853e-06, "loss": 1.6217793226242065, "step": 4524 }, { "epoch": 1.3930742593305117, "grad_norm": 6.375, "learning_rate": 1.8916066317641692e-06, "loss": 1.1800013780593872, "step": 4526 }, { "epoch": 1.393689880723355, "grad_norm": 2.703125, "learning_rate": 1.8899476447878875e-06, "loss": 1.1012816429138184, "step": 4528 }, { "epoch": 1.3943055021161985, "grad_norm": 11.5, "learning_rate": 1.8882897608955147e-06, "loss": 1.1068960428237915, "step": 4530 }, { "epoch": 1.394921123509042, "grad_norm": 9.3125, "learning_rate": 1.8866329817345264e-06, "loss": 0.9407857656478882, "step": 4532 }, { "epoch": 1.3955367449018854, "grad_norm": 6.34375, "learning_rate": 1.8849773089513002e-06, "loss": 1.0099589824676514, "step": 4534 }, { "epoch": 1.3961523662947286, "grad_norm": 8.75, "learning_rate": 1.8833227441911173e-06, "loss": 1.339590072631836, "step": 4536 }, { "epoch": 1.3967679876875723, "grad_norm": 6.84375, "learning_rate": 1.8816692890981535e-06, "loss": 1.2085269689559937, "step": 4538 }, { "epoch": 1.3973836090804155, "grad_norm": 9.1875, "learning_rate": 1.8800169453154873e-06, "loss": 1.6690895557403564, "step": 4540 }, { "epoch": 1.397999230473259, "grad_norm": 7.875, "learning_rate": 1.8783657144850873e-06, "loss": 1.625824213027954, "step": 4542 }, { "epoch": 1.3986148518661023, "grad_norm": 7.625, "learning_rate": 1.876715598247818e-06, "loss": 1.6058636903762817, "step": 4544 }, { "epoch": 1.3992304732589458, "grad_norm": 7.125, "learning_rate": 1.875066598243439e-06, "loss": 1.2708394527435303, "step": 4546 }, { "epoch": 1.3998460946517892, "grad_norm": 5.09375, "learning_rate": 1.8734187161105971e-06, "loss": 1.3050729036331177, "step": 4548 }, { "epoch": 1.4004617160446324, "grad_norm": 6.46875, "learning_rate": 1.8717719534868305e-06, "loss": 1.233088493347168, "step": 4550 }, { "epoch": 1.401077337437476, "grad_norm": 3.859375, "learning_rate": 1.8701263120085644e-06, "loss": 1.26491379737854, "step": 4552 }, { "epoch": 1.4016929588303193, "grad_norm": 7.625, "learning_rate": 1.8684817933111092e-06, "loss": 1.4451459646224976, "step": 4554 }, { "epoch": 1.4023085802231627, "grad_norm": 6.78125, "learning_rate": 1.8668383990286595e-06, "loss": 1.622502326965332, "step": 4556 }, { "epoch": 1.4029242016160062, "grad_norm": 11.25, "learning_rate": 1.8651961307942927e-06, "loss": 1.3570857048034668, "step": 4558 }, { "epoch": 1.4035398230088496, "grad_norm": 9.8125, "learning_rate": 1.8635549902399693e-06, "loss": 1.2812453508377075, "step": 4560 }, { "epoch": 1.404155444401693, "grad_norm": 8.75, "learning_rate": 1.8619149789965262e-06, "loss": 1.1815876960754395, "step": 4562 }, { "epoch": 1.4047710657945363, "grad_norm": 188.0, "learning_rate": 1.860276098693679e-06, "loss": 1.2427574396133423, "step": 4564 }, { "epoch": 1.4053866871873797, "grad_norm": 6.84375, "learning_rate": 1.858638350960022e-06, "loss": 1.0153923034667969, "step": 4566 }, { "epoch": 1.4060023085802231, "grad_norm": 13.875, "learning_rate": 1.8570017374230186e-06, "loss": 1.691391110420227, "step": 4568 }, { "epoch": 1.4066179299730666, "grad_norm": 9.4375, "learning_rate": 1.8553662597090108e-06, "loss": 1.2465100288391113, "step": 4570 }, { "epoch": 1.40723355136591, "grad_norm": 11.0625, "learning_rate": 1.8537319194432079e-06, "loss": 1.3901227712631226, "step": 4572 }, { "epoch": 1.4078491727587534, "grad_norm": 6.53125, "learning_rate": 1.852098718249692e-06, "loss": 1.0444080829620361, "step": 4574 }, { "epoch": 1.4084647941515969, "grad_norm": 6.8125, "learning_rate": 1.8504666577514107e-06, "loss": 1.3018534183502197, "step": 4576 }, { "epoch": 1.40908041554444, "grad_norm": 4.4375, "learning_rate": 1.8488357395701795e-06, "loss": 1.1676541566848755, "step": 4578 }, { "epoch": 1.4096960369372835, "grad_norm": 3.9375, "learning_rate": 1.847205965326678e-06, "loss": 1.2985882759094238, "step": 4580 }, { "epoch": 1.410311658330127, "grad_norm": 18.0, "learning_rate": 1.845577336640449e-06, "loss": 1.411516785621643, "step": 4582 }, { "epoch": 1.4109272797229704, "grad_norm": 7.21875, "learning_rate": 1.8439498551298984e-06, "loss": 1.4975998401641846, "step": 4584 }, { "epoch": 1.4115429011158138, "grad_norm": 6.34375, "learning_rate": 1.8423235224122909e-06, "loss": 1.2311623096466064, "step": 4586 }, { "epoch": 1.4121585225086573, "grad_norm": 3.28125, "learning_rate": 1.8406983401037487e-06, "loss": 1.2641805410385132, "step": 4588 }, { "epoch": 1.4127741439015007, "grad_norm": 3.609375, "learning_rate": 1.8390743098192543e-06, "loss": 1.3353376388549805, "step": 4590 }, { "epoch": 1.413389765294344, "grad_norm": 8.875, "learning_rate": 1.8374514331726396e-06, "loss": 1.0704381465911865, "step": 4592 }, { "epoch": 1.4140053866871873, "grad_norm": 7.0, "learning_rate": 1.8358297117765958e-06, "loss": 1.4866266250610352, "step": 4594 }, { "epoch": 1.4146210080800308, "grad_norm": 6.0625, "learning_rate": 1.8342091472426637e-06, "loss": 1.5926576852798462, "step": 4596 }, { "epoch": 1.4152366294728742, "grad_norm": 9.625, "learning_rate": 1.8325897411812333e-06, "loss": 1.5305733680725098, "step": 4598 }, { "epoch": 1.4158522508657176, "grad_norm": 7.0625, "learning_rate": 1.830971495201546e-06, "loss": 1.4690890312194824, "step": 4600 }, { "epoch": 1.4164678722585609, "grad_norm": 12.875, "learning_rate": 1.829354410911688e-06, "loss": 0.9163476228713989, "step": 4602 }, { "epoch": 1.4170834936514045, "grad_norm": 5.125, "learning_rate": 1.8277384899185946e-06, "loss": 1.257507085800171, "step": 4604 }, { "epoch": 1.4176991150442477, "grad_norm": 10.5625, "learning_rate": 1.8261237338280393e-06, "loss": 1.5716406106948853, "step": 4606 }, { "epoch": 1.4183147364370912, "grad_norm": 6.71875, "learning_rate": 1.824510144244644e-06, "loss": 1.0980907678604126, "step": 4608 }, { "epoch": 1.4189303578299346, "grad_norm": 7.78125, "learning_rate": 1.822897722771868e-06, "loss": 1.5042476654052734, "step": 4610 }, { "epoch": 1.419545979222778, "grad_norm": 5.46875, "learning_rate": 1.8212864710120096e-06, "loss": 1.3456066846847534, "step": 4612 }, { "epoch": 1.4201616006156215, "grad_norm": 13.125, "learning_rate": 1.8196763905662077e-06, "loss": 0.7732136845588684, "step": 4614 }, { "epoch": 1.4207772220084647, "grad_norm": 9.5625, "learning_rate": 1.8180674830344343e-06, "loss": 1.8712453842163086, "step": 4616 }, { "epoch": 1.4213928434013081, "grad_norm": 8.4375, "learning_rate": 1.816459750015497e-06, "loss": 1.4007591009140015, "step": 4618 }, { "epoch": 1.4220084647941515, "grad_norm": 10.625, "learning_rate": 1.8148531931070365e-06, "loss": 1.0826690196990967, "step": 4620 }, { "epoch": 1.422624086186995, "grad_norm": 11.375, "learning_rate": 1.813247813905523e-06, "loss": 1.226710557937622, "step": 4622 }, { "epoch": 1.4232397075798384, "grad_norm": 6.5625, "learning_rate": 1.81164361400626e-06, "loss": 1.5129951238632202, "step": 4624 }, { "epoch": 1.4238553289726819, "grad_norm": 7.25, "learning_rate": 1.8100405950033744e-06, "loss": 1.3884327411651611, "step": 4626 }, { "epoch": 1.4244709503655253, "grad_norm": 5.0, "learning_rate": 1.8084387584898244e-06, "loss": 1.3586499691009521, "step": 4628 }, { "epoch": 1.4250865717583685, "grad_norm": 7.15625, "learning_rate": 1.8068381060573903e-06, "loss": 1.124431848526001, "step": 4630 }, { "epoch": 1.425702193151212, "grad_norm": 16.25, "learning_rate": 1.8052386392966756e-06, "loss": 1.155695915222168, "step": 4632 }, { "epoch": 1.4263178145440554, "grad_norm": 5.59375, "learning_rate": 1.8036403597971064e-06, "loss": 1.4743564128875732, "step": 4634 }, { "epoch": 1.4269334359368988, "grad_norm": 3.984375, "learning_rate": 1.8020432691469289e-06, "loss": 1.2493964433670044, "step": 4636 }, { "epoch": 1.4275490573297422, "grad_norm": 4.6875, "learning_rate": 1.8004473689332082e-06, "loss": 1.23715078830719, "step": 4638 }, { "epoch": 1.4281646787225857, "grad_norm": 8.875, "learning_rate": 1.7988526607418264e-06, "loss": 1.0521396398544312, "step": 4640 }, { "epoch": 1.4287803001154291, "grad_norm": 7.90625, "learning_rate": 1.7972591461574801e-06, "loss": 1.257058024406433, "step": 4642 }, { "epoch": 1.4293959215082723, "grad_norm": 4.21875, "learning_rate": 1.7956668267636806e-06, "loss": 0.9944857358932495, "step": 4644 }, { "epoch": 1.4300115429011158, "grad_norm": 10.6875, "learning_rate": 1.7940757041427512e-06, "loss": 1.5173877477645874, "step": 4646 }, { "epoch": 1.4306271642939592, "grad_norm": 6.0, "learning_rate": 1.7924857798758265e-06, "loss": 1.4341071844100952, "step": 4648 }, { "epoch": 1.4312427856868026, "grad_norm": 8.375, "learning_rate": 1.7908970555428504e-06, "loss": 1.187394142150879, "step": 4650 }, { "epoch": 1.431858407079646, "grad_norm": 4.46875, "learning_rate": 1.789309532722572e-06, "loss": 1.5108386278152466, "step": 4652 }, { "epoch": 1.4324740284724893, "grad_norm": 8.875, "learning_rate": 1.7877232129925506e-06, "loss": 1.2611860036849976, "step": 4654 }, { "epoch": 1.433089649865333, "grad_norm": 11.5, "learning_rate": 1.7861380979291464e-06, "loss": 1.0800515413284302, "step": 4656 }, { "epoch": 1.4337052712581762, "grad_norm": 5.8125, "learning_rate": 1.7845541891075245e-06, "loss": 1.4038598537445068, "step": 4658 }, { "epoch": 1.4343208926510196, "grad_norm": 7.0, "learning_rate": 1.7829714881016489e-06, "loss": 1.3155494928359985, "step": 4660 }, { "epoch": 1.434936514043863, "grad_norm": 6.75, "learning_rate": 1.781389996484287e-06, "loss": 1.3487207889556885, "step": 4662 }, { "epoch": 1.4355521354367065, "grad_norm": 8.3125, "learning_rate": 1.779809715827002e-06, "loss": 1.2048066854476929, "step": 4664 }, { "epoch": 1.43616775682955, "grad_norm": 7.53125, "learning_rate": 1.7782306477001533e-06, "loss": 1.5318856239318848, "step": 4666 }, { "epoch": 1.436783378222393, "grad_norm": 8.1875, "learning_rate": 1.776652793672898e-06, "loss": 1.6808677911758423, "step": 4668 }, { "epoch": 1.4373989996152368, "grad_norm": 14.9375, "learning_rate": 1.775076155313183e-06, "loss": 1.43800950050354, "step": 4670 }, { "epoch": 1.43801462100808, "grad_norm": 4.28125, "learning_rate": 1.7735007341877505e-06, "loss": 1.326431393623352, "step": 4672 }, { "epoch": 1.4386302424009234, "grad_norm": 26.25, "learning_rate": 1.7719265318621314e-06, "loss": 0.7720835208892822, "step": 4674 }, { "epoch": 1.4392458637937668, "grad_norm": 13.125, "learning_rate": 1.7703535499006455e-06, "loss": 1.4564543962478638, "step": 4676 }, { "epoch": 1.4398614851866103, "grad_norm": 3.984375, "learning_rate": 1.7687817898664012e-06, "loss": 0.8778886795043945, "step": 4678 }, { "epoch": 1.4404771065794537, "grad_norm": 12.0, "learning_rate": 1.7672112533212904e-06, "loss": 1.2717019319534302, "step": 4680 }, { "epoch": 1.441092727972297, "grad_norm": 13.25, "learning_rate": 1.7656419418259923e-06, "loss": 1.0154832601547241, "step": 4682 }, { "epoch": 1.4417083493651404, "grad_norm": 10.875, "learning_rate": 1.764073856939965e-06, "loss": 1.366729974746704, "step": 4684 }, { "epoch": 1.4423239707579838, "grad_norm": 8.5, "learning_rate": 1.7625070002214502e-06, "loss": 1.3453766107559204, "step": 4686 }, { "epoch": 1.4429395921508272, "grad_norm": 2.96875, "learning_rate": 1.7609413732274694e-06, "loss": 1.1809775829315186, "step": 4688 }, { "epoch": 1.4435552135436707, "grad_norm": 14.125, "learning_rate": 1.7593769775138196e-06, "loss": 1.6814355850219727, "step": 4690 }, { "epoch": 1.444170834936514, "grad_norm": 7.9375, "learning_rate": 1.7578138146350776e-06, "loss": 1.379990577697754, "step": 4692 }, { "epoch": 1.4447864563293575, "grad_norm": 5.0, "learning_rate": 1.7562518861445923e-06, "loss": 1.1842437982559204, "step": 4694 }, { "epoch": 1.4454020777222008, "grad_norm": 2.546875, "learning_rate": 1.7546911935944878e-06, "loss": 1.024581789970398, "step": 4696 }, { "epoch": 1.4460176991150442, "grad_norm": 6.21875, "learning_rate": 1.7531317385356587e-06, "loss": 1.291495680809021, "step": 4698 }, { "epoch": 1.4466333205078876, "grad_norm": 3.78125, "learning_rate": 1.7515735225177698e-06, "loss": 1.3858778476715088, "step": 4700 }, { "epoch": 1.447248941900731, "grad_norm": 2.46875, "learning_rate": 1.7500165470892571e-06, "loss": 1.2054401636123657, "step": 4702 }, { "epoch": 1.4478645632935745, "grad_norm": 6.84375, "learning_rate": 1.7484608137973207e-06, "loss": 1.1113160848617554, "step": 4704 }, { "epoch": 1.448480184686418, "grad_norm": 4.84375, "learning_rate": 1.7469063241879272e-06, "loss": 1.0340756177902222, "step": 4706 }, { "epoch": 1.4490958060792614, "grad_norm": 5.90625, "learning_rate": 1.74535307980581e-06, "loss": 1.1889235973358154, "step": 4708 }, { "epoch": 1.4497114274721046, "grad_norm": 7.65625, "learning_rate": 1.7438010821944602e-06, "loss": 1.5387898683547974, "step": 4710 }, { "epoch": 1.450327048864948, "grad_norm": 1.78125, "learning_rate": 1.742250332896134e-06, "loss": 1.1450103521347046, "step": 4712 }, { "epoch": 1.4509426702577914, "grad_norm": 5.9375, "learning_rate": 1.7407008334518451e-06, "loss": 1.2448246479034424, "step": 4714 }, { "epoch": 1.4515582916506349, "grad_norm": 4.0, "learning_rate": 1.7391525854013668e-06, "loss": 1.2321335077285767, "step": 4716 }, { "epoch": 1.4521739130434783, "grad_norm": 9.9375, "learning_rate": 1.7376055902832273e-06, "loss": 1.2992148399353027, "step": 4718 }, { "epoch": 1.4527895344363215, "grad_norm": 9.25, "learning_rate": 1.7360598496347105e-06, "loss": 1.4072332382202148, "step": 4720 }, { "epoch": 1.4534051558291652, "grad_norm": 16.5, "learning_rate": 1.7345153649918533e-06, "loss": 1.4186739921569824, "step": 4722 }, { "epoch": 1.4540207772220084, "grad_norm": 7.9375, "learning_rate": 1.7329721378894443e-06, "loss": 1.5055826902389526, "step": 4724 }, { "epoch": 1.4546363986148518, "grad_norm": 9.1875, "learning_rate": 1.731430169861024e-06, "loss": 1.5261921882629395, "step": 4726 }, { "epoch": 1.4552520200076953, "grad_norm": 6.78125, "learning_rate": 1.7298894624388796e-06, "loss": 1.4818764925003052, "step": 4728 }, { "epoch": 1.4558676414005387, "grad_norm": 16.375, "learning_rate": 1.7283500171540468e-06, "loss": 1.5456936359405518, "step": 4730 }, { "epoch": 1.4564832627933821, "grad_norm": 7.15625, "learning_rate": 1.7268118355363074e-06, "loss": 1.3593312501907349, "step": 4732 }, { "epoch": 1.4570988841862254, "grad_norm": 10.625, "learning_rate": 1.7252749191141866e-06, "loss": 1.1384668350219727, "step": 4734 }, { "epoch": 1.457714505579069, "grad_norm": 5.1875, "learning_rate": 1.7237392694149527e-06, "loss": 1.149019479751587, "step": 4736 }, { "epoch": 1.4583301269719122, "grad_norm": 7.46875, "learning_rate": 1.7222048879646147e-06, "loss": 0.5427249670028687, "step": 4738 }, { "epoch": 1.4589457483647557, "grad_norm": 12.1875, "learning_rate": 1.7206717762879228e-06, "loss": 0.9779499173164368, "step": 4740 }, { "epoch": 1.459561369757599, "grad_norm": 6.8125, "learning_rate": 1.7191399359083642e-06, "loss": 1.6622265577316284, "step": 4742 }, { "epoch": 1.4601769911504425, "grad_norm": 9.5625, "learning_rate": 1.717609368348162e-06, "loss": 1.4094470739364624, "step": 4744 }, { "epoch": 1.460792612543286, "grad_norm": 7.8125, "learning_rate": 1.716080075128278e-06, "loss": 1.5001131296157837, "step": 4746 }, { "epoch": 1.4614082339361292, "grad_norm": 12.1875, "learning_rate": 1.7145520577684015e-06, "loss": 0.9509897232055664, "step": 4748 }, { "epoch": 1.4620238553289726, "grad_norm": 11.6875, "learning_rate": 1.7130253177869607e-06, "loss": 1.2298247814178467, "step": 4750 }, { "epoch": 1.462639476721816, "grad_norm": 9.0, "learning_rate": 1.7114998567011105e-06, "loss": 1.5896788835525513, "step": 4752 }, { "epoch": 1.4632550981146595, "grad_norm": 12.5625, "learning_rate": 1.7099756760267345e-06, "loss": 1.5959491729736328, "step": 4754 }, { "epoch": 1.463870719507503, "grad_norm": 8.75, "learning_rate": 1.7084527772784466e-06, "loss": 1.5655696392059326, "step": 4756 }, { "epoch": 1.4644863409003464, "grad_norm": 5.40625, "learning_rate": 1.7069311619695852e-06, "loss": 1.3257312774658203, "step": 4758 }, { "epoch": 1.4651019622931898, "grad_norm": 9.0, "learning_rate": 1.7054108316122136e-06, "loss": 1.3359758853912354, "step": 4760 }, { "epoch": 1.465717583686033, "grad_norm": 4.90625, "learning_rate": 1.7038917877171179e-06, "loss": 1.2051633596420288, "step": 4762 }, { "epoch": 1.4663332050788764, "grad_norm": 2.984375, "learning_rate": 1.7023740317938053e-06, "loss": 0.8368760347366333, "step": 4764 }, { "epoch": 1.4669488264717199, "grad_norm": 6.0625, "learning_rate": 1.700857565350505e-06, "loss": 0.9664328694343567, "step": 4766 }, { "epoch": 1.4675644478645633, "grad_norm": 5.53125, "learning_rate": 1.6993423898941632e-06, "loss": 1.2796266078948975, "step": 4768 }, { "epoch": 1.4681800692574067, "grad_norm": 11.4375, "learning_rate": 1.6978285069304444e-06, "loss": 1.601299524307251, "step": 4770 }, { "epoch": 1.4687956906502502, "grad_norm": 15.875, "learning_rate": 1.6963159179637274e-06, "loss": 1.381443738937378, "step": 4772 }, { "epoch": 1.4694113120430936, "grad_norm": 7.75, "learning_rate": 1.6948046244971062e-06, "loss": 1.4764504432678223, "step": 4774 }, { "epoch": 1.4700269334359368, "grad_norm": 6.1875, "learning_rate": 1.6932946280323865e-06, "loss": 1.3972898721694946, "step": 4776 }, { "epoch": 1.4706425548287803, "grad_norm": 2.390625, "learning_rate": 1.6917859300700848e-06, "loss": 1.0846296548843384, "step": 4778 }, { "epoch": 1.4712581762216237, "grad_norm": 6.59375, "learning_rate": 1.69027853210943e-06, "loss": 1.0266474485397339, "step": 4780 }, { "epoch": 1.4718737976144671, "grad_norm": 6.09375, "learning_rate": 1.6887724356483564e-06, "loss": 1.5290355682373047, "step": 4782 }, { "epoch": 1.4724894190073106, "grad_norm": 5.6875, "learning_rate": 1.6872676421835055e-06, "loss": 1.6265225410461426, "step": 4784 }, { "epoch": 1.4731050404001538, "grad_norm": 8.8125, "learning_rate": 1.6857641532102254e-06, "loss": 1.7117468118667603, "step": 4786 }, { "epoch": 1.4737206617929974, "grad_norm": 8.125, "learning_rate": 1.6842619702225643e-06, "loss": 1.8459373712539673, "step": 4788 }, { "epoch": 1.4743362831858406, "grad_norm": 3.90625, "learning_rate": 1.682761094713278e-06, "loss": 1.2410180568695068, "step": 4790 }, { "epoch": 1.474951904578684, "grad_norm": 5.625, "learning_rate": 1.6812615281738178e-06, "loss": 1.2673044204711914, "step": 4792 }, { "epoch": 1.4755675259715275, "grad_norm": 8.5625, "learning_rate": 1.6797632720943385e-06, "loss": 1.2248817682266235, "step": 4794 }, { "epoch": 1.476183147364371, "grad_norm": 6.6875, "learning_rate": 1.6782663279636902e-06, "loss": 1.089085340499878, "step": 4796 }, { "epoch": 1.4767987687572144, "grad_norm": 4.4375, "learning_rate": 1.6767706972694192e-06, "loss": 1.323860764503479, "step": 4798 }, { "epoch": 1.4774143901500576, "grad_norm": 13.0, "learning_rate": 1.6752763814977679e-06, "loss": 1.0628877878189087, "step": 4800 }, { "epoch": 1.478030011542901, "grad_norm": 6.4375, "learning_rate": 1.67378338213367e-06, "loss": 1.3006656169891357, "step": 4802 }, { "epoch": 1.4786456329357445, "grad_norm": 6.5, "learning_rate": 1.6722917006607548e-06, "loss": 1.4505863189697266, "step": 4804 }, { "epoch": 1.479261254328588, "grad_norm": 5.0, "learning_rate": 1.6708013385613378e-06, "loss": 1.3453786373138428, "step": 4806 }, { "epoch": 1.4798768757214313, "grad_norm": 8.0625, "learning_rate": 1.6693122973164255e-06, "loss": 0.9350771903991699, "step": 4808 }, { "epoch": 1.4804924971142748, "grad_norm": 12.375, "learning_rate": 1.6678245784057124e-06, "loss": 1.6242949962615967, "step": 4810 }, { "epoch": 1.4811081185071182, "grad_norm": 4.21875, "learning_rate": 1.666338183307577e-06, "loss": 1.2193092107772827, "step": 4812 }, { "epoch": 1.4817237398999614, "grad_norm": 5.96875, "learning_rate": 1.6648531134990845e-06, "loss": 1.388013482093811, "step": 4814 }, { "epoch": 1.4823393612928049, "grad_norm": 6.15625, "learning_rate": 1.6633693704559816e-06, "loss": 1.156988263130188, "step": 4816 }, { "epoch": 1.4829549826856483, "grad_norm": 6.28125, "learning_rate": 1.6618869556526962e-06, "loss": 1.3346011638641357, "step": 4818 }, { "epoch": 1.4835706040784917, "grad_norm": 5.96875, "learning_rate": 1.6604058705623383e-06, "loss": 1.282492756843567, "step": 4820 }, { "epoch": 1.4841862254713352, "grad_norm": 3.125, "learning_rate": 1.6589261166566945e-06, "loss": 1.2600009441375732, "step": 4822 }, { "epoch": 1.4848018468641786, "grad_norm": 6.46875, "learning_rate": 1.6574476954062312e-06, "loss": 1.3078112602233887, "step": 4824 }, { "epoch": 1.485417468257022, "grad_norm": 6.78125, "learning_rate": 1.6559706082800859e-06, "loss": 1.066993236541748, "step": 4826 }, { "epoch": 1.4860330896498652, "grad_norm": 4.375, "learning_rate": 1.6544948567460755e-06, "loss": 1.383507490158081, "step": 4828 }, { "epoch": 1.4866487110427087, "grad_norm": 2.328125, "learning_rate": 1.6530204422706867e-06, "loss": 1.0866873264312744, "step": 4830 }, { "epoch": 1.4872643324355521, "grad_norm": 4.0625, "learning_rate": 1.6515473663190774e-06, "loss": 1.2688915729522705, "step": 4832 }, { "epoch": 1.4878799538283956, "grad_norm": 8.625, "learning_rate": 1.6500756303550775e-06, "loss": 1.510236144065857, "step": 4834 }, { "epoch": 1.488495575221239, "grad_norm": 2.28125, "learning_rate": 1.6486052358411831e-06, "loss": 1.077452301979065, "step": 4836 }, { "epoch": 1.4891111966140824, "grad_norm": 7.5625, "learning_rate": 1.6471361842385586e-06, "loss": 1.2230280637741089, "step": 4838 }, { "epoch": 1.4897268180069259, "grad_norm": 9.5625, "learning_rate": 1.6456684770070336e-06, "loss": 1.319644570350647, "step": 4840 }, { "epoch": 1.490342439399769, "grad_norm": 15.0625, "learning_rate": 1.6442021156051009e-06, "loss": 1.1388092041015625, "step": 4842 }, { "epoch": 1.4909580607926125, "grad_norm": 4.28125, "learning_rate": 1.6427371014899175e-06, "loss": 0.6261699795722961, "step": 4844 }, { "epoch": 1.491573682185456, "grad_norm": 31.5, "learning_rate": 1.6412734361173e-06, "loss": 0.9949028491973877, "step": 4846 }, { "epoch": 1.4921893035782994, "grad_norm": 8.875, "learning_rate": 1.6398111209417266e-06, "loss": 1.4906691312789917, "step": 4848 }, { "epoch": 1.4928049249711428, "grad_norm": 8.0, "learning_rate": 1.638350157416333e-06, "loss": 1.877355694770813, "step": 4850 }, { "epoch": 1.493420546363986, "grad_norm": 14.125, "learning_rate": 1.6368905469929091e-06, "loss": 1.703723669052124, "step": 4852 }, { "epoch": 1.4940361677568297, "grad_norm": 7.03125, "learning_rate": 1.6354322911219045e-06, "loss": 1.234142780303955, "step": 4854 }, { "epoch": 1.494651789149673, "grad_norm": 11.0625, "learning_rate": 1.6339753912524196e-06, "loss": 1.5459946393966675, "step": 4856 }, { "epoch": 1.4952674105425163, "grad_norm": 12.0, "learning_rate": 1.6325198488322095e-06, "loss": 1.1419613361358643, "step": 4858 }, { "epoch": 1.4958830319353598, "grad_norm": 5.25, "learning_rate": 1.631065665307679e-06, "loss": 1.4642760753631592, "step": 4860 }, { "epoch": 1.4964986533282032, "grad_norm": 2.78125, "learning_rate": 1.6296128421238822e-06, "loss": 1.274398684501648, "step": 4862 }, { "epoch": 1.4971142747210466, "grad_norm": 4.21875, "learning_rate": 1.6281613807245228e-06, "loss": 1.0630048513412476, "step": 4864 }, { "epoch": 1.4977298961138898, "grad_norm": 27.875, "learning_rate": 1.6267112825519498e-06, "loss": 1.2358163595199585, "step": 4866 }, { "epoch": 1.4983455175067333, "grad_norm": 5.4375, "learning_rate": 1.6252625490471591e-06, "loss": 1.0745879411697388, "step": 4868 }, { "epoch": 1.4989611388995767, "grad_norm": 6.84375, "learning_rate": 1.6238151816497896e-06, "loss": 1.559770107269287, "step": 4870 }, { "epoch": 1.4995767602924202, "grad_norm": 2.15625, "learning_rate": 1.622369181798122e-06, "loss": 0.9814578890800476, "step": 4872 }, { "epoch": 1.5001923816852636, "grad_norm": 4.96875, "learning_rate": 1.6209245509290794e-06, "loss": 1.20391845703125, "step": 4874 }, { "epoch": 1.5008080030781068, "grad_norm": 4.96875, "learning_rate": 1.6194812904782236e-06, "loss": 1.2740033864974976, "step": 4876 }, { "epoch": 1.5014236244709505, "grad_norm": 9.125, "learning_rate": 1.6180394018797552e-06, "loss": 1.7918224334716797, "step": 4878 }, { "epoch": 1.5020392458637937, "grad_norm": 3.046875, "learning_rate": 1.61659888656651e-06, "loss": 1.2199079990386963, "step": 4880 }, { "epoch": 1.5026548672566373, "grad_norm": 20.75, "learning_rate": 1.6151597459699622e-06, "loss": 0.8411628007888794, "step": 4882 }, { "epoch": 1.5032704886494805, "grad_norm": 6.6875, "learning_rate": 1.613721981520217e-06, "loss": 1.7471990585327148, "step": 4884 }, { "epoch": 1.503886110042324, "grad_norm": 7.09375, "learning_rate": 1.6122855946460128e-06, "loss": 1.0529389381408691, "step": 4886 }, { "epoch": 1.5045017314351674, "grad_norm": 5.4375, "learning_rate": 1.6108505867747215e-06, "loss": 1.3521735668182373, "step": 4888 }, { "epoch": 1.5051173528280106, "grad_norm": 8.8125, "learning_rate": 1.6094169593323395e-06, "loss": 1.0746639966964722, "step": 4890 }, { "epoch": 1.5057329742208543, "grad_norm": 16.125, "learning_rate": 1.6079847137434967e-06, "loss": 1.5944452285766602, "step": 4892 }, { "epoch": 1.5063485956136975, "grad_norm": 5.78125, "learning_rate": 1.6065538514314472e-06, "loss": 1.073503017425537, "step": 4894 }, { "epoch": 1.506964217006541, "grad_norm": 81.0, "learning_rate": 1.60512437381807e-06, "loss": 0.8253582119941711, "step": 4896 }, { "epoch": 1.5075798383993844, "grad_norm": 3.984375, "learning_rate": 1.6036962823238703e-06, "loss": 0.8447735905647278, "step": 4898 }, { "epoch": 1.5081954597922278, "grad_norm": 4.78125, "learning_rate": 1.6022695783679736e-06, "loss": 1.2586779594421387, "step": 4900 }, { "epoch": 1.5088110811850712, "grad_norm": 10.375, "learning_rate": 1.6008442633681298e-06, "loss": 1.3773995637893677, "step": 4902 }, { "epoch": 1.5094267025779144, "grad_norm": 3.046875, "learning_rate": 1.5994203387407036e-06, "loss": 1.2577307224273682, "step": 4904 }, { "epoch": 1.510042323970758, "grad_norm": 8.1875, "learning_rate": 1.5979978059006819e-06, "loss": 1.2247735261917114, "step": 4906 }, { "epoch": 1.5106579453636013, "grad_norm": 4.90625, "learning_rate": 1.5965766662616677e-06, "loss": 1.536833643913269, "step": 4908 }, { "epoch": 1.5112735667564448, "grad_norm": 10.8125, "learning_rate": 1.5951569212358787e-06, "loss": 0.6794060468673706, "step": 4910 }, { "epoch": 1.5118891881492882, "grad_norm": 17.625, "learning_rate": 1.5937385722341481e-06, "loss": 0.9972232580184937, "step": 4912 }, { "epoch": 1.5125048095421316, "grad_norm": 2.828125, "learning_rate": 1.5923216206659213e-06, "loss": 1.1143596172332764, "step": 4914 }, { "epoch": 1.513120430934975, "grad_norm": 3.421875, "learning_rate": 1.590906067939254e-06, "loss": 1.128780722618103, "step": 4916 }, { "epoch": 1.5137360523278183, "grad_norm": 2.5625, "learning_rate": 1.589491915460813e-06, "loss": 1.1181021928787231, "step": 4918 }, { "epoch": 1.514351673720662, "grad_norm": 4.875, "learning_rate": 1.5880791646358728e-06, "loss": 1.110456943511963, "step": 4920 }, { "epoch": 1.5149672951135051, "grad_norm": 5.90625, "learning_rate": 1.5866678168683167e-06, "loss": 1.3714308738708496, "step": 4922 }, { "epoch": 1.5155829165063486, "grad_norm": 39.5, "learning_rate": 1.5852578735606317e-06, "loss": 1.3347901105880737, "step": 4924 }, { "epoch": 1.516198537899192, "grad_norm": 6.03125, "learning_rate": 1.58384933611391e-06, "loss": 1.6708590984344482, "step": 4926 }, { "epoch": 1.5168141592920354, "grad_norm": 7.5, "learning_rate": 1.5824422059278486e-06, "loss": 1.5403391122817993, "step": 4928 }, { "epoch": 1.5174297806848789, "grad_norm": 22.25, "learning_rate": 1.5810364844007414e-06, "loss": 1.3507730960845947, "step": 4930 }, { "epoch": 1.518045402077722, "grad_norm": 8.3125, "learning_rate": 1.5796321729294875e-06, "loss": 1.4804326295852661, "step": 4932 }, { "epoch": 1.5186610234705658, "grad_norm": 13.5625, "learning_rate": 1.5782292729095815e-06, "loss": 1.4317504167556763, "step": 4934 }, { "epoch": 1.519276644863409, "grad_norm": 7.65625, "learning_rate": 1.576827785735118e-06, "loss": 1.2551989555358887, "step": 4936 }, { "epoch": 1.5198922662562524, "grad_norm": 5.84375, "learning_rate": 1.5754277127987852e-06, "loss": 1.3449933528900146, "step": 4938 }, { "epoch": 1.5205078876490958, "grad_norm": 11.6875, "learning_rate": 1.5740290554918675e-06, "loss": 1.3214181661605835, "step": 4940 }, { "epoch": 1.521123509041939, "grad_norm": 7.34375, "learning_rate": 1.5726318152042413e-06, "loss": 1.3067471981048584, "step": 4942 }, { "epoch": 1.5217391304347827, "grad_norm": 5.53125, "learning_rate": 1.5712359933243754e-06, "loss": 1.39982008934021, "step": 4944 }, { "epoch": 1.522354751827626, "grad_norm": 8.6875, "learning_rate": 1.5698415912393306e-06, "loss": 1.399191975593567, "step": 4946 }, { "epoch": 1.5229703732204696, "grad_norm": 3.203125, "learning_rate": 1.5684486103347549e-06, "loss": 1.3007683753967285, "step": 4948 }, { "epoch": 1.5235859946133128, "grad_norm": 5.40625, "learning_rate": 1.5670570519948836e-06, "loss": 1.1364392042160034, "step": 4950 }, { "epoch": 1.5242016160061562, "grad_norm": 21.875, "learning_rate": 1.565666917602541e-06, "loss": 1.522515892982483, "step": 4952 }, { "epoch": 1.5248172373989997, "grad_norm": 5.1875, "learning_rate": 1.5642782085391345e-06, "loss": 1.0475881099700928, "step": 4954 }, { "epoch": 1.5254328587918429, "grad_norm": 6.3125, "learning_rate": 1.5628909261846547e-06, "loss": 1.2147290706634521, "step": 4956 }, { "epoch": 1.5260484801846865, "grad_norm": 7.71875, "learning_rate": 1.5615050719176758e-06, "loss": 1.2593631744384766, "step": 4958 }, { "epoch": 1.5266641015775297, "grad_norm": 6.21875, "learning_rate": 1.560120647115351e-06, "loss": 1.6543095111846924, "step": 4960 }, { "epoch": 1.5272797229703732, "grad_norm": 5.375, "learning_rate": 1.5587376531534162e-06, "loss": 1.0033334493637085, "step": 4962 }, { "epoch": 1.5278953443632166, "grad_norm": 11.3125, "learning_rate": 1.557356091406182e-06, "loss": 1.2959232330322266, "step": 4964 }, { "epoch": 1.52851096575606, "grad_norm": 5.90625, "learning_rate": 1.5559759632465388e-06, "loss": 1.2834677696228027, "step": 4966 }, { "epoch": 1.5291265871489035, "grad_norm": 5.875, "learning_rate": 1.554597270045949e-06, "loss": 1.1852271556854248, "step": 4968 }, { "epoch": 1.5297422085417467, "grad_norm": 11.5625, "learning_rate": 1.553220013174452e-06, "loss": 1.0119701623916626, "step": 4970 }, { "epoch": 1.5303578299345904, "grad_norm": 9.0625, "learning_rate": 1.551844194000659e-06, "loss": 1.247909665107727, "step": 4972 }, { "epoch": 1.5309734513274336, "grad_norm": 1.7890625, "learning_rate": 1.5504698138917515e-06, "loss": 1.0428611040115356, "step": 4974 }, { "epoch": 1.531589072720277, "grad_norm": 5.28125, "learning_rate": 1.5490968742134826e-06, "loss": 1.3514962196350098, "step": 4976 }, { "epoch": 1.5322046941131204, "grad_norm": 7.46875, "learning_rate": 1.5477253763301734e-06, "loss": 1.2463090419769287, "step": 4978 }, { "epoch": 1.5328203155059639, "grad_norm": 7.25, "learning_rate": 1.5463553216047114e-06, "loss": 1.3994954824447632, "step": 4980 }, { "epoch": 1.5334359368988073, "grad_norm": 10.0, "learning_rate": 1.5449867113985512e-06, "loss": 1.6348204612731934, "step": 4982 }, { "epoch": 1.5340515582916505, "grad_norm": 5.25, "learning_rate": 1.5436195470717104e-06, "loss": 1.3202322721481323, "step": 4984 }, { "epoch": 1.5346671796844942, "grad_norm": 8.5, "learning_rate": 1.5422538299827725e-06, "loss": 1.6668038368225098, "step": 4986 }, { "epoch": 1.5352828010773374, "grad_norm": 16.75, "learning_rate": 1.5408895614888798e-06, "loss": 1.4328434467315674, "step": 4988 }, { "epoch": 1.5358984224701808, "grad_norm": 8.625, "learning_rate": 1.5395267429457371e-06, "loss": 1.0568163394927979, "step": 4990 }, { "epoch": 1.5365140438630243, "grad_norm": 2.984375, "learning_rate": 1.5381653757076082e-06, "loss": 1.1498777866363525, "step": 4992 }, { "epoch": 1.5371296652558677, "grad_norm": 4.15625, "learning_rate": 1.5368054611273133e-06, "loss": 1.2708563804626465, "step": 4994 }, { "epoch": 1.5377452866487111, "grad_norm": 9.0, "learning_rate": 1.5354470005562306e-06, "loss": 1.5392816066741943, "step": 4996 }, { "epoch": 1.5383609080415543, "grad_norm": 6.15625, "learning_rate": 1.5340899953442923e-06, "loss": 1.0745151042938232, "step": 4998 }, { "epoch": 1.538976529434398, "grad_norm": 13.4375, "learning_rate": 1.5327344468399852e-06, "loss": 1.365796685218811, "step": 5000 }, { "epoch": 1.5395921508272412, "grad_norm": 8.25, "learning_rate": 1.5313803563903485e-06, "loss": 1.3695905208587646, "step": 5002 }, { "epoch": 1.5402077722200846, "grad_norm": 4.65625, "learning_rate": 1.5300277253409715e-06, "loss": 0.8892248272895813, "step": 5004 }, { "epoch": 1.540823393612928, "grad_norm": 10.3125, "learning_rate": 1.5286765550359958e-06, "loss": 1.1660937070846558, "step": 5006 }, { "epoch": 1.5414390150057713, "grad_norm": 2.1875, "learning_rate": 1.527326846818107e-06, "loss": 1.207476258277893, "step": 5008 }, { "epoch": 1.542054636398615, "grad_norm": 5.96875, "learning_rate": 1.525978602028542e-06, "loss": 1.2643166780471802, "step": 5010 }, { "epoch": 1.5426702577914582, "grad_norm": 5.9375, "learning_rate": 1.5246318220070818e-06, "loss": 1.335673451423645, "step": 5012 }, { "epoch": 1.5432858791843016, "grad_norm": 8.875, "learning_rate": 1.5232865080920512e-06, "loss": 1.5163029432296753, "step": 5014 }, { "epoch": 1.543901500577145, "grad_norm": 5.15625, "learning_rate": 1.5219426616203198e-06, "loss": 1.4841883182525635, "step": 5016 }, { "epoch": 1.5445171219699885, "grad_norm": 8.5625, "learning_rate": 1.5206002839272973e-06, "loss": 1.6200695037841797, "step": 5018 }, { "epoch": 1.545132743362832, "grad_norm": 5.21875, "learning_rate": 1.5192593763469346e-06, "loss": 1.3461923599243164, "step": 5020 }, { "epoch": 1.5457483647556751, "grad_norm": 5.78125, "learning_rate": 1.5179199402117214e-06, "loss": 1.234081745147705, "step": 5022 }, { "epoch": 1.5463639861485188, "grad_norm": 8.9375, "learning_rate": 1.516581976852686e-06, "loss": 1.3456507921218872, "step": 5024 }, { "epoch": 1.546979607541362, "grad_norm": 7.53125, "learning_rate": 1.5152454875993921e-06, "loss": 1.0912538766860962, "step": 5026 }, { "epoch": 1.5475952289342054, "grad_norm": 10.1875, "learning_rate": 1.513910473779939e-06, "loss": 1.7042229175567627, "step": 5028 }, { "epoch": 1.5482108503270489, "grad_norm": 7.625, "learning_rate": 1.5125769367209603e-06, "loss": 1.3107324838638306, "step": 5030 }, { "epoch": 1.5488264717198923, "grad_norm": 5.28125, "learning_rate": 1.5112448777476216e-06, "loss": 1.7281067371368408, "step": 5032 }, { "epoch": 1.5494420931127357, "grad_norm": 6.75, "learning_rate": 1.5099142981836193e-06, "loss": 1.3627159595489502, "step": 5034 }, { "epoch": 1.550057714505579, "grad_norm": 9.75, "learning_rate": 1.5085851993511807e-06, "loss": 1.335439682006836, "step": 5036 }, { "epoch": 1.5506733358984226, "grad_norm": 6.9375, "learning_rate": 1.5072575825710601e-06, "loss": 1.2400782108306885, "step": 5038 }, { "epoch": 1.5512889572912658, "grad_norm": 6.46875, "learning_rate": 1.5059314491625413e-06, "loss": 1.5848368406295776, "step": 5040 }, { "epoch": 1.5519045786841092, "grad_norm": 8.0, "learning_rate": 1.5046068004434318e-06, "loss": 1.104585886001587, "step": 5042 }, { "epoch": 1.5525202000769527, "grad_norm": 8.5, "learning_rate": 1.5032836377300663e-06, "loss": 1.2151025533676147, "step": 5044 }, { "epoch": 1.5531358214697961, "grad_norm": 18.25, "learning_rate": 1.501961962337299e-06, "loss": 1.3426923751831055, "step": 5046 }, { "epoch": 1.5537514428626396, "grad_norm": 8.3125, "learning_rate": 1.5006417755785096e-06, "loss": 1.0586085319519043, "step": 5048 }, { "epoch": 1.5543670642554828, "grad_norm": 7.96875, "learning_rate": 1.4993230787655978e-06, "loss": 1.2460078001022339, "step": 5050 }, { "epoch": 1.5549826856483264, "grad_norm": 15.125, "learning_rate": 1.4980058732089807e-06, "loss": 1.0223263502120972, "step": 5052 }, { "epoch": 1.5555983070411696, "grad_norm": 16.75, "learning_rate": 1.4966901602175965e-06, "loss": 1.5279709100723267, "step": 5054 }, { "epoch": 1.556213928434013, "grad_norm": 10.5625, "learning_rate": 1.495375941098898e-06, "loss": 1.2279974222183228, "step": 5056 }, { "epoch": 1.5568295498268565, "grad_norm": 4.84375, "learning_rate": 1.4940632171588544e-06, "loss": 1.3564412593841553, "step": 5058 }, { "epoch": 1.5574451712196997, "grad_norm": 7.875, "learning_rate": 1.4927519897019482e-06, "loss": 1.4334050416946411, "step": 5060 }, { "epoch": 1.5580607926125434, "grad_norm": 39.75, "learning_rate": 1.491442260031176e-06, "loss": 1.7179346084594727, "step": 5062 }, { "epoch": 1.5586764140053866, "grad_norm": 15.3125, "learning_rate": 1.490134029448046e-06, "loss": 1.6199084520339966, "step": 5064 }, { "epoch": 1.5592920353982302, "grad_norm": 2.515625, "learning_rate": 1.4888272992525758e-06, "loss": 1.2121710777282715, "step": 5066 }, { "epoch": 1.5599076567910735, "grad_norm": 5.0, "learning_rate": 1.487522070743292e-06, "loss": 1.2717112302780151, "step": 5068 }, { "epoch": 1.560523278183917, "grad_norm": 3.296875, "learning_rate": 1.486218345217231e-06, "loss": 1.2327752113342285, "step": 5070 }, { "epoch": 1.5611388995767603, "grad_norm": 9.3125, "learning_rate": 1.484916123969932e-06, "loss": 1.564252257347107, "step": 5072 }, { "epoch": 1.5617545209696035, "grad_norm": 2.953125, "learning_rate": 1.4836154082954428e-06, "loss": 1.0845710039138794, "step": 5074 }, { "epoch": 1.5623701423624472, "grad_norm": 2.546875, "learning_rate": 1.4823161994863134e-06, "loss": 1.1228383779525757, "step": 5076 }, { "epoch": 1.5629857637552904, "grad_norm": 19.625, "learning_rate": 1.4810184988335965e-06, "loss": 1.4887943267822266, "step": 5078 }, { "epoch": 1.5636013851481338, "grad_norm": 9.625, "learning_rate": 1.479722307626847e-06, "loss": 1.802656888961792, "step": 5080 }, { "epoch": 1.5642170065409773, "grad_norm": 32.5, "learning_rate": 1.4784276271541188e-06, "loss": 1.276349663734436, "step": 5082 }, { "epoch": 1.5648326279338207, "grad_norm": 5.15625, "learning_rate": 1.4771344587019644e-06, "loss": 1.297951579093933, "step": 5084 }, { "epoch": 1.5654482493266642, "grad_norm": 6.1875, "learning_rate": 1.4758428035554345e-06, "loss": 1.5146312713623047, "step": 5086 }, { "epoch": 1.5660638707195074, "grad_norm": 3.59375, "learning_rate": 1.4745526629980766e-06, "loss": 1.4900462627410889, "step": 5088 }, { "epoch": 1.566679492112351, "grad_norm": 10.75, "learning_rate": 1.4732640383119312e-06, "loss": 1.4104520082473755, "step": 5090 }, { "epoch": 1.5672951135051942, "grad_norm": 8.5, "learning_rate": 1.4719769307775337e-06, "loss": 1.2928396463394165, "step": 5092 }, { "epoch": 1.5679107348980377, "grad_norm": 2.40625, "learning_rate": 1.4706913416739123e-06, "loss": 0.8257898092269897, "step": 5094 }, { "epoch": 1.568526356290881, "grad_norm": 9.875, "learning_rate": 1.4694072722785857e-06, "loss": 1.2909157276153564, "step": 5096 }, { "epoch": 1.5691419776837245, "grad_norm": 4.15625, "learning_rate": 1.4681247238675622e-06, "loss": 1.4911627769470215, "step": 5098 }, { "epoch": 1.569757599076568, "grad_norm": 5.9375, "learning_rate": 1.4668436977153383e-06, "loss": 1.3358662128448486, "step": 5100 }, { "epoch": 1.5703732204694112, "grad_norm": 1.921875, "learning_rate": 1.4655641950948993e-06, "loss": 0.9619826078414917, "step": 5102 }, { "epoch": 1.5709888418622548, "grad_norm": 9.25, "learning_rate": 1.4642862172777154e-06, "loss": 1.237134337425232, "step": 5104 }, { "epoch": 1.571604463255098, "grad_norm": 5.4375, "learning_rate": 1.463009765533741e-06, "loss": 1.2212834358215332, "step": 5106 }, { "epoch": 1.5722200846479415, "grad_norm": 6.90625, "learning_rate": 1.4617348411314167e-06, "loss": 1.2365351915359497, "step": 5108 }, { "epoch": 1.572835706040785, "grad_norm": 10.25, "learning_rate": 1.4604614453376613e-06, "loss": 1.1786137819290161, "step": 5110 }, { "epoch": 1.5734513274336284, "grad_norm": 15.375, "learning_rate": 1.459189579417878e-06, "loss": 1.056581735610962, "step": 5112 }, { "epoch": 1.5740669488264718, "grad_norm": 5.03125, "learning_rate": 1.4579192446359483e-06, "loss": 1.3243074417114258, "step": 5114 }, { "epoch": 1.574682570219315, "grad_norm": 10.375, "learning_rate": 1.4566504422542316e-06, "loss": 1.2739485502243042, "step": 5116 }, { "epoch": 1.5752981916121587, "grad_norm": 5.0, "learning_rate": 1.4553831735335667e-06, "loss": 1.2523506879806519, "step": 5118 }, { "epoch": 1.5759138130050019, "grad_norm": 5.5, "learning_rate": 1.4541174397332659e-06, "loss": 1.0985594987869263, "step": 5120 }, { "epoch": 1.5765294343978453, "grad_norm": 5.9375, "learning_rate": 1.4528532421111175e-06, "loss": 1.0287668704986572, "step": 5122 }, { "epoch": 1.5771450557906888, "grad_norm": 8.75, "learning_rate": 1.451590581923383e-06, "loss": 1.2971521615982056, "step": 5124 }, { "epoch": 1.577760677183532, "grad_norm": 4.65625, "learning_rate": 1.4503294604247953e-06, "loss": 0.7459732890129089, "step": 5126 }, { "epoch": 1.5783762985763756, "grad_norm": 139.0, "learning_rate": 1.449069878868561e-06, "loss": 1.7319278717041016, "step": 5128 }, { "epoch": 1.5789919199692188, "grad_norm": 7.125, "learning_rate": 1.4478118385063526e-06, "loss": 1.4584087133407593, "step": 5130 }, { "epoch": 1.5796075413620625, "grad_norm": 42.75, "learning_rate": 1.4465553405883146e-06, "loss": 1.3035287857055664, "step": 5132 }, { "epoch": 1.5802231627549057, "grad_norm": 17.625, "learning_rate": 1.4453003863630564e-06, "loss": 1.0673809051513672, "step": 5134 }, { "epoch": 1.5808387841477491, "grad_norm": 6.15625, "learning_rate": 1.4440469770776538e-06, "loss": 1.2335408926010132, "step": 5136 }, { "epoch": 1.5814544055405926, "grad_norm": 4.0625, "learning_rate": 1.4427951139776483e-06, "loss": 1.5894789695739746, "step": 5138 }, { "epoch": 1.5820700269334358, "grad_norm": 15.75, "learning_rate": 1.4415447983070435e-06, "loss": 1.413337230682373, "step": 5140 }, { "epoch": 1.5826856483262794, "grad_norm": 7.5, "learning_rate": 1.4402960313083072e-06, "loss": 1.2019569873809814, "step": 5142 }, { "epoch": 1.5833012697191227, "grad_norm": 10.25, "learning_rate": 1.4390488142223668e-06, "loss": 1.3251562118530273, "step": 5144 }, { "epoch": 1.583916891111966, "grad_norm": 9.25, "learning_rate": 1.437803148288609e-06, "loss": 1.3022871017456055, "step": 5146 }, { "epoch": 1.5845325125048095, "grad_norm": 6.0, "learning_rate": 1.436559034744882e-06, "loss": 1.4235812425613403, "step": 5148 }, { "epoch": 1.585148133897653, "grad_norm": 8.125, "learning_rate": 1.4353164748274867e-06, "loss": 1.2802785634994507, "step": 5150 }, { "epoch": 1.5857637552904964, "grad_norm": 10.9375, "learning_rate": 1.4340754697711848e-06, "loss": 1.7972906827926636, "step": 5152 }, { "epoch": 1.5863793766833396, "grad_norm": 8.6875, "learning_rate": 1.4328360208091893e-06, "loss": 1.5309021472930908, "step": 5154 }, { "epoch": 1.5869949980761833, "grad_norm": 9.25, "learning_rate": 1.4315981291731698e-06, "loss": 1.2634186744689941, "step": 5156 }, { "epoch": 1.5876106194690265, "grad_norm": 4.65625, "learning_rate": 1.4303617960932467e-06, "loss": 1.1108946800231934, "step": 5158 }, { "epoch": 1.58822624086187, "grad_norm": 18.375, "learning_rate": 1.4291270227979912e-06, "loss": 1.176788091659546, "step": 5160 }, { "epoch": 1.5888418622547134, "grad_norm": 7.6875, "learning_rate": 1.4278938105144257e-06, "loss": 1.330367922782898, "step": 5162 }, { "epoch": 1.5894574836475568, "grad_norm": 7.40625, "learning_rate": 1.42666216046802e-06, "loss": 1.5494861602783203, "step": 5164 }, { "epoch": 1.5900731050404002, "grad_norm": 4.5, "learning_rate": 1.425432073882694e-06, "loss": 1.1721563339233398, "step": 5166 }, { "epoch": 1.5906887264332434, "grad_norm": 14.1875, "learning_rate": 1.4242035519808113e-06, "loss": 1.1466803550720215, "step": 5168 }, { "epoch": 1.591304347826087, "grad_norm": 7.65625, "learning_rate": 1.4229765959831813e-06, "loss": 1.0815727710723877, "step": 5170 }, { "epoch": 1.5919199692189303, "grad_norm": 6.875, "learning_rate": 1.4217512071090587e-06, "loss": 1.200536847114563, "step": 5172 }, { "epoch": 1.5925355906117737, "grad_norm": 5.375, "learning_rate": 1.4205273865761393e-06, "loss": 1.2767208814620972, "step": 5174 }, { "epoch": 1.5931512120046172, "grad_norm": 12.25, "learning_rate": 1.4193051356005608e-06, "loss": 1.1916471719741821, "step": 5176 }, { "epoch": 1.5937668333974606, "grad_norm": 6.96875, "learning_rate": 1.418084455396902e-06, "loss": 1.357552170753479, "step": 5178 }, { "epoch": 1.594382454790304, "grad_norm": 5.15625, "learning_rate": 1.416865347178179e-06, "loss": 1.1372745037078857, "step": 5180 }, { "epoch": 1.5949980761831473, "grad_norm": 4.3125, "learning_rate": 1.4156478121558484e-06, "loss": 1.5234270095825195, "step": 5182 }, { "epoch": 1.595613697575991, "grad_norm": 9.25, "learning_rate": 1.4144318515398012e-06, "loss": 1.21889328956604, "step": 5184 }, { "epoch": 1.5962293189688341, "grad_norm": 5.90625, "learning_rate": 1.4132174665383658e-06, "loss": 1.5588219165802002, "step": 5186 }, { "epoch": 1.5968449403616776, "grad_norm": 16.125, "learning_rate": 1.4120046583583019e-06, "loss": 1.4573081731796265, "step": 5188 }, { "epoch": 1.597460561754521, "grad_norm": 5.0, "learning_rate": 1.4107934282048056e-06, "loss": 1.3332809209823608, "step": 5190 }, { "epoch": 1.5980761831473642, "grad_norm": 4.5, "learning_rate": 1.4095837772815033e-06, "loss": 1.2771632671356201, "step": 5192 }, { "epoch": 1.5986918045402079, "grad_norm": 12.0625, "learning_rate": 1.4083757067904513e-06, "loss": 1.2745952606201172, "step": 5194 }, { "epoch": 1.599307425933051, "grad_norm": 1.640625, "learning_rate": 1.4071692179321378e-06, "loss": 1.1325695514678955, "step": 5196 }, { "epoch": 1.5999230473258945, "grad_norm": 6.3125, "learning_rate": 1.405964311905477e-06, "loss": 1.1452646255493164, "step": 5198 }, { "epoch": 1.600538668718738, "grad_norm": 8.375, "learning_rate": 1.4047609899078107e-06, "loss": 1.3691991567611694, "step": 5200 }, { "epoch": 1.6011542901115814, "grad_norm": 21.375, "learning_rate": 1.4035592531349079e-06, "loss": 1.2523620128631592, "step": 5202 }, { "epoch": 1.6017699115044248, "grad_norm": 5.6875, "learning_rate": 1.4023591027809601e-06, "loss": 1.025434136390686, "step": 5204 }, { "epoch": 1.602385532897268, "grad_norm": 9.375, "learning_rate": 1.4011605400385847e-06, "loss": 1.1921247243881226, "step": 5206 }, { "epoch": 1.6030011542901117, "grad_norm": 7.0, "learning_rate": 1.3999635660988199e-06, "loss": 1.3524107933044434, "step": 5208 }, { "epoch": 1.603616775682955, "grad_norm": 6.1875, "learning_rate": 1.3987681821511255e-06, "loss": 1.6408205032348633, "step": 5210 }, { "epoch": 1.6042323970757983, "grad_norm": 4.09375, "learning_rate": 1.3975743893833823e-06, "loss": 1.2043273448944092, "step": 5212 }, { "epoch": 1.6048480184686418, "grad_norm": 11.75, "learning_rate": 1.3963821889818868e-06, "loss": 1.4415565729141235, "step": 5214 }, { "epoch": 1.6054636398614852, "grad_norm": 8.875, "learning_rate": 1.3951915821313572e-06, "loss": 0.6851305365562439, "step": 5216 }, { "epoch": 1.6060792612543286, "grad_norm": 5.21875, "learning_rate": 1.394002570014925e-06, "loss": 1.123796820640564, "step": 5218 }, { "epoch": 1.6066948826471719, "grad_norm": 9.1875, "learning_rate": 1.3928151538141393e-06, "loss": 1.3693852424621582, "step": 5220 }, { "epoch": 1.6073105040400155, "grad_norm": 6.8125, "learning_rate": 1.3916293347089618e-06, "loss": 1.1036242246627808, "step": 5222 }, { "epoch": 1.6079261254328587, "grad_norm": 7.625, "learning_rate": 1.3904451138777666e-06, "loss": 1.0903420448303223, "step": 5224 }, { "epoch": 1.6085417468257022, "grad_norm": 2.5625, "learning_rate": 1.3892624924973425e-06, "loss": 1.148220419883728, "step": 5226 }, { "epoch": 1.6091573682185456, "grad_norm": 6.625, "learning_rate": 1.3880814717428844e-06, "loss": 0.9726516008377075, "step": 5228 }, { "epoch": 1.609772989611389, "grad_norm": 6.90625, "learning_rate": 1.386902052788001e-06, "loss": 1.5604653358459473, "step": 5230 }, { "epoch": 1.6103886110042325, "grad_norm": 11.5625, "learning_rate": 1.3857242368047065e-06, "loss": 1.1536065340042114, "step": 5232 }, { "epoch": 1.6110042323970757, "grad_norm": 9.6875, "learning_rate": 1.3845480249634227e-06, "loss": 2.0095927715301514, "step": 5234 }, { "epoch": 1.6116198537899193, "grad_norm": 5.65625, "learning_rate": 1.383373418432979e-06, "loss": 1.5020904541015625, "step": 5236 }, { "epoch": 1.6122354751827626, "grad_norm": 6.65625, "learning_rate": 1.382200418380607e-06, "loss": 1.2842357158660889, "step": 5238 }, { "epoch": 1.612851096575606, "grad_norm": 4.5, "learning_rate": 1.381029025971944e-06, "loss": 1.2769551277160645, "step": 5240 }, { "epoch": 1.6134667179684494, "grad_norm": 25.5, "learning_rate": 1.3798592423710278e-06, "loss": 1.1915051937103271, "step": 5242 }, { "epoch": 1.6140823393612926, "grad_norm": 10.0625, "learning_rate": 1.3786910687402998e-06, "loss": 1.229771614074707, "step": 5244 }, { "epoch": 1.6146979607541363, "grad_norm": 7.6875, "learning_rate": 1.3775245062405996e-06, "loss": 1.2317193746566772, "step": 5246 }, { "epoch": 1.6153135821469795, "grad_norm": 7.40625, "learning_rate": 1.3763595560311663e-06, "loss": 1.4626492261886597, "step": 5248 }, { "epoch": 1.6159292035398232, "grad_norm": 5.71875, "learning_rate": 1.3751962192696378e-06, "loss": 1.6538395881652832, "step": 5250 }, { "epoch": 1.6165448249326664, "grad_norm": 7.1875, "learning_rate": 1.3740344971120478e-06, "loss": 0.9849225282669067, "step": 5252 }, { "epoch": 1.6171604463255098, "grad_norm": 3.90625, "learning_rate": 1.372874390712825e-06, "loss": 1.276718020439148, "step": 5254 }, { "epoch": 1.6177760677183533, "grad_norm": 10.5625, "learning_rate": 1.3717159012247938e-06, "loss": 1.4531131982803345, "step": 5256 }, { "epoch": 1.6183916891111965, "grad_norm": 16.375, "learning_rate": 1.3705590297991705e-06, "loss": 0.8134285807609558, "step": 5258 }, { "epoch": 1.6190073105040401, "grad_norm": 11.5625, "learning_rate": 1.3694037775855651e-06, "loss": 1.3254494667053223, "step": 5260 }, { "epoch": 1.6196229318968833, "grad_norm": 21.625, "learning_rate": 1.3682501457319764e-06, "loss": 1.3920022249221802, "step": 5262 }, { "epoch": 1.6202385532897268, "grad_norm": 12.3125, "learning_rate": 1.3670981353847955e-06, "loss": 1.470127820968628, "step": 5264 }, { "epoch": 1.6208541746825702, "grad_norm": 5.125, "learning_rate": 1.3659477476888006e-06, "loss": 1.1842358112335205, "step": 5266 }, { "epoch": 1.6214697960754136, "grad_norm": 12.5625, "learning_rate": 1.3647989837871565e-06, "loss": 1.2555248737335205, "step": 5268 }, { "epoch": 1.622085417468257, "grad_norm": 7.5, "learning_rate": 1.3636518448214172e-06, "loss": 1.632139801979065, "step": 5270 }, { "epoch": 1.6227010388611003, "grad_norm": 7.59375, "learning_rate": 1.362506331931519e-06, "loss": 1.117124080657959, "step": 5272 }, { "epoch": 1.623316660253944, "grad_norm": 5.4375, "learning_rate": 1.3613624462557857e-06, "loss": 1.1123738288879395, "step": 5274 }, { "epoch": 1.6239322816467872, "grad_norm": 12.75, "learning_rate": 1.3602201889309204e-06, "loss": 1.6312143802642822, "step": 5276 }, { "epoch": 1.6245479030396306, "grad_norm": 8.8125, "learning_rate": 1.3590795610920108e-06, "loss": 1.4402046203613281, "step": 5278 }, { "epoch": 1.625163524432474, "grad_norm": 6.15625, "learning_rate": 1.3579405638725238e-06, "loss": 1.1443254947662354, "step": 5280 }, { "epoch": 1.6257791458253175, "grad_norm": 7.09375, "learning_rate": 1.356803198404306e-06, "loss": 1.3337159156799316, "step": 5282 }, { "epoch": 1.626394767218161, "grad_norm": 4.5, "learning_rate": 1.355667465817584e-06, "loss": 1.227888584136963, "step": 5284 }, { "epoch": 1.627010388611004, "grad_norm": 6.3125, "learning_rate": 1.3545333672409605e-06, "loss": 1.1797523498535156, "step": 5286 }, { "epoch": 1.6276260100038478, "grad_norm": 9.1875, "learning_rate": 1.353400903801414e-06, "loss": 1.4626578092575073, "step": 5288 }, { "epoch": 1.628241631396691, "grad_norm": 11.3125, "learning_rate": 1.3522700766243e-06, "loss": 0.43031102418899536, "step": 5290 }, { "epoch": 1.6288572527895344, "grad_norm": 4.53125, "learning_rate": 1.3511408868333453e-06, "loss": 1.227097511291504, "step": 5292 }, { "epoch": 1.6294728741823779, "grad_norm": 2.984375, "learning_rate": 1.3500133355506523e-06, "loss": 1.1872293949127197, "step": 5294 }, { "epoch": 1.6300884955752213, "grad_norm": 8.0, "learning_rate": 1.3488874238966931e-06, "loss": 1.690487027168274, "step": 5296 }, { "epoch": 1.6307041169680647, "grad_norm": 5.40625, "learning_rate": 1.3477631529903124e-06, "loss": 1.1481996774673462, "step": 5298 }, { "epoch": 1.631319738360908, "grad_norm": 8.3125, "learning_rate": 1.346640523948723e-06, "loss": 1.4720221757888794, "step": 5300 }, { "epoch": 1.6319353597537516, "grad_norm": 2.609375, "learning_rate": 1.345519537887506e-06, "loss": 1.1942338943481445, "step": 5302 }, { "epoch": 1.6325509811465948, "grad_norm": 10.9375, "learning_rate": 1.344400195920611e-06, "loss": 1.0284016132354736, "step": 5304 }, { "epoch": 1.6331666025394382, "grad_norm": 51.25, "learning_rate": 1.3432824991603525e-06, "loss": 1.1344106197357178, "step": 5306 }, { "epoch": 1.6337822239322817, "grad_norm": 16.875, "learning_rate": 1.3421664487174116e-06, "loss": 1.4245868921279907, "step": 5308 }, { "epoch": 1.6343978453251249, "grad_norm": 7.53125, "learning_rate": 1.3410520457008325e-06, "loss": 1.54558265209198, "step": 5310 }, { "epoch": 1.6350134667179685, "grad_norm": 9.1875, "learning_rate": 1.3399392912180214e-06, "loss": 1.724228024482727, "step": 5312 }, { "epoch": 1.6356290881108118, "grad_norm": 3.109375, "learning_rate": 1.3388281863747494e-06, "loss": 1.1608362197875977, "step": 5314 }, { "epoch": 1.6362447095036554, "grad_norm": 6.09375, "learning_rate": 1.3377187322751448e-06, "loss": 1.1420176029205322, "step": 5316 }, { "epoch": 1.6368603308964986, "grad_norm": 5.34375, "learning_rate": 1.336610930021697e-06, "loss": 1.0587470531463623, "step": 5318 }, { "epoch": 1.637475952289342, "grad_norm": 5.90625, "learning_rate": 1.3355047807152543e-06, "loss": 1.3292616605758667, "step": 5320 }, { "epoch": 1.6380915736821855, "grad_norm": 10.8125, "learning_rate": 1.3344002854550222e-06, "loss": 1.4881070852279663, "step": 5322 }, { "epoch": 1.6387071950750287, "grad_norm": 11.125, "learning_rate": 1.3332974453385628e-06, "loss": 1.8732441663742065, "step": 5324 }, { "epoch": 1.6393228164678724, "grad_norm": 8.6875, "learning_rate": 1.3321962614617914e-06, "loss": 1.35651695728302, "step": 5326 }, { "epoch": 1.6399384378607156, "grad_norm": 15.25, "learning_rate": 1.3310967349189815e-06, "loss": 1.5227655172348022, "step": 5328 }, { "epoch": 1.640554059253559, "grad_norm": 2.90625, "learning_rate": 1.329998866802755e-06, "loss": 1.1766257286071777, "step": 5330 }, { "epoch": 1.6411696806464025, "grad_norm": 9.8125, "learning_rate": 1.3289026582040892e-06, "loss": 1.5235556364059448, "step": 5332 }, { "epoch": 1.6417853020392459, "grad_norm": 5.40625, "learning_rate": 1.3278081102123111e-06, "loss": 1.3055951595306396, "step": 5334 }, { "epoch": 1.6424009234320893, "grad_norm": 16.5, "learning_rate": 1.3267152239150971e-06, "loss": 1.365351676940918, "step": 5336 }, { "epoch": 1.6430165448249325, "grad_norm": 199.0, "learning_rate": 1.3256240003984736e-06, "loss": 1.2809425592422485, "step": 5338 }, { "epoch": 1.6436321662177762, "grad_norm": 6.3125, "learning_rate": 1.3245344407468133e-06, "loss": 1.3115761280059814, "step": 5340 }, { "epoch": 1.6442477876106194, "grad_norm": 6.5, "learning_rate": 1.3234465460428363e-06, "loss": 1.4750884771347046, "step": 5342 }, { "epoch": 1.6448634090034628, "grad_norm": 8.75, "learning_rate": 1.322360317367608e-06, "loss": 1.533599853515625, "step": 5344 }, { "epoch": 1.6454790303963063, "grad_norm": 5.59375, "learning_rate": 1.3212757558005374e-06, "loss": 1.107212781906128, "step": 5346 }, { "epoch": 1.6460946517891497, "grad_norm": 7.09375, "learning_rate": 1.3201928624193785e-06, "loss": 1.17499840259552, "step": 5348 }, { "epoch": 1.6467102731819931, "grad_norm": 10.0625, "learning_rate": 1.3191116383002265e-06, "loss": 1.496663212776184, "step": 5350 }, { "epoch": 1.6473258945748364, "grad_norm": 4.78125, "learning_rate": 1.3180320845175181e-06, "loss": 1.499389410018921, "step": 5352 }, { "epoch": 1.64794151596768, "grad_norm": 15.625, "learning_rate": 1.31695420214403e-06, "loss": 1.3854055404663086, "step": 5354 }, { "epoch": 1.6485571373605232, "grad_norm": 11.8125, "learning_rate": 1.3158779922508782e-06, "loss": 1.507443904876709, "step": 5356 }, { "epoch": 1.6491727587533667, "grad_norm": 7.96875, "learning_rate": 1.3148034559075169e-06, "loss": 1.6484109163284302, "step": 5358 }, { "epoch": 1.64978838014621, "grad_norm": 12.875, "learning_rate": 1.3137305941817354e-06, "loss": 1.4092316627502441, "step": 5360 }, { "epoch": 1.6504040015390535, "grad_norm": 13.625, "learning_rate": 1.3126594081396627e-06, "loss": 1.1647193431854248, "step": 5362 }, { "epoch": 1.651019622931897, "grad_norm": 8.8125, "learning_rate": 1.3115898988457586e-06, "loss": 1.6195112466812134, "step": 5364 }, { "epoch": 1.6516352443247402, "grad_norm": 8.125, "learning_rate": 1.3105220673628195e-06, "loss": 1.6777485609054565, "step": 5366 }, { "epoch": 1.6522508657175838, "grad_norm": 9.3125, "learning_rate": 1.3094559147519733e-06, "loss": 1.0617992877960205, "step": 5368 }, { "epoch": 1.652866487110427, "grad_norm": 7.09375, "learning_rate": 1.3083914420726787e-06, "loss": 1.3714184761047363, "step": 5370 }, { "epoch": 1.6534821085032705, "grad_norm": 31.75, "learning_rate": 1.3073286503827275e-06, "loss": 1.0258792638778687, "step": 5372 }, { "epoch": 1.654097729896114, "grad_norm": 3.265625, "learning_rate": 1.3062675407382389e-06, "loss": 1.311981439590454, "step": 5374 }, { "epoch": 1.6547133512889571, "grad_norm": 25.625, "learning_rate": 1.3052081141936618e-06, "loss": 1.4438797235488892, "step": 5376 }, { "epoch": 1.6553289726818008, "grad_norm": 11.5, "learning_rate": 1.3041503718017715e-06, "loss": 1.2950350046157837, "step": 5378 }, { "epoch": 1.655944594074644, "grad_norm": 4.96875, "learning_rate": 1.303094314613671e-06, "loss": 1.5233476161956787, "step": 5380 }, { "epoch": 1.6565602154674874, "grad_norm": 7.15625, "learning_rate": 1.3020399436787876e-06, "loss": 1.4304616451263428, "step": 5382 }, { "epoch": 1.6571758368603309, "grad_norm": 7.21875, "learning_rate": 1.3009872600448725e-06, "loss": 1.5034236907958984, "step": 5384 }, { "epoch": 1.6577914582531743, "grad_norm": 7.25, "learning_rate": 1.2999362647580027e-06, "loss": 1.215216040611267, "step": 5386 }, { "epoch": 1.6584070796460177, "grad_norm": 9.1875, "learning_rate": 1.2988869588625746e-06, "loss": 1.0233787298202515, "step": 5388 }, { "epoch": 1.659022701038861, "grad_norm": 7.46875, "learning_rate": 1.297839343401307e-06, "loss": 1.4407703876495361, "step": 5390 }, { "epoch": 1.6596383224317046, "grad_norm": 9.6875, "learning_rate": 1.2967934194152399e-06, "loss": 1.1850214004516602, "step": 5392 }, { "epoch": 1.6602539438245478, "grad_norm": 12.375, "learning_rate": 1.2957491879437306e-06, "loss": 1.499040126800537, "step": 5394 }, { "epoch": 1.6608695652173913, "grad_norm": 9.25, "learning_rate": 1.2947066500244554e-06, "loss": 1.5880337953567505, "step": 5396 }, { "epoch": 1.6614851866102347, "grad_norm": 7.78125, "learning_rate": 1.2936658066934077e-06, "loss": 1.1170679330825806, "step": 5398 }, { "epoch": 1.6621008080030781, "grad_norm": 7.03125, "learning_rate": 1.2926266589848965e-06, "loss": 1.1979840993881226, "step": 5400 }, { "epoch": 1.6627164293959216, "grad_norm": 2.25, "learning_rate": 1.2915892079315465e-06, "loss": 1.1809914112091064, "step": 5402 }, { "epoch": 1.6633320507887648, "grad_norm": 9.5, "learning_rate": 1.2905534545642958e-06, "loss": 1.5309373140335083, "step": 5404 }, { "epoch": 1.6639476721816084, "grad_norm": 5.40625, "learning_rate": 1.2895193999123966e-06, "loss": 1.4166136980056763, "step": 5406 }, { "epoch": 1.6645632935744517, "grad_norm": 6.90625, "learning_rate": 1.2884870450034112e-06, "loss": 1.5316441059112549, "step": 5408 }, { "epoch": 1.665178914967295, "grad_norm": 4.1875, "learning_rate": 1.2874563908632142e-06, "loss": 1.3547664880752563, "step": 5410 }, { "epoch": 1.6657945363601385, "grad_norm": 11.375, "learning_rate": 1.28642743851599e-06, "loss": 1.549019455909729, "step": 5412 }, { "epoch": 1.666410157752982, "grad_norm": 9.1875, "learning_rate": 1.2854001889842305e-06, "loss": 1.3322649002075195, "step": 5414 }, { "epoch": 1.6670257791458254, "grad_norm": 6.40625, "learning_rate": 1.2843746432887382e-06, "loss": 1.2669012546539307, "step": 5416 }, { "epoch": 1.6676414005386686, "grad_norm": 9.0, "learning_rate": 1.2833508024486197e-06, "loss": 1.4787484407424927, "step": 5418 }, { "epoch": 1.6682570219315123, "grad_norm": 6.40625, "learning_rate": 1.282328667481289e-06, "loss": 1.5382004976272583, "step": 5420 }, { "epoch": 1.6688726433243555, "grad_norm": 11.0, "learning_rate": 1.2813082394024646e-06, "loss": 1.46946382522583, "step": 5422 }, { "epoch": 1.669488264717199, "grad_norm": 2.859375, "learning_rate": 1.280289519226168e-06, "loss": 1.4135793447494507, "step": 5424 }, { "epoch": 1.6701038861100423, "grad_norm": 580.0, "learning_rate": 1.2792725079647253e-06, "loss": 0.8567907214164734, "step": 5426 }, { "epoch": 1.6707195075028856, "grad_norm": 11.25, "learning_rate": 1.2782572066287626e-06, "loss": 1.6358792781829834, "step": 5428 }, { "epoch": 1.6713351288957292, "grad_norm": 4.71875, "learning_rate": 1.2772436162272084e-06, "loss": 1.5351207256317139, "step": 5430 }, { "epoch": 1.6719507502885724, "grad_norm": 11.625, "learning_rate": 1.2762317377672905e-06, "loss": 1.840936541557312, "step": 5432 }, { "epoch": 1.672566371681416, "grad_norm": 4.6875, "learning_rate": 1.2752215722545334e-06, "loss": 1.1363348960876465, "step": 5434 }, { "epoch": 1.6731819930742593, "grad_norm": 6.6875, "learning_rate": 1.2742131206927624e-06, "loss": 1.3856546878814697, "step": 5436 }, { "epoch": 1.6737976144671027, "grad_norm": 5.875, "learning_rate": 1.273206384084098e-06, "loss": 1.0931546688079834, "step": 5438 }, { "epoch": 1.6744132358599462, "grad_norm": 11.3125, "learning_rate": 1.2722013634289579e-06, "loss": 1.5303289890289307, "step": 5440 }, { "epoch": 1.6750288572527894, "grad_norm": 5.59375, "learning_rate": 1.2711980597260532e-06, "loss": 1.5187264680862427, "step": 5442 }, { "epoch": 1.675644478645633, "grad_norm": 7.4375, "learning_rate": 1.2701964739723883e-06, "loss": 1.1173781156539917, "step": 5444 }, { "epoch": 1.6762601000384763, "grad_norm": 6.8125, "learning_rate": 1.2691966071632634e-06, "loss": 0.7528840899467468, "step": 5446 }, { "epoch": 1.6768757214313197, "grad_norm": 6.21875, "learning_rate": 1.2681984602922659e-06, "loss": 0.8180416822433472, "step": 5448 }, { "epoch": 1.6774913428241631, "grad_norm": 7.78125, "learning_rate": 1.2672020343512788e-06, "loss": 1.5359752178192139, "step": 5450 }, { "epoch": 1.6781069642170066, "grad_norm": 2.265625, "learning_rate": 1.2662073303304726e-06, "loss": 1.2845362424850464, "step": 5452 }, { "epoch": 1.67872258560985, "grad_norm": 12.5, "learning_rate": 1.265214349218306e-06, "loss": 0.669722318649292, "step": 5454 }, { "epoch": 1.6793382070026932, "grad_norm": 7.65625, "learning_rate": 1.2642230920015279e-06, "loss": 1.5199944972991943, "step": 5456 }, { "epoch": 1.6799538283955369, "grad_norm": 13.875, "learning_rate": 1.2632335596651717e-06, "loss": 1.471770167350769, "step": 5458 }, { "epoch": 1.68056944978838, "grad_norm": 23.0, "learning_rate": 1.2622457531925586e-06, "loss": 1.2522399425506592, "step": 5460 }, { "epoch": 1.6811850711812235, "grad_norm": 13.625, "learning_rate": 1.2612596735652935e-06, "loss": 1.019469976425171, "step": 5462 }, { "epoch": 1.681800692574067, "grad_norm": 6.09375, "learning_rate": 1.2602753217632662e-06, "loss": 1.0745410919189453, "step": 5464 }, { "epoch": 1.6824163139669104, "grad_norm": 1.9453125, "learning_rate": 1.2592926987646492e-06, "loss": 1.2746351957321167, "step": 5466 }, { "epoch": 1.6830319353597538, "grad_norm": 8.0625, "learning_rate": 1.2583118055458965e-06, "loss": 1.1866888999938965, "step": 5468 }, { "epoch": 1.683647556752597, "grad_norm": 1.9765625, "learning_rate": 1.2573326430817443e-06, "loss": 1.1329030990600586, "step": 5470 }, { "epoch": 1.6842631781454407, "grad_norm": 6.71875, "learning_rate": 1.256355212345208e-06, "loss": 1.3046042919158936, "step": 5472 }, { "epoch": 1.684878799538284, "grad_norm": 4.6875, "learning_rate": 1.2553795143075825e-06, "loss": 1.1448999643325806, "step": 5474 }, { "epoch": 1.6854944209311273, "grad_norm": 4.84375, "learning_rate": 1.2544055499384408e-06, "loss": 1.2929960489273071, "step": 5476 }, { "epoch": 1.6861100423239708, "grad_norm": 8.75, "learning_rate": 1.2534333202056326e-06, "loss": 1.6156351566314697, "step": 5478 }, { "epoch": 1.6867256637168142, "grad_norm": 8.4375, "learning_rate": 1.252462826075285e-06, "loss": 1.4472776651382446, "step": 5480 }, { "epoch": 1.6873412851096576, "grad_norm": 13.75, "learning_rate": 1.2514940685117996e-06, "loss": 1.7093762159347534, "step": 5482 }, { "epoch": 1.6879569065025009, "grad_norm": 24.0, "learning_rate": 1.2505270484778532e-06, "loss": 1.6886651515960693, "step": 5484 }, { "epoch": 1.6885725278953445, "grad_norm": 3.359375, "learning_rate": 1.2495617669343943e-06, "loss": 1.4793897867202759, "step": 5486 }, { "epoch": 1.6891881492881877, "grad_norm": 14.125, "learning_rate": 1.2485982248406445e-06, "loss": 1.4237476587295532, "step": 5488 }, { "epoch": 1.6898037706810312, "grad_norm": 5.96875, "learning_rate": 1.2476364231540982e-06, "loss": 1.5307400226593018, "step": 5490 }, { "epoch": 1.6904193920738746, "grad_norm": 4.96875, "learning_rate": 1.2466763628305189e-06, "loss": 1.1641037464141846, "step": 5492 }, { "epoch": 1.6910350134667178, "grad_norm": 13.1875, "learning_rate": 1.24571804482394e-06, "loss": 1.4488732814788818, "step": 5494 }, { "epoch": 1.6916506348595615, "grad_norm": 8.6875, "learning_rate": 1.2447614700866639e-06, "loss": 1.524038553237915, "step": 5496 }, { "epoch": 1.6922662562524047, "grad_norm": 7.5, "learning_rate": 1.24380663956926e-06, "loss": 1.4217767715454102, "step": 5498 }, { "epoch": 1.6928818776452483, "grad_norm": 7.65625, "learning_rate": 1.2428535542205651e-06, "loss": 1.250489592552185, "step": 5500 }, { "epoch": 1.6934974990380915, "grad_norm": 4.3125, "learning_rate": 1.2419022149876808e-06, "loss": 1.3538804054260254, "step": 5502 }, { "epoch": 1.694113120430935, "grad_norm": 10.8125, "learning_rate": 1.240952622815975e-06, "loss": 0.9905145168304443, "step": 5504 }, { "epoch": 1.6947287418237784, "grad_norm": 3.203125, "learning_rate": 1.2400047786490783e-06, "loss": 1.1671013832092285, "step": 5506 }, { "epoch": 1.6953443632166216, "grad_norm": 8.25, "learning_rate": 1.2390586834288846e-06, "loss": 1.2432751655578613, "step": 5508 }, { "epoch": 1.6959599846094653, "grad_norm": 2.90625, "learning_rate": 1.238114338095551e-06, "loss": 1.2975242137908936, "step": 5510 }, { "epoch": 1.6965756060023085, "grad_norm": 5.4375, "learning_rate": 1.2371717435874926e-06, "loss": 1.561279535293579, "step": 5512 }, { "epoch": 1.697191227395152, "grad_norm": 6.5, "learning_rate": 1.2362309008413887e-06, "loss": 1.4640417098999023, "step": 5514 }, { "epoch": 1.6978068487879954, "grad_norm": 8.875, "learning_rate": 1.2352918107921744e-06, "loss": 1.2020208835601807, "step": 5516 }, { "epoch": 1.6984224701808388, "grad_norm": 8.5, "learning_rate": 1.2343544743730454e-06, "loss": 1.6358736753463745, "step": 5518 }, { "epoch": 1.6990380915736822, "grad_norm": 9.5625, "learning_rate": 1.233418892515454e-06, "loss": 1.4060251712799072, "step": 5520 }, { "epoch": 1.6996537129665255, "grad_norm": 3.984375, "learning_rate": 1.232485066149108e-06, "loss": 1.262081503868103, "step": 5522 }, { "epoch": 1.7002693343593691, "grad_norm": 11.0, "learning_rate": 1.2315529962019722e-06, "loss": 1.615898847579956, "step": 5524 }, { "epoch": 1.7008849557522123, "grad_norm": 3.515625, "learning_rate": 1.230622683600265e-06, "loss": 1.0624699592590332, "step": 5526 }, { "epoch": 1.7015005771450558, "grad_norm": 13.1875, "learning_rate": 1.2296941292684595e-06, "loss": 1.3232251405715942, "step": 5528 }, { "epoch": 1.7021161985378992, "grad_norm": 2.4375, "learning_rate": 1.2287673341292808e-06, "loss": 1.1552577018737793, "step": 5530 }, { "epoch": 1.7027318199307426, "grad_norm": 12.25, "learning_rate": 1.2278422991037051e-06, "loss": 1.103263258934021, "step": 5532 }, { "epoch": 1.703347441323586, "grad_norm": 4.5625, "learning_rate": 1.2269190251109619e-06, "loss": 1.0746948719024658, "step": 5534 }, { "epoch": 1.7039630627164293, "grad_norm": 6.9375, "learning_rate": 1.2259975130685285e-06, "loss": 1.2396745681762695, "step": 5536 }, { "epoch": 1.704578684109273, "grad_norm": 5.6875, "learning_rate": 1.2250777638921318e-06, "loss": 1.2555797100067139, "step": 5538 }, { "epoch": 1.7051943055021161, "grad_norm": 5.75, "learning_rate": 1.2241597784957477e-06, "loss": 1.088400959968567, "step": 5540 }, { "epoch": 1.7058099268949596, "grad_norm": 3.453125, "learning_rate": 1.2232435577915982e-06, "loss": 1.0315688848495483, "step": 5542 }, { "epoch": 1.706425548287803, "grad_norm": 3.0, "learning_rate": 1.2223291026901534e-06, "loss": 1.2652621269226074, "step": 5544 }, { "epoch": 1.7070411696806465, "grad_norm": 5.875, "learning_rate": 1.2214164141001266e-06, "loss": 1.4031015634536743, "step": 5546 }, { "epoch": 1.7076567910734899, "grad_norm": 7.53125, "learning_rate": 1.2205054929284784e-06, "loss": 1.2202022075653076, "step": 5548 }, { "epoch": 1.708272412466333, "grad_norm": 4.28125, "learning_rate": 1.21959634008041e-06, "loss": 1.0692622661590576, "step": 5550 }, { "epoch": 1.7088880338591768, "grad_norm": 7.6875, "learning_rate": 1.2186889564593678e-06, "loss": 1.356179118156433, "step": 5552 }, { "epoch": 1.70950365525202, "grad_norm": 11.1875, "learning_rate": 1.2177833429670395e-06, "loss": 1.7683215141296387, "step": 5554 }, { "epoch": 1.7101192766448634, "grad_norm": 4.65625, "learning_rate": 1.2168795005033524e-06, "loss": 1.0441101789474487, "step": 5556 }, { "epoch": 1.7107348980377068, "grad_norm": 5.4375, "learning_rate": 1.2159774299664765e-06, "loss": 1.1631641387939453, "step": 5558 }, { "epoch": 1.71135051943055, "grad_norm": 14.0, "learning_rate": 1.2150771322528187e-06, "loss": 1.212846040725708, "step": 5560 }, { "epoch": 1.7119661408233937, "grad_norm": 9.8125, "learning_rate": 1.2141786082570248e-06, "loss": 1.3858622312545776, "step": 5562 }, { "epoch": 1.712581762216237, "grad_norm": 6.40625, "learning_rate": 1.2132818588719788e-06, "loss": 1.2670592069625854, "step": 5564 }, { "epoch": 1.7131973836090806, "grad_norm": 18.625, "learning_rate": 1.2123868849888e-06, "loss": 1.259639859199524, "step": 5566 }, { "epoch": 1.7138130050019238, "grad_norm": 12.9375, "learning_rate": 1.2114936874968452e-06, "loss": 1.3705947399139404, "step": 5568 }, { "epoch": 1.7144286263947672, "grad_norm": 6.875, "learning_rate": 1.210602267283703e-06, "loss": 0.7072739601135254, "step": 5570 }, { "epoch": 1.7150442477876107, "grad_norm": 4.875, "learning_rate": 1.2097126252351992e-06, "loss": 1.2308495044708252, "step": 5572 }, { "epoch": 1.7156598691804539, "grad_norm": 5.25, "learning_rate": 1.2088247622353907e-06, "loss": 1.469330906867981, "step": 5574 }, { "epoch": 1.7162754905732975, "grad_norm": 7.28125, "learning_rate": 1.2079386791665664e-06, "loss": 1.3243225812911987, "step": 5576 }, { "epoch": 1.7168911119661407, "grad_norm": 9.3125, "learning_rate": 1.2070543769092475e-06, "loss": 1.68306565284729, "step": 5578 }, { "epoch": 1.7175067333589842, "grad_norm": 8.25, "learning_rate": 1.206171856342184e-06, "loss": 1.2183438539505005, "step": 5580 }, { "epoch": 1.7181223547518276, "grad_norm": 13.3125, "learning_rate": 1.205291118342357e-06, "loss": 1.412492275238037, "step": 5582 }, { "epoch": 1.718737976144671, "grad_norm": 17.25, "learning_rate": 1.2044121637849762e-06, "loss": 1.0876752138137817, "step": 5584 }, { "epoch": 1.7193535975375145, "grad_norm": 7.15625, "learning_rate": 1.203534993543477e-06, "loss": 1.2739953994750977, "step": 5586 }, { "epoch": 1.7199692189303577, "grad_norm": 4.625, "learning_rate": 1.202659608489525e-06, "loss": 1.3260688781738281, "step": 5588 }, { "epoch": 1.7205848403232014, "grad_norm": 6.5, "learning_rate": 1.2017860094930084e-06, "loss": 0.9359119534492493, "step": 5590 }, { "epoch": 1.7212004617160446, "grad_norm": 14.4375, "learning_rate": 1.2009141974220428e-06, "loss": 1.80499267578125, "step": 5592 }, { "epoch": 1.721816083108888, "grad_norm": 6.84375, "learning_rate": 1.2000441731429669e-06, "loss": 1.2378640174865723, "step": 5594 }, { "epoch": 1.7224317045017314, "grad_norm": 3.90625, "learning_rate": 1.1991759375203437e-06, "loss": 1.1640088558197021, "step": 5596 }, { "epoch": 1.7230473258945749, "grad_norm": 4.4375, "learning_rate": 1.1983094914169586e-06, "loss": 1.1311296224594116, "step": 5598 }, { "epoch": 1.7236629472874183, "grad_norm": 12.8125, "learning_rate": 1.197444835693818e-06, "loss": 1.6203620433807373, "step": 5600 }, { "epoch": 1.7242785686802615, "grad_norm": 12.8125, "learning_rate": 1.19658197121015e-06, "loss": 1.2333106994628906, "step": 5602 }, { "epoch": 1.7248941900731052, "grad_norm": 7.5, "learning_rate": 1.1957208988234025e-06, "loss": 1.2552940845489502, "step": 5604 }, { "epoch": 1.7255098114659484, "grad_norm": 9.8125, "learning_rate": 1.1948616193892421e-06, "loss": 1.3899599313735962, "step": 5606 }, { "epoch": 1.7261254328587918, "grad_norm": 8.25, "learning_rate": 1.1940041337615544e-06, "loss": 1.5031278133392334, "step": 5608 }, { "epoch": 1.7267410542516353, "grad_norm": 11.625, "learning_rate": 1.1931484427924415e-06, "loss": 1.7559168338775635, "step": 5610 }, { "epoch": 1.7273566756444787, "grad_norm": 4.59375, "learning_rate": 1.1922945473322233e-06, "loss": 1.2270822525024414, "step": 5612 }, { "epoch": 1.7279722970373221, "grad_norm": 9.0625, "learning_rate": 1.1914424482294347e-06, "loss": 1.2980186939239502, "step": 5614 }, { "epoch": 1.7285879184301653, "grad_norm": 14.8125, "learning_rate": 1.1905921463308256e-06, "loss": 1.246907114982605, "step": 5616 }, { "epoch": 1.729203539823009, "grad_norm": 20.125, "learning_rate": 1.18974364248136e-06, "loss": 1.6529814004898071, "step": 5618 }, { "epoch": 1.7298191612158522, "grad_norm": 14.0, "learning_rate": 1.1888969375242153e-06, "loss": 1.4205410480499268, "step": 5620 }, { "epoch": 1.7304347826086957, "grad_norm": 3.140625, "learning_rate": 1.1880520323007811e-06, "loss": 1.503267765045166, "step": 5622 }, { "epoch": 1.731050404001539, "grad_norm": 5.625, "learning_rate": 1.1872089276506584e-06, "loss": 1.4263373613357544, "step": 5624 }, { "epoch": 1.7316660253943823, "grad_norm": 9.375, "learning_rate": 1.1863676244116604e-06, "loss": 1.579979419708252, "step": 5626 }, { "epoch": 1.732281646787226, "grad_norm": 6.8125, "learning_rate": 1.1855281234198073e-06, "loss": 1.5714635848999023, "step": 5628 }, { "epoch": 1.7328972681800692, "grad_norm": 10.5625, "learning_rate": 1.1846904255093312e-06, "loss": 1.6432576179504395, "step": 5630 }, { "epoch": 1.7335128895729126, "grad_norm": 4.9375, "learning_rate": 1.183854531512671e-06, "loss": 1.2803030014038086, "step": 5632 }, { "epoch": 1.734128510965756, "grad_norm": 2.484375, "learning_rate": 1.1830204422604728e-06, "loss": 1.0546574592590332, "step": 5634 }, { "epoch": 1.7347441323585995, "grad_norm": 19.375, "learning_rate": 1.1821881585815907e-06, "loss": 1.4187873601913452, "step": 5636 }, { "epoch": 1.735359753751443, "grad_norm": 7.5625, "learning_rate": 1.1813576813030831e-06, "loss": 1.383895754814148, "step": 5638 }, { "epoch": 1.7359753751442861, "grad_norm": 6.1875, "learning_rate": 1.180529011250214e-06, "loss": 1.2424262762069702, "step": 5640 }, { "epoch": 1.7365909965371298, "grad_norm": 4.90625, "learning_rate": 1.1797021492464514e-06, "loss": 1.362465739250183, "step": 5642 }, { "epoch": 1.737206617929973, "grad_norm": 26.625, "learning_rate": 1.1788770961134662e-06, "loss": 1.3541010618209839, "step": 5644 }, { "epoch": 1.7378222393228164, "grad_norm": 9.875, "learning_rate": 1.1780538526711329e-06, "loss": 0.9746760725975037, "step": 5646 }, { "epoch": 1.7384378607156599, "grad_norm": 9.9375, "learning_rate": 1.1772324197375267e-06, "loss": 1.4916549921035767, "step": 5648 }, { "epoch": 1.7390534821085033, "grad_norm": 4.28125, "learning_rate": 1.1764127981289234e-06, "loss": 1.221276044845581, "step": 5650 }, { "epoch": 1.7396691035013467, "grad_norm": 9.1875, "learning_rate": 1.1755949886598006e-06, "loss": 1.383480191230774, "step": 5652 }, { "epoch": 1.74028472489419, "grad_norm": 5.15625, "learning_rate": 1.1747789921428324e-06, "loss": 1.2294129133224487, "step": 5654 }, { "epoch": 1.7409003462870336, "grad_norm": 3.921875, "learning_rate": 1.1739648093888938e-06, "loss": 1.397127628326416, "step": 5656 }, { "epoch": 1.7415159676798768, "grad_norm": 8.1875, "learning_rate": 1.1731524412070562e-06, "loss": 1.2268887758255005, "step": 5658 }, { "epoch": 1.7421315890727203, "grad_norm": 5.59375, "learning_rate": 1.1723418884045881e-06, "loss": 1.2938385009765625, "step": 5660 }, { "epoch": 1.7427472104655637, "grad_norm": 3.625, "learning_rate": 1.171533151786954e-06, "loss": 1.0179165601730347, "step": 5662 }, { "epoch": 1.7433628318584071, "grad_norm": 30.25, "learning_rate": 1.1707262321578134e-06, "loss": 1.429439663887024, "step": 5664 }, { "epoch": 1.7439784532512506, "grad_norm": 7.96875, "learning_rate": 1.1699211303190212e-06, "loss": 1.526271104812622, "step": 5666 }, { "epoch": 1.7445940746440938, "grad_norm": 3.4375, "learning_rate": 1.169117847070624e-06, "loss": 1.0915848016738892, "step": 5668 }, { "epoch": 1.7452096960369374, "grad_norm": 9.125, "learning_rate": 1.1683163832108626e-06, "loss": 1.369112491607666, "step": 5670 }, { "epoch": 1.7458253174297806, "grad_norm": 4.84375, "learning_rate": 1.1675167395361705e-06, "loss": 1.1842763423919678, "step": 5672 }, { "epoch": 1.746440938822624, "grad_norm": 5.4375, "learning_rate": 1.1667189168411706e-06, "loss": 1.0369771718978882, "step": 5674 }, { "epoch": 1.7470565602154675, "grad_norm": 13.625, "learning_rate": 1.1659229159186779e-06, "loss": 1.2211787700653076, "step": 5676 }, { "epoch": 1.7476721816083107, "grad_norm": 5.46875, "learning_rate": 1.165128737559696e-06, "loss": 0.7121513485908508, "step": 5678 }, { "epoch": 1.7482878030011544, "grad_norm": 3.421875, "learning_rate": 1.1643363825534173e-06, "loss": 1.1748327016830444, "step": 5680 }, { "epoch": 1.7489034243939976, "grad_norm": 5.65625, "learning_rate": 1.1635458516872234e-06, "loss": 1.2920325994491577, "step": 5682 }, { "epoch": 1.7495190457868413, "grad_norm": 8.8125, "learning_rate": 1.1627571457466824e-06, "loss": 1.368028998374939, "step": 5684 }, { "epoch": 1.7501346671796845, "grad_norm": 11.375, "learning_rate": 1.161970265515549e-06, "loss": 1.3619840145111084, "step": 5686 }, { "epoch": 1.750750288572528, "grad_norm": 5.25, "learning_rate": 1.1611852117757634e-06, "loss": 1.3654226064682007, "step": 5688 }, { "epoch": 1.7513659099653713, "grad_norm": 6.09375, "learning_rate": 1.1604019853074518e-06, "loss": 1.1195334196090698, "step": 5690 }, { "epoch": 1.7519815313582145, "grad_norm": 23.75, "learning_rate": 1.1596205868889238e-06, "loss": 1.4777530431747437, "step": 5692 }, { "epoch": 1.7525971527510582, "grad_norm": 6.375, "learning_rate": 1.1588410172966719e-06, "loss": 1.3182849884033203, "step": 5694 }, { "epoch": 1.7532127741439014, "grad_norm": 9.125, "learning_rate": 1.1580632773053722e-06, "loss": 1.2617470026016235, "step": 5696 }, { "epoch": 1.7538283955367449, "grad_norm": 5.0, "learning_rate": 1.1572873676878822e-06, "loss": 1.5043034553527832, "step": 5698 }, { "epoch": 1.7544440169295883, "grad_norm": 1.8203125, "learning_rate": 1.156513289215241e-06, "loss": 0.9596728086471558, "step": 5700 }, { "epoch": 1.7550596383224317, "grad_norm": 5.09375, "learning_rate": 1.1557410426566678e-06, "loss": 1.29399573802948, "step": 5702 }, { "epoch": 1.7556752597152752, "grad_norm": 6.25, "learning_rate": 1.154970628779561e-06, "loss": 1.644959568977356, "step": 5704 }, { "epoch": 1.7562908811081184, "grad_norm": 14.25, "learning_rate": 1.1542020483494982e-06, "loss": 1.3602919578552246, "step": 5706 }, { "epoch": 1.756906502500962, "grad_norm": 5.46875, "learning_rate": 1.1534353021302347e-06, "loss": 1.2901757955551147, "step": 5708 }, { "epoch": 1.7575221238938052, "grad_norm": 8.25, "learning_rate": 1.1526703908837043e-06, "loss": 1.4418219327926636, "step": 5710 }, { "epoch": 1.7581377452866487, "grad_norm": 15.75, "learning_rate": 1.1519073153700156e-06, "loss": 1.4010531902313232, "step": 5712 }, { "epoch": 1.7587533666794921, "grad_norm": 5.40625, "learning_rate": 1.1511460763474543e-06, "loss": 1.2491612434387207, "step": 5714 }, { "epoch": 1.7593689880723355, "grad_norm": 5.375, "learning_rate": 1.1503866745724807e-06, "loss": 1.45999014377594, "step": 5716 }, { "epoch": 1.759984609465179, "grad_norm": 2.859375, "learning_rate": 1.1496291107997288e-06, "loss": 1.1590230464935303, "step": 5718 }, { "epoch": 1.7606002308580222, "grad_norm": 9.75, "learning_rate": 1.1488733857820073e-06, "loss": 1.1593949794769287, "step": 5720 }, { "epoch": 1.7612158522508659, "grad_norm": 22.375, "learning_rate": 1.1481195002702968e-06, "loss": 1.744612455368042, "step": 5722 }, { "epoch": 1.761831473643709, "grad_norm": 8.125, "learning_rate": 1.1473674550137503e-06, "loss": 1.607894778251648, "step": 5724 }, { "epoch": 1.7624470950365525, "grad_norm": 5.71875, "learning_rate": 1.1466172507596923e-06, "loss": 1.4263869524002075, "step": 5726 }, { "epoch": 1.763062716429396, "grad_norm": 5.4375, "learning_rate": 1.1458688882536168e-06, "loss": 1.3822015523910522, "step": 5728 }, { "epoch": 1.7636783378222394, "grad_norm": 15.75, "learning_rate": 1.14512236823919e-06, "loss": 1.2835092544555664, "step": 5730 }, { "epoch": 1.7642939592150828, "grad_norm": 11.1875, "learning_rate": 1.1443776914582434e-06, "loss": 1.523394227027893, "step": 5732 }, { "epoch": 1.764909580607926, "grad_norm": 4.09375, "learning_rate": 1.1436348586507807e-06, "loss": 1.4853382110595703, "step": 5734 }, { "epoch": 1.7655252020007697, "grad_norm": 2.25, "learning_rate": 1.1428938705549704e-06, "loss": 1.1671732664108276, "step": 5736 }, { "epoch": 1.766140823393613, "grad_norm": 7.15625, "learning_rate": 1.1421547279071499e-06, "loss": 1.3251278400421143, "step": 5738 }, { "epoch": 1.7667564447864563, "grad_norm": 7.53125, "learning_rate": 1.1414174314418217e-06, "loss": 1.5456947088241577, "step": 5740 }, { "epoch": 1.7673720661792998, "grad_norm": 5.90625, "learning_rate": 1.1406819818916533e-06, "loss": 0.7835991978645325, "step": 5742 }, { "epoch": 1.767987687572143, "grad_norm": 15.25, "learning_rate": 1.1399483799874777e-06, "loss": 1.0862616300582886, "step": 5744 }, { "epoch": 1.7686033089649866, "grad_norm": 12.75, "learning_rate": 1.139216626458291e-06, "loss": 1.527071475982666, "step": 5746 }, { "epoch": 1.7692189303578298, "grad_norm": 6.0625, "learning_rate": 1.1384867220312541e-06, "loss": 1.3800554275512695, "step": 5748 }, { "epoch": 1.7698345517506735, "grad_norm": 6.15625, "learning_rate": 1.1377586674316887e-06, "loss": 1.2057721614837646, "step": 5750 }, { "epoch": 1.7704501731435167, "grad_norm": 4.46875, "learning_rate": 1.137032463383079e-06, "loss": 1.502542495727539, "step": 5752 }, { "epoch": 1.7710657945363601, "grad_norm": 6.3125, "learning_rate": 1.1363081106070709e-06, "loss": 1.1586823463439941, "step": 5754 }, { "epoch": 1.7716814159292036, "grad_norm": 9.625, "learning_rate": 1.1355856098234693e-06, "loss": 1.2189154624938965, "step": 5756 }, { "epoch": 1.7722970373220468, "grad_norm": 7.9375, "learning_rate": 1.1348649617502395e-06, "loss": 1.298841118812561, "step": 5758 }, { "epoch": 1.7729126587148905, "grad_norm": 10.875, "learning_rate": 1.1341461671035059e-06, "loss": 1.798980474472046, "step": 5760 }, { "epoch": 1.7735282801077337, "grad_norm": 10.875, "learning_rate": 1.1334292265975506e-06, "loss": 1.6718307733535767, "step": 5762 }, { "epoch": 1.774143901500577, "grad_norm": 3.65625, "learning_rate": 1.1327141409448134e-06, "loss": 1.12022864818573, "step": 5764 }, { "epoch": 1.7747595228934205, "grad_norm": 4.84375, "learning_rate": 1.132000910855891e-06, "loss": 1.088094711303711, "step": 5766 }, { "epoch": 1.775375144286264, "grad_norm": 3.21875, "learning_rate": 1.131289537039537e-06, "loss": 1.2651698589324951, "step": 5768 }, { "epoch": 1.7759907656791074, "grad_norm": 18.25, "learning_rate": 1.1305800202026581e-06, "loss": 1.527762770652771, "step": 5770 }, { "epoch": 1.7766063870719506, "grad_norm": 5.46875, "learning_rate": 1.1298723610503178e-06, "loss": 1.3321083784103394, "step": 5772 }, { "epoch": 1.7772220084647943, "grad_norm": 29.75, "learning_rate": 1.129166560285733e-06, "loss": 1.3004841804504395, "step": 5774 }, { "epoch": 1.7778376298576375, "grad_norm": 6.53125, "learning_rate": 1.1284626186102733e-06, "loss": 1.1199957132339478, "step": 5776 }, { "epoch": 1.778453251250481, "grad_norm": 9.5625, "learning_rate": 1.1277605367234617e-06, "loss": 1.4505853652954102, "step": 5778 }, { "epoch": 1.7790688726433244, "grad_norm": 3.625, "learning_rate": 1.1270603153229725e-06, "loss": 1.1563801765441895, "step": 5780 }, { "epoch": 1.7796844940361678, "grad_norm": 3.5, "learning_rate": 1.1263619551046315e-06, "loss": 1.1516625881195068, "step": 5782 }, { "epoch": 1.7803001154290112, "grad_norm": 2.234375, "learning_rate": 1.1256654567624151e-06, "loss": 1.0001325607299805, "step": 5784 }, { "epoch": 1.7809157368218544, "grad_norm": 3.140625, "learning_rate": 1.1249708209884485e-06, "loss": 1.1107304096221924, "step": 5786 }, { "epoch": 1.781531358214698, "grad_norm": 7.71875, "learning_rate": 1.124278048473008e-06, "loss": 1.2107503414154053, "step": 5788 }, { "epoch": 1.7821469796075413, "grad_norm": 5.875, "learning_rate": 1.1235871399045157e-06, "loss": 1.021038293838501, "step": 5790 }, { "epoch": 1.7827626010003848, "grad_norm": 2.234375, "learning_rate": 1.1228980959695447e-06, "loss": 1.0064464807510376, "step": 5792 }, { "epoch": 1.7833782223932282, "grad_norm": 5.375, "learning_rate": 1.1222109173528127e-06, "loss": 1.3229267597198486, "step": 5794 }, { "epoch": 1.7839938437860716, "grad_norm": 5.90625, "learning_rate": 1.1215256047371837e-06, "loss": 1.33932363986969, "step": 5796 }, { "epoch": 1.784609465178915, "grad_norm": 5.46875, "learning_rate": 1.120842158803669e-06, "loss": 1.2903056144714355, "step": 5798 }, { "epoch": 1.7852250865717583, "grad_norm": 5.1875, "learning_rate": 1.120160580231424e-06, "loss": 1.3245118856430054, "step": 5800 }, { "epoch": 1.785840707964602, "grad_norm": 2.6875, "learning_rate": 1.1194808696977487e-06, "loss": 1.2954384088516235, "step": 5802 }, { "epoch": 1.7864563293574451, "grad_norm": 2.921875, "learning_rate": 1.1188030278780867e-06, "loss": 1.1626046895980835, "step": 5804 }, { "epoch": 1.7870719507502886, "grad_norm": 5.71875, "learning_rate": 1.118127055446024e-06, "loss": 1.4342396259307861, "step": 5806 }, { "epoch": 1.787687572143132, "grad_norm": 5.46875, "learning_rate": 1.1174529530732908e-06, "loss": 1.3454452753067017, "step": 5808 }, { "epoch": 1.7883031935359752, "grad_norm": 5.09375, "learning_rate": 1.1167807214297562e-06, "loss": 1.3604456186294556, "step": 5810 }, { "epoch": 1.7889188149288189, "grad_norm": 5.09375, "learning_rate": 1.116110361183433e-06, "loss": 1.27693772315979, "step": 5812 }, { "epoch": 1.789534436321662, "grad_norm": 4.34375, "learning_rate": 1.115441873000473e-06, "loss": 1.1668188571929932, "step": 5814 }, { "epoch": 1.7901500577145055, "grad_norm": 16.625, "learning_rate": 1.1147752575451674e-06, "loss": 1.3752187490463257, "step": 5816 }, { "epoch": 1.790765679107349, "grad_norm": 3.796875, "learning_rate": 1.1141105154799475e-06, "loss": 1.0731788873672485, "step": 5818 }, { "epoch": 1.7913813005001924, "grad_norm": 5.4375, "learning_rate": 1.113447647465382e-06, "loss": 1.426667332649231, "step": 5820 }, { "epoch": 1.7919969218930358, "grad_norm": 7.375, "learning_rate": 1.112786654160178e-06, "loss": 1.4523508548736572, "step": 5822 }, { "epoch": 1.792612543285879, "grad_norm": 9.875, "learning_rate": 1.112127536221179e-06, "loss": 1.5422025918960571, "step": 5824 }, { "epoch": 1.7932281646787227, "grad_norm": 15.0, "learning_rate": 1.1114702943033656e-06, "loss": 1.28928542137146, "step": 5826 }, { "epoch": 1.793843786071566, "grad_norm": 2.28125, "learning_rate": 1.1108149290598537e-06, "loss": 1.0345327854156494, "step": 5828 }, { "epoch": 1.7944594074644094, "grad_norm": 2.671875, "learning_rate": 1.1101614411418945e-06, "loss": 1.075919270515442, "step": 5830 }, { "epoch": 1.7950750288572528, "grad_norm": 5.53125, "learning_rate": 1.1095098311988735e-06, "loss": 1.2990070581436157, "step": 5832 }, { "epoch": 1.7956906502500962, "grad_norm": 7.5, "learning_rate": 1.1088600998783101e-06, "loss": 1.3440529108047485, "step": 5834 }, { "epoch": 1.7963062716429397, "grad_norm": 9.25, "learning_rate": 1.1082122478258572e-06, "loss": 1.5774719715118408, "step": 5836 }, { "epoch": 1.7969218930357829, "grad_norm": 6.875, "learning_rate": 1.1075662756852996e-06, "loss": 1.2371692657470703, "step": 5838 }, { "epoch": 1.7975375144286265, "grad_norm": 3.515625, "learning_rate": 1.106922184098554e-06, "loss": 1.199177861213684, "step": 5840 }, { "epoch": 1.7981531358214697, "grad_norm": 8.0625, "learning_rate": 1.106279973705669e-06, "loss": 1.2665684223175049, "step": 5842 }, { "epoch": 1.7987687572143132, "grad_norm": 6.65625, "learning_rate": 1.1056396451448233e-06, "loss": 1.4166505336761475, "step": 5844 }, { "epoch": 1.7993843786071566, "grad_norm": 7.15625, "learning_rate": 1.1050011990523263e-06, "loss": 1.285638451576233, "step": 5846 }, { "epoch": 1.8, "grad_norm": 6.6875, "learning_rate": 1.1043646360626159e-06, "loss": 1.3939177989959717, "step": 5848 }, { "epoch": 1.8006156213928435, "grad_norm": 8.5625, "learning_rate": 1.1037299568082583e-06, "loss": 1.106064796447754, "step": 5850 }, { "epoch": 1.8012312427856867, "grad_norm": 10.875, "learning_rate": 1.1030971619199496e-06, "loss": 1.2404805421829224, "step": 5852 }, { "epoch": 1.8018468641785303, "grad_norm": 5.28125, "learning_rate": 1.1024662520265113e-06, "loss": 0.9673604369163513, "step": 5854 }, { "epoch": 1.8024624855713736, "grad_norm": 4.6875, "learning_rate": 1.1018372277548934e-06, "loss": 1.3252668380737305, "step": 5856 }, { "epoch": 1.803078106964217, "grad_norm": 9.75, "learning_rate": 1.1012100897301712e-06, "loss": 1.380995512008667, "step": 5858 }, { "epoch": 1.8036937283570604, "grad_norm": 10.1875, "learning_rate": 1.1005848385755457e-06, "loss": 1.1895322799682617, "step": 5860 }, { "epoch": 1.8043093497499036, "grad_norm": 7.84375, "learning_rate": 1.0999614749123433e-06, "loss": 0.9498265981674194, "step": 5862 }, { "epoch": 1.8049249711427473, "grad_norm": 2.40625, "learning_rate": 1.099339999360014e-06, "loss": 0.9431395530700684, "step": 5864 }, { "epoch": 1.8055405925355905, "grad_norm": 5.5625, "learning_rate": 1.0987204125361324e-06, "loss": 1.2742674350738525, "step": 5866 }, { "epoch": 1.8061562139284342, "grad_norm": 5.53125, "learning_rate": 1.0981027150563954e-06, "loss": 1.5245552062988281, "step": 5868 }, { "epoch": 1.8067718353212774, "grad_norm": 7.125, "learning_rate": 1.0974869075346228e-06, "loss": 0.723529040813446, "step": 5870 }, { "epoch": 1.8073874567141208, "grad_norm": 4.59375, "learning_rate": 1.0968729905827575e-06, "loss": 1.2482393980026245, "step": 5872 }, { "epoch": 1.8080030781069643, "grad_norm": 3.859375, "learning_rate": 1.0962609648108607e-06, "loss": 1.2178765535354614, "step": 5874 }, { "epoch": 1.8086186994998075, "grad_norm": 7.59375, "learning_rate": 1.0956508308271174e-06, "loss": 1.0812054872512817, "step": 5876 }, { "epoch": 1.8092343208926511, "grad_norm": 4.9375, "learning_rate": 1.0950425892378309e-06, "loss": 1.3730570077896118, "step": 5878 }, { "epoch": 1.8098499422854943, "grad_norm": 65.0, "learning_rate": 1.094436240647425e-06, "loss": 0.9377802610397339, "step": 5880 }, { "epoch": 1.8104655636783378, "grad_norm": 10.75, "learning_rate": 1.0938317856584415e-06, "loss": 1.1239988803863525, "step": 5882 }, { "epoch": 1.8110811850711812, "grad_norm": 16.125, "learning_rate": 1.0932292248715407e-06, "loss": 1.2952073812484741, "step": 5884 }, { "epoch": 1.8116968064640246, "grad_norm": 6.03125, "learning_rate": 1.0926285588855016e-06, "loss": 1.3250057697296143, "step": 5886 }, { "epoch": 1.812312427856868, "grad_norm": 17.625, "learning_rate": 1.0920297882972183e-06, "loss": 1.3514528274536133, "step": 5888 }, { "epoch": 1.8129280492497113, "grad_norm": 6.5625, "learning_rate": 1.0914329137017032e-06, "loss": 1.3845181465148926, "step": 5890 }, { "epoch": 1.813543670642555, "grad_norm": 39.5, "learning_rate": 1.0908379356920838e-06, "loss": 1.256075382232666, "step": 5892 }, { "epoch": 1.8141592920353982, "grad_norm": 7.46875, "learning_rate": 1.0902448548596034e-06, "loss": 1.3791804313659668, "step": 5894 }, { "epoch": 1.8147749134282416, "grad_norm": 4.90625, "learning_rate": 1.089653671793619e-06, "loss": 1.1116917133331299, "step": 5896 }, { "epoch": 1.815390534821085, "grad_norm": 8.375, "learning_rate": 1.0890643870816033e-06, "loss": 1.268815279006958, "step": 5898 }, { "epoch": 1.8160061562139285, "grad_norm": 3.171875, "learning_rate": 1.0884770013091416e-06, "loss": 1.3997559547424316, "step": 5900 }, { "epoch": 1.816621777606772, "grad_norm": 7.03125, "learning_rate": 1.0878915150599318e-06, "loss": 1.3274210691452026, "step": 5902 }, { "epoch": 1.8172373989996151, "grad_norm": 6.8125, "learning_rate": 1.087307928915785e-06, "loss": 1.410146713256836, "step": 5904 }, { "epoch": 1.8178530203924588, "grad_norm": 22.375, "learning_rate": 1.0867262434566237e-06, "loss": 1.1526761054992676, "step": 5906 }, { "epoch": 1.818468641785302, "grad_norm": 8.75, "learning_rate": 1.0861464592604818e-06, "loss": 1.2998844385147095, "step": 5908 }, { "epoch": 1.8190842631781454, "grad_norm": 6.15625, "learning_rate": 1.0855685769035044e-06, "loss": 1.3243539333343506, "step": 5910 }, { "epoch": 1.8196998845709889, "grad_norm": 4.375, "learning_rate": 1.0849925969599454e-06, "loss": 1.346010446548462, "step": 5912 }, { "epoch": 1.8203155059638323, "grad_norm": 3.265625, "learning_rate": 1.0844185200021695e-06, "loss": 1.0687695741653442, "step": 5914 }, { "epoch": 1.8209311273566757, "grad_norm": 10.4375, "learning_rate": 1.0838463466006496e-06, "loss": 0.9370536804199219, "step": 5916 }, { "epoch": 1.821546748749519, "grad_norm": 5.375, "learning_rate": 1.0832760773239668e-06, "loss": 1.3170430660247803, "step": 5918 }, { "epoch": 1.8221623701423626, "grad_norm": 5.3125, "learning_rate": 1.0827077127388114e-06, "loss": 1.3774001598358154, "step": 5920 }, { "epoch": 1.8227779915352058, "grad_norm": 14.0625, "learning_rate": 1.0821412534099794e-06, "loss": 0.8619877696037292, "step": 5922 }, { "epoch": 1.8233936129280492, "grad_norm": 9.6875, "learning_rate": 1.0815766999003744e-06, "loss": 1.6785545349121094, "step": 5924 }, { "epoch": 1.8240092343208927, "grad_norm": 4.90625, "learning_rate": 1.0810140527710057e-06, "loss": 1.270500898361206, "step": 5926 }, { "epoch": 1.824624855713736, "grad_norm": 4.4375, "learning_rate": 1.0804533125809879e-06, "loss": 1.0367622375488281, "step": 5928 }, { "epoch": 1.8252404771065796, "grad_norm": 11.6875, "learning_rate": 1.0798944798875425e-06, "loss": 1.118377923965454, "step": 5930 }, { "epoch": 1.8258560984994228, "grad_norm": 6.40625, "learning_rate": 1.0793375552459925e-06, "loss": 1.2934290170669556, "step": 5932 }, { "epoch": 1.8264717198922664, "grad_norm": 8.3125, "learning_rate": 1.0787825392097673e-06, "loss": 1.301269292831421, "step": 5934 }, { "epoch": 1.8270873412851096, "grad_norm": 5.09375, "learning_rate": 1.0782294323303987e-06, "loss": 1.43231999874115, "step": 5936 }, { "epoch": 1.827702962677953, "grad_norm": 8.125, "learning_rate": 1.0776782351575212e-06, "loss": 1.0624133348464966, "step": 5938 }, { "epoch": 1.8283185840707965, "grad_norm": 5.09375, "learning_rate": 1.077128948238872e-06, "loss": 1.4891644716262817, "step": 5940 }, { "epoch": 1.8289342054636397, "grad_norm": 7.03125, "learning_rate": 1.0765815721202895e-06, "loss": 1.1424224376678467, "step": 5942 }, { "epoch": 1.8295498268564834, "grad_norm": 6.53125, "learning_rate": 1.076036107345714e-06, "loss": 1.5409581661224365, "step": 5944 }, { "epoch": 1.8301654482493266, "grad_norm": 4.625, "learning_rate": 1.0754925544571858e-06, "loss": 1.2086446285247803, "step": 5946 }, { "epoch": 1.83078106964217, "grad_norm": 6.375, "learning_rate": 1.0749509139948455e-06, "loss": 1.3200281858444214, "step": 5948 }, { "epoch": 1.8313966910350135, "grad_norm": 9.0, "learning_rate": 1.074411186496934e-06, "loss": 1.3422338962554932, "step": 5950 }, { "epoch": 1.832012312427857, "grad_norm": 7.0, "learning_rate": 1.0738733724997896e-06, "loss": 1.2960294485092163, "step": 5952 }, { "epoch": 1.8326279338207003, "grad_norm": 3.8125, "learning_rate": 1.0733374725378508e-06, "loss": 1.1552178859710693, "step": 5954 }, { "epoch": 1.8332435552135435, "grad_norm": 8.0625, "learning_rate": 1.0728034871436536e-06, "loss": 1.386427879333496, "step": 5956 }, { "epoch": 1.8338591766063872, "grad_norm": 5.40625, "learning_rate": 1.0722714168478306e-06, "loss": 1.6475698947906494, "step": 5958 }, { "epoch": 1.8344747979992304, "grad_norm": 5.875, "learning_rate": 1.0717412621791123e-06, "loss": 1.4369161128997803, "step": 5960 }, { "epoch": 1.8350904193920738, "grad_norm": 3.578125, "learning_rate": 1.0712130236643257e-06, "loss": 1.274042010307312, "step": 5962 }, { "epoch": 1.8357060407849173, "grad_norm": 21.125, "learning_rate": 1.0706867018283924e-06, "loss": 0.9764050245285034, "step": 5964 }, { "epoch": 1.8363216621777607, "grad_norm": 8.125, "learning_rate": 1.070162297194331e-06, "loss": 1.4333478212356567, "step": 5966 }, { "epoch": 1.8369372835706042, "grad_norm": 16.125, "learning_rate": 1.0696398102832534e-06, "loss": 1.6606907844543457, "step": 5968 }, { "epoch": 1.8375529049634474, "grad_norm": 5.125, "learning_rate": 1.0691192416143673e-06, "loss": 1.4241994619369507, "step": 5970 }, { "epoch": 1.838168526356291, "grad_norm": 6.28125, "learning_rate": 1.068600591704973e-06, "loss": 1.156654953956604, "step": 5972 }, { "epoch": 1.8387841477491342, "grad_norm": 4.75, "learning_rate": 1.0680838610704645e-06, "loss": 1.4421087503433228, "step": 5974 }, { "epoch": 1.8393997691419777, "grad_norm": 2.90625, "learning_rate": 1.0675690502243288e-06, "loss": 1.1113789081573486, "step": 5976 }, { "epoch": 1.840015390534821, "grad_norm": 7.75, "learning_rate": 1.0670561596781454e-06, "loss": 1.331964373588562, "step": 5978 }, { "epoch": 1.8406310119276645, "grad_norm": 3.8125, "learning_rate": 1.0665451899415843e-06, "loss": 1.075302004814148, "step": 5980 }, { "epoch": 1.841246633320508, "grad_norm": 9.0, "learning_rate": 1.0660361415224077e-06, "loss": 1.4577410221099854, "step": 5982 }, { "epoch": 1.8418622547133512, "grad_norm": 29.75, "learning_rate": 1.0655290149264688e-06, "loss": 1.7311087846755981, "step": 5984 }, { "epoch": 1.8424778761061948, "grad_norm": 8.0625, "learning_rate": 1.0650238106577104e-06, "loss": 1.6321951150894165, "step": 5986 }, { "epoch": 1.843093497499038, "grad_norm": 17.0, "learning_rate": 1.0645205292181662e-06, "loss": 1.5196198225021362, "step": 5988 }, { "epoch": 1.8437091188918815, "grad_norm": 7.0, "learning_rate": 1.0640191711079568e-06, "loss": 1.3114598989486694, "step": 5990 }, { "epoch": 1.844324740284725, "grad_norm": 6.25, "learning_rate": 1.063519736825294e-06, "loss": 1.561679482460022, "step": 5992 }, { "epoch": 1.8449403616775681, "grad_norm": 5.625, "learning_rate": 1.0630222268664764e-06, "loss": 1.591888666152954, "step": 5994 }, { "epoch": 1.8455559830704118, "grad_norm": 19.375, "learning_rate": 1.062526641725891e-06, "loss": 1.102703332901001, "step": 5996 }, { "epoch": 1.846171604463255, "grad_norm": 6.3125, "learning_rate": 1.0620329818960116e-06, "loss": 1.6592758893966675, "step": 5998 }, { "epoch": 1.8467872258560984, "grad_norm": 8.4375, "learning_rate": 1.0615412478673996e-06, "loss": 1.662277340888977, "step": 6000 }, { "epoch": 1.8474028472489419, "grad_norm": 9.3125, "learning_rate": 1.0610514401287015e-06, "loss": 1.3304259777069092, "step": 6002 }, { "epoch": 1.8480184686417853, "grad_norm": 6.1875, "learning_rate": 1.0605635591666505e-06, "loss": 0.9376093149185181, "step": 6004 }, { "epoch": 1.8486340900346288, "grad_norm": 3.671875, "learning_rate": 1.0600776054660646e-06, "loss": 1.0420424938201904, "step": 6006 }, { "epoch": 1.849249711427472, "grad_norm": 13.0625, "learning_rate": 1.0595935795098474e-06, "loss": 1.5878580808639526, "step": 6008 }, { "epoch": 1.8498653328203156, "grad_norm": 6.09375, "learning_rate": 1.0591114817789861e-06, "loss": 1.1772234439849854, "step": 6010 }, { "epoch": 1.8504809542131588, "grad_norm": 6.625, "learning_rate": 1.058631312752552e-06, "loss": 1.3105149269104004, "step": 6012 }, { "epoch": 1.8510965756060023, "grad_norm": 5.4375, "learning_rate": 1.0581530729076997e-06, "loss": 1.240427851676941, "step": 6014 }, { "epoch": 1.8517121969988457, "grad_norm": 2.703125, "learning_rate": 1.057676762719667e-06, "loss": 0.9596577882766724, "step": 6016 }, { "epoch": 1.8523278183916891, "grad_norm": 2.734375, "learning_rate": 1.057202382661774e-06, "loss": 1.3915760517120361, "step": 6018 }, { "epoch": 1.8529434397845326, "grad_norm": 8.6875, "learning_rate": 1.0567299332054225e-06, "loss": 1.3262848854064941, "step": 6020 }, { "epoch": 1.8535590611773758, "grad_norm": 33.25, "learning_rate": 1.0562594148200966e-06, "loss": 1.464792251586914, "step": 6022 }, { "epoch": 1.8541746825702194, "grad_norm": 7.5, "learning_rate": 1.055790827973361e-06, "loss": 1.4374839067459106, "step": 6024 }, { "epoch": 1.8547903039630627, "grad_norm": 6.59375, "learning_rate": 1.0553241731308602e-06, "loss": 1.3735730648040771, "step": 6026 }, { "epoch": 1.855405925355906, "grad_norm": 8.5, "learning_rate": 1.0548594507563207e-06, "loss": 1.1841590404510498, "step": 6028 }, { "epoch": 1.8560215467487495, "grad_norm": 12.0, "learning_rate": 1.0543966613115464e-06, "loss": 1.6395710706710815, "step": 6030 }, { "epoch": 1.856637168141593, "grad_norm": 5.03125, "learning_rate": 1.0539358052564224e-06, "loss": 1.2497833967208862, "step": 6032 }, { "epoch": 1.8572527895344364, "grad_norm": 36.0, "learning_rate": 1.0534768830489111e-06, "loss": 0.7703664898872375, "step": 6034 }, { "epoch": 1.8578684109272796, "grad_norm": 16.875, "learning_rate": 1.0530198951450542e-06, "loss": 1.0920758247375488, "step": 6036 }, { "epoch": 1.8584840323201233, "grad_norm": 10.1875, "learning_rate": 1.0525648419989705e-06, "loss": 1.3005291223526, "step": 6038 }, { "epoch": 1.8590996537129665, "grad_norm": 4.875, "learning_rate": 1.052111724062857e-06, "loss": 0.813774049282074, "step": 6040 }, { "epoch": 1.85971527510581, "grad_norm": 7.84375, "learning_rate": 1.0516605417869865e-06, "loss": 1.2903823852539062, "step": 6042 }, { "epoch": 1.8603308964986534, "grad_norm": 4.75, "learning_rate": 1.0512112956197094e-06, "loss": 0.8724620938301086, "step": 6044 }, { "epoch": 1.8609465178914966, "grad_norm": 8.8125, "learning_rate": 1.0507639860074517e-06, "loss": 1.3100684881210327, "step": 6046 }, { "epoch": 1.8615621392843402, "grad_norm": 5.375, "learning_rate": 1.0503186133947148e-06, "loss": 1.3518569469451904, "step": 6048 }, { "epoch": 1.8621777606771834, "grad_norm": 3.734375, "learning_rate": 1.0498751782240752e-06, "loss": 0.7228596210479736, "step": 6050 }, { "epoch": 1.862793382070027, "grad_norm": 5.0625, "learning_rate": 1.049433680936185e-06, "loss": 1.2877094745635986, "step": 6052 }, { "epoch": 1.8634090034628703, "grad_norm": 7.78125, "learning_rate": 1.0489941219697695e-06, "loss": 1.3102952241897583, "step": 6054 }, { "epoch": 1.8640246248557137, "grad_norm": 8.6875, "learning_rate": 1.0485565017616286e-06, "loss": 1.3873236179351807, "step": 6056 }, { "epoch": 1.8646402462485572, "grad_norm": 6.09375, "learning_rate": 1.0481208207466349e-06, "loss": 1.362139344215393, "step": 6058 }, { "epoch": 1.8652558676414004, "grad_norm": 6.03125, "learning_rate": 1.0476870793577346e-06, "loss": 1.3270050287246704, "step": 6060 }, { "epoch": 1.865871489034244, "grad_norm": 8.375, "learning_rate": 1.0472552780259464e-06, "loss": 1.2958656549453735, "step": 6062 }, { "epoch": 1.8664871104270873, "grad_norm": 6.34375, "learning_rate": 1.0468254171803607e-06, "loss": 1.2908275127410889, "step": 6064 }, { "epoch": 1.8671027318199307, "grad_norm": 5.625, "learning_rate": 1.0463974972481402e-06, "loss": 1.4341435432434082, "step": 6066 }, { "epoch": 1.8677183532127741, "grad_norm": 3.375, "learning_rate": 1.045971518654518e-06, "loss": 1.2812824249267578, "step": 6068 }, { "epoch": 1.8683339746056176, "grad_norm": 11.25, "learning_rate": 1.045547481822799e-06, "loss": 1.5339080095291138, "step": 6070 }, { "epoch": 1.868949595998461, "grad_norm": 6.40625, "learning_rate": 1.0451253871743582e-06, "loss": 1.1604589223861694, "step": 6072 }, { "epoch": 1.8695652173913042, "grad_norm": 6.625, "learning_rate": 1.0447052351286401e-06, "loss": 1.3172587156295776, "step": 6074 }, { "epoch": 1.8701808387841479, "grad_norm": 5.3125, "learning_rate": 1.0442870261031593e-06, "loss": 1.0107141733169556, "step": 6076 }, { "epoch": 1.870796460176991, "grad_norm": 7.46875, "learning_rate": 1.0438707605134996e-06, "loss": 1.1750514507293701, "step": 6078 }, { "epoch": 1.8714120815698345, "grad_norm": 5.6875, "learning_rate": 1.0434564387733138e-06, "loss": 1.1378748416900635, "step": 6080 }, { "epoch": 1.872027702962678, "grad_norm": 4.34375, "learning_rate": 1.0430440612943222e-06, "loss": 1.150460958480835, "step": 6082 }, { "epoch": 1.8726433243555214, "grad_norm": 7.84375, "learning_rate": 1.0426336284863136e-06, "loss": 1.2356613874435425, "step": 6084 }, { "epoch": 1.8732589457483648, "grad_norm": 12.5, "learning_rate": 1.0422251407571444e-06, "loss": 1.6873862743377686, "step": 6086 }, { "epoch": 1.873874567141208, "grad_norm": 8.0, "learning_rate": 1.0418185985127379e-06, "loss": 1.527343988418579, "step": 6088 }, { "epoch": 1.8744901885340517, "grad_norm": 23.25, "learning_rate": 1.041414002157084e-06, "loss": 1.393473505973816, "step": 6090 }, { "epoch": 1.875105809926895, "grad_norm": 7.1875, "learning_rate": 1.0410113520922402e-06, "loss": 1.6080451011657715, "step": 6092 }, { "epoch": 1.8757214313197383, "grad_norm": 28.0, "learning_rate": 1.0406106487183277e-06, "loss": 1.6712785959243774, "step": 6094 }, { "epoch": 1.8763370527125818, "grad_norm": 9.375, "learning_rate": 1.040211892433535e-06, "loss": 1.5237807035446167, "step": 6096 }, { "epoch": 1.8769526741054252, "grad_norm": 11.5, "learning_rate": 1.039815083634115e-06, "loss": 1.5383855104446411, "step": 6098 }, { "epoch": 1.8775682954982686, "grad_norm": 16.125, "learning_rate": 1.0394202227143857e-06, "loss": 1.5905861854553223, "step": 6100 }, { "epoch": 1.8781839168911119, "grad_norm": 16.625, "learning_rate": 1.0390273100667291e-06, "loss": 1.1382654905319214, "step": 6102 }, { "epoch": 1.8787995382839555, "grad_norm": 12.25, "learning_rate": 1.0386363460815913e-06, "loss": 0.9064276218414307, "step": 6104 }, { "epoch": 1.8794151596767987, "grad_norm": 3.078125, "learning_rate": 1.0382473311474821e-06, "loss": 1.2223496437072754, "step": 6106 }, { "epoch": 1.8800307810696422, "grad_norm": 111.0, "learning_rate": 1.037860265650974e-06, "loss": 1.2195074558258057, "step": 6108 }, { "epoch": 1.8806464024624856, "grad_norm": 8.8125, "learning_rate": 1.037475149976703e-06, "loss": 1.5098378658294678, "step": 6110 }, { "epoch": 1.8812620238553288, "grad_norm": 9.6875, "learning_rate": 1.0370919845073674e-06, "loss": 1.5166021585464478, "step": 6112 }, { "epoch": 1.8818776452481725, "grad_norm": 5.5625, "learning_rate": 1.0367107696237266e-06, "loss": 1.114842176437378, "step": 6114 }, { "epoch": 1.8824932666410157, "grad_norm": 6.4375, "learning_rate": 1.036331505704603e-06, "loss": 0.7568085789680481, "step": 6116 }, { "epoch": 1.8831088880338593, "grad_norm": 11.0, "learning_rate": 1.0359541931268793e-06, "loss": 1.5966717004776, "step": 6118 }, { "epoch": 1.8837245094267026, "grad_norm": 7.3125, "learning_rate": 1.0355788322655e-06, "loss": 1.569027304649353, "step": 6120 }, { "epoch": 1.884340130819546, "grad_norm": 2.609375, "learning_rate": 1.0352054234934688e-06, "loss": 0.9419485330581665, "step": 6122 }, { "epoch": 1.8849557522123894, "grad_norm": 7.03125, "learning_rate": 1.0348339671818509e-06, "loss": 1.1914021968841553, "step": 6124 }, { "epoch": 1.8855713736052326, "grad_norm": 4.09375, "learning_rate": 1.0344644636997705e-06, "loss": 1.2729462385177612, "step": 6126 }, { "epoch": 1.8861869949980763, "grad_norm": 6.65625, "learning_rate": 1.0340969134144118e-06, "loss": 1.2701356410980225, "step": 6128 }, { "epoch": 1.8868026163909195, "grad_norm": 7.15625, "learning_rate": 1.0337313166910176e-06, "loss": 1.2504490613937378, "step": 6130 }, { "epoch": 1.887418237783763, "grad_norm": 5.125, "learning_rate": 1.0333676738928895e-06, "loss": 1.3305485248565674, "step": 6132 }, { "epoch": 1.8880338591766064, "grad_norm": 8.8125, "learning_rate": 1.0330059853813875e-06, "loss": 1.276931881904602, "step": 6134 }, { "epoch": 1.8886494805694498, "grad_norm": 6.125, "learning_rate": 1.0326462515159297e-06, "loss": 0.7779605388641357, "step": 6136 }, { "epoch": 1.8892651019622932, "grad_norm": 6.0625, "learning_rate": 1.0322884726539915e-06, "loss": 1.1487722396850586, "step": 6138 }, { "epoch": 1.8898807233551365, "grad_norm": 7.21875, "learning_rate": 1.0319326491511062e-06, "loss": 1.1358317136764526, "step": 6140 }, { "epoch": 1.8904963447479801, "grad_norm": 2.375, "learning_rate": 1.0315787813608631e-06, "loss": 1.1888786554336548, "step": 6142 }, { "epoch": 1.8911119661408233, "grad_norm": 9.6875, "learning_rate": 1.031226869634909e-06, "loss": 0.9452347159385681, "step": 6144 }, { "epoch": 1.8917275875336668, "grad_norm": 5.09375, "learning_rate": 1.0308769143229458e-06, "loss": 1.399794578552246, "step": 6146 }, { "epoch": 1.8923432089265102, "grad_norm": 22.25, "learning_rate": 1.0305289157727326e-06, "loss": 1.4802740812301636, "step": 6148 }, { "epoch": 1.8929588303193536, "grad_norm": 14.625, "learning_rate": 1.030182874330083e-06, "loss": 1.3717149496078491, "step": 6150 }, { "epoch": 1.893574451712197, "grad_norm": 3.34375, "learning_rate": 1.0298387903388665e-06, "loss": 1.2769005298614502, "step": 6152 }, { "epoch": 1.8941900731050403, "grad_norm": 9.75, "learning_rate": 1.0294966641410067e-06, "loss": 1.3929970264434814, "step": 6154 }, { "epoch": 1.894805694497884, "grad_norm": 8.0, "learning_rate": 1.0291564960764822e-06, "loss": 1.286952018737793, "step": 6156 }, { "epoch": 1.8954213158907272, "grad_norm": 8.8125, "learning_rate": 1.028818286483326e-06, "loss": 1.2915070056915283, "step": 6158 }, { "epoch": 1.8960369372835706, "grad_norm": 7.375, "learning_rate": 1.0284820356976239e-06, "loss": 1.7561068534851074, "step": 6160 }, { "epoch": 1.896652558676414, "grad_norm": 12.9375, "learning_rate": 1.0281477440535157e-06, "loss": 0.9700071215629578, "step": 6162 }, { "epoch": 1.8972681800692575, "grad_norm": 5.5625, "learning_rate": 1.0278154118831956e-06, "loss": 1.5087077617645264, "step": 6164 }, { "epoch": 1.897883801462101, "grad_norm": 6.6875, "learning_rate": 1.0274850395169086e-06, "loss": 1.482177972793579, "step": 6166 }, { "epoch": 1.898499422854944, "grad_norm": 5.6875, "learning_rate": 1.0271566272829532e-06, "loss": 1.3078364133834839, "step": 6168 }, { "epoch": 1.8991150442477878, "grad_norm": 3.953125, "learning_rate": 1.0268301755076806e-06, "loss": 1.3884791135787964, "step": 6170 }, { "epoch": 1.899730665640631, "grad_norm": 6.34375, "learning_rate": 1.0265056845154927e-06, "loss": 0.9594384431838989, "step": 6172 }, { "epoch": 1.9003462870334744, "grad_norm": 11.1875, "learning_rate": 1.0261831546288435e-06, "loss": 1.4539716243743896, "step": 6174 }, { "epoch": 1.9009619084263178, "grad_norm": 9.0625, "learning_rate": 1.0258625861682383e-06, "loss": 1.788408637046814, "step": 6176 }, { "epoch": 1.901577529819161, "grad_norm": 4.8125, "learning_rate": 1.0255439794522332e-06, "loss": 1.148524284362793, "step": 6178 }, { "epoch": 1.9021931512120047, "grad_norm": 8.0625, "learning_rate": 1.025227334797435e-06, "loss": 1.4598939418792725, "step": 6180 }, { "epoch": 1.902808772604848, "grad_norm": 4.625, "learning_rate": 1.0249126525185e-06, "loss": 1.4986612796783447, "step": 6182 }, { "epoch": 1.9034243939976914, "grad_norm": 7.84375, "learning_rate": 1.0245999329281356e-06, "loss": 1.09617280960083, "step": 6184 }, { "epoch": 1.9040400153905348, "grad_norm": 7.1875, "learning_rate": 1.024289176337098e-06, "loss": 1.5821504592895508, "step": 6186 }, { "epoch": 1.9046556367833782, "grad_norm": 4.21875, "learning_rate": 1.0239803830541933e-06, "loss": 1.1723260879516602, "step": 6188 }, { "epoch": 1.9052712581762217, "grad_norm": 6.5625, "learning_rate": 1.023673553386276e-06, "loss": 1.4746092557907104, "step": 6190 }, { "epoch": 1.9058868795690649, "grad_norm": 19.75, "learning_rate": 1.0233686876382493e-06, "loss": 1.318878412246704, "step": 6192 }, { "epoch": 1.9065025009619085, "grad_norm": 5.5, "learning_rate": 1.023065786113066e-06, "loss": 1.4316555261611938, "step": 6194 }, { "epoch": 1.9071181223547518, "grad_norm": 8.375, "learning_rate": 1.0227648491117256e-06, "loss": 1.5165979862213135, "step": 6196 }, { "epoch": 1.9077337437475952, "grad_norm": 7.28125, "learning_rate": 1.0224658769332758e-06, "loss": 1.47892427444458, "step": 6198 }, { "epoch": 1.9083493651404386, "grad_norm": 17.5, "learning_rate": 1.0221688698748124e-06, "loss": 1.3471444845199585, "step": 6200 }, { "epoch": 1.908964986533282, "grad_norm": 7.125, "learning_rate": 1.0218738282314776e-06, "loss": 0.6739861369132996, "step": 6202 }, { "epoch": 1.9095806079261255, "grad_norm": 8.5, "learning_rate": 1.021580752296461e-06, "loss": 0.8699933886528015, "step": 6204 }, { "epoch": 1.9101962293189687, "grad_norm": 5.75, "learning_rate": 1.0212896423609986e-06, "loss": 1.5890634059906006, "step": 6206 }, { "epoch": 1.9108118507118124, "grad_norm": 7.3125, "learning_rate": 1.0210004987143736e-06, "loss": 1.2174679040908813, "step": 6208 }, { "epoch": 1.9114274721046556, "grad_norm": 4.25, "learning_rate": 1.0207133216439136e-06, "loss": 1.385654091835022, "step": 6210 }, { "epoch": 1.912043093497499, "grad_norm": 4.3125, "learning_rate": 1.020428111434993e-06, "loss": 1.5168359279632568, "step": 6212 }, { "epoch": 1.9126587148903424, "grad_norm": 8.6875, "learning_rate": 1.020144868371032e-06, "loss": 1.3468291759490967, "step": 6214 }, { "epoch": 1.9132743362831859, "grad_norm": 2.671875, "learning_rate": 1.0198635927334954e-06, "loss": 1.1638237237930298, "step": 6216 }, { "epoch": 1.9138899576760293, "grad_norm": 49.5, "learning_rate": 1.0195842848018932e-06, "loss": 1.425129771232605, "step": 6218 }, { "epoch": 1.9145055790688725, "grad_norm": 6.15625, "learning_rate": 1.01930694485378e-06, "loss": 1.3448690176010132, "step": 6220 }, { "epoch": 1.9151212004617162, "grad_norm": 6.28125, "learning_rate": 1.0190315731647542e-06, "loss": 1.4132188558578491, "step": 6222 }, { "epoch": 1.9157368218545594, "grad_norm": 6.625, "learning_rate": 1.0187581700084593e-06, "loss": 1.5360431671142578, "step": 6224 }, { "epoch": 1.9163524432474028, "grad_norm": 5.375, "learning_rate": 1.018486735656582e-06, "loss": 1.2933526039123535, "step": 6226 }, { "epoch": 1.9169680646402463, "grad_norm": 25.0, "learning_rate": 1.0182172703788529e-06, "loss": 1.4225658178329468, "step": 6228 }, { "epoch": 1.9175836860330895, "grad_norm": 1.8828125, "learning_rate": 1.0179497744430456e-06, "loss": 1.0137361288070679, "step": 6230 }, { "epoch": 1.9181993074259331, "grad_norm": 6.625, "learning_rate": 1.0176842481149765e-06, "loss": 1.0409231185913086, "step": 6232 }, { "epoch": 1.9188149288187764, "grad_norm": 7.34375, "learning_rate": 1.0174206916585056e-06, "loss": 1.2320098876953125, "step": 6234 }, { "epoch": 1.91943055021162, "grad_norm": 4.0, "learning_rate": 1.017159105335534e-06, "loss": 1.1500188112258911, "step": 6236 }, { "epoch": 1.9200461716044632, "grad_norm": 8.875, "learning_rate": 1.016899489406007e-06, "loss": 1.364874005317688, "step": 6238 }, { "epoch": 1.9206617929973067, "grad_norm": 7.1875, "learning_rate": 1.0166418441279101e-06, "loss": 1.4855751991271973, "step": 6240 }, { "epoch": 1.92127741439015, "grad_norm": 3.015625, "learning_rate": 1.0163861697572714e-06, "loss": 1.0191508531570435, "step": 6242 }, { "epoch": 1.9218930357829933, "grad_norm": 6.15625, "learning_rate": 1.0161324665481598e-06, "loss": 1.259505033493042, "step": 6244 }, { "epoch": 1.922508657175837, "grad_norm": 7.78125, "learning_rate": 1.0158807347526865e-06, "loss": 1.6142199039459229, "step": 6246 }, { "epoch": 1.9231242785686802, "grad_norm": 4.75, "learning_rate": 1.0156309746210028e-06, "loss": 1.6033637523651123, "step": 6248 }, { "epoch": 1.9237398999615236, "grad_norm": 5.90625, "learning_rate": 1.0153831864013008e-06, "loss": 1.5636179447174072, "step": 6250 }, { "epoch": 1.924355521354367, "grad_norm": 14.4375, "learning_rate": 1.0151373703398134e-06, "loss": 1.2434797286987305, "step": 6252 }, { "epoch": 1.9249711427472105, "grad_norm": 10.75, "learning_rate": 1.0148935266808134e-06, "loss": 1.4169031381607056, "step": 6254 }, { "epoch": 1.925586764140054, "grad_norm": 6.25, "learning_rate": 1.0146516556666135e-06, "loss": 1.2062106132507324, "step": 6256 }, { "epoch": 1.9262023855328971, "grad_norm": 6.6875, "learning_rate": 1.0144117575375667e-06, "loss": 1.5046076774597168, "step": 6258 }, { "epoch": 1.9268180069257408, "grad_norm": 3.15625, "learning_rate": 1.0141738325320656e-06, "loss": 1.0233358144760132, "step": 6260 }, { "epoch": 1.927433628318584, "grad_norm": 5.625, "learning_rate": 1.0139378808865404e-06, "loss": 1.2588615417480469, "step": 6262 }, { "epoch": 1.9280492497114274, "grad_norm": 2.828125, "learning_rate": 1.0137039028354624e-06, "loss": 1.157249093055725, "step": 6264 }, { "epoch": 1.9286648711042709, "grad_norm": 7.96875, "learning_rate": 1.0134718986113406e-06, "loss": 1.1970008611679077, "step": 6266 }, { "epoch": 1.9292804924971143, "grad_norm": 5.125, "learning_rate": 1.0132418684447227e-06, "loss": 1.3345950841903687, "step": 6268 }, { "epoch": 1.9298961138899577, "grad_norm": 8.0, "learning_rate": 1.013013812564195e-06, "loss": 1.1379836797714233, "step": 6270 }, { "epoch": 1.930511735282801, "grad_norm": 13.8125, "learning_rate": 1.0127877311963818e-06, "loss": 1.1116348505020142, "step": 6272 }, { "epoch": 1.9311273566756446, "grad_norm": 4.28125, "learning_rate": 1.0125636245659453e-06, "loss": 1.2857990264892578, "step": 6274 }, { "epoch": 1.9317429780684878, "grad_norm": 5.34375, "learning_rate": 1.012341492895585e-06, "loss": 1.5975061655044556, "step": 6276 }, { "epoch": 1.9323585994613313, "grad_norm": 3.796875, "learning_rate": 1.0121213364060383e-06, "loss": 1.213213324546814, "step": 6278 }, { "epoch": 1.9329742208541747, "grad_norm": 7.71875, "learning_rate": 1.0119031553160791e-06, "loss": 1.5098799467086792, "step": 6280 }, { "epoch": 1.9335898422470181, "grad_norm": 7.375, "learning_rate": 1.01168694984252e-06, "loss": 1.2202961444854736, "step": 6282 }, { "epoch": 1.9342054636398616, "grad_norm": 6.40625, "learning_rate": 1.011472720200208e-06, "loss": 1.6116353273391724, "step": 6284 }, { "epoch": 1.9348210850327048, "grad_norm": 5.28125, "learning_rate": 1.0112604666020288e-06, "loss": 1.3997621536254883, "step": 6286 }, { "epoch": 1.9354367064255484, "grad_norm": 15.6875, "learning_rate": 1.011050189258903e-06, "loss": 1.4360119104385376, "step": 6288 }, { "epoch": 1.9360523278183916, "grad_norm": 11.1875, "learning_rate": 1.010841888379788e-06, "loss": 1.6121647357940674, "step": 6290 }, { "epoch": 1.936667949211235, "grad_norm": 3.203125, "learning_rate": 1.0106355641716772e-06, "loss": 1.1204249858856201, "step": 6292 }, { "epoch": 1.9372835706040785, "grad_norm": 5.53125, "learning_rate": 1.0104312168395996e-06, "loss": 0.9558459520339966, "step": 6294 }, { "epoch": 1.9378991919969217, "grad_norm": 6.40625, "learning_rate": 1.0102288465866196e-06, "loss": 1.29323148727417, "step": 6296 }, { "epoch": 1.9385148133897654, "grad_norm": 4.28125, "learning_rate": 1.0100284536138372e-06, "loss": 1.245251178741455, "step": 6298 }, { "epoch": 1.9391304347826086, "grad_norm": 18.125, "learning_rate": 1.0098300381203873e-06, "loss": 1.4215116500854492, "step": 6300 }, { "epoch": 1.9397460561754523, "grad_norm": 8.5, "learning_rate": 1.0096336003034398e-06, "loss": 1.6721007823944092, "step": 6302 }, { "epoch": 1.9403616775682955, "grad_norm": 4.5, "learning_rate": 1.0094391403581991e-06, "loss": 1.408481478691101, "step": 6304 }, { "epoch": 1.940977298961139, "grad_norm": 7.9375, "learning_rate": 1.0092466584779052e-06, "loss": 1.1964179277420044, "step": 6306 }, { "epoch": 1.9415929203539823, "grad_norm": 6.25, "learning_rate": 1.009056154853831e-06, "loss": 1.6166949272155762, "step": 6308 }, { "epoch": 1.9422085417468256, "grad_norm": 7.78125, "learning_rate": 1.008867629675284e-06, "loss": 1.4972113370895386, "step": 6310 }, { "epoch": 1.9428241631396692, "grad_norm": 13.1875, "learning_rate": 1.0086810831296071e-06, "loss": 1.3295609951019287, "step": 6312 }, { "epoch": 1.9434397845325124, "grad_norm": 6.28125, "learning_rate": 1.0084965154021741e-06, "loss": 1.528801679611206, "step": 6314 }, { "epoch": 1.9440554059253559, "grad_norm": 3.109375, "learning_rate": 1.0083139266763955e-06, "loss": 1.281317949295044, "step": 6316 }, { "epoch": 1.9446710273181993, "grad_norm": 4.84375, "learning_rate": 1.0081333171337132e-06, "loss": 1.282319188117981, "step": 6318 }, { "epoch": 1.9452866487110427, "grad_norm": 7.65625, "learning_rate": 1.0079546869536027e-06, "loss": 1.2659920454025269, "step": 6320 }, { "epoch": 1.9459022701038862, "grad_norm": 5.0, "learning_rate": 1.0077780363135736e-06, "loss": 1.0919814109802246, "step": 6322 }, { "epoch": 1.9465178914967294, "grad_norm": 10.625, "learning_rate": 1.0076033653891667e-06, "loss": 1.3609809875488281, "step": 6324 }, { "epoch": 1.947133512889573, "grad_norm": 2.6875, "learning_rate": 1.007430674353957e-06, "loss": 1.2212693691253662, "step": 6326 }, { "epoch": 1.9477491342824163, "grad_norm": 5.96875, "learning_rate": 1.0072599633795512e-06, "loss": 1.3360657691955566, "step": 6328 }, { "epoch": 1.9483647556752597, "grad_norm": 5.375, "learning_rate": 1.007091232635589e-06, "loss": 1.6322662830352783, "step": 6330 }, { "epoch": 1.9489803770681031, "grad_norm": 9.8125, "learning_rate": 1.0069244822897413e-06, "loss": 0.840678334236145, "step": 6332 }, { "epoch": 1.9495959984609466, "grad_norm": 12.625, "learning_rate": 1.006759712507712e-06, "loss": 1.6254369020462036, "step": 6334 }, { "epoch": 1.95021161985379, "grad_norm": 9.0, "learning_rate": 1.0065969234532367e-06, "loss": 0.9958906769752502, "step": 6336 }, { "epoch": 1.9508272412466332, "grad_norm": 5.90625, "learning_rate": 1.0064361152880823e-06, "loss": 1.294455885887146, "step": 6338 }, { "epoch": 1.9514428626394769, "grad_norm": 10.875, "learning_rate": 1.0062772881720476e-06, "loss": 1.2550309896469116, "step": 6340 }, { "epoch": 1.95205848403232, "grad_norm": 5.125, "learning_rate": 1.0061204422629625e-06, "loss": 1.6748965978622437, "step": 6342 }, { "epoch": 1.9526741054251635, "grad_norm": 6.8125, "learning_rate": 1.0059655777166883e-06, "loss": 1.3717496395111084, "step": 6344 }, { "epoch": 1.953289726818007, "grad_norm": 10.375, "learning_rate": 1.0058126946871174e-06, "loss": 1.4777202606201172, "step": 6346 }, { "epoch": 1.9539053482108504, "grad_norm": 3.78125, "learning_rate": 1.0056617933261735e-06, "loss": 1.1079200506210327, "step": 6348 }, { "epoch": 1.9545209696036938, "grad_norm": 3.0, "learning_rate": 1.0055128737838101e-06, "loss": 1.182327389717102, "step": 6350 }, { "epoch": 1.955136590996537, "grad_norm": 23.625, "learning_rate": 1.0053659362080123e-06, "loss": 1.4486178159713745, "step": 6352 }, { "epoch": 1.9557522123893807, "grad_norm": 13.1875, "learning_rate": 1.0052209807447948e-06, "loss": 1.0059131383895874, "step": 6354 }, { "epoch": 1.956367833782224, "grad_norm": 5.3125, "learning_rate": 1.0050780075382033e-06, "loss": 1.1534645557403564, "step": 6356 }, { "epoch": 1.9569834551750673, "grad_norm": 13.0, "learning_rate": 1.0049370167303138e-06, "loss": 1.2540004253387451, "step": 6358 }, { "epoch": 1.9575990765679108, "grad_norm": 7.1875, "learning_rate": 1.0047980084612318e-06, "loss": 1.0319916009902954, "step": 6360 }, { "epoch": 1.958214697960754, "grad_norm": 12.3125, "learning_rate": 1.0046609828690929e-06, "loss": 1.5139015913009644, "step": 6362 }, { "epoch": 1.9588303193535976, "grad_norm": 6.5625, "learning_rate": 1.0045259400900622e-06, "loss": 1.390751838684082, "step": 6364 }, { "epoch": 1.9594459407464409, "grad_norm": 5.125, "learning_rate": 1.0043928802583352e-06, "loss": 1.1808812618255615, "step": 6366 }, { "epoch": 1.9600615621392843, "grad_norm": 2.3125, "learning_rate": 1.0042618035061364e-06, "loss": 1.1729049682617188, "step": 6368 }, { "epoch": 1.9606771835321277, "grad_norm": 6.125, "learning_rate": 1.0041327099637196e-06, "loss": 1.105539083480835, "step": 6370 }, { "epoch": 1.9612928049249712, "grad_norm": 7.96875, "learning_rate": 1.0040055997593677e-06, "loss": 1.1965091228485107, "step": 6372 }, { "epoch": 1.9619084263178146, "grad_norm": 10.625, "learning_rate": 1.0038804730193933e-06, "loss": 1.5604122877120972, "step": 6374 }, { "epoch": 1.9625240477106578, "grad_norm": 5.75, "learning_rate": 1.0037573298681375e-06, "loss": 1.297991156578064, "step": 6376 }, { "epoch": 1.9631396691035015, "grad_norm": 12.6875, "learning_rate": 1.0036361704279705e-06, "loss": 1.061846375465393, "step": 6378 }, { "epoch": 1.9637552904963447, "grad_norm": 6.53125, "learning_rate": 1.0035169948192912e-06, "loss": 1.0523725748062134, "step": 6380 }, { "epoch": 1.964370911889188, "grad_norm": 5.25, "learning_rate": 1.003399803160527e-06, "loss": 1.1259384155273438, "step": 6382 }, { "epoch": 1.9649865332820315, "grad_norm": 8.875, "learning_rate": 1.0032845955681337e-06, "loss": 1.567067265510559, "step": 6384 }, { "epoch": 1.965602154674875, "grad_norm": 17.0, "learning_rate": 1.0031713721565957e-06, "loss": 1.315956950187683, "step": 6386 }, { "epoch": 1.9662177760677184, "grad_norm": 9.3125, "learning_rate": 1.003060133038426e-06, "loss": 1.0785685777664185, "step": 6388 }, { "epoch": 1.9668333974605616, "grad_norm": 10.0, "learning_rate": 1.002950878324165e-06, "loss": 1.875533103942871, "step": 6390 }, { "epoch": 1.9674490188534053, "grad_norm": 6.09375, "learning_rate": 1.0028436081223818e-06, "loss": 1.2702431678771973, "step": 6392 }, { "epoch": 1.9680646402462485, "grad_norm": 38.5, "learning_rate": 1.0027383225396731e-06, "loss": 1.201972484588623, "step": 6394 }, { "epoch": 1.968680261639092, "grad_norm": 4.6875, "learning_rate": 1.0026350216806638e-06, "loss": 1.2024223804473877, "step": 6396 }, { "epoch": 1.9692958830319354, "grad_norm": 9.9375, "learning_rate": 1.0025337056480055e-06, "loss": 1.4981609582901, "step": 6398 }, { "epoch": 1.9699115044247788, "grad_norm": 5.96875, "learning_rate": 1.0024343745423792e-06, "loss": 1.516045093536377, "step": 6400 }, { "epoch": 1.9705271258176222, "grad_norm": 10.375, "learning_rate": 1.002337028462492e-06, "loss": 1.532149076461792, "step": 6402 }, { "epoch": 1.9711427472104655, "grad_norm": 8.875, "learning_rate": 1.002241667505079e-06, "loss": 1.6822595596313477, "step": 6404 }, { "epoch": 1.971758368603309, "grad_norm": 6.25, "learning_rate": 1.0021482917649021e-06, "loss": 1.4750088453292847, "step": 6406 }, { "epoch": 1.9723739899961523, "grad_norm": 4.78125, "learning_rate": 1.0020569013347512e-06, "loss": 0.9572700262069702, "step": 6408 }, { "epoch": 1.9729896113889958, "grad_norm": 2.71875, "learning_rate": 1.0019674963054432e-06, "loss": 1.1694449186325073, "step": 6410 }, { "epoch": 1.9736052327818392, "grad_norm": 61.25, "learning_rate": 1.0018800767658216e-06, "loss": 1.1208440065383911, "step": 6412 }, { "epoch": 1.9742208541746824, "grad_norm": 11.0625, "learning_rate": 1.0017946428027572e-06, "loss": 1.4088279008865356, "step": 6414 }, { "epoch": 1.974836475567526, "grad_norm": 10.1875, "learning_rate": 1.0017111945011477e-06, "loss": 1.4532158374786377, "step": 6416 }, { "epoch": 1.9754520969603693, "grad_norm": 6.03125, "learning_rate": 1.0016297319439175e-06, "loss": 1.4767062664031982, "step": 6418 }, { "epoch": 1.976067718353213, "grad_norm": 3.625, "learning_rate": 1.0015502552120178e-06, "loss": 1.2021182775497437, "step": 6420 }, { "epoch": 1.9766833397460561, "grad_norm": 8.5625, "learning_rate": 1.0014727643844265e-06, "loss": 1.348162293434143, "step": 6422 }, { "epoch": 1.9772989611388996, "grad_norm": 9.4375, "learning_rate": 1.001397259538148e-06, "loss": 1.5533214807510376, "step": 6424 }, { "epoch": 1.977914582531743, "grad_norm": 6.125, "learning_rate": 1.0013237407482126e-06, "loss": 1.439581274986267, "step": 6426 }, { "epoch": 1.9785302039245862, "grad_norm": 6.28125, "learning_rate": 1.0012522080876784e-06, "loss": 1.3912233114242554, "step": 6428 }, { "epoch": 1.9791458253174299, "grad_norm": 8.5, "learning_rate": 1.0011826616276283e-06, "loss": 2.146826982498169, "step": 6430 }, { "epoch": 1.979761446710273, "grad_norm": 18.625, "learning_rate": 1.0011151014371728e-06, "loss": 1.7773079872131348, "step": 6432 }, { "epoch": 1.9803770681031165, "grad_norm": 6.3125, "learning_rate": 1.0010495275834475e-06, "loss": 1.4938530921936035, "step": 6434 }, { "epoch": 1.98099268949596, "grad_norm": 6.71875, "learning_rate": 1.000985940131615e-06, "loss": 1.3083902597427368, "step": 6436 }, { "epoch": 1.9816083108888034, "grad_norm": 8.125, "learning_rate": 1.0009243391448629e-06, "loss": 1.6811070442199707, "step": 6438 }, { "epoch": 1.9822239322816468, "grad_norm": 5.71875, "learning_rate": 1.0008647246844064e-06, "loss": 1.0163898468017578, "step": 6440 }, { "epoch": 1.98283955367449, "grad_norm": 9.4375, "learning_rate": 1.000807096809485e-06, "loss": 0.9855446219444275, "step": 6442 }, { "epoch": 1.9834551750673337, "grad_norm": 6.3125, "learning_rate": 1.0007514555773652e-06, "loss": 1.5436961650848389, "step": 6444 }, { "epoch": 1.984070796460177, "grad_norm": 10.375, "learning_rate": 1.0006978010433386e-06, "loss": 0.9845457673072815, "step": 6446 }, { "epoch": 1.9846864178530204, "grad_norm": 7.09375, "learning_rate": 1.000646133260723e-06, "loss": 1.182098388671875, "step": 6448 }, { "epoch": 1.9853020392458638, "grad_norm": 2.453125, "learning_rate": 1.0005964522808626e-06, "loss": 1.1003978252410889, "step": 6450 }, { "epoch": 1.9859176606387072, "grad_norm": 5.25, "learning_rate": 1.0005487581531254e-06, "loss": 1.3184493780136108, "step": 6452 }, { "epoch": 1.9865332820315507, "grad_norm": 18.375, "learning_rate": 1.0005030509249064e-06, "loss": 0.7050902247428894, "step": 6454 }, { "epoch": 1.9871489034243939, "grad_norm": 3.359375, "learning_rate": 1.0004593306416267e-06, "loss": 1.2746435403823853, "step": 6456 }, { "epoch": 1.9877645248172375, "grad_norm": 8.75, "learning_rate": 1.000417597346731e-06, "loss": 1.3945417404174805, "step": 6458 }, { "epoch": 1.9883801462100807, "grad_norm": 12.1875, "learning_rate": 1.0003778510816915e-06, "loss": 1.4266530275344849, "step": 6460 }, { "epoch": 1.9889957676029242, "grad_norm": 8.25, "learning_rate": 1.000340091886004e-06, "loss": 1.3669017553329468, "step": 6462 }, { "epoch": 1.9896113889957676, "grad_norm": 9.1875, "learning_rate": 1.0003043197971917e-06, "loss": 1.1172243356704712, "step": 6464 }, { "epoch": 1.990227010388611, "grad_norm": 6.75, "learning_rate": 1.0002705348508016e-06, "loss": 1.4020445346832275, "step": 6466 }, { "epoch": 1.9908426317814545, "grad_norm": 5.90625, "learning_rate": 1.0002387370804063e-06, "loss": 1.021568775177002, "step": 6468 }, { "epoch": 1.9914582531742977, "grad_norm": 9.25, "learning_rate": 1.0002089265176046e-06, "loss": 1.5302304029464722, "step": 6470 }, { "epoch": 1.9920738745671414, "grad_norm": 8.9375, "learning_rate": 1.0001811031920195e-06, "loss": 1.5604091882705688, "step": 6472 }, { "epoch": 1.9926894959599846, "grad_norm": 7.96875, "learning_rate": 1.0001552671312996e-06, "loss": 1.1880793571472168, "step": 6474 }, { "epoch": 1.993305117352828, "grad_norm": 7.5, "learning_rate": 1.0001314183611194e-06, "loss": 1.5420341491699219, "step": 6476 }, { "epoch": 1.9939207387456714, "grad_norm": 9.0, "learning_rate": 1.0001095569051772e-06, "loss": 1.7072868347167969, "step": 6478 }, { "epoch": 1.9945363601385147, "grad_norm": 13.0, "learning_rate": 1.0000896827851974e-06, "loss": 1.5550557374954224, "step": 6480 }, { "epoch": 1.9951519815313583, "grad_norm": 14.6875, "learning_rate": 1.0000717960209295e-06, "loss": 0.8384658694267273, "step": 6482 }, { "epoch": 1.9957676029242015, "grad_norm": 3.34375, "learning_rate": 1.0000558966301483e-06, "loss": 0.9205080270767212, "step": 6484 }, { "epoch": 1.9963832243170452, "grad_norm": 16.875, "learning_rate": 1.0000419846286524e-06, "loss": 1.325616478919983, "step": 6486 }, { "epoch": 1.9969988457098884, "grad_norm": 3.046875, "learning_rate": 1.0000300600302676e-06, "loss": 0.5225828886032104, "step": 6488 }, { "epoch": 1.9976144671027318, "grad_norm": 7.6875, "learning_rate": 1.0000201228468429e-06, "loss": 1.0502115488052368, "step": 6490 }, { "epoch": 1.9982300884955753, "grad_norm": 5.71875, "learning_rate": 1.0000121730882534e-06, "loss": 1.224287748336792, "step": 6492 }, { "epoch": 1.9988457098884185, "grad_norm": 17.75, "learning_rate": 1.000006210762399e-06, "loss": 1.3089892864227295, "step": 6494 }, { "epoch": 1.9994613312812621, "grad_norm": 9.125, "learning_rate": 1.0000022358752043e-06, "loss": 1.4170578718185425, "step": 6496 }, { "epoch": 2.0, "grad_norm": 7.375, "learning_rate": 1.0000002484306195e-06, "loss": 1.3172667026519775, "step": 6498 }, { "epoch": 2.0, "step": 6498, "total_flos": 2.5760029558366536e+18, "train_loss": 1.3367212960888106, "train_runtime": 21551.3774, "train_samples_per_second": 1.206, "train_steps_per_second": 0.302 } ], "logging_steps": 2, "max_steps": 6498, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 9999999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5760029558366536e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }