{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 6498, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006156213928434013, "grad_norm": 3.078125, "learning_rate": 2.5641025641025643e-08, "loss": 1.2494438886642456, "step": 2 }, { "epoch": 0.0012312427856868025, "grad_norm": 10.0625, "learning_rate": 7.692307692307694e-08, "loss": 1.9442476034164429, "step": 4 }, { "epoch": 0.001846864178530204, "grad_norm": 4.84375, "learning_rate": 1.282051282051282e-07, "loss": 1.6205459833145142, "step": 6 }, { "epoch": 0.002462485571373605, "grad_norm": 5.375, "learning_rate": 1.7948717948717948e-07, "loss": 1.9548132419586182, "step": 8 }, { "epoch": 0.0030781069642170067, "grad_norm": 7.9375, "learning_rate": 2.307692307692308e-07, "loss": 2.279244899749756, "step": 10 }, { "epoch": 0.003693728357060408, "grad_norm": 10.3125, "learning_rate": 2.820512820512821e-07, "loss": 1.478683590888977, "step": 12 }, { "epoch": 0.004309349749903809, "grad_norm": 22.625, "learning_rate": 3.3333333333333335e-07, "loss": 2.582879066467285, "step": 14 }, { "epoch": 0.00492497114274721, "grad_norm": 7.1875, "learning_rate": 3.846153846153847e-07, "loss": 1.8323612213134766, "step": 16 }, { "epoch": 0.005540592535590611, "grad_norm": 7.84375, "learning_rate": 4.358974358974359e-07, "loss": 1.6451525688171387, "step": 18 }, { "epoch": 0.0061562139284340135, "grad_norm": 8.625, "learning_rate": 4.871794871794872e-07, "loss": 1.5036852359771729, "step": 20 }, { "epoch": 0.006771835321277415, "grad_norm": 7.6875, "learning_rate": 5.384615384615386e-07, "loss": 1.8006218671798706, "step": 22 }, { "epoch": 0.007387456714120816, "grad_norm": 6.1875, "learning_rate": 5.897435897435898e-07, "loss": 1.83491051197052, "step": 24 }, { "epoch": 0.008003078106964217, "grad_norm": 2.890625, "learning_rate": 6.41025641025641e-07, "loss": 1.6935982704162598, "step": 26 }, { "epoch": 0.008618699499807618, "grad_norm": 5.8125, "learning_rate": 6.923076923076924e-07, "loss": 1.590232491493225, "step": 28 }, { "epoch": 0.00923432089265102, "grad_norm": 3.546875, "learning_rate": 7.435897435897436e-07, "loss": 1.8031585216522217, "step": 30 }, { "epoch": 0.00984994228549442, "grad_norm": 8.125, "learning_rate": 7.948717948717949e-07, "loss": 1.8574869632720947, "step": 32 }, { "epoch": 0.010465563678337822, "grad_norm": 5.90625, "learning_rate": 8.461538461538463e-07, "loss": 2.2104434967041016, "step": 34 }, { "epoch": 0.011081185071181223, "grad_norm": 4.125, "learning_rate": 8.974358974358975e-07, "loss": 1.4840224981307983, "step": 36 }, { "epoch": 0.011696806464024625, "grad_norm": 12.6875, "learning_rate": 9.487179487179487e-07, "loss": 1.8832218647003174, "step": 38 }, { "epoch": 0.012312427856868027, "grad_norm": 2.71875, "learning_rate": 1.0000000000000002e-06, "loss": 1.1985594034194946, "step": 40 }, { "epoch": 0.012928049249711427, "grad_norm": 8.0625, "learning_rate": 1.0512820512820514e-06, "loss": 1.2397284507751465, "step": 42 }, { "epoch": 0.01354367064255483, "grad_norm": 8.125, "learning_rate": 1.1025641025641026e-06, "loss": 2.0114519596099854, "step": 44 }, { "epoch": 0.01415929203539823, "grad_norm": 2.859375, "learning_rate": 1.153846153846154e-06, "loss": 1.4508113861083984, "step": 46 }, { "epoch": 0.014774913428241632, "grad_norm": 14.625, "learning_rate": 1.2051282051282053e-06, "loss": 1.73067307472229, "step": 48 }, { "epoch": 0.015390534821085032, "grad_norm": 2.40625, "learning_rate": 1.2564102564102565e-06, "loss": 1.3646880388259888, "step": 50 }, { "epoch": 0.016006156213928435, "grad_norm": 9.875, "learning_rate": 1.307692307692308e-06, "loss": 1.6563876867294312, "step": 52 }, { "epoch": 0.016621777606771835, "grad_norm": 7.71875, "learning_rate": 1.358974358974359e-06, "loss": 1.985392689704895, "step": 54 }, { "epoch": 0.017237398999615235, "grad_norm": 8.4375, "learning_rate": 1.4102564102564104e-06, "loss": 1.5193960666656494, "step": 56 }, { "epoch": 0.01785302039245864, "grad_norm": 3.453125, "learning_rate": 1.4615384615384618e-06, "loss": 1.9381763935089111, "step": 58 }, { "epoch": 0.01846864178530204, "grad_norm": 6.40625, "learning_rate": 1.5128205128205128e-06, "loss": 1.8013283014297485, "step": 60 }, { "epoch": 0.01908426317814544, "grad_norm": 10.8125, "learning_rate": 1.5641025641025642e-06, "loss": 1.5148835182189941, "step": 62 }, { "epoch": 0.01969988457098884, "grad_norm": 2.484375, "learning_rate": 1.6153846153846157e-06, "loss": 1.1726927757263184, "step": 64 }, { "epoch": 0.020315505963832244, "grad_norm": 9.5, "learning_rate": 1.6666666666666667e-06, "loss": 1.6863510608673096, "step": 66 }, { "epoch": 0.020931127356675645, "grad_norm": 5.15625, "learning_rate": 1.717948717948718e-06, "loss": 1.2719526290893555, "step": 68 }, { "epoch": 0.021546748749519045, "grad_norm": 7.5, "learning_rate": 1.7692307692307695e-06, "loss": 2.3042705059051514, "step": 70 }, { "epoch": 0.022162370142362445, "grad_norm": 6.78125, "learning_rate": 1.8205128205128205e-06, "loss": 1.8919185400009155, "step": 72 }, { "epoch": 0.02277799153520585, "grad_norm": 7.1875, "learning_rate": 1.871794871794872e-06, "loss": 1.6833558082580566, "step": 74 }, { "epoch": 0.02339361292804925, "grad_norm": 26.625, "learning_rate": 1.9230769230769234e-06, "loss": 2.0307669639587402, "step": 76 }, { "epoch": 0.02400923432089265, "grad_norm": 17.125, "learning_rate": 1.9743589743589744e-06, "loss": 1.9644056558609009, "step": 78 }, { "epoch": 0.024624855713736054, "grad_norm": 1.5390625, "learning_rate": 2.025641025641026e-06, "loss": 1.228562593460083, "step": 80 }, { "epoch": 0.025240477106579454, "grad_norm": 9.3125, "learning_rate": 2.0769230769230773e-06, "loss": 1.5443778038024902, "step": 82 }, { "epoch": 0.025856098499422855, "grad_norm": 6.1875, "learning_rate": 2.1282051282051283e-06, "loss": 1.4227708578109741, "step": 84 }, { "epoch": 0.026471719892266255, "grad_norm": 6.15625, "learning_rate": 2.1794871794871797e-06, "loss": 1.4684773683547974, "step": 86 }, { "epoch": 0.02708734128510966, "grad_norm": 4.75, "learning_rate": 2.230769230769231e-06, "loss": 1.6047524213790894, "step": 88 }, { "epoch": 0.02770296267795306, "grad_norm": 26.25, "learning_rate": 2.282051282051282e-06, "loss": 2.01340389251709, "step": 90 }, { "epoch": 0.02831858407079646, "grad_norm": 4.9375, "learning_rate": 2.3333333333333336e-06, "loss": 1.4057049751281738, "step": 92 }, { "epoch": 0.02893420546363986, "grad_norm": 6.5, "learning_rate": 2.384615384615385e-06, "loss": 1.8485151529312134, "step": 94 }, { "epoch": 0.029549826856483264, "grad_norm": 3.3125, "learning_rate": 2.435897435897436e-06, "loss": 1.523245930671692, "step": 96 }, { "epoch": 0.030165448249326664, "grad_norm": 5.53125, "learning_rate": 2.4871794871794875e-06, "loss": 1.679101824760437, "step": 98 }, { "epoch": 0.030781069642170065, "grad_norm": 3.6875, "learning_rate": 2.5384615384615385e-06, "loss": 1.103708028793335, "step": 100 }, { "epoch": 0.03139669103501347, "grad_norm": 6.875, "learning_rate": 2.5897435897435903e-06, "loss": 1.5518531799316406, "step": 102 }, { "epoch": 0.03201231242785687, "grad_norm": 4.84375, "learning_rate": 2.6410256410256413e-06, "loss": 1.558458924293518, "step": 104 }, { "epoch": 0.03262793382070027, "grad_norm": 22.0, "learning_rate": 2.6923076923076923e-06, "loss": 1.7127501964569092, "step": 106 }, { "epoch": 0.03324355521354367, "grad_norm": 8.5, "learning_rate": 2.743589743589744e-06, "loss": 1.4668926000595093, "step": 108 }, { "epoch": 0.03385917660638707, "grad_norm": 4.125, "learning_rate": 2.794871794871795e-06, "loss": 1.6548444032669067, "step": 110 }, { "epoch": 0.03447479799923047, "grad_norm": 13.875, "learning_rate": 2.846153846153846e-06, "loss": 1.927480936050415, "step": 112 }, { "epoch": 0.03509041939207388, "grad_norm": 7.4375, "learning_rate": 2.897435897435898e-06, "loss": 1.9920681715011597, "step": 114 }, { "epoch": 0.03570604078491728, "grad_norm": 30.0, "learning_rate": 2.948717948717949e-06, "loss": 1.4198029041290283, "step": 116 }, { "epoch": 0.03632166217776068, "grad_norm": 21.0, "learning_rate": 3e-06, "loss": 1.8268883228302002, "step": 118 }, { "epoch": 0.03693728357060408, "grad_norm": 5.09375, "learning_rate": 3.051282051282052e-06, "loss": 1.550619125366211, "step": 120 }, { "epoch": 0.03755290496344748, "grad_norm": 4.5625, "learning_rate": 3.102564102564103e-06, "loss": 1.4174524545669556, "step": 122 }, { "epoch": 0.03816852635629088, "grad_norm": 2.421875, "learning_rate": 3.153846153846154e-06, "loss": 1.085978627204895, "step": 124 }, { "epoch": 0.03878414774913428, "grad_norm": 4.34375, "learning_rate": 3.205128205128206e-06, "loss": 1.4020051956176758, "step": 126 }, { "epoch": 0.03939976914197768, "grad_norm": 10.25, "learning_rate": 3.256410256410257e-06, "loss": 1.7224973440170288, "step": 128 }, { "epoch": 0.04001539053482109, "grad_norm": 8.5625, "learning_rate": 3.307692307692308e-06, "loss": 1.2587796449661255, "step": 130 }, { "epoch": 0.04063101192766449, "grad_norm": 8.5, "learning_rate": 3.358974358974359e-06, "loss": 1.6709303855895996, "step": 132 }, { "epoch": 0.04124663332050789, "grad_norm": 6.5, "learning_rate": 3.4102564102564107e-06, "loss": 1.6689445972442627, "step": 134 }, { "epoch": 0.04186225471335129, "grad_norm": 1.6328125, "learning_rate": 3.4615384615384617e-06, "loss": 0.9564234018325806, "step": 136 }, { "epoch": 0.04247787610619469, "grad_norm": 23.5, "learning_rate": 3.5128205128205127e-06, "loss": 1.5858899354934692, "step": 138 }, { "epoch": 0.04309349749903809, "grad_norm": 14.3125, "learning_rate": 3.5641025641025646e-06, "loss": 1.7939025163650513, "step": 140 }, { "epoch": 0.04370911889188149, "grad_norm": 4.96875, "learning_rate": 3.6153846153846156e-06, "loss": 1.3694020509719849, "step": 142 }, { "epoch": 0.04432474028472489, "grad_norm": 3.0625, "learning_rate": 3.6666666666666666e-06, "loss": 1.423534631729126, "step": 144 }, { "epoch": 0.0449403616775683, "grad_norm": 9.5625, "learning_rate": 3.7179487179487184e-06, "loss": 1.492781162261963, "step": 146 }, { "epoch": 0.0455559830704117, "grad_norm": 4.90625, "learning_rate": 3.7692307692307694e-06, "loss": 1.906273603439331, "step": 148 }, { "epoch": 0.0461716044632551, "grad_norm": 5.625, "learning_rate": 3.8205128205128204e-06, "loss": 1.3480786085128784, "step": 150 }, { "epoch": 0.0467872258560985, "grad_norm": 14.5, "learning_rate": 3.871794871794872e-06, "loss": 1.4525244235992432, "step": 152 }, { "epoch": 0.0474028472489419, "grad_norm": 7.46875, "learning_rate": 3.923076923076923e-06, "loss": 1.6392451524734497, "step": 154 }, { "epoch": 0.0480184686417853, "grad_norm": 7.5, "learning_rate": 3.974358974358974e-06, "loss": 1.236540675163269, "step": 156 }, { "epoch": 0.0486340900346287, "grad_norm": 4.3125, "learning_rate": 4.025641025641026e-06, "loss": 1.391970157623291, "step": 158 }, { "epoch": 0.04924971142747211, "grad_norm": 35.0, "learning_rate": 4.076923076923077e-06, "loss": 1.6326066255569458, "step": 160 }, { "epoch": 0.04986533282031551, "grad_norm": 2.875, "learning_rate": 4.128205128205128e-06, "loss": 1.3114484548568726, "step": 162 }, { "epoch": 0.05048095421315891, "grad_norm": 7.53125, "learning_rate": 4.17948717948718e-06, "loss": 1.2678321599960327, "step": 164 }, { "epoch": 0.05109657560600231, "grad_norm": 14.8125, "learning_rate": 4.230769230769231e-06, "loss": 1.3473753929138184, "step": 166 }, { "epoch": 0.05171219699884571, "grad_norm": 2.78125, "learning_rate": 4.282051282051282e-06, "loss": 1.7800545692443848, "step": 168 }, { "epoch": 0.05232781839168911, "grad_norm": 4.03125, "learning_rate": 4.333333333333334e-06, "loss": 1.421060562133789, "step": 170 }, { "epoch": 0.05294343978453251, "grad_norm": 17.25, "learning_rate": 4.384615384615385e-06, "loss": 1.8355920314788818, "step": 172 }, { "epoch": 0.05355906117737591, "grad_norm": 7.0625, "learning_rate": 4.435897435897436e-06, "loss": 1.2254821062088013, "step": 174 }, { "epoch": 0.05417468257021932, "grad_norm": 6.1875, "learning_rate": 4.487179487179488e-06, "loss": 1.5050994157791138, "step": 176 }, { "epoch": 0.05479030396306272, "grad_norm": 8.5625, "learning_rate": 4.538461538461539e-06, "loss": 1.6933058500289917, "step": 178 }, { "epoch": 0.05540592535590612, "grad_norm": 6.625, "learning_rate": 4.58974358974359e-06, "loss": 1.3030725717544556, "step": 180 }, { "epoch": 0.05602154674874952, "grad_norm": 5.09375, "learning_rate": 4.641025641025642e-06, "loss": 1.407893180847168, "step": 182 }, { "epoch": 0.05663716814159292, "grad_norm": 4.25, "learning_rate": 4.692307692307693e-06, "loss": 0.9550248980522156, "step": 184 }, { "epoch": 0.05725278953443632, "grad_norm": 8.875, "learning_rate": 4.743589743589744e-06, "loss": 1.7092329263687134, "step": 186 }, { "epoch": 0.05786841092727972, "grad_norm": 9.4375, "learning_rate": 4.7948717948717955e-06, "loss": 1.3839688301086426, "step": 188 }, { "epoch": 0.05848403232012313, "grad_norm": 7.78125, "learning_rate": 4.8461538461538465e-06, "loss": 1.676175594329834, "step": 190 }, { "epoch": 0.05909965371296653, "grad_norm": 2.296875, "learning_rate": 4.8974358974358975e-06, "loss": 1.3374234437942505, "step": 192 }, { "epoch": 0.05971527510580993, "grad_norm": 3.71875, "learning_rate": 4.948717948717949e-06, "loss": 1.2153236865997314, "step": 194 }, { "epoch": 0.06033089649865333, "grad_norm": 4.71875, "learning_rate": 5e-06, "loss": 1.4874577522277832, "step": 196 }, { "epoch": 0.06094651789149673, "grad_norm": 4.125, "learning_rate": 4.999999006277585e-06, "loss": 1.27693772315979, "step": 198 }, { "epoch": 0.06156213928434013, "grad_norm": 36.25, "learning_rate": 4.9999960251113246e-06, "loss": 1.3822270631790161, "step": 200 }, { "epoch": 0.06217776067718353, "grad_norm": 3.984375, "learning_rate": 4.999991056504183e-06, "loss": 1.6080832481384277, "step": 202 }, { "epoch": 0.06279338207002694, "grad_norm": 4.5625, "learning_rate": 4.9999841004610975e-06, "loss": 1.046898365020752, "step": 204 }, { "epoch": 0.06340900346287033, "grad_norm": 14.1875, "learning_rate": 4.999975156988978e-06, "loss": 1.6927094459533691, "step": 206 }, { "epoch": 0.06402462485571374, "grad_norm": 5.96875, "learning_rate": 4.999964226096716e-06, "loss": 1.65798819065094, "step": 208 }, { "epoch": 0.06464024624855713, "grad_norm": 6.53125, "learning_rate": 4.999951307795171e-06, "loss": 1.7509567737579346, "step": 210 }, { "epoch": 0.06525586764140054, "grad_norm": 6.9375, "learning_rate": 4.999936402097182e-06, "loss": 1.7493160963058472, "step": 212 }, { "epoch": 0.06587148903424395, "grad_norm": 14.0, "learning_rate": 4.999919509017559e-06, "loss": 1.6608965396881104, "step": 214 }, { "epoch": 0.06648711042708734, "grad_norm": 11.3125, "learning_rate": 4.99990062857309e-06, "loss": 1.330740213394165, "step": 216 }, { "epoch": 0.06710273181993075, "grad_norm": 5.6875, "learning_rate": 4.999879760782537e-06, "loss": 1.5713825225830078, "step": 218 }, { "epoch": 0.06771835321277414, "grad_norm": 8.625, "learning_rate": 4.999856905666636e-06, "loss": 1.4168877601623535, "step": 220 }, { "epoch": 0.06833397460561755, "grad_norm": 7.3125, "learning_rate": 4.9998320632481e-06, "loss": 1.3851971626281738, "step": 222 }, { "epoch": 0.06894959599846094, "grad_norm": 3.15625, "learning_rate": 4.999805233551616e-06, "loss": 1.1393821239471436, "step": 224 }, { "epoch": 0.06956521739130435, "grad_norm": 2.046875, "learning_rate": 4.999776416603842e-06, "loss": 1.4283275604248047, "step": 226 }, { "epoch": 0.07018083878414776, "grad_norm": 7.0625, "learning_rate": 4.999745612433418e-06, "loss": 1.4363598823547363, "step": 228 }, { "epoch": 0.07079646017699115, "grad_norm": 10.3125, "learning_rate": 4.999712821070951e-06, "loss": 1.6502529382705688, "step": 230 }, { "epoch": 0.07141208156983456, "grad_norm": 9.1875, "learning_rate": 4.99967804254903e-06, "loss": 1.3001021146774292, "step": 232 }, { "epoch": 0.07202770296267795, "grad_norm": 3.875, "learning_rate": 4.999641276902213e-06, "loss": 1.3146867752075195, "step": 234 }, { "epoch": 0.07264332435552136, "grad_norm": 2.359375, "learning_rate": 4.999602524167036e-06, "loss": 1.344026803970337, "step": 236 }, { "epoch": 0.07325894574836475, "grad_norm": 5.03125, "learning_rate": 4.999561784382009e-06, "loss": 1.5490647554397583, "step": 238 }, { "epoch": 0.07387456714120816, "grad_norm": 5.84375, "learning_rate": 4.999519057587613e-06, "loss": 1.3091920614242554, "step": 240 }, { "epoch": 0.07449018853405155, "grad_norm": 3.453125, "learning_rate": 4.999474343826309e-06, "loss": 1.1746587753295898, "step": 242 }, { "epoch": 0.07510580992689496, "grad_norm": 51.75, "learning_rate": 4.999427643142531e-06, "loss": 1.331214427947998, "step": 244 }, { "epoch": 0.07572143131973837, "grad_norm": 6.96875, "learning_rate": 4.999378955582684e-06, "loss": 1.5764216184616089, "step": 246 }, { "epoch": 0.07633705271258176, "grad_norm": 8.25, "learning_rate": 4.9993282811951514e-06, "loss": 1.4954771995544434, "step": 248 }, { "epoch": 0.07695267410542517, "grad_norm": 4.0625, "learning_rate": 4.99927562003029e-06, "loss": 1.064731240272522, "step": 250 }, { "epoch": 0.07756829549826856, "grad_norm": 5.4375, "learning_rate": 4.999220972140427e-06, "loss": 1.4773770570755005, "step": 252 }, { "epoch": 0.07818391689111197, "grad_norm": 4.375, "learning_rate": 4.999164337579873e-06, "loss": 1.6769360303878784, "step": 254 }, { "epoch": 0.07879953828395536, "grad_norm": 11.5, "learning_rate": 4.999105716404901e-06, "loss": 1.8475109338760376, "step": 256 }, { "epoch": 0.07941515967679877, "grad_norm": 4.3125, "learning_rate": 4.999045108673769e-06, "loss": 1.46021568775177, "step": 258 }, { "epoch": 0.08003078106964218, "grad_norm": 2.546875, "learning_rate": 4.998982514446702e-06, "loss": 1.5628776550292969, "step": 260 }, { "epoch": 0.08064640246248557, "grad_norm": 7.8125, "learning_rate": 4.9989179337859e-06, "loss": 1.5176582336425781, "step": 262 }, { "epoch": 0.08126202385532898, "grad_norm": 8.375, "learning_rate": 4.998851366755541e-06, "loss": 1.5871182680130005, "step": 264 }, { "epoch": 0.08187764524817237, "grad_norm": 5.375, "learning_rate": 4.998782813421773e-06, "loss": 1.2723009586334229, "step": 266 }, { "epoch": 0.08249326664101578, "grad_norm": 8.375, "learning_rate": 4.998712273852719e-06, "loss": 1.3980518579483032, "step": 268 }, { "epoch": 0.08310888803385917, "grad_norm": 4.53125, "learning_rate": 4.998639748118476e-06, "loss": 1.306288480758667, "step": 270 }, { "epoch": 0.08372450942670258, "grad_norm": 7.3125, "learning_rate": 4.998565236291114e-06, "loss": 1.5996097326278687, "step": 272 }, { "epoch": 0.08434013081954599, "grad_norm": 7.03125, "learning_rate": 4.9984887384446755e-06, "loss": 1.1350281238555908, "step": 274 }, { "epoch": 0.08495575221238938, "grad_norm": 2.546875, "learning_rate": 4.998410254655181e-06, "loss": 1.4976806640625, "step": 276 }, { "epoch": 0.08557137360523279, "grad_norm": 4.09375, "learning_rate": 4.998329785000621e-06, "loss": 1.4027005434036255, "step": 278 }, { "epoch": 0.08618699499807618, "grad_norm": 11.5625, "learning_rate": 4.998247329560959e-06, "loss": 1.5084396600723267, "step": 280 }, { "epoch": 0.08680261639091959, "grad_norm": 7.5, "learning_rate": 4.9981628884181335e-06, "loss": 1.5678538084030151, "step": 282 }, { "epoch": 0.08741823778376298, "grad_norm": 16.875, "learning_rate": 4.9980764616560555e-06, "loss": 1.4148145914077759, "step": 284 }, { "epoch": 0.08803385917660639, "grad_norm": 11.8125, "learning_rate": 4.997988049360608e-06, "loss": 2.0164575576782227, "step": 286 }, { "epoch": 0.08864948056944978, "grad_norm": 11.25, "learning_rate": 4.99789765161965e-06, "loss": 1.0314432382583618, "step": 288 }, { "epoch": 0.08926510196229319, "grad_norm": 6.875, "learning_rate": 4.9978052685230105e-06, "loss": 1.5884900093078613, "step": 290 }, { "epoch": 0.0898807233551366, "grad_norm": 5.40625, "learning_rate": 4.997710900162494e-06, "loss": 1.0817818641662598, "step": 292 }, { "epoch": 0.09049634474797999, "grad_norm": 9.125, "learning_rate": 4.997614546631875e-06, "loss": 1.442818284034729, "step": 294 }, { "epoch": 0.0911119661408234, "grad_norm": 4.0625, "learning_rate": 4.997516208026902e-06, "loss": 1.5313433408737183, "step": 296 }, { "epoch": 0.09172758753366679, "grad_norm": 7.25, "learning_rate": 4.997415884445299e-06, "loss": 1.1391329765319824, "step": 298 }, { "epoch": 0.0923432089265102, "grad_norm": 12.875, "learning_rate": 4.997313575986756e-06, "loss": 1.4202252626419067, "step": 300 }, { "epoch": 0.09295883031935359, "grad_norm": 3.84375, "learning_rate": 4.997209282752943e-06, "loss": 1.0266681909561157, "step": 302 }, { "epoch": 0.093574451712197, "grad_norm": 12.375, "learning_rate": 4.997103004847496e-06, "loss": 1.9265862703323364, "step": 304 }, { "epoch": 0.0941900731050404, "grad_norm": 6.46875, "learning_rate": 4.996994742376025e-06, "loss": 1.753663182258606, "step": 306 }, { "epoch": 0.0948056944978838, "grad_norm": 3.328125, "learning_rate": 4.996884495446116e-06, "loss": 0.8436685800552368, "step": 308 }, { "epoch": 0.0954213158907272, "grad_norm": 10.0625, "learning_rate": 4.996772264167321e-06, "loss": 2.0714216232299805, "step": 310 }, { "epoch": 0.0960369372835706, "grad_norm": 15.1875, "learning_rate": 4.996658048651169e-06, "loss": 1.8349342346191406, "step": 312 }, { "epoch": 0.09665255867641401, "grad_norm": 15.9375, "learning_rate": 4.996541849011156e-06, "loss": 1.8003356456756592, "step": 314 }, { "epoch": 0.0972681800692574, "grad_norm": 1.796875, "learning_rate": 4.996423665362754e-06, "loss": 0.6770284175872803, "step": 316 }, { "epoch": 0.09788380146210081, "grad_norm": 3.015625, "learning_rate": 4.9963034978234035e-06, "loss": 1.2355016469955444, "step": 318 }, { "epoch": 0.09849942285494422, "grad_norm": 5.46875, "learning_rate": 4.99618134651252e-06, "loss": 0.9716172814369202, "step": 320 }, { "epoch": 0.09911504424778761, "grad_norm": 5.875, "learning_rate": 4.996057211551485e-06, "loss": 1.3039079904556274, "step": 322 }, { "epoch": 0.09973066564063102, "grad_norm": 2.59375, "learning_rate": 4.995931093063656e-06, "loss": 1.5024352073669434, "step": 324 }, { "epoch": 0.10034628703347441, "grad_norm": 14.375, "learning_rate": 4.99580299117436e-06, "loss": 1.4408152103424072, "step": 326 }, { "epoch": 0.10096190842631782, "grad_norm": 7.53125, "learning_rate": 4.995672906010893e-06, "loss": 1.5990712642669678, "step": 328 }, { "epoch": 0.10157752981916121, "grad_norm": 13.25, "learning_rate": 4.9955408377025245e-06, "loss": 1.2190871238708496, "step": 330 }, { "epoch": 0.10219315121200462, "grad_norm": 6.09375, "learning_rate": 4.995406786380496e-06, "loss": 1.2393221855163574, "step": 332 }, { "epoch": 0.10280877260484801, "grad_norm": 11.125, "learning_rate": 4.995270752178013e-06, "loss": 1.5301152467727661, "step": 334 }, { "epoch": 0.10342439399769142, "grad_norm": 1.7734375, "learning_rate": 4.995132735230258e-06, "loss": 1.2268280982971191, "step": 336 }, { "epoch": 0.10404001539053483, "grad_norm": 5.09375, "learning_rate": 4.994992735674382e-06, "loss": 1.1164804697036743, "step": 338 }, { "epoch": 0.10465563678337822, "grad_norm": 8.75, "learning_rate": 4.994850753649506e-06, "loss": 1.6136159896850586, "step": 340 }, { "epoch": 0.10527125817622163, "grad_norm": 5.28125, "learning_rate": 4.99470678929672e-06, "loss": 1.2142976522445679, "step": 342 }, { "epoch": 0.10588687956906502, "grad_norm": 6.125, "learning_rate": 4.9945608427590834e-06, "loss": 1.4761240482330322, "step": 344 }, { "epoch": 0.10650250096190843, "grad_norm": 2.0, "learning_rate": 4.994412914181627e-06, "loss": 1.3172059059143066, "step": 346 }, { "epoch": 0.10711812235475182, "grad_norm": 4.40625, "learning_rate": 4.994263003711351e-06, "loss": 1.3690990209579468, "step": 348 }, { "epoch": 0.10773374374759523, "grad_norm": 6.5, "learning_rate": 4.994111111497227e-06, "loss": 0.9969586730003357, "step": 350 }, { "epoch": 0.10834936514043864, "grad_norm": 5.0625, "learning_rate": 4.993957237690191e-06, "loss": 1.1765248775482178, "step": 352 }, { "epoch": 0.10896498653328203, "grad_norm": 5.28125, "learning_rate": 4.993801382443152e-06, "loss": 1.5850834846496582, "step": 354 }, { "epoch": 0.10958060792612544, "grad_norm": 4.40625, "learning_rate": 4.993643545910986e-06, "loss": 1.4101109504699707, "step": 356 }, { "epoch": 0.11019622931896883, "grad_norm": 7.59375, "learning_rate": 4.99348372825054e-06, "loss": 1.3804712295532227, "step": 358 }, { "epoch": 0.11081185071181224, "grad_norm": 4.90625, "learning_rate": 4.993321929620627e-06, "loss": 1.5072553157806396, "step": 360 }, { "epoch": 0.11142747210465563, "grad_norm": 7.9375, "learning_rate": 4.9931581501820315e-06, "loss": 1.5175684690475464, "step": 362 }, { "epoch": 0.11204309349749904, "grad_norm": 4.71875, "learning_rate": 4.992992390097503e-06, "loss": 1.5557037591934204, "step": 364 }, { "epoch": 0.11265871489034245, "grad_norm": 11.625, "learning_rate": 4.992824649531762e-06, "loss": 0.6004631519317627, "step": 366 }, { "epoch": 0.11327433628318584, "grad_norm": 7.625, "learning_rate": 4.992654928651496e-06, "loss": 1.6547404527664185, "step": 368 }, { "epoch": 0.11388995767602925, "grad_norm": 116.0, "learning_rate": 4.99248322762536e-06, "loss": 1.4214962720870972, "step": 370 }, { "epoch": 0.11450557906887264, "grad_norm": 8.8125, "learning_rate": 4.992309546623978e-06, "loss": 1.0065133571624756, "step": 372 }, { "epoch": 0.11512120046171605, "grad_norm": 8.0625, "learning_rate": 4.99213388581994e-06, "loss": 1.671399712562561, "step": 374 }, { "epoch": 0.11573682185455944, "grad_norm": 6.6875, "learning_rate": 4.991956245387805e-06, "loss": 1.2735620737075806, "step": 376 }, { "epoch": 0.11635244324740285, "grad_norm": 7.0625, "learning_rate": 4.991776625504097e-06, "loss": 1.4278959035873413, "step": 378 }, { "epoch": 0.11696806464024626, "grad_norm": 3.015625, "learning_rate": 4.991595026347309e-06, "loss": 1.1840717792510986, "step": 380 }, { "epoch": 0.11758368603308965, "grad_norm": 3.21875, "learning_rate": 4.9914114480979e-06, "loss": 1.2029433250427246, "step": 382 }, { "epoch": 0.11819930742593306, "grad_norm": 13.5, "learning_rate": 4.991225890938296e-06, "loss": 1.6791255474090576, "step": 384 }, { "epoch": 0.11881492881877645, "grad_norm": 11.625, "learning_rate": 4.991038355052889e-06, "loss": 1.2933179140090942, "step": 386 }, { "epoch": 0.11943055021161986, "grad_norm": 7.375, "learning_rate": 4.9908488406280375e-06, "loss": 1.6819498538970947, "step": 388 }, { "epoch": 0.12004617160446325, "grad_norm": 6.125, "learning_rate": 4.990657347852067e-06, "loss": 0.9763534665107727, "step": 390 }, { "epoch": 0.12066179299730666, "grad_norm": 4.9375, "learning_rate": 4.990463876915268e-06, "loss": 1.4245015382766724, "step": 392 }, { "epoch": 0.12127741439015005, "grad_norm": 2.265625, "learning_rate": 4.9902684280098964e-06, "loss": 1.4248372316360474, "step": 394 }, { "epoch": 0.12189303578299346, "grad_norm": 25.5, "learning_rate": 4.990071001330174e-06, "loss": 1.306705355644226, "step": 396 }, { "epoch": 0.12250865717583687, "grad_norm": 4.03125, "learning_rate": 4.989871597072289e-06, "loss": 1.2770874500274658, "step": 398 }, { "epoch": 0.12312427856868026, "grad_norm": 6.125, "learning_rate": 4.989670215434393e-06, "loss": 1.4374216794967651, "step": 400 }, { "epoch": 0.12373989996152367, "grad_norm": 5.875, "learning_rate": 4.989466856616604e-06, "loss": 1.5654191970825195, "step": 402 }, { "epoch": 0.12435552135436706, "grad_norm": 7.09375, "learning_rate": 4.989261520821004e-06, "loss": 1.0301884412765503, "step": 404 }, { "epoch": 0.12497114274721047, "grad_norm": 7.65625, "learning_rate": 4.98905420825164e-06, "loss": 1.2070664167404175, "step": 406 }, { "epoch": 0.12558676414005387, "grad_norm": 5.5, "learning_rate": 4.988844919114523e-06, "loss": 1.388759732246399, "step": 408 }, { "epoch": 0.12620238553289725, "grad_norm": 3.578125, "learning_rate": 4.988633653617628e-06, "loss": 1.3255653381347656, "step": 410 }, { "epoch": 0.12681800692574066, "grad_norm": 6.3125, "learning_rate": 4.9884204119708946e-06, "loss": 1.2467639446258545, "step": 412 }, { "epoch": 0.12743362831858407, "grad_norm": 5.09375, "learning_rate": 4.988205194386225e-06, "loss": 1.5396754741668701, "step": 414 }, { "epoch": 0.12804924971142748, "grad_norm": 5.90625, "learning_rate": 4.987988001077487e-06, "loss": 1.5393030643463135, "step": 416 }, { "epoch": 0.12866487110427088, "grad_norm": 20.0, "learning_rate": 4.98776883226051e-06, "loss": 0.8135099411010742, "step": 418 }, { "epoch": 0.12928049249711426, "grad_norm": 7.9375, "learning_rate": 4.987547688153087e-06, "loss": 1.421095848083496, "step": 420 }, { "epoch": 0.12989611388995767, "grad_norm": 83.0, "learning_rate": 4.987324568974974e-06, "loss": 1.5129131078720093, "step": 422 }, { "epoch": 0.13051173528280108, "grad_norm": 6.6875, "learning_rate": 4.987099474947889e-06, "loss": 1.033460259437561, "step": 424 }, { "epoch": 0.13112735667564449, "grad_norm": 4.71875, "learning_rate": 4.986872406295513e-06, "loss": 1.4056336879730225, "step": 426 }, { "epoch": 0.1317429780684879, "grad_norm": 8.4375, "learning_rate": 4.9866433632434895e-06, "loss": 1.235420823097229, "step": 428 }, { "epoch": 0.13235859946133127, "grad_norm": 8.6875, "learning_rate": 4.986412346019423e-06, "loss": 1.3629176616668701, "step": 430 }, { "epoch": 0.13297422085417468, "grad_norm": 8.0625, "learning_rate": 4.9861793548528835e-06, "loss": 1.478594422340393, "step": 432 }, { "epoch": 0.1335898422470181, "grad_norm": 25.25, "learning_rate": 4.985944389975396e-06, "loss": 1.8666064739227295, "step": 434 }, { "epoch": 0.1342054636398615, "grad_norm": 10.0, "learning_rate": 4.98570745162045e-06, "loss": 1.877537727355957, "step": 436 }, { "epoch": 0.13482108503270487, "grad_norm": 6.4375, "learning_rate": 4.985468540023501e-06, "loss": 1.4299794435501099, "step": 438 }, { "epoch": 0.13543670642554828, "grad_norm": 4.65625, "learning_rate": 4.985227655421956e-06, "loss": 0.9729835987091064, "step": 440 }, { "epoch": 0.1360523278183917, "grad_norm": 11.25, "learning_rate": 4.984984798055189e-06, "loss": 1.6033118963241577, "step": 442 }, { "epoch": 0.1366679492112351, "grad_norm": 30.875, "learning_rate": 4.984739968164534e-06, "loss": 1.3174901008605957, "step": 444 }, { "epoch": 0.1372835706040785, "grad_norm": 9.5, "learning_rate": 4.9844931659932825e-06, "loss": 1.4525535106658936, "step": 446 }, { "epoch": 0.13789919199692188, "grad_norm": 22.5, "learning_rate": 4.984244391786688e-06, "loss": 2.0567686557769775, "step": 448 }, { "epoch": 0.1385148133897653, "grad_norm": 3.03125, "learning_rate": 4.983993645791962e-06, "loss": 1.2985535860061646, "step": 450 }, { "epoch": 0.1391304347826087, "grad_norm": 18.5, "learning_rate": 4.9837409282582795e-06, "loss": 1.554973840713501, "step": 452 }, { "epoch": 0.1397460561754521, "grad_norm": 24.5, "learning_rate": 4.983486239436768e-06, "loss": 1.2067277431488037, "step": 454 }, { "epoch": 0.1403616775682955, "grad_norm": 9.75, "learning_rate": 4.983229579580519e-06, "loss": 1.499319076538086, "step": 456 }, { "epoch": 0.1409772989611389, "grad_norm": 7.59375, "learning_rate": 4.982970948944581e-06, "loss": 1.479343295097351, "step": 458 }, { "epoch": 0.1415929203539823, "grad_norm": 7.0, "learning_rate": 4.98271034778596e-06, "loss": 1.8565704822540283, "step": 460 }, { "epoch": 0.1422085417468257, "grad_norm": 21.125, "learning_rate": 4.982447776363625e-06, "loss": 1.7143534421920776, "step": 462 }, { "epoch": 0.1428241631396691, "grad_norm": 7.96875, "learning_rate": 4.982183234938495e-06, "loss": 1.408637523651123, "step": 464 }, { "epoch": 0.1434397845325125, "grad_norm": 6.96875, "learning_rate": 4.9819167237734515e-06, "loss": 1.3933427333831787, "step": 466 }, { "epoch": 0.1440554059253559, "grad_norm": 11.4375, "learning_rate": 4.981648243133334e-06, "loss": 1.767836570739746, "step": 468 }, { "epoch": 0.1446710273181993, "grad_norm": 5.21875, "learning_rate": 4.9813777932849365e-06, "loss": 1.680699348449707, "step": 470 }, { "epoch": 0.14528664871104272, "grad_norm": 25.125, "learning_rate": 4.981105374497012e-06, "loss": 1.4010961055755615, "step": 472 }, { "epoch": 0.14590227010388612, "grad_norm": 6.875, "learning_rate": 4.9808309870402685e-06, "loss": 1.5148462057113647, "step": 474 }, { "epoch": 0.1465178914967295, "grad_norm": 4.8125, "learning_rate": 4.980554631187371e-06, "loss": 1.7220298051834106, "step": 476 }, { "epoch": 0.1471335128895729, "grad_norm": 4.46875, "learning_rate": 4.980276307212941e-06, "loss": 1.0316753387451172, "step": 478 }, { "epoch": 0.14774913428241632, "grad_norm": 6.9375, "learning_rate": 4.9799960153935555e-06, "loss": 1.2958691120147705, "step": 480 }, { "epoch": 0.14836475567525972, "grad_norm": 4.96875, "learning_rate": 4.9797137560077456e-06, "loss": 1.341382384300232, "step": 482 }, { "epoch": 0.1489803770681031, "grad_norm": 3.671875, "learning_rate": 4.979429529335999e-06, "loss": 1.2698701620101929, "step": 484 }, { "epoch": 0.1495959984609465, "grad_norm": 8.875, "learning_rate": 4.97914333566076e-06, "loss": 1.5648541450500488, "step": 486 }, { "epoch": 0.15021161985378992, "grad_norm": 7.59375, "learning_rate": 4.978855175266423e-06, "loss": 1.6046477556228638, "step": 488 }, { "epoch": 0.15082724124663333, "grad_norm": 10.3125, "learning_rate": 4.978565048439341e-06, "loss": 1.4956114292144775, "step": 490 }, { "epoch": 0.15144286263947673, "grad_norm": 3.125, "learning_rate": 4.9782729554678185e-06, "loss": 1.2179930210113525, "step": 492 }, { "epoch": 0.1520584840323201, "grad_norm": 14.875, "learning_rate": 4.977978896642117e-06, "loss": 0.909509539604187, "step": 494 }, { "epoch": 0.15267410542516352, "grad_norm": 12.0, "learning_rate": 4.9776828722544465e-06, "loss": 1.531646728515625, "step": 496 }, { "epoch": 0.15328972681800693, "grad_norm": 8.5625, "learning_rate": 4.977384882598976e-06, "loss": 1.7037618160247803, "step": 498 }, { "epoch": 0.15390534821085033, "grad_norm": 5.0625, "learning_rate": 4.9770849279718215e-06, "loss": 1.2394758462905884, "step": 500 }, { "epoch": 0.15452096960369374, "grad_norm": 6.0, "learning_rate": 4.9767830086710565e-06, "loss": 1.4906891584396362, "step": 502 }, { "epoch": 0.15513659099653712, "grad_norm": 10.25, "learning_rate": 4.976479124996705e-06, "loss": 1.6954540014266968, "step": 504 }, { "epoch": 0.15575221238938053, "grad_norm": 9.8125, "learning_rate": 4.976173277250742e-06, "loss": 1.4690717458724976, "step": 506 }, { "epoch": 0.15636783378222394, "grad_norm": 12.5625, "learning_rate": 4.975865465737096e-06, "loss": 1.3483026027679443, "step": 508 }, { "epoch": 0.15698345517506734, "grad_norm": 4.375, "learning_rate": 4.9755556907616455e-06, "loss": 1.5260601043701172, "step": 510 }, { "epoch": 0.15759907656791072, "grad_norm": 13.25, "learning_rate": 4.9752439526322224e-06, "loss": 1.7949278354644775, "step": 512 }, { "epoch": 0.15821469796075413, "grad_norm": 7.5625, "learning_rate": 4.974930251658606e-06, "loss": 1.5277800559997559, "step": 514 }, { "epoch": 0.15883031935359754, "grad_norm": 3.78125, "learning_rate": 4.97461458815253e-06, "loss": 1.358290433883667, "step": 516 }, { "epoch": 0.15944594074644095, "grad_norm": 4.1875, "learning_rate": 4.9742969624276735e-06, "loss": 1.4866092205047607, "step": 518 }, { "epoch": 0.16006156213928435, "grad_norm": 9.625, "learning_rate": 4.9739773747996715e-06, "loss": 1.4200929403305054, "step": 520 }, { "epoch": 0.16067718353212773, "grad_norm": 7.71875, "learning_rate": 4.973655825586102e-06, "loss": 1.8140249252319336, "step": 522 }, { "epoch": 0.16129280492497114, "grad_norm": 5.78125, "learning_rate": 4.973332315106499e-06, "loss": 1.4459000825881958, "step": 524 }, { "epoch": 0.16190842631781455, "grad_norm": 8.9375, "learning_rate": 4.97300684368234e-06, "loss": 1.7631019353866577, "step": 526 }, { "epoch": 0.16252404771065795, "grad_norm": 5.96875, "learning_rate": 4.972679411637053e-06, "loss": 1.517167568206787, "step": 528 }, { "epoch": 0.16313966910350133, "grad_norm": 5.15625, "learning_rate": 4.972350019296017e-06, "loss": 0.9867444038391113, "step": 530 }, { "epoch": 0.16375529049634474, "grad_norm": 4.5, "learning_rate": 4.972018666986554e-06, "loss": 1.376389503479004, "step": 532 }, { "epoch": 0.16437091188918815, "grad_norm": 4.875, "learning_rate": 4.971685355037938e-06, "loss": 1.3735544681549072, "step": 534 }, { "epoch": 0.16498653328203156, "grad_norm": 13.5, "learning_rate": 4.971350083781387e-06, "loss": 1.414724588394165, "step": 536 }, { "epoch": 0.16560215467487496, "grad_norm": 15.5, "learning_rate": 4.971012853550069e-06, "loss": 1.7341519594192505, "step": 538 }, { "epoch": 0.16621777606771834, "grad_norm": 4.40625, "learning_rate": 4.970673664679097e-06, "loss": 1.353968858718872, "step": 540 }, { "epoch": 0.16683339746056175, "grad_norm": 9.4375, "learning_rate": 4.9703325175055285e-06, "loss": 1.5440301895141602, "step": 542 }, { "epoch": 0.16744901885340516, "grad_norm": 8.3125, "learning_rate": 4.969989412368371e-06, "loss": 1.3509860038757324, "step": 544 }, { "epoch": 0.16806464024624856, "grad_norm": 5.65625, "learning_rate": 4.969644349608576e-06, "loss": 0.9262280464172363, "step": 546 }, { "epoch": 0.16868026163909197, "grad_norm": 5.21875, "learning_rate": 4.969297329569039e-06, "loss": 1.278700828552246, "step": 548 }, { "epoch": 0.16929588303193535, "grad_norm": 5.75, "learning_rate": 4.968948352594604e-06, "loss": 1.3852548599243164, "step": 550 }, { "epoch": 0.16991150442477876, "grad_norm": 11.3125, "learning_rate": 4.968597419032053e-06, "loss": 1.6079179048538208, "step": 552 }, { "epoch": 0.17052712581762217, "grad_norm": 12.125, "learning_rate": 4.96824452923012e-06, "loss": 1.416031837463379, "step": 554 }, { "epoch": 0.17114274721046557, "grad_norm": 13.75, "learning_rate": 4.967889683539479e-06, "loss": 1.6487233638763428, "step": 556 }, { "epoch": 0.17175836860330895, "grad_norm": 8.1875, "learning_rate": 4.9675328823127465e-06, "loss": 1.6864871978759766, "step": 558 }, { "epoch": 0.17237398999615236, "grad_norm": 7.59375, "learning_rate": 4.967174125904486e-06, "loss": 1.3610726594924927, "step": 560 }, { "epoch": 0.17298961138899577, "grad_norm": 6.6875, "learning_rate": 4.9668134146712e-06, "loss": 0.9785069823265076, "step": 562 }, { "epoch": 0.17360523278183917, "grad_norm": 10.3125, "learning_rate": 4.966450748971336e-06, "loss": 1.3967070579528809, "step": 564 }, { "epoch": 0.17422085417468258, "grad_norm": 2.703125, "learning_rate": 4.966086129165283e-06, "loss": 0.9256719946861267, "step": 566 }, { "epoch": 0.17483647556752596, "grad_norm": 6.21875, "learning_rate": 4.9657195556153725e-06, "loss": 1.2384088039398193, "step": 568 }, { "epoch": 0.17545209696036937, "grad_norm": 6.03125, "learning_rate": 4.965351028685876e-06, "loss": 1.5370593070983887, "step": 570 }, { "epoch": 0.17606771835321278, "grad_norm": 7.71875, "learning_rate": 4.964980548743009e-06, "loss": 1.3585302829742432, "step": 572 }, { "epoch": 0.17668333974605618, "grad_norm": 4.3125, "learning_rate": 4.964608116154922e-06, "loss": 1.2399895191192627, "step": 574 }, { "epoch": 0.17729896113889956, "grad_norm": 15.6875, "learning_rate": 4.9642337312917125e-06, "loss": 1.6500157117843628, "step": 576 }, { "epoch": 0.17791458253174297, "grad_norm": 3.5625, "learning_rate": 4.963857394525414e-06, "loss": 0.9791191816329956, "step": 578 }, { "epoch": 0.17853020392458638, "grad_norm": 6.15625, "learning_rate": 4.963479106230001e-06, "loss": 0.9877485036849976, "step": 580 }, { "epoch": 0.17914582531742979, "grad_norm": 12.625, "learning_rate": 4.963098866781387e-06, "loss": 1.2987223863601685, "step": 582 }, { "epoch": 0.1797614467102732, "grad_norm": 18.625, "learning_rate": 4.9627166765574255e-06, "loss": 1.7646374702453613, "step": 584 }, { "epoch": 0.18037706810311657, "grad_norm": 6.125, "learning_rate": 4.962332535937906e-06, "loss": 1.5843291282653809, "step": 586 }, { "epoch": 0.18099268949595998, "grad_norm": 5.75, "learning_rate": 4.961946445304559e-06, "loss": 1.6552579402923584, "step": 588 }, { "epoch": 0.1816083108888034, "grad_norm": 8.3125, "learning_rate": 4.961558405041048e-06, "loss": 1.3857282400131226, "step": 590 }, { "epoch": 0.1822239322816468, "grad_norm": 4.28125, "learning_rate": 4.961168415532983e-06, "loss": 1.4815261363983154, "step": 592 }, { "epoch": 0.1828395536744902, "grad_norm": 5.46875, "learning_rate": 4.9607764771679e-06, "loss": 1.4543473720550537, "step": 594 }, { "epoch": 0.18345517506733358, "grad_norm": 4.84375, "learning_rate": 4.960382590335281e-06, "loss": 1.4324307441711426, "step": 596 }, { "epoch": 0.184070796460177, "grad_norm": 2.953125, "learning_rate": 4.959986755426538e-06, "loss": 0.9503142237663269, "step": 598 }, { "epoch": 0.1846864178530204, "grad_norm": 9.6875, "learning_rate": 4.95958897283502e-06, "loss": 0.8729435801506042, "step": 600 }, { "epoch": 0.1853020392458638, "grad_norm": 8.5, "learning_rate": 4.959189242956015e-06, "loss": 1.3921244144439697, "step": 602 }, { "epoch": 0.18591766063870718, "grad_norm": 15.1875, "learning_rate": 4.958787566186743e-06, "loss": 1.465436577796936, "step": 604 }, { "epoch": 0.1865332820315506, "grad_norm": 14.375, "learning_rate": 4.958383942926358e-06, "loss": 1.4243252277374268, "step": 606 }, { "epoch": 0.187148903424394, "grad_norm": 3.921875, "learning_rate": 4.95797837357595e-06, "loss": 1.597616195678711, "step": 608 }, { "epoch": 0.1877645248172374, "grad_norm": 5.9375, "learning_rate": 4.957570858538543e-06, "loss": 1.6251206398010254, "step": 610 }, { "epoch": 0.1883801462100808, "grad_norm": 12.625, "learning_rate": 4.957161398219092e-06, "loss": 0.6412205696105957, "step": 612 }, { "epoch": 0.1889957676029242, "grad_norm": 7.5625, "learning_rate": 4.956749993024489e-06, "loss": 1.2991650104522705, "step": 614 }, { "epoch": 0.1896113889957676, "grad_norm": 6.875, "learning_rate": 4.956336643363556e-06, "loss": 1.4068098068237305, "step": 616 }, { "epoch": 0.190227010388611, "grad_norm": 11.0625, "learning_rate": 4.955921349647047e-06, "loss": 1.3036388158798218, "step": 618 }, { "epoch": 0.1908426317814544, "grad_norm": 10.75, "learning_rate": 4.95550411228765e-06, "loss": 0.9869240522384644, "step": 620 }, { "epoch": 0.1914582531742978, "grad_norm": 8.375, "learning_rate": 4.955084931699982e-06, "loss": 1.533276081085205, "step": 622 }, { "epoch": 0.1920738745671412, "grad_norm": 9.4375, "learning_rate": 4.954663808300593e-06, "loss": 1.6364980936050415, "step": 624 }, { "epoch": 0.1926894959599846, "grad_norm": 16.25, "learning_rate": 4.954240742507961e-06, "loss": 1.0341116189956665, "step": 626 }, { "epoch": 0.19330511735282802, "grad_norm": 8.4375, "learning_rate": 4.9538157347424985e-06, "loss": 1.7232986688613892, "step": 628 }, { "epoch": 0.19392073874567142, "grad_norm": 8.75, "learning_rate": 4.953388785426544e-06, "loss": 1.8982014656066895, "step": 630 }, { "epoch": 0.1945363601385148, "grad_norm": 15.25, "learning_rate": 4.952959894984365e-06, "loss": 1.3605602979660034, "step": 632 }, { "epoch": 0.1951519815313582, "grad_norm": 14.0, "learning_rate": 4.952529063842163e-06, "loss": 1.2464309930801392, "step": 634 }, { "epoch": 0.19576760292420162, "grad_norm": 6.0625, "learning_rate": 4.952096292428062e-06, "loss": 1.502238154411316, "step": 636 }, { "epoch": 0.19638322431704502, "grad_norm": 11.75, "learning_rate": 4.951661581172117e-06, "loss": 1.25320565700531, "step": 638 }, { "epoch": 0.19699884570988843, "grad_norm": 39.25, "learning_rate": 4.951224930506311e-06, "loss": 1.4860585927963257, "step": 640 }, { "epoch": 0.1976144671027318, "grad_norm": 5.1875, "learning_rate": 4.950786340864553e-06, "loss": 1.6566805839538574, "step": 642 }, { "epoch": 0.19823008849557522, "grad_norm": 3.921875, "learning_rate": 4.95034581268268e-06, "loss": 1.3507304191589355, "step": 644 }, { "epoch": 0.19884570988841863, "grad_norm": 5.90625, "learning_rate": 4.9499033463984535e-06, "loss": 1.424899697303772, "step": 646 }, { "epoch": 0.19946133128126203, "grad_norm": 6.5, "learning_rate": 4.9494589424515636e-06, "loss": 1.1606203317642212, "step": 648 }, { "epoch": 0.2000769526741054, "grad_norm": 5.9375, "learning_rate": 4.949012601283624e-06, "loss": 1.5425701141357422, "step": 650 }, { "epoch": 0.20069257406694882, "grad_norm": 9.0, "learning_rate": 4.948564323338174e-06, "loss": 1.4933964014053345, "step": 652 }, { "epoch": 0.20130819545979223, "grad_norm": 9.125, "learning_rate": 4.948114109060677e-06, "loss": 1.0542612075805664, "step": 654 }, { "epoch": 0.20192381685263563, "grad_norm": 9.9375, "learning_rate": 4.947661958898521e-06, "loss": 1.675760269165039, "step": 656 }, { "epoch": 0.20253943824547904, "grad_norm": 4.03125, "learning_rate": 4.947207873301018e-06, "loss": 1.4727646112442017, "step": 658 }, { "epoch": 0.20315505963832242, "grad_norm": 5.9375, "learning_rate": 4.946751852719403e-06, "loss": 1.482978105545044, "step": 660 }, { "epoch": 0.20377068103116583, "grad_norm": 3.265625, "learning_rate": 4.946293897606833e-06, "loss": 1.354433536529541, "step": 662 }, { "epoch": 0.20438630242400924, "grad_norm": 6.46875, "learning_rate": 4.945834008418391e-06, "loss": 1.3635056018829346, "step": 664 }, { "epoch": 0.20500192381685264, "grad_norm": 3.234375, "learning_rate": 4.945372185611076e-06, "loss": 0.7794215679168701, "step": 666 }, { "epoch": 0.20561754520969602, "grad_norm": 6.09375, "learning_rate": 4.9449084296438135e-06, "loss": 1.195050597190857, "step": 668 }, { "epoch": 0.20623316660253943, "grad_norm": 9.125, "learning_rate": 4.944442740977447e-06, "loss": 1.4416937828063965, "step": 670 }, { "epoch": 0.20684878799538284, "grad_norm": 5.15625, "learning_rate": 4.943975120074743e-06, "loss": 1.5561989545822144, "step": 672 }, { "epoch": 0.20746440938822625, "grad_norm": 7.625, "learning_rate": 4.943505567400387e-06, "loss": 1.3376786708831787, "step": 674 }, { "epoch": 0.20808003078106965, "grad_norm": 7.28125, "learning_rate": 4.943034083420983e-06, "loss": 1.2193069458007812, "step": 676 }, { "epoch": 0.20869565217391303, "grad_norm": 51.25, "learning_rate": 4.942560668605055e-06, "loss": 1.6837108135223389, "step": 678 }, { "epoch": 0.20931127356675644, "grad_norm": 9.1875, "learning_rate": 4.942085323423048e-06, "loss": 1.5166053771972656, "step": 680 }, { "epoch": 0.20992689495959985, "grad_norm": 5.84375, "learning_rate": 4.941608048347321e-06, "loss": 1.3111361265182495, "step": 682 }, { "epoch": 0.21054251635244325, "grad_norm": 4.59375, "learning_rate": 4.941128843852152e-06, "loss": 1.3362860679626465, "step": 684 }, { "epoch": 0.21115813774528666, "grad_norm": 8.1875, "learning_rate": 4.940647710413741e-06, "loss": 1.576740026473999, "step": 686 }, { "epoch": 0.21177375913813004, "grad_norm": 8.625, "learning_rate": 4.940164648510197e-06, "loss": 1.8432643413543701, "step": 688 }, { "epoch": 0.21238938053097345, "grad_norm": 5.125, "learning_rate": 4.939679658621552e-06, "loss": 1.5032325983047485, "step": 690 }, { "epoch": 0.21300500192381686, "grad_norm": 10.25, "learning_rate": 4.9391927412297525e-06, "loss": 1.168021321296692, "step": 692 }, { "epoch": 0.21362062331666026, "grad_norm": 5.46875, "learning_rate": 4.938703896818655e-06, "loss": 1.455658197402954, "step": 694 }, { "epoch": 0.21423624470950364, "grad_norm": 7.0625, "learning_rate": 4.938213125874039e-06, "loss": 1.5365169048309326, "step": 696 }, { "epoch": 0.21485186610234705, "grad_norm": 3.515625, "learning_rate": 4.937720428883594e-06, "loss": 1.3698763847351074, "step": 698 }, { "epoch": 0.21546748749519046, "grad_norm": 4.59375, "learning_rate": 4.937225806336921e-06, "loss": 1.2674376964569092, "step": 700 }, { "epoch": 0.21608310888803386, "grad_norm": 3.921875, "learning_rate": 4.93672925872554e-06, "loss": 1.3184254169464111, "step": 702 }, { "epoch": 0.21669873028087727, "grad_norm": 15.375, "learning_rate": 4.936230786542883e-06, "loss": 1.1187396049499512, "step": 704 }, { "epoch": 0.21731435167372065, "grad_norm": 9.625, "learning_rate": 4.935730390284289e-06, "loss": 1.689683198928833, "step": 706 }, { "epoch": 0.21792997306656406, "grad_norm": 6.09375, "learning_rate": 4.935228070447017e-06, "loss": 1.3738960027694702, "step": 708 }, { "epoch": 0.21854559445940747, "grad_norm": 4.8125, "learning_rate": 4.934723827530231e-06, "loss": 1.432965874671936, "step": 710 }, { "epoch": 0.21916121585225087, "grad_norm": 7.34375, "learning_rate": 4.934217662035008e-06, "loss": 1.4561982154846191, "step": 712 }, { "epoch": 0.21977683724509428, "grad_norm": 10.5625, "learning_rate": 4.9337095744643385e-06, "loss": 1.4224251508712769, "step": 714 }, { "epoch": 0.22039245863793766, "grad_norm": 8.8125, "learning_rate": 4.933199565323119e-06, "loss": 1.2862472534179688, "step": 716 }, { "epoch": 0.22100808003078107, "grad_norm": 16.75, "learning_rate": 4.932687635118157e-06, "loss": 1.1861869096755981, "step": 718 }, { "epoch": 0.22162370142362448, "grad_norm": 4.1875, "learning_rate": 4.9321737843581685e-06, "loss": 1.4926865100860596, "step": 720 }, { "epoch": 0.22223932281646788, "grad_norm": 8.1875, "learning_rate": 4.931658013553781e-06, "loss": 1.7047574520111084, "step": 722 }, { "epoch": 0.22285494420931126, "grad_norm": 7.53125, "learning_rate": 4.931140323217524e-06, "loss": 0.7368004322052002, "step": 724 }, { "epoch": 0.22347056560215467, "grad_norm": 8.125, "learning_rate": 4.93062071386384e-06, "loss": 1.560957670211792, "step": 726 }, { "epoch": 0.22408618699499808, "grad_norm": 8.0, "learning_rate": 4.930099186009077e-06, "loss": 1.7710736989974976, "step": 728 }, { "epoch": 0.22470180838784148, "grad_norm": 7.21875, "learning_rate": 4.929575740171488e-06, "loss": 1.3468711376190186, "step": 730 }, { "epoch": 0.2253174297806849, "grad_norm": 3.4375, "learning_rate": 4.929050376871231e-06, "loss": 1.2263801097869873, "step": 732 }, { "epoch": 0.22593305117352827, "grad_norm": 4.09375, "learning_rate": 4.928523096630376e-06, "loss": 1.151431679725647, "step": 734 }, { "epoch": 0.22654867256637168, "grad_norm": 13.3125, "learning_rate": 4.9279938999728886e-06, "loss": 1.481982946395874, "step": 736 }, { "epoch": 0.22716429395921509, "grad_norm": 7.875, "learning_rate": 4.927462787424646e-06, "loss": 1.4301162958145142, "step": 738 }, { "epoch": 0.2277799153520585, "grad_norm": 10.375, "learning_rate": 4.926929759513426e-06, "loss": 1.7959684133529663, "step": 740 }, { "epoch": 0.22839553674490187, "grad_norm": 9.3125, "learning_rate": 4.926394816768909e-06, "loss": 1.7166894674301147, "step": 742 }, { "epoch": 0.22901115813774528, "grad_norm": 4.28125, "learning_rate": 4.925857959722682e-06, "loss": 0.9727653861045837, "step": 744 }, { "epoch": 0.2296267795305887, "grad_norm": 7.0625, "learning_rate": 4.92531918890823e-06, "loss": 1.389609456062317, "step": 746 }, { "epoch": 0.2302424009234321, "grad_norm": 7.65625, "learning_rate": 4.924778504860943e-06, "loss": 1.2141828536987305, "step": 748 }, { "epoch": 0.2308580223162755, "grad_norm": 3.65625, "learning_rate": 4.92423590811811e-06, "loss": 1.245687484741211, "step": 750 }, { "epoch": 0.23147364370911888, "grad_norm": 13.375, "learning_rate": 4.923691399218921e-06, "loss": 1.4262340068817139, "step": 752 }, { "epoch": 0.2320892651019623, "grad_norm": 7.78125, "learning_rate": 4.9231449787044695e-06, "loss": 1.483527660369873, "step": 754 }, { "epoch": 0.2327048864948057, "grad_norm": 9.875, "learning_rate": 4.922596647117742e-06, "loss": 1.4266114234924316, "step": 756 }, { "epoch": 0.2333205078876491, "grad_norm": 8.625, "learning_rate": 4.92204640500363e-06, "loss": 1.367720365524292, "step": 758 }, { "epoch": 0.2339361292804925, "grad_norm": 4.28125, "learning_rate": 4.9214942529089215e-06, "loss": 1.498761534690857, "step": 760 }, { "epoch": 0.2345517506733359, "grad_norm": 5.21875, "learning_rate": 4.920940191382302e-06, "loss": 1.1964250802993774, "step": 762 }, { "epoch": 0.2351673720661793, "grad_norm": 10.0625, "learning_rate": 4.920384220974355e-06, "loss": 1.7517389059066772, "step": 764 }, { "epoch": 0.2357829934590227, "grad_norm": 16.5, "learning_rate": 4.919826342237559e-06, "loss": 1.6877859830856323, "step": 766 }, { "epoch": 0.2363986148518661, "grad_norm": 8.9375, "learning_rate": 4.919266555726293e-06, "loss": 1.5608092546463013, "step": 768 }, { "epoch": 0.2370142362447095, "grad_norm": 4.1875, "learning_rate": 4.918704861996829e-06, "loss": 1.2134896516799927, "step": 770 }, { "epoch": 0.2376298576375529, "grad_norm": 12.875, "learning_rate": 4.918141261607335e-06, "loss": 1.2214515209197998, "step": 772 }, { "epoch": 0.2382454790303963, "grad_norm": 14.25, "learning_rate": 4.917575755117872e-06, "loss": 1.697631597518921, "step": 774 }, { "epoch": 0.23886110042323971, "grad_norm": 9.8125, "learning_rate": 4.917008343090397e-06, "loss": 1.0490994453430176, "step": 776 }, { "epoch": 0.23947672181608312, "grad_norm": 7.5, "learning_rate": 4.91643902608876e-06, "loss": 1.4371521472930908, "step": 778 }, { "epoch": 0.2400923432089265, "grad_norm": 8.1875, "learning_rate": 4.915867804678704e-06, "loss": 1.4070208072662354, "step": 780 }, { "epoch": 0.2407079646017699, "grad_norm": 6.6875, "learning_rate": 4.915294679427865e-06, "loss": 1.0935542583465576, "step": 782 }, { "epoch": 0.24132358599461332, "grad_norm": 7.3125, "learning_rate": 4.91471965090577e-06, "loss": 1.4794899225234985, "step": 784 }, { "epoch": 0.24193920738745672, "grad_norm": 19.875, "learning_rate": 4.914142719683839e-06, "loss": 1.6757147312164307, "step": 786 }, { "epoch": 0.2425548287803001, "grad_norm": 7.34375, "learning_rate": 4.913563886335379e-06, "loss": 1.3972264528274536, "step": 788 }, { "epoch": 0.2431704501731435, "grad_norm": 4.4375, "learning_rate": 4.9129831514355915e-06, "loss": 1.4343969821929932, "step": 790 }, { "epoch": 0.24378607156598692, "grad_norm": 6.34375, "learning_rate": 4.912400515561565e-06, "loss": 1.3144001960754395, "step": 792 }, { "epoch": 0.24440169295883032, "grad_norm": 5.03125, "learning_rate": 4.911815979292278e-06, "loss": 1.2800910472869873, "step": 794 }, { "epoch": 0.24501731435167373, "grad_norm": 7.53125, "learning_rate": 4.911229543208598e-06, "loss": 1.637013554573059, "step": 796 }, { "epoch": 0.2456329357445171, "grad_norm": 8.0625, "learning_rate": 4.9106412078932785e-06, "loss": 1.5050179958343506, "step": 798 }, { "epoch": 0.24624855713736052, "grad_norm": 7.9375, "learning_rate": 4.9100509739309635e-06, "loss": 1.5874851942062378, "step": 800 }, { "epoch": 0.24686417853020393, "grad_norm": 7.96875, "learning_rate": 4.909458841908179e-06, "loss": 1.6707507371902466, "step": 802 }, { "epoch": 0.24747979992304733, "grad_norm": 6.0, "learning_rate": 4.908864812413341e-06, "loss": 1.627681016921997, "step": 804 }, { "epoch": 0.24809542131589074, "grad_norm": 164.0, "learning_rate": 4.908268886036751e-06, "loss": 1.8221757411956787, "step": 806 }, { "epoch": 0.24871104270873412, "grad_norm": 4.46875, "learning_rate": 4.907671063370592e-06, "loss": 1.4139869213104248, "step": 808 }, { "epoch": 0.24932666410157753, "grad_norm": 20.625, "learning_rate": 4.907071345008938e-06, "loss": 1.001795768737793, "step": 810 }, { "epoch": 0.24994228549442093, "grad_norm": 7.40625, "learning_rate": 4.906469731547738e-06, "loss": 1.647517442703247, "step": 812 }, { "epoch": 0.2505579068872643, "grad_norm": 16.125, "learning_rate": 4.905866223584831e-06, "loss": 1.1413493156433105, "step": 814 }, { "epoch": 0.25117352828010775, "grad_norm": 3.265625, "learning_rate": 4.905260821719936e-06, "loss": 1.199564814567566, "step": 816 }, { "epoch": 0.25178914967295113, "grad_norm": 10.8125, "learning_rate": 4.904653526554655e-06, "loss": 1.039031982421875, "step": 818 }, { "epoch": 0.2524047710657945, "grad_norm": 2.234375, "learning_rate": 4.9040443386924694e-06, "loss": 1.157308578491211, "step": 820 }, { "epoch": 0.25302039245863794, "grad_norm": 8.6875, "learning_rate": 4.903433258738744e-06, "loss": 1.594495177268982, "step": 822 }, { "epoch": 0.2536360138514813, "grad_norm": 2.703125, "learning_rate": 4.9028202873007216e-06, "loss": 1.30024254322052, "step": 824 }, { "epoch": 0.25425163524432476, "grad_norm": 8.5625, "learning_rate": 4.902205424987528e-06, "loss": 1.2227691411972046, "step": 826 }, { "epoch": 0.25486725663716814, "grad_norm": 6.03125, "learning_rate": 4.901588672410163e-06, "loss": 0.6920562386512756, "step": 828 }, { "epoch": 0.2554828780300115, "grad_norm": 9.0625, "learning_rate": 4.900970030181509e-06, "loss": 1.546901822090149, "step": 830 }, { "epoch": 0.25609849942285495, "grad_norm": 5.625, "learning_rate": 4.900349498916324e-06, "loss": 1.3025083541870117, "step": 832 }, { "epoch": 0.25671412081569833, "grad_norm": 8.5625, "learning_rate": 4.899727079231244e-06, "loss": 1.447066307067871, "step": 834 }, { "epoch": 0.25732974220854177, "grad_norm": 7.28125, "learning_rate": 4.899102771744781e-06, "loss": 1.023911476135254, "step": 836 }, { "epoch": 0.25794536360138515, "grad_norm": 13.875, "learning_rate": 4.898476577077325e-06, "loss": 0.6992422342300415, "step": 838 }, { "epoch": 0.2585609849942285, "grad_norm": 11.75, "learning_rate": 4.897848495851137e-06, "loss": 1.657697081565857, "step": 840 }, { "epoch": 0.25917660638707196, "grad_norm": 10.6875, "learning_rate": 4.897218528690357e-06, "loss": 1.3900957107543945, "step": 842 }, { "epoch": 0.25979222777991534, "grad_norm": 6.0, "learning_rate": 4.896586676220998e-06, "loss": 1.4221551418304443, "step": 844 }, { "epoch": 0.2604078491727588, "grad_norm": 9.6875, "learning_rate": 4.895952939070946e-06, "loss": 1.4861396551132202, "step": 846 }, { "epoch": 0.26102347056560216, "grad_norm": 11.9375, "learning_rate": 4.8953173178699575e-06, "loss": 1.8264024257659912, "step": 848 }, { "epoch": 0.26163909195844554, "grad_norm": 2.734375, "learning_rate": 4.894679813249666e-06, "loss": 1.1094304323196411, "step": 850 }, { "epoch": 0.26225471335128897, "grad_norm": 10.6875, "learning_rate": 4.8940404258435725e-06, "loss": 1.3908812999725342, "step": 852 }, { "epoch": 0.26287033474413235, "grad_norm": 12.625, "learning_rate": 4.893399156287052e-06, "loss": 1.177417278289795, "step": 854 }, { "epoch": 0.2634859561369758, "grad_norm": 10.375, "learning_rate": 4.892756005217347e-06, "loss": 1.703174114227295, "step": 856 }, { "epoch": 0.26410157752981916, "grad_norm": 6.03125, "learning_rate": 4.892110973273573e-06, "loss": 1.7122807502746582, "step": 858 }, { "epoch": 0.26471719892266254, "grad_norm": 1.515625, "learning_rate": 4.891464061096711e-06, "loss": 1.5598692893981934, "step": 860 }, { "epoch": 0.265332820315506, "grad_norm": 9.0625, "learning_rate": 4.890815269329613e-06, "loss": 1.562238335609436, "step": 862 }, { "epoch": 0.26594844170834936, "grad_norm": 8.125, "learning_rate": 4.890164598616997e-06, "loss": 1.5305628776550293, "step": 864 }, { "epoch": 0.2665640631011928, "grad_norm": 7.9375, "learning_rate": 4.88951204960545e-06, "loss": 0.80830979347229, "step": 866 }, { "epoch": 0.2671796844940362, "grad_norm": 3.21875, "learning_rate": 4.888857622943426e-06, "loss": 1.1971948146820068, "step": 868 }, { "epoch": 0.26779530588687955, "grad_norm": 19.375, "learning_rate": 4.88820131928124e-06, "loss": 1.206146478652954, "step": 870 }, { "epoch": 0.268410927279723, "grad_norm": 5.1875, "learning_rate": 4.887543139271078e-06, "loss": 1.107580542564392, "step": 872 }, { "epoch": 0.26902654867256637, "grad_norm": 5.5, "learning_rate": 4.886883083566988e-06, "loss": 1.0649266242980957, "step": 874 }, { "epoch": 0.26964217006540975, "grad_norm": 6.125, "learning_rate": 4.88622115282488e-06, "loss": 0.9791417717933655, "step": 876 }, { "epoch": 0.2702577914582532, "grad_norm": 7.25, "learning_rate": 4.885557347702533e-06, "loss": 1.4648741483688354, "step": 878 }, { "epoch": 0.27087341285109656, "grad_norm": 15.1875, "learning_rate": 4.884891668859583e-06, "loss": 1.4896833896636963, "step": 880 }, { "epoch": 0.27148903424394, "grad_norm": 6.5625, "learning_rate": 4.88422411695753e-06, "loss": 0.9050318002700806, "step": 882 }, { "epoch": 0.2721046556367834, "grad_norm": 8.625, "learning_rate": 4.883554692659736e-06, "loss": 1.3426930904388428, "step": 884 }, { "epoch": 0.27272027702962676, "grad_norm": 3.84375, "learning_rate": 4.882883396631421e-06, "loss": 1.2128629684448242, "step": 886 }, { "epoch": 0.2733358984224702, "grad_norm": 6.96875, "learning_rate": 4.88221022953967e-06, "loss": 1.4272143840789795, "step": 888 }, { "epoch": 0.27395151981531357, "grad_norm": 7.4375, "learning_rate": 4.881535192053423e-06, "loss": 1.621766448020935, "step": 890 }, { "epoch": 0.274567141208157, "grad_norm": 6.375, "learning_rate": 4.880858284843477e-06, "loss": 1.4103642702102661, "step": 892 }, { "epoch": 0.2751827626010004, "grad_norm": 9.0625, "learning_rate": 4.8801795085824945e-06, "loss": 1.7125623226165771, "step": 894 }, { "epoch": 0.27579838399384377, "grad_norm": 6.40625, "learning_rate": 4.879498863944988e-06, "loss": 1.400087594985962, "step": 896 }, { "epoch": 0.2764140053866872, "grad_norm": 4.625, "learning_rate": 4.87881635160733e-06, "loss": 1.3133388757705688, "step": 898 }, { "epoch": 0.2770296267795306, "grad_norm": 11.0, "learning_rate": 4.878131972247747e-06, "loss": 1.203717589378357, "step": 900 }, { "epoch": 0.277645248172374, "grad_norm": 5.46875, "learning_rate": 4.8774457265463245e-06, "loss": 1.2662968635559082, "step": 902 }, { "epoch": 0.2782608695652174, "grad_norm": 4.8125, "learning_rate": 4.8767576151849985e-06, "loss": 1.1130250692367554, "step": 904 }, { "epoch": 0.2788764909580608, "grad_norm": 6.71875, "learning_rate": 4.876067638847561e-06, "loss": 1.460016131401062, "step": 906 }, { "epoch": 0.2794921123509042, "grad_norm": 3.375, "learning_rate": 4.875375798219658e-06, "loss": 1.1902095079421997, "step": 908 }, { "epoch": 0.2801077337437476, "grad_norm": 7.34375, "learning_rate": 4.874682093988786e-06, "loss": 1.420830488204956, "step": 910 }, { "epoch": 0.280723355136591, "grad_norm": 8.25, "learning_rate": 4.873986526844294e-06, "loss": 0.9275550246238708, "step": 912 }, { "epoch": 0.2813389765294344, "grad_norm": 7.15625, "learning_rate": 4.873289097477384e-06, "loss": 0.959951639175415, "step": 914 }, { "epoch": 0.2819545979222778, "grad_norm": 4.6875, "learning_rate": 4.872589806581106e-06, "loss": 0.9401820302009583, "step": 916 }, { "epoch": 0.2825702193151212, "grad_norm": 6.9375, "learning_rate": 4.871888654850362e-06, "loss": 1.7376612424850464, "step": 918 }, { "epoch": 0.2831858407079646, "grad_norm": 9.6875, "learning_rate": 4.871185642981901e-06, "loss": 1.251401424407959, "step": 920 }, { "epoch": 0.283801462100808, "grad_norm": 6.1875, "learning_rate": 4.870480771674324e-06, "loss": 1.3512121438980103, "step": 922 }, { "epoch": 0.2844170834936514, "grad_norm": 6.53125, "learning_rate": 4.869774041628075e-06, "loss": 0.9417418241500854, "step": 924 }, { "epoch": 0.2850327048864948, "grad_norm": 10.125, "learning_rate": 4.869065453545447e-06, "loss": 1.549818515777588, "step": 926 }, { "epoch": 0.2856483262793382, "grad_norm": 14.4375, "learning_rate": 4.868355008130583e-06, "loss": 1.5407216548919678, "step": 928 }, { "epoch": 0.2862639476721816, "grad_norm": 6.46875, "learning_rate": 4.867642706089466e-06, "loss": 1.3004157543182373, "step": 930 }, { "epoch": 0.286879569065025, "grad_norm": 4.34375, "learning_rate": 4.866928548129927e-06, "loss": 1.1468735933303833, "step": 932 }, { "epoch": 0.2874951904578684, "grad_norm": 21.875, "learning_rate": 4.866212534961641e-06, "loss": 1.1942775249481201, "step": 934 }, { "epoch": 0.2881108118507118, "grad_norm": 6.96875, "learning_rate": 4.865494667296126e-06, "loss": 1.3379075527191162, "step": 936 }, { "epoch": 0.28872643324355524, "grad_norm": 3.515625, "learning_rate": 4.864774945846744e-06, "loss": 1.234367847442627, "step": 938 }, { "epoch": 0.2893420546363986, "grad_norm": 6.65625, "learning_rate": 4.864053371328697e-06, "loss": 1.3895533084869385, "step": 940 }, { "epoch": 0.289957676029242, "grad_norm": 7.40625, "learning_rate": 4.8633299444590324e-06, "loss": 1.0340811014175415, "step": 942 }, { "epoch": 0.29057329742208543, "grad_norm": 6.3125, "learning_rate": 4.862604665956633e-06, "loss": 0.9675121307373047, "step": 944 }, { "epoch": 0.2911889188149288, "grad_norm": 6.4375, "learning_rate": 4.8618775365422246e-06, "loss": 1.2580920457839966, "step": 946 }, { "epoch": 0.29180454020777224, "grad_norm": 15.4375, "learning_rate": 4.861148556938372e-06, "loss": 1.769187569618225, "step": 948 }, { "epoch": 0.2924201616006156, "grad_norm": 9.3125, "learning_rate": 4.860417727869481e-06, "loss": 1.3316900730133057, "step": 950 }, { "epoch": 0.293035782993459, "grad_norm": 11.875, "learning_rate": 4.85968505006179e-06, "loss": 1.470916748046875, "step": 952 }, { "epoch": 0.29365140438630244, "grad_norm": 6.53125, "learning_rate": 4.858950524243379e-06, "loss": 1.1799218654632568, "step": 954 }, { "epoch": 0.2942670257791458, "grad_norm": 12.25, "learning_rate": 4.858214151144161e-06, "loss": 1.172690510749817, "step": 956 }, { "epoch": 0.29488264717198925, "grad_norm": 6.1875, "learning_rate": 4.857475931495888e-06, "loss": 1.3986486196517944, "step": 958 }, { "epoch": 0.29549826856483263, "grad_norm": 7.1875, "learning_rate": 4.8567358660321465e-06, "loss": 1.6064389944076538, "step": 960 }, { "epoch": 0.296113889957676, "grad_norm": 11.5, "learning_rate": 4.8559939554883526e-06, "loss": 1.3742074966430664, "step": 962 }, { "epoch": 0.29672951135051945, "grad_norm": 7.71875, "learning_rate": 4.855250200601762e-06, "loss": 1.3658512830734253, "step": 964 }, { "epoch": 0.2973451327433628, "grad_norm": 7.375, "learning_rate": 4.854504602111461e-06, "loss": 1.521564245223999, "step": 966 }, { "epoch": 0.2979607541362062, "grad_norm": 5.59375, "learning_rate": 4.853757160758367e-06, "loss": 1.0583288669586182, "step": 968 }, { "epoch": 0.29857637552904964, "grad_norm": 7.8125, "learning_rate": 4.853007877285226e-06, "loss": 1.126049280166626, "step": 970 }, { "epoch": 0.299191996921893, "grad_norm": 4.78125, "learning_rate": 4.852256752436623e-06, "loss": 0.9671115875244141, "step": 972 }, { "epoch": 0.29980761831473646, "grad_norm": 6.40625, "learning_rate": 4.851503786958965e-06, "loss": 1.3904610872268677, "step": 974 }, { "epoch": 0.30042323970757984, "grad_norm": 8.3125, "learning_rate": 4.85074898160049e-06, "loss": 1.5142306089401245, "step": 976 }, { "epoch": 0.3010388611004232, "grad_norm": 8.75, "learning_rate": 4.849992337111267e-06, "loss": 1.1751598119735718, "step": 978 }, { "epoch": 0.30165448249326665, "grad_norm": 3.234375, "learning_rate": 4.849233854243189e-06, "loss": 1.5039445161819458, "step": 980 }, { "epoch": 0.30227010388611003, "grad_norm": 10.375, "learning_rate": 4.848473533749979e-06, "loss": 1.6068918704986572, "step": 982 }, { "epoch": 0.30288572527895347, "grad_norm": 4.96875, "learning_rate": 4.847711376387182e-06, "loss": 1.2263261079788208, "step": 984 }, { "epoch": 0.30350134667179685, "grad_norm": 13.625, "learning_rate": 4.846947382912173e-06, "loss": 1.4331692457199097, "step": 986 }, { "epoch": 0.3041169680646402, "grad_norm": 7.1875, "learning_rate": 4.846181554084147e-06, "loss": 1.3864588737487793, "step": 988 }, { "epoch": 0.30473258945748366, "grad_norm": 6.46875, "learning_rate": 4.845413890664129e-06, "loss": 1.1368046998977661, "step": 990 }, { "epoch": 0.30534821085032704, "grad_norm": 6.125, "learning_rate": 4.844644393414961e-06, "loss": 1.4460575580596924, "step": 992 }, { "epoch": 0.3059638322431705, "grad_norm": 9.375, "learning_rate": 4.84387306310131e-06, "loss": 1.6091135740280151, "step": 994 }, { "epoch": 0.30657945363601385, "grad_norm": 8.5, "learning_rate": 4.843099900489664e-06, "loss": 1.3350764513015747, "step": 996 }, { "epoch": 0.30719507502885723, "grad_norm": 5.375, "learning_rate": 4.842324906348333e-06, "loss": 1.414305329322815, "step": 998 }, { "epoch": 0.30781069642170067, "grad_norm": 38.25, "learning_rate": 4.841548081447445e-06, "loss": 1.2241977453231812, "step": 1000 }, { "epoch": 0.30842631781454405, "grad_norm": 5.96875, "learning_rate": 4.840769426558948e-06, "loss": 1.6980602741241455, "step": 1002 }, { "epoch": 0.3090419392073875, "grad_norm": 6.78125, "learning_rate": 4.839988942456609e-06, "loss": 1.1203399896621704, "step": 1004 }, { "epoch": 0.30965756060023086, "grad_norm": 19.25, "learning_rate": 4.839206629916015e-06, "loss": 1.7229124307632446, "step": 1006 }, { "epoch": 0.31027318199307424, "grad_norm": 2.1875, "learning_rate": 4.838422489714564e-06, "loss": 1.3425105810165405, "step": 1008 }, { "epoch": 0.3108888033859177, "grad_norm": 6.0625, "learning_rate": 4.837636522631475e-06, "loss": 1.2921473979949951, "step": 1010 }, { "epoch": 0.31150442477876106, "grad_norm": 6.375, "learning_rate": 4.8368487294477815e-06, "loss": 1.2659180164337158, "step": 1012 }, { "epoch": 0.31212004617160444, "grad_norm": 3.28125, "learning_rate": 4.836059110946332e-06, "loss": 1.1795529127120972, "step": 1014 }, { "epoch": 0.3127356675644479, "grad_norm": 9.75, "learning_rate": 4.835267667911786e-06, "loss": 1.4138458967208862, "step": 1016 }, { "epoch": 0.31335128895729125, "grad_norm": 19.75, "learning_rate": 4.83447440113062e-06, "loss": 1.4681267738342285, "step": 1018 }, { "epoch": 0.3139669103501347, "grad_norm": 16.75, "learning_rate": 4.833679311391121e-06, "loss": 1.5608904361724854, "step": 1020 }, { "epoch": 0.31458253174297807, "grad_norm": 4.875, "learning_rate": 4.832882399483385e-06, "loss": 1.239699125289917, "step": 1022 }, { "epoch": 0.31519815313582145, "grad_norm": 5.5, "learning_rate": 4.832083666199324e-06, "loss": 1.064995527267456, "step": 1024 }, { "epoch": 0.3158137745286649, "grad_norm": 10.1875, "learning_rate": 4.8312831123326565e-06, "loss": 0.7298382520675659, "step": 1026 }, { "epoch": 0.31642939592150826, "grad_norm": 5.84375, "learning_rate": 4.83048073867891e-06, "loss": 1.388184666633606, "step": 1028 }, { "epoch": 0.3170450173143517, "grad_norm": 7.09375, "learning_rate": 4.829676546035422e-06, "loss": 1.092585563659668, "step": 1030 }, { "epoch": 0.3176606387071951, "grad_norm": 4.6875, "learning_rate": 4.828870535201336e-06, "loss": 1.5091979503631592, "step": 1032 }, { "epoch": 0.31827626010003846, "grad_norm": 8.3125, "learning_rate": 4.828062706977605e-06, "loss": 1.4676424264907837, "step": 1034 }, { "epoch": 0.3188918814928819, "grad_norm": 7.46875, "learning_rate": 4.827253062166985e-06, "loss": 1.5855411291122437, "step": 1036 }, { "epoch": 0.31950750288572527, "grad_norm": 3.40625, "learning_rate": 4.826441601574035e-06, "loss": 1.0988051891326904, "step": 1038 }, { "epoch": 0.3201231242785687, "grad_norm": 12.5, "learning_rate": 4.825628326005126e-06, "loss": 1.5161454677581787, "step": 1040 }, { "epoch": 0.3207387456714121, "grad_norm": 11.125, "learning_rate": 4.824813236268425e-06, "loss": 1.51992928981781, "step": 1042 }, { "epoch": 0.32135436706425546, "grad_norm": 17.625, "learning_rate": 4.823996333173908e-06, "loss": 1.4970186948776245, "step": 1044 }, { "epoch": 0.3219699884570989, "grad_norm": 4.78125, "learning_rate": 4.823177617533348e-06, "loss": 1.2613115310668945, "step": 1046 }, { "epoch": 0.3225856098499423, "grad_norm": 13.4375, "learning_rate": 4.822357090160321e-06, "loss": 1.331524133682251, "step": 1048 }, { "epoch": 0.3232012312427857, "grad_norm": 5.78125, "learning_rate": 4.821534751870205e-06, "loss": 1.2422449588775635, "step": 1050 }, { "epoch": 0.3238168526356291, "grad_norm": 5.28125, "learning_rate": 4.8207106034801735e-06, "loss": 1.1214247941970825, "step": 1052 }, { "epoch": 0.3244324740284725, "grad_norm": 10.9375, "learning_rate": 4.819884645809203e-06, "loss": 0.9166821837425232, "step": 1054 }, { "epoch": 0.3250480954213159, "grad_norm": 5.59375, "learning_rate": 4.819056879678066e-06, "loss": 1.309885859489441, "step": 1056 }, { "epoch": 0.3256637168141593, "grad_norm": 4.15625, "learning_rate": 4.818227305909332e-06, "loss": 1.3728529214859009, "step": 1058 }, { "epoch": 0.32627933820700267, "grad_norm": 13.0625, "learning_rate": 4.817395925327367e-06, "loss": 1.260845422744751, "step": 1060 }, { "epoch": 0.3268949595998461, "grad_norm": 6.78125, "learning_rate": 4.8165627387583316e-06, "loss": 1.671797513961792, "step": 1062 }, { "epoch": 0.3275105809926895, "grad_norm": 9.0625, "learning_rate": 4.815727747030184e-06, "loss": 1.5508997440338135, "step": 1064 }, { "epoch": 0.3281262023855329, "grad_norm": 18.375, "learning_rate": 4.814890950972672e-06, "loss": 1.1947568655014038, "step": 1066 }, { "epoch": 0.3287418237783763, "grad_norm": 11.625, "learning_rate": 4.814052351417341e-06, "loss": 0.9314720630645752, "step": 1068 }, { "epoch": 0.3293574451712197, "grad_norm": 6.125, "learning_rate": 4.813211949197525e-06, "loss": 1.2254494428634644, "step": 1070 }, { "epoch": 0.3299730665640631, "grad_norm": 11.0, "learning_rate": 4.81236974514835e-06, "loss": 1.730814814567566, "step": 1072 }, { "epoch": 0.3305886879569065, "grad_norm": 10.75, "learning_rate": 4.811525740106734e-06, "loss": 1.7095310688018799, "step": 1074 }, { "epoch": 0.3312043093497499, "grad_norm": 8.5625, "learning_rate": 4.810679934911382e-06, "loss": 1.582645058631897, "step": 1076 }, { "epoch": 0.3318199307425933, "grad_norm": 13.8125, "learning_rate": 4.8098323304027915e-06, "loss": 1.3837974071502686, "step": 1078 }, { "epoch": 0.3324355521354367, "grad_norm": 1.6171875, "learning_rate": 4.808982927423246e-06, "loss": 1.4626548290252686, "step": 1080 }, { "epoch": 0.3330511735282801, "grad_norm": 9.6875, "learning_rate": 4.808131726816814e-06, "loss": 1.3742682933807373, "step": 1082 }, { "epoch": 0.3336667949211235, "grad_norm": 4.5, "learning_rate": 4.807278729429356e-06, "loss": 1.2568737268447876, "step": 1084 }, { "epoch": 0.33428241631396693, "grad_norm": 45.0, "learning_rate": 4.8064239361085115e-06, "loss": 1.3473093509674072, "step": 1086 }, { "epoch": 0.3348980377068103, "grad_norm": 15.875, "learning_rate": 4.80556734770371e-06, "loss": 1.001320719718933, "step": 1088 }, { "epoch": 0.3355136590996537, "grad_norm": 3.609375, "learning_rate": 4.804708965066162e-06, "loss": 1.238239049911499, "step": 1090 }, { "epoch": 0.33612928049249713, "grad_norm": 6.71875, "learning_rate": 4.803848789048861e-06, "loss": 1.1691683530807495, "step": 1092 }, { "epoch": 0.3367449018853405, "grad_norm": 3.796875, "learning_rate": 4.802986820506583e-06, "loss": 0.498982697725296, "step": 1094 }, { "epoch": 0.33736052327818394, "grad_norm": 9.1875, "learning_rate": 4.802123060295887e-06, "loss": 1.345341682434082, "step": 1096 }, { "epoch": 0.3379761446710273, "grad_norm": 4.40625, "learning_rate": 4.801257509275109e-06, "loss": 1.1568982601165771, "step": 1098 }, { "epoch": 0.3385917660638707, "grad_norm": 8.9375, "learning_rate": 4.8003901683043675e-06, "loss": 1.3907790184020996, "step": 1100 }, { "epoch": 0.33920738745671414, "grad_norm": 13.375, "learning_rate": 4.799521038245559e-06, "loss": 1.615919828414917, "step": 1102 }, { "epoch": 0.3398230088495575, "grad_norm": 11.125, "learning_rate": 4.798650119962357e-06, "loss": 1.0830153226852417, "step": 1104 }, { "epoch": 0.3404386302424009, "grad_norm": 20.75, "learning_rate": 4.797777414320213e-06, "loss": 1.1351860761642456, "step": 1106 }, { "epoch": 0.34105425163524433, "grad_norm": 5.25, "learning_rate": 4.796902922186353e-06, "loss": 1.2003566026687622, "step": 1108 }, { "epoch": 0.3416698730280877, "grad_norm": 7.125, "learning_rate": 4.7960266444297794e-06, "loss": 1.4255852699279785, "step": 1110 }, { "epoch": 0.34228549442093115, "grad_norm": 10.8125, "learning_rate": 4.79514858192127e-06, "loss": 1.2999197244644165, "step": 1112 }, { "epoch": 0.3429011158137745, "grad_norm": 5.65625, "learning_rate": 4.794268735533377e-06, "loss": 1.4916157722473145, "step": 1114 }, { "epoch": 0.3435167372066179, "grad_norm": 9.125, "learning_rate": 4.7933871061404204e-06, "loss": 1.7867798805236816, "step": 1116 }, { "epoch": 0.34413235859946134, "grad_norm": 6.3125, "learning_rate": 4.792503694618495e-06, "loss": 1.3523523807525635, "step": 1118 }, { "epoch": 0.3447479799923047, "grad_norm": 4.34375, "learning_rate": 4.791618501845469e-06, "loss": 1.5108684301376343, "step": 1120 }, { "epoch": 0.34536360138514816, "grad_norm": 2.671875, "learning_rate": 4.790731528700977e-06, "loss": 1.0168988704681396, "step": 1122 }, { "epoch": 0.34597922277799154, "grad_norm": 8.3125, "learning_rate": 4.789842776066425e-06, "loss": 1.5370982885360718, "step": 1124 }, { "epoch": 0.3465948441708349, "grad_norm": 9.0625, "learning_rate": 4.788952244824984e-06, "loss": 1.5379194021224976, "step": 1126 }, { "epoch": 0.34721046556367835, "grad_norm": 3.765625, "learning_rate": 4.788059935861597e-06, "loss": 1.3551844358444214, "step": 1128 }, { "epoch": 0.34782608695652173, "grad_norm": 34.5, "learning_rate": 4.78716585006297e-06, "loss": 1.744898796081543, "step": 1130 }, { "epoch": 0.34844170834936516, "grad_norm": 11.625, "learning_rate": 4.786269988317579e-06, "loss": 1.322486162185669, "step": 1132 }, { "epoch": 0.34905732974220854, "grad_norm": 11.6875, "learning_rate": 4.785372351515659e-06, "loss": 1.6481207609176636, "step": 1134 }, { "epoch": 0.3496729511350519, "grad_norm": 11.75, "learning_rate": 4.784472940549213e-06, "loss": 1.5510214567184448, "step": 1136 }, { "epoch": 0.35028857252789536, "grad_norm": 2.28125, "learning_rate": 4.7835717563120044e-06, "loss": 1.2969998121261597, "step": 1138 }, { "epoch": 0.35090419392073874, "grad_norm": 7.09375, "learning_rate": 4.782668799699563e-06, "loss": 1.3686840534210205, "step": 1140 }, { "epoch": 0.3515198153135822, "grad_norm": 2.3125, "learning_rate": 4.781764071609173e-06, "loss": 1.109289526939392, "step": 1142 }, { "epoch": 0.35213543670642555, "grad_norm": 27.0, "learning_rate": 4.7808575729398865e-06, "loss": 1.244390845298767, "step": 1144 }, { "epoch": 0.35275105809926893, "grad_norm": 25.5, "learning_rate": 4.779949304592511e-06, "loss": 1.4960665702819824, "step": 1146 }, { "epoch": 0.35336667949211237, "grad_norm": 4.0, "learning_rate": 4.779039267469612e-06, "loss": 1.4322017431259155, "step": 1148 }, { "epoch": 0.35398230088495575, "grad_norm": 10.75, "learning_rate": 4.778127462475513e-06, "loss": 1.4980297088623047, "step": 1150 }, { "epoch": 0.3545979222777991, "grad_norm": 15.5, "learning_rate": 4.777213890516299e-06, "loss": 1.6447110176086426, "step": 1152 }, { "epoch": 0.35521354367064256, "grad_norm": 7.15625, "learning_rate": 4.776298552499803e-06, "loss": 1.1732640266418457, "step": 1154 }, { "epoch": 0.35582916506348594, "grad_norm": 21.25, "learning_rate": 4.775381449335617e-06, "loss": 1.5481268167495728, "step": 1156 }, { "epoch": 0.3564447864563294, "grad_norm": 7.34375, "learning_rate": 4.77446258193509e-06, "loss": 1.207810401916504, "step": 1158 }, { "epoch": 0.35706040784917276, "grad_norm": 8.0, "learning_rate": 4.773541951211318e-06, "loss": 1.3997482061386108, "step": 1160 }, { "epoch": 0.35767602924201614, "grad_norm": 7.65625, "learning_rate": 4.772619558079154e-06, "loss": 1.4421358108520508, "step": 1162 }, { "epoch": 0.35829165063485957, "grad_norm": 8.625, "learning_rate": 4.771695403455201e-06, "loss": 1.5459229946136475, "step": 1164 }, { "epoch": 0.35890727202770295, "grad_norm": 5.96875, "learning_rate": 4.770769488257812e-06, "loss": 1.160110592842102, "step": 1166 }, { "epoch": 0.3595228934205464, "grad_norm": 3.6875, "learning_rate": 4.769841813407088e-06, "loss": 1.345860481262207, "step": 1168 }, { "epoch": 0.36013851481338977, "grad_norm": 9.75, "learning_rate": 4.768912379824882e-06, "loss": 1.449210524559021, "step": 1170 }, { "epoch": 0.36075413620623314, "grad_norm": 3.59375, "learning_rate": 4.767981188434791e-06, "loss": 1.6172339916229248, "step": 1172 }, { "epoch": 0.3613697575990766, "grad_norm": 15.3125, "learning_rate": 4.767048240162164e-06, "loss": 1.7002043724060059, "step": 1174 }, { "epoch": 0.36198537899191996, "grad_norm": 5.4375, "learning_rate": 4.7661135359340915e-06, "loss": 1.232842206954956, "step": 1176 }, { "epoch": 0.3626010003847634, "grad_norm": 5.375, "learning_rate": 4.7651770766794085e-06, "loss": 1.5186643600463867, "step": 1178 }, { "epoch": 0.3632166217776068, "grad_norm": 5.65625, "learning_rate": 4.764238863328696e-06, "loss": 1.2324049472808838, "step": 1180 }, { "epoch": 0.36383224317045015, "grad_norm": 6.4375, "learning_rate": 4.763298896814279e-06, "loss": 1.5353456735610962, "step": 1182 }, { "epoch": 0.3644478645632936, "grad_norm": 7.40625, "learning_rate": 4.762357178070221e-06, "loss": 1.3807618618011475, "step": 1184 }, { "epoch": 0.36506348595613697, "grad_norm": 15.1875, "learning_rate": 4.761413708032332e-06, "loss": 1.4120433330535889, "step": 1186 }, { "epoch": 0.3656791073489804, "grad_norm": 10.375, "learning_rate": 4.760468487638158e-06, "loss": 1.3134698867797852, "step": 1188 }, { "epoch": 0.3662947287418238, "grad_norm": 3.875, "learning_rate": 4.759521517826985e-06, "loss": 1.4803966283798218, "step": 1190 }, { "epoch": 0.36691035013466716, "grad_norm": 18.125, "learning_rate": 4.7585727995398376e-06, "loss": 1.1557652950286865, "step": 1192 }, { "epoch": 0.3675259715275106, "grad_norm": 5.1875, "learning_rate": 4.75762233371948e-06, "loss": 1.406283974647522, "step": 1194 }, { "epoch": 0.368141592920354, "grad_norm": 6.28125, "learning_rate": 4.756670121310411e-06, "loss": 1.388087272644043, "step": 1196 }, { "epoch": 0.36875721431319736, "grad_norm": 14.4375, "learning_rate": 4.7557161632588655e-06, "loss": 1.896012783050537, "step": 1198 }, { "epoch": 0.3693728357060408, "grad_norm": 9.25, "learning_rate": 4.754760460512813e-06, "loss": 1.5799840688705444, "step": 1200 }, { "epoch": 0.36998845709888417, "grad_norm": 4.59375, "learning_rate": 4.753803014021956e-06, "loss": 1.3843291997909546, "step": 1202 }, { "epoch": 0.3706040784917276, "grad_norm": 11.4375, "learning_rate": 4.75284382473773e-06, "loss": 1.2989155054092407, "step": 1204 }, { "epoch": 0.371219699884571, "grad_norm": 8.5, "learning_rate": 4.751882893613305e-06, "loss": 1.5606530904769897, "step": 1206 }, { "epoch": 0.37183532127741437, "grad_norm": 6.875, "learning_rate": 4.75092022160358e-06, "loss": 1.6842151880264282, "step": 1208 }, { "epoch": 0.3724509426702578, "grad_norm": 8.5625, "learning_rate": 4.7499558096651796e-06, "loss": 1.3427581787109375, "step": 1210 }, { "epoch": 0.3730665640631012, "grad_norm": 5.25, "learning_rate": 4.748989658756467e-06, "loss": 1.4908112287521362, "step": 1212 }, { "epoch": 0.3736821854559446, "grad_norm": 7.5625, "learning_rate": 4.748021769837524e-06, "loss": 1.002152681350708, "step": 1214 }, { "epoch": 0.374297806848788, "grad_norm": 9.0, "learning_rate": 4.747052143870166e-06, "loss": 1.6298128366470337, "step": 1216 }, { "epoch": 0.3749134282416314, "grad_norm": 6.3125, "learning_rate": 4.746080781817929e-06, "loss": 1.0059893131256104, "step": 1218 }, { "epoch": 0.3755290496344748, "grad_norm": 6.65625, "learning_rate": 4.745107684646081e-06, "loss": 1.0655871629714966, "step": 1220 }, { "epoch": 0.3761446710273182, "grad_norm": 7.8125, "learning_rate": 4.744132853321608e-06, "loss": 1.5112130641937256, "step": 1222 }, { "epoch": 0.3767602924201616, "grad_norm": 5.0625, "learning_rate": 4.743156288813223e-06, "loss": 1.4251501560211182, "step": 1224 }, { "epoch": 0.377375913813005, "grad_norm": 3.4375, "learning_rate": 4.742177992091359e-06, "loss": 1.295206904411316, "step": 1226 }, { "epoch": 0.3779915352058484, "grad_norm": 5.4375, "learning_rate": 4.7411979641281724e-06, "loss": 1.2756762504577637, "step": 1228 }, { "epoch": 0.3786071565986918, "grad_norm": 4.75, "learning_rate": 4.7402162058975375e-06, "loss": 1.355668306350708, "step": 1230 }, { "epoch": 0.3792227779915352, "grad_norm": 13.5, "learning_rate": 4.7392327183750516e-06, "loss": 1.397149682044983, "step": 1232 }, { "epoch": 0.37983839938437863, "grad_norm": 8.0, "learning_rate": 4.738247502538027e-06, "loss": 1.4336791038513184, "step": 1234 }, { "epoch": 0.380454020777222, "grad_norm": 19.125, "learning_rate": 4.737260559365494e-06, "loss": 1.4073939323425293, "step": 1236 }, { "epoch": 0.3810696421700654, "grad_norm": 6.53125, "learning_rate": 4.736271889838201e-06, "loss": 1.9138761758804321, "step": 1238 }, { "epoch": 0.3816852635629088, "grad_norm": 9.25, "learning_rate": 4.735281494938612e-06, "loss": 1.3105454444885254, "step": 1240 }, { "epoch": 0.3823008849557522, "grad_norm": 4.5625, "learning_rate": 4.734289375650903e-06, "loss": 1.9018452167510986, "step": 1242 }, { "epoch": 0.3829165063485956, "grad_norm": 15.125, "learning_rate": 4.733295532960966e-06, "loss": 1.3528389930725098, "step": 1244 }, { "epoch": 0.383532127741439, "grad_norm": 7.15625, "learning_rate": 4.732299967856405e-06, "loss": 1.6921008825302124, "step": 1246 }, { "epoch": 0.3841477491342824, "grad_norm": 10.8125, "learning_rate": 4.731302681326535e-06, "loss": 1.2409745454788208, "step": 1248 }, { "epoch": 0.38476337052712584, "grad_norm": 5.6875, "learning_rate": 4.730303674362382e-06, "loss": 1.4924330711364746, "step": 1250 }, { "epoch": 0.3853789919199692, "grad_norm": 10.75, "learning_rate": 4.729302947956681e-06, "loss": 1.2952157258987427, "step": 1252 }, { "epoch": 0.3859946133128126, "grad_norm": 3.09375, "learning_rate": 4.7283005031038775e-06, "loss": 1.1797876358032227, "step": 1254 }, { "epoch": 0.38661023470565603, "grad_norm": 7.9375, "learning_rate": 4.727296340800123e-06, "loss": 1.361810564994812, "step": 1256 }, { "epoch": 0.3872258560984994, "grad_norm": 11.0, "learning_rate": 4.726290462043275e-06, "loss": 1.5369617938995361, "step": 1258 }, { "epoch": 0.38784147749134285, "grad_norm": 10.3125, "learning_rate": 4.725282867832899e-06, "loss": 1.7465407848358154, "step": 1260 }, { "epoch": 0.3884570988841862, "grad_norm": 4.8125, "learning_rate": 4.724273559170264e-06, "loss": 1.4688948392868042, "step": 1262 }, { "epoch": 0.3890727202770296, "grad_norm": 8.375, "learning_rate": 4.723262537058342e-06, "loss": 1.671391487121582, "step": 1264 }, { "epoch": 0.38968834166987304, "grad_norm": 3.296875, "learning_rate": 4.722249802501807e-06, "loss": 1.2540719509124756, "step": 1266 }, { "epoch": 0.3903039630627164, "grad_norm": 3.84375, "learning_rate": 4.72123535650704e-06, "loss": 1.1532858610153198, "step": 1268 }, { "epoch": 0.39091958445555985, "grad_norm": 4.84375, "learning_rate": 4.720219200082116e-06, "loss": 1.2491059303283691, "step": 1270 }, { "epoch": 0.39153520584840323, "grad_norm": 20.25, "learning_rate": 4.719201334236811e-06, "loss": 1.8230793476104736, "step": 1272 }, { "epoch": 0.3921508272412466, "grad_norm": 8.5, "learning_rate": 4.718181759982604e-06, "loss": 1.5554561614990234, "step": 1274 }, { "epoch": 0.39276644863409005, "grad_norm": 4.125, "learning_rate": 4.7171604783326674e-06, "loss": 1.3932298421859741, "step": 1276 }, { "epoch": 0.39338207002693343, "grad_norm": 19.75, "learning_rate": 4.716137490301872e-06, "loss": 1.1679414510726929, "step": 1278 }, { "epoch": 0.39399769141977686, "grad_norm": 4.75, "learning_rate": 4.715112796906784e-06, "loss": 1.247887134552002, "step": 1280 }, { "epoch": 0.39461331281262024, "grad_norm": 51.5, "learning_rate": 4.714086399165664e-06, "loss": 0.8747434616088867, "step": 1282 }, { "epoch": 0.3952289342054636, "grad_norm": 4.28125, "learning_rate": 4.713058298098467e-06, "loss": 1.1395118236541748, "step": 1284 }, { "epoch": 0.39584455559830706, "grad_norm": 5.4375, "learning_rate": 4.712028494726838e-06, "loss": 1.3030931949615479, "step": 1286 }, { "epoch": 0.39646017699115044, "grad_norm": 6.375, "learning_rate": 4.7109969900741185e-06, "loss": 1.2976770401000977, "step": 1288 }, { "epoch": 0.3970757983839938, "grad_norm": 4.90625, "learning_rate": 4.709963785165336e-06, "loss": 1.4521037340164185, "step": 1290 }, { "epoch": 0.39769141977683725, "grad_norm": 10.125, "learning_rate": 4.708928881027209e-06, "loss": 1.8213273286819458, "step": 1292 }, { "epoch": 0.39830704116968063, "grad_norm": 6.5625, "learning_rate": 4.707892278688148e-06, "loss": 1.713027834892273, "step": 1294 }, { "epoch": 0.39892266256252407, "grad_norm": 9.8125, "learning_rate": 4.706853979178244e-06, "loss": 1.3795325756072998, "step": 1296 }, { "epoch": 0.39953828395536745, "grad_norm": 6.03125, "learning_rate": 4.705813983529282e-06, "loss": 1.3580855131149292, "step": 1298 }, { "epoch": 0.4001539053482108, "grad_norm": 15.1875, "learning_rate": 4.704772292774726e-06, "loss": 1.2726483345031738, "step": 1300 }, { "epoch": 0.40076952674105426, "grad_norm": 5.46875, "learning_rate": 4.703728907949729e-06, "loss": 1.3816659450531006, "step": 1302 }, { "epoch": 0.40138514813389764, "grad_norm": 6.4375, "learning_rate": 4.702683830091127e-06, "loss": 1.3593336343765259, "step": 1304 }, { "epoch": 0.4020007695267411, "grad_norm": 7.46875, "learning_rate": 4.701637060237434e-06, "loss": 1.5321509838104248, "step": 1306 }, { "epoch": 0.40261639091958445, "grad_norm": 33.0, "learning_rate": 4.700588599428851e-06, "loss": 1.5812572240829468, "step": 1308 }, { "epoch": 0.40323201231242783, "grad_norm": 7.59375, "learning_rate": 4.699538448707258e-06, "loss": 1.2317802906036377, "step": 1310 }, { "epoch": 0.40384763370527127, "grad_norm": 12.375, "learning_rate": 4.698486609116212e-06, "loss": 1.445168375968933, "step": 1312 }, { "epoch": 0.40446325509811465, "grad_norm": 6.59375, "learning_rate": 4.697433081700949e-06, "loss": 1.0394923686981201, "step": 1314 }, { "epoch": 0.4050788764909581, "grad_norm": 1.984375, "learning_rate": 4.6963778675083815e-06, "loss": 1.3255703449249268, "step": 1316 }, { "epoch": 0.40569449788380146, "grad_norm": 15.5625, "learning_rate": 4.695320967587104e-06, "loss": 1.6242492198944092, "step": 1318 }, { "epoch": 0.40631011927664484, "grad_norm": 6.15625, "learning_rate": 4.694262382987377e-06, "loss": 1.4963597059249878, "step": 1320 }, { "epoch": 0.4069257406694883, "grad_norm": 6.5, "learning_rate": 4.693202114761143e-06, "loss": 1.311753511428833, "step": 1322 }, { "epoch": 0.40754136206233166, "grad_norm": 2.953125, "learning_rate": 4.692140163962012e-06, "loss": 1.1284924745559692, "step": 1324 }, { "epoch": 0.4081569834551751, "grad_norm": 6.84375, "learning_rate": 4.69107653164527e-06, "loss": 1.5050592422485352, "step": 1326 }, { "epoch": 0.4087726048480185, "grad_norm": 10.4375, "learning_rate": 4.6900112188678715e-06, "loss": 1.091437578201294, "step": 1328 }, { "epoch": 0.40938822624086185, "grad_norm": 3.9375, "learning_rate": 4.688944226688442e-06, "loss": 1.2527481317520142, "step": 1330 }, { "epoch": 0.4100038476337053, "grad_norm": 29.625, "learning_rate": 4.687875556167275e-06, "loss": 1.3746745586395264, "step": 1332 }, { "epoch": 0.41061946902654867, "grad_norm": 24.0, "learning_rate": 4.686805208366333e-06, "loss": 1.671664834022522, "step": 1334 }, { "epoch": 0.41123509041939205, "grad_norm": 8.875, "learning_rate": 4.685733184349245e-06, "loss": 1.5099245309829712, "step": 1336 }, { "epoch": 0.4118507118122355, "grad_norm": 6.46875, "learning_rate": 4.684659485181303e-06, "loss": 1.2871772050857544, "step": 1338 }, { "epoch": 0.41246633320507886, "grad_norm": 4.8125, "learning_rate": 4.683584111929469e-06, "loss": 1.2058351039886475, "step": 1340 }, { "epoch": 0.4130819545979223, "grad_norm": 5.28125, "learning_rate": 4.682507065662363e-06, "loss": 1.219216227531433, "step": 1342 }, { "epoch": 0.4136975759907657, "grad_norm": 9.5625, "learning_rate": 4.681428347450271e-06, "loss": 1.354074239730835, "step": 1344 }, { "epoch": 0.41431319738360906, "grad_norm": 7.65625, "learning_rate": 4.68034795836514e-06, "loss": 1.5927313566207886, "step": 1346 }, { "epoch": 0.4149288187764525, "grad_norm": 8.125, "learning_rate": 4.679265899480577e-06, "loss": 1.5225095748901367, "step": 1348 }, { "epoch": 0.41554444016929587, "grad_norm": 5.75, "learning_rate": 4.678182171871847e-06, "loss": 1.5918680429458618, "step": 1350 }, { "epoch": 0.4161600615621393, "grad_norm": 6.28125, "learning_rate": 4.677096776615875e-06, "loss": 1.3410913944244385, "step": 1352 }, { "epoch": 0.4167756829549827, "grad_norm": 3.765625, "learning_rate": 4.676009714791242e-06, "loss": 1.076409101486206, "step": 1354 }, { "epoch": 0.41739130434782606, "grad_norm": 2.90625, "learning_rate": 4.6749209874781864e-06, "loss": 1.2970428466796875, "step": 1356 }, { "epoch": 0.4180069257406695, "grad_norm": 16.0, "learning_rate": 4.6738305957586e-06, "loss": 1.6537973880767822, "step": 1358 }, { "epoch": 0.4186225471335129, "grad_norm": 11.5, "learning_rate": 4.672738540716032e-06, "loss": 1.1844156980514526, "step": 1360 }, { "epoch": 0.4192381685263563, "grad_norm": 6.78125, "learning_rate": 4.671644823435681e-06, "loss": 1.7953647375106812, "step": 1362 }, { "epoch": 0.4198537899191997, "grad_norm": 6.375, "learning_rate": 4.670549445004395e-06, "loss": 1.0888378620147705, "step": 1364 }, { "epoch": 0.4204694113120431, "grad_norm": 13.1875, "learning_rate": 4.669452406510681e-06, "loss": 1.3127028942108154, "step": 1366 }, { "epoch": 0.4210850327048865, "grad_norm": 6.78125, "learning_rate": 4.6683537090446875e-06, "loss": 1.6716501712799072, "step": 1368 }, { "epoch": 0.4217006540977299, "grad_norm": 9.75, "learning_rate": 4.667253353698216e-06, "loss": 1.297078251838684, "step": 1370 }, { "epoch": 0.4223162754905733, "grad_norm": 5.8125, "learning_rate": 4.666151341564713e-06, "loss": 1.1920912265777588, "step": 1372 }, { "epoch": 0.4229318968834167, "grad_norm": 14.4375, "learning_rate": 4.665047673739275e-06, "loss": 1.5134333372116089, "step": 1374 }, { "epoch": 0.4235475182762601, "grad_norm": 6.09375, "learning_rate": 4.66394235131864e-06, "loss": 1.4252889156341553, "step": 1376 }, { "epoch": 0.4241631396691035, "grad_norm": 7.5625, "learning_rate": 4.662835375401191e-06, "loss": 1.397026538848877, "step": 1378 }, { "epoch": 0.4247787610619469, "grad_norm": 5.21875, "learning_rate": 4.661726747086957e-06, "loss": 1.2420134544372559, "step": 1380 }, { "epoch": 0.4253943824547903, "grad_norm": 6.15625, "learning_rate": 4.660616467477604e-06, "loss": 1.0156877040863037, "step": 1382 }, { "epoch": 0.4260100038476337, "grad_norm": 5.34375, "learning_rate": 4.659504537676444e-06, "loss": 1.616020679473877, "step": 1384 }, { "epoch": 0.4266256252404771, "grad_norm": 4.28125, "learning_rate": 4.658390958788426e-06, "loss": 1.2731225490570068, "step": 1386 }, { "epoch": 0.4272412466333205, "grad_norm": 10.6875, "learning_rate": 4.6572757319201366e-06, "loss": 1.3901350498199463, "step": 1388 }, { "epoch": 0.4278568680261639, "grad_norm": 6.90625, "learning_rate": 4.656158858179805e-06, "loss": 1.0529388189315796, "step": 1390 }, { "epoch": 0.4284724894190073, "grad_norm": 4.4375, "learning_rate": 4.655040338677292e-06, "loss": 1.18723464012146, "step": 1392 }, { "epoch": 0.4290881108118507, "grad_norm": 7.875, "learning_rate": 4.6539201745240925e-06, "loss": 1.2830877304077148, "step": 1394 }, { "epoch": 0.4297037322046941, "grad_norm": 7.03125, "learning_rate": 4.652798366833344e-06, "loss": 1.3695780038833618, "step": 1396 }, { "epoch": 0.43031935359753754, "grad_norm": 7.84375, "learning_rate": 4.651674916719809e-06, "loss": 1.5429341793060303, "step": 1398 }, { "epoch": 0.4309349749903809, "grad_norm": 3.265625, "learning_rate": 4.650549825299886e-06, "loss": 1.2008081674575806, "step": 1400 }, { "epoch": 0.4315505963832243, "grad_norm": 9.25, "learning_rate": 4.649423093691603e-06, "loss": 1.5424776077270508, "step": 1402 }, { "epoch": 0.43216621777606773, "grad_norm": 6.28125, "learning_rate": 4.648294723014618e-06, "loss": 1.3717658519744873, "step": 1404 }, { "epoch": 0.4327818391689111, "grad_norm": 3.625, "learning_rate": 4.647164714390219e-06, "loss": 1.1719211339950562, "step": 1406 }, { "epoch": 0.43339746056175454, "grad_norm": 5.90625, "learning_rate": 4.6460330689413214e-06, "loss": 1.252063512802124, "step": 1408 }, { "epoch": 0.4340130819545979, "grad_norm": 8.125, "learning_rate": 4.644899787792465e-06, "loss": 1.3777265548706055, "step": 1410 }, { "epoch": 0.4346287033474413, "grad_norm": 28.25, "learning_rate": 4.643764872069819e-06, "loss": 1.412273645401001, "step": 1412 }, { "epoch": 0.43524432474028474, "grad_norm": 3.8125, "learning_rate": 4.642628322901171e-06, "loss": 1.134641170501709, "step": 1414 }, { "epoch": 0.4358599461331281, "grad_norm": 6.1875, "learning_rate": 4.64149014141594e-06, "loss": 1.6808122396469116, "step": 1416 }, { "epoch": 0.43647556752597155, "grad_norm": 9.875, "learning_rate": 4.640350328745159e-06, "loss": 1.5614060163497925, "step": 1418 }, { "epoch": 0.43709118891881493, "grad_norm": 11.4375, "learning_rate": 4.6392088860214865e-06, "loss": 1.5738409757614136, "step": 1420 }, { "epoch": 0.4377068103116583, "grad_norm": 11.125, "learning_rate": 4.638065814379201e-06, "loss": 1.6225403547286987, "step": 1422 }, { "epoch": 0.43832243170450175, "grad_norm": 6.46875, "learning_rate": 4.636921114954196e-06, "loss": 1.1251602172851562, "step": 1424 }, { "epoch": 0.4389380530973451, "grad_norm": 14.5, "learning_rate": 4.635774788883986e-06, "loss": 0.9046669602394104, "step": 1426 }, { "epoch": 0.43955367449018856, "grad_norm": 7.0, "learning_rate": 4.634626837307702e-06, "loss": 1.2894409894943237, "step": 1428 }, { "epoch": 0.44016929588303194, "grad_norm": 3.5, "learning_rate": 4.633477261366087e-06, "loss": 1.0991194248199463, "step": 1430 }, { "epoch": 0.4407849172758753, "grad_norm": 5.9375, "learning_rate": 4.632326062201502e-06, "loss": 1.2833058834075928, "step": 1432 }, { "epoch": 0.44140053866871876, "grad_norm": 7.71875, "learning_rate": 4.631173240957919e-06, "loss": 1.098385214805603, "step": 1434 }, { "epoch": 0.44201616006156214, "grad_norm": 6.125, "learning_rate": 4.630018798780923e-06, "loss": 1.4895821809768677, "step": 1436 }, { "epoch": 0.4426317814544055, "grad_norm": 6.15625, "learning_rate": 4.628862736817707e-06, "loss": 1.2825528383255005, "step": 1438 }, { "epoch": 0.44324740284724895, "grad_norm": 11.6875, "learning_rate": 4.627705056217079e-06, "loss": 1.4047847986221313, "step": 1440 }, { "epoch": 0.44386302424009233, "grad_norm": 4.5, "learning_rate": 4.626545758129449e-06, "loss": 1.418422818183899, "step": 1442 }, { "epoch": 0.44447864563293576, "grad_norm": 6.4375, "learning_rate": 4.62538484370684e-06, "loss": 1.5138154029846191, "step": 1444 }, { "epoch": 0.44509426702577914, "grad_norm": 5.09375, "learning_rate": 4.624222314102876e-06, "loss": 1.2055473327636719, "step": 1446 }, { "epoch": 0.4457098884186225, "grad_norm": 4.25, "learning_rate": 4.623058170472792e-06, "loss": 1.1256903409957886, "step": 1448 }, { "epoch": 0.44632550981146596, "grad_norm": 6.625, "learning_rate": 4.62189241397342e-06, "loss": 1.4850614070892334, "step": 1450 }, { "epoch": 0.44694113120430934, "grad_norm": 13.1875, "learning_rate": 4.6207250457632e-06, "loss": 1.3964794874191284, "step": 1452 }, { "epoch": 0.4475567525971528, "grad_norm": 3.328125, "learning_rate": 4.619556067002173e-06, "loss": 1.011217474937439, "step": 1454 }, { "epoch": 0.44817237398999615, "grad_norm": 11.125, "learning_rate": 4.6183854788519785e-06, "loss": 1.03416907787323, "step": 1456 }, { "epoch": 0.44878799538283953, "grad_norm": 4.96875, "learning_rate": 4.6172132824758565e-06, "loss": 1.2690712213516235, "step": 1458 }, { "epoch": 0.44940361677568297, "grad_norm": 31.125, "learning_rate": 4.616039479038644e-06, "loss": 1.6051621437072754, "step": 1460 }, { "epoch": 0.45001923816852635, "grad_norm": 7.375, "learning_rate": 4.614864069706777e-06, "loss": 1.3212553262710571, "step": 1462 }, { "epoch": 0.4506348595613698, "grad_norm": 7.96875, "learning_rate": 4.613687055648285e-06, "loss": 1.2707087993621826, "step": 1464 }, { "epoch": 0.45125048095421316, "grad_norm": 4.625, "learning_rate": 4.6125084380327935e-06, "loss": 1.2656570672988892, "step": 1466 }, { "epoch": 0.45186610234705654, "grad_norm": 5.84375, "learning_rate": 4.611328218031521e-06, "loss": 1.237438440322876, "step": 1468 }, { "epoch": 0.4524817237399, "grad_norm": 7.6875, "learning_rate": 4.6101463968172795e-06, "loss": 1.4386591911315918, "step": 1470 }, { "epoch": 0.45309734513274336, "grad_norm": 8.625, "learning_rate": 4.608962975564471e-06, "loss": 1.5615410804748535, "step": 1472 }, { "epoch": 0.4537129665255868, "grad_norm": 11.25, "learning_rate": 4.6077779554490875e-06, "loss": 1.4186005592346191, "step": 1474 }, { "epoch": 0.45432858791843017, "grad_norm": 11.5625, "learning_rate": 4.606591337648709e-06, "loss": 1.6277843713760376, "step": 1476 }, { "epoch": 0.45494420931127355, "grad_norm": 8.5, "learning_rate": 4.605403123342506e-06, "loss": 1.523292064666748, "step": 1478 }, { "epoch": 0.455559830704117, "grad_norm": 10.5625, "learning_rate": 4.604213313711232e-06, "loss": 1.4570480585098267, "step": 1480 }, { "epoch": 0.45617545209696037, "grad_norm": 6.0625, "learning_rate": 4.60302190993723e-06, "loss": 1.4717897176742554, "step": 1482 }, { "epoch": 0.45679107348980375, "grad_norm": 5.6875, "learning_rate": 4.601828913204421e-06, "loss": 1.3334667682647705, "step": 1484 }, { "epoch": 0.4574066948826472, "grad_norm": 7.3125, "learning_rate": 4.600634324698317e-06, "loss": 1.1617430448532104, "step": 1486 }, { "epoch": 0.45802231627549056, "grad_norm": 24.375, "learning_rate": 4.599438145606003e-06, "loss": 0.9103840589523315, "step": 1488 }, { "epoch": 0.458637937668334, "grad_norm": 8.875, "learning_rate": 4.5982403771161525e-06, "loss": 0.7573866248130798, "step": 1490 }, { "epoch": 0.4592535590611774, "grad_norm": 14.75, "learning_rate": 4.597041020419012e-06, "loss": 1.3226635456085205, "step": 1492 }, { "epoch": 0.45986918045402075, "grad_norm": 11.25, "learning_rate": 4.595840076706411e-06, "loss": 1.7974557876586914, "step": 1494 }, { "epoch": 0.4604848018468642, "grad_norm": 15.25, "learning_rate": 4.5946375471717545e-06, "loss": 1.465869665145874, "step": 1496 }, { "epoch": 0.46110042323970757, "grad_norm": 12.6875, "learning_rate": 4.593433433010021e-06, "loss": 1.2467507123947144, "step": 1498 }, { "epoch": 0.461716044632551, "grad_norm": 11.8125, "learning_rate": 4.592227735417768e-06, "loss": 0.9114681482315063, "step": 1500 }, { "epoch": 0.4623316660253944, "grad_norm": 15.125, "learning_rate": 4.591020455593123e-06, "loss": 1.6152416467666626, "step": 1502 }, { "epoch": 0.46294728741823776, "grad_norm": 16.125, "learning_rate": 4.589811594735785e-06, "loss": 1.343848466873169, "step": 1504 }, { "epoch": 0.4635629088110812, "grad_norm": 14.75, "learning_rate": 4.588601154047031e-06, "loss": 1.9935762882232666, "step": 1506 }, { "epoch": 0.4641785302039246, "grad_norm": 6.40625, "learning_rate": 4.5873891347296995e-06, "loss": 1.3867267370224, "step": 1508 }, { "epoch": 0.464794151596768, "grad_norm": 9.4375, "learning_rate": 4.586175537988204e-06, "loss": 1.250286340713501, "step": 1510 }, { "epoch": 0.4654097729896114, "grad_norm": 9.375, "learning_rate": 4.584960365028519e-06, "loss": 1.512013554573059, "step": 1512 }, { "epoch": 0.46602539438245477, "grad_norm": 8.5625, "learning_rate": 4.58374361705819e-06, "loss": 0.8709845542907715, "step": 1514 }, { "epoch": 0.4666410157752982, "grad_norm": 10.25, "learning_rate": 4.58252529528633e-06, "loss": 0.8513764142990112, "step": 1516 }, { "epoch": 0.4672566371681416, "grad_norm": 42.25, "learning_rate": 4.58130540092361e-06, "loss": 1.3510565757751465, "step": 1518 }, { "epoch": 0.467872258560985, "grad_norm": 15.5625, "learning_rate": 4.5800839351822665e-06, "loss": 1.391147494316101, "step": 1520 }, { "epoch": 0.4684878799538284, "grad_norm": 4.46875, "learning_rate": 4.578860899276097e-06, "loss": 1.5558403730392456, "step": 1522 }, { "epoch": 0.4691035013466718, "grad_norm": 15.875, "learning_rate": 4.577636294420462e-06, "loss": 1.5080465078353882, "step": 1524 }, { "epoch": 0.4697191227395152, "grad_norm": 6.5625, "learning_rate": 4.5764101218322765e-06, "loss": 1.4655946493148804, "step": 1526 }, { "epoch": 0.4703347441323586, "grad_norm": 8.9375, "learning_rate": 4.575182382730016e-06, "loss": 1.309800148010254, "step": 1528 }, { "epoch": 0.470950365525202, "grad_norm": 2.890625, "learning_rate": 4.573953078333712e-06, "loss": 0.9789494276046753, "step": 1530 }, { "epoch": 0.4715659869180454, "grad_norm": 9.0, "learning_rate": 4.572722209864955e-06, "loss": 1.3504770994186401, "step": 1532 }, { "epoch": 0.4721816083108888, "grad_norm": 13.875, "learning_rate": 4.571489778546883e-06, "loss": 1.7013717889785767, "step": 1534 }, { "epoch": 0.4727972297037322, "grad_norm": 10.25, "learning_rate": 4.57025578560419e-06, "loss": 1.694466471672058, "step": 1536 }, { "epoch": 0.4734128510965756, "grad_norm": 6.3125, "learning_rate": 4.569020232263127e-06, "loss": 1.8504588603973389, "step": 1538 }, { "epoch": 0.474028472489419, "grad_norm": 4.28125, "learning_rate": 4.567783119751487e-06, "loss": 1.4777541160583496, "step": 1540 }, { "epoch": 0.4746440938822624, "grad_norm": 3.109375, "learning_rate": 4.566544449298618e-06, "loss": 1.3929392099380493, "step": 1542 }, { "epoch": 0.4752597152751058, "grad_norm": 5.15625, "learning_rate": 4.565304222135414e-06, "loss": 1.2495237588882446, "step": 1544 }, { "epoch": 0.47587533666794923, "grad_norm": 2.25, "learning_rate": 4.5640624394943164e-06, "loss": 1.0778892040252686, "step": 1546 }, { "epoch": 0.4764909580607926, "grad_norm": 7.34375, "learning_rate": 4.562819102609314e-06, "loss": 1.4203083515167236, "step": 1548 }, { "epoch": 0.477106579453636, "grad_norm": 7.1875, "learning_rate": 4.5615742127159365e-06, "loss": 1.2454944849014282, "step": 1550 }, { "epoch": 0.47772220084647943, "grad_norm": 9.875, "learning_rate": 4.560327771051262e-06, "loss": 1.0231895446777344, "step": 1552 }, { "epoch": 0.4783378222393228, "grad_norm": 7.15625, "learning_rate": 4.5590797788539035e-06, "loss": 1.5143892765045166, "step": 1554 }, { "epoch": 0.47895344363216624, "grad_norm": 15.6875, "learning_rate": 4.55783023736402e-06, "loss": 1.607107400894165, "step": 1556 }, { "epoch": 0.4795690650250096, "grad_norm": 7.96875, "learning_rate": 4.556579147823311e-06, "loss": 1.1287742853164673, "step": 1558 }, { "epoch": 0.480184686417853, "grad_norm": 7.3125, "learning_rate": 4.55532651147501e-06, "loss": 1.1777609586715698, "step": 1560 }, { "epoch": 0.48080030781069644, "grad_norm": 8.5, "learning_rate": 4.554072329563891e-06, "loss": 1.5708506107330322, "step": 1562 }, { "epoch": 0.4814159292035398, "grad_norm": 9.0625, "learning_rate": 4.552816603336262e-06, "loss": 1.292683482170105, "step": 1564 }, { "epoch": 0.48203155059638325, "grad_norm": 2.953125, "learning_rate": 4.551559334039966e-06, "loss": 1.262157917022705, "step": 1566 }, { "epoch": 0.48264717198922663, "grad_norm": 4.1875, "learning_rate": 4.550300522924383e-06, "loss": 1.291046142578125, "step": 1568 }, { "epoch": 0.48326279338207, "grad_norm": 7.5, "learning_rate": 4.549040171240416e-06, "loss": 1.324859619140625, "step": 1570 }, { "epoch": 0.48387841477491345, "grad_norm": 7.9375, "learning_rate": 4.54777828024051e-06, "loss": 1.4657518863677979, "step": 1572 }, { "epoch": 0.4844940361677568, "grad_norm": 5.625, "learning_rate": 4.546514851178631e-06, "loss": 1.2068415880203247, "step": 1574 }, { "epoch": 0.4851096575606002, "grad_norm": 5.375, "learning_rate": 4.545249885310278e-06, "loss": 1.10733163356781, "step": 1576 }, { "epoch": 0.48572527895344364, "grad_norm": 6.15625, "learning_rate": 4.543983383892477e-06, "loss": 1.432086706161499, "step": 1578 }, { "epoch": 0.486340900346287, "grad_norm": 2.265625, "learning_rate": 4.542715348183776e-06, "loss": 1.1427652835845947, "step": 1580 }, { "epoch": 0.48695652173913045, "grad_norm": 6.0, "learning_rate": 4.541445779444252e-06, "loss": 1.2995665073394775, "step": 1582 }, { "epoch": 0.48757214313197383, "grad_norm": 6.375, "learning_rate": 4.540174678935506e-06, "loss": 1.3644684553146362, "step": 1584 }, { "epoch": 0.4881877645248172, "grad_norm": 4.40625, "learning_rate": 4.538902047920657e-06, "loss": 1.3820481300354004, "step": 1586 }, { "epoch": 0.48880338591766065, "grad_norm": 12.125, "learning_rate": 4.537627887664346e-06, "loss": 1.1909376382827759, "step": 1588 }, { "epoch": 0.48941900731050403, "grad_norm": 6.78125, "learning_rate": 4.536352199432737e-06, "loss": 1.278643250465393, "step": 1590 }, { "epoch": 0.49003462870334746, "grad_norm": 6.3125, "learning_rate": 4.535074984493508e-06, "loss": 1.367564082145691, "step": 1592 }, { "epoch": 0.49065025009619084, "grad_norm": 4.1875, "learning_rate": 4.533796244115858e-06, "loss": 1.0127589702606201, "step": 1594 }, { "epoch": 0.4912658714890342, "grad_norm": 8.75, "learning_rate": 4.532515979570498e-06, "loss": 0.5572319626808167, "step": 1596 }, { "epoch": 0.49188149288187766, "grad_norm": 8.125, "learning_rate": 4.5312341921296565e-06, "loss": 1.367499589920044, "step": 1598 }, { "epoch": 0.49249711427472104, "grad_norm": 15.0625, "learning_rate": 4.5299508830670745e-06, "loss": 1.641442060470581, "step": 1600 }, { "epoch": 0.4931127356675645, "grad_norm": 20.0, "learning_rate": 4.528666053658005e-06, "loss": 1.9643449783325195, "step": 1602 }, { "epoch": 0.49372835706040785, "grad_norm": 9.9375, "learning_rate": 4.5273797051792114e-06, "loss": 1.4262056350708008, "step": 1604 }, { "epoch": 0.49434397845325123, "grad_norm": 5.8125, "learning_rate": 4.526091838908968e-06, "loss": 1.4248754978179932, "step": 1606 }, { "epoch": 0.49495959984609467, "grad_norm": 4.71875, "learning_rate": 4.524802456127054e-06, "loss": 1.3877049684524536, "step": 1608 }, { "epoch": 0.49557522123893805, "grad_norm": 19.5, "learning_rate": 4.523511558114762e-06, "loss": 1.483176350593567, "step": 1610 }, { "epoch": 0.4961908426317815, "grad_norm": 4.84375, "learning_rate": 4.522219146154883e-06, "loss": 1.4454123973846436, "step": 1612 }, { "epoch": 0.49680646402462486, "grad_norm": 3.203125, "learning_rate": 4.520925221531716e-06, "loss": 1.0618836879730225, "step": 1614 }, { "epoch": 0.49742208541746824, "grad_norm": 9.4375, "learning_rate": 4.519629785531063e-06, "loss": 1.1982765197753906, "step": 1616 }, { "epoch": 0.4980377068103117, "grad_norm": 4.8125, "learning_rate": 4.518332839440231e-06, "loss": 1.1161822080612183, "step": 1618 }, { "epoch": 0.49865332820315506, "grad_norm": 8.1875, "learning_rate": 4.517034384548019e-06, "loss": 1.463593602180481, "step": 1620 }, { "epoch": 0.49926894959599843, "grad_norm": 11.0625, "learning_rate": 4.515734422144734e-06, "loss": 1.3318003416061401, "step": 1622 }, { "epoch": 0.49988457098884187, "grad_norm": 3.5, "learning_rate": 4.514432953522178e-06, "loss": 1.0682951211929321, "step": 1624 }, { "epoch": 0.5005001923816853, "grad_norm": 4.875, "learning_rate": 4.513129979973648e-06, "loss": 1.3080323934555054, "step": 1626 }, { "epoch": 0.5011158137745286, "grad_norm": 9.0625, "learning_rate": 4.51182550279394e-06, "loss": 1.5416233539581299, "step": 1628 }, { "epoch": 0.5017314351673721, "grad_norm": 7.78125, "learning_rate": 4.5105195232793405e-06, "loss": 1.6960456371307373, "step": 1630 }, { "epoch": 0.5023470565602155, "grad_norm": 46.5, "learning_rate": 4.509212042727632e-06, "loss": 0.8543944358825684, "step": 1632 }, { "epoch": 0.5029626779530588, "grad_norm": 12.25, "learning_rate": 4.5079030624380845e-06, "loss": 1.5435972213745117, "step": 1634 }, { "epoch": 0.5035782993459023, "grad_norm": 16.5, "learning_rate": 4.5065925837114645e-06, "loss": 1.229882001876831, "step": 1636 }, { "epoch": 0.5041939207387457, "grad_norm": 5.65625, "learning_rate": 4.5052806078500225e-06, "loss": 1.4174797534942627, "step": 1638 }, { "epoch": 0.504809542131589, "grad_norm": 4.46875, "learning_rate": 4.503967136157498e-06, "loss": 1.102844476699829, "step": 1640 }, { "epoch": 0.5054251635244325, "grad_norm": 9.5625, "learning_rate": 4.502652169939117e-06, "loss": 1.3901140689849854, "step": 1642 }, { "epoch": 0.5060407849172759, "grad_norm": 38.25, "learning_rate": 4.501335710501592e-06, "loss": 1.737921118736267, "step": 1644 }, { "epoch": 0.5066564063101193, "grad_norm": 4.25, "learning_rate": 4.500017759153118e-06, "loss": 0.5540255904197693, "step": 1646 }, { "epoch": 0.5072720277029626, "grad_norm": 5.1875, "learning_rate": 4.498698317203373e-06, "loss": 1.030527114868164, "step": 1648 }, { "epoch": 0.5078876490958061, "grad_norm": 5.28125, "learning_rate": 4.497377385963514e-06, "loss": 1.1004064083099365, "step": 1650 }, { "epoch": 0.5085032704886495, "grad_norm": 2.421875, "learning_rate": 4.496054966746183e-06, "loss": 1.3087314367294312, "step": 1652 }, { "epoch": 0.5091188918814928, "grad_norm": 10.0625, "learning_rate": 4.494731060865496e-06, "loss": 1.518137812614441, "step": 1654 }, { "epoch": 0.5097345132743363, "grad_norm": 4.84375, "learning_rate": 4.493405669637048e-06, "loss": 1.0819928646087646, "step": 1656 }, { "epoch": 0.5103501346671797, "grad_norm": 9.1875, "learning_rate": 4.49207879437791e-06, "loss": 1.685136318206787, "step": 1658 }, { "epoch": 0.510965756060023, "grad_norm": 4.46875, "learning_rate": 4.490750436406628e-06, "loss": 1.1140801906585693, "step": 1660 }, { "epoch": 0.5115813774528665, "grad_norm": 37.5, "learning_rate": 4.489420597043221e-06, "loss": 0.5321840047836304, "step": 1662 }, { "epoch": 0.5121969988457099, "grad_norm": 3.828125, "learning_rate": 4.48808927760918e-06, "loss": 1.404366374015808, "step": 1664 }, { "epoch": 0.5128126202385533, "grad_norm": 3.125, "learning_rate": 4.486756479427467e-06, "loss": 1.0384865999221802, "step": 1666 }, { "epoch": 0.5134282416313967, "grad_norm": 4.84375, "learning_rate": 4.485422203822515e-06, "loss": 1.2788240909576416, "step": 1668 }, { "epoch": 0.5140438630242401, "grad_norm": 2.796875, "learning_rate": 4.484086452120221e-06, "loss": 0.9911500215530396, "step": 1670 }, { "epoch": 0.5146594844170835, "grad_norm": 7.3125, "learning_rate": 4.482749225647952e-06, "loss": 1.3307186365127563, "step": 1672 }, { "epoch": 0.5152751058099269, "grad_norm": 8.375, "learning_rate": 4.481410525734541e-06, "loss": 1.4370609521865845, "step": 1674 }, { "epoch": 0.5158907272027703, "grad_norm": 9.4375, "learning_rate": 4.480070353710283e-06, "loss": 1.6557636260986328, "step": 1676 }, { "epoch": 0.5165063485956137, "grad_norm": 6.6875, "learning_rate": 4.478728710906938e-06, "loss": 1.4819470643997192, "step": 1678 }, { "epoch": 0.517121969988457, "grad_norm": 5.625, "learning_rate": 4.4773855986577255e-06, "loss": 1.5152249336242676, "step": 1680 }, { "epoch": 0.5177375913813005, "grad_norm": 7.25, "learning_rate": 4.476041018297327e-06, "loss": 1.5486692190170288, "step": 1682 }, { "epoch": 0.5183532127741439, "grad_norm": 7.53125, "learning_rate": 4.474694971161882e-06, "loss": 1.4799833297729492, "step": 1684 }, { "epoch": 0.5189688341669872, "grad_norm": 11.5, "learning_rate": 4.473347458588987e-06, "loss": 1.4414798021316528, "step": 1686 }, { "epoch": 0.5195844555598307, "grad_norm": 9.6875, "learning_rate": 4.471998481917698e-06, "loss": 1.2920417785644531, "step": 1688 }, { "epoch": 0.5202000769526741, "grad_norm": 4.625, "learning_rate": 4.47064804248852e-06, "loss": 1.2978482246398926, "step": 1690 }, { "epoch": 0.5208156983455176, "grad_norm": 3.75, "learning_rate": 4.4692961416434156e-06, "loss": 1.2956743240356445, "step": 1692 }, { "epoch": 0.5214313197383609, "grad_norm": 2.953125, "learning_rate": 4.467942780725801e-06, "loss": 1.393992304801941, "step": 1694 }, { "epoch": 0.5220469411312043, "grad_norm": 8.5625, "learning_rate": 4.46658796108054e-06, "loss": 1.42585027217865, "step": 1696 }, { "epoch": 0.5226625625240477, "grad_norm": 20.125, "learning_rate": 4.465231684053947e-06, "loss": 1.6024885177612305, "step": 1698 }, { "epoch": 0.5232781839168911, "grad_norm": 5.875, "learning_rate": 4.463873950993786e-06, "loss": 1.0916786193847656, "step": 1700 }, { "epoch": 0.5238938053097345, "grad_norm": 6.96875, "learning_rate": 4.462514763249265e-06, "loss": 1.3746522665023804, "step": 1702 }, { "epoch": 0.5245094267025779, "grad_norm": 3.953125, "learning_rate": 4.46115412217104e-06, "loss": 1.3183560371398926, "step": 1704 }, { "epoch": 0.5251250480954213, "grad_norm": 6.59375, "learning_rate": 4.459792029111211e-06, "loss": 1.0775420665740967, "step": 1706 }, { "epoch": 0.5257406694882647, "grad_norm": 1.8125, "learning_rate": 4.45842848542332e-06, "loss": 1.253765344619751, "step": 1708 }, { "epoch": 0.5263562908811081, "grad_norm": 39.25, "learning_rate": 4.457063492462352e-06, "loss": 1.7046610116958618, "step": 1710 }, { "epoch": 0.5269719122739516, "grad_norm": 5.28125, "learning_rate": 4.4556970515847305e-06, "loss": 1.16429603099823, "step": 1712 }, { "epoch": 0.5275875336667949, "grad_norm": 4.53125, "learning_rate": 4.454329164148317e-06, "loss": 1.1705595254898071, "step": 1714 }, { "epoch": 0.5282031550596383, "grad_norm": 7.5, "learning_rate": 4.452959831512414e-06, "loss": 1.4573957920074463, "step": 1716 }, { "epoch": 0.5288187764524818, "grad_norm": 2.71875, "learning_rate": 4.451589055037757e-06, "loss": 1.1135655641555786, "step": 1718 }, { "epoch": 0.5294343978453251, "grad_norm": 4.03125, "learning_rate": 4.4502168360865175e-06, "loss": 0.9940666556358337, "step": 1720 }, { "epoch": 0.5300500192381685, "grad_norm": 3.96875, "learning_rate": 4.448843176022299e-06, "loss": 1.332295536994934, "step": 1722 }, { "epoch": 0.530665640631012, "grad_norm": 7.03125, "learning_rate": 4.44746807621014e-06, "loss": 1.3662868738174438, "step": 1724 }, { "epoch": 0.5312812620238553, "grad_norm": 9.25, "learning_rate": 4.44609153801651e-06, "loss": 1.1173661947250366, "step": 1726 }, { "epoch": 0.5318968834166987, "grad_norm": 5.90625, "learning_rate": 4.4447135628093e-06, "loss": 1.3571062088012695, "step": 1728 }, { "epoch": 0.5325125048095422, "grad_norm": 16.0, "learning_rate": 4.44333415195784e-06, "loss": 1.3048704862594604, "step": 1730 }, { "epoch": 0.5331281262023856, "grad_norm": 7.125, "learning_rate": 4.441953306832879e-06, "loss": 1.6084398031234741, "step": 1732 }, { "epoch": 0.5337437475952289, "grad_norm": 13.75, "learning_rate": 4.440571028806594e-06, "loss": 1.482635498046875, "step": 1734 }, { "epoch": 0.5343593689880723, "grad_norm": 9.625, "learning_rate": 4.439187319252586e-06, "loss": 1.7756417989730835, "step": 1736 }, { "epoch": 0.5349749903809158, "grad_norm": 6.75, "learning_rate": 4.437802179545879e-06, "loss": 1.5119597911834717, "step": 1738 }, { "epoch": 0.5355906117737591, "grad_norm": 4.71875, "learning_rate": 4.436415611062916e-06, "loss": 1.1370530128479004, "step": 1740 }, { "epoch": 0.5362062331666025, "grad_norm": 13.0625, "learning_rate": 4.435027615181563e-06, "loss": 1.3913326263427734, "step": 1742 }, { "epoch": 0.536821854559446, "grad_norm": 2.875, "learning_rate": 4.4336381932811e-06, "loss": 1.1053932905197144, "step": 1744 }, { "epoch": 0.5374374759522893, "grad_norm": 6.40625, "learning_rate": 4.43224734674223e-06, "loss": 1.414675235748291, "step": 1746 }, { "epoch": 0.5380530973451327, "grad_norm": 5.6875, "learning_rate": 4.4308550769470645e-06, "loss": 1.3577004671096802, "step": 1748 }, { "epoch": 0.5386687187379762, "grad_norm": 5.71875, "learning_rate": 4.429461385279136e-06, "loss": 1.3203229904174805, "step": 1750 }, { "epoch": 0.5392843401308195, "grad_norm": 6.4375, "learning_rate": 4.428066273123387e-06, "loss": 1.2799878120422363, "step": 1752 }, { "epoch": 0.5398999615236629, "grad_norm": 2.203125, "learning_rate": 4.4266697418661705e-06, "loss": 1.2069662809371948, "step": 1754 }, { "epoch": 0.5405155829165064, "grad_norm": 4.90625, "learning_rate": 4.425271792895252e-06, "loss": 1.0387519598007202, "step": 1756 }, { "epoch": 0.5411312043093498, "grad_norm": 17.125, "learning_rate": 4.423872427599804e-06, "loss": 1.3242170810699463, "step": 1758 }, { "epoch": 0.5417468257021931, "grad_norm": 3.078125, "learning_rate": 4.422471647370406e-06, "loss": 1.1795176267623901, "step": 1760 }, { "epoch": 0.5423624470950366, "grad_norm": 10.6875, "learning_rate": 4.421069453599049e-06, "loss": 1.476176381111145, "step": 1762 }, { "epoch": 0.54297806848788, "grad_norm": 11.125, "learning_rate": 4.41966584767912e-06, "loss": 1.396909475326538, "step": 1764 }, { "epoch": 0.5435936898807233, "grad_norm": 12.5, "learning_rate": 4.418260831005415e-06, "loss": 1.7101448774337769, "step": 1766 }, { "epoch": 0.5442093112735668, "grad_norm": 5.34375, "learning_rate": 4.4168544049741304e-06, "loss": 1.282928228378296, "step": 1768 }, { "epoch": 0.5448249326664102, "grad_norm": 16.875, "learning_rate": 4.415446570982864e-06, "loss": 1.662355899810791, "step": 1770 }, { "epoch": 0.5454405540592535, "grad_norm": 5.25, "learning_rate": 4.414037330430611e-06, "loss": 1.2465815544128418, "step": 1772 }, { "epoch": 0.546056175452097, "grad_norm": 6.40625, "learning_rate": 4.412626684717768e-06, "loss": 1.2851492166519165, "step": 1774 }, { "epoch": 0.5466717968449404, "grad_norm": 15.625, "learning_rate": 4.4112146352461216e-06, "loss": 1.4682855606079102, "step": 1776 }, { "epoch": 0.5472874182377838, "grad_norm": 7.125, "learning_rate": 4.409801183418858e-06, "loss": 1.3794105052947998, "step": 1778 }, { "epoch": 0.5479030396306271, "grad_norm": 9.25, "learning_rate": 4.408386330640559e-06, "loss": 1.5041764974594116, "step": 1780 }, { "epoch": 0.5485186610234706, "grad_norm": 9.3125, "learning_rate": 4.40697007831719e-06, "loss": 1.758446216583252, "step": 1782 }, { "epoch": 0.549134282416314, "grad_norm": 97.5, "learning_rate": 4.4055524278561175e-06, "loss": 1.5930302143096924, "step": 1784 }, { "epoch": 0.5497499038091573, "grad_norm": 7.15625, "learning_rate": 4.40413338066609e-06, "loss": 1.1427586078643799, "step": 1786 }, { "epoch": 0.5503655252020008, "grad_norm": 7.46875, "learning_rate": 4.402712938157249e-06, "loss": 1.3729523420333862, "step": 1788 }, { "epoch": 0.5509811465948442, "grad_norm": 6.1875, "learning_rate": 4.401291101741116e-06, "loss": 1.446144700050354, "step": 1790 }, { "epoch": 0.5515967679876875, "grad_norm": 5.0625, "learning_rate": 4.399867872830607e-06, "loss": 1.4674649238586426, "step": 1792 }, { "epoch": 0.552212389380531, "grad_norm": 5.59375, "learning_rate": 4.398443252840011e-06, "loss": 1.0208038091659546, "step": 1794 }, { "epoch": 0.5528280107733744, "grad_norm": 9.6875, "learning_rate": 4.397017243185008e-06, "loss": 1.3345563411712646, "step": 1796 }, { "epoch": 0.5534436321662177, "grad_norm": 4.71875, "learning_rate": 4.395589845282656e-06, "loss": 1.4287055730819702, "step": 1798 }, { "epoch": 0.5540592535590612, "grad_norm": 6.46875, "learning_rate": 4.3941610605513905e-06, "loss": 1.1713486909866333, "step": 1800 }, { "epoch": 0.5546748749519046, "grad_norm": 7.875, "learning_rate": 4.392730890411029e-06, "loss": 1.1164500713348389, "step": 1802 }, { "epoch": 0.555290496344748, "grad_norm": 8.25, "learning_rate": 4.391299336282761e-06, "loss": 1.515763759613037, "step": 1804 }, { "epoch": 0.5559061177375914, "grad_norm": 7.0625, "learning_rate": 4.389866399589157e-06, "loss": 0.984856367111206, "step": 1806 }, { "epoch": 0.5565217391304348, "grad_norm": 8.625, "learning_rate": 4.388432081754155e-06, "loss": 1.7644293308258057, "step": 1808 }, { "epoch": 0.5571373605232782, "grad_norm": 3.421875, "learning_rate": 4.386996384203072e-06, "loss": 1.1138197183609009, "step": 1810 }, { "epoch": 0.5577529819161215, "grad_norm": 15.375, "learning_rate": 4.3855593083625904e-06, "loss": 1.775923490524292, "step": 1812 }, { "epoch": 0.558368603308965, "grad_norm": 10.5625, "learning_rate": 4.384120855660765e-06, "loss": 1.8163801431655884, "step": 1814 }, { "epoch": 0.5589842247018084, "grad_norm": 12.8125, "learning_rate": 4.382681027527021e-06, "loss": 1.5391318798065186, "step": 1816 }, { "epoch": 0.5595998460946517, "grad_norm": 5.75, "learning_rate": 4.381239825392144e-06, "loss": 1.098721981048584, "step": 1818 }, { "epoch": 0.5602154674874952, "grad_norm": 9.875, "learning_rate": 4.379797250688292e-06, "loss": 1.27237868309021, "step": 1820 }, { "epoch": 0.5608310888803386, "grad_norm": 8.375, "learning_rate": 4.378353304848982e-06, "loss": 1.4349870681762695, "step": 1822 }, { "epoch": 0.561446710273182, "grad_norm": 4.9375, "learning_rate": 4.376907989309097e-06, "loss": 1.4746294021606445, "step": 1824 }, { "epoch": 0.5620623316660254, "grad_norm": 12.4375, "learning_rate": 4.375461305504879e-06, "loss": 1.102752447128296, "step": 1826 }, { "epoch": 0.5626779530588688, "grad_norm": 8.1875, "learning_rate": 4.374013254873929e-06, "loss": 1.1980957984924316, "step": 1828 }, { "epoch": 0.5632935744517122, "grad_norm": 6.21875, "learning_rate": 4.372563838855207e-06, "loss": 1.722758412361145, "step": 1830 }, { "epoch": 0.5639091958445556, "grad_norm": 13.625, "learning_rate": 4.3711130588890315e-06, "loss": 1.632874846458435, "step": 1832 }, { "epoch": 0.564524817237399, "grad_norm": 4.53125, "learning_rate": 4.369660916417076e-06, "loss": 1.4207483530044556, "step": 1834 }, { "epoch": 0.5651404386302424, "grad_norm": 17.875, "learning_rate": 4.3682074128823645e-06, "loss": 1.476704716682434, "step": 1836 }, { "epoch": 0.5657560600230858, "grad_norm": 10.1875, "learning_rate": 4.3667525497292776e-06, "loss": 1.6292390823364258, "step": 1838 }, { "epoch": 0.5663716814159292, "grad_norm": 5.21875, "learning_rate": 4.365296328403546e-06, "loss": 1.3333988189697266, "step": 1840 }, { "epoch": 0.5669873028087726, "grad_norm": 7.3125, "learning_rate": 4.363838750352247e-06, "loss": 1.524222493171692, "step": 1842 }, { "epoch": 0.567602924201616, "grad_norm": 7.46875, "learning_rate": 4.362379817023811e-06, "loss": 1.0296413898468018, "step": 1844 }, { "epoch": 0.5682185455944594, "grad_norm": 3.71875, "learning_rate": 4.3609195298680115e-06, "loss": 1.0932461023330688, "step": 1846 }, { "epoch": 0.5688341669873028, "grad_norm": 12.5625, "learning_rate": 4.3594578903359695e-06, "loss": 1.5462793111801147, "step": 1848 }, { "epoch": 0.5694497883801463, "grad_norm": 8.375, "learning_rate": 4.357994899880149e-06, "loss": 1.412261962890625, "step": 1850 }, { "epoch": 0.5700654097729896, "grad_norm": 6.28125, "learning_rate": 4.356530559954356e-06, "loss": 1.2975988388061523, "step": 1852 }, { "epoch": 0.570681031165833, "grad_norm": 14.5, "learning_rate": 4.355064872013737e-06, "loss": 1.2473691701889038, "step": 1854 }, { "epoch": 0.5712966525586765, "grad_norm": 15.75, "learning_rate": 4.353597837514779e-06, "loss": 0.9717381000518799, "step": 1856 }, { "epoch": 0.5719122739515198, "grad_norm": 5.4375, "learning_rate": 4.3521294579153096e-06, "loss": 1.3041472434997559, "step": 1858 }, { "epoch": 0.5725278953443632, "grad_norm": 7.09375, "learning_rate": 4.350659734674488e-06, "loss": 1.1828478574752808, "step": 1860 }, { "epoch": 0.5731435167372066, "grad_norm": 7.875, "learning_rate": 4.3491886692528115e-06, "loss": 1.293772578239441, "step": 1862 }, { "epoch": 0.57375913813005, "grad_norm": 6.15625, "learning_rate": 4.347716263112112e-06, "loss": 1.188981533050537, "step": 1864 }, { "epoch": 0.5743747595228934, "grad_norm": 5.0625, "learning_rate": 4.346242517715551e-06, "loss": 1.2617030143737793, "step": 1866 }, { "epoch": 0.5749903809157368, "grad_norm": 4.4375, "learning_rate": 4.344767434527623e-06, "loss": 1.3794877529144287, "step": 1868 }, { "epoch": 0.5756060023085803, "grad_norm": 1.6171875, "learning_rate": 4.343291015014152e-06, "loss": 1.2357971668243408, "step": 1870 }, { "epoch": 0.5762216237014236, "grad_norm": 10.375, "learning_rate": 4.341813260642288e-06, "loss": 0.8696410655975342, "step": 1872 }, { "epoch": 0.576837245094267, "grad_norm": 5.375, "learning_rate": 4.340334172880508e-06, "loss": 1.406108021736145, "step": 1874 }, { "epoch": 0.5774528664871105, "grad_norm": 4.875, "learning_rate": 4.338853753198618e-06, "loss": 1.1402201652526855, "step": 1876 }, { "epoch": 0.5780684878799538, "grad_norm": 8.0, "learning_rate": 4.33737200306774e-06, "loss": 1.1661087274551392, "step": 1878 }, { "epoch": 0.5786841092727972, "grad_norm": 3.984375, "learning_rate": 4.3358889239603245e-06, "loss": 1.516743540763855, "step": 1880 }, { "epoch": 0.5792997306656407, "grad_norm": 6.65625, "learning_rate": 4.33440451735014e-06, "loss": 1.434889793395996, "step": 1882 }, { "epoch": 0.579915352058484, "grad_norm": 8.5625, "learning_rate": 4.332918784712276e-06, "loss": 1.2617021799087524, "step": 1884 }, { "epoch": 0.5805309734513274, "grad_norm": 4.375, "learning_rate": 4.331431727523136e-06, "loss": 1.2327024936676025, "step": 1886 }, { "epoch": 0.5811465948441709, "grad_norm": 12.8125, "learning_rate": 4.3299433472604445e-06, "loss": 1.5364668369293213, "step": 1888 }, { "epoch": 0.5817622162370142, "grad_norm": 5.625, "learning_rate": 4.3284536454032356e-06, "loss": 1.5384386777877808, "step": 1890 }, { "epoch": 0.5823778376298576, "grad_norm": 7.625, "learning_rate": 4.326962623431862e-06, "loss": 0.9961131811141968, "step": 1892 }, { "epoch": 0.582993459022701, "grad_norm": 3.375, "learning_rate": 4.325470282827984e-06, "loss": 1.555985450744629, "step": 1894 }, { "epoch": 0.5836090804155445, "grad_norm": 5.03125, "learning_rate": 4.323976625074574e-06, "loss": 1.347398281097412, "step": 1896 }, { "epoch": 0.5842247018083878, "grad_norm": 6.15625, "learning_rate": 4.3224816516559145e-06, "loss": 1.2748348712921143, "step": 1898 }, { "epoch": 0.5848403232012312, "grad_norm": 2.171875, "learning_rate": 4.320985364057593e-06, "loss": 1.444568157196045, "step": 1900 }, { "epoch": 0.5854559445940747, "grad_norm": 15.4375, "learning_rate": 4.3194877637665035e-06, "loss": 1.672950267791748, "step": 1902 }, { "epoch": 0.586071565986918, "grad_norm": 5.65625, "learning_rate": 4.317988852270845e-06, "loss": 1.6386940479278564, "step": 1904 }, { "epoch": 0.5866871873797614, "grad_norm": 10.1875, "learning_rate": 4.3164886310601224e-06, "loss": 1.4679235219955444, "step": 1906 }, { "epoch": 0.5873028087726049, "grad_norm": 7.75, "learning_rate": 4.3149871016251365e-06, "loss": 1.3325090408325195, "step": 1908 }, { "epoch": 0.5879184301654482, "grad_norm": 8.0, "learning_rate": 4.31348426545799e-06, "loss": 1.1321873664855957, "step": 1910 }, { "epoch": 0.5885340515582916, "grad_norm": 6.40625, "learning_rate": 4.311980124052087e-06, "loss": 1.2628101110458374, "step": 1912 }, { "epoch": 0.5891496729511351, "grad_norm": 5.625, "learning_rate": 4.310474678902126e-06, "loss": 1.4517428874969482, "step": 1914 }, { "epoch": 0.5897652943439785, "grad_norm": 2.25, "learning_rate": 4.3089679315041e-06, "loss": 1.2754348516464233, "step": 1916 }, { "epoch": 0.5903809157368218, "grad_norm": 21.125, "learning_rate": 4.307459883355299e-06, "loss": 1.384401559829712, "step": 1918 }, { "epoch": 0.5909965371296653, "grad_norm": 24.5, "learning_rate": 4.305950535954305e-06, "loss": 0.9517618417739868, "step": 1920 }, { "epoch": 0.5916121585225087, "grad_norm": 6.3125, "learning_rate": 4.30443989080099e-06, "loss": 1.3214876651763916, "step": 1922 }, { "epoch": 0.592227779915352, "grad_norm": 3.40625, "learning_rate": 4.3029279493965135e-06, "loss": 1.2694828510284424, "step": 1924 }, { "epoch": 0.5928434013081955, "grad_norm": 4.71875, "learning_rate": 4.301414713243328e-06, "loss": 1.17191481590271, "step": 1926 }, { "epoch": 0.5934590227010389, "grad_norm": 5.09375, "learning_rate": 4.299900183845171e-06, "loss": 1.352325677871704, "step": 1928 }, { "epoch": 0.5940746440938822, "grad_norm": 4.875, "learning_rate": 4.2983843627070625e-06, "loss": 1.3218797445297241, "step": 1930 }, { "epoch": 0.5946902654867257, "grad_norm": 37.25, "learning_rate": 4.2968672513353075e-06, "loss": 1.535480260848999, "step": 1932 }, { "epoch": 0.5953058868795691, "grad_norm": 9.3125, "learning_rate": 4.295348851237494e-06, "loss": 1.1567133665084839, "step": 1934 }, { "epoch": 0.5959215082724124, "grad_norm": 9.1875, "learning_rate": 4.293829163922491e-06, "loss": 1.4145631790161133, "step": 1936 }, { "epoch": 0.5965371296652558, "grad_norm": 12.125, "learning_rate": 4.2923081909004475e-06, "loss": 1.6092185974121094, "step": 1938 }, { "epoch": 0.5971527510580993, "grad_norm": 10.5, "learning_rate": 4.290785933682785e-06, "loss": 1.466023325920105, "step": 1940 }, { "epoch": 0.5977683724509427, "grad_norm": 4.5625, "learning_rate": 4.289262393782206e-06, "loss": 1.269843339920044, "step": 1942 }, { "epoch": 0.598383993843786, "grad_norm": 6.375, "learning_rate": 4.287737572712687e-06, "loss": 1.4272327423095703, "step": 1944 }, { "epoch": 0.5989996152366295, "grad_norm": 5.3125, "learning_rate": 4.2862114719894754e-06, "loss": 1.1759858131408691, "step": 1946 }, { "epoch": 0.5996152366294729, "grad_norm": 4.15625, "learning_rate": 4.284684093129093e-06, "loss": 1.0027165412902832, "step": 1948 }, { "epoch": 0.6002308580223162, "grad_norm": 8.5625, "learning_rate": 4.283155437649331e-06, "loss": 1.5012065172195435, "step": 1950 }, { "epoch": 0.6008464794151597, "grad_norm": 6.65625, "learning_rate": 4.281625507069247e-06, "loss": 1.0649147033691406, "step": 1952 }, { "epoch": 0.6014621008080031, "grad_norm": 13.5625, "learning_rate": 4.280094302909168e-06, "loss": 1.3106036186218262, "step": 1954 }, { "epoch": 0.6020777222008464, "grad_norm": 13.625, "learning_rate": 4.2785618266906844e-06, "loss": 1.2972180843353271, "step": 1956 }, { "epoch": 0.6026933435936899, "grad_norm": 5.5, "learning_rate": 4.277028079936656e-06, "loss": 1.08073091506958, "step": 1958 }, { "epoch": 0.6033089649865333, "grad_norm": 5.65625, "learning_rate": 4.2754930641711974e-06, "loss": 1.7052501440048218, "step": 1960 }, { "epoch": 0.6039245863793767, "grad_norm": 4.625, "learning_rate": 4.27395678091969e-06, "loss": 1.3487554788589478, "step": 1962 }, { "epoch": 0.6045402077722201, "grad_norm": 11.3125, "learning_rate": 4.272419231708773e-06, "loss": 1.341812014579773, "step": 1964 }, { "epoch": 0.6051558291650635, "grad_norm": 10.75, "learning_rate": 4.270880418066342e-06, "loss": 1.1420282125473022, "step": 1966 }, { "epoch": 0.6057714505579069, "grad_norm": 16.0, "learning_rate": 4.2693403415215525e-06, "loss": 0.7540841102600098, "step": 1968 }, { "epoch": 0.6063870719507503, "grad_norm": 6.8125, "learning_rate": 4.267799003604812e-06, "loss": 1.4339842796325684, "step": 1970 }, { "epoch": 0.6070026933435937, "grad_norm": 4.78125, "learning_rate": 4.266256405847784e-06, "loss": 1.4078599214553833, "step": 1972 }, { "epoch": 0.6076183147364371, "grad_norm": 2.25, "learning_rate": 4.264712549783381e-06, "loss": 0.9816248416900635, "step": 1974 }, { "epoch": 0.6082339361292805, "grad_norm": 5.1875, "learning_rate": 4.263167436945767e-06, "loss": 1.346351146697998, "step": 1976 }, { "epoch": 0.6088495575221239, "grad_norm": 28.375, "learning_rate": 4.261621068870355e-06, "loss": 0.8800836205482483, "step": 1978 }, { "epoch": 0.6094651789149673, "grad_norm": 7.0625, "learning_rate": 4.260073447093808e-06, "loss": 1.407514214515686, "step": 1980 }, { "epoch": 0.6100808003078106, "grad_norm": 5.125, "learning_rate": 4.258524573154031e-06, "loss": 1.2680063247680664, "step": 1982 }, { "epoch": 0.6106964217006541, "grad_norm": 13.8125, "learning_rate": 4.256974448590174e-06, "loss": 1.7805291414260864, "step": 1984 }, { "epoch": 0.6113120430934975, "grad_norm": 7.5625, "learning_rate": 4.255423074942632e-06, "loss": 1.780397653579712, "step": 1986 }, { "epoch": 0.611927664486341, "grad_norm": 9.1875, "learning_rate": 4.25387045375304e-06, "loss": 1.231481909751892, "step": 1988 }, { "epoch": 0.6125432858791843, "grad_norm": 6.8125, "learning_rate": 4.252316586564269e-06, "loss": 1.4257529973983765, "step": 1990 }, { "epoch": 0.6131589072720277, "grad_norm": 8.375, "learning_rate": 4.250761474920437e-06, "loss": 1.2827187776565552, "step": 1992 }, { "epoch": 0.6137745286648711, "grad_norm": 7.40625, "learning_rate": 4.249205120366888e-06, "loss": 1.349419355392456, "step": 1994 }, { "epoch": 0.6143901500577145, "grad_norm": 9.5625, "learning_rate": 4.2476475244502105e-06, "loss": 1.0123541355133057, "step": 1996 }, { "epoch": 0.6150057714505579, "grad_norm": 9.625, "learning_rate": 4.246088688718221e-06, "loss": 1.5329852104187012, "step": 1998 }, { "epoch": 0.6156213928434013, "grad_norm": 13.0625, "learning_rate": 4.244528614719968e-06, "loss": 1.7838776111602783, "step": 2000 }, { "epoch": 0.6162370142362447, "grad_norm": 11.25, "learning_rate": 4.242967304005734e-06, "loss": 1.074199914932251, "step": 2002 }, { "epoch": 0.6168526356290881, "grad_norm": 6.28125, "learning_rate": 4.241404758127029e-06, "loss": 1.3185136318206787, "step": 2004 }, { "epoch": 0.6174682570219315, "grad_norm": 7.53125, "learning_rate": 4.239840978636588e-06, "loss": 1.418949842453003, "step": 2006 }, { "epoch": 0.618083878414775, "grad_norm": 8.5625, "learning_rate": 4.238275967088375e-06, "loss": 1.2543166875839233, "step": 2008 }, { "epoch": 0.6186994998076183, "grad_norm": 3.375, "learning_rate": 4.2367097250375744e-06, "loss": 1.0142097473144531, "step": 2010 }, { "epoch": 0.6193151212004617, "grad_norm": 13.5625, "learning_rate": 4.2351422540406005e-06, "loss": 1.2836097478866577, "step": 2012 }, { "epoch": 0.6199307425933052, "grad_norm": 11.1875, "learning_rate": 4.23357355565508e-06, "loss": 1.5026246309280396, "step": 2014 }, { "epoch": 0.6205463639861485, "grad_norm": 8.6875, "learning_rate": 4.232003631439868e-06, "loss": 1.4889832735061646, "step": 2016 }, { "epoch": 0.6211619853789919, "grad_norm": 5.4375, "learning_rate": 4.2304324829550306e-06, "loss": 1.1584733724594116, "step": 2018 }, { "epoch": 0.6217776067718354, "grad_norm": 22.0, "learning_rate": 4.228860111761852e-06, "loss": 1.358148217201233, "step": 2020 }, { "epoch": 0.6223932281646787, "grad_norm": 10.125, "learning_rate": 4.2272865194228355e-06, "loss": 1.7109938859939575, "step": 2022 }, { "epoch": 0.6230088495575221, "grad_norm": 20.75, "learning_rate": 4.225711707501694e-06, "loss": 0.9796476364135742, "step": 2024 }, { "epoch": 0.6236244709503656, "grad_norm": 5.125, "learning_rate": 4.224135677563354e-06, "loss": 1.3197376728057861, "step": 2026 }, { "epoch": 0.6242400923432089, "grad_norm": 15.9375, "learning_rate": 4.22255843117395e-06, "loss": 1.6285589933395386, "step": 2028 }, { "epoch": 0.6248557137360523, "grad_norm": 10.0, "learning_rate": 4.220979969900828e-06, "loss": 1.7880958318710327, "step": 2030 }, { "epoch": 0.6254713351288957, "grad_norm": 8.5625, "learning_rate": 4.219400295312542e-06, "loss": 1.0276007652282715, "step": 2032 }, { "epoch": 0.6260869565217392, "grad_norm": 8.9375, "learning_rate": 4.217819408978848e-06, "loss": 1.206255316734314, "step": 2034 }, { "epoch": 0.6267025779145825, "grad_norm": 4.28125, "learning_rate": 4.216237312470709e-06, "loss": 1.2645046710968018, "step": 2036 }, { "epoch": 0.6273181993074259, "grad_norm": 6.75, "learning_rate": 4.214654007360289e-06, "loss": 1.246014952659607, "step": 2038 }, { "epoch": 0.6279338207002694, "grad_norm": 12.625, "learning_rate": 4.213069495220955e-06, "loss": 1.0208582878112793, "step": 2040 }, { "epoch": 0.6285494420931127, "grad_norm": 7.28125, "learning_rate": 4.211483777627272e-06, "loss": 0.9263346195220947, "step": 2042 }, { "epoch": 0.6291650634859561, "grad_norm": 4.1875, "learning_rate": 4.2098968561550025e-06, "loss": 1.1192724704742432, "step": 2044 }, { "epoch": 0.6297806848787996, "grad_norm": 23.5, "learning_rate": 4.208308732381106e-06, "loss": 1.644690990447998, "step": 2046 }, { "epoch": 0.6303963062716429, "grad_norm": 3.828125, "learning_rate": 4.206719407883737e-06, "loss": 1.3574427366256714, "step": 2048 }, { "epoch": 0.6310119276644863, "grad_norm": 14.0625, "learning_rate": 4.205128884242243e-06, "loss": 1.3872565031051636, "step": 2050 }, { "epoch": 0.6316275490573298, "grad_norm": 4.5, "learning_rate": 4.203537163037163e-06, "loss": 1.401400089263916, "step": 2052 }, { "epoch": 0.6322431704501732, "grad_norm": 3.0625, "learning_rate": 4.201944245850224e-06, "loss": 1.0317822694778442, "step": 2054 }, { "epoch": 0.6328587918430165, "grad_norm": 6.1875, "learning_rate": 4.200350134264347e-06, "loss": 1.2016314268112183, "step": 2056 }, { "epoch": 0.63347441323586, "grad_norm": 4.53125, "learning_rate": 4.198754829863635e-06, "loss": 1.081105351448059, "step": 2058 }, { "epoch": 0.6340900346287034, "grad_norm": 9.625, "learning_rate": 4.197158334233376e-06, "loss": 1.6976094245910645, "step": 2060 }, { "epoch": 0.6347056560215467, "grad_norm": 7.15625, "learning_rate": 4.195560648960046e-06, "loss": 1.545295238494873, "step": 2062 }, { "epoch": 0.6353212774143902, "grad_norm": 7.03125, "learning_rate": 4.1939617756313e-06, "loss": 1.5460435152053833, "step": 2064 }, { "epoch": 0.6359368988072336, "grad_norm": 2.984375, "learning_rate": 4.192361715835973e-06, "loss": 1.1674773693084717, "step": 2066 }, { "epoch": 0.6365525202000769, "grad_norm": 7.21875, "learning_rate": 4.190760471164081e-06, "loss": 1.2440143823623657, "step": 2068 }, { "epoch": 0.6371681415929203, "grad_norm": 17.125, "learning_rate": 4.189158043206818e-06, "loss": 1.123397946357727, "step": 2070 }, { "epoch": 0.6377837629857638, "grad_norm": 7.21875, "learning_rate": 4.187554433556552e-06, "loss": 1.394108772277832, "step": 2072 }, { "epoch": 0.6383993843786071, "grad_norm": 5.6875, "learning_rate": 4.185949643806824e-06, "loss": 1.698440670967102, "step": 2074 }, { "epoch": 0.6390150057714505, "grad_norm": 6.53125, "learning_rate": 4.184343675552351e-06, "loss": 1.7033265829086304, "step": 2076 }, { "epoch": 0.639630627164294, "grad_norm": 38.0, "learning_rate": 4.18273653038902e-06, "loss": 1.600013017654419, "step": 2078 }, { "epoch": 0.6402462485571374, "grad_norm": 5.75, "learning_rate": 4.1811282099138865e-06, "loss": 1.2738231420516968, "step": 2080 }, { "epoch": 0.6408618699499807, "grad_norm": 6.25, "learning_rate": 4.179518715725175e-06, "loss": 1.2792097330093384, "step": 2082 }, { "epoch": 0.6414774913428242, "grad_norm": 4.625, "learning_rate": 4.177908049422276e-06, "loss": 1.5177111625671387, "step": 2084 }, { "epoch": 0.6420931127356676, "grad_norm": 10.0625, "learning_rate": 4.176296212605744e-06, "loss": 1.3783992528915405, "step": 2086 }, { "epoch": 0.6427087341285109, "grad_norm": 7.9375, "learning_rate": 4.174683206877298e-06, "loss": 1.4645200967788696, "step": 2088 }, { "epoch": 0.6433243555213544, "grad_norm": 4.375, "learning_rate": 4.173069033839818e-06, "loss": 1.2238026857376099, "step": 2090 }, { "epoch": 0.6439399769141978, "grad_norm": 6.25, "learning_rate": 4.171453695097344e-06, "loss": 1.3408904075622559, "step": 2092 }, { "epoch": 0.6445555983070411, "grad_norm": 7.28125, "learning_rate": 4.169837192255073e-06, "loss": 1.7966108322143555, "step": 2094 }, { "epoch": 0.6451712196998846, "grad_norm": 8.9375, "learning_rate": 4.168219526919361e-06, "loss": 1.266811728477478, "step": 2096 }, { "epoch": 0.645786841092728, "grad_norm": 5.125, "learning_rate": 4.16660070069772e-06, "loss": 1.0839991569519043, "step": 2098 }, { "epoch": 0.6464024624855714, "grad_norm": 4.84375, "learning_rate": 4.164980715198812e-06, "loss": 1.306950330734253, "step": 2100 }, { "epoch": 0.6470180838784148, "grad_norm": 16.625, "learning_rate": 4.1633595720324525e-06, "loss": 1.7637819051742554, "step": 2102 }, { "epoch": 0.6476337052712582, "grad_norm": 8.9375, "learning_rate": 4.161737272809609e-06, "loss": 1.285837173461914, "step": 2104 }, { "epoch": 0.6482493266641016, "grad_norm": 29.375, "learning_rate": 4.160113819142398e-06, "loss": 1.4777885675430298, "step": 2106 }, { "epoch": 0.648864948056945, "grad_norm": 22.875, "learning_rate": 4.158489212644078e-06, "loss": 1.935289740562439, "step": 2108 }, { "epoch": 0.6494805694497884, "grad_norm": 9.8125, "learning_rate": 4.1568634549290585e-06, "loss": 1.1520946025848389, "step": 2110 }, { "epoch": 0.6500961908426318, "grad_norm": 52.5, "learning_rate": 4.155236547612893e-06, "loss": 1.2270162105560303, "step": 2112 }, { "epoch": 0.6507118122354751, "grad_norm": 18.125, "learning_rate": 4.153608492312273e-06, "loss": 1.6834357976913452, "step": 2114 }, { "epoch": 0.6513274336283186, "grad_norm": 16.75, "learning_rate": 4.151979290645037e-06, "loss": 1.7510524988174438, "step": 2116 }, { "epoch": 0.651943055021162, "grad_norm": 5.1875, "learning_rate": 4.150348944230157e-06, "loss": 1.317051649093628, "step": 2118 }, { "epoch": 0.6525586764140053, "grad_norm": 7.21875, "learning_rate": 4.148717454687744e-06, "loss": 1.1501628160476685, "step": 2120 }, { "epoch": 0.6531742978068488, "grad_norm": 2.046875, "learning_rate": 4.147084823639048e-06, "loss": 1.2211923599243164, "step": 2122 }, { "epoch": 0.6537899191996922, "grad_norm": 6.4375, "learning_rate": 4.1454510527064495e-06, "loss": 1.4013869762420654, "step": 2124 }, { "epoch": 0.6544055405925356, "grad_norm": 6.21875, "learning_rate": 4.143816143513463e-06, "loss": 1.236532211303711, "step": 2126 }, { "epoch": 0.655021161985379, "grad_norm": 5.09375, "learning_rate": 4.1421800976847355e-06, "loss": 1.4775946140289307, "step": 2128 }, { "epoch": 0.6556367833782224, "grad_norm": 4.90625, "learning_rate": 4.140542916846042e-06, "loss": 1.3135361671447754, "step": 2130 }, { "epoch": 0.6562524047710658, "grad_norm": 7.03125, "learning_rate": 4.138904602624286e-06, "loss": 1.6919790506362915, "step": 2132 }, { "epoch": 0.6568680261639092, "grad_norm": 3.03125, "learning_rate": 4.137265156647496e-06, "loss": 1.3405189514160156, "step": 2134 }, { "epoch": 0.6574836475567526, "grad_norm": 2.203125, "learning_rate": 4.135624580544829e-06, "loss": 1.3917291164398193, "step": 2136 }, { "epoch": 0.658099268949596, "grad_norm": 7.5, "learning_rate": 4.13398287594656e-06, "loss": 1.2286945581436157, "step": 2138 }, { "epoch": 0.6587148903424394, "grad_norm": 10.125, "learning_rate": 4.132340044484086e-06, "loss": 1.1168937683105469, "step": 2140 }, { "epoch": 0.6593305117352828, "grad_norm": 14.4375, "learning_rate": 4.130696087789929e-06, "loss": 1.4081250429153442, "step": 2142 }, { "epoch": 0.6599461331281262, "grad_norm": 7.40625, "learning_rate": 4.129051007497723e-06, "loss": 1.380058765411377, "step": 2144 }, { "epoch": 0.6605617545209697, "grad_norm": 16.5, "learning_rate": 4.127404805242224e-06, "loss": 1.6792216300964355, "step": 2146 }, { "epoch": 0.661177375913813, "grad_norm": 8.4375, "learning_rate": 4.1257574826592975e-06, "loss": 1.2235708236694336, "step": 2148 }, { "epoch": 0.6617929973066564, "grad_norm": 8.9375, "learning_rate": 4.124109041385925e-06, "loss": 1.3387386798858643, "step": 2150 }, { "epoch": 0.6624086186994999, "grad_norm": 17.0, "learning_rate": 4.122459483060201e-06, "loss": 1.3065528869628906, "step": 2152 }, { "epoch": 0.6630242400923432, "grad_norm": 6.46875, "learning_rate": 4.1208088093213275e-06, "loss": 1.2853572368621826, "step": 2154 }, { "epoch": 0.6636398614851866, "grad_norm": 11.0625, "learning_rate": 4.119157021809616e-06, "loss": 1.7220256328582764, "step": 2156 }, { "epoch": 0.66425548287803, "grad_norm": 4.96875, "learning_rate": 4.1175041221664855e-06, "loss": 1.3406869173049927, "step": 2158 }, { "epoch": 0.6648711042708734, "grad_norm": 4.875, "learning_rate": 4.11585011203446e-06, "loss": 1.4518709182739258, "step": 2160 }, { "epoch": 0.6654867256637168, "grad_norm": 4.65625, "learning_rate": 4.114194993057163e-06, "loss": 1.38444983959198, "step": 2162 }, { "epoch": 0.6661023470565602, "grad_norm": 10.3125, "learning_rate": 4.112538766879328e-06, "loss": 1.6982120275497437, "step": 2164 }, { "epoch": 0.6667179684494036, "grad_norm": 11.875, "learning_rate": 4.110881435146782e-06, "loss": 1.2242650985717773, "step": 2166 }, { "epoch": 0.667333589842247, "grad_norm": 6.46875, "learning_rate": 4.109222999506452e-06, "loss": 1.4547520875930786, "step": 2168 }, { "epoch": 0.6679492112350904, "grad_norm": 93.5, "learning_rate": 4.107563461606362e-06, "loss": 1.5448589324951172, "step": 2170 }, { "epoch": 0.6685648326279339, "grad_norm": 5.75, "learning_rate": 4.105902823095634e-06, "loss": 1.3755910396575928, "step": 2172 }, { "epoch": 0.6691804540207772, "grad_norm": 32.25, "learning_rate": 4.104241085624482e-06, "loss": 1.5684752464294434, "step": 2174 }, { "epoch": 0.6697960754136206, "grad_norm": 4.59375, "learning_rate": 4.102578250844209e-06, "loss": 1.2582721710205078, "step": 2176 }, { "epoch": 0.6704116968064641, "grad_norm": 5.3125, "learning_rate": 4.100914320407213e-06, "loss": 1.0611647367477417, "step": 2178 }, { "epoch": 0.6710273181993074, "grad_norm": 14.6875, "learning_rate": 4.099249295966981e-06, "loss": 1.7605546712875366, "step": 2180 }, { "epoch": 0.6716429395921508, "grad_norm": 13.4375, "learning_rate": 4.0975831791780815e-06, "loss": 1.76204514503479, "step": 2182 }, { "epoch": 0.6722585609849943, "grad_norm": 5.75, "learning_rate": 4.0959159716961735e-06, "loss": 1.4151215553283691, "step": 2184 }, { "epoch": 0.6728741823778376, "grad_norm": 5.96875, "learning_rate": 4.094247675177999e-06, "loss": 1.2722121477127075, "step": 2186 }, { "epoch": 0.673489803770681, "grad_norm": 9.0625, "learning_rate": 4.0925782912813815e-06, "loss": 1.2459155321121216, "step": 2188 }, { "epoch": 0.6741054251635245, "grad_norm": 4.8125, "learning_rate": 4.0909078216652245e-06, "loss": 1.1633710861206055, "step": 2190 }, { "epoch": 0.6747210465563679, "grad_norm": 9.0625, "learning_rate": 4.089236267989512e-06, "loss": 1.6654694080352783, "step": 2192 }, { "epoch": 0.6753366679492112, "grad_norm": 6.0, "learning_rate": 4.0875636319153025e-06, "loss": 1.6824748516082764, "step": 2194 }, { "epoch": 0.6759522893420546, "grad_norm": 9.875, "learning_rate": 4.085889915104735e-06, "loss": 0.9210827946662903, "step": 2196 }, { "epoch": 0.6765679107348981, "grad_norm": 7.65625, "learning_rate": 4.084215119221016e-06, "loss": 0.7590182423591614, "step": 2198 }, { "epoch": 0.6771835321277414, "grad_norm": 8.8125, "learning_rate": 4.0825392459284305e-06, "loss": 1.0328636169433594, "step": 2200 }, { "epoch": 0.6777991535205848, "grad_norm": 9.125, "learning_rate": 4.08086229689233e-06, "loss": 1.7246052026748657, "step": 2202 }, { "epoch": 0.6784147749134283, "grad_norm": 11.25, "learning_rate": 4.079184273779138e-06, "loss": 0.9623408913612366, "step": 2204 }, { "epoch": 0.6790303963062716, "grad_norm": 19.375, "learning_rate": 4.07750517825634e-06, "loss": 0.8454251289367676, "step": 2206 }, { "epoch": 0.679646017699115, "grad_norm": 8.0, "learning_rate": 4.075825011992495e-06, "loss": 1.83228600025177, "step": 2208 }, { "epoch": 0.6802616390919585, "grad_norm": 13.3125, "learning_rate": 4.074143776657219e-06, "loss": 1.3253730535507202, "step": 2210 }, { "epoch": 0.6808772604848018, "grad_norm": 5.4375, "learning_rate": 4.072461473921196e-06, "loss": 0.8865528106689453, "step": 2212 }, { "epoch": 0.6814928818776452, "grad_norm": 6.09375, "learning_rate": 4.070778105456164e-06, "loss": 1.3036880493164062, "step": 2214 }, { "epoch": 0.6821085032704887, "grad_norm": 16.875, "learning_rate": 4.0690936729349275e-06, "loss": 1.3068422079086304, "step": 2216 }, { "epoch": 0.6827241246633321, "grad_norm": 12.4375, "learning_rate": 4.067408178031342e-06, "loss": 1.1644279956817627, "step": 2218 }, { "epoch": 0.6833397460561754, "grad_norm": 3.96875, "learning_rate": 4.0657216224203255e-06, "loss": 1.2166335582733154, "step": 2220 }, { "epoch": 0.6839553674490189, "grad_norm": 7.03125, "learning_rate": 4.064034007777843e-06, "loss": 1.2225016355514526, "step": 2222 }, { "epoch": 0.6845709888418623, "grad_norm": 4.40625, "learning_rate": 4.062345335780915e-06, "loss": 1.2606008052825928, "step": 2224 }, { "epoch": 0.6851866102347056, "grad_norm": 6.53125, "learning_rate": 4.0606556081076145e-06, "loss": 1.2266736030578613, "step": 2226 }, { "epoch": 0.685802231627549, "grad_norm": 6.3125, "learning_rate": 4.058964826437059e-06, "loss": 1.3379666805267334, "step": 2228 }, { "epoch": 0.6864178530203925, "grad_norm": 11.0625, "learning_rate": 4.057272992449419e-06, "loss": 1.8048510551452637, "step": 2230 }, { "epoch": 0.6870334744132358, "grad_norm": 8.1875, "learning_rate": 4.0555801078259085e-06, "loss": 1.3982155323028564, "step": 2232 }, { "epoch": 0.6876490958060792, "grad_norm": 16.0, "learning_rate": 4.0538861742487815e-06, "loss": 1.7776756286621094, "step": 2234 }, { "epoch": 0.6882647171989227, "grad_norm": 26.125, "learning_rate": 4.052191193401339e-06, "loss": 1.7840557098388672, "step": 2236 }, { "epoch": 0.6888803385917661, "grad_norm": 10.75, "learning_rate": 4.050495166967922e-06, "loss": 1.3246575593948364, "step": 2238 }, { "epoch": 0.6894959599846094, "grad_norm": 7.125, "learning_rate": 4.04879809663391e-06, "loss": 1.4963364601135254, "step": 2240 }, { "epoch": 0.6901115813774529, "grad_norm": 12.8125, "learning_rate": 4.04709998408572e-06, "loss": 1.3805711269378662, "step": 2242 }, { "epoch": 0.6907272027702963, "grad_norm": 7.1875, "learning_rate": 4.045400831010804e-06, "loss": 1.2389583587646484, "step": 2244 }, { "epoch": 0.6913428241631396, "grad_norm": 8.0625, "learning_rate": 4.043700639097649e-06, "loss": 1.7255418300628662, "step": 2246 }, { "epoch": 0.6919584455559831, "grad_norm": 7.34375, "learning_rate": 4.0419994100357725e-06, "loss": 1.4835617542266846, "step": 2248 }, { "epoch": 0.6925740669488265, "grad_norm": 3.984375, "learning_rate": 4.0402971455157255e-06, "loss": 1.1938284635543823, "step": 2250 }, { "epoch": 0.6931896883416698, "grad_norm": 8.8125, "learning_rate": 4.038593847229088e-06, "loss": 1.6337261199951172, "step": 2252 }, { "epoch": 0.6938053097345133, "grad_norm": 13.75, "learning_rate": 4.036889516868461e-06, "loss": 0.6948480606079102, "step": 2254 }, { "epoch": 0.6944209311273567, "grad_norm": 2.546875, "learning_rate": 4.035184156127478e-06, "loss": 1.3340535163879395, "step": 2256 }, { "epoch": 0.6950365525202, "grad_norm": 5.875, "learning_rate": 4.0334777667007966e-06, "loss": 1.317762851715088, "step": 2258 }, { "epoch": 0.6956521739130435, "grad_norm": 7.125, "learning_rate": 4.031770350284091e-06, "loss": 1.231817364692688, "step": 2260 }, { "epoch": 0.6962677953058869, "grad_norm": 14.875, "learning_rate": 4.03006190857406e-06, "loss": 1.566786766052246, "step": 2262 }, { "epoch": 0.6968834166987303, "grad_norm": 10.25, "learning_rate": 4.028352443268422e-06, "loss": 1.5564160346984863, "step": 2264 }, { "epoch": 0.6974990380915737, "grad_norm": 3.296875, "learning_rate": 4.026641956065908e-06, "loss": 1.2401013374328613, "step": 2266 }, { "epoch": 0.6981146594844171, "grad_norm": 17.375, "learning_rate": 4.02493044866627e-06, "loss": 1.467827320098877, "step": 2268 }, { "epoch": 0.6987302808772605, "grad_norm": 6.8125, "learning_rate": 4.023217922770272e-06, "loss": 1.536721110343933, "step": 2270 }, { "epoch": 0.6993459022701038, "grad_norm": 7.28125, "learning_rate": 4.021504380079686e-06, "loss": 1.3971998691558838, "step": 2272 }, { "epoch": 0.6999615236629473, "grad_norm": 6.25, "learning_rate": 4.0197898222973e-06, "loss": 1.3819046020507812, "step": 2274 }, { "epoch": 0.7005771450557907, "grad_norm": 3.71875, "learning_rate": 4.018074251126908e-06, "loss": 1.3601036071777344, "step": 2276 }, { "epoch": 0.701192766448634, "grad_norm": 14.0, "learning_rate": 4.016357668273313e-06, "loss": 1.337782621383667, "step": 2278 }, { "epoch": 0.7018083878414775, "grad_norm": 3.203125, "learning_rate": 4.014640075442318e-06, "loss": 1.136154055595398, "step": 2280 }, { "epoch": 0.7024240092343209, "grad_norm": 4.78125, "learning_rate": 4.012921474340738e-06, "loss": 1.2227451801300049, "step": 2282 }, { "epoch": 0.7030396306271643, "grad_norm": 10.1875, "learning_rate": 4.011201866676383e-06, "loss": 0.9052634239196777, "step": 2284 }, { "epoch": 0.7036552520200077, "grad_norm": 5.0625, "learning_rate": 4.009481254158066e-06, "loss": 1.4201956987380981, "step": 2286 }, { "epoch": 0.7042708734128511, "grad_norm": 2.625, "learning_rate": 4.007759638495599e-06, "loss": 1.151263952255249, "step": 2288 }, { "epoch": 0.7048864948056945, "grad_norm": 2.734375, "learning_rate": 4.006037021399789e-06, "loss": 1.2394721508026123, "step": 2290 }, { "epoch": 0.7055021161985379, "grad_norm": 2.234375, "learning_rate": 4.004313404582439e-06, "loss": 1.0372200012207031, "step": 2292 }, { "epoch": 0.7061177375913813, "grad_norm": 6.25, "learning_rate": 4.002588789756349e-06, "loss": 1.233241081237793, "step": 2294 }, { "epoch": 0.7067333589842247, "grad_norm": 6.46875, "learning_rate": 4.000863178635301e-06, "loss": 1.434636116027832, "step": 2296 }, { "epoch": 0.7073489803770681, "grad_norm": 3.328125, "learning_rate": 3.9991365729340805e-06, "loss": 1.223391056060791, "step": 2298 }, { "epoch": 0.7079646017699115, "grad_norm": 7.3125, "learning_rate": 3.997408974368449e-06, "loss": 1.1761574745178223, "step": 2300 }, { "epoch": 0.7085802231627549, "grad_norm": 9.4375, "learning_rate": 3.995680384655162e-06, "loss": 1.2080774307250977, "step": 2302 }, { "epoch": 0.7091958445555983, "grad_norm": 8.4375, "learning_rate": 3.993950805511959e-06, "loss": 1.641865611076355, "step": 2304 }, { "epoch": 0.7098114659484417, "grad_norm": 9.875, "learning_rate": 3.992220238657559e-06, "loss": 1.4001866579055786, "step": 2306 }, { "epoch": 0.7104270873412851, "grad_norm": 4.125, "learning_rate": 3.990488685811667e-06, "loss": 1.180934190750122, "step": 2308 }, { "epoch": 0.7110427087341286, "grad_norm": 6.96875, "learning_rate": 3.9887561486949655e-06, "loss": 1.1157152652740479, "step": 2310 }, { "epoch": 0.7116583301269719, "grad_norm": 8.8125, "learning_rate": 3.987022629029115e-06, "loss": 1.4063937664031982, "step": 2312 }, { "epoch": 0.7122739515198153, "grad_norm": 4.03125, "learning_rate": 3.985288128536753e-06, "loss": 1.39382803440094, "step": 2314 }, { "epoch": 0.7128895729126588, "grad_norm": 4.09375, "learning_rate": 3.983552648941492e-06, "loss": 1.2778561115264893, "step": 2316 }, { "epoch": 0.7135051943055021, "grad_norm": 11.1875, "learning_rate": 3.981816191967917e-06, "loss": 1.3750758171081543, "step": 2318 }, { "epoch": 0.7141208156983455, "grad_norm": 14.6875, "learning_rate": 3.980078759341582e-06, "loss": 1.2443888187408447, "step": 2320 }, { "epoch": 0.714736437091189, "grad_norm": 7.1875, "learning_rate": 3.978340352789017e-06, "loss": 0.7416563630104065, "step": 2322 }, { "epoch": 0.7153520584840323, "grad_norm": 6.46875, "learning_rate": 3.976600974037711e-06, "loss": 1.4614344835281372, "step": 2324 }, { "epoch": 0.7159676798768757, "grad_norm": 10.5, "learning_rate": 3.974860624816126e-06, "loss": 1.3321248292922974, "step": 2326 }, { "epoch": 0.7165833012697191, "grad_norm": 11.375, "learning_rate": 3.973119306853687e-06, "loss": 1.6063240766525269, "step": 2328 }, { "epoch": 0.7171989226625626, "grad_norm": 6.15625, "learning_rate": 3.971377021880779e-06, "loss": 1.5237957239151, "step": 2330 }, { "epoch": 0.7178145440554059, "grad_norm": 4.75, "learning_rate": 3.96963377162875e-06, "loss": 1.20784330368042, "step": 2332 }, { "epoch": 0.7184301654482493, "grad_norm": 4.25, "learning_rate": 3.967889557829907e-06, "loss": 1.2961535453796387, "step": 2334 }, { "epoch": 0.7190457868410928, "grad_norm": 13.875, "learning_rate": 3.966144382217514e-06, "loss": 1.1308797597885132, "step": 2336 }, { "epoch": 0.7196614082339361, "grad_norm": 4.9375, "learning_rate": 3.964398246525791e-06, "loss": 1.4275403022766113, "step": 2338 }, { "epoch": 0.7202770296267795, "grad_norm": 4.34375, "learning_rate": 3.962651152489914e-06, "loss": 1.1467701196670532, "step": 2340 }, { "epoch": 0.720892651019623, "grad_norm": 11.4375, "learning_rate": 3.960903101846009e-06, "loss": 1.4461891651153564, "step": 2342 }, { "epoch": 0.7215082724124663, "grad_norm": 7.375, "learning_rate": 3.959154096331149e-06, "loss": 1.6958913803100586, "step": 2344 }, { "epoch": 0.7221238938053097, "grad_norm": 5.9375, "learning_rate": 3.957404137683366e-06, "loss": 1.124305248260498, "step": 2346 }, { "epoch": 0.7227395151981532, "grad_norm": 3.71875, "learning_rate": 3.955653227641629e-06, "loss": 1.18574857711792, "step": 2348 }, { "epoch": 0.7233551365909965, "grad_norm": 9.4375, "learning_rate": 3.953901367945858e-06, "loss": 1.2754722833633423, "step": 2350 }, { "epoch": 0.7239707579838399, "grad_norm": 6.6875, "learning_rate": 3.952148560336916e-06, "loss": 1.480337381362915, "step": 2352 }, { "epoch": 0.7245863793766834, "grad_norm": 5.15625, "learning_rate": 3.950394806556607e-06, "loss": 1.277705192565918, "step": 2354 }, { "epoch": 0.7252020007695268, "grad_norm": 8.375, "learning_rate": 3.948640108347673e-06, "loss": 1.3931273221969604, "step": 2356 }, { "epoch": 0.7258176221623701, "grad_norm": 4.3125, "learning_rate": 3.9468844674537995e-06, "loss": 1.2593852281570435, "step": 2358 }, { "epoch": 0.7264332435552135, "grad_norm": 11.125, "learning_rate": 3.945127885619605e-06, "loss": 1.9256591796875, "step": 2360 }, { "epoch": 0.727048864948057, "grad_norm": 7.375, "learning_rate": 3.943370364590646e-06, "loss": 1.451329231262207, "step": 2362 }, { "epoch": 0.7276644863409003, "grad_norm": 5.5625, "learning_rate": 3.941611906113409e-06, "loss": 1.3462300300598145, "step": 2364 }, { "epoch": 0.7282801077337437, "grad_norm": 3.90625, "learning_rate": 3.939852511935313e-06, "loss": 1.1861860752105713, "step": 2366 }, { "epoch": 0.7288957291265872, "grad_norm": 10.6875, "learning_rate": 3.938092183804709e-06, "loss": 1.5588297843933105, "step": 2368 }, { "epoch": 0.7295113505194305, "grad_norm": 5.09375, "learning_rate": 3.936330923470874e-06, "loss": 1.4599204063415527, "step": 2370 }, { "epoch": 0.7301269719122739, "grad_norm": 9.9375, "learning_rate": 3.934568732684011e-06, "loss": 1.70735764503479, "step": 2372 }, { "epoch": 0.7307425933051174, "grad_norm": 6.28125, "learning_rate": 3.932805613195249e-06, "loss": 1.2503821849822998, "step": 2374 }, { "epoch": 0.7313582146979608, "grad_norm": 8.4375, "learning_rate": 3.9310415667566405e-06, "loss": 1.5381474494934082, "step": 2376 }, { "epoch": 0.7319738360908041, "grad_norm": 7.4375, "learning_rate": 3.929276595121157e-06, "loss": 1.7803454399108887, "step": 2378 }, { "epoch": 0.7325894574836476, "grad_norm": 6.59375, "learning_rate": 3.927510700042689e-06, "loss": 1.4499974250793457, "step": 2380 }, { "epoch": 0.733205078876491, "grad_norm": 17.375, "learning_rate": 3.9257438832760485e-06, "loss": 1.590423822402954, "step": 2382 }, { "epoch": 0.7338207002693343, "grad_norm": 12.0625, "learning_rate": 3.923976146576961e-06, "loss": 1.1864643096923828, "step": 2384 }, { "epoch": 0.7344363216621778, "grad_norm": 5.8125, "learning_rate": 3.922207491702064e-06, "loss": 1.1859465837478638, "step": 2386 }, { "epoch": 0.7350519430550212, "grad_norm": 13.0, "learning_rate": 3.9204379204089095e-06, "loss": 1.6148601770401, "step": 2388 }, { "epoch": 0.7356675644478645, "grad_norm": 4.53125, "learning_rate": 3.918667434455962e-06, "loss": 1.1462031602859497, "step": 2390 }, { "epoch": 0.736283185840708, "grad_norm": 4.875, "learning_rate": 3.916896035602592e-06, "loss": 1.4192490577697754, "step": 2392 }, { "epoch": 0.7368988072335514, "grad_norm": 5.09375, "learning_rate": 3.915123725609079e-06, "loss": 1.2958197593688965, "step": 2394 }, { "epoch": 0.7375144286263947, "grad_norm": 9.5, "learning_rate": 3.913350506236606e-06, "loss": 1.2567051649093628, "step": 2396 }, { "epoch": 0.7381300500192381, "grad_norm": 7.6875, "learning_rate": 3.9115763792472615e-06, "loss": 1.1681147813796997, "step": 2398 }, { "epoch": 0.7387456714120816, "grad_norm": 4.71875, "learning_rate": 3.909801346404035e-06, "loss": 1.3985934257507324, "step": 2400 }, { "epoch": 0.739361292804925, "grad_norm": 6.96875, "learning_rate": 3.908025409470817e-06, "loss": 1.3959019184112549, "step": 2402 }, { "epoch": 0.7399769141977683, "grad_norm": 10.5, "learning_rate": 3.906248570212395e-06, "loss": 1.3087546825408936, "step": 2404 }, { "epoch": 0.7405925355906118, "grad_norm": 9.375, "learning_rate": 3.904470830394455e-06, "loss": 1.3431845903396606, "step": 2406 }, { "epoch": 0.7412081569834552, "grad_norm": 6.65625, "learning_rate": 3.902692191783576e-06, "loss": 1.2573740482330322, "step": 2408 }, { "epoch": 0.7418237783762985, "grad_norm": 8.6875, "learning_rate": 3.9009126561472325e-06, "loss": 1.419797420501709, "step": 2410 }, { "epoch": 0.742439399769142, "grad_norm": 5.90625, "learning_rate": 3.899132225253786e-06, "loss": 1.0288840532302856, "step": 2412 }, { "epoch": 0.7430550211619854, "grad_norm": 4.59375, "learning_rate": 3.897350900872494e-06, "loss": 1.1642630100250244, "step": 2414 }, { "epoch": 0.7436706425548287, "grad_norm": 8.9375, "learning_rate": 3.895568684773496e-06, "loss": 1.7512762546539307, "step": 2416 }, { "epoch": 0.7442862639476722, "grad_norm": 4.21875, "learning_rate": 3.893785578727821e-06, "loss": 1.2895947694778442, "step": 2418 }, { "epoch": 0.7449018853405156, "grad_norm": 4.6875, "learning_rate": 3.892001584507382e-06, "loss": 1.3287444114685059, "step": 2420 }, { "epoch": 0.745517506733359, "grad_norm": 12.0625, "learning_rate": 3.890216703884974e-06, "loss": 1.1935874223709106, "step": 2422 }, { "epoch": 0.7461331281262024, "grad_norm": 5.6875, "learning_rate": 3.888430938634272e-06, "loss": 1.219550371170044, "step": 2424 }, { "epoch": 0.7467487495190458, "grad_norm": 8.625, "learning_rate": 3.886644290529831e-06, "loss": 0.879625678062439, "step": 2426 }, { "epoch": 0.7473643709118892, "grad_norm": 5.875, "learning_rate": 3.884856761347084e-06, "loss": 1.354616641998291, "step": 2428 }, { "epoch": 0.7479799923047326, "grad_norm": 5.09375, "learning_rate": 3.883068352862338e-06, "loss": 0.8705620169639587, "step": 2430 }, { "epoch": 0.748595613697576, "grad_norm": 7.84375, "learning_rate": 3.8812790668527755e-06, "loss": 1.2265740633010864, "step": 2432 }, { "epoch": 0.7492112350904194, "grad_norm": 7.4375, "learning_rate": 3.879488905096449e-06, "loss": 1.355034351348877, "step": 2434 }, { "epoch": 0.7498268564832627, "grad_norm": 9.0, "learning_rate": 3.877697869372284e-06, "loss": 1.4637103080749512, "step": 2436 }, { "epoch": 0.7504424778761062, "grad_norm": 5.125, "learning_rate": 3.8759059614600705e-06, "loss": 1.2718815803527832, "step": 2438 }, { "epoch": 0.7510580992689496, "grad_norm": 17.375, "learning_rate": 3.87411318314047e-06, "loss": 1.2510896921157837, "step": 2440 }, { "epoch": 0.7516737206617929, "grad_norm": 15.6875, "learning_rate": 3.872319536195006e-06, "loss": 1.4513335227966309, "step": 2442 }, { "epoch": 0.7522893420546364, "grad_norm": 8.1875, "learning_rate": 3.870525022406064e-06, "loss": 1.5151240825653076, "step": 2444 }, { "epoch": 0.7529049634474798, "grad_norm": 9.375, "learning_rate": 3.8687296435568945e-06, "loss": 1.266099452972412, "step": 2446 }, { "epoch": 0.7535205848403232, "grad_norm": 9.3125, "learning_rate": 3.866933401431604e-06, "loss": 1.5317912101745605, "step": 2448 }, { "epoch": 0.7541362062331666, "grad_norm": 1.6015625, "learning_rate": 3.865136297815161e-06, "loss": 1.1209218502044678, "step": 2450 }, { "epoch": 0.75475182762601, "grad_norm": 2.671875, "learning_rate": 3.863338334493384e-06, "loss": 1.0036613941192627, "step": 2452 }, { "epoch": 0.7553674490188534, "grad_norm": 12.3125, "learning_rate": 3.8615395132529536e-06, "loss": 1.73202645778656, "step": 2454 }, { "epoch": 0.7559830704116968, "grad_norm": 3.71875, "learning_rate": 3.859739835881394e-06, "loss": 1.3284037113189697, "step": 2456 }, { "epoch": 0.7565986918045402, "grad_norm": 6.53125, "learning_rate": 3.85793930416709e-06, "loss": 1.451532006263733, "step": 2458 }, { "epoch": 0.7572143131973836, "grad_norm": 7.0625, "learning_rate": 3.856137919899268e-06, "loss": 1.2960894107818604, "step": 2460 }, { "epoch": 0.757829934590227, "grad_norm": 5.8125, "learning_rate": 3.854335684868004e-06, "loss": 1.4375635385513306, "step": 2462 }, { "epoch": 0.7584455559830704, "grad_norm": 6.125, "learning_rate": 3.852532600864218e-06, "loss": 1.4283324480056763, "step": 2464 }, { "epoch": 0.7590611773759138, "grad_norm": 7.6875, "learning_rate": 3.850728669679678e-06, "loss": 1.5085079669952393, "step": 2466 }, { "epoch": 0.7596767987687573, "grad_norm": 5.5, "learning_rate": 3.848923893106987e-06, "loss": 1.2598025798797607, "step": 2468 }, { "epoch": 0.7602924201616006, "grad_norm": 2.5, "learning_rate": 3.847118272939595e-06, "loss": 1.171806812286377, "step": 2470 }, { "epoch": 0.760908041554444, "grad_norm": 12.3125, "learning_rate": 3.845311810971787e-06, "loss": 1.3810973167419434, "step": 2472 }, { "epoch": 0.7615236629472875, "grad_norm": 12.0, "learning_rate": 3.843504508998684e-06, "loss": 1.7773661613464355, "step": 2474 }, { "epoch": 0.7621392843401308, "grad_norm": 10.375, "learning_rate": 3.841696368816242e-06, "loss": 1.6338831186294556, "step": 2476 }, { "epoch": 0.7627549057329742, "grad_norm": 7.75, "learning_rate": 3.839887392221252e-06, "loss": 1.3375442028045654, "step": 2478 }, { "epoch": 0.7633705271258177, "grad_norm": 6.40625, "learning_rate": 3.838077581011332e-06, "loss": 1.4185311794281006, "step": 2480 }, { "epoch": 0.763986148518661, "grad_norm": 9.5625, "learning_rate": 3.836266936984934e-06, "loss": 1.4465787410736084, "step": 2482 }, { "epoch": 0.7646017699115044, "grad_norm": 17.75, "learning_rate": 3.834455461941335e-06, "loss": 1.7770164012908936, "step": 2484 }, { "epoch": 0.7652173913043478, "grad_norm": 5.40625, "learning_rate": 3.83264315768064e-06, "loss": 1.1469110250473022, "step": 2486 }, { "epoch": 0.7658330126971912, "grad_norm": 4.4375, "learning_rate": 3.830830026003774e-06, "loss": 1.093663215637207, "step": 2488 }, { "epoch": 0.7664486340900346, "grad_norm": 20.125, "learning_rate": 3.829016068712486e-06, "loss": 1.6215949058532715, "step": 2490 }, { "epoch": 0.767064255482878, "grad_norm": 5.59375, "learning_rate": 3.827201287609349e-06, "loss": 1.2483386993408203, "step": 2492 }, { "epoch": 0.7676798768757215, "grad_norm": 6.9375, "learning_rate": 3.82538568449775e-06, "loss": 0.8979315757751465, "step": 2494 }, { "epoch": 0.7682954982685648, "grad_norm": 6.15625, "learning_rate": 3.823569261181894e-06, "loss": 1.4703541994094849, "step": 2496 }, { "epoch": 0.7689111196614082, "grad_norm": 7.0, "learning_rate": 3.821752019466803e-06, "loss": 1.2911673784255981, "step": 2498 }, { "epoch": 0.7695267410542517, "grad_norm": 3.65625, "learning_rate": 3.819933961158308e-06, "loss": 1.3201124668121338, "step": 2500 }, { "epoch": 0.770142362447095, "grad_norm": 27.25, "learning_rate": 3.818115088063058e-06, "loss": 1.4830491542816162, "step": 2502 }, { "epoch": 0.7707579838399384, "grad_norm": 3.4375, "learning_rate": 3.816295401988507e-06, "loss": 1.2479162216186523, "step": 2504 }, { "epoch": 0.7713736052327819, "grad_norm": 5.46875, "learning_rate": 3.814474904742916e-06, "loss": 1.1597003936767578, "step": 2506 }, { "epoch": 0.7719892266256252, "grad_norm": 20.0, "learning_rate": 3.812653598135355e-06, "loss": 1.6120350360870361, "step": 2508 }, { "epoch": 0.7726048480184686, "grad_norm": 7.625, "learning_rate": 3.8108314839756976e-06, "loss": 1.2619602680206299, "step": 2510 }, { "epoch": 0.7732204694113121, "grad_norm": 8.3125, "learning_rate": 3.809008564074619e-06, "loss": 1.9463766813278198, "step": 2512 }, { "epoch": 0.7738360908041555, "grad_norm": 5.34375, "learning_rate": 3.807184840243595e-06, "loss": 1.4215641021728516, "step": 2514 }, { "epoch": 0.7744517121969988, "grad_norm": 5.1875, "learning_rate": 3.8053603142949024e-06, "loss": 1.05170476436615, "step": 2516 }, { "epoch": 0.7750673335898423, "grad_norm": 11.625, "learning_rate": 3.803534988041613e-06, "loss": 1.2248324155807495, "step": 2518 }, { "epoch": 0.7756829549826857, "grad_norm": 4.9375, "learning_rate": 3.8017088632975928e-06, "loss": 1.3490006923675537, "step": 2520 }, { "epoch": 0.776298576375529, "grad_norm": 7.9375, "learning_rate": 3.7998819418775044e-06, "loss": 1.5887987613677979, "step": 2522 }, { "epoch": 0.7769141977683724, "grad_norm": 8.75, "learning_rate": 3.798054225596801e-06, "loss": 1.286887526512146, "step": 2524 }, { "epoch": 0.7775298191612159, "grad_norm": 4.875, "learning_rate": 3.7962257162717242e-06, "loss": 1.2695399522781372, "step": 2526 }, { "epoch": 0.7781454405540592, "grad_norm": 3.25, "learning_rate": 3.7943964157193057e-06, "loss": 1.313680648803711, "step": 2528 }, { "epoch": 0.7787610619469026, "grad_norm": 5.625, "learning_rate": 3.792566325757361e-06, "loss": 1.4804496765136719, "step": 2530 }, { "epoch": 0.7793766833397461, "grad_norm": 5.3125, "learning_rate": 3.790735448204492e-06, "loss": 1.0813490152359009, "step": 2532 }, { "epoch": 0.7799923047325894, "grad_norm": 10.1875, "learning_rate": 3.7889037848800846e-06, "loss": 1.128103256225586, "step": 2534 }, { "epoch": 0.7806079261254328, "grad_norm": 7.3125, "learning_rate": 3.787071337604301e-06, "loss": 1.0231573581695557, "step": 2536 }, { "epoch": 0.7812235475182763, "grad_norm": 6.625, "learning_rate": 3.785238108198087e-06, "loss": 1.2463830709457397, "step": 2538 }, { "epoch": 0.7818391689111197, "grad_norm": 6.1875, "learning_rate": 3.783404098483163e-06, "loss": 0.9330945014953613, "step": 2540 }, { "epoch": 0.782454790303963, "grad_norm": 6.5625, "learning_rate": 3.781569310282026e-06, "loss": 1.2505488395690918, "step": 2542 }, { "epoch": 0.7830704116968065, "grad_norm": 4.71875, "learning_rate": 3.779733745417945e-06, "loss": 0.9242513179779053, "step": 2544 }, { "epoch": 0.7836860330896499, "grad_norm": 5.75, "learning_rate": 3.7778974057149632e-06, "loss": 1.6376546621322632, "step": 2546 }, { "epoch": 0.7843016544824932, "grad_norm": 5.6875, "learning_rate": 3.776060292997893e-06, "loss": 1.1617989540100098, "step": 2548 }, { "epoch": 0.7849172758753367, "grad_norm": 7.25, "learning_rate": 3.774222409092313e-06, "loss": 1.4114969968795776, "step": 2550 }, { "epoch": 0.7855328972681801, "grad_norm": 7.90625, "learning_rate": 3.772383755824569e-06, "loss": 1.4932143688201904, "step": 2552 }, { "epoch": 0.7861485186610234, "grad_norm": 18.0, "learning_rate": 3.770544335021774e-06, "loss": 1.221050500869751, "step": 2554 }, { "epoch": 0.7867641400538669, "grad_norm": 7.28125, "learning_rate": 3.7687041485118025e-06, "loss": 1.2442152500152588, "step": 2556 }, { "epoch": 0.7873797614467103, "grad_norm": 5.96875, "learning_rate": 3.7668631981232852e-06, "loss": 1.4660810232162476, "step": 2558 }, { "epoch": 0.7879953828395537, "grad_norm": 7.5, "learning_rate": 3.7650214856856192e-06, "loss": 1.2007068395614624, "step": 2560 }, { "epoch": 0.788611004232397, "grad_norm": 6.9375, "learning_rate": 3.7631790130289548e-06, "loss": 1.5176289081573486, "step": 2562 }, { "epoch": 0.7892266256252405, "grad_norm": 4.9375, "learning_rate": 3.7613357819841968e-06, "loss": 1.1563633680343628, "step": 2564 }, { "epoch": 0.7898422470180839, "grad_norm": 6.0, "learning_rate": 3.7594917943830065e-06, "loss": 1.4055408239364624, "step": 2566 }, { "epoch": 0.7904578684109272, "grad_norm": 4.53125, "learning_rate": 3.757647052057796e-06, "loss": 1.1063488721847534, "step": 2568 }, { "epoch": 0.7910734898037707, "grad_norm": 5.25, "learning_rate": 3.755801556841726e-06, "loss": 1.3035168647766113, "step": 2570 }, { "epoch": 0.7916891111966141, "grad_norm": 6.03125, "learning_rate": 3.7539553105687067e-06, "loss": 1.4018332958221436, "step": 2572 }, { "epoch": 0.7923047325894574, "grad_norm": 25.0, "learning_rate": 3.7521083150733952e-06, "loss": 1.6415070295333862, "step": 2574 }, { "epoch": 0.7929203539823009, "grad_norm": 12.5625, "learning_rate": 3.7502605721911923e-06, "loss": 1.4985864162445068, "step": 2576 }, { "epoch": 0.7935359753751443, "grad_norm": 9.75, "learning_rate": 3.7484120837582405e-06, "loss": 1.1360118389129639, "step": 2578 }, { "epoch": 0.7941515967679876, "grad_norm": 5.78125, "learning_rate": 3.746562851611425e-06, "loss": 0.9488309621810913, "step": 2580 }, { "epoch": 0.7947672181608311, "grad_norm": 4.59375, "learning_rate": 3.744712877588369e-06, "loss": 1.281587839126587, "step": 2582 }, { "epoch": 0.7953828395536745, "grad_norm": 5.0625, "learning_rate": 3.7428621635274333e-06, "loss": 1.2814773321151733, "step": 2584 }, { "epoch": 0.7959984609465179, "grad_norm": 17.25, "learning_rate": 3.741010711267714e-06, "loss": 1.6053236722946167, "step": 2586 }, { "epoch": 0.7966140823393613, "grad_norm": 4.84375, "learning_rate": 3.73915852264904e-06, "loss": 1.4618890285491943, "step": 2588 }, { "epoch": 0.7972297037322047, "grad_norm": 4.90625, "learning_rate": 3.7373055995119745e-06, "loss": 1.130091667175293, "step": 2590 }, { "epoch": 0.7978453251250481, "grad_norm": 8.8125, "learning_rate": 3.7354519436978056e-06, "loss": 1.317619800567627, "step": 2592 }, { "epoch": 0.7984609465178915, "grad_norm": 28.25, "learning_rate": 3.733597557048555e-06, "loss": 1.3336650133132935, "step": 2594 }, { "epoch": 0.7990765679107349, "grad_norm": 11.125, "learning_rate": 3.731742441406969e-06, "loss": 1.345863699913025, "step": 2596 }, { "epoch": 0.7996921893035783, "grad_norm": 8.625, "learning_rate": 3.7298865986165154e-06, "loss": 1.3263499736785889, "step": 2598 }, { "epoch": 0.8003078106964217, "grad_norm": 7.75, "learning_rate": 3.728030030521387e-06, "loss": 1.4835124015808105, "step": 2600 }, { "epoch": 0.8009234320892651, "grad_norm": 4.96875, "learning_rate": 3.7261727389664993e-06, "loss": 1.102487564086914, "step": 2602 }, { "epoch": 0.8015390534821085, "grad_norm": 10.4375, "learning_rate": 3.7243147257974832e-06, "loss": 1.0480748414993286, "step": 2604 }, { "epoch": 0.802154674874952, "grad_norm": 8.1875, "learning_rate": 3.7224559928606862e-06, "loss": 1.1928825378417969, "step": 2606 }, { "epoch": 0.8027702962677953, "grad_norm": 4.75, "learning_rate": 3.7205965420031763e-06, "loss": 1.3224366903305054, "step": 2608 }, { "epoch": 0.8033859176606387, "grad_norm": 6.21875, "learning_rate": 3.718736375072729e-06, "loss": 1.4208377599716187, "step": 2610 }, { "epoch": 0.8040015390534822, "grad_norm": 8.375, "learning_rate": 3.716875493917834e-06, "loss": 0.9219213724136353, "step": 2612 }, { "epoch": 0.8046171604463255, "grad_norm": 1.5625, "learning_rate": 3.715013900387693e-06, "loss": 1.0641413927078247, "step": 2614 }, { "epoch": 0.8052327818391689, "grad_norm": 34.5, "learning_rate": 3.7131515963322106e-06, "loss": 1.1293139457702637, "step": 2616 }, { "epoch": 0.8058484032320123, "grad_norm": 11.25, "learning_rate": 3.7112885836020017e-06, "loss": 1.3597586154937744, "step": 2618 }, { "epoch": 0.8064640246248557, "grad_norm": 15.0625, "learning_rate": 3.7094248640483834e-06, "loss": 1.231532335281372, "step": 2620 }, { "epoch": 0.8070796460176991, "grad_norm": 5.6875, "learning_rate": 3.7075604395233745e-06, "loss": 1.0208721160888672, "step": 2622 }, { "epoch": 0.8076952674105425, "grad_norm": 2.40625, "learning_rate": 3.705695311879696e-06, "loss": 1.136234164237976, "step": 2624 }, { "epoch": 0.8083108888033859, "grad_norm": 4.4375, "learning_rate": 3.7038294829707675e-06, "loss": 1.288115382194519, "step": 2626 }, { "epoch": 0.8089265101962293, "grad_norm": 11.125, "learning_rate": 3.701962954650705e-06, "loss": 1.3650559186935425, "step": 2628 }, { "epoch": 0.8095421315890727, "grad_norm": 5.5, "learning_rate": 3.70009572877432e-06, "loss": 0.9391842484474182, "step": 2630 }, { "epoch": 0.8101577529819162, "grad_norm": 2.8125, "learning_rate": 3.6982278071971158e-06, "loss": 1.1927106380462646, "step": 2632 }, { "epoch": 0.8107733743747595, "grad_norm": 5.6875, "learning_rate": 3.696359191775288e-06, "loss": 1.094351887702942, "step": 2634 }, { "epoch": 0.8113889957676029, "grad_norm": 13.75, "learning_rate": 3.694489884365722e-06, "loss": 1.2645761966705322, "step": 2636 }, { "epoch": 0.8120046171604464, "grad_norm": 10.6875, "learning_rate": 3.69261988682599e-06, "loss": 1.4461915493011475, "step": 2638 }, { "epoch": 0.8126202385532897, "grad_norm": 6.0, "learning_rate": 3.690749201014352e-06, "loss": 1.3748584985733032, "step": 2640 }, { "epoch": 0.8132358599461331, "grad_norm": 8.25, "learning_rate": 3.6888778287897477e-06, "loss": 1.4600911140441895, "step": 2642 }, { "epoch": 0.8138514813389766, "grad_norm": 12.25, "learning_rate": 3.6870057720118036e-06, "loss": 1.3699744939804077, "step": 2644 }, { "epoch": 0.8144671027318199, "grad_norm": 8.25, "learning_rate": 3.6851330325408242e-06, "loss": 1.312798261642456, "step": 2646 }, { "epoch": 0.8150827241246633, "grad_norm": 3.53125, "learning_rate": 3.6832596122377926e-06, "loss": 1.1834629774093628, "step": 2648 }, { "epoch": 0.8156983455175068, "grad_norm": 7.4375, "learning_rate": 3.681385512964368e-06, "loss": 1.501805067062378, "step": 2650 }, { "epoch": 0.8163139669103502, "grad_norm": 11.6875, "learning_rate": 3.6795107365828862e-06, "loss": 1.251523733139038, "step": 2652 }, { "epoch": 0.8169295883031935, "grad_norm": 16.0, "learning_rate": 3.6776352849563534e-06, "loss": 1.4806562662124634, "step": 2654 }, { "epoch": 0.817545209696037, "grad_norm": 8.375, "learning_rate": 3.6757591599484476e-06, "loss": 1.484478235244751, "step": 2656 }, { "epoch": 0.8181608310888804, "grad_norm": 6.71875, "learning_rate": 3.6738823634235175e-06, "loss": 0.9983839392662048, "step": 2658 }, { "epoch": 0.8187764524817237, "grad_norm": 6.34375, "learning_rate": 3.6720048972465773e-06, "loss": 1.276545763015747, "step": 2660 }, { "epoch": 0.8193920738745671, "grad_norm": 8.1875, "learning_rate": 3.670126763283307e-06, "loss": 1.4565844535827637, "step": 2662 }, { "epoch": 0.8200076952674106, "grad_norm": 4.5, "learning_rate": 3.66824796340005e-06, "loss": 1.2369879484176636, "step": 2664 }, { "epoch": 0.8206233166602539, "grad_norm": 12.5, "learning_rate": 3.666368499463814e-06, "loss": 1.103905439376831, "step": 2666 }, { "epoch": 0.8212389380530973, "grad_norm": 5.78125, "learning_rate": 3.664488373342261e-06, "loss": 1.0926010608673096, "step": 2668 }, { "epoch": 0.8218545594459408, "grad_norm": 5.96875, "learning_rate": 3.662607586903717e-06, "loss": 1.2622876167297363, "step": 2670 }, { "epoch": 0.8224701808387841, "grad_norm": 5.75, "learning_rate": 3.6607261420171614e-06, "loss": 1.4322469234466553, "step": 2672 }, { "epoch": 0.8230858022316275, "grad_norm": 6.625, "learning_rate": 3.6588440405522275e-06, "loss": 1.1148122549057007, "step": 2674 }, { "epoch": 0.823701423624471, "grad_norm": 36.5, "learning_rate": 3.6569612843792015e-06, "loss": 1.4914367198944092, "step": 2676 }, { "epoch": 0.8243170450173144, "grad_norm": 4.1875, "learning_rate": 3.655077875369022e-06, "loss": 1.2348322868347168, "step": 2678 }, { "epoch": 0.8249326664101577, "grad_norm": 7.9375, "learning_rate": 3.653193815393272e-06, "loss": 1.0290086269378662, "step": 2680 }, { "epoch": 0.8255482878030012, "grad_norm": 11.0, "learning_rate": 3.6513091063241878e-06, "loss": 1.2747092247009277, "step": 2682 }, { "epoch": 0.8261639091958446, "grad_norm": 8.125, "learning_rate": 3.649423750034643e-06, "loss": 1.2471359968185425, "step": 2684 }, { "epoch": 0.8267795305886879, "grad_norm": 8.25, "learning_rate": 3.647537748398162e-06, "loss": 1.3850839138031006, "step": 2686 }, { "epoch": 0.8273951519815314, "grad_norm": 3.6875, "learning_rate": 3.645651103288904e-06, "loss": 1.3188068866729736, "step": 2688 }, { "epoch": 0.8280107733743748, "grad_norm": 11.25, "learning_rate": 3.6437638165816725e-06, "loss": 1.4054416418075562, "step": 2690 }, { "epoch": 0.8286263947672181, "grad_norm": 2.90625, "learning_rate": 3.641875890151906e-06, "loss": 1.072651982307434, "step": 2692 }, { "epoch": 0.8292420161600615, "grad_norm": 10.8125, "learning_rate": 3.63998732587568e-06, "loss": 1.5652683973312378, "step": 2694 }, { "epoch": 0.829857637552905, "grad_norm": 12.6875, "learning_rate": 3.638098125629701e-06, "loss": 1.457189917564392, "step": 2696 }, { "epoch": 0.8304732589457484, "grad_norm": 5.21875, "learning_rate": 3.636208291291312e-06, "loss": 1.1520062685012817, "step": 2698 }, { "epoch": 0.8310888803385917, "grad_norm": 6.71875, "learning_rate": 3.6343178247384827e-06, "loss": 1.2357407808303833, "step": 2700 }, { "epoch": 0.8317045017314352, "grad_norm": 7.28125, "learning_rate": 3.6324267278498114e-06, "loss": 1.4669443368911743, "step": 2702 }, { "epoch": 0.8323201231242786, "grad_norm": 1.984375, "learning_rate": 3.630535002504526e-06, "loss": 1.239076852798462, "step": 2704 }, { "epoch": 0.8329357445171219, "grad_norm": 11.5, "learning_rate": 3.6286426505824734e-06, "loss": 1.2649511098861694, "step": 2706 }, { "epoch": 0.8335513659099654, "grad_norm": 15.1875, "learning_rate": 3.6267496739641272e-06, "loss": 1.6545941829681396, "step": 2708 }, { "epoch": 0.8341669873028088, "grad_norm": 19.75, "learning_rate": 3.6248560745305818e-06, "loss": 1.504084587097168, "step": 2710 }, { "epoch": 0.8347826086956521, "grad_norm": 13.75, "learning_rate": 3.622961854163548e-06, "loss": 1.4818305969238281, "step": 2712 }, { "epoch": 0.8353982300884956, "grad_norm": 2.21875, "learning_rate": 3.6210670147453555e-06, "loss": 0.9951951503753662, "step": 2714 }, { "epoch": 0.836013851481339, "grad_norm": 8.1875, "learning_rate": 3.6191715581589482e-06, "loss": 1.1955739259719849, "step": 2716 }, { "epoch": 0.8366294728741823, "grad_norm": 9.125, "learning_rate": 3.6172754862878844e-06, "loss": 1.7448787689208984, "step": 2718 }, { "epoch": 0.8372450942670258, "grad_norm": 5.625, "learning_rate": 3.6153788010163336e-06, "loss": 1.4390456676483154, "step": 2720 }, { "epoch": 0.8378607156598692, "grad_norm": 3.421875, "learning_rate": 3.6134815042290737e-06, "loss": 1.3452774286270142, "step": 2722 }, { "epoch": 0.8384763370527126, "grad_norm": 7.09375, "learning_rate": 3.611583597811491e-06, "loss": 1.2631638050079346, "step": 2724 }, { "epoch": 0.839091958445556, "grad_norm": 4.875, "learning_rate": 3.609685083649579e-06, "loss": 1.2300083637237549, "step": 2726 }, { "epoch": 0.8397075798383994, "grad_norm": 8.0625, "learning_rate": 3.6077859636299316e-06, "loss": 1.2552623748779297, "step": 2728 }, { "epoch": 0.8403232012312428, "grad_norm": 4.6875, "learning_rate": 3.60588623963975e-06, "loss": 1.2631356716156006, "step": 2730 }, { "epoch": 0.8409388226240861, "grad_norm": 5.25, "learning_rate": 3.6039859135668287e-06, "loss": 1.3826861381530762, "step": 2732 }, { "epoch": 0.8415544440169296, "grad_norm": 49.75, "learning_rate": 3.602084987299567e-06, "loss": 1.6629542112350464, "step": 2734 }, { "epoch": 0.842170065409773, "grad_norm": 11.5, "learning_rate": 3.6001834627269573e-06, "loss": 1.6803635358810425, "step": 2736 }, { "epoch": 0.8427856868026163, "grad_norm": 25.75, "learning_rate": 3.5982813417385876e-06, "loss": 1.5373589992523193, "step": 2738 }, { "epoch": 0.8434013081954598, "grad_norm": 3.890625, "learning_rate": 3.596378626224636e-06, "loss": 1.3916454315185547, "step": 2740 }, { "epoch": 0.8440169295883032, "grad_norm": 7.09375, "learning_rate": 3.594475318075876e-06, "loss": 1.5421522855758667, "step": 2742 }, { "epoch": 0.8446325509811466, "grad_norm": 6.46875, "learning_rate": 3.592571419183667e-06, "loss": 1.5785051584243774, "step": 2744 }, { "epoch": 0.84524817237399, "grad_norm": 16.75, "learning_rate": 3.5906669314399555e-06, "loss": 1.4765441417694092, "step": 2746 }, { "epoch": 0.8458637937668334, "grad_norm": 7.125, "learning_rate": 3.5887618567372752e-06, "loss": 1.3185290098190308, "step": 2748 }, { "epoch": 0.8464794151596768, "grad_norm": 8.125, "learning_rate": 3.5868561969687387e-06, "loss": 1.7368346452713013, "step": 2750 }, { "epoch": 0.8470950365525202, "grad_norm": 4.5, "learning_rate": 3.584949954028045e-06, "loss": 1.392224669456482, "step": 2752 }, { "epoch": 0.8477106579453636, "grad_norm": 5.28125, "learning_rate": 3.583043129809469e-06, "loss": 1.2633379697799683, "step": 2754 }, { "epoch": 0.848326279338207, "grad_norm": 2.40625, "learning_rate": 3.581135726207867e-06, "loss": 0.9549652338027954, "step": 2756 }, { "epoch": 0.8489419007310504, "grad_norm": 12.875, "learning_rate": 3.5792277451186665e-06, "loss": 1.3236083984375, "step": 2758 }, { "epoch": 0.8495575221238938, "grad_norm": 4.40625, "learning_rate": 3.577319188437872e-06, "loss": 1.37073814868927, "step": 2760 }, { "epoch": 0.8501731435167372, "grad_norm": 9.75, "learning_rate": 3.5754100580620587e-06, "loss": 1.2857497930526733, "step": 2762 }, { "epoch": 0.8507887649095806, "grad_norm": 3.875, "learning_rate": 3.573500355888372e-06, "loss": 1.3076057434082031, "step": 2764 }, { "epoch": 0.851404386302424, "grad_norm": 2.140625, "learning_rate": 3.5715900838145267e-06, "loss": 1.052871823310852, "step": 2766 }, { "epoch": 0.8520200076952674, "grad_norm": 10.3125, "learning_rate": 3.569679243738803e-06, "loss": 1.4118722677230835, "step": 2768 }, { "epoch": 0.8526356290881109, "grad_norm": 5.8125, "learning_rate": 3.5677678375600443e-06, "loss": 1.2163760662078857, "step": 2770 }, { "epoch": 0.8532512504809542, "grad_norm": 12.1875, "learning_rate": 3.5658558671776577e-06, "loss": 1.4401609897613525, "step": 2772 }, { "epoch": 0.8538668718737976, "grad_norm": 7.53125, "learning_rate": 3.5639433344916117e-06, "loss": 1.4799726009368896, "step": 2774 }, { "epoch": 0.854482493266641, "grad_norm": 5.8125, "learning_rate": 3.5620302414024345e-06, "loss": 1.3472819328308105, "step": 2776 }, { "epoch": 0.8550981146594844, "grad_norm": 5.03125, "learning_rate": 3.560116589811207e-06, "loss": 1.391073226928711, "step": 2778 }, { "epoch": 0.8557137360523278, "grad_norm": 11.1875, "learning_rate": 3.5582023816195687e-06, "loss": 1.4109700918197632, "step": 2780 }, { "epoch": 0.8563293574451712, "grad_norm": 33.25, "learning_rate": 3.5562876187297125e-06, "loss": 1.4943645000457764, "step": 2782 }, { "epoch": 0.8569449788380146, "grad_norm": 3.6875, "learning_rate": 3.554372303044379e-06, "loss": 1.0076923370361328, "step": 2784 }, { "epoch": 0.857560600230858, "grad_norm": 5.3125, "learning_rate": 3.5524564364668602e-06, "loss": 1.0487892627716064, "step": 2786 }, { "epoch": 0.8581762216237014, "grad_norm": 10.0, "learning_rate": 3.550540020900998e-06, "loss": 1.1835980415344238, "step": 2788 }, { "epoch": 0.8587918430165449, "grad_norm": 8.1875, "learning_rate": 3.548623058251176e-06, "loss": 1.2528934478759766, "step": 2790 }, { "epoch": 0.8594074644093882, "grad_norm": 8.375, "learning_rate": 3.5467055504223225e-06, "loss": 0.8922328352928162, "step": 2792 }, { "epoch": 0.8600230858022316, "grad_norm": 6.59375, "learning_rate": 3.5447874993199095e-06, "loss": 1.041680932044983, "step": 2794 }, { "epoch": 0.8606387071950751, "grad_norm": 6.9375, "learning_rate": 3.542868906849947e-06, "loss": 1.2172621488571167, "step": 2796 }, { "epoch": 0.8612543285879184, "grad_norm": 5.46875, "learning_rate": 3.5409497749189814e-06, "loss": 1.2450368404388428, "step": 2798 }, { "epoch": 0.8618699499807618, "grad_norm": 5.5, "learning_rate": 3.539030105434099e-06, "loss": 1.2909342050552368, "step": 2800 }, { "epoch": 0.8624855713736053, "grad_norm": 5.75, "learning_rate": 3.5371099003029184e-06, "loss": 0.8794264793395996, "step": 2802 }, { "epoch": 0.8631011927664486, "grad_norm": 7.34375, "learning_rate": 3.535189161433591e-06, "loss": 1.2951686382293701, "step": 2804 }, { "epoch": 0.863716814159292, "grad_norm": 6.75, "learning_rate": 3.5332678907347963e-06, "loss": 1.4462780952453613, "step": 2806 }, { "epoch": 0.8643324355521355, "grad_norm": 7.5625, "learning_rate": 3.531346090115745e-06, "loss": 1.3095674514770508, "step": 2808 }, { "epoch": 0.8649480569449788, "grad_norm": 9.375, "learning_rate": 3.5294237614861738e-06, "loss": 1.342227816581726, "step": 2810 }, { "epoch": 0.8655636783378222, "grad_norm": 5.5625, "learning_rate": 3.5275009067563413e-06, "loss": 1.3350898027420044, "step": 2812 }, { "epoch": 0.8661792997306657, "grad_norm": 5.65625, "learning_rate": 3.5255775278370363e-06, "loss": 1.2826955318450928, "step": 2814 }, { "epoch": 0.8667949211235091, "grad_norm": 4.09375, "learning_rate": 3.5236536266395594e-06, "loss": 1.3708828687667847, "step": 2816 }, { "epoch": 0.8674105425163524, "grad_norm": 8.375, "learning_rate": 3.521729205075736e-06, "loss": 1.5767796039581299, "step": 2818 }, { "epoch": 0.8680261639091958, "grad_norm": 20.5, "learning_rate": 3.5198042650579043e-06, "loss": 1.23195219039917, "step": 2820 }, { "epoch": 0.8686417853020393, "grad_norm": 4.15625, "learning_rate": 3.5178788084989244e-06, "loss": 1.432385802268982, "step": 2822 }, { "epoch": 0.8692574066948826, "grad_norm": 4.6875, "learning_rate": 3.5159528373121645e-06, "loss": 1.2838640213012695, "step": 2824 }, { "epoch": 0.869873028087726, "grad_norm": 3.984375, "learning_rate": 3.5140263534115038e-06, "loss": 1.0871201753616333, "step": 2826 }, { "epoch": 0.8704886494805695, "grad_norm": 3.90625, "learning_rate": 3.512099358711333e-06, "loss": 1.2386858463287354, "step": 2828 }, { "epoch": 0.8711042708734128, "grad_norm": 6.9375, "learning_rate": 3.5101718551265505e-06, "loss": 1.2389919757843018, "step": 2830 }, { "epoch": 0.8717198922662562, "grad_norm": 6.875, "learning_rate": 3.50824384457256e-06, "loss": 1.061251163482666, "step": 2832 }, { "epoch": 0.8723355136590997, "grad_norm": 2.546875, "learning_rate": 3.5063153289652685e-06, "loss": 1.2190308570861816, "step": 2834 }, { "epoch": 0.8729511350519431, "grad_norm": 10.75, "learning_rate": 3.5043863102210853e-06, "loss": 1.5301597118377686, "step": 2836 }, { "epoch": 0.8735667564447864, "grad_norm": 8.25, "learning_rate": 3.5024567902569196e-06, "loss": 1.6561012268066406, "step": 2838 }, { "epoch": 0.8741823778376299, "grad_norm": 5.1875, "learning_rate": 3.500526770990177e-06, "loss": 1.3980400562286377, "step": 2840 }, { "epoch": 0.8747979992304733, "grad_norm": 12.0625, "learning_rate": 3.4985962543387632e-06, "loss": 1.5745309591293335, "step": 2842 }, { "epoch": 0.8754136206233166, "grad_norm": 15.6875, "learning_rate": 3.4966652422210746e-06, "loss": 1.5733387470245361, "step": 2844 }, { "epoch": 0.8760292420161601, "grad_norm": 10.4375, "learning_rate": 3.4947337365560023e-06, "loss": 1.807689905166626, "step": 2846 }, { "epoch": 0.8766448634090035, "grad_norm": 7.78125, "learning_rate": 3.4928017392629265e-06, "loss": 1.3859963417053223, "step": 2848 }, { "epoch": 0.8772604848018468, "grad_norm": 8.9375, "learning_rate": 3.4908692522617147e-06, "loss": 1.6209248304367065, "step": 2850 }, { "epoch": 0.8778761061946903, "grad_norm": 3.40625, "learning_rate": 3.4889362774727244e-06, "loss": 1.3499466180801392, "step": 2852 }, { "epoch": 0.8784917275875337, "grad_norm": 6.8125, "learning_rate": 3.487002816816796e-06, "loss": 1.1040327548980713, "step": 2854 }, { "epoch": 0.8791073489803771, "grad_norm": 4.28125, "learning_rate": 3.4850688722152498e-06, "loss": 1.0755717754364014, "step": 2856 }, { "epoch": 0.8797229703732204, "grad_norm": 13.1875, "learning_rate": 3.4831344455898937e-06, "loss": 1.2058649063110352, "step": 2858 }, { "epoch": 0.8803385917660639, "grad_norm": 8.8125, "learning_rate": 3.4811995388630093e-06, "loss": 1.3847553730010986, "step": 2860 }, { "epoch": 0.8809542131589073, "grad_norm": 7.0, "learning_rate": 3.4792641539573558e-06, "loss": 1.4876608848571777, "step": 2862 }, { "epoch": 0.8815698345517506, "grad_norm": 4.5, "learning_rate": 3.4773282927961693e-06, "loss": 1.424057960510254, "step": 2864 }, { "epoch": 0.8821854559445941, "grad_norm": 17.125, "learning_rate": 3.475391957303159e-06, "loss": 0.7264156341552734, "step": 2866 }, { "epoch": 0.8828010773374375, "grad_norm": 9.125, "learning_rate": 3.4734551494025047e-06, "loss": 1.4327208995819092, "step": 2868 }, { "epoch": 0.8834166987302808, "grad_norm": 9.875, "learning_rate": 3.4715178710188552e-06, "loss": 1.6192026138305664, "step": 2870 }, { "epoch": 0.8840323201231243, "grad_norm": 8.875, "learning_rate": 3.469580124077328e-06, "loss": 1.152923583984375, "step": 2872 }, { "epoch": 0.8846479415159677, "grad_norm": 4.71875, "learning_rate": 3.4676419105035054e-06, "loss": 1.0860756635665894, "step": 2874 }, { "epoch": 0.885263562908811, "grad_norm": 5.375, "learning_rate": 3.465703232223433e-06, "loss": 1.3257733583450317, "step": 2876 }, { "epoch": 0.8858791843016545, "grad_norm": 7.5, "learning_rate": 3.4637640911636206e-06, "loss": 1.2568975687026978, "step": 2878 }, { "epoch": 0.8864948056944979, "grad_norm": 3.890625, "learning_rate": 3.4618244892510346e-06, "loss": 1.2634060382843018, "step": 2880 }, { "epoch": 0.8871104270873413, "grad_norm": 33.0, "learning_rate": 3.4598844284131017e-06, "loss": 1.273327112197876, "step": 2882 }, { "epoch": 0.8877260484801847, "grad_norm": 5.0625, "learning_rate": 3.4579439105777034e-06, "loss": 1.0977848768234253, "step": 2884 }, { "epoch": 0.8883416698730281, "grad_norm": 5.03125, "learning_rate": 3.4560029376731765e-06, "loss": 1.4040604829788208, "step": 2886 }, { "epoch": 0.8889572912658715, "grad_norm": 14.125, "learning_rate": 3.454061511628308e-06, "loss": 1.073675274848938, "step": 2888 }, { "epoch": 0.8895729126587149, "grad_norm": 7.65625, "learning_rate": 3.4521196343723377e-06, "loss": 0.6765494346618652, "step": 2890 }, { "epoch": 0.8901885340515583, "grad_norm": 5.46875, "learning_rate": 3.450177307834952e-06, "loss": 1.3744484186172485, "step": 2892 }, { "epoch": 0.8908041554444017, "grad_norm": 12.375, "learning_rate": 3.448234533946284e-06, "loss": 1.2740247249603271, "step": 2894 }, { "epoch": 0.891419776837245, "grad_norm": 82.0, "learning_rate": 3.4462913146369124e-06, "loss": 1.1558359861373901, "step": 2896 }, { "epoch": 0.8920353982300885, "grad_norm": 8.6875, "learning_rate": 3.4443476518378583e-06, "loss": 1.429876446723938, "step": 2898 }, { "epoch": 0.8926510196229319, "grad_norm": 6.125, "learning_rate": 3.4424035474805808e-06, "loss": 1.5772767066955566, "step": 2900 }, { "epoch": 0.8932666410157754, "grad_norm": 5.625, "learning_rate": 3.440459003496982e-06, "loss": 1.1360305547714233, "step": 2902 }, { "epoch": 0.8938822624086187, "grad_norm": 17.25, "learning_rate": 3.4385140218193978e-06, "loss": 1.3795175552368164, "step": 2904 }, { "epoch": 0.8944978838014621, "grad_norm": 8.625, "learning_rate": 3.4365686043806014e-06, "loss": 1.3732916116714478, "step": 2906 }, { "epoch": 0.8951135051943055, "grad_norm": 18.25, "learning_rate": 3.4346227531137954e-06, "loss": 1.2515709400177002, "step": 2908 }, { "epoch": 0.8957291265871489, "grad_norm": 8.0625, "learning_rate": 3.4326764699526184e-06, "loss": 1.6415996551513672, "step": 2910 }, { "epoch": 0.8963447479799923, "grad_norm": 9.3125, "learning_rate": 3.4307297568311337e-06, "loss": 1.3767842054367065, "step": 2912 }, { "epoch": 0.8969603693728357, "grad_norm": 7.4375, "learning_rate": 3.428782615683835e-06, "loss": 1.2890866994857788, "step": 2914 }, { "epoch": 0.8975759907656791, "grad_norm": 66.0, "learning_rate": 3.4268350484456385e-06, "loss": 1.5002754926681519, "step": 2916 }, { "epoch": 0.8981916121585225, "grad_norm": 2.671875, "learning_rate": 3.4248870570518875e-06, "loss": 1.2909613847732544, "step": 2918 }, { "epoch": 0.8988072335513659, "grad_norm": 8.125, "learning_rate": 3.4229386434383438e-06, "loss": 1.5465000867843628, "step": 2920 }, { "epoch": 0.8994228549442093, "grad_norm": 19.0, "learning_rate": 3.4209898095411894e-06, "loss": 1.0798399448394775, "step": 2922 }, { "epoch": 0.9000384763370527, "grad_norm": 13.5625, "learning_rate": 3.4190405572970242e-06, "loss": 1.0773447751998901, "step": 2924 }, { "epoch": 0.9006540977298961, "grad_norm": 8.5, "learning_rate": 3.4170908886428644e-06, "loss": 1.7177311182022095, "step": 2926 }, { "epoch": 0.9012697191227396, "grad_norm": 1.9375, "learning_rate": 3.4151408055161385e-06, "loss": 1.4434187412261963, "step": 2928 }, { "epoch": 0.9018853405155829, "grad_norm": 7.75, "learning_rate": 3.413190309854688e-06, "loss": 1.4095145463943481, "step": 2930 }, { "epoch": 0.9025009619084263, "grad_norm": 10.3125, "learning_rate": 3.4112394035967656e-06, "loss": 1.6630830764770508, "step": 2932 }, { "epoch": 0.9031165833012698, "grad_norm": 2.984375, "learning_rate": 3.40928808868103e-06, "loss": 1.0885878801345825, "step": 2934 }, { "epoch": 0.9037322046941131, "grad_norm": 3.828125, "learning_rate": 3.407336367046545e-06, "loss": 1.311049461364746, "step": 2936 }, { "epoch": 0.9043478260869565, "grad_norm": 6.96875, "learning_rate": 3.405384240632782e-06, "loss": 1.5935975313186646, "step": 2938 }, { "epoch": 0.9049634474798, "grad_norm": 5.4375, "learning_rate": 3.4034317113796125e-06, "loss": 1.2446832656860352, "step": 2940 }, { "epoch": 0.9055790688726433, "grad_norm": 5.46875, "learning_rate": 3.4014787812273063e-06, "loss": 1.5879740715026855, "step": 2942 }, { "epoch": 0.9061946902654867, "grad_norm": 21.375, "learning_rate": 3.3995254521165376e-06, "loss": 1.4036391973495483, "step": 2944 }, { "epoch": 0.9068103116583301, "grad_norm": 5.5625, "learning_rate": 3.397571725988371e-06, "loss": 1.2651968002319336, "step": 2946 }, { "epoch": 0.9074259330511736, "grad_norm": 15.75, "learning_rate": 3.3956176047842683e-06, "loss": 1.7061303853988647, "step": 2948 }, { "epoch": 0.9080415544440169, "grad_norm": 5.0625, "learning_rate": 3.393663090446083e-06, "loss": 1.3028591871261597, "step": 2950 }, { "epoch": 0.9086571758368603, "grad_norm": 7.25, "learning_rate": 3.391708184916061e-06, "loss": 0.870864748954773, "step": 2952 }, { "epoch": 0.9092727972297038, "grad_norm": 7.15625, "learning_rate": 3.389752890136835e-06, "loss": 0.7668548822402954, "step": 2954 }, { "epoch": 0.9098884186225471, "grad_norm": 15.75, "learning_rate": 3.3877972080514255e-06, "loss": 1.5941828489303589, "step": 2956 }, { "epoch": 0.9105040400153905, "grad_norm": 4.71875, "learning_rate": 3.385841140603238e-06, "loss": 1.3613324165344238, "step": 2958 }, { "epoch": 0.911119661408234, "grad_norm": 6.90625, "learning_rate": 3.3838846897360595e-06, "loss": 1.1362364292144775, "step": 2960 }, { "epoch": 0.9117352828010773, "grad_norm": 7.78125, "learning_rate": 3.3819278573940595e-06, "loss": 1.6314747333526611, "step": 2962 }, { "epoch": 0.9123509041939207, "grad_norm": 7.53125, "learning_rate": 3.3799706455217875e-06, "loss": 1.4049698114395142, "step": 2964 }, { "epoch": 0.9129665255867642, "grad_norm": 4.8125, "learning_rate": 3.3780130560641666e-06, "loss": 1.3051364421844482, "step": 2966 }, { "epoch": 0.9135821469796075, "grad_norm": 7.40625, "learning_rate": 3.376055090966499e-06, "loss": 1.285605788230896, "step": 2968 }, { "epoch": 0.9141977683724509, "grad_norm": 11.125, "learning_rate": 3.3740967521744584e-06, "loss": 1.344340205192566, "step": 2970 }, { "epoch": 0.9148133897652944, "grad_norm": 5.5625, "learning_rate": 3.372138041634088e-06, "loss": 1.3495433330535889, "step": 2972 }, { "epoch": 0.9154290111581378, "grad_norm": 10.375, "learning_rate": 3.3701789612918047e-06, "loss": 1.6084606647491455, "step": 2974 }, { "epoch": 0.9160446325509811, "grad_norm": 2.90625, "learning_rate": 3.3682195130943897e-06, "loss": 1.3200281858444214, "step": 2976 }, { "epoch": 0.9166602539438246, "grad_norm": 9.8125, "learning_rate": 3.3662596989889906e-06, "loss": 1.5909281969070435, "step": 2978 }, { "epoch": 0.917275875336668, "grad_norm": 4.71875, "learning_rate": 3.364299520923118e-06, "loss": 1.0454299449920654, "step": 2980 }, { "epoch": 0.9178914967295113, "grad_norm": 21.75, "learning_rate": 3.3623389808446468e-06, "loss": 1.0043212175369263, "step": 2982 }, { "epoch": 0.9185071181223547, "grad_norm": 4.1875, "learning_rate": 3.360378080701807e-06, "loss": 1.15622878074646, "step": 2984 }, { "epoch": 0.9191227395151982, "grad_norm": 1.625, "learning_rate": 3.3584168224431902e-06, "loss": 1.1797165870666504, "step": 2986 }, { "epoch": 0.9197383609080415, "grad_norm": 8.5625, "learning_rate": 3.3564552080177438e-06, "loss": 1.3120102882385254, "step": 2988 }, { "epoch": 0.9203539823008849, "grad_norm": 6.59375, "learning_rate": 3.354493239374766e-06, "loss": 1.605320692062378, "step": 2990 }, { "epoch": 0.9209696036937284, "grad_norm": 6.5, "learning_rate": 3.35253091846391e-06, "loss": 1.461379885673523, "step": 2992 }, { "epoch": 0.9215852250865718, "grad_norm": 5.65625, "learning_rate": 3.350568247235178e-06, "loss": 0.8134555816650391, "step": 2994 }, { "epoch": 0.9222008464794151, "grad_norm": 17.5, "learning_rate": 3.348605227638921e-06, "loss": 1.4931071996688843, "step": 2996 }, { "epoch": 0.9228164678722586, "grad_norm": 10.0, "learning_rate": 3.3466418616258345e-06, "loss": 1.7454947233200073, "step": 2998 }, { "epoch": 0.923432089265102, "grad_norm": 11.5625, "learning_rate": 3.3446781511469606e-06, "loss": 1.5419141054153442, "step": 3000 }, { "epoch": 0.9240477106579453, "grad_norm": 3.75, "learning_rate": 3.342714098153681e-06, "loss": 1.2411764860153198, "step": 3002 }, { "epoch": 0.9246633320507888, "grad_norm": 5.25, "learning_rate": 3.34074970459772e-06, "loss": 1.4362714290618896, "step": 3004 }, { "epoch": 0.9252789534436322, "grad_norm": 3.765625, "learning_rate": 3.3387849724311383e-06, "loss": 1.4024693965911865, "step": 3006 }, { "epoch": 0.9258945748364755, "grad_norm": 18.375, "learning_rate": 3.336819903606337e-06, "loss": 1.450740933418274, "step": 3008 }, { "epoch": 0.926510196229319, "grad_norm": 3.34375, "learning_rate": 3.3348545000760468e-06, "loss": 1.1831083297729492, "step": 3010 }, { "epoch": 0.9271258176221624, "grad_norm": 2.9375, "learning_rate": 3.332888763793334e-06, "loss": 1.0978304147720337, "step": 3012 }, { "epoch": 0.9277414390150057, "grad_norm": 5.78125, "learning_rate": 3.3309226967115936e-06, "loss": 1.233439326286316, "step": 3014 }, { "epoch": 0.9283570604078492, "grad_norm": 2.46875, "learning_rate": 3.3289563007845525e-06, "loss": 1.2525725364685059, "step": 3016 }, { "epoch": 0.9289726818006926, "grad_norm": 11.1875, "learning_rate": 3.326989577966262e-06, "loss": 1.3444368839263916, "step": 3018 }, { "epoch": 0.929588303193536, "grad_norm": 6.71875, "learning_rate": 3.3250225302110973e-06, "loss": 1.0215452909469604, "step": 3020 }, { "epoch": 0.9302039245863793, "grad_norm": 4.6875, "learning_rate": 3.323055159473759e-06, "loss": 1.1843159198760986, "step": 3022 }, { "epoch": 0.9308195459792228, "grad_norm": 4.28125, "learning_rate": 3.3210874677092675e-06, "loss": 1.3392987251281738, "step": 3024 }, { "epoch": 0.9314351673720662, "grad_norm": 11.25, "learning_rate": 3.319119456872961e-06, "loss": 1.518965482711792, "step": 3026 }, { "epoch": 0.9320507887649095, "grad_norm": 9.625, "learning_rate": 3.3171511289204973e-06, "loss": 1.3396942615509033, "step": 3028 }, { "epoch": 0.932666410157753, "grad_norm": 6.15625, "learning_rate": 3.3151824858078474e-06, "loss": 1.3504267930984497, "step": 3030 }, { "epoch": 0.9332820315505964, "grad_norm": 7.28125, "learning_rate": 3.313213529491297e-06, "loss": 1.3895577192306519, "step": 3032 }, { "epoch": 0.9338976529434397, "grad_norm": 9.5625, "learning_rate": 3.311244261927441e-06, "loss": 1.548140048980713, "step": 3034 }, { "epoch": 0.9345132743362832, "grad_norm": 7.78125, "learning_rate": 3.309274685073185e-06, "loss": 1.4520243406295776, "step": 3036 }, { "epoch": 0.9351288957291266, "grad_norm": 7.375, "learning_rate": 3.307304800885741e-06, "loss": 1.3225038051605225, "step": 3038 }, { "epoch": 0.93574451712197, "grad_norm": 12.9375, "learning_rate": 3.3053346113226287e-06, "loss": 1.4269733428955078, "step": 3040 }, { "epoch": 0.9363601385148134, "grad_norm": 5.0625, "learning_rate": 3.3033641183416686e-06, "loss": 1.5906643867492676, "step": 3042 }, { "epoch": 0.9369757599076568, "grad_norm": 5.0625, "learning_rate": 3.301393323900984e-06, "loss": 1.0133838653564453, "step": 3044 }, { "epoch": 0.9375913813005002, "grad_norm": 5.0625, "learning_rate": 3.2994222299589986e-06, "loss": 1.1549919843673706, "step": 3046 }, { "epoch": 0.9382070026933436, "grad_norm": 5.0625, "learning_rate": 3.2974508384744303e-06, "loss": 1.338111162185669, "step": 3048 }, { "epoch": 0.938822624086187, "grad_norm": 8.8125, "learning_rate": 3.295479151406296e-06, "loss": 1.2632451057434082, "step": 3050 }, { "epoch": 0.9394382454790304, "grad_norm": 26.375, "learning_rate": 3.293507170713906e-06, "loss": 1.6881314516067505, "step": 3052 }, { "epoch": 0.9400538668718738, "grad_norm": 15.1875, "learning_rate": 3.2915348983568612e-06, "loss": 1.5322569608688354, "step": 3054 }, { "epoch": 0.9406694882647172, "grad_norm": 6.5625, "learning_rate": 3.2895623362950533e-06, "loss": 1.2912862300872803, "step": 3056 }, { "epoch": 0.9412851096575606, "grad_norm": 16.125, "learning_rate": 3.2875894864886604e-06, "loss": 1.4331493377685547, "step": 3058 }, { "epoch": 0.941900731050404, "grad_norm": 13.4375, "learning_rate": 3.2856163508981486e-06, "loss": 1.5571575164794922, "step": 3060 }, { "epoch": 0.9425163524432474, "grad_norm": 6.625, "learning_rate": 3.283642931484266e-06, "loss": 1.1650996208190918, "step": 3062 }, { "epoch": 0.9431319738360908, "grad_norm": 4.71875, "learning_rate": 3.281669230208045e-06, "loss": 1.779885172843933, "step": 3064 }, { "epoch": 0.9437475952289343, "grad_norm": 11.25, "learning_rate": 3.2796952490307953e-06, "loss": 1.4820382595062256, "step": 3066 }, { "epoch": 0.9443632166217776, "grad_norm": 6.8125, "learning_rate": 3.2777209899141084e-06, "loss": 1.1543258428573608, "step": 3068 }, { "epoch": 0.944978838014621, "grad_norm": 10.5625, "learning_rate": 3.275746454819847e-06, "loss": 1.3555870056152344, "step": 3070 }, { "epoch": 0.9455944594074644, "grad_norm": 21.75, "learning_rate": 3.273771645710153e-06, "loss": 1.120388150215149, "step": 3072 }, { "epoch": 0.9462100808003078, "grad_norm": 4.84375, "learning_rate": 3.2717965645474382e-06, "loss": 1.1688385009765625, "step": 3074 }, { "epoch": 0.9468257021931512, "grad_norm": 7.90625, "learning_rate": 3.269821213294384e-06, "loss": 1.4369738101959229, "step": 3076 }, { "epoch": 0.9474413235859946, "grad_norm": 5.96875, "learning_rate": 3.2678455939139418e-06, "loss": 1.2481718063354492, "step": 3078 }, { "epoch": 0.948056944978838, "grad_norm": 10.0, "learning_rate": 3.2658697083693302e-06, "loss": 1.651442289352417, "step": 3080 }, { "epoch": 0.9486725663716814, "grad_norm": 6.34375, "learning_rate": 3.263893558624028e-06, "loss": 1.2866804599761963, "step": 3082 }, { "epoch": 0.9492881877645248, "grad_norm": 6.46875, "learning_rate": 3.2619171466417823e-06, "loss": 0.966914176940918, "step": 3084 }, { "epoch": 0.9499038091573683, "grad_norm": 3.453125, "learning_rate": 3.259940474386597e-06, "loss": 1.5348058938980103, "step": 3086 }, { "epoch": 0.9505194305502116, "grad_norm": 8.875, "learning_rate": 3.2579635438227354e-06, "loss": 1.1678472757339478, "step": 3088 }, { "epoch": 0.951135051943055, "grad_norm": 2.75, "learning_rate": 3.2559863569147167e-06, "loss": 1.1636126041412354, "step": 3090 }, { "epoch": 0.9517506733358985, "grad_norm": 10.9375, "learning_rate": 3.2540089156273185e-06, "loss": 1.7442286014556885, "step": 3092 }, { "epoch": 0.9523662947287418, "grad_norm": 8.375, "learning_rate": 3.2520312219255656e-06, "loss": 1.3746250867843628, "step": 3094 }, { "epoch": 0.9529819161215852, "grad_norm": 5.71875, "learning_rate": 3.250053277774739e-06, "loss": 1.3419276475906372, "step": 3096 }, { "epoch": 0.9535975375144287, "grad_norm": 7.28125, "learning_rate": 3.2480750851403652e-06, "loss": 1.4645942449569702, "step": 3098 }, { "epoch": 0.954213158907272, "grad_norm": 9.5, "learning_rate": 3.2460966459882182e-06, "loss": 1.1250495910644531, "step": 3100 }, { "epoch": 0.9548287803001154, "grad_norm": 4.15625, "learning_rate": 3.2441179622843178e-06, "loss": 1.2315913438796997, "step": 3102 }, { "epoch": 0.9554444016929589, "grad_norm": 11.4375, "learning_rate": 3.242139035994926e-06, "loss": 1.202285647392273, "step": 3104 }, { "epoch": 0.9560600230858022, "grad_norm": 2.671875, "learning_rate": 3.240159869086546e-06, "loss": 1.1278797388076782, "step": 3106 }, { "epoch": 0.9566756444786456, "grad_norm": 5.1875, "learning_rate": 3.2381804635259208e-06, "loss": 1.3194780349731445, "step": 3108 }, { "epoch": 0.957291265871489, "grad_norm": 2.015625, "learning_rate": 3.236200821280031e-06, "loss": 1.0704129934310913, "step": 3110 }, { "epoch": 0.9579068872643325, "grad_norm": 19.5, "learning_rate": 3.2342209443160895e-06, "loss": 1.5187937021255493, "step": 3112 }, { "epoch": 0.9585225086571758, "grad_norm": 6.21875, "learning_rate": 3.2322408346015453e-06, "loss": 1.3669098615646362, "step": 3114 }, { "epoch": 0.9591381300500192, "grad_norm": 4.09375, "learning_rate": 3.230260494104078e-06, "loss": 1.2145713567733765, "step": 3116 }, { "epoch": 0.9597537514428627, "grad_norm": 11.75, "learning_rate": 3.2282799247915964e-06, "loss": 1.2938891649246216, "step": 3118 }, { "epoch": 0.960369372835706, "grad_norm": 9.75, "learning_rate": 3.2262991286322366e-06, "loss": 1.0800869464874268, "step": 3120 }, { "epoch": 0.9609849942285494, "grad_norm": 4.125, "learning_rate": 3.2243181075943595e-06, "loss": 1.239424467086792, "step": 3122 }, { "epoch": 0.9616006156213929, "grad_norm": 4.59375, "learning_rate": 3.2223368636465513e-06, "loss": 1.3668651580810547, "step": 3124 }, { "epoch": 0.9622162370142362, "grad_norm": 25.125, "learning_rate": 3.2203553987576175e-06, "loss": 1.5709583759307861, "step": 3126 }, { "epoch": 0.9628318584070796, "grad_norm": 12.375, "learning_rate": 3.2183737148965845e-06, "loss": 1.1637746095657349, "step": 3128 }, { "epoch": 0.9634474797999231, "grad_norm": 5.34375, "learning_rate": 3.216391814032696e-06, "loss": 0.9557535648345947, "step": 3130 }, { "epoch": 0.9640631011927665, "grad_norm": 2.875, "learning_rate": 3.2144096981354113e-06, "loss": 1.0244866609573364, "step": 3132 }, { "epoch": 0.9646787225856098, "grad_norm": 8.9375, "learning_rate": 3.2124273691744032e-06, "loss": 1.4394854307174683, "step": 3134 }, { "epoch": 0.9652943439784533, "grad_norm": 3.890625, "learning_rate": 3.2104448291195567e-06, "loss": 1.2630878686904907, "step": 3136 }, { "epoch": 0.9659099653712967, "grad_norm": 4.53125, "learning_rate": 3.208462079940966e-06, "loss": 1.4885867834091187, "step": 3138 }, { "epoch": 0.96652558676414, "grad_norm": 15.75, "learning_rate": 3.2064791236089344e-06, "loss": 1.4528062343597412, "step": 3140 }, { "epoch": 0.9671412081569835, "grad_norm": 7.90625, "learning_rate": 3.2044959620939685e-06, "loss": 1.6152876615524292, "step": 3142 }, { "epoch": 0.9677568295498269, "grad_norm": 5.84375, "learning_rate": 3.2025125973667815e-06, "loss": 1.2320517301559448, "step": 3144 }, { "epoch": 0.9683724509426702, "grad_norm": 15.0625, "learning_rate": 3.2005290313982864e-06, "loss": 1.1179535388946533, "step": 3146 }, { "epoch": 0.9689880723355137, "grad_norm": 6.34375, "learning_rate": 3.1985452661595984e-06, "loss": 1.3401814699172974, "step": 3148 }, { "epoch": 0.9696036937283571, "grad_norm": 5.65625, "learning_rate": 3.1965613036220283e-06, "loss": 1.5490792989730835, "step": 3150 }, { "epoch": 0.9702193151212004, "grad_norm": 9.0, "learning_rate": 3.1945771457570855e-06, "loss": 1.5507392883300781, "step": 3152 }, { "epoch": 0.9708349365140438, "grad_norm": 13.8125, "learning_rate": 3.192592794536471e-06, "loss": 1.7767086029052734, "step": 3154 }, { "epoch": 0.9714505579068873, "grad_norm": 8.25, "learning_rate": 3.1906082519320793e-06, "loss": 1.5002129077911377, "step": 3156 }, { "epoch": 0.9720661792997307, "grad_norm": 4.8125, "learning_rate": 3.1886235199159955e-06, "loss": 1.209383487701416, "step": 3158 }, { "epoch": 0.972681800692574, "grad_norm": 2.1875, "learning_rate": 3.186638600460491e-06, "loss": 1.0300054550170898, "step": 3160 }, { "epoch": 0.9732974220854175, "grad_norm": 10.4375, "learning_rate": 3.1846534955380257e-06, "loss": 1.1945561170578003, "step": 3162 }, { "epoch": 0.9739130434782609, "grad_norm": 13.0, "learning_rate": 3.1826682071212435e-06, "loss": 1.2087066173553467, "step": 3164 }, { "epoch": 0.9745286648711042, "grad_norm": 6.4375, "learning_rate": 3.1806827371829687e-06, "loss": 1.490106463432312, "step": 3166 }, { "epoch": 0.9751442862639477, "grad_norm": 7.75, "learning_rate": 3.1786970876962076e-06, "loss": 1.6063194274902344, "step": 3168 }, { "epoch": 0.9757599076567911, "grad_norm": 12.125, "learning_rate": 3.1767112606341454e-06, "loss": 1.3124420642852783, "step": 3170 }, { "epoch": 0.9763755290496344, "grad_norm": 9.6875, "learning_rate": 3.1747252579701415e-06, "loss": 1.1164565086364746, "step": 3172 }, { "epoch": 0.9769911504424779, "grad_norm": 8.5625, "learning_rate": 3.1727390816777326e-06, "loss": 1.6541111469268799, "step": 3174 }, { "epoch": 0.9776067718353213, "grad_norm": 8.0625, "learning_rate": 3.1707527337306266e-06, "loss": 1.4673432111740112, "step": 3176 }, { "epoch": 0.9782223932281647, "grad_norm": 6.21875, "learning_rate": 3.168766216102701e-06, "loss": 1.450014352798462, "step": 3178 }, { "epoch": 0.9788380146210081, "grad_norm": 17.875, "learning_rate": 3.166779530768004e-06, "loss": 1.7292819023132324, "step": 3180 }, { "epoch": 0.9794536360138515, "grad_norm": 33.25, "learning_rate": 3.164792679700748e-06, "loss": 1.9745506048202515, "step": 3182 }, { "epoch": 0.9800692574066949, "grad_norm": 14.375, "learning_rate": 3.1628056648753127e-06, "loss": 1.881894826889038, "step": 3184 }, { "epoch": 0.9806848787995383, "grad_norm": 11.875, "learning_rate": 3.1608184882662386e-06, "loss": 1.3349779844284058, "step": 3186 }, { "epoch": 0.9813005001923817, "grad_norm": 14.625, "learning_rate": 3.158831151848228e-06, "loss": 1.4845685958862305, "step": 3188 }, { "epoch": 0.9819161215852251, "grad_norm": 5.09375, "learning_rate": 3.1568436575961412e-06, "loss": 1.3236198425292969, "step": 3190 }, { "epoch": 0.9825317429780684, "grad_norm": 7.46875, "learning_rate": 3.1548560074849965e-06, "loss": 0.7392821311950684, "step": 3192 }, { "epoch": 0.9831473643709119, "grad_norm": 11.375, "learning_rate": 3.152868203489965e-06, "loss": 1.5636484622955322, "step": 3194 }, { "epoch": 0.9837629857637553, "grad_norm": 2.25, "learning_rate": 3.150880247586374e-06, "loss": 1.1580954790115356, "step": 3196 }, { "epoch": 0.9843786071565986, "grad_norm": 6.125, "learning_rate": 3.1488921417496985e-06, "loss": 1.0804388523101807, "step": 3198 }, { "epoch": 0.9849942285494421, "grad_norm": 7.8125, "learning_rate": 3.1469038879555647e-06, "loss": 1.2902541160583496, "step": 3200 }, { "epoch": 0.9856098499422855, "grad_norm": 5.0, "learning_rate": 3.144915488179744e-06, "loss": 1.177592158317566, "step": 3202 }, { "epoch": 0.986225471335129, "grad_norm": 4.9375, "learning_rate": 3.1429269443981537e-06, "loss": 0.9780274629592896, "step": 3204 }, { "epoch": 0.9868410927279723, "grad_norm": 8.5, "learning_rate": 3.1409382585868553e-06, "loss": 1.2001250982284546, "step": 3206 }, { "epoch": 0.9874567141208157, "grad_norm": 7.78125, "learning_rate": 3.1389494327220506e-06, "loss": 1.0966755151748657, "step": 3208 }, { "epoch": 0.9880723355136591, "grad_norm": 5.8125, "learning_rate": 3.1369604687800804e-06, "loss": 1.4548532962799072, "step": 3210 }, { "epoch": 0.9886879569065025, "grad_norm": 11.5625, "learning_rate": 3.1349713687374213e-06, "loss": 1.450698733329773, "step": 3212 }, { "epoch": 0.9893035782993459, "grad_norm": 3.234375, "learning_rate": 3.1329821345706877e-06, "loss": 1.2463198900222778, "step": 3214 }, { "epoch": 0.9899191996921893, "grad_norm": 10.125, "learning_rate": 3.1309927682566266e-06, "loss": 1.4055688381195068, "step": 3216 }, { "epoch": 0.9905348210850327, "grad_norm": 11.3125, "learning_rate": 3.1290032717721143e-06, "loss": 1.20082426071167, "step": 3218 }, { "epoch": 0.9911504424778761, "grad_norm": 16.625, "learning_rate": 3.1270136470941604e-06, "loss": 1.1681442260742188, "step": 3220 }, { "epoch": 0.9917660638707195, "grad_norm": 8.125, "learning_rate": 3.1250238961998972e-06, "loss": 1.6339752674102783, "step": 3222 }, { "epoch": 0.992381685263563, "grad_norm": 11.875, "learning_rate": 3.1230340210665866e-06, "loss": 1.3062167167663574, "step": 3224 }, { "epoch": 0.9929973066564063, "grad_norm": 7.53125, "learning_rate": 3.121044023671611e-06, "loss": 1.5589596033096313, "step": 3226 }, { "epoch": 0.9936129280492497, "grad_norm": 7.78125, "learning_rate": 3.1190539059924756e-06, "loss": 1.5615897178649902, "step": 3228 }, { "epoch": 0.9942285494420932, "grad_norm": 13.375, "learning_rate": 3.117063670006806e-06, "loss": 1.5788617134094238, "step": 3230 }, { "epoch": 0.9948441708349365, "grad_norm": 11.125, "learning_rate": 3.115073317692342e-06, "loss": 1.3353925943374634, "step": 3232 }, { "epoch": 0.9954597922277799, "grad_norm": 4.03125, "learning_rate": 3.113082851026944e-06, "loss": 0.8592925667762756, "step": 3234 }, { "epoch": 0.9960754136206234, "grad_norm": 5.125, "learning_rate": 3.1110922719885817e-06, "loss": 0.9766306281089783, "step": 3236 }, { "epoch": 0.9966910350134667, "grad_norm": 5.21875, "learning_rate": 3.109101582555338e-06, "loss": 0.9334098100662231, "step": 3238 }, { "epoch": 0.9973066564063101, "grad_norm": 2.640625, "learning_rate": 3.1071107847054074e-06, "loss": 0.8099700212478638, "step": 3240 }, { "epoch": 0.9979222777991535, "grad_norm": 4.1875, "learning_rate": 3.1051198804170877e-06, "loss": 1.2748355865478516, "step": 3242 }, { "epoch": 0.9985378991919969, "grad_norm": 9.5625, "learning_rate": 3.103128871668787e-06, "loss": 1.2080063819885254, "step": 3244 }, { "epoch": 0.9991535205848403, "grad_norm": 11.1875, "learning_rate": 3.1011377604390147e-06, "loss": 1.3822190761566162, "step": 3246 }, { "epoch": 0.9997691419776837, "grad_norm": 6.34375, "learning_rate": 3.099146548706383e-06, "loss": 1.387730360031128, "step": 3248 }, { "epoch": 1.0003078106964216, "grad_norm": 1.5546875, "learning_rate": 3.0971552384496028e-06, "loss": 1.398690938949585, "step": 3250 }, { "epoch": 1.000923432089265, "grad_norm": 10.375, "learning_rate": 3.0951638316474853e-06, "loss": 1.44259774684906, "step": 3252 }, { "epoch": 1.0015390534821085, "grad_norm": 5.4375, "learning_rate": 3.0931723302789352e-06, "loss": 1.1998789310455322, "step": 3254 }, { "epoch": 1.002154674874952, "grad_norm": 11.125, "learning_rate": 3.0911807363229505e-06, "loss": 1.4243640899658203, "step": 3256 }, { "epoch": 1.0027702962677953, "grad_norm": 6.65625, "learning_rate": 3.0891890517586254e-06, "loss": 1.6670856475830078, "step": 3258 }, { "epoch": 1.0033859176606388, "grad_norm": 1.6796875, "learning_rate": 3.0871972785651395e-06, "loss": 1.1755706071853638, "step": 3260 }, { "epoch": 1.0040015390534822, "grad_norm": 12.5, "learning_rate": 3.085205418721764e-06, "loss": 0.7385867238044739, "step": 3262 }, { "epoch": 1.0046171604463254, "grad_norm": 5.15625, "learning_rate": 3.083213474207854e-06, "loss": 1.5130101442337036, "step": 3264 }, { "epoch": 1.0052327818391689, "grad_norm": 5.6875, "learning_rate": 3.081221447002849e-06, "loss": 1.2244466543197632, "step": 3266 }, { "epoch": 1.0058484032320123, "grad_norm": 6.9375, "learning_rate": 3.0792293390862715e-06, "loss": 1.3093106746673584, "step": 3268 }, { "epoch": 1.0064640246248557, "grad_norm": 19.625, "learning_rate": 3.0772371524377242e-06, "loss": 1.218044638633728, "step": 3270 }, { "epoch": 1.0070796460176992, "grad_norm": 13.4375, "learning_rate": 3.0752448890368865e-06, "loss": 1.5576331615447998, "step": 3272 }, { "epoch": 1.0076952674105426, "grad_norm": 12.5, "learning_rate": 3.0732525508635157e-06, "loss": 1.055126667022705, "step": 3274 }, { "epoch": 1.0083108888033858, "grad_norm": 10.8125, "learning_rate": 3.071260139897445e-06, "loss": 1.1821177005767822, "step": 3276 }, { "epoch": 1.0089265101962293, "grad_norm": 10.6875, "learning_rate": 3.069267658118574e-06, "loss": 1.4936022758483887, "step": 3278 }, { "epoch": 1.0095421315890727, "grad_norm": 6.25, "learning_rate": 3.0672751075068796e-06, "loss": 1.2071452140808105, "step": 3280 }, { "epoch": 1.0101577529819161, "grad_norm": 8.875, "learning_rate": 3.0652824900424015e-06, "loss": 1.8205440044403076, "step": 3282 }, { "epoch": 1.0107733743747596, "grad_norm": 4.75, "learning_rate": 3.063289807705251e-06, "loss": 1.3226101398468018, "step": 3284 }, { "epoch": 1.011388995767603, "grad_norm": 7.40625, "learning_rate": 3.061297062475599e-06, "loss": 1.2532638311386108, "step": 3286 }, { "epoch": 1.0120046171604464, "grad_norm": 2.765625, "learning_rate": 3.059304256333682e-06, "loss": 1.1202727556228638, "step": 3288 }, { "epoch": 1.0126202385532896, "grad_norm": 4.65625, "learning_rate": 3.0573113912597967e-06, "loss": 1.1009318828582764, "step": 3290 }, { "epoch": 1.013235859946133, "grad_norm": 9.6875, "learning_rate": 3.0553184692342967e-06, "loss": 1.1363499164581299, "step": 3292 }, { "epoch": 1.0138514813389765, "grad_norm": 5.8125, "learning_rate": 3.0533254922375942e-06, "loss": 1.536495327949524, "step": 3294 }, { "epoch": 1.01446710273182, "grad_norm": 5.15625, "learning_rate": 3.051332462250155e-06, "loss": 1.1987110376358032, "step": 3296 }, { "epoch": 1.0150827241246634, "grad_norm": 5.8125, "learning_rate": 3.049339381252497e-06, "loss": 1.3810436725616455, "step": 3298 }, { "epoch": 1.0156983455175068, "grad_norm": 1.6953125, "learning_rate": 3.04734625122519e-06, "loss": 1.0787886381149292, "step": 3300 }, { "epoch": 1.01631396691035, "grad_norm": 5.375, "learning_rate": 3.045353074148851e-06, "loss": 1.5245403051376343, "step": 3302 }, { "epoch": 1.0169295883031935, "grad_norm": 3.515625, "learning_rate": 3.0433598520041462e-06, "loss": 1.351635217666626, "step": 3304 }, { "epoch": 1.017545209696037, "grad_norm": 14.3125, "learning_rate": 3.041366586771786e-06, "loss": 1.6581358909606934, "step": 3306 }, { "epoch": 1.0181608310888803, "grad_norm": 9.5, "learning_rate": 3.0393732804325193e-06, "loss": 1.514939785003662, "step": 3308 }, { "epoch": 1.0187764524817238, "grad_norm": 17.0, "learning_rate": 3.037379934967142e-06, "loss": 1.204219102859497, "step": 3310 }, { "epoch": 1.0193920738745672, "grad_norm": 1.921875, "learning_rate": 3.0353865523564854e-06, "loss": 1.2337111234664917, "step": 3312 }, { "epoch": 1.0200076952674106, "grad_norm": 9.1875, "learning_rate": 3.0333931345814177e-06, "loss": 1.1594010591506958, "step": 3314 }, { "epoch": 1.0206233166602539, "grad_norm": 3.96875, "learning_rate": 3.031399683622844e-06, "loss": 1.0055603981018066, "step": 3316 }, { "epoch": 1.0212389380530973, "grad_norm": 13.375, "learning_rate": 3.029406201461702e-06, "loss": 1.6027551889419556, "step": 3318 }, { "epoch": 1.0218545594459407, "grad_norm": 35.75, "learning_rate": 3.0274126900789575e-06, "loss": 1.4393030405044556, "step": 3320 }, { "epoch": 1.0224701808387842, "grad_norm": 6.46875, "learning_rate": 3.0254191514556084e-06, "loss": 1.414679765701294, "step": 3322 }, { "epoch": 1.0230858022316276, "grad_norm": 13.5, "learning_rate": 3.023425587572678e-06, "loss": 1.354950189590454, "step": 3324 }, { "epoch": 1.023701423624471, "grad_norm": 8.875, "learning_rate": 3.0214320004112176e-06, "loss": 1.4962619543075562, "step": 3326 }, { "epoch": 1.0243170450173142, "grad_norm": 31.125, "learning_rate": 3.019438391952297e-06, "loss": 1.1890467405319214, "step": 3328 }, { "epoch": 1.0249326664101577, "grad_norm": 4.53125, "learning_rate": 3.017444764177011e-06, "loss": 1.1171259880065918, "step": 3330 }, { "epoch": 1.0255482878030011, "grad_norm": 5.8125, "learning_rate": 3.0154511190664713e-06, "loss": 1.2560460567474365, "step": 3332 }, { "epoch": 1.0261639091958445, "grad_norm": 4.40625, "learning_rate": 3.0134574586018085e-06, "loss": 0.9332529902458191, "step": 3334 }, { "epoch": 1.026779530588688, "grad_norm": 12.5625, "learning_rate": 3.011463784764168e-06, "loss": 1.6173343658447266, "step": 3336 }, { "epoch": 1.0273951519815314, "grad_norm": 12.25, "learning_rate": 3.009470099534707e-06, "loss": 1.3854453563690186, "step": 3338 }, { "epoch": 1.0280107733743749, "grad_norm": 5.15625, "learning_rate": 3.0074764048945963e-06, "loss": 1.3140357732772827, "step": 3340 }, { "epoch": 1.028626394767218, "grad_norm": 7.59375, "learning_rate": 3.005482702825014e-06, "loss": 1.7254990339279175, "step": 3342 }, { "epoch": 1.0292420161600615, "grad_norm": 13.5, "learning_rate": 3.0034889953071466e-06, "loss": 1.5347199440002441, "step": 3344 }, { "epoch": 1.029857637552905, "grad_norm": 8.5625, "learning_rate": 3.0014952843221874e-06, "loss": 1.156404733657837, "step": 3346 }, { "epoch": 1.0304732589457484, "grad_norm": 10.0625, "learning_rate": 2.9995015718513296e-06, "loss": 1.064646601676941, "step": 3348 }, { "epoch": 1.0310888803385918, "grad_norm": 7.5, "learning_rate": 2.9975078598757723e-06, "loss": 1.219983696937561, "step": 3350 }, { "epoch": 1.0317045017314352, "grad_norm": 16.375, "learning_rate": 2.9955141503767093e-06, "loss": 1.3969101905822754, "step": 3352 }, { "epoch": 1.0323201231242787, "grad_norm": 8.1875, "learning_rate": 2.9935204453353363e-06, "loss": 1.3140618801116943, "step": 3354 }, { "epoch": 1.032935744517122, "grad_norm": 7.0625, "learning_rate": 2.9915267467328414e-06, "loss": 1.3451359272003174, "step": 3356 }, { "epoch": 1.0335513659099653, "grad_norm": 60.25, "learning_rate": 2.9895330565504088e-06, "loss": 1.5487874746322632, "step": 3358 }, { "epoch": 1.0341669873028088, "grad_norm": 4.90625, "learning_rate": 2.9875393767692117e-06, "loss": 1.08141028881073, "step": 3360 }, { "epoch": 1.0347826086956522, "grad_norm": 11.375, "learning_rate": 2.9855457093704166e-06, "loss": 1.8934881687164307, "step": 3362 }, { "epoch": 1.0353982300884956, "grad_norm": 8.875, "learning_rate": 2.9835520563351737e-06, "loss": 1.5357357263565063, "step": 3364 }, { "epoch": 1.036013851481339, "grad_norm": 6.40625, "learning_rate": 2.98155841964462e-06, "loss": 1.1267738342285156, "step": 3366 }, { "epoch": 1.0366294728741823, "grad_norm": 4.5, "learning_rate": 2.9795648012798795e-06, "loss": 1.5629507303237915, "step": 3368 }, { "epoch": 1.0372450942670257, "grad_norm": 22.0, "learning_rate": 2.9775712032220526e-06, "loss": 1.2372854948043823, "step": 3370 }, { "epoch": 1.0378607156598691, "grad_norm": 3.78125, "learning_rate": 2.975577627452225e-06, "loss": 1.0697031021118164, "step": 3372 }, { "epoch": 1.0384763370527126, "grad_norm": 1.921875, "learning_rate": 2.973584075951456e-06, "loss": 1.0637809038162231, "step": 3374 }, { "epoch": 1.039091958445556, "grad_norm": 8.25, "learning_rate": 2.9715905507007837e-06, "loss": 1.3433064222335815, "step": 3376 }, { "epoch": 1.0397075798383995, "grad_norm": 5.75, "learning_rate": 2.969597053681217e-06, "loss": 1.0099010467529297, "step": 3378 }, { "epoch": 1.0403232012312429, "grad_norm": 5.34375, "learning_rate": 2.9676035868737397e-06, "loss": 1.3029708862304688, "step": 3380 }, { "epoch": 1.040938822624086, "grad_norm": 18.75, "learning_rate": 2.965610152259304e-06, "loss": 1.4256157875061035, "step": 3382 }, { "epoch": 1.0415544440169295, "grad_norm": 1.84375, "learning_rate": 2.9636167518188308e-06, "loss": 1.1102657318115234, "step": 3384 }, { "epoch": 1.042170065409773, "grad_norm": 3.046875, "learning_rate": 2.961623387533208e-06, "loss": 0.9856078624725342, "step": 3386 }, { "epoch": 1.0427856868026164, "grad_norm": 5.96875, "learning_rate": 2.959630061383285e-06, "loss": 1.6799319982528687, "step": 3388 }, { "epoch": 1.0434013081954598, "grad_norm": 6.5, "learning_rate": 2.957636775349874e-06, "loss": 1.4677543640136719, "step": 3390 }, { "epoch": 1.0440169295883033, "grad_norm": 5.78125, "learning_rate": 2.9556435314137495e-06, "loss": 1.3481136560440063, "step": 3392 }, { "epoch": 1.0446325509811465, "grad_norm": 4.1875, "learning_rate": 2.953650331555642e-06, "loss": 1.2225199937820435, "step": 3394 }, { "epoch": 1.04524817237399, "grad_norm": 20.25, "learning_rate": 2.9516571777562387e-06, "loss": 1.614674687385559, "step": 3396 }, { "epoch": 1.0458637937668334, "grad_norm": 17.75, "learning_rate": 2.949664071996182e-06, "loss": 1.031624674797058, "step": 3398 }, { "epoch": 1.0464794151596768, "grad_norm": 6.1875, "learning_rate": 2.947671016256066e-06, "loss": 1.179604172706604, "step": 3400 }, { "epoch": 1.0470950365525202, "grad_norm": 25.0, "learning_rate": 2.945678012516433e-06, "loss": 1.6507970094680786, "step": 3402 }, { "epoch": 1.0477106579453637, "grad_norm": 4.71875, "learning_rate": 2.943685062757777e-06, "loss": 1.0796221494674683, "step": 3404 }, { "epoch": 1.048326279338207, "grad_norm": 8.6875, "learning_rate": 2.941692168960536e-06, "loss": 1.5195220708847046, "step": 3406 }, { "epoch": 1.0489419007310503, "grad_norm": 11.9375, "learning_rate": 2.9396993331050944e-06, "loss": 1.0404980182647705, "step": 3408 }, { "epoch": 1.0495575221238937, "grad_norm": 6.0, "learning_rate": 2.937706557171777e-06, "loss": 1.3892185688018799, "step": 3410 }, { "epoch": 1.0501731435167372, "grad_norm": 3.3125, "learning_rate": 2.9357138431408493e-06, "loss": 0.933286726474762, "step": 3412 }, { "epoch": 1.0507887649095806, "grad_norm": 21.125, "learning_rate": 2.933721192992518e-06, "loss": 0.8847688436508179, "step": 3414 }, { "epoch": 1.051404386302424, "grad_norm": 32.0, "learning_rate": 2.9317286087069225e-06, "loss": 1.8928682804107666, "step": 3416 }, { "epoch": 1.0520200076952675, "grad_norm": 6.15625, "learning_rate": 2.9297360922641393e-06, "loss": 1.357107162475586, "step": 3418 }, { "epoch": 1.052635629088111, "grad_norm": 1.9765625, "learning_rate": 2.9277436456441755e-06, "loss": 1.1912208795547485, "step": 3420 }, { "epoch": 1.0532512504809541, "grad_norm": 19.625, "learning_rate": 2.92575127082697e-06, "loss": 1.3276985883712769, "step": 3422 }, { "epoch": 1.0538668718737976, "grad_norm": 3.765625, "learning_rate": 2.9237589697923917e-06, "loss": 1.3126859664916992, "step": 3424 }, { "epoch": 1.054482493266641, "grad_norm": 4.75, "learning_rate": 2.921766744520235e-06, "loss": 1.3983420133590698, "step": 3426 }, { "epoch": 1.0550981146594844, "grad_norm": 7.1875, "learning_rate": 2.919774596990217e-06, "loss": 1.2710156440734863, "step": 3428 }, { "epoch": 1.0557137360523279, "grad_norm": 6.21875, "learning_rate": 2.917782529181981e-06, "loss": 1.5055437088012695, "step": 3430 }, { "epoch": 1.0563293574451713, "grad_norm": 36.75, "learning_rate": 2.9157905430750884e-06, "loss": 0.8962615132331848, "step": 3432 }, { "epoch": 1.0569449788380145, "grad_norm": 18.875, "learning_rate": 2.9137986406490205e-06, "loss": 1.3401298522949219, "step": 3434 }, { "epoch": 1.057560600230858, "grad_norm": 9.1875, "learning_rate": 2.9118068238831755e-06, "loss": 0.9295080900192261, "step": 3436 }, { "epoch": 1.0581762216237014, "grad_norm": 7.0, "learning_rate": 2.909815094756867e-06, "loss": 1.655866026878357, "step": 3438 }, { "epoch": 1.0587918430165448, "grad_norm": 6.3125, "learning_rate": 2.907823455249321e-06, "loss": 1.320584774017334, "step": 3440 }, { "epoch": 1.0594074644093883, "grad_norm": 2.1875, "learning_rate": 2.9058319073396725e-06, "loss": 1.1577818393707275, "step": 3442 }, { "epoch": 1.0600230858022317, "grad_norm": 4.53125, "learning_rate": 2.9038404530069687e-06, "loss": 1.1551495790481567, "step": 3444 }, { "epoch": 1.060638707195075, "grad_norm": 7.6875, "learning_rate": 2.9018490942301625e-06, "loss": 1.2440279722213745, "step": 3446 }, { "epoch": 1.0612543285879183, "grad_norm": 9.3125, "learning_rate": 2.899857832988112e-06, "loss": 1.4411247968673706, "step": 3448 }, { "epoch": 1.0618699499807618, "grad_norm": 12.375, "learning_rate": 2.897866671259577e-06, "loss": 1.2047297954559326, "step": 3450 }, { "epoch": 1.0624855713736052, "grad_norm": 8.3125, "learning_rate": 2.8958756110232212e-06, "loss": 1.1603564023971558, "step": 3452 }, { "epoch": 1.0631011927664487, "grad_norm": 8.0, "learning_rate": 2.893884654257604e-06, "loss": 1.3994611501693726, "step": 3454 }, { "epoch": 1.063716814159292, "grad_norm": 8.375, "learning_rate": 2.8918938029411836e-06, "loss": 1.4442249536514282, "step": 3456 }, { "epoch": 1.0643324355521355, "grad_norm": 19.625, "learning_rate": 2.889903059052315e-06, "loss": 1.4329032897949219, "step": 3458 }, { "epoch": 1.0649480569449787, "grad_norm": 8.5, "learning_rate": 2.8879124245692456e-06, "loss": 1.8946828842163086, "step": 3460 }, { "epoch": 1.0655636783378222, "grad_norm": 6.625, "learning_rate": 2.8859219014701112e-06, "loss": 1.313145637512207, "step": 3462 }, { "epoch": 1.0661792997306656, "grad_norm": 11.8125, "learning_rate": 2.883931491732942e-06, "loss": 1.3232002258300781, "step": 3464 }, { "epoch": 1.066794921123509, "grad_norm": 10.5625, "learning_rate": 2.8819411973356497e-06, "loss": 1.1899701356887817, "step": 3466 }, { "epoch": 1.0674105425163525, "grad_norm": 10.25, "learning_rate": 2.8799510202560366e-06, "loss": 1.5477254390716553, "step": 3468 }, { "epoch": 1.068026163909196, "grad_norm": 4.8125, "learning_rate": 2.8779609624717854e-06, "loss": 1.286978006362915, "step": 3470 }, { "epoch": 1.0686417853020393, "grad_norm": 8.25, "learning_rate": 2.8759710259604616e-06, "loss": 1.2044036388397217, "step": 3472 }, { "epoch": 1.0692574066948826, "grad_norm": 6.09375, "learning_rate": 2.8739812126995094e-06, "loss": 1.1230486631393433, "step": 3474 }, { "epoch": 1.069873028087726, "grad_norm": 1.3359375, "learning_rate": 2.871991524666251e-06, "loss": 1.222065806388855, "step": 3476 }, { "epoch": 1.0704886494805694, "grad_norm": 10.1875, "learning_rate": 2.8700019638378846e-06, "loss": 1.2572171688079834, "step": 3478 }, { "epoch": 1.0711042708734129, "grad_norm": 5.28125, "learning_rate": 2.86801253219148e-06, "loss": 1.5641423463821411, "step": 3480 }, { "epoch": 1.0717198922662563, "grad_norm": 4.84375, "learning_rate": 2.8660232317039804e-06, "loss": 1.1697967052459717, "step": 3482 }, { "epoch": 1.0723355136590997, "grad_norm": 10.25, "learning_rate": 2.8640340643521996e-06, "loss": 1.2459495067596436, "step": 3484 }, { "epoch": 1.0729511350519432, "grad_norm": 8.0, "learning_rate": 2.862045032112817e-06, "loss": 1.1779091358184814, "step": 3486 }, { "epoch": 1.0735667564447864, "grad_norm": 2.78125, "learning_rate": 2.860056136962377e-06, "loss": 1.218005657196045, "step": 3488 }, { "epoch": 1.0741823778376298, "grad_norm": 11.4375, "learning_rate": 2.858067380877292e-06, "loss": 1.2789708375930786, "step": 3490 }, { "epoch": 1.0747979992304733, "grad_norm": 6.1875, "learning_rate": 2.856078765833832e-06, "loss": 1.0197508335113525, "step": 3492 }, { "epoch": 1.0754136206233167, "grad_norm": 4.34375, "learning_rate": 2.8540902938081285e-06, "loss": 1.2939587831497192, "step": 3494 }, { "epoch": 1.0760292420161601, "grad_norm": 4.28125, "learning_rate": 2.8521019667761697e-06, "loss": 1.3281444311141968, "step": 3496 }, { "epoch": 1.0766448634090036, "grad_norm": 2.40625, "learning_rate": 2.8501137867138017e-06, "loss": 1.1961374282836914, "step": 3498 }, { "epoch": 1.0772604848018468, "grad_norm": 12.4375, "learning_rate": 2.8481257555967244e-06, "loss": 1.1601097583770752, "step": 3500 }, { "epoch": 1.0778761061946902, "grad_norm": 5.0, "learning_rate": 2.8461378754004886e-06, "loss": 1.362778663635254, "step": 3502 }, { "epoch": 1.0784917275875336, "grad_norm": 11.6875, "learning_rate": 2.844150148100495e-06, "loss": 1.5003156661987305, "step": 3504 }, { "epoch": 1.079107348980377, "grad_norm": 5.53125, "learning_rate": 2.8421625756719923e-06, "loss": 1.8150582313537598, "step": 3506 }, { "epoch": 1.0797229703732205, "grad_norm": 9.6875, "learning_rate": 2.840175160090076e-06, "loss": 1.5286773443222046, "step": 3508 }, { "epoch": 1.080338591766064, "grad_norm": 16.625, "learning_rate": 2.8381879033296856e-06, "loss": 1.0091629028320312, "step": 3510 }, { "epoch": 1.0809542131589072, "grad_norm": 11.6875, "learning_rate": 2.8362008073656033e-06, "loss": 1.7487086057662964, "step": 3512 }, { "epoch": 1.0815698345517506, "grad_norm": 3.734375, "learning_rate": 2.83421387417245e-06, "loss": 1.1365066766738892, "step": 3514 }, { "epoch": 1.082185455944594, "grad_norm": 2.3125, "learning_rate": 2.8322271057246864e-06, "loss": 1.261230230331421, "step": 3516 }, { "epoch": 1.0828010773374375, "grad_norm": 19.25, "learning_rate": 2.830240503996609e-06, "loss": 1.410510778427124, "step": 3518 }, { "epoch": 1.083416698730281, "grad_norm": 8.4375, "learning_rate": 2.8282540709623473e-06, "loss": 1.1378717422485352, "step": 3520 }, { "epoch": 1.0840323201231243, "grad_norm": 5.6875, "learning_rate": 2.8262678085958657e-06, "loss": 1.0326919555664062, "step": 3522 }, { "epoch": 1.0846479415159678, "grad_norm": 11.4375, "learning_rate": 2.8242817188709563e-06, "loss": 1.4649014472961426, "step": 3524 }, { "epoch": 1.085263562908811, "grad_norm": 8.0625, "learning_rate": 2.8222958037612423e-06, "loss": 1.2231855392456055, "step": 3526 }, { "epoch": 1.0858791843016544, "grad_norm": 7.09375, "learning_rate": 2.8203100652401714e-06, "loss": 1.3849393129348755, "step": 3528 }, { "epoch": 1.0864948056944979, "grad_norm": 12.875, "learning_rate": 2.818324505281017e-06, "loss": 1.4800243377685547, "step": 3530 }, { "epoch": 1.0871104270873413, "grad_norm": 4.1875, "learning_rate": 2.8163391258568738e-06, "loss": 1.0491694211959839, "step": 3532 }, { "epoch": 1.0877260484801847, "grad_norm": 18.5, "learning_rate": 2.8143539289406583e-06, "loss": 1.6905831098556519, "step": 3534 }, { "epoch": 1.0883416698730282, "grad_norm": 7.71875, "learning_rate": 2.8123689165051042e-06, "loss": 1.5698007345199585, "step": 3536 }, { "epoch": 1.0889572912658716, "grad_norm": 4.1875, "learning_rate": 2.810384090522765e-06, "loss": 0.8298982381820679, "step": 3538 }, { "epoch": 1.0895729126587148, "grad_norm": 7.0, "learning_rate": 2.808399452966004e-06, "loss": 1.7035311460494995, "step": 3540 }, { "epoch": 1.0901885340515582, "grad_norm": 2.53125, "learning_rate": 2.8064150058070026e-06, "loss": 0.7278132438659668, "step": 3542 }, { "epoch": 1.0908041554444017, "grad_norm": 15.625, "learning_rate": 2.804430751017749e-06, "loss": 1.5310912132263184, "step": 3544 }, { "epoch": 1.0914197768372451, "grad_norm": 4.59375, "learning_rate": 2.802446690570042e-06, "loss": 1.3212103843688965, "step": 3546 }, { "epoch": 1.0920353982300885, "grad_norm": 7.75, "learning_rate": 2.8004628264354873e-06, "loss": 0.8523740768432617, "step": 3548 }, { "epoch": 1.092651019622932, "grad_norm": 4.8125, "learning_rate": 2.7984791605854935e-06, "loss": 1.2967448234558105, "step": 3550 }, { "epoch": 1.0932666410157752, "grad_norm": 46.5, "learning_rate": 2.796495694991276e-06, "loss": 1.3590329885482788, "step": 3552 }, { "epoch": 1.0938822624086186, "grad_norm": 7.625, "learning_rate": 2.794512431623847e-06, "loss": 1.7286721467971802, "step": 3554 }, { "epoch": 1.094497883801462, "grad_norm": 12.0625, "learning_rate": 2.7925293724540226e-06, "loss": 1.0495727062225342, "step": 3556 }, { "epoch": 1.0951135051943055, "grad_norm": 17.875, "learning_rate": 2.7905465194524106e-06, "loss": 1.4596565961837769, "step": 3558 }, { "epoch": 1.095729126587149, "grad_norm": 9.3125, "learning_rate": 2.788563874589417e-06, "loss": 2.065638780593872, "step": 3560 }, { "epoch": 1.0963447479799924, "grad_norm": 8.9375, "learning_rate": 2.7865814398352412e-06, "loss": 1.5109493732452393, "step": 3562 }, { "epoch": 1.0969603693728358, "grad_norm": 6.90625, "learning_rate": 2.7845992171598724e-06, "loss": 0.8623924255371094, "step": 3564 }, { "epoch": 1.097575990765679, "grad_norm": 9.4375, "learning_rate": 2.7826172085330895e-06, "loss": 1.1472525596618652, "step": 3566 }, { "epoch": 1.0981916121585225, "grad_norm": 36.75, "learning_rate": 2.780635415924461e-06, "loss": 0.913570761680603, "step": 3568 }, { "epoch": 1.098807233551366, "grad_norm": 4.84375, "learning_rate": 2.7786538413033347e-06, "loss": 1.163435459136963, "step": 3570 }, { "epoch": 1.0994228549442093, "grad_norm": 11.8125, "learning_rate": 2.7766724866388496e-06, "loss": 1.3768595457077026, "step": 3572 }, { "epoch": 1.1000384763370528, "grad_norm": 2.296875, "learning_rate": 2.7746913538999197e-06, "loss": 0.9041175246238708, "step": 3574 }, { "epoch": 1.1006540977298962, "grad_norm": 5.15625, "learning_rate": 2.772710445055242e-06, "loss": 1.5667126178741455, "step": 3576 }, { "epoch": 1.1012697191227394, "grad_norm": 2.8125, "learning_rate": 2.7707297620732897e-06, "loss": 1.352919340133667, "step": 3578 }, { "epoch": 1.1018853405155828, "grad_norm": 12.5625, "learning_rate": 2.7687493069223128e-06, "loss": 1.1133570671081543, "step": 3580 }, { "epoch": 1.1025009619084263, "grad_norm": 4.9375, "learning_rate": 2.766769081570333e-06, "loss": 1.3237600326538086, "step": 3582 }, { "epoch": 1.1031165833012697, "grad_norm": 7.25, "learning_rate": 2.7647890879851447e-06, "loss": 1.3910489082336426, "step": 3584 }, { "epoch": 1.1037322046941132, "grad_norm": 12.9375, "learning_rate": 2.7628093281343127e-06, "loss": 0.8161633014678955, "step": 3586 }, { "epoch": 1.1043478260869566, "grad_norm": 4.5625, "learning_rate": 2.760829803985167e-06, "loss": 1.3154006004333496, "step": 3588 }, { "epoch": 1.1049634474798, "grad_norm": 2.59375, "learning_rate": 2.7588505175048074e-06, "loss": 1.3041410446166992, "step": 3590 }, { "epoch": 1.1055790688726432, "grad_norm": 8.0, "learning_rate": 2.7568714706600935e-06, "loss": 1.370842695236206, "step": 3592 }, { "epoch": 1.1061946902654867, "grad_norm": 6.40625, "learning_rate": 2.75489266541765e-06, "loss": 1.2594032287597656, "step": 3594 }, { "epoch": 1.10681031165833, "grad_norm": 4.34375, "learning_rate": 2.7529141037438584e-06, "loss": 1.2902520895004272, "step": 3596 }, { "epoch": 1.1074259330511735, "grad_norm": 2.671875, "learning_rate": 2.7509357876048604e-06, "loss": 1.1863526105880737, "step": 3598 }, { "epoch": 1.108041554444017, "grad_norm": 3.625, "learning_rate": 2.7489577189665535e-06, "loss": 0.8220889568328857, "step": 3600 }, { "epoch": 1.1086571758368604, "grad_norm": 12.875, "learning_rate": 2.7469798997945886e-06, "loss": 1.2545669078826904, "step": 3602 }, { "epoch": 1.1092727972297038, "grad_norm": 5.625, "learning_rate": 2.7450023320543685e-06, "loss": 1.405876874923706, "step": 3604 }, { "epoch": 1.109888418622547, "grad_norm": 5.09375, "learning_rate": 2.743025017711047e-06, "loss": 1.234447717666626, "step": 3606 }, { "epoch": 1.1105040400153905, "grad_norm": 11.25, "learning_rate": 2.7410479587295272e-06, "loss": 1.2985813617706299, "step": 3608 }, { "epoch": 1.111119661408234, "grad_norm": 2.921875, "learning_rate": 2.7390711570744542e-06, "loss": 1.278555154800415, "step": 3610 }, { "epoch": 1.1117352828010774, "grad_norm": 8.625, "learning_rate": 2.7370946147102216e-06, "loss": 1.6551470756530762, "step": 3612 }, { "epoch": 1.1123509041939208, "grad_norm": 3.015625, "learning_rate": 2.7351183336009633e-06, "loss": 1.1302613019943237, "step": 3614 }, { "epoch": 1.1129665255867642, "grad_norm": 4.8125, "learning_rate": 2.733142315710555e-06, "loss": 0.7164484858512878, "step": 3616 }, { "epoch": 1.1135821469796074, "grad_norm": 4.25, "learning_rate": 2.7311665630026086e-06, "loss": 1.392930269241333, "step": 3618 }, { "epoch": 1.1141977683724509, "grad_norm": 3.890625, "learning_rate": 2.7291910774404764e-06, "loss": 1.3824591636657715, "step": 3620 }, { "epoch": 1.1148133897652943, "grad_norm": 10.9375, "learning_rate": 2.727215860987239e-06, "loss": 1.3611738681793213, "step": 3622 }, { "epoch": 1.1154290111581378, "grad_norm": 10.125, "learning_rate": 2.725240915605716e-06, "loss": 1.305256724357605, "step": 3624 }, { "epoch": 1.1160446325509812, "grad_norm": 4.6875, "learning_rate": 2.7232662432584546e-06, "loss": 1.1907715797424316, "step": 3626 }, { "epoch": 1.1166602539438246, "grad_norm": 6.75, "learning_rate": 2.7212918459077293e-06, "loss": 1.2510671615600586, "step": 3628 }, { "epoch": 1.1172758753366678, "grad_norm": 9.5625, "learning_rate": 2.7193177255155447e-06, "loss": 1.1721570491790771, "step": 3630 }, { "epoch": 1.1178914967295113, "grad_norm": 6.6875, "learning_rate": 2.717343884043628e-06, "loss": 1.253555178642273, "step": 3632 }, { "epoch": 1.1185071181223547, "grad_norm": 6.15625, "learning_rate": 2.71537032345343e-06, "loss": 1.282861351966858, "step": 3634 }, { "epoch": 1.1191227395151981, "grad_norm": 11.5625, "learning_rate": 2.713397045706122e-06, "loss": 1.6779778003692627, "step": 3636 }, { "epoch": 1.1197383609080416, "grad_norm": 9.6875, "learning_rate": 2.7114240527625935e-06, "loss": 0.9080994725227356, "step": 3638 }, { "epoch": 1.120353982300885, "grad_norm": 9.25, "learning_rate": 2.7094513465834528e-06, "loss": 1.4127110242843628, "step": 3640 }, { "epoch": 1.1209696036937284, "grad_norm": 6.84375, "learning_rate": 2.7074789291290214e-06, "loss": 1.2764534950256348, "step": 3642 }, { "epoch": 1.1215852250865717, "grad_norm": 4.59375, "learning_rate": 2.7055068023593356e-06, "loss": 1.2790331840515137, "step": 3644 }, { "epoch": 1.122200846479415, "grad_norm": 1.6171875, "learning_rate": 2.703534968234142e-06, "loss": 1.0954314470291138, "step": 3646 }, { "epoch": 1.1228164678722585, "grad_norm": 6.0625, "learning_rate": 2.7015634287128955e-06, "loss": 1.2995126247406006, "step": 3648 }, { "epoch": 1.123432089265102, "grad_norm": 20.5, "learning_rate": 2.6995921857547604e-06, "loss": 1.4568636417388916, "step": 3650 }, { "epoch": 1.1240477106579454, "grad_norm": 8.6875, "learning_rate": 2.697621241318603e-06, "loss": 0.8446935415267944, "step": 3652 }, { "epoch": 1.1246633320507888, "grad_norm": 4.4375, "learning_rate": 2.6956505973629965e-06, "loss": 1.2519776821136475, "step": 3654 }, { "epoch": 1.1252789534436323, "grad_norm": 6.75, "learning_rate": 2.6936802558462136e-06, "loss": 1.2862558364868164, "step": 3656 }, { "epoch": 1.1258945748364755, "grad_norm": 8.75, "learning_rate": 2.6917102187262266e-06, "loss": 1.4903209209442139, "step": 3658 }, { "epoch": 1.126510196229319, "grad_norm": 6.53125, "learning_rate": 2.689740487960707e-06, "loss": 1.075721263885498, "step": 3660 }, { "epoch": 1.1271258176221624, "grad_norm": 15.1875, "learning_rate": 2.687771065507019e-06, "loss": 1.2730168104171753, "step": 3662 }, { "epoch": 1.1277414390150058, "grad_norm": 6.03125, "learning_rate": 2.6858019533222215e-06, "loss": 1.3608458042144775, "step": 3664 }, { "epoch": 1.1283570604078492, "grad_norm": 7.5625, "learning_rate": 2.6838331533630658e-06, "loss": 1.1968542337417603, "step": 3666 }, { "epoch": 1.1289726818006927, "grad_norm": 6.3125, "learning_rate": 2.6818646675859926e-06, "loss": 0.8596151471138, "step": 3668 }, { "epoch": 1.129588303193536, "grad_norm": 22.125, "learning_rate": 2.679896497947131e-06, "loss": 1.338981032371521, "step": 3670 }, { "epoch": 1.1302039245863793, "grad_norm": 15.875, "learning_rate": 2.677928646402296e-06, "loss": 1.0218119621276855, "step": 3672 }, { "epoch": 1.1308195459792227, "grad_norm": 4.90625, "learning_rate": 2.6759611149069826e-06, "loss": 1.3446364402770996, "step": 3674 }, { "epoch": 1.1314351673720662, "grad_norm": 5.03125, "learning_rate": 2.6739939054163734e-06, "loss": 1.1468969583511353, "step": 3676 }, { "epoch": 1.1320507887649096, "grad_norm": 6.59375, "learning_rate": 2.672027019885328e-06, "loss": 1.2110896110534668, "step": 3678 }, { "epoch": 1.132666410157753, "grad_norm": 6.125, "learning_rate": 2.6700604602683856e-06, "loss": 1.3085262775421143, "step": 3680 }, { "epoch": 1.1332820315505965, "grad_norm": 9.4375, "learning_rate": 2.6680942285197586e-06, "loss": 1.596697449684143, "step": 3682 }, { "epoch": 1.1338976529434397, "grad_norm": 11.9375, "learning_rate": 2.666128326593337e-06, "loss": 1.7206666469573975, "step": 3684 }, { "epoch": 1.1345132743362831, "grad_norm": 6.0625, "learning_rate": 2.664162756442682e-06, "loss": 1.8020777702331543, "step": 3686 }, { "epoch": 1.1351288957291266, "grad_norm": 3.96875, "learning_rate": 2.6621975200210238e-06, "loss": 0.736263632774353, "step": 3688 }, { "epoch": 1.13574451712197, "grad_norm": 5.40625, "learning_rate": 2.660232619281261e-06, "loss": 1.3801339864730835, "step": 3690 }, { "epoch": 1.1363601385148134, "grad_norm": 27.25, "learning_rate": 2.6582680561759615e-06, "loss": 1.5115694999694824, "step": 3692 }, { "epoch": 1.1369757599076569, "grad_norm": 3.625, "learning_rate": 2.656303832657354e-06, "loss": 1.0857009887695312, "step": 3694 }, { "epoch": 1.1375913813005, "grad_norm": 24.625, "learning_rate": 2.6543399506773333e-06, "loss": 1.650892734527588, "step": 3696 }, { "epoch": 1.1382070026933435, "grad_norm": 10.4375, "learning_rate": 2.652376412187452e-06, "loss": 1.6021376848220825, "step": 3698 }, { "epoch": 1.138822624086187, "grad_norm": 10.625, "learning_rate": 2.650413219138921e-06, "loss": 1.4878451824188232, "step": 3700 }, { "epoch": 1.1394382454790304, "grad_norm": 3.609375, "learning_rate": 2.648450373482612e-06, "loss": 1.094240427017212, "step": 3702 }, { "epoch": 1.1400538668718738, "grad_norm": 8.875, "learning_rate": 2.646487877169045e-06, "loss": 1.3141735792160034, "step": 3704 }, { "epoch": 1.1406694882647173, "grad_norm": 5.4375, "learning_rate": 2.6445257321483998e-06, "loss": 1.2903178930282593, "step": 3706 }, { "epoch": 1.1412851096575607, "grad_norm": 10.1875, "learning_rate": 2.6425639403705028e-06, "loss": 1.7199811935424805, "step": 3708 }, { "epoch": 1.141900731050404, "grad_norm": 7.21875, "learning_rate": 2.6406025037848316e-06, "loss": 1.6154519319534302, "step": 3710 }, { "epoch": 1.1425163524432473, "grad_norm": 6.3125, "learning_rate": 2.6386414243405068e-06, "loss": 1.5471071004867554, "step": 3712 }, { "epoch": 1.1431319738360908, "grad_norm": 10.8125, "learning_rate": 2.6366807039863e-06, "loss": 1.4274885654449463, "step": 3714 }, { "epoch": 1.1437475952289342, "grad_norm": 12.6875, "learning_rate": 2.6347203446706214e-06, "loss": 1.2440565824508667, "step": 3716 }, { "epoch": 1.1443632166217776, "grad_norm": 12.75, "learning_rate": 2.632760348341524e-06, "loss": 1.8487552404403687, "step": 3718 }, { "epoch": 1.144978838014621, "grad_norm": 3.984375, "learning_rate": 2.6308007169467003e-06, "loss": 1.325544834136963, "step": 3720 }, { "epoch": 1.1455944594074645, "grad_norm": 5.03125, "learning_rate": 2.6288414524334803e-06, "loss": 1.3365601301193237, "step": 3722 }, { "epoch": 1.1462100808003077, "grad_norm": 11.0625, "learning_rate": 2.6268825567488297e-06, "loss": 1.6731278896331787, "step": 3724 }, { "epoch": 1.1468257021931512, "grad_norm": 3.25, "learning_rate": 2.6249240318393454e-06, "loss": 1.1490329504013062, "step": 3726 }, { "epoch": 1.1474413235859946, "grad_norm": 4.15625, "learning_rate": 2.622965879651258e-06, "loss": 1.1322708129882812, "step": 3728 }, { "epoch": 1.148056944978838, "grad_norm": 4.5, "learning_rate": 2.6210081021304278e-06, "loss": 1.2662780284881592, "step": 3730 }, { "epoch": 1.1486725663716815, "grad_norm": 9.8125, "learning_rate": 2.619050701222342e-06, "loss": 1.161523461341858, "step": 3732 }, { "epoch": 1.149288187764525, "grad_norm": 5.0, "learning_rate": 2.617093678872114e-06, "loss": 1.3838979005813599, "step": 3734 }, { "epoch": 1.1499038091573683, "grad_norm": 8.875, "learning_rate": 2.6151370370244807e-06, "loss": 1.4697721004486084, "step": 3736 }, { "epoch": 1.1505194305502116, "grad_norm": 7.90625, "learning_rate": 2.6131807776238007e-06, "loss": 1.370856761932373, "step": 3738 }, { "epoch": 1.151135051943055, "grad_norm": 6.46875, "learning_rate": 2.6112249026140515e-06, "loss": 1.4948546886444092, "step": 3740 }, { "epoch": 1.1517506733358984, "grad_norm": 3.015625, "learning_rate": 2.609269413938832e-06, "loss": 1.002959132194519, "step": 3742 }, { "epoch": 1.1523662947287419, "grad_norm": 7.75, "learning_rate": 2.6073143135413546e-06, "loss": 0.9472619891166687, "step": 3744 }, { "epoch": 1.1529819161215853, "grad_norm": 5.90625, "learning_rate": 2.6053596033644463e-06, "loss": 1.480543851852417, "step": 3746 }, { "epoch": 1.1535975375144285, "grad_norm": 2.6875, "learning_rate": 2.603405285350546e-06, "loss": 1.4469226598739624, "step": 3748 }, { "epoch": 1.154213158907272, "grad_norm": 6.96875, "learning_rate": 2.601451361441705e-06, "loss": 1.3982146978378296, "step": 3750 }, { "epoch": 1.1548287803001154, "grad_norm": 11.5, "learning_rate": 2.5994978335795784e-06, "loss": 1.4059464931488037, "step": 3752 }, { "epoch": 1.1554444016929588, "grad_norm": 13.6875, "learning_rate": 2.5975447037054325e-06, "loss": 1.6139023303985596, "step": 3754 }, { "epoch": 1.1560600230858022, "grad_norm": 4.21875, "learning_rate": 2.595591973760135e-06, "loss": 1.1390726566314697, "step": 3756 }, { "epoch": 1.1566756444786457, "grad_norm": 8.1875, "learning_rate": 2.5936396456841597e-06, "loss": 1.4934390783309937, "step": 3758 }, { "epoch": 1.1572912658714891, "grad_norm": 6.96875, "learning_rate": 2.5916877214175774e-06, "loss": 1.3812040090560913, "step": 3760 }, { "epoch": 1.1579068872643323, "grad_norm": 7.46875, "learning_rate": 2.5897362029000583e-06, "loss": 1.7317631244659424, "step": 3762 }, { "epoch": 1.1585225086571758, "grad_norm": 7.0625, "learning_rate": 2.5877850920708714e-06, "loss": 1.406481385231018, "step": 3764 }, { "epoch": 1.1591381300500192, "grad_norm": 68.5, "learning_rate": 2.585834390868878e-06, "loss": 1.2627127170562744, "step": 3766 }, { "epoch": 1.1597537514428626, "grad_norm": 9.3125, "learning_rate": 2.5838841012325344e-06, "loss": 1.3155207633972168, "step": 3768 }, { "epoch": 1.160369372835706, "grad_norm": 122.5, "learning_rate": 2.581934225099887e-06, "loss": 1.5597208738327026, "step": 3770 }, { "epoch": 1.1609849942285495, "grad_norm": 6.34375, "learning_rate": 2.579984764408572e-06, "loss": 1.4694806337356567, "step": 3772 }, { "epoch": 1.161600615621393, "grad_norm": 9.1875, "learning_rate": 2.578035721095811e-06, "loss": 1.5787906646728516, "step": 3774 }, { "epoch": 1.1622162370142362, "grad_norm": 9.0, "learning_rate": 2.5760870970984132e-06, "loss": 1.554677128791809, "step": 3776 }, { "epoch": 1.1628318584070796, "grad_norm": 7.75, "learning_rate": 2.5741388943527684e-06, "loss": 1.2102149724960327, "step": 3778 }, { "epoch": 1.163447479799923, "grad_norm": 7.4375, "learning_rate": 2.572191114794851e-06, "loss": 1.199662208557129, "step": 3780 }, { "epoch": 1.1640631011927665, "grad_norm": 15.4375, "learning_rate": 2.5702437603602125e-06, "loss": 1.2612106800079346, "step": 3782 }, { "epoch": 1.16467872258561, "grad_norm": 9.875, "learning_rate": 2.568296832983982e-06, "loss": 1.4821970462799072, "step": 3784 }, { "epoch": 1.1652943439784533, "grad_norm": 9.375, "learning_rate": 2.5663503346008663e-06, "loss": 1.2696473598480225, "step": 3786 }, { "epoch": 1.1659099653712968, "grad_norm": 2.015625, "learning_rate": 2.564404267145144e-06, "loss": 1.352256178855896, "step": 3788 }, { "epoch": 1.16652558676414, "grad_norm": 8.8125, "learning_rate": 2.562458632550665e-06, "loss": 1.5974379777908325, "step": 3790 }, { "epoch": 1.1671412081569834, "grad_norm": 8.25, "learning_rate": 2.5605134327508506e-06, "loss": 1.3554526567459106, "step": 3792 }, { "epoch": 1.1677568295498268, "grad_norm": 8.25, "learning_rate": 2.5585686696786903e-06, "loss": 0.7919501662254333, "step": 3794 }, { "epoch": 1.1683724509426703, "grad_norm": 7.4375, "learning_rate": 2.5566243452667374e-06, "loss": 1.200896143913269, "step": 3796 }, { "epoch": 1.1689880723355137, "grad_norm": 5.84375, "learning_rate": 2.554680461447111e-06, "loss": 1.2222011089324951, "step": 3798 }, { "epoch": 1.1696036937283572, "grad_norm": 7.21875, "learning_rate": 2.5527370201514924e-06, "loss": 1.274246096611023, "step": 3800 }, { "epoch": 1.1702193151212004, "grad_norm": 6.1875, "learning_rate": 2.550794023311124e-06, "loss": 1.403409719467163, "step": 3802 }, { "epoch": 1.1708349365140438, "grad_norm": 10.0, "learning_rate": 2.5488514728568026e-06, "loss": 1.5534183979034424, "step": 3804 }, { "epoch": 1.1714505579068872, "grad_norm": 9.4375, "learning_rate": 2.5469093707188854e-06, "loss": 1.8067338466644287, "step": 3806 }, { "epoch": 1.1720661792997307, "grad_norm": 10.5, "learning_rate": 2.5449677188272825e-06, "loss": 1.4116039276123047, "step": 3808 }, { "epoch": 1.172681800692574, "grad_norm": 17.75, "learning_rate": 2.5430265191114587e-06, "loss": 0.9207834601402283, "step": 3810 }, { "epoch": 1.1732974220854175, "grad_norm": 10.375, "learning_rate": 2.541085773500426e-06, "loss": 1.201561450958252, "step": 3812 }, { "epoch": 1.1739130434782608, "grad_norm": 9.3125, "learning_rate": 2.539145483922747e-06, "loss": 0.9789211750030518, "step": 3814 }, { "epoch": 1.1745286648711042, "grad_norm": 2.140625, "learning_rate": 2.5372056523065304e-06, "loss": 1.0726956129074097, "step": 3816 }, { "epoch": 1.1751442862639476, "grad_norm": 9.8125, "learning_rate": 2.5352662805794313e-06, "loss": 1.5304408073425293, "step": 3818 }, { "epoch": 1.175759907656791, "grad_norm": 6.375, "learning_rate": 2.533327370668647e-06, "loss": 1.2936549186706543, "step": 3820 }, { "epoch": 1.1763755290496345, "grad_norm": 6.9375, "learning_rate": 2.531388924500915e-06, "loss": 1.3403570652008057, "step": 3822 }, { "epoch": 1.176991150442478, "grad_norm": 11.75, "learning_rate": 2.5294509440025127e-06, "loss": 1.2184439897537231, "step": 3824 }, { "epoch": 1.1776067718353214, "grad_norm": 4.53125, "learning_rate": 2.5275134310992554e-06, "loss": 1.3041874170303345, "step": 3826 }, { "epoch": 1.1782223932281646, "grad_norm": 4.65625, "learning_rate": 2.5255763877164933e-06, "loss": 0.834338366985321, "step": 3828 }, { "epoch": 1.178838014621008, "grad_norm": 11.4375, "learning_rate": 2.5236398157791085e-06, "loss": 1.3058403730392456, "step": 3830 }, { "epoch": 1.1794536360138514, "grad_norm": 15.1875, "learning_rate": 2.521703717211518e-06, "loss": 1.2352006435394287, "step": 3832 }, { "epoch": 1.1800692574066949, "grad_norm": 12.0, "learning_rate": 2.519768093937664e-06, "loss": 1.7258977890014648, "step": 3834 }, { "epoch": 1.1806848787995383, "grad_norm": 12.1875, "learning_rate": 2.5178329478810198e-06, "loss": 1.3492827415466309, "step": 3836 }, { "epoch": 1.1813005001923818, "grad_norm": 5.34375, "learning_rate": 2.5158982809645838e-06, "loss": 1.3997857570648193, "step": 3838 }, { "epoch": 1.1819161215852252, "grad_norm": 16.375, "learning_rate": 2.5139640951108777e-06, "loss": 1.5591062307357788, "step": 3840 }, { "epoch": 1.1825317429780684, "grad_norm": 9.125, "learning_rate": 2.512030392241945e-06, "loss": 1.342834234237671, "step": 3842 }, { "epoch": 1.1831473643709118, "grad_norm": 6.96875, "learning_rate": 2.5100971742793502e-06, "loss": 1.3188698291778564, "step": 3844 }, { "epoch": 1.1837629857637553, "grad_norm": 2.46875, "learning_rate": 2.508164443144174e-06, "loss": 1.1037402153015137, "step": 3846 }, { "epoch": 1.1843786071565987, "grad_norm": 2.8125, "learning_rate": 2.506232200757016e-06, "loss": 0.8773598670959473, "step": 3848 }, { "epoch": 1.1849942285494421, "grad_norm": 5.9375, "learning_rate": 2.5043004490379887e-06, "loss": 0.9601057767868042, "step": 3850 }, { "epoch": 1.1856098499422856, "grad_norm": 9.0625, "learning_rate": 2.502369189906716e-06, "loss": 1.3968665599822998, "step": 3852 }, { "epoch": 1.186225471335129, "grad_norm": 4.40625, "learning_rate": 2.5004384252823353e-06, "loss": 1.4004179239273071, "step": 3854 }, { "epoch": 1.1868410927279722, "grad_norm": 7.65625, "learning_rate": 2.498508157083489e-06, "loss": 1.71554434299469, "step": 3856 }, { "epoch": 1.1874567141208157, "grad_norm": 10.4375, "learning_rate": 2.4965783872283275e-06, "loss": 1.362620234489441, "step": 3858 }, { "epoch": 1.188072335513659, "grad_norm": 6.25, "learning_rate": 2.4946491176345077e-06, "loss": 1.016183853149414, "step": 3860 }, { "epoch": 1.1886879569065025, "grad_norm": 3.703125, "learning_rate": 2.4927203502191873e-06, "loss": 0.8576395511627197, "step": 3862 }, { "epoch": 1.189303578299346, "grad_norm": 7.3125, "learning_rate": 2.4907920868990266e-06, "loss": 1.3699417114257812, "step": 3864 }, { "epoch": 1.1899191996921894, "grad_norm": 14.0625, "learning_rate": 2.4888643295901834e-06, "loss": 1.112636685371399, "step": 3866 }, { "epoch": 1.1905348210850326, "grad_norm": 7.71875, "learning_rate": 2.4869370802083135e-06, "loss": 0.9367272853851318, "step": 3868 }, { "epoch": 1.191150442477876, "grad_norm": 5.9375, "learning_rate": 2.485010340668567e-06, "loss": 1.3729990720748901, "step": 3870 }, { "epoch": 1.1917660638707195, "grad_norm": 8.375, "learning_rate": 2.4830841128855894e-06, "loss": 1.5221357345581055, "step": 3872 }, { "epoch": 1.192381685263563, "grad_norm": 5.78125, "learning_rate": 2.4811583987735157e-06, "loss": 1.02495539188385, "step": 3874 }, { "epoch": 1.1929973066564064, "grad_norm": 15.25, "learning_rate": 2.4792332002459717e-06, "loss": 1.5791809558868408, "step": 3876 }, { "epoch": 1.1936129280492498, "grad_norm": 11.625, "learning_rate": 2.4773085192160697e-06, "loss": 1.8533213138580322, "step": 3878 }, { "epoch": 1.194228549442093, "grad_norm": 6.46875, "learning_rate": 2.4753843575964094e-06, "loss": 1.459651231765747, "step": 3880 }, { "epoch": 1.1948441708349364, "grad_norm": 4.625, "learning_rate": 2.473460717299072e-06, "loss": 1.155147910118103, "step": 3882 }, { "epoch": 1.1954597922277799, "grad_norm": 15.1875, "learning_rate": 2.4715376002356225e-06, "loss": 1.5217078924179077, "step": 3884 }, { "epoch": 1.1960754136206233, "grad_norm": 9.8125, "learning_rate": 2.4696150083171057e-06, "loss": 1.0151829719543457, "step": 3886 }, { "epoch": 1.1966910350134667, "grad_norm": 12.5, "learning_rate": 2.4676929434540444e-06, "loss": 1.3974363803863525, "step": 3888 }, { "epoch": 1.1973066564063102, "grad_norm": 9.0625, "learning_rate": 2.4657714075564374e-06, "loss": 1.6371207237243652, "step": 3890 }, { "epoch": 1.1979222777991536, "grad_norm": 4.71875, "learning_rate": 2.463850402533758e-06, "loss": 1.3001331090927124, "step": 3892 }, { "epoch": 1.1985378991919968, "grad_norm": 9.625, "learning_rate": 2.4619299302949517e-06, "loss": 1.3801743984222412, "step": 3894 }, { "epoch": 1.1991535205848403, "grad_norm": 16.5, "learning_rate": 2.4600099927484345e-06, "loss": 0.9950218200683594, "step": 3896 }, { "epoch": 1.1997691419776837, "grad_norm": 7.375, "learning_rate": 2.458090591802092e-06, "loss": 1.379498839378357, "step": 3898 }, { "epoch": 1.2003847633705271, "grad_norm": 5.125, "learning_rate": 2.456171729363276e-06, "loss": 1.3566272258758545, "step": 3900 }, { "epoch": 1.2010003847633706, "grad_norm": 2.1875, "learning_rate": 2.4542534073388026e-06, "loss": 1.3192147016525269, "step": 3902 }, { "epoch": 1.201616006156214, "grad_norm": 4.875, "learning_rate": 2.4523356276349515e-06, "loss": 1.2463765144348145, "step": 3904 }, { "epoch": 1.2022316275490574, "grad_norm": 18.875, "learning_rate": 2.4504183921574648e-06, "loss": 1.7187427282333374, "step": 3906 }, { "epoch": 1.2028472489419006, "grad_norm": 15.75, "learning_rate": 2.44850170281154e-06, "loss": 1.4562066793441772, "step": 3908 }, { "epoch": 1.203462870334744, "grad_norm": 5.59375, "learning_rate": 2.446585561501836e-06, "loss": 1.3070893287658691, "step": 3910 }, { "epoch": 1.2040784917275875, "grad_norm": 4.78125, "learning_rate": 2.4446699701324643e-06, "loss": 1.2424821853637695, "step": 3912 }, { "epoch": 1.204694113120431, "grad_norm": 4.84375, "learning_rate": 2.4427549306069918e-06, "loss": 0.7941330671310425, "step": 3914 }, { "epoch": 1.2053097345132744, "grad_norm": 29.25, "learning_rate": 2.4408404448284352e-06, "loss": 1.1013128757476807, "step": 3916 }, { "epoch": 1.2059253559061178, "grad_norm": 4.90625, "learning_rate": 2.4389265146992637e-06, "loss": 1.2746576070785522, "step": 3918 }, { "epoch": 1.2065409772989613, "grad_norm": 7.03125, "learning_rate": 2.437013142121391e-06, "loss": 1.5280157327651978, "step": 3920 }, { "epoch": 1.2071565986918045, "grad_norm": 4.40625, "learning_rate": 2.435100328996179e-06, "loss": 1.2533605098724365, "step": 3922 }, { "epoch": 1.207772220084648, "grad_norm": 3.09375, "learning_rate": 2.433188077224432e-06, "loss": 1.099176287651062, "step": 3924 }, { "epoch": 1.2083878414774913, "grad_norm": 13.0625, "learning_rate": 2.431276388706398e-06, "loss": 1.4526443481445312, "step": 3926 }, { "epoch": 1.2090034628703348, "grad_norm": 7.46875, "learning_rate": 2.429365265341766e-06, "loss": 1.5245413780212402, "step": 3928 }, { "epoch": 1.2096190842631782, "grad_norm": 5.46875, "learning_rate": 2.4274547090296614e-06, "loss": 1.3142131567001343, "step": 3930 }, { "epoch": 1.2102347056560214, "grad_norm": 16.875, "learning_rate": 2.4255447216686455e-06, "loss": 1.3165017366409302, "step": 3932 }, { "epoch": 1.2108503270488649, "grad_norm": 5.6875, "learning_rate": 2.4236353051567172e-06, "loss": 1.2935256958007812, "step": 3934 }, { "epoch": 1.2114659484417083, "grad_norm": 10.3125, "learning_rate": 2.4217264613913053e-06, "loss": 1.751842737197876, "step": 3936 }, { "epoch": 1.2120815698345517, "grad_norm": 6.9375, "learning_rate": 2.4198181922692714e-06, "loss": 1.647628903388977, "step": 3938 }, { "epoch": 1.2126971912273952, "grad_norm": 7.78125, "learning_rate": 2.417910499686905e-06, "loss": 1.3499926328659058, "step": 3940 }, { "epoch": 1.2133128126202386, "grad_norm": 3.921875, "learning_rate": 2.4160033855399235e-06, "loss": 1.0936715602874756, "step": 3942 }, { "epoch": 1.213928434013082, "grad_norm": 4.15625, "learning_rate": 2.4140968517234682e-06, "loss": 1.3401120901107788, "step": 3944 }, { "epoch": 1.2145440554059252, "grad_norm": 7.25, "learning_rate": 2.4121909001321054e-06, "loss": 1.5647368431091309, "step": 3946 }, { "epoch": 1.2151596767987687, "grad_norm": 3.234375, "learning_rate": 2.4102855326598205e-06, "loss": 1.066684365272522, "step": 3948 }, { "epoch": 1.2157752981916121, "grad_norm": 4.4375, "learning_rate": 2.408380751200021e-06, "loss": 1.1742613315582275, "step": 3950 }, { "epoch": 1.2163909195844556, "grad_norm": 2.65625, "learning_rate": 2.4064765576455307e-06, "loss": 1.228191614151001, "step": 3952 }, { "epoch": 1.217006540977299, "grad_norm": 7.78125, "learning_rate": 2.40457295388859e-06, "loss": 1.3355095386505127, "step": 3954 }, { "epoch": 1.2176221623701424, "grad_norm": 5.875, "learning_rate": 2.402669941820852e-06, "loss": 1.578446388244629, "step": 3956 }, { "epoch": 1.2182377837629859, "grad_norm": 4.03125, "learning_rate": 2.4007675233333816e-06, "loss": 1.2577093839645386, "step": 3958 }, { "epoch": 1.218853405155829, "grad_norm": 12.625, "learning_rate": 2.398865700316656e-06, "loss": 1.4146099090576172, "step": 3960 }, { "epoch": 1.2194690265486725, "grad_norm": 7.75, "learning_rate": 2.3969644746605584e-06, "loss": 1.4271454811096191, "step": 3962 }, { "epoch": 1.220084647941516, "grad_norm": 6.9375, "learning_rate": 2.39506384825438e-06, "loss": 1.2378193140029907, "step": 3964 }, { "epoch": 1.2207002693343594, "grad_norm": 10.8125, "learning_rate": 2.3931638229868163e-06, "loss": 1.0556186437606812, "step": 3966 }, { "epoch": 1.2213158907272028, "grad_norm": 13.4375, "learning_rate": 2.391264400745964e-06, "loss": 1.55692720413208, "step": 3968 }, { "epoch": 1.2219315121200462, "grad_norm": 6.625, "learning_rate": 2.389365583419323e-06, "loss": 1.3303570747375488, "step": 3970 }, { "epoch": 1.2225471335128897, "grad_norm": 5.1875, "learning_rate": 2.3874673728937886e-06, "loss": 1.3450106382369995, "step": 3972 }, { "epoch": 1.223162754905733, "grad_norm": 5.875, "learning_rate": 2.3855697710556562e-06, "loss": 0.9559692144393921, "step": 3974 }, { "epoch": 1.2237783762985763, "grad_norm": 8.75, "learning_rate": 2.3836727797906157e-06, "loss": 1.4707281589508057, "step": 3976 }, { "epoch": 1.2243939976914198, "grad_norm": 8.1875, "learning_rate": 2.381776400983749e-06, "loss": 1.3327157497406006, "step": 3978 }, { "epoch": 1.2250096190842632, "grad_norm": 3.765625, "learning_rate": 2.3798806365195305e-06, "loss": 1.2126898765563965, "step": 3980 }, { "epoch": 1.2256252404771066, "grad_norm": 4.125, "learning_rate": 2.377985488281825e-06, "loss": 1.0974383354187012, "step": 3982 }, { "epoch": 1.22624086186995, "grad_norm": 5.5625, "learning_rate": 2.3760909581538818e-06, "loss": 1.1715731620788574, "step": 3984 }, { "epoch": 1.2268564832627935, "grad_norm": 7.875, "learning_rate": 2.374197048018339e-06, "loss": 1.570125937461853, "step": 3986 }, { "epoch": 1.2274721046556367, "grad_norm": 6.75, "learning_rate": 2.372303759757218e-06, "loss": 1.2609387636184692, "step": 3988 }, { "epoch": 1.2280877260484802, "grad_norm": 25.625, "learning_rate": 2.3704110952519206e-06, "loss": 2.0279440879821777, "step": 3990 }, { "epoch": 1.2287033474413236, "grad_norm": 25.5, "learning_rate": 2.3685190563832307e-06, "loss": 0.9932742118835449, "step": 3992 }, { "epoch": 1.229318968834167, "grad_norm": 3.90625, "learning_rate": 2.36662764503131e-06, "loss": 1.3351396322250366, "step": 3994 }, { "epoch": 1.2299345902270105, "grad_norm": 7.21875, "learning_rate": 2.3647368630756964e-06, "loss": 1.0999829769134521, "step": 3996 }, { "epoch": 1.2305502116198537, "grad_norm": 7.71875, "learning_rate": 2.3628467123953015e-06, "loss": 1.2393935918807983, "step": 3998 }, { "epoch": 1.231165833012697, "grad_norm": 5.90625, "learning_rate": 2.3609571948684107e-06, "loss": 1.250370740890503, "step": 4000 }, { "epoch": 1.2317814544055405, "grad_norm": 9.3125, "learning_rate": 2.35906831237268e-06, "loss": 1.3964062929153442, "step": 4002 }, { "epoch": 1.232397075798384, "grad_norm": 8.4375, "learning_rate": 2.3571800667851343e-06, "loss": 1.3615357875823975, "step": 4004 }, { "epoch": 1.2330126971912274, "grad_norm": 4.34375, "learning_rate": 2.355292459982165e-06, "loss": 1.28764009475708, "step": 4006 }, { "epoch": 1.2336283185840708, "grad_norm": 6.90625, "learning_rate": 2.3534054938395313e-06, "loss": 1.4439440965652466, "step": 4008 }, { "epoch": 1.2342439399769143, "grad_norm": 4.625, "learning_rate": 2.351519170232352e-06, "loss": 1.112069010734558, "step": 4010 }, { "epoch": 1.2348595613697575, "grad_norm": 6.84375, "learning_rate": 2.3496334910351086e-06, "loss": 1.4705801010131836, "step": 4012 }, { "epoch": 1.235475182762601, "grad_norm": 12.6875, "learning_rate": 2.3477484581216435e-06, "loss": 1.5753551721572876, "step": 4014 }, { "epoch": 1.2360908041554444, "grad_norm": 7.59375, "learning_rate": 2.345864073365157e-06, "loss": 1.7668852806091309, "step": 4016 }, { "epoch": 1.2367064255482878, "grad_norm": 8.5625, "learning_rate": 2.3439803386382033e-06, "loss": 1.2533469200134277, "step": 4018 }, { "epoch": 1.2373220469411312, "grad_norm": 4.28125, "learning_rate": 2.3420972558126933e-06, "loss": 1.04129159450531, "step": 4020 }, { "epoch": 1.2379376683339747, "grad_norm": 6.15625, "learning_rate": 2.3402148267598875e-06, "loss": 1.4373548030853271, "step": 4022 }, { "epoch": 1.238553289726818, "grad_norm": 2.578125, "learning_rate": 2.3383330533503973e-06, "loss": 1.4359878301620483, "step": 4024 }, { "epoch": 1.2391689111196613, "grad_norm": 7.21875, "learning_rate": 2.3364519374541838e-06, "loss": 1.0765472650527954, "step": 4026 }, { "epoch": 1.2397845325125048, "grad_norm": 11.0625, "learning_rate": 2.334571480940554e-06, "loss": 1.436760425567627, "step": 4028 }, { "epoch": 1.2404001539053482, "grad_norm": 9.375, "learning_rate": 2.3326916856781603e-06, "loss": 0.9236443042755127, "step": 4030 }, { "epoch": 1.2410157752981916, "grad_norm": 8.25, "learning_rate": 2.330812553534996e-06, "loss": 1.4468777179718018, "step": 4032 }, { "epoch": 1.241631396691035, "grad_norm": 9.375, "learning_rate": 2.3289340863783993e-06, "loss": 1.4952726364135742, "step": 4034 }, { "epoch": 1.2422470180838785, "grad_norm": 5.96875, "learning_rate": 2.327056286075042e-06, "loss": 1.5572830438613892, "step": 4036 }, { "epoch": 1.242862639476722, "grad_norm": 10.125, "learning_rate": 2.325179154490938e-06, "loss": 1.4064810276031494, "step": 4038 }, { "epoch": 1.2434782608695651, "grad_norm": 7.53125, "learning_rate": 2.3233026934914347e-06, "loss": 1.265794277191162, "step": 4040 }, { "epoch": 1.2440938822624086, "grad_norm": 8.875, "learning_rate": 2.3214269049412142e-06, "loss": 1.2024005651474, "step": 4042 }, { "epoch": 1.244709503655252, "grad_norm": 12.4375, "learning_rate": 2.3195517907042884e-06, "loss": 1.3893858194351196, "step": 4044 }, { "epoch": 1.2453251250480954, "grad_norm": 4.71875, "learning_rate": 2.317677352644001e-06, "loss": 1.4748245477676392, "step": 4046 }, { "epoch": 1.2459407464409389, "grad_norm": 8.5625, "learning_rate": 2.315803592623024e-06, "loss": 1.6225428581237793, "step": 4048 }, { "epoch": 1.2465563678337823, "grad_norm": 8.5, "learning_rate": 2.3139305125033533e-06, "loss": 1.3552569150924683, "step": 4050 }, { "epoch": 1.2471719892266255, "grad_norm": 5.6875, "learning_rate": 2.3120581141463107e-06, "loss": 1.6191600561141968, "step": 4052 }, { "epoch": 1.247787610619469, "grad_norm": 12.0625, "learning_rate": 2.3101863994125417e-06, "loss": 1.6349756717681885, "step": 4054 }, { "epoch": 1.2484032320123124, "grad_norm": 9.625, "learning_rate": 2.30831537016201e-06, "loss": 1.7783564329147339, "step": 4056 }, { "epoch": 1.2490188534051558, "grad_norm": 15.625, "learning_rate": 2.3064450282539993e-06, "loss": 1.1966140270233154, "step": 4058 }, { "epoch": 1.2496344747979993, "grad_norm": 33.0, "learning_rate": 2.3045753755471114e-06, "loss": 1.1083506345748901, "step": 4060 }, { "epoch": 1.2502500961908427, "grad_norm": 4.4375, "learning_rate": 2.3027064138992604e-06, "loss": 1.2617923021316528, "step": 4062 }, { "epoch": 1.250865717583686, "grad_norm": 6.34375, "learning_rate": 2.3008381451676764e-06, "loss": 1.0938457250595093, "step": 4064 }, { "epoch": 1.2514813389765294, "grad_norm": 15.25, "learning_rate": 2.2989705712089004e-06, "loss": 1.302169680595398, "step": 4066 }, { "epoch": 1.2520969603693728, "grad_norm": 6.25, "learning_rate": 2.2971036938787816e-06, "loss": 0.7615894675254822, "step": 4068 }, { "epoch": 1.2527125817622162, "grad_norm": 5.03125, "learning_rate": 2.2952375150324785e-06, "loss": 1.3285247087478638, "step": 4070 }, { "epoch": 1.2533282031550597, "grad_norm": 8.75, "learning_rate": 2.293372036524454e-06, "loss": 1.5400751829147339, "step": 4072 }, { "epoch": 1.253943824547903, "grad_norm": 8.5625, "learning_rate": 2.2915072602084778e-06, "loss": 1.14712393283844, "step": 4074 }, { "epoch": 1.2545594459407465, "grad_norm": 3.546875, "learning_rate": 2.2896431879376177e-06, "loss": 0.9063671231269836, "step": 4076 }, { "epoch": 1.2551750673335897, "grad_norm": 9.5625, "learning_rate": 2.2877798215642445e-06, "loss": 0.7759019136428833, "step": 4078 }, { "epoch": 1.2557906887264332, "grad_norm": 6.28125, "learning_rate": 2.285917162940028e-06, "loss": 1.5217760801315308, "step": 4080 }, { "epoch": 1.2564063101192766, "grad_norm": 8.1875, "learning_rate": 2.2840552139159335e-06, "loss": 1.2249348163604736, "step": 4082 }, { "epoch": 1.25702193151212, "grad_norm": 6.25, "learning_rate": 2.2821939763422217e-06, "loss": 1.3272895812988281, "step": 4084 }, { "epoch": 1.2576375529049635, "grad_norm": 5.34375, "learning_rate": 2.2803334520684456e-06, "loss": 0.7774797081947327, "step": 4086 }, { "epoch": 1.258253174297807, "grad_norm": 9.3125, "learning_rate": 2.2784736429434505e-06, "loss": 0.8993749022483826, "step": 4088 }, { "epoch": 1.2588687956906504, "grad_norm": 15.875, "learning_rate": 2.276614550815369e-06, "loss": 1.7677662372589111, "step": 4090 }, { "epoch": 1.2594844170834936, "grad_norm": 11.3125, "learning_rate": 2.274756177531624e-06, "loss": 1.1717032194137573, "step": 4092 }, { "epoch": 1.260100038476337, "grad_norm": 7.8125, "learning_rate": 2.2728985249389225e-06, "loss": 1.233552098274231, "step": 4094 }, { "epoch": 1.2607156598691804, "grad_norm": 10.0, "learning_rate": 2.2710415948832557e-06, "loss": 1.563547968864441, "step": 4096 }, { "epoch": 1.2613312812620239, "grad_norm": 6.0, "learning_rate": 2.2691853892098957e-06, "loss": 1.5977107286453247, "step": 4098 }, { "epoch": 1.2619469026548673, "grad_norm": 7.59375, "learning_rate": 2.267329909763397e-06, "loss": 1.1745226383209229, "step": 4100 }, { "epoch": 1.2625625240477105, "grad_norm": 8.3125, "learning_rate": 2.26547515838759e-06, "loss": 1.0408663749694824, "step": 4102 }, { "epoch": 1.2631781454405542, "grad_norm": 14.75, "learning_rate": 2.263621136925583e-06, "loss": 1.402496099472046, "step": 4104 }, { "epoch": 1.2637937668333974, "grad_norm": 8.0, "learning_rate": 2.261767847219758e-06, "loss": 1.7521933317184448, "step": 4106 }, { "epoch": 1.2644093882262408, "grad_norm": 9.3125, "learning_rate": 2.2599152911117726e-06, "loss": 1.6123474836349487, "step": 4108 }, { "epoch": 1.2650250096190843, "grad_norm": 4.6875, "learning_rate": 2.2580634704425513e-06, "loss": 1.5347142219543457, "step": 4110 }, { "epoch": 1.2656406310119277, "grad_norm": 7.28125, "learning_rate": 2.2562123870522914e-06, "loss": 1.4022611379623413, "step": 4112 }, { "epoch": 1.2662562524047711, "grad_norm": 5.65625, "learning_rate": 2.254362042780454e-06, "loss": 1.2427886724472046, "step": 4114 }, { "epoch": 1.2668718737976143, "grad_norm": 8.9375, "learning_rate": 2.2525124394657694e-06, "loss": 0.8506733179092407, "step": 4116 }, { "epoch": 1.267487495190458, "grad_norm": 5.71875, "learning_rate": 2.2506635789462287e-06, "loss": 1.035892128944397, "step": 4118 }, { "epoch": 1.2681031165833012, "grad_norm": 4.25, "learning_rate": 2.2488154630590876e-06, "loss": 1.1248090267181396, "step": 4120 }, { "epoch": 1.2687187379761447, "grad_norm": 6.96875, "learning_rate": 2.2469680936408584e-06, "loss": 1.1317126750946045, "step": 4122 }, { "epoch": 1.269334359368988, "grad_norm": 28.375, "learning_rate": 2.2451214725273143e-06, "loss": 0.6671968698501587, "step": 4124 }, { "epoch": 1.2699499807618315, "grad_norm": 9.4375, "learning_rate": 2.2432756015534853e-06, "loss": 1.3187196254730225, "step": 4126 }, { "epoch": 1.270565602154675, "grad_norm": 9.125, "learning_rate": 2.2414304825536526e-06, "loss": 1.5205804109573364, "step": 4128 }, { "epoch": 1.2711812235475182, "grad_norm": 3.953125, "learning_rate": 2.239586117361354e-06, "loss": 1.1491056680679321, "step": 4130 }, { "epoch": 1.2717968449403616, "grad_norm": 14.5, "learning_rate": 2.237742507809375e-06, "loss": 0.8924199938774109, "step": 4132 }, { "epoch": 1.272412466333205, "grad_norm": 6.125, "learning_rate": 2.2358996557297534e-06, "loss": 1.269026279449463, "step": 4134 }, { "epoch": 1.2730280877260485, "grad_norm": 12.25, "learning_rate": 2.2340575629537713e-06, "loss": 1.2126991748809814, "step": 4136 }, { "epoch": 1.273643709118892, "grad_norm": 13.625, "learning_rate": 2.232216231311959e-06, "loss": 1.360539436340332, "step": 4138 }, { "epoch": 1.2742593305117353, "grad_norm": 5.75, "learning_rate": 2.2303756626340875e-06, "loss": 1.5025355815887451, "step": 4140 }, { "epoch": 1.2748749519045788, "grad_norm": 10.0625, "learning_rate": 2.228535858749172e-06, "loss": 1.3831332921981812, "step": 4142 }, { "epoch": 1.275490573297422, "grad_norm": 7.34375, "learning_rate": 2.2266968214854664e-06, "loss": 1.4608408212661743, "step": 4144 }, { "epoch": 1.2761061946902654, "grad_norm": 7.28125, "learning_rate": 2.2248585526704635e-06, "loss": 1.4185645580291748, "step": 4146 }, { "epoch": 1.2767218160831089, "grad_norm": 2.78125, "learning_rate": 2.223021054130892e-06, "loss": 1.0750359296798706, "step": 4148 }, { "epoch": 1.2773374374759523, "grad_norm": 6.625, "learning_rate": 2.221184327692717e-06, "loss": 1.2798019647598267, "step": 4150 }, { "epoch": 1.2779530588687957, "grad_norm": 1.953125, "learning_rate": 2.2193483751811324e-06, "loss": 1.0501108169555664, "step": 4152 }, { "epoch": 1.2785686802616392, "grad_norm": 12.0, "learning_rate": 2.2175131984205664e-06, "loss": 1.222936749458313, "step": 4154 }, { "epoch": 1.2791843016544826, "grad_norm": 4.75, "learning_rate": 2.2156787992346752e-06, "loss": 1.301552653312683, "step": 4156 }, { "epoch": 1.2797999230473258, "grad_norm": 77.0, "learning_rate": 2.2138451794463423e-06, "loss": 1.4193586111068726, "step": 4158 }, { "epoch": 1.2804155444401693, "grad_norm": 8.0625, "learning_rate": 2.2120123408776765e-06, "loss": 0.7374695539474487, "step": 4160 }, { "epoch": 1.2810311658330127, "grad_norm": 1.8671875, "learning_rate": 2.2101802853500118e-06, "loss": 1.0434259176254272, "step": 4162 }, { "epoch": 1.2816467872258561, "grad_norm": 35.0, "learning_rate": 2.2083490146839023e-06, "loss": 0.865664541721344, "step": 4164 }, { "epoch": 1.2822624086186996, "grad_norm": 7.4375, "learning_rate": 2.206518530699122e-06, "loss": 1.5127063989639282, "step": 4166 }, { "epoch": 1.2828780300115428, "grad_norm": 4.875, "learning_rate": 2.2046888352146632e-06, "loss": 1.2243505716323853, "step": 4168 }, { "epoch": 1.2834936514043864, "grad_norm": 11.25, "learning_rate": 2.2028599300487372e-06, "loss": 1.3695871829986572, "step": 4170 }, { "epoch": 1.2841092727972296, "grad_norm": 4.125, "learning_rate": 2.2010318170187676e-06, "loss": 0.7426531314849854, "step": 4172 }, { "epoch": 1.284724894190073, "grad_norm": 6.4375, "learning_rate": 2.199204497941391e-06, "loss": 1.3837558031082153, "step": 4174 }, { "epoch": 1.2853405155829165, "grad_norm": 16.375, "learning_rate": 2.1973779746324556e-06, "loss": 1.5432078838348389, "step": 4176 }, { "epoch": 1.28595613697576, "grad_norm": 6.625, "learning_rate": 2.195552248907018e-06, "loss": 1.1665561199188232, "step": 4178 }, { "epoch": 1.2865717583686034, "grad_norm": 9.8125, "learning_rate": 2.1937273225793422e-06, "loss": 1.2903505563735962, "step": 4180 }, { "epoch": 1.2871873797614466, "grad_norm": 23.625, "learning_rate": 2.1919031974628995e-06, "loss": 1.1587284803390503, "step": 4182 }, { "epoch": 1.2878030011542902, "grad_norm": 12.6875, "learning_rate": 2.190079875370364e-06, "loss": 1.1106494665145874, "step": 4184 }, { "epoch": 1.2884186225471335, "grad_norm": 5.78125, "learning_rate": 2.1882573581136112e-06, "loss": 1.3384305238723755, "step": 4186 }, { "epoch": 1.289034243939977, "grad_norm": 6.90625, "learning_rate": 2.1864356475037175e-06, "loss": 1.2218106985092163, "step": 4188 }, { "epoch": 1.2896498653328203, "grad_norm": 7.0625, "learning_rate": 2.184614745350958e-06, "loss": 1.1343225240707397, "step": 4190 }, { "epoch": 1.2902654867256638, "grad_norm": 24.5, "learning_rate": 2.1827946534648035e-06, "loss": 1.0077672004699707, "step": 4192 }, { "epoch": 1.2908811081185072, "grad_norm": 3.421875, "learning_rate": 2.1809753736539195e-06, "loss": 0.9141346216201782, "step": 4194 }, { "epoch": 1.2914967295113504, "grad_norm": 8.6875, "learning_rate": 2.1791569077261663e-06, "loss": 1.302938461303711, "step": 4196 }, { "epoch": 1.2921123509041939, "grad_norm": 6.96875, "learning_rate": 2.177339257488594e-06, "loss": 1.711605191230774, "step": 4198 }, { "epoch": 1.2927279722970373, "grad_norm": 10.1875, "learning_rate": 2.1755224247474416e-06, "loss": 1.245891809463501, "step": 4200 }, { "epoch": 1.2933435936898807, "grad_norm": 5.4375, "learning_rate": 2.1737064113081376e-06, "loss": 1.2915186882019043, "step": 4202 }, { "epoch": 1.2939592150827242, "grad_norm": 6.40625, "learning_rate": 2.1718912189752945e-06, "loss": 1.257743239402771, "step": 4204 }, { "epoch": 1.2945748364755676, "grad_norm": 8.0625, "learning_rate": 2.1700768495527097e-06, "loss": 1.1580772399902344, "step": 4206 }, { "epoch": 1.295190457868411, "grad_norm": 4.78125, "learning_rate": 2.168263304843363e-06, "loss": 1.3657855987548828, "step": 4208 }, { "epoch": 1.2958060792612542, "grad_norm": 12.25, "learning_rate": 2.1664505866494143e-06, "loss": 1.3685224056243896, "step": 4210 }, { "epoch": 1.2964217006540977, "grad_norm": 6.0625, "learning_rate": 2.1646386967722024e-06, "loss": 1.3441635370254517, "step": 4212 }, { "epoch": 1.297037322046941, "grad_norm": 11.4375, "learning_rate": 2.1628276370122435e-06, "loss": 1.6039296388626099, "step": 4214 }, { "epoch": 1.2976529434397845, "grad_norm": 4.03125, "learning_rate": 2.161017409169227e-06, "loss": 1.04316246509552, "step": 4216 }, { "epoch": 1.298268564832628, "grad_norm": 15.4375, "learning_rate": 2.159208015042018e-06, "loss": 1.00014066696167, "step": 4218 }, { "epoch": 1.2988841862254714, "grad_norm": 6.625, "learning_rate": 2.1573994564286504e-06, "loss": 0.7740896940231323, "step": 4220 }, { "epoch": 1.2994998076183149, "grad_norm": 29.375, "learning_rate": 2.1555917351263315e-06, "loss": 1.262576937675476, "step": 4222 }, { "epoch": 1.300115429011158, "grad_norm": 11.0, "learning_rate": 2.153784852931433e-06, "loss": 1.6396496295928955, "step": 4224 }, { "epoch": 1.3007310504040015, "grad_norm": 11.0625, "learning_rate": 2.1519788116394944e-06, "loss": 0.8453181982040405, "step": 4226 }, { "epoch": 1.301346671796845, "grad_norm": 9.5, "learning_rate": 2.1501736130452215e-06, "loss": 1.5795183181762695, "step": 4228 }, { "epoch": 1.3019622931896884, "grad_norm": 11.6875, "learning_rate": 2.148369258942477e-06, "loss": 1.3546230792999268, "step": 4230 }, { "epoch": 1.3025779145825318, "grad_norm": 5.5625, "learning_rate": 2.146565751124291e-06, "loss": 1.4344923496246338, "step": 4232 }, { "epoch": 1.303193535975375, "grad_norm": 9.375, "learning_rate": 2.1447630913828486e-06, "loss": 1.288711428642273, "step": 4234 }, { "epoch": 1.3038091573682187, "grad_norm": 5.90625, "learning_rate": 2.1429612815094917e-06, "loss": 1.2355188131332397, "step": 4236 }, { "epoch": 1.3044247787610619, "grad_norm": 7.0, "learning_rate": 2.141160323294722e-06, "loss": 1.3009905815124512, "step": 4238 }, { "epoch": 1.3050404001539053, "grad_norm": 9.375, "learning_rate": 2.1393602185281895e-06, "loss": 1.1852028369903564, "step": 4240 }, { "epoch": 1.3056560215467488, "grad_norm": 7.625, "learning_rate": 2.1375609689987018e-06, "loss": 1.349449872970581, "step": 4242 }, { "epoch": 1.3062716429395922, "grad_norm": 2.453125, "learning_rate": 2.1357625764942096e-06, "loss": 1.4713563919067383, "step": 4244 }, { "epoch": 1.3068872643324356, "grad_norm": 5.75, "learning_rate": 2.133965042801819e-06, "loss": 1.5209068059921265, "step": 4246 }, { "epoch": 1.3075028857252788, "grad_norm": 6.125, "learning_rate": 2.1321683697077774e-06, "loss": 0.6988930702209473, "step": 4248 }, { "epoch": 1.3081185071181223, "grad_norm": 19.75, "learning_rate": 2.1303725589974797e-06, "loss": 1.7587997913360596, "step": 4250 }, { "epoch": 1.3087341285109657, "grad_norm": 7.09375, "learning_rate": 2.1285776124554644e-06, "loss": 1.2838658094406128, "step": 4252 }, { "epoch": 1.3093497499038091, "grad_norm": 9.0625, "learning_rate": 2.126783531865409e-06, "loss": 1.2931767702102661, "step": 4254 }, { "epoch": 1.3099653712966526, "grad_norm": 71.0, "learning_rate": 2.124990319010132e-06, "loss": 1.549433946609497, "step": 4256 }, { "epoch": 1.310580992689496, "grad_norm": 6.3125, "learning_rate": 2.123197975671589e-06, "loss": 1.1958732604980469, "step": 4258 }, { "epoch": 1.3111966140823395, "grad_norm": 5.0, "learning_rate": 2.121406503630871e-06, "loss": 1.2058968544006348, "step": 4260 }, { "epoch": 1.3118122354751827, "grad_norm": 4.1875, "learning_rate": 2.1196159046682058e-06, "loss": 1.1884846687316895, "step": 4262 }, { "epoch": 1.312427856868026, "grad_norm": 7.9375, "learning_rate": 2.1178261805629495e-06, "loss": 1.1313529014587402, "step": 4264 }, { "epoch": 1.3130434782608695, "grad_norm": 5.75, "learning_rate": 2.1160373330935937e-06, "loss": 1.3965493440628052, "step": 4266 }, { "epoch": 1.313659099653713, "grad_norm": 12.0625, "learning_rate": 2.114249364037754e-06, "loss": 1.5671403408050537, "step": 4268 }, { "epoch": 1.3142747210465564, "grad_norm": 3.34375, "learning_rate": 2.112462275172176e-06, "loss": 1.3740296363830566, "step": 4270 }, { "epoch": 1.3148903424393998, "grad_norm": 49.5, "learning_rate": 2.110676068272731e-06, "loss": 0.7293486595153809, "step": 4272 }, { "epoch": 1.3155059638322433, "grad_norm": 5.375, "learning_rate": 2.1088907451144105e-06, "loss": 1.0549618005752563, "step": 4274 }, { "epoch": 1.3161215852250865, "grad_norm": 5.9375, "learning_rate": 2.107106307471332e-06, "loss": 0.9314700365066528, "step": 4276 }, { "epoch": 1.31673720661793, "grad_norm": 5.0, "learning_rate": 2.1053227571167316e-06, "loss": 0.9728541970252991, "step": 4278 }, { "epoch": 1.3173528280107734, "grad_norm": 10.9375, "learning_rate": 2.1035400958229617e-06, "loss": 1.6516066789627075, "step": 4280 }, { "epoch": 1.3179684494036168, "grad_norm": 43.25, "learning_rate": 2.1017583253614936e-06, "loss": 1.2666043043136597, "step": 4282 }, { "epoch": 1.3185840707964602, "grad_norm": 23.75, "learning_rate": 2.099977447502912e-06, "loss": 1.5421197414398193, "step": 4284 }, { "epoch": 1.3191996921893034, "grad_norm": 7.75, "learning_rate": 2.0981974640169155e-06, "loss": 1.1174343824386597, "step": 4286 }, { "epoch": 1.319815313582147, "grad_norm": 13.0, "learning_rate": 2.0964183766723143e-06, "loss": 1.520904779434204, "step": 4288 }, { "epoch": 1.3204309349749903, "grad_norm": 7.96875, "learning_rate": 2.094640187237026e-06, "loss": 1.4290568828582764, "step": 4290 }, { "epoch": 1.3210465563678337, "grad_norm": 9.875, "learning_rate": 2.0928628974780784e-06, "loss": 1.786804437637329, "step": 4292 }, { "epoch": 1.3216621777606772, "grad_norm": 3.0, "learning_rate": 2.0910865091616044e-06, "loss": 0.8363227844238281, "step": 4294 }, { "epoch": 1.3222777991535206, "grad_norm": 5.90625, "learning_rate": 2.08931102405284e-06, "loss": 1.2891432046890259, "step": 4296 }, { "epoch": 1.322893420546364, "grad_norm": 16.25, "learning_rate": 2.087536443916124e-06, "loss": 1.2994776964187622, "step": 4298 }, { "epoch": 1.3235090419392073, "grad_norm": 10.4375, "learning_rate": 2.0857627705148985e-06, "loss": 0.9553351402282715, "step": 4300 }, { "epoch": 1.324124663332051, "grad_norm": 8.625, "learning_rate": 2.083990005611701e-06, "loss": 1.2285696268081665, "step": 4302 }, { "epoch": 1.3247402847248941, "grad_norm": 4.6875, "learning_rate": 2.082218150968167e-06, "loss": 0.8677452802658081, "step": 4304 }, { "epoch": 1.3253559061177376, "grad_norm": 11.1875, "learning_rate": 2.080447208345031e-06, "loss": 1.2829258441925049, "step": 4306 }, { "epoch": 1.325971527510581, "grad_norm": 3.53125, "learning_rate": 2.078677179502115e-06, "loss": 1.1913378238677979, "step": 4308 }, { "epoch": 1.3265871489034244, "grad_norm": 9.0625, "learning_rate": 2.076908066198339e-06, "loss": 1.5480560064315796, "step": 4310 }, { "epoch": 1.3272027702962679, "grad_norm": 10.375, "learning_rate": 2.0751398701917092e-06, "loss": 1.5165691375732422, "step": 4312 }, { "epoch": 1.327818391689111, "grad_norm": 8.25, "learning_rate": 2.073372593239321e-06, "loss": 1.4265310764312744, "step": 4314 }, { "epoch": 1.3284340130819545, "grad_norm": 4.375, "learning_rate": 2.0716062370973587e-06, "loss": 0.530724048614502, "step": 4316 }, { "epoch": 1.329049634474798, "grad_norm": 2.921875, "learning_rate": 2.069840803521089e-06, "loss": 1.161312222480774, "step": 4318 }, { "epoch": 1.3296652558676414, "grad_norm": 33.75, "learning_rate": 2.0680762942648646e-06, "loss": 1.5510377883911133, "step": 4320 }, { "epoch": 1.3302808772604848, "grad_norm": 7.90625, "learning_rate": 2.0663127110821144e-06, "loss": 1.5545850992202759, "step": 4322 }, { "epoch": 1.3308964986533283, "grad_norm": 6.84375, "learning_rate": 2.0645500557253544e-06, "loss": 1.6228781938552856, "step": 4324 }, { "epoch": 1.3315121200461717, "grad_norm": 5.21875, "learning_rate": 2.062788329946172e-06, "loss": 1.5430296659469604, "step": 4326 }, { "epoch": 1.332127741439015, "grad_norm": 15.6875, "learning_rate": 2.0610275354952338e-06, "loss": 1.4414647817611694, "step": 4328 }, { "epoch": 1.3327433628318583, "grad_norm": 5.5625, "learning_rate": 2.059267674122283e-06, "loss": 1.1935805082321167, "step": 4330 }, { "epoch": 1.3333589842247018, "grad_norm": 2.8125, "learning_rate": 2.057508747576131e-06, "loss": 1.3282206058502197, "step": 4332 }, { "epoch": 1.3339746056175452, "grad_norm": 21.0, "learning_rate": 2.0557507576046632e-06, "loss": 1.315294623374939, "step": 4334 }, { "epoch": 1.3345902270103887, "grad_norm": 12.75, "learning_rate": 2.0539937059548336e-06, "loss": 1.1800450086593628, "step": 4336 }, { "epoch": 1.335205848403232, "grad_norm": 4.625, "learning_rate": 2.0522375943726634e-06, "loss": 1.0343067646026611, "step": 4338 }, { "epoch": 1.3358214697960755, "grad_norm": 22.875, "learning_rate": 2.050482424603242e-06, "loss": 1.1441932916641235, "step": 4340 }, { "epoch": 1.3364370911889187, "grad_norm": 3.78125, "learning_rate": 2.0487281983907185e-06, "loss": 0.6436120867729187, "step": 4342 }, { "epoch": 1.3370527125817622, "grad_norm": 13.6875, "learning_rate": 2.0469749174783072e-06, "loss": 0.700677752494812, "step": 4344 }, { "epoch": 1.3376683339746056, "grad_norm": 4.1875, "learning_rate": 2.045222583608285e-06, "loss": 1.539798617362976, "step": 4346 }, { "epoch": 1.338283955367449, "grad_norm": 5.53125, "learning_rate": 2.0434711985219823e-06, "loss": 1.2045598030090332, "step": 4348 }, { "epoch": 1.3388995767602925, "grad_norm": 4.65625, "learning_rate": 2.041720763959791e-06, "loss": 1.2669674158096313, "step": 4350 }, { "epoch": 1.3395151981531357, "grad_norm": 12.25, "learning_rate": 2.0399712816611573e-06, "loss": 1.7195749282836914, "step": 4352 }, { "epoch": 1.3401308195459793, "grad_norm": 2.953125, "learning_rate": 2.0382227533645813e-06, "loss": 0.7352992296218872, "step": 4354 }, { "epoch": 1.3407464409388226, "grad_norm": 7.78125, "learning_rate": 2.0364751808076142e-06, "loss": 1.1636168956756592, "step": 4356 }, { "epoch": 1.341362062331666, "grad_norm": 9.0625, "learning_rate": 2.034728565726858e-06, "loss": 1.2400199174880981, "step": 4358 }, { "epoch": 1.3419776837245094, "grad_norm": 9.1875, "learning_rate": 2.032982909857964e-06, "loss": 1.349189281463623, "step": 4360 }, { "epoch": 1.3425933051173529, "grad_norm": 5.90625, "learning_rate": 2.0312382149356276e-06, "loss": 1.2955114841461182, "step": 4362 }, { "epoch": 1.3432089265101963, "grad_norm": 19.375, "learning_rate": 2.0294944826935937e-06, "loss": 1.6380174160003662, "step": 4364 }, { "epoch": 1.3438245479030395, "grad_norm": 2.6875, "learning_rate": 2.027751714864647e-06, "loss": 1.3109372854232788, "step": 4366 }, { "epoch": 1.3444401692958832, "grad_norm": 6.84375, "learning_rate": 2.0260099131806137e-06, "loss": 1.6937569379806519, "step": 4368 }, { "epoch": 1.3450557906887264, "grad_norm": 1.84375, "learning_rate": 2.024269079372365e-06, "loss": 1.0671393871307373, "step": 4370 }, { "epoch": 1.3456714120815698, "grad_norm": 4.1875, "learning_rate": 2.0225292151698016e-06, "loss": 1.1917141675949097, "step": 4372 }, { "epoch": 1.3462870334744133, "grad_norm": 3.421875, "learning_rate": 2.0207903223018686e-06, "loss": 1.3352512121200562, "step": 4374 }, { "epoch": 1.3469026548672567, "grad_norm": 5.28125, "learning_rate": 2.019052402496542e-06, "loss": 1.538804292678833, "step": 4376 }, { "epoch": 1.3475182762601001, "grad_norm": 13.9375, "learning_rate": 2.017315457480832e-06, "loss": 1.5620875358581543, "step": 4378 }, { "epoch": 1.3481338976529433, "grad_norm": 5.9375, "learning_rate": 2.0155794889807802e-06, "loss": 1.3226521015167236, "step": 4380 }, { "epoch": 1.3487495190457868, "grad_norm": 10.5, "learning_rate": 2.0138444987214556e-06, "loss": 1.5064817667007446, "step": 4382 }, { "epoch": 1.3493651404386302, "grad_norm": 10.25, "learning_rate": 2.0121104884269598e-06, "loss": 1.6169400215148926, "step": 4384 }, { "epoch": 1.3499807618314736, "grad_norm": 9.5, "learning_rate": 2.0103774598204144e-06, "loss": 1.4810489416122437, "step": 4386 }, { "epoch": 1.350596383224317, "grad_norm": 13.5625, "learning_rate": 2.008645414623971e-06, "loss": 1.0603591203689575, "step": 4388 }, { "epoch": 1.3512120046171605, "grad_norm": 13.125, "learning_rate": 2.006914354558801e-06, "loss": 1.2640867233276367, "step": 4390 }, { "epoch": 1.351827626010004, "grad_norm": 3.71875, "learning_rate": 2.0051842813450977e-06, "loss": 1.0505619049072266, "step": 4392 }, { "epoch": 1.3524432474028472, "grad_norm": 5.6875, "learning_rate": 2.003455196702074e-06, "loss": 1.3017810583114624, "step": 4394 }, { "epoch": 1.3530588687956906, "grad_norm": 9.5, "learning_rate": 2.0017271023479595e-06, "loss": 1.5109614133834839, "step": 4396 }, { "epoch": 1.353674490188534, "grad_norm": 6.34375, "learning_rate": 2.0000000000000008e-06, "loss": 1.2867122888565063, "step": 4398 }, { "epoch": 1.3542901115813775, "grad_norm": 8.5625, "learning_rate": 1.9982738913744574e-06, "loss": 1.4767355918884277, "step": 4400 }, { "epoch": 1.354905732974221, "grad_norm": 1.953125, "learning_rate": 1.9965487781866026e-06, "loss": 1.3075792789459229, "step": 4402 }, { "epoch": 1.3555213543670643, "grad_norm": 11.6875, "learning_rate": 1.9948246621507204e-06, "loss": 1.4660849571228027, "step": 4404 }, { "epoch": 1.3561369757599078, "grad_norm": 6.71875, "learning_rate": 1.993101544980103e-06, "loss": 1.3034684658050537, "step": 4406 }, { "epoch": 1.356752597152751, "grad_norm": 7.25, "learning_rate": 1.9913794283870513e-06, "loss": 1.106898546218872, "step": 4408 }, { "epoch": 1.3573682185455944, "grad_norm": 33.75, "learning_rate": 1.9896583140828707e-06, "loss": 1.467529296875, "step": 4410 }, { "epoch": 1.3579838399384379, "grad_norm": 13.25, "learning_rate": 1.987938203777871e-06, "loss": 1.5077505111694336, "step": 4412 }, { "epoch": 1.3585994613312813, "grad_norm": 8.0625, "learning_rate": 1.9862190991813642e-06, "loss": 1.4373395442962646, "step": 4414 }, { "epoch": 1.3592150827241247, "grad_norm": 5.71875, "learning_rate": 1.984501002001663e-06, "loss": 0.9870994091033936, "step": 4416 }, { "epoch": 1.359830704116968, "grad_norm": 5.125, "learning_rate": 1.9827839139460793e-06, "loss": 1.319697380065918, "step": 4418 }, { "epoch": 1.3604463255098116, "grad_norm": 10.3125, "learning_rate": 1.981067836720923e-06, "loss": 1.703061819076538, "step": 4420 }, { "epoch": 1.3610619469026548, "grad_norm": 7.8125, "learning_rate": 1.979352772031497e-06, "loss": 1.3773094415664673, "step": 4422 }, { "epoch": 1.3616775682954982, "grad_norm": 4.75, "learning_rate": 1.9776387215821e-06, "loss": 1.4723691940307617, "step": 4424 }, { "epoch": 1.3622931896883417, "grad_norm": 9.0625, "learning_rate": 1.9759256870760226e-06, "loss": 1.3658338785171509, "step": 4426 }, { "epoch": 1.3629088110811851, "grad_norm": 5.59375, "learning_rate": 1.9742136702155452e-06, "loss": 1.1979423761367798, "step": 4428 }, { "epoch": 1.3635244324740285, "grad_norm": 6.1875, "learning_rate": 1.9725026727019368e-06, "loss": 1.3881921768188477, "step": 4430 }, { "epoch": 1.3641400538668718, "grad_norm": 11.375, "learning_rate": 1.970792696235456e-06, "loss": 1.4705370664596558, "step": 4432 }, { "epoch": 1.3647556752597152, "grad_norm": 7.75, "learning_rate": 1.9690837425153433e-06, "loss": 1.252502679824829, "step": 4434 }, { "epoch": 1.3653712966525586, "grad_norm": 6.71875, "learning_rate": 1.9673758132398245e-06, "loss": 1.3999512195587158, "step": 4436 }, { "epoch": 1.365986918045402, "grad_norm": 7.625, "learning_rate": 1.9656689101061076e-06, "loss": 1.3501346111297607, "step": 4438 }, { "epoch": 1.3666025394382455, "grad_norm": 4.5, "learning_rate": 1.963963034810379e-06, "loss": 1.1640175580978394, "step": 4440 }, { "epoch": 1.367218160831089, "grad_norm": 5.375, "learning_rate": 1.9622581890478066e-06, "loss": 1.3057405948638916, "step": 4442 }, { "epoch": 1.3678337822239324, "grad_norm": 7.625, "learning_rate": 1.9605543745125343e-06, "loss": 1.3908565044403076, "step": 4444 }, { "epoch": 1.3684494036167756, "grad_norm": 9.9375, "learning_rate": 1.9588515928976793e-06, "loss": 1.5239262580871582, "step": 4446 }, { "epoch": 1.369065025009619, "grad_norm": 12.4375, "learning_rate": 1.957149845895336e-06, "loss": 1.6909699440002441, "step": 4448 }, { "epoch": 1.3696806464024625, "grad_norm": 8.625, "learning_rate": 1.9554491351965654e-06, "loss": 1.5218443870544434, "step": 4450 }, { "epoch": 1.370296267795306, "grad_norm": 3.9375, "learning_rate": 1.9537494624914046e-06, "loss": 1.1506916284561157, "step": 4452 }, { "epoch": 1.3709118891881493, "grad_norm": 15.5, "learning_rate": 1.9520508294688558e-06, "loss": 1.3528341054916382, "step": 4454 }, { "epoch": 1.3715275105809928, "grad_norm": 13.875, "learning_rate": 1.950353237816887e-06, "loss": 1.7821015119552612, "step": 4456 }, { "epoch": 1.3721431319738362, "grad_norm": 5.34375, "learning_rate": 1.9486566892224355e-06, "loss": 1.294496774673462, "step": 4458 }, { "epoch": 1.3727587533666794, "grad_norm": 8.375, "learning_rate": 1.9469611853713984e-06, "loss": 1.490309238433838, "step": 4460 }, { "epoch": 1.3733743747595228, "grad_norm": 15.875, "learning_rate": 1.945266727948637e-06, "loss": 0.8911181092262268, "step": 4462 }, { "epoch": 1.3739899961523663, "grad_norm": 7.40625, "learning_rate": 1.9435733186379694e-06, "loss": 1.6356643438339233, "step": 4464 }, { "epoch": 1.3746056175452097, "grad_norm": 17.0, "learning_rate": 1.941880959122177e-06, "loss": 0.9448275566101074, "step": 4466 }, { "epoch": 1.3752212389380531, "grad_norm": 5.59375, "learning_rate": 1.9401896510829935e-06, "loss": 1.2833199501037598, "step": 4468 }, { "epoch": 1.3758368603308964, "grad_norm": 18.875, "learning_rate": 1.93849939620111e-06, "loss": 1.2400034666061401, "step": 4470 }, { "epoch": 1.37645248172374, "grad_norm": 8.0, "learning_rate": 1.9368101961561712e-06, "loss": 1.4103025197982788, "step": 4472 }, { "epoch": 1.3770681031165832, "grad_norm": 4.4375, "learning_rate": 1.935122052626773e-06, "loss": 1.195777416229248, "step": 4474 }, { "epoch": 1.3776837245094267, "grad_norm": 55.5, "learning_rate": 1.933434967290461e-06, "loss": 1.3255105018615723, "step": 4476 }, { "epoch": 1.37829934590227, "grad_norm": 9.5, "learning_rate": 1.9317489418237303e-06, "loss": 1.213265299797058, "step": 4478 }, { "epoch": 1.3789149672951135, "grad_norm": 2.75, "learning_rate": 1.930063977902021e-06, "loss": 1.274376392364502, "step": 4480 }, { "epoch": 1.379530588687957, "grad_norm": 4.9375, "learning_rate": 1.928380077199721e-06, "loss": 1.4887018203735352, "step": 4482 }, { "epoch": 1.3801462100808002, "grad_norm": 6.5625, "learning_rate": 1.926697241390159e-06, "loss": 1.33385169506073, "step": 4484 }, { "epoch": 1.3807618314736438, "grad_norm": 8.1875, "learning_rate": 1.9250154721456075e-06, "loss": 1.7023966312408447, "step": 4486 }, { "epoch": 1.381377452866487, "grad_norm": 4.71875, "learning_rate": 1.9233347711372794e-06, "loss": 1.4938396215438843, "step": 4488 }, { "epoch": 1.3819930742593305, "grad_norm": 14.6875, "learning_rate": 1.9216551400353213e-06, "loss": 1.6272310018539429, "step": 4490 }, { "epoch": 1.382608695652174, "grad_norm": 8.0, "learning_rate": 1.9199765805088237e-06, "loss": 1.546007752418518, "step": 4492 }, { "epoch": 1.3832243170450174, "grad_norm": 8.5625, "learning_rate": 1.9182990942258074e-06, "loss": 1.4731391668319702, "step": 4494 }, { "epoch": 1.3838399384378608, "grad_norm": 43.5, "learning_rate": 1.9166226828532285e-06, "loss": 1.415830373764038, "step": 4496 }, { "epoch": 1.384455559830704, "grad_norm": 6.1875, "learning_rate": 1.9149473480569747e-06, "loss": 1.2627977132797241, "step": 4498 }, { "epoch": 1.3850711812235474, "grad_norm": 13.375, "learning_rate": 1.913273091501863e-06, "loss": 1.481513261795044, "step": 4500 }, { "epoch": 1.3856868026163909, "grad_norm": 6.03125, "learning_rate": 1.9115999148516408e-06, "loss": 0.9864988327026367, "step": 4502 }, { "epoch": 1.3863024240092343, "grad_norm": 6.96875, "learning_rate": 1.9099278197689796e-06, "loss": 1.1418694257736206, "step": 4504 }, { "epoch": 1.3869180454020777, "grad_norm": 12.75, "learning_rate": 1.9082568079154797e-06, "loss": 1.5251163244247437, "step": 4506 }, { "epoch": 1.3875336667949212, "grad_norm": 5.78125, "learning_rate": 1.906586880951662e-06, "loss": 1.3032033443450928, "step": 4508 }, { "epoch": 1.3881492881877646, "grad_norm": 16.375, "learning_rate": 1.9049180405369693e-06, "loss": 1.7152700424194336, "step": 4510 }, { "epoch": 1.3887649095806078, "grad_norm": 10.4375, "learning_rate": 1.9032502883297683e-06, "loss": 1.4913779497146606, "step": 4512 }, { "epoch": 1.3893805309734513, "grad_norm": 10.5625, "learning_rate": 1.9015836259873399e-06, "loss": 1.4394402503967285, "step": 4514 }, { "epoch": 1.3899961523662947, "grad_norm": 4.875, "learning_rate": 1.8999180551658844e-06, "loss": 1.1065744161605835, "step": 4516 }, { "epoch": 1.3906117737591381, "grad_norm": 12.1875, "learning_rate": 1.898253577520516e-06, "loss": 1.1111674308776855, "step": 4518 }, { "epoch": 1.3912273951519816, "grad_norm": 9.0625, "learning_rate": 1.8965901947052648e-06, "loss": 1.5368804931640625, "step": 4520 }, { "epoch": 1.391843016544825, "grad_norm": 9.625, "learning_rate": 1.8949279083730713e-06, "loss": 1.5333424806594849, "step": 4522 }, { "epoch": 1.3924586379376684, "grad_norm": 6.5625, "learning_rate": 1.8932667201757853e-06, "loss": 1.6218065023422241, "step": 4524 }, { "epoch": 1.3930742593305117, "grad_norm": 6.40625, "learning_rate": 1.8916066317641692e-06, "loss": 1.1975196599960327, "step": 4526 }, { "epoch": 1.393689880723355, "grad_norm": 3.390625, "learning_rate": 1.8899476447878875e-06, "loss": 1.0931484699249268, "step": 4528 }, { "epoch": 1.3943055021161985, "grad_norm": 4.625, "learning_rate": 1.8882897608955147e-06, "loss": 1.1063181161880493, "step": 4530 }, { "epoch": 1.394921123509042, "grad_norm": 6.3125, "learning_rate": 1.8866329817345264e-06, "loss": 0.9578396677970886, "step": 4532 }, { "epoch": 1.3955367449018854, "grad_norm": 5.34375, "learning_rate": 1.8849773089513002e-06, "loss": 1.0011837482452393, "step": 4534 }, { "epoch": 1.3961523662947286, "grad_norm": 6.40625, "learning_rate": 1.8833227441911173e-06, "loss": 1.3394837379455566, "step": 4536 }, { "epoch": 1.3967679876875723, "grad_norm": 7.28125, "learning_rate": 1.8816692890981535e-06, "loss": 1.2099369764328003, "step": 4538 }, { "epoch": 1.3973836090804155, "grad_norm": 215.0, "learning_rate": 1.8800169453154873e-06, "loss": 1.6604599952697754, "step": 4540 }, { "epoch": 1.397999230473259, "grad_norm": 12.8125, "learning_rate": 1.8783657144850873e-06, "loss": 1.6320379972457886, "step": 4542 }, { "epoch": 1.3986148518661023, "grad_norm": 7.15625, "learning_rate": 1.876715598247818e-06, "loss": 1.6006578207015991, "step": 4544 }, { "epoch": 1.3992304732589458, "grad_norm": 6.71875, "learning_rate": 1.875066598243439e-06, "loss": 1.2672290802001953, "step": 4546 }, { "epoch": 1.3998460946517892, "grad_norm": 5.46875, "learning_rate": 1.8734187161105971e-06, "loss": 1.3041892051696777, "step": 4548 }, { "epoch": 1.4004617160446324, "grad_norm": 5.875, "learning_rate": 1.8717719534868305e-06, "loss": 1.2353522777557373, "step": 4550 }, { "epoch": 1.401077337437476, "grad_norm": 3.359375, "learning_rate": 1.8701263120085644e-06, "loss": 1.2635188102722168, "step": 4552 }, { "epoch": 1.4016929588303193, "grad_norm": 6.4375, "learning_rate": 1.8684817933111092e-06, "loss": 1.4470961093902588, "step": 4554 }, { "epoch": 1.4023085802231627, "grad_norm": 5.40625, "learning_rate": 1.8668383990286595e-06, "loss": 1.6117013692855835, "step": 4556 }, { "epoch": 1.4029242016160062, "grad_norm": 5.875, "learning_rate": 1.8651961307942927e-06, "loss": 1.3637393712997437, "step": 4558 }, { "epoch": 1.4035398230088496, "grad_norm": 11.5, "learning_rate": 1.8635549902399693e-06, "loss": 1.286697268486023, "step": 4560 }, { "epoch": 1.404155444401693, "grad_norm": 6.21875, "learning_rate": 1.8619149789965262e-06, "loss": 1.1871602535247803, "step": 4562 }, { "epoch": 1.4047710657945363, "grad_norm": 7.125, "learning_rate": 1.860276098693679e-06, "loss": 1.2495115995407104, "step": 4564 }, { "epoch": 1.4053866871873797, "grad_norm": 6.96875, "learning_rate": 1.858638350960022e-06, "loss": 1.014350175857544, "step": 4566 }, { "epoch": 1.4060023085802231, "grad_norm": 19.5, "learning_rate": 1.8570017374230186e-06, "loss": 1.6906726360321045, "step": 4568 }, { "epoch": 1.4066179299730666, "grad_norm": 18.125, "learning_rate": 1.8553662597090108e-06, "loss": 1.248024344444275, "step": 4570 }, { "epoch": 1.40723355136591, "grad_norm": 19.0, "learning_rate": 1.8537319194432079e-06, "loss": 1.3858565092086792, "step": 4572 }, { "epoch": 1.4078491727587534, "grad_norm": 7.8125, "learning_rate": 1.852098718249692e-06, "loss": 1.0407990217208862, "step": 4574 }, { "epoch": 1.4084647941515969, "grad_norm": 4.5, "learning_rate": 1.8504666577514107e-06, "loss": 1.3021090030670166, "step": 4576 }, { "epoch": 1.40908041554444, "grad_norm": 5.4375, "learning_rate": 1.8488357395701795e-06, "loss": 1.17103910446167, "step": 4578 }, { "epoch": 1.4096960369372835, "grad_norm": 4.4375, "learning_rate": 1.847205965326678e-06, "loss": 1.296510934829712, "step": 4580 }, { "epoch": 1.410311658330127, "grad_norm": 17.125, "learning_rate": 1.845577336640449e-06, "loss": 1.4172823429107666, "step": 4582 }, { "epoch": 1.4109272797229704, "grad_norm": 5.75, "learning_rate": 1.8439498551298984e-06, "loss": 1.5029948949813843, "step": 4584 }, { "epoch": 1.4115429011158138, "grad_norm": 8.5625, "learning_rate": 1.8423235224122909e-06, "loss": 1.2382487058639526, "step": 4586 }, { "epoch": 1.4121585225086573, "grad_norm": 4.375, "learning_rate": 1.8406983401037487e-06, "loss": 1.267309546470642, "step": 4588 }, { "epoch": 1.4127741439015007, "grad_norm": 3.625, "learning_rate": 1.8390743098192543e-06, "loss": 1.3358652591705322, "step": 4590 }, { "epoch": 1.413389765294344, "grad_norm": 3.6875, "learning_rate": 1.8374514331726396e-06, "loss": 1.0695524215698242, "step": 4592 }, { "epoch": 1.4140053866871873, "grad_norm": 6.0625, "learning_rate": 1.8358297117765958e-06, "loss": 1.4901816844940186, "step": 4594 }, { "epoch": 1.4146210080800308, "grad_norm": 5.625, "learning_rate": 1.8342091472426637e-06, "loss": 1.594797134399414, "step": 4596 }, { "epoch": 1.4152366294728742, "grad_norm": 9.875, "learning_rate": 1.8325897411812333e-06, "loss": 1.5337915420532227, "step": 4598 }, { "epoch": 1.4158522508657176, "grad_norm": 7.625, "learning_rate": 1.830971495201546e-06, "loss": 1.4680191278457642, "step": 4600 }, { "epoch": 1.4164678722585609, "grad_norm": 18.75, "learning_rate": 1.829354410911688e-06, "loss": 0.921638548374176, "step": 4602 }, { "epoch": 1.4170834936514045, "grad_norm": 4.09375, "learning_rate": 1.8277384899185946e-06, "loss": 1.2589328289031982, "step": 4604 }, { "epoch": 1.4176991150442477, "grad_norm": 12.875, "learning_rate": 1.8261237338280393e-06, "loss": 1.569953441619873, "step": 4606 }, { "epoch": 1.4183147364370912, "grad_norm": 43.25, "learning_rate": 1.824510144244644e-06, "loss": 1.1034214496612549, "step": 4608 }, { "epoch": 1.4189303578299346, "grad_norm": 10.5, "learning_rate": 1.822897722771868e-06, "loss": 1.5207964181900024, "step": 4610 }, { "epoch": 1.419545979222778, "grad_norm": 4.5, "learning_rate": 1.8212864710120096e-06, "loss": 1.3485201597213745, "step": 4612 }, { "epoch": 1.4201616006156215, "grad_norm": 11.25, "learning_rate": 1.8196763905662077e-06, "loss": 0.7749507427215576, "step": 4614 }, { "epoch": 1.4207772220084647, "grad_norm": 48.0, "learning_rate": 1.8180674830344343e-06, "loss": 1.8789231777191162, "step": 4616 }, { "epoch": 1.4213928434013081, "grad_norm": 8.6875, "learning_rate": 1.816459750015497e-06, "loss": 1.3978246450424194, "step": 4618 }, { "epoch": 1.4220084647941515, "grad_norm": 6.125, "learning_rate": 1.8148531931070365e-06, "loss": 1.079241394996643, "step": 4620 }, { "epoch": 1.422624086186995, "grad_norm": 11.0625, "learning_rate": 1.813247813905523e-06, "loss": 1.2193536758422852, "step": 4622 }, { "epoch": 1.4232397075798384, "grad_norm": 5.03125, "learning_rate": 1.81164361400626e-06, "loss": 1.5119633674621582, "step": 4624 }, { "epoch": 1.4238553289726819, "grad_norm": 4.3125, "learning_rate": 1.8100405950033744e-06, "loss": 1.38486909866333, "step": 4626 }, { "epoch": 1.4244709503655253, "grad_norm": 6.65625, "learning_rate": 1.8084387584898244e-06, "loss": 1.3553379774093628, "step": 4628 }, { "epoch": 1.4250865717583685, "grad_norm": 9.375, "learning_rate": 1.8068381060573903e-06, "loss": 1.1260082721710205, "step": 4630 }, { "epoch": 1.425702193151212, "grad_norm": 14.4375, "learning_rate": 1.8052386392966756e-06, "loss": 1.1511904001235962, "step": 4632 }, { "epoch": 1.4263178145440554, "grad_norm": 8.8125, "learning_rate": 1.8036403597971064e-06, "loss": 1.4743871688842773, "step": 4634 }, { "epoch": 1.4269334359368988, "grad_norm": 6.125, "learning_rate": 1.8020432691469289e-06, "loss": 1.2460463047027588, "step": 4636 }, { "epoch": 1.4275490573297422, "grad_norm": 10.625, "learning_rate": 1.8004473689332082e-06, "loss": 1.2400809526443481, "step": 4638 }, { "epoch": 1.4281646787225857, "grad_norm": 10.25, "learning_rate": 1.7988526607418264e-06, "loss": 1.0554633140563965, "step": 4640 }, { "epoch": 1.4287803001154291, "grad_norm": 8.625, "learning_rate": 1.7972591461574801e-06, "loss": 1.2530990839004517, "step": 4642 }, { "epoch": 1.4293959215082723, "grad_norm": 3.53125, "learning_rate": 1.7956668267636806e-06, "loss": 0.9893711805343628, "step": 4644 }, { "epoch": 1.4300115429011158, "grad_norm": 20.25, "learning_rate": 1.7940757041427512e-06, "loss": 1.5199898481369019, "step": 4646 }, { "epoch": 1.4306271642939592, "grad_norm": 21.0, "learning_rate": 1.7924857798758265e-06, "loss": 1.4373661279678345, "step": 4648 }, { "epoch": 1.4312427856868026, "grad_norm": 8.75, "learning_rate": 1.7908970555428504e-06, "loss": 1.1833205223083496, "step": 4650 }, { "epoch": 1.431858407079646, "grad_norm": 5.3125, "learning_rate": 1.789309532722572e-06, "loss": 1.5111973285675049, "step": 4652 }, { "epoch": 1.4324740284724893, "grad_norm": 8.3125, "learning_rate": 1.7877232129925506e-06, "loss": 1.2623931169509888, "step": 4654 }, { "epoch": 1.433089649865333, "grad_norm": 4.09375, "learning_rate": 1.7861380979291464e-06, "loss": 1.082903504371643, "step": 4656 }, { "epoch": 1.4337052712581762, "grad_norm": 6.40625, "learning_rate": 1.7845541891075245e-06, "loss": 1.4036815166473389, "step": 4658 }, { "epoch": 1.4343208926510196, "grad_norm": 12.25, "learning_rate": 1.7829714881016489e-06, "loss": 1.3116552829742432, "step": 4660 }, { "epoch": 1.434936514043863, "grad_norm": 6.40625, "learning_rate": 1.781389996484287e-06, "loss": 1.3516740798950195, "step": 4662 }, { "epoch": 1.4355521354367065, "grad_norm": 12.875, "learning_rate": 1.779809715827002e-06, "loss": 1.2047392129898071, "step": 4664 }, { "epoch": 1.43616775682955, "grad_norm": 6.25, "learning_rate": 1.7782306477001533e-06, "loss": 1.528123378753662, "step": 4666 }, { "epoch": 1.436783378222393, "grad_norm": 8.6875, "learning_rate": 1.776652793672898e-06, "loss": 1.6827585697174072, "step": 4668 }, { "epoch": 1.4373989996152368, "grad_norm": 6.40625, "learning_rate": 1.775076155313183e-06, "loss": 1.4424442052841187, "step": 4670 }, { "epoch": 1.43801462100808, "grad_norm": 3.34375, "learning_rate": 1.7735007341877505e-06, "loss": 1.3284802436828613, "step": 4672 }, { "epoch": 1.4386302424009234, "grad_norm": 6.53125, "learning_rate": 1.7719265318621314e-06, "loss": 0.7595493197441101, "step": 4674 }, { "epoch": 1.4392458637937668, "grad_norm": 12.8125, "learning_rate": 1.7703535499006455e-06, "loss": 1.4551305770874023, "step": 4676 }, { "epoch": 1.4398614851866103, "grad_norm": 2.359375, "learning_rate": 1.7687817898664012e-06, "loss": 0.8711044788360596, "step": 4678 }, { "epoch": 1.4404771065794537, "grad_norm": 8.75, "learning_rate": 1.7672112533212904e-06, "loss": 1.2691121101379395, "step": 4680 }, { "epoch": 1.441092727972297, "grad_norm": 22.75, "learning_rate": 1.7656419418259923e-06, "loss": 1.0050324201583862, "step": 4682 }, { "epoch": 1.4417083493651404, "grad_norm": 12.125, "learning_rate": 1.764073856939965e-06, "loss": 1.3560082912445068, "step": 4684 }, { "epoch": 1.4423239707579838, "grad_norm": 5.78125, "learning_rate": 1.7625070002214502e-06, "loss": 1.340012788772583, "step": 4686 }, { "epoch": 1.4429395921508272, "grad_norm": 3.453125, "learning_rate": 1.7609413732274694e-06, "loss": 1.1772860288619995, "step": 4688 }, { "epoch": 1.4435552135436707, "grad_norm": 8.25, "learning_rate": 1.7593769775138196e-06, "loss": 1.6817734241485596, "step": 4690 }, { "epoch": 1.444170834936514, "grad_norm": 8.4375, "learning_rate": 1.7578138146350776e-06, "loss": 1.3809301853179932, "step": 4692 }, { "epoch": 1.4447864563293575, "grad_norm": 6.53125, "learning_rate": 1.7562518861445923e-06, "loss": 1.1861056089401245, "step": 4694 }, { "epoch": 1.4454020777222008, "grad_norm": 3.28125, "learning_rate": 1.7546911935944878e-06, "loss": 1.0259594917297363, "step": 4696 }, { "epoch": 1.4460176991150442, "grad_norm": 5.1875, "learning_rate": 1.7531317385356587e-06, "loss": 1.2867045402526855, "step": 4698 }, { "epoch": 1.4466333205078876, "grad_norm": 3.546875, "learning_rate": 1.7515735225177698e-06, "loss": 1.3793871402740479, "step": 4700 }, { "epoch": 1.447248941900731, "grad_norm": 7.78125, "learning_rate": 1.7500165470892571e-06, "loss": 1.208519697189331, "step": 4702 }, { "epoch": 1.4478645632935745, "grad_norm": 10.4375, "learning_rate": 1.7484608137973207e-06, "loss": 1.0785921812057495, "step": 4704 }, { "epoch": 1.448480184686418, "grad_norm": 9.1875, "learning_rate": 1.7469063241879272e-06, "loss": 1.0429582595825195, "step": 4706 }, { "epoch": 1.4490958060792614, "grad_norm": 7.21875, "learning_rate": 1.74535307980581e-06, "loss": 1.1869057416915894, "step": 4708 }, { "epoch": 1.4497114274721046, "grad_norm": 5.5625, "learning_rate": 1.7438010821944602e-06, "loss": 1.54345703125, "step": 4710 }, { "epoch": 1.450327048864948, "grad_norm": 4.96875, "learning_rate": 1.742250332896134e-06, "loss": 1.1383198499679565, "step": 4712 }, { "epoch": 1.4509426702577914, "grad_norm": 4.71875, "learning_rate": 1.7407008334518451e-06, "loss": 1.2358094453811646, "step": 4714 }, { "epoch": 1.4515582916506349, "grad_norm": 10.0625, "learning_rate": 1.7391525854013668e-06, "loss": 1.2327531576156616, "step": 4716 }, { "epoch": 1.4521739130434783, "grad_norm": 10.375, "learning_rate": 1.7376055902832273e-06, "loss": 1.297253966331482, "step": 4718 }, { "epoch": 1.4527895344363215, "grad_norm": 7.78125, "learning_rate": 1.7360598496347105e-06, "loss": 1.4092620611190796, "step": 4720 }, { "epoch": 1.4534051558291652, "grad_norm": 11.6875, "learning_rate": 1.7345153649918533e-06, "loss": 1.4216560125350952, "step": 4722 }, { "epoch": 1.4540207772220084, "grad_norm": 6.75, "learning_rate": 1.7329721378894443e-06, "loss": 1.501382827758789, "step": 4724 }, { "epoch": 1.4546363986148518, "grad_norm": 12.25, "learning_rate": 1.731430169861024e-06, "loss": 1.525866985321045, "step": 4726 }, { "epoch": 1.4552520200076953, "grad_norm": 8.1875, "learning_rate": 1.7298894624388796e-06, "loss": 1.4777371883392334, "step": 4728 }, { "epoch": 1.4558676414005387, "grad_norm": 14.125, "learning_rate": 1.7283500171540468e-06, "loss": 1.544064998626709, "step": 4730 }, { "epoch": 1.4564832627933821, "grad_norm": 7.03125, "learning_rate": 1.7268118355363074e-06, "loss": 1.3566229343414307, "step": 4732 }, { "epoch": 1.4570988841862254, "grad_norm": 8.4375, "learning_rate": 1.7252749191141866e-06, "loss": 1.1351234912872314, "step": 4734 }, { "epoch": 1.457714505579069, "grad_norm": 6.78125, "learning_rate": 1.7237392694149527e-06, "loss": 1.142598032951355, "step": 4736 }, { "epoch": 1.4583301269719122, "grad_norm": 7.25, "learning_rate": 1.7222048879646147e-06, "loss": 0.549152672290802, "step": 4738 }, { "epoch": 1.4589457483647557, "grad_norm": 140.0, "learning_rate": 1.7206717762879228e-06, "loss": 0.9815592765808105, "step": 4740 }, { "epoch": 1.459561369757599, "grad_norm": 6.1875, "learning_rate": 1.7191399359083642e-06, "loss": 1.659904956817627, "step": 4742 }, { "epoch": 1.4601769911504425, "grad_norm": 5.09375, "learning_rate": 1.717609368348162e-06, "loss": 1.4122743606567383, "step": 4744 }, { "epoch": 1.460792612543286, "grad_norm": 11.75, "learning_rate": 1.716080075128278e-06, "loss": 1.494467854499817, "step": 4746 }, { "epoch": 1.4614082339361292, "grad_norm": 7.96875, "learning_rate": 1.7145520577684015e-06, "loss": 0.9555761218070984, "step": 4748 }, { "epoch": 1.4620238553289726, "grad_norm": 11.1875, "learning_rate": 1.7130253177869607e-06, "loss": 1.2343807220458984, "step": 4750 }, { "epoch": 1.462639476721816, "grad_norm": 9.6875, "learning_rate": 1.7114998567011105e-06, "loss": 1.5910029411315918, "step": 4752 }, { "epoch": 1.4632550981146595, "grad_norm": 14.0, "learning_rate": 1.7099756760267345e-06, "loss": 1.6015785932540894, "step": 4754 }, { "epoch": 1.463870719507503, "grad_norm": 28.25, "learning_rate": 1.7084527772784466e-06, "loss": 1.5735604763031006, "step": 4756 }, { "epoch": 1.4644863409003464, "grad_norm": 5.53125, "learning_rate": 1.7069311619695852e-06, "loss": 1.325727105140686, "step": 4758 }, { "epoch": 1.4651019622931898, "grad_norm": 11.5625, "learning_rate": 1.7054108316122136e-06, "loss": 1.341804027557373, "step": 4760 }, { "epoch": 1.465717583686033, "grad_norm": 3.390625, "learning_rate": 1.7038917877171179e-06, "loss": 1.2094422578811646, "step": 4762 }, { "epoch": 1.4663332050788764, "grad_norm": 6.1875, "learning_rate": 1.7023740317938053e-06, "loss": 0.8405437469482422, "step": 4764 }, { "epoch": 1.4669488264717199, "grad_norm": 8.125, "learning_rate": 1.700857565350505e-06, "loss": 0.9737740159034729, "step": 4766 }, { "epoch": 1.4675644478645633, "grad_norm": 5.6875, "learning_rate": 1.6993423898941632e-06, "loss": 1.284681797027588, "step": 4768 }, { "epoch": 1.4681800692574067, "grad_norm": 10.5625, "learning_rate": 1.6978285069304444e-06, "loss": 1.6055063009262085, "step": 4770 }, { "epoch": 1.4687956906502502, "grad_norm": 6.84375, "learning_rate": 1.6963159179637274e-06, "loss": 1.386444330215454, "step": 4772 }, { "epoch": 1.4694113120430936, "grad_norm": 8.5, "learning_rate": 1.6948046244971062e-06, "loss": 1.4745707511901855, "step": 4774 }, { "epoch": 1.4700269334359368, "grad_norm": 6.375, "learning_rate": 1.6932946280323865e-06, "loss": 1.4012553691864014, "step": 4776 }, { "epoch": 1.4706425548287803, "grad_norm": 14.5625, "learning_rate": 1.6917859300700848e-06, "loss": 1.0863161087036133, "step": 4778 }, { "epoch": 1.4712581762216237, "grad_norm": 4.78125, "learning_rate": 1.69027853210943e-06, "loss": 1.0291978120803833, "step": 4780 }, { "epoch": 1.4718737976144671, "grad_norm": 5.625, "learning_rate": 1.6887724356483564e-06, "loss": 1.5243959426879883, "step": 4782 }, { "epoch": 1.4724894190073106, "grad_norm": 6.34375, "learning_rate": 1.6872676421835055e-06, "loss": 1.6225019693374634, "step": 4784 }, { "epoch": 1.4731050404001538, "grad_norm": 15.625, "learning_rate": 1.6857641532102254e-06, "loss": 1.6960830688476562, "step": 4786 }, { "epoch": 1.4737206617929974, "grad_norm": 11.0, "learning_rate": 1.6842619702225643e-06, "loss": 1.8426251411437988, "step": 4788 }, { "epoch": 1.4743362831858406, "grad_norm": 3.5625, "learning_rate": 1.682761094713278e-06, "loss": 1.2329729795455933, "step": 4790 }, { "epoch": 1.474951904578684, "grad_norm": 8.3125, "learning_rate": 1.6812615281738178e-06, "loss": 1.261023759841919, "step": 4792 }, { "epoch": 1.4755675259715275, "grad_norm": 4.0, "learning_rate": 1.6797632720943385e-06, "loss": 1.2236747741699219, "step": 4794 }, { "epoch": 1.476183147364371, "grad_norm": 3.953125, "learning_rate": 1.6782663279636902e-06, "loss": 1.0907131433486938, "step": 4796 }, { "epoch": 1.4767987687572144, "grad_norm": 7.625, "learning_rate": 1.6767706972694192e-06, "loss": 1.322651743888855, "step": 4798 }, { "epoch": 1.4774143901500576, "grad_norm": 9.5625, "learning_rate": 1.6752763814977679e-06, "loss": 1.056107759475708, "step": 4800 }, { "epoch": 1.478030011542901, "grad_norm": 6.5, "learning_rate": 1.67378338213367e-06, "loss": 1.2994539737701416, "step": 4802 }, { "epoch": 1.4786456329357445, "grad_norm": 5.6875, "learning_rate": 1.6722917006607548e-06, "loss": 1.452077865600586, "step": 4804 }, { "epoch": 1.479261254328588, "grad_norm": 2.5625, "learning_rate": 1.6708013385613378e-06, "loss": 1.3469157218933105, "step": 4806 }, { "epoch": 1.4798768757214313, "grad_norm": 10.5, "learning_rate": 1.6693122973164255e-06, "loss": 0.9303495287895203, "step": 4808 }, { "epoch": 1.4804924971142748, "grad_norm": 9.0, "learning_rate": 1.6678245784057124e-06, "loss": 1.624355673789978, "step": 4810 }, { "epoch": 1.4811081185071182, "grad_norm": 3.5, "learning_rate": 1.666338183307577e-06, "loss": 1.2143288850784302, "step": 4812 }, { "epoch": 1.4817237398999614, "grad_norm": 10.5, "learning_rate": 1.6648531134990845e-06, "loss": 1.3841214179992676, "step": 4814 }, { "epoch": 1.4823393612928049, "grad_norm": 4.5625, "learning_rate": 1.6633693704559816e-06, "loss": 1.1594114303588867, "step": 4816 }, { "epoch": 1.4829549826856483, "grad_norm": 8.1875, "learning_rate": 1.6618869556526962e-06, "loss": 1.3344064950942993, "step": 4818 }, { "epoch": 1.4835706040784917, "grad_norm": 9.3125, "learning_rate": 1.6604058705623383e-06, "loss": 1.2847368717193604, "step": 4820 }, { "epoch": 1.4841862254713352, "grad_norm": 4.09375, "learning_rate": 1.6589261166566945e-06, "loss": 1.2618248462677002, "step": 4822 }, { "epoch": 1.4848018468641786, "grad_norm": 6.59375, "learning_rate": 1.6574476954062312e-06, "loss": 1.3073136806488037, "step": 4824 }, { "epoch": 1.485417468257022, "grad_norm": 3.140625, "learning_rate": 1.6559706082800859e-06, "loss": 1.0688142776489258, "step": 4826 }, { "epoch": 1.4860330896498652, "grad_norm": 7.0, "learning_rate": 1.6544948567460755e-06, "loss": 1.3833760023117065, "step": 4828 }, { "epoch": 1.4866487110427087, "grad_norm": 5.84375, "learning_rate": 1.6530204422706867e-06, "loss": 1.0907853841781616, "step": 4830 }, { "epoch": 1.4872643324355521, "grad_norm": 3.140625, "learning_rate": 1.6515473663190774e-06, "loss": 1.2678896188735962, "step": 4832 }, { "epoch": 1.4878799538283956, "grad_norm": 9.9375, "learning_rate": 1.6500756303550775e-06, "loss": 1.5102856159210205, "step": 4834 }, { "epoch": 1.488495575221239, "grad_norm": 2.984375, "learning_rate": 1.6486052358411831e-06, "loss": 1.0790684223175049, "step": 4836 }, { "epoch": 1.4891111966140824, "grad_norm": 9.3125, "learning_rate": 1.6471361842385586e-06, "loss": 1.2228312492370605, "step": 4838 }, { "epoch": 1.4897268180069259, "grad_norm": 6.59375, "learning_rate": 1.6456684770070336e-06, "loss": 1.3123070001602173, "step": 4840 }, { "epoch": 1.490342439399769, "grad_norm": 4.6875, "learning_rate": 1.6442021156051009e-06, "loss": 1.1420661211013794, "step": 4842 }, { "epoch": 1.4909580607926125, "grad_norm": 9.6875, "learning_rate": 1.6427371014899175e-06, "loss": 0.6289253830909729, "step": 4844 }, { "epoch": 1.491573682185456, "grad_norm": 7.1875, "learning_rate": 1.6412734361173e-06, "loss": 0.9971387386322021, "step": 4846 }, { "epoch": 1.4921893035782994, "grad_norm": 8.3125, "learning_rate": 1.6398111209417266e-06, "loss": 1.491666555404663, "step": 4848 }, { "epoch": 1.4928049249711428, "grad_norm": 12.625, "learning_rate": 1.638350157416333e-06, "loss": 1.8812816143035889, "step": 4850 }, { "epoch": 1.493420546363986, "grad_norm": 26.25, "learning_rate": 1.6368905469929091e-06, "loss": 1.6962043046951294, "step": 4852 }, { "epoch": 1.4940361677568297, "grad_norm": 7.125, "learning_rate": 1.6354322911219045e-06, "loss": 1.2345105409622192, "step": 4854 }, { "epoch": 1.494651789149673, "grad_norm": 11.0, "learning_rate": 1.6339753912524196e-06, "loss": 1.5342422723770142, "step": 4856 }, { "epoch": 1.4952674105425163, "grad_norm": 8.75, "learning_rate": 1.6325198488322095e-06, "loss": 1.1423943042755127, "step": 4858 }, { "epoch": 1.4958830319353598, "grad_norm": 7.125, "learning_rate": 1.631065665307679e-06, "loss": 1.4692134857177734, "step": 4860 }, { "epoch": 1.4964986533282032, "grad_norm": 1.796875, "learning_rate": 1.6296128421238822e-06, "loss": 1.274204134941101, "step": 4862 }, { "epoch": 1.4971142747210466, "grad_norm": 15.6875, "learning_rate": 1.6281613807245228e-06, "loss": 1.0645694732666016, "step": 4864 }, { "epoch": 1.4977298961138898, "grad_norm": 8.4375, "learning_rate": 1.6267112825519498e-06, "loss": 1.2314739227294922, "step": 4866 }, { "epoch": 1.4983455175067333, "grad_norm": 4.8125, "learning_rate": 1.6252625490471591e-06, "loss": 1.0799189805984497, "step": 4868 }, { "epoch": 1.4989611388995767, "grad_norm": 6.03125, "learning_rate": 1.6238151816497896e-06, "loss": 1.5553315877914429, "step": 4870 }, { "epoch": 1.4995767602924202, "grad_norm": 13.375, "learning_rate": 1.622369181798122e-06, "loss": 0.986903965473175, "step": 4872 }, { "epoch": 1.5001923816852636, "grad_norm": 7.65625, "learning_rate": 1.6209245509290794e-06, "loss": 1.2000123262405396, "step": 4874 }, { "epoch": 1.5008080030781068, "grad_norm": 16.25, "learning_rate": 1.6194812904782236e-06, "loss": 1.2726726531982422, "step": 4876 }, { "epoch": 1.5014236244709505, "grad_norm": 27.875, "learning_rate": 1.6180394018797552e-06, "loss": 1.7936526536941528, "step": 4878 }, { "epoch": 1.5020392458637937, "grad_norm": 3.21875, "learning_rate": 1.61659888656651e-06, "loss": 1.2177774906158447, "step": 4880 }, { "epoch": 1.5026548672566373, "grad_norm": 36.25, "learning_rate": 1.6151597459699622e-06, "loss": 0.8342002034187317, "step": 4882 }, { "epoch": 1.5032704886494805, "grad_norm": 4.8125, "learning_rate": 1.613721981520217e-06, "loss": 1.748741865158081, "step": 4884 }, { "epoch": 1.503886110042324, "grad_norm": 7.125, "learning_rate": 1.6122855946460128e-06, "loss": 1.059945821762085, "step": 4886 }, { "epoch": 1.5045017314351674, "grad_norm": 8.25, "learning_rate": 1.6108505867747215e-06, "loss": 1.352982997894287, "step": 4888 }, { "epoch": 1.5051173528280106, "grad_norm": 21.375, "learning_rate": 1.6094169593323395e-06, "loss": 1.0702686309814453, "step": 4890 }, { "epoch": 1.5057329742208543, "grad_norm": 11.25, "learning_rate": 1.6079847137434967e-06, "loss": 1.5966722965240479, "step": 4892 }, { "epoch": 1.5063485956136975, "grad_norm": 8.75, "learning_rate": 1.6065538514314472e-06, "loss": 1.0752497911453247, "step": 4894 }, { "epoch": 1.506964217006541, "grad_norm": 7.96875, "learning_rate": 1.60512437381807e-06, "loss": 0.8288393020629883, "step": 4896 }, { "epoch": 1.5075798383993844, "grad_norm": 19.875, "learning_rate": 1.6036962823238703e-06, "loss": 0.845901608467102, "step": 4898 }, { "epoch": 1.5081954597922278, "grad_norm": 3.734375, "learning_rate": 1.6022695783679736e-06, "loss": 1.2551945447921753, "step": 4900 }, { "epoch": 1.5088110811850712, "grad_norm": 8.0625, "learning_rate": 1.6008442633681298e-06, "loss": 1.3766522407531738, "step": 4902 }, { "epoch": 1.5094267025779144, "grad_norm": 5.625, "learning_rate": 1.5994203387407036e-06, "loss": 1.263414978981018, "step": 4904 }, { "epoch": 1.510042323970758, "grad_norm": 131.0, "learning_rate": 1.5979978059006819e-06, "loss": 1.2265853881835938, "step": 4906 }, { "epoch": 1.5106579453636013, "grad_norm": 5.625, "learning_rate": 1.5965766662616677e-06, "loss": 1.5353926420211792, "step": 4908 }, { "epoch": 1.5112735667564448, "grad_norm": 11.5625, "learning_rate": 1.5951569212358787e-06, "loss": 0.6604315638542175, "step": 4910 }, { "epoch": 1.5118891881492882, "grad_norm": 9.375, "learning_rate": 1.5937385722341481e-06, "loss": 1.011427402496338, "step": 4912 }, { "epoch": 1.5125048095421316, "grad_norm": 3.625, "learning_rate": 1.5923216206659213e-06, "loss": 1.1152572631835938, "step": 4914 }, { "epoch": 1.513120430934975, "grad_norm": 8.1875, "learning_rate": 1.590906067939254e-06, "loss": 1.1291025876998901, "step": 4916 }, { "epoch": 1.5137360523278183, "grad_norm": 2.4375, "learning_rate": 1.589491915460813e-06, "loss": 1.1154942512512207, "step": 4918 }, { "epoch": 1.514351673720662, "grad_norm": 5.3125, "learning_rate": 1.5880791646358728e-06, "loss": 1.1070373058319092, "step": 4920 }, { "epoch": 1.5149672951135051, "grad_norm": 6.5625, "learning_rate": 1.5866678168683167e-06, "loss": 1.3664308786392212, "step": 4922 }, { "epoch": 1.5155829165063486, "grad_norm": 5.0625, "learning_rate": 1.5852578735606317e-06, "loss": 1.3333261013031006, "step": 4924 }, { "epoch": 1.516198537899192, "grad_norm": 9.3125, "learning_rate": 1.58384933611391e-06, "loss": 1.675808072090149, "step": 4926 }, { "epoch": 1.5168141592920354, "grad_norm": 14.375, "learning_rate": 1.5824422059278486e-06, "loss": 1.5378636121749878, "step": 4928 }, { "epoch": 1.5174297806848789, "grad_norm": 14.8125, "learning_rate": 1.5810364844007414e-06, "loss": 1.34841787815094, "step": 4930 }, { "epoch": 1.518045402077722, "grad_norm": 7.375, "learning_rate": 1.5796321729294875e-06, "loss": 1.4807690382003784, "step": 4932 }, { "epoch": 1.5186610234705658, "grad_norm": 7.90625, "learning_rate": 1.5782292729095815e-06, "loss": 1.42064368724823, "step": 4934 }, { "epoch": 1.519276644863409, "grad_norm": 6.59375, "learning_rate": 1.576827785735118e-06, "loss": 1.2580657005310059, "step": 4936 }, { "epoch": 1.5198922662562524, "grad_norm": 4.8125, "learning_rate": 1.5754277127987852e-06, "loss": 1.345978021621704, "step": 4938 }, { "epoch": 1.5205078876490958, "grad_norm": 15.3125, "learning_rate": 1.5740290554918675e-06, "loss": 1.3176696300506592, "step": 4940 }, { "epoch": 1.521123509041939, "grad_norm": 6.6875, "learning_rate": 1.5726318152042413e-06, "loss": 1.3076728582382202, "step": 4942 }, { "epoch": 1.5217391304347827, "grad_norm": 21.0, "learning_rate": 1.5712359933243754e-06, "loss": 1.3914289474487305, "step": 4944 }, { "epoch": 1.522354751827626, "grad_norm": 8.5625, "learning_rate": 1.5698415912393306e-06, "loss": 1.3952226638793945, "step": 4946 }, { "epoch": 1.5229703732204696, "grad_norm": 5.96875, "learning_rate": 1.5684486103347549e-06, "loss": 1.305515170097351, "step": 4948 }, { "epoch": 1.5235859946133128, "grad_norm": 4.0, "learning_rate": 1.5670570519948836e-06, "loss": 1.1404972076416016, "step": 4950 }, { "epoch": 1.5242016160061562, "grad_norm": 7.21875, "learning_rate": 1.565666917602541e-06, "loss": 1.5215786695480347, "step": 4952 }, { "epoch": 1.5248172373989997, "grad_norm": 5.84375, "learning_rate": 1.5642782085391345e-06, "loss": 1.050858974456787, "step": 4954 }, { "epoch": 1.5254328587918429, "grad_norm": 9.9375, "learning_rate": 1.5628909261846547e-06, "loss": 1.2226909399032593, "step": 4956 }, { "epoch": 1.5260484801846865, "grad_norm": 2.921875, "learning_rate": 1.5615050719176758e-06, "loss": 1.2626575231552124, "step": 4958 }, { "epoch": 1.5266641015775297, "grad_norm": 6.75, "learning_rate": 1.560120647115351e-06, "loss": 1.6576193571090698, "step": 4960 }, { "epoch": 1.5272797229703732, "grad_norm": 3.890625, "learning_rate": 1.5587376531534162e-06, "loss": 0.989402711391449, "step": 4962 }, { "epoch": 1.5278953443632166, "grad_norm": 7.75, "learning_rate": 1.557356091406182e-06, "loss": 1.3007752895355225, "step": 4964 }, { "epoch": 1.52851096575606, "grad_norm": 4.78125, "learning_rate": 1.5559759632465388e-06, "loss": 1.287515640258789, "step": 4966 }, { "epoch": 1.5291265871489035, "grad_norm": 4.71875, "learning_rate": 1.554597270045949e-06, "loss": 1.1778684854507446, "step": 4968 }, { "epoch": 1.5297422085417467, "grad_norm": 16.625, "learning_rate": 1.553220013174452e-06, "loss": 1.0114957094192505, "step": 4970 }, { "epoch": 1.5303578299345904, "grad_norm": 61.5, "learning_rate": 1.551844194000659e-06, "loss": 1.2502862215042114, "step": 4972 }, { "epoch": 1.5309734513274336, "grad_norm": 1.8671875, "learning_rate": 1.5504698138917515e-06, "loss": 1.044121265411377, "step": 4974 }, { "epoch": 1.531589072720277, "grad_norm": 8.125, "learning_rate": 1.5490968742134826e-06, "loss": 1.355575680732727, "step": 4976 }, { "epoch": 1.5322046941131204, "grad_norm": 7.0, "learning_rate": 1.5477253763301734e-06, "loss": 1.244855523109436, "step": 4978 }, { "epoch": 1.5328203155059639, "grad_norm": 6.5, "learning_rate": 1.5463553216047114e-06, "loss": 1.4011290073394775, "step": 4980 }, { "epoch": 1.5334359368988073, "grad_norm": 8.375, "learning_rate": 1.5449867113985512e-06, "loss": 1.6376794576644897, "step": 4982 }, { "epoch": 1.5340515582916505, "grad_norm": 6.65625, "learning_rate": 1.5436195470717104e-06, "loss": 1.328403115272522, "step": 4984 }, { "epoch": 1.5346671796844942, "grad_norm": 9.75, "learning_rate": 1.5422538299827725e-06, "loss": 1.6750003099441528, "step": 4986 }, { "epoch": 1.5352828010773374, "grad_norm": 13.3125, "learning_rate": 1.5408895614888798e-06, "loss": 1.4268121719360352, "step": 4988 }, { "epoch": 1.5358984224701808, "grad_norm": 5.78125, "learning_rate": 1.5395267429457371e-06, "loss": 1.058072805404663, "step": 4990 }, { "epoch": 1.5365140438630243, "grad_norm": 2.328125, "learning_rate": 1.5381653757076082e-06, "loss": 1.1522841453552246, "step": 4992 }, { "epoch": 1.5371296652558677, "grad_norm": 5.96875, "learning_rate": 1.5368054611273133e-06, "loss": 1.2712756395339966, "step": 4994 }, { "epoch": 1.5377452866487111, "grad_norm": 14.75, "learning_rate": 1.5354470005562306e-06, "loss": 1.5364612340927124, "step": 4996 }, { "epoch": 1.5383609080415543, "grad_norm": 7.34375, "learning_rate": 1.5340899953442923e-06, "loss": 1.0772629976272583, "step": 4998 }, { "epoch": 1.538976529434398, "grad_norm": 9.625, "learning_rate": 1.5327344468399852e-06, "loss": 1.364283800125122, "step": 5000 }, { "epoch": 1.5395921508272412, "grad_norm": 8.8125, "learning_rate": 1.5313803563903485e-06, "loss": 1.3770108222961426, "step": 5002 }, { "epoch": 1.5402077722200846, "grad_norm": 3.75, "learning_rate": 1.5300277253409715e-06, "loss": 0.8861935138702393, "step": 5004 }, { "epoch": 1.540823393612928, "grad_norm": 6.90625, "learning_rate": 1.5286765550359958e-06, "loss": 1.168266773223877, "step": 5006 }, { "epoch": 1.5414390150057713, "grad_norm": 2.578125, "learning_rate": 1.527326846818107e-06, "loss": 1.2080166339874268, "step": 5008 }, { "epoch": 1.542054636398615, "grad_norm": 4.5, "learning_rate": 1.525978602028542e-06, "loss": 1.2628138065338135, "step": 5010 }, { "epoch": 1.5426702577914582, "grad_norm": 5.71875, "learning_rate": 1.5246318220070818e-06, "loss": 1.3403091430664062, "step": 5012 }, { "epoch": 1.5432858791843016, "grad_norm": 6.0625, "learning_rate": 1.5232865080920512e-06, "loss": 1.5233333110809326, "step": 5014 }, { "epoch": 1.543901500577145, "grad_norm": 3.875, "learning_rate": 1.5219426616203198e-06, "loss": 1.4889639616012573, "step": 5016 }, { "epoch": 1.5445171219699885, "grad_norm": 12.5625, "learning_rate": 1.5206002839272973e-06, "loss": 1.616183876991272, "step": 5018 }, { "epoch": 1.545132743362832, "grad_norm": 8.3125, "learning_rate": 1.5192593763469346e-06, "loss": 1.3438105583190918, "step": 5020 }, { "epoch": 1.5457483647556751, "grad_norm": 9.75, "learning_rate": 1.5179199402117214e-06, "loss": 1.2317559719085693, "step": 5022 }, { "epoch": 1.5463639861485188, "grad_norm": 28.25, "learning_rate": 1.516581976852686e-06, "loss": 1.338000774383545, "step": 5024 }, { "epoch": 1.546979607541362, "grad_norm": 6.65625, "learning_rate": 1.5152454875993921e-06, "loss": 1.1061460971832275, "step": 5026 }, { "epoch": 1.5475952289342054, "grad_norm": 11.125, "learning_rate": 1.513910473779939e-06, "loss": 1.7041362524032593, "step": 5028 }, { "epoch": 1.5482108503270489, "grad_norm": 9.0625, "learning_rate": 1.5125769367209603e-06, "loss": 1.3146107196807861, "step": 5030 }, { "epoch": 1.5488264717198923, "grad_norm": 6.125, "learning_rate": 1.5112448777476216e-06, "loss": 1.7176063060760498, "step": 5032 }, { "epoch": 1.5494420931127357, "grad_norm": 4.1875, "learning_rate": 1.5099142981836193e-06, "loss": 1.3725917339324951, "step": 5034 }, { "epoch": 1.550057714505579, "grad_norm": 9.625, "learning_rate": 1.5085851993511807e-06, "loss": 1.3335912227630615, "step": 5036 }, { "epoch": 1.5506733358984226, "grad_norm": 8.8125, "learning_rate": 1.5072575825710601e-06, "loss": 1.2308400869369507, "step": 5038 }, { "epoch": 1.5512889572912658, "grad_norm": 7.96875, "learning_rate": 1.5059314491625413e-06, "loss": 1.579362392425537, "step": 5040 }, { "epoch": 1.5519045786841092, "grad_norm": 7.40625, "learning_rate": 1.5046068004434318e-06, "loss": 1.0912820100784302, "step": 5042 }, { "epoch": 1.5525202000769527, "grad_norm": 22.25, "learning_rate": 1.5032836377300663e-06, "loss": 1.201651930809021, "step": 5044 }, { "epoch": 1.5531358214697961, "grad_norm": 13.6875, "learning_rate": 1.501961962337299e-06, "loss": 1.3447504043579102, "step": 5046 }, { "epoch": 1.5537514428626396, "grad_norm": 36.5, "learning_rate": 1.5006417755785096e-06, "loss": 1.0574091672897339, "step": 5048 }, { "epoch": 1.5543670642554828, "grad_norm": 9.5, "learning_rate": 1.4993230787655978e-06, "loss": 1.2444021701812744, "step": 5050 }, { "epoch": 1.5549826856483264, "grad_norm": 29.625, "learning_rate": 1.4980058732089807e-06, "loss": 1.0119298696517944, "step": 5052 }, { "epoch": 1.5555983070411696, "grad_norm": 6.53125, "learning_rate": 1.4966901602175965e-06, "loss": 1.5215494632720947, "step": 5054 }, { "epoch": 1.556213928434013, "grad_norm": 11.3125, "learning_rate": 1.495375941098898e-06, "loss": 1.2245548963546753, "step": 5056 }, { "epoch": 1.5568295498268565, "grad_norm": 4.65625, "learning_rate": 1.4940632171588544e-06, "loss": 1.3511286973953247, "step": 5058 }, { "epoch": 1.5574451712196997, "grad_norm": 15.75, "learning_rate": 1.4927519897019482e-06, "loss": 1.4330006837844849, "step": 5060 }, { "epoch": 1.5580607926125434, "grad_norm": 12.4375, "learning_rate": 1.491442260031176e-06, "loss": 1.7162457704544067, "step": 5062 }, { "epoch": 1.5586764140053866, "grad_norm": 22.5, "learning_rate": 1.490134029448046e-06, "loss": 1.6139112710952759, "step": 5064 }, { "epoch": 1.5592920353982302, "grad_norm": 15.875, "learning_rate": 1.4888272992525758e-06, "loss": 1.2123407125473022, "step": 5066 }, { "epoch": 1.5599076567910735, "grad_norm": 6.4375, "learning_rate": 1.487522070743292e-06, "loss": 1.2695972919464111, "step": 5068 }, { "epoch": 1.560523278183917, "grad_norm": 3.125, "learning_rate": 1.486218345217231e-06, "loss": 1.2351018190383911, "step": 5070 }, { "epoch": 1.5611388995767603, "grad_norm": 7.78125, "learning_rate": 1.484916123969932e-06, "loss": 1.5654300451278687, "step": 5072 }, { "epoch": 1.5617545209696035, "grad_norm": 2.828125, "learning_rate": 1.4836154082954428e-06, "loss": 1.0870963335037231, "step": 5074 }, { "epoch": 1.5623701423624472, "grad_norm": 3.28125, "learning_rate": 1.4823161994863134e-06, "loss": 1.126512050628662, "step": 5076 }, { "epoch": 1.5629857637552904, "grad_norm": 22.125, "learning_rate": 1.4810184988335965e-06, "loss": 1.484130620956421, "step": 5078 }, { "epoch": 1.5636013851481338, "grad_norm": 8.5625, "learning_rate": 1.479722307626847e-06, "loss": 1.8093156814575195, "step": 5080 }, { "epoch": 1.5642170065409773, "grad_norm": 20.0, "learning_rate": 1.4784276271541188e-06, "loss": 1.2959030866622925, "step": 5082 }, { "epoch": 1.5648326279338207, "grad_norm": 6.6875, "learning_rate": 1.4771344587019644e-06, "loss": 1.2962727546691895, "step": 5084 }, { "epoch": 1.5654482493266642, "grad_norm": 4.71875, "learning_rate": 1.4758428035554345e-06, "loss": 1.52423095703125, "step": 5086 }, { "epoch": 1.5660638707195074, "grad_norm": 3.734375, "learning_rate": 1.4745526629980766e-06, "loss": 1.4872410297393799, "step": 5088 }, { "epoch": 1.566679492112351, "grad_norm": 14.1875, "learning_rate": 1.4732640383119312e-06, "loss": 1.4265644550323486, "step": 5090 }, { "epoch": 1.5672951135051942, "grad_norm": 5.53125, "learning_rate": 1.4719769307775337e-06, "loss": 1.2954037189483643, "step": 5092 }, { "epoch": 1.5679107348980377, "grad_norm": 2.5, "learning_rate": 1.4706913416739123e-06, "loss": 0.825425386428833, "step": 5094 }, { "epoch": 1.568526356290881, "grad_norm": 17.875, "learning_rate": 1.4694072722785857e-06, "loss": 1.2906631231307983, "step": 5096 }, { "epoch": 1.5691419776837245, "grad_norm": 5.0, "learning_rate": 1.4681247238675622e-06, "loss": 1.4937584400177002, "step": 5098 }, { "epoch": 1.569757599076568, "grad_norm": 4.9375, "learning_rate": 1.4668436977153383e-06, "loss": 1.3391203880310059, "step": 5100 }, { "epoch": 1.5703732204694112, "grad_norm": 4.1875, "learning_rate": 1.4655641950948993e-06, "loss": 0.9629199504852295, "step": 5102 }, { "epoch": 1.5709888418622548, "grad_norm": 4.03125, "learning_rate": 1.4642862172777154e-06, "loss": 1.2397006750106812, "step": 5104 }, { "epoch": 1.571604463255098, "grad_norm": 9.5625, "learning_rate": 1.463009765533741e-06, "loss": 1.2248990535736084, "step": 5106 }, { "epoch": 1.5722200846479415, "grad_norm": 11.125, "learning_rate": 1.4617348411314167e-06, "loss": 1.2453210353851318, "step": 5108 }, { "epoch": 1.572835706040785, "grad_norm": 4.96875, "learning_rate": 1.4604614453376613e-06, "loss": 1.1807498931884766, "step": 5110 }, { "epoch": 1.5734513274336284, "grad_norm": 3.9375, "learning_rate": 1.459189579417878e-06, "loss": 1.058799386024475, "step": 5112 }, { "epoch": 1.5740669488264718, "grad_norm": 4.0, "learning_rate": 1.4579192446359483e-06, "loss": 1.3278148174285889, "step": 5114 }, { "epoch": 1.574682570219315, "grad_norm": 18.625, "learning_rate": 1.4566504422542316e-06, "loss": 1.2750755548477173, "step": 5116 }, { "epoch": 1.5752981916121587, "grad_norm": 4.59375, "learning_rate": 1.4553831735335667e-06, "loss": 1.2513916492462158, "step": 5118 }, { "epoch": 1.5759138130050019, "grad_norm": 3.765625, "learning_rate": 1.4541174397332659e-06, "loss": 1.1057507991790771, "step": 5120 }, { "epoch": 1.5765294343978453, "grad_norm": 10.4375, "learning_rate": 1.4528532421111175e-06, "loss": 1.0204139947891235, "step": 5122 }, { "epoch": 1.5771450557906888, "grad_norm": 8.3125, "learning_rate": 1.451590581923383e-06, "loss": 1.296351671218872, "step": 5124 }, { "epoch": 1.577760677183532, "grad_norm": 4.75, "learning_rate": 1.4503294604247953e-06, "loss": 0.7441580891609192, "step": 5126 }, { "epoch": 1.5783762985763756, "grad_norm": 18.75, "learning_rate": 1.449069878868561e-06, "loss": 1.7373453378677368, "step": 5128 }, { "epoch": 1.5789919199692188, "grad_norm": 6.96875, "learning_rate": 1.4478118385063526e-06, "loss": 1.455741047859192, "step": 5130 }, { "epoch": 1.5796075413620625, "grad_norm": 12.75, "learning_rate": 1.4465553405883146e-06, "loss": 1.3124862909317017, "step": 5132 }, { "epoch": 1.5802231627549057, "grad_norm": 9.1875, "learning_rate": 1.4453003863630564e-06, "loss": 1.0531606674194336, "step": 5134 }, { "epoch": 1.5808387841477491, "grad_norm": 4.9375, "learning_rate": 1.4440469770776538e-06, "loss": 1.234816551208496, "step": 5136 }, { "epoch": 1.5814544055405926, "grad_norm": 41.5, "learning_rate": 1.4427951139776483e-06, "loss": 1.5872516632080078, "step": 5138 }, { "epoch": 1.5820700269334358, "grad_norm": 9.125, "learning_rate": 1.4415447983070435e-06, "loss": 1.4138165712356567, "step": 5140 }, { "epoch": 1.5826856483262794, "grad_norm": 7.1875, "learning_rate": 1.4402960313083072e-06, "loss": 1.2016167640686035, "step": 5142 }, { "epoch": 1.5833012697191227, "grad_norm": 5.53125, "learning_rate": 1.4390488142223668e-06, "loss": 1.3272122144699097, "step": 5144 }, { "epoch": 1.583916891111966, "grad_norm": 6.25, "learning_rate": 1.437803148288609e-06, "loss": 1.3010327816009521, "step": 5146 }, { "epoch": 1.5845325125048095, "grad_norm": 12.5625, "learning_rate": 1.436559034744882e-06, "loss": 1.4370824098587036, "step": 5148 }, { "epoch": 1.585148133897653, "grad_norm": 13.5625, "learning_rate": 1.4353164748274867e-06, "loss": 1.282833456993103, "step": 5150 }, { "epoch": 1.5857637552904964, "grad_norm": 13.8125, "learning_rate": 1.4340754697711848e-06, "loss": 1.8019992113113403, "step": 5152 }, { "epoch": 1.5863793766833396, "grad_norm": 6.9375, "learning_rate": 1.4328360208091893e-06, "loss": 1.5347020626068115, "step": 5154 }, { "epoch": 1.5869949980761833, "grad_norm": 5.46875, "learning_rate": 1.4315981291731698e-06, "loss": 1.2625937461853027, "step": 5156 }, { "epoch": 1.5876106194690265, "grad_norm": 3.359375, "learning_rate": 1.4303617960932467e-06, "loss": 1.111945629119873, "step": 5158 }, { "epoch": 1.58822624086187, "grad_norm": 9.9375, "learning_rate": 1.4291270227979912e-06, "loss": 1.1782164573669434, "step": 5160 }, { "epoch": 1.5888418622547134, "grad_norm": 5.375, "learning_rate": 1.4278938105144257e-06, "loss": 1.3335965871810913, "step": 5162 }, { "epoch": 1.5894574836475568, "grad_norm": 7.46875, "learning_rate": 1.42666216046802e-06, "loss": 1.5547248125076294, "step": 5164 }, { "epoch": 1.5900731050404002, "grad_norm": 6.25, "learning_rate": 1.425432073882694e-06, "loss": 1.1728477478027344, "step": 5166 }, { "epoch": 1.5906887264332434, "grad_norm": 8.6875, "learning_rate": 1.4242035519808113e-06, "loss": 1.1617035865783691, "step": 5168 }, { "epoch": 1.591304347826087, "grad_norm": 5.9375, "learning_rate": 1.4229765959831813e-06, "loss": 1.0882588624954224, "step": 5170 }, { "epoch": 1.5919199692189303, "grad_norm": 8.5, "learning_rate": 1.4217512071090587e-06, "loss": 1.203486442565918, "step": 5172 }, { "epoch": 1.5925355906117737, "grad_norm": 5.3125, "learning_rate": 1.4205273865761393e-06, "loss": 1.2833119630813599, "step": 5174 }, { "epoch": 1.5931512120046172, "grad_norm": 9.8125, "learning_rate": 1.4193051356005608e-06, "loss": 1.192473292350769, "step": 5176 }, { "epoch": 1.5937668333974606, "grad_norm": 6.84375, "learning_rate": 1.418084455396902e-06, "loss": 1.3609299659729004, "step": 5178 }, { "epoch": 1.594382454790304, "grad_norm": 10.625, "learning_rate": 1.416865347178179e-06, "loss": 1.1379776000976562, "step": 5180 }, { "epoch": 1.5949980761831473, "grad_norm": 5.71875, "learning_rate": 1.4156478121558484e-06, "loss": 1.5259006023406982, "step": 5182 }, { "epoch": 1.595613697575991, "grad_norm": 7.90625, "learning_rate": 1.4144318515398012e-06, "loss": 1.2174557447433472, "step": 5184 }, { "epoch": 1.5962293189688341, "grad_norm": 12.875, "learning_rate": 1.4132174665383658e-06, "loss": 1.5642627477645874, "step": 5186 }, { "epoch": 1.5968449403616776, "grad_norm": 7.28125, "learning_rate": 1.4120046583583019e-06, "loss": 1.466805100440979, "step": 5188 }, { "epoch": 1.597460561754521, "grad_norm": 6.15625, "learning_rate": 1.4107934282048056e-06, "loss": 1.3336026668548584, "step": 5190 }, { "epoch": 1.5980761831473642, "grad_norm": 5.5625, "learning_rate": 1.4095837772815033e-06, "loss": 1.275206208229065, "step": 5192 }, { "epoch": 1.5986918045402079, "grad_norm": 6.3125, "learning_rate": 1.4083757067904513e-06, "loss": 1.2710078954696655, "step": 5194 }, { "epoch": 1.599307425933051, "grad_norm": 2.40625, "learning_rate": 1.4071692179321378e-06, "loss": 1.1343597173690796, "step": 5196 }, { "epoch": 1.5999230473258945, "grad_norm": 17.5, "learning_rate": 1.405964311905477e-06, "loss": 1.1442995071411133, "step": 5198 }, { "epoch": 1.600538668718738, "grad_norm": 8.4375, "learning_rate": 1.4047609899078107e-06, "loss": 1.3664504289627075, "step": 5200 }, { "epoch": 1.6011542901115814, "grad_norm": 23.875, "learning_rate": 1.4035592531349079e-06, "loss": 1.2492691278457642, "step": 5202 }, { "epoch": 1.6017699115044248, "grad_norm": 9.1875, "learning_rate": 1.4023591027809601e-06, "loss": 1.0259876251220703, "step": 5204 }, { "epoch": 1.602385532897268, "grad_norm": 2.203125, "learning_rate": 1.4011605400385847e-06, "loss": 1.194952368736267, "step": 5206 }, { "epoch": 1.6030011542901117, "grad_norm": 9.3125, "learning_rate": 1.3999635660988199e-06, "loss": 1.3531898260116577, "step": 5208 }, { "epoch": 1.603616775682955, "grad_norm": 6.3125, "learning_rate": 1.3987681821511255e-06, "loss": 1.63215172290802, "step": 5210 }, { "epoch": 1.6042323970757983, "grad_norm": 3.265625, "learning_rate": 1.3975743893833823e-06, "loss": 1.2054312229156494, "step": 5212 }, { "epoch": 1.6048480184686418, "grad_norm": 7.25, "learning_rate": 1.3963821889818868e-06, "loss": 1.4407352209091187, "step": 5214 }, { "epoch": 1.6054636398614852, "grad_norm": 28.625, "learning_rate": 1.3951915821313572e-06, "loss": 0.6785837411880493, "step": 5216 }, { "epoch": 1.6060792612543286, "grad_norm": 9.9375, "learning_rate": 1.394002570014925e-06, "loss": 1.1350946426391602, "step": 5218 }, { "epoch": 1.6066948826471719, "grad_norm": 7.96875, "learning_rate": 1.3928151538141393e-06, "loss": 1.3565690517425537, "step": 5220 }, { "epoch": 1.6073105040400155, "grad_norm": 8.3125, "learning_rate": 1.3916293347089618e-06, "loss": 1.1012502908706665, "step": 5222 }, { "epoch": 1.6079261254328587, "grad_norm": 3.96875, "learning_rate": 1.3904451138777666e-06, "loss": 1.0895153284072876, "step": 5224 }, { "epoch": 1.6085417468257022, "grad_norm": 3.71875, "learning_rate": 1.3892624924973425e-06, "loss": 1.1453642845153809, "step": 5226 }, { "epoch": 1.6091573682185456, "grad_norm": 4.90625, "learning_rate": 1.3880814717428844e-06, "loss": 0.972252607345581, "step": 5228 }, { "epoch": 1.609772989611389, "grad_norm": 11.375, "learning_rate": 1.386902052788001e-06, "loss": 1.5603035688400269, "step": 5230 }, { "epoch": 1.6103886110042325, "grad_norm": 10.5625, "learning_rate": 1.3857242368047065e-06, "loss": 1.1489510536193848, "step": 5232 }, { "epoch": 1.6110042323970757, "grad_norm": 9.0, "learning_rate": 1.3845480249634227e-06, "loss": 2.005187511444092, "step": 5234 }, { "epoch": 1.6116198537899193, "grad_norm": 4.6875, "learning_rate": 1.383373418432979e-06, "loss": 1.503325343132019, "step": 5236 }, { "epoch": 1.6122354751827626, "grad_norm": 5.59375, "learning_rate": 1.382200418380607e-06, "loss": 1.2843276262283325, "step": 5238 }, { "epoch": 1.612851096575606, "grad_norm": 6.25, "learning_rate": 1.381029025971944e-06, "loss": 1.280766487121582, "step": 5240 }, { "epoch": 1.6134667179684494, "grad_norm": 5.75, "learning_rate": 1.3798592423710278e-06, "loss": 1.1898144483566284, "step": 5242 }, { "epoch": 1.6140823393612926, "grad_norm": 6.78125, "learning_rate": 1.3786910687402998e-06, "loss": 1.2278223037719727, "step": 5244 }, { "epoch": 1.6146979607541363, "grad_norm": 14.625, "learning_rate": 1.3775245062405996e-06, "loss": 1.2321878671646118, "step": 5246 }, { "epoch": 1.6153135821469795, "grad_norm": 6.8125, "learning_rate": 1.3763595560311663e-06, "loss": 1.470040202140808, "step": 5248 }, { "epoch": 1.6159292035398232, "grad_norm": 7.625, "learning_rate": 1.3751962192696378e-06, "loss": 1.649686336517334, "step": 5250 }, { "epoch": 1.6165448249326664, "grad_norm": 11.25, "learning_rate": 1.3740344971120478e-06, "loss": 0.983123242855072, "step": 5252 }, { "epoch": 1.6171604463255098, "grad_norm": 4.3125, "learning_rate": 1.372874390712825e-06, "loss": 1.2789380550384521, "step": 5254 }, { "epoch": 1.6177760677183533, "grad_norm": 12.75, "learning_rate": 1.3717159012247938e-06, "loss": 1.4643131494522095, "step": 5256 }, { "epoch": 1.6183916891111965, "grad_norm": 9.875, "learning_rate": 1.3705590297991705e-06, "loss": 0.8083310127258301, "step": 5258 }, { "epoch": 1.6190073105040401, "grad_norm": 6.78125, "learning_rate": 1.3694037775855651e-06, "loss": 1.3276646137237549, "step": 5260 }, { "epoch": 1.6196229318968833, "grad_norm": 14.6875, "learning_rate": 1.3682501457319764e-06, "loss": 1.3942234516143799, "step": 5262 }, { "epoch": 1.6202385532897268, "grad_norm": 20.0, "learning_rate": 1.3670981353847955e-06, "loss": 1.4684497117996216, "step": 5264 }, { "epoch": 1.6208541746825702, "grad_norm": 6.59375, "learning_rate": 1.3659477476888006e-06, "loss": 1.1888436079025269, "step": 5266 }, { "epoch": 1.6214697960754136, "grad_norm": 5.875, "learning_rate": 1.3647989837871565e-06, "loss": 1.2587193250656128, "step": 5268 }, { "epoch": 1.622085417468257, "grad_norm": 7.5, "learning_rate": 1.3636518448214172e-06, "loss": 1.6369602680206299, "step": 5270 }, { "epoch": 1.6227010388611003, "grad_norm": 5.78125, "learning_rate": 1.362506331931519e-06, "loss": 1.1179128885269165, "step": 5272 }, { "epoch": 1.623316660253944, "grad_norm": 4.75, "learning_rate": 1.3613624462557857e-06, "loss": 1.1202988624572754, "step": 5274 }, { "epoch": 1.6239322816467872, "grad_norm": 32.25, "learning_rate": 1.3602201889309204e-06, "loss": 1.6366544961929321, "step": 5276 }, { "epoch": 1.6245479030396306, "grad_norm": 13.625, "learning_rate": 1.3590795610920108e-06, "loss": 1.451014518737793, "step": 5278 }, { "epoch": 1.625163524432474, "grad_norm": 8.375, "learning_rate": 1.3579405638725238e-06, "loss": 1.1452707052230835, "step": 5280 }, { "epoch": 1.6257791458253175, "grad_norm": 9.8125, "learning_rate": 1.356803198404306e-06, "loss": 1.3378841876983643, "step": 5282 }, { "epoch": 1.626394767218161, "grad_norm": 5.03125, "learning_rate": 1.355667465817584e-06, "loss": 1.2216230630874634, "step": 5284 }, { "epoch": 1.627010388611004, "grad_norm": 26.875, "learning_rate": 1.3545333672409605e-06, "loss": 1.1749513149261475, "step": 5286 }, { "epoch": 1.6276260100038478, "grad_norm": 32.0, "learning_rate": 1.353400903801414e-06, "loss": 1.466373324394226, "step": 5288 }, { "epoch": 1.628241631396691, "grad_norm": 10.8125, "learning_rate": 1.3522700766243e-06, "loss": 0.42502403259277344, "step": 5290 }, { "epoch": 1.6288572527895344, "grad_norm": 3.875, "learning_rate": 1.3511408868333453e-06, "loss": 1.2273374795913696, "step": 5292 }, { "epoch": 1.6294728741823779, "grad_norm": 8.25, "learning_rate": 1.3500133355506523e-06, "loss": 1.1918621063232422, "step": 5294 }, { "epoch": 1.6300884955752213, "grad_norm": 7.125, "learning_rate": 1.3488874238966931e-06, "loss": 1.6822189092636108, "step": 5296 }, { "epoch": 1.6307041169680647, "grad_norm": 4.65625, "learning_rate": 1.3477631529903124e-06, "loss": 1.1439008712768555, "step": 5298 }, { "epoch": 1.631319738360908, "grad_norm": 8.0, "learning_rate": 1.346640523948723e-06, "loss": 1.478432297706604, "step": 5300 }, { "epoch": 1.6319353597537516, "grad_norm": 2.296875, "learning_rate": 1.345519537887506e-06, "loss": 1.190623164176941, "step": 5302 }, { "epoch": 1.6325509811465948, "grad_norm": 6.3125, "learning_rate": 1.344400195920611e-06, "loss": 1.0302263498306274, "step": 5304 }, { "epoch": 1.6331666025394382, "grad_norm": 13.625, "learning_rate": 1.3432824991603525e-06, "loss": 1.1354777812957764, "step": 5306 }, { "epoch": 1.6337822239322817, "grad_norm": 10.75, "learning_rate": 1.3421664487174116e-06, "loss": 1.4184706211090088, "step": 5308 }, { "epoch": 1.6343978453251249, "grad_norm": 55.75, "learning_rate": 1.3410520457008325e-06, "loss": 1.549203634262085, "step": 5310 }, { "epoch": 1.6350134667179685, "grad_norm": 9.1875, "learning_rate": 1.3399392912180214e-06, "loss": 1.7178195714950562, "step": 5312 }, { "epoch": 1.6356290881108118, "grad_norm": 3.734375, "learning_rate": 1.3388281863747494e-06, "loss": 1.1633416414260864, "step": 5314 }, { "epoch": 1.6362447095036554, "grad_norm": 5.21875, "learning_rate": 1.3377187322751448e-06, "loss": 1.1372298002243042, "step": 5316 }, { "epoch": 1.6368603308964986, "grad_norm": 6.96875, "learning_rate": 1.336610930021697e-06, "loss": 1.0590790510177612, "step": 5318 }, { "epoch": 1.637475952289342, "grad_norm": 6.0, "learning_rate": 1.3355047807152543e-06, "loss": 1.332948088645935, "step": 5320 }, { "epoch": 1.6380915736821855, "grad_norm": 7.59375, "learning_rate": 1.3344002854550222e-06, "loss": 1.4869979619979858, "step": 5322 }, { "epoch": 1.6387071950750287, "grad_norm": 12.8125, "learning_rate": 1.3332974453385628e-06, "loss": 1.8726364374160767, "step": 5324 }, { "epoch": 1.6393228164678724, "grad_norm": 9.625, "learning_rate": 1.3321962614617914e-06, "loss": 1.3572361469268799, "step": 5326 }, { "epoch": 1.6399384378607156, "grad_norm": 4.5, "learning_rate": 1.3310967349189815e-06, "loss": 1.5230677127838135, "step": 5328 }, { "epoch": 1.640554059253559, "grad_norm": 5.90625, "learning_rate": 1.329998866802755e-06, "loss": 1.179467797279358, "step": 5330 }, { "epoch": 1.6411696806464025, "grad_norm": 9.6875, "learning_rate": 1.3289026582040892e-06, "loss": 1.527015209197998, "step": 5332 }, { "epoch": 1.6417853020392459, "grad_norm": 5.84375, "learning_rate": 1.3278081102123111e-06, "loss": 1.3052524328231812, "step": 5334 }, { "epoch": 1.6424009234320893, "grad_norm": 7.65625, "learning_rate": 1.3267152239150971e-06, "loss": 1.367884635925293, "step": 5336 }, { "epoch": 1.6430165448249325, "grad_norm": 5.03125, "learning_rate": 1.3256240003984736e-06, "loss": 1.2928811311721802, "step": 5338 }, { "epoch": 1.6436321662177762, "grad_norm": 5.34375, "learning_rate": 1.3245344407468133e-06, "loss": 1.3148117065429688, "step": 5340 }, { "epoch": 1.6442477876106194, "grad_norm": 6.03125, "learning_rate": 1.3234465460428363e-06, "loss": 1.475799798965454, "step": 5342 }, { "epoch": 1.6448634090034628, "grad_norm": 6.375, "learning_rate": 1.322360317367608e-06, "loss": 1.5288089513778687, "step": 5344 }, { "epoch": 1.6454790303963063, "grad_norm": 4.03125, "learning_rate": 1.3212757558005374e-06, "loss": 1.108746886253357, "step": 5346 }, { "epoch": 1.6460946517891497, "grad_norm": 6.25, "learning_rate": 1.3201928624193785e-06, "loss": 1.1718138456344604, "step": 5348 }, { "epoch": 1.6467102731819931, "grad_norm": 26.25, "learning_rate": 1.3191116383002265e-06, "loss": 1.495430588722229, "step": 5350 }, { "epoch": 1.6473258945748364, "grad_norm": 49.5, "learning_rate": 1.3180320845175181e-06, "loss": 1.5044770240783691, "step": 5352 }, { "epoch": 1.64794151596768, "grad_norm": 14.3125, "learning_rate": 1.31695420214403e-06, "loss": 1.3877954483032227, "step": 5354 }, { "epoch": 1.6485571373605232, "grad_norm": 9.8125, "learning_rate": 1.3158779922508782e-06, "loss": 1.5113141536712646, "step": 5356 }, { "epoch": 1.6491727587533667, "grad_norm": 7.0625, "learning_rate": 1.3148034559075169e-06, "loss": 1.6475886106491089, "step": 5358 }, { "epoch": 1.64978838014621, "grad_norm": 7.71875, "learning_rate": 1.3137305941817354e-06, "loss": 1.3991791009902954, "step": 5360 }, { "epoch": 1.6504040015390535, "grad_norm": 20.875, "learning_rate": 1.3126594081396627e-06, "loss": 1.1645764112472534, "step": 5362 }, { "epoch": 1.651019622931897, "grad_norm": 9.5, "learning_rate": 1.3115898988457586e-06, "loss": 1.6117132902145386, "step": 5364 }, { "epoch": 1.6516352443247402, "grad_norm": 12.5, "learning_rate": 1.3105220673628195e-06, "loss": 1.682780385017395, "step": 5366 }, { "epoch": 1.6522508657175838, "grad_norm": 28.5, "learning_rate": 1.3094559147519733e-06, "loss": 1.0650416612625122, "step": 5368 }, { "epoch": 1.652866487110427, "grad_norm": 13.9375, "learning_rate": 1.3083914420726787e-06, "loss": 1.3738309144973755, "step": 5370 }, { "epoch": 1.6534821085032705, "grad_norm": 7.46875, "learning_rate": 1.3073286503827275e-06, "loss": 1.0279654264450073, "step": 5372 }, { "epoch": 1.654097729896114, "grad_norm": 4.59375, "learning_rate": 1.3062675407382389e-06, "loss": 1.3116581439971924, "step": 5374 }, { "epoch": 1.6547133512889571, "grad_norm": 11.0, "learning_rate": 1.3052081141936618e-06, "loss": 1.4373891353607178, "step": 5376 }, { "epoch": 1.6553289726818008, "grad_norm": 16.625, "learning_rate": 1.3041503718017715e-06, "loss": 1.296805500984192, "step": 5378 }, { "epoch": 1.655944594074644, "grad_norm": 10.375, "learning_rate": 1.303094314613671e-06, "loss": 1.520833969116211, "step": 5380 }, { "epoch": 1.6565602154674874, "grad_norm": 6.25, "learning_rate": 1.3020399436787876e-06, "loss": 1.4238828420639038, "step": 5382 }, { "epoch": 1.6571758368603309, "grad_norm": 11.6875, "learning_rate": 1.3009872600448725e-06, "loss": 1.5053352117538452, "step": 5384 }, { "epoch": 1.6577914582531743, "grad_norm": 16.75, "learning_rate": 1.2999362647580027e-06, "loss": 1.2121310234069824, "step": 5386 }, { "epoch": 1.6584070796460177, "grad_norm": 4.53125, "learning_rate": 1.2988869588625746e-06, "loss": 1.0248559713363647, "step": 5388 }, { "epoch": 1.659022701038861, "grad_norm": 8.0, "learning_rate": 1.297839343401307e-06, "loss": 1.4401817321777344, "step": 5390 }, { "epoch": 1.6596383224317046, "grad_norm": 6.96875, "learning_rate": 1.2967934194152399e-06, "loss": 1.1867748498916626, "step": 5392 }, { "epoch": 1.6602539438245478, "grad_norm": 9.9375, "learning_rate": 1.2957491879437306e-06, "loss": 1.5059815645217896, "step": 5394 }, { "epoch": 1.6608695652173913, "grad_norm": 11.5, "learning_rate": 1.2947066500244554e-06, "loss": 1.5864746570587158, "step": 5396 }, { "epoch": 1.6614851866102347, "grad_norm": 10.125, "learning_rate": 1.2936658066934077e-06, "loss": 1.1161748170852661, "step": 5398 }, { "epoch": 1.6621008080030781, "grad_norm": 10.375, "learning_rate": 1.2926266589848965e-06, "loss": 1.196049690246582, "step": 5400 }, { "epoch": 1.6627164293959216, "grad_norm": 6.1875, "learning_rate": 1.2915892079315465e-06, "loss": 1.1869969367980957, "step": 5402 }, { "epoch": 1.6633320507887648, "grad_norm": 8.5625, "learning_rate": 1.2905534545642958e-06, "loss": 1.5340697765350342, "step": 5404 }, { "epoch": 1.6639476721816084, "grad_norm": 47.0, "learning_rate": 1.2895193999123966e-06, "loss": 1.421201467514038, "step": 5406 }, { "epoch": 1.6645632935744517, "grad_norm": 13.8125, "learning_rate": 1.2884870450034112e-06, "loss": 1.5346685647964478, "step": 5408 }, { "epoch": 1.665178914967295, "grad_norm": 4.875, "learning_rate": 1.2874563908632142e-06, "loss": 1.3476852178573608, "step": 5410 }, { "epoch": 1.6657945363601385, "grad_norm": 7.625, "learning_rate": 1.28642743851599e-06, "loss": 1.5519896745681763, "step": 5412 }, { "epoch": 1.666410157752982, "grad_norm": 6.15625, "learning_rate": 1.2854001889842305e-06, "loss": 1.3321746587753296, "step": 5414 }, { "epoch": 1.6670257791458254, "grad_norm": 6.40625, "learning_rate": 1.2843746432887382e-06, "loss": 1.2712730169296265, "step": 5416 }, { "epoch": 1.6676414005386686, "grad_norm": 5.34375, "learning_rate": 1.2833508024486197e-06, "loss": 1.4763667583465576, "step": 5418 }, { "epoch": 1.6682570219315123, "grad_norm": 11.8125, "learning_rate": 1.282328667481289e-06, "loss": 1.537577509880066, "step": 5420 }, { "epoch": 1.6688726433243555, "grad_norm": 23.25, "learning_rate": 1.2813082394024646e-06, "loss": 1.4748051166534424, "step": 5422 }, { "epoch": 1.669488264717199, "grad_norm": 3.9375, "learning_rate": 1.280289519226168e-06, "loss": 1.412636160850525, "step": 5424 }, { "epoch": 1.6701038861100423, "grad_norm": 11.1875, "learning_rate": 1.2792725079647253e-06, "loss": 0.8590376377105713, "step": 5426 }, { "epoch": 1.6707195075028856, "grad_norm": 8.1875, "learning_rate": 1.2782572066287626e-06, "loss": 1.628885269165039, "step": 5428 }, { "epoch": 1.6713351288957292, "grad_norm": 20.75, "learning_rate": 1.2772436162272084e-06, "loss": 1.5341277122497559, "step": 5430 }, { "epoch": 1.6719507502885724, "grad_norm": 13.75, "learning_rate": 1.2762317377672905e-06, "loss": 1.8415738344192505, "step": 5432 }, { "epoch": 1.672566371681416, "grad_norm": 3.0, "learning_rate": 1.2752215722545334e-06, "loss": 1.139654278755188, "step": 5434 }, { "epoch": 1.6731819930742593, "grad_norm": 5.5625, "learning_rate": 1.2742131206927624e-06, "loss": 1.3815727233886719, "step": 5436 }, { "epoch": 1.6737976144671027, "grad_norm": 7.34375, "learning_rate": 1.273206384084098e-06, "loss": 1.0991954803466797, "step": 5438 }, { "epoch": 1.6744132358599462, "grad_norm": 7.8125, "learning_rate": 1.2722013634289579e-06, "loss": 1.5331264734268188, "step": 5440 }, { "epoch": 1.6750288572527894, "grad_norm": 6.53125, "learning_rate": 1.2711980597260532e-06, "loss": 1.523708701133728, "step": 5442 }, { "epoch": 1.675644478645633, "grad_norm": 7.8125, "learning_rate": 1.2701964739723883e-06, "loss": 1.1231988668441772, "step": 5444 }, { "epoch": 1.6762601000384763, "grad_norm": 8.5, "learning_rate": 1.2691966071632634e-06, "loss": 0.7487651109695435, "step": 5446 }, { "epoch": 1.6768757214313197, "grad_norm": 10.9375, "learning_rate": 1.2681984602922659e-06, "loss": 0.8239330649375916, "step": 5448 }, { "epoch": 1.6774913428241631, "grad_norm": 8.125, "learning_rate": 1.2672020343512788e-06, "loss": 1.5421671867370605, "step": 5450 }, { "epoch": 1.6781069642170066, "grad_norm": 5.15625, "learning_rate": 1.2662073303304726e-06, "loss": 1.2927336692810059, "step": 5452 }, { "epoch": 1.67872258560985, "grad_norm": 17.0, "learning_rate": 1.265214349218306e-06, "loss": 0.6713177561759949, "step": 5454 }, { "epoch": 1.6793382070026932, "grad_norm": 9.25, "learning_rate": 1.2642230920015279e-06, "loss": 1.5233432054519653, "step": 5456 }, { "epoch": 1.6799538283955369, "grad_norm": 6.90625, "learning_rate": 1.2632335596651717e-06, "loss": 1.4733463525772095, "step": 5458 }, { "epoch": 1.68056944978838, "grad_norm": 14.4375, "learning_rate": 1.2622457531925586e-06, "loss": 1.2585396766662598, "step": 5460 }, { "epoch": 1.6811850711812235, "grad_norm": 15.5, "learning_rate": 1.2612596735652935e-06, "loss": 1.0226867198944092, "step": 5462 }, { "epoch": 1.681800692574067, "grad_norm": 5.875, "learning_rate": 1.2602753217632662e-06, "loss": 1.078691005706787, "step": 5464 }, { "epoch": 1.6824163139669104, "grad_norm": 5.5625, "learning_rate": 1.2592926987646492e-06, "loss": 1.2774691581726074, "step": 5466 }, { "epoch": 1.6830319353597538, "grad_norm": 16.625, "learning_rate": 1.2583118055458965e-06, "loss": 1.1871919631958008, "step": 5468 }, { "epoch": 1.683647556752597, "grad_norm": 5.46875, "learning_rate": 1.2573326430817443e-06, "loss": 1.133792519569397, "step": 5470 }, { "epoch": 1.6842631781454407, "grad_norm": 6.09375, "learning_rate": 1.256355212345208e-06, "loss": 1.3047640323638916, "step": 5472 }, { "epoch": 1.684878799538284, "grad_norm": 3.1875, "learning_rate": 1.2553795143075825e-06, "loss": 1.1482043266296387, "step": 5474 }, { "epoch": 1.6854944209311273, "grad_norm": 12.6875, "learning_rate": 1.2544055499384408e-06, "loss": 1.2934027910232544, "step": 5476 }, { "epoch": 1.6861100423239708, "grad_norm": 8.9375, "learning_rate": 1.2534333202056326e-06, "loss": 1.6211135387420654, "step": 5478 }, { "epoch": 1.6867256637168142, "grad_norm": 17.875, "learning_rate": 1.252462826075285e-06, "loss": 1.44912588596344, "step": 5480 }, { "epoch": 1.6873412851096576, "grad_norm": 9.1875, "learning_rate": 1.2514940685117996e-06, "loss": 1.7090988159179688, "step": 5482 }, { "epoch": 1.6879569065025009, "grad_norm": 17.625, "learning_rate": 1.2505270484778532e-06, "loss": 1.6906849145889282, "step": 5484 }, { "epoch": 1.6885725278953445, "grad_norm": 3.015625, "learning_rate": 1.2495617669343943e-06, "loss": 1.484596610069275, "step": 5486 }, { "epoch": 1.6891881492881877, "grad_norm": 16.75, "learning_rate": 1.2485982248406445e-06, "loss": 1.4228898286819458, "step": 5488 }, { "epoch": 1.6898037706810312, "grad_norm": 5.375, "learning_rate": 1.2476364231540982e-06, "loss": 1.5339558124542236, "step": 5490 }, { "epoch": 1.6904193920738746, "grad_norm": 5.875, "learning_rate": 1.2466763628305189e-06, "loss": 1.1721829175949097, "step": 5492 }, { "epoch": 1.6910350134667178, "grad_norm": 8.9375, "learning_rate": 1.24571804482394e-06, "loss": 1.454105257987976, "step": 5494 }, { "epoch": 1.6916506348595615, "grad_norm": 21.125, "learning_rate": 1.2447614700866639e-06, "loss": 1.5239249467849731, "step": 5496 }, { "epoch": 1.6922662562524047, "grad_norm": 14.5, "learning_rate": 1.24380663956926e-06, "loss": 1.422902226448059, "step": 5498 }, { "epoch": 1.6928818776452483, "grad_norm": 8.1875, "learning_rate": 1.2428535542205651e-06, "loss": 1.2538090944290161, "step": 5500 }, { "epoch": 1.6934974990380915, "grad_norm": 3.890625, "learning_rate": 1.2419022149876808e-06, "loss": 1.3582100868225098, "step": 5502 }, { "epoch": 1.694113120430935, "grad_norm": 9.375, "learning_rate": 1.240952622815975e-06, "loss": 0.9965105652809143, "step": 5504 }, { "epoch": 1.6947287418237784, "grad_norm": 3.578125, "learning_rate": 1.2400047786490783e-06, "loss": 1.1679785251617432, "step": 5506 }, { "epoch": 1.6953443632166216, "grad_norm": 10.625, "learning_rate": 1.2390586834288846e-06, "loss": 1.241326928138733, "step": 5508 }, { "epoch": 1.6959599846094653, "grad_norm": 2.953125, "learning_rate": 1.238114338095551e-06, "loss": 1.3023382425308228, "step": 5510 }, { "epoch": 1.6965756060023085, "grad_norm": 4.46875, "learning_rate": 1.2371717435874926e-06, "loss": 1.5654349327087402, "step": 5512 }, { "epoch": 1.697191227395152, "grad_norm": 7.625, "learning_rate": 1.2362309008413887e-06, "loss": 1.4605005979537964, "step": 5514 }, { "epoch": 1.6978068487879954, "grad_norm": 15.0, "learning_rate": 1.2352918107921744e-06, "loss": 1.203782081604004, "step": 5516 }, { "epoch": 1.6984224701808388, "grad_norm": 5.15625, "learning_rate": 1.2343544743730454e-06, "loss": 1.638678789138794, "step": 5518 }, { "epoch": 1.6990380915736822, "grad_norm": 2.578125, "learning_rate": 1.233418892515454e-06, "loss": 1.407549262046814, "step": 5520 }, { "epoch": 1.6996537129665255, "grad_norm": 5.59375, "learning_rate": 1.232485066149108e-06, "loss": 1.2579734325408936, "step": 5522 }, { "epoch": 1.7002693343593691, "grad_norm": 11.9375, "learning_rate": 1.2315529962019722e-06, "loss": 1.6120179891586304, "step": 5524 }, { "epoch": 1.7008849557522123, "grad_norm": 8.8125, "learning_rate": 1.230622683600265e-06, "loss": 1.0686167478561401, "step": 5526 }, { "epoch": 1.7015005771450558, "grad_norm": 6.53125, "learning_rate": 1.2296941292684595e-06, "loss": 1.3260496854782104, "step": 5528 }, { "epoch": 1.7021161985378992, "grad_norm": 9.0625, "learning_rate": 1.2287673341292808e-06, "loss": 1.1599143743515015, "step": 5530 }, { "epoch": 1.7027318199307426, "grad_norm": 7.96875, "learning_rate": 1.2278422991037051e-06, "loss": 1.1137622594833374, "step": 5532 }, { "epoch": 1.703347441323586, "grad_norm": 9.125, "learning_rate": 1.2269190251109619e-06, "loss": 1.0730335712432861, "step": 5534 }, { "epoch": 1.7039630627164293, "grad_norm": 6.0625, "learning_rate": 1.2259975130685285e-06, "loss": 1.2467234134674072, "step": 5536 }, { "epoch": 1.704578684109273, "grad_norm": 4.375, "learning_rate": 1.2250777638921318e-06, "loss": 1.2540618181228638, "step": 5538 }, { "epoch": 1.7051943055021161, "grad_norm": 3.890625, "learning_rate": 1.2241597784957477e-06, "loss": 1.0865514278411865, "step": 5540 }, { "epoch": 1.7058099268949596, "grad_norm": 5.6875, "learning_rate": 1.2232435577915982e-06, "loss": 1.032016634941101, "step": 5542 }, { "epoch": 1.706425548287803, "grad_norm": 18.75, "learning_rate": 1.2223291026901534e-06, "loss": 1.2673108577728271, "step": 5544 }, { "epoch": 1.7070411696806465, "grad_norm": 8.1875, "learning_rate": 1.2214164141001266e-06, "loss": 1.399294137954712, "step": 5546 }, { "epoch": 1.7076567910734899, "grad_norm": 9.5, "learning_rate": 1.2205054929284784e-06, "loss": 1.2184782028198242, "step": 5548 }, { "epoch": 1.708272412466333, "grad_norm": 16.0, "learning_rate": 1.21959634008041e-06, "loss": 1.0672837495803833, "step": 5550 }, { "epoch": 1.7088880338591768, "grad_norm": 6.1875, "learning_rate": 1.2186889564593678e-06, "loss": 1.352750301361084, "step": 5552 }, { "epoch": 1.70950365525202, "grad_norm": 25.625, "learning_rate": 1.2177833429670395e-06, "loss": 1.7680718898773193, "step": 5554 }, { "epoch": 1.7101192766448634, "grad_norm": 6.3125, "learning_rate": 1.2168795005033524e-06, "loss": 1.0429075956344604, "step": 5556 }, { "epoch": 1.7107348980377068, "grad_norm": 4.46875, "learning_rate": 1.2159774299664765e-06, "loss": 1.1620887517929077, "step": 5558 }, { "epoch": 1.71135051943055, "grad_norm": 9.125, "learning_rate": 1.2150771322528187e-06, "loss": 1.2178293466567993, "step": 5560 }, { "epoch": 1.7119661408233937, "grad_norm": 10.875, "learning_rate": 1.2141786082570248e-06, "loss": 1.3962485790252686, "step": 5562 }, { "epoch": 1.712581762216237, "grad_norm": 5.84375, "learning_rate": 1.2132818588719788e-06, "loss": 1.2660832405090332, "step": 5564 }, { "epoch": 1.7131973836090806, "grad_norm": 5.6875, "learning_rate": 1.2123868849888e-06, "loss": 1.2593755722045898, "step": 5566 }, { "epoch": 1.7138130050019238, "grad_norm": 6.46875, "learning_rate": 1.2114936874968452e-06, "loss": 1.3764986991882324, "step": 5568 }, { "epoch": 1.7144286263947672, "grad_norm": 7.28125, "learning_rate": 1.210602267283703e-06, "loss": 0.7025138139724731, "step": 5570 }, { "epoch": 1.7150442477876107, "grad_norm": 4.46875, "learning_rate": 1.2097126252351992e-06, "loss": 1.2333356142044067, "step": 5572 }, { "epoch": 1.7156598691804539, "grad_norm": 4.3125, "learning_rate": 1.2088247622353907e-06, "loss": 1.4705708026885986, "step": 5574 }, { "epoch": 1.7162754905732975, "grad_norm": 7.59375, "learning_rate": 1.2079386791665664e-06, "loss": 1.3183263540267944, "step": 5576 }, { "epoch": 1.7168911119661407, "grad_norm": 10.8125, "learning_rate": 1.2070543769092475e-06, "loss": 1.6780861616134644, "step": 5578 }, { "epoch": 1.7175067333589842, "grad_norm": 7.40625, "learning_rate": 1.206171856342184e-06, "loss": 1.2179884910583496, "step": 5580 }, { "epoch": 1.7181223547518276, "grad_norm": 8.375, "learning_rate": 1.205291118342357e-06, "loss": 1.4092509746551514, "step": 5582 }, { "epoch": 1.718737976144671, "grad_norm": 13.4375, "learning_rate": 1.2044121637849762e-06, "loss": 1.090433120727539, "step": 5584 }, { "epoch": 1.7193535975375145, "grad_norm": 7.65625, "learning_rate": 1.203534993543477e-06, "loss": 1.2665958404541016, "step": 5586 }, { "epoch": 1.7199692189303577, "grad_norm": 4.78125, "learning_rate": 1.202659608489525e-06, "loss": 1.3226650953292847, "step": 5588 }, { "epoch": 1.7205848403232014, "grad_norm": 4.34375, "learning_rate": 1.2017860094930084e-06, "loss": 0.938016414642334, "step": 5590 }, { "epoch": 1.7212004617160446, "grad_norm": 12.25, "learning_rate": 1.2009141974220428e-06, "loss": 1.8086051940917969, "step": 5592 }, { "epoch": 1.721816083108888, "grad_norm": 8.8125, "learning_rate": 1.2000441731429669e-06, "loss": 1.231706142425537, "step": 5594 }, { "epoch": 1.7224317045017314, "grad_norm": 3.15625, "learning_rate": 1.1991759375203437e-06, "loss": 1.162283182144165, "step": 5596 }, { "epoch": 1.7230473258945749, "grad_norm": 5.09375, "learning_rate": 1.1983094914169586e-06, "loss": 1.128656268119812, "step": 5598 }, { "epoch": 1.7236629472874183, "grad_norm": 30.875, "learning_rate": 1.197444835693818e-06, "loss": 1.6214772462844849, "step": 5600 }, { "epoch": 1.7242785686802615, "grad_norm": 11.375, "learning_rate": 1.19658197121015e-06, "loss": 1.2303175926208496, "step": 5602 }, { "epoch": 1.7248941900731052, "grad_norm": 12.25, "learning_rate": 1.1957208988234025e-06, "loss": 1.2540677785873413, "step": 5604 }, { "epoch": 1.7255098114659484, "grad_norm": 5.625, "learning_rate": 1.1948616193892421e-06, "loss": 1.3936173915863037, "step": 5606 }, { "epoch": 1.7261254328587918, "grad_norm": 11.8125, "learning_rate": 1.1940041337615544e-06, "loss": 1.5067126750946045, "step": 5608 }, { "epoch": 1.7267410542516353, "grad_norm": 7.5, "learning_rate": 1.1931484427924415e-06, "loss": 1.7533129453659058, "step": 5610 }, { "epoch": 1.7273566756444787, "grad_norm": 4.40625, "learning_rate": 1.1922945473322233e-06, "loss": 1.2284516096115112, "step": 5612 }, { "epoch": 1.7279722970373221, "grad_norm": 3.3125, "learning_rate": 1.1914424482294347e-06, "loss": 1.2963087558746338, "step": 5614 }, { "epoch": 1.7285879184301653, "grad_norm": 14.6875, "learning_rate": 1.1905921463308256e-06, "loss": 1.2415173053741455, "step": 5616 }, { "epoch": 1.729203539823009, "grad_norm": 16.875, "learning_rate": 1.18974364248136e-06, "loss": 1.656542420387268, "step": 5618 }, { "epoch": 1.7298191612158522, "grad_norm": 8.125, "learning_rate": 1.1888969375242153e-06, "loss": 1.4199869632720947, "step": 5620 }, { "epoch": 1.7304347826086957, "grad_norm": 7.375, "learning_rate": 1.1880520323007811e-06, "loss": 1.509156346321106, "step": 5622 }, { "epoch": 1.731050404001539, "grad_norm": 4.75, "learning_rate": 1.1872089276506584e-06, "loss": 1.424713373184204, "step": 5624 }, { "epoch": 1.7316660253943823, "grad_norm": 11.8125, "learning_rate": 1.1863676244116604e-06, "loss": 1.584140419960022, "step": 5626 }, { "epoch": 1.732281646787226, "grad_norm": 7.78125, "learning_rate": 1.1855281234198073e-06, "loss": 1.5721691846847534, "step": 5628 }, { "epoch": 1.7328972681800692, "grad_norm": 23.25, "learning_rate": 1.1846904255093312e-06, "loss": 1.642784595489502, "step": 5630 }, { "epoch": 1.7335128895729126, "grad_norm": 7.875, "learning_rate": 1.183854531512671e-06, "loss": 1.2797009944915771, "step": 5632 }, { "epoch": 1.734128510965756, "grad_norm": 2.375, "learning_rate": 1.1830204422604728e-06, "loss": 1.0580322742462158, "step": 5634 }, { "epoch": 1.7347441323585995, "grad_norm": 6.375, "learning_rate": 1.1821881585815907e-06, "loss": 1.4140422344207764, "step": 5636 }, { "epoch": 1.735359753751443, "grad_norm": 6.15625, "learning_rate": 1.1813576813030831e-06, "loss": 1.3851627111434937, "step": 5638 }, { "epoch": 1.7359753751442861, "grad_norm": 5.4375, "learning_rate": 1.180529011250214e-06, "loss": 1.2441668510437012, "step": 5640 }, { "epoch": 1.7365909965371298, "grad_norm": 10.875, "learning_rate": 1.1797021492464514e-06, "loss": 1.362105369567871, "step": 5642 }, { "epoch": 1.737206617929973, "grad_norm": 7.28125, "learning_rate": 1.1788770961134662e-06, "loss": 1.356371283531189, "step": 5644 }, { "epoch": 1.7378222393228164, "grad_norm": 21.625, "learning_rate": 1.1780538526711329e-06, "loss": 0.9809825420379639, "step": 5646 }, { "epoch": 1.7384378607156599, "grad_norm": 15.4375, "learning_rate": 1.1772324197375267e-06, "loss": 1.4957363605499268, "step": 5648 }, { "epoch": 1.7390534821085033, "grad_norm": 5.59375, "learning_rate": 1.1764127981289234e-06, "loss": 1.2225853204727173, "step": 5650 }, { "epoch": 1.7396691035013467, "grad_norm": 14.625, "learning_rate": 1.1755949886598006e-06, "loss": 1.3869491815567017, "step": 5652 }, { "epoch": 1.74028472489419, "grad_norm": 4.96875, "learning_rate": 1.1747789921428324e-06, "loss": 1.2316184043884277, "step": 5654 }, { "epoch": 1.7409003462870336, "grad_norm": 6.8125, "learning_rate": 1.1739648093888938e-06, "loss": 1.3992987871170044, "step": 5656 }, { "epoch": 1.7415159676798768, "grad_norm": 11.1875, "learning_rate": 1.1731524412070562e-06, "loss": 1.2253878116607666, "step": 5658 }, { "epoch": 1.7421315890727203, "grad_norm": 7.84375, "learning_rate": 1.1723418884045881e-06, "loss": 1.2958788871765137, "step": 5660 }, { "epoch": 1.7427472104655637, "grad_norm": 5.15625, "learning_rate": 1.171533151786954e-06, "loss": 1.0208817720413208, "step": 5662 }, { "epoch": 1.7433628318584071, "grad_norm": 9.375, "learning_rate": 1.1707262321578134e-06, "loss": 1.4322336912155151, "step": 5664 }, { "epoch": 1.7439784532512506, "grad_norm": 18.25, "learning_rate": 1.1699211303190212e-06, "loss": 1.5269854068756104, "step": 5666 }, { "epoch": 1.7445940746440938, "grad_norm": 5.59375, "learning_rate": 1.169117847070624e-06, "loss": 1.094125509262085, "step": 5668 }, { "epoch": 1.7452096960369374, "grad_norm": 5.3125, "learning_rate": 1.1683163832108626e-06, "loss": 1.3716709613800049, "step": 5670 }, { "epoch": 1.7458253174297806, "grad_norm": 7.8125, "learning_rate": 1.1675167395361705e-06, "loss": 1.190974473953247, "step": 5672 }, { "epoch": 1.746440938822624, "grad_norm": 10.8125, "learning_rate": 1.1667189168411706e-06, "loss": 1.0392242670059204, "step": 5674 }, { "epoch": 1.7470565602154675, "grad_norm": 14.25, "learning_rate": 1.1659229159186779e-06, "loss": 1.2160130739212036, "step": 5676 }, { "epoch": 1.7476721816083107, "grad_norm": 7.90625, "learning_rate": 1.165128737559696e-06, "loss": 0.7107000946998596, "step": 5678 }, { "epoch": 1.7482878030011544, "grad_norm": 4.03125, "learning_rate": 1.1643363825534173e-06, "loss": 1.1767587661743164, "step": 5680 }, { "epoch": 1.7489034243939976, "grad_norm": 7.6875, "learning_rate": 1.1635458516872234e-06, "loss": 1.2937829494476318, "step": 5682 }, { "epoch": 1.7495190457868413, "grad_norm": 10.1875, "learning_rate": 1.1627571457466824e-06, "loss": 1.3708268404006958, "step": 5684 }, { "epoch": 1.7501346671796845, "grad_norm": 6.53125, "learning_rate": 1.161970265515549e-06, "loss": 1.3696799278259277, "step": 5686 }, { "epoch": 1.750750288572528, "grad_norm": 3.953125, "learning_rate": 1.1611852117757634e-06, "loss": 1.3666961193084717, "step": 5688 }, { "epoch": 1.7513659099653713, "grad_norm": 4.625, "learning_rate": 1.1604019853074518e-06, "loss": 1.1216015815734863, "step": 5690 }, { "epoch": 1.7519815313582145, "grad_norm": 16.125, "learning_rate": 1.1596205868889238e-06, "loss": 1.474446177482605, "step": 5692 }, { "epoch": 1.7525971527510582, "grad_norm": 6.0, "learning_rate": 1.1588410172966719e-06, "loss": 1.3140782117843628, "step": 5694 }, { "epoch": 1.7532127741439014, "grad_norm": 8.75, "learning_rate": 1.1580632773053722e-06, "loss": 1.2671130895614624, "step": 5696 }, { "epoch": 1.7538283955367449, "grad_norm": 9.8125, "learning_rate": 1.1572873676878822e-06, "loss": 1.5012891292572021, "step": 5698 }, { "epoch": 1.7544440169295883, "grad_norm": 3.046875, "learning_rate": 1.156513289215241e-06, "loss": 0.9581146240234375, "step": 5700 }, { "epoch": 1.7550596383224317, "grad_norm": 6.25, "learning_rate": 1.1557410426566678e-06, "loss": 1.2958364486694336, "step": 5702 }, { "epoch": 1.7556752597152752, "grad_norm": 6.90625, "learning_rate": 1.154970628779561e-06, "loss": 1.6476054191589355, "step": 5704 }, { "epoch": 1.7562908811081184, "grad_norm": 9.6875, "learning_rate": 1.1542020483494982e-06, "loss": 1.3589799404144287, "step": 5706 }, { "epoch": 1.756906502500962, "grad_norm": 7.0625, "learning_rate": 1.1534353021302347e-06, "loss": 1.2966574430465698, "step": 5708 }, { "epoch": 1.7575221238938052, "grad_norm": 8.1875, "learning_rate": 1.1526703908837043e-06, "loss": 1.4398329257965088, "step": 5710 }, { "epoch": 1.7581377452866487, "grad_norm": 11.125, "learning_rate": 1.1519073153700156e-06, "loss": 1.3965197801589966, "step": 5712 }, { "epoch": 1.7587533666794921, "grad_norm": 3.03125, "learning_rate": 1.1511460763474543e-06, "loss": 1.247341275215149, "step": 5714 }, { "epoch": 1.7593689880723355, "grad_norm": 5.9375, "learning_rate": 1.1503866745724807e-06, "loss": 1.4572646617889404, "step": 5716 }, { "epoch": 1.759984609465179, "grad_norm": 2.609375, "learning_rate": 1.1496291107997288e-06, "loss": 1.1606372594833374, "step": 5718 }, { "epoch": 1.7606002308580222, "grad_norm": 4.3125, "learning_rate": 1.1488733857820073e-06, "loss": 1.1627566814422607, "step": 5720 }, { "epoch": 1.7612158522508659, "grad_norm": 8.75, "learning_rate": 1.1481195002702968e-06, "loss": 1.7606632709503174, "step": 5722 }, { "epoch": 1.761831473643709, "grad_norm": 10.75, "learning_rate": 1.1473674550137503e-06, "loss": 1.6082324981689453, "step": 5724 }, { "epoch": 1.7624470950365525, "grad_norm": 5.53125, "learning_rate": 1.1466172507596923e-06, "loss": 1.4290021657943726, "step": 5726 }, { "epoch": 1.763062716429396, "grad_norm": 8.5, "learning_rate": 1.1458688882536168e-06, "loss": 1.3907338380813599, "step": 5728 }, { "epoch": 1.7636783378222394, "grad_norm": 12.5, "learning_rate": 1.14512236823919e-06, "loss": 1.2802016735076904, "step": 5730 }, { "epoch": 1.7642939592150828, "grad_norm": 8.0, "learning_rate": 1.1443776914582434e-06, "loss": 1.526460886001587, "step": 5732 }, { "epoch": 1.764909580607926, "grad_norm": 5.84375, "learning_rate": 1.1436348586507807e-06, "loss": 1.4908353090286255, "step": 5734 }, { "epoch": 1.7655252020007697, "grad_norm": 4.3125, "learning_rate": 1.1428938705549704e-06, "loss": 1.1730027198791504, "step": 5736 }, { "epoch": 1.766140823393613, "grad_norm": 5.96875, "learning_rate": 1.1421547279071499e-06, "loss": 1.3278155326843262, "step": 5738 }, { "epoch": 1.7667564447864563, "grad_norm": 7.59375, "learning_rate": 1.1414174314418217e-06, "loss": 1.5488226413726807, "step": 5740 }, { "epoch": 1.7673720661792998, "grad_norm": 11.4375, "learning_rate": 1.1406819818916533e-06, "loss": 0.7858749032020569, "step": 5742 }, { "epoch": 1.767987687572143, "grad_norm": 9.125, "learning_rate": 1.1399483799874777e-06, "loss": 1.0775755643844604, "step": 5744 }, { "epoch": 1.7686033089649866, "grad_norm": 6.25, "learning_rate": 1.139216626458291e-06, "loss": 1.5314750671386719, "step": 5746 }, { "epoch": 1.7692189303578298, "grad_norm": 6.28125, "learning_rate": 1.1384867220312541e-06, "loss": 1.385054588317871, "step": 5748 }, { "epoch": 1.7698345517506735, "grad_norm": 15.25, "learning_rate": 1.1377586674316887e-06, "loss": 1.2082306146621704, "step": 5750 }, { "epoch": 1.7704501731435167, "grad_norm": 5.28125, "learning_rate": 1.137032463383079e-06, "loss": 1.5090394020080566, "step": 5752 }, { "epoch": 1.7710657945363601, "grad_norm": 5.8125, "learning_rate": 1.1363081106070709e-06, "loss": 1.1624975204467773, "step": 5754 }, { "epoch": 1.7716814159292036, "grad_norm": 8.6875, "learning_rate": 1.1355856098234693e-06, "loss": 1.2231210470199585, "step": 5756 }, { "epoch": 1.7722970373220468, "grad_norm": 2.359375, "learning_rate": 1.1348649617502395e-06, "loss": 1.3029277324676514, "step": 5758 }, { "epoch": 1.7729126587148905, "grad_norm": 15.875, "learning_rate": 1.1341461671035059e-06, "loss": 1.8060013055801392, "step": 5760 }, { "epoch": 1.7735282801077337, "grad_norm": 7.96875, "learning_rate": 1.1334292265975506e-06, "loss": 1.6648659706115723, "step": 5762 }, { "epoch": 1.774143901500577, "grad_norm": 4.625, "learning_rate": 1.1327141409448134e-06, "loss": 1.1244107484817505, "step": 5764 }, { "epoch": 1.7747595228934205, "grad_norm": 5.0, "learning_rate": 1.132000910855891e-06, "loss": 1.087131381034851, "step": 5766 }, { "epoch": 1.775375144286264, "grad_norm": 11.75, "learning_rate": 1.131289537039537e-06, "loss": 1.270734429359436, "step": 5768 }, { "epoch": 1.7759907656791074, "grad_norm": 14.875, "learning_rate": 1.1305800202026581e-06, "loss": 1.528784990310669, "step": 5770 }, { "epoch": 1.7766063870719506, "grad_norm": 7.25, "learning_rate": 1.1298723610503178e-06, "loss": 1.3354603052139282, "step": 5772 }, { "epoch": 1.7772220084647943, "grad_norm": 9.375, "learning_rate": 1.129166560285733e-06, "loss": 1.3287979364395142, "step": 5774 }, { "epoch": 1.7778376298576375, "grad_norm": 7.46875, "learning_rate": 1.1284626186102733e-06, "loss": 1.1215649843215942, "step": 5776 }, { "epoch": 1.778453251250481, "grad_norm": 7.21875, "learning_rate": 1.1277605367234617e-06, "loss": 1.456329107284546, "step": 5778 }, { "epoch": 1.7790688726433244, "grad_norm": 3.828125, "learning_rate": 1.1270603153229725e-06, "loss": 1.1575416326522827, "step": 5780 }, { "epoch": 1.7796844940361678, "grad_norm": 5.90625, "learning_rate": 1.1263619551046315e-06, "loss": 1.1512196063995361, "step": 5782 }, { "epoch": 1.7803001154290112, "grad_norm": 3.84375, "learning_rate": 1.1256654567624151e-06, "loss": 1.005980134010315, "step": 5784 }, { "epoch": 1.7809157368218544, "grad_norm": 5.1875, "learning_rate": 1.1249708209884485e-06, "loss": 1.1128065586090088, "step": 5786 }, { "epoch": 1.781531358214698, "grad_norm": 4.96875, "learning_rate": 1.124278048473008e-06, "loss": 1.2213716506958008, "step": 5788 }, { "epoch": 1.7821469796075413, "grad_norm": 4.375, "learning_rate": 1.1235871399045157e-06, "loss": 1.0188642740249634, "step": 5790 }, { "epoch": 1.7827626010003848, "grad_norm": 3.03125, "learning_rate": 1.1228980959695447e-06, "loss": 1.0099231004714966, "step": 5792 }, { "epoch": 1.7833782223932282, "grad_norm": 7.59375, "learning_rate": 1.1222109173528127e-06, "loss": 1.320572018623352, "step": 5794 }, { "epoch": 1.7839938437860716, "grad_norm": 13.3125, "learning_rate": 1.1215256047371837e-06, "loss": 1.3446568250656128, "step": 5796 }, { "epoch": 1.784609465178915, "grad_norm": 5.53125, "learning_rate": 1.120842158803669e-06, "loss": 1.2886813879013062, "step": 5798 }, { "epoch": 1.7852250865717583, "grad_norm": 7.28125, "learning_rate": 1.120160580231424e-06, "loss": 1.3227756023406982, "step": 5800 }, { "epoch": 1.785840707964602, "grad_norm": 5.09375, "learning_rate": 1.1194808696977487e-06, "loss": 1.3050503730773926, "step": 5802 }, { "epoch": 1.7864563293574451, "grad_norm": 16.375, "learning_rate": 1.1188030278780867e-06, "loss": 1.1649458408355713, "step": 5804 }, { "epoch": 1.7870719507502886, "grad_norm": 5.75, "learning_rate": 1.118127055446024e-06, "loss": 1.4305706024169922, "step": 5806 }, { "epoch": 1.787687572143132, "grad_norm": 5.6875, "learning_rate": 1.1174529530732908e-06, "loss": 1.34320867061615, "step": 5808 }, { "epoch": 1.7883031935359752, "grad_norm": 4.65625, "learning_rate": 1.1167807214297562e-06, "loss": 1.3574994802474976, "step": 5810 }, { "epoch": 1.7889188149288189, "grad_norm": 3.25, "learning_rate": 1.116110361183433e-06, "loss": 1.2846295833587646, "step": 5812 }, { "epoch": 1.789534436321662, "grad_norm": 2.8125, "learning_rate": 1.115441873000473e-06, "loss": 1.1642603874206543, "step": 5814 }, { "epoch": 1.7901500577145055, "grad_norm": 13.125, "learning_rate": 1.1147752575451674e-06, "loss": 1.3659430742263794, "step": 5816 }, { "epoch": 1.790765679107349, "grad_norm": 23.5, "learning_rate": 1.1141105154799475e-06, "loss": 1.0769296884536743, "step": 5818 }, { "epoch": 1.7913813005001924, "grad_norm": 5.28125, "learning_rate": 1.113447647465382e-06, "loss": 1.4280297756195068, "step": 5820 }, { "epoch": 1.7919969218930358, "grad_norm": 9.1875, "learning_rate": 1.112786654160178e-06, "loss": 1.4500038623809814, "step": 5822 }, { "epoch": 1.792612543285879, "grad_norm": 95.0, "learning_rate": 1.112127536221179e-06, "loss": 1.5453027486801147, "step": 5824 }, { "epoch": 1.7932281646787227, "grad_norm": 3.5, "learning_rate": 1.1114702943033656e-06, "loss": 1.2905791997909546, "step": 5826 }, { "epoch": 1.793843786071566, "grad_norm": 2.6875, "learning_rate": 1.1108149290598537e-06, "loss": 1.0351507663726807, "step": 5828 }, { "epoch": 1.7944594074644094, "grad_norm": 6.21875, "learning_rate": 1.1101614411418945e-06, "loss": 1.0793569087982178, "step": 5830 }, { "epoch": 1.7950750288572528, "grad_norm": 4.03125, "learning_rate": 1.1095098311988735e-06, "loss": 1.2975974082946777, "step": 5832 }, { "epoch": 1.7956906502500962, "grad_norm": 10.25, "learning_rate": 1.1088600998783101e-06, "loss": 1.3448054790496826, "step": 5834 }, { "epoch": 1.7963062716429397, "grad_norm": 6.75, "learning_rate": 1.1082122478258572e-06, "loss": 1.578335165977478, "step": 5836 }, { "epoch": 1.7969218930357829, "grad_norm": 8.5625, "learning_rate": 1.1075662756852996e-06, "loss": 1.236875295639038, "step": 5838 }, { "epoch": 1.7975375144286265, "grad_norm": 5.71875, "learning_rate": 1.106922184098554e-06, "loss": 1.1941136121749878, "step": 5840 }, { "epoch": 1.7981531358214697, "grad_norm": 6.53125, "learning_rate": 1.106279973705669e-06, "loss": 1.266106367111206, "step": 5842 }, { "epoch": 1.7987687572143132, "grad_norm": 56.5, "learning_rate": 1.1056396451448233e-06, "loss": 1.415964126586914, "step": 5844 }, { "epoch": 1.7993843786071566, "grad_norm": 6.28125, "learning_rate": 1.1050011990523263e-06, "loss": 1.287773609161377, "step": 5846 }, { "epoch": 1.8, "grad_norm": 7.84375, "learning_rate": 1.1043646360626159e-06, "loss": 1.3969554901123047, "step": 5848 }, { "epoch": 1.8006156213928435, "grad_norm": 5.40625, "learning_rate": 1.1037299568082583e-06, "loss": 1.0966699123382568, "step": 5850 }, { "epoch": 1.8012312427856867, "grad_norm": 5.03125, "learning_rate": 1.1030971619199496e-06, "loss": 1.239692211151123, "step": 5852 }, { "epoch": 1.8018468641785303, "grad_norm": 10.5625, "learning_rate": 1.1024662520265113e-06, "loss": 0.9717188477516174, "step": 5854 }, { "epoch": 1.8024624855713736, "grad_norm": 11.9375, "learning_rate": 1.1018372277548934e-06, "loss": 1.3267759084701538, "step": 5856 }, { "epoch": 1.803078106964217, "grad_norm": 7.5, "learning_rate": 1.1012100897301712e-06, "loss": 1.3745042085647583, "step": 5858 }, { "epoch": 1.8036937283570604, "grad_norm": 5.65625, "learning_rate": 1.1005848385755457e-06, "loss": 1.1953214406967163, "step": 5860 }, { "epoch": 1.8043093497499036, "grad_norm": 9.6875, "learning_rate": 1.0999614749123433e-06, "loss": 0.9452353715896606, "step": 5862 }, { "epoch": 1.8049249711427473, "grad_norm": 12.1875, "learning_rate": 1.099339999360014e-06, "loss": 0.9445773363113403, "step": 5864 }, { "epoch": 1.8055405925355905, "grad_norm": 10.375, "learning_rate": 1.0987204125361324e-06, "loss": 1.274017095565796, "step": 5866 }, { "epoch": 1.8061562139284342, "grad_norm": 9.0, "learning_rate": 1.0981027150563954e-06, "loss": 1.5239988565444946, "step": 5868 }, { "epoch": 1.8067718353212774, "grad_norm": 13.5, "learning_rate": 1.0974869075346228e-06, "loss": 0.7201202511787415, "step": 5870 }, { "epoch": 1.8073874567141208, "grad_norm": 6.65625, "learning_rate": 1.0968729905827575e-06, "loss": 1.2459853887557983, "step": 5872 }, { "epoch": 1.8080030781069643, "grad_norm": 3.953125, "learning_rate": 1.0962609648108607e-06, "loss": 1.2178658246994019, "step": 5874 }, { "epoch": 1.8086186994998075, "grad_norm": 6.46875, "learning_rate": 1.0956508308271174e-06, "loss": 1.0787028074264526, "step": 5876 }, { "epoch": 1.8092343208926511, "grad_norm": 8.625, "learning_rate": 1.0950425892378309e-06, "loss": 1.3741106986999512, "step": 5878 }, { "epoch": 1.8098499422854943, "grad_norm": 3.515625, "learning_rate": 1.094436240647425e-06, "loss": 0.9369704127311707, "step": 5880 }, { "epoch": 1.8104655636783378, "grad_norm": 2.4375, "learning_rate": 1.0938317856584415e-06, "loss": 1.1248363256454468, "step": 5882 }, { "epoch": 1.8110811850711812, "grad_norm": 10.75, "learning_rate": 1.0932292248715407e-06, "loss": 1.2924288511276245, "step": 5884 }, { "epoch": 1.8116968064640246, "grad_norm": 8.25, "learning_rate": 1.0926285588855016e-06, "loss": 1.328222632408142, "step": 5886 }, { "epoch": 1.812312427856868, "grad_norm": 11.25, "learning_rate": 1.0920297882972183e-06, "loss": 1.3337467908859253, "step": 5888 }, { "epoch": 1.8129280492497113, "grad_norm": 9.625, "learning_rate": 1.0914329137017032e-06, "loss": 1.3814985752105713, "step": 5890 }, { "epoch": 1.813543670642555, "grad_norm": 3.140625, "learning_rate": 1.0908379356920838e-06, "loss": 1.2589805126190186, "step": 5892 }, { "epoch": 1.8141592920353982, "grad_norm": 6.375, "learning_rate": 1.0902448548596034e-06, "loss": 1.3981999158859253, "step": 5894 }, { "epoch": 1.8147749134282416, "grad_norm": 5.75, "learning_rate": 1.089653671793619e-06, "loss": 1.116858959197998, "step": 5896 }, { "epoch": 1.815390534821085, "grad_norm": 10.3125, "learning_rate": 1.0890643870816033e-06, "loss": 1.2711323499679565, "step": 5898 }, { "epoch": 1.8160061562139285, "grad_norm": 5.0, "learning_rate": 1.0884770013091416e-06, "loss": 1.4036121368408203, "step": 5900 }, { "epoch": 1.816621777606772, "grad_norm": 14.3125, "learning_rate": 1.0878915150599318e-06, "loss": 1.3301712274551392, "step": 5902 }, { "epoch": 1.8172373989996151, "grad_norm": 5.5, "learning_rate": 1.087307928915785e-06, "loss": 1.4109411239624023, "step": 5904 }, { "epoch": 1.8178530203924588, "grad_norm": 10.8125, "learning_rate": 1.0867262434566237e-06, "loss": 1.1447744369506836, "step": 5906 }, { "epoch": 1.818468641785302, "grad_norm": 8.6875, "learning_rate": 1.0861464592604818e-06, "loss": 1.2968274354934692, "step": 5908 }, { "epoch": 1.8190842631781454, "grad_norm": 4.71875, "learning_rate": 1.0855685769035044e-06, "loss": 1.3239787817001343, "step": 5910 }, { "epoch": 1.8196998845709889, "grad_norm": 4.84375, "learning_rate": 1.0849925969599454e-06, "loss": 1.3455312252044678, "step": 5912 }, { "epoch": 1.8203155059638323, "grad_norm": 6.375, "learning_rate": 1.0844185200021695e-06, "loss": 1.0734500885009766, "step": 5914 }, { "epoch": 1.8209311273566757, "grad_norm": 24.375, "learning_rate": 1.0838463466006496e-06, "loss": 0.9379870891571045, "step": 5916 }, { "epoch": 1.821546748749519, "grad_norm": 6.0, "learning_rate": 1.0832760773239668e-06, "loss": 1.3134609460830688, "step": 5918 }, { "epoch": 1.8221623701423626, "grad_norm": 5.4375, "learning_rate": 1.0827077127388114e-06, "loss": 1.3826963901519775, "step": 5920 }, { "epoch": 1.8227779915352058, "grad_norm": 7.21875, "learning_rate": 1.0821412534099794e-06, "loss": 0.8602036237716675, "step": 5922 }, { "epoch": 1.8233936129280492, "grad_norm": 7.90625, "learning_rate": 1.0815766999003744e-06, "loss": 1.6724231243133545, "step": 5924 }, { "epoch": 1.8240092343208927, "grad_norm": 9.25, "learning_rate": 1.0810140527710057e-06, "loss": 1.274658203125, "step": 5926 }, { "epoch": 1.824624855713736, "grad_norm": 12.375, "learning_rate": 1.0804533125809879e-06, "loss": 1.0407257080078125, "step": 5928 }, { "epoch": 1.8252404771065796, "grad_norm": 4.71875, "learning_rate": 1.0798944798875425e-06, "loss": 1.1220669746398926, "step": 5930 }, { "epoch": 1.8258560984994228, "grad_norm": 7.3125, "learning_rate": 1.0793375552459925e-06, "loss": 1.2940001487731934, "step": 5932 }, { "epoch": 1.8264717198922664, "grad_norm": 7.0625, "learning_rate": 1.0787825392097673e-06, "loss": 1.3032958507537842, "step": 5934 }, { "epoch": 1.8270873412851096, "grad_norm": 5.21875, "learning_rate": 1.0782294323303987e-06, "loss": 1.4309355020523071, "step": 5936 }, { "epoch": 1.827702962677953, "grad_norm": 4.875, "learning_rate": 1.0776782351575212e-06, "loss": 1.062288761138916, "step": 5938 }, { "epoch": 1.8283185840707965, "grad_norm": 5.03125, "learning_rate": 1.077128948238872e-06, "loss": 1.4909385442733765, "step": 5940 }, { "epoch": 1.8289342054636397, "grad_norm": 18.375, "learning_rate": 1.0765815721202895e-06, "loss": 1.1382215023040771, "step": 5942 }, { "epoch": 1.8295498268564834, "grad_norm": 8.0, "learning_rate": 1.076036107345714e-06, "loss": 1.5346736907958984, "step": 5944 }, { "epoch": 1.8301654482493266, "grad_norm": 2.484375, "learning_rate": 1.0754925544571858e-06, "loss": 1.2139662504196167, "step": 5946 }, { "epoch": 1.83078106964217, "grad_norm": 7.96875, "learning_rate": 1.0749509139948455e-06, "loss": 1.3137943744659424, "step": 5948 }, { "epoch": 1.8313966910350135, "grad_norm": 10.625, "learning_rate": 1.074411186496934e-06, "loss": 1.3383773565292358, "step": 5950 }, { "epoch": 1.832012312427857, "grad_norm": 7.75, "learning_rate": 1.0738733724997896e-06, "loss": 1.2972408533096313, "step": 5952 }, { "epoch": 1.8326279338207003, "grad_norm": 5.25, "learning_rate": 1.0733374725378508e-06, "loss": 1.153618574142456, "step": 5954 }, { "epoch": 1.8332435552135435, "grad_norm": 13.5625, "learning_rate": 1.0728034871436536e-06, "loss": 1.384376883506775, "step": 5956 }, { "epoch": 1.8338591766063872, "grad_norm": 6.5, "learning_rate": 1.0722714168478306e-06, "loss": 1.6453121900558472, "step": 5958 }, { "epoch": 1.8344747979992304, "grad_norm": 9.3125, "learning_rate": 1.0717412621791123e-06, "loss": 1.43854558467865, "step": 5960 }, { "epoch": 1.8350904193920738, "grad_norm": 2.0, "learning_rate": 1.0712130236643257e-06, "loss": 1.280350685119629, "step": 5962 }, { "epoch": 1.8357060407849173, "grad_norm": 10.5625, "learning_rate": 1.0706867018283924e-06, "loss": 0.9778401255607605, "step": 5964 }, { "epoch": 1.8363216621777607, "grad_norm": 6.375, "learning_rate": 1.070162297194331e-06, "loss": 1.431647539138794, "step": 5966 }, { "epoch": 1.8369372835706042, "grad_norm": 12.375, "learning_rate": 1.0696398102832534e-06, "loss": 1.6630594730377197, "step": 5968 }, { "epoch": 1.8375529049634474, "grad_norm": 6.4375, "learning_rate": 1.0691192416143673e-06, "loss": 1.4239740371704102, "step": 5970 }, { "epoch": 1.838168526356291, "grad_norm": 6.21875, "learning_rate": 1.068600591704973e-06, "loss": 1.1576460599899292, "step": 5972 }, { "epoch": 1.8387841477491342, "grad_norm": 6.21875, "learning_rate": 1.0680838610704645e-06, "loss": 1.4384702444076538, "step": 5974 }, { "epoch": 1.8393997691419777, "grad_norm": 2.5, "learning_rate": 1.0675690502243288e-06, "loss": 1.1146204471588135, "step": 5976 }, { "epoch": 1.840015390534821, "grad_norm": 8.6875, "learning_rate": 1.0670561596781454e-06, "loss": 1.3305184841156006, "step": 5978 }, { "epoch": 1.8406310119276645, "grad_norm": 8.1875, "learning_rate": 1.0665451899415843e-06, "loss": 1.0790116786956787, "step": 5980 }, { "epoch": 1.841246633320508, "grad_norm": 7.75, "learning_rate": 1.0660361415224077e-06, "loss": 1.4586706161499023, "step": 5982 }, { "epoch": 1.8418622547133512, "grad_norm": 6.65625, "learning_rate": 1.0655290149264688e-06, "loss": 1.733391523361206, "step": 5984 }, { "epoch": 1.8424778761061948, "grad_norm": 15.625, "learning_rate": 1.0650238106577104e-06, "loss": 1.6308709383010864, "step": 5986 }, { "epoch": 1.843093497499038, "grad_norm": 11.875, "learning_rate": 1.0645205292181662e-06, "loss": 1.5266309976577759, "step": 5988 }, { "epoch": 1.8437091188918815, "grad_norm": 14.875, "learning_rate": 1.0640191711079568e-06, "loss": 1.3158437013626099, "step": 5990 }, { "epoch": 1.844324740284725, "grad_norm": 6.96875, "learning_rate": 1.063519736825294e-06, "loss": 1.5622031688690186, "step": 5992 }, { "epoch": 1.8449403616775681, "grad_norm": 4.5625, "learning_rate": 1.0630222268664764e-06, "loss": 1.5937286615371704, "step": 5994 }, { "epoch": 1.8455559830704118, "grad_norm": 12.5, "learning_rate": 1.062526641725891e-06, "loss": 1.1101106405258179, "step": 5996 }, { "epoch": 1.846171604463255, "grad_norm": 9.375, "learning_rate": 1.0620329818960116e-06, "loss": 1.6603482961654663, "step": 5998 }, { "epoch": 1.8467872258560984, "grad_norm": 9.375, "learning_rate": 1.0615412478673996e-06, "loss": 1.6565195322036743, "step": 6000 }, { "epoch": 1.8474028472489419, "grad_norm": 7.4375, "learning_rate": 1.0610514401287015e-06, "loss": 1.3372001647949219, "step": 6002 }, { "epoch": 1.8480184686417853, "grad_norm": 5.59375, "learning_rate": 1.0605635591666505e-06, "loss": 0.9332516193389893, "step": 6004 }, { "epoch": 1.8486340900346288, "grad_norm": 2.625, "learning_rate": 1.0600776054660646e-06, "loss": 1.0429884195327759, "step": 6006 }, { "epoch": 1.849249711427472, "grad_norm": 12.0625, "learning_rate": 1.0595935795098474e-06, "loss": 1.5856671333312988, "step": 6008 }, { "epoch": 1.8498653328203156, "grad_norm": 4.90625, "learning_rate": 1.0591114817789861e-06, "loss": 1.177258014678955, "step": 6010 }, { "epoch": 1.8504809542131588, "grad_norm": 5.84375, "learning_rate": 1.058631312752552e-06, "loss": 1.312177062034607, "step": 6012 }, { "epoch": 1.8510965756060023, "grad_norm": 4.28125, "learning_rate": 1.0581530729076997e-06, "loss": 1.2367324829101562, "step": 6014 }, { "epoch": 1.8517121969988457, "grad_norm": 2.8125, "learning_rate": 1.057676762719667e-06, "loss": 0.9590421915054321, "step": 6016 }, { "epoch": 1.8523278183916891, "grad_norm": 3.234375, "learning_rate": 1.057202382661774e-06, "loss": 1.3898155689239502, "step": 6018 }, { "epoch": 1.8529434397845326, "grad_norm": 5.1875, "learning_rate": 1.0567299332054225e-06, "loss": 1.3247897624969482, "step": 6020 }, { "epoch": 1.8535590611773758, "grad_norm": 12.75, "learning_rate": 1.0562594148200966e-06, "loss": 1.459519624710083, "step": 6022 }, { "epoch": 1.8541746825702194, "grad_norm": 11.0, "learning_rate": 1.055790827973361e-06, "loss": 1.4367085695266724, "step": 6024 }, { "epoch": 1.8547903039630627, "grad_norm": 10.5, "learning_rate": 1.0553241731308602e-06, "loss": 1.3709160089492798, "step": 6026 }, { "epoch": 1.855405925355906, "grad_norm": 4.625, "learning_rate": 1.0548594507563207e-06, "loss": 1.1860214471817017, "step": 6028 }, { "epoch": 1.8560215467487495, "grad_norm": 7.9375, "learning_rate": 1.0543966613115464e-06, "loss": 1.6553035974502563, "step": 6030 }, { "epoch": 1.856637168141593, "grad_norm": 12.125, "learning_rate": 1.0539358052564224e-06, "loss": 1.2580708265304565, "step": 6032 }, { "epoch": 1.8572527895344364, "grad_norm": 34.25, "learning_rate": 1.0534768830489111e-06, "loss": 0.7556243538856506, "step": 6034 }, { "epoch": 1.8578684109272796, "grad_norm": 6.125, "learning_rate": 1.0530198951450542e-06, "loss": 1.1045972108840942, "step": 6036 }, { "epoch": 1.8584840323201233, "grad_norm": 6.15625, "learning_rate": 1.0525648419989705e-06, "loss": 1.3060710430145264, "step": 6038 }, { "epoch": 1.8590996537129665, "grad_norm": 4.71875, "learning_rate": 1.052111724062857e-06, "loss": 0.8121116757392883, "step": 6040 }, { "epoch": 1.85971527510581, "grad_norm": 8.5625, "learning_rate": 1.0516605417869865e-06, "loss": 1.2903244495391846, "step": 6042 }, { "epoch": 1.8603308964986534, "grad_norm": 6.1875, "learning_rate": 1.0512112956197094e-06, "loss": 0.8824107050895691, "step": 6044 }, { "epoch": 1.8609465178914966, "grad_norm": 8.375, "learning_rate": 1.0507639860074517e-06, "loss": 1.311356544494629, "step": 6046 }, { "epoch": 1.8615621392843402, "grad_norm": 5.71875, "learning_rate": 1.0503186133947148e-06, "loss": 1.3515329360961914, "step": 6048 }, { "epoch": 1.8621777606771834, "grad_norm": 4.21875, "learning_rate": 1.0498751782240752e-06, "loss": 0.7219789028167725, "step": 6050 }, { "epoch": 1.862793382070027, "grad_norm": 4.84375, "learning_rate": 1.049433680936185e-06, "loss": 1.2869892120361328, "step": 6052 }, { "epoch": 1.8634090034628703, "grad_norm": 11.5, "learning_rate": 1.0489941219697695e-06, "loss": 1.3091208934783936, "step": 6054 }, { "epoch": 1.8640246248557137, "grad_norm": 6.625, "learning_rate": 1.0485565017616286e-06, "loss": 1.3862000703811646, "step": 6056 }, { "epoch": 1.8646402462485572, "grad_norm": 5.3125, "learning_rate": 1.0481208207466349e-06, "loss": 1.3635724782943726, "step": 6058 }, { "epoch": 1.8652558676414004, "grad_norm": 9.75, "learning_rate": 1.0476870793577346e-06, "loss": 1.3298139572143555, "step": 6060 }, { "epoch": 1.865871489034244, "grad_norm": 4.84375, "learning_rate": 1.0472552780259464e-06, "loss": 1.297196388244629, "step": 6062 }, { "epoch": 1.8664871104270873, "grad_norm": 11.8125, "learning_rate": 1.0468254171803607e-06, "loss": 1.2975666522979736, "step": 6064 }, { "epoch": 1.8671027318199307, "grad_norm": 7.15625, "learning_rate": 1.0463974972481402e-06, "loss": 1.4344289302825928, "step": 6066 }, { "epoch": 1.8677183532127741, "grad_norm": 3.328125, "learning_rate": 1.045971518654518e-06, "loss": 1.2920666933059692, "step": 6068 }, { "epoch": 1.8683339746056176, "grad_norm": 10.6875, "learning_rate": 1.045547481822799e-06, "loss": 1.531708002090454, "step": 6070 }, { "epoch": 1.868949595998461, "grad_norm": 3.84375, "learning_rate": 1.0451253871743582e-06, "loss": 1.1584322452545166, "step": 6072 }, { "epoch": 1.8695652173913042, "grad_norm": 4.25, "learning_rate": 1.0447052351286401e-06, "loss": 1.314656376838684, "step": 6074 }, { "epoch": 1.8701808387841479, "grad_norm": 7.9375, "learning_rate": 1.0442870261031593e-06, "loss": 1.0107910633087158, "step": 6076 }, { "epoch": 1.870796460176991, "grad_norm": 8.9375, "learning_rate": 1.0438707605134996e-06, "loss": 1.1779106855392456, "step": 6078 }, { "epoch": 1.8714120815698345, "grad_norm": 5.875, "learning_rate": 1.0434564387733138e-06, "loss": 1.1396067142486572, "step": 6080 }, { "epoch": 1.872027702962678, "grad_norm": 4.34375, "learning_rate": 1.0430440612943222e-06, "loss": 1.1491904258728027, "step": 6082 }, { "epoch": 1.8726433243555214, "grad_norm": 7.0, "learning_rate": 1.0426336284863136e-06, "loss": 1.234432339668274, "step": 6084 }, { "epoch": 1.8732589457483648, "grad_norm": 10.3125, "learning_rate": 1.0422251407571444e-06, "loss": 1.6887693405151367, "step": 6086 }, { "epoch": 1.873874567141208, "grad_norm": 7.09375, "learning_rate": 1.0418185985127379e-06, "loss": 1.5162160396575928, "step": 6088 }, { "epoch": 1.8744901885340517, "grad_norm": 7.03125, "learning_rate": 1.041414002157084e-06, "loss": 1.3888404369354248, "step": 6090 }, { "epoch": 1.875105809926895, "grad_norm": 7.75, "learning_rate": 1.0410113520922402e-06, "loss": 1.605562686920166, "step": 6092 }, { "epoch": 1.8757214313197383, "grad_norm": 19.0, "learning_rate": 1.0406106487183277e-06, "loss": 1.6701446771621704, "step": 6094 }, { "epoch": 1.8763370527125818, "grad_norm": 13.125, "learning_rate": 1.040211892433535e-06, "loss": 1.520393967628479, "step": 6096 }, { "epoch": 1.8769526741054252, "grad_norm": 6.34375, "learning_rate": 1.039815083634115e-06, "loss": 1.5344507694244385, "step": 6098 }, { "epoch": 1.8775682954982686, "grad_norm": 14.125, "learning_rate": 1.0394202227143857e-06, "loss": 1.590895175933838, "step": 6100 }, { "epoch": 1.8781839168911119, "grad_norm": 11.1875, "learning_rate": 1.0390273100667291e-06, "loss": 1.1395299434661865, "step": 6102 }, { "epoch": 1.8787995382839555, "grad_norm": 10.6875, "learning_rate": 1.0386363460815913e-06, "loss": 0.9038736820220947, "step": 6104 }, { "epoch": 1.8794151596767987, "grad_norm": 2.0625, "learning_rate": 1.0382473311474821e-06, "loss": 1.2217110395431519, "step": 6106 }, { "epoch": 1.8800307810696422, "grad_norm": 3.828125, "learning_rate": 1.037860265650974e-06, "loss": 1.2206361293792725, "step": 6108 }, { "epoch": 1.8806464024624856, "grad_norm": 13.8125, "learning_rate": 1.037475149976703e-06, "loss": 1.5064672231674194, "step": 6110 }, { "epoch": 1.8812620238553288, "grad_norm": 11.1875, "learning_rate": 1.0370919845073674e-06, "loss": 1.51680326461792, "step": 6112 }, { "epoch": 1.8818776452481725, "grad_norm": 8.6875, "learning_rate": 1.0367107696237266e-06, "loss": 1.1089879274368286, "step": 6114 }, { "epoch": 1.8824932666410157, "grad_norm": 11.9375, "learning_rate": 1.036331505704603e-06, "loss": 0.7587801218032837, "step": 6116 }, { "epoch": 1.8831088880338593, "grad_norm": 8.75, "learning_rate": 1.0359541931268793e-06, "loss": 1.591186285018921, "step": 6118 }, { "epoch": 1.8837245094267026, "grad_norm": 7.21875, "learning_rate": 1.0355788322655e-06, "loss": 1.5688434839248657, "step": 6120 }, { "epoch": 1.884340130819546, "grad_norm": 3.734375, "learning_rate": 1.0352054234934688e-06, "loss": 0.9427247643470764, "step": 6122 }, { "epoch": 1.8849557522123894, "grad_norm": 5.8125, "learning_rate": 1.0348339671818509e-06, "loss": 1.1877045631408691, "step": 6124 }, { "epoch": 1.8855713736052326, "grad_norm": 5.875, "learning_rate": 1.0344644636997705e-06, "loss": 1.2711281776428223, "step": 6126 }, { "epoch": 1.8861869949980763, "grad_norm": 11.0, "learning_rate": 1.0340969134144118e-06, "loss": 1.2702889442443848, "step": 6128 }, { "epoch": 1.8868026163909195, "grad_norm": 4.09375, "learning_rate": 1.0337313166910176e-06, "loss": 1.2502009868621826, "step": 6130 }, { "epoch": 1.887418237783763, "grad_norm": 123.5, "learning_rate": 1.0333676738928895e-06, "loss": 1.333520531654358, "step": 6132 }, { "epoch": 1.8880338591766064, "grad_norm": 6.71875, "learning_rate": 1.0330059853813875e-06, "loss": 1.27629816532135, "step": 6134 }, { "epoch": 1.8886494805694498, "grad_norm": 13.125, "learning_rate": 1.0326462515159297e-06, "loss": 0.7746158242225647, "step": 6136 }, { "epoch": 1.8892651019622932, "grad_norm": 9.25, "learning_rate": 1.0322884726539915e-06, "loss": 1.1550971269607544, "step": 6138 }, { "epoch": 1.8898807233551365, "grad_norm": 6.4375, "learning_rate": 1.0319326491511062e-06, "loss": 1.1305445432662964, "step": 6140 }, { "epoch": 1.8904963447479801, "grad_norm": 3.546875, "learning_rate": 1.0315787813608631e-06, "loss": 1.1933317184448242, "step": 6142 }, { "epoch": 1.8911119661408233, "grad_norm": 18.375, "learning_rate": 1.031226869634909e-06, "loss": 0.9492455720901489, "step": 6144 }, { "epoch": 1.8917275875336668, "grad_norm": 4.375, "learning_rate": 1.0308769143229458e-06, "loss": 1.4015666246414185, "step": 6146 }, { "epoch": 1.8923432089265102, "grad_norm": 10.8125, "learning_rate": 1.0305289157727326e-06, "loss": 1.4756766557693481, "step": 6148 }, { "epoch": 1.8929588303193536, "grad_norm": 4.34375, "learning_rate": 1.030182874330083e-06, "loss": 1.3707283735275269, "step": 6150 }, { "epoch": 1.893574451712197, "grad_norm": 3.546875, "learning_rate": 1.0298387903388665e-06, "loss": 1.2775065898895264, "step": 6152 }, { "epoch": 1.8941900731050403, "grad_norm": 13.25, "learning_rate": 1.0294966641410067e-06, "loss": 1.3877066373825073, "step": 6154 }, { "epoch": 1.894805694497884, "grad_norm": 11.4375, "learning_rate": 1.0291564960764822e-06, "loss": 1.2795771360397339, "step": 6156 }, { "epoch": 1.8954213158907272, "grad_norm": 8.6875, "learning_rate": 1.028818286483326e-06, "loss": 1.289372205734253, "step": 6158 }, { "epoch": 1.8960369372835706, "grad_norm": 8.1875, "learning_rate": 1.0284820356976239e-06, "loss": 1.7508002519607544, "step": 6160 }, { "epoch": 1.896652558676414, "grad_norm": 7.96875, "learning_rate": 1.0281477440535157e-06, "loss": 0.9736946821212769, "step": 6162 }, { "epoch": 1.8972681800692575, "grad_norm": 5.78125, "learning_rate": 1.0278154118831956e-06, "loss": 1.5066009759902954, "step": 6164 }, { "epoch": 1.897883801462101, "grad_norm": 2.671875, "learning_rate": 1.0274850395169086e-06, "loss": 1.4896010160446167, "step": 6166 }, { "epoch": 1.898499422854944, "grad_norm": 6.0, "learning_rate": 1.0271566272829532e-06, "loss": 1.3116507530212402, "step": 6168 }, { "epoch": 1.8991150442477878, "grad_norm": 3.9375, "learning_rate": 1.0268301755076806e-06, "loss": 1.390318512916565, "step": 6170 }, { "epoch": 1.899730665640631, "grad_norm": 3.671875, "learning_rate": 1.0265056845154927e-06, "loss": 0.9628296494483948, "step": 6172 }, { "epoch": 1.9003462870334744, "grad_norm": 12.1875, "learning_rate": 1.0261831546288435e-06, "loss": 1.4534339904785156, "step": 6174 }, { "epoch": 1.9009619084263178, "grad_norm": 13.1875, "learning_rate": 1.0258625861682383e-06, "loss": 1.78825044631958, "step": 6176 }, { "epoch": 1.901577529819161, "grad_norm": 11.5, "learning_rate": 1.0255439794522332e-06, "loss": 1.1523923873901367, "step": 6178 }, { "epoch": 1.9021931512120047, "grad_norm": 12.5625, "learning_rate": 1.025227334797435e-06, "loss": 1.4603246450424194, "step": 6180 }, { "epoch": 1.902808772604848, "grad_norm": 5.3125, "learning_rate": 1.0249126525185e-06, "loss": 1.5006656646728516, "step": 6182 }, { "epoch": 1.9034243939976914, "grad_norm": 6.0625, "learning_rate": 1.0245999329281356e-06, "loss": 1.0975650548934937, "step": 6184 }, { "epoch": 1.9040400153905348, "grad_norm": 8.125, "learning_rate": 1.024289176337098e-06, "loss": 1.5822598934173584, "step": 6186 }, { "epoch": 1.9046556367833782, "grad_norm": 3.859375, "learning_rate": 1.0239803830541933e-06, "loss": 1.1737971305847168, "step": 6188 }, { "epoch": 1.9052712581762217, "grad_norm": 4.875, "learning_rate": 1.023673553386276e-06, "loss": 1.4774574041366577, "step": 6190 }, { "epoch": 1.9058868795690649, "grad_norm": 22.0, "learning_rate": 1.0233686876382493e-06, "loss": 1.326117753982544, "step": 6192 }, { "epoch": 1.9065025009619085, "grad_norm": 5.21875, "learning_rate": 1.023065786113066e-06, "loss": 1.4425880908966064, "step": 6194 }, { "epoch": 1.9071181223547518, "grad_norm": 8.8125, "learning_rate": 1.0227648491117256e-06, "loss": 1.5173418521881104, "step": 6196 }, { "epoch": 1.9077337437475952, "grad_norm": 8.5, "learning_rate": 1.0224658769332758e-06, "loss": 1.4755529165267944, "step": 6198 }, { "epoch": 1.9083493651404386, "grad_norm": 12.4375, "learning_rate": 1.0221688698748124e-06, "loss": 1.3463419675827026, "step": 6200 }, { "epoch": 1.908964986533282, "grad_norm": 5.03125, "learning_rate": 1.0218738282314776e-06, "loss": 0.6657676100730896, "step": 6202 }, { "epoch": 1.9095806079261255, "grad_norm": 7.96875, "learning_rate": 1.021580752296461e-06, "loss": 0.8702752590179443, "step": 6204 }, { "epoch": 1.9101962293189687, "grad_norm": 5.53125, "learning_rate": 1.0212896423609986e-06, "loss": 1.584697961807251, "step": 6206 }, { "epoch": 1.9108118507118124, "grad_norm": 4.4375, "learning_rate": 1.0210004987143736e-06, "loss": 1.2164653539657593, "step": 6208 }, { "epoch": 1.9114274721046556, "grad_norm": 6.21875, "learning_rate": 1.0207133216439136e-06, "loss": 1.3886322975158691, "step": 6210 }, { "epoch": 1.912043093497499, "grad_norm": 6.28125, "learning_rate": 1.020428111434993e-06, "loss": 1.5179249048233032, "step": 6212 }, { "epoch": 1.9126587148903424, "grad_norm": 12.375, "learning_rate": 1.020144868371032e-06, "loss": 1.3470783233642578, "step": 6214 }, { "epoch": 1.9132743362831859, "grad_norm": 3.5, "learning_rate": 1.0198635927334954e-06, "loss": 1.1639832258224487, "step": 6216 }, { "epoch": 1.9138899576760293, "grad_norm": 8.25, "learning_rate": 1.0195842848018932e-06, "loss": 1.42462158203125, "step": 6218 }, { "epoch": 1.9145055790688725, "grad_norm": 9.1875, "learning_rate": 1.01930694485378e-06, "loss": 1.3437308073043823, "step": 6220 }, { "epoch": 1.9151212004617162, "grad_norm": 19.75, "learning_rate": 1.0190315731647542e-06, "loss": 1.4155735969543457, "step": 6222 }, { "epoch": 1.9157368218545594, "grad_norm": 39.5, "learning_rate": 1.0187581700084593e-06, "loss": 1.5331898927688599, "step": 6224 }, { "epoch": 1.9163524432474028, "grad_norm": 10.5625, "learning_rate": 1.018486735656582e-06, "loss": 1.295295238494873, "step": 6226 }, { "epoch": 1.9169680646402463, "grad_norm": 9.1875, "learning_rate": 1.0182172703788529e-06, "loss": 1.421062707901001, "step": 6228 }, { "epoch": 1.9175836860330895, "grad_norm": 2.671875, "learning_rate": 1.0179497744430456e-06, "loss": 1.0161007642745972, "step": 6230 }, { "epoch": 1.9181993074259331, "grad_norm": 5.15625, "learning_rate": 1.0176842481149765e-06, "loss": 1.0413410663604736, "step": 6232 }, { "epoch": 1.9188149288187764, "grad_norm": 4.625, "learning_rate": 1.0174206916585056e-06, "loss": 1.2316532135009766, "step": 6234 }, { "epoch": 1.91943055021162, "grad_norm": 6.03125, "learning_rate": 1.017159105335534e-06, "loss": 1.151000738143921, "step": 6236 }, { "epoch": 1.9200461716044632, "grad_norm": 8.75, "learning_rate": 1.016899489406007e-06, "loss": 1.3681657314300537, "step": 6238 }, { "epoch": 1.9206617929973067, "grad_norm": 6.96875, "learning_rate": 1.0166418441279101e-06, "loss": 1.4891537427902222, "step": 6240 }, { "epoch": 1.92127741439015, "grad_norm": 2.421875, "learning_rate": 1.0163861697572714e-06, "loss": 1.0187004804611206, "step": 6242 }, { "epoch": 1.9218930357829933, "grad_norm": 11.3125, "learning_rate": 1.0161324665481598e-06, "loss": 1.2609717845916748, "step": 6244 }, { "epoch": 1.922508657175837, "grad_norm": 10.6875, "learning_rate": 1.0158807347526865e-06, "loss": 1.6142393350601196, "step": 6246 }, { "epoch": 1.9231242785686802, "grad_norm": 6.78125, "learning_rate": 1.0156309746210028e-06, "loss": 1.5999330282211304, "step": 6248 }, { "epoch": 1.9237398999615236, "grad_norm": 4.40625, "learning_rate": 1.0153831864013008e-06, "loss": 1.568923830986023, "step": 6250 }, { "epoch": 1.924355521354367, "grad_norm": 11.6875, "learning_rate": 1.0151373703398134e-06, "loss": 1.2423279285430908, "step": 6252 }, { "epoch": 1.9249711427472105, "grad_norm": 7.6875, "learning_rate": 1.0148935266808134e-06, "loss": 1.4200328588485718, "step": 6254 }, { "epoch": 1.925586764140054, "grad_norm": 6.3125, "learning_rate": 1.0146516556666135e-06, "loss": 1.2052884101867676, "step": 6256 }, { "epoch": 1.9262023855328971, "grad_norm": 8.5625, "learning_rate": 1.0144117575375667e-06, "loss": 1.5078574419021606, "step": 6258 }, { "epoch": 1.9268180069257408, "grad_norm": 3.234375, "learning_rate": 1.0141738325320656e-06, "loss": 1.0258066654205322, "step": 6260 }, { "epoch": 1.927433628318584, "grad_norm": 5.625, "learning_rate": 1.0139378808865404e-06, "loss": 1.2590429782867432, "step": 6262 }, { "epoch": 1.9280492497114274, "grad_norm": 3.09375, "learning_rate": 1.0137039028354624e-06, "loss": 1.1616289615631104, "step": 6264 }, { "epoch": 1.9286648711042709, "grad_norm": 4.375, "learning_rate": 1.0134718986113406e-06, "loss": 1.1947599649429321, "step": 6266 }, { "epoch": 1.9292804924971143, "grad_norm": 9.625, "learning_rate": 1.0132418684447227e-06, "loss": 1.3349967002868652, "step": 6268 }, { "epoch": 1.9298961138899577, "grad_norm": 26.75, "learning_rate": 1.013013812564195e-06, "loss": 1.1362533569335938, "step": 6270 }, { "epoch": 1.930511735282801, "grad_norm": 7.03125, "learning_rate": 1.0127877311963818e-06, "loss": 1.1104720830917358, "step": 6272 }, { "epoch": 1.9311273566756446, "grad_norm": 7.15625, "learning_rate": 1.0125636245659453e-06, "loss": 1.2859126329421997, "step": 6274 }, { "epoch": 1.9317429780684878, "grad_norm": 10.25, "learning_rate": 1.012341492895585e-06, "loss": 1.598039984703064, "step": 6276 }, { "epoch": 1.9323585994613313, "grad_norm": 2.890625, "learning_rate": 1.0121213364060383e-06, "loss": 1.2150026559829712, "step": 6278 }, { "epoch": 1.9329742208541747, "grad_norm": 11.5625, "learning_rate": 1.0119031553160791e-06, "loss": 1.505211591720581, "step": 6280 }, { "epoch": 1.9335898422470181, "grad_norm": 6.09375, "learning_rate": 1.01168694984252e-06, "loss": 1.2169826030731201, "step": 6282 }, { "epoch": 1.9342054636398616, "grad_norm": 6.0, "learning_rate": 1.011472720200208e-06, "loss": 1.6118532419204712, "step": 6284 }, { "epoch": 1.9348210850327048, "grad_norm": 4.6875, "learning_rate": 1.0112604666020288e-06, "loss": 1.3988016843795776, "step": 6286 }, { "epoch": 1.9354367064255484, "grad_norm": 18.875, "learning_rate": 1.011050189258903e-06, "loss": 1.429690957069397, "step": 6288 }, { "epoch": 1.9360523278183916, "grad_norm": 10.0, "learning_rate": 1.010841888379788e-06, "loss": 1.6061837673187256, "step": 6290 }, { "epoch": 1.936667949211235, "grad_norm": 2.40625, "learning_rate": 1.0106355641716772e-06, "loss": 1.12090265750885, "step": 6292 }, { "epoch": 1.9372835706040785, "grad_norm": 2.984375, "learning_rate": 1.0104312168395996e-06, "loss": 0.9568726420402527, "step": 6294 }, { "epoch": 1.9378991919969217, "grad_norm": 6.34375, "learning_rate": 1.0102288465866196e-06, "loss": 1.2962055206298828, "step": 6296 }, { "epoch": 1.9385148133897654, "grad_norm": 3.84375, "learning_rate": 1.0100284536138372e-06, "loss": 1.2469260692596436, "step": 6298 }, { "epoch": 1.9391304347826086, "grad_norm": 13.5, "learning_rate": 1.0098300381203873e-06, "loss": 1.4265371561050415, "step": 6300 }, { "epoch": 1.9397460561754523, "grad_norm": 10.0625, "learning_rate": 1.0096336003034398e-06, "loss": 1.6725401878356934, "step": 6302 }, { "epoch": 1.9403616775682955, "grad_norm": 5.4375, "learning_rate": 1.0094391403581991e-06, "loss": 1.4111108779907227, "step": 6304 }, { "epoch": 1.940977298961139, "grad_norm": 11.0, "learning_rate": 1.0092466584779052e-06, "loss": 1.198007345199585, "step": 6306 }, { "epoch": 1.9415929203539823, "grad_norm": 4.3125, "learning_rate": 1.009056154853831e-06, "loss": 1.6168948411941528, "step": 6308 }, { "epoch": 1.9422085417468256, "grad_norm": 7.46875, "learning_rate": 1.008867629675284e-06, "loss": 1.5000094175338745, "step": 6310 }, { "epoch": 1.9428241631396692, "grad_norm": 11.625, "learning_rate": 1.0086810831296071e-06, "loss": 1.3290870189666748, "step": 6312 }, { "epoch": 1.9434397845325124, "grad_norm": 5.0, "learning_rate": 1.0084965154021741e-06, "loss": 1.5315940380096436, "step": 6314 }, { "epoch": 1.9440554059253559, "grad_norm": 4.65625, "learning_rate": 1.0083139266763955e-06, "loss": 1.2824970483779907, "step": 6316 }, { "epoch": 1.9446710273181993, "grad_norm": 4.59375, "learning_rate": 1.0081333171337132e-06, "loss": 1.2794500589370728, "step": 6318 }, { "epoch": 1.9452866487110427, "grad_norm": 7.28125, "learning_rate": 1.0079546869536027e-06, "loss": 1.26789391040802, "step": 6320 }, { "epoch": 1.9459022701038862, "grad_norm": 4.25, "learning_rate": 1.0077780363135736e-06, "loss": 1.1022186279296875, "step": 6322 }, { "epoch": 1.9465178914967294, "grad_norm": 6.375, "learning_rate": 1.0076033653891667e-06, "loss": 1.3582931756973267, "step": 6324 }, { "epoch": 1.947133512889573, "grad_norm": 14.375, "learning_rate": 1.007430674353957e-06, "loss": 1.2253589630126953, "step": 6326 }, { "epoch": 1.9477491342824163, "grad_norm": 6.03125, "learning_rate": 1.0072599633795512e-06, "loss": 1.3371095657348633, "step": 6328 }, { "epoch": 1.9483647556752597, "grad_norm": 5.8125, "learning_rate": 1.007091232635589e-06, "loss": 1.6374062299728394, "step": 6330 }, { "epoch": 1.9489803770681031, "grad_norm": 5.96875, "learning_rate": 1.0069244822897413e-06, "loss": 0.8464836478233337, "step": 6332 }, { "epoch": 1.9495959984609466, "grad_norm": 9.375, "learning_rate": 1.006759712507712e-06, "loss": 1.6271177530288696, "step": 6334 }, { "epoch": 1.95021161985379, "grad_norm": 2.375, "learning_rate": 1.0065969234532367e-06, "loss": 1.002731442451477, "step": 6336 }, { "epoch": 1.9508272412466332, "grad_norm": 4.8125, "learning_rate": 1.0064361152880823e-06, "loss": 1.2966411113739014, "step": 6338 }, { "epoch": 1.9514428626394769, "grad_norm": 8.5, "learning_rate": 1.0062772881720476e-06, "loss": 1.254443883895874, "step": 6340 }, { "epoch": 1.95205848403232, "grad_norm": 6.65625, "learning_rate": 1.0061204422629625e-06, "loss": 1.6713799238204956, "step": 6342 }, { "epoch": 1.9526741054251635, "grad_norm": 9.0625, "learning_rate": 1.0059655777166883e-06, "loss": 1.3720214366912842, "step": 6344 }, { "epoch": 1.953289726818007, "grad_norm": 5.46875, "learning_rate": 1.0058126946871174e-06, "loss": 1.4773958921432495, "step": 6346 }, { "epoch": 1.9539053482108504, "grad_norm": 4.96875, "learning_rate": 1.0056617933261735e-06, "loss": 1.110971212387085, "step": 6348 }, { "epoch": 1.9545209696036938, "grad_norm": 5.75, "learning_rate": 1.0055128737838101e-06, "loss": 1.187142014503479, "step": 6350 }, { "epoch": 1.955136590996537, "grad_norm": 13.125, "learning_rate": 1.0053659362080123e-06, "loss": 1.4464235305786133, "step": 6352 }, { "epoch": 1.9557522123893807, "grad_norm": 10.375, "learning_rate": 1.0052209807447948e-06, "loss": 1.0058897733688354, "step": 6354 }, { "epoch": 1.956367833782224, "grad_norm": 5.875, "learning_rate": 1.0050780075382033e-06, "loss": 1.1535087823867798, "step": 6356 }, { "epoch": 1.9569834551750673, "grad_norm": 7.5625, "learning_rate": 1.0049370167303138e-06, "loss": 1.2556126117706299, "step": 6358 }, { "epoch": 1.9575990765679108, "grad_norm": 13.25, "learning_rate": 1.0047980084612318e-06, "loss": 1.032423973083496, "step": 6360 }, { "epoch": 1.958214697960754, "grad_norm": 6.40625, "learning_rate": 1.0046609828690929e-06, "loss": 1.5129501819610596, "step": 6362 }, { "epoch": 1.9588303193535976, "grad_norm": 6.90625, "learning_rate": 1.0045259400900622e-06, "loss": 1.3893715143203735, "step": 6364 }, { "epoch": 1.9594459407464409, "grad_norm": 10.0625, "learning_rate": 1.0043928802583352e-06, "loss": 1.1793274879455566, "step": 6366 }, { "epoch": 1.9600615621392843, "grad_norm": 7.59375, "learning_rate": 1.0042618035061364e-06, "loss": 1.1719051599502563, "step": 6368 }, { "epoch": 1.9606771835321277, "grad_norm": 5.6875, "learning_rate": 1.0041327099637196e-06, "loss": 1.1064897775650024, "step": 6370 }, { "epoch": 1.9612928049249712, "grad_norm": 8.6875, "learning_rate": 1.0040055997593677e-06, "loss": 1.1995185613632202, "step": 6372 }, { "epoch": 1.9619084263178146, "grad_norm": 8.1875, "learning_rate": 1.0038804730193933e-06, "loss": 1.558251142501831, "step": 6374 }, { "epoch": 1.9625240477106578, "grad_norm": 4.21875, "learning_rate": 1.0037573298681375e-06, "loss": 1.2958089113235474, "step": 6376 }, { "epoch": 1.9631396691035015, "grad_norm": 12.1875, "learning_rate": 1.0036361704279705e-06, "loss": 1.064068078994751, "step": 6378 }, { "epoch": 1.9637552904963447, "grad_norm": 2.421875, "learning_rate": 1.0035169948192912e-06, "loss": 1.0521752834320068, "step": 6380 }, { "epoch": 1.964370911889188, "grad_norm": 5.15625, "learning_rate": 1.003399803160527e-06, "loss": 1.1212785243988037, "step": 6382 }, { "epoch": 1.9649865332820315, "grad_norm": 7.9375, "learning_rate": 1.0032845955681337e-06, "loss": 1.5647592544555664, "step": 6384 }, { "epoch": 1.965602154674875, "grad_norm": 12.3125, "learning_rate": 1.0031713721565957e-06, "loss": 1.3184818029403687, "step": 6386 }, { "epoch": 1.9662177760677184, "grad_norm": 7.78125, "learning_rate": 1.003060133038426e-06, "loss": 1.0812666416168213, "step": 6388 }, { "epoch": 1.9668333974605616, "grad_norm": 11.375, "learning_rate": 1.002950878324165e-06, "loss": 1.876505970954895, "step": 6390 }, { "epoch": 1.9674490188534053, "grad_norm": 5.0625, "learning_rate": 1.0028436081223818e-06, "loss": 1.272090196609497, "step": 6392 }, { "epoch": 1.9680646402462485, "grad_norm": 7.5, "learning_rate": 1.0027383225396731e-06, "loss": 1.2015711069107056, "step": 6394 }, { "epoch": 1.968680261639092, "grad_norm": 6.34375, "learning_rate": 1.0026350216806638e-06, "loss": 1.2012073993682861, "step": 6396 }, { "epoch": 1.9692958830319354, "grad_norm": 8.9375, "learning_rate": 1.0025337056480055e-06, "loss": 1.4998914003372192, "step": 6398 }, { "epoch": 1.9699115044247788, "grad_norm": 7.8125, "learning_rate": 1.0024343745423792e-06, "loss": 1.5121166706085205, "step": 6400 }, { "epoch": 1.9705271258176222, "grad_norm": 6.5625, "learning_rate": 1.002337028462492e-06, "loss": 1.5358381271362305, "step": 6402 }, { "epoch": 1.9711427472104655, "grad_norm": 6.875, "learning_rate": 1.002241667505079e-06, "loss": 1.6802611351013184, "step": 6404 }, { "epoch": 1.971758368603309, "grad_norm": 6.75, "learning_rate": 1.0021482917649021e-06, "loss": 1.4775930643081665, "step": 6406 }, { "epoch": 1.9723739899961523, "grad_norm": 4.625, "learning_rate": 1.0020569013347512e-06, "loss": 0.9560152292251587, "step": 6408 }, { "epoch": 1.9729896113889958, "grad_norm": 3.25, "learning_rate": 1.0019674963054432e-06, "loss": 1.1680761575698853, "step": 6410 }, { "epoch": 1.9736052327818392, "grad_norm": 6.875, "learning_rate": 1.0018800767658216e-06, "loss": 1.123976707458496, "step": 6412 }, { "epoch": 1.9742208541746824, "grad_norm": 10.0, "learning_rate": 1.0017946428027572e-06, "loss": 1.4125099182128906, "step": 6414 }, { "epoch": 1.974836475567526, "grad_norm": 7.21875, "learning_rate": 1.0017111945011477e-06, "loss": 1.4543452262878418, "step": 6416 }, { "epoch": 1.9754520969603693, "grad_norm": 9.875, "learning_rate": 1.0016297319439175e-06, "loss": 1.4717330932617188, "step": 6418 }, { "epoch": 1.976067718353213, "grad_norm": 16.25, "learning_rate": 1.0015502552120178e-06, "loss": 1.206028938293457, "step": 6420 }, { "epoch": 1.9766833397460561, "grad_norm": 11.0625, "learning_rate": 1.0014727643844265e-06, "loss": 1.3496100902557373, "step": 6422 }, { "epoch": 1.9772989611388996, "grad_norm": 7.375, "learning_rate": 1.001397259538148e-06, "loss": 1.554663896560669, "step": 6424 }, { "epoch": 1.977914582531743, "grad_norm": 8.4375, "learning_rate": 1.0013237407482126e-06, "loss": 1.4421809911727905, "step": 6426 }, { "epoch": 1.9785302039245862, "grad_norm": 7.34375, "learning_rate": 1.0012522080876784e-06, "loss": 1.3873109817504883, "step": 6428 }, { "epoch": 1.9791458253174299, "grad_norm": 13.75, "learning_rate": 1.0011826616276283e-06, "loss": 2.147081136703491, "step": 6430 }, { "epoch": 1.979761446710273, "grad_norm": 11.375, "learning_rate": 1.0011151014371728e-06, "loss": 1.7759194374084473, "step": 6432 }, { "epoch": 1.9803770681031165, "grad_norm": 6.40625, "learning_rate": 1.0010495275834475e-06, "loss": 1.4935888051986694, "step": 6434 }, { "epoch": 1.98099268949596, "grad_norm": 9.1875, "learning_rate": 1.000985940131615e-06, "loss": 1.297410011291504, "step": 6436 }, { "epoch": 1.9816083108888034, "grad_norm": 6.65625, "learning_rate": 1.0009243391448629e-06, "loss": 1.6829392910003662, "step": 6438 }, { "epoch": 1.9822239322816468, "grad_norm": 2.859375, "learning_rate": 1.0008647246844064e-06, "loss": 1.0125871896743774, "step": 6440 }, { "epoch": 1.98283955367449, "grad_norm": 7.6875, "learning_rate": 1.000807096809485e-06, "loss": 0.981855571269989, "step": 6442 }, { "epoch": 1.9834551750673337, "grad_norm": 6.25, "learning_rate": 1.0007514555773652e-06, "loss": 1.543346881866455, "step": 6444 }, { "epoch": 1.984070796460177, "grad_norm": 3.9375, "learning_rate": 1.0006978010433386e-06, "loss": 0.9839072227478027, "step": 6446 }, { "epoch": 1.9846864178530204, "grad_norm": 7.0, "learning_rate": 1.000646133260723e-06, "loss": 1.1815534830093384, "step": 6448 }, { "epoch": 1.9853020392458638, "grad_norm": 3.625, "learning_rate": 1.0005964522808626e-06, "loss": 1.1010407209396362, "step": 6450 }, { "epoch": 1.9859176606387072, "grad_norm": 4.40625, "learning_rate": 1.0005487581531254e-06, "loss": 1.3142129182815552, "step": 6452 }, { "epoch": 1.9865332820315507, "grad_norm": 14.6875, "learning_rate": 1.0005030509249064e-06, "loss": 0.7110906839370728, "step": 6454 }, { "epoch": 1.9871489034243939, "grad_norm": 4.90625, "learning_rate": 1.0004593306416267e-06, "loss": 1.2709407806396484, "step": 6456 }, { "epoch": 1.9877645248172375, "grad_norm": 5.0, "learning_rate": 1.000417597346731e-06, "loss": 1.3931374549865723, "step": 6458 }, { "epoch": 1.9883801462100807, "grad_norm": 9.0, "learning_rate": 1.0003778510816915e-06, "loss": 1.4285491704940796, "step": 6460 }, { "epoch": 1.9889957676029242, "grad_norm": 7.65625, "learning_rate": 1.000340091886004e-06, "loss": 1.3640979528427124, "step": 6462 }, { "epoch": 1.9896113889957676, "grad_norm": 5.78125, "learning_rate": 1.0003043197971917e-06, "loss": 1.1222772598266602, "step": 6464 }, { "epoch": 1.990227010388611, "grad_norm": 5.53125, "learning_rate": 1.0002705348508016e-06, "loss": 1.4084155559539795, "step": 6466 }, { "epoch": 1.9908426317814545, "grad_norm": 7.375, "learning_rate": 1.0002387370804063e-06, "loss": 1.0216439962387085, "step": 6468 }, { "epoch": 1.9914582531742977, "grad_norm": 6.3125, "learning_rate": 1.0002089265176046e-06, "loss": 1.532482385635376, "step": 6470 }, { "epoch": 1.9920738745671414, "grad_norm": 15.375, "learning_rate": 1.0001811031920195e-06, "loss": 1.5534192323684692, "step": 6472 }, { "epoch": 1.9926894959599846, "grad_norm": 5.25, "learning_rate": 1.0001552671312996e-06, "loss": 1.1872690916061401, "step": 6474 }, { "epoch": 1.993305117352828, "grad_norm": 6.8125, "learning_rate": 1.0001314183611194e-06, "loss": 1.5387009382247925, "step": 6476 }, { "epoch": 1.9939207387456714, "grad_norm": 8.5625, "learning_rate": 1.0001095569051772e-06, "loss": 1.7060209512710571, "step": 6478 }, { "epoch": 1.9945363601385147, "grad_norm": 9.0, "learning_rate": 1.0000896827851974e-06, "loss": 1.5537105798721313, "step": 6480 }, { "epoch": 1.9951519815313583, "grad_norm": 32.0, "learning_rate": 1.0000717960209295e-06, "loss": 0.8341541290283203, "step": 6482 }, { "epoch": 1.9957676029242015, "grad_norm": 2.53125, "learning_rate": 1.0000558966301483e-06, "loss": 0.9209161996841431, "step": 6484 }, { "epoch": 1.9963832243170452, "grad_norm": 19.25, "learning_rate": 1.0000419846286524e-06, "loss": 1.3282356262207031, "step": 6486 }, { "epoch": 1.9969988457098884, "grad_norm": 4.59375, "learning_rate": 1.0000300600302676e-06, "loss": 0.5242670178413391, "step": 6488 }, { "epoch": 1.9976144671027318, "grad_norm": 5.625, "learning_rate": 1.0000201228468429e-06, "loss": 1.050189733505249, "step": 6490 }, { "epoch": 1.9982300884955753, "grad_norm": 5.40625, "learning_rate": 1.0000121730882534e-06, "loss": 1.2277486324310303, "step": 6492 }, { "epoch": 1.9988457098884185, "grad_norm": 10.375, "learning_rate": 1.000006210762399e-06, "loss": 1.3151390552520752, "step": 6494 }, { "epoch": 1.9994613312812621, "grad_norm": 8.1875, "learning_rate": 1.0000022358752043e-06, "loss": 1.4126924276351929, "step": 6496 }, { "epoch": 2.0, "grad_norm": 10.5625, "learning_rate": 1.0000002484306195e-06, "loss": 1.3165018558502197, "step": 6498 }, { "epoch": 2.0, "step": 6498, "total_flos": 2.5760029558366536e+18, "train_loss": 1.3364153672508476, "train_runtime": 22623.2482, "train_samples_per_second": 1.149, "train_steps_per_second": 0.287 } ], "logging_steps": 2, "max_steps": 6498, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 9999999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5760029558366536e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }