{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 6498, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006156213928434013, "grad_norm": 2.703125, "learning_rate": 5.1282051282051286e-08, "loss": 1.2488996982574463, "step": 2 }, { "epoch": 0.0012312427856868025, "grad_norm": 3.59375, "learning_rate": 1.5384615384615387e-07, "loss": 1.944703459739685, "step": 4 }, { "epoch": 0.001846864178530204, "grad_norm": 5.8125, "learning_rate": 2.564102564102564e-07, "loss": 1.6205146312713623, "step": 6 }, { "epoch": 0.002462485571373605, "grad_norm": 4.875, "learning_rate": 3.5897435897435896e-07, "loss": 1.951253056526184, "step": 8 }, { "epoch": 0.0030781069642170067, "grad_norm": 14.5, "learning_rate": 4.615384615384616e-07, "loss": 2.2729759216308594, "step": 10 }, { "epoch": 0.003693728357060408, "grad_norm": 8.75, "learning_rate": 5.641025641025642e-07, "loss": 1.4744157791137695, "step": 12 }, { "epoch": 0.004309349749903809, "grad_norm": 27.125, "learning_rate": 6.666666666666667e-07, "loss": 2.574921131134033, "step": 14 }, { "epoch": 0.00492497114274721, "grad_norm": 8.25, "learning_rate": 7.692307692307694e-07, "loss": 1.8204889297485352, "step": 16 }, { "epoch": 0.005540592535590611, "grad_norm": 15.6875, "learning_rate": 8.717948717948718e-07, "loss": 1.6300557851791382, "step": 18 }, { "epoch": 0.0061562139284340135, "grad_norm": 8.5, "learning_rate": 9.743589743589745e-07, "loss": 1.4897385835647583, "step": 20 }, { "epoch": 0.006771835321277415, "grad_norm": 4.125, "learning_rate": 1.076923076923077e-06, "loss": 1.7789047956466675, "step": 22 }, { "epoch": 0.007387456714120816, "grad_norm": 4.15625, "learning_rate": 1.1794871794871795e-06, "loss": 1.8086273670196533, "step": 24 }, { "epoch": 0.008003078106964217, "grad_norm": 5.5, "learning_rate": 1.282051282051282e-06, "loss": 1.6509721279144287, "step": 26 }, { "epoch": 0.008618699499807618, "grad_norm": 3.484375, "learning_rate": 1.3846153846153848e-06, "loss": 1.568681001663208, "step": 28 }, { "epoch": 0.00923432089265102, "grad_norm": 3.625, "learning_rate": 1.4871794871794873e-06, "loss": 1.7773399353027344, "step": 30 }, { "epoch": 0.00984994228549442, "grad_norm": 6.65625, "learning_rate": 1.5897435897435897e-06, "loss": 1.8189994096755981, "step": 32 }, { "epoch": 0.010465563678337822, "grad_norm": 4.75, "learning_rate": 1.6923076923076926e-06, "loss": 2.1563496589660645, "step": 34 }, { "epoch": 0.011081185071181223, "grad_norm": 5.0, "learning_rate": 1.794871794871795e-06, "loss": 1.4601736068725586, "step": 36 }, { "epoch": 0.011696806464024625, "grad_norm": 20.5, "learning_rate": 1.8974358974358975e-06, "loss": 1.8413267135620117, "step": 38 }, { "epoch": 0.012312427856868027, "grad_norm": 2.515625, "learning_rate": 2.0000000000000003e-06, "loss": 1.1767711639404297, "step": 40 }, { "epoch": 0.012928049249711427, "grad_norm": 7.28125, "learning_rate": 2.1025641025641028e-06, "loss": 1.2139570713043213, "step": 42 }, { "epoch": 0.01354367064255483, "grad_norm": 10.8125, "learning_rate": 2.2051282051282052e-06, "loss": 1.9730910062789917, "step": 44 }, { "epoch": 0.01415929203539823, "grad_norm": 6.03125, "learning_rate": 2.307692307692308e-06, "loss": 1.4171947240829468, "step": 46 }, { "epoch": 0.014774913428241632, "grad_norm": 8.1875, "learning_rate": 2.4102564102564105e-06, "loss": 1.6850796937942505, "step": 48 }, { "epoch": 0.015390534821085032, "grad_norm": 2.296875, "learning_rate": 2.512820512820513e-06, "loss": 1.3291962146759033, "step": 50 }, { "epoch": 0.016006156213928435, "grad_norm": 6.90625, "learning_rate": 2.615384615384616e-06, "loss": 1.6199973821640015, "step": 52 }, { "epoch": 0.016621777606771835, "grad_norm": 6.46875, "learning_rate": 2.717948717948718e-06, "loss": 1.920493721961975, "step": 54 }, { "epoch": 0.017237398999615235, "grad_norm": 6.53125, "learning_rate": 2.8205128205128207e-06, "loss": 1.4693753719329834, "step": 56 }, { "epoch": 0.01785302039245864, "grad_norm": 3.46875, "learning_rate": 2.9230769230769236e-06, "loss": 1.869727611541748, "step": 58 }, { "epoch": 0.01846864178530204, "grad_norm": 8.625, "learning_rate": 3.0256410256410256e-06, "loss": 1.7403851747512817, "step": 60 }, { "epoch": 0.01908426317814544, "grad_norm": 6.03125, "learning_rate": 3.1282051282051284e-06, "loss": 1.4627933502197266, "step": 62 }, { "epoch": 0.01969988457098884, "grad_norm": 3.0625, "learning_rate": 3.2307692307692313e-06, "loss": 1.1403491497039795, "step": 64 }, { "epoch": 0.020315505963832244, "grad_norm": 10.4375, "learning_rate": 3.3333333333333333e-06, "loss": 1.5978504419326782, "step": 66 }, { "epoch": 0.020931127356675645, "grad_norm": 4.1875, "learning_rate": 3.435897435897436e-06, "loss": 1.2248575687408447, "step": 68 }, { "epoch": 0.021546748749519045, "grad_norm": 7.5, "learning_rate": 3.538461538461539e-06, "loss": 2.153047800064087, "step": 70 }, { "epoch": 0.022162370142362445, "grad_norm": 8.75, "learning_rate": 3.641025641025641e-06, "loss": 1.7931642532348633, "step": 72 }, { "epoch": 0.02277799153520585, "grad_norm": 6.125, "learning_rate": 3.743589743589744e-06, "loss": 1.591568946838379, "step": 74 }, { "epoch": 0.02339361292804925, "grad_norm": 13.5625, "learning_rate": 3.846153846153847e-06, "loss": 1.8861238956451416, "step": 76 }, { "epoch": 0.02400923432089265, "grad_norm": 30.5, "learning_rate": 3.948717948717949e-06, "loss": 1.8302340507507324, "step": 78 }, { "epoch": 0.024624855713736054, "grad_norm": 6.84375, "learning_rate": 4.051282051282052e-06, "loss": 1.1091889142990112, "step": 80 }, { "epoch": 0.025240477106579454, "grad_norm": 4.15625, "learning_rate": 4.1538461538461545e-06, "loss": 1.4548031091690063, "step": 82 }, { "epoch": 0.025856098499422855, "grad_norm": 8.0625, "learning_rate": 4.2564102564102566e-06, "loss": 1.3143142461776733, "step": 84 }, { "epoch": 0.026471719892266255, "grad_norm": 4.34375, "learning_rate": 4.358974358974359e-06, "loss": 1.3913114070892334, "step": 86 }, { "epoch": 0.02708734128510966, "grad_norm": 4.4375, "learning_rate": 4.461538461538462e-06, "loss": 1.53354012966156, "step": 88 }, { "epoch": 0.02770296267795306, "grad_norm": 7.0625, "learning_rate": 4.564102564102564e-06, "loss": 1.8740226030349731, "step": 90 }, { "epoch": 0.02831858407079646, "grad_norm": 5.6875, "learning_rate": 4.666666666666667e-06, "loss": 1.359494924545288, "step": 92 }, { "epoch": 0.02893420546363986, "grad_norm": 6.78125, "learning_rate": 4.76923076923077e-06, "loss": 1.7467714548110962, "step": 94 }, { "epoch": 0.029549826856483264, "grad_norm": 3.828125, "learning_rate": 4.871794871794872e-06, "loss": 1.4591333866119385, "step": 96 }, { "epoch": 0.030165448249326664, "grad_norm": 6.6875, "learning_rate": 4.974358974358975e-06, "loss": 1.5871689319610596, "step": 98 }, { "epoch": 0.030781069642170065, "grad_norm": 3.46875, "learning_rate": 5.076923076923077e-06, "loss": 1.078829288482666, "step": 100 }, { "epoch": 0.03139669103501347, "grad_norm": 4.4375, "learning_rate": 5.179487179487181e-06, "loss": 1.4992876052856445, "step": 102 }, { "epoch": 0.03201231242785687, "grad_norm": 10.1875, "learning_rate": 5.282051282051283e-06, "loss": 1.4937949180603027, "step": 104 }, { "epoch": 0.03262793382070027, "grad_norm": 7.40625, "learning_rate": 5.384615384615385e-06, "loss": 1.634779930114746, "step": 106 }, { "epoch": 0.03324355521354367, "grad_norm": 5.5625, "learning_rate": 5.487179487179488e-06, "loss": 1.4202882051467896, "step": 108 }, { "epoch": 0.03385917660638707, "grad_norm": 4.625, "learning_rate": 5.58974358974359e-06, "loss": 1.5589418411254883, "step": 110 }, { "epoch": 0.03447479799923047, "grad_norm": 12.4375, "learning_rate": 5.692307692307692e-06, "loss": 1.8365751504898071, "step": 112 }, { "epoch": 0.03509041939207388, "grad_norm": 7.1875, "learning_rate": 5.794871794871796e-06, "loss": 1.9069055318832397, "step": 114 }, { "epoch": 0.03570604078491728, "grad_norm": 2.796875, "learning_rate": 5.897435897435898e-06, "loss": 1.3717936277389526, "step": 116 }, { "epoch": 0.03632166217776068, "grad_norm": 29.125, "learning_rate": 6e-06, "loss": 1.7499637603759766, "step": 118 }, { "epoch": 0.03693728357060408, "grad_norm": 3.953125, "learning_rate": 6.102564102564104e-06, "loss": 1.4855519533157349, "step": 120 }, { "epoch": 0.03755290496344748, "grad_norm": 4.65625, "learning_rate": 6.205128205128206e-06, "loss": 1.3522869348526, "step": 122 }, { "epoch": 0.03816852635629088, "grad_norm": 4.59375, "learning_rate": 6.307692307692308e-06, "loss": 1.0647168159484863, "step": 124 }, { "epoch": 0.03878414774913428, "grad_norm": 4.8125, "learning_rate": 6.410256410256412e-06, "loss": 1.3522032499313354, "step": 126 }, { "epoch": 0.03939976914197768, "grad_norm": 11.375, "learning_rate": 6.512820512820514e-06, "loss": 1.6329436302185059, "step": 128 }, { "epoch": 0.04001539053482109, "grad_norm": 8.3125, "learning_rate": 6.615384615384616e-06, "loss": 1.1570396423339844, "step": 130 }, { "epoch": 0.04063101192766449, "grad_norm": 6.21875, "learning_rate": 6.717948717948718e-06, "loss": 1.609910011291504, "step": 132 }, { "epoch": 0.04124663332050789, "grad_norm": 6.5625, "learning_rate": 6.820512820512821e-06, "loss": 1.589626669883728, "step": 134 }, { "epoch": 0.04186225471335129, "grad_norm": 1.7578125, "learning_rate": 6.923076923076923e-06, "loss": 0.9378024339675903, "step": 136 }, { "epoch": 0.04247787610619469, "grad_norm": 9.1875, "learning_rate": 7.025641025641025e-06, "loss": 1.545324444770813, "step": 138 }, { "epoch": 0.04309349749903809, "grad_norm": 8.625, "learning_rate": 7.128205128205129e-06, "loss": 1.7356501817703247, "step": 140 }, { "epoch": 0.04370911889188149, "grad_norm": 5.15625, "learning_rate": 7.230769230769231e-06, "loss": 1.3267698287963867, "step": 142 }, { "epoch": 0.04432474028472489, "grad_norm": 3.828125, "learning_rate": 7.333333333333333e-06, "loss": 1.3805948495864868, "step": 144 }, { "epoch": 0.0449403616775683, "grad_norm": 5.1875, "learning_rate": 7.435897435897437e-06, "loss": 1.43031644821167, "step": 146 }, { "epoch": 0.0455559830704117, "grad_norm": 4.96875, "learning_rate": 7.538461538461539e-06, "loss": 1.8443446159362793, "step": 148 }, { "epoch": 0.0461716044632551, "grad_norm": 6.09375, "learning_rate": 7.641025641025641e-06, "loss": 1.2642335891723633, "step": 150 }, { "epoch": 0.0467872258560985, "grad_norm": 30.125, "learning_rate": 7.743589743589745e-06, "loss": 1.3919174671173096, "step": 152 }, { "epoch": 0.0474028472489419, "grad_norm": 9.125, "learning_rate": 7.846153846153847e-06, "loss": 1.5884499549865723, "step": 154 }, { "epoch": 0.0480184686417853, "grad_norm": 5.875, "learning_rate": 7.948717948717949e-06, "loss": 1.1845054626464844, "step": 156 }, { "epoch": 0.0486340900346287, "grad_norm": 13.5, "learning_rate": 8.051282051282052e-06, "loss": 1.3628064393997192, "step": 158 }, { "epoch": 0.04924971142747211, "grad_norm": 9.6875, "learning_rate": 8.153846153846154e-06, "loss": 1.5696605443954468, "step": 160 }, { "epoch": 0.04986533282031551, "grad_norm": 3.25, "learning_rate": 8.256410256410256e-06, "loss": 1.2648931741714478, "step": 162 }, { "epoch": 0.05048095421315891, "grad_norm": 11.875, "learning_rate": 8.35897435897436e-06, "loss": 1.2277356386184692, "step": 164 }, { "epoch": 0.05109657560600231, "grad_norm": 18.625, "learning_rate": 8.461538461538462e-06, "loss": 1.2374329566955566, "step": 166 }, { "epoch": 0.05171219699884571, "grad_norm": 3.171875, "learning_rate": 8.564102564102564e-06, "loss": 1.7571760416030884, "step": 168 }, { "epoch": 0.05232781839168911, "grad_norm": 2.484375, "learning_rate": 8.666666666666668e-06, "loss": 1.3766837120056152, "step": 170 }, { "epoch": 0.05294343978453251, "grad_norm": 14.375, "learning_rate": 8.76923076923077e-06, "loss": 1.7967778444290161, "step": 172 }, { "epoch": 0.05355906117737591, "grad_norm": 4.3125, "learning_rate": 8.871794871794872e-06, "loss": 1.1553382873535156, "step": 174 }, { "epoch": 0.05417468257021932, "grad_norm": 4.5625, "learning_rate": 8.974358974358976e-06, "loss": 1.4610005617141724, "step": 176 }, { "epoch": 0.05479030396306272, "grad_norm": 15.4375, "learning_rate": 9.076923076923078e-06, "loss": 1.636716604232788, "step": 178 }, { "epoch": 0.05540592535590612, "grad_norm": 6.6875, "learning_rate": 9.17948717948718e-06, "loss": 1.2763687372207642, "step": 180 }, { "epoch": 0.05602154674874952, "grad_norm": 5.65625, "learning_rate": 9.282051282051283e-06, "loss": 1.361682653427124, "step": 182 }, { "epoch": 0.05663716814159292, "grad_norm": 5.46875, "learning_rate": 9.384615384615385e-06, "loss": 0.9292706251144409, "step": 184 }, { "epoch": 0.05725278953443632, "grad_norm": 7.28125, "learning_rate": 9.487179487179487e-06, "loss": 1.634671926498413, "step": 186 }, { "epoch": 0.05786841092727972, "grad_norm": 9.625, "learning_rate": 9.589743589743591e-06, "loss": 1.34225594997406, "step": 188 }, { "epoch": 0.05848403232012313, "grad_norm": 15.625, "learning_rate": 9.692307692307693e-06, "loss": 1.5954524278640747, "step": 190 }, { "epoch": 0.05909965371296653, "grad_norm": 3.0, "learning_rate": 9.794871794871795e-06, "loss": 1.2905179262161255, "step": 192 }, { "epoch": 0.05971527510580993, "grad_norm": 4.5, "learning_rate": 9.897435897435899e-06, "loss": 1.197383165359497, "step": 194 }, { "epoch": 0.06033089649865333, "grad_norm": 5.6875, "learning_rate": 1e-05, "loss": 1.425607442855835, "step": 196 }, { "epoch": 0.06094651789149673, "grad_norm": 2.875, "learning_rate": 9.99999801255517e-06, "loss": 1.2564163208007812, "step": 198 }, { "epoch": 0.06156213928434013, "grad_norm": 12.875, "learning_rate": 9.999992050222649e-06, "loss": 1.3558177947998047, "step": 200 }, { "epoch": 0.06217776067718353, "grad_norm": 5.125, "learning_rate": 9.999982113008366e-06, "loss": 1.5833901166915894, "step": 202 }, { "epoch": 0.06279338207002694, "grad_norm": 2.21875, "learning_rate": 9.999968200922195e-06, "loss": 1.0304534435272217, "step": 204 }, { "epoch": 0.06340900346287033, "grad_norm": 5.53125, "learning_rate": 9.999950313977957e-06, "loss": 1.6564018726348877, "step": 206 }, { "epoch": 0.06402462485571374, "grad_norm": 5.625, "learning_rate": 9.999928452193432e-06, "loss": 1.6364089250564575, "step": 208 }, { "epoch": 0.06464024624855713, "grad_norm": 6.84375, "learning_rate": 9.999902615590342e-06, "loss": 1.6970021724700928, "step": 210 }, { "epoch": 0.06525586764140054, "grad_norm": 7.46875, "learning_rate": 9.999872804194363e-06, "loss": 1.6989271640777588, "step": 212 }, { "epoch": 0.06587148903424395, "grad_norm": 22.375, "learning_rate": 9.999839018035117e-06, "loss": 1.6279175281524658, "step": 214 }, { "epoch": 0.06648711042708734, "grad_norm": 26.25, "learning_rate": 9.99980125714618e-06, "loss": 1.2584187984466553, "step": 216 }, { "epoch": 0.06710273181993075, "grad_norm": 5.96875, "learning_rate": 9.999759521565074e-06, "loss": 1.5264304876327515, "step": 218 }, { "epoch": 0.06771835321277414, "grad_norm": 6.375, "learning_rate": 9.999713811333272e-06, "loss": 1.37717604637146, "step": 220 }, { "epoch": 0.06833397460561755, "grad_norm": 2.5, "learning_rate": 9.9996641264962e-06, "loss": 1.3600963354110718, "step": 222 }, { "epoch": 0.06894959599846094, "grad_norm": 6.6875, "learning_rate": 9.999610467103231e-06, "loss": 1.1031935214996338, "step": 224 }, { "epoch": 0.06956521739130435, "grad_norm": 2.515625, "learning_rate": 9.999552833207684e-06, "loss": 1.3930033445358276, "step": 226 }, { "epoch": 0.07018083878414776, "grad_norm": 5.6875, "learning_rate": 9.999491224866836e-06, "loss": 1.3915252685546875, "step": 228 }, { "epoch": 0.07079646017699115, "grad_norm": 21.0, "learning_rate": 9.999425642141903e-06, "loss": 1.6080518960952759, "step": 230 }, { "epoch": 0.07141208156983456, "grad_norm": 4.40625, "learning_rate": 9.99935608509806e-06, "loss": 1.2827638387680054, "step": 232 }, { "epoch": 0.07202770296267795, "grad_norm": 4.625, "learning_rate": 9.999282553804425e-06, "loss": 1.281058669090271, "step": 234 }, { "epoch": 0.07264332435552136, "grad_norm": 3.265625, "learning_rate": 9.999205048334073e-06, "loss": 1.315993070602417, "step": 236 }, { "epoch": 0.07325894574836475, "grad_norm": 5.34375, "learning_rate": 9.999123568764018e-06, "loss": 1.48131263256073, "step": 238 }, { "epoch": 0.07387456714120816, "grad_norm": 6.1875, "learning_rate": 9.999038115175226e-06, "loss": 1.273392677307129, "step": 240 }, { "epoch": 0.07449018853405155, "grad_norm": 2.71875, "learning_rate": 9.998948687652619e-06, "loss": 1.138009786605835, "step": 242 }, { "epoch": 0.07510580992689496, "grad_norm": 5.40625, "learning_rate": 9.998855286285061e-06, "loss": 1.2859021425247192, "step": 244 }, { "epoch": 0.07572143131973837, "grad_norm": 15.0625, "learning_rate": 9.998757911165368e-06, "loss": 1.5143579244613647, "step": 246 }, { "epoch": 0.07633705271258176, "grad_norm": 4.9375, "learning_rate": 9.998656562390303e-06, "loss": 1.4446684122085571, "step": 248 }, { "epoch": 0.07695267410542517, "grad_norm": 1.71875, "learning_rate": 9.99855124006058e-06, "loss": 1.0485321283340454, "step": 250 }, { "epoch": 0.07756829549826856, "grad_norm": 6.09375, "learning_rate": 9.998441944280854e-06, "loss": 1.4379631280899048, "step": 252 }, { "epoch": 0.07818391689111197, "grad_norm": 4.5, "learning_rate": 9.998328675159746e-06, "loss": 1.640031099319458, "step": 254 }, { "epoch": 0.07879953828395536, "grad_norm": 28.25, "learning_rate": 9.998211432809803e-06, "loss": 1.8261964321136475, "step": 256 }, { "epoch": 0.07941515967679877, "grad_norm": 3.921875, "learning_rate": 9.998090217347537e-06, "loss": 1.4264521598815918, "step": 258 }, { "epoch": 0.08003078106964218, "grad_norm": 1.5625, "learning_rate": 9.997965028893404e-06, "loss": 1.5476711988449097, "step": 260 }, { "epoch": 0.08064640246248557, "grad_norm": 11.0625, "learning_rate": 9.9978358675718e-06, "loss": 1.4974404573440552, "step": 262 }, { "epoch": 0.08126202385532898, "grad_norm": 5.46875, "learning_rate": 9.997702733511082e-06, "loss": 1.5387169122695923, "step": 264 }, { "epoch": 0.08187764524817237, "grad_norm": 4.0625, "learning_rate": 9.997565626843546e-06, "loss": 1.240181803703308, "step": 266 }, { "epoch": 0.08249326664101578, "grad_norm": 5.21875, "learning_rate": 9.997424547705438e-06, "loss": 1.3706841468811035, "step": 268 }, { "epoch": 0.08310888803385917, "grad_norm": 5.8125, "learning_rate": 9.997279496236952e-06, "loss": 1.2704806327819824, "step": 270 }, { "epoch": 0.08372450942670258, "grad_norm": 6.53125, "learning_rate": 9.997130472582228e-06, "loss": 1.5347249507904053, "step": 272 }, { "epoch": 0.08434013081954599, "grad_norm": 4.65625, "learning_rate": 9.996977476889351e-06, "loss": 1.072393536567688, "step": 274 }, { "epoch": 0.08495575221238938, "grad_norm": 5.96875, "learning_rate": 9.996820509310363e-06, "loss": 1.465416669845581, "step": 276 }, { "epoch": 0.08557137360523279, "grad_norm": 4.78125, "learning_rate": 9.996659570001242e-06, "loss": 1.3557649850845337, "step": 278 }, { "epoch": 0.08618699499807618, "grad_norm": 14.875, "learning_rate": 9.996494659121919e-06, "loss": 1.492985486984253, "step": 280 }, { "epoch": 0.08680261639091959, "grad_norm": 8.6875, "learning_rate": 9.996325776836267e-06, "loss": 1.5023902654647827, "step": 282 }, { "epoch": 0.08741823778376298, "grad_norm": 15.125, "learning_rate": 9.996152923312111e-06, "loss": 1.3942416906356812, "step": 284 }, { "epoch": 0.08803385917660639, "grad_norm": 8.3125, "learning_rate": 9.995976098721216e-06, "loss": 1.9787180423736572, "step": 286 }, { "epoch": 0.08864948056944978, "grad_norm": 8.8125, "learning_rate": 9.9957953032393e-06, "loss": 0.9891330599784851, "step": 288 }, { "epoch": 0.08926510196229319, "grad_norm": 11.4375, "learning_rate": 9.995610537046021e-06, "loss": 1.5660462379455566, "step": 290 }, { "epoch": 0.0898807233551366, "grad_norm": 5.3125, "learning_rate": 9.995421800324987e-06, "loss": 1.0548125505447388, "step": 292 }, { "epoch": 0.09049634474797999, "grad_norm": 5.875, "learning_rate": 9.99522909326375e-06, "loss": 1.4109697341918945, "step": 294 }, { "epoch": 0.0911119661408234, "grad_norm": 4.375, "learning_rate": 9.995032416053804e-06, "loss": 1.4787043333053589, "step": 296 }, { "epoch": 0.09172758753366679, "grad_norm": 12.25, "learning_rate": 9.994831768890598e-06, "loss": 1.09103524684906, "step": 298 }, { "epoch": 0.0923432089265102, "grad_norm": 8.3125, "learning_rate": 9.994627151973513e-06, "loss": 1.40105140209198, "step": 300 }, { "epoch": 0.09295883031935359, "grad_norm": 2.171875, "learning_rate": 9.994418565505885e-06, "loss": 1.0116194486618042, "step": 302 }, { "epoch": 0.093574451712197, "grad_norm": 22.375, "learning_rate": 9.994206009694991e-06, "loss": 1.894078016281128, "step": 304 }, { "epoch": 0.0941900731050404, "grad_norm": 12.3125, "learning_rate": 9.99398948475205e-06, "loss": 1.7190589904785156, "step": 306 }, { "epoch": 0.0948056944978838, "grad_norm": 3.75, "learning_rate": 9.993768990892232e-06, "loss": 0.814287543296814, "step": 308 }, { "epoch": 0.0954213158907272, "grad_norm": 9.6875, "learning_rate": 9.993544528334641e-06, "loss": 2.03491473197937, "step": 310 }, { "epoch": 0.0960369372835706, "grad_norm": 11.6875, "learning_rate": 9.993316097302337e-06, "loss": 1.7918344736099243, "step": 312 }, { "epoch": 0.09665255867641401, "grad_norm": 8.875, "learning_rate": 9.993083698022313e-06, "loss": 1.7452021837234497, "step": 314 }, { "epoch": 0.0972681800692574, "grad_norm": 1.7578125, "learning_rate": 9.992847330725507e-06, "loss": 0.6417583227157593, "step": 316 }, { "epoch": 0.09788380146210081, "grad_norm": 4.125, "learning_rate": 9.992606995646807e-06, "loss": 1.199703335762024, "step": 318 }, { "epoch": 0.09849942285494422, "grad_norm": 15.75, "learning_rate": 9.99236269302504e-06, "loss": 0.9593316316604614, "step": 320 }, { "epoch": 0.09911504424778761, "grad_norm": 3.078125, "learning_rate": 9.99211442310297e-06, "loss": 1.2619590759277344, "step": 322 }, { "epoch": 0.09973066564063102, "grad_norm": 6.65625, "learning_rate": 9.991862186127312e-06, "loss": 1.4878219366073608, "step": 324 }, { "epoch": 0.10034628703347441, "grad_norm": 21.875, "learning_rate": 9.99160598234872e-06, "loss": 1.4164918661117554, "step": 326 }, { "epoch": 0.10096190842631782, "grad_norm": 32.5, "learning_rate": 9.991345812021786e-06, "loss": 1.5547345876693726, "step": 328 }, { "epoch": 0.10157752981916121, "grad_norm": 6.875, "learning_rate": 9.991081675405049e-06, "loss": 1.1818456649780273, "step": 330 }, { "epoch": 0.10219315121200462, "grad_norm": 6.5625, "learning_rate": 9.990813572760992e-06, "loss": 1.2009780406951904, "step": 332 }, { "epoch": 0.10280877260484801, "grad_norm": 9.75, "learning_rate": 9.990541504356027e-06, "loss": 1.4748668670654297, "step": 334 }, { "epoch": 0.10342439399769142, "grad_norm": 3.90625, "learning_rate": 9.990265470460516e-06, "loss": 1.1837290525436401, "step": 336 }, { "epoch": 0.10404001539053483, "grad_norm": 9.0, "learning_rate": 9.989985471348765e-06, "loss": 1.0815367698669434, "step": 338 }, { "epoch": 0.10465563678337822, "grad_norm": 16.875, "learning_rate": 9.989701507299013e-06, "loss": 1.576779842376709, "step": 340 }, { "epoch": 0.10527125817622163, "grad_norm": 6.34375, "learning_rate": 9.98941357859344e-06, "loss": 1.180511474609375, "step": 342 }, { "epoch": 0.10588687956906502, "grad_norm": 28.875, "learning_rate": 9.989121685518167e-06, "loss": 1.410712718963623, "step": 344 }, { "epoch": 0.10650250096190843, "grad_norm": 3.375, "learning_rate": 9.988825828363254e-06, "loss": 1.2783693075180054, "step": 346 }, { "epoch": 0.10711812235475182, "grad_norm": 3.234375, "learning_rate": 9.988526007422703e-06, "loss": 1.327851414680481, "step": 348 }, { "epoch": 0.10773374374759523, "grad_norm": 8.0625, "learning_rate": 9.988222222994455e-06, "loss": 0.9698463082313538, "step": 350 }, { "epoch": 0.10834936514043864, "grad_norm": 2.828125, "learning_rate": 9.987914475380382e-06, "loss": 1.1605567932128906, "step": 352 }, { "epoch": 0.10896498653328203, "grad_norm": 6.375, "learning_rate": 9.987602764886304e-06, "loss": 1.5284254550933838, "step": 354 }, { "epoch": 0.10958060792612544, "grad_norm": 5.71875, "learning_rate": 9.987287091821973e-06, "loss": 1.3579685688018799, "step": 356 }, { "epoch": 0.11019622931896883, "grad_norm": 6.125, "learning_rate": 9.98696745650108e-06, "loss": 1.316291093826294, "step": 358 }, { "epoch": 0.11081185071181224, "grad_norm": 6.1875, "learning_rate": 9.986643859241255e-06, "loss": 1.4510215520858765, "step": 360 }, { "epoch": 0.11142747210465563, "grad_norm": 10.3125, "learning_rate": 9.986316300364063e-06, "loss": 1.5002473592758179, "step": 362 }, { "epoch": 0.11204309349749904, "grad_norm": 6.375, "learning_rate": 9.985984780195006e-06, "loss": 1.5161786079406738, "step": 364 }, { "epoch": 0.11265871489034245, "grad_norm": 5.53125, "learning_rate": 9.985649299063524e-06, "loss": 0.5424119830131531, "step": 366 }, { "epoch": 0.11327433628318584, "grad_norm": 12.5, "learning_rate": 9.985309857302992e-06, "loss": 1.6288212537765503, "step": 368 }, { "epoch": 0.11388995767602925, "grad_norm": 10.125, "learning_rate": 9.98496645525072e-06, "loss": 1.3971303701400757, "step": 370 }, { "epoch": 0.11450557906887264, "grad_norm": 2.21875, "learning_rate": 9.984619093247956e-06, "loss": 0.9907740354537964, "step": 372 }, { "epoch": 0.11512120046171605, "grad_norm": 9.625, "learning_rate": 9.98426777163988e-06, "loss": 1.6520015001296997, "step": 374 }, { "epoch": 0.11573682185455944, "grad_norm": 3.84375, "learning_rate": 9.98391249077561e-06, "loss": 1.2411595582962036, "step": 376 }, { "epoch": 0.11635244324740285, "grad_norm": 9.5, "learning_rate": 9.983553251008194e-06, "loss": 1.3712900876998901, "step": 378 }, { "epoch": 0.11696806464024626, "grad_norm": 3.3125, "learning_rate": 9.983190052694618e-06, "loss": 1.1593055725097656, "step": 380 }, { "epoch": 0.11758368603308965, "grad_norm": 1.8359375, "learning_rate": 9.9828228961958e-06, "loss": 1.1706119775772095, "step": 382 }, { "epoch": 0.11819930742593306, "grad_norm": 11.9375, "learning_rate": 9.982451781876592e-06, "loss": 1.6324158906936646, "step": 384 }, { "epoch": 0.11881492881877645, "grad_norm": 6.5625, "learning_rate": 9.982076710105778e-06, "loss": 1.252874493598938, "step": 386 }, { "epoch": 0.11943055021161986, "grad_norm": 6.28125, "learning_rate": 9.981697681256075e-06, "loss": 1.670320749282837, "step": 388 }, { "epoch": 0.12004617160446325, "grad_norm": 6.28125, "learning_rate": 9.981314695704134e-06, "loss": 0.9480584859848022, "step": 390 }, { "epoch": 0.12066179299730666, "grad_norm": 9.6875, "learning_rate": 9.980927753830536e-06, "loss": 1.4103999137878418, "step": 392 }, { "epoch": 0.12127741439015005, "grad_norm": 3.34375, "learning_rate": 9.980536856019793e-06, "loss": 1.3862167596817017, "step": 394 }, { "epoch": 0.12189303578299346, "grad_norm": 4.0625, "learning_rate": 9.980142002660349e-06, "loss": 1.2710118293762207, "step": 396 }, { "epoch": 0.12250865717583687, "grad_norm": 3.890625, "learning_rate": 9.979743194144578e-06, "loss": 1.2509794235229492, "step": 398 }, { "epoch": 0.12312427856868026, "grad_norm": 4.15625, "learning_rate": 9.979340430868786e-06, "loss": 1.3792119026184082, "step": 400 }, { "epoch": 0.12373989996152367, "grad_norm": 5.90625, "learning_rate": 9.978933713233208e-06, "loss": 1.5315032005310059, "step": 402 }, { "epoch": 0.12435552135436706, "grad_norm": 6.375, "learning_rate": 9.978523041642007e-06, "loss": 0.9869703054428101, "step": 404 }, { "epoch": 0.12497114274721047, "grad_norm": 8.875, "learning_rate": 9.97810841650328e-06, "loss": 1.1880018711090088, "step": 406 }, { "epoch": 0.12558676414005387, "grad_norm": 11.3125, "learning_rate": 9.977689838229045e-06, "loss": 1.3651795387268066, "step": 408 }, { "epoch": 0.12620238553289725, "grad_norm": 10.9375, "learning_rate": 9.977267307235255e-06, "loss": 1.3049819469451904, "step": 410 }, { "epoch": 0.12681800692574066, "grad_norm": 4.5, "learning_rate": 9.976840823941789e-06, "loss": 1.2135716676712036, "step": 412 }, { "epoch": 0.12743362831858407, "grad_norm": 6.59375, "learning_rate": 9.97641038877245e-06, "loss": 1.504198670387268, "step": 414 }, { "epoch": 0.12804924971142748, "grad_norm": 8.1875, "learning_rate": 9.975976002154974e-06, "loss": 1.4882495403289795, "step": 416 }, { "epoch": 0.12866487110427088, "grad_norm": 10.1875, "learning_rate": 9.97553766452102e-06, "loss": 0.7723729610443115, "step": 418 }, { "epoch": 0.12928049249711426, "grad_norm": 6.25, "learning_rate": 9.975095376306174e-06, "loss": 1.3737043142318726, "step": 420 }, { "epoch": 0.12989611388995767, "grad_norm": 9.1875, "learning_rate": 9.974649137949947e-06, "loss": 1.471574068069458, "step": 422 }, { "epoch": 0.13051173528280108, "grad_norm": 6.875, "learning_rate": 9.974198949895778e-06, "loss": 0.9776772856712341, "step": 424 }, { "epoch": 0.13112735667564449, "grad_norm": 16.25, "learning_rate": 9.973744812591027e-06, "loss": 1.3625441789627075, "step": 426 }, { "epoch": 0.1317429780684879, "grad_norm": 6.53125, "learning_rate": 9.973286726486979e-06, "loss": 1.225606918334961, "step": 428 }, { "epoch": 0.13235859946133127, "grad_norm": 6.34375, "learning_rate": 9.972824692038846e-06, "loss": 1.3208441734313965, "step": 430 }, { "epoch": 0.13297422085417468, "grad_norm": 12.1875, "learning_rate": 9.972358709705767e-06, "loss": 1.4286389350891113, "step": 432 }, { "epoch": 0.1335898422470181, "grad_norm": 19.0, "learning_rate": 9.971888779950791e-06, "loss": 1.8515408039093018, "step": 434 }, { "epoch": 0.1342054636398615, "grad_norm": 10.1875, "learning_rate": 9.9714149032409e-06, "loss": 1.8427278995513916, "step": 436 }, { "epoch": 0.13482108503270487, "grad_norm": 8.3125, "learning_rate": 9.970937080047001e-06, "loss": 1.406190037727356, "step": 438 }, { "epoch": 0.13543670642554828, "grad_norm": 5.34375, "learning_rate": 9.970455310843911e-06, "loss": 0.9209026098251343, "step": 440 }, { "epoch": 0.1360523278183917, "grad_norm": 15.3125, "learning_rate": 9.969969596110378e-06, "loss": 1.5718461275100708, "step": 442 }, { "epoch": 0.1366679492112351, "grad_norm": 2.890625, "learning_rate": 9.969479936329067e-06, "loss": 1.2917200326919556, "step": 444 }, { "epoch": 0.1372835706040785, "grad_norm": 7.5, "learning_rate": 9.968986331986565e-06, "loss": 1.4151887893676758, "step": 446 }, { "epoch": 0.13789919199692188, "grad_norm": 14.75, "learning_rate": 9.968488783573376e-06, "loss": 1.9977552890777588, "step": 448 }, { "epoch": 0.1385148133897653, "grad_norm": 3.921875, "learning_rate": 9.967987291583924e-06, "loss": 1.2807127237319946, "step": 450 }, { "epoch": 0.1391304347826087, "grad_norm": 20.875, "learning_rate": 9.967481856516559e-06, "loss": 1.5343042612075806, "step": 452 }, { "epoch": 0.1397460561754521, "grad_norm": 7.6875, "learning_rate": 9.966972478873536e-06, "loss": 1.1871122121810913, "step": 454 }, { "epoch": 0.1403616775682955, "grad_norm": 7.96875, "learning_rate": 9.966459159161038e-06, "loss": 1.477895736694336, "step": 456 }, { "epoch": 0.1409772989611389, "grad_norm": 6.53125, "learning_rate": 9.965941897889162e-06, "loss": 1.4448455572128296, "step": 458 }, { "epoch": 0.1415929203539823, "grad_norm": 6.59375, "learning_rate": 9.96542069557192e-06, "loss": 1.834481954574585, "step": 460 }, { "epoch": 0.1422085417468257, "grad_norm": 10.5625, "learning_rate": 9.96489555272725e-06, "loss": 1.6769435405731201, "step": 462 }, { "epoch": 0.1428241631396691, "grad_norm": 5.34375, "learning_rate": 9.96436646987699e-06, "loss": 1.369468092918396, "step": 464 }, { "epoch": 0.1434397845325125, "grad_norm": 6.78125, "learning_rate": 9.963833447546903e-06, "loss": 1.3754758834838867, "step": 466 }, { "epoch": 0.1440554059253559, "grad_norm": 7.34375, "learning_rate": 9.963296486266667e-06, "loss": 1.7363370656967163, "step": 468 }, { "epoch": 0.1446710273181993, "grad_norm": 5.125, "learning_rate": 9.962755586569873e-06, "loss": 1.634676218032837, "step": 470 }, { "epoch": 0.14528664871104272, "grad_norm": 8.0625, "learning_rate": 9.962210748994023e-06, "loss": 1.3580645322799683, "step": 472 }, { "epoch": 0.14590227010388612, "grad_norm": 4.0, "learning_rate": 9.961661974080537e-06, "loss": 1.469011902809143, "step": 474 }, { "epoch": 0.1465178914967295, "grad_norm": 8.25, "learning_rate": 9.961109262374742e-06, "loss": 1.6970473527908325, "step": 476 }, { "epoch": 0.1471335128895729, "grad_norm": 2.546875, "learning_rate": 9.960552614425882e-06, "loss": 1.0189458131790161, "step": 478 }, { "epoch": 0.14774913428241632, "grad_norm": 11.0625, "learning_rate": 9.959992030787111e-06, "loss": 1.264059066772461, "step": 480 }, { "epoch": 0.14836475567525972, "grad_norm": 4.3125, "learning_rate": 9.959427512015491e-06, "loss": 1.3091117143630981, "step": 482 }, { "epoch": 0.1489803770681031, "grad_norm": 14.375, "learning_rate": 9.958859058671999e-06, "loss": 1.2481650114059448, "step": 484 }, { "epoch": 0.1495959984609465, "grad_norm": 5.28125, "learning_rate": 9.95828667132152e-06, "loss": 1.513774037361145, "step": 486 }, { "epoch": 0.15021161985378992, "grad_norm": 11.375, "learning_rate": 9.957710350532846e-06, "loss": 1.5790436267852783, "step": 488 }, { "epoch": 0.15082724124663333, "grad_norm": 10.5625, "learning_rate": 9.957130096878682e-06, "loss": 1.4887685775756836, "step": 490 }, { "epoch": 0.15144286263947673, "grad_norm": 3.546875, "learning_rate": 9.956545910935637e-06, "loss": 1.1842014789581299, "step": 492 }, { "epoch": 0.1520584840323201, "grad_norm": 6.65625, "learning_rate": 9.955957793284234e-06, "loss": 0.8832247257232666, "step": 494 }, { "epoch": 0.15267410542516352, "grad_norm": 8.0625, "learning_rate": 9.955365744508893e-06, "loss": 1.473832130432129, "step": 496 }, { "epoch": 0.15328972681800693, "grad_norm": 10.0625, "learning_rate": 9.954769765197952e-06, "loss": 1.6755919456481934, "step": 498 }, { "epoch": 0.15390534821085033, "grad_norm": 4.5, "learning_rate": 9.954169855943643e-06, "loss": 1.207249402999878, "step": 500 }, { "epoch": 0.15452096960369374, "grad_norm": 7.1875, "learning_rate": 9.953566017342113e-06, "loss": 1.4508755207061768, "step": 502 }, { "epoch": 0.15513659099653712, "grad_norm": 7.59375, "learning_rate": 9.95295824999341e-06, "loss": 1.6686768531799316, "step": 504 }, { "epoch": 0.15575221238938053, "grad_norm": 7.28125, "learning_rate": 9.952346554501485e-06, "loss": 1.4300891160964966, "step": 506 }, { "epoch": 0.15636783378222394, "grad_norm": 9.375, "learning_rate": 9.951730931474192e-06, "loss": 1.3210822343826294, "step": 508 }, { "epoch": 0.15698345517506734, "grad_norm": 6.03125, "learning_rate": 9.951111381523291e-06, "loss": 1.4945731163024902, "step": 510 }, { "epoch": 0.15759907656791072, "grad_norm": 11.875, "learning_rate": 9.950487905264445e-06, "loss": 1.7546916007995605, "step": 512 }, { "epoch": 0.15821469796075413, "grad_norm": 8.0625, "learning_rate": 9.949860503317213e-06, "loss": 1.4959115982055664, "step": 514 }, { "epoch": 0.15883031935359754, "grad_norm": 7.28125, "learning_rate": 9.94922917630506e-06, "loss": 1.3217384815216064, "step": 516 }, { "epoch": 0.15944594074644095, "grad_norm": 9.625, "learning_rate": 9.948593924855347e-06, "loss": 1.45122492313385, "step": 518 }, { "epoch": 0.16006156213928435, "grad_norm": 16.375, "learning_rate": 9.947954749599343e-06, "loss": 1.3676655292510986, "step": 520 }, { "epoch": 0.16067718353212773, "grad_norm": 11.4375, "learning_rate": 9.947311651172205e-06, "loss": 1.78343665599823, "step": 522 }, { "epoch": 0.16129280492497114, "grad_norm": 8.6875, "learning_rate": 9.946664630212998e-06, "loss": 1.3944776058197021, "step": 524 }, { "epoch": 0.16190842631781455, "grad_norm": 8.6875, "learning_rate": 9.94601368736468e-06, "loss": 1.7285417318344116, "step": 526 }, { "epoch": 0.16252404771065795, "grad_norm": 8.9375, "learning_rate": 9.945358823274107e-06, "loss": 1.4867370128631592, "step": 528 }, { "epoch": 0.16313966910350133, "grad_norm": 3.96875, "learning_rate": 9.944700038592034e-06, "loss": 0.9708921909332275, "step": 530 }, { "epoch": 0.16375529049634474, "grad_norm": 3.125, "learning_rate": 9.944037333973109e-06, "loss": 1.3556715250015259, "step": 532 }, { "epoch": 0.16437091188918815, "grad_norm": 5.625, "learning_rate": 9.943370710075877e-06, "loss": 1.361598253250122, "step": 534 }, { "epoch": 0.16498653328203156, "grad_norm": 8.5, "learning_rate": 9.942700167562774e-06, "loss": 1.3754247426986694, "step": 536 }, { "epoch": 0.16560215467487496, "grad_norm": 15.75, "learning_rate": 9.942025707100139e-06, "loss": 1.7027689218521118, "step": 538 }, { "epoch": 0.16621777606771834, "grad_norm": 6.375, "learning_rate": 9.941347329358193e-06, "loss": 1.3407535552978516, "step": 540 }, { "epoch": 0.16683339746056175, "grad_norm": 5.03125, "learning_rate": 9.940665035011057e-06, "loss": 1.505167007446289, "step": 542 }, { "epoch": 0.16744901885340516, "grad_norm": 6.46875, "learning_rate": 9.939978824736742e-06, "loss": 1.297150731086731, "step": 544 }, { "epoch": 0.16806464024624856, "grad_norm": 7.09375, "learning_rate": 9.939288699217152e-06, "loss": 0.9072936773300171, "step": 546 }, { "epoch": 0.16868026163909197, "grad_norm": 7.625, "learning_rate": 9.938594659138078e-06, "loss": 1.2618991136550903, "step": 548 }, { "epoch": 0.16929588303193535, "grad_norm": 39.25, "learning_rate": 9.937896705189207e-06, "loss": 1.3387863636016846, "step": 550 }, { "epoch": 0.16991150442477876, "grad_norm": 15.8125, "learning_rate": 9.937194838064106e-06, "loss": 1.590695858001709, "step": 552 }, { "epoch": 0.17052712581762217, "grad_norm": 6.96875, "learning_rate": 9.93648905846024e-06, "loss": 1.4013009071350098, "step": 554 }, { "epoch": 0.17114274721046557, "grad_norm": 9.9375, "learning_rate": 9.935779367078958e-06, "loss": 1.6209547519683838, "step": 556 }, { "epoch": 0.17175836860330895, "grad_norm": 5.1875, "learning_rate": 9.935065764625493e-06, "loss": 1.6528769731521606, "step": 558 }, { "epoch": 0.17237398999615236, "grad_norm": 12.25, "learning_rate": 9.934348251808972e-06, "loss": 1.3280056715011597, "step": 560 }, { "epoch": 0.17298961138899577, "grad_norm": 5.5, "learning_rate": 9.9336268293424e-06, "loss": 0.9575151801109314, "step": 562 }, { "epoch": 0.17360523278183917, "grad_norm": 7.9375, "learning_rate": 9.932901497942672e-06, "loss": 1.3487428426742554, "step": 564 }, { "epoch": 0.17422085417468258, "grad_norm": 2.828125, "learning_rate": 9.932172258330566e-06, "loss": 0.9193823337554932, "step": 566 }, { "epoch": 0.17483647556752596, "grad_norm": 4.4375, "learning_rate": 9.931439111230745e-06, "loss": 1.2267365455627441, "step": 568 }, { "epoch": 0.17545209696036937, "grad_norm": 8.8125, "learning_rate": 9.930702057371752e-06, "loss": 1.499516487121582, "step": 570 }, { "epoch": 0.17606771835321278, "grad_norm": 7.96875, "learning_rate": 9.929961097486018e-06, "loss": 1.337449312210083, "step": 572 }, { "epoch": 0.17668333974605618, "grad_norm": 3.921875, "learning_rate": 9.929216232309845e-06, "loss": 1.2182761430740356, "step": 574 }, { "epoch": 0.17729896113889956, "grad_norm": 8.375, "learning_rate": 9.928467462583425e-06, "loss": 1.60722815990448, "step": 576 }, { "epoch": 0.17791458253174297, "grad_norm": 3.515625, "learning_rate": 9.927714789050828e-06, "loss": 0.9616989493370056, "step": 578 }, { "epoch": 0.17853020392458638, "grad_norm": 6.78125, "learning_rate": 9.926958212460002e-06, "loss": 0.9722899794578552, "step": 580 }, { "epoch": 0.17914582531742979, "grad_norm": 75.0, "learning_rate": 9.926197733562774e-06, "loss": 1.2725977897644043, "step": 582 }, { "epoch": 0.1797614467102732, "grad_norm": 27.125, "learning_rate": 9.925433353114851e-06, "loss": 1.7348570823669434, "step": 584 }, { "epoch": 0.18037706810311657, "grad_norm": 10.5, "learning_rate": 9.924665071875812e-06, "loss": 1.5623992681503296, "step": 586 }, { "epoch": 0.18099268949595998, "grad_norm": 8.25, "learning_rate": 9.923892890609118e-06, "loss": 1.6208124160766602, "step": 588 }, { "epoch": 0.1816083108888034, "grad_norm": 6.875, "learning_rate": 9.923116810082096e-06, "loss": 1.3482056856155396, "step": 590 }, { "epoch": 0.1822239322816468, "grad_norm": 3.15625, "learning_rate": 9.922336831065966e-06, "loss": 1.4589314460754395, "step": 592 }, { "epoch": 0.1828395536744902, "grad_norm": 6.5, "learning_rate": 9.9215529543358e-06, "loss": 1.4144929647445679, "step": 594 }, { "epoch": 0.18345517506733358, "grad_norm": 4.03125, "learning_rate": 9.920765180670562e-06, "loss": 1.389893651008606, "step": 596 }, { "epoch": 0.184070796460177, "grad_norm": 2.875, "learning_rate": 9.919973510853076e-06, "loss": 0.9373916387557983, "step": 598 }, { "epoch": 0.1846864178530204, "grad_norm": 7.4375, "learning_rate": 9.91917794567004e-06, "loss": 0.8495765924453735, "step": 600 }, { "epoch": 0.1853020392458638, "grad_norm": 11.25, "learning_rate": 9.91837848591203e-06, "loss": 1.3704453706741333, "step": 602 }, { "epoch": 0.18591766063870718, "grad_norm": 17.625, "learning_rate": 9.917575132373485e-06, "loss": 1.4358397722244263, "step": 604 }, { "epoch": 0.1865332820315506, "grad_norm": 6.0, "learning_rate": 9.916767885852716e-06, "loss": 1.389540195465088, "step": 606 }, { "epoch": 0.187148903424394, "grad_norm": 4.90625, "learning_rate": 9.9159567471519e-06, "loss": 1.5843164920806885, "step": 608 }, { "epoch": 0.1877645248172374, "grad_norm": 5.03125, "learning_rate": 9.915141717077087e-06, "loss": 1.5799970626831055, "step": 610 }, { "epoch": 0.1883801462100808, "grad_norm": 15.375, "learning_rate": 9.914322796438185e-06, "loss": 0.6345373392105103, "step": 612 }, { "epoch": 0.1889957676029242, "grad_norm": 5.90625, "learning_rate": 9.913499986048979e-06, "loss": 1.2795158624649048, "step": 614 }, { "epoch": 0.1896113889957676, "grad_norm": 14.125, "learning_rate": 9.912673286727112e-06, "loss": 1.3732866048812866, "step": 616 }, { "epoch": 0.190227010388611, "grad_norm": 16.5, "learning_rate": 9.911842699294095e-06, "loss": 1.2570321559906006, "step": 618 }, { "epoch": 0.1908426317814544, "grad_norm": 5.53125, "learning_rate": 9.9110082245753e-06, "loss": 0.9575303792953491, "step": 620 }, { "epoch": 0.1914582531742978, "grad_norm": 12.25, "learning_rate": 9.910169863399964e-06, "loss": 1.4983577728271484, "step": 622 }, { "epoch": 0.1920738745671412, "grad_norm": 33.75, "learning_rate": 9.909327616601185e-06, "loss": 1.6086472272872925, "step": 624 }, { "epoch": 0.1926894959599846, "grad_norm": 14.8125, "learning_rate": 9.908481485015922e-06, "loss": 0.9734293818473816, "step": 626 }, { "epoch": 0.19330511735282802, "grad_norm": 10.5625, "learning_rate": 9.907631469484997e-06, "loss": 1.6982768774032593, "step": 628 }, { "epoch": 0.19392073874567142, "grad_norm": 11.25, "learning_rate": 9.906777570853088e-06, "loss": 1.8723676204681396, "step": 630 }, { "epoch": 0.1945363601385148, "grad_norm": 7.28125, "learning_rate": 9.90591978996873e-06, "loss": 1.3249781131744385, "step": 632 }, { "epoch": 0.1951519815313582, "grad_norm": 3.8125, "learning_rate": 9.905058127684326e-06, "loss": 1.2242438793182373, "step": 634 }, { "epoch": 0.19576760292420162, "grad_norm": 5.90625, "learning_rate": 9.904192584856123e-06, "loss": 1.4721521139144897, "step": 636 }, { "epoch": 0.19638322431704502, "grad_norm": 10.875, "learning_rate": 9.903323162344234e-06, "loss": 1.2291452884674072, "step": 638 }, { "epoch": 0.19699884570988843, "grad_norm": 10.8125, "learning_rate": 9.902449861012622e-06, "loss": 1.4630839824676514, "step": 640 }, { "epoch": 0.1976144671027318, "grad_norm": 6.59375, "learning_rate": 9.901572681729106e-06, "loss": 1.6249059438705444, "step": 642 }, { "epoch": 0.19823008849557522, "grad_norm": 7.75, "learning_rate": 9.90069162536536e-06, "loss": 1.314393162727356, "step": 644 }, { "epoch": 0.19884570988841863, "grad_norm": 6.8125, "learning_rate": 9.899806692796907e-06, "loss": 1.394429326057434, "step": 646 }, { "epoch": 0.19946133128126203, "grad_norm": 12.125, "learning_rate": 9.898917884903127e-06, "loss": 1.1286661624908447, "step": 648 }, { "epoch": 0.2000769526741054, "grad_norm": 5.84375, "learning_rate": 9.898025202567247e-06, "loss": 1.5071682929992676, "step": 650 }, { "epoch": 0.20069257406694882, "grad_norm": 8.4375, "learning_rate": 9.897128646676349e-06, "loss": 1.4699369668960571, "step": 652 }, { "epoch": 0.20130819545979223, "grad_norm": 4.90625, "learning_rate": 9.896228218121353e-06, "loss": 1.037987470626831, "step": 654 }, { "epoch": 0.20192381685263563, "grad_norm": 11.8125, "learning_rate": 9.895323917797042e-06, "loss": 1.6456223726272583, "step": 656 }, { "epoch": 0.20253943824547904, "grad_norm": 97.0, "learning_rate": 9.894415746602035e-06, "loss": 1.4587563276290894, "step": 658 }, { "epoch": 0.20315505963832242, "grad_norm": 5.875, "learning_rate": 9.893503705438806e-06, "loss": 1.4608798027038574, "step": 660 }, { "epoch": 0.20377068103116583, "grad_norm": 3.0, "learning_rate": 9.892587795213666e-06, "loss": 1.3375227451324463, "step": 662 }, { "epoch": 0.20438630242400924, "grad_norm": 15.625, "learning_rate": 9.891668016836782e-06, "loss": 1.3512498140335083, "step": 664 }, { "epoch": 0.20500192381685264, "grad_norm": 6.78125, "learning_rate": 9.890744371222152e-06, "loss": 0.7749507427215576, "step": 666 }, { "epoch": 0.20561754520969602, "grad_norm": 6.25, "learning_rate": 9.889816859287627e-06, "loss": 1.158947229385376, "step": 668 }, { "epoch": 0.20623316660253943, "grad_norm": 4.59375, "learning_rate": 9.888885481954895e-06, "loss": 1.391932487487793, "step": 670 }, { "epoch": 0.20684878799538284, "grad_norm": 5.09375, "learning_rate": 9.887950240149486e-06, "loss": 1.5135208368301392, "step": 672 }, { "epoch": 0.20746440938822625, "grad_norm": 7.375, "learning_rate": 9.887011134800774e-06, "loss": 1.2985072135925293, "step": 674 }, { "epoch": 0.20808003078106965, "grad_norm": 8.5625, "learning_rate": 9.886068166841966e-06, "loss": 1.1860077381134033, "step": 676 }, { "epoch": 0.20869565217391303, "grad_norm": 6.40625, "learning_rate": 9.88512133721011e-06, "loss": 1.6630322933197021, "step": 678 }, { "epoch": 0.20931127356675644, "grad_norm": 13.0, "learning_rate": 9.884170646846096e-06, "loss": 1.4792208671569824, "step": 680 }, { "epoch": 0.20992689495959985, "grad_norm": 14.6875, "learning_rate": 9.883216096694641e-06, "loss": 1.2808235883712769, "step": 682 }, { "epoch": 0.21054251635244325, "grad_norm": 3.9375, "learning_rate": 9.882257687704304e-06, "loss": 1.296074628829956, "step": 684 }, { "epoch": 0.21115813774528666, "grad_norm": 8.875, "learning_rate": 9.881295420827482e-06, "loss": 1.5456552505493164, "step": 686 }, { "epoch": 0.21177375913813004, "grad_norm": 7.96875, "learning_rate": 9.880329297020394e-06, "loss": 1.8220405578613281, "step": 688 }, { "epoch": 0.21238938053097345, "grad_norm": 3.90625, "learning_rate": 9.879359317243104e-06, "loss": 1.4817025661468506, "step": 690 }, { "epoch": 0.21300500192381686, "grad_norm": 10.125, "learning_rate": 9.878385482459505e-06, "loss": 1.1662178039550781, "step": 692 }, { "epoch": 0.21362062331666026, "grad_norm": 11.375, "learning_rate": 9.87740779363731e-06, "loss": 1.4008355140686035, "step": 694 }, { "epoch": 0.21423624470950364, "grad_norm": 8.8125, "learning_rate": 9.876426251748079e-06, "loss": 1.5103132724761963, "step": 696 }, { "epoch": 0.21485186610234705, "grad_norm": 4.0, "learning_rate": 9.875440857767187e-06, "loss": 1.3361725807189941, "step": 698 }, { "epoch": 0.21546748749519046, "grad_norm": 4.0, "learning_rate": 9.874451612673841e-06, "loss": 1.2430728673934937, "step": 700 }, { "epoch": 0.21608310888803386, "grad_norm": 4.75, "learning_rate": 9.87345851745108e-06, "loss": 1.2919254302978516, "step": 702 }, { "epoch": 0.21669873028087727, "grad_norm": 6.65625, "learning_rate": 9.872461573085766e-06, "loss": 1.1002001762390137, "step": 704 }, { "epoch": 0.21731435167372065, "grad_norm": 11.1875, "learning_rate": 9.871460780568578e-06, "loss": 1.653430700302124, "step": 706 }, { "epoch": 0.21792997306656406, "grad_norm": 57.25, "learning_rate": 9.870456140894033e-06, "loss": 1.343963384628296, "step": 708 }, { "epoch": 0.21854559445940747, "grad_norm": 12.5625, "learning_rate": 9.869447655060463e-06, "loss": 1.4002281427383423, "step": 710 }, { "epoch": 0.21916121585225087, "grad_norm": 6.75, "learning_rate": 9.868435324070016e-06, "loss": 1.4317114353179932, "step": 712 }, { "epoch": 0.21977683724509428, "grad_norm": 8.0625, "learning_rate": 9.867419148928677e-06, "loss": 1.3845818042755127, "step": 714 }, { "epoch": 0.22039245863793766, "grad_norm": 6.0625, "learning_rate": 9.866399130646238e-06, "loss": 1.2754722833633423, "step": 716 }, { "epoch": 0.22100808003078107, "grad_norm": 10.5625, "learning_rate": 9.865375270236314e-06, "loss": 1.173915147781372, "step": 718 }, { "epoch": 0.22162370142362448, "grad_norm": 6.65625, "learning_rate": 9.864347568716337e-06, "loss": 1.476589322090149, "step": 720 }, { "epoch": 0.22223932281646788, "grad_norm": 10.375, "learning_rate": 9.863316027107561e-06, "loss": 1.672022819519043, "step": 722 }, { "epoch": 0.22285494420931126, "grad_norm": 12.25, "learning_rate": 9.862280646435048e-06, "loss": 0.7103063464164734, "step": 724 }, { "epoch": 0.22347056560215467, "grad_norm": 5.875, "learning_rate": 9.86124142772768e-06, "loss": 1.5016530752182007, "step": 726 }, { "epoch": 0.22408618699499808, "grad_norm": 6.78125, "learning_rate": 9.860198372018153e-06, "loss": 1.6842578649520874, "step": 728 }, { "epoch": 0.22470180838784148, "grad_norm": 22.375, "learning_rate": 9.859151480342975e-06, "loss": 1.2881923913955688, "step": 730 }, { "epoch": 0.2253174297806849, "grad_norm": 4.53125, "learning_rate": 9.858100753742463e-06, "loss": 1.1919798851013184, "step": 732 }, { "epoch": 0.22593305117352827, "grad_norm": 5.59375, "learning_rate": 9.857046193260751e-06, "loss": 1.1353979110717773, "step": 734 }, { "epoch": 0.22654867256637168, "grad_norm": 8.9375, "learning_rate": 9.855987799945777e-06, "loss": 1.460937261581421, "step": 736 }, { "epoch": 0.22716429395921509, "grad_norm": 7.65625, "learning_rate": 9.854925574849292e-06, "loss": 1.3833016157150269, "step": 738 }, { "epoch": 0.2277799153520585, "grad_norm": 30.125, "learning_rate": 9.853859519026852e-06, "loss": 1.7745245695114136, "step": 740 }, { "epoch": 0.22839553674490187, "grad_norm": 7.53125, "learning_rate": 9.852789633537818e-06, "loss": 1.6859065294265747, "step": 742 }, { "epoch": 0.22901115813774528, "grad_norm": 6.34375, "learning_rate": 9.851715919445364e-06, "loss": 0.9433289170265198, "step": 744 }, { "epoch": 0.2296267795305887, "grad_norm": 9.125, "learning_rate": 9.85063837781646e-06, "loss": 1.363451361656189, "step": 746 }, { "epoch": 0.2302424009234321, "grad_norm": 8.1875, "learning_rate": 9.849557009721885e-06, "loss": 1.1893389225006104, "step": 748 }, { "epoch": 0.2308580223162755, "grad_norm": 3.84375, "learning_rate": 9.84847181623622e-06, "loss": 1.209572672843933, "step": 750 }, { "epoch": 0.23147364370911888, "grad_norm": 12.125, "learning_rate": 9.847382798437843e-06, "loss": 1.3902331590652466, "step": 752 }, { "epoch": 0.2320892651019623, "grad_norm": 5.875, "learning_rate": 9.846289957408939e-06, "loss": 1.4305294752120972, "step": 754 }, { "epoch": 0.2327048864948057, "grad_norm": 5.28125, "learning_rate": 9.845193294235484e-06, "loss": 1.384616732597351, "step": 756 }, { "epoch": 0.2333205078876491, "grad_norm": 5.40625, "learning_rate": 9.84409281000726e-06, "loss": 1.3366835117340088, "step": 758 }, { "epoch": 0.2339361292804925, "grad_norm": 4.09375, "learning_rate": 9.842988505817843e-06, "loss": 1.4528403282165527, "step": 760 }, { "epoch": 0.2345517506733359, "grad_norm": 4.78125, "learning_rate": 9.841880382764604e-06, "loss": 1.1686084270477295, "step": 762 }, { "epoch": 0.2351673720661793, "grad_norm": 6.125, "learning_rate": 9.84076844194871e-06, "loss": 1.7286490201950073, "step": 764 }, { "epoch": 0.2357829934590227, "grad_norm": 11.4375, "learning_rate": 9.839652684475118e-06, "loss": 1.6478850841522217, "step": 766 }, { "epoch": 0.2363986148518661, "grad_norm": 6.5625, "learning_rate": 9.838533111452586e-06, "loss": 1.5273628234863281, "step": 768 }, { "epoch": 0.2370142362447095, "grad_norm": 10.0, "learning_rate": 9.837409723993658e-06, "loss": 1.1861430406570435, "step": 770 }, { "epoch": 0.2376298576375529, "grad_norm": 4.5625, "learning_rate": 9.83628252321467e-06, "loss": 1.1912150382995605, "step": 772 }, { "epoch": 0.2382454790303963, "grad_norm": 18.5, "learning_rate": 9.835151510235744e-06, "loss": 1.6655570268630981, "step": 774 }, { "epoch": 0.23886110042323971, "grad_norm": 5.0, "learning_rate": 9.834016686180794e-06, "loss": 1.0292619466781616, "step": 776 }, { "epoch": 0.23947672181608312, "grad_norm": 6.90625, "learning_rate": 9.83287805217752e-06, "loss": 1.4104214906692505, "step": 778 }, { "epoch": 0.2400923432089265, "grad_norm": 12.875, "learning_rate": 9.831735609357408e-06, "loss": 1.3935370445251465, "step": 780 }, { "epoch": 0.2407079646017699, "grad_norm": 11.1875, "learning_rate": 9.83058935885573e-06, "loss": 1.1317634582519531, "step": 782 }, { "epoch": 0.24132358599461332, "grad_norm": 2.84375, "learning_rate": 9.82943930181154e-06, "loss": 1.459189772605896, "step": 784 }, { "epoch": 0.24193920738745672, "grad_norm": 8.625, "learning_rate": 9.828285439367678e-06, "loss": 1.6490992307662964, "step": 786 }, { "epoch": 0.2425548287803001, "grad_norm": 34.75, "learning_rate": 9.827127772670758e-06, "loss": 1.3461780548095703, "step": 788 }, { "epoch": 0.2431704501731435, "grad_norm": 6.21875, "learning_rate": 9.825966302871183e-06, "loss": 1.4193799495697021, "step": 790 }, { "epoch": 0.24378607156598692, "grad_norm": 5.28125, "learning_rate": 9.82480103112313e-06, "loss": 1.273714542388916, "step": 792 }, { "epoch": 0.24440169295883032, "grad_norm": 11.0625, "learning_rate": 9.823631958584556e-06, "loss": 1.255758285522461, "step": 794 }, { "epoch": 0.24501731435167373, "grad_norm": 5.9375, "learning_rate": 9.822459086417195e-06, "loss": 1.5950769186019897, "step": 796 }, { "epoch": 0.2456329357445171, "grad_norm": 6.96875, "learning_rate": 9.821282415786557e-06, "loss": 1.4654479026794434, "step": 798 }, { "epoch": 0.24624855713736052, "grad_norm": 10.625, "learning_rate": 9.820101947861927e-06, "loss": 1.5545507669448853, "step": 800 }, { "epoch": 0.24686417853020393, "grad_norm": 6.75, "learning_rate": 9.818917683816358e-06, "loss": 1.6426501274108887, "step": 802 }, { "epoch": 0.24747979992304733, "grad_norm": 7.40625, "learning_rate": 9.817729624826681e-06, "loss": 1.5924813747406006, "step": 804 }, { "epoch": 0.24809542131589074, "grad_norm": 16.25, "learning_rate": 9.816537772073502e-06, "loss": 1.791450023651123, "step": 806 }, { "epoch": 0.24871104270873412, "grad_norm": 14.375, "learning_rate": 9.815342126741185e-06, "loss": 1.4027529954910278, "step": 808 }, { "epoch": 0.24932666410157753, "grad_norm": 14.625, "learning_rate": 9.814142690017875e-06, "loss": 0.960854709148407, "step": 810 }, { "epoch": 0.24994228549442093, "grad_norm": 5.9375, "learning_rate": 9.812939463095476e-06, "loss": 1.6002954244613647, "step": 812 }, { "epoch": 0.2505579068872643, "grad_norm": 2.90625, "learning_rate": 9.811732447169662e-06, "loss": 1.1075432300567627, "step": 814 }, { "epoch": 0.25117352828010775, "grad_norm": 4.9375, "learning_rate": 9.810521643439872e-06, "loss": 1.1781002283096313, "step": 816 }, { "epoch": 0.25178914967295113, "grad_norm": 20.75, "learning_rate": 9.80930705310931e-06, "loss": 1.0230952501296997, "step": 818 }, { "epoch": 0.2524047710657945, "grad_norm": 1.7265625, "learning_rate": 9.808088677384939e-06, "loss": 1.1417490243911743, "step": 820 }, { "epoch": 0.25302039245863794, "grad_norm": 10.6875, "learning_rate": 9.806866517477487e-06, "loss": 1.560872197151184, "step": 822 }, { "epoch": 0.2536360138514813, "grad_norm": 3.0625, "learning_rate": 9.805640574601443e-06, "loss": 1.2669354677200317, "step": 824 }, { "epoch": 0.25425163524432476, "grad_norm": 5.03125, "learning_rate": 9.804410849975056e-06, "loss": 1.2113425731658936, "step": 826 }, { "epoch": 0.25486725663716814, "grad_norm": 8.75, "learning_rate": 9.803177344820326e-06, "loss": 0.6720989942550659, "step": 828 }, { "epoch": 0.2554828780300115, "grad_norm": 8.3125, "learning_rate": 9.801940060363018e-06, "loss": 1.5107386112213135, "step": 830 }, { "epoch": 0.25609849942285495, "grad_norm": 2.140625, "learning_rate": 9.800698997832647e-06, "loss": 1.2716776132583618, "step": 832 }, { "epoch": 0.25671412081569833, "grad_norm": 6.46875, "learning_rate": 9.799454158462487e-06, "loss": 1.4024817943572998, "step": 834 }, { "epoch": 0.25732974220854177, "grad_norm": 12.25, "learning_rate": 9.798205543489562e-06, "loss": 1.0063973665237427, "step": 836 }, { "epoch": 0.25794536360138515, "grad_norm": 9.5625, "learning_rate": 9.79695315415465e-06, "loss": 0.7014191150665283, "step": 838 }, { "epoch": 0.2585609849942285, "grad_norm": 19.625, "learning_rate": 9.795696991702274e-06, "loss": 1.6371238231658936, "step": 840 }, { "epoch": 0.25917660638707196, "grad_norm": 7.03125, "learning_rate": 9.794437057380714e-06, "loss": 1.376318097114563, "step": 842 }, { "epoch": 0.25979222777991534, "grad_norm": 4.5, "learning_rate": 9.793173352441996e-06, "loss": 1.3747721910476685, "step": 844 }, { "epoch": 0.2604078491727588, "grad_norm": 9.125, "learning_rate": 9.791905878141891e-06, "loss": 1.455966830253601, "step": 846 }, { "epoch": 0.26102347056560216, "grad_norm": 149.0, "learning_rate": 9.790634635739915e-06, "loss": 1.793878197669983, "step": 848 }, { "epoch": 0.26163909195844554, "grad_norm": 37.5, "learning_rate": 9.789359626499332e-06, "loss": 1.0845096111297607, "step": 850 }, { "epoch": 0.26225471335128897, "grad_norm": 9.9375, "learning_rate": 9.788080851687145e-06, "loss": 1.3753288984298706, "step": 852 }, { "epoch": 0.26287033474413235, "grad_norm": 5.15625, "learning_rate": 9.786798312574104e-06, "loss": 1.1450377702713013, "step": 854 }, { "epoch": 0.2634859561369758, "grad_norm": 11.625, "learning_rate": 9.785512010434695e-06, "loss": 1.6636018753051758, "step": 856 }, { "epoch": 0.26410157752981916, "grad_norm": 8.5, "learning_rate": 9.784221946547146e-06, "loss": 1.6864784955978394, "step": 858 }, { "epoch": 0.26471719892266254, "grad_norm": 6.1875, "learning_rate": 9.782928122193423e-06, "loss": 1.5488227605819702, "step": 860 }, { "epoch": 0.265332820315506, "grad_norm": 7.78125, "learning_rate": 9.781630538659226e-06, "loss": 1.5389184951782227, "step": 862 }, { "epoch": 0.26594844170834936, "grad_norm": 13.125, "learning_rate": 9.780329197233995e-06, "loss": 1.5071114301681519, "step": 864 }, { "epoch": 0.2665640631011928, "grad_norm": 496.0, "learning_rate": 9.7790240992109e-06, "loss": 0.7931236624717712, "step": 866 }, { "epoch": 0.2671796844940362, "grad_norm": 4.21875, "learning_rate": 9.777715245886852e-06, "loss": 1.1507668495178223, "step": 868 }, { "epoch": 0.26779530588687955, "grad_norm": 5.8125, "learning_rate": 9.77640263856248e-06, "loss": 1.1638729572296143, "step": 870 }, { "epoch": 0.268410927279723, "grad_norm": 4.34375, "learning_rate": 9.775086278542156e-06, "loss": 1.0706069469451904, "step": 872 }, { "epoch": 0.26902654867256637, "grad_norm": 4.65625, "learning_rate": 9.773766167133976e-06, "loss": 1.0477044582366943, "step": 874 }, { "epoch": 0.26964217006540975, "grad_norm": 5.59375, "learning_rate": 9.77244230564976e-06, "loss": 0.9538878798484802, "step": 876 }, { "epoch": 0.2702577914582532, "grad_norm": 9.625, "learning_rate": 9.771114695405066e-06, "loss": 1.4235448837280273, "step": 878 }, { "epoch": 0.27087341285109656, "grad_norm": 7.21875, "learning_rate": 9.769783337719166e-06, "loss": 1.4780068397521973, "step": 880 }, { "epoch": 0.27148903424394, "grad_norm": 26.0, "learning_rate": 9.76844823391506e-06, "loss": 0.8643165230751038, "step": 882 }, { "epoch": 0.2721046556367834, "grad_norm": 9.75, "learning_rate": 9.767109385319472e-06, "loss": 1.3260691165924072, "step": 884 }, { "epoch": 0.27272027702962676, "grad_norm": 11.125, "learning_rate": 9.765766793262843e-06, "loss": 1.1863996982574463, "step": 886 }, { "epoch": 0.2733358984224702, "grad_norm": 7.0, "learning_rate": 9.76442045907934e-06, "loss": 1.3834737539291382, "step": 888 }, { "epoch": 0.27395151981531357, "grad_norm": 14.1875, "learning_rate": 9.763070384106845e-06, "loss": 1.594760775566101, "step": 890 }, { "epoch": 0.274567141208157, "grad_norm": 23.625, "learning_rate": 9.761716569686954e-06, "loss": 1.3638572692871094, "step": 892 }, { "epoch": 0.2751827626010004, "grad_norm": 5.21875, "learning_rate": 9.760359017164989e-06, "loss": 1.6811443567276, "step": 894 }, { "epoch": 0.27579838399384377, "grad_norm": 8.1875, "learning_rate": 9.758997727889977e-06, "loss": 1.3500550985336304, "step": 896 }, { "epoch": 0.2764140053866872, "grad_norm": 5.875, "learning_rate": 9.75763270321466e-06, "loss": 1.2751091718673706, "step": 898 }, { "epoch": 0.2770296267795306, "grad_norm": 5.75, "learning_rate": 9.756263944495495e-06, "loss": 1.177546501159668, "step": 900 }, { "epoch": 0.277645248172374, "grad_norm": 5.40625, "learning_rate": 9.754891453092649e-06, "loss": 1.2334669828414917, "step": 902 }, { "epoch": 0.2782608695652174, "grad_norm": 3.71875, "learning_rate": 9.753515230369997e-06, "loss": 1.0920450687408447, "step": 904 }, { "epoch": 0.2788764909580608, "grad_norm": 5.15625, "learning_rate": 9.752135277695122e-06, "loss": 1.4145758152008057, "step": 906 }, { "epoch": 0.2794921123509042, "grad_norm": 5.6875, "learning_rate": 9.750751596439316e-06, "loss": 1.1625487804412842, "step": 908 }, { "epoch": 0.2801077337437476, "grad_norm": 8.375, "learning_rate": 9.749364187977572e-06, "loss": 1.4057106971740723, "step": 910 }, { "epoch": 0.280723355136591, "grad_norm": 16.125, "learning_rate": 9.747973053688589e-06, "loss": 0.9219164252281189, "step": 912 }, { "epoch": 0.2813389765294344, "grad_norm": 6.71875, "learning_rate": 9.746578194954767e-06, "loss": 0.9533755779266357, "step": 914 }, { "epoch": 0.2819545979222778, "grad_norm": 2.953125, "learning_rate": 9.745179613162213e-06, "loss": 0.9482744932174683, "step": 916 }, { "epoch": 0.2825702193151212, "grad_norm": 9.75, "learning_rate": 9.743777309700724e-06, "loss": 1.7051759958267212, "step": 918 }, { "epoch": 0.2831858407079646, "grad_norm": 6.03125, "learning_rate": 9.742371285963802e-06, "loss": 1.216848611831665, "step": 920 }, { "epoch": 0.283801462100808, "grad_norm": 9.0625, "learning_rate": 9.740961543348648e-06, "loss": 1.3270268440246582, "step": 922 }, { "epoch": 0.2844170834936514, "grad_norm": 4.5, "learning_rate": 9.73954808325615e-06, "loss": 0.9035344123840332, "step": 924 }, { "epoch": 0.2850327048864948, "grad_norm": 55.25, "learning_rate": 9.738130907090895e-06, "loss": 1.51797616481781, "step": 926 }, { "epoch": 0.2856483262793382, "grad_norm": 12.375, "learning_rate": 9.736710016261166e-06, "loss": 1.508449673652649, "step": 928 }, { "epoch": 0.2862639476721816, "grad_norm": 9.6875, "learning_rate": 9.735285412178931e-06, "loss": 1.2803070545196533, "step": 930 }, { "epoch": 0.286879569065025, "grad_norm": 11.25, "learning_rate": 9.733857096259854e-06, "loss": 1.1159873008728027, "step": 932 }, { "epoch": 0.2874951904578684, "grad_norm": 13.75, "learning_rate": 9.732425069923282e-06, "loss": 1.1685740947723389, "step": 934 }, { "epoch": 0.2881108118507118, "grad_norm": 7.09375, "learning_rate": 9.730989334592252e-06, "loss": 1.297022819519043, "step": 936 }, { "epoch": 0.28872643324355524, "grad_norm": 3.609375, "learning_rate": 9.729549891693487e-06, "loss": 1.2184970378875732, "step": 938 }, { "epoch": 0.2893420546363986, "grad_norm": 5.0625, "learning_rate": 9.728106742657394e-06, "loss": 1.3542568683624268, "step": 940 }, { "epoch": 0.289957676029242, "grad_norm": 2.296875, "learning_rate": 9.726659888918065e-06, "loss": 1.019122838973999, "step": 942 }, { "epoch": 0.29057329742208543, "grad_norm": 9.0, "learning_rate": 9.725209331913266e-06, "loss": 0.9411934614181519, "step": 944 }, { "epoch": 0.2911889188149288, "grad_norm": 7.34375, "learning_rate": 9.723755073084449e-06, "loss": 1.2213878631591797, "step": 946 }, { "epoch": 0.29180454020777224, "grad_norm": 8.5, "learning_rate": 9.722297113876744e-06, "loss": 1.7289658784866333, "step": 948 }, { "epoch": 0.2924201616006156, "grad_norm": 5.3125, "learning_rate": 9.720835455738961e-06, "loss": 1.3057657480239868, "step": 950 }, { "epoch": 0.293035782993459, "grad_norm": 25.625, "learning_rate": 9.71937010012358e-06, "loss": 1.4629443883895874, "step": 952 }, { "epoch": 0.29365140438630244, "grad_norm": 4.34375, "learning_rate": 9.717901048486758e-06, "loss": 1.1471151113510132, "step": 954 }, { "epoch": 0.2942670257791458, "grad_norm": 3.609375, "learning_rate": 9.716428302288323e-06, "loss": 1.144550085067749, "step": 956 }, { "epoch": 0.29488264717198925, "grad_norm": 5.78125, "learning_rate": 9.714951862991777e-06, "loss": 1.3617162704467773, "step": 958 }, { "epoch": 0.29549826856483263, "grad_norm": 7.3125, "learning_rate": 9.713471732064293e-06, "loss": 1.593535304069519, "step": 960 }, { "epoch": 0.296113889957676, "grad_norm": 13.5625, "learning_rate": 9.711987910976705e-06, "loss": 1.3249685764312744, "step": 962 }, { "epoch": 0.29672951135051945, "grad_norm": 5.96875, "learning_rate": 9.710500401203525e-06, "loss": 1.3375357389450073, "step": 964 }, { "epoch": 0.2973451327433628, "grad_norm": 4.9375, "learning_rate": 9.709009204222923e-06, "loss": 1.483339786529541, "step": 966 }, { "epoch": 0.2979607541362062, "grad_norm": 10.875, "learning_rate": 9.707514321516734e-06, "loss": 1.0293675661087036, "step": 968 }, { "epoch": 0.29857637552904964, "grad_norm": 5.5625, "learning_rate": 9.706015754570452e-06, "loss": 1.0801160335540771, "step": 970 }, { "epoch": 0.299191996921893, "grad_norm": 5.25, "learning_rate": 9.704513504873247e-06, "loss": 0.9405486583709717, "step": 972 }, { "epoch": 0.29980761831473646, "grad_norm": 5.65625, "learning_rate": 9.70300757391793e-06, "loss": 1.3500492572784424, "step": 974 }, { "epoch": 0.30042323970757984, "grad_norm": 26.875, "learning_rate": 9.70149796320098e-06, "loss": 1.480463981628418, "step": 976 }, { "epoch": 0.3010388611004232, "grad_norm": 5.25, "learning_rate": 9.699984674222534e-06, "loss": 1.1579872369766235, "step": 978 }, { "epoch": 0.30165448249326665, "grad_norm": 6.78125, "learning_rate": 9.698467708486379e-06, "loss": 1.4822840690612793, "step": 980 }, { "epoch": 0.30227010388611003, "grad_norm": 9.1875, "learning_rate": 9.696947067499958e-06, "loss": 1.5733311176300049, "step": 982 }, { "epoch": 0.30288572527895347, "grad_norm": 3.640625, "learning_rate": 9.695422752774364e-06, "loss": 1.190136432647705, "step": 984 }, { "epoch": 0.30350134667179685, "grad_norm": 6.8125, "learning_rate": 9.693894765824345e-06, "loss": 1.3894709348678589, "step": 986 }, { "epoch": 0.3041169680646402, "grad_norm": 7.25, "learning_rate": 9.692363108168294e-06, "loss": 1.3427432775497437, "step": 988 }, { "epoch": 0.30473258945748366, "grad_norm": 9.4375, "learning_rate": 9.690827781328259e-06, "loss": 1.1161162853240967, "step": 990 }, { "epoch": 0.30534821085032704, "grad_norm": 11.3125, "learning_rate": 9.689288786829922e-06, "loss": 1.4082365036010742, "step": 992 }, { "epoch": 0.3059638322431705, "grad_norm": 11.0625, "learning_rate": 9.68774612620262e-06, "loss": 1.6027635335922241, "step": 994 }, { "epoch": 0.30657945363601385, "grad_norm": 6.53125, "learning_rate": 9.686199800979328e-06, "loss": 1.2858593463897705, "step": 996 }, { "epoch": 0.30719507502885723, "grad_norm": 4.28125, "learning_rate": 9.684649812696665e-06, "loss": 1.340716004371643, "step": 998 }, { "epoch": 0.30781069642170067, "grad_norm": 12.5, "learning_rate": 9.68309616289489e-06, "loss": 1.1984376907348633, "step": 1000 }, { "epoch": 0.30842631781454405, "grad_norm": 5.4375, "learning_rate": 9.681538853117896e-06, "loss": 1.6487531661987305, "step": 1002 }, { "epoch": 0.3090419392073875, "grad_norm": 4.46875, "learning_rate": 9.679977884913219e-06, "loss": 1.0971587896347046, "step": 1004 }, { "epoch": 0.30965756060023086, "grad_norm": 7.28125, "learning_rate": 9.67841325983203e-06, "loss": 1.6865112781524658, "step": 1006 }, { "epoch": 0.31027318199307424, "grad_norm": 2.109375, "learning_rate": 9.676844979429127e-06, "loss": 1.3177874088287354, "step": 1008 }, { "epoch": 0.3108888033859177, "grad_norm": 8.125, "learning_rate": 9.67527304526295e-06, "loss": 1.2586731910705566, "step": 1010 }, { "epoch": 0.31150442477876106, "grad_norm": 5.625, "learning_rate": 9.673697458895563e-06, "loss": 1.2238794565200806, "step": 1012 }, { "epoch": 0.31212004617160444, "grad_norm": 4.96875, "learning_rate": 9.672118221892663e-06, "loss": 1.1530698537826538, "step": 1014 }, { "epoch": 0.3127356675644479, "grad_norm": 7.0, "learning_rate": 9.670535335823572e-06, "loss": 1.3727072477340698, "step": 1016 }, { "epoch": 0.31335128895729125, "grad_norm": 8.4375, "learning_rate": 9.66894880226124e-06, "loss": 1.4163241386413574, "step": 1018 }, { "epoch": 0.3139669103501347, "grad_norm": 7.84375, "learning_rate": 9.667358622782242e-06, "loss": 1.5218753814697266, "step": 1020 }, { "epoch": 0.31458253174297807, "grad_norm": 11.875, "learning_rate": 9.66576479896677e-06, "loss": 1.2111762762069702, "step": 1022 }, { "epoch": 0.31519815313582145, "grad_norm": 6.78125, "learning_rate": 9.664167332398649e-06, "loss": 1.0470290184020996, "step": 1024 }, { "epoch": 0.3158137745286649, "grad_norm": 11.25, "learning_rate": 9.662566224665313e-06, "loss": 0.7057480812072754, "step": 1026 }, { "epoch": 0.31642939592150826, "grad_norm": 6.125, "learning_rate": 9.66096147735782e-06, "loss": 1.3530975580215454, "step": 1028 }, { "epoch": 0.3170450173143517, "grad_norm": 12.625, "learning_rate": 9.659353092070844e-06, "loss": 1.084864854812622, "step": 1030 }, { "epoch": 0.3176606387071951, "grad_norm": 9.5625, "learning_rate": 9.657741070402673e-06, "loss": 1.4985663890838623, "step": 1032 }, { "epoch": 0.31827626010003846, "grad_norm": 4.75, "learning_rate": 9.65612541395521e-06, "loss": 1.4467542171478271, "step": 1034 }, { "epoch": 0.3188918814928819, "grad_norm": 9.8125, "learning_rate": 9.65450612433397e-06, "loss": 1.5565028190612793, "step": 1036 }, { "epoch": 0.31950750288572527, "grad_norm": 17.125, "learning_rate": 9.65288320314807e-06, "loss": 1.085126519203186, "step": 1038 }, { "epoch": 0.3201231242785687, "grad_norm": 4.53125, "learning_rate": 9.651256652010252e-06, "loss": 1.497908115386963, "step": 1040 }, { "epoch": 0.3207387456714121, "grad_norm": 12.9375, "learning_rate": 9.64962647253685e-06, "loss": 1.504968285560608, "step": 1042 }, { "epoch": 0.32135436706425546, "grad_norm": 7.03125, "learning_rate": 9.647992666347816e-06, "loss": 1.4675335884094238, "step": 1044 }, { "epoch": 0.3219699884570989, "grad_norm": 7.46875, "learning_rate": 9.646355235066696e-06, "loss": 1.2435938119888306, "step": 1046 }, { "epoch": 0.3225856098499423, "grad_norm": 4.21875, "learning_rate": 9.644714180320642e-06, "loss": 1.3065686225891113, "step": 1048 }, { "epoch": 0.3232012312427857, "grad_norm": 9.625, "learning_rate": 9.64306950374041e-06, "loss": 1.2061657905578613, "step": 1050 }, { "epoch": 0.3238168526356291, "grad_norm": 10.0625, "learning_rate": 9.641421206960347e-06, "loss": 1.10911226272583, "step": 1052 }, { "epoch": 0.3244324740284725, "grad_norm": 11.5, "learning_rate": 9.639769291618406e-06, "loss": 0.9055336117744446, "step": 1054 }, { "epoch": 0.3250480954213159, "grad_norm": 4.21875, "learning_rate": 9.638113759356132e-06, "loss": 1.2694463729858398, "step": 1056 }, { "epoch": 0.3256637168141593, "grad_norm": 3.25, "learning_rate": 9.636454611818665e-06, "loss": 1.3356084823608398, "step": 1058 }, { "epoch": 0.32627933820700267, "grad_norm": 7.84375, "learning_rate": 9.634791850654735e-06, "loss": 1.2268829345703125, "step": 1060 }, { "epoch": 0.3268949595998461, "grad_norm": 5.21875, "learning_rate": 9.633125477516663e-06, "loss": 1.638374924659729, "step": 1062 }, { "epoch": 0.3275105809926895, "grad_norm": 8.5, "learning_rate": 9.631455494060369e-06, "loss": 1.517810344696045, "step": 1064 }, { "epoch": 0.3281262023855329, "grad_norm": 44.75, "learning_rate": 9.629781901945345e-06, "loss": 1.184501051902771, "step": 1066 }, { "epoch": 0.3287418237783763, "grad_norm": 6.59375, "learning_rate": 9.628104702834681e-06, "loss": 0.9123966693878174, "step": 1068 }, { "epoch": 0.3293574451712197, "grad_norm": 11.3125, "learning_rate": 9.62642389839505e-06, "loss": 1.200285792350769, "step": 1070 }, { "epoch": 0.3299730665640631, "grad_norm": 18.5, "learning_rate": 9.6247394902967e-06, "loss": 1.7133913040161133, "step": 1072 }, { "epoch": 0.3305886879569065, "grad_norm": 11.75, "learning_rate": 9.623051480213468e-06, "loss": 1.683318018913269, "step": 1074 }, { "epoch": 0.3312043093497499, "grad_norm": 6.125, "learning_rate": 9.621359869822764e-06, "loss": 1.5562039613723755, "step": 1076 }, { "epoch": 0.3318199307425933, "grad_norm": 8.875, "learning_rate": 9.619664660805583e-06, "loss": 1.3498424291610718, "step": 1078 }, { "epoch": 0.3324355521354367, "grad_norm": 2.71875, "learning_rate": 9.617965854846492e-06, "loss": 1.4478331804275513, "step": 1080 }, { "epoch": 0.3330511735282801, "grad_norm": 9.75, "learning_rate": 9.616263453633628e-06, "loss": 1.3443846702575684, "step": 1082 }, { "epoch": 0.3336667949211235, "grad_norm": 5.125, "learning_rate": 9.614557458858712e-06, "loss": 1.2382532358169556, "step": 1084 }, { "epoch": 0.33428241631396693, "grad_norm": 4.125, "learning_rate": 9.612847872217023e-06, "loss": 1.3172005414962769, "step": 1086 }, { "epoch": 0.3348980377068103, "grad_norm": 14.1875, "learning_rate": 9.61113469540742e-06, "loss": 0.9626773595809937, "step": 1088 }, { "epoch": 0.3355136590996537, "grad_norm": 2.328125, "learning_rate": 9.609417930132324e-06, "loss": 1.2234991788864136, "step": 1090 }, { "epoch": 0.33612928049249713, "grad_norm": 9.5, "learning_rate": 9.607697578097721e-06, "loss": 1.1397364139556885, "step": 1092 }, { "epoch": 0.3367449018853405, "grad_norm": 2.796875, "learning_rate": 9.605973641013166e-06, "loss": 0.4958764910697937, "step": 1094 }, { "epoch": 0.33736052327818394, "grad_norm": 6.375, "learning_rate": 9.604246120591774e-06, "loss": 1.32682204246521, "step": 1096 }, { "epoch": 0.3379761446710273, "grad_norm": 12.375, "learning_rate": 9.602515018550217e-06, "loss": 1.1351960897445679, "step": 1098 }, { "epoch": 0.3385917660638707, "grad_norm": 6.40625, "learning_rate": 9.600780336608735e-06, "loss": 1.3413037061691284, "step": 1100 }, { "epoch": 0.33920738745671414, "grad_norm": 7.59375, "learning_rate": 9.599042076491118e-06, "loss": 1.577768087387085, "step": 1102 }, { "epoch": 0.3398230088495575, "grad_norm": 23.625, "learning_rate": 9.597300239924714e-06, "loss": 1.0736544132232666, "step": 1104 }, { "epoch": 0.3404386302424009, "grad_norm": 9.125, "learning_rate": 9.595554828640426e-06, "loss": 1.1022412776947021, "step": 1106 }, { "epoch": 0.34105425163524433, "grad_norm": 5.15625, "learning_rate": 9.593805844372706e-06, "loss": 1.1736547946929932, "step": 1108 }, { "epoch": 0.3416698730280877, "grad_norm": 9.0625, "learning_rate": 9.592053288859559e-06, "loss": 1.372632622718811, "step": 1110 }, { "epoch": 0.34228549442093115, "grad_norm": 5.96875, "learning_rate": 9.59029716384254e-06, "loss": 1.2505266666412354, "step": 1112 }, { "epoch": 0.3429011158137745, "grad_norm": 6.5, "learning_rate": 9.588537471066755e-06, "loss": 1.4534019231796265, "step": 1114 }, { "epoch": 0.3435167372066179, "grad_norm": 9.0, "learning_rate": 9.586774212280841e-06, "loss": 1.7671773433685303, "step": 1116 }, { "epoch": 0.34413235859946134, "grad_norm": 6.875, "learning_rate": 9.58500738923699e-06, "loss": 1.3340649604797363, "step": 1118 }, { "epoch": 0.3447479799923047, "grad_norm": 4.90625, "learning_rate": 9.583237003690939e-06, "loss": 1.4870283603668213, "step": 1120 }, { "epoch": 0.34536360138514816, "grad_norm": 6.40625, "learning_rate": 9.581463057401954e-06, "loss": 1.010940432548523, "step": 1122 }, { "epoch": 0.34597922277799154, "grad_norm": 5.75, "learning_rate": 9.57968555213285e-06, "loss": 1.4298595190048218, "step": 1124 }, { "epoch": 0.3465948441708349, "grad_norm": 20.625, "learning_rate": 9.577904489649968e-06, "loss": 1.5162783861160278, "step": 1126 }, { "epoch": 0.34721046556367835, "grad_norm": 4.625, "learning_rate": 9.576119871723194e-06, "loss": 1.3182761669158936, "step": 1128 }, { "epoch": 0.34782608695652173, "grad_norm": 11.375, "learning_rate": 9.57433170012594e-06, "loss": 1.7118732929229736, "step": 1130 }, { "epoch": 0.34844170834936516, "grad_norm": 7.71875, "learning_rate": 9.572539976635158e-06, "loss": 1.2914398908615112, "step": 1132 }, { "epoch": 0.34905732974220854, "grad_norm": 8.375, "learning_rate": 9.570744703031319e-06, "loss": 1.6061583757400513, "step": 1134 }, { "epoch": 0.3496729511350519, "grad_norm": 6.21875, "learning_rate": 9.568945881098426e-06, "loss": 1.5233328342437744, "step": 1136 }, { "epoch": 0.35028857252789536, "grad_norm": 2.0, "learning_rate": 9.567143512624009e-06, "loss": 1.2799041271209717, "step": 1138 }, { "epoch": 0.35090419392073874, "grad_norm": 7.8125, "learning_rate": 9.565337599399126e-06, "loss": 1.3228352069854736, "step": 1140 }, { "epoch": 0.3515198153135822, "grad_norm": 2.65625, "learning_rate": 9.563528143218346e-06, "loss": 1.082732915878296, "step": 1142 }, { "epoch": 0.35213543670642555, "grad_norm": 7.40625, "learning_rate": 9.561715145879773e-06, "loss": 1.2160236835479736, "step": 1144 }, { "epoch": 0.35275105809926893, "grad_norm": 19.75, "learning_rate": 9.559898609185023e-06, "loss": 1.4509515762329102, "step": 1146 }, { "epoch": 0.35336667949211237, "grad_norm": 5.59375, "learning_rate": 9.558078534939223e-06, "loss": 1.3905575275421143, "step": 1148 }, { "epoch": 0.35398230088495575, "grad_norm": 6.34375, "learning_rate": 9.556254924951026e-06, "loss": 1.4463965892791748, "step": 1150 }, { "epoch": 0.3545979222777991, "grad_norm": 10.75, "learning_rate": 9.554427781032597e-06, "loss": 1.6196813583374023, "step": 1152 }, { "epoch": 0.35521354367064256, "grad_norm": 16.625, "learning_rate": 9.552597104999606e-06, "loss": 1.157198429107666, "step": 1154 }, { "epoch": 0.35582916506348594, "grad_norm": 6.3125, "learning_rate": 9.550762898671235e-06, "loss": 1.513045310974121, "step": 1156 }, { "epoch": 0.3564447864563294, "grad_norm": 5.21875, "learning_rate": 9.54892516387018e-06, "loss": 1.1897473335266113, "step": 1158 }, { "epoch": 0.35706040784917276, "grad_norm": 11.3125, "learning_rate": 9.547083902422636e-06, "loss": 1.3680248260498047, "step": 1160 }, { "epoch": 0.35767602924201614, "grad_norm": 9.0625, "learning_rate": 9.545239116158308e-06, "loss": 1.395264744758606, "step": 1162 }, { "epoch": 0.35829165063485957, "grad_norm": 6.0, "learning_rate": 9.543390806910403e-06, "loss": 1.5195459127426147, "step": 1164 }, { "epoch": 0.35890727202770295, "grad_norm": 10.0625, "learning_rate": 9.541538976515624e-06, "loss": 1.1251592636108398, "step": 1166 }, { "epoch": 0.3595228934205464, "grad_norm": 5.65625, "learning_rate": 9.539683626814176e-06, "loss": 1.3194764852523804, "step": 1168 }, { "epoch": 0.36013851481338977, "grad_norm": 12.3125, "learning_rate": 9.537824759649763e-06, "loss": 1.413508415222168, "step": 1170 }, { "epoch": 0.36075413620623314, "grad_norm": 5.0, "learning_rate": 9.535962376869582e-06, "loss": 1.5969830751419067, "step": 1172 }, { "epoch": 0.3613697575990766, "grad_norm": 16.375, "learning_rate": 9.534096480324329e-06, "loss": 1.6726967096328735, "step": 1174 }, { "epoch": 0.36198537899191996, "grad_norm": 5.03125, "learning_rate": 9.532227071868183e-06, "loss": 1.2074344158172607, "step": 1176 }, { "epoch": 0.3626010003847634, "grad_norm": 8.0625, "learning_rate": 9.530354153358817e-06, "loss": 1.5075865983963013, "step": 1178 }, { "epoch": 0.3632166217776068, "grad_norm": 7.875, "learning_rate": 9.528477726657393e-06, "loss": 1.1990852355957031, "step": 1180 }, { "epoch": 0.36383224317045015, "grad_norm": 8.8125, "learning_rate": 9.526597793628558e-06, "loss": 1.5207868814468384, "step": 1182 }, { "epoch": 0.3644478645632936, "grad_norm": 17.75, "learning_rate": 9.524714356140443e-06, "loss": 1.3340822458267212, "step": 1184 }, { "epoch": 0.36506348595613697, "grad_norm": 7.40625, "learning_rate": 9.522827416064664e-06, "loss": 1.3735941648483276, "step": 1186 }, { "epoch": 0.3656791073489804, "grad_norm": 5.59375, "learning_rate": 9.520936975276316e-06, "loss": 1.2616640329360962, "step": 1188 }, { "epoch": 0.3662947287418238, "grad_norm": 8.5625, "learning_rate": 9.51904303565397e-06, "loss": 1.4502698183059692, "step": 1190 }, { "epoch": 0.36691035013466716, "grad_norm": 7.3125, "learning_rate": 9.517145599079675e-06, "loss": 1.1244027614593506, "step": 1192 }, { "epoch": 0.3675259715275106, "grad_norm": 8.3125, "learning_rate": 9.51524466743896e-06, "loss": 1.3728070259094238, "step": 1194 }, { "epoch": 0.368141592920354, "grad_norm": 13.875, "learning_rate": 9.513340242620823e-06, "loss": 1.3410530090332031, "step": 1196 }, { "epoch": 0.36875721431319736, "grad_norm": 8.3125, "learning_rate": 9.511432326517731e-06, "loss": 1.8403198719024658, "step": 1198 }, { "epoch": 0.3693728357060408, "grad_norm": 16.25, "learning_rate": 9.509520921025626e-06, "loss": 1.5507339239120483, "step": 1200 }, { "epoch": 0.36998845709888417, "grad_norm": 2.765625, "learning_rate": 9.507606028043912e-06, "loss": 1.3664857149124146, "step": 1202 }, { "epoch": 0.3706040784917276, "grad_norm": 8.25, "learning_rate": 9.50568764947546e-06, "loss": 1.2702953815460205, "step": 1204 }, { "epoch": 0.371219699884571, "grad_norm": 16.75, "learning_rate": 9.50376578722661e-06, "loss": 1.535370945930481, "step": 1206 }, { "epoch": 0.37183532127741437, "grad_norm": 8.25, "learning_rate": 9.50184044320716e-06, "loss": 1.6450167894363403, "step": 1208 }, { "epoch": 0.3724509426702578, "grad_norm": 9.125, "learning_rate": 9.499911619330359e-06, "loss": 1.3015737533569336, "step": 1210 }, { "epoch": 0.3730665640631012, "grad_norm": 6.53125, "learning_rate": 9.497979317512933e-06, "loss": 1.4610661268234253, "step": 1212 }, { "epoch": 0.3736821854559446, "grad_norm": 6.65625, "learning_rate": 9.496043539675048e-06, "loss": 0.986057460308075, "step": 1214 }, { "epoch": 0.374297806848788, "grad_norm": 15.0, "learning_rate": 9.494104287740332e-06, "loss": 1.6070356369018555, "step": 1216 }, { "epoch": 0.3749134282416314, "grad_norm": 4.9375, "learning_rate": 9.492161563635857e-06, "loss": 0.9645931720733643, "step": 1218 }, { "epoch": 0.3755290496344748, "grad_norm": 3.015625, "learning_rate": 9.490215369292162e-06, "loss": 1.0386302471160889, "step": 1220 }, { "epoch": 0.3761446710273182, "grad_norm": 5.78125, "learning_rate": 9.488265706643216e-06, "loss": 1.4799411296844482, "step": 1222 }, { "epoch": 0.3767602924201616, "grad_norm": 5.53125, "learning_rate": 9.486312577626446e-06, "loss": 1.4030064344406128, "step": 1224 }, { "epoch": 0.377375913813005, "grad_norm": 4.96875, "learning_rate": 9.484355984182718e-06, "loss": 1.2825661897659302, "step": 1226 }, { "epoch": 0.3779915352058484, "grad_norm": 5.46875, "learning_rate": 9.482395928256345e-06, "loss": 1.2394254207611084, "step": 1228 }, { "epoch": 0.3786071565986918, "grad_norm": 5.625, "learning_rate": 9.480432411795075e-06, "loss": 1.317694902420044, "step": 1230 }, { "epoch": 0.3792227779915352, "grad_norm": 15.6875, "learning_rate": 9.478465436750103e-06, "loss": 1.3715949058532715, "step": 1232 }, { "epoch": 0.37983839938437863, "grad_norm": 5.875, "learning_rate": 9.476495005076054e-06, "loss": 1.4082399606704712, "step": 1234 }, { "epoch": 0.380454020777222, "grad_norm": 6.6875, "learning_rate": 9.474521118730988e-06, "loss": 1.38877534866333, "step": 1236 }, { "epoch": 0.3810696421700654, "grad_norm": 5.46875, "learning_rate": 9.472543779676402e-06, "loss": 1.8902709484100342, "step": 1238 }, { "epoch": 0.3816852635629088, "grad_norm": 6.40625, "learning_rate": 9.470562989877224e-06, "loss": 1.2837282419204712, "step": 1240 }, { "epoch": 0.3823008849557522, "grad_norm": 4.625, "learning_rate": 9.468578751301806e-06, "loss": 1.8792657852172852, "step": 1242 }, { "epoch": 0.3829165063485956, "grad_norm": 4.875, "learning_rate": 9.466591065921932e-06, "loss": 1.319744348526001, "step": 1244 }, { "epoch": 0.383532127741439, "grad_norm": 5.03125, "learning_rate": 9.46459993571281e-06, "loss": 1.664703607559204, "step": 1246 }, { "epoch": 0.3841477491342824, "grad_norm": 5.96875, "learning_rate": 9.46260536265307e-06, "loss": 1.2072725296020508, "step": 1248 }, { "epoch": 0.38476337052712584, "grad_norm": 6.90625, "learning_rate": 9.460607348724763e-06, "loss": 1.4186588525772095, "step": 1250 }, { "epoch": 0.3853789919199692, "grad_norm": 22.875, "learning_rate": 9.458605895913362e-06, "loss": 1.2559492588043213, "step": 1252 }, { "epoch": 0.3859946133128126, "grad_norm": 2.34375, "learning_rate": 9.456601006207755e-06, "loss": 1.1489924192428589, "step": 1254 }, { "epoch": 0.38661023470565603, "grad_norm": 10.1875, "learning_rate": 9.454592681600246e-06, "loss": 1.3234691619873047, "step": 1256 }, { "epoch": 0.3872258560984994, "grad_norm": 6.34375, "learning_rate": 9.45258092408655e-06, "loss": 1.5079069137573242, "step": 1258 }, { "epoch": 0.38784147749134285, "grad_norm": 7.34375, "learning_rate": 9.450565735665797e-06, "loss": 1.5823462009429932, "step": 1260 }, { "epoch": 0.3884570988841862, "grad_norm": 6.09375, "learning_rate": 9.448547118340528e-06, "loss": 1.4394303560256958, "step": 1262 }, { "epoch": 0.3890727202770296, "grad_norm": 6.28125, "learning_rate": 9.446525074116684e-06, "loss": 1.6312642097473145, "step": 1264 }, { "epoch": 0.38968834166987304, "grad_norm": 2.421875, "learning_rate": 9.444499605003614e-06, "loss": 1.2395602464675903, "step": 1266 }, { "epoch": 0.3903039630627164, "grad_norm": 3.0, "learning_rate": 9.44247071301408e-06, "loss": 1.1280606985092163, "step": 1268 }, { "epoch": 0.39091958445555985, "grad_norm": 6.59375, "learning_rate": 9.440438400164232e-06, "loss": 1.2219340801239014, "step": 1270 }, { "epoch": 0.39153520584840323, "grad_norm": 8.4375, "learning_rate": 9.438402668473623e-06, "loss": 1.7627100944519043, "step": 1272 }, { "epoch": 0.3921508272412466, "grad_norm": 13.25, "learning_rate": 9.436363519965209e-06, "loss": 1.527239203453064, "step": 1274 }, { "epoch": 0.39276644863409005, "grad_norm": 6.3125, "learning_rate": 9.434320956665335e-06, "loss": 1.3796216249465942, "step": 1276 }, { "epoch": 0.39338207002693343, "grad_norm": 12.5, "learning_rate": 9.432274980603745e-06, "loss": 1.1514739990234375, "step": 1278 }, { "epoch": 0.39399769141977686, "grad_norm": 3.84375, "learning_rate": 9.430225593813567e-06, "loss": 1.2350151538848877, "step": 1280 }, { "epoch": 0.39461331281262024, "grad_norm": 16.25, "learning_rate": 9.428172798331328e-06, "loss": 0.8400089740753174, "step": 1282 }, { "epoch": 0.3952289342054636, "grad_norm": 4.0625, "learning_rate": 9.426116596196933e-06, "loss": 1.1161513328552246, "step": 1284 }, { "epoch": 0.39584455559830706, "grad_norm": 4.40625, "learning_rate": 9.424056989453677e-06, "loss": 1.2691034078598022, "step": 1286 }, { "epoch": 0.39646017699115044, "grad_norm": 6.28125, "learning_rate": 9.421993980148237e-06, "loss": 1.256693720817566, "step": 1288 }, { "epoch": 0.3970757983839938, "grad_norm": 5.0625, "learning_rate": 9.419927570330672e-06, "loss": 1.4276587963104248, "step": 1290 }, { "epoch": 0.39769141977683725, "grad_norm": 6.3125, "learning_rate": 9.417857762054418e-06, "loss": 1.7910493612289429, "step": 1292 }, { "epoch": 0.39830704116968063, "grad_norm": 6.03125, "learning_rate": 9.415784557376296e-06, "loss": 1.6821799278259277, "step": 1294 }, { "epoch": 0.39892266256252407, "grad_norm": 12.1875, "learning_rate": 9.413707958356489e-06, "loss": 1.3325204849243164, "step": 1296 }, { "epoch": 0.39953828395536745, "grad_norm": 6.28125, "learning_rate": 9.411627967058563e-06, "loss": 1.311694860458374, "step": 1298 }, { "epoch": 0.4001539053482108, "grad_norm": 5.875, "learning_rate": 9.409544585549452e-06, "loss": 1.2477080821990967, "step": 1300 }, { "epoch": 0.40076952674105426, "grad_norm": 5.53125, "learning_rate": 9.407457815899458e-06, "loss": 1.3540740013122559, "step": 1302 }, { "epoch": 0.40138514813389764, "grad_norm": 9.1875, "learning_rate": 9.405367660182254e-06, "loss": 1.3317627906799316, "step": 1304 }, { "epoch": 0.4020007695267411, "grad_norm": 7.8125, "learning_rate": 9.403274120474867e-06, "loss": 1.508737564086914, "step": 1306 }, { "epoch": 0.40261639091958445, "grad_norm": 7.3125, "learning_rate": 9.401177198857703e-06, "loss": 1.5539381504058838, "step": 1308 }, { "epoch": 0.40323201231242783, "grad_norm": 7.5, "learning_rate": 9.399076897414517e-06, "loss": 1.196575403213501, "step": 1310 }, { "epoch": 0.40384763370527127, "grad_norm": 4.34375, "learning_rate": 9.396973218232424e-06, "loss": 1.4205857515335083, "step": 1312 }, { "epoch": 0.40446325509811465, "grad_norm": 22.125, "learning_rate": 9.394866163401897e-06, "loss": 1.0114387273788452, "step": 1314 }, { "epoch": 0.4050788764909581, "grad_norm": 3.78125, "learning_rate": 9.392755735016763e-06, "loss": 1.302337646484375, "step": 1316 }, { "epoch": 0.40569449788380146, "grad_norm": 20.125, "learning_rate": 9.390641935174208e-06, "loss": 1.5426905155181885, "step": 1318 }, { "epoch": 0.40631011927664484, "grad_norm": 56.75, "learning_rate": 9.388524765974754e-06, "loss": 1.4665982723236084, "step": 1320 }, { "epoch": 0.4069257406694883, "grad_norm": 6.5625, "learning_rate": 9.386404229522286e-06, "loss": 1.2628440856933594, "step": 1322 }, { "epoch": 0.40754136206233166, "grad_norm": 4.15625, "learning_rate": 9.384280327924024e-06, "loss": 1.117812991142273, "step": 1324 }, { "epoch": 0.4081569834551751, "grad_norm": 5.03125, "learning_rate": 9.38215306329054e-06, "loss": 1.4815458059310913, "step": 1326 }, { "epoch": 0.4087726048480185, "grad_norm": 4.59375, "learning_rate": 9.380022437735743e-06, "loss": 1.057960033416748, "step": 1328 }, { "epoch": 0.40938822624086185, "grad_norm": 17.625, "learning_rate": 9.377888453376885e-06, "loss": 1.2156786918640137, "step": 1330 }, { "epoch": 0.4100038476337053, "grad_norm": 4.21875, "learning_rate": 9.37575111233455e-06, "loss": 1.3469533920288086, "step": 1332 }, { "epoch": 0.41061946902654867, "grad_norm": 7.90625, "learning_rate": 9.373610416732667e-06, "loss": 1.6464571952819824, "step": 1334 }, { "epoch": 0.41123509041939205, "grad_norm": 8.25, "learning_rate": 9.37146636869849e-06, "loss": 1.4672791957855225, "step": 1336 }, { "epoch": 0.4118507118122355, "grad_norm": 9.5, "learning_rate": 9.369318970362606e-06, "loss": 1.2561655044555664, "step": 1338 }, { "epoch": 0.41246633320507886, "grad_norm": 19.375, "learning_rate": 9.367168223858937e-06, "loss": 1.1853914260864258, "step": 1340 }, { "epoch": 0.4130819545979223, "grad_norm": 8.5, "learning_rate": 9.365014131324726e-06, "loss": 1.1935577392578125, "step": 1342 }, { "epoch": 0.4136975759907657, "grad_norm": 8.5, "learning_rate": 9.362856694900542e-06, "loss": 1.3152433633804321, "step": 1344 }, { "epoch": 0.41431319738360906, "grad_norm": 7.1875, "learning_rate": 9.36069591673028e-06, "loss": 1.5555031299591064, "step": 1346 }, { "epoch": 0.4149288187764525, "grad_norm": 19.0, "learning_rate": 9.358531798961154e-06, "loss": 1.4917670488357544, "step": 1348 }, { "epoch": 0.41554444016929587, "grad_norm": 4.8125, "learning_rate": 9.356364343743694e-06, "loss": 1.564723253250122, "step": 1350 }, { "epoch": 0.4161600615621393, "grad_norm": 6.5625, "learning_rate": 9.35419355323175e-06, "loss": 1.31098210811615, "step": 1352 }, { "epoch": 0.4167756829549827, "grad_norm": 3.9375, "learning_rate": 9.352019429582485e-06, "loss": 1.0555250644683838, "step": 1354 }, { "epoch": 0.41739130434782606, "grad_norm": 3.0625, "learning_rate": 9.349841974956373e-06, "loss": 1.2879685163497925, "step": 1356 }, { "epoch": 0.4180069257406695, "grad_norm": 7.34375, "learning_rate": 9.3476611915172e-06, "loss": 1.627746820449829, "step": 1358 }, { "epoch": 0.4186225471335129, "grad_norm": 14.75, "learning_rate": 9.345477081432065e-06, "loss": 1.1410566568374634, "step": 1360 }, { "epoch": 0.4192381685263563, "grad_norm": 14.6875, "learning_rate": 9.343289646871361e-06, "loss": 1.6584223508834839, "step": 1362 }, { "epoch": 0.4198537899191997, "grad_norm": 4.46875, "learning_rate": 9.34109889000879e-06, "loss": 1.063903570175171, "step": 1364 }, { "epoch": 0.4204694113120431, "grad_norm": 5.46875, "learning_rate": 9.338904813021361e-06, "loss": 1.2660856246948242, "step": 1366 }, { "epoch": 0.4210850327048865, "grad_norm": 8.9375, "learning_rate": 9.336707418089375e-06, "loss": 1.5954651832580566, "step": 1368 }, { "epoch": 0.4217006540977299, "grad_norm": 20.75, "learning_rate": 9.334506707396432e-06, "loss": 1.2890225648880005, "step": 1370 }, { "epoch": 0.4223162754905733, "grad_norm": 35.25, "learning_rate": 9.332302683129427e-06, "loss": 1.1560983657836914, "step": 1372 }, { "epoch": 0.4229318968834167, "grad_norm": 10.8125, "learning_rate": 9.33009534747855e-06, "loss": 1.4955801963806152, "step": 1374 }, { "epoch": 0.4235475182762601, "grad_norm": 11.75, "learning_rate": 9.32788470263728e-06, "loss": 1.3701071739196777, "step": 1376 }, { "epoch": 0.4241631396691035, "grad_norm": 8.0625, "learning_rate": 9.325670750802382e-06, "loss": 1.3674092292785645, "step": 1378 }, { "epoch": 0.4247787610619469, "grad_norm": 3.375, "learning_rate": 9.323453494173913e-06, "loss": 1.222502589225769, "step": 1380 }, { "epoch": 0.4253943824547903, "grad_norm": 8.875, "learning_rate": 9.321232934955208e-06, "loss": 0.9671890735626221, "step": 1382 }, { "epoch": 0.4260100038476337, "grad_norm": 20.375, "learning_rate": 9.319009075352888e-06, "loss": 1.5922764539718628, "step": 1384 }, { "epoch": 0.4266256252404771, "grad_norm": 3.390625, "learning_rate": 9.316781917576851e-06, "loss": 1.2351106405258179, "step": 1386 }, { "epoch": 0.4272412466333205, "grad_norm": 5.0625, "learning_rate": 9.314551463840273e-06, "loss": 1.357588529586792, "step": 1388 }, { "epoch": 0.4278568680261639, "grad_norm": 5.125, "learning_rate": 9.31231771635961e-06, "loss": 1.0194324254989624, "step": 1390 }, { "epoch": 0.4284724894190073, "grad_norm": 3.71875, "learning_rate": 9.310080677354583e-06, "loss": 1.1744673252105713, "step": 1392 }, { "epoch": 0.4290881108118507, "grad_norm": 6.75, "learning_rate": 9.307840349048185e-06, "loss": 1.2617883682250977, "step": 1394 }, { "epoch": 0.4297037322046941, "grad_norm": 17.875, "learning_rate": 9.305596733666688e-06, "loss": 1.3517186641693115, "step": 1396 }, { "epoch": 0.43031935359753754, "grad_norm": 7.46875, "learning_rate": 9.303349833439619e-06, "loss": 1.5301835536956787, "step": 1398 }, { "epoch": 0.4309349749903809, "grad_norm": 2.359375, "learning_rate": 9.301099650599771e-06, "loss": 1.1756891012191772, "step": 1400 }, { "epoch": 0.4315505963832243, "grad_norm": 17.5, "learning_rate": 9.298846187383206e-06, "loss": 1.5164425373077393, "step": 1402 }, { "epoch": 0.43216621777606773, "grad_norm": 13.3125, "learning_rate": 9.296589446029235e-06, "loss": 1.3366045951843262, "step": 1404 }, { "epoch": 0.4327818391689111, "grad_norm": 6.75, "learning_rate": 9.294329428780437e-06, "loss": 1.1451746225357056, "step": 1406 }, { "epoch": 0.43339746056175454, "grad_norm": 5.21875, "learning_rate": 9.292066137882643e-06, "loss": 1.2165199518203735, "step": 1408 }, { "epoch": 0.4340130819545979, "grad_norm": 4.40625, "learning_rate": 9.28979957558493e-06, "loss": 1.3512365818023682, "step": 1410 }, { "epoch": 0.4346287033474413, "grad_norm": 5.84375, "learning_rate": 9.287529744139638e-06, "loss": 1.3853118419647217, "step": 1412 }, { "epoch": 0.43524432474028474, "grad_norm": 3.46875, "learning_rate": 9.285256645802343e-06, "loss": 1.1237252950668335, "step": 1414 }, { "epoch": 0.4358599461331281, "grad_norm": 12.3125, "learning_rate": 9.28298028283188e-06, "loss": 1.6480573415756226, "step": 1416 }, { "epoch": 0.43647556752597155, "grad_norm": 10.1875, "learning_rate": 9.280700657490319e-06, "loss": 1.5300313234329224, "step": 1418 }, { "epoch": 0.43709118891881493, "grad_norm": 9.0625, "learning_rate": 9.278417772042973e-06, "loss": 1.5395290851593018, "step": 1420 }, { "epoch": 0.4377068103116583, "grad_norm": 11.4375, "learning_rate": 9.276131628758402e-06, "loss": 1.599255084991455, "step": 1422 }, { "epoch": 0.43832243170450175, "grad_norm": 5.96875, "learning_rate": 9.273842229908392e-06, "loss": 1.0969022512435913, "step": 1424 }, { "epoch": 0.4389380530973451, "grad_norm": 5.1875, "learning_rate": 9.271549577767972e-06, "loss": 0.8788780570030212, "step": 1426 }, { "epoch": 0.43955367449018856, "grad_norm": 18.5, "learning_rate": 9.269253674615404e-06, "loss": 1.2673485279083252, "step": 1428 }, { "epoch": 0.44016929588303194, "grad_norm": 3.421875, "learning_rate": 9.266954522732174e-06, "loss": 1.0636610984802246, "step": 1430 }, { "epoch": 0.4407849172758753, "grad_norm": 6.21875, "learning_rate": 9.264652124403004e-06, "loss": 1.2438650131225586, "step": 1432 }, { "epoch": 0.44140053866871876, "grad_norm": 8.6875, "learning_rate": 9.262346481915838e-06, "loss": 1.0668020248413086, "step": 1434 }, { "epoch": 0.44201616006156214, "grad_norm": 7.65625, "learning_rate": 9.260037597561846e-06, "loss": 1.4651720523834229, "step": 1436 }, { "epoch": 0.4426317814544055, "grad_norm": 6.96875, "learning_rate": 9.257725473635414e-06, "loss": 1.2487192153930664, "step": 1438 }, { "epoch": 0.44324740284724895, "grad_norm": 6.6875, "learning_rate": 9.255410112434158e-06, "loss": 1.3854714632034302, "step": 1440 }, { "epoch": 0.44386302424009233, "grad_norm": 5.71875, "learning_rate": 9.253091516258899e-06, "loss": 1.4076898097991943, "step": 1442 }, { "epoch": 0.44447864563293576, "grad_norm": 2.171875, "learning_rate": 9.25076968741368e-06, "loss": 1.4974058866500854, "step": 1444 }, { "epoch": 0.44509426702577914, "grad_norm": 4.3125, "learning_rate": 9.248444628205753e-06, "loss": 1.1812409162521362, "step": 1446 }, { "epoch": 0.4457098884186225, "grad_norm": 6.90625, "learning_rate": 9.246116340945584e-06, "loss": 1.0853921175003052, "step": 1448 }, { "epoch": 0.44632550981146596, "grad_norm": 5.59375, "learning_rate": 9.24378482794684e-06, "loss": 1.4460300207138062, "step": 1450 }, { "epoch": 0.44694113120430934, "grad_norm": 6.46875, "learning_rate": 9.2414500915264e-06, "loss": 1.3900498151779175, "step": 1452 }, { "epoch": 0.4475567525971528, "grad_norm": 4.9375, "learning_rate": 9.239112134004346e-06, "loss": 0.9988191723823547, "step": 1454 }, { "epoch": 0.44817237398999615, "grad_norm": 10.125, "learning_rate": 9.236770957703957e-06, "loss": 1.0114213228225708, "step": 1456 }, { "epoch": 0.44878799538283953, "grad_norm": 5.65625, "learning_rate": 9.234426564951713e-06, "loss": 1.2418731451034546, "step": 1458 }, { "epoch": 0.44940361677568297, "grad_norm": 19.25, "learning_rate": 9.232078958077289e-06, "loss": 1.515458106994629, "step": 1460 }, { "epoch": 0.45001923816852635, "grad_norm": 4.8125, "learning_rate": 9.229728139413553e-06, "loss": 1.2873460054397583, "step": 1462 }, { "epoch": 0.4506348595613698, "grad_norm": 12.75, "learning_rate": 9.22737411129657e-06, "loss": 1.2370492219924927, "step": 1464 }, { "epoch": 0.45125048095421316, "grad_norm": 6.75, "learning_rate": 9.225016876065587e-06, "loss": 1.2360658645629883, "step": 1466 }, { "epoch": 0.45186610234705654, "grad_norm": 7.4375, "learning_rate": 9.222656436063043e-06, "loss": 1.2146968841552734, "step": 1468 }, { "epoch": 0.4524817237399, "grad_norm": 6.46875, "learning_rate": 9.220292793634559e-06, "loss": 1.3954319953918457, "step": 1470 }, { "epoch": 0.45309734513274336, "grad_norm": 6.0625, "learning_rate": 9.217925951128941e-06, "loss": 1.5174306631088257, "step": 1472 }, { "epoch": 0.4537129665255868, "grad_norm": 10.5, "learning_rate": 9.215555910898175e-06, "loss": 1.3878395557403564, "step": 1474 }, { "epoch": 0.45432858791843017, "grad_norm": 9.8125, "learning_rate": 9.213182675297418e-06, "loss": 1.5955039262771606, "step": 1476 }, { "epoch": 0.45494420931127355, "grad_norm": 6.84375, "learning_rate": 9.210806246685012e-06, "loss": 1.4851185083389282, "step": 1478 }, { "epoch": 0.455559830704117, "grad_norm": 9.1875, "learning_rate": 9.208426627422464e-06, "loss": 1.4243017435073853, "step": 1480 }, { "epoch": 0.45617545209696037, "grad_norm": 5.75, "learning_rate": 9.20604381987446e-06, "loss": 1.4351035356521606, "step": 1482 }, { "epoch": 0.45679107348980375, "grad_norm": 7.0, "learning_rate": 9.203657826408842e-06, "loss": 1.302568793296814, "step": 1484 }, { "epoch": 0.4574066948826472, "grad_norm": 6.75, "learning_rate": 9.201268649396634e-06, "loss": 1.1568458080291748, "step": 1486 }, { "epoch": 0.45802231627549056, "grad_norm": 9.8125, "learning_rate": 9.198876291212006e-06, "loss": 0.8684755563735962, "step": 1488 }, { "epoch": 0.458637937668334, "grad_norm": 3.71875, "learning_rate": 9.196480754232305e-06, "loss": 0.7467790246009827, "step": 1490 }, { "epoch": 0.4592535590611774, "grad_norm": 11.5, "learning_rate": 9.194082040838025e-06, "loss": 1.2954021692276, "step": 1492 }, { "epoch": 0.45986918045402075, "grad_norm": 11.3125, "learning_rate": 9.191680153412823e-06, "loss": 1.7717770338058472, "step": 1494 }, { "epoch": 0.4604848018468642, "grad_norm": 10.8125, "learning_rate": 9.189275094343509e-06, "loss": 1.44906747341156, "step": 1496 }, { "epoch": 0.46110042323970757, "grad_norm": 12.5625, "learning_rate": 9.186866866020042e-06, "loss": 1.2336671352386475, "step": 1498 }, { "epoch": 0.461716044632551, "grad_norm": 5.25, "learning_rate": 9.184455470835536e-06, "loss": 0.8941473364830017, "step": 1500 }, { "epoch": 0.4623316660253944, "grad_norm": 10.75, "learning_rate": 9.182040911186246e-06, "loss": 1.598117470741272, "step": 1502 }, { "epoch": 0.46294728741823776, "grad_norm": 5.5, "learning_rate": 9.17962318947157e-06, "loss": 1.3159825801849365, "step": 1504 }, { "epoch": 0.4635629088110812, "grad_norm": 13.875, "learning_rate": 9.177202308094062e-06, "loss": 1.9779479503631592, "step": 1506 }, { "epoch": 0.4641785302039246, "grad_norm": 8.1875, "learning_rate": 9.174778269459399e-06, "loss": 1.3620253801345825, "step": 1508 }, { "epoch": 0.464794151596768, "grad_norm": 5.96875, "learning_rate": 9.172351075976407e-06, "loss": 1.2268351316452026, "step": 1510 }, { "epoch": 0.4654097729896114, "grad_norm": 6.46875, "learning_rate": 9.169920730057038e-06, "loss": 1.475582242012024, "step": 1512 }, { "epoch": 0.46602539438245477, "grad_norm": 4.6875, "learning_rate": 9.16748723411638e-06, "loss": 0.8650484681129456, "step": 1514 }, { "epoch": 0.4666410157752982, "grad_norm": 20.375, "learning_rate": 9.16505059057266e-06, "loss": 0.8505802750587463, "step": 1516 }, { "epoch": 0.4672566371681416, "grad_norm": 6.0625, "learning_rate": 9.16261080184722e-06, "loss": 1.3201478719711304, "step": 1518 }, { "epoch": 0.467872258560985, "grad_norm": 7.9375, "learning_rate": 9.160167870364533e-06, "loss": 1.345275640487671, "step": 1520 }, { "epoch": 0.4684878799538284, "grad_norm": 8.6875, "learning_rate": 9.157721798552194e-06, "loss": 1.5291322469711304, "step": 1522 }, { "epoch": 0.4691035013466718, "grad_norm": 9.8125, "learning_rate": 9.155272588840924e-06, "loss": 1.465450406074524, "step": 1524 }, { "epoch": 0.4697191227395152, "grad_norm": 4.625, "learning_rate": 9.152820243664553e-06, "loss": 1.4166549444198608, "step": 1526 }, { "epoch": 0.4703347441323586, "grad_norm": 2.21875, "learning_rate": 9.150364765460032e-06, "loss": 1.282368779182434, "step": 1528 }, { "epoch": 0.470950365525202, "grad_norm": 2.65625, "learning_rate": 9.147906156667425e-06, "loss": 0.9684574007987976, "step": 1530 }, { "epoch": 0.4715659869180454, "grad_norm": 8.125, "learning_rate": 9.14544441972991e-06, "loss": 1.324642300605774, "step": 1532 }, { "epoch": 0.4721816083108888, "grad_norm": 8.9375, "learning_rate": 9.142979557093766e-06, "loss": 1.6652965545654297, "step": 1534 }, { "epoch": 0.4727972297037322, "grad_norm": 16.875, "learning_rate": 9.14051157120838e-06, "loss": 1.6565061807632446, "step": 1536 }, { "epoch": 0.4734128510965756, "grad_norm": 6.75, "learning_rate": 9.138040464526254e-06, "loss": 1.8133372068405151, "step": 1538 }, { "epoch": 0.474028472489419, "grad_norm": 3.984375, "learning_rate": 9.135566239502974e-06, "loss": 1.4453431367874146, "step": 1540 }, { "epoch": 0.4746440938822624, "grad_norm": 4.5, "learning_rate": 9.133088898597236e-06, "loss": 1.37595796585083, "step": 1542 }, { "epoch": 0.4752597152751058, "grad_norm": 4.1875, "learning_rate": 9.130608444270828e-06, "loss": 1.2220466136932373, "step": 1544 }, { "epoch": 0.47587533666794923, "grad_norm": 2.71875, "learning_rate": 9.128124878988633e-06, "loss": 1.0569881200790405, "step": 1546 }, { "epoch": 0.4764909580607926, "grad_norm": 9.375, "learning_rate": 9.125638205218628e-06, "loss": 1.4060856103897095, "step": 1548 }, { "epoch": 0.477106579453636, "grad_norm": 6.1875, "learning_rate": 9.123148425431873e-06, "loss": 1.219241976737976, "step": 1550 }, { "epoch": 0.47772220084647943, "grad_norm": 5.1875, "learning_rate": 9.120655542102524e-06, "loss": 1.014161467552185, "step": 1552 }, { "epoch": 0.4783378222393228, "grad_norm": 6.875, "learning_rate": 9.118159557707807e-06, "loss": 1.4548730850219727, "step": 1554 }, { "epoch": 0.47895344363216624, "grad_norm": 11.4375, "learning_rate": 9.11566047472804e-06, "loss": 1.5502562522888184, "step": 1556 }, { "epoch": 0.4795690650250096, "grad_norm": 6.53125, "learning_rate": 9.113158295646623e-06, "loss": 1.093739628791809, "step": 1558 }, { "epoch": 0.480184686417853, "grad_norm": 9.875, "learning_rate": 9.11065302295002e-06, "loss": 1.1581438779830933, "step": 1560 }, { "epoch": 0.48080030781069644, "grad_norm": 5.25, "learning_rate": 9.108144659127782e-06, "loss": 1.5227125883102417, "step": 1562 }, { "epoch": 0.4814159292035398, "grad_norm": 6.5625, "learning_rate": 9.105633206672524e-06, "loss": 1.2725529670715332, "step": 1564 }, { "epoch": 0.48203155059638325, "grad_norm": 4.75, "learning_rate": 9.103118668079932e-06, "loss": 1.2470329999923706, "step": 1566 }, { "epoch": 0.48264717198922663, "grad_norm": 8.875, "learning_rate": 9.100601045848765e-06, "loss": 1.2510249614715576, "step": 1568 }, { "epoch": 0.48326279338207, "grad_norm": 5.25, "learning_rate": 9.098080342480832e-06, "loss": 1.2870439291000366, "step": 1570 }, { "epoch": 0.48387841477491345, "grad_norm": 6.84375, "learning_rate": 9.09555656048102e-06, "loss": 1.453609824180603, "step": 1572 }, { "epoch": 0.4844940361677568, "grad_norm": 11.3125, "learning_rate": 9.093029702357262e-06, "loss": 1.162030577659607, "step": 1574 }, { "epoch": 0.4851096575606002, "grad_norm": 1.9921875, "learning_rate": 9.090499770620556e-06, "loss": 1.0766057968139648, "step": 1576 }, { "epoch": 0.48572527895344364, "grad_norm": 8.6875, "learning_rate": 9.087966767784953e-06, "loss": 1.3970556259155273, "step": 1578 }, { "epoch": 0.486340900346287, "grad_norm": 10.6875, "learning_rate": 9.085430696367553e-06, "loss": 1.1191117763519287, "step": 1580 }, { "epoch": 0.48695652173913045, "grad_norm": 5.5625, "learning_rate": 9.082891558888505e-06, "loss": 1.2700603008270264, "step": 1582 }, { "epoch": 0.48757214313197383, "grad_norm": 10.25, "learning_rate": 9.080349357871013e-06, "loss": 1.3424956798553467, "step": 1584 }, { "epoch": 0.4881877645248172, "grad_norm": 3.234375, "learning_rate": 9.077804095841314e-06, "loss": 1.348867416381836, "step": 1586 }, { "epoch": 0.48880338591766065, "grad_norm": 4.875, "learning_rate": 9.075255775328692e-06, "loss": 1.1686131954193115, "step": 1588 }, { "epoch": 0.48941900731050403, "grad_norm": 30.125, "learning_rate": 9.072704398865473e-06, "loss": 1.240150809288025, "step": 1590 }, { "epoch": 0.49003462870334746, "grad_norm": 15.4375, "learning_rate": 9.070149968987017e-06, "loss": 1.333054780960083, "step": 1592 }, { "epoch": 0.49065025009619084, "grad_norm": 7.34375, "learning_rate": 9.067592488231716e-06, "loss": 0.98194819688797, "step": 1594 }, { "epoch": 0.4912658714890342, "grad_norm": 3.1875, "learning_rate": 9.065031959140995e-06, "loss": 0.5497357249259949, "step": 1596 }, { "epoch": 0.49188149288187766, "grad_norm": 5.59375, "learning_rate": 9.062468384259313e-06, "loss": 1.3398869037628174, "step": 1598 }, { "epoch": 0.49249711427472104, "grad_norm": 9.125, "learning_rate": 9.059901766134149e-06, "loss": 1.6062138080596924, "step": 1600 }, { "epoch": 0.4931127356675645, "grad_norm": 12.5, "learning_rate": 9.05733210731601e-06, "loss": 1.9328994750976562, "step": 1602 }, { "epoch": 0.49372835706040785, "grad_norm": 6.34375, "learning_rate": 9.054759410358423e-06, "loss": 1.378353476524353, "step": 1604 }, { "epoch": 0.49434397845325123, "grad_norm": 5.28125, "learning_rate": 9.052183677817936e-06, "loss": 1.3997539281845093, "step": 1606 }, { "epoch": 0.49495959984609467, "grad_norm": 3.203125, "learning_rate": 9.049604912254108e-06, "loss": 1.3598434925079346, "step": 1608 }, { "epoch": 0.49557522123893805, "grad_norm": 13.3125, "learning_rate": 9.047023116229523e-06, "loss": 1.4434458017349243, "step": 1610 }, { "epoch": 0.4961908426317815, "grad_norm": 4.375, "learning_rate": 9.044438292309766e-06, "loss": 1.3952776193618774, "step": 1612 }, { "epoch": 0.49680646402462486, "grad_norm": 3.15625, "learning_rate": 9.041850443063431e-06, "loss": 1.047157883644104, "step": 1614 }, { "epoch": 0.49742208541746824, "grad_norm": 8.8125, "learning_rate": 9.039259571062126e-06, "loss": 1.1723214387893677, "step": 1616 }, { "epoch": 0.4980377068103117, "grad_norm": 2.90625, "learning_rate": 9.036665678880462e-06, "loss": 1.1033647060394287, "step": 1618 }, { "epoch": 0.49865332820315506, "grad_norm": 11.0625, "learning_rate": 9.034068769096038e-06, "loss": 1.4442481994628906, "step": 1620 }, { "epoch": 0.49926894959599843, "grad_norm": 4.625, "learning_rate": 9.031468844289467e-06, "loss": 1.2997276782989502, "step": 1622 }, { "epoch": 0.49988457098884187, "grad_norm": 3.046875, "learning_rate": 9.028865907044356e-06, "loss": 1.0517377853393555, "step": 1624 }, { "epoch": 0.5005001923816853, "grad_norm": 4.78125, "learning_rate": 9.026259959947296e-06, "loss": 1.2825373411178589, "step": 1626 }, { "epoch": 0.5011158137745286, "grad_norm": 9.1875, "learning_rate": 9.02365100558788e-06, "loss": 1.5244855880737305, "step": 1628 }, { "epoch": 0.5017314351673721, "grad_norm": 5.84375, "learning_rate": 9.021039046558681e-06, "loss": 1.6843534708023071, "step": 1630 }, { "epoch": 0.5023470565602155, "grad_norm": 3.296875, "learning_rate": 9.018424085455264e-06, "loss": 0.8393566012382507, "step": 1632 }, { "epoch": 0.5029626779530588, "grad_norm": 4.84375, "learning_rate": 9.015806124876169e-06, "loss": 1.4292811155319214, "step": 1634 }, { "epoch": 0.5035782993459023, "grad_norm": 10.3125, "learning_rate": 9.013185167422929e-06, "loss": 1.1995810270309448, "step": 1636 }, { "epoch": 0.5041939207387457, "grad_norm": 6.71875, "learning_rate": 9.010561215700045e-06, "loss": 1.3768254518508911, "step": 1638 }, { "epoch": 0.504809542131589, "grad_norm": 8.1875, "learning_rate": 9.007934272314996e-06, "loss": 1.08784019947052, "step": 1640 }, { "epoch": 0.5054251635244325, "grad_norm": 17.0, "learning_rate": 9.005304339878234e-06, "loss": 1.3579922914505005, "step": 1642 }, { "epoch": 0.5060407849172759, "grad_norm": 11.6875, "learning_rate": 9.002671421003185e-06, "loss": 1.7261128425598145, "step": 1644 }, { "epoch": 0.5066564063101193, "grad_norm": 7.3125, "learning_rate": 9.000035518306236e-06, "loss": 0.5321837663650513, "step": 1646 }, { "epoch": 0.5072720277029626, "grad_norm": 68.5, "learning_rate": 8.997396634406746e-06, "loss": 1.0127274990081787, "step": 1648 }, { "epoch": 0.5078876490958061, "grad_norm": 7.1875, "learning_rate": 8.994754771927029e-06, "loss": 1.0840795040130615, "step": 1650 }, { "epoch": 0.5085032704886495, "grad_norm": 8.625, "learning_rate": 8.992109933492366e-06, "loss": 1.2792012691497803, "step": 1652 }, { "epoch": 0.5091188918814928, "grad_norm": 5.21875, "learning_rate": 8.989462121730991e-06, "loss": 1.4806631803512573, "step": 1654 }, { "epoch": 0.5097345132743363, "grad_norm": 3.34375, "learning_rate": 8.986811339274095e-06, "loss": 1.0648256540298462, "step": 1656 }, { "epoch": 0.5103501346671797, "grad_norm": 7.4375, "learning_rate": 8.98415758875582e-06, "loss": 1.6581193208694458, "step": 1658 }, { "epoch": 0.510965756060023, "grad_norm": 5.25, "learning_rate": 8.981500872813256e-06, "loss": 1.0713742971420288, "step": 1660 }, { "epoch": 0.5115813774528665, "grad_norm": 11.5, "learning_rate": 8.978841194086443e-06, "loss": 0.5658085942268372, "step": 1662 }, { "epoch": 0.5121969988457099, "grad_norm": 4.53125, "learning_rate": 8.97617855521836e-06, "loss": 1.364528775215149, "step": 1664 }, { "epoch": 0.5128126202385533, "grad_norm": 4.78125, "learning_rate": 8.973512958854934e-06, "loss": 1.0248429775238037, "step": 1666 }, { "epoch": 0.5134282416313967, "grad_norm": 5.15625, "learning_rate": 8.97084440764503e-06, "loss": 1.2535791397094727, "step": 1668 }, { "epoch": 0.5140438630242401, "grad_norm": 2.84375, "learning_rate": 8.968172904240441e-06, "loss": 0.9831995964050293, "step": 1670 }, { "epoch": 0.5146594844170835, "grad_norm": 12.3125, "learning_rate": 8.965498451295904e-06, "loss": 1.2839998006820679, "step": 1672 }, { "epoch": 0.5152751058099269, "grad_norm": 8.6875, "learning_rate": 8.962821051469082e-06, "loss": 1.3942573070526123, "step": 1674 }, { "epoch": 0.5158907272027703, "grad_norm": 13.75, "learning_rate": 8.960140707420566e-06, "loss": 1.6228309869766235, "step": 1676 }, { "epoch": 0.5165063485956137, "grad_norm": 5.0625, "learning_rate": 8.957457421813876e-06, "loss": 1.4481265544891357, "step": 1678 }, { "epoch": 0.517121969988457, "grad_norm": 7.5625, "learning_rate": 8.954771197315451e-06, "loss": 1.4853836297988892, "step": 1680 }, { "epoch": 0.5177375913813005, "grad_norm": 8.75, "learning_rate": 8.952082036594653e-06, "loss": 1.552171230316162, "step": 1682 }, { "epoch": 0.5183532127741439, "grad_norm": 6.4375, "learning_rate": 8.949389942323763e-06, "loss": 1.4480795860290527, "step": 1684 }, { "epoch": 0.5189688341669872, "grad_norm": 4.625, "learning_rate": 8.946694917177974e-06, "loss": 1.430462121963501, "step": 1686 }, { "epoch": 0.5195844555598307, "grad_norm": 8.0625, "learning_rate": 8.943996963835396e-06, "loss": 1.272444486618042, "step": 1688 }, { "epoch": 0.5202000769526741, "grad_norm": 4.96875, "learning_rate": 8.94129608497704e-06, "loss": 1.265394926071167, "step": 1690 }, { "epoch": 0.5208156983455176, "grad_norm": 10.8125, "learning_rate": 8.938592283286831e-06, "loss": 1.2681784629821777, "step": 1692 }, { "epoch": 0.5214313197383609, "grad_norm": 10.1875, "learning_rate": 8.935885561451602e-06, "loss": 1.3533116579055786, "step": 1694 }, { "epoch": 0.5220469411312043, "grad_norm": 8.5, "learning_rate": 8.93317592216108e-06, "loss": 1.3701342344284058, "step": 1696 }, { "epoch": 0.5226625625240477, "grad_norm": 8.25, "learning_rate": 8.930463368107894e-06, "loss": 1.5623503923416138, "step": 1698 }, { "epoch": 0.5232781839168911, "grad_norm": 6.53125, "learning_rate": 8.927747901987572e-06, "loss": 1.0573383569717407, "step": 1700 }, { "epoch": 0.5238938053097345, "grad_norm": 33.75, "learning_rate": 8.92502952649853e-06, "loss": 1.3457276821136475, "step": 1702 }, { "epoch": 0.5245094267025779, "grad_norm": 5.25, "learning_rate": 8.92230824434208e-06, "loss": 1.3082480430603027, "step": 1704 }, { "epoch": 0.5251250480954213, "grad_norm": 2.90625, "learning_rate": 8.919584058222422e-06, "loss": 1.063995361328125, "step": 1706 }, { "epoch": 0.5257406694882647, "grad_norm": 2.21875, "learning_rate": 8.91685697084664e-06, "loss": 1.228017807006836, "step": 1708 }, { "epoch": 0.5263562908811081, "grad_norm": 7.5625, "learning_rate": 8.914126984924705e-06, "loss": 1.6891307830810547, "step": 1710 }, { "epoch": 0.5269719122739516, "grad_norm": 6.15625, "learning_rate": 8.911394103169461e-06, "loss": 1.14736807346344, "step": 1712 }, { "epoch": 0.5275875336667949, "grad_norm": 3.890625, "learning_rate": 8.908658328296635e-06, "loss": 1.1494313478469849, "step": 1714 }, { "epoch": 0.5282031550596383, "grad_norm": 6.75, "learning_rate": 8.905919663024829e-06, "loss": 1.4376049041748047, "step": 1716 }, { "epoch": 0.5288187764524818, "grad_norm": 2.9375, "learning_rate": 8.903178110075514e-06, "loss": 1.0928840637207031, "step": 1718 }, { "epoch": 0.5294343978453251, "grad_norm": 4.03125, "learning_rate": 8.900433672173035e-06, "loss": 0.9837021827697754, "step": 1720 }, { "epoch": 0.5300500192381685, "grad_norm": 4.40625, "learning_rate": 8.897686352044599e-06, "loss": 1.3075284957885742, "step": 1722 }, { "epoch": 0.530665640631012, "grad_norm": 4.71875, "learning_rate": 8.89493615242028e-06, "loss": 1.307611107826233, "step": 1724 }, { "epoch": 0.5312812620238553, "grad_norm": 10.9375, "learning_rate": 8.89218307603302e-06, "loss": 1.0998104810714722, "step": 1726 }, { "epoch": 0.5318968834166987, "grad_norm": 11.625, "learning_rate": 8.8894271256186e-06, "loss": 1.3269050121307373, "step": 1728 }, { "epoch": 0.5325125048095422, "grad_norm": 7.125, "learning_rate": 8.88666830391568e-06, "loss": 1.2515839338302612, "step": 1730 }, { "epoch": 0.5331281262023856, "grad_norm": 10.1875, "learning_rate": 8.883906613665758e-06, "loss": 1.5673224925994873, "step": 1732 }, { "epoch": 0.5337437475952289, "grad_norm": 6.46875, "learning_rate": 8.881142057613187e-06, "loss": 1.440687656402588, "step": 1734 }, { "epoch": 0.5343593689880723, "grad_norm": 18.75, "learning_rate": 8.878374638505172e-06, "loss": 1.611922025680542, "step": 1736 }, { "epoch": 0.5349749903809158, "grad_norm": 7.375, "learning_rate": 8.875604359091759e-06, "loss": 1.4720699787139893, "step": 1738 }, { "epoch": 0.5355906117737591, "grad_norm": 4.75, "learning_rate": 8.872831222125833e-06, "loss": 1.1215943098068237, "step": 1740 }, { "epoch": 0.5362062331666025, "grad_norm": 7.59375, "learning_rate": 8.870055230363126e-06, "loss": 1.3788470029830933, "step": 1742 }, { "epoch": 0.536821854559446, "grad_norm": 4.09375, "learning_rate": 8.8672763865622e-06, "loss": 1.083932876586914, "step": 1744 }, { "epoch": 0.5374374759522893, "grad_norm": 6.15625, "learning_rate": 8.86449469348446e-06, "loss": 1.3741451501846313, "step": 1746 }, { "epoch": 0.5380530973451327, "grad_norm": 4.0625, "learning_rate": 8.861710153894129e-06, "loss": 1.334261417388916, "step": 1748 }, { "epoch": 0.5386687187379762, "grad_norm": 4.1875, "learning_rate": 8.858922770558272e-06, "loss": 1.2975589036941528, "step": 1750 }, { "epoch": 0.5392843401308195, "grad_norm": 24.25, "learning_rate": 8.856132546246774e-06, "loss": 1.2593014240264893, "step": 1752 }, { "epoch": 0.5398999615236629, "grad_norm": 3.265625, "learning_rate": 8.853339483732341e-06, "loss": 1.1915488243103027, "step": 1754 }, { "epoch": 0.5405155829165064, "grad_norm": 6.46875, "learning_rate": 8.850543585790504e-06, "loss": 1.025390625, "step": 1756 }, { "epoch": 0.5411312043093498, "grad_norm": 10.0625, "learning_rate": 8.847744855199607e-06, "loss": 1.3022722005844116, "step": 1758 }, { "epoch": 0.5417468257021931, "grad_norm": 14.1875, "learning_rate": 8.844943294740813e-06, "loss": 1.1693408489227295, "step": 1760 }, { "epoch": 0.5423624470950366, "grad_norm": 5.5625, "learning_rate": 8.842138907198098e-06, "loss": 1.4360636472702026, "step": 1762 }, { "epoch": 0.54297806848788, "grad_norm": 13.75, "learning_rate": 8.83933169535824e-06, "loss": 1.373488187789917, "step": 1764 }, { "epoch": 0.5435936898807233, "grad_norm": 9.6875, "learning_rate": 8.83652166201083e-06, "loss": 1.6782146692276, "step": 1766 }, { "epoch": 0.5442093112735668, "grad_norm": 4.28125, "learning_rate": 8.833708809948261e-06, "loss": 1.2512364387512207, "step": 1768 }, { "epoch": 0.5448249326664102, "grad_norm": 9.25, "learning_rate": 8.830893141965729e-06, "loss": 1.6331942081451416, "step": 1770 }, { "epoch": 0.5454405540592535, "grad_norm": 4.21875, "learning_rate": 8.828074660861223e-06, "loss": 1.2096216678619385, "step": 1772 }, { "epoch": 0.546056175452097, "grad_norm": 5.21875, "learning_rate": 8.825253369435536e-06, "loss": 1.2483621835708618, "step": 1774 }, { "epoch": 0.5466717968449404, "grad_norm": 9.5625, "learning_rate": 8.822429270492243e-06, "loss": 1.4440934658050537, "step": 1776 }, { "epoch": 0.5472874182377838, "grad_norm": 7.8125, "learning_rate": 8.819602366837716e-06, "loss": 1.355756402015686, "step": 1778 }, { "epoch": 0.5479030396306271, "grad_norm": 4.375, "learning_rate": 8.816772661281117e-06, "loss": 1.4682282209396362, "step": 1780 }, { "epoch": 0.5485186610234706, "grad_norm": 9.4375, "learning_rate": 8.81394015663438e-06, "loss": 1.7345459461212158, "step": 1782 }, { "epoch": 0.549134282416314, "grad_norm": 10.6875, "learning_rate": 8.811104855712235e-06, "loss": 1.5893970727920532, "step": 1784 }, { "epoch": 0.5497499038091573, "grad_norm": 6.6875, "learning_rate": 8.80826676133218e-06, "loss": 1.135110855102539, "step": 1786 }, { "epoch": 0.5503655252020008, "grad_norm": 7.6875, "learning_rate": 8.805425876314497e-06, "loss": 1.3346829414367676, "step": 1788 }, { "epoch": 0.5509811465948442, "grad_norm": 9.1875, "learning_rate": 8.802582203482232e-06, "loss": 1.416990876197815, "step": 1790 }, { "epoch": 0.5515967679876875, "grad_norm": 4.65625, "learning_rate": 8.799735745661214e-06, "loss": 1.4351859092712402, "step": 1792 }, { "epoch": 0.552212389380531, "grad_norm": 5.375, "learning_rate": 8.796886505680022e-06, "loss": 1.0165706872940063, "step": 1794 }, { "epoch": 0.5528280107733744, "grad_norm": 4.625, "learning_rate": 8.794034486370015e-06, "loss": 1.3028804063796997, "step": 1796 }, { "epoch": 0.5534436321662177, "grad_norm": 8.3125, "learning_rate": 8.791179690565312e-06, "loss": 1.4025111198425293, "step": 1798 }, { "epoch": 0.5540592535590612, "grad_norm": 4.96875, "learning_rate": 8.788322121102781e-06, "loss": 1.1301255226135254, "step": 1800 }, { "epoch": 0.5546748749519046, "grad_norm": 5.21875, "learning_rate": 8.785461780822058e-06, "loss": 1.0889443159103394, "step": 1802 }, { "epoch": 0.555290496344748, "grad_norm": 8.5625, "learning_rate": 8.782598672565521e-06, "loss": 1.5021328926086426, "step": 1804 }, { "epoch": 0.5559061177375914, "grad_norm": 6.28125, "learning_rate": 8.779732799178314e-06, "loss": 0.9383888840675354, "step": 1806 }, { "epoch": 0.5565217391304348, "grad_norm": 7.5625, "learning_rate": 8.77686416350831e-06, "loss": 1.7545676231384277, "step": 1808 }, { "epoch": 0.5571373605232782, "grad_norm": 5.09375, "learning_rate": 8.773992768406144e-06, "loss": 1.0912437438964844, "step": 1810 }, { "epoch": 0.5577529819161215, "grad_norm": 8.1875, "learning_rate": 8.771118616725181e-06, "loss": 1.7606732845306396, "step": 1812 }, { "epoch": 0.558368603308965, "grad_norm": 14.25, "learning_rate": 8.76824171132153e-06, "loss": 1.8077706098556519, "step": 1814 }, { "epoch": 0.5589842247018084, "grad_norm": 34.5, "learning_rate": 8.765362055054042e-06, "loss": 1.521872878074646, "step": 1816 }, { "epoch": 0.5595998460946517, "grad_norm": 5.75, "learning_rate": 8.762479650784287e-06, "loss": 1.0878723859786987, "step": 1818 }, { "epoch": 0.5602154674874952, "grad_norm": 6.09375, "learning_rate": 8.759594501376584e-06, "loss": 1.2345068454742432, "step": 1820 }, { "epoch": 0.5608310888803386, "grad_norm": 6.625, "learning_rate": 8.756706609697965e-06, "loss": 1.4004822969436646, "step": 1822 }, { "epoch": 0.561446710273182, "grad_norm": 20.625, "learning_rate": 8.753815978618194e-06, "loss": 1.4569531679153442, "step": 1824 }, { "epoch": 0.5620623316660254, "grad_norm": 12.25, "learning_rate": 8.750922611009757e-06, "loss": 1.0861998796463013, "step": 1826 }, { "epoch": 0.5626779530588688, "grad_norm": 7.0625, "learning_rate": 8.748026509747858e-06, "loss": 1.1484061479568481, "step": 1828 }, { "epoch": 0.5632935744517122, "grad_norm": 7.84375, "learning_rate": 8.745127677710415e-06, "loss": 1.6220837831497192, "step": 1830 }, { "epoch": 0.5639091958445556, "grad_norm": 6.3125, "learning_rate": 8.742226117778063e-06, "loss": 1.5962282419204712, "step": 1832 }, { "epoch": 0.564524817237399, "grad_norm": 12.875, "learning_rate": 8.739321832834151e-06, "loss": 1.4062249660491943, "step": 1834 }, { "epoch": 0.5651404386302424, "grad_norm": 6.34375, "learning_rate": 8.736414825764729e-06, "loss": 1.4309604167938232, "step": 1836 }, { "epoch": 0.5657560600230858, "grad_norm": 10.1875, "learning_rate": 8.733505099458555e-06, "loss": 1.6085896492004395, "step": 1838 }, { "epoch": 0.5663716814159292, "grad_norm": 6.875, "learning_rate": 8.730592656807091e-06, "loss": 1.260807991027832, "step": 1840 }, { "epoch": 0.5669873028087726, "grad_norm": 10.3125, "learning_rate": 8.727677500704494e-06, "loss": 1.4040920734405518, "step": 1842 }, { "epoch": 0.567602924201616, "grad_norm": 12.0625, "learning_rate": 8.724759634047622e-06, "loss": 1.0131279230117798, "step": 1844 }, { "epoch": 0.5682185455944594, "grad_norm": 5.59375, "learning_rate": 8.721839059736023e-06, "loss": 1.0781059265136719, "step": 1846 }, { "epoch": 0.5688341669873028, "grad_norm": 12.1875, "learning_rate": 8.718915780671939e-06, "loss": 1.4911892414093018, "step": 1848 }, { "epoch": 0.5694497883801463, "grad_norm": 5.8125, "learning_rate": 8.715989799760298e-06, "loss": 1.3792434930801392, "step": 1850 }, { "epoch": 0.5700654097729896, "grad_norm": 5.65625, "learning_rate": 8.713061119908713e-06, "loss": 1.2725658416748047, "step": 1852 }, { "epoch": 0.570681031165833, "grad_norm": 5.8125, "learning_rate": 8.710129744027474e-06, "loss": 1.2276039123535156, "step": 1854 }, { "epoch": 0.5712966525586765, "grad_norm": 4.21875, "learning_rate": 8.707195675029558e-06, "loss": 0.955919086933136, "step": 1856 }, { "epoch": 0.5719122739515198, "grad_norm": 4.53125, "learning_rate": 8.704258915830619e-06, "loss": 1.2571704387664795, "step": 1858 }, { "epoch": 0.5725278953443632, "grad_norm": 10.1875, "learning_rate": 8.701319469348975e-06, "loss": 1.1578973531723022, "step": 1860 }, { "epoch": 0.5731435167372066, "grad_norm": 4.21875, "learning_rate": 8.698377338505623e-06, "loss": 1.2620946168899536, "step": 1862 }, { "epoch": 0.57375913813005, "grad_norm": 5.5, "learning_rate": 8.695432526224223e-06, "loss": 1.1545274257659912, "step": 1864 }, { "epoch": 0.5743747595228934, "grad_norm": 5.34375, "learning_rate": 8.692485035431103e-06, "loss": 1.2381047010421753, "step": 1866 }, { "epoch": 0.5749903809157368, "grad_norm": 10.0625, "learning_rate": 8.689534869055247e-06, "loss": 1.3293325901031494, "step": 1868 }, { "epoch": 0.5756060023085803, "grad_norm": 3.859375, "learning_rate": 8.686582030028304e-06, "loss": 1.2085543870925903, "step": 1870 }, { "epoch": 0.5762216237014236, "grad_norm": 8.3125, "learning_rate": 8.683626521284576e-06, "loss": 0.848010241985321, "step": 1872 }, { "epoch": 0.576837245094267, "grad_norm": 4.46875, "learning_rate": 8.680668345761016e-06, "loss": 1.362102746963501, "step": 1874 }, { "epoch": 0.5774528664871105, "grad_norm": 6.90625, "learning_rate": 8.677707506397235e-06, "loss": 1.1126528978347778, "step": 1876 }, { "epoch": 0.5780684878799538, "grad_norm": 7.8125, "learning_rate": 8.67474400613548e-06, "loss": 1.150965929031372, "step": 1878 }, { "epoch": 0.5786841092727972, "grad_norm": 4.125, "learning_rate": 8.671777847920649e-06, "loss": 1.4929723739624023, "step": 1880 }, { "epoch": 0.5792997306656407, "grad_norm": 5.84375, "learning_rate": 8.66880903470028e-06, "loss": 1.4086570739746094, "step": 1882 }, { "epoch": 0.579915352058484, "grad_norm": 12.25, "learning_rate": 8.665837569424552e-06, "loss": 1.2356774806976318, "step": 1884 }, { "epoch": 0.5805309734513274, "grad_norm": 4.1875, "learning_rate": 8.662863455046272e-06, "loss": 1.2308299541473389, "step": 1886 }, { "epoch": 0.5811465948441709, "grad_norm": 13.3125, "learning_rate": 8.659886694520889e-06, "loss": 1.505770206451416, "step": 1888 }, { "epoch": 0.5817622162370142, "grad_norm": 5.59375, "learning_rate": 8.656907290806471e-06, "loss": 1.5034568309783936, "step": 1890 }, { "epoch": 0.5823778376298576, "grad_norm": 7.40625, "learning_rate": 8.653925246863724e-06, "loss": 0.9744303822517395, "step": 1892 }, { "epoch": 0.582993459022701, "grad_norm": 4.40625, "learning_rate": 8.650940565655968e-06, "loss": 1.5263757705688477, "step": 1894 }, { "epoch": 0.5836090804155445, "grad_norm": 5.5, "learning_rate": 8.647953250149149e-06, "loss": 1.3015506267547607, "step": 1896 }, { "epoch": 0.5842247018083878, "grad_norm": 7.4375, "learning_rate": 8.644963303311829e-06, "loss": 1.2412558794021606, "step": 1898 }, { "epoch": 0.5848403232012312, "grad_norm": 4.09375, "learning_rate": 8.641970728115186e-06, "loss": 1.366623044013977, "step": 1900 }, { "epoch": 0.5854559445940747, "grad_norm": 14.4375, "learning_rate": 8.638975527533007e-06, "loss": 1.654881238937378, "step": 1902 }, { "epoch": 0.586071565986918, "grad_norm": 5.34375, "learning_rate": 8.63597770454169e-06, "loss": 1.6023765802383423, "step": 1904 }, { "epoch": 0.5866871873797614, "grad_norm": 9.0, "learning_rate": 8.632977262120245e-06, "loss": 1.4414507150650024, "step": 1906 }, { "epoch": 0.5873028087726049, "grad_norm": 9.3125, "learning_rate": 8.629974203250273e-06, "loss": 1.3019405603408813, "step": 1908 }, { "epoch": 0.5879184301654482, "grad_norm": 7.28125, "learning_rate": 8.62696853091598e-06, "loss": 1.1021331548690796, "step": 1910 }, { "epoch": 0.5885340515582916, "grad_norm": 8.1875, "learning_rate": 8.623960248104175e-06, "loss": 1.2331730127334595, "step": 1912 }, { "epoch": 0.5891496729511351, "grad_norm": 5.75, "learning_rate": 8.620949357804252e-06, "loss": 1.4081820249557495, "step": 1914 }, { "epoch": 0.5897652943439785, "grad_norm": 4.90625, "learning_rate": 8.6179358630082e-06, "loss": 1.2616064548492432, "step": 1916 }, { "epoch": 0.5903809157368218, "grad_norm": 8.375, "learning_rate": 8.614919766710598e-06, "loss": 1.3621723651885986, "step": 1918 }, { "epoch": 0.5909965371296653, "grad_norm": 14.875, "learning_rate": 8.61190107190861e-06, "loss": 0.9340348243713379, "step": 1920 }, { "epoch": 0.5916121585225087, "grad_norm": 4.65625, "learning_rate": 8.60887978160198e-06, "loss": 1.278018593788147, "step": 1922 }, { "epoch": 0.592227779915352, "grad_norm": 4.53125, "learning_rate": 8.605855898793027e-06, "loss": 1.2426376342773438, "step": 1924 }, { "epoch": 0.5928434013081955, "grad_norm": 3.109375, "learning_rate": 8.602829426486657e-06, "loss": 1.1372792720794678, "step": 1926 }, { "epoch": 0.5934590227010389, "grad_norm": 5.3125, "learning_rate": 8.599800367690342e-06, "loss": 1.3143759965896606, "step": 1928 }, { "epoch": 0.5940746440938822, "grad_norm": 3.8125, "learning_rate": 8.596768725414125e-06, "loss": 1.2842066287994385, "step": 1930 }, { "epoch": 0.5946902654867257, "grad_norm": 6.34375, "learning_rate": 8.593734502670615e-06, "loss": 1.5189852714538574, "step": 1932 }, { "epoch": 0.5953058868795691, "grad_norm": 8.875, "learning_rate": 8.590697702474988e-06, "loss": 1.1181970834732056, "step": 1934 }, { "epoch": 0.5959215082724124, "grad_norm": 6.09375, "learning_rate": 8.587658327844982e-06, "loss": 1.3686904907226562, "step": 1936 }, { "epoch": 0.5965371296652558, "grad_norm": 8.5, "learning_rate": 8.584616381800895e-06, "loss": 1.596742033958435, "step": 1938 }, { "epoch": 0.5971527510580993, "grad_norm": 5.5625, "learning_rate": 8.58157186736557e-06, "loss": 1.4269849061965942, "step": 1940 }, { "epoch": 0.5977683724509427, "grad_norm": 4.5, "learning_rate": 8.578524787564412e-06, "loss": 1.2365959882736206, "step": 1942 }, { "epoch": 0.598383993843786, "grad_norm": 15.25, "learning_rate": 8.575475145425373e-06, "loss": 1.385693907737732, "step": 1944 }, { "epoch": 0.5989996152366295, "grad_norm": 4.5, "learning_rate": 8.572422943978951e-06, "loss": 1.1528739929199219, "step": 1946 }, { "epoch": 0.5996152366294729, "grad_norm": 3.25, "learning_rate": 8.569368186258187e-06, "loss": 0.9854751825332642, "step": 1948 }, { "epoch": 0.6002308580223162, "grad_norm": 15.75, "learning_rate": 8.566310875298662e-06, "loss": 1.4587029218673706, "step": 1950 }, { "epoch": 0.6008464794151597, "grad_norm": 12.0625, "learning_rate": 8.563251014138493e-06, "loss": 1.0503976345062256, "step": 1952 }, { "epoch": 0.6014621008080031, "grad_norm": 5.28125, "learning_rate": 8.560188605818335e-06, "loss": 1.2880537509918213, "step": 1954 }, { "epoch": 0.6020777222008464, "grad_norm": 22.25, "learning_rate": 8.557123653381369e-06, "loss": 1.2823199033737183, "step": 1956 }, { "epoch": 0.6026933435936899, "grad_norm": 3.640625, "learning_rate": 8.554056159873311e-06, "loss": 1.0577878952026367, "step": 1958 }, { "epoch": 0.6033089649865333, "grad_norm": 10.1875, "learning_rate": 8.550986128342395e-06, "loss": 1.6685055494308472, "step": 1960 }, { "epoch": 0.6039245863793767, "grad_norm": 3.75, "learning_rate": 8.54791356183938e-06, "loss": 1.2891515493392944, "step": 1962 }, { "epoch": 0.6045402077722201, "grad_norm": 8.8125, "learning_rate": 8.544838463417547e-06, "loss": 1.311370849609375, "step": 1964 }, { "epoch": 0.6051558291650635, "grad_norm": 10.3125, "learning_rate": 8.541760836132684e-06, "loss": 1.1342506408691406, "step": 1966 }, { "epoch": 0.6057714505579069, "grad_norm": 8.9375, "learning_rate": 8.538680683043105e-06, "loss": 0.7439443469047546, "step": 1968 }, { "epoch": 0.6063870719507503, "grad_norm": 6.28125, "learning_rate": 8.535598007209624e-06, "loss": 1.3986562490463257, "step": 1970 }, { "epoch": 0.6070026933435937, "grad_norm": 4.625, "learning_rate": 8.532512811695567e-06, "loss": 1.389469027519226, "step": 1972 }, { "epoch": 0.6076183147364371, "grad_norm": 1.875, "learning_rate": 8.529425099566761e-06, "loss": 0.9762718677520752, "step": 1974 }, { "epoch": 0.6082339361292805, "grad_norm": 7.09375, "learning_rate": 8.526334873891533e-06, "loss": 1.321108102798462, "step": 1976 }, { "epoch": 0.6088495575221239, "grad_norm": 39.5, "learning_rate": 8.52324213774071e-06, "loss": 0.8753465414047241, "step": 1978 }, { "epoch": 0.6094651789149673, "grad_norm": 9.6875, "learning_rate": 8.520146894187616e-06, "loss": 1.3895344734191895, "step": 1980 }, { "epoch": 0.6100808003078106, "grad_norm": 3.875, "learning_rate": 8.517049146308063e-06, "loss": 1.2270596027374268, "step": 1982 }, { "epoch": 0.6106964217006541, "grad_norm": 59.0, "learning_rate": 8.513948897180348e-06, "loss": 1.7473394870758057, "step": 1984 }, { "epoch": 0.6113120430934975, "grad_norm": 9.75, "learning_rate": 8.510846149885264e-06, "loss": 1.770764946937561, "step": 1986 }, { "epoch": 0.611927664486341, "grad_norm": 7.96875, "learning_rate": 8.50774090750608e-06, "loss": 1.2302086353302002, "step": 1988 }, { "epoch": 0.6125432858791843, "grad_norm": 7.0, "learning_rate": 8.504633173128539e-06, "loss": 1.3983111381530762, "step": 1990 }, { "epoch": 0.6131589072720277, "grad_norm": 7.15625, "learning_rate": 8.501522949840873e-06, "loss": 1.2710623741149902, "step": 1992 }, { "epoch": 0.6137745286648711, "grad_norm": 5.9375, "learning_rate": 8.498410240733776e-06, "loss": 1.2992455959320068, "step": 1994 }, { "epoch": 0.6143901500577145, "grad_norm": 9.25, "learning_rate": 8.495295048900421e-06, "loss": 1.0018309354782104, "step": 1996 }, { "epoch": 0.6150057714505579, "grad_norm": 16.625, "learning_rate": 8.492177377436442e-06, "loss": 1.5174227952957153, "step": 1998 }, { "epoch": 0.6156213928434013, "grad_norm": 7.75, "learning_rate": 8.489057229439937e-06, "loss": 1.758395791053772, "step": 2000 }, { "epoch": 0.6162370142362447, "grad_norm": 13.625, "learning_rate": 8.485934608011469e-06, "loss": 1.062825322151184, "step": 2002 }, { "epoch": 0.6168526356290881, "grad_norm": 7.375, "learning_rate": 8.482809516254058e-06, "loss": 1.2867090702056885, "step": 2004 }, { "epoch": 0.6174682570219315, "grad_norm": 10.8125, "learning_rate": 8.479681957273177e-06, "loss": 1.3977320194244385, "step": 2006 }, { "epoch": 0.618083878414775, "grad_norm": 8.125, "learning_rate": 8.47655193417675e-06, "loss": 1.230679988861084, "step": 2008 }, { "epoch": 0.6186994998076183, "grad_norm": 7.59375, "learning_rate": 8.473419450075149e-06, "loss": 0.9919013381004333, "step": 2010 }, { "epoch": 0.6193151212004617, "grad_norm": 6.0, "learning_rate": 8.470284508081201e-06, "loss": 1.2583743333816528, "step": 2012 }, { "epoch": 0.6199307425933052, "grad_norm": 14.0625, "learning_rate": 8.46714711131016e-06, "loss": 1.474548578262329, "step": 2014 }, { "epoch": 0.6205463639861485, "grad_norm": 11.875, "learning_rate": 8.464007262879736e-06, "loss": 1.4689182043075562, "step": 2016 }, { "epoch": 0.6211619853789919, "grad_norm": 21.125, "learning_rate": 8.460864965910061e-06, "loss": 1.1271132230758667, "step": 2018 }, { "epoch": 0.6217776067718354, "grad_norm": 8.9375, "learning_rate": 8.457720223523704e-06, "loss": 1.3200182914733887, "step": 2020 }, { "epoch": 0.6223932281646787, "grad_norm": 7.6875, "learning_rate": 8.454573038845671e-06, "loss": 1.6913087368011475, "step": 2022 }, { "epoch": 0.6230088495575221, "grad_norm": 8.1875, "learning_rate": 8.451423415003387e-06, "loss": 0.9711453914642334, "step": 2024 }, { "epoch": 0.6236244709503656, "grad_norm": 7.3125, "learning_rate": 8.448271355126707e-06, "loss": 1.281950831413269, "step": 2026 }, { "epoch": 0.6242400923432089, "grad_norm": 4.59375, "learning_rate": 8.4451168623479e-06, "loss": 1.5933538675308228, "step": 2028 }, { "epoch": 0.6248557137360523, "grad_norm": 13.4375, "learning_rate": 8.441959939801657e-06, "loss": 1.6639219522476196, "step": 2030 }, { "epoch": 0.6254713351288957, "grad_norm": 16.5, "learning_rate": 8.438800590625084e-06, "loss": 1.0111881494522095, "step": 2032 }, { "epoch": 0.6260869565217392, "grad_norm": 6.59375, "learning_rate": 8.435638817957696e-06, "loss": 1.194913387298584, "step": 2034 }, { "epoch": 0.6267025779145825, "grad_norm": 6.1875, "learning_rate": 8.432474624941418e-06, "loss": 1.225528359413147, "step": 2036 }, { "epoch": 0.6273181993074259, "grad_norm": 5.75, "learning_rate": 8.429308014720578e-06, "loss": 1.2120106220245361, "step": 2038 }, { "epoch": 0.6279338207002694, "grad_norm": 10.625, "learning_rate": 8.42613899044191e-06, "loss": 1.016963243484497, "step": 2040 }, { "epoch": 0.6285494420931127, "grad_norm": 6.78125, "learning_rate": 8.422967555254544e-06, "loss": 0.8804018497467041, "step": 2042 }, { "epoch": 0.6291650634859561, "grad_norm": 2.828125, "learning_rate": 8.419793712310005e-06, "loss": 1.086695671081543, "step": 2044 }, { "epoch": 0.6297806848787996, "grad_norm": 8.25, "learning_rate": 8.416617464762213e-06, "loss": 1.6183313131332397, "step": 2046 }, { "epoch": 0.6303963062716429, "grad_norm": 4.1875, "learning_rate": 8.413438815767474e-06, "loss": 1.333935260772705, "step": 2048 }, { "epoch": 0.6310119276644863, "grad_norm": 9.75, "learning_rate": 8.410257768484486e-06, "loss": 1.3460040092468262, "step": 2050 }, { "epoch": 0.6316275490573298, "grad_norm": 6.5625, "learning_rate": 8.407074326074325e-06, "loss": 1.3743432760238647, "step": 2052 }, { "epoch": 0.6322431704501732, "grad_norm": 3.921875, "learning_rate": 8.403888491700449e-06, "loss": 1.0204885005950928, "step": 2054 }, { "epoch": 0.6328587918430165, "grad_norm": 10.125, "learning_rate": 8.400700268528695e-06, "loss": 1.1670832633972168, "step": 2056 }, { "epoch": 0.63347441323586, "grad_norm": 5.40625, "learning_rate": 8.39750965972727e-06, "loss": 1.0655497312545776, "step": 2058 }, { "epoch": 0.6340900346287034, "grad_norm": 14.75, "learning_rate": 8.394316668466753e-06, "loss": 1.6687726974487305, "step": 2060 }, { "epoch": 0.6347056560215467, "grad_norm": 9.25, "learning_rate": 8.391121297920093e-06, "loss": 1.5025804042816162, "step": 2062 }, { "epoch": 0.6353212774143902, "grad_norm": 4.6875, "learning_rate": 8.3879235512626e-06, "loss": 1.5211477279663086, "step": 2064 }, { "epoch": 0.6359368988072336, "grad_norm": 3.46875, "learning_rate": 8.384723431671947e-06, "loss": 1.1468708515167236, "step": 2066 }, { "epoch": 0.6365525202000769, "grad_norm": 8.8125, "learning_rate": 8.381520942328163e-06, "loss": 1.2303088903427124, "step": 2068 }, { "epoch": 0.6371681415929203, "grad_norm": 5.84375, "learning_rate": 8.378316086413637e-06, "loss": 1.098226547241211, "step": 2070 }, { "epoch": 0.6377837629857638, "grad_norm": 4.5625, "learning_rate": 8.375108867113104e-06, "loss": 1.3485264778137207, "step": 2072 }, { "epoch": 0.6383993843786071, "grad_norm": 7.40625, "learning_rate": 8.371899287613648e-06, "loss": 1.6760691404342651, "step": 2074 }, { "epoch": 0.6390150057714505, "grad_norm": 4.09375, "learning_rate": 8.368687351104702e-06, "loss": 1.6598718166351318, "step": 2076 }, { "epoch": 0.639630627164294, "grad_norm": 16.0, "learning_rate": 8.36547306077804e-06, "loss": 1.560227394104004, "step": 2078 }, { "epoch": 0.6402462485571374, "grad_norm": 4.25, "learning_rate": 8.362256419827773e-06, "loss": 1.228755235671997, "step": 2080 }, { "epoch": 0.6408618699499807, "grad_norm": 7.28125, "learning_rate": 8.35903743145035e-06, "loss": 1.2501813173294067, "step": 2082 }, { "epoch": 0.6414774913428242, "grad_norm": 3.828125, "learning_rate": 8.355816098844551e-06, "loss": 1.4830968379974365, "step": 2084 }, { "epoch": 0.6420931127356676, "grad_norm": 7.5625, "learning_rate": 8.352592425211488e-06, "loss": 1.3437997102737427, "step": 2086 }, { "epoch": 0.6427087341285109, "grad_norm": 9.3125, "learning_rate": 8.349366413754595e-06, "loss": 1.3608304262161255, "step": 2088 }, { "epoch": 0.6433243555213544, "grad_norm": 3.78125, "learning_rate": 8.346138067679635e-06, "loss": 1.2023274898529053, "step": 2090 }, { "epoch": 0.6439399769141978, "grad_norm": 16.125, "learning_rate": 8.342907390194687e-06, "loss": 1.297385811805725, "step": 2092 }, { "epoch": 0.6445555983070411, "grad_norm": 8.5, "learning_rate": 8.339674384510145e-06, "loss": 1.7631044387817383, "step": 2094 }, { "epoch": 0.6451712196998846, "grad_norm": 4.375, "learning_rate": 8.336439053838722e-06, "loss": 1.2460544109344482, "step": 2096 }, { "epoch": 0.645786841092728, "grad_norm": 13.0, "learning_rate": 8.33320140139544e-06, "loss": 1.054074764251709, "step": 2098 }, { "epoch": 0.6464024624855714, "grad_norm": 8.75, "learning_rate": 8.329961430397623e-06, "loss": 1.279534101486206, "step": 2100 }, { "epoch": 0.6470180838784148, "grad_norm": 10.75, "learning_rate": 8.326719144064905e-06, "loss": 1.72566819190979, "step": 2102 }, { "epoch": 0.6476337052712582, "grad_norm": 5.78125, "learning_rate": 8.323474545619219e-06, "loss": 1.2224308252334595, "step": 2104 }, { "epoch": 0.6482493266641016, "grad_norm": 7.71875, "learning_rate": 8.320227638284795e-06, "loss": 1.363440752029419, "step": 2106 }, { "epoch": 0.648864948056945, "grad_norm": 9.3125, "learning_rate": 8.316978425288157e-06, "loss": 1.9052454233169556, "step": 2108 }, { "epoch": 0.6494805694497884, "grad_norm": 25.125, "learning_rate": 8.313726909858117e-06, "loss": 1.117143988609314, "step": 2110 }, { "epoch": 0.6500961908426318, "grad_norm": 38.75, "learning_rate": 8.310473095225786e-06, "loss": 1.2457895278930664, "step": 2112 }, { "epoch": 0.6507118122354751, "grad_norm": 16.25, "learning_rate": 8.307216984624547e-06, "loss": 1.6599624156951904, "step": 2114 }, { "epoch": 0.6513274336283186, "grad_norm": 9.375, "learning_rate": 8.303958581290074e-06, "loss": 1.7203136682510376, "step": 2116 }, { "epoch": 0.651943055021162, "grad_norm": 7.71875, "learning_rate": 8.300697888460314e-06, "loss": 1.2997355461120605, "step": 2118 }, { "epoch": 0.6525586764140053, "grad_norm": 4.9375, "learning_rate": 8.297434909375488e-06, "loss": 1.1495037078857422, "step": 2120 }, { "epoch": 0.6531742978068488, "grad_norm": 2.921875, "learning_rate": 8.294169647278097e-06, "loss": 1.1950782537460327, "step": 2122 }, { "epoch": 0.6537899191996922, "grad_norm": 7.34375, "learning_rate": 8.290902105412899e-06, "loss": 1.379351258277893, "step": 2124 }, { "epoch": 0.6544055405925356, "grad_norm": 19.125, "learning_rate": 8.287632287026925e-06, "loss": 1.2099511623382568, "step": 2126 }, { "epoch": 0.655021161985379, "grad_norm": 8.1875, "learning_rate": 8.284360195369471e-06, "loss": 1.4267417192459106, "step": 2128 }, { "epoch": 0.6556367833782224, "grad_norm": 4.0, "learning_rate": 8.281085833692083e-06, "loss": 1.3050733804702759, "step": 2130 }, { "epoch": 0.6562524047710658, "grad_norm": 9.3125, "learning_rate": 8.277809205248572e-06, "loss": 1.6474896669387817, "step": 2132 }, { "epoch": 0.6568680261639092, "grad_norm": 5.4375, "learning_rate": 8.274530313294992e-06, "loss": 1.317122459411621, "step": 2134 }, { "epoch": 0.6574836475567526, "grad_norm": 2.453125, "learning_rate": 8.271249161089658e-06, "loss": 1.3636703491210938, "step": 2136 }, { "epoch": 0.658099268949596, "grad_norm": 5.125, "learning_rate": 8.26796575189312e-06, "loss": 1.190585970878601, "step": 2138 }, { "epoch": 0.6587148903424394, "grad_norm": 8.75, "learning_rate": 8.264680088968173e-06, "loss": 1.0943142175674438, "step": 2140 }, { "epoch": 0.6593305117352828, "grad_norm": 4.4375, "learning_rate": 8.261392175579859e-06, "loss": 1.3861849308013916, "step": 2142 }, { "epoch": 0.6599461331281262, "grad_norm": 28.25, "learning_rate": 8.258102014995446e-06, "loss": 1.3402506113052368, "step": 2144 }, { "epoch": 0.6605617545209697, "grad_norm": 16.375, "learning_rate": 8.254809610484449e-06, "loss": 1.638556718826294, "step": 2146 }, { "epoch": 0.661177375913813, "grad_norm": 30.25, "learning_rate": 8.251514965318595e-06, "loss": 1.2110763788223267, "step": 2148 }, { "epoch": 0.6617929973066564, "grad_norm": 6.71875, "learning_rate": 8.24821808277185e-06, "loss": 1.3014509677886963, "step": 2150 }, { "epoch": 0.6624086186994999, "grad_norm": 6.78125, "learning_rate": 8.244918966120402e-06, "loss": 1.265984296798706, "step": 2152 }, { "epoch": 0.6630242400923432, "grad_norm": 10.1875, "learning_rate": 8.241617618642655e-06, "loss": 1.253446102142334, "step": 2154 }, { "epoch": 0.6636398614851866, "grad_norm": 10.5625, "learning_rate": 8.238314043619233e-06, "loss": 1.6335866451263428, "step": 2156 }, { "epoch": 0.66425548287803, "grad_norm": 3.59375, "learning_rate": 8.235008244332971e-06, "loss": 1.3307693004608154, "step": 2158 }, { "epoch": 0.6648711042708734, "grad_norm": 4.96875, "learning_rate": 8.23170022406892e-06, "loss": 1.4052999019622803, "step": 2160 }, { "epoch": 0.6654867256637168, "grad_norm": 4.46875, "learning_rate": 8.228389986114326e-06, "loss": 1.3534044027328491, "step": 2162 }, { "epoch": 0.6661023470565602, "grad_norm": 11.375, "learning_rate": 8.225077533758656e-06, "loss": 1.6899948120117188, "step": 2164 }, { "epoch": 0.6667179684494036, "grad_norm": 8.6875, "learning_rate": 8.221762870293564e-06, "loss": 1.1940737962722778, "step": 2166 }, { "epoch": 0.667333589842247, "grad_norm": 21.25, "learning_rate": 8.218445999012903e-06, "loss": 1.4204351902008057, "step": 2168 }, { "epoch": 0.6679492112350904, "grad_norm": 5.75, "learning_rate": 8.215126923212724e-06, "loss": 1.5102076530456543, "step": 2170 }, { "epoch": 0.6685648326279339, "grad_norm": 32.25, "learning_rate": 8.211805646191268e-06, "loss": 1.3515056371688843, "step": 2172 }, { "epoch": 0.6691804540207772, "grad_norm": 18.0, "learning_rate": 8.208482171248964e-06, "loss": 1.5586285591125488, "step": 2174 }, { "epoch": 0.6697960754136206, "grad_norm": 4.34375, "learning_rate": 8.205156501688418e-06, "loss": 1.2292786836624146, "step": 2176 }, { "epoch": 0.6704116968064641, "grad_norm": 14.9375, "learning_rate": 8.201828640814426e-06, "loss": 1.0528019666671753, "step": 2178 }, { "epoch": 0.6710273181993074, "grad_norm": 10.5, "learning_rate": 8.198498591933961e-06, "loss": 1.7368333339691162, "step": 2180 }, { "epoch": 0.6716429395921508, "grad_norm": 9.6875, "learning_rate": 8.195166358356163e-06, "loss": 1.732184648513794, "step": 2182 }, { "epoch": 0.6722585609849943, "grad_norm": 6.1875, "learning_rate": 8.191831943392347e-06, "loss": 1.375901699066162, "step": 2184 }, { "epoch": 0.6728741823778376, "grad_norm": 4.90625, "learning_rate": 8.188495350355998e-06, "loss": 1.2327485084533691, "step": 2186 }, { "epoch": 0.673489803770681, "grad_norm": 6.875, "learning_rate": 8.185156582562763e-06, "loss": 1.2183061838150024, "step": 2188 }, { "epoch": 0.6741054251635245, "grad_norm": 4.5, "learning_rate": 8.181815643330449e-06, "loss": 1.133251428604126, "step": 2190 }, { "epoch": 0.6747210465563679, "grad_norm": 8.875, "learning_rate": 8.178472535979023e-06, "loss": 1.6299774646759033, "step": 2192 }, { "epoch": 0.6753366679492112, "grad_norm": 8.5, "learning_rate": 8.175127263830605e-06, "loss": 1.6542222499847412, "step": 2194 }, { "epoch": 0.6759522893420546, "grad_norm": 9.875, "learning_rate": 8.17177983020947e-06, "loss": 0.9074614644050598, "step": 2196 }, { "epoch": 0.6765679107348981, "grad_norm": 4.71875, "learning_rate": 8.168430238442033e-06, "loss": 0.7373267412185669, "step": 2198 }, { "epoch": 0.6771835321277414, "grad_norm": 7.125, "learning_rate": 8.165078491856861e-06, "loss": 1.0262969732284546, "step": 2200 }, { "epoch": 0.6777991535205848, "grad_norm": 8.0625, "learning_rate": 8.16172459378466e-06, "loss": 1.5758912563323975, "step": 2202 }, { "epoch": 0.6784147749134283, "grad_norm": 4.96875, "learning_rate": 8.158368547558276e-06, "loss": 0.9562698006629944, "step": 2204 }, { "epoch": 0.6790303963062716, "grad_norm": 6.15625, "learning_rate": 8.15501035651268e-06, "loss": 0.8322404623031616, "step": 2206 }, { "epoch": 0.679646017699115, "grad_norm": 12.6875, "learning_rate": 8.15165002398499e-06, "loss": 1.8041938543319702, "step": 2208 }, { "epoch": 0.6802616390919585, "grad_norm": 8.125, "learning_rate": 8.148287553314438e-06, "loss": 1.2884726524353027, "step": 2210 }, { "epoch": 0.6808772604848018, "grad_norm": 6.9375, "learning_rate": 8.144922947842391e-06, "loss": 0.8856427669525146, "step": 2212 }, { "epoch": 0.6814928818776452, "grad_norm": 3.09375, "learning_rate": 8.141556210912328e-06, "loss": 1.3027023077011108, "step": 2214 }, { "epoch": 0.6821085032704887, "grad_norm": 15.1875, "learning_rate": 8.138187345869855e-06, "loss": 1.2902532815933228, "step": 2216 }, { "epoch": 0.6827241246633321, "grad_norm": 3.671875, "learning_rate": 8.134816356062684e-06, "loss": 1.1476560831069946, "step": 2218 }, { "epoch": 0.6833397460561754, "grad_norm": 1.6875, "learning_rate": 8.131443244840651e-06, "loss": 1.194162130355835, "step": 2220 }, { "epoch": 0.6839553674490189, "grad_norm": 6.21875, "learning_rate": 8.128068015555686e-06, "loss": 1.206117033958435, "step": 2222 }, { "epoch": 0.6845709888418623, "grad_norm": 4.1875, "learning_rate": 8.12469067156183e-06, "loss": 1.229992389678955, "step": 2224 }, { "epoch": 0.6851866102347056, "grad_norm": 5.625, "learning_rate": 8.121311216215229e-06, "loss": 1.1885910034179688, "step": 2226 }, { "epoch": 0.685802231627549, "grad_norm": 7.21875, "learning_rate": 8.117929652874119e-06, "loss": 1.3053927421569824, "step": 2228 }, { "epoch": 0.6864178530203925, "grad_norm": 7.15625, "learning_rate": 8.114545984898838e-06, "loss": 1.7367584705352783, "step": 2230 }, { "epoch": 0.6870334744132358, "grad_norm": 9.875, "learning_rate": 8.111160215651817e-06, "loss": 1.3689879179000854, "step": 2232 }, { "epoch": 0.6876490958060792, "grad_norm": 7.03125, "learning_rate": 8.107772348497563e-06, "loss": 1.7468862533569336, "step": 2234 }, { "epoch": 0.6882647171989227, "grad_norm": 8.75, "learning_rate": 8.104382386802678e-06, "loss": 1.7702608108520508, "step": 2236 }, { "epoch": 0.6888803385917661, "grad_norm": 5.6875, "learning_rate": 8.100990333935845e-06, "loss": 1.2906309366226196, "step": 2238 }, { "epoch": 0.6894959599846094, "grad_norm": 5.6875, "learning_rate": 8.09759619326782e-06, "loss": 1.4636949300765991, "step": 2240 }, { "epoch": 0.6901115813774529, "grad_norm": 6.15625, "learning_rate": 8.09419996817144e-06, "loss": 1.3649095296859741, "step": 2242 }, { "epoch": 0.6907272027702963, "grad_norm": 56.5, "learning_rate": 8.090801662021609e-06, "loss": 1.213357925415039, "step": 2244 }, { "epoch": 0.6913428241631396, "grad_norm": 13.75, "learning_rate": 8.087401278195297e-06, "loss": 1.6974331140518188, "step": 2246 }, { "epoch": 0.6919584455559831, "grad_norm": 7.65625, "learning_rate": 8.083998820071545e-06, "loss": 1.4499943256378174, "step": 2248 }, { "epoch": 0.6925740669488265, "grad_norm": 4.25, "learning_rate": 8.080594291031451e-06, "loss": 1.1680039167404175, "step": 2250 }, { "epoch": 0.6931896883416698, "grad_norm": 6.3125, "learning_rate": 8.077187694458175e-06, "loss": 1.6170227527618408, "step": 2252 }, { "epoch": 0.6938053097345133, "grad_norm": 10.5625, "learning_rate": 8.073779033736922e-06, "loss": 0.6868647336959839, "step": 2254 }, { "epoch": 0.6944209311273567, "grad_norm": 7.34375, "learning_rate": 8.070368312254956e-06, "loss": 1.337400197982788, "step": 2256 }, { "epoch": 0.6950365525202, "grad_norm": 4.71875, "learning_rate": 8.066955533401593e-06, "loss": 1.283442497253418, "step": 2258 }, { "epoch": 0.6956521739130435, "grad_norm": 8.4375, "learning_rate": 8.063540700568182e-06, "loss": 1.2001821994781494, "step": 2260 }, { "epoch": 0.6962677953058869, "grad_norm": 12.25, "learning_rate": 8.06012381714812e-06, "loss": 1.5367563962936401, "step": 2262 }, { "epoch": 0.6968834166987303, "grad_norm": 9.625, "learning_rate": 8.056704886536844e-06, "loss": 1.5206408500671387, "step": 2264 }, { "epoch": 0.6974990380915737, "grad_norm": 3.328125, "learning_rate": 8.053283912131817e-06, "loss": 1.2205138206481934, "step": 2266 }, { "epoch": 0.6981146594844171, "grad_norm": 11.375, "learning_rate": 8.04986089733254e-06, "loss": 1.4563145637512207, "step": 2268 }, { "epoch": 0.6987302808772605, "grad_norm": 4.75, "learning_rate": 8.046435845540543e-06, "loss": 1.5146596431732178, "step": 2270 }, { "epoch": 0.6993459022701038, "grad_norm": 8.0625, "learning_rate": 8.043008760159372e-06, "loss": 1.3689314126968384, "step": 2272 }, { "epoch": 0.6999615236629473, "grad_norm": 14.9375, "learning_rate": 8.0395796445946e-06, "loss": 1.366464614868164, "step": 2274 }, { "epoch": 0.7005771450557907, "grad_norm": 2.40625, "learning_rate": 8.036148502253816e-06, "loss": 1.3394603729248047, "step": 2276 }, { "epoch": 0.701192766448634, "grad_norm": 8.8125, "learning_rate": 8.032715336546627e-06, "loss": 1.3192006349563599, "step": 2278 }, { "epoch": 0.7018083878414775, "grad_norm": 2.734375, "learning_rate": 8.029280150884637e-06, "loss": 1.1054191589355469, "step": 2280 }, { "epoch": 0.7024240092343209, "grad_norm": 5.9375, "learning_rate": 8.025842948681477e-06, "loss": 1.1914633512496948, "step": 2282 }, { "epoch": 0.7030396306271643, "grad_norm": 6.71875, "learning_rate": 8.022403733352767e-06, "loss": 0.8835414052009583, "step": 2284 }, { "epoch": 0.7036552520200077, "grad_norm": 7.0625, "learning_rate": 8.018962508316132e-06, "loss": 1.3785141706466675, "step": 2286 }, { "epoch": 0.7042708734128511, "grad_norm": 2.3125, "learning_rate": 8.015519276991199e-06, "loss": 1.1339272260665894, "step": 2288 }, { "epoch": 0.7048864948056945, "grad_norm": 15.125, "learning_rate": 8.012074042799578e-06, "loss": 1.2178083658218384, "step": 2290 }, { "epoch": 0.7055021161985379, "grad_norm": 4.1875, "learning_rate": 8.008626809164878e-06, "loss": 1.0197231769561768, "step": 2292 }, { "epoch": 0.7061177375913813, "grad_norm": 6.21875, "learning_rate": 8.005177579512698e-06, "loss": 1.201414942741394, "step": 2294 }, { "epoch": 0.7067333589842247, "grad_norm": 6.5, "learning_rate": 8.001726357270602e-06, "loss": 1.4018722772598267, "step": 2296 }, { "epoch": 0.7073489803770681, "grad_norm": 7.84375, "learning_rate": 7.998273145868161e-06, "loss": 1.2003297805786133, "step": 2298 }, { "epoch": 0.7079646017699115, "grad_norm": 3.765625, "learning_rate": 7.994817948736898e-06, "loss": 1.143210530281067, "step": 2300 }, { "epoch": 0.7085802231627549, "grad_norm": 6.5625, "learning_rate": 7.991360769310324e-06, "loss": 1.1977264881134033, "step": 2302 }, { "epoch": 0.7091958445555983, "grad_norm": 34.75, "learning_rate": 7.987901611023918e-06, "loss": 1.6158286333084106, "step": 2304 }, { "epoch": 0.7098114659484417, "grad_norm": 17.75, "learning_rate": 7.984440477315118e-06, "loss": 1.3872449398040771, "step": 2306 }, { "epoch": 0.7104270873412851, "grad_norm": 14.125, "learning_rate": 7.980977371623335e-06, "loss": 1.1497303247451782, "step": 2308 }, { "epoch": 0.7110427087341286, "grad_norm": 5.46875, "learning_rate": 7.977512297389931e-06, "loss": 1.0877271890640259, "step": 2310 }, { "epoch": 0.7116583301269719, "grad_norm": 8.0625, "learning_rate": 7.97404525805823e-06, "loss": 1.3802058696746826, "step": 2312 }, { "epoch": 0.7122739515198153, "grad_norm": 4.0, "learning_rate": 7.970576257073506e-06, "loss": 1.307868480682373, "step": 2314 }, { "epoch": 0.7128895729126588, "grad_norm": 4.5, "learning_rate": 7.967105297882984e-06, "loss": 1.2434139251708984, "step": 2316 }, { "epoch": 0.7135051943055021, "grad_norm": 9.0, "learning_rate": 7.963632383935834e-06, "loss": 1.3490757942199707, "step": 2318 }, { "epoch": 0.7141208156983455, "grad_norm": 9.375, "learning_rate": 7.960157518683164e-06, "loss": 1.2088630199432373, "step": 2320 }, { "epoch": 0.714736437091189, "grad_norm": 9.625, "learning_rate": 7.956680705578033e-06, "loss": 0.7232799530029297, "step": 2322 }, { "epoch": 0.7153520584840323, "grad_norm": 6.25, "learning_rate": 7.953201948075423e-06, "loss": 1.441251277923584, "step": 2324 }, { "epoch": 0.7159676798768757, "grad_norm": 8.875, "learning_rate": 7.949721249632251e-06, "loss": 1.3019921779632568, "step": 2326 }, { "epoch": 0.7165833012697191, "grad_norm": 7.625, "learning_rate": 7.946238613707374e-06, "loss": 1.57804274559021, "step": 2328 }, { "epoch": 0.7171989226625626, "grad_norm": 13.25, "learning_rate": 7.942754043761558e-06, "loss": 1.4858185052871704, "step": 2330 }, { "epoch": 0.7178145440554059, "grad_norm": 3.609375, "learning_rate": 7.9392675432575e-06, "loss": 1.1838147640228271, "step": 2332 }, { "epoch": 0.7184301654482493, "grad_norm": 7.5, "learning_rate": 7.935779115659813e-06, "loss": 1.2579776048660278, "step": 2334 }, { "epoch": 0.7190457868410928, "grad_norm": 6.0, "learning_rate": 7.932288764435028e-06, "loss": 1.0854580402374268, "step": 2336 }, { "epoch": 0.7196614082339361, "grad_norm": 4.9375, "learning_rate": 7.928796493051582e-06, "loss": 1.3975675106048584, "step": 2338 }, { "epoch": 0.7202770296267795, "grad_norm": 4.75, "learning_rate": 7.925302304979827e-06, "loss": 1.115556001663208, "step": 2340 }, { "epoch": 0.720892651019623, "grad_norm": 9.4375, "learning_rate": 7.921806203692017e-06, "loss": 1.4209973812103271, "step": 2342 }, { "epoch": 0.7215082724124663, "grad_norm": 7.90625, "learning_rate": 7.918308192662298e-06, "loss": 1.6648054122924805, "step": 2344 }, { "epoch": 0.7221238938053097, "grad_norm": 7.5, "learning_rate": 7.914808275366733e-06, "loss": 1.1019439697265625, "step": 2346 }, { "epoch": 0.7227395151981532, "grad_norm": 4.125, "learning_rate": 7.911306455283258e-06, "loss": 1.1492326259613037, "step": 2348 }, { "epoch": 0.7233551365909965, "grad_norm": 6.65625, "learning_rate": 7.907802735891716e-06, "loss": 1.2494227886199951, "step": 2350 }, { "epoch": 0.7239707579838399, "grad_norm": 5.78125, "learning_rate": 7.904297120673831e-06, "loss": 1.4483420848846436, "step": 2352 }, { "epoch": 0.7245863793766834, "grad_norm": 2.453125, "learning_rate": 7.900789613113214e-06, "loss": 1.2527090311050415, "step": 2354 }, { "epoch": 0.7252020007695268, "grad_norm": 7.59375, "learning_rate": 7.897280216695346e-06, "loss": 1.3328126668930054, "step": 2356 }, { "epoch": 0.7258176221623701, "grad_norm": 4.3125, "learning_rate": 7.893768934907599e-06, "loss": 1.2253814935684204, "step": 2358 }, { "epoch": 0.7264332435552135, "grad_norm": 24.625, "learning_rate": 7.89025577123921e-06, "loss": 1.8969019651412964, "step": 2360 }, { "epoch": 0.727048864948057, "grad_norm": 5.8125, "learning_rate": 7.886740729181292e-06, "loss": 1.4229713678359985, "step": 2362 }, { "epoch": 0.7276644863409003, "grad_norm": 4.5625, "learning_rate": 7.883223812226817e-06, "loss": 1.3291300535202026, "step": 2364 }, { "epoch": 0.7282801077337437, "grad_norm": 4.625, "learning_rate": 7.879705023870626e-06, "loss": 1.1539807319641113, "step": 2366 }, { "epoch": 0.7288957291265872, "grad_norm": 8.4375, "learning_rate": 7.876184367609418e-06, "loss": 1.536054015159607, "step": 2368 }, { "epoch": 0.7295113505194305, "grad_norm": 5.125, "learning_rate": 7.872661846941747e-06, "loss": 1.4130350351333618, "step": 2370 }, { "epoch": 0.7301269719122739, "grad_norm": 9.4375, "learning_rate": 7.869137465368023e-06, "loss": 1.6681222915649414, "step": 2372 }, { "epoch": 0.7307425933051174, "grad_norm": 6.28125, "learning_rate": 7.865611226390499e-06, "loss": 1.2161335945129395, "step": 2374 }, { "epoch": 0.7313582146979608, "grad_norm": 10.875, "learning_rate": 7.862083133513281e-06, "loss": 1.5017954111099243, "step": 2376 }, { "epoch": 0.7319738360908041, "grad_norm": 7.78125, "learning_rate": 7.858553190242314e-06, "loss": 1.7430108785629272, "step": 2378 }, { "epoch": 0.7325894574836476, "grad_norm": 6.75, "learning_rate": 7.855021400085378e-06, "loss": 1.3919011354446411, "step": 2380 }, { "epoch": 0.733205078876491, "grad_norm": 6.0, "learning_rate": 7.851487766552097e-06, "loss": 1.5471138954162598, "step": 2382 }, { "epoch": 0.7338207002693343, "grad_norm": 6.5625, "learning_rate": 7.847952293153923e-06, "loss": 1.1629184484481812, "step": 2384 }, { "epoch": 0.7344363216621778, "grad_norm": 7.3125, "learning_rate": 7.844414983404128e-06, "loss": 1.168907880783081, "step": 2386 }, { "epoch": 0.7350519430550212, "grad_norm": 6.5, "learning_rate": 7.840875840817819e-06, "loss": 1.5863640308380127, "step": 2388 }, { "epoch": 0.7356675644478645, "grad_norm": 4.28125, "learning_rate": 7.837334868911923e-06, "loss": 1.121029257774353, "step": 2390 }, { "epoch": 0.736283185840708, "grad_norm": 4.5, "learning_rate": 7.833792071205184e-06, "loss": 1.3915820121765137, "step": 2392 }, { "epoch": 0.7368988072335514, "grad_norm": 4.375, "learning_rate": 7.830247451218158e-06, "loss": 1.2680091857910156, "step": 2394 }, { "epoch": 0.7375144286263947, "grad_norm": 5.6875, "learning_rate": 7.826701012473213e-06, "loss": 1.2384248971939087, "step": 2396 }, { "epoch": 0.7381300500192381, "grad_norm": 4.65625, "learning_rate": 7.823152758494523e-06, "loss": 1.1445600986480713, "step": 2398 }, { "epoch": 0.7387456714120816, "grad_norm": 12.75, "learning_rate": 7.81960269280807e-06, "loss": 1.365366816520691, "step": 2400 }, { "epoch": 0.739361292804925, "grad_norm": 5.59375, "learning_rate": 7.816050818941634e-06, "loss": 1.353005051612854, "step": 2402 }, { "epoch": 0.7399769141977683, "grad_norm": 5.875, "learning_rate": 7.81249714042479e-06, "loss": 1.2653348445892334, "step": 2404 }, { "epoch": 0.7405925355906118, "grad_norm": 7.03125, "learning_rate": 7.80894166078891e-06, "loss": 1.3015531301498413, "step": 2406 }, { "epoch": 0.7412081569834552, "grad_norm": 3.6875, "learning_rate": 7.805384383567152e-06, "loss": 1.2278153896331787, "step": 2408 }, { "epoch": 0.7418237783762985, "grad_norm": 9.6875, "learning_rate": 7.801825312294465e-06, "loss": 1.3797082901000977, "step": 2410 }, { "epoch": 0.742439399769142, "grad_norm": 2.578125, "learning_rate": 7.798264450507573e-06, "loss": 1.002259373664856, "step": 2412 }, { "epoch": 0.7430550211619854, "grad_norm": 13.0625, "learning_rate": 7.794701801744989e-06, "loss": 1.1419209241867065, "step": 2414 }, { "epoch": 0.7436706425548287, "grad_norm": 8.1875, "learning_rate": 7.791137369546992e-06, "loss": 1.7288269996643066, "step": 2416 }, { "epoch": 0.7442862639476722, "grad_norm": 2.4375, "learning_rate": 7.787571157455643e-06, "loss": 1.265411615371704, "step": 2418 }, { "epoch": 0.7449018853405156, "grad_norm": 5.5, "learning_rate": 7.784003169014764e-06, "loss": 1.294354796409607, "step": 2420 }, { "epoch": 0.745517506733359, "grad_norm": 4.625, "learning_rate": 7.780433407769948e-06, "loss": 1.1728662252426147, "step": 2422 }, { "epoch": 0.7461331281262024, "grad_norm": 5.3125, "learning_rate": 7.776861877268544e-06, "loss": 1.1810388565063477, "step": 2424 }, { "epoch": 0.7467487495190458, "grad_norm": 9.0, "learning_rate": 7.773288581059661e-06, "loss": 0.8759303689002991, "step": 2426 }, { "epoch": 0.7473643709118892, "grad_norm": 5.71875, "learning_rate": 7.769713522694167e-06, "loss": 1.3331336975097656, "step": 2428 }, { "epoch": 0.7479799923047326, "grad_norm": 4.5, "learning_rate": 7.766136705724675e-06, "loss": 0.8516298532485962, "step": 2430 }, { "epoch": 0.748595613697576, "grad_norm": 20.875, "learning_rate": 7.762558133705551e-06, "loss": 1.1931649446487427, "step": 2432 }, { "epoch": 0.7492112350904194, "grad_norm": 3.953125, "learning_rate": 7.758977810192898e-06, "loss": 1.3147087097167969, "step": 2434 }, { "epoch": 0.7498268564832627, "grad_norm": 12.0, "learning_rate": 7.755395738744567e-06, "loss": 1.4341905117034912, "step": 2436 }, { "epoch": 0.7504424778761062, "grad_norm": 4.46875, "learning_rate": 7.751811922920141e-06, "loss": 1.2278903722763062, "step": 2438 }, { "epoch": 0.7510580992689496, "grad_norm": 7.25, "learning_rate": 7.74822636628094e-06, "loss": 1.236400842666626, "step": 2440 }, { "epoch": 0.7516737206617929, "grad_norm": 19.875, "learning_rate": 7.744639072390013e-06, "loss": 1.4345285892486572, "step": 2442 }, { "epoch": 0.7522893420546364, "grad_norm": 7.78125, "learning_rate": 7.741050044812128e-06, "loss": 1.4369693994522095, "step": 2444 }, { "epoch": 0.7529049634474798, "grad_norm": 3.859375, "learning_rate": 7.737459287113789e-06, "loss": 1.2332228422164917, "step": 2446 }, { "epoch": 0.7535205848403232, "grad_norm": 7.25, "learning_rate": 7.733866802863207e-06, "loss": 1.5218693017959595, "step": 2448 }, { "epoch": 0.7541362062331666, "grad_norm": 1.5703125, "learning_rate": 7.730272595630322e-06, "loss": 1.0981626510620117, "step": 2450 }, { "epoch": 0.75475182762601, "grad_norm": 3.484375, "learning_rate": 7.726676668986769e-06, "loss": 0.9859358668327332, "step": 2452 }, { "epoch": 0.7553674490188534, "grad_norm": 7.90625, "learning_rate": 7.723079026505907e-06, "loss": 1.6938118934631348, "step": 2454 }, { "epoch": 0.7559830704116968, "grad_norm": 3.78125, "learning_rate": 7.719479671762788e-06, "loss": 1.2973098754882812, "step": 2456 }, { "epoch": 0.7565986918045402, "grad_norm": 5.40625, "learning_rate": 7.71587860833418e-06, "loss": 1.4085596799850464, "step": 2458 }, { "epoch": 0.7572143131973836, "grad_norm": 6.90625, "learning_rate": 7.712275839798536e-06, "loss": 1.2574015855789185, "step": 2460 }, { "epoch": 0.757829934590227, "grad_norm": 5.0, "learning_rate": 7.708671369736007e-06, "loss": 1.3973033428192139, "step": 2462 }, { "epoch": 0.7584455559830704, "grad_norm": 6.96875, "learning_rate": 7.705065201728436e-06, "loss": 1.380937099456787, "step": 2464 }, { "epoch": 0.7590611773759138, "grad_norm": 6.8125, "learning_rate": 7.701457339359356e-06, "loss": 1.4758228063583374, "step": 2466 }, { "epoch": 0.7596767987687573, "grad_norm": 3.71875, "learning_rate": 7.697847786213974e-06, "loss": 1.2328314781188965, "step": 2468 }, { "epoch": 0.7602924201616006, "grad_norm": 2.859375, "learning_rate": 7.69423654587919e-06, "loss": 1.1552143096923828, "step": 2470 }, { "epoch": 0.760908041554444, "grad_norm": 6.875, "learning_rate": 7.690623621943574e-06, "loss": 1.1700533628463745, "step": 2472 }, { "epoch": 0.7615236629472875, "grad_norm": 24.0, "learning_rate": 7.687009017997369e-06, "loss": 1.7394222021102905, "step": 2474 }, { "epoch": 0.7621392843401308, "grad_norm": 5.40625, "learning_rate": 7.683392737632484e-06, "loss": 1.5715559720993042, "step": 2476 }, { "epoch": 0.7627549057329742, "grad_norm": 4.21875, "learning_rate": 7.679774784442504e-06, "loss": 1.3047338724136353, "step": 2478 }, { "epoch": 0.7633705271258177, "grad_norm": 6.03125, "learning_rate": 7.676155162022664e-06, "loss": 1.3770108222961426, "step": 2480 }, { "epoch": 0.763986148518661, "grad_norm": 7.65625, "learning_rate": 7.672533873969867e-06, "loss": 1.4164502620697021, "step": 2482 }, { "epoch": 0.7646017699115044, "grad_norm": 9.4375, "learning_rate": 7.66891092388267e-06, "loss": 1.746399164199829, "step": 2484 }, { "epoch": 0.7652173913043478, "grad_norm": 5.375, "learning_rate": 7.66528631536128e-06, "loss": 1.126002311706543, "step": 2486 }, { "epoch": 0.7658330126971912, "grad_norm": 3.953125, "learning_rate": 7.661660052007547e-06, "loss": 1.0803347826004028, "step": 2488 }, { "epoch": 0.7664486340900346, "grad_norm": 6.84375, "learning_rate": 7.658032137424973e-06, "loss": 1.6090704202651978, "step": 2490 }, { "epoch": 0.767064255482878, "grad_norm": 5.625, "learning_rate": 7.654402575218698e-06, "loss": 1.215574026107788, "step": 2492 }, { "epoch": 0.7676798768757215, "grad_norm": 8.6875, "learning_rate": 7.6507713689955e-06, "loss": 0.8813379406929016, "step": 2494 }, { "epoch": 0.7682954982685648, "grad_norm": 7.65625, "learning_rate": 7.647138522363788e-06, "loss": 1.4518318176269531, "step": 2496 }, { "epoch": 0.7689111196614082, "grad_norm": 7.21875, "learning_rate": 7.643504038933607e-06, "loss": 1.2638100385665894, "step": 2498 }, { "epoch": 0.7695267410542517, "grad_norm": 2.109375, "learning_rate": 7.639867922316616e-06, "loss": 1.2956639528274536, "step": 2500 }, { "epoch": 0.770142362447095, "grad_norm": 4.78125, "learning_rate": 7.636230176126116e-06, "loss": 1.4407836198806763, "step": 2502 }, { "epoch": 0.7707579838399384, "grad_norm": 5.96875, "learning_rate": 7.632590803977014e-06, "loss": 1.2232799530029297, "step": 2504 }, { "epoch": 0.7713736052327819, "grad_norm": 6.0625, "learning_rate": 7.628949809485832e-06, "loss": 1.1274707317352295, "step": 2506 }, { "epoch": 0.7719892266256252, "grad_norm": 7.625, "learning_rate": 7.62530719627071e-06, "loss": 1.583906888961792, "step": 2508 }, { "epoch": 0.7726048480184686, "grad_norm": 5.1875, "learning_rate": 7.621662967951395e-06, "loss": 1.2432018518447876, "step": 2510 }, { "epoch": 0.7732204694113121, "grad_norm": 7.90625, "learning_rate": 7.618017128149238e-06, "loss": 1.9361577033996582, "step": 2512 }, { "epoch": 0.7738360908041555, "grad_norm": 3.671875, "learning_rate": 7.61436968048719e-06, "loss": 1.390897512435913, "step": 2514 }, { "epoch": 0.7744517121969988, "grad_norm": 3.78125, "learning_rate": 7.610720628589805e-06, "loss": 1.0258307456970215, "step": 2516 }, { "epoch": 0.7750673335898423, "grad_norm": 4.8125, "learning_rate": 7.607069976083226e-06, "loss": 1.1963353157043457, "step": 2518 }, { "epoch": 0.7756829549826857, "grad_norm": 5.8125, "learning_rate": 7.6034177265951855e-06, "loss": 1.3238426446914673, "step": 2520 }, { "epoch": 0.776298576375529, "grad_norm": 4.71875, "learning_rate": 7.599763883755009e-06, "loss": 1.54075026512146, "step": 2522 }, { "epoch": 0.7769141977683724, "grad_norm": 4.6875, "learning_rate": 7.596108451193602e-06, "loss": 1.1980767250061035, "step": 2524 }, { "epoch": 0.7775298191612159, "grad_norm": 4.46875, "learning_rate": 7.5924514325434484e-06, "loss": 1.0477863550186157, "step": 2526 }, { "epoch": 0.7781454405540592, "grad_norm": 4.25, "learning_rate": 7.5887928314386115e-06, "loss": 1.2851420640945435, "step": 2528 }, { "epoch": 0.7787610619469026, "grad_norm": 4.09375, "learning_rate": 7.585132651514722e-06, "loss": 1.459242343902588, "step": 2530 }, { "epoch": 0.7793766833397461, "grad_norm": 7.40625, "learning_rate": 7.581470896408984e-06, "loss": 1.0518429279327393, "step": 2532 }, { "epoch": 0.7799923047325894, "grad_norm": 3.671875, "learning_rate": 7.577807569760169e-06, "loss": 1.1042135953903198, "step": 2534 }, { "epoch": 0.7806079261254328, "grad_norm": 6.96875, "learning_rate": 7.574142675208602e-06, "loss": 1.0141704082489014, "step": 2536 }, { "epoch": 0.7812235475182763, "grad_norm": 4.625, "learning_rate": 7.570476216396174e-06, "loss": 1.2268452644348145, "step": 2538 }, { "epoch": 0.7818391689111197, "grad_norm": 9.6875, "learning_rate": 7.566808196966326e-06, "loss": 0.9167345762252808, "step": 2540 }, { "epoch": 0.782454790303963, "grad_norm": 2.5, "learning_rate": 7.563138620564052e-06, "loss": 1.2149308919906616, "step": 2542 }, { "epoch": 0.7830704116968065, "grad_norm": 7.3125, "learning_rate": 7.55946749083589e-06, "loss": 0.9064674973487854, "step": 2544 }, { "epoch": 0.7836860330896499, "grad_norm": 5.03125, "learning_rate": 7.5557948114299265e-06, "loss": 1.6139088869094849, "step": 2546 }, { "epoch": 0.7843016544824932, "grad_norm": 12.5, "learning_rate": 7.552120585995786e-06, "loss": 1.1430484056472778, "step": 2548 }, { "epoch": 0.7849172758753367, "grad_norm": 10.375, "learning_rate": 7.548444818184626e-06, "loss": 1.3700685501098633, "step": 2550 }, { "epoch": 0.7855328972681801, "grad_norm": 5.5, "learning_rate": 7.544767511649138e-06, "loss": 1.3238214254379272, "step": 2552 }, { "epoch": 0.7861485186610234, "grad_norm": 5.625, "learning_rate": 7.541088670043548e-06, "loss": 1.196365237236023, "step": 2554 }, { "epoch": 0.7867641400538669, "grad_norm": 9.75, "learning_rate": 7.537408297023605e-06, "loss": 1.2222591638565063, "step": 2556 }, { "epoch": 0.7873797614467103, "grad_norm": 8.0625, "learning_rate": 7.5337263962465704e-06, "loss": 1.451030969619751, "step": 2558 }, { "epoch": 0.7879953828395537, "grad_norm": 3.5625, "learning_rate": 7.5300429713712385e-06, "loss": 1.190792441368103, "step": 2560 }, { "epoch": 0.788611004232397, "grad_norm": 8.8125, "learning_rate": 7.5263580260579096e-06, "loss": 1.4873063564300537, "step": 2562 }, { "epoch": 0.7892266256252405, "grad_norm": 4.0625, "learning_rate": 7.5226715639683936e-06, "loss": 1.1327518224716187, "step": 2564 }, { "epoch": 0.7898422470180839, "grad_norm": 6.375, "learning_rate": 7.518983588766013e-06, "loss": 1.3757460117340088, "step": 2566 }, { "epoch": 0.7904578684109272, "grad_norm": 3.109375, "learning_rate": 7.515294104115592e-06, "loss": 1.0891473293304443, "step": 2568 }, { "epoch": 0.7910734898037707, "grad_norm": 5.1875, "learning_rate": 7.511603113683452e-06, "loss": 1.2719448804855347, "step": 2570 }, { "epoch": 0.7916891111966141, "grad_norm": 5.96875, "learning_rate": 7.507910621137413e-06, "loss": 1.379211187362671, "step": 2572 }, { "epoch": 0.7923047325894574, "grad_norm": 7.65625, "learning_rate": 7.5042166301467904e-06, "loss": 1.6094646453857422, "step": 2574 }, { "epoch": 0.7929203539823009, "grad_norm": 6.5625, "learning_rate": 7.500521144382385e-06, "loss": 1.4890835285186768, "step": 2576 }, { "epoch": 0.7935359753751443, "grad_norm": 13.625, "learning_rate": 7.496824167516481e-06, "loss": 1.1069023609161377, "step": 2578 }, { "epoch": 0.7941515967679876, "grad_norm": 3.5, "learning_rate": 7.49312570322285e-06, "loss": 0.9310998916625977, "step": 2580 }, { "epoch": 0.7947672181608311, "grad_norm": 7.71875, "learning_rate": 7.489425755176738e-06, "loss": 1.2464100122451782, "step": 2582 }, { "epoch": 0.7953828395536745, "grad_norm": 5.03125, "learning_rate": 7.4857243270548666e-06, "loss": 1.2356469631195068, "step": 2584 }, { "epoch": 0.7959984609465179, "grad_norm": 7.625, "learning_rate": 7.482021422535428e-06, "loss": 1.5616161823272705, "step": 2586 }, { "epoch": 0.7966140823393613, "grad_norm": 24.25, "learning_rate": 7.47831704529808e-06, "loss": 1.4321647882461548, "step": 2588 }, { "epoch": 0.7972297037322047, "grad_norm": 5.0625, "learning_rate": 7.474611199023949e-06, "loss": 1.0948270559310913, "step": 2590 }, { "epoch": 0.7978453251250481, "grad_norm": 5.9375, "learning_rate": 7.470903887395611e-06, "loss": 1.3025152683258057, "step": 2592 }, { "epoch": 0.7984609465178915, "grad_norm": 9.25, "learning_rate": 7.46719511409711e-06, "loss": 1.2904844284057617, "step": 2594 }, { "epoch": 0.7990765679107349, "grad_norm": 6.71875, "learning_rate": 7.463484882813938e-06, "loss": 1.301687240600586, "step": 2596 }, { "epoch": 0.7996921893035783, "grad_norm": 11.625, "learning_rate": 7.459773197233031e-06, "loss": 1.275160551071167, "step": 2598 }, { "epoch": 0.8003078106964217, "grad_norm": 8.625, "learning_rate": 7.456060061042774e-06, "loss": 1.452974557876587, "step": 2600 }, { "epoch": 0.8009234320892651, "grad_norm": 3.3125, "learning_rate": 7.452345477932999e-06, "loss": 1.0892770290374756, "step": 2602 }, { "epoch": 0.8015390534821085, "grad_norm": 8.75, "learning_rate": 7.4486294515949665e-06, "loss": 1.018752932548523, "step": 2604 }, { "epoch": 0.802154674874952, "grad_norm": 27.375, "learning_rate": 7.4449119857213725e-06, "loss": 1.1587097644805908, "step": 2606 }, { "epoch": 0.8027702962677953, "grad_norm": 6.125, "learning_rate": 7.441193084006353e-06, "loss": 1.2957143783569336, "step": 2608 }, { "epoch": 0.8033859176606387, "grad_norm": 9.875, "learning_rate": 7.437472750145458e-06, "loss": 1.380386471748352, "step": 2610 }, { "epoch": 0.8040015390534822, "grad_norm": 12.875, "learning_rate": 7.433750987835668e-06, "loss": 0.9111630916595459, "step": 2612 }, { "epoch": 0.8046171604463255, "grad_norm": 1.9765625, "learning_rate": 7.430027800775386e-06, "loss": 1.0486061573028564, "step": 2614 }, { "epoch": 0.8052327818391689, "grad_norm": 7.71875, "learning_rate": 7.426303192664421e-06, "loss": 1.111664056777954, "step": 2616 }, { "epoch": 0.8058484032320123, "grad_norm": 5.21875, "learning_rate": 7.422577167204003e-06, "loss": 1.3210314512252808, "step": 2618 }, { "epoch": 0.8064640246248557, "grad_norm": 5.34375, "learning_rate": 7.418849728096767e-06, "loss": 1.2019422054290771, "step": 2620 }, { "epoch": 0.8070796460176991, "grad_norm": 41.5, "learning_rate": 7.415120879046749e-06, "loss": 0.9851787090301514, "step": 2622 }, { "epoch": 0.8076952674105425, "grad_norm": 2.6875, "learning_rate": 7.411390623759392e-06, "loss": 1.1129413843154907, "step": 2624 }, { "epoch": 0.8083108888033859, "grad_norm": 6.53125, "learning_rate": 7.407658965941535e-06, "loss": 1.2529385089874268, "step": 2626 }, { "epoch": 0.8089265101962293, "grad_norm": 12.0625, "learning_rate": 7.40392590930141e-06, "loss": 1.3294597864151, "step": 2628 }, { "epoch": 0.8095421315890727, "grad_norm": 3.625, "learning_rate": 7.40019145754864e-06, "loss": 0.9272560477256775, "step": 2630 }, { "epoch": 0.8101577529819162, "grad_norm": 1.7265625, "learning_rate": 7.3964556143942315e-06, "loss": 1.1824629306793213, "step": 2632 }, { "epoch": 0.8107733743747595, "grad_norm": 11.0625, "learning_rate": 7.392718383550576e-06, "loss": 1.0783371925354004, "step": 2634 }, { "epoch": 0.8113889957676029, "grad_norm": 6.3125, "learning_rate": 7.388979768731444e-06, "loss": 1.2361860275268555, "step": 2636 }, { "epoch": 0.8120046171604464, "grad_norm": 6.15625, "learning_rate": 7.38523977365198e-06, "loss": 1.3848330974578857, "step": 2638 }, { "epoch": 0.8126202385532897, "grad_norm": 6.59375, "learning_rate": 7.381498402028704e-06, "loss": 1.2543951272964478, "step": 2640 }, { "epoch": 0.8132358599461331, "grad_norm": 6.3125, "learning_rate": 7.377755657579495e-06, "loss": 1.4167609214782715, "step": 2642 }, { "epoch": 0.8138514813389766, "grad_norm": 5.375, "learning_rate": 7.374011544023607e-06, "loss": 1.3318917751312256, "step": 2644 }, { "epoch": 0.8144671027318199, "grad_norm": 7.375, "learning_rate": 7.3702660650816485e-06, "loss": 1.0573267936706543, "step": 2646 }, { "epoch": 0.8150827241246633, "grad_norm": 3.328125, "learning_rate": 7.366519224475585e-06, "loss": 1.1513400077819824, "step": 2648 }, { "epoch": 0.8156983455175068, "grad_norm": 7.09375, "learning_rate": 7.362771025928736e-06, "loss": 1.4746935367584229, "step": 2650 }, { "epoch": 0.8163139669103502, "grad_norm": 5.65625, "learning_rate": 7.3590214731657724e-06, "loss": 1.2213988304138184, "step": 2652 }, { "epoch": 0.8169295883031935, "grad_norm": 8.6875, "learning_rate": 7.355270569912707e-06, "loss": 1.4360812902450562, "step": 2654 }, { "epoch": 0.817545209696037, "grad_norm": 6.90625, "learning_rate": 7.351518319896895e-06, "loss": 1.447126030921936, "step": 2656 }, { "epoch": 0.8181608310888804, "grad_norm": 6.40625, "learning_rate": 7.347764726847035e-06, "loss": 0.9661370515823364, "step": 2658 }, { "epoch": 0.8187764524817237, "grad_norm": 24.5, "learning_rate": 7.3440097944931545e-06, "loss": 1.2369790077209473, "step": 2660 }, { "epoch": 0.8193920738745671, "grad_norm": 7.90625, "learning_rate": 7.340253526566614e-06, "loss": 1.4130780696868896, "step": 2662 }, { "epoch": 0.8200076952674106, "grad_norm": 3.953125, "learning_rate": 7.3364959268001e-06, "loss": 1.210066318511963, "step": 2664 }, { "epoch": 0.8206233166602539, "grad_norm": 18.375, "learning_rate": 7.332736998927628e-06, "loss": 1.090823769569397, "step": 2666 }, { "epoch": 0.8212389380530973, "grad_norm": 5.5625, "learning_rate": 7.328976746684522e-06, "loss": 1.0669875144958496, "step": 2668 }, { "epoch": 0.8218545594459408, "grad_norm": 6.21875, "learning_rate": 7.325215173807434e-06, "loss": 1.2229068279266357, "step": 2670 }, { "epoch": 0.8224701808387841, "grad_norm": 11.1875, "learning_rate": 7.321452284034323e-06, "loss": 1.389937400817871, "step": 2672 }, { "epoch": 0.8230858022316275, "grad_norm": 13.25, "learning_rate": 7.317688081104455e-06, "loss": 1.0739669799804688, "step": 2674 }, { "epoch": 0.823701423624471, "grad_norm": 6.625, "learning_rate": 7.313922568758403e-06, "loss": 1.4765902757644653, "step": 2676 }, { "epoch": 0.8243170450173144, "grad_norm": 3.78125, "learning_rate": 7.310155750738044e-06, "loss": 1.188515543937683, "step": 2678 }, { "epoch": 0.8249326664101577, "grad_norm": 6.96875, "learning_rate": 7.306387630786544e-06, "loss": 1.0044671297073364, "step": 2680 }, { "epoch": 0.8255482878030012, "grad_norm": 6.90625, "learning_rate": 7.3026182126483755e-06, "loss": 1.2412277460098267, "step": 2682 }, { "epoch": 0.8261639091958446, "grad_norm": 4.09375, "learning_rate": 7.298847500069286e-06, "loss": 1.2116367816925049, "step": 2684 }, { "epoch": 0.8267795305886879, "grad_norm": 5.28125, "learning_rate": 7.295075496796324e-06, "loss": 1.3458802700042725, "step": 2686 }, { "epoch": 0.8273951519815314, "grad_norm": 5.25, "learning_rate": 7.291302206577808e-06, "loss": 1.289811611175537, "step": 2688 }, { "epoch": 0.8280107733743748, "grad_norm": 6.90625, "learning_rate": 7.287527633163345e-06, "loss": 1.3732936382293701, "step": 2690 }, { "epoch": 0.8286263947672181, "grad_norm": 3.984375, "learning_rate": 7.283751780303812e-06, "loss": 1.0649974346160889, "step": 2692 }, { "epoch": 0.8292420161600615, "grad_norm": 12.75, "learning_rate": 7.27997465175136e-06, "loss": 1.5276716947555542, "step": 2694 }, { "epoch": 0.829857637552905, "grad_norm": 10.0625, "learning_rate": 7.276196251259402e-06, "loss": 1.4253476858139038, "step": 2696 }, { "epoch": 0.8304732589457484, "grad_norm": 6.5625, "learning_rate": 7.272416582582624e-06, "loss": 1.1293399333953857, "step": 2698 }, { "epoch": 0.8310888803385917, "grad_norm": 3.921875, "learning_rate": 7.268635649476965e-06, "loss": 1.224055528640747, "step": 2700 }, { "epoch": 0.8317045017314352, "grad_norm": 3.828125, "learning_rate": 7.264853455699623e-06, "loss": 1.432799220085144, "step": 2702 }, { "epoch": 0.8323201231242786, "grad_norm": 5.3125, "learning_rate": 7.261070005009052e-06, "loss": 1.2237067222595215, "step": 2704 }, { "epoch": 0.8329357445171219, "grad_norm": 9.4375, "learning_rate": 7.257285301164947e-06, "loss": 1.2335262298583984, "step": 2706 }, { "epoch": 0.8335513659099654, "grad_norm": 10.5, "learning_rate": 7.2534993479282545e-06, "loss": 1.619771957397461, "step": 2708 }, { "epoch": 0.8341669873028088, "grad_norm": 10.0, "learning_rate": 7.2497121490611636e-06, "loss": 1.460617184638977, "step": 2710 }, { "epoch": 0.8347826086956521, "grad_norm": 10.8125, "learning_rate": 7.245923708327096e-06, "loss": 1.450473427772522, "step": 2712 }, { "epoch": 0.8353982300884956, "grad_norm": 2.234375, "learning_rate": 7.242134029490711e-06, "loss": 0.9833471179008484, "step": 2714 }, { "epoch": 0.836013851481339, "grad_norm": 6.6875, "learning_rate": 7.2383431163178965e-06, "loss": 1.161080241203308, "step": 2716 }, { "epoch": 0.8366294728741823, "grad_norm": 8.25, "learning_rate": 7.234550972575769e-06, "loss": 1.712884545326233, "step": 2718 }, { "epoch": 0.8372450942670258, "grad_norm": 5.71875, "learning_rate": 7.230757602032667e-06, "loss": 1.40744149684906, "step": 2720 }, { "epoch": 0.8378607156598692, "grad_norm": 3.140625, "learning_rate": 7.2269630084581475e-06, "loss": 1.321396827697754, "step": 2722 }, { "epoch": 0.8384763370527126, "grad_norm": 5.3125, "learning_rate": 7.223167195622982e-06, "loss": 1.2216382026672363, "step": 2724 }, { "epoch": 0.839091958445556, "grad_norm": 52.0, "learning_rate": 7.219370167299158e-06, "loss": 1.2050942182540894, "step": 2726 }, { "epoch": 0.8397075798383994, "grad_norm": 12.125, "learning_rate": 7.215571927259863e-06, "loss": 1.2433619499206543, "step": 2728 }, { "epoch": 0.8403232012312428, "grad_norm": 2.953125, "learning_rate": 7.2117724792795e-06, "loss": 1.2390086650848389, "step": 2730 }, { "epoch": 0.8409388226240861, "grad_norm": 11.375, "learning_rate": 7.207971827133657e-06, "loss": 1.348872184753418, "step": 2732 }, { "epoch": 0.8415544440169296, "grad_norm": 23.375, "learning_rate": 7.204169974599134e-06, "loss": 1.6377098560333252, "step": 2734 }, { "epoch": 0.842170065409773, "grad_norm": 6.5, "learning_rate": 7.200366925453915e-06, "loss": 1.6480786800384521, "step": 2736 }, { "epoch": 0.8427856868026163, "grad_norm": 5.96875, "learning_rate": 7.196562683477175e-06, "loss": 1.4472780227661133, "step": 2738 }, { "epoch": 0.8434013081954598, "grad_norm": 2.5625, "learning_rate": 7.192757252449272e-06, "loss": 1.3245518207550049, "step": 2740 }, { "epoch": 0.8440169295883032, "grad_norm": 5.59375, "learning_rate": 7.188950636151752e-06, "loss": 1.510989785194397, "step": 2742 }, { "epoch": 0.8446325509811466, "grad_norm": 10.625, "learning_rate": 7.185142838367334e-06, "loss": 1.5210520029067993, "step": 2744 }, { "epoch": 0.84524817237399, "grad_norm": 8.375, "learning_rate": 7.181333862879911e-06, "loss": 1.4478716850280762, "step": 2746 }, { "epoch": 0.8458637937668334, "grad_norm": 8.5, "learning_rate": 7.1775237134745505e-06, "loss": 1.3034485578536987, "step": 2748 }, { "epoch": 0.8464794151596768, "grad_norm": 8.25, "learning_rate": 7.173712393937477e-06, "loss": 1.708869457244873, "step": 2750 }, { "epoch": 0.8470950365525202, "grad_norm": 5.90625, "learning_rate": 7.16989990805609e-06, "loss": 1.3472611904144287, "step": 2752 }, { "epoch": 0.8477106579453636, "grad_norm": 3.015625, "learning_rate": 7.166086259618938e-06, "loss": 1.2394222021102905, "step": 2754 }, { "epoch": 0.848326279338207, "grad_norm": 2.890625, "learning_rate": 7.162271452415734e-06, "loss": 0.943736732006073, "step": 2756 }, { "epoch": 0.8489419007310504, "grad_norm": 41.5, "learning_rate": 7.158455490237333e-06, "loss": 1.3057875633239746, "step": 2758 }, { "epoch": 0.8495575221238938, "grad_norm": 8.6875, "learning_rate": 7.154638376875744e-06, "loss": 1.333833932876587, "step": 2760 }, { "epoch": 0.8501731435167372, "grad_norm": 38.25, "learning_rate": 7.150820116124117e-06, "loss": 1.2368322610855103, "step": 2762 }, { "epoch": 0.8507887649095806, "grad_norm": 8.0625, "learning_rate": 7.147000711776744e-06, "loss": 1.2598323822021484, "step": 2764 }, { "epoch": 0.851404386302424, "grad_norm": 2.125, "learning_rate": 7.1431801676290535e-06, "loss": 1.039093017578125, "step": 2766 }, { "epoch": 0.8520200076952674, "grad_norm": 5.53125, "learning_rate": 7.139358487477606e-06, "loss": 1.363373041152954, "step": 2768 }, { "epoch": 0.8526356290881109, "grad_norm": 8.375, "learning_rate": 7.1355356751200886e-06, "loss": 1.186076283454895, "step": 2770 }, { "epoch": 0.8532512504809542, "grad_norm": 6.46875, "learning_rate": 7.131711734355315e-06, "loss": 1.3930425643920898, "step": 2772 }, { "epoch": 0.8538668718737976, "grad_norm": 8.875, "learning_rate": 7.1278866689832235e-06, "loss": 1.4591971635818481, "step": 2774 }, { "epoch": 0.854482493266641, "grad_norm": 4.75, "learning_rate": 7.124060482804869e-06, "loss": 1.313432216644287, "step": 2776 }, { "epoch": 0.8550981146594844, "grad_norm": 9.5625, "learning_rate": 7.120233179622414e-06, "loss": 1.3527673482894897, "step": 2778 }, { "epoch": 0.8557137360523278, "grad_norm": 11.3125, "learning_rate": 7.1164047632391375e-06, "loss": 1.3231563568115234, "step": 2780 }, { "epoch": 0.8563293574451712, "grad_norm": 7.75, "learning_rate": 7.112575237459425e-06, "loss": 1.3130462169647217, "step": 2782 }, { "epoch": 0.8569449788380146, "grad_norm": 3.21875, "learning_rate": 7.108744606088758e-06, "loss": 0.9800470471382141, "step": 2784 }, { "epoch": 0.857560600230858, "grad_norm": 6.0, "learning_rate": 7.1049128729337205e-06, "loss": 1.008358359336853, "step": 2786 }, { "epoch": 0.8581762216237014, "grad_norm": 5.0, "learning_rate": 7.101080041801996e-06, "loss": 1.1537256240844727, "step": 2788 }, { "epoch": 0.8587918430165449, "grad_norm": 8.5625, "learning_rate": 7.097246116502352e-06, "loss": 1.2312843799591064, "step": 2790 }, { "epoch": 0.8594074644093882, "grad_norm": 14.625, "learning_rate": 7.093411100844645e-06, "loss": 0.8555831909179688, "step": 2792 }, { "epoch": 0.8600230858022316, "grad_norm": 6.6875, "learning_rate": 7.089574998639819e-06, "loss": 1.0235496759414673, "step": 2794 }, { "epoch": 0.8606387071950751, "grad_norm": 13.9375, "learning_rate": 7.085737813699894e-06, "loss": 1.1901612281799316, "step": 2796 }, { "epoch": 0.8612543285879184, "grad_norm": 8.4375, "learning_rate": 7.081899549837963e-06, "loss": 1.1888796091079712, "step": 2798 }, { "epoch": 0.8618699499807618, "grad_norm": 6.125, "learning_rate": 7.078060210868198e-06, "loss": 1.2371288537979126, "step": 2800 }, { "epoch": 0.8624855713736053, "grad_norm": 4.6875, "learning_rate": 7.074219800605837e-06, "loss": 0.8463555574417114, "step": 2802 }, { "epoch": 0.8631011927664486, "grad_norm": 7.8125, "learning_rate": 7.070378322867182e-06, "loss": 1.2541903257369995, "step": 2804 }, { "epoch": 0.863716814159292, "grad_norm": 7.15625, "learning_rate": 7.066535781469593e-06, "loss": 1.4080777168273926, "step": 2806 }, { "epoch": 0.8643324355521355, "grad_norm": 8.5625, "learning_rate": 7.06269218023149e-06, "loss": 1.2658737897872925, "step": 2808 }, { "epoch": 0.8649480569449788, "grad_norm": 3.171875, "learning_rate": 7.0588475229723475e-06, "loss": 1.2996368408203125, "step": 2810 }, { "epoch": 0.8655636783378222, "grad_norm": 7.34375, "learning_rate": 7.055001813512683e-06, "loss": 1.2913126945495605, "step": 2812 }, { "epoch": 0.8661792997306657, "grad_norm": 4.21875, "learning_rate": 7.051155055674073e-06, "loss": 1.2469276189804077, "step": 2814 }, { "epoch": 0.8667949211235091, "grad_norm": 4.0625, "learning_rate": 7.047307253279119e-06, "loss": 1.3468555212020874, "step": 2816 }, { "epoch": 0.8674105425163524, "grad_norm": 8.3125, "learning_rate": 7.043458410151472e-06, "loss": 1.4589338302612305, "step": 2818 }, { "epoch": 0.8680261639091958, "grad_norm": 7.28125, "learning_rate": 7.039608530115809e-06, "loss": 1.2077659368515015, "step": 2820 }, { "epoch": 0.8686417853020393, "grad_norm": 3.859375, "learning_rate": 7.035757616997849e-06, "loss": 1.411242127418518, "step": 2822 }, { "epoch": 0.8692574066948826, "grad_norm": 4.5625, "learning_rate": 7.031905674624329e-06, "loss": 1.2462449073791504, "step": 2824 }, { "epoch": 0.869873028087726, "grad_norm": 2.328125, "learning_rate": 7.0280527068230076e-06, "loss": 1.0618919134140015, "step": 2826 }, { "epoch": 0.8704886494805695, "grad_norm": 4.09375, "learning_rate": 7.024198717422666e-06, "loss": 1.2162715196609497, "step": 2828 }, { "epoch": 0.8711042708734128, "grad_norm": 7.78125, "learning_rate": 7.020343710253101e-06, "loss": 1.230750322341919, "step": 2830 }, { "epoch": 0.8717198922662562, "grad_norm": 6.84375, "learning_rate": 7.01648768914512e-06, "loss": 1.0324935913085938, "step": 2832 }, { "epoch": 0.8723355136590997, "grad_norm": 4.125, "learning_rate": 7.012630657930537e-06, "loss": 1.205783724784851, "step": 2834 }, { "epoch": 0.8729511350519431, "grad_norm": 7.75, "learning_rate": 7.008772620442171e-06, "loss": 1.4939160346984863, "step": 2836 }, { "epoch": 0.8735667564447864, "grad_norm": 9.0625, "learning_rate": 7.004913580513839e-06, "loss": 1.623045802116394, "step": 2838 }, { "epoch": 0.8741823778376299, "grad_norm": 10.625, "learning_rate": 7.001053541980354e-06, "loss": 1.3726472854614258, "step": 2840 }, { "epoch": 0.8747979992304733, "grad_norm": 18.0, "learning_rate": 6.9971925086775264e-06, "loss": 1.5557606220245361, "step": 2842 }, { "epoch": 0.8754136206233166, "grad_norm": 11.5625, "learning_rate": 6.993330484442149e-06, "loss": 1.5380194187164307, "step": 2844 }, { "epoch": 0.8760292420161601, "grad_norm": 7.375, "learning_rate": 6.989467473112005e-06, "loss": 1.790273904800415, "step": 2846 }, { "epoch": 0.8766448634090035, "grad_norm": 7.59375, "learning_rate": 6.985603478525853e-06, "loss": 1.3569434881210327, "step": 2848 }, { "epoch": 0.8772604848018468, "grad_norm": 5.4375, "learning_rate": 6.9817385045234294e-06, "loss": 1.58054780960083, "step": 2850 }, { "epoch": 0.8778761061946903, "grad_norm": 4.5625, "learning_rate": 6.977872554945449e-06, "loss": 1.3338651657104492, "step": 2852 }, { "epoch": 0.8784917275875337, "grad_norm": 6.03125, "learning_rate": 6.974005633633592e-06, "loss": 1.0849279165267944, "step": 2854 }, { "epoch": 0.8791073489803771, "grad_norm": 4.125, "learning_rate": 6.9701377444304995e-06, "loss": 1.0520415306091309, "step": 2856 }, { "epoch": 0.8797229703732204, "grad_norm": 21.875, "learning_rate": 6.9662688911797874e-06, "loss": 1.1952751874923706, "step": 2858 }, { "epoch": 0.8803385917660639, "grad_norm": 7.34375, "learning_rate": 6.962399077726019e-06, "loss": 1.3586761951446533, "step": 2860 }, { "epoch": 0.8809542131589073, "grad_norm": 8.0, "learning_rate": 6.9585283079147116e-06, "loss": 1.4400031566619873, "step": 2862 }, { "epoch": 0.8815698345517506, "grad_norm": 7.0625, "learning_rate": 6.954656585592339e-06, "loss": 1.38856840133667, "step": 2864 }, { "epoch": 0.8821854559445941, "grad_norm": 12.875, "learning_rate": 6.950783914606318e-06, "loss": 0.7037546038627625, "step": 2866 }, { "epoch": 0.8828010773374375, "grad_norm": 8.75, "learning_rate": 6.946910298805009e-06, "loss": 1.418529748916626, "step": 2868 }, { "epoch": 0.8834166987302808, "grad_norm": 7.15625, "learning_rate": 6.9430357420377104e-06, "loss": 1.5819289684295654, "step": 2870 }, { "epoch": 0.8840323201231243, "grad_norm": 3.171875, "learning_rate": 6.939160248154656e-06, "loss": 1.130005121231079, "step": 2872 }, { "epoch": 0.8846479415159677, "grad_norm": 4.40625, "learning_rate": 6.935283821007011e-06, "loss": 1.0710422992706299, "step": 2874 }, { "epoch": 0.885263562908811, "grad_norm": 9.625, "learning_rate": 6.931406464446866e-06, "loss": 1.295141339302063, "step": 2876 }, { "epoch": 0.8858791843016545, "grad_norm": 4.625, "learning_rate": 6.927528182327241e-06, "loss": 1.2207305431365967, "step": 2878 }, { "epoch": 0.8864948056944979, "grad_norm": 3.125, "learning_rate": 6.923648978502069e-06, "loss": 1.2300981283187866, "step": 2880 }, { "epoch": 0.8871104270873413, "grad_norm": 6.875, "learning_rate": 6.9197688568262035e-06, "loss": 1.2342026233673096, "step": 2882 }, { "epoch": 0.8877260484801847, "grad_norm": 5.25, "learning_rate": 6.915887821155407e-06, "loss": 1.0839194059371948, "step": 2884 }, { "epoch": 0.8883416698730281, "grad_norm": 4.65625, "learning_rate": 6.912005875346353e-06, "loss": 1.3776112794876099, "step": 2886 }, { "epoch": 0.8889572912658715, "grad_norm": 17.5, "learning_rate": 6.908123023256616e-06, "loss": 1.059165120124817, "step": 2888 }, { "epoch": 0.8895729126587149, "grad_norm": 9.8125, "learning_rate": 6.904239268744675e-06, "loss": 0.6640627980232239, "step": 2890 }, { "epoch": 0.8901885340515583, "grad_norm": 2.328125, "learning_rate": 6.900354615669904e-06, "loss": 1.352664589881897, "step": 2892 }, { "epoch": 0.8908041554444017, "grad_norm": 4.15625, "learning_rate": 6.896469067892568e-06, "loss": 1.2407547235488892, "step": 2894 }, { "epoch": 0.891419776837245, "grad_norm": 13.75, "learning_rate": 6.892582629273825e-06, "loss": 1.150381326675415, "step": 2896 }, { "epoch": 0.8920353982300885, "grad_norm": 12.25, "learning_rate": 6.8886953036757165e-06, "loss": 1.401031494140625, "step": 2898 }, { "epoch": 0.8926510196229319, "grad_norm": 8.3125, "learning_rate": 6.8848070949611616e-06, "loss": 1.51918625831604, "step": 2900 }, { "epoch": 0.8932666410157754, "grad_norm": 9.875, "learning_rate": 6.880918006993964e-06, "loss": 1.1139386892318726, "step": 2902 }, { "epoch": 0.8938822624086187, "grad_norm": 7.15625, "learning_rate": 6.8770280436387956e-06, "loss": 1.3600401878356934, "step": 2904 }, { "epoch": 0.8944978838014621, "grad_norm": 9.3125, "learning_rate": 6.873137208761203e-06, "loss": 1.346145749092102, "step": 2906 }, { "epoch": 0.8951135051943055, "grad_norm": 6.40625, "learning_rate": 6.869245506227591e-06, "loss": 1.2260481119155884, "step": 2908 }, { "epoch": 0.8957291265871489, "grad_norm": 22.875, "learning_rate": 6.865352939905237e-06, "loss": 1.6141583919525146, "step": 2910 }, { "epoch": 0.8963447479799923, "grad_norm": 7.6875, "learning_rate": 6.861459513662267e-06, "loss": 1.3436152935028076, "step": 2912 }, { "epoch": 0.8969603693728357, "grad_norm": 16.625, "learning_rate": 6.85756523136767e-06, "loss": 1.2726424932479858, "step": 2914 }, { "epoch": 0.8975759907656791, "grad_norm": 13.1875, "learning_rate": 6.853670096891277e-06, "loss": 1.37815260887146, "step": 2916 }, { "epoch": 0.8981916121585225, "grad_norm": 4.375, "learning_rate": 6.849774114103775e-06, "loss": 1.2725327014923096, "step": 2918 }, { "epoch": 0.8988072335513659, "grad_norm": 11.0625, "learning_rate": 6.8458772868766875e-06, "loss": 1.5259203910827637, "step": 2920 }, { "epoch": 0.8994228549442093, "grad_norm": 4.5, "learning_rate": 6.841979619082379e-06, "loss": 1.0570274591445923, "step": 2922 }, { "epoch": 0.9000384763370527, "grad_norm": 9.3125, "learning_rate": 6.8380811145940485e-06, "loss": 1.0440081357955933, "step": 2924 }, { "epoch": 0.9006540977298961, "grad_norm": 7.25, "learning_rate": 6.834181777285729e-06, "loss": 1.6796268224716187, "step": 2926 }, { "epoch": 0.9012697191227396, "grad_norm": 2.109375, "learning_rate": 6.830281611032277e-06, "loss": 1.435825228691101, "step": 2928 }, { "epoch": 0.9018853405155829, "grad_norm": 4.40625, "learning_rate": 6.826380619709376e-06, "loss": 1.38742196559906, "step": 2930 }, { "epoch": 0.9025009619084263, "grad_norm": 9.9375, "learning_rate": 6.822478807193531e-06, "loss": 1.6338295936584473, "step": 2932 }, { "epoch": 0.9031165833012698, "grad_norm": 11.8125, "learning_rate": 6.81857617736206e-06, "loss": 1.0583865642547607, "step": 2934 }, { "epoch": 0.9037322046941131, "grad_norm": 7.21875, "learning_rate": 6.81467273409309e-06, "loss": 1.2847459316253662, "step": 2936 }, { "epoch": 0.9043478260869565, "grad_norm": 14.875, "learning_rate": 6.810768481265564e-06, "loss": 1.5531105995178223, "step": 2938 }, { "epoch": 0.9049634474798, "grad_norm": 3.53125, "learning_rate": 6.806863422759225e-06, "loss": 1.2093020677566528, "step": 2940 }, { "epoch": 0.9055790688726433, "grad_norm": 6.09375, "learning_rate": 6.802957562454613e-06, "loss": 1.5628454685211182, "step": 2942 }, { "epoch": 0.9061946902654867, "grad_norm": 9.875, "learning_rate": 6.799050904233075e-06, "loss": 1.3811249732971191, "step": 2944 }, { "epoch": 0.9068103116583301, "grad_norm": 6.9375, "learning_rate": 6.795143451976742e-06, "loss": 1.2331663370132446, "step": 2946 }, { "epoch": 0.9074259330511736, "grad_norm": 12.9375, "learning_rate": 6.7912352095685366e-06, "loss": 1.6550911664962769, "step": 2948 }, { "epoch": 0.9080415544440169, "grad_norm": 6.4375, "learning_rate": 6.787326180892166e-06, "loss": 1.2516546249389648, "step": 2950 }, { "epoch": 0.9086571758368603, "grad_norm": 4.5625, "learning_rate": 6.783416369832122e-06, "loss": 0.8436126708984375, "step": 2952 }, { "epoch": 0.9092727972297038, "grad_norm": 12.5625, "learning_rate": 6.77950578027367e-06, "loss": 0.747209906578064, "step": 2954 }, { "epoch": 0.9098884186225471, "grad_norm": 13.9375, "learning_rate": 6.775594416102851e-06, "loss": 1.5576605796813965, "step": 2956 }, { "epoch": 0.9105040400153905, "grad_norm": 4.46875, "learning_rate": 6.771682281206476e-06, "loss": 1.3183671236038208, "step": 2958 }, { "epoch": 0.911119661408234, "grad_norm": 6.40625, "learning_rate": 6.767769379472119e-06, "loss": 1.1161308288574219, "step": 2960 }, { "epoch": 0.9117352828010773, "grad_norm": 10.125, "learning_rate": 6.763855714788119e-06, "loss": 1.6109819412231445, "step": 2962 }, { "epoch": 0.9123509041939207, "grad_norm": 7.375, "learning_rate": 6.759941291043575e-06, "loss": 1.3613532781600952, "step": 2964 }, { "epoch": 0.9129665255867642, "grad_norm": 5.625, "learning_rate": 6.756026112128333e-06, "loss": 1.263413667678833, "step": 2966 }, { "epoch": 0.9135821469796075, "grad_norm": 5.59375, "learning_rate": 6.752110181932998e-06, "loss": 1.2548730373382568, "step": 2968 }, { "epoch": 0.9141977683724509, "grad_norm": 4.78125, "learning_rate": 6.748193504348917e-06, "loss": 1.312070369720459, "step": 2970 }, { "epoch": 0.9148133897652944, "grad_norm": 7.03125, "learning_rate": 6.744276083268176e-06, "loss": 1.305625081062317, "step": 2972 }, { "epoch": 0.9154290111581378, "grad_norm": 16.125, "learning_rate": 6.7403579225836094e-06, "loss": 1.5548710823059082, "step": 2974 }, { "epoch": 0.9160446325509811, "grad_norm": 4.5625, "learning_rate": 6.736439026188779e-06, "loss": 1.302093505859375, "step": 2976 }, { "epoch": 0.9166602539438246, "grad_norm": 32.5, "learning_rate": 6.732519397977981e-06, "loss": 1.5850815773010254, "step": 2978 }, { "epoch": 0.917275875336668, "grad_norm": 2.4375, "learning_rate": 6.728599041846236e-06, "loss": 1.028442144393921, "step": 2980 }, { "epoch": 0.9178914967295113, "grad_norm": 2.75, "learning_rate": 6.7246779616892936e-06, "loss": 0.9915659427642822, "step": 2982 }, { "epoch": 0.9185071181223547, "grad_norm": 4.03125, "learning_rate": 6.720756161403614e-06, "loss": 1.1359045505523682, "step": 2984 }, { "epoch": 0.9191227395151982, "grad_norm": 1.4453125, "learning_rate": 6.7168336448863805e-06, "loss": 1.156688928604126, "step": 2986 }, { "epoch": 0.9197383609080415, "grad_norm": 8.0, "learning_rate": 6.7129104160354875e-06, "loss": 1.278592824935913, "step": 2988 }, { "epoch": 0.9203539823008849, "grad_norm": 7.34375, "learning_rate": 6.708986478749532e-06, "loss": 1.583968997001648, "step": 2990 }, { "epoch": 0.9209696036937284, "grad_norm": 6.71875, "learning_rate": 6.70506183692782e-06, "loss": 1.4367609024047852, "step": 2992 }, { "epoch": 0.9215852250865718, "grad_norm": 5.34375, "learning_rate": 6.701136494470356e-06, "loss": 0.7980353236198425, "step": 2994 }, { "epoch": 0.9222008464794151, "grad_norm": 8.75, "learning_rate": 6.697210455277842e-06, "loss": 1.4646888971328735, "step": 2996 }, { "epoch": 0.9228164678722586, "grad_norm": 9.375, "learning_rate": 6.693283723251669e-06, "loss": 1.732331395149231, "step": 2998 }, { "epoch": 0.923432089265102, "grad_norm": 13.3125, "learning_rate": 6.689356302293921e-06, "loss": 1.5182671546936035, "step": 3000 }, { "epoch": 0.9240477106579453, "grad_norm": 13.125, "learning_rate": 6.685428196307362e-06, "loss": 1.2069380283355713, "step": 3002 }, { "epoch": 0.9246633320507888, "grad_norm": 5.78125, "learning_rate": 6.68149940919544e-06, "loss": 1.3300877809524536, "step": 3004 }, { "epoch": 0.9252789534436322, "grad_norm": 25.125, "learning_rate": 6.677569944862277e-06, "loss": 1.355804204940796, "step": 3006 }, { "epoch": 0.9258945748364755, "grad_norm": 8.5625, "learning_rate": 6.673639807212674e-06, "loss": 1.4251302480697632, "step": 3008 }, { "epoch": 0.926510196229319, "grad_norm": 1.8046875, "learning_rate": 6.6697090001520936e-06, "loss": 1.1559216976165771, "step": 3010 }, { "epoch": 0.9271258176221624, "grad_norm": 3.59375, "learning_rate": 6.665777527586668e-06, "loss": 1.0845048427581787, "step": 3012 }, { "epoch": 0.9277414390150057, "grad_norm": 10.125, "learning_rate": 6.661845393423187e-06, "loss": 1.2049851417541504, "step": 3014 }, { "epoch": 0.9283570604078492, "grad_norm": 9.6875, "learning_rate": 6.657912601569105e-06, "loss": 1.232240080833435, "step": 3016 }, { "epoch": 0.9289726818006926, "grad_norm": 6.4375, "learning_rate": 6.653979155932524e-06, "loss": 1.3066433668136597, "step": 3018 }, { "epoch": 0.929588303193536, "grad_norm": 22.5, "learning_rate": 6.6500450604221945e-06, "loss": 1.005613923072815, "step": 3020 }, { "epoch": 0.9302039245863793, "grad_norm": 19.75, "learning_rate": 6.646110318947518e-06, "loss": 1.1477642059326172, "step": 3022 }, { "epoch": 0.9308195459792228, "grad_norm": 7.375, "learning_rate": 6.642174935418535e-06, "loss": 1.2949566841125488, "step": 3024 }, { "epoch": 0.9314351673720662, "grad_norm": 10.8125, "learning_rate": 6.638238913745922e-06, "loss": 1.5063673257827759, "step": 3026 }, { "epoch": 0.9320507887649095, "grad_norm": 6.71875, "learning_rate": 6.634302257840995e-06, "loss": 1.325649619102478, "step": 3028 }, { "epoch": 0.932666410157753, "grad_norm": 8.375, "learning_rate": 6.630364971615695e-06, "loss": 1.3196182250976562, "step": 3030 }, { "epoch": 0.9332820315505964, "grad_norm": 7.125, "learning_rate": 6.626427058982594e-06, "loss": 1.3414075374603271, "step": 3032 }, { "epoch": 0.9338976529434397, "grad_norm": 17.5, "learning_rate": 6.622488523854882e-06, "loss": 1.5244091749191284, "step": 3034 }, { "epoch": 0.9345132743362832, "grad_norm": 6.53125, "learning_rate": 6.61854937014637e-06, "loss": 1.4249980449676514, "step": 3036 }, { "epoch": 0.9351288957291266, "grad_norm": 8.8125, "learning_rate": 6.614609601771482e-06, "loss": 1.284740924835205, "step": 3038 }, { "epoch": 0.93574451712197, "grad_norm": 6.96875, "learning_rate": 6.6106692226452574e-06, "loss": 1.396584153175354, "step": 3040 }, { "epoch": 0.9363601385148134, "grad_norm": 5.6875, "learning_rate": 6.606728236683337e-06, "loss": 1.5623862743377686, "step": 3042 }, { "epoch": 0.9369757599076568, "grad_norm": 2.15625, "learning_rate": 6.602786647801968e-06, "loss": 0.997999906539917, "step": 3044 }, { "epoch": 0.9375913813005002, "grad_norm": 4.8125, "learning_rate": 6.598844459917997e-06, "loss": 1.1285858154296875, "step": 3046 }, { "epoch": 0.9382070026933436, "grad_norm": 7.53125, "learning_rate": 6.594901676948861e-06, "loss": 1.3015860319137573, "step": 3048 }, { "epoch": 0.938822624086187, "grad_norm": 6.375, "learning_rate": 6.590958302812592e-06, "loss": 1.2325267791748047, "step": 3050 }, { "epoch": 0.9394382454790304, "grad_norm": 10.125, "learning_rate": 6.587014341427812e-06, "loss": 1.6582579612731934, "step": 3052 }, { "epoch": 0.9400538668718738, "grad_norm": 11.1875, "learning_rate": 6.5830697967137225e-06, "loss": 1.4930779933929443, "step": 3054 }, { "epoch": 0.9406694882647172, "grad_norm": 4.03125, "learning_rate": 6.579124672590107e-06, "loss": 1.2360354661941528, "step": 3056 }, { "epoch": 0.9412851096575606, "grad_norm": 9.0625, "learning_rate": 6.575178972977321e-06, "loss": 1.4100964069366455, "step": 3058 }, { "epoch": 0.941900731050404, "grad_norm": 5.0625, "learning_rate": 6.571232701796297e-06, "loss": 1.505408763885498, "step": 3060 }, { "epoch": 0.9425163524432474, "grad_norm": 4.78125, "learning_rate": 6.567285862968532e-06, "loss": 1.143608808517456, "step": 3062 }, { "epoch": 0.9431319738360908, "grad_norm": 21.5, "learning_rate": 6.56333846041609e-06, "loss": 1.7274588346481323, "step": 3064 }, { "epoch": 0.9437475952289343, "grad_norm": 6.125, "learning_rate": 6.559390498061591e-06, "loss": 1.4496876001358032, "step": 3066 }, { "epoch": 0.9443632166217776, "grad_norm": 4.125, "learning_rate": 6.555441979828217e-06, "loss": 1.1227610111236572, "step": 3068 }, { "epoch": 0.944978838014621, "grad_norm": 6.0625, "learning_rate": 6.551492909639694e-06, "loss": 1.3069566488265991, "step": 3070 }, { "epoch": 0.9455944594074644, "grad_norm": 9.0, "learning_rate": 6.547543291420306e-06, "loss": 1.0902361869812012, "step": 3072 }, { "epoch": 0.9462100808003078, "grad_norm": 17.375, "learning_rate": 6.5435931290948765e-06, "loss": 1.1386152505874634, "step": 3074 }, { "epoch": 0.9468257021931512, "grad_norm": 8.0, "learning_rate": 6.539642426588768e-06, "loss": 1.3928558826446533, "step": 3076 }, { "epoch": 0.9474413235859946, "grad_norm": 31.625, "learning_rate": 6.5356911878278835e-06, "loss": 1.2120009660720825, "step": 3078 }, { "epoch": 0.948056944978838, "grad_norm": 25.5, "learning_rate": 6.5317394167386605e-06, "loss": 1.620686411857605, "step": 3080 }, { "epoch": 0.9486725663716814, "grad_norm": 5.96875, "learning_rate": 6.527787117248056e-06, "loss": 1.2502431869506836, "step": 3082 }, { "epoch": 0.9492881877645248, "grad_norm": 7.75, "learning_rate": 6.5238342932835645e-06, "loss": 0.956054151058197, "step": 3084 }, { "epoch": 0.9499038091573683, "grad_norm": 2.40625, "learning_rate": 6.519880948773194e-06, "loss": 1.5191279649734497, "step": 3086 }, { "epoch": 0.9505194305502116, "grad_norm": 4.84375, "learning_rate": 6.515927087645471e-06, "loss": 1.1445995569229126, "step": 3088 }, { "epoch": 0.951135051943055, "grad_norm": 4.125, "learning_rate": 6.511972713829433e-06, "loss": 1.129223108291626, "step": 3090 }, { "epoch": 0.9517506733358985, "grad_norm": 13.625, "learning_rate": 6.508017831254637e-06, "loss": 1.7071363925933838, "step": 3092 }, { "epoch": 0.9523662947287418, "grad_norm": 4.28125, "learning_rate": 6.504062443851131e-06, "loss": 1.3347673416137695, "step": 3094 }, { "epoch": 0.9529819161215852, "grad_norm": 4.53125, "learning_rate": 6.500106555549478e-06, "loss": 1.29487943649292, "step": 3096 }, { "epoch": 0.9535975375144287, "grad_norm": 4.75, "learning_rate": 6.4961501702807305e-06, "loss": 1.4279407262802124, "step": 3098 }, { "epoch": 0.954213158907272, "grad_norm": 8.875, "learning_rate": 6.4921932919764365e-06, "loss": 1.0926623344421387, "step": 3100 }, { "epoch": 0.9548287803001154, "grad_norm": 5.65625, "learning_rate": 6.4882359245686355e-06, "loss": 1.2280659675598145, "step": 3102 }, { "epoch": 0.9554444016929589, "grad_norm": 10.125, "learning_rate": 6.484278071989852e-06, "loss": 1.1809242963790894, "step": 3104 }, { "epoch": 0.9560600230858022, "grad_norm": 3.265625, "learning_rate": 6.480319738173092e-06, "loss": 1.0962247848510742, "step": 3106 }, { "epoch": 0.9566756444786456, "grad_norm": 8.125, "learning_rate": 6.4763609270518416e-06, "loss": 1.2776129245758057, "step": 3108 }, { "epoch": 0.957291265871489, "grad_norm": 6.75, "learning_rate": 6.472401642560062e-06, "loss": 1.0463416576385498, "step": 3110 }, { "epoch": 0.9579068872643325, "grad_norm": 15.6875, "learning_rate": 6.468441888632179e-06, "loss": 1.5015335083007812, "step": 3112 }, { "epoch": 0.9585225086571758, "grad_norm": 7.0, "learning_rate": 6.4644816692030905e-06, "loss": 1.3311984539031982, "step": 3114 }, { "epoch": 0.9591381300500192, "grad_norm": 5.1875, "learning_rate": 6.460520988208156e-06, "loss": 1.1144503355026245, "step": 3116 }, { "epoch": 0.9597537514428627, "grad_norm": 4.78125, "learning_rate": 6.456559849583193e-06, "loss": 1.2613763809204102, "step": 3118 }, { "epoch": 0.960369372835706, "grad_norm": 27.5, "learning_rate": 6.452598257264473e-06, "loss": 1.0620733499526978, "step": 3120 }, { "epoch": 0.9609849942285494, "grad_norm": 4.09375, "learning_rate": 6.448636215188719e-06, "loss": 1.211551308631897, "step": 3122 }, { "epoch": 0.9616006156213929, "grad_norm": 4.1875, "learning_rate": 6.444673727293103e-06, "loss": 1.3237829208374023, "step": 3124 }, { "epoch": 0.9622162370142362, "grad_norm": 11.5625, "learning_rate": 6.440710797515235e-06, "loss": 1.543803095817566, "step": 3126 }, { "epoch": 0.9628318584070796, "grad_norm": 10.0, "learning_rate": 6.436747429793169e-06, "loss": 1.1492283344268799, "step": 3128 }, { "epoch": 0.9634474797999231, "grad_norm": 5.25, "learning_rate": 6.432783628065392e-06, "loss": 0.9545158743858337, "step": 3130 }, { "epoch": 0.9640631011927665, "grad_norm": 2.734375, "learning_rate": 6.4288193962708225e-06, "loss": 1.005800485610962, "step": 3132 }, { "epoch": 0.9646787225856098, "grad_norm": 10.75, "learning_rate": 6.4248547383488065e-06, "loss": 1.4155142307281494, "step": 3134 }, { "epoch": 0.9652943439784533, "grad_norm": 9.125, "learning_rate": 6.420889658239113e-06, "loss": 1.2309889793395996, "step": 3136 }, { "epoch": 0.9659099653712967, "grad_norm": 7.28125, "learning_rate": 6.416924159881932e-06, "loss": 1.452165126800537, "step": 3138 }, { "epoch": 0.96652558676414, "grad_norm": 11.75, "learning_rate": 6.412958247217869e-06, "loss": 1.4262676239013672, "step": 3140 }, { "epoch": 0.9671412081569835, "grad_norm": 4.0625, "learning_rate": 6.408991924187937e-06, "loss": 1.578167200088501, "step": 3142 }, { "epoch": 0.9677568295498269, "grad_norm": 4.75, "learning_rate": 6.405025194733563e-06, "loss": 1.1947578191757202, "step": 3144 }, { "epoch": 0.9683724509426702, "grad_norm": 8.5, "learning_rate": 6.401058062796573e-06, "loss": 1.0839043855667114, "step": 3146 }, { "epoch": 0.9689880723355137, "grad_norm": 5.8125, "learning_rate": 6.397090532319197e-06, "loss": 1.2948286533355713, "step": 3148 }, { "epoch": 0.9696036937283571, "grad_norm": 5.46875, "learning_rate": 6.393122607244057e-06, "loss": 1.5003513097763062, "step": 3150 }, { "epoch": 0.9702193151212004, "grad_norm": 5.9375, "learning_rate": 6.389154291514171e-06, "loss": 1.5157687664031982, "step": 3152 }, { "epoch": 0.9708349365140438, "grad_norm": 12.3125, "learning_rate": 6.385185589072942e-06, "loss": 1.7549535036087036, "step": 3154 }, { "epoch": 0.9714505579068873, "grad_norm": 7.53125, "learning_rate": 6.3812165038641585e-06, "loss": 1.477494716644287, "step": 3156 }, { "epoch": 0.9720661792997307, "grad_norm": 2.9375, "learning_rate": 6.377247039831991e-06, "loss": 1.1848636865615845, "step": 3158 }, { "epoch": 0.972681800692574, "grad_norm": 6.125, "learning_rate": 6.373277200920982e-06, "loss": 1.0180506706237793, "step": 3160 }, { "epoch": 0.9732974220854175, "grad_norm": 9.5625, "learning_rate": 6.3693069910760515e-06, "loss": 1.167291283607483, "step": 3162 }, { "epoch": 0.9739130434782609, "grad_norm": 7.5625, "learning_rate": 6.365336414242487e-06, "loss": 1.1879794597625732, "step": 3164 }, { "epoch": 0.9745286648711042, "grad_norm": 6.78125, "learning_rate": 6.361365474365937e-06, "loss": 1.449440598487854, "step": 3166 }, { "epoch": 0.9751442862639477, "grad_norm": 7.9375, "learning_rate": 6.357394175392415e-06, "loss": 1.5806314945220947, "step": 3168 }, { "epoch": 0.9757599076567911, "grad_norm": 9.375, "learning_rate": 6.353422521268291e-06, "loss": 1.2685093879699707, "step": 3170 }, { "epoch": 0.9763755290496344, "grad_norm": 9.8125, "learning_rate": 6.349450515940283e-06, "loss": 1.0854675769805908, "step": 3172 }, { "epoch": 0.9769911504424779, "grad_norm": 11.0625, "learning_rate": 6.345478163355465e-06, "loss": 1.622758150100708, "step": 3174 }, { "epoch": 0.9776067718353213, "grad_norm": 9.4375, "learning_rate": 6.341505467461253e-06, "loss": 1.4243215322494507, "step": 3176 }, { "epoch": 0.9782223932281647, "grad_norm": 4.5625, "learning_rate": 6.337532432205402e-06, "loss": 1.414436936378479, "step": 3178 }, { "epoch": 0.9788380146210081, "grad_norm": 10.75, "learning_rate": 6.333559061536008e-06, "loss": 1.6876072883605957, "step": 3180 }, { "epoch": 0.9794536360138515, "grad_norm": 13.3125, "learning_rate": 6.329585359401496e-06, "loss": 1.9350146055221558, "step": 3182 }, { "epoch": 0.9800692574066949, "grad_norm": 12.5625, "learning_rate": 6.325611329750625e-06, "loss": 1.7809089422225952, "step": 3184 }, { "epoch": 0.9806848787995383, "grad_norm": 11.0, "learning_rate": 6.321636976532477e-06, "loss": 1.300473690032959, "step": 3186 }, { "epoch": 0.9813005001923817, "grad_norm": 8.5625, "learning_rate": 6.317662303696456e-06, "loss": 1.4619872570037842, "step": 3188 }, { "epoch": 0.9819161215852251, "grad_norm": 4.53125, "learning_rate": 6.3136873151922825e-06, "loss": 1.2975380420684814, "step": 3190 }, { "epoch": 0.9825317429780684, "grad_norm": 10.625, "learning_rate": 6.309712014969993e-06, "loss": 0.7392944693565369, "step": 3192 }, { "epoch": 0.9831473643709119, "grad_norm": 10.375, "learning_rate": 6.30573640697993e-06, "loss": 1.530669927597046, "step": 3194 }, { "epoch": 0.9837629857637553, "grad_norm": 3.25, "learning_rate": 6.301760495172748e-06, "loss": 1.13042414188385, "step": 3196 }, { "epoch": 0.9843786071565986, "grad_norm": 5.1875, "learning_rate": 6.297784283499397e-06, "loss": 1.0704107284545898, "step": 3198 }, { "epoch": 0.9849942285494421, "grad_norm": 5.625, "learning_rate": 6.293807775911129e-06, "loss": 1.2520568370819092, "step": 3200 }, { "epoch": 0.9856098499422855, "grad_norm": 9.875, "learning_rate": 6.289830976359488e-06, "loss": 1.1383583545684814, "step": 3202 }, { "epoch": 0.986225471335129, "grad_norm": 2.421875, "learning_rate": 6.285853888796307e-06, "loss": 0.9404821991920471, "step": 3204 }, { "epoch": 0.9868410927279723, "grad_norm": 6.78125, "learning_rate": 6.2818765171737106e-06, "loss": 1.1734299659729004, "step": 3206 }, { "epoch": 0.9874567141208157, "grad_norm": 6.8125, "learning_rate": 6.277898865444101e-06, "loss": 1.0691970586776733, "step": 3208 }, { "epoch": 0.9880723355136591, "grad_norm": 7.625, "learning_rate": 6.273920937560161e-06, "loss": 1.4263228178024292, "step": 3210 }, { "epoch": 0.9886879569065025, "grad_norm": 6.46875, "learning_rate": 6.269942737474843e-06, "loss": 1.433255672454834, "step": 3212 }, { "epoch": 0.9893035782993459, "grad_norm": 3.109375, "learning_rate": 6.265964269141375e-06, "loss": 1.2142915725708008, "step": 3214 }, { "epoch": 0.9899191996921893, "grad_norm": 6.90625, "learning_rate": 6.261985536513253e-06, "loss": 1.1429392099380493, "step": 3216 }, { "epoch": 0.9905348210850327, "grad_norm": 8.125, "learning_rate": 6.258006543544229e-06, "loss": 1.1579214334487915, "step": 3218 }, { "epoch": 0.9911504424778761, "grad_norm": 9.0625, "learning_rate": 6.254027294188321e-06, "loss": 1.15116548538208, "step": 3220 }, { "epoch": 0.9917660638707195, "grad_norm": 11.0625, "learning_rate": 6.2500477923997945e-06, "loss": 1.5957486629486084, "step": 3222 }, { "epoch": 0.992381685263563, "grad_norm": 11.8125, "learning_rate": 6.246068042133173e-06, "loss": 1.2969751358032227, "step": 3224 }, { "epoch": 0.9929973066564063, "grad_norm": 6.28125, "learning_rate": 6.242088047343222e-06, "loss": 1.5214343070983887, "step": 3226 }, { "epoch": 0.9936129280492497, "grad_norm": 11.3125, "learning_rate": 6.238107811984951e-06, "loss": 1.5400235652923584, "step": 3228 }, { "epoch": 0.9942285494420932, "grad_norm": 8.0, "learning_rate": 6.234127340013612e-06, "loss": 1.5471324920654297, "step": 3230 }, { "epoch": 0.9948441708349365, "grad_norm": 5.34375, "learning_rate": 6.230146635384684e-06, "loss": 1.3047486543655396, "step": 3232 }, { "epoch": 0.9954597922277799, "grad_norm": 2.40625, "learning_rate": 6.226165702053888e-06, "loss": 0.83452308177948, "step": 3234 }, { "epoch": 0.9960754136206234, "grad_norm": 3.09375, "learning_rate": 6.222184543977163e-06, "loss": 0.9607036113739014, "step": 3236 }, { "epoch": 0.9966910350134667, "grad_norm": 4.59375, "learning_rate": 6.218203165110676e-06, "loss": 0.9204300045967102, "step": 3238 }, { "epoch": 0.9973066564063101, "grad_norm": 7.40625, "learning_rate": 6.214221569410815e-06, "loss": 0.8008885383605957, "step": 3240 }, { "epoch": 0.9979222777991535, "grad_norm": 5.1875, "learning_rate": 6.2102397608341755e-06, "loss": 1.2449378967285156, "step": 3242 }, { "epoch": 0.9985378991919969, "grad_norm": 4.90625, "learning_rate": 6.206257743337574e-06, "loss": 1.1880314350128174, "step": 3244 }, { "epoch": 0.9991535205848403, "grad_norm": 5.90625, "learning_rate": 6.202275520878029e-06, "loss": 1.374240517616272, "step": 3246 }, { "epoch": 0.9997691419776837, "grad_norm": 7.03125, "learning_rate": 6.198293097412766e-06, "loss": 1.3699828386306763, "step": 3248 }, { "epoch": 1.0003078106964216, "grad_norm": 1.28125, "learning_rate": 6.1943104768992055e-06, "loss": 1.3666250705718994, "step": 3250 }, { "epoch": 1.000923432089265, "grad_norm": 7.125, "learning_rate": 6.190327663294971e-06, "loss": 1.4202823638916016, "step": 3252 }, { "epoch": 1.0015390534821085, "grad_norm": 3.703125, "learning_rate": 6.1863446605578705e-06, "loss": 1.1655609607696533, "step": 3254 }, { "epoch": 1.002154674874952, "grad_norm": 10.5625, "learning_rate": 6.182361472645901e-06, "loss": 1.410911202430725, "step": 3256 }, { "epoch": 1.0027702962677953, "grad_norm": 10.0625, "learning_rate": 6.178378103517251e-06, "loss": 1.6366688013076782, "step": 3258 }, { "epoch": 1.0033859176606388, "grad_norm": 3.59375, "learning_rate": 6.174394557130279e-06, "loss": 1.1457003355026245, "step": 3260 }, { "epoch": 1.0040015390534822, "grad_norm": 18.0, "learning_rate": 6.170410837443528e-06, "loss": 0.724550187587738, "step": 3262 }, { "epoch": 1.0046171604463254, "grad_norm": 3.515625, "learning_rate": 6.166426948415708e-06, "loss": 1.4719600677490234, "step": 3264 }, { "epoch": 1.0052327818391689, "grad_norm": 6.34375, "learning_rate": 6.162442894005698e-06, "loss": 1.1914829015731812, "step": 3266 }, { "epoch": 1.0058484032320123, "grad_norm": 7.09375, "learning_rate": 6.158458678172543e-06, "loss": 1.26486337184906, "step": 3268 }, { "epoch": 1.0064640246248557, "grad_norm": 7.78125, "learning_rate": 6.1544743048754484e-06, "loss": 1.1825785636901855, "step": 3270 }, { "epoch": 1.0070796460176992, "grad_norm": 10.375, "learning_rate": 6.150489778073773e-06, "loss": 1.5313773155212402, "step": 3272 }, { "epoch": 1.0076952674105426, "grad_norm": 12.6875, "learning_rate": 6.146505101727031e-06, "loss": 1.0273815393447876, "step": 3274 }, { "epoch": 1.0083108888033858, "grad_norm": 4.15625, "learning_rate": 6.14252027979489e-06, "loss": 1.1481852531433105, "step": 3276 }, { "epoch": 1.0089265101962293, "grad_norm": 10.1875, "learning_rate": 6.138535316237148e-06, "loss": 1.4595853090286255, "step": 3278 }, { "epoch": 1.0095421315890727, "grad_norm": 8.1875, "learning_rate": 6.134550215013759e-06, "loss": 1.178184151649475, "step": 3280 }, { "epoch": 1.0101577529819161, "grad_norm": 9.5, "learning_rate": 6.130564980084803e-06, "loss": 1.7836490869522095, "step": 3282 }, { "epoch": 1.0107733743747596, "grad_norm": 3.859375, "learning_rate": 6.126579615410502e-06, "loss": 1.275104284286499, "step": 3284 }, { "epoch": 1.011388995767603, "grad_norm": 11.125, "learning_rate": 6.122594124951198e-06, "loss": 1.2140474319458008, "step": 3286 }, { "epoch": 1.0120046171604464, "grad_norm": 3.375, "learning_rate": 6.118608512667364e-06, "loss": 1.1134839057922363, "step": 3288 }, { "epoch": 1.0126202385532896, "grad_norm": 7.5625, "learning_rate": 6.1146227825195934e-06, "loss": 1.0847816467285156, "step": 3290 }, { "epoch": 1.013235859946133, "grad_norm": 4.9375, "learning_rate": 6.110636938468593e-06, "loss": 1.0978894233703613, "step": 3292 }, { "epoch": 1.0138514813389765, "grad_norm": 6.5, "learning_rate": 6.1066509844751884e-06, "loss": 1.5050498247146606, "step": 3294 }, { "epoch": 1.01446710273182, "grad_norm": 3.671875, "learning_rate": 6.10266492450031e-06, "loss": 1.1623247861862183, "step": 3296 }, { "epoch": 1.0150827241246634, "grad_norm": 4.75, "learning_rate": 6.098678762504994e-06, "loss": 1.3516665697097778, "step": 3298 }, { "epoch": 1.0156983455175068, "grad_norm": 1.7109375, "learning_rate": 6.09469250245038e-06, "loss": 1.0650442838668823, "step": 3300 }, { "epoch": 1.01631396691035, "grad_norm": 5.75, "learning_rate": 6.090706148297702e-06, "loss": 1.4884039163589478, "step": 3302 }, { "epoch": 1.0169295883031935, "grad_norm": 3.375, "learning_rate": 6.0867197040082925e-06, "loss": 1.312299132347107, "step": 3304 }, { "epoch": 1.017545209696037, "grad_norm": 11.625, "learning_rate": 6.082733173543572e-06, "loss": 1.6194450855255127, "step": 3306 }, { "epoch": 1.0181608310888803, "grad_norm": 9.6875, "learning_rate": 6.078746560865039e-06, "loss": 1.4919713735580444, "step": 3308 }, { "epoch": 1.0187764524817238, "grad_norm": 2.421875, "learning_rate": 6.074759869934284e-06, "loss": 1.1863977909088135, "step": 3310 }, { "epoch": 1.0193920738745672, "grad_norm": 2.9375, "learning_rate": 6.070773104712971e-06, "loss": 1.208958625793457, "step": 3312 }, { "epoch": 1.0200076952674106, "grad_norm": 5.3125, "learning_rate": 6.0667862691628355e-06, "loss": 1.1294976472854614, "step": 3314 }, { "epoch": 1.0206233166602539, "grad_norm": 2.796875, "learning_rate": 6.062799367245688e-06, "loss": 0.984968364238739, "step": 3316 }, { "epoch": 1.0212389380530973, "grad_norm": 32.75, "learning_rate": 6.058812402923404e-06, "loss": 1.56447172164917, "step": 3318 }, { "epoch": 1.0218545594459407, "grad_norm": 6.125, "learning_rate": 6.054825380157915e-06, "loss": 1.4007642269134521, "step": 3320 }, { "epoch": 1.0224701808387842, "grad_norm": 4.90625, "learning_rate": 6.050838302911217e-06, "loss": 1.3601939678192139, "step": 3322 }, { "epoch": 1.0230858022316276, "grad_norm": 4.625, "learning_rate": 6.046851175145356e-06, "loss": 1.3050976991653442, "step": 3324 }, { "epoch": 1.023701423624471, "grad_norm": 7.65625, "learning_rate": 6.042864000822435e-06, "loss": 1.4357898235321045, "step": 3326 }, { "epoch": 1.0243170450173142, "grad_norm": 14.1875, "learning_rate": 6.038876783904594e-06, "loss": 1.1699950695037842, "step": 3328 }, { "epoch": 1.0249326664101577, "grad_norm": 4.9375, "learning_rate": 6.034889528354022e-06, "loss": 1.083859920501709, "step": 3330 }, { "epoch": 1.0255482878030011, "grad_norm": 5.53125, "learning_rate": 6.030902238132943e-06, "loss": 1.2051880359649658, "step": 3332 }, { "epoch": 1.0261639091958445, "grad_norm": 3.265625, "learning_rate": 6.026914917203617e-06, "loss": 0.9221932888031006, "step": 3334 }, { "epoch": 1.026779530588688, "grad_norm": 47.75, "learning_rate": 6.022927569528336e-06, "loss": 1.5843613147735596, "step": 3336 }, { "epoch": 1.0273951519815314, "grad_norm": 17.5, "learning_rate": 6.018940199069414e-06, "loss": 1.3591049909591675, "step": 3338 }, { "epoch": 1.0280107733743749, "grad_norm": 6.9375, "learning_rate": 6.014952809789193e-06, "loss": 1.2899994850158691, "step": 3340 }, { "epoch": 1.028626394767218, "grad_norm": 7.25, "learning_rate": 6.010965405650028e-06, "loss": 1.698613166809082, "step": 3342 }, { "epoch": 1.0292420161600615, "grad_norm": 7.21875, "learning_rate": 6.006977990614293e-06, "loss": 1.4977262020111084, "step": 3344 }, { "epoch": 1.029857637552905, "grad_norm": 7.34375, "learning_rate": 6.002990568644375e-06, "loss": 1.124574065208435, "step": 3346 }, { "epoch": 1.0304732589457484, "grad_norm": 2.25, "learning_rate": 5.999003143702659e-06, "loss": 1.0379462242126465, "step": 3348 }, { "epoch": 1.0310888803385918, "grad_norm": 3.28125, "learning_rate": 5.9950157197515445e-06, "loss": 1.1994037628173828, "step": 3350 }, { "epoch": 1.0317045017314352, "grad_norm": 8.4375, "learning_rate": 5.9910283007534185e-06, "loss": 1.3688772916793823, "step": 3352 }, { "epoch": 1.0323201231242787, "grad_norm": 5.65625, "learning_rate": 5.9870408906706725e-06, "loss": 1.2714589834213257, "step": 3354 }, { "epoch": 1.032935744517122, "grad_norm": 4.5625, "learning_rate": 5.983053493465683e-06, "loss": 1.300389051437378, "step": 3356 }, { "epoch": 1.0335513659099653, "grad_norm": 12.5, "learning_rate": 5.9790661131008175e-06, "loss": 1.529327630996704, "step": 3358 }, { "epoch": 1.0341669873028088, "grad_norm": 4.84375, "learning_rate": 5.975078753538423e-06, "loss": 1.053821086883545, "step": 3360 }, { "epoch": 1.0347826086956522, "grad_norm": 12.9375, "learning_rate": 5.971091418740833e-06, "loss": 1.8510454893112183, "step": 3362 }, { "epoch": 1.0353982300884956, "grad_norm": 6.03125, "learning_rate": 5.9671041126703475e-06, "loss": 1.4777189493179321, "step": 3364 }, { "epoch": 1.036013851481339, "grad_norm": 8.5625, "learning_rate": 5.96311683928924e-06, "loss": 1.08958899974823, "step": 3366 }, { "epoch": 1.0366294728741823, "grad_norm": 3.78125, "learning_rate": 5.959129602559759e-06, "loss": 1.5156233310699463, "step": 3368 }, { "epoch": 1.0372450942670257, "grad_norm": 4.625, "learning_rate": 5.955142406444105e-06, "loss": 1.18940007686615, "step": 3370 }, { "epoch": 1.0378607156598691, "grad_norm": 3.390625, "learning_rate": 5.95115525490445e-06, "loss": 1.036544919013977, "step": 3372 }, { "epoch": 1.0384763370527126, "grad_norm": 1.75, "learning_rate": 5.947168151902912e-06, "loss": 1.041877269744873, "step": 3374 }, { "epoch": 1.039091958445556, "grad_norm": 16.625, "learning_rate": 5.943181101401567e-06, "loss": 1.3080252408981323, "step": 3376 }, { "epoch": 1.0397075798383995, "grad_norm": 4.8125, "learning_rate": 5.939194107362434e-06, "loss": 0.9550781846046448, "step": 3378 }, { "epoch": 1.0403232012312429, "grad_norm": 5.0, "learning_rate": 5.9352071737474795e-06, "loss": 1.2598165273666382, "step": 3380 }, { "epoch": 1.040938822624086, "grad_norm": 7.75, "learning_rate": 5.931220304518608e-06, "loss": 1.376179575920105, "step": 3382 }, { "epoch": 1.0415544440169295, "grad_norm": 1.6796875, "learning_rate": 5.9272335036376615e-06, "loss": 1.0743067264556885, "step": 3384 }, { "epoch": 1.042170065409773, "grad_norm": 2.875, "learning_rate": 5.923246775066416e-06, "loss": 0.9674758315086365, "step": 3386 }, { "epoch": 1.0427856868026164, "grad_norm": 6.03125, "learning_rate": 5.91926012276657e-06, "loss": 1.6355992555618286, "step": 3388 }, { "epoch": 1.0434013081954598, "grad_norm": 3.40625, "learning_rate": 5.915273550699748e-06, "loss": 1.4389272928237915, "step": 3390 }, { "epoch": 1.0440169295883033, "grad_norm": 5.8125, "learning_rate": 5.911287062827499e-06, "loss": 1.3074742555618286, "step": 3392 }, { "epoch": 1.0446325509811465, "grad_norm": 4.9375, "learning_rate": 5.907300663111284e-06, "loss": 1.1949514150619507, "step": 3394 }, { "epoch": 1.04524817237399, "grad_norm": 11.9375, "learning_rate": 5.903314355512477e-06, "loss": 1.5670204162597656, "step": 3396 }, { "epoch": 1.0458637937668334, "grad_norm": 6.375, "learning_rate": 5.899328143992364e-06, "loss": 0.9783201813697815, "step": 3398 }, { "epoch": 1.0464794151596768, "grad_norm": 5.15625, "learning_rate": 5.895342032512132e-06, "loss": 1.126775860786438, "step": 3400 }, { "epoch": 1.0470950365525202, "grad_norm": 13.5, "learning_rate": 5.891356025032866e-06, "loss": 1.6031320095062256, "step": 3402 }, { "epoch": 1.0477106579453637, "grad_norm": 3.375, "learning_rate": 5.887370125515554e-06, "loss": 1.0438646078109741, "step": 3404 }, { "epoch": 1.048326279338207, "grad_norm": 14.375, "learning_rate": 5.883384337921072e-06, "loss": 1.4825729131698608, "step": 3406 }, { "epoch": 1.0489419007310503, "grad_norm": 12.1875, "learning_rate": 5.879398666210189e-06, "loss": 0.9457087516784668, "step": 3408 }, { "epoch": 1.0495575221238937, "grad_norm": 5.15625, "learning_rate": 5.875413114343554e-06, "loss": 1.3323965072631836, "step": 3410 }, { "epoch": 1.0501731435167372, "grad_norm": 6.09375, "learning_rate": 5.871427686281699e-06, "loss": 0.9091135859489441, "step": 3412 }, { "epoch": 1.0507887649095806, "grad_norm": 40.0, "learning_rate": 5.867442385985036e-06, "loss": 0.8349736928939819, "step": 3414 }, { "epoch": 1.051404386302424, "grad_norm": 24.625, "learning_rate": 5.863457217413845e-06, "loss": 1.8638713359832764, "step": 3416 }, { "epoch": 1.0520200076952675, "grad_norm": 11.125, "learning_rate": 5.859472184528279e-06, "loss": 1.314063549041748, "step": 3418 }, { "epoch": 1.052635629088111, "grad_norm": 1.8515625, "learning_rate": 5.855487291288351e-06, "loss": 1.1677442789077759, "step": 3420 }, { "epoch": 1.0532512504809541, "grad_norm": 7.84375, "learning_rate": 5.85150254165394e-06, "loss": 1.2674967050552368, "step": 3422 }, { "epoch": 1.0538668718737976, "grad_norm": 6.375, "learning_rate": 5.847517939584783e-06, "loss": 1.268509864807129, "step": 3424 }, { "epoch": 1.054482493266641, "grad_norm": 4.25, "learning_rate": 5.84353348904047e-06, "loss": 1.3440495729446411, "step": 3426 }, { "epoch": 1.0550981146594844, "grad_norm": 11.4375, "learning_rate": 5.839549193980434e-06, "loss": 1.240549087524414, "step": 3428 }, { "epoch": 1.0557137360523279, "grad_norm": 6.03125, "learning_rate": 5.835565058363962e-06, "loss": 1.4629731178283691, "step": 3430 }, { "epoch": 1.0563293574451713, "grad_norm": 13.375, "learning_rate": 5.831581086150177e-06, "loss": 0.8581249117851257, "step": 3432 }, { "epoch": 1.0569449788380145, "grad_norm": 14.25, "learning_rate": 5.827597281298041e-06, "loss": 1.3019016981124878, "step": 3434 }, { "epoch": 1.057560600230858, "grad_norm": 11.0, "learning_rate": 5.823613647766351e-06, "loss": 0.8576952815055847, "step": 3436 }, { "epoch": 1.0581762216237014, "grad_norm": 7.0, "learning_rate": 5.819630189513734e-06, "loss": 1.5954262018203735, "step": 3438 }, { "epoch": 1.0587918430165448, "grad_norm": 3.484375, "learning_rate": 5.815646910498642e-06, "loss": 1.2628898620605469, "step": 3440 }, { "epoch": 1.0594074644093883, "grad_norm": 2.234375, "learning_rate": 5.811663814679345e-06, "loss": 1.136056900024414, "step": 3442 }, { "epoch": 1.0600230858022317, "grad_norm": 8.5, "learning_rate": 5.807680906013937e-06, "loss": 1.1236200332641602, "step": 3444 }, { "epoch": 1.060638707195075, "grad_norm": 7.03125, "learning_rate": 5.803698188460325e-06, "loss": 1.1918787956237793, "step": 3446 }, { "epoch": 1.0612543285879183, "grad_norm": 8.5625, "learning_rate": 5.799715665976224e-06, "loss": 1.402172327041626, "step": 3448 }, { "epoch": 1.0618699499807618, "grad_norm": 7.03125, "learning_rate": 5.795733342519154e-06, "loss": 1.1775987148284912, "step": 3450 }, { "epoch": 1.0624855713736052, "grad_norm": 25.375, "learning_rate": 5.7917512220464424e-06, "loss": 1.1295747756958008, "step": 3452 }, { "epoch": 1.0631011927664487, "grad_norm": 7.03125, "learning_rate": 5.787769308515208e-06, "loss": 1.3732116222381592, "step": 3454 }, { "epoch": 1.063716814159292, "grad_norm": 8.9375, "learning_rate": 5.783787605882367e-06, "loss": 1.401436448097229, "step": 3456 }, { "epoch": 1.0643324355521355, "grad_norm": 6.53125, "learning_rate": 5.77980611810463e-06, "loss": 1.3957934379577637, "step": 3458 }, { "epoch": 1.0649480569449787, "grad_norm": 8.1875, "learning_rate": 5.775824849138491e-06, "loss": 1.8235782384872437, "step": 3460 }, { "epoch": 1.0655636783378222, "grad_norm": 5.65625, "learning_rate": 5.7718438029402225e-06, "loss": 1.2653405666351318, "step": 3462 }, { "epoch": 1.0661792997306656, "grad_norm": 10.5, "learning_rate": 5.767862983465884e-06, "loss": 1.286799669265747, "step": 3464 }, { "epoch": 1.066794921123509, "grad_norm": 10.9375, "learning_rate": 5.763882394671299e-06, "loss": 1.1525325775146484, "step": 3466 }, { "epoch": 1.0674105425163525, "grad_norm": 10.1875, "learning_rate": 5.759902040512073e-06, "loss": 1.480642318725586, "step": 3468 }, { "epoch": 1.068026163909196, "grad_norm": 5.59375, "learning_rate": 5.755921924943571e-06, "loss": 1.250403642654419, "step": 3470 }, { "epoch": 1.0686417853020393, "grad_norm": 5.3125, "learning_rate": 5.751942051920923e-06, "loss": 1.1772561073303223, "step": 3472 }, { "epoch": 1.0692574066948826, "grad_norm": 7.9375, "learning_rate": 5.747962425399019e-06, "loss": 1.0843044519424438, "step": 3474 }, { "epoch": 1.069873028087726, "grad_norm": 1.8671875, "learning_rate": 5.743983049332502e-06, "loss": 1.2011973857879639, "step": 3476 }, { "epoch": 1.0704886494805694, "grad_norm": 5.8125, "learning_rate": 5.740003927675769e-06, "loss": 1.196240782737732, "step": 3478 }, { "epoch": 1.0711042708734129, "grad_norm": 6.15625, "learning_rate": 5.73602506438296e-06, "loss": 1.5370745658874512, "step": 3480 }, { "epoch": 1.0717198922662563, "grad_norm": 7.3125, "learning_rate": 5.732046463407961e-06, "loss": 1.1346544027328491, "step": 3482 }, { "epoch": 1.0723355136590997, "grad_norm": 7.34375, "learning_rate": 5.728068128704399e-06, "loss": 1.1971721649169922, "step": 3484 }, { "epoch": 1.0729511350519432, "grad_norm": 5.90625, "learning_rate": 5.724090064225634e-06, "loss": 1.1389195919036865, "step": 3486 }, { "epoch": 1.0735667564447864, "grad_norm": 5.625, "learning_rate": 5.720112273924754e-06, "loss": 1.1738673448562622, "step": 3488 }, { "epoch": 1.0741823778376298, "grad_norm": 4.25, "learning_rate": 5.716134761754584e-06, "loss": 1.225260853767395, "step": 3490 }, { "epoch": 1.0747979992304733, "grad_norm": 10.3125, "learning_rate": 5.712157531667664e-06, "loss": 0.9891336560249329, "step": 3492 }, { "epoch": 1.0754136206233167, "grad_norm": 4.5, "learning_rate": 5.708180587616257e-06, "loss": 1.2247391939163208, "step": 3494 }, { "epoch": 1.0760292420161601, "grad_norm": 5.8125, "learning_rate": 5.704203933552339e-06, "loss": 1.2745413780212402, "step": 3496 }, { "epoch": 1.0766448634090036, "grad_norm": 4.34375, "learning_rate": 5.7002275734276034e-06, "loss": 1.155380368232727, "step": 3498 }, { "epoch": 1.0772604848018468, "grad_norm": 5.8125, "learning_rate": 5.696251511193449e-06, "loss": 1.122201681137085, "step": 3500 }, { "epoch": 1.0778761061946902, "grad_norm": 5.40625, "learning_rate": 5.692275750800977e-06, "loss": 1.2920207977294922, "step": 3502 }, { "epoch": 1.0784917275875336, "grad_norm": 51.25, "learning_rate": 5.68830029620099e-06, "loss": 1.4547216892242432, "step": 3504 }, { "epoch": 1.079107348980377, "grad_norm": 7.0625, "learning_rate": 5.6843251513439845e-06, "loss": 1.7656255960464478, "step": 3506 }, { "epoch": 1.0797229703732205, "grad_norm": 8.1875, "learning_rate": 5.680350320180152e-06, "loss": 1.4713743925094604, "step": 3508 }, { "epoch": 1.080338591766064, "grad_norm": 13.75, "learning_rate": 5.676375806659371e-06, "loss": 0.9524730443954468, "step": 3510 }, { "epoch": 1.0809542131589072, "grad_norm": 18.375, "learning_rate": 5.6724016147312065e-06, "loss": 1.7022722959518433, "step": 3512 }, { "epoch": 1.0815698345517506, "grad_norm": 5.84375, "learning_rate": 5.6684277483449e-06, "loss": 1.0908372402191162, "step": 3514 }, { "epoch": 1.082185455944594, "grad_norm": 2.875, "learning_rate": 5.664454211449373e-06, "loss": 1.2222366333007812, "step": 3516 }, { "epoch": 1.0828010773374375, "grad_norm": 5.0, "learning_rate": 5.660481007993218e-06, "loss": 1.3541144132614136, "step": 3518 }, { "epoch": 1.083416698730281, "grad_norm": 8.875, "learning_rate": 5.656508141924695e-06, "loss": 1.100854754447937, "step": 3520 }, { "epoch": 1.0840323201231243, "grad_norm": 5.6875, "learning_rate": 5.6525356171917314e-06, "loss": 0.957051157951355, "step": 3522 }, { "epoch": 1.0846479415159678, "grad_norm": 6.96875, "learning_rate": 5.648563437741913e-06, "loss": 1.4000749588012695, "step": 3524 }, { "epoch": 1.085263562908811, "grad_norm": 4.90625, "learning_rate": 5.6445916075224845e-06, "loss": 1.1802750825881958, "step": 3526 }, { "epoch": 1.0858791843016544, "grad_norm": 9.875, "learning_rate": 5.640620130480343e-06, "loss": 1.3265138864517212, "step": 3528 }, { "epoch": 1.0864948056944979, "grad_norm": 6.4375, "learning_rate": 5.636649010562034e-06, "loss": 1.4276715517044067, "step": 3530 }, { "epoch": 1.0871104270873413, "grad_norm": 5.15625, "learning_rate": 5.6326782517137475e-06, "loss": 1.0193065404891968, "step": 3532 }, { "epoch": 1.0877260484801847, "grad_norm": 10.8125, "learning_rate": 5.628707857881317e-06, "loss": 1.513476014137268, "step": 3534 }, { "epoch": 1.0883416698730282, "grad_norm": 4.59375, "learning_rate": 5.6247378330102085e-06, "loss": 1.5175204277038574, "step": 3536 }, { "epoch": 1.0889572912658716, "grad_norm": 3.8125, "learning_rate": 5.62076818104553e-06, "loss": 0.7854060530662537, "step": 3538 }, { "epoch": 1.0895729126587148, "grad_norm": 7.71875, "learning_rate": 5.616798905932008e-06, "loss": 1.6784924268722534, "step": 3540 }, { "epoch": 1.0901885340515582, "grad_norm": 1.7265625, "learning_rate": 5.612830011614005e-06, "loss": 0.6873236298561096, "step": 3542 }, { "epoch": 1.0908041554444017, "grad_norm": 10.1875, "learning_rate": 5.608861502035498e-06, "loss": 1.4695472717285156, "step": 3544 }, { "epoch": 1.0914197768372451, "grad_norm": 7.1875, "learning_rate": 5.604893381140084e-06, "loss": 1.2640148401260376, "step": 3546 }, { "epoch": 1.0920353982300885, "grad_norm": 8.1875, "learning_rate": 5.600925652870975e-06, "loss": 0.8067486882209778, "step": 3548 }, { "epoch": 1.092651019622932, "grad_norm": 5.75, "learning_rate": 5.596958321170987e-06, "loss": 1.2527005672454834, "step": 3550 }, { "epoch": 1.0932666410157752, "grad_norm": 8.8125, "learning_rate": 5.592991389982552e-06, "loss": 1.3179057836532593, "step": 3552 }, { "epoch": 1.0938822624086186, "grad_norm": 9.6875, "learning_rate": 5.589024863247694e-06, "loss": 1.688755989074707, "step": 3554 }, { "epoch": 1.094497883801462, "grad_norm": 11.875, "learning_rate": 5.585058744908045e-06, "loss": 1.0170738697052002, "step": 3556 }, { "epoch": 1.0951135051943055, "grad_norm": 10.75, "learning_rate": 5.581093038904821e-06, "loss": 1.4297583103179932, "step": 3558 }, { "epoch": 1.095729126587149, "grad_norm": 10.25, "learning_rate": 5.577127749178834e-06, "loss": 1.9947293996810913, "step": 3560 }, { "epoch": 1.0963447479799924, "grad_norm": 8.4375, "learning_rate": 5.5731628796704825e-06, "loss": 1.440248727798462, "step": 3562 }, { "epoch": 1.0969603693728358, "grad_norm": 12.3125, "learning_rate": 5.569198434319745e-06, "loss": 0.8194329142570496, "step": 3564 }, { "epoch": 1.097575990765679, "grad_norm": 6.96875, "learning_rate": 5.565234417066179e-06, "loss": 1.1109315156936646, "step": 3566 }, { "epoch": 1.0981916121585225, "grad_norm": 7.96875, "learning_rate": 5.561270831848922e-06, "loss": 0.8463287949562073, "step": 3568 }, { "epoch": 1.098807233551366, "grad_norm": 8.4375, "learning_rate": 5.557307682606669e-06, "loss": 1.1162123680114746, "step": 3570 }, { "epoch": 1.0994228549442093, "grad_norm": 12.5625, "learning_rate": 5.553344973277699e-06, "loss": 1.2072829008102417, "step": 3572 }, { "epoch": 1.1000384763370528, "grad_norm": 1.1484375, "learning_rate": 5.5493827077998395e-06, "loss": 0.888592004776001, "step": 3574 }, { "epoch": 1.1006540977298962, "grad_norm": 11.875, "learning_rate": 5.545420890110484e-06, "loss": 1.5153419971466064, "step": 3576 }, { "epoch": 1.1012697191227394, "grad_norm": 2.328125, "learning_rate": 5.541459524146579e-06, "loss": 1.3337959051132202, "step": 3578 }, { "epoch": 1.1018853405155828, "grad_norm": 3.5625, "learning_rate": 5.5374986138446255e-06, "loss": 1.0780278444290161, "step": 3580 }, { "epoch": 1.1025009619084263, "grad_norm": 4.5, "learning_rate": 5.533538163140666e-06, "loss": 1.2636305093765259, "step": 3582 }, { "epoch": 1.1031165833012697, "grad_norm": 4.15625, "learning_rate": 5.5295781759702895e-06, "loss": 1.325244426727295, "step": 3584 }, { "epoch": 1.1037322046941132, "grad_norm": 7.28125, "learning_rate": 5.5256186562686255e-06, "loss": 0.8051739931106567, "step": 3586 }, { "epoch": 1.1043478260869566, "grad_norm": 4.78125, "learning_rate": 5.521659607970334e-06, "loss": 1.2588369846343994, "step": 3588 }, { "epoch": 1.1049634474798, "grad_norm": 2.546875, "learning_rate": 5.517701035009615e-06, "loss": 1.2784472703933716, "step": 3590 }, { "epoch": 1.1055790688726432, "grad_norm": 7.0625, "learning_rate": 5.513742941320187e-06, "loss": 1.315181016921997, "step": 3592 }, { "epoch": 1.1061946902654867, "grad_norm": 4.65625, "learning_rate": 5.5097853308353e-06, "loss": 1.1988142728805542, "step": 3594 }, { "epoch": 1.10681031165833, "grad_norm": 10.25, "learning_rate": 5.505828207487717e-06, "loss": 1.2534257173538208, "step": 3596 }, { "epoch": 1.1074259330511735, "grad_norm": 5.78125, "learning_rate": 5.501871575209721e-06, "loss": 1.1419970989227295, "step": 3598 }, { "epoch": 1.108041554444017, "grad_norm": 2.859375, "learning_rate": 5.497915437933107e-06, "loss": 0.7719895839691162, "step": 3600 }, { "epoch": 1.1086571758368604, "grad_norm": 5.59375, "learning_rate": 5.493959799589177e-06, "loss": 1.2032313346862793, "step": 3602 }, { "epoch": 1.1092727972297038, "grad_norm": 3.796875, "learning_rate": 5.490004664108737e-06, "loss": 1.3593542575836182, "step": 3604 }, { "epoch": 1.109888418622547, "grad_norm": 4.4375, "learning_rate": 5.486050035422094e-06, "loss": 1.1797250509262085, "step": 3606 }, { "epoch": 1.1105040400153905, "grad_norm": 5.78125, "learning_rate": 5.4820959174590545e-06, "loss": 1.2517540454864502, "step": 3608 }, { "epoch": 1.111119661408234, "grad_norm": 9.5, "learning_rate": 5.4781423141489085e-06, "loss": 1.248819351196289, "step": 3610 }, { "epoch": 1.1117352828010774, "grad_norm": 21.0, "learning_rate": 5.474189229420443e-06, "loss": 1.6034396886825562, "step": 3612 }, { "epoch": 1.1123509041939208, "grad_norm": 4.125, "learning_rate": 5.470236667201927e-06, "loss": 1.0933693647384644, "step": 3614 }, { "epoch": 1.1129665255867642, "grad_norm": 7.28125, "learning_rate": 5.46628463142111e-06, "loss": 0.682421863079071, "step": 3616 }, { "epoch": 1.1135821469796074, "grad_norm": 8.5625, "learning_rate": 5.462333126005217e-06, "loss": 1.3613861799240112, "step": 3618 }, { "epoch": 1.1141977683724509, "grad_norm": 6.34375, "learning_rate": 5.458382154880953e-06, "loss": 1.3445461988449097, "step": 3620 }, { "epoch": 1.1148133897652943, "grad_norm": 17.75, "learning_rate": 5.454431721974478e-06, "loss": 1.3255103826522827, "step": 3622 }, { "epoch": 1.1154290111581378, "grad_norm": 5.96875, "learning_rate": 5.450481831211432e-06, "loss": 1.2539777755737305, "step": 3624 }, { "epoch": 1.1160446325509812, "grad_norm": 6.9375, "learning_rate": 5.446532486516909e-06, "loss": 1.1418551206588745, "step": 3626 }, { "epoch": 1.1166602539438246, "grad_norm": 5.8125, "learning_rate": 5.4425836918154585e-06, "loss": 1.2032052278518677, "step": 3628 }, { "epoch": 1.1172758753366678, "grad_norm": 7.3125, "learning_rate": 5.438635451031089e-06, "loss": 1.1165993213653564, "step": 3630 }, { "epoch": 1.1178914967295113, "grad_norm": 7.40625, "learning_rate": 5.434687768087256e-06, "loss": 1.2143586874008179, "step": 3632 }, { "epoch": 1.1185071181223547, "grad_norm": 4.6875, "learning_rate": 5.43074064690686e-06, "loss": 1.2352961301803589, "step": 3634 }, { "epoch": 1.1191227395151981, "grad_norm": 12.0, "learning_rate": 5.426794091412244e-06, "loss": 1.598303198814392, "step": 3636 }, { "epoch": 1.1197383609080416, "grad_norm": 32.5, "learning_rate": 5.422848105525187e-06, "loss": 0.8510976433753967, "step": 3638 }, { "epoch": 1.120353982300885, "grad_norm": 15.8125, "learning_rate": 5.4189026931669056e-06, "loss": 1.3632495403289795, "step": 3640 }, { "epoch": 1.1209696036937284, "grad_norm": 6.15625, "learning_rate": 5.414957858258043e-06, "loss": 1.2300214767456055, "step": 3642 }, { "epoch": 1.1215852250865717, "grad_norm": 5.34375, "learning_rate": 5.411013604718671e-06, "loss": 1.2433948516845703, "step": 3644 }, { "epoch": 1.122200846479415, "grad_norm": 1.8203125, "learning_rate": 5.407069936468284e-06, "loss": 1.080621361732483, "step": 3646 }, { "epoch": 1.1228164678722585, "grad_norm": 4.09375, "learning_rate": 5.403126857425791e-06, "loss": 1.2390270233154297, "step": 3648 }, { "epoch": 1.123432089265102, "grad_norm": 9.5625, "learning_rate": 5.399184371509521e-06, "loss": 1.39670991897583, "step": 3650 }, { "epoch": 1.1240477106579454, "grad_norm": 7.25, "learning_rate": 5.395242482637206e-06, "loss": 0.806347131729126, "step": 3652 }, { "epoch": 1.1246633320507888, "grad_norm": 3.40625, "learning_rate": 5.391301194725993e-06, "loss": 1.2208232879638672, "step": 3654 }, { "epoch": 1.1252789534436323, "grad_norm": 5.53125, "learning_rate": 5.387360511692427e-06, "loss": 1.2304521799087524, "step": 3656 }, { "epoch": 1.1258945748364755, "grad_norm": 5.25, "learning_rate": 5.383420437452453e-06, "loss": 1.4465998411178589, "step": 3658 }, { "epoch": 1.126510196229319, "grad_norm": 3.734375, "learning_rate": 5.379480975921414e-06, "loss": 1.0305050611495972, "step": 3660 }, { "epoch": 1.1271258176221624, "grad_norm": 8.1875, "learning_rate": 5.375542131014038e-06, "loss": 1.223438024520874, "step": 3662 }, { "epoch": 1.1277414390150058, "grad_norm": 5.4375, "learning_rate": 5.371603906644443e-06, "loss": 1.3073363304138184, "step": 3664 }, { "epoch": 1.1283570604078492, "grad_norm": 6.875, "learning_rate": 5.3676663067261315e-06, "loss": 1.14788818359375, "step": 3666 }, { "epoch": 1.1289726818006927, "grad_norm": 6.71875, "learning_rate": 5.363729335171985e-06, "loss": 0.8086817264556885, "step": 3668 }, { "epoch": 1.129588303193536, "grad_norm": 12.5625, "learning_rate": 5.359792995894262e-06, "loss": 1.2843672037124634, "step": 3670 }, { "epoch": 1.1302039245863793, "grad_norm": 8.5625, "learning_rate": 5.355857292804592e-06, "loss": 0.9466153383255005, "step": 3672 }, { "epoch": 1.1308195459792227, "grad_norm": 4.5, "learning_rate": 5.351922229813965e-06, "loss": 1.2944337129592896, "step": 3674 }, { "epoch": 1.1314351673720662, "grad_norm": 3.40625, "learning_rate": 5.347987810832747e-06, "loss": 1.0941487550735474, "step": 3676 }, { "epoch": 1.1320507887649096, "grad_norm": 4.09375, "learning_rate": 5.344054039770656e-06, "loss": 1.1674392223358154, "step": 3678 }, { "epoch": 1.132666410157753, "grad_norm": 10.0, "learning_rate": 5.340120920536771e-06, "loss": 1.2395570278167725, "step": 3680 }, { "epoch": 1.1332820315505965, "grad_norm": 8.1875, "learning_rate": 5.336188457039517e-06, "loss": 1.541485071182251, "step": 3682 }, { "epoch": 1.1338976529434397, "grad_norm": 10.875, "learning_rate": 5.332256653186674e-06, "loss": 1.6763718128204346, "step": 3684 }, { "epoch": 1.1345132743362831, "grad_norm": 6.875, "learning_rate": 5.328325512885364e-06, "loss": 1.7518420219421387, "step": 3686 }, { "epoch": 1.1351288957291266, "grad_norm": 19.5, "learning_rate": 5.3243950400420476e-06, "loss": 0.7111321687698364, "step": 3688 }, { "epoch": 1.13574451712197, "grad_norm": 5.8125, "learning_rate": 5.320465238562522e-06, "loss": 1.3289068937301636, "step": 3690 }, { "epoch": 1.1363601385148134, "grad_norm": 7.9375, "learning_rate": 5.316536112351923e-06, "loss": 1.4628081321716309, "step": 3692 }, { "epoch": 1.1369757599076569, "grad_norm": 3.765625, "learning_rate": 5.312607665314708e-06, "loss": 1.062358021736145, "step": 3694 }, { "epoch": 1.1375913813005, "grad_norm": 10.25, "learning_rate": 5.308679901354667e-06, "loss": 1.35207200050354, "step": 3696 }, { "epoch": 1.1382070026933435, "grad_norm": 5.15625, "learning_rate": 5.304752824374904e-06, "loss": 1.5505430698394775, "step": 3698 }, { "epoch": 1.138822624086187, "grad_norm": 42.5, "learning_rate": 5.300826438277842e-06, "loss": 1.450918436050415, "step": 3700 }, { "epoch": 1.1394382454790304, "grad_norm": 3.734375, "learning_rate": 5.296900746965224e-06, "loss": 1.0654526948928833, "step": 3702 }, { "epoch": 1.1400538668718738, "grad_norm": 8.5, "learning_rate": 5.29297575433809e-06, "loss": 1.2837327718734741, "step": 3704 }, { "epoch": 1.1406694882647173, "grad_norm": 4.84375, "learning_rate": 5.2890514642967995e-06, "loss": 1.2365937232971191, "step": 3706 }, { "epoch": 1.1412851096575607, "grad_norm": 11.25, "learning_rate": 5.2851278807410055e-06, "loss": 1.6589319705963135, "step": 3708 }, { "epoch": 1.141900731050404, "grad_norm": 5.59375, "learning_rate": 5.281205007569663e-06, "loss": 1.5612269639968872, "step": 3710 }, { "epoch": 1.1425163524432473, "grad_norm": 4.4375, "learning_rate": 5.2772828486810135e-06, "loss": 1.5178145170211792, "step": 3712 }, { "epoch": 1.1431319738360908, "grad_norm": 16.625, "learning_rate": 5.2733614079726e-06, "loss": 1.3569532632827759, "step": 3714 }, { "epoch": 1.1437475952289342, "grad_norm": 9.25, "learning_rate": 5.269440689341243e-06, "loss": 1.2014302015304565, "step": 3716 }, { "epoch": 1.1443632166217776, "grad_norm": 19.5, "learning_rate": 5.265520696683048e-06, "loss": 1.7885704040527344, "step": 3718 }, { "epoch": 1.144978838014621, "grad_norm": 8.3125, "learning_rate": 5.2616014338934005e-06, "loss": 1.2593146562576294, "step": 3720 }, { "epoch": 1.1455944594074645, "grad_norm": 4.75, "learning_rate": 5.2576829048669606e-06, "loss": 1.2763214111328125, "step": 3722 }, { "epoch": 1.1462100808003077, "grad_norm": 10.1875, "learning_rate": 5.253765113497659e-06, "loss": 1.6070430278778076, "step": 3724 }, { "epoch": 1.1468257021931512, "grad_norm": 3.828125, "learning_rate": 5.249848063678691e-06, "loss": 1.1164813041687012, "step": 3726 }, { "epoch": 1.1474413235859946, "grad_norm": 3.125, "learning_rate": 5.245931759302516e-06, "loss": 1.0933690071105957, "step": 3728 }, { "epoch": 1.148056944978838, "grad_norm": 4.28125, "learning_rate": 5.2420162042608555e-06, "loss": 1.2221918106079102, "step": 3730 }, { "epoch": 1.1486725663716815, "grad_norm": 4.96875, "learning_rate": 5.238101402444684e-06, "loss": 1.1146072149276733, "step": 3732 }, { "epoch": 1.149288187764525, "grad_norm": 8.75, "learning_rate": 5.234187357744228e-06, "loss": 1.342393159866333, "step": 3734 }, { "epoch": 1.1499038091573683, "grad_norm": 4.5, "learning_rate": 5.230274074048961e-06, "loss": 1.3994978666305542, "step": 3736 }, { "epoch": 1.1505194305502116, "grad_norm": 1.90625, "learning_rate": 5.226361555247601e-06, "loss": 1.3354339599609375, "step": 3738 }, { "epoch": 1.151135051943055, "grad_norm": 6.4375, "learning_rate": 5.222449805228103e-06, "loss": 1.4526199102401733, "step": 3740 }, { "epoch": 1.1517506733358984, "grad_norm": 5.71875, "learning_rate": 5.218538827877664e-06, "loss": 0.9815947413444519, "step": 3742 }, { "epoch": 1.1523662947287419, "grad_norm": 7.46875, "learning_rate": 5.214628627082709e-06, "loss": 0.8592740297317505, "step": 3744 }, { "epoch": 1.1529819161215853, "grad_norm": 5.78125, "learning_rate": 5.2107192067288925e-06, "loss": 1.4137502908706665, "step": 3746 }, { "epoch": 1.1535975375144285, "grad_norm": 2.96875, "learning_rate": 5.206810570701092e-06, "loss": 1.4178833961486816, "step": 3748 }, { "epoch": 1.154213158907272, "grad_norm": 4.71875, "learning_rate": 5.20290272288341e-06, "loss": 1.3415439128875732, "step": 3750 }, { "epoch": 1.1548287803001154, "grad_norm": 8.375, "learning_rate": 5.198995667159157e-06, "loss": 1.3343955278396606, "step": 3752 }, { "epoch": 1.1554444016929588, "grad_norm": 9.6875, "learning_rate": 5.195089407410865e-06, "loss": 1.5473103523254395, "step": 3754 }, { "epoch": 1.1560600230858022, "grad_norm": 4.9375, "learning_rate": 5.19118394752027e-06, "loss": 1.0742156505584717, "step": 3756 }, { "epoch": 1.1566756444786457, "grad_norm": 8.5, "learning_rate": 5.187279291368319e-06, "loss": 1.4407182931900024, "step": 3758 }, { "epoch": 1.1572912658714891, "grad_norm": 33.5, "learning_rate": 5.183375442835155e-06, "loss": 1.318236231803894, "step": 3760 }, { "epoch": 1.1579068872643323, "grad_norm": 9.6875, "learning_rate": 5.1794724058001165e-06, "loss": 1.6564728021621704, "step": 3762 }, { "epoch": 1.1585225086571758, "grad_norm": 3.203125, "learning_rate": 5.175570184141743e-06, "loss": 1.357529640197754, "step": 3764 }, { "epoch": 1.1591381300500192, "grad_norm": 4.65625, "learning_rate": 5.171668781737756e-06, "loss": 1.2191883325576782, "step": 3766 }, { "epoch": 1.1597537514428626, "grad_norm": 7.78125, "learning_rate": 5.167768202465069e-06, "loss": 1.2753338813781738, "step": 3768 }, { "epoch": 1.160369372835706, "grad_norm": 9.5625, "learning_rate": 5.163868450199774e-06, "loss": 1.4886462688446045, "step": 3770 }, { "epoch": 1.1609849942285495, "grad_norm": 6.34375, "learning_rate": 5.159969528817144e-06, "loss": 1.4246331453323364, "step": 3772 }, { "epoch": 1.161600615621393, "grad_norm": 8.0, "learning_rate": 5.156071442191622e-06, "loss": 1.5249407291412354, "step": 3774 }, { "epoch": 1.1622162370142362, "grad_norm": 12.8125, "learning_rate": 5.1521741941968265e-06, "loss": 1.5107097625732422, "step": 3776 }, { "epoch": 1.1628318584070796, "grad_norm": 2.09375, "learning_rate": 5.148277788705537e-06, "loss": 1.1826157569885254, "step": 3778 }, { "epoch": 1.163447479799923, "grad_norm": 9.3125, "learning_rate": 5.144382229589702e-06, "loss": 1.1656603813171387, "step": 3780 }, { "epoch": 1.1640631011927665, "grad_norm": 6.53125, "learning_rate": 5.140487520720425e-06, "loss": 1.2253952026367188, "step": 3782 }, { "epoch": 1.16467872258561, "grad_norm": 15.0625, "learning_rate": 5.136593665967964e-06, "loss": 1.449235200881958, "step": 3784 }, { "epoch": 1.1652943439784533, "grad_norm": 22.75, "learning_rate": 5.1327006692017325e-06, "loss": 1.1967664957046509, "step": 3786 }, { "epoch": 1.1659099653712968, "grad_norm": 1.8359375, "learning_rate": 5.128808534290288e-06, "loss": 1.3245983123779297, "step": 3788 }, { "epoch": 1.16652558676414, "grad_norm": 7.09375, "learning_rate": 5.12491726510133e-06, "loss": 1.5706483125686646, "step": 3790 }, { "epoch": 1.1671412081569834, "grad_norm": 5.78125, "learning_rate": 5.121026865501701e-06, "loss": 1.29669189453125, "step": 3792 }, { "epoch": 1.1677568295498268, "grad_norm": 21.5, "learning_rate": 5.117137339357381e-06, "loss": 0.7407981753349304, "step": 3794 }, { "epoch": 1.1683724509426703, "grad_norm": 8.5, "learning_rate": 5.113248690533475e-06, "loss": 1.147154688835144, "step": 3796 }, { "epoch": 1.1689880723355137, "grad_norm": 6.09375, "learning_rate": 5.109360922894222e-06, "loss": 1.1887047290802002, "step": 3798 }, { "epoch": 1.1696036937283572, "grad_norm": 9.375, "learning_rate": 5.105474040302985e-06, "loss": 1.2298659086227417, "step": 3800 }, { "epoch": 1.1702193151212004, "grad_norm": 5.375, "learning_rate": 5.101588046622248e-06, "loss": 1.3768701553344727, "step": 3802 }, { "epoch": 1.1708349365140438, "grad_norm": 6.0, "learning_rate": 5.097702945713605e-06, "loss": 1.4939483404159546, "step": 3804 }, { "epoch": 1.1714505579068872, "grad_norm": 20.0, "learning_rate": 5.093818741437771e-06, "loss": 1.765651822090149, "step": 3806 }, { "epoch": 1.1720661792997307, "grad_norm": 6.59375, "learning_rate": 5.089935437654565e-06, "loss": 1.3605940341949463, "step": 3808 }, { "epoch": 1.172681800692574, "grad_norm": 14.4375, "learning_rate": 5.0860530382229175e-06, "loss": 0.8839163184165955, "step": 3810 }, { "epoch": 1.1732974220854175, "grad_norm": 9.375, "learning_rate": 5.082171547000852e-06, "loss": 1.1597310304641724, "step": 3812 }, { "epoch": 1.1739130434782608, "grad_norm": 6.40625, "learning_rate": 5.078290967845494e-06, "loss": 0.9128301739692688, "step": 3814 }, { "epoch": 1.1745286648711042, "grad_norm": 6.6875, "learning_rate": 5.074411304613061e-06, "loss": 1.056254267692566, "step": 3816 }, { "epoch": 1.1751442862639476, "grad_norm": 13.875, "learning_rate": 5.0705325611588626e-06, "loss": 1.498201847076416, "step": 3818 }, { "epoch": 1.175759907656791, "grad_norm": 8.3125, "learning_rate": 5.066654741337294e-06, "loss": 1.243221402168274, "step": 3820 }, { "epoch": 1.1763755290496345, "grad_norm": 5.625, "learning_rate": 5.06277784900183e-06, "loss": 1.3041177988052368, "step": 3822 }, { "epoch": 1.176991150442478, "grad_norm": 81.5, "learning_rate": 5.0589018880050255e-06, "loss": 1.1544214487075806, "step": 3824 }, { "epoch": 1.1776067718353214, "grad_norm": 23.125, "learning_rate": 5.055026862198511e-06, "loss": 1.2737535238265991, "step": 3826 }, { "epoch": 1.1782223932281646, "grad_norm": 5.15625, "learning_rate": 5.051152775432987e-06, "loss": 0.8115167021751404, "step": 3828 }, { "epoch": 1.178838014621008, "grad_norm": 10.3125, "learning_rate": 5.047279631558217e-06, "loss": 1.2570488452911377, "step": 3830 }, { "epoch": 1.1794536360138514, "grad_norm": 16.25, "learning_rate": 5.043407434423036e-06, "loss": 1.199446201324463, "step": 3832 }, { "epoch": 1.1800692574066949, "grad_norm": 54.25, "learning_rate": 5.039536187875328e-06, "loss": 1.6809020042419434, "step": 3834 }, { "epoch": 1.1806848787995383, "grad_norm": 9.8125, "learning_rate": 5.0356658957620395e-06, "loss": 1.2956881523132324, "step": 3836 }, { "epoch": 1.1813005001923818, "grad_norm": 4.15625, "learning_rate": 5.0317965619291676e-06, "loss": 1.3684914112091064, "step": 3838 }, { "epoch": 1.1819161215852252, "grad_norm": 7.25, "learning_rate": 5.0279281902217555e-06, "loss": 1.5077707767486572, "step": 3840 }, { "epoch": 1.1825317429780684, "grad_norm": 5.90625, "learning_rate": 5.02406078448389e-06, "loss": 1.291996955871582, "step": 3842 }, { "epoch": 1.1831473643709118, "grad_norm": 47.0, "learning_rate": 5.0201943485587004e-06, "loss": 1.2624106407165527, "step": 3844 }, { "epoch": 1.1837629857637553, "grad_norm": 4.53125, "learning_rate": 5.016328886288348e-06, "loss": 1.0628662109375, "step": 3846 }, { "epoch": 1.1843786071565987, "grad_norm": 8.625, "learning_rate": 5.012464401514032e-06, "loss": 0.8396918773651123, "step": 3848 }, { "epoch": 1.1849942285494421, "grad_norm": 5.09375, "learning_rate": 5.0086008980759775e-06, "loss": 0.8910311460494995, "step": 3850 }, { "epoch": 1.1856098499422856, "grad_norm": 6.8125, "learning_rate": 5.004738379813432e-06, "loss": 1.350878119468689, "step": 3852 }, { "epoch": 1.186225471335129, "grad_norm": 4.8125, "learning_rate": 5.000876850564671e-06, "loss": 1.3546310663223267, "step": 3854 }, { "epoch": 1.1868410927279722, "grad_norm": 8.9375, "learning_rate": 4.997016314166978e-06, "loss": 1.653855800628662, "step": 3856 }, { "epoch": 1.1874567141208157, "grad_norm": 9.9375, "learning_rate": 4.993156774456655e-06, "loss": 1.289168119430542, "step": 3858 }, { "epoch": 1.188072335513659, "grad_norm": 7.25, "learning_rate": 4.989298235269015e-06, "loss": 0.9712845683097839, "step": 3860 }, { "epoch": 1.1886879569065025, "grad_norm": 2.484375, "learning_rate": 4.985440700438375e-06, "loss": 0.8265883326530457, "step": 3862 }, { "epoch": 1.189303578299346, "grad_norm": 8.5625, "learning_rate": 4.981584173798053e-06, "loss": 1.333146333694458, "step": 3864 }, { "epoch": 1.1899191996921894, "grad_norm": 20.375, "learning_rate": 4.977728659180367e-06, "loss": 1.0070924758911133, "step": 3866 }, { "epoch": 1.1905348210850326, "grad_norm": 3.796875, "learning_rate": 4.973874160416627e-06, "loss": 0.8928322792053223, "step": 3868 }, { "epoch": 1.191150442477876, "grad_norm": 6.65625, "learning_rate": 4.970020681337134e-06, "loss": 1.2990491390228271, "step": 3870 }, { "epoch": 1.1917660638707195, "grad_norm": 5.84375, "learning_rate": 4.966168225771179e-06, "loss": 1.4741895198822021, "step": 3872 }, { "epoch": 1.192381685263563, "grad_norm": 6.125, "learning_rate": 4.962316797547031e-06, "loss": 0.9886192083358765, "step": 3874 }, { "epoch": 1.1929973066564064, "grad_norm": 8.5625, "learning_rate": 4.958466400491943e-06, "loss": 1.517229437828064, "step": 3876 }, { "epoch": 1.1936129280492498, "grad_norm": 11.1875, "learning_rate": 4.954617038432139e-06, "loss": 1.8023860454559326, "step": 3878 }, { "epoch": 1.194228549442093, "grad_norm": 5.78125, "learning_rate": 4.950768715192819e-06, "loss": 1.4020851850509644, "step": 3880 }, { "epoch": 1.1948441708349364, "grad_norm": 4.78125, "learning_rate": 4.946921434598144e-06, "loss": 1.0979114770889282, "step": 3882 }, { "epoch": 1.1954597922277799, "grad_norm": 6.75, "learning_rate": 4.943075200471245e-06, "loss": 1.4677519798278809, "step": 3884 }, { "epoch": 1.1960754136206233, "grad_norm": 6.0625, "learning_rate": 4.939230016634211e-06, "loss": 0.9861754179000854, "step": 3886 }, { "epoch": 1.1966910350134667, "grad_norm": 27.25, "learning_rate": 4.935385886908089e-06, "loss": 1.3366788625717163, "step": 3888 }, { "epoch": 1.1973066564063102, "grad_norm": 9.875, "learning_rate": 4.931542815112875e-06, "loss": 1.57736337184906, "step": 3890 }, { "epoch": 1.1979222777991536, "grad_norm": 6.5625, "learning_rate": 4.927700805067516e-06, "loss": 1.2437076568603516, "step": 3892 }, { "epoch": 1.1985378991919968, "grad_norm": 23.625, "learning_rate": 4.9238598605899035e-06, "loss": 1.3512110710144043, "step": 3894 }, { "epoch": 1.1991535205848403, "grad_norm": 7.0, "learning_rate": 4.920019985496869e-06, "loss": 0.9439166188240051, "step": 3896 }, { "epoch": 1.1997691419776837, "grad_norm": 7.09375, "learning_rate": 4.916181183604184e-06, "loss": 1.318813681602478, "step": 3898 }, { "epoch": 1.2003847633705271, "grad_norm": 4.6875, "learning_rate": 4.912343458726552e-06, "loss": 1.2898008823394775, "step": 3900 }, { "epoch": 1.2010003847633706, "grad_norm": 10.0, "learning_rate": 4.908506814677605e-06, "loss": 1.2824454307556152, "step": 3902 }, { "epoch": 1.201616006156214, "grad_norm": 4.4375, "learning_rate": 4.904671255269903e-06, "loss": 1.2138501405715942, "step": 3904 }, { "epoch": 1.2022316275490574, "grad_norm": 11.375, "learning_rate": 4.9008367843149296e-06, "loss": 1.675101637840271, "step": 3906 }, { "epoch": 1.2028472489419006, "grad_norm": 11.4375, "learning_rate": 4.89700340562308e-06, "loss": 1.4074642658233643, "step": 3908 }, { "epoch": 1.203462870334744, "grad_norm": 4.40625, "learning_rate": 4.893171123003672e-06, "loss": 1.2531607151031494, "step": 3910 }, { "epoch": 1.2040784917275875, "grad_norm": 5.40625, "learning_rate": 4.889339940264929e-06, "loss": 1.2239080667495728, "step": 3912 }, { "epoch": 1.204694113120431, "grad_norm": 5.4375, "learning_rate": 4.8855098612139835e-06, "loss": 0.7669562101364136, "step": 3914 }, { "epoch": 1.2053097345132744, "grad_norm": 6.8125, "learning_rate": 4.8816808896568705e-06, "loss": 1.0785932540893555, "step": 3916 }, { "epoch": 1.2059253559061178, "grad_norm": 6.78125, "learning_rate": 4.877853029398527e-06, "loss": 1.1975194215774536, "step": 3918 }, { "epoch": 1.2065409772989613, "grad_norm": 7.59375, "learning_rate": 4.874026284242782e-06, "loss": 1.4719704389572144, "step": 3920 }, { "epoch": 1.2071565986918045, "grad_norm": 5.03125, "learning_rate": 4.870200657992358e-06, "loss": 1.1891669034957886, "step": 3922 }, { "epoch": 1.207772220084648, "grad_norm": 7.71875, "learning_rate": 4.866376154448864e-06, "loss": 1.0619840621948242, "step": 3924 }, { "epoch": 1.2083878414774913, "grad_norm": 6.8125, "learning_rate": 4.862552777412796e-06, "loss": 1.3883061408996582, "step": 3926 }, { "epoch": 1.2090034628703348, "grad_norm": 7.8125, "learning_rate": 4.858730530683532e-06, "loss": 1.4634684324264526, "step": 3928 }, { "epoch": 1.2096190842631782, "grad_norm": 6.59375, "learning_rate": 4.854909418059323e-06, "loss": 1.2528977394104004, "step": 3930 }, { "epoch": 1.2102347056560214, "grad_norm": 66.0, "learning_rate": 4.851089443337291e-06, "loss": 1.2767219543457031, "step": 3932 }, { "epoch": 1.2108503270488649, "grad_norm": 4.6875, "learning_rate": 4.8472706103134344e-06, "loss": 1.2410752773284912, "step": 3934 }, { "epoch": 1.2114659484417083, "grad_norm": 9.875, "learning_rate": 4.8434529227826106e-06, "loss": 1.6885144710540771, "step": 3936 }, { "epoch": 1.2120815698345517, "grad_norm": 9.25, "learning_rate": 4.839636384538543e-06, "loss": 1.6024225950241089, "step": 3938 }, { "epoch": 1.2126971912273952, "grad_norm": 8.0625, "learning_rate": 4.83582099937381e-06, "loss": 1.3082659244537354, "step": 3940 }, { "epoch": 1.2133128126202386, "grad_norm": 4.53125, "learning_rate": 4.832006771079847e-06, "loss": 1.046728253364563, "step": 3942 }, { "epoch": 1.213928434013082, "grad_norm": 5.40625, "learning_rate": 4.8281937034469364e-06, "loss": 1.2913883924484253, "step": 3944 }, { "epoch": 1.2145440554059252, "grad_norm": 9.375, "learning_rate": 4.824381800264211e-06, "loss": 1.5368022918701172, "step": 3946 }, { "epoch": 1.2151596767987687, "grad_norm": 5.65625, "learning_rate": 4.820571065319641e-06, "loss": 1.0252327919006348, "step": 3948 }, { "epoch": 1.2157752981916121, "grad_norm": 6.125, "learning_rate": 4.816761502400042e-06, "loss": 1.1273882389068604, "step": 3950 }, { "epoch": 1.2163909195844556, "grad_norm": 4.21875, "learning_rate": 4.8129531152910615e-06, "loss": 1.1979624032974243, "step": 3952 }, { "epoch": 1.217006540977299, "grad_norm": 18.125, "learning_rate": 4.80914590777718e-06, "loss": 1.2820459604263306, "step": 3954 }, { "epoch": 1.2176221623701424, "grad_norm": 4.9375, "learning_rate": 4.805339883641704e-06, "loss": 1.5242979526519775, "step": 3956 }, { "epoch": 1.2182377837629859, "grad_norm": 5.21875, "learning_rate": 4.801535046666763e-06, "loss": 1.2126457691192627, "step": 3958 }, { "epoch": 1.218853405155829, "grad_norm": 10.1875, "learning_rate": 4.797731400633312e-06, "loss": 1.3734692335128784, "step": 3960 }, { "epoch": 1.2194690265486725, "grad_norm": 10.875, "learning_rate": 4.793928949321117e-06, "loss": 1.368640661239624, "step": 3962 }, { "epoch": 1.220084647941516, "grad_norm": 2.140625, "learning_rate": 4.79012769650876e-06, "loss": 1.1736152172088623, "step": 3964 }, { "epoch": 1.2207002693343594, "grad_norm": 9.625, "learning_rate": 4.786327645973633e-06, "loss": 1.005432367324829, "step": 3966 }, { "epoch": 1.2213158907272028, "grad_norm": 9.875, "learning_rate": 4.782528801491928e-06, "loss": 1.5107418298721313, "step": 3968 }, { "epoch": 1.2219315121200462, "grad_norm": 7.8125, "learning_rate": 4.778731166838646e-06, "loss": 1.278852939605713, "step": 3970 }, { "epoch": 1.2225471335128897, "grad_norm": 6.0, "learning_rate": 4.774934745787577e-06, "loss": 1.2914077043533325, "step": 3972 }, { "epoch": 1.223162754905733, "grad_norm": 5.21875, "learning_rate": 4.7711395421113124e-06, "loss": 0.9130945801734924, "step": 3974 }, { "epoch": 1.2237783762985763, "grad_norm": 50.75, "learning_rate": 4.767345559581231e-06, "loss": 1.1317358016967773, "step": 3976 }, { "epoch": 1.2243939976914198, "grad_norm": 7.4375, "learning_rate": 4.763552801967498e-06, "loss": 1.285693645477295, "step": 3978 }, { "epoch": 1.2250096190842632, "grad_norm": 2.96875, "learning_rate": 4.759761273039061e-06, "loss": 1.1612380743026733, "step": 3980 }, { "epoch": 1.2256252404771066, "grad_norm": 2.53125, "learning_rate": 4.75597097656365e-06, "loss": 1.0672385692596436, "step": 3982 }, { "epoch": 1.22624086186995, "grad_norm": 47.25, "learning_rate": 4.7521819163077635e-06, "loss": 1.1390371322631836, "step": 3984 }, { "epoch": 1.2268564832627935, "grad_norm": 7.6875, "learning_rate": 4.748394096036678e-06, "loss": 1.5185624361038208, "step": 3986 }, { "epoch": 1.2274721046556367, "grad_norm": 7.15625, "learning_rate": 4.744607519514436e-06, "loss": 1.2074450254440308, "step": 3988 }, { "epoch": 1.2280877260484802, "grad_norm": 10.625, "learning_rate": 4.740822190503841e-06, "loss": 1.9707778692245483, "step": 3990 }, { "epoch": 1.2287033474413236, "grad_norm": 12.4375, "learning_rate": 4.737038112766461e-06, "loss": 0.9399597644805908, "step": 3992 }, { "epoch": 1.229318968834167, "grad_norm": 3.609375, "learning_rate": 4.73325529006262e-06, "loss": 1.2759641408920288, "step": 3994 }, { "epoch": 1.2299345902270105, "grad_norm": 6.90625, "learning_rate": 4.729473726151393e-06, "loss": 1.0565696954727173, "step": 3996 }, { "epoch": 1.2305502116198537, "grad_norm": 5.6875, "learning_rate": 4.725693424790603e-06, "loss": 1.1895898580551147, "step": 3998 }, { "epoch": 1.231165833012697, "grad_norm": 7.625, "learning_rate": 4.721914389736821e-06, "loss": 1.195793867111206, "step": 4000 }, { "epoch": 1.2317814544055405, "grad_norm": 12.8125, "learning_rate": 4.71813662474536e-06, "loss": 1.334660530090332, "step": 4002 }, { "epoch": 1.232397075798384, "grad_norm": 6.25, "learning_rate": 4.7143601335702686e-06, "loss": 1.3096296787261963, "step": 4004 }, { "epoch": 1.2330126971912274, "grad_norm": 9.3125, "learning_rate": 4.71058491996433e-06, "loss": 1.246321201324463, "step": 4006 }, { "epoch": 1.2336283185840708, "grad_norm": 6.46875, "learning_rate": 4.706810987679063e-06, "loss": 1.372193694114685, "step": 4008 }, { "epoch": 1.2342439399769143, "grad_norm": 5.96875, "learning_rate": 4.703038340464704e-06, "loss": 1.0236767530441284, "step": 4010 }, { "epoch": 1.2348595613697575, "grad_norm": 6.375, "learning_rate": 4.699266982070217e-06, "loss": 1.4130743741989136, "step": 4012 }, { "epoch": 1.235475182762601, "grad_norm": 7.65625, "learning_rate": 4.695496916243287e-06, "loss": 1.526268482208252, "step": 4014 }, { "epoch": 1.2360908041554444, "grad_norm": 10.375, "learning_rate": 4.691728146730314e-06, "loss": 1.710693597793579, "step": 4016 }, { "epoch": 1.2367064255482878, "grad_norm": 5.625, "learning_rate": 4.6879606772764066e-06, "loss": 1.1987316608428955, "step": 4018 }, { "epoch": 1.2373220469411312, "grad_norm": 2.46875, "learning_rate": 4.6841945116253865e-06, "loss": 1.0062463283538818, "step": 4020 }, { "epoch": 1.2379376683339747, "grad_norm": 8.0625, "learning_rate": 4.680429653519775e-06, "loss": 1.3855282068252563, "step": 4022 }, { "epoch": 1.238553289726818, "grad_norm": 6.46875, "learning_rate": 4.6766661067007945e-06, "loss": 1.396597981452942, "step": 4024 }, { "epoch": 1.2391689111196613, "grad_norm": 12.0, "learning_rate": 4.6729038749083675e-06, "loss": 1.0071481466293335, "step": 4026 }, { "epoch": 1.2397845325125048, "grad_norm": 8.5, "learning_rate": 4.669142961881108e-06, "loss": 1.3875513076782227, "step": 4028 }, { "epoch": 1.2404001539053482, "grad_norm": 8.3125, "learning_rate": 4.665383371356321e-06, "loss": 0.9031148552894592, "step": 4030 }, { "epoch": 1.2410157752981916, "grad_norm": 6.09375, "learning_rate": 4.661625107069992e-06, "loss": 1.4057817459106445, "step": 4032 }, { "epoch": 1.241631396691035, "grad_norm": 5.3125, "learning_rate": 4.657868172756799e-06, "loss": 1.468703269958496, "step": 4034 }, { "epoch": 1.2422470180838785, "grad_norm": 6.90625, "learning_rate": 4.654112572150084e-06, "loss": 1.4883900880813599, "step": 4036 }, { "epoch": 1.242862639476722, "grad_norm": 12.0, "learning_rate": 4.650358308981876e-06, "loss": 1.3661744594573975, "step": 4038 }, { "epoch": 1.2434782608695651, "grad_norm": 5.34375, "learning_rate": 4.6466053869828695e-06, "loss": 1.223865270614624, "step": 4040 }, { "epoch": 1.2440938822624086, "grad_norm": 12.25, "learning_rate": 4.6428538098824284e-06, "loss": 1.1496392488479614, "step": 4042 }, { "epoch": 1.244709503655252, "grad_norm": 8.0, "learning_rate": 4.639103581408577e-06, "loss": 1.3361393213272095, "step": 4044 }, { "epoch": 1.2453251250480954, "grad_norm": 19.125, "learning_rate": 4.635354705288002e-06, "loss": 1.4294644594192505, "step": 4046 }, { "epoch": 1.2459407464409389, "grad_norm": 9.25, "learning_rate": 4.631607185246048e-06, "loss": 1.5690128803253174, "step": 4048 }, { "epoch": 1.2465563678337823, "grad_norm": 5.65625, "learning_rate": 4.6278610250067065e-06, "loss": 1.3025476932525635, "step": 4050 }, { "epoch": 1.2471719892266255, "grad_norm": 5.5625, "learning_rate": 4.624116228292621e-06, "loss": 1.5564523935317993, "step": 4052 }, { "epoch": 1.247787610619469, "grad_norm": 7.25, "learning_rate": 4.620372798825083e-06, "loss": 1.5863494873046875, "step": 4054 }, { "epoch": 1.2484032320123124, "grad_norm": 9.875, "learning_rate": 4.61663074032402e-06, "loss": 1.7407833337783813, "step": 4056 }, { "epoch": 1.2490188534051558, "grad_norm": 53.5, "learning_rate": 4.6128900565079985e-06, "loss": 1.1799893379211426, "step": 4058 }, { "epoch": 1.2496344747979993, "grad_norm": 14.1875, "learning_rate": 4.609150751094223e-06, "loss": 1.0459473133087158, "step": 4060 }, { "epoch": 1.2502500961908427, "grad_norm": 4.0625, "learning_rate": 4.605412827798521e-06, "loss": 1.190739393234253, "step": 4062 }, { "epoch": 1.250865717583686, "grad_norm": 17.625, "learning_rate": 4.601676290335353e-06, "loss": 1.0495179891586304, "step": 4064 }, { "epoch": 1.2514813389765294, "grad_norm": 17.25, "learning_rate": 4.597941142417801e-06, "loss": 1.2779337167739868, "step": 4066 }, { "epoch": 1.2520969603693728, "grad_norm": 2.96875, "learning_rate": 4.594207387757563e-06, "loss": 0.7309108376502991, "step": 4068 }, { "epoch": 1.2527125817622162, "grad_norm": 4.875, "learning_rate": 4.590475030064957e-06, "loss": 1.3036484718322754, "step": 4070 }, { "epoch": 1.2533282031550597, "grad_norm": 7.125, "learning_rate": 4.586744073048908e-06, "loss": 1.4698227643966675, "step": 4072 }, { "epoch": 1.253943824547903, "grad_norm": 9.5, "learning_rate": 4.5830145204169555e-06, "loss": 1.1250114440917969, "step": 4074 }, { "epoch": 1.2545594459407465, "grad_norm": 2.921875, "learning_rate": 4.5792863758752355e-06, "loss": 0.8555299639701843, "step": 4076 }, { "epoch": 1.2551750673335897, "grad_norm": 4.0625, "learning_rate": 4.575559643128489e-06, "loss": 0.7287892699241638, "step": 4078 }, { "epoch": 1.2557906887264332, "grad_norm": 5.9375, "learning_rate": 4.571834325880056e-06, "loss": 1.4676601886749268, "step": 4080 }, { "epoch": 1.2564063101192766, "grad_norm": 7.78125, "learning_rate": 4.568110427831867e-06, "loss": 1.1823492050170898, "step": 4082 }, { "epoch": 1.25702193151212, "grad_norm": 4.90625, "learning_rate": 4.5643879526844435e-06, "loss": 1.2526085376739502, "step": 4084 }, { "epoch": 1.2576375529049635, "grad_norm": 7.875, "learning_rate": 4.560666904136891e-06, "loss": 0.753780722618103, "step": 4086 }, { "epoch": 1.258253174297807, "grad_norm": 23.625, "learning_rate": 4.556947285886901e-06, "loss": 0.8512752056121826, "step": 4088 }, { "epoch": 1.2588687956906504, "grad_norm": 20.75, "learning_rate": 4.553229101630738e-06, "loss": 1.7275090217590332, "step": 4090 }, { "epoch": 1.2594844170834936, "grad_norm": 6.65625, "learning_rate": 4.549512355063248e-06, "loss": 1.0966256856918335, "step": 4092 }, { "epoch": 1.260100038476337, "grad_norm": 6.4375, "learning_rate": 4.545797049877845e-06, "loss": 1.1546615362167358, "step": 4094 }, { "epoch": 1.2607156598691804, "grad_norm": 7.0, "learning_rate": 4.5420831897665115e-06, "loss": 1.4722708463668823, "step": 4096 }, { "epoch": 1.2613312812620239, "grad_norm": 9.125, "learning_rate": 4.538370778419791e-06, "loss": 1.5684009790420532, "step": 4098 }, { "epoch": 1.2619469026548673, "grad_norm": 5.75, "learning_rate": 4.534659819526794e-06, "loss": 1.1395797729492188, "step": 4100 }, { "epoch": 1.2625625240477105, "grad_norm": 3.71875, "learning_rate": 4.53095031677518e-06, "loss": 0.9826986193656921, "step": 4102 }, { "epoch": 1.2631781454405542, "grad_norm": 8.0625, "learning_rate": 4.527242273851166e-06, "loss": 1.339382529258728, "step": 4104 }, { "epoch": 1.2637937668333974, "grad_norm": 13.875, "learning_rate": 4.523535694439516e-06, "loss": 1.7129334211349487, "step": 4106 }, { "epoch": 1.2644093882262408, "grad_norm": 7.53125, "learning_rate": 4.519830582223545e-06, "loss": 1.558545470237732, "step": 4108 }, { "epoch": 1.2650250096190843, "grad_norm": 8.8125, "learning_rate": 4.516126940885103e-06, "loss": 1.5102803707122803, "step": 4110 }, { "epoch": 1.2656406310119277, "grad_norm": 3.9375, "learning_rate": 4.512424774104583e-06, "loss": 1.365267038345337, "step": 4112 }, { "epoch": 1.2662562524047711, "grad_norm": 8.4375, "learning_rate": 4.508724085560908e-06, "loss": 1.2057257890701294, "step": 4114 }, { "epoch": 1.2668718737976143, "grad_norm": 5.34375, "learning_rate": 4.505024878931539e-06, "loss": 0.8204096555709839, "step": 4116 }, { "epoch": 1.267487495190458, "grad_norm": 7.09375, "learning_rate": 4.501327157892457e-06, "loss": 0.9797679781913757, "step": 4118 }, { "epoch": 1.2681031165833012, "grad_norm": 3.828125, "learning_rate": 4.497630926118175e-06, "loss": 1.0729451179504395, "step": 4120 }, { "epoch": 1.2687187379761447, "grad_norm": 12.25, "learning_rate": 4.493936187281717e-06, "loss": 1.0971177816390991, "step": 4122 }, { "epoch": 1.269334359368988, "grad_norm": 3.4375, "learning_rate": 4.490242945054629e-06, "loss": 0.6305558085441589, "step": 4124 }, { "epoch": 1.2699499807618315, "grad_norm": 9.6875, "learning_rate": 4.486551203106971e-06, "loss": 1.2655073404312134, "step": 4126 }, { "epoch": 1.270565602154675, "grad_norm": 9.75, "learning_rate": 4.482860965107305e-06, "loss": 1.462432861328125, "step": 4128 }, { "epoch": 1.2711812235475182, "grad_norm": 19.25, "learning_rate": 4.479172234722708e-06, "loss": 1.0850521326065063, "step": 4130 }, { "epoch": 1.2717968449403616, "grad_norm": 7.0, "learning_rate": 4.47548501561875e-06, "loss": 0.8474297523498535, "step": 4132 }, { "epoch": 1.272412466333205, "grad_norm": 7.9375, "learning_rate": 4.471799311459507e-06, "loss": 1.212317943572998, "step": 4134 }, { "epoch": 1.2730280877260485, "grad_norm": 5.5, "learning_rate": 4.468115125907543e-06, "loss": 1.1656124591827393, "step": 4136 }, { "epoch": 1.273643709118892, "grad_norm": 5.8125, "learning_rate": 4.464432462623918e-06, "loss": 1.2986111640930176, "step": 4138 }, { "epoch": 1.2742593305117353, "grad_norm": 7.875, "learning_rate": 4.460751325268175e-06, "loss": 1.3736796379089355, "step": 4140 }, { "epoch": 1.2748749519045788, "grad_norm": 4.34375, "learning_rate": 4.457071717498344e-06, "loss": 1.3222219944000244, "step": 4142 }, { "epoch": 1.275490573297422, "grad_norm": 6.0625, "learning_rate": 4.453393642970933e-06, "loss": 1.3862910270690918, "step": 4144 }, { "epoch": 1.2761061946902654, "grad_norm": 5.25, "learning_rate": 4.449717105340927e-06, "loss": 1.3663526773452759, "step": 4146 }, { "epoch": 1.2767218160831089, "grad_norm": 6.46875, "learning_rate": 4.446042108261784e-06, "loss": 1.046976089477539, "step": 4148 }, { "epoch": 1.2773374374759523, "grad_norm": 10.1875, "learning_rate": 4.442368655385434e-06, "loss": 1.2302095890045166, "step": 4150 }, { "epoch": 1.2779530588687957, "grad_norm": 6.90625, "learning_rate": 4.438696750362265e-06, "loss": 1.021092176437378, "step": 4152 }, { "epoch": 1.2785686802616392, "grad_norm": 4.25, "learning_rate": 4.435026396841133e-06, "loss": 1.173077940940857, "step": 4154 }, { "epoch": 1.2791843016544826, "grad_norm": 8.4375, "learning_rate": 4.4313575984693505e-06, "loss": 1.241083025932312, "step": 4156 }, { "epoch": 1.2797999230473258, "grad_norm": 10.3125, "learning_rate": 4.4276903588926846e-06, "loss": 1.3768047094345093, "step": 4158 }, { "epoch": 1.2804155444401693, "grad_norm": 6.75, "learning_rate": 4.424024681755353e-06, "loss": 0.6591784358024597, "step": 4160 }, { "epoch": 1.2810311658330127, "grad_norm": 2.125, "learning_rate": 4.4203605707000236e-06, "loss": 1.0183517932891846, "step": 4162 }, { "epoch": 1.2816467872258561, "grad_norm": 18.125, "learning_rate": 4.4166980293678045e-06, "loss": 0.8171372413635254, "step": 4164 }, { "epoch": 1.2822624086186996, "grad_norm": 6.59375, "learning_rate": 4.413037061398244e-06, "loss": 1.4593392610549927, "step": 4166 }, { "epoch": 1.2828780300115428, "grad_norm": 21.375, "learning_rate": 4.4093776704293265e-06, "loss": 1.1947382688522339, "step": 4168 }, { "epoch": 1.2834936514043864, "grad_norm": 5.9375, "learning_rate": 4.4057198600974745e-06, "loss": 1.3039485216140747, "step": 4170 }, { "epoch": 1.2841092727972296, "grad_norm": 3.171875, "learning_rate": 4.402063634037535e-06, "loss": 0.7116605043411255, "step": 4172 }, { "epoch": 1.284724894190073, "grad_norm": 10.0625, "learning_rate": 4.398408995882782e-06, "loss": 1.313848614692688, "step": 4174 }, { "epoch": 1.2853405155829165, "grad_norm": 14.625, "learning_rate": 4.394755949264911e-06, "loss": 1.5144187211990356, "step": 4176 }, { "epoch": 1.28595613697576, "grad_norm": 21.25, "learning_rate": 4.391104497814036e-06, "loss": 1.0581088066101074, "step": 4178 }, { "epoch": 1.2865717583686034, "grad_norm": 21.25, "learning_rate": 4.3874546451586845e-06, "loss": 1.2269830703735352, "step": 4180 }, { "epoch": 1.2871873797614466, "grad_norm": 5.90625, "learning_rate": 4.383806394925799e-06, "loss": 1.0916420221328735, "step": 4182 }, { "epoch": 1.2878030011542902, "grad_norm": 14.6875, "learning_rate": 4.380159750740728e-06, "loss": 1.0353869199752808, "step": 4184 }, { "epoch": 1.2884186225471335, "grad_norm": 2.34375, "learning_rate": 4.3765147162272225e-06, "loss": 1.2946035861968994, "step": 4186 }, { "epoch": 1.289034243939977, "grad_norm": 6.4375, "learning_rate": 4.372871295007435e-06, "loss": 1.1765669584274292, "step": 4188 }, { "epoch": 1.2896498653328203, "grad_norm": 2.984375, "learning_rate": 4.369229490701916e-06, "loss": 1.099198341369629, "step": 4190 }, { "epoch": 1.2902654867256638, "grad_norm": 4.84375, "learning_rate": 4.365589306929607e-06, "loss": 0.9606583118438721, "step": 4192 }, { "epoch": 1.2908811081185072, "grad_norm": 2.609375, "learning_rate": 4.361950747307839e-06, "loss": 0.8651901483535767, "step": 4194 }, { "epoch": 1.2914967295113504, "grad_norm": 6.53125, "learning_rate": 4.358313815452333e-06, "loss": 1.2352056503295898, "step": 4196 }, { "epoch": 1.2921123509041939, "grad_norm": 8.3125, "learning_rate": 4.354678514977188e-06, "loss": 1.6388479471206665, "step": 4198 }, { "epoch": 1.2927279722970373, "grad_norm": 4.25, "learning_rate": 4.351044849494883e-06, "loss": 1.20128333568573, "step": 4200 }, { "epoch": 1.2933435936898807, "grad_norm": 8.5, "learning_rate": 4.347412822616275e-06, "loss": 1.2727470397949219, "step": 4202 }, { "epoch": 1.2939592150827242, "grad_norm": 4.625, "learning_rate": 4.343782437950589e-06, "loss": 1.2012213468551636, "step": 4204 }, { "epoch": 1.2945748364755676, "grad_norm": 4.0625, "learning_rate": 4.3401536991054194e-06, "loss": 1.129618525505066, "step": 4206 }, { "epoch": 1.295190457868411, "grad_norm": 4.125, "learning_rate": 4.336526609686726e-06, "loss": 1.3237369060516357, "step": 4208 }, { "epoch": 1.2958060792612542, "grad_norm": 15.125, "learning_rate": 4.3329011732988285e-06, "loss": 1.2727644443511963, "step": 4210 }, { "epoch": 1.2964217006540977, "grad_norm": 12.6875, "learning_rate": 4.329277393544405e-06, "loss": 1.2533864974975586, "step": 4212 }, { "epoch": 1.297037322046941, "grad_norm": 8.4375, "learning_rate": 4.325655274024487e-06, "loss": 1.5394253730773926, "step": 4214 }, { "epoch": 1.2976529434397845, "grad_norm": 4.03125, "learning_rate": 4.322034818338454e-06, "loss": 0.9984561204910278, "step": 4216 }, { "epoch": 1.298268564832628, "grad_norm": 8.0625, "learning_rate": 4.318416030084036e-06, "loss": 0.9267721176147461, "step": 4218 }, { "epoch": 1.2988841862254714, "grad_norm": 5.4375, "learning_rate": 4.314798912857301e-06, "loss": 0.7328625917434692, "step": 4220 }, { "epoch": 1.2994998076183149, "grad_norm": 6.6875, "learning_rate": 4.311183470252663e-06, "loss": 1.1941163539886475, "step": 4222 }, { "epoch": 1.300115429011158, "grad_norm": 10.75, "learning_rate": 4.307569705862866e-06, "loss": 1.5790549516677856, "step": 4224 }, { "epoch": 1.3007310504040015, "grad_norm": 29.25, "learning_rate": 4.303957623278989e-06, "loss": 0.7785030603408813, "step": 4226 }, { "epoch": 1.301346671796845, "grad_norm": 7.0, "learning_rate": 4.300347226090443e-06, "loss": 1.5293329954147339, "step": 4228 }, { "epoch": 1.3019622931896884, "grad_norm": 10.75, "learning_rate": 4.296738517884954e-06, "loss": 1.3094733953475952, "step": 4230 }, { "epoch": 1.3025779145825318, "grad_norm": 4.8125, "learning_rate": 4.293131502248582e-06, "loss": 1.3940355777740479, "step": 4232 }, { "epoch": 1.303193535975375, "grad_norm": 5.90625, "learning_rate": 4.289526182765697e-06, "loss": 1.2179944515228271, "step": 4234 }, { "epoch": 1.3038091573682187, "grad_norm": 4.1875, "learning_rate": 4.285922563018983e-06, "loss": 1.1753324270248413, "step": 4236 }, { "epoch": 1.3044247787610619, "grad_norm": 25.625, "learning_rate": 4.282320646589444e-06, "loss": 1.2323254346847534, "step": 4238 }, { "epoch": 1.3050404001539053, "grad_norm": 6.6875, "learning_rate": 4.278720437056379e-06, "loss": 1.1362353563308716, "step": 4240 }, { "epoch": 1.3056560215467488, "grad_norm": 8.8125, "learning_rate": 4.2751219379974035e-06, "loss": 1.307444453239441, "step": 4242 }, { "epoch": 1.3062716429395922, "grad_norm": 2.5625, "learning_rate": 4.271525152988419e-06, "loss": 1.4350255727767944, "step": 4244 }, { "epoch": 1.3068872643324356, "grad_norm": 22.5, "learning_rate": 4.267930085603638e-06, "loss": 1.1926822662353516, "step": 4246 }, { "epoch": 1.3075028857252788, "grad_norm": 14.5, "learning_rate": 4.264336739415555e-06, "loss": 0.6354160308837891, "step": 4248 }, { "epoch": 1.3081185071181223, "grad_norm": 13.6875, "learning_rate": 4.260745117994959e-06, "loss": 1.6849381923675537, "step": 4250 }, { "epoch": 1.3087341285109657, "grad_norm": 5.5, "learning_rate": 4.257155224910929e-06, "loss": 1.223006010055542, "step": 4252 }, { "epoch": 1.3093497499038091, "grad_norm": 13.5, "learning_rate": 4.253567063730818e-06, "loss": 1.2167497873306274, "step": 4254 }, { "epoch": 1.3099653712966526, "grad_norm": 5.4375, "learning_rate": 4.249980638020264e-06, "loss": 1.4918882846832275, "step": 4256 }, { "epoch": 1.310580992689496, "grad_norm": 4.53125, "learning_rate": 4.246395951343178e-06, "loss": 1.132615566253662, "step": 4258 }, { "epoch": 1.3111966140823395, "grad_norm": 4.75, "learning_rate": 4.242813007261742e-06, "loss": 1.1479458808898926, "step": 4260 }, { "epoch": 1.3118122354751827, "grad_norm": 8.5, "learning_rate": 4.2392318093364115e-06, "loss": 1.1298502683639526, "step": 4262 }, { "epoch": 1.312427856868026, "grad_norm": 6.5625, "learning_rate": 4.235652361125899e-06, "loss": 1.0727702379226685, "step": 4264 }, { "epoch": 1.3130434782608695, "grad_norm": 9.375, "learning_rate": 4.232074666187187e-06, "loss": 1.3204834461212158, "step": 4266 }, { "epoch": 1.313659099653713, "grad_norm": 10.0, "learning_rate": 4.228498728075508e-06, "loss": 1.491713523864746, "step": 4268 }, { "epoch": 1.3142747210465564, "grad_norm": 3.359375, "learning_rate": 4.224924550344352e-06, "loss": 1.342588186264038, "step": 4270 }, { "epoch": 1.3148903424393998, "grad_norm": 28.5, "learning_rate": 4.221352136545462e-06, "loss": 0.6640535593032837, "step": 4272 }, { "epoch": 1.3155059638322433, "grad_norm": 5.96875, "learning_rate": 4.217781490228821e-06, "loss": 0.9824773073196411, "step": 4274 }, { "epoch": 1.3161215852250865, "grad_norm": 5.25, "learning_rate": 4.214212614942664e-06, "loss": 0.8653430938720703, "step": 4276 }, { "epoch": 1.31673720661793, "grad_norm": 6.0, "learning_rate": 4.210645514233463e-06, "loss": 0.9413928389549255, "step": 4278 }, { "epoch": 1.3173528280107734, "grad_norm": 19.125, "learning_rate": 4.207080191645923e-06, "loss": 1.6054041385650635, "step": 4280 }, { "epoch": 1.3179684494036168, "grad_norm": 15.875, "learning_rate": 4.203516650722987e-06, "loss": 1.221308946609497, "step": 4282 }, { "epoch": 1.3185840707964602, "grad_norm": 8.4375, "learning_rate": 4.199954895005824e-06, "loss": 1.490601658821106, "step": 4284 }, { "epoch": 1.3191996921893034, "grad_norm": 5.40625, "learning_rate": 4.196394928033831e-06, "loss": 1.0584721565246582, "step": 4286 }, { "epoch": 1.319815313582147, "grad_norm": 20.875, "learning_rate": 4.192836753344629e-06, "loss": 1.4903297424316406, "step": 4288 }, { "epoch": 1.3204309349749903, "grad_norm": 8.5625, "learning_rate": 4.189280374474052e-06, "loss": 1.3928847312927246, "step": 4290 }, { "epoch": 1.3210465563678337, "grad_norm": 11.875, "learning_rate": 4.185725794956157e-06, "loss": 1.7273478507995605, "step": 4292 }, { "epoch": 1.3216621777606772, "grad_norm": 4.53125, "learning_rate": 4.182173018323209e-06, "loss": 0.7521393895149231, "step": 4294 }, { "epoch": 1.3222777991535206, "grad_norm": 6.0625, "learning_rate": 4.17862204810568e-06, "loss": 1.2451914548873901, "step": 4296 }, { "epoch": 1.322893420546364, "grad_norm": 5.25, "learning_rate": 4.175072887832248e-06, "loss": 1.2428488731384277, "step": 4298 }, { "epoch": 1.3235090419392073, "grad_norm": 6.875, "learning_rate": 4.171525541029797e-06, "loss": 0.8714505434036255, "step": 4300 }, { "epoch": 1.324124663332051, "grad_norm": 6.53125, "learning_rate": 4.167980011223402e-06, "loss": 1.1614984273910522, "step": 4302 }, { "epoch": 1.3247402847248941, "grad_norm": 5.25, "learning_rate": 4.164436301936334e-06, "loss": 0.8371998071670532, "step": 4304 }, { "epoch": 1.3253559061177376, "grad_norm": 4.9375, "learning_rate": 4.160894416690062e-06, "loss": 1.2088501453399658, "step": 4306 }, { "epoch": 1.325971527510581, "grad_norm": 7.71875, "learning_rate": 4.15735435900423e-06, "loss": 1.1465224027633667, "step": 4308 }, { "epoch": 1.3265871489034244, "grad_norm": 6.625, "learning_rate": 4.153816132396678e-06, "loss": 1.4936845302581787, "step": 4310 }, { "epoch": 1.3272027702962679, "grad_norm": 10.9375, "learning_rate": 4.1502797403834184e-06, "loss": 1.4611656665802002, "step": 4312 }, { "epoch": 1.327818391689111, "grad_norm": 6.0, "learning_rate": 4.146745186478642e-06, "loss": 1.3357412815093994, "step": 4314 }, { "epoch": 1.3284340130819545, "grad_norm": 4.5625, "learning_rate": 4.143212474194717e-06, "loss": 0.46920517086982727, "step": 4316 }, { "epoch": 1.329049634474798, "grad_norm": 3.03125, "learning_rate": 4.139681607042178e-06, "loss": 1.1236724853515625, "step": 4318 }, { "epoch": 1.3296652558676414, "grad_norm": 9.0, "learning_rate": 4.136152588529729e-06, "loss": 1.4975790977478027, "step": 4320 }, { "epoch": 1.3302808772604848, "grad_norm": 11.625, "learning_rate": 4.132625422164229e-06, "loss": 1.5210164785385132, "step": 4322 }, { "epoch": 1.3308964986533283, "grad_norm": 7.34375, "learning_rate": 4.129100111450709e-06, "loss": 1.5824909210205078, "step": 4324 }, { "epoch": 1.3315121200461717, "grad_norm": 4.75, "learning_rate": 4.125576659892344e-06, "loss": 1.4773908853530884, "step": 4326 }, { "epoch": 1.332127741439015, "grad_norm": 81.5, "learning_rate": 4.1220550709904676e-06, "loss": 1.4020800590515137, "step": 4328 }, { "epoch": 1.3327433628318583, "grad_norm": 4.8125, "learning_rate": 4.118535348244566e-06, "loss": 1.1659955978393555, "step": 4330 }, { "epoch": 1.3333589842247018, "grad_norm": 3.75, "learning_rate": 4.115017495152262e-06, "loss": 1.2958264350891113, "step": 4332 }, { "epoch": 1.3339746056175452, "grad_norm": 4.59375, "learning_rate": 4.1115015152093264e-06, "loss": 1.2718833684921265, "step": 4334 }, { "epoch": 1.3345902270103887, "grad_norm": 10.5625, "learning_rate": 4.107987411909667e-06, "loss": 1.120107889175415, "step": 4336 }, { "epoch": 1.335205848403232, "grad_norm": 12.125, "learning_rate": 4.104475188745327e-06, "loss": 0.9704511761665344, "step": 4338 }, { "epoch": 1.3358214697960755, "grad_norm": 14.5, "learning_rate": 4.100964849206484e-06, "loss": 1.0948693752288818, "step": 4340 }, { "epoch": 1.3364370911889187, "grad_norm": 2.1875, "learning_rate": 4.097456396781437e-06, "loss": 0.613154411315918, "step": 4342 }, { "epoch": 1.3370527125817622, "grad_norm": 14.75, "learning_rate": 4.0939498349566145e-06, "loss": 0.6655212640762329, "step": 4344 }, { "epoch": 1.3376683339746056, "grad_norm": 5.125, "learning_rate": 4.09044516721657e-06, "loss": 1.4865241050720215, "step": 4346 }, { "epoch": 1.338283955367449, "grad_norm": 4.40625, "learning_rate": 4.0869423970439646e-06, "loss": 1.161873459815979, "step": 4348 }, { "epoch": 1.3388995767602925, "grad_norm": 9.8125, "learning_rate": 4.083441527919582e-06, "loss": 1.1888126134872437, "step": 4350 }, { "epoch": 1.3395151981531357, "grad_norm": 8.8125, "learning_rate": 4.079942563322315e-06, "loss": 1.6624459028244019, "step": 4352 }, { "epoch": 1.3401308195459793, "grad_norm": 3.671875, "learning_rate": 4.0764455067291625e-06, "loss": 0.6965327262878418, "step": 4354 }, { "epoch": 1.3407464409388226, "grad_norm": 8.3125, "learning_rate": 4.0729503616152284e-06, "loss": 1.100224256515503, "step": 4356 }, { "epoch": 1.341362062331666, "grad_norm": 4.9375, "learning_rate": 4.069457131453716e-06, "loss": 1.1976124048233032, "step": 4358 }, { "epoch": 1.3419776837245094, "grad_norm": 5.03125, "learning_rate": 4.065965819715928e-06, "loss": 1.2772959470748901, "step": 4360 }, { "epoch": 1.3425933051173529, "grad_norm": 4.15625, "learning_rate": 4.062476429871255e-06, "loss": 1.228773832321167, "step": 4362 }, { "epoch": 1.3432089265101963, "grad_norm": 9.625, "learning_rate": 4.058988965387187e-06, "loss": 1.5692869424819946, "step": 4364 }, { "epoch": 1.3438245479030395, "grad_norm": 4.71875, "learning_rate": 4.055503429729294e-06, "loss": 1.2732508182525635, "step": 4366 }, { "epoch": 1.3444401692958832, "grad_norm": 55.75, "learning_rate": 4.052019826361227e-06, "loss": 1.653706669807434, "step": 4368 }, { "epoch": 1.3450557906887264, "grad_norm": 2.5625, "learning_rate": 4.04853815874473e-06, "loss": 1.0349485874176025, "step": 4370 }, { "epoch": 1.3456714120815698, "grad_norm": 4.53125, "learning_rate": 4.045058430339603e-06, "loss": 1.0696982145309448, "step": 4372 }, { "epoch": 1.3462870334744133, "grad_norm": 4.125, "learning_rate": 4.041580644603737e-06, "loss": 1.13498854637146, "step": 4374 }, { "epoch": 1.3469026548672567, "grad_norm": 5.5, "learning_rate": 4.038104804993084e-06, "loss": 1.483115315437317, "step": 4376 }, { "epoch": 1.3475182762601001, "grad_norm": 7.75, "learning_rate": 4.034630914961664e-06, "loss": 1.5009520053863525, "step": 4378 }, { "epoch": 1.3481338976529433, "grad_norm": 5.9375, "learning_rate": 4.0311589779615605e-06, "loss": 1.265244960784912, "step": 4380 }, { "epoch": 1.3487495190457868, "grad_norm": 13.5, "learning_rate": 4.027688997442911e-06, "loss": 1.444014072418213, "step": 4382 }, { "epoch": 1.3493651404386302, "grad_norm": 9.3125, "learning_rate": 4.0242209768539195e-06, "loss": 1.5586994886398315, "step": 4384 }, { "epoch": 1.3499807618314736, "grad_norm": 6.21875, "learning_rate": 4.020754919640829e-06, "loss": 1.4298980236053467, "step": 4386 }, { "epoch": 1.350596383224317, "grad_norm": 5.6875, "learning_rate": 4.017290829247942e-06, "loss": 1.0092744827270508, "step": 4388 }, { "epoch": 1.3512120046171605, "grad_norm": 13.125, "learning_rate": 4.013828709117602e-06, "loss": 1.1938982009887695, "step": 4390 }, { "epoch": 1.351827626010004, "grad_norm": 5.84375, "learning_rate": 4.010368562690195e-06, "loss": 1.0090004205703735, "step": 4392 }, { "epoch": 1.3524432474028472, "grad_norm": 8.75, "learning_rate": 4.006910393404148e-06, "loss": 1.2579269409179688, "step": 4394 }, { "epoch": 1.3530588687956906, "grad_norm": 10.5625, "learning_rate": 4.003454204695919e-06, "loss": 1.4525524377822876, "step": 4396 }, { "epoch": 1.353674490188534, "grad_norm": 5.03125, "learning_rate": 4.0000000000000015e-06, "loss": 1.2327172756195068, "step": 4398 }, { "epoch": 1.3542901115813775, "grad_norm": 6.875, "learning_rate": 3.996547782748915e-06, "loss": 1.3894731998443604, "step": 4400 }, { "epoch": 1.354905732974221, "grad_norm": 3.453125, "learning_rate": 3.993097556373205e-06, "loss": 1.2647353410720825, "step": 4402 }, { "epoch": 1.3555213543670643, "grad_norm": 16.375, "learning_rate": 3.989649324301441e-06, "loss": 1.43026864528656, "step": 4404 }, { "epoch": 1.3561369757599078, "grad_norm": 6.625, "learning_rate": 3.986203089960206e-06, "loss": 1.2500108480453491, "step": 4406 }, { "epoch": 1.356752597152751, "grad_norm": 5.15625, "learning_rate": 3.982758856774103e-06, "loss": 1.0625942945480347, "step": 4408 }, { "epoch": 1.3573682185455944, "grad_norm": 12.0625, "learning_rate": 3.979316628165741e-06, "loss": 1.4211559295654297, "step": 4410 }, { "epoch": 1.3579838399384379, "grad_norm": 8.625, "learning_rate": 3.975876407555742e-06, "loss": 1.446379542350769, "step": 4412 }, { "epoch": 1.3585994613312813, "grad_norm": 9.6875, "learning_rate": 3.9724381983627285e-06, "loss": 1.3842189311981201, "step": 4414 }, { "epoch": 1.3592150827241247, "grad_norm": 6.78125, "learning_rate": 3.969002004003326e-06, "loss": 0.9346789717674255, "step": 4416 }, { "epoch": 1.359830704116968, "grad_norm": 3.0, "learning_rate": 3.965567827892159e-06, "loss": 1.2848070859909058, "step": 4418 }, { "epoch": 1.3604463255098116, "grad_norm": 12.75, "learning_rate": 3.962135673441846e-06, "loss": 1.651962161064148, "step": 4420 }, { "epoch": 1.3610619469026548, "grad_norm": 7.84375, "learning_rate": 3.958705544062994e-06, "loss": 1.3421986103057861, "step": 4422 }, { "epoch": 1.3616775682954982, "grad_norm": 5.59375, "learning_rate": 3.9552774431642e-06, "loss": 1.4327430725097656, "step": 4424 }, { "epoch": 1.3622931896883417, "grad_norm": 4.65625, "learning_rate": 3.951851374152045e-06, "loss": 1.3108030557632446, "step": 4426 }, { "epoch": 1.3629088110811851, "grad_norm": 3.390625, "learning_rate": 3.9484273404310905e-06, "loss": 1.1684361696243286, "step": 4428 }, { "epoch": 1.3635244324740285, "grad_norm": 5.03125, "learning_rate": 3.9450053454038735e-06, "loss": 1.3234820365905762, "step": 4430 }, { "epoch": 1.3641400538668718, "grad_norm": 5.84375, "learning_rate": 3.941585392470912e-06, "loss": 1.429876446723938, "step": 4432 }, { "epoch": 1.3647556752597152, "grad_norm": 5.8125, "learning_rate": 3.938167485030687e-06, "loss": 1.1931172609329224, "step": 4434 }, { "epoch": 1.3653712966525586, "grad_norm": 10.5625, "learning_rate": 3.934751626479649e-06, "loss": 1.3414031267166138, "step": 4436 }, { "epoch": 1.365986918045402, "grad_norm": 8.125, "learning_rate": 3.931337820212215e-06, "loss": 1.2865818738937378, "step": 4438 }, { "epoch": 1.3666025394382455, "grad_norm": 5.1875, "learning_rate": 3.927926069620758e-06, "loss": 1.122363567352295, "step": 4440 }, { "epoch": 1.367218160831089, "grad_norm": 4.1875, "learning_rate": 3.924516378095613e-06, "loss": 1.2529959678649902, "step": 4442 }, { "epoch": 1.3678337822239324, "grad_norm": 6.75, "learning_rate": 3.921108749025069e-06, "loss": 1.335496425628662, "step": 4444 }, { "epoch": 1.3684494036167756, "grad_norm": 9.5, "learning_rate": 3.917703185795359e-06, "loss": 1.4739428758621216, "step": 4446 }, { "epoch": 1.369065025009619, "grad_norm": 15.0625, "learning_rate": 3.914299691790672e-06, "loss": 1.5854828357696533, "step": 4448 }, { "epoch": 1.3696806464024625, "grad_norm": 8.875, "learning_rate": 3.910898270393131e-06, "loss": 1.4780482053756714, "step": 4450 }, { "epoch": 1.370296267795306, "grad_norm": 14.5625, "learning_rate": 3.907498924982809e-06, "loss": 1.1311975717544556, "step": 4452 }, { "epoch": 1.3709118891881493, "grad_norm": 5.46875, "learning_rate": 3.9041016589377115e-06, "loss": 1.292945146560669, "step": 4454 }, { "epoch": 1.3715275105809928, "grad_norm": 16.875, "learning_rate": 3.900706475633774e-06, "loss": 1.7364095449447632, "step": 4456 }, { "epoch": 1.3721431319738362, "grad_norm": 4.46875, "learning_rate": 3.897313378444871e-06, "loss": 1.2401243448257446, "step": 4458 }, { "epoch": 1.3727587533666794, "grad_norm": 12.8125, "learning_rate": 3.893922370742797e-06, "loss": 1.4392495155334473, "step": 4460 }, { "epoch": 1.3733743747595228, "grad_norm": 5.46875, "learning_rate": 3.890533455897274e-06, "loss": 0.848065197467804, "step": 4462 }, { "epoch": 1.3739899961523663, "grad_norm": 19.75, "learning_rate": 3.887146637275939e-06, "loss": 1.5929031372070312, "step": 4464 }, { "epoch": 1.3746056175452097, "grad_norm": 31.25, "learning_rate": 3.883761918244354e-06, "loss": 0.9019598960876465, "step": 4466 }, { "epoch": 1.3752212389380531, "grad_norm": 7.28125, "learning_rate": 3.880379302165987e-06, "loss": 1.2336246967315674, "step": 4468 }, { "epoch": 1.3758368603308964, "grad_norm": 10.0, "learning_rate": 3.87699879240222e-06, "loss": 1.1775295734405518, "step": 4470 }, { "epoch": 1.37645248172374, "grad_norm": 8.1875, "learning_rate": 3.8736203923123425e-06, "loss": 1.3640468120574951, "step": 4472 }, { "epoch": 1.3770681031165832, "grad_norm": 4.65625, "learning_rate": 3.870244105253546e-06, "loss": 1.1681087017059326, "step": 4474 }, { "epoch": 1.3776837245094267, "grad_norm": 8.625, "learning_rate": 3.866869934580922e-06, "loss": 1.2777169942855835, "step": 4476 }, { "epoch": 1.37829934590227, "grad_norm": 7.75, "learning_rate": 3.8634978836474605e-06, "loss": 1.1488378047943115, "step": 4478 }, { "epoch": 1.3789149672951135, "grad_norm": 10.375, "learning_rate": 3.860127955804042e-06, "loss": 1.2367651462554932, "step": 4480 }, { "epoch": 1.379530588687957, "grad_norm": 8.1875, "learning_rate": 3.856760154399442e-06, "loss": 1.4411249160766602, "step": 4482 }, { "epoch": 1.3801462100808002, "grad_norm": 11.25, "learning_rate": 3.853394482780318e-06, "loss": 1.2920490503311157, "step": 4484 }, { "epoch": 1.3807618314736438, "grad_norm": 13.4375, "learning_rate": 3.850030944291215e-06, "loss": 1.6774686574935913, "step": 4486 }, { "epoch": 1.381377452866487, "grad_norm": 5.65625, "learning_rate": 3.846669542274559e-06, "loss": 1.4606332778930664, "step": 4488 }, { "epoch": 1.3819930742593305, "grad_norm": 6.5625, "learning_rate": 3.843310280070643e-06, "loss": 1.5854536294937134, "step": 4490 }, { "epoch": 1.382608695652174, "grad_norm": 8.1875, "learning_rate": 3.839953161017647e-06, "loss": 1.4916932582855225, "step": 4492 }, { "epoch": 1.3832243170450174, "grad_norm": 10.1875, "learning_rate": 3.836598188451615e-06, "loss": 1.4063359498977661, "step": 4494 }, { "epoch": 1.3838399384378608, "grad_norm": 4.46875, "learning_rate": 3.833245365706457e-06, "loss": 1.3658607006072998, "step": 4496 }, { "epoch": 1.384455559830704, "grad_norm": 9.1875, "learning_rate": 3.829894696113949e-06, "loss": 1.2148422002792358, "step": 4498 }, { "epoch": 1.3850711812235474, "grad_norm": 12.625, "learning_rate": 3.826546183003726e-06, "loss": 1.2281498908996582, "step": 4500 }, { "epoch": 1.3856868026163909, "grad_norm": 5.1875, "learning_rate": 3.8231998297032815e-06, "loss": 0.9534695744514465, "step": 4502 }, { "epoch": 1.3863024240092343, "grad_norm": 11.125, "learning_rate": 3.819855639537959e-06, "loss": 1.0877275466918945, "step": 4504 }, { "epoch": 1.3869180454020777, "grad_norm": 10.75, "learning_rate": 3.816513615830959e-06, "loss": 1.4760278463363647, "step": 4506 }, { "epoch": 1.3875336667949212, "grad_norm": 7.0625, "learning_rate": 3.813173761903324e-06, "loss": 1.136398434638977, "step": 4508 }, { "epoch": 1.3881492881877646, "grad_norm": 21.375, "learning_rate": 3.8098360810739386e-06, "loss": 1.4807792901992798, "step": 4510 }, { "epoch": 1.3887649095806078, "grad_norm": 9.75, "learning_rate": 3.8065005766595366e-06, "loss": 1.4393525123596191, "step": 4512 }, { "epoch": 1.3893805309734513, "grad_norm": 7.40625, "learning_rate": 3.8031672519746797e-06, "loss": 1.3941676616668701, "step": 4514 }, { "epoch": 1.3899961523662947, "grad_norm": 3.515625, "learning_rate": 3.7998361103317688e-06, "loss": 1.0671101808547974, "step": 4516 }, { "epoch": 1.3906117737591381, "grad_norm": 5.15625, "learning_rate": 3.796507155041032e-06, "loss": 1.0656688213348389, "step": 4518 }, { "epoch": 1.3912273951519816, "grad_norm": 11.0, "learning_rate": 3.7931803894105296e-06, "loss": 1.498962163925171, "step": 4520 }, { "epoch": 1.391843016544825, "grad_norm": 23.375, "learning_rate": 3.7898558167461426e-06, "loss": 1.4071496725082397, "step": 4522 }, { "epoch": 1.3924586379376684, "grad_norm": 5.875, "learning_rate": 3.7865334403515706e-06, "loss": 1.5745182037353516, "step": 4524 }, { "epoch": 1.3930742593305117, "grad_norm": 6.65625, "learning_rate": 3.7832132635283385e-06, "loss": 1.134566307067871, "step": 4526 }, { "epoch": 1.393689880723355, "grad_norm": 32.5, "learning_rate": 3.779895289575775e-06, "loss": 1.036484718322754, "step": 4528 }, { "epoch": 1.3943055021161985, "grad_norm": 4.34375, "learning_rate": 3.7765795217910294e-06, "loss": 1.0628092288970947, "step": 4530 }, { "epoch": 1.394921123509042, "grad_norm": 5.40625, "learning_rate": 3.7732659634690528e-06, "loss": 0.8714097738265991, "step": 4532 }, { "epoch": 1.3955367449018854, "grad_norm": 11.1875, "learning_rate": 3.7699546179026003e-06, "loss": 0.9510615468025208, "step": 4534 }, { "epoch": 1.3961523662947286, "grad_norm": 6.46875, "learning_rate": 3.7666454883822345e-06, "loss": 1.2836421728134155, "step": 4536 }, { "epoch": 1.3967679876875723, "grad_norm": 6.28125, "learning_rate": 3.763338578196307e-06, "loss": 1.164069652557373, "step": 4538 }, { "epoch": 1.3973836090804155, "grad_norm": 24.125, "learning_rate": 3.7600338906309747e-06, "loss": 1.5945087671279907, "step": 4540 }, { "epoch": 1.397999230473259, "grad_norm": 7.71875, "learning_rate": 3.7567314289701746e-06, "loss": 1.5566492080688477, "step": 4542 }, { "epoch": 1.3986148518661023, "grad_norm": 6.28125, "learning_rate": 3.753431196495636e-06, "loss": 1.5426421165466309, "step": 4544 }, { "epoch": 1.3992304732589458, "grad_norm": 6.25, "learning_rate": 3.750133196486878e-06, "loss": 1.209553837776184, "step": 4546 }, { "epoch": 1.3998460946517892, "grad_norm": 5.84375, "learning_rate": 3.7468374322211943e-06, "loss": 1.249995470046997, "step": 4548 }, { "epoch": 1.4004617160446324, "grad_norm": 16.0, "learning_rate": 3.743543906973661e-06, "loss": 1.1695905923843384, "step": 4550 }, { "epoch": 1.401077337437476, "grad_norm": 8.0625, "learning_rate": 3.740252624017129e-06, "loss": 1.2239853143692017, "step": 4552 }, { "epoch": 1.4016929588303193, "grad_norm": 5.65625, "learning_rate": 3.7369635866222183e-06, "loss": 1.3880513906478882, "step": 4554 }, { "epoch": 1.4023085802231627, "grad_norm": 4.96875, "learning_rate": 3.733676798057319e-06, "loss": 1.5688056945800781, "step": 4556 }, { "epoch": 1.4029242016160062, "grad_norm": 6.71875, "learning_rate": 3.7303922615885855e-06, "loss": 1.2916425466537476, "step": 4558 }, { "epoch": 1.4035398230088496, "grad_norm": 12.5625, "learning_rate": 3.7271099804799387e-06, "loss": 1.238887071609497, "step": 4560 }, { "epoch": 1.404155444401693, "grad_norm": 8.0, "learning_rate": 3.7238299579930525e-06, "loss": 1.1047825813293457, "step": 4562 }, { "epoch": 1.4047710657945363, "grad_norm": 10.4375, "learning_rate": 3.720552197387358e-06, "loss": 1.199622392654419, "step": 4564 }, { "epoch": 1.4053866871873797, "grad_norm": 6.40625, "learning_rate": 3.717276701920044e-06, "loss": 0.9500876665115356, "step": 4566 }, { "epoch": 1.4060023085802231, "grad_norm": 12.9375, "learning_rate": 3.7140034748460373e-06, "loss": 1.5336376428604126, "step": 4568 }, { "epoch": 1.4066179299730666, "grad_norm": 8.4375, "learning_rate": 3.7107325194180216e-06, "loss": 1.1892744302749634, "step": 4570 }, { "epoch": 1.40723355136591, "grad_norm": 17.375, "learning_rate": 3.7074638388864157e-06, "loss": 1.3420631885528564, "step": 4572 }, { "epoch": 1.4078491727587534, "grad_norm": 9.5, "learning_rate": 3.704197436499384e-06, "loss": 1.0091776847839355, "step": 4574 }, { "epoch": 1.4084647941515969, "grad_norm": 4.375, "learning_rate": 3.7009333155028215e-06, "loss": 1.2374558448791504, "step": 4576 }, { "epoch": 1.40908041554444, "grad_norm": 4.28125, "learning_rate": 3.697671479140359e-06, "loss": 1.1223832368850708, "step": 4578 }, { "epoch": 1.4096960369372835, "grad_norm": 4.15625, "learning_rate": 3.694411930653356e-06, "loss": 1.2537848949432373, "step": 4580 }, { "epoch": 1.410311658330127, "grad_norm": 8.1875, "learning_rate": 3.691154673280898e-06, "loss": 1.3645119667053223, "step": 4582 }, { "epoch": 1.4109272797229704, "grad_norm": 7.28125, "learning_rate": 3.6878997102597967e-06, "loss": 1.4384486675262451, "step": 4584 }, { "epoch": 1.4115429011158138, "grad_norm": 14.0625, "learning_rate": 3.6846470448245817e-06, "loss": 1.1669893264770508, "step": 4586 }, { "epoch": 1.4121585225086573, "grad_norm": 2.375, "learning_rate": 3.6813966802074975e-06, "loss": 1.2316890954971313, "step": 4588 }, { "epoch": 1.4127741439015007, "grad_norm": 15.5625, "learning_rate": 3.6781486196385085e-06, "loss": 1.3015397787094116, "step": 4590 }, { "epoch": 1.413389765294344, "grad_norm": 5.75, "learning_rate": 3.674902866345279e-06, "loss": 1.0278640985488892, "step": 4592 }, { "epoch": 1.4140053866871873, "grad_norm": 4.875, "learning_rate": 3.6716594235531915e-06, "loss": 1.4170969724655151, "step": 4594 }, { "epoch": 1.4146210080800308, "grad_norm": 5.625, "learning_rate": 3.6684182944853274e-06, "loss": 1.532051920890808, "step": 4596 }, { "epoch": 1.4152366294728742, "grad_norm": 8.75, "learning_rate": 3.6651794823624665e-06, "loss": 1.4891656637191772, "step": 4598 }, { "epoch": 1.4158522508657176, "grad_norm": 10.625, "learning_rate": 3.661942990403092e-06, "loss": 1.4055567979812622, "step": 4600 }, { "epoch": 1.4164678722585609, "grad_norm": 8.0625, "learning_rate": 3.658708821823376e-06, "loss": 0.8534173965454102, "step": 4602 }, { "epoch": 1.4170834936514045, "grad_norm": 4.15625, "learning_rate": 3.655476979837189e-06, "loss": 1.2280539274215698, "step": 4604 }, { "epoch": 1.4176991150442477, "grad_norm": 8.0, "learning_rate": 3.6522474676560786e-06, "loss": 1.5194727182388306, "step": 4606 }, { "epoch": 1.4183147364370912, "grad_norm": 5.8125, "learning_rate": 3.649020288489288e-06, "loss": 1.046549916267395, "step": 4608 }, { "epoch": 1.4189303578299346, "grad_norm": 10.875, "learning_rate": 3.645795445543736e-06, "loss": 1.0877692699432373, "step": 4610 }, { "epoch": 1.419545979222778, "grad_norm": 4.34375, "learning_rate": 3.6425729420240193e-06, "loss": 1.3161126375198364, "step": 4612 }, { "epoch": 1.4201616006156215, "grad_norm": 5.46875, "learning_rate": 3.6393527811324154e-06, "loss": 0.7252700328826904, "step": 4614 }, { "epoch": 1.4207772220084647, "grad_norm": 8.25, "learning_rate": 3.6361349660688687e-06, "loss": 1.6500828266143799, "step": 4616 }, { "epoch": 1.4213928434013081, "grad_norm": 9.0, "learning_rate": 3.632919500030994e-06, "loss": 1.3527953624725342, "step": 4618 }, { "epoch": 1.4220084647941515, "grad_norm": 5.125, "learning_rate": 3.629706386214073e-06, "loss": 1.0250791311264038, "step": 4620 }, { "epoch": 1.422624086186995, "grad_norm": 17.375, "learning_rate": 3.626495627811046e-06, "loss": 1.1802740097045898, "step": 4622 }, { "epoch": 1.4232397075798384, "grad_norm": 5.0625, "learning_rate": 3.62328722801252e-06, "loss": 1.4389358758926392, "step": 4624 }, { "epoch": 1.4238553289726819, "grad_norm": 4.40625, "learning_rate": 3.6200811900067488e-06, "loss": 1.3220165967941284, "step": 4626 }, { "epoch": 1.4244709503655253, "grad_norm": 10.5625, "learning_rate": 3.616877516979649e-06, "loss": 1.3158611059188843, "step": 4628 }, { "epoch": 1.4250865717583685, "grad_norm": 15.0, "learning_rate": 3.6136762121147805e-06, "loss": 1.0538830757141113, "step": 4630 }, { "epoch": 1.425702193151212, "grad_norm": 9.8125, "learning_rate": 3.610477278593351e-06, "loss": 1.0830562114715576, "step": 4632 }, { "epoch": 1.4263178145440554, "grad_norm": 5.59375, "learning_rate": 3.607280719594213e-06, "loss": 1.409989833831787, "step": 4634 }, { "epoch": 1.4269334359368988, "grad_norm": 3.84375, "learning_rate": 3.6040865382938578e-06, "loss": 1.2030140161514282, "step": 4636 }, { "epoch": 1.4275490573297422, "grad_norm": 3.96875, "learning_rate": 3.6008947378664164e-06, "loss": 1.1751364469528198, "step": 4638 }, { "epoch": 1.4281646787225857, "grad_norm": 6.21875, "learning_rate": 3.597705321483653e-06, "loss": 1.0235944986343384, "step": 4640 }, { "epoch": 1.4287803001154291, "grad_norm": 12.375, "learning_rate": 3.5945182923149602e-06, "loss": 1.2032253742218018, "step": 4642 }, { "epoch": 1.4293959215082723, "grad_norm": 3.15625, "learning_rate": 3.5913336535273613e-06, "loss": 0.9244052171707153, "step": 4644 }, { "epoch": 1.4300115429011158, "grad_norm": 8.875, "learning_rate": 3.5881514082855023e-06, "loss": 1.4916614294052124, "step": 4646 }, { "epoch": 1.4306271642939592, "grad_norm": 7.34375, "learning_rate": 3.584971559751653e-06, "loss": 1.3980257511138916, "step": 4648 }, { "epoch": 1.4312427856868026, "grad_norm": 37.25, "learning_rate": 3.5817941110857008e-06, "loss": 1.134635090827942, "step": 4650 }, { "epoch": 1.431858407079646, "grad_norm": 4.6875, "learning_rate": 3.578619065445144e-06, "loss": 1.467618465423584, "step": 4652 }, { "epoch": 1.4324740284724893, "grad_norm": 5.90625, "learning_rate": 3.5754464259851013e-06, "loss": 1.207594871520996, "step": 4654 }, { "epoch": 1.433089649865333, "grad_norm": 10.5625, "learning_rate": 3.572276195858293e-06, "loss": 1.0639207363128662, "step": 4656 }, { "epoch": 1.4337052712581762, "grad_norm": 4.9375, "learning_rate": 3.569108378215049e-06, "loss": 1.3432626724243164, "step": 4658 }, { "epoch": 1.4343208926510196, "grad_norm": 7.3125, "learning_rate": 3.5659429762032977e-06, "loss": 1.2566211223602295, "step": 4660 }, { "epoch": 1.434936514043863, "grad_norm": 4.21875, "learning_rate": 3.562779992968574e-06, "loss": 1.310718297958374, "step": 4662 }, { "epoch": 1.4355521354367065, "grad_norm": 13.875, "learning_rate": 3.559619431654004e-06, "loss": 1.1663079261779785, "step": 4664 }, { "epoch": 1.43616775682955, "grad_norm": 4.9375, "learning_rate": 3.5564612954003066e-06, "loss": 1.4862658977508545, "step": 4666 }, { "epoch": 1.436783378222393, "grad_norm": 24.75, "learning_rate": 3.553305587345796e-06, "loss": 1.6377918720245361, "step": 4668 }, { "epoch": 1.4373989996152368, "grad_norm": 6.6875, "learning_rate": 3.550152310626366e-06, "loss": 1.3760124444961548, "step": 4670 }, { "epoch": 1.43801462100808, "grad_norm": 3.359375, "learning_rate": 3.547001468375501e-06, "loss": 1.28596830368042, "step": 4672 }, { "epoch": 1.4386302424009234, "grad_norm": 18.625, "learning_rate": 3.543853063724263e-06, "loss": 0.7132194638252258, "step": 4674 }, { "epoch": 1.4392458637937668, "grad_norm": 10.0, "learning_rate": 3.540707099801291e-06, "loss": 1.4063074588775635, "step": 4676 }, { "epoch": 1.4398614851866103, "grad_norm": 2.484375, "learning_rate": 3.5375635797328024e-06, "loss": 0.8440459966659546, "step": 4678 }, { "epoch": 1.4404771065794537, "grad_norm": 4.5, "learning_rate": 3.534422506642581e-06, "loss": 1.209274172782898, "step": 4680 }, { "epoch": 1.441092727972297, "grad_norm": 7.34375, "learning_rate": 3.5312838836519846e-06, "loss": 0.9432491660118103, "step": 4682 }, { "epoch": 1.4417083493651404, "grad_norm": 16.0, "learning_rate": 3.52814771387993e-06, "loss": 1.304944634437561, "step": 4684 }, { "epoch": 1.4423239707579838, "grad_norm": 5.09375, "learning_rate": 3.5250140004429005e-06, "loss": 1.2836129665374756, "step": 4686 }, { "epoch": 1.4429395921508272, "grad_norm": 2.65625, "learning_rate": 3.521882746454939e-06, "loss": 1.123637080192566, "step": 4688 }, { "epoch": 1.4435552135436707, "grad_norm": 9.375, "learning_rate": 3.518753955027639e-06, "loss": 1.6363736391067505, "step": 4690 }, { "epoch": 1.444170834936514, "grad_norm": 9.1875, "learning_rate": 3.5156276292701552e-06, "loss": 1.3315503597259521, "step": 4692 }, { "epoch": 1.4447864563293575, "grad_norm": 3.484375, "learning_rate": 3.5125037722891846e-06, "loss": 1.1604689359664917, "step": 4694 }, { "epoch": 1.4454020777222008, "grad_norm": 3.125, "learning_rate": 3.5093823871889756e-06, "loss": 0.9855441451072693, "step": 4696 }, { "epoch": 1.4460176991150442, "grad_norm": 5.40625, "learning_rate": 3.5062634770713174e-06, "loss": 1.2205106019973755, "step": 4698 }, { "epoch": 1.4466333205078876, "grad_norm": 3.3125, "learning_rate": 3.5031470450355396e-06, "loss": 1.3333935737609863, "step": 4700 }, { "epoch": 1.447248941900731, "grad_norm": 4.03125, "learning_rate": 3.5000330941785142e-06, "loss": 1.1714081764221191, "step": 4702 }, { "epoch": 1.4478645632935745, "grad_norm": 6.25, "learning_rate": 3.4969216275946414e-06, "loss": 1.0127254724502563, "step": 4704 }, { "epoch": 1.448480184686418, "grad_norm": 6.09375, "learning_rate": 3.4938126483758544e-06, "loss": 0.9646738767623901, "step": 4706 }, { "epoch": 1.4490958060792614, "grad_norm": 5.28125, "learning_rate": 3.49070615961162e-06, "loss": 1.1320888996124268, "step": 4708 }, { "epoch": 1.4497114274721046, "grad_norm": 10.5, "learning_rate": 3.4876021643889203e-06, "loss": 1.3203264474868774, "step": 4710 }, { "epoch": 1.450327048864948, "grad_norm": 2.625, "learning_rate": 3.484500665792268e-06, "loss": 1.0886530876159668, "step": 4712 }, { "epoch": 1.4509426702577914, "grad_norm": 28.75, "learning_rate": 3.4814016669036903e-06, "loss": 1.164709210395813, "step": 4714 }, { "epoch": 1.4515582916506349, "grad_norm": 3.90625, "learning_rate": 3.4783051708027337e-06, "loss": 1.1819281578063965, "step": 4716 }, { "epoch": 1.4521739130434783, "grad_norm": 9.0625, "learning_rate": 3.4752111805664547e-06, "loss": 1.2423570156097412, "step": 4718 }, { "epoch": 1.4527895344363215, "grad_norm": 7.03125, "learning_rate": 3.472119699269421e-06, "loss": 1.3394665718078613, "step": 4720 }, { "epoch": 1.4534051558291652, "grad_norm": 7.96875, "learning_rate": 3.4690307299837065e-06, "loss": 1.3577899932861328, "step": 4722 }, { "epoch": 1.4540207772220084, "grad_norm": 6.40625, "learning_rate": 3.4659442757788886e-06, "loss": 1.440277338027954, "step": 4724 }, { "epoch": 1.4546363986148518, "grad_norm": 4.8125, "learning_rate": 3.462860339722048e-06, "loss": 1.4500181674957275, "step": 4726 }, { "epoch": 1.4552520200076953, "grad_norm": 5.5625, "learning_rate": 3.459778924877759e-06, "loss": 1.429072618484497, "step": 4728 }, { "epoch": 1.4558676414005387, "grad_norm": 6.125, "learning_rate": 3.4567000343080936e-06, "loss": 1.4825221300125122, "step": 4730 }, { "epoch": 1.4564832627933821, "grad_norm": 5.65625, "learning_rate": 3.4536236710726147e-06, "loss": 1.3014346361160278, "step": 4732 }, { "epoch": 1.4570988841862254, "grad_norm": 4.65625, "learning_rate": 3.450549838228373e-06, "loss": 1.0785235166549683, "step": 4734 }, { "epoch": 1.457714505579069, "grad_norm": 5.03125, "learning_rate": 3.4474785388299054e-06, "loss": 1.0648795366287231, "step": 4736 }, { "epoch": 1.4583301269719122, "grad_norm": 6.40625, "learning_rate": 3.4444097759292294e-06, "loss": 0.4583609700202942, "step": 4738 }, { "epoch": 1.4589457483647557, "grad_norm": 4.1875, "learning_rate": 3.4413435525758456e-06, "loss": 0.9563672542572021, "step": 4740 }, { "epoch": 1.459561369757599, "grad_norm": 17.5, "learning_rate": 3.4382798718167283e-06, "loss": 1.5865509510040283, "step": 4742 }, { "epoch": 1.4601769911504425, "grad_norm": 4.53125, "learning_rate": 3.435218736696324e-06, "loss": 1.3797905445098877, "step": 4744 }, { "epoch": 1.460792612543286, "grad_norm": 18.25, "learning_rate": 3.432160150256556e-06, "loss": 1.4692882299423218, "step": 4746 }, { "epoch": 1.4614082339361292, "grad_norm": 6.0625, "learning_rate": 3.429104115536803e-06, "loss": 0.8994933366775513, "step": 4748 }, { "epoch": 1.4620238553289726, "grad_norm": 21.25, "learning_rate": 3.4260506355739214e-06, "loss": 1.1846752166748047, "step": 4750 }, { "epoch": 1.462639476721816, "grad_norm": 7.5, "learning_rate": 3.422999713402221e-06, "loss": 1.5342363119125366, "step": 4752 }, { "epoch": 1.4632550981146595, "grad_norm": 17.125, "learning_rate": 3.419951352053469e-06, "loss": 1.5448050498962402, "step": 4754 }, { "epoch": 1.463870719507503, "grad_norm": 5.0625, "learning_rate": 3.416905554556893e-06, "loss": 1.5295370817184448, "step": 4756 }, { "epoch": 1.4644863409003464, "grad_norm": 7.71875, "learning_rate": 3.4138623239391705e-06, "loss": 1.2686611413955688, "step": 4758 }, { "epoch": 1.4651019622931898, "grad_norm": 8.375, "learning_rate": 3.4108216632244272e-06, "loss": 1.2924885749816895, "step": 4760 }, { "epoch": 1.465717583686033, "grad_norm": 6.0625, "learning_rate": 3.4077835754342357e-06, "loss": 1.1660832166671753, "step": 4762 }, { "epoch": 1.4663332050788764, "grad_norm": 4.71875, "learning_rate": 3.4047480635876106e-06, "loss": 0.7886714339256287, "step": 4764 }, { "epoch": 1.4669488264717199, "grad_norm": 6.125, "learning_rate": 3.40171513070101e-06, "loss": 0.928934633731842, "step": 4766 }, { "epoch": 1.4675644478645633, "grad_norm": 4.09375, "learning_rate": 3.3986847797883265e-06, "loss": 1.2280499935150146, "step": 4768 }, { "epoch": 1.4681800692574067, "grad_norm": 12.625, "learning_rate": 3.395657013860889e-06, "loss": 1.5266685485839844, "step": 4770 }, { "epoch": 1.4687956906502502, "grad_norm": 8.4375, "learning_rate": 3.392631835927455e-06, "loss": 1.3424854278564453, "step": 4772 }, { "epoch": 1.4694113120430936, "grad_norm": 5.5, "learning_rate": 3.3896092489942123e-06, "loss": 1.4064112901687622, "step": 4774 }, { "epoch": 1.4700269334359368, "grad_norm": 11.625, "learning_rate": 3.386589256064773e-06, "loss": 1.328787922859192, "step": 4776 }, { "epoch": 1.4706425548287803, "grad_norm": 4.6875, "learning_rate": 3.3835718601401696e-06, "loss": 1.0625183582305908, "step": 4778 }, { "epoch": 1.4712581762216237, "grad_norm": 6.78125, "learning_rate": 3.38055706421886e-06, "loss": 0.9996272921562195, "step": 4780 }, { "epoch": 1.4718737976144671, "grad_norm": 6.625, "learning_rate": 3.3775448712967128e-06, "loss": 1.4594124555587769, "step": 4782 }, { "epoch": 1.4724894190073106, "grad_norm": 13.75, "learning_rate": 3.374535284367011e-06, "loss": 1.557099461555481, "step": 4784 }, { "epoch": 1.4731050404001538, "grad_norm": 8.6875, "learning_rate": 3.371528306420451e-06, "loss": 1.6163135766983032, "step": 4786 }, { "epoch": 1.4737206617929974, "grad_norm": 9.375, "learning_rate": 3.3685239404451286e-06, "loss": 1.7769767045974731, "step": 4788 }, { "epoch": 1.4743362831858406, "grad_norm": 3.15625, "learning_rate": 3.365522189426556e-06, "loss": 1.1799372434616089, "step": 4790 }, { "epoch": 1.474951904578684, "grad_norm": 6.71875, "learning_rate": 3.3625230563476356e-06, "loss": 1.2087534666061401, "step": 4792 }, { "epoch": 1.4755675259715275, "grad_norm": 7.59375, "learning_rate": 3.359526544188677e-06, "loss": 1.1725058555603027, "step": 4794 }, { "epoch": 1.476183147364371, "grad_norm": 3.46875, "learning_rate": 3.3565326559273803e-06, "loss": 1.0747044086456299, "step": 4796 }, { "epoch": 1.4767987687572144, "grad_norm": 5.46875, "learning_rate": 3.3535413945388385e-06, "loss": 1.2907452583312988, "step": 4798 }, { "epoch": 1.4774143901500576, "grad_norm": 96.5, "learning_rate": 3.3505527629955357e-06, "loss": 0.9903527498245239, "step": 4800 }, { "epoch": 1.478030011542901, "grad_norm": 4.9375, "learning_rate": 3.34756676426734e-06, "loss": 1.253095030784607, "step": 4802 }, { "epoch": 1.4786456329357445, "grad_norm": 5.625, "learning_rate": 3.3445834013215095e-06, "loss": 1.3294130563735962, "step": 4804 }, { "epoch": 1.479261254328588, "grad_norm": 2.84375, "learning_rate": 3.3416026771226756e-06, "loss": 1.3141107559204102, "step": 4806 }, { "epoch": 1.4798768757214313, "grad_norm": 8.0625, "learning_rate": 3.338624594632851e-06, "loss": 0.8612028956413269, "step": 4808 }, { "epoch": 1.4804924971142748, "grad_norm": 8.375, "learning_rate": 3.335649156811425e-06, "loss": 1.5777348279953003, "step": 4810 }, { "epoch": 1.4811081185071182, "grad_norm": 3.515625, "learning_rate": 3.332676366615154e-06, "loss": 1.1648386716842651, "step": 4812 }, { "epoch": 1.4817237398999614, "grad_norm": 8.5625, "learning_rate": 3.329706226998169e-06, "loss": 1.3452742099761963, "step": 4814 }, { "epoch": 1.4823393612928049, "grad_norm": 6.03125, "learning_rate": 3.3267387409119633e-06, "loss": 1.1355098485946655, "step": 4816 }, { "epoch": 1.4829549826856483, "grad_norm": 8.9375, "learning_rate": 3.3237739113053924e-06, "loss": 1.2803765535354614, "step": 4818 }, { "epoch": 1.4835706040784917, "grad_norm": 7.46875, "learning_rate": 3.3208117411246766e-06, "loss": 1.2495794296264648, "step": 4820 }, { "epoch": 1.4841862254713352, "grad_norm": 5.0625, "learning_rate": 3.317852233313389e-06, "loss": 1.2289323806762695, "step": 4822 }, { "epoch": 1.4848018468641786, "grad_norm": 3.890625, "learning_rate": 3.3148953908124624e-06, "loss": 1.246183156967163, "step": 4824 }, { "epoch": 1.485417468257022, "grad_norm": 6.34375, "learning_rate": 3.3119412165601717e-06, "loss": 1.047895073890686, "step": 4826 }, { "epoch": 1.4860330896498652, "grad_norm": 7.1875, "learning_rate": 3.308989713492151e-06, "loss": 1.32063627243042, "step": 4828 }, { "epoch": 1.4866487110427087, "grad_norm": 3.796875, "learning_rate": 3.3060408845413733e-06, "loss": 1.0678014755249023, "step": 4830 }, { "epoch": 1.4872643324355521, "grad_norm": 5.15625, "learning_rate": 3.3030947326381548e-06, "loss": 1.2184605598449707, "step": 4832 }, { "epoch": 1.4878799538283956, "grad_norm": 12.625, "learning_rate": 3.300151260710155e-06, "loss": 1.4712916612625122, "step": 4834 }, { "epoch": 1.488495575221239, "grad_norm": 14.5, "learning_rate": 3.2972104716823663e-06, "loss": 1.0407633781433105, "step": 4836 }, { "epoch": 1.4891111966140824, "grad_norm": 5.4375, "learning_rate": 3.2942723684771172e-06, "loss": 1.1654162406921387, "step": 4838 }, { "epoch": 1.4897268180069259, "grad_norm": 8.3125, "learning_rate": 3.2913369540140673e-06, "loss": 1.262948989868164, "step": 4840 }, { "epoch": 1.490342439399769, "grad_norm": 7.34375, "learning_rate": 3.2884042312102017e-06, "loss": 1.097400188446045, "step": 4842 }, { "epoch": 1.4909580607926125, "grad_norm": 3.0, "learning_rate": 3.285474202979835e-06, "loss": 0.5754028558731079, "step": 4844 }, { "epoch": 1.491573682185456, "grad_norm": 4.28125, "learning_rate": 3.2825468722346e-06, "loss": 0.969556450843811, "step": 4846 }, { "epoch": 1.4921893035782994, "grad_norm": 8.1875, "learning_rate": 3.2796222418834533e-06, "loss": 1.434523105621338, "step": 4848 }, { "epoch": 1.4928049249711428, "grad_norm": 9.3125, "learning_rate": 3.276700314832666e-06, "loss": 1.8160494565963745, "step": 4850 }, { "epoch": 1.493420546363986, "grad_norm": 17.5, "learning_rate": 3.2737810939858183e-06, "loss": 1.635191559791565, "step": 4852 }, { "epoch": 1.4940361677568297, "grad_norm": 5.4375, "learning_rate": 3.270864582243809e-06, "loss": 1.1667835712432861, "step": 4854 }, { "epoch": 1.494651789149673, "grad_norm": 9.1875, "learning_rate": 3.267950782504839e-06, "loss": 1.4574627876281738, "step": 4856 }, { "epoch": 1.4952674105425163, "grad_norm": 5.78125, "learning_rate": 3.265039697664419e-06, "loss": 1.0753768682479858, "step": 4858 }, { "epoch": 1.4958830319353598, "grad_norm": 5.3125, "learning_rate": 3.262131330615358e-06, "loss": 1.4011366367340088, "step": 4860 }, { "epoch": 1.4964986533282032, "grad_norm": 1.7421875, "learning_rate": 3.2592256842477644e-06, "loss": 1.22893226146698, "step": 4862 }, { "epoch": 1.4971142747210466, "grad_norm": 7.375, "learning_rate": 3.2563227614490456e-06, "loss": 1.0237904787063599, "step": 4864 }, { "epoch": 1.4977298961138898, "grad_norm": 5.78125, "learning_rate": 3.2534225651038997e-06, "loss": 1.1823903322219849, "step": 4866 }, { "epoch": 1.4983455175067333, "grad_norm": 9.5625, "learning_rate": 3.2505250980943182e-06, "loss": 1.0303336381912231, "step": 4868 }, { "epoch": 1.4989611388995767, "grad_norm": 5.0, "learning_rate": 3.2476303632995792e-06, "loss": 1.4868566989898682, "step": 4870 }, { "epoch": 1.4995767602924202, "grad_norm": 8.25, "learning_rate": 3.244738363596244e-06, "loss": 0.9418045282363892, "step": 4872 }, { "epoch": 1.5001923816852636, "grad_norm": 5.84375, "learning_rate": 3.241849101858159e-06, "loss": 1.1360383033752441, "step": 4874 }, { "epoch": 1.5008080030781068, "grad_norm": 33.0, "learning_rate": 3.238962580956447e-06, "loss": 1.2348039150238037, "step": 4876 }, { "epoch": 1.5014236244709505, "grad_norm": 7.0625, "learning_rate": 3.2360788037595104e-06, "loss": 1.745518445968628, "step": 4878 }, { "epoch": 1.5020392458637937, "grad_norm": 1.8515625, "learning_rate": 3.23319777313302e-06, "loss": 1.1881951093673706, "step": 4880 }, { "epoch": 1.5026548672566373, "grad_norm": 10.625, "learning_rate": 3.2303194919399244e-06, "loss": 0.7165201902389526, "step": 4882 }, { "epoch": 1.5032704886494805, "grad_norm": 6.40625, "learning_rate": 3.227443963040434e-06, "loss": 1.5470805168151855, "step": 4884 }, { "epoch": 1.503886110042324, "grad_norm": 5.375, "learning_rate": 3.2245711892920256e-06, "loss": 0.9742276668548584, "step": 4886 }, { "epoch": 1.5045017314351674, "grad_norm": 5.65625, "learning_rate": 3.221701173549443e-06, "loss": 1.2734863758087158, "step": 4888 }, { "epoch": 1.5051173528280106, "grad_norm": 12.75, "learning_rate": 3.218833918664679e-06, "loss": 1.0193525552749634, "step": 4890 }, { "epoch": 1.5057329742208543, "grad_norm": 16.875, "learning_rate": 3.2159694274869935e-06, "loss": 1.5467280149459839, "step": 4892 }, { "epoch": 1.5063485956136975, "grad_norm": 4.9375, "learning_rate": 3.2131077028628945e-06, "loss": 1.0171399116516113, "step": 4894 }, { "epoch": 1.506964217006541, "grad_norm": 4.71875, "learning_rate": 3.21024874763614e-06, "loss": 0.7832403182983398, "step": 4896 }, { "epoch": 1.5075798383993844, "grad_norm": 2.921875, "learning_rate": 3.2073925646477406e-06, "loss": 0.8320021629333496, "step": 4898 }, { "epoch": 1.5081954597922278, "grad_norm": 4.28125, "learning_rate": 3.2045391567359473e-06, "loss": 1.2064433097839355, "step": 4900 }, { "epoch": 1.5088110811850712, "grad_norm": 9.9375, "learning_rate": 3.2016885267362595e-06, "loss": 1.3419796228408813, "step": 4902 }, { "epoch": 1.5094267025779144, "grad_norm": 3.234375, "learning_rate": 3.198840677481407e-06, "loss": 1.22185480594635, "step": 4904 }, { "epoch": 1.510042323970758, "grad_norm": 6.8125, "learning_rate": 3.1959956118013637e-06, "loss": 1.1904466152191162, "step": 4906 }, { "epoch": 1.5106579453636013, "grad_norm": 4.75, "learning_rate": 3.1931533325233354e-06, "loss": 1.4791064262390137, "step": 4908 }, { "epoch": 1.5112735667564448, "grad_norm": 31.5, "learning_rate": 3.1903138424717573e-06, "loss": 0.6052420139312744, "step": 4910 }, { "epoch": 1.5118891881492882, "grad_norm": 10.375, "learning_rate": 3.1874771444682962e-06, "loss": 0.9311878085136414, "step": 4912 }, { "epoch": 1.5125048095421316, "grad_norm": 12.25, "learning_rate": 3.1846432413318425e-06, "loss": 1.0832551717758179, "step": 4914 }, { "epoch": 1.513120430934975, "grad_norm": 4.4375, "learning_rate": 3.181812135878508e-06, "loss": 1.0968680381774902, "step": 4916 }, { "epoch": 1.5137360523278183, "grad_norm": 1.828125, "learning_rate": 3.178983830921626e-06, "loss": 1.0732853412628174, "step": 4918 }, { "epoch": 1.514351673720662, "grad_norm": 3.9375, "learning_rate": 3.1761583292717456e-06, "loss": 1.0559816360473633, "step": 4920 }, { "epoch": 1.5149672951135051, "grad_norm": 4.1875, "learning_rate": 3.1733356337366334e-06, "loss": 1.3119699954986572, "step": 4922 }, { "epoch": 1.5155829165063486, "grad_norm": 7.40625, "learning_rate": 3.1705157471212634e-06, "loss": 1.2674381732940674, "step": 4924 }, { "epoch": 1.516198537899192, "grad_norm": 9.6875, "learning_rate": 3.16769867222782e-06, "loss": 1.6064198017120361, "step": 4926 }, { "epoch": 1.5168141592920354, "grad_norm": 8.6875, "learning_rate": 3.164884411855697e-06, "loss": 1.4921178817749023, "step": 4928 }, { "epoch": 1.5174297806848789, "grad_norm": 13.0625, "learning_rate": 3.162072968801483e-06, "loss": 1.308781623840332, "step": 4930 }, { "epoch": 1.518045402077722, "grad_norm": 4.65625, "learning_rate": 3.159264345858975e-06, "loss": 1.440758228302002, "step": 4932 }, { "epoch": 1.5186610234705658, "grad_norm": 8.8125, "learning_rate": 3.156458545819163e-06, "loss": 1.3631476163864136, "step": 4934 }, { "epoch": 1.519276644863409, "grad_norm": 6.09375, "learning_rate": 3.153655571470236e-06, "loss": 1.188265085220337, "step": 4936 }, { "epoch": 1.5198922662562524, "grad_norm": 5.625, "learning_rate": 3.1508554255975705e-06, "loss": 1.2975990772247314, "step": 4938 }, { "epoch": 1.5205078876490958, "grad_norm": 5.03125, "learning_rate": 3.148058110983735e-06, "loss": 1.2463330030441284, "step": 4940 }, { "epoch": 1.521123509041939, "grad_norm": 7.71875, "learning_rate": 3.1452636304084827e-06, "loss": 1.2518240213394165, "step": 4942 }, { "epoch": 1.5217391304347827, "grad_norm": 23.125, "learning_rate": 3.142471986648751e-06, "loss": 1.3494226932525635, "step": 4944 }, { "epoch": 1.522354751827626, "grad_norm": 6.15625, "learning_rate": 3.1396831824786612e-06, "loss": 1.3387173414230347, "step": 4946 }, { "epoch": 1.5229703732204696, "grad_norm": 3.96875, "learning_rate": 3.1368972206695097e-06, "loss": 1.245314359664917, "step": 4948 }, { "epoch": 1.5235859946133128, "grad_norm": 6.46875, "learning_rate": 3.134114103989767e-06, "loss": 1.087040901184082, "step": 4950 }, { "epoch": 1.5242016160061562, "grad_norm": 9.625, "learning_rate": 3.131333835205082e-06, "loss": 1.4781947135925293, "step": 4952 }, { "epoch": 1.5248172373989997, "grad_norm": 4.15625, "learning_rate": 3.128556417078269e-06, "loss": 1.0316975116729736, "step": 4954 }, { "epoch": 1.5254328587918429, "grad_norm": 5.84375, "learning_rate": 3.1257818523693094e-06, "loss": 1.1803417205810547, "step": 4956 }, { "epoch": 1.5260484801846865, "grad_norm": 3.515625, "learning_rate": 3.1230101438353516e-06, "loss": 1.2382619380950928, "step": 4958 }, { "epoch": 1.5266641015775297, "grad_norm": 11.6875, "learning_rate": 3.120241294230702e-06, "loss": 1.5910042524337769, "step": 4960 }, { "epoch": 1.5272797229703732, "grad_norm": 4.9375, "learning_rate": 3.1174753063068324e-06, "loss": 0.9350588321685791, "step": 4962 }, { "epoch": 1.5278953443632166, "grad_norm": 11.9375, "learning_rate": 3.114712182812364e-06, "loss": 1.262253999710083, "step": 4964 }, { "epoch": 1.52851096575606, "grad_norm": 11.875, "learning_rate": 3.1119519264930777e-06, "loss": 1.2304760217666626, "step": 4966 }, { "epoch": 1.5291265871489035, "grad_norm": 7.125, "learning_rate": 3.109194540091898e-06, "loss": 1.1309937238693237, "step": 4968 }, { "epoch": 1.5297422085417467, "grad_norm": 14.5, "learning_rate": 3.106440026348904e-06, "loss": 0.9601719975471497, "step": 4970 }, { "epoch": 1.5303578299345904, "grad_norm": 6.6875, "learning_rate": 3.103688388001318e-06, "loss": 1.187595248222351, "step": 4972 }, { "epoch": 1.5309734513274336, "grad_norm": 1.5703125, "learning_rate": 3.100939627783503e-06, "loss": 0.9698360562324524, "step": 4974 }, { "epoch": 1.531589072720277, "grad_norm": 7.4375, "learning_rate": 3.098193748426965e-06, "loss": 1.3194845914840698, "step": 4976 }, { "epoch": 1.5322046941131204, "grad_norm": 8.5625, "learning_rate": 3.095450752660347e-06, "loss": 1.1873408555984497, "step": 4978 }, { "epoch": 1.5328203155059639, "grad_norm": 11.875, "learning_rate": 3.0927106432094228e-06, "loss": 1.3280301094055176, "step": 4980 }, { "epoch": 1.5334359368988073, "grad_norm": 9.8125, "learning_rate": 3.0899734227971025e-06, "loss": 1.564595103263855, "step": 4982 }, { "epoch": 1.5340515582916505, "grad_norm": 5.6875, "learning_rate": 3.087239094143421e-06, "loss": 1.1651568412780762, "step": 4984 }, { "epoch": 1.5346671796844942, "grad_norm": 10.125, "learning_rate": 3.084507659965545e-06, "loss": 1.458910346031189, "step": 4986 }, { "epoch": 1.5352828010773374, "grad_norm": 9.375, "learning_rate": 3.0817791229777595e-06, "loss": 1.3746843338012695, "step": 4988 }, { "epoch": 1.5358984224701808, "grad_norm": 5.96875, "learning_rate": 3.0790534858914742e-06, "loss": 1.043514609336853, "step": 4990 }, { "epoch": 1.5365140438630243, "grad_norm": 2.46875, "learning_rate": 3.0763307514152163e-06, "loss": 1.1240226030349731, "step": 4992 }, { "epoch": 1.5371296652558677, "grad_norm": 4.40625, "learning_rate": 3.0736109222546267e-06, "loss": 1.2111371755599976, "step": 4994 }, { "epoch": 1.5377452866487111, "grad_norm": 9.1875, "learning_rate": 3.0708940011124613e-06, "loss": 1.4809215068817139, "step": 4996 }, { "epoch": 1.5383609080415543, "grad_norm": 4.125, "learning_rate": 3.0681799906885846e-06, "loss": 1.032042145729065, "step": 4998 }, { "epoch": 1.538976529434398, "grad_norm": 3.171875, "learning_rate": 3.0654688936799704e-06, "loss": 1.322123646736145, "step": 5000 }, { "epoch": 1.5395921508272412, "grad_norm": 9.9375, "learning_rate": 3.062760712780697e-06, "loss": 1.3321819305419922, "step": 5002 }, { "epoch": 1.5402077722200846, "grad_norm": 4.375, "learning_rate": 3.060055450681943e-06, "loss": 0.8392131924629211, "step": 5004 }, { "epoch": 1.540823393612928, "grad_norm": 5.46875, "learning_rate": 3.0573531100719915e-06, "loss": 1.1302207708358765, "step": 5006 }, { "epoch": 1.5414390150057713, "grad_norm": 1.875, "learning_rate": 3.054653693636214e-06, "loss": 1.157600998878479, "step": 5008 }, { "epoch": 1.542054636398615, "grad_norm": 5.21875, "learning_rate": 3.051957204057084e-06, "loss": 1.2203869819641113, "step": 5010 }, { "epoch": 1.5426702577914582, "grad_norm": 33.0, "learning_rate": 3.0492636440141637e-06, "loss": 1.2869846820831299, "step": 5012 }, { "epoch": 1.5432858791843016, "grad_norm": 3.640625, "learning_rate": 3.0465730161841023e-06, "loss": 1.4599010944366455, "step": 5014 }, { "epoch": 1.543901500577145, "grad_norm": 5.75, "learning_rate": 3.0438853232406395e-06, "loss": 1.4534536600112915, "step": 5016 }, { "epoch": 1.5445171219699885, "grad_norm": 14.9375, "learning_rate": 3.0412005678545947e-06, "loss": 1.5622611045837402, "step": 5018 }, { "epoch": 1.545132743362832, "grad_norm": 4.8125, "learning_rate": 3.038518752693869e-06, "loss": 1.2805486917495728, "step": 5020 }, { "epoch": 1.5457483647556751, "grad_norm": 5.21875, "learning_rate": 3.035839880423443e-06, "loss": 1.184311866760254, "step": 5022 }, { "epoch": 1.5463639861485188, "grad_norm": 6.15625, "learning_rate": 3.033163953705372e-06, "loss": 1.2769352197647095, "step": 5024 }, { "epoch": 1.546979607541362, "grad_norm": 7.1875, "learning_rate": 3.0304909751987842e-06, "loss": 1.059838056564331, "step": 5026 }, { "epoch": 1.5475952289342054, "grad_norm": 9.625, "learning_rate": 3.027820947559878e-06, "loss": 1.6449565887451172, "step": 5028 }, { "epoch": 1.5482108503270489, "grad_norm": 6.5, "learning_rate": 3.0251538734419205e-06, "loss": 1.2422211170196533, "step": 5030 }, { "epoch": 1.5488264717198923, "grad_norm": 6.625, "learning_rate": 3.0224897554952433e-06, "loss": 1.6606228351593018, "step": 5032 }, { "epoch": 1.5494420931127357, "grad_norm": 4.3125, "learning_rate": 3.0198285963672386e-06, "loss": 1.3177484273910522, "step": 5034 }, { "epoch": 1.550057714505579, "grad_norm": 9.25, "learning_rate": 3.0171703987023615e-06, "loss": 1.2722313404083252, "step": 5036 }, { "epoch": 1.5506733358984226, "grad_norm": 9.125, "learning_rate": 3.0145151651421202e-06, "loss": 1.1697152853012085, "step": 5038 }, { "epoch": 1.5512889572912658, "grad_norm": 6.6875, "learning_rate": 3.0118628983250826e-06, "loss": 1.511368751525879, "step": 5040 }, { "epoch": 1.5519045786841092, "grad_norm": 8.5, "learning_rate": 3.0092136008868635e-06, "loss": 1.0446054935455322, "step": 5042 }, { "epoch": 1.5525202000769527, "grad_norm": 7.84375, "learning_rate": 3.0065672754601326e-06, "loss": 1.1255905628204346, "step": 5044 }, { "epoch": 1.5531358214697961, "grad_norm": 12.3125, "learning_rate": 3.003923924674598e-06, "loss": 1.307901382446289, "step": 5046 }, { "epoch": 1.5537514428626396, "grad_norm": 9.0625, "learning_rate": 3.0012835511570193e-06, "loss": 1.0257079601287842, "step": 5048 }, { "epoch": 1.5543670642554828, "grad_norm": 4.96875, "learning_rate": 2.9986461575311955e-06, "loss": 1.1753787994384766, "step": 5050 }, { "epoch": 1.5549826856483264, "grad_norm": 14.4375, "learning_rate": 2.9960117464179615e-06, "loss": 0.9505599737167358, "step": 5052 }, { "epoch": 1.5555983070411696, "grad_norm": 5.9375, "learning_rate": 2.993380320435193e-06, "loss": 1.4326746463775635, "step": 5054 }, { "epoch": 1.556213928434013, "grad_norm": 8.625, "learning_rate": 2.990751882197796e-06, "loss": 1.1694228649139404, "step": 5056 }, { "epoch": 1.5568295498268565, "grad_norm": 6.375, "learning_rate": 2.9881264343177087e-06, "loss": 1.2966336011886597, "step": 5058 }, { "epoch": 1.5574451712196997, "grad_norm": 13.75, "learning_rate": 2.9855039794038964e-06, "loss": 1.4063342809677124, "step": 5060 }, { "epoch": 1.5580607926125434, "grad_norm": 21.75, "learning_rate": 2.982884520062352e-06, "loss": 1.6873064041137695, "step": 5062 }, { "epoch": 1.5586764140053866, "grad_norm": 18.25, "learning_rate": 2.980268058896092e-06, "loss": 1.5693508386611938, "step": 5064 }, { "epoch": 1.5592920353982302, "grad_norm": 51.25, "learning_rate": 2.9776545985051515e-06, "loss": 1.1830573081970215, "step": 5066 }, { "epoch": 1.5599076567910735, "grad_norm": 7.0625, "learning_rate": 2.975044141486584e-06, "loss": 1.215011477470398, "step": 5068 }, { "epoch": 1.560523278183917, "grad_norm": 3.84375, "learning_rate": 2.972436690434462e-06, "loss": 1.1971906423568726, "step": 5070 }, { "epoch": 1.5611388995767603, "grad_norm": 10.6875, "learning_rate": 2.969832247939864e-06, "loss": 1.50948965549469, "step": 5072 }, { "epoch": 1.5617545209696035, "grad_norm": 4.5625, "learning_rate": 2.9672308165908857e-06, "loss": 1.0662416219711304, "step": 5074 }, { "epoch": 1.5623701423624472, "grad_norm": 3.46875, "learning_rate": 2.9646323989726267e-06, "loss": 1.0881036520004272, "step": 5076 }, { "epoch": 1.5629857637552904, "grad_norm": 16.5, "learning_rate": 2.962036997667193e-06, "loss": 1.2012338638305664, "step": 5078 }, { "epoch": 1.5636013851481338, "grad_norm": 9.625, "learning_rate": 2.959444615253694e-06, "loss": 1.7494795322418213, "step": 5080 }, { "epoch": 1.5642170065409773, "grad_norm": 19.125, "learning_rate": 2.9568552543082375e-06, "loss": 1.1780784130096436, "step": 5082 }, { "epoch": 1.5648326279338207, "grad_norm": 4.53125, "learning_rate": 2.954268917403929e-06, "loss": 1.244818925857544, "step": 5084 }, { "epoch": 1.5654482493266642, "grad_norm": 5.09375, "learning_rate": 2.951685607110869e-06, "loss": 1.466505765914917, "step": 5086 }, { "epoch": 1.5660638707195074, "grad_norm": 3.78125, "learning_rate": 2.949105325996153e-06, "loss": 1.434507131576538, "step": 5088 }, { "epoch": 1.566679492112351, "grad_norm": 7.6875, "learning_rate": 2.9465280766238625e-06, "loss": 1.105286955833435, "step": 5090 }, { "epoch": 1.5672951135051942, "grad_norm": 5.90625, "learning_rate": 2.9439538615550674e-06, "loss": 1.214867353439331, "step": 5092 }, { "epoch": 1.5679107348980377, "grad_norm": 4.125, "learning_rate": 2.9413826833478246e-06, "loss": 0.799168586730957, "step": 5094 }, { "epoch": 1.568526356290881, "grad_norm": 35.5, "learning_rate": 2.9388145445571715e-06, "loss": 1.2401793003082275, "step": 5096 }, { "epoch": 1.5691419776837245, "grad_norm": 5.8125, "learning_rate": 2.9362494477351245e-06, "loss": 1.4166289567947388, "step": 5098 }, { "epoch": 1.569757599076568, "grad_norm": 4.8125, "learning_rate": 2.9336873954306765e-06, "loss": 1.2858507633209229, "step": 5100 }, { "epoch": 1.5703732204694112, "grad_norm": 5.28125, "learning_rate": 2.9311283901897985e-06, "loss": 0.9255216121673584, "step": 5102 }, { "epoch": 1.5709888418622548, "grad_norm": 6.40625, "learning_rate": 2.928572434555431e-06, "loss": 1.2053285837173462, "step": 5104 }, { "epoch": 1.571604463255098, "grad_norm": 5.5, "learning_rate": 2.926019531067482e-06, "loss": 1.1776148080825806, "step": 5106 }, { "epoch": 1.5722200846479415, "grad_norm": 3.953125, "learning_rate": 2.9234696822628334e-06, "loss": 1.183230996131897, "step": 5108 }, { "epoch": 1.572835706040785, "grad_norm": 7.1875, "learning_rate": 2.9209228906753225e-06, "loss": 1.1474194526672363, "step": 5110 }, { "epoch": 1.5734513274336284, "grad_norm": 20.25, "learning_rate": 2.918379158835756e-06, "loss": 1.014983057975769, "step": 5112 }, { "epoch": 1.5740669488264718, "grad_norm": 4.59375, "learning_rate": 2.9158384892718966e-06, "loss": 1.2621625661849976, "step": 5114 }, { "epoch": 1.574682570219315, "grad_norm": 4.34375, "learning_rate": 2.9133008845084632e-06, "loss": 1.2142401933670044, "step": 5116 }, { "epoch": 1.5752981916121587, "grad_norm": 3.421875, "learning_rate": 2.9107663470671334e-06, "loss": 1.1952993869781494, "step": 5118 }, { "epoch": 1.5759138130050019, "grad_norm": 19.25, "learning_rate": 2.9082348794665317e-06, "loss": 1.0763602256774902, "step": 5120 }, { "epoch": 1.5765294343978453, "grad_norm": 5.125, "learning_rate": 2.905706484222235e-06, "loss": 0.9360074400901794, "step": 5122 }, { "epoch": 1.5771450557906888, "grad_norm": 4.40625, "learning_rate": 2.903181163846766e-06, "loss": 1.23157799243927, "step": 5124 }, { "epoch": 1.577760677183532, "grad_norm": 5.03125, "learning_rate": 2.9006589208495907e-06, "loss": 0.7338969707489014, "step": 5126 }, { "epoch": 1.5783762985763756, "grad_norm": 29.625, "learning_rate": 2.898139757737122e-06, "loss": 1.6849936246871948, "step": 5128 }, { "epoch": 1.5789919199692188, "grad_norm": 11.5, "learning_rate": 2.895623677012705e-06, "loss": 1.4091507196426392, "step": 5130 }, { "epoch": 1.5796075413620625, "grad_norm": 10.4375, "learning_rate": 2.8931106811766292e-06, "loss": 1.269464135169983, "step": 5132 }, { "epoch": 1.5802231627549057, "grad_norm": 10.5, "learning_rate": 2.890600772726113e-06, "loss": 0.9996485710144043, "step": 5134 }, { "epoch": 1.5808387841477491, "grad_norm": 8.625, "learning_rate": 2.8880939541553075e-06, "loss": 1.2088068723678589, "step": 5136 }, { "epoch": 1.5814544055405926, "grad_norm": 13.5, "learning_rate": 2.8855902279552966e-06, "loss": 1.5215330123901367, "step": 5138 }, { "epoch": 1.5820700269334358, "grad_norm": 18.0, "learning_rate": 2.883089596614087e-06, "loss": 1.3437632322311401, "step": 5140 }, { "epoch": 1.5826856483262794, "grad_norm": 10.8125, "learning_rate": 2.8805920626166144e-06, "loss": 1.1248289346694946, "step": 5142 }, { "epoch": 1.5833012697191227, "grad_norm": 11.5625, "learning_rate": 2.8780976284447337e-06, "loss": 1.271138310432434, "step": 5144 }, { "epoch": 1.583916891111966, "grad_norm": 8.6875, "learning_rate": 2.875606296577218e-06, "loss": 1.2558194398880005, "step": 5146 }, { "epoch": 1.5845325125048095, "grad_norm": 8.875, "learning_rate": 2.873118069489764e-06, "loss": 1.2829164266586304, "step": 5148 }, { "epoch": 1.585148133897653, "grad_norm": 12.125, "learning_rate": 2.8706329496549734e-06, "loss": 1.261286735534668, "step": 5150 }, { "epoch": 1.5857637552904964, "grad_norm": 13.5, "learning_rate": 2.8681509395423695e-06, "loss": 1.7474175691604614, "step": 5152 }, { "epoch": 1.5863793766833396, "grad_norm": 9.25, "learning_rate": 2.8656720416183786e-06, "loss": 1.4634180068969727, "step": 5154 }, { "epoch": 1.5869949980761833, "grad_norm": 3.375, "learning_rate": 2.8631962583463396e-06, "loss": 1.2069108486175537, "step": 5156 }, { "epoch": 1.5876106194690265, "grad_norm": 3.125, "learning_rate": 2.8607235921864934e-06, "loss": 1.0664864778518677, "step": 5158 }, { "epoch": 1.58822624086187, "grad_norm": 3.921875, "learning_rate": 2.8582540455959824e-06, "loss": 1.1400189399719238, "step": 5160 }, { "epoch": 1.5888418622547134, "grad_norm": 12.25, "learning_rate": 2.8557876210288513e-06, "loss": 1.2755979299545288, "step": 5162 }, { "epoch": 1.5894574836475568, "grad_norm": 8.3125, "learning_rate": 2.85332432093604e-06, "loss": 1.5039992332458496, "step": 5164 }, { "epoch": 1.5900731050404002, "grad_norm": 2.859375, "learning_rate": 2.850864147765388e-06, "loss": 1.158522367477417, "step": 5166 }, { "epoch": 1.5906887264332434, "grad_norm": 6.5625, "learning_rate": 2.8484071039616227e-06, "loss": 1.0927605628967285, "step": 5168 }, { "epoch": 1.591304347826087, "grad_norm": 7.5625, "learning_rate": 2.8459531919663626e-06, "loss": 1.0280814170837402, "step": 5170 }, { "epoch": 1.5919199692189303, "grad_norm": 4.21875, "learning_rate": 2.8435024142181174e-06, "loss": 1.1364541053771973, "step": 5172 }, { "epoch": 1.5925355906117737, "grad_norm": 4.71875, "learning_rate": 2.8410547731522787e-06, "loss": 1.2341628074645996, "step": 5174 }, { "epoch": 1.5931512120046172, "grad_norm": 13.875, "learning_rate": 2.8386102712011215e-06, "loss": 1.1459120512008667, "step": 5176 }, { "epoch": 1.5937668333974606, "grad_norm": 6.3125, "learning_rate": 2.836168910793804e-06, "loss": 1.302492380142212, "step": 5178 }, { "epoch": 1.594382454790304, "grad_norm": 5.84375, "learning_rate": 2.833730694356358e-06, "loss": 1.0967020988464355, "step": 5180 }, { "epoch": 1.5949980761831473, "grad_norm": 5.46875, "learning_rate": 2.831295624311697e-06, "loss": 1.4645941257476807, "step": 5182 }, { "epoch": 1.595613697575991, "grad_norm": 6.4375, "learning_rate": 2.8288637030796023e-06, "loss": 1.1216599941253662, "step": 5184 }, { "epoch": 1.5962293189688341, "grad_norm": 6.5625, "learning_rate": 2.8264349330767316e-06, "loss": 1.520884394645691, "step": 5186 }, { "epoch": 1.5968449403616776, "grad_norm": 10.6875, "learning_rate": 2.8240093167166037e-06, "loss": 1.397294044494629, "step": 5188 }, { "epoch": 1.597460561754521, "grad_norm": 5.4375, "learning_rate": 2.821586856409611e-06, "loss": 1.2669717073440552, "step": 5190 }, { "epoch": 1.5980761831473642, "grad_norm": 4.65625, "learning_rate": 2.8191675545630066e-06, "loss": 1.2159026861190796, "step": 5192 }, { "epoch": 1.5986918045402079, "grad_norm": 7.4375, "learning_rate": 2.8167514135809025e-06, "loss": 1.2182968854904175, "step": 5194 }, { "epoch": 1.599307425933051, "grad_norm": 15.3125, "learning_rate": 2.8143384358642757e-06, "loss": 1.1196813583374023, "step": 5196 }, { "epoch": 1.5999230473258945, "grad_norm": 6.4375, "learning_rate": 2.811928623810954e-06, "loss": 1.0934268236160278, "step": 5198 }, { "epoch": 1.600538668718738, "grad_norm": 5.6875, "learning_rate": 2.8095219798156213e-06, "loss": 1.3068904876708984, "step": 5200 }, { "epoch": 1.6011542901115814, "grad_norm": 7.09375, "learning_rate": 2.8071185062698158e-06, "loss": 1.1975951194763184, "step": 5202 }, { "epoch": 1.6017699115044248, "grad_norm": 5.34375, "learning_rate": 2.8047182055619203e-06, "loss": 1.0088521242141724, "step": 5204 }, { "epoch": 1.602385532897268, "grad_norm": 2.78125, "learning_rate": 2.8023210800771694e-06, "loss": 1.170898199081421, "step": 5206 }, { "epoch": 1.6030011542901117, "grad_norm": 5.46875, "learning_rate": 2.7999271321976397e-06, "loss": 1.3110368251800537, "step": 5208 }, { "epoch": 1.603616775682955, "grad_norm": 13.1875, "learning_rate": 2.797536364302251e-06, "loss": 1.5783711671829224, "step": 5210 }, { "epoch": 1.6042323970757983, "grad_norm": 2.96875, "learning_rate": 2.7951487787667646e-06, "loss": 1.162759780883789, "step": 5212 }, { "epoch": 1.6048480184686418, "grad_norm": 10.1875, "learning_rate": 2.7927643779637736e-06, "loss": 1.3883463144302368, "step": 5214 }, { "epoch": 1.6054636398614852, "grad_norm": 7.4375, "learning_rate": 2.7903831642627144e-06, "loss": 0.6480385661125183, "step": 5216 }, { "epoch": 1.6060792612543286, "grad_norm": 8.0625, "learning_rate": 2.78800514002985e-06, "loss": 1.0798194408416748, "step": 5218 }, { "epoch": 1.6066948826471719, "grad_norm": 6.28125, "learning_rate": 2.7856303076282786e-06, "loss": 1.295226812362671, "step": 5220 }, { "epoch": 1.6073105040400155, "grad_norm": 8.0, "learning_rate": 2.7832586694179235e-06, "loss": 1.0225231647491455, "step": 5222 }, { "epoch": 1.6079261254328587, "grad_norm": 10.4375, "learning_rate": 2.780890227755533e-06, "loss": 1.0516223907470703, "step": 5224 }, { "epoch": 1.6085417468257022, "grad_norm": 2.9375, "learning_rate": 2.778524984994685e-06, "loss": 1.1044272184371948, "step": 5226 }, { "epoch": 1.6091573682185456, "grad_norm": 2.984375, "learning_rate": 2.776162943485769e-06, "loss": 0.9470773339271545, "step": 5228 }, { "epoch": 1.609772989611389, "grad_norm": 7.03125, "learning_rate": 2.773804105576002e-06, "loss": 1.5163894891738892, "step": 5230 }, { "epoch": 1.6103886110042325, "grad_norm": 16.875, "learning_rate": 2.771448473609413e-06, "loss": 1.079423427581787, "step": 5232 }, { "epoch": 1.6110042323970757, "grad_norm": 8.875, "learning_rate": 2.7690960499268453e-06, "loss": 1.9689688682556152, "step": 5234 }, { "epoch": 1.6116198537899193, "grad_norm": 5.6875, "learning_rate": 2.766746836865958e-06, "loss": 1.467721700668335, "step": 5236 }, { "epoch": 1.6122354751827626, "grad_norm": 5.65625, "learning_rate": 2.764400836761214e-06, "loss": 1.2351285219192505, "step": 5238 }, { "epoch": 1.612851096575606, "grad_norm": 3.4375, "learning_rate": 2.762058051943888e-06, "loss": 1.2483223676681519, "step": 5240 }, { "epoch": 1.6134667179684494, "grad_norm": 5.09375, "learning_rate": 2.7597184847420557e-06, "loss": 1.146704077720642, "step": 5242 }, { "epoch": 1.6140823393612926, "grad_norm": 5.375, "learning_rate": 2.7573821374805997e-06, "loss": 1.1703463792800903, "step": 5244 }, { "epoch": 1.6146979607541363, "grad_norm": 7.1875, "learning_rate": 2.7550490124811992e-06, "loss": 1.1746296882629395, "step": 5246 }, { "epoch": 1.6153135821469795, "grad_norm": 5.5625, "learning_rate": 2.7527191120623325e-06, "loss": 1.4364252090454102, "step": 5248 }, { "epoch": 1.6159292035398232, "grad_norm": 5.8125, "learning_rate": 2.7503924385392757e-06, "loss": 1.6019375324249268, "step": 5250 }, { "epoch": 1.6165448249326664, "grad_norm": 6.71875, "learning_rate": 2.7480689942240957e-06, "loss": 0.9271045923233032, "step": 5252 }, { "epoch": 1.6171604463255098, "grad_norm": 3.65625, "learning_rate": 2.74574878142565e-06, "loss": 1.2441916465759277, "step": 5254 }, { "epoch": 1.6177760677183533, "grad_norm": 9.8125, "learning_rate": 2.7434318024495875e-06, "loss": 1.3981280326843262, "step": 5256 }, { "epoch": 1.6183916891111965, "grad_norm": 20.75, "learning_rate": 2.741118059598341e-06, "loss": 0.778854489326477, "step": 5258 }, { "epoch": 1.6190073105040401, "grad_norm": 6.3125, "learning_rate": 2.7388075551711302e-06, "loss": 1.2868058681488037, "step": 5260 }, { "epoch": 1.6196229318968833, "grad_norm": 12.75, "learning_rate": 2.736500291463953e-06, "loss": 1.343778133392334, "step": 5262 }, { "epoch": 1.6202385532897268, "grad_norm": 15.8125, "learning_rate": 2.734196270769591e-06, "loss": 1.4329502582550049, "step": 5264 }, { "epoch": 1.6208541746825702, "grad_norm": 2.546875, "learning_rate": 2.7318954953776013e-06, "loss": 1.150846004486084, "step": 5266 }, { "epoch": 1.6214697960754136, "grad_norm": 6.03125, "learning_rate": 2.729597967574313e-06, "loss": 1.2093613147735596, "step": 5268 }, { "epoch": 1.622085417468257, "grad_norm": 12.125, "learning_rate": 2.7273036896428343e-06, "loss": 1.6008520126342773, "step": 5270 }, { "epoch": 1.6227010388611003, "grad_norm": 6.9375, "learning_rate": 2.725012663863038e-06, "loss": 1.0350145101547241, "step": 5272 }, { "epoch": 1.623316660253944, "grad_norm": 11.0, "learning_rate": 2.7227248925115713e-06, "loss": 1.0717015266418457, "step": 5274 }, { "epoch": 1.6239322816467872, "grad_norm": 7.75, "learning_rate": 2.720440377861841e-06, "loss": 1.5751841068267822, "step": 5276 }, { "epoch": 1.6245479030396306, "grad_norm": 9.4375, "learning_rate": 2.7181591221840215e-06, "loss": 1.2236948013305664, "step": 5278 }, { "epoch": 1.625163524432474, "grad_norm": 5.9375, "learning_rate": 2.7158811277450476e-06, "loss": 1.093230962753296, "step": 5280 }, { "epoch": 1.6257791458253175, "grad_norm": 7.90625, "learning_rate": 2.713606396808612e-06, "loss": 1.3141717910766602, "step": 5282 }, { "epoch": 1.626394767218161, "grad_norm": 56.75, "learning_rate": 2.711334931635168e-06, "loss": 1.1603947877883911, "step": 5284 }, { "epoch": 1.627010388611004, "grad_norm": 4.84375, "learning_rate": 2.709066734481921e-06, "loss": 1.1331205368041992, "step": 5286 }, { "epoch": 1.6276260100038478, "grad_norm": 11.5, "learning_rate": 2.706801807602828e-06, "loss": 1.4139946699142456, "step": 5288 }, { "epoch": 1.628241631396691, "grad_norm": 7.9375, "learning_rate": 2.7045401532486e-06, "loss": 0.35363346338272095, "step": 5290 }, { "epoch": 1.6288572527895344, "grad_norm": 5.375, "learning_rate": 2.7022817736666905e-06, "loss": 1.1643867492675781, "step": 5292 }, { "epoch": 1.6294728741823779, "grad_norm": 5.9375, "learning_rate": 2.7000266711013046e-06, "loss": 1.1620657444000244, "step": 5294 }, { "epoch": 1.6300884955752213, "grad_norm": 9.375, "learning_rate": 2.6977748477933863e-06, "loss": 1.6389256715774536, "step": 5296 }, { "epoch": 1.6307041169680647, "grad_norm": 4.03125, "learning_rate": 2.6955263059806247e-06, "loss": 1.0903058052062988, "step": 5298 }, { "epoch": 1.631319738360908, "grad_norm": 8.75, "learning_rate": 2.693281047897446e-06, "loss": 1.423241138458252, "step": 5300 }, { "epoch": 1.6319353597537516, "grad_norm": 4.40625, "learning_rate": 2.691039075775012e-06, "loss": 1.153511881828308, "step": 5302 }, { "epoch": 1.6325509811465948, "grad_norm": 3.0, "learning_rate": 2.688800391841222e-06, "loss": 1.011278748512268, "step": 5304 }, { "epoch": 1.6331666025394382, "grad_norm": 2.203125, "learning_rate": 2.686564998320705e-06, "loss": 1.1042957305908203, "step": 5306 }, { "epoch": 1.6337822239322817, "grad_norm": 12.125, "learning_rate": 2.684332897434823e-06, "loss": 1.385214924812317, "step": 5308 }, { "epoch": 1.6343978453251249, "grad_norm": 8.375, "learning_rate": 2.682104091401665e-06, "loss": 1.499825358390808, "step": 5310 }, { "epoch": 1.6350134667179685, "grad_norm": 8.5, "learning_rate": 2.679878582436043e-06, "loss": 1.6597236394882202, "step": 5312 }, { "epoch": 1.6356290881108118, "grad_norm": 6.03125, "learning_rate": 2.6776563727494987e-06, "loss": 1.1139764785766602, "step": 5314 }, { "epoch": 1.6362447095036554, "grad_norm": 6.40625, "learning_rate": 2.6754374645502896e-06, "loss": 1.1137142181396484, "step": 5316 }, { "epoch": 1.6368603308964986, "grad_norm": 7.15625, "learning_rate": 2.673221860043394e-06, "loss": 1.0289076566696167, "step": 5318 }, { "epoch": 1.637475952289342, "grad_norm": 4.875, "learning_rate": 2.6710095614305085e-06, "loss": 1.285740852355957, "step": 5320 }, { "epoch": 1.6380915736821855, "grad_norm": 26.25, "learning_rate": 2.6688005709100445e-06, "loss": 1.420121669769287, "step": 5322 }, { "epoch": 1.6387071950750287, "grad_norm": 17.375, "learning_rate": 2.6665948906771257e-06, "loss": 1.826535940170288, "step": 5324 }, { "epoch": 1.6393228164678724, "grad_norm": 12.625, "learning_rate": 2.6643925229235827e-06, "loss": 1.2961101531982422, "step": 5326 }, { "epoch": 1.6399384378607156, "grad_norm": 10.375, "learning_rate": 2.662193469837963e-06, "loss": 1.4630703926086426, "step": 5328 }, { "epoch": 1.640554059253559, "grad_norm": 4.0, "learning_rate": 2.65999773360551e-06, "loss": 1.136149525642395, "step": 5330 }, { "epoch": 1.6411696806464025, "grad_norm": 10.625, "learning_rate": 2.6578053164081784e-06, "loss": 1.469642162322998, "step": 5332 }, { "epoch": 1.6417853020392459, "grad_norm": 4.9375, "learning_rate": 2.6556162204246223e-06, "loss": 1.253054141998291, "step": 5334 }, { "epoch": 1.6424009234320893, "grad_norm": 17.875, "learning_rate": 2.6534304478301942e-06, "loss": 1.297611951828003, "step": 5336 }, { "epoch": 1.6430165448249325, "grad_norm": 4.09375, "learning_rate": 2.6512480007969472e-06, "loss": 1.1495213508605957, "step": 5338 }, { "epoch": 1.6436321662177762, "grad_norm": 4.71875, "learning_rate": 2.6490688814936265e-06, "loss": 1.2631639242172241, "step": 5340 }, { "epoch": 1.6442477876106194, "grad_norm": 6.0625, "learning_rate": 2.6468930920856727e-06, "loss": 1.4238358736038208, "step": 5342 }, { "epoch": 1.6448634090034628, "grad_norm": 6.21875, "learning_rate": 2.644720634735216e-06, "loss": 1.4778305292129517, "step": 5344 }, { "epoch": 1.6454790303963063, "grad_norm": 3.703125, "learning_rate": 2.6425515116010748e-06, "loss": 1.061782717704773, "step": 5346 }, { "epoch": 1.6460946517891497, "grad_norm": 7.90625, "learning_rate": 2.640385724838757e-06, "loss": 1.1082851886749268, "step": 5348 }, { "epoch": 1.6467102731819931, "grad_norm": 8.8125, "learning_rate": 2.638223276600453e-06, "loss": 1.4387338161468506, "step": 5350 }, { "epoch": 1.6473258945748364, "grad_norm": 6.28125, "learning_rate": 2.6360641690350362e-06, "loss": 1.4443461894989014, "step": 5352 }, { "epoch": 1.64794151596768, "grad_norm": 12.375, "learning_rate": 2.63390840428806e-06, "loss": 1.0974818468093872, "step": 5354 }, { "epoch": 1.6485571373605232, "grad_norm": 16.25, "learning_rate": 2.6317559845017564e-06, "loss": 1.4540315866470337, "step": 5356 }, { "epoch": 1.6491727587533667, "grad_norm": 13.5625, "learning_rate": 2.6296069118150337e-06, "loss": 1.5910497903823853, "step": 5358 }, { "epoch": 1.64978838014621, "grad_norm": 15.4375, "learning_rate": 2.627461188363471e-06, "loss": 1.3330596685409546, "step": 5360 }, { "epoch": 1.6504040015390535, "grad_norm": 15.3125, "learning_rate": 2.6253188162793254e-06, "loss": 1.1511274576187134, "step": 5362 }, { "epoch": 1.651019622931897, "grad_norm": 58.5, "learning_rate": 2.6231797976915173e-06, "loss": 1.5621612071990967, "step": 5364 }, { "epoch": 1.6516352443247402, "grad_norm": 8.1875, "learning_rate": 2.621044134725639e-06, "loss": 1.622155785560608, "step": 5366 }, { "epoch": 1.6522508657175838, "grad_norm": 6.21875, "learning_rate": 2.6189118295039465e-06, "loss": 1.0462428331375122, "step": 5368 }, { "epoch": 1.652866487110427, "grad_norm": 9.0625, "learning_rate": 2.6167828841453575e-06, "loss": 1.3204021453857422, "step": 5370 }, { "epoch": 1.6534821085032705, "grad_norm": 16.5, "learning_rate": 2.614657300765455e-06, "loss": 1.005763053894043, "step": 5372 }, { "epoch": 1.654097729896114, "grad_norm": 2.484375, "learning_rate": 2.6125350814764777e-06, "loss": 1.2809065580368042, "step": 5374 }, { "epoch": 1.6547133512889571, "grad_norm": 9.125, "learning_rate": 2.6104162283873236e-06, "loss": 1.371181607246399, "step": 5376 }, { "epoch": 1.6553289726818008, "grad_norm": 9.6875, "learning_rate": 2.608300743603543e-06, "loss": 1.2377533912658691, "step": 5378 }, { "epoch": 1.655944594074644, "grad_norm": 8.75, "learning_rate": 2.606188629227342e-06, "loss": 1.4686627388000488, "step": 5380 }, { "epoch": 1.6565602154674874, "grad_norm": 5.46875, "learning_rate": 2.604079887357575e-06, "loss": 1.3642234802246094, "step": 5382 }, { "epoch": 1.6571758368603309, "grad_norm": 7.9375, "learning_rate": 2.601974520089745e-06, "loss": 1.4514119625091553, "step": 5384 }, { "epoch": 1.6577914582531743, "grad_norm": 6.46875, "learning_rate": 2.5998725295160053e-06, "loss": 1.166435956954956, "step": 5386 }, { "epoch": 1.6584070796460177, "grad_norm": 3.453125, "learning_rate": 2.5977739177251492e-06, "loss": 0.9771444797515869, "step": 5388 }, { "epoch": 1.659022701038861, "grad_norm": 7.96875, "learning_rate": 2.595678686802614e-06, "loss": 1.4036625623703003, "step": 5390 }, { "epoch": 1.6596383224317046, "grad_norm": 6.03125, "learning_rate": 2.5935868388304797e-06, "loss": 1.1340692043304443, "step": 5392 }, { "epoch": 1.6602539438245478, "grad_norm": 8.125, "learning_rate": 2.5914983758874612e-06, "loss": 1.44871187210083, "step": 5394 }, { "epoch": 1.6608695652173913, "grad_norm": 9.375, "learning_rate": 2.5894133000489108e-06, "loss": 1.542242169380188, "step": 5396 }, { "epoch": 1.6614851866102347, "grad_norm": 9.1875, "learning_rate": 2.5873316133868154e-06, "loss": 1.0620455741882324, "step": 5398 }, { "epoch": 1.6621008080030781, "grad_norm": 5.28125, "learning_rate": 2.585253317969793e-06, "loss": 1.1398513317108154, "step": 5400 }, { "epoch": 1.6627164293959216, "grad_norm": 28.125, "learning_rate": 2.583178415863093e-06, "loss": 1.1372501850128174, "step": 5402 }, { "epoch": 1.6633320507887648, "grad_norm": 8.8125, "learning_rate": 2.5811069091285916e-06, "loss": 1.3599408864974976, "step": 5404 }, { "epoch": 1.6639476721816084, "grad_norm": 4.875, "learning_rate": 2.5790387998247933e-06, "loss": 1.3852657079696655, "step": 5406 }, { "epoch": 1.6645632935744517, "grad_norm": 6.5625, "learning_rate": 2.5769740900068223e-06, "loss": 1.4934041500091553, "step": 5408 }, { "epoch": 1.665178914967295, "grad_norm": 6.78125, "learning_rate": 2.5749127817264284e-06, "loss": 1.2885091304779053, "step": 5410 }, { "epoch": 1.6657945363601385, "grad_norm": 5.46875, "learning_rate": 2.57285487703198e-06, "loss": 1.5227521657943726, "step": 5412 }, { "epoch": 1.666410157752982, "grad_norm": 8.25, "learning_rate": 2.570800377968461e-06, "loss": 1.2954890727996826, "step": 5414 }, { "epoch": 1.6670257791458254, "grad_norm": 5.40625, "learning_rate": 2.5687492865774765e-06, "loss": 1.2142372131347656, "step": 5416 }, { "epoch": 1.6676414005386686, "grad_norm": 5.5625, "learning_rate": 2.5667016048972394e-06, "loss": 1.4201480150222778, "step": 5418 }, { "epoch": 1.6682570219315123, "grad_norm": 9.6875, "learning_rate": 2.564657334962578e-06, "loss": 1.475806713104248, "step": 5420 }, { "epoch": 1.6688726433243555, "grad_norm": 7.75, "learning_rate": 2.562616478804929e-06, "loss": 1.4380114078521729, "step": 5422 }, { "epoch": 1.669488264717199, "grad_norm": 3.921875, "learning_rate": 2.560579038452336e-06, "loss": 1.3787497282028198, "step": 5424 }, { "epoch": 1.6701038861100423, "grad_norm": 9.8125, "learning_rate": 2.5585450159294506e-06, "loss": 0.8029026985168457, "step": 5426 }, { "epoch": 1.6707195075028856, "grad_norm": 20.0, "learning_rate": 2.556514413257525e-06, "loss": 1.5744322538375854, "step": 5428 }, { "epoch": 1.6713351288957292, "grad_norm": 3.96875, "learning_rate": 2.5544872324544168e-06, "loss": 1.4971494674682617, "step": 5430 }, { "epoch": 1.6719507502885724, "grad_norm": 11.75, "learning_rate": 2.552463475534581e-06, "loss": 1.7740942239761353, "step": 5432 }, { "epoch": 1.672566371681416, "grad_norm": 4.6875, "learning_rate": 2.5504431445090668e-06, "loss": 1.09224534034729, "step": 5434 }, { "epoch": 1.6731819930742593, "grad_norm": 4.625, "learning_rate": 2.5484262413855247e-06, "loss": 1.319622278213501, "step": 5436 }, { "epoch": 1.6737976144671027, "grad_norm": 3.59375, "learning_rate": 2.546412768168196e-06, "loss": 1.0411967039108276, "step": 5438 }, { "epoch": 1.6744132358599462, "grad_norm": 6.90625, "learning_rate": 2.5444027268579157e-06, "loss": 1.4855449199676514, "step": 5440 }, { "epoch": 1.6750288572527894, "grad_norm": 8.125, "learning_rate": 2.5423961194521064e-06, "loss": 1.4647443294525146, "step": 5442 }, { "epoch": 1.675644478645633, "grad_norm": 5.8125, "learning_rate": 2.5403929479447765e-06, "loss": 1.0761570930480957, "step": 5444 }, { "epoch": 1.6762601000384763, "grad_norm": 5.9375, "learning_rate": 2.538393214326527e-06, "loss": 0.6961817741394043, "step": 5446 }, { "epoch": 1.6768757214313197, "grad_norm": 9.375, "learning_rate": 2.5363969205845317e-06, "loss": 0.7719630599021912, "step": 5448 }, { "epoch": 1.6774913428241631, "grad_norm": 5.375, "learning_rate": 2.5344040687025577e-06, "loss": 1.4532030820846558, "step": 5450 }, { "epoch": 1.6781069642170066, "grad_norm": 4.4375, "learning_rate": 2.5324146606609452e-06, "loss": 1.089078664779663, "step": 5452 }, { "epoch": 1.67872258560985, "grad_norm": 8.5625, "learning_rate": 2.530428698436612e-06, "loss": 0.639711320400238, "step": 5454 }, { "epoch": 1.6793382070026932, "grad_norm": 8.375, "learning_rate": 2.5284461840030557e-06, "loss": 1.4705522060394287, "step": 5456 }, { "epoch": 1.6799538283955369, "grad_norm": 5.65625, "learning_rate": 2.5264671193303434e-06, "loss": 1.4255211353302002, "step": 5458 }, { "epoch": 1.68056944978838, "grad_norm": 24.375, "learning_rate": 2.524491506385117e-06, "loss": 1.188090205192566, "step": 5460 }, { "epoch": 1.6811850711812235, "grad_norm": 11.8125, "learning_rate": 2.522519347130587e-06, "loss": 1.0061157941818237, "step": 5462 }, { "epoch": 1.681800692574067, "grad_norm": 5.59375, "learning_rate": 2.5205506435265325e-06, "loss": 1.0411052703857422, "step": 5464 }, { "epoch": 1.6824163139669104, "grad_norm": 2.40625, "learning_rate": 2.5185853975292984e-06, "loss": 1.256507396697998, "step": 5466 }, { "epoch": 1.6830319353597538, "grad_norm": 4.96875, "learning_rate": 2.516623611091793e-06, "loss": 1.1471688747406006, "step": 5468 }, { "epoch": 1.683647556752597, "grad_norm": 5.34375, "learning_rate": 2.5146652861634887e-06, "loss": 1.1057099103927612, "step": 5470 }, { "epoch": 1.6842631781454407, "grad_norm": 20.75, "learning_rate": 2.512710424690416e-06, "loss": 1.255007028579712, "step": 5472 }, { "epoch": 1.684878799538284, "grad_norm": 3.46875, "learning_rate": 2.510759028615165e-06, "loss": 1.100907564163208, "step": 5474 }, { "epoch": 1.6854944209311273, "grad_norm": 10.8125, "learning_rate": 2.5088110998768817e-06, "loss": 1.236773133277893, "step": 5476 }, { "epoch": 1.6861100423239708, "grad_norm": 10.4375, "learning_rate": 2.506866640411265e-06, "loss": 1.5754953622817993, "step": 5478 }, { "epoch": 1.6867256637168142, "grad_norm": 4.78125, "learning_rate": 2.50492565215057e-06, "loss": 1.3212549686431885, "step": 5480 }, { "epoch": 1.6873412851096576, "grad_norm": 13.0, "learning_rate": 2.5029881370235993e-06, "loss": 1.6552295684814453, "step": 5482 }, { "epoch": 1.6879569065025009, "grad_norm": 8.875, "learning_rate": 2.5010540969557064e-06, "loss": 1.6298810243606567, "step": 5484 }, { "epoch": 1.6885725278953445, "grad_norm": 5.125, "learning_rate": 2.4991235338687886e-06, "loss": 1.4609882831573486, "step": 5486 }, { "epoch": 1.6891881492881877, "grad_norm": 12.4375, "learning_rate": 2.497196449681289e-06, "loss": 1.3491747379302979, "step": 5488 }, { "epoch": 1.6898037706810312, "grad_norm": 5.1875, "learning_rate": 2.4952728463081964e-06, "loss": 1.4812688827514648, "step": 5490 }, { "epoch": 1.6904193920738746, "grad_norm": 6.5, "learning_rate": 2.4933527256610377e-06, "loss": 1.118019461631775, "step": 5492 }, { "epoch": 1.6910350134667178, "grad_norm": 9.9375, "learning_rate": 2.49143608964788e-06, "loss": 1.4161806106567383, "step": 5494 }, { "epoch": 1.6916506348595615, "grad_norm": 14.1875, "learning_rate": 2.4895229401733278e-06, "loss": 1.4623079299926758, "step": 5496 }, { "epoch": 1.6922662562524047, "grad_norm": 9.5, "learning_rate": 2.48761327913852e-06, "loss": 1.365538239479065, "step": 5498 }, { "epoch": 1.6928818776452483, "grad_norm": 19.0, "learning_rate": 2.4857071084411302e-06, "loss": 1.220156192779541, "step": 5500 }, { "epoch": 1.6934974990380915, "grad_norm": 4.59375, "learning_rate": 2.4838044299753615e-06, "loss": 1.311576008796692, "step": 5502 }, { "epoch": 1.694113120430935, "grad_norm": 18.125, "learning_rate": 2.48190524563195e-06, "loss": 0.9696519374847412, "step": 5504 }, { "epoch": 1.6947287418237784, "grad_norm": 3.375, "learning_rate": 2.4800095572981567e-06, "loss": 1.153354287147522, "step": 5506 }, { "epoch": 1.6953443632166216, "grad_norm": 14.0, "learning_rate": 2.4781173668577692e-06, "loss": 1.186468482017517, "step": 5508 }, { "epoch": 1.6959599846094653, "grad_norm": 3.796875, "learning_rate": 2.476228676191102e-06, "loss": 1.2508482933044434, "step": 5510 }, { "epoch": 1.6965756060023085, "grad_norm": 6.375, "learning_rate": 2.474343487174985e-06, "loss": 1.5084669589996338, "step": 5512 }, { "epoch": 1.697191227395152, "grad_norm": 7.875, "learning_rate": 2.4724618016827775e-06, "loss": 1.4095842838287354, "step": 5514 }, { "epoch": 1.6978068487879954, "grad_norm": 8.5625, "learning_rate": 2.470583621584349e-06, "loss": 1.1798251867294312, "step": 5516 }, { "epoch": 1.6984224701808388, "grad_norm": 5.75, "learning_rate": 2.468708948746091e-06, "loss": 1.5921574831008911, "step": 5518 }, { "epoch": 1.6990380915736822, "grad_norm": 2.9375, "learning_rate": 2.466837785030908e-06, "loss": 1.3569403886795044, "step": 5520 }, { "epoch": 1.6996537129665255, "grad_norm": 2.546875, "learning_rate": 2.464970132298216e-06, "loss": 1.2199639081954956, "step": 5522 }, { "epoch": 1.7002693343593691, "grad_norm": 8.25, "learning_rate": 2.4631059924039444e-06, "loss": 1.5668450593948364, "step": 5524 }, { "epoch": 1.7008849557522123, "grad_norm": 7.0625, "learning_rate": 2.46124536720053e-06, "loss": 1.0412358045578003, "step": 5526 }, { "epoch": 1.7015005771450558, "grad_norm": 7.0, "learning_rate": 2.459388258536919e-06, "loss": 1.2781226634979248, "step": 5528 }, { "epoch": 1.7021161985378992, "grad_norm": 4.6875, "learning_rate": 2.4575346682585616e-06, "loss": 1.128685474395752, "step": 5530 }, { "epoch": 1.7027318199307426, "grad_norm": 8.625, "learning_rate": 2.4556845982074103e-06, "loss": 1.0571131706237793, "step": 5532 }, { "epoch": 1.703347441323586, "grad_norm": 4.75, "learning_rate": 2.4538380502219238e-06, "loss": 1.0286558866500854, "step": 5534 }, { "epoch": 1.7039630627164293, "grad_norm": 5.53125, "learning_rate": 2.451995026137057e-06, "loss": 1.1925334930419922, "step": 5536 }, { "epoch": 1.704578684109273, "grad_norm": 7.46875, "learning_rate": 2.4501555277842636e-06, "loss": 1.2129243612289429, "step": 5538 }, { "epoch": 1.7051943055021161, "grad_norm": 4.09375, "learning_rate": 2.4483195569914954e-06, "loss": 1.0670663118362427, "step": 5540 }, { "epoch": 1.7058099268949596, "grad_norm": 6.40625, "learning_rate": 2.4464871155831963e-06, "loss": 1.0114057064056396, "step": 5542 }, { "epoch": 1.706425548287803, "grad_norm": 11.5, "learning_rate": 2.4446582053803068e-06, "loss": 1.2254160642623901, "step": 5544 }, { "epoch": 1.7070411696806465, "grad_norm": 5.25, "learning_rate": 2.442832828200253e-06, "loss": 1.3474894762039185, "step": 5546 }, { "epoch": 1.7076567910734899, "grad_norm": 4.1875, "learning_rate": 2.4410109858569567e-06, "loss": 1.1840492486953735, "step": 5548 }, { "epoch": 1.708272412466333, "grad_norm": 4.6875, "learning_rate": 2.43919268016082e-06, "loss": 1.0255926847457886, "step": 5550 }, { "epoch": 1.7088880338591768, "grad_norm": 70.5, "learning_rate": 2.4373779129187356e-06, "loss": 1.3053051233291626, "step": 5552 }, { "epoch": 1.70950365525202, "grad_norm": 12.5625, "learning_rate": 2.435566685934079e-06, "loss": 1.7308895587921143, "step": 5554 }, { "epoch": 1.7101192766448634, "grad_norm": 5.6875, "learning_rate": 2.433759001006705e-06, "loss": 0.9967052340507507, "step": 5556 }, { "epoch": 1.7107348980377068, "grad_norm": 4.0, "learning_rate": 2.431954859932953e-06, "loss": 1.1225043535232544, "step": 5558 }, { "epoch": 1.71135051943055, "grad_norm": 10.0, "learning_rate": 2.4301542645056373e-06, "loss": 1.176830530166626, "step": 5560 }, { "epoch": 1.7119661408233937, "grad_norm": 8.25, "learning_rate": 2.4283572165140496e-06, "loss": 1.2773972749710083, "step": 5562 }, { "epoch": 1.712581762216237, "grad_norm": 5.15625, "learning_rate": 2.4265637177439577e-06, "loss": 1.1938509941101074, "step": 5564 }, { "epoch": 1.7131973836090806, "grad_norm": 5.5625, "learning_rate": 2.4247737699776e-06, "loss": 1.229737401008606, "step": 5566 }, { "epoch": 1.7138130050019238, "grad_norm": 5.125, "learning_rate": 2.4229873749936904e-06, "loss": 1.314058542251587, "step": 5568 }, { "epoch": 1.7144286263947672, "grad_norm": 119.5, "learning_rate": 2.421204534567406e-06, "loss": 0.6797417402267456, "step": 5570 }, { "epoch": 1.7150442477876107, "grad_norm": 4.1875, "learning_rate": 2.4194252504703985e-06, "loss": 1.1942250728607178, "step": 5572 }, { "epoch": 1.7156598691804539, "grad_norm": 4.1875, "learning_rate": 2.4176495244707814e-06, "loss": 1.420361876487732, "step": 5574 }, { "epoch": 1.7162754905732975, "grad_norm": 15.0625, "learning_rate": 2.415877358333133e-06, "loss": 1.2674133777618408, "step": 5576 }, { "epoch": 1.7168911119661407, "grad_norm": 10.375, "learning_rate": 2.414108753818495e-06, "loss": 1.6348576545715332, "step": 5578 }, { "epoch": 1.7175067333589842, "grad_norm": 5.125, "learning_rate": 2.412343712684368e-06, "loss": 1.1598114967346191, "step": 5580 }, { "epoch": 1.7181223547518276, "grad_norm": 10.8125, "learning_rate": 2.410582236684714e-06, "loss": 1.364866018295288, "step": 5582 }, { "epoch": 1.718737976144671, "grad_norm": 13.0625, "learning_rate": 2.4088243275699523e-06, "loss": 1.0493624210357666, "step": 5584 }, { "epoch": 1.7193535975375145, "grad_norm": 5.0, "learning_rate": 2.407069987086954e-06, "loss": 1.2027814388275146, "step": 5586 }, { "epoch": 1.7199692189303577, "grad_norm": 14.5, "learning_rate": 2.40531921697905e-06, "loss": 1.262241005897522, "step": 5588 }, { "epoch": 1.7205848403232014, "grad_norm": 3.4375, "learning_rate": 2.4035720189860167e-06, "loss": 0.9158905148506165, "step": 5590 }, { "epoch": 1.7212004617160446, "grad_norm": 12.6875, "learning_rate": 2.4018283948440856e-06, "loss": 1.7495272159576416, "step": 5592 }, { "epoch": 1.721816083108888, "grad_norm": 8.8125, "learning_rate": 2.4000883462859337e-06, "loss": 1.157536268234253, "step": 5594 }, { "epoch": 1.7224317045017314, "grad_norm": 4.1875, "learning_rate": 2.3983518750406874e-06, "loss": 1.124690055847168, "step": 5596 }, { "epoch": 1.7230473258945749, "grad_norm": 4.3125, "learning_rate": 2.396618982833917e-06, "loss": 1.0826674699783325, "step": 5598 }, { "epoch": 1.7236629472874183, "grad_norm": 17.625, "learning_rate": 2.394889671387636e-06, "loss": 1.5495002269744873, "step": 5600 }, { "epoch": 1.7242785686802615, "grad_norm": 9.25, "learning_rate": 2.3931639424203e-06, "loss": 1.1986749172210693, "step": 5602 }, { "epoch": 1.7248941900731052, "grad_norm": 11.125, "learning_rate": 2.391441797646805e-06, "loss": 1.1907917261123657, "step": 5604 }, { "epoch": 1.7255098114659484, "grad_norm": 4.3125, "learning_rate": 2.3897232387784842e-06, "loss": 1.3200044631958008, "step": 5606 }, { "epoch": 1.7261254328587918, "grad_norm": 15.75, "learning_rate": 2.3880082675231088e-06, "loss": 1.4564788341522217, "step": 5608 }, { "epoch": 1.7267410542516353, "grad_norm": 8.5625, "learning_rate": 2.386296885584883e-06, "loss": 1.6941089630126953, "step": 5610 }, { "epoch": 1.7273566756444787, "grad_norm": 7.03125, "learning_rate": 2.3845890946644466e-06, "loss": 1.1814099550247192, "step": 5612 }, { "epoch": 1.7279722970373221, "grad_norm": 3.0, "learning_rate": 2.3828848964588694e-06, "loss": 1.255849838256836, "step": 5614 }, { "epoch": 1.7285879184301653, "grad_norm": 6.34375, "learning_rate": 2.3811842926616513e-06, "loss": 1.195947289466858, "step": 5616 }, { "epoch": 1.729203539823009, "grad_norm": 9.9375, "learning_rate": 2.37948728496272e-06, "loss": 1.5996789932250977, "step": 5618 }, { "epoch": 1.7298191612158522, "grad_norm": 11.75, "learning_rate": 2.3777938750484306e-06, "loss": 1.3444377183914185, "step": 5620 }, { "epoch": 1.7304347826086957, "grad_norm": 3.078125, "learning_rate": 2.3761040646015623e-06, "loss": 1.450816035270691, "step": 5622 }, { "epoch": 1.731050404001539, "grad_norm": 11.0, "learning_rate": 2.374417855301317e-06, "loss": 1.3607956171035767, "step": 5624 }, { "epoch": 1.7316660253943823, "grad_norm": 7.59375, "learning_rate": 2.372735248823321e-06, "loss": 1.529814600944519, "step": 5626 }, { "epoch": 1.732281646787226, "grad_norm": 6.25, "learning_rate": 2.3710562468396146e-06, "loss": 1.4916486740112305, "step": 5628 }, { "epoch": 1.7328972681800692, "grad_norm": 10.1875, "learning_rate": 2.3693808510186625e-06, "loss": 1.5742865800857544, "step": 5630 }, { "epoch": 1.7335128895729126, "grad_norm": 3.78125, "learning_rate": 2.367709063025342e-06, "loss": 1.2341521978378296, "step": 5632 }, { "epoch": 1.734128510965756, "grad_norm": 1.8671875, "learning_rate": 2.3660408845209455e-06, "loss": 1.0239232778549194, "step": 5634 }, { "epoch": 1.7347441323585995, "grad_norm": 8.875, "learning_rate": 2.3643763171631815e-06, "loss": 1.3741108179092407, "step": 5636 }, { "epoch": 1.735359753751443, "grad_norm": 5.0625, "learning_rate": 2.3627153626061663e-06, "loss": 1.338080883026123, "step": 5638 }, { "epoch": 1.7359753751442861, "grad_norm": 6.03125, "learning_rate": 2.361058022500428e-06, "loss": 1.20331609249115, "step": 5640 }, { "epoch": 1.7365909965371298, "grad_norm": 4.0625, "learning_rate": 2.359404298492903e-06, "loss": 1.3246911764144897, "step": 5642 }, { "epoch": 1.737206617929973, "grad_norm": 15.8125, "learning_rate": 2.3577541922269324e-06, "loss": 1.3114361763000488, "step": 5644 }, { "epoch": 1.7378222393228164, "grad_norm": 102.0, "learning_rate": 2.3561077053422658e-06, "loss": 0.9264519214630127, "step": 5646 }, { "epoch": 1.7384378607156599, "grad_norm": 12.125, "learning_rate": 2.3544648394750535e-06, "loss": 1.4408466815948486, "step": 5648 }, { "epoch": 1.7390534821085033, "grad_norm": 5.34375, "learning_rate": 2.352825596257847e-06, "loss": 1.1803123950958252, "step": 5650 }, { "epoch": 1.7396691035013467, "grad_norm": 8.625, "learning_rate": 2.351189977319601e-06, "loss": 1.317006230354309, "step": 5652 }, { "epoch": 1.74028472489419, "grad_norm": 5.8125, "learning_rate": 2.349557984285665e-06, "loss": 1.1664330959320068, "step": 5654 }, { "epoch": 1.7409003462870336, "grad_norm": 3.96875, "learning_rate": 2.3479296187777877e-06, "loss": 1.3429275751113892, "step": 5656 }, { "epoch": 1.7415159676798768, "grad_norm": 6.84375, "learning_rate": 2.3463048824141123e-06, "loss": 1.1863704919815063, "step": 5658 }, { "epoch": 1.7421315890727203, "grad_norm": 5.5625, "learning_rate": 2.3446837768091763e-06, "loss": 1.2393282651901245, "step": 5660 }, { "epoch": 1.7427472104655637, "grad_norm": 5.0, "learning_rate": 2.343066303573908e-06, "loss": 0.9789924621582031, "step": 5662 }, { "epoch": 1.7433628318584071, "grad_norm": 8.75, "learning_rate": 2.341452464315627e-06, "loss": 1.4149452447891235, "step": 5664 }, { "epoch": 1.7439784532512506, "grad_norm": 6.5625, "learning_rate": 2.3398422606380424e-06, "loss": 1.4665279388427734, "step": 5666 }, { "epoch": 1.7445940746440938, "grad_norm": 3.328125, "learning_rate": 2.338235694141248e-06, "loss": 1.0495976209640503, "step": 5668 }, { "epoch": 1.7452096960369374, "grad_norm": 5.96875, "learning_rate": 2.3366327664217253e-06, "loss": 1.3150982856750488, "step": 5670 }, { "epoch": 1.7458253174297806, "grad_norm": 4.15625, "learning_rate": 2.335033479072341e-06, "loss": 1.1453654766082764, "step": 5672 }, { "epoch": 1.746440938822624, "grad_norm": 4.875, "learning_rate": 2.3334378336823413e-06, "loss": 1.0010666847229004, "step": 5674 }, { "epoch": 1.7470565602154675, "grad_norm": 17.5, "learning_rate": 2.3318458318373558e-06, "loss": 1.1657469272613525, "step": 5676 }, { "epoch": 1.7476721816083107, "grad_norm": 3.609375, "learning_rate": 2.330257475119392e-06, "loss": 0.6801499128341675, "step": 5678 }, { "epoch": 1.7482878030011544, "grad_norm": 33.75, "learning_rate": 2.3286727651068346e-06, "loss": 1.1293959617614746, "step": 5680 }, { "epoch": 1.7489034243939976, "grad_norm": 15.8125, "learning_rate": 2.327091703374447e-06, "loss": 1.2393264770507812, "step": 5682 }, { "epoch": 1.7495190457868413, "grad_norm": 6.9375, "learning_rate": 2.325514291493365e-06, "loss": 1.3086076974868774, "step": 5684 }, { "epoch": 1.7501346671796845, "grad_norm": 9.0, "learning_rate": 2.323940531031098e-06, "loss": 1.3145592212677002, "step": 5686 }, { "epoch": 1.750750288572528, "grad_norm": 2.609375, "learning_rate": 2.322370423551527e-06, "loss": 1.3300237655639648, "step": 5688 }, { "epoch": 1.7513659099653713, "grad_norm": 10.125, "learning_rate": 2.3208039706149037e-06, "loss": 1.1004767417907715, "step": 5690 }, { "epoch": 1.7519815313582145, "grad_norm": 19.125, "learning_rate": 2.3192411737778476e-06, "loss": 1.3882306814193726, "step": 5692 }, { "epoch": 1.7525971527510582, "grad_norm": 6.96875, "learning_rate": 2.3176820345933437e-06, "loss": 1.197304368019104, "step": 5694 }, { "epoch": 1.7532127741439014, "grad_norm": 18.125, "learning_rate": 2.3161265546107443e-06, "loss": 1.220046877861023, "step": 5696 }, { "epoch": 1.7538283955367449, "grad_norm": 5.0, "learning_rate": 2.3145747353757643e-06, "loss": 1.4614460468292236, "step": 5698 }, { "epoch": 1.7544440169295883, "grad_norm": 3.46875, "learning_rate": 2.313026578430482e-06, "loss": 0.9256251454353333, "step": 5700 }, { "epoch": 1.7550596383224317, "grad_norm": 14.75, "learning_rate": 2.3114820853133356e-06, "loss": 1.2474175691604614, "step": 5702 }, { "epoch": 1.7556752597152752, "grad_norm": 6.59375, "learning_rate": 2.309941257559122e-06, "loss": 1.5943374633789062, "step": 5704 }, { "epoch": 1.7562908811081184, "grad_norm": 5.75, "learning_rate": 2.3084040966989964e-06, "loss": 1.3186969757080078, "step": 5706 }, { "epoch": 1.756906502500962, "grad_norm": 26.375, "learning_rate": 2.3068706042604694e-06, "loss": 1.2382252216339111, "step": 5708 }, { "epoch": 1.7575221238938052, "grad_norm": 7.1875, "learning_rate": 2.3053407817674087e-06, "loss": 1.3816118240356445, "step": 5710 }, { "epoch": 1.7581377452866487, "grad_norm": 12.0625, "learning_rate": 2.3038146307400313e-06, "loss": 1.339174509048462, "step": 5712 }, { "epoch": 1.7587533666794921, "grad_norm": 7.28125, "learning_rate": 2.3022921526949087e-06, "loss": 1.2081241607666016, "step": 5714 }, { "epoch": 1.7593689880723355, "grad_norm": 4.84375, "learning_rate": 2.3007733491449615e-06, "loss": 1.3891562223434448, "step": 5716 }, { "epoch": 1.759984609465179, "grad_norm": 2.984375, "learning_rate": 2.2992582215994576e-06, "loss": 1.126984715461731, "step": 5718 }, { "epoch": 1.7606002308580222, "grad_norm": 4.34375, "learning_rate": 2.2977467715640147e-06, "loss": 1.0940567255020142, "step": 5720 }, { "epoch": 1.7612158522508659, "grad_norm": 17.5, "learning_rate": 2.2962390005405935e-06, "loss": 1.4153151512145996, "step": 5722 }, { "epoch": 1.761831473643709, "grad_norm": 8.3125, "learning_rate": 2.2947349100275007e-06, "loss": 1.535245418548584, "step": 5724 }, { "epoch": 1.7624470950365525, "grad_norm": 4.53125, "learning_rate": 2.2932345015193845e-06, "loss": 1.3496298789978027, "step": 5726 }, { "epoch": 1.763062716429396, "grad_norm": 7.125, "learning_rate": 2.2917377765072336e-06, "loss": 1.3143893480300903, "step": 5728 }, { "epoch": 1.7636783378222394, "grad_norm": 8.0625, "learning_rate": 2.29024473647838e-06, "loss": 1.2238497734069824, "step": 5730 }, { "epoch": 1.7642939592150828, "grad_norm": 6.03125, "learning_rate": 2.288755382916487e-06, "loss": 1.4694186449050903, "step": 5732 }, { "epoch": 1.764909580607926, "grad_norm": 3.109375, "learning_rate": 2.2872697173015614e-06, "loss": 1.4567644596099854, "step": 5734 }, { "epoch": 1.7655252020007697, "grad_norm": 3.0625, "learning_rate": 2.2857877411099407e-06, "loss": 1.1318411827087402, "step": 5736 }, { "epoch": 1.766140823393613, "grad_norm": 5.8125, "learning_rate": 2.2843094558142998e-06, "loss": 1.2868903875350952, "step": 5738 }, { "epoch": 1.7667564447864563, "grad_norm": 10.0, "learning_rate": 2.2828348628836434e-06, "loss": 1.4868735074996948, "step": 5740 }, { "epoch": 1.7673720661792998, "grad_norm": 10.5, "learning_rate": 2.2813639637833065e-06, "loss": 0.7236781120300293, "step": 5742 }, { "epoch": 1.767987687572143, "grad_norm": 6.28125, "learning_rate": 2.2798967599749554e-06, "loss": 1.0289052724838257, "step": 5744 }, { "epoch": 1.7686033089649866, "grad_norm": 8.0625, "learning_rate": 2.278433252916582e-06, "loss": 1.4637351036071777, "step": 5746 }, { "epoch": 1.7692189303578298, "grad_norm": 7.71875, "learning_rate": 2.2769734440625083e-06, "loss": 1.3271539211273193, "step": 5748 }, { "epoch": 1.7698345517506735, "grad_norm": 9.0, "learning_rate": 2.2755173348633773e-06, "loss": 1.1616591215133667, "step": 5750 }, { "epoch": 1.7704501731435167, "grad_norm": 8.9375, "learning_rate": 2.274064926766158e-06, "loss": 1.4358797073364258, "step": 5752 }, { "epoch": 1.7710657945363601, "grad_norm": 4.8125, "learning_rate": 2.2726162212141417e-06, "loss": 1.1068189144134521, "step": 5754 }, { "epoch": 1.7716814159292036, "grad_norm": 6.625, "learning_rate": 2.2711712196469386e-06, "loss": 1.1925125122070312, "step": 5756 }, { "epoch": 1.7722970373220468, "grad_norm": 2.1875, "learning_rate": 2.269729923500479e-06, "loss": 1.2517881393432617, "step": 5758 }, { "epoch": 1.7729126587148905, "grad_norm": 14.4375, "learning_rate": 2.2682923342070118e-06, "loss": 1.7617454528808594, "step": 5760 }, { "epoch": 1.7735282801077337, "grad_norm": 9.625, "learning_rate": 2.266858453195101e-06, "loss": 1.614905834197998, "step": 5762 }, { "epoch": 1.774143901500577, "grad_norm": 6.78125, "learning_rate": 2.2654282818896268e-06, "loss": 1.064343810081482, "step": 5764 }, { "epoch": 1.7747595228934205, "grad_norm": 4.25, "learning_rate": 2.264001821711782e-06, "loss": 1.0498532056808472, "step": 5766 }, { "epoch": 1.775375144286264, "grad_norm": 3.03125, "learning_rate": 2.262579074079074e-06, "loss": 1.2279499769210815, "step": 5768 }, { "epoch": 1.7759907656791074, "grad_norm": 9.125, "learning_rate": 2.2611600404053162e-06, "loss": 1.4600101709365845, "step": 5770 }, { "epoch": 1.7766063870719506, "grad_norm": 7.84375, "learning_rate": 2.2597447221006355e-06, "loss": 1.2678983211517334, "step": 5772 }, { "epoch": 1.7772220084647943, "grad_norm": 9.375, "learning_rate": 2.258333120571466e-06, "loss": 0.8515924215316772, "step": 5774 }, { "epoch": 1.7778376298576375, "grad_norm": 3.859375, "learning_rate": 2.2569252372205465e-06, "loss": 1.0808274745941162, "step": 5776 }, { "epoch": 1.778453251250481, "grad_norm": 10.1875, "learning_rate": 2.2555210734469233e-06, "loss": 1.4102704524993896, "step": 5778 }, { "epoch": 1.7790688726433244, "grad_norm": 3.640625, "learning_rate": 2.254120630645945e-06, "loss": 1.1184544563293457, "step": 5780 }, { "epoch": 1.7796844940361678, "grad_norm": 2.75, "learning_rate": 2.252723910209263e-06, "loss": 1.1054328680038452, "step": 5782 }, { "epoch": 1.7803001154290112, "grad_norm": 5.625, "learning_rate": 2.2513309135248302e-06, "loss": 0.9634500741958618, "step": 5784 }, { "epoch": 1.7809157368218544, "grad_norm": 3.75, "learning_rate": 2.249941641976897e-06, "loss": 1.0928064584732056, "step": 5786 }, { "epoch": 1.781531358214698, "grad_norm": 18.125, "learning_rate": 2.248556096946016e-06, "loss": 1.1610251665115356, "step": 5788 }, { "epoch": 1.7821469796075413, "grad_norm": 4.34375, "learning_rate": 2.2471742798090315e-06, "loss": 0.9581281542778015, "step": 5790 }, { "epoch": 1.7827626010003848, "grad_norm": 5.59375, "learning_rate": 2.2457961919390893e-06, "loss": 0.9840832948684692, "step": 5792 }, { "epoch": 1.7833782223932282, "grad_norm": 10.125, "learning_rate": 2.2444218347056253e-06, "loss": 1.285952091217041, "step": 5794 }, { "epoch": 1.7839938437860716, "grad_norm": 6.0625, "learning_rate": 2.2430512094743674e-06, "loss": 1.3057432174682617, "step": 5796 }, { "epoch": 1.784609465178915, "grad_norm": 4.78125, "learning_rate": 2.241684317607338e-06, "loss": 1.2441459894180298, "step": 5798 }, { "epoch": 1.7852250865717583, "grad_norm": 5.0625, "learning_rate": 2.240321160462848e-06, "loss": 1.260965347290039, "step": 5800 }, { "epoch": 1.785840707964602, "grad_norm": 3.796875, "learning_rate": 2.2389617393954974e-06, "loss": 1.0719890594482422, "step": 5802 }, { "epoch": 1.7864563293574451, "grad_norm": 3.359375, "learning_rate": 2.2376060557561734e-06, "loss": 1.1214145421981812, "step": 5804 }, { "epoch": 1.7870719507502886, "grad_norm": 5.25, "learning_rate": 2.236254110892048e-06, "loss": 1.4030085802078247, "step": 5806 }, { "epoch": 1.787687572143132, "grad_norm": 3.203125, "learning_rate": 2.2349059061465816e-06, "loss": 1.3156555891036987, "step": 5808 }, { "epoch": 1.7883031935359752, "grad_norm": 5.40625, "learning_rate": 2.2335614428595125e-06, "loss": 1.3215585947036743, "step": 5810 }, { "epoch": 1.7889188149288189, "grad_norm": 3.390625, "learning_rate": 2.232220722366866e-06, "loss": 1.2306294441223145, "step": 5812 }, { "epoch": 1.789534436321662, "grad_norm": 6.5, "learning_rate": 2.230883746000946e-06, "loss": 1.1097067594528198, "step": 5814 }, { "epoch": 1.7901500577145055, "grad_norm": 6.6875, "learning_rate": 2.2295505150903348e-06, "loss": 1.2976242303848267, "step": 5816 }, { "epoch": 1.790765679107349, "grad_norm": 3.15625, "learning_rate": 2.228221030959895e-06, "loss": 1.0502114295959473, "step": 5818 }, { "epoch": 1.7913813005001924, "grad_norm": 6.71875, "learning_rate": 2.226895294930764e-06, "loss": 1.392374038696289, "step": 5820 }, { "epoch": 1.7919969218930358, "grad_norm": 6.78125, "learning_rate": 2.225573308320356e-06, "loss": 1.3847123384475708, "step": 5822 }, { "epoch": 1.792612543285879, "grad_norm": 18.625, "learning_rate": 2.224255072442358e-06, "loss": 1.499535322189331, "step": 5824 }, { "epoch": 1.7932281646787227, "grad_norm": 3.765625, "learning_rate": 2.222940588606731e-06, "loss": 1.2504417896270752, "step": 5826 }, { "epoch": 1.793843786071566, "grad_norm": 3.03125, "learning_rate": 2.2216298581197075e-06, "loss": 1.0057941675186157, "step": 5828 }, { "epoch": 1.7944594074644094, "grad_norm": 3.40625, "learning_rate": 2.220322882283789e-06, "loss": 1.0351872444152832, "step": 5830 }, { "epoch": 1.7950750288572528, "grad_norm": 3.65625, "learning_rate": 2.219019662397747e-06, "loss": 1.2333468198776245, "step": 5832 }, { "epoch": 1.7956906502500962, "grad_norm": 16.5, "learning_rate": 2.2177201997566203e-06, "loss": 1.2922084331512451, "step": 5834 }, { "epoch": 1.7963062716429397, "grad_norm": 6.1875, "learning_rate": 2.2164244956517144e-06, "loss": 1.5321375131607056, "step": 5836 }, { "epoch": 1.7969218930357829, "grad_norm": 11.9375, "learning_rate": 2.215132551370599e-06, "loss": 1.1942152976989746, "step": 5838 }, { "epoch": 1.7975375144286265, "grad_norm": 6.03125, "learning_rate": 2.213844368197108e-06, "loss": 1.1645042896270752, "step": 5840 }, { "epoch": 1.7981531358214697, "grad_norm": 5.53125, "learning_rate": 2.212559947411338e-06, "loss": 1.2141934633255005, "step": 5842 }, { "epoch": 1.7987687572143132, "grad_norm": 8.6875, "learning_rate": 2.2112792902896467e-06, "loss": 1.35050368309021, "step": 5844 }, { "epoch": 1.7993843786071566, "grad_norm": 13.5, "learning_rate": 2.2100023981046526e-06, "loss": 1.2252672910690308, "step": 5846 }, { "epoch": 1.8, "grad_norm": 5.84375, "learning_rate": 2.2087292721252317e-06, "loss": 1.3437782526016235, "step": 5848 }, { "epoch": 1.8006156213928435, "grad_norm": 6.9375, "learning_rate": 2.2074599136165165e-06, "loss": 1.0520520210266113, "step": 5850 }, { "epoch": 1.8012312427856867, "grad_norm": 6.9375, "learning_rate": 2.2061943238398992e-06, "loss": 1.1961694955825806, "step": 5852 }, { "epoch": 1.8018468641785303, "grad_norm": 6.0625, "learning_rate": 2.2049325040530226e-06, "loss": 0.9422978162765503, "step": 5854 }, { "epoch": 1.8024624855713736, "grad_norm": 6.375, "learning_rate": 2.2036744555097867e-06, "loss": 1.279676079750061, "step": 5856 }, { "epoch": 1.803078106964217, "grad_norm": 9.9375, "learning_rate": 2.2024201794603424e-06, "loss": 1.333317756652832, "step": 5858 }, { "epoch": 1.8036937283570604, "grad_norm": 6.03125, "learning_rate": 2.2011696771510914e-06, "loss": 1.1424082517623901, "step": 5860 }, { "epoch": 1.8043093497499036, "grad_norm": 4.90625, "learning_rate": 2.1999229498246865e-06, "loss": 0.9212673902511597, "step": 5862 }, { "epoch": 1.8049249711427473, "grad_norm": 2.75, "learning_rate": 2.198679998720028e-06, "loss": 0.9288144707679749, "step": 5864 }, { "epoch": 1.8055405925355905, "grad_norm": 9.9375, "learning_rate": 2.1974408250722647e-06, "loss": 1.2315566539764404, "step": 5866 }, { "epoch": 1.8061562139284342, "grad_norm": 16.25, "learning_rate": 2.1962054301127907e-06, "loss": 1.463548183441162, "step": 5868 }, { "epoch": 1.8067718353212774, "grad_norm": 9.8125, "learning_rate": 2.1949738150692455e-06, "loss": 0.6639575958251953, "step": 5870 }, { "epoch": 1.8073874567141208, "grad_norm": 6.0, "learning_rate": 2.193745981165515e-06, "loss": 1.1918666362762451, "step": 5872 }, { "epoch": 1.8080030781069643, "grad_norm": 7.875, "learning_rate": 2.1925219296217213e-06, "loss": 1.178608775138855, "step": 5874 }, { "epoch": 1.8086186994998075, "grad_norm": 7.4375, "learning_rate": 2.1913016616542348e-06, "loss": 1.0431655645370483, "step": 5876 }, { "epoch": 1.8092343208926511, "grad_norm": 14.625, "learning_rate": 2.1900851784756618e-06, "loss": 1.3192921876907349, "step": 5878 }, { "epoch": 1.8098499422854943, "grad_norm": 4.53125, "learning_rate": 2.18887248129485e-06, "loss": 0.8913655877113342, "step": 5880 }, { "epoch": 1.8104655636783378, "grad_norm": 3.421875, "learning_rate": 2.187663571316883e-06, "loss": 1.0939395427703857, "step": 5882 }, { "epoch": 1.8110811850711812, "grad_norm": 9.6875, "learning_rate": 2.1864584497430813e-06, "loss": 1.2626577615737915, "step": 5884 }, { "epoch": 1.8116968064640246, "grad_norm": 12.125, "learning_rate": 2.185257117771003e-06, "loss": 1.2554960250854492, "step": 5886 }, { "epoch": 1.812312427856868, "grad_norm": 12.3125, "learning_rate": 2.1840595765944366e-06, "loss": 1.126465916633606, "step": 5888 }, { "epoch": 1.8129280492497113, "grad_norm": 8.125, "learning_rate": 2.1828658274034063e-06, "loss": 1.2984936237335205, "step": 5890 }, { "epoch": 1.813543670642555, "grad_norm": 4.09375, "learning_rate": 2.1816758713841676e-06, "loss": 1.214903473854065, "step": 5892 }, { "epoch": 1.8141592920353982, "grad_norm": 7.96875, "learning_rate": 2.1804897097192067e-06, "loss": 1.0102453231811523, "step": 5894 }, { "epoch": 1.8147749134282416, "grad_norm": 6.3125, "learning_rate": 2.179307343587238e-06, "loss": 1.0699224472045898, "step": 5896 }, { "epoch": 1.815390534821085, "grad_norm": 10.625, "learning_rate": 2.1781287741632067e-06, "loss": 1.2344567775726318, "step": 5898 }, { "epoch": 1.8160061562139285, "grad_norm": 6.90625, "learning_rate": 2.176954002618283e-06, "loss": 1.3633794784545898, "step": 5900 }, { "epoch": 1.816621777606772, "grad_norm": 7.84375, "learning_rate": 2.1757830301198637e-06, "loss": 1.2593004703521729, "step": 5902 }, { "epoch": 1.8172373989996151, "grad_norm": 7.375, "learning_rate": 2.17461585783157e-06, "loss": 1.3392139673233032, "step": 5904 }, { "epoch": 1.8178530203924588, "grad_norm": 13.0625, "learning_rate": 2.1734524869132475e-06, "loss": 1.0816066265106201, "step": 5906 }, { "epoch": 1.818468641785302, "grad_norm": 5.75, "learning_rate": 2.1722929185209637e-06, "loss": 1.2291300296783447, "step": 5908 }, { "epoch": 1.8190842631781454, "grad_norm": 6.53125, "learning_rate": 2.1711371538070088e-06, "loss": 1.2727948427200317, "step": 5910 }, { "epoch": 1.8196998845709889, "grad_norm": 7.3125, "learning_rate": 2.169985193919891e-06, "loss": 1.287793517112732, "step": 5912 }, { "epoch": 1.8203155059638323, "grad_norm": 3.0, "learning_rate": 2.168837040004339e-06, "loss": 1.0403543710708618, "step": 5914 }, { "epoch": 1.8209311273566757, "grad_norm": 9.5625, "learning_rate": 2.167692693201299e-06, "loss": 0.8734148740768433, "step": 5916 }, { "epoch": 1.821546748749519, "grad_norm": 7.46875, "learning_rate": 2.1665521546479336e-06, "loss": 1.2621644735336304, "step": 5918 }, { "epoch": 1.8221623701423626, "grad_norm": 5.625, "learning_rate": 2.165415425477623e-06, "loss": 1.3135576248168945, "step": 5920 }, { "epoch": 1.8227779915352058, "grad_norm": 7.84375, "learning_rate": 2.164282506819959e-06, "loss": 0.8223294615745544, "step": 5922 }, { "epoch": 1.8233936129280492, "grad_norm": 9.625, "learning_rate": 2.163153399800749e-06, "loss": 1.6391355991363525, "step": 5924 }, { "epoch": 1.8240092343208927, "grad_norm": 48.0, "learning_rate": 2.1620281055420113e-06, "loss": 1.230574607849121, "step": 5926 }, { "epoch": 1.824624855713736, "grad_norm": 3.09375, "learning_rate": 2.1609066251619757e-06, "loss": 1.003134846687317, "step": 5928 }, { "epoch": 1.8252404771065796, "grad_norm": 3.703125, "learning_rate": 2.159788959775085e-06, "loss": 1.0853413343429565, "step": 5930 }, { "epoch": 1.8258560984994228, "grad_norm": 12.125, "learning_rate": 2.158675110491985e-06, "loss": 1.2373244762420654, "step": 5932 }, { "epoch": 1.8264717198922664, "grad_norm": 7.21875, "learning_rate": 2.1575650784195346e-06, "loss": 1.2410924434661865, "step": 5934 }, { "epoch": 1.8270873412851096, "grad_norm": 8.1875, "learning_rate": 2.1564588646607974e-06, "loss": 1.377103567123413, "step": 5936 }, { "epoch": 1.827702962677953, "grad_norm": 6.09375, "learning_rate": 2.1553564703150425e-06, "loss": 1.0280659198760986, "step": 5938 }, { "epoch": 1.8283185840707965, "grad_norm": 5.9375, "learning_rate": 2.154257896477744e-06, "loss": 1.4314674139022827, "step": 5940 }, { "epoch": 1.8289342054636397, "grad_norm": 6.0, "learning_rate": 2.153163144240579e-06, "loss": 1.0865339040756226, "step": 5942 }, { "epoch": 1.8295498268564834, "grad_norm": 11.3125, "learning_rate": 2.152072214691428e-06, "loss": 1.4894191026687622, "step": 5944 }, { "epoch": 1.8301654482493266, "grad_norm": 3.625, "learning_rate": 2.1509851089143717e-06, "loss": 1.176236629486084, "step": 5946 }, { "epoch": 1.83078106964217, "grad_norm": 6.15625, "learning_rate": 2.149901827989691e-06, "loss": 1.2651861906051636, "step": 5948 }, { "epoch": 1.8313966910350135, "grad_norm": 7.40625, "learning_rate": 2.148822372993868e-06, "loss": 1.2794727087020874, "step": 5950 }, { "epoch": 1.832012312427857, "grad_norm": 5.46875, "learning_rate": 2.1477467449995793e-06, "loss": 1.244179606437683, "step": 5952 }, { "epoch": 1.8326279338207003, "grad_norm": 3.65625, "learning_rate": 2.1466749450757016e-06, "loss": 1.132509469985962, "step": 5954 }, { "epoch": 1.8332435552135435, "grad_norm": 11.875, "learning_rate": 2.145606974287307e-06, "loss": 1.3131150007247925, "step": 5956 }, { "epoch": 1.8338591766063872, "grad_norm": 8.5625, "learning_rate": 2.144542833695661e-06, "loss": 1.5854219198226929, "step": 5958 }, { "epoch": 1.8344747979992304, "grad_norm": 12.8125, "learning_rate": 2.1434825243582247e-06, "loss": 1.37516450881958, "step": 5960 }, { "epoch": 1.8350904193920738, "grad_norm": 2.578125, "learning_rate": 2.1424260473286515e-06, "loss": 1.2534147500991821, "step": 5962 }, { "epoch": 1.8357060407849173, "grad_norm": 9.125, "learning_rate": 2.141373403656785e-06, "loss": 0.9582955241203308, "step": 5964 }, { "epoch": 1.8363216621777607, "grad_norm": 6.1875, "learning_rate": 2.140324594388662e-06, "loss": 1.3661773204803467, "step": 5966 }, { "epoch": 1.8369372835706042, "grad_norm": 15.875, "learning_rate": 2.139279620566507e-06, "loss": 1.6128534078598022, "step": 5968 }, { "epoch": 1.8375529049634474, "grad_norm": 5.03125, "learning_rate": 2.1382384832287345e-06, "loss": 1.3733783960342407, "step": 5970 }, { "epoch": 1.838168526356291, "grad_norm": 7.59375, "learning_rate": 2.137201183409946e-06, "loss": 1.1199281215667725, "step": 5972 }, { "epoch": 1.8387841477491342, "grad_norm": 5.125, "learning_rate": 2.136167722140929e-06, "loss": 1.3824279308319092, "step": 5974 }, { "epoch": 1.8393997691419777, "grad_norm": 2.453125, "learning_rate": 2.1351381004486575e-06, "loss": 1.0962920188903809, "step": 5976 }, { "epoch": 1.840015390534821, "grad_norm": 9.125, "learning_rate": 2.134112319356291e-06, "loss": 1.3079670667648315, "step": 5978 }, { "epoch": 1.8406310119276645, "grad_norm": 4.15625, "learning_rate": 2.1330903798831685e-06, "loss": 1.0482728481292725, "step": 5980 }, { "epoch": 1.841246633320508, "grad_norm": 10.75, "learning_rate": 2.1320722830448155e-06, "loss": 1.4023866653442383, "step": 5982 }, { "epoch": 1.8418622547133512, "grad_norm": 12.9375, "learning_rate": 2.1310580298529375e-06, "loss": 1.6925112009048462, "step": 5984 }, { "epoch": 1.8424778761061948, "grad_norm": 13.1875, "learning_rate": 2.130047621315421e-06, "loss": 1.5157252550125122, "step": 5986 }, { "epoch": 1.843093497499038, "grad_norm": 14.1875, "learning_rate": 2.1290410584363324e-06, "loss": 1.3847084045410156, "step": 5988 }, { "epoch": 1.8437091188918815, "grad_norm": 7.59375, "learning_rate": 2.1280383422159135e-06, "loss": 1.2718254327774048, "step": 5990 }, { "epoch": 1.844324740284725, "grad_norm": 6.875, "learning_rate": 2.127039473650588e-06, "loss": 1.5083742141723633, "step": 5992 }, { "epoch": 1.8449403616775681, "grad_norm": 9.3125, "learning_rate": 2.1260444537329527e-06, "loss": 1.5251927375793457, "step": 5994 }, { "epoch": 1.8455559830704118, "grad_norm": 15.5625, "learning_rate": 2.125053283451782e-06, "loss": 1.0528500080108643, "step": 5996 }, { "epoch": 1.846171604463255, "grad_norm": 5.8125, "learning_rate": 2.1240659637920232e-06, "loss": 1.6003139019012451, "step": 5998 }, { "epoch": 1.8467872258560984, "grad_norm": 7.96875, "learning_rate": 2.123082495734799e-06, "loss": 1.5982369184494019, "step": 6000 }, { "epoch": 1.8474028472489419, "grad_norm": 6.28125, "learning_rate": 2.122102880257403e-06, "loss": 1.2855113744735718, "step": 6002 }, { "epoch": 1.8480184686417853, "grad_norm": 4.40625, "learning_rate": 2.121127118333301e-06, "loss": 0.905225932598114, "step": 6004 }, { "epoch": 1.8486340900346288, "grad_norm": 3.65625, "learning_rate": 2.1201552109321293e-06, "loss": 1.025927186012268, "step": 6006 }, { "epoch": 1.849249711427472, "grad_norm": 7.59375, "learning_rate": 2.119187159019695e-06, "loss": 1.531143307685852, "step": 6008 }, { "epoch": 1.8498653328203156, "grad_norm": 5.28125, "learning_rate": 2.1182229635579722e-06, "loss": 1.1162152290344238, "step": 6010 }, { "epoch": 1.8504809542131588, "grad_norm": 6.65625, "learning_rate": 2.117262625505104e-06, "loss": 1.2625081539154053, "step": 6012 }, { "epoch": 1.8510965756060023, "grad_norm": 5.15625, "learning_rate": 2.1163061458153994e-06, "loss": 1.179467797279358, "step": 6014 }, { "epoch": 1.8517121969988457, "grad_norm": 5.1875, "learning_rate": 2.115353525439334e-06, "loss": 0.9201962947845459, "step": 6016 }, { "epoch": 1.8523278183916891, "grad_norm": 9.3125, "learning_rate": 2.114404765323548e-06, "loss": 1.3194632530212402, "step": 6018 }, { "epoch": 1.8529434397845326, "grad_norm": 6.40625, "learning_rate": 2.113459866410845e-06, "loss": 1.2764129638671875, "step": 6020 }, { "epoch": 1.8535590611773758, "grad_norm": 13.875, "learning_rate": 2.112518829640193e-06, "loss": 1.3906527757644653, "step": 6022 }, { "epoch": 1.8541746825702194, "grad_norm": 4.5625, "learning_rate": 2.111581655946722e-06, "loss": 1.3976833820343018, "step": 6024 }, { "epoch": 1.8547903039630627, "grad_norm": 6.84375, "learning_rate": 2.1106483462617205e-06, "loss": 1.3194999694824219, "step": 6026 }, { "epoch": 1.855405925355906, "grad_norm": 4.1875, "learning_rate": 2.1097189015126414e-06, "loss": 1.1536383628845215, "step": 6028 }, { "epoch": 1.8560215467487495, "grad_norm": 10.6875, "learning_rate": 2.108793322623093e-06, "loss": 1.3191182613372803, "step": 6030 }, { "epoch": 1.856637168141593, "grad_norm": 4.15625, "learning_rate": 2.107871610512845e-06, "loss": 1.1951905488967896, "step": 6032 }, { "epoch": 1.8572527895344364, "grad_norm": 83.5, "learning_rate": 2.1069537660978223e-06, "loss": 0.7205085158348083, "step": 6034 }, { "epoch": 1.8578684109272796, "grad_norm": 7.5625, "learning_rate": 2.1060397902901083e-06, "loss": 1.0426700115203857, "step": 6036 }, { "epoch": 1.8584840323201233, "grad_norm": 5.34375, "learning_rate": 2.105129683997941e-06, "loss": 1.239465594291687, "step": 6038 }, { "epoch": 1.8590996537129665, "grad_norm": 4.40625, "learning_rate": 2.104223448125714e-06, "loss": 0.7743887305259705, "step": 6040 }, { "epoch": 1.85971527510581, "grad_norm": 13.5625, "learning_rate": 2.103321083573973e-06, "loss": 1.2444900274276733, "step": 6042 }, { "epoch": 1.8603308964986534, "grad_norm": 7.46875, "learning_rate": 2.102422591239419e-06, "loss": 0.8321576118469238, "step": 6044 }, { "epoch": 1.8609465178914966, "grad_norm": 6.625, "learning_rate": 2.1015279720149035e-06, "loss": 1.2588664293289185, "step": 6046 }, { "epoch": 1.8615621392843402, "grad_norm": 5.875, "learning_rate": 2.1006372267894296e-06, "loss": 1.2898892164230347, "step": 6048 }, { "epoch": 1.8621777606771834, "grad_norm": 4.78125, "learning_rate": 2.0997503564481504e-06, "loss": 0.6708816885948181, "step": 6050 }, { "epoch": 1.862793382070027, "grad_norm": 3.796875, "learning_rate": 2.09886736187237e-06, "loss": 1.221728801727295, "step": 6052 }, { "epoch": 1.8634090034628703, "grad_norm": 14.875, "learning_rate": 2.097988243939539e-06, "loss": 1.2509214878082275, "step": 6054 }, { "epoch": 1.8640246248557137, "grad_norm": 6.1875, "learning_rate": 2.097113003523257e-06, "loss": 1.3385615348815918, "step": 6056 }, { "epoch": 1.8646402462485572, "grad_norm": 7.71875, "learning_rate": 2.0962416414932697e-06, "loss": 1.2880332469940186, "step": 6058 }, { "epoch": 1.8652558676414004, "grad_norm": 7.8125, "learning_rate": 2.095374158715469e-06, "loss": 1.2843137979507446, "step": 6060 }, { "epoch": 1.865871489034244, "grad_norm": 5.1875, "learning_rate": 2.094510556051893e-06, "loss": 1.2483960390090942, "step": 6062 }, { "epoch": 1.8664871104270873, "grad_norm": 11.875, "learning_rate": 2.0936508343607214e-06, "loss": 1.257112741470337, "step": 6064 }, { "epoch": 1.8671027318199307, "grad_norm": 6.4375, "learning_rate": 2.0927949944962804e-06, "loss": 1.3845338821411133, "step": 6066 }, { "epoch": 1.8677183532127741, "grad_norm": 5.0625, "learning_rate": 2.091943037309036e-06, "loss": 1.1420466899871826, "step": 6068 }, { "epoch": 1.8683339746056176, "grad_norm": 11.75, "learning_rate": 2.091094963645598e-06, "loss": 1.4840701818466187, "step": 6070 }, { "epoch": 1.868949595998461, "grad_norm": 5.125, "learning_rate": 2.0902507743487163e-06, "loss": 1.1129378080368042, "step": 6072 }, { "epoch": 1.8695652173913042, "grad_norm": 4.9375, "learning_rate": 2.0894104702572803e-06, "loss": 1.253161907196045, "step": 6074 }, { "epoch": 1.8701808387841479, "grad_norm": 6.3125, "learning_rate": 2.0885740522063187e-06, "loss": 0.9732499122619629, "step": 6076 }, { "epoch": 1.870796460176991, "grad_norm": 5.90625, "learning_rate": 2.0877415210269993e-06, "loss": 1.1398284435272217, "step": 6078 }, { "epoch": 1.8714120815698345, "grad_norm": 4.4375, "learning_rate": 2.0869128775466275e-06, "loss": 1.1115602254867554, "step": 6080 }, { "epoch": 1.872027702962678, "grad_norm": 3.546875, "learning_rate": 2.0860881225886444e-06, "loss": 1.1072742938995361, "step": 6082 }, { "epoch": 1.8726433243555214, "grad_norm": 6.28125, "learning_rate": 2.085267256972627e-06, "loss": 1.196256160736084, "step": 6084 }, { "epoch": 1.8732589457483648, "grad_norm": 7.15625, "learning_rate": 2.084450281514289e-06, "loss": 1.621668815612793, "step": 6086 }, { "epoch": 1.873874567141208, "grad_norm": 10.8125, "learning_rate": 2.0836371970254758e-06, "loss": 1.4544686079025269, "step": 6088 }, { "epoch": 1.8744901885340517, "grad_norm": 8.6875, "learning_rate": 2.082828004314168e-06, "loss": 1.3470609188079834, "step": 6090 }, { "epoch": 1.875105809926895, "grad_norm": 6.75, "learning_rate": 2.0820227041844803e-06, "loss": 1.568597674369812, "step": 6092 }, { "epoch": 1.8757214313197383, "grad_norm": 11.4375, "learning_rate": 2.0812212974366554e-06, "loss": 1.6319973468780518, "step": 6094 }, { "epoch": 1.8763370527125818, "grad_norm": 15.6875, "learning_rate": 2.08042378486707e-06, "loss": 1.4720542430877686, "step": 6096 }, { "epoch": 1.8769526741054252, "grad_norm": 10.0625, "learning_rate": 2.07963016726823e-06, "loss": 1.479348063468933, "step": 6098 }, { "epoch": 1.8775682954982686, "grad_norm": 12.375, "learning_rate": 2.0788404454287714e-06, "loss": 1.5378873348236084, "step": 6100 }, { "epoch": 1.8781839168911119, "grad_norm": 8.25, "learning_rate": 2.0780546201334583e-06, "loss": 1.0985162258148193, "step": 6102 }, { "epoch": 1.8787995382839555, "grad_norm": 7.375, "learning_rate": 2.0772726921631826e-06, "loss": 0.8501262664794922, "step": 6104 }, { "epoch": 1.8794151596767987, "grad_norm": 4.25, "learning_rate": 2.0764946622949642e-06, "loss": 1.1900298595428467, "step": 6106 }, { "epoch": 1.8800307810696422, "grad_norm": 5.78125, "learning_rate": 2.075720531301948e-06, "loss": 1.2045283317565918, "step": 6108 }, { "epoch": 1.8806464024624856, "grad_norm": 8.5, "learning_rate": 2.074950299953406e-06, "loss": 1.4412344694137573, "step": 6110 }, { "epoch": 1.8812620238553288, "grad_norm": 13.0625, "learning_rate": 2.0741839690147347e-06, "loss": 1.4701406955718994, "step": 6112 }, { "epoch": 1.8818776452481725, "grad_norm": 7.34375, "learning_rate": 2.0734215392474533e-06, "loss": 1.0646079778671265, "step": 6114 }, { "epoch": 1.8824932666410157, "grad_norm": 8.75, "learning_rate": 2.072663011409206e-06, "loss": 0.7244550585746765, "step": 6116 }, { "epoch": 1.8831088880338593, "grad_norm": 6.15625, "learning_rate": 2.0719083862537585e-06, "loss": 1.5345789194107056, "step": 6118 }, { "epoch": 1.8837245094267026, "grad_norm": 9.8125, "learning_rate": 2.071157664531e-06, "loss": 1.5183554887771606, "step": 6120 }, { "epoch": 1.884340130819546, "grad_norm": 1.6875, "learning_rate": 2.0704108469869377e-06, "loss": 0.9273104071617126, "step": 6122 }, { "epoch": 1.8849557522123894, "grad_norm": 11.5625, "learning_rate": 2.0696679343637018e-06, "loss": 1.1478441953659058, "step": 6124 }, { "epoch": 1.8855713736052326, "grad_norm": 5.03125, "learning_rate": 2.068928927399541e-06, "loss": 1.2363404035568237, "step": 6126 }, { "epoch": 1.8861869949980763, "grad_norm": 4.4375, "learning_rate": 2.0681938268288236e-06, "loss": 1.2083735466003418, "step": 6128 }, { "epoch": 1.8868026163909195, "grad_norm": 4.53125, "learning_rate": 2.067462633382035e-06, "loss": 1.207137107849121, "step": 6130 }, { "epoch": 1.887418237783763, "grad_norm": 2.140625, "learning_rate": 2.066735347785779e-06, "loss": 1.2943766117095947, "step": 6132 }, { "epoch": 1.8880338591766064, "grad_norm": 7.75, "learning_rate": 2.066011970762775e-06, "loss": 1.2611300945281982, "step": 6134 }, { "epoch": 1.8886494805694498, "grad_norm": 6.59375, "learning_rate": 2.0652925030318594e-06, "loss": 0.7345150709152222, "step": 6136 }, { "epoch": 1.8892651019622932, "grad_norm": 4.375, "learning_rate": 2.064576945307983e-06, "loss": 1.110431432723999, "step": 6138 }, { "epoch": 1.8898807233551365, "grad_norm": 9.25, "learning_rate": 2.0638652983022124e-06, "loss": 1.096685767173767, "step": 6140 }, { "epoch": 1.8904963447479801, "grad_norm": 2.421875, "learning_rate": 2.0631575627217263e-06, "loss": 1.16090989112854, "step": 6142 }, { "epoch": 1.8911119661408233, "grad_norm": 35.5, "learning_rate": 2.062453739269818e-06, "loss": 0.8789357542991638, "step": 6144 }, { "epoch": 1.8917275875336668, "grad_norm": 4.90625, "learning_rate": 2.0617538286458915e-06, "loss": 1.34991455078125, "step": 6146 }, { "epoch": 1.8923432089265102, "grad_norm": 6.53125, "learning_rate": 2.061057831545465e-06, "loss": 1.434483289718628, "step": 6148 }, { "epoch": 1.8929588303193536, "grad_norm": 4.09375, "learning_rate": 2.060365748660166e-06, "loss": 1.3135393857955933, "step": 6150 }, { "epoch": 1.893574451712197, "grad_norm": 6.6875, "learning_rate": 2.059677580677733e-06, "loss": 1.2441500425338745, "step": 6152 }, { "epoch": 1.8941900731050403, "grad_norm": 8.75, "learning_rate": 2.0589933282820133e-06, "loss": 1.339614987373352, "step": 6154 }, { "epoch": 1.894805694497884, "grad_norm": 15.8125, "learning_rate": 2.0583129921529644e-06, "loss": 1.2056138515472412, "step": 6156 }, { "epoch": 1.8954213158907272, "grad_norm": 11.0, "learning_rate": 2.057636572966652e-06, "loss": 1.2353260517120361, "step": 6158 }, { "epoch": 1.8960369372835706, "grad_norm": 10.5625, "learning_rate": 2.0569640713952478e-06, "loss": 1.705105185508728, "step": 6160 }, { "epoch": 1.896652558676414, "grad_norm": 9.5, "learning_rate": 2.0562954881070313e-06, "loss": 0.937542200088501, "step": 6162 }, { "epoch": 1.8972681800692575, "grad_norm": 5.03125, "learning_rate": 2.055630823766391e-06, "loss": 1.455617904663086, "step": 6164 }, { "epoch": 1.897883801462101, "grad_norm": 10.5, "learning_rate": 2.054970079033817e-06, "loss": 1.343193769454956, "step": 6166 }, { "epoch": 1.898499422854944, "grad_norm": 5.84375, "learning_rate": 2.0543132545659065e-06, "loss": 1.2717845439910889, "step": 6168 }, { "epoch": 1.8991150442477878, "grad_norm": 5.53125, "learning_rate": 2.053660351015361e-06, "loss": 1.3608722686767578, "step": 6170 }, { "epoch": 1.899730665640631, "grad_norm": 3.875, "learning_rate": 2.0530113690309854e-06, "loss": 0.9390138983726501, "step": 6172 }, { "epoch": 1.9003462870334744, "grad_norm": 17.5, "learning_rate": 2.052366309257687e-06, "loss": 1.403334140777588, "step": 6174 }, { "epoch": 1.9009619084263178, "grad_norm": 9.5625, "learning_rate": 2.0517251723364767e-06, "loss": 1.7505857944488525, "step": 6176 }, { "epoch": 1.901577529819161, "grad_norm": 7.40625, "learning_rate": 2.0510879589044663e-06, "loss": 1.120890498161316, "step": 6178 }, { "epoch": 1.9021931512120047, "grad_norm": 9.0, "learning_rate": 2.05045466959487e-06, "loss": 1.4072320461273193, "step": 6180 }, { "epoch": 1.902808772604848, "grad_norm": 6.3125, "learning_rate": 2.049825305037e-06, "loss": 1.454264760017395, "step": 6182 }, { "epoch": 1.9034243939976914, "grad_norm": 4.1875, "learning_rate": 2.049199865856271e-06, "loss": 1.0654551982879639, "step": 6184 }, { "epoch": 1.9040400153905348, "grad_norm": 23.75, "learning_rate": 2.048578352674196e-06, "loss": 1.544224500656128, "step": 6186 }, { "epoch": 1.9046556367833782, "grad_norm": 5.15625, "learning_rate": 2.0479607661083867e-06, "loss": 1.1244772672653198, "step": 6188 }, { "epoch": 1.9052712581762217, "grad_norm": 5.96875, "learning_rate": 2.047347106772552e-06, "loss": 1.4134089946746826, "step": 6190 }, { "epoch": 1.9058868795690649, "grad_norm": 9.9375, "learning_rate": 2.0467373752764986e-06, "loss": 1.2867887020111084, "step": 6192 }, { "epoch": 1.9065025009619085, "grad_norm": 4.90625, "learning_rate": 2.046131572226132e-06, "loss": 1.3925960063934326, "step": 6194 }, { "epoch": 1.9071181223547518, "grad_norm": 8.1875, "learning_rate": 2.045529698223451e-06, "loss": 1.4497476816177368, "step": 6196 }, { "epoch": 1.9077337437475952, "grad_norm": 5.0, "learning_rate": 2.0449317538665515e-06, "loss": 1.4063258171081543, "step": 6198 }, { "epoch": 1.9083493651404386, "grad_norm": 9.8125, "learning_rate": 2.044337739749625e-06, "loss": 1.3043848276138306, "step": 6200 }, { "epoch": 1.908964986533282, "grad_norm": 4.84375, "learning_rate": 2.0437476564629553e-06, "loss": 0.6216869950294495, "step": 6202 }, { "epoch": 1.9095806079261255, "grad_norm": 8.1875, "learning_rate": 2.043161504592922e-06, "loss": 0.8327115774154663, "step": 6204 }, { "epoch": 1.9101962293189687, "grad_norm": 5.65625, "learning_rate": 2.0425792847219973e-06, "loss": 1.537729024887085, "step": 6206 }, { "epoch": 1.9108118507118124, "grad_norm": 5.5, "learning_rate": 2.042000997428747e-06, "loss": 1.1706664562225342, "step": 6208 }, { "epoch": 1.9114274721046556, "grad_norm": 3.625, "learning_rate": 2.041426643287827e-06, "loss": 1.3486610651016235, "step": 6210 }, { "epoch": 1.912043093497499, "grad_norm": 18.0, "learning_rate": 2.040856222869986e-06, "loss": 1.4879320859909058, "step": 6212 }, { "epoch": 1.9126587148903424, "grad_norm": 16.25, "learning_rate": 2.040289736742064e-06, "loss": 1.281280517578125, "step": 6214 }, { "epoch": 1.9132743362831859, "grad_norm": 2.609375, "learning_rate": 2.039727185466991e-06, "loss": 1.127787470817566, "step": 6216 }, { "epoch": 1.9138899576760293, "grad_norm": 7.4375, "learning_rate": 2.0391685696037864e-06, "loss": 1.3760292530059814, "step": 6218 }, { "epoch": 1.9145055790688725, "grad_norm": 6.25, "learning_rate": 2.03861388970756e-06, "loss": 1.2867217063903809, "step": 6220 }, { "epoch": 1.9151212004617162, "grad_norm": 8.25, "learning_rate": 2.0380631463295085e-06, "loss": 1.3626055717468262, "step": 6222 }, { "epoch": 1.9157368218545594, "grad_norm": 4.78125, "learning_rate": 2.0375163400169186e-06, "loss": 1.468051791191101, "step": 6224 }, { "epoch": 1.9163524432474028, "grad_norm": 15.25, "learning_rate": 2.036973471313164e-06, "loss": 1.2753353118896484, "step": 6226 }, { "epoch": 1.9169680646402463, "grad_norm": 7.34375, "learning_rate": 2.0364345407577057e-06, "loss": 1.407792568206787, "step": 6228 }, { "epoch": 1.9175836860330895, "grad_norm": 1.9921875, "learning_rate": 2.0358995488860912e-06, "loss": 0.9969472885131836, "step": 6230 }, { "epoch": 1.9181993074259331, "grad_norm": 4.53125, "learning_rate": 2.035368496229953e-06, "loss": 1.012083888053894, "step": 6232 }, { "epoch": 1.9188149288187764, "grad_norm": 7.125, "learning_rate": 2.0348413833170113e-06, "loss": 1.195415735244751, "step": 6234 }, { "epoch": 1.91943055021162, "grad_norm": 5.53125, "learning_rate": 2.034318210671068e-06, "loss": 1.1292438507080078, "step": 6236 }, { "epoch": 1.9200461716044632, "grad_norm": 7.53125, "learning_rate": 2.033798978812014e-06, "loss": 1.3102638721466064, "step": 6238 }, { "epoch": 1.9206617929973067, "grad_norm": 7.625, "learning_rate": 2.0332836882558202e-06, "loss": 1.4417779445648193, "step": 6240 }, { "epoch": 1.92127741439015, "grad_norm": 5.65625, "learning_rate": 2.032772339514543e-06, "loss": 1.0075815916061401, "step": 6242 }, { "epoch": 1.9218930357829933, "grad_norm": 4.46875, "learning_rate": 2.0322649330963197e-06, "loss": 1.2295527458190918, "step": 6244 }, { "epoch": 1.922508657175837, "grad_norm": 11.375, "learning_rate": 2.031761469505373e-06, "loss": 1.5699710845947266, "step": 6246 }, { "epoch": 1.9231242785686802, "grad_norm": 4.1875, "learning_rate": 2.0312619492420056e-06, "loss": 1.5786056518554688, "step": 6248 }, { "epoch": 1.9237398999615236, "grad_norm": 6.125, "learning_rate": 2.0307663728026015e-06, "loss": 1.5211533308029175, "step": 6250 }, { "epoch": 1.924355521354367, "grad_norm": 7.1875, "learning_rate": 2.0302747406796268e-06, "loss": 1.1912158727645874, "step": 6252 }, { "epoch": 1.9249711427472105, "grad_norm": 11.0625, "learning_rate": 2.0297870533616267e-06, "loss": 1.2701131105422974, "step": 6254 }, { "epoch": 1.925586764140054, "grad_norm": 4.59375, "learning_rate": 2.029303311333227e-06, "loss": 1.1772947311401367, "step": 6256 }, { "epoch": 1.9262023855328971, "grad_norm": 8.0, "learning_rate": 2.0288235150751333e-06, "loss": 1.4614884853363037, "step": 6258 }, { "epoch": 1.9268180069257408, "grad_norm": 24.5, "learning_rate": 2.028347665064131e-06, "loss": 1.0047273635864258, "step": 6260 }, { "epoch": 1.927433628318584, "grad_norm": 4.28125, "learning_rate": 2.0278757617730808e-06, "loss": 1.2402758598327637, "step": 6262 }, { "epoch": 1.9280492497114274, "grad_norm": 2.984375, "learning_rate": 2.0274078056709247e-06, "loss": 1.119405746459961, "step": 6264 }, { "epoch": 1.9286648711042709, "grad_norm": 5.59375, "learning_rate": 2.026943797222681e-06, "loss": 1.1718242168426514, "step": 6266 }, { "epoch": 1.9292804924971143, "grad_norm": 9.1875, "learning_rate": 2.0264837368894454e-06, "loss": 1.274290680885315, "step": 6268 }, { "epoch": 1.9298961138899577, "grad_norm": 5.65625, "learning_rate": 2.02602762512839e-06, "loss": 1.0971585512161255, "step": 6270 }, { "epoch": 1.930511735282801, "grad_norm": 12.0625, "learning_rate": 2.0255754623927635e-06, "loss": 1.0695799589157104, "step": 6272 }, { "epoch": 1.9311273566756446, "grad_norm": 6.78125, "learning_rate": 2.0251272491318906e-06, "loss": 1.2456305027008057, "step": 6274 }, { "epoch": 1.9317429780684878, "grad_norm": 7.71875, "learning_rate": 2.02468298579117e-06, "loss": 1.5815218687057495, "step": 6276 }, { "epoch": 1.9323585994613313, "grad_norm": 4.84375, "learning_rate": 2.0242426728120766e-06, "loss": 1.1808098554611206, "step": 6278 }, { "epoch": 1.9329742208541747, "grad_norm": 5.15625, "learning_rate": 2.0238063106321583e-06, "loss": 1.447092056274414, "step": 6280 }, { "epoch": 1.9335898422470181, "grad_norm": 4.4375, "learning_rate": 2.02337389968504e-06, "loss": 1.1663380861282349, "step": 6282 }, { "epoch": 1.9342054636398616, "grad_norm": 8.75, "learning_rate": 2.022945440400416e-06, "loss": 1.5715312957763672, "step": 6284 }, { "epoch": 1.9348210850327048, "grad_norm": 5.96875, "learning_rate": 2.0225209332040576e-06, "loss": 1.3347316980361938, "step": 6286 }, { "epoch": 1.9354367064255484, "grad_norm": 13.5625, "learning_rate": 2.022100378517806e-06, "loss": 1.386270523071289, "step": 6288 }, { "epoch": 1.9360523278183916, "grad_norm": 9.1875, "learning_rate": 2.021683776759576e-06, "loss": 1.545689582824707, "step": 6290 }, { "epoch": 1.936667949211235, "grad_norm": 4.40625, "learning_rate": 2.0212711283433544e-06, "loss": 1.096389651298523, "step": 6292 }, { "epoch": 1.9372835706040785, "grad_norm": 3.53125, "learning_rate": 2.0208624336791993e-06, "loss": 0.9402706623077393, "step": 6294 }, { "epoch": 1.9378991919969217, "grad_norm": 4.6875, "learning_rate": 2.020457693173239e-06, "loss": 1.2515463829040527, "step": 6296 }, { "epoch": 1.9385148133897654, "grad_norm": 3.5, "learning_rate": 2.0200569072276744e-06, "loss": 1.211615800857544, "step": 6298 }, { "epoch": 1.9391304347826086, "grad_norm": 5.3125, "learning_rate": 2.0196600762407745e-06, "loss": 1.3675752878189087, "step": 6300 }, { "epoch": 1.9397460561754523, "grad_norm": 6.8125, "learning_rate": 2.0192672006068795e-06, "loss": 1.6202188730239868, "step": 6302 }, { "epoch": 1.9403616775682955, "grad_norm": 4.25, "learning_rate": 2.0188782807163983e-06, "loss": 1.3375134468078613, "step": 6304 }, { "epoch": 1.940977298961139, "grad_norm": 32.75, "learning_rate": 2.0184933169558103e-06, "loss": 1.153170108795166, "step": 6306 }, { "epoch": 1.9415929203539823, "grad_norm": 4.96875, "learning_rate": 2.018112309707662e-06, "loss": 1.559049367904663, "step": 6308 }, { "epoch": 1.9422085417468256, "grad_norm": 4.28125, "learning_rate": 2.017735259350568e-06, "loss": 1.4401798248291016, "step": 6310 }, { "epoch": 1.9428241631396692, "grad_norm": 12.5625, "learning_rate": 2.0173621662592142e-06, "loss": 1.2643327713012695, "step": 6312 }, { "epoch": 1.9434397845325124, "grad_norm": 7.28125, "learning_rate": 2.0169930308043482e-06, "loss": 1.4889107942581177, "step": 6314 }, { "epoch": 1.9440554059253559, "grad_norm": 2.0, "learning_rate": 2.016627853352791e-06, "loss": 1.2440648078918457, "step": 6316 }, { "epoch": 1.9446710273181993, "grad_norm": 9.25, "learning_rate": 2.0162666342674265e-06, "loss": 1.2304495573043823, "step": 6318 }, { "epoch": 1.9452866487110427, "grad_norm": 11.6875, "learning_rate": 2.0159093739072054e-06, "loss": 1.1931729316711426, "step": 6320 }, { "epoch": 1.9459022701038862, "grad_norm": 6.65625, "learning_rate": 2.015556072627147e-06, "loss": 1.0525994300842285, "step": 6322 }, { "epoch": 1.9465178914967294, "grad_norm": 15.5625, "learning_rate": 2.0152067307783333e-06, "loss": 1.3192481994628906, "step": 6324 }, { "epoch": 1.947133512889573, "grad_norm": 2.90625, "learning_rate": 2.014861348707914e-06, "loss": 1.18320631980896, "step": 6326 }, { "epoch": 1.9477491342824163, "grad_norm": 5.90625, "learning_rate": 2.0145199267591025e-06, "loss": 1.2805728912353516, "step": 6328 }, { "epoch": 1.9483647556752597, "grad_norm": 5.4375, "learning_rate": 2.014182465271178e-06, "loss": 1.586653470993042, "step": 6330 }, { "epoch": 1.9489803770681031, "grad_norm": 8.25, "learning_rate": 2.0138489645794826e-06, "loss": 0.7887523174285889, "step": 6332 }, { "epoch": 1.9495959984609466, "grad_norm": 11.25, "learning_rate": 2.013519425015424e-06, "loss": 1.597000241279602, "step": 6334 }, { "epoch": 1.95021161985379, "grad_norm": 6.0, "learning_rate": 2.0131938469064734e-06, "loss": 0.9804918169975281, "step": 6336 }, { "epoch": 1.9508272412466332, "grad_norm": 5.625, "learning_rate": 2.0128722305761646e-06, "loss": 1.2448334693908691, "step": 6338 }, { "epoch": 1.9514428626394769, "grad_norm": 13.75, "learning_rate": 2.0125545763440953e-06, "loss": 1.201894998550415, "step": 6340 }, { "epoch": 1.95205848403232, "grad_norm": 9.9375, "learning_rate": 2.012240884525925e-06, "loss": 1.6245787143707275, "step": 6342 }, { "epoch": 1.9526741054251635, "grad_norm": 6.4375, "learning_rate": 2.0119311554333766e-06, "loss": 1.30740487575531, "step": 6344 }, { "epoch": 1.953289726818007, "grad_norm": 6.25, "learning_rate": 2.011625389374235e-06, "loss": 1.4148385524749756, "step": 6346 }, { "epoch": 1.9539053482108504, "grad_norm": 4.6875, "learning_rate": 2.011323586652347e-06, "loss": 1.0704916715621948, "step": 6348 }, { "epoch": 1.9545209696036938, "grad_norm": 3.0625, "learning_rate": 2.0110257475676203e-06, "loss": 1.1448662281036377, "step": 6350 }, { "epoch": 1.955136590996537, "grad_norm": 8.5, "learning_rate": 2.0107318724160245e-06, "loss": 1.4039653539657593, "step": 6352 }, { "epoch": 1.9557522123893807, "grad_norm": 8.6875, "learning_rate": 2.0104419614895896e-06, "loss": 0.9554769992828369, "step": 6354 }, { "epoch": 1.956367833782224, "grad_norm": 7.40625, "learning_rate": 2.0101560150764067e-06, "loss": 1.112485408782959, "step": 6356 }, { "epoch": 1.9569834551750673, "grad_norm": 3.921875, "learning_rate": 2.0098740334606277e-06, "loss": 1.2079534530639648, "step": 6358 }, { "epoch": 1.9575990765679108, "grad_norm": 6.5625, "learning_rate": 2.0095960169224635e-06, "loss": 1.0168330669403076, "step": 6360 }, { "epoch": 1.958214697960754, "grad_norm": 9.9375, "learning_rate": 2.0093219657381857e-06, "loss": 1.4579648971557617, "step": 6362 }, { "epoch": 1.9588303193535976, "grad_norm": 12.0625, "learning_rate": 2.0090518801801244e-06, "loss": 1.2450551986694336, "step": 6364 }, { "epoch": 1.9594459407464409, "grad_norm": 5.25, "learning_rate": 2.0087857605166704e-06, "loss": 1.1469169855117798, "step": 6366 }, { "epoch": 1.9600615621392843, "grad_norm": 1.8359375, "learning_rate": 2.0085236070122728e-06, "loss": 1.1382040977478027, "step": 6368 }, { "epoch": 1.9606771835321277, "grad_norm": 5.46875, "learning_rate": 2.008265419927439e-06, "loss": 1.0705983638763428, "step": 6370 }, { "epoch": 1.9612928049249712, "grad_norm": 9.25, "learning_rate": 2.0080111995187354e-06, "loss": 1.1481586694717407, "step": 6372 }, { "epoch": 1.9619084263178146, "grad_norm": 9.3125, "learning_rate": 2.0077609460387866e-06, "loss": 1.5089284181594849, "step": 6374 }, { "epoch": 1.9625240477106578, "grad_norm": 4.0, "learning_rate": 2.007514659736275e-06, "loss": 1.2592695951461792, "step": 6376 }, { "epoch": 1.9631396691035015, "grad_norm": 4.8125, "learning_rate": 2.007272340855941e-06, "loss": 1.014017105102539, "step": 6378 }, { "epoch": 1.9637552904963447, "grad_norm": 7.3125, "learning_rate": 2.0070339896385823e-06, "loss": 1.0303841829299927, "step": 6380 }, { "epoch": 1.964370911889188, "grad_norm": 11.4375, "learning_rate": 2.006799606321054e-06, "loss": 1.0867152214050293, "step": 6382 }, { "epoch": 1.9649865332820315, "grad_norm": 10.75, "learning_rate": 2.0065691911362674e-06, "loss": 1.5220026969909668, "step": 6384 }, { "epoch": 1.965602154674875, "grad_norm": 26.5, "learning_rate": 2.0063427443131915e-06, "loss": 1.254058837890625, "step": 6386 }, { "epoch": 1.9662177760677184, "grad_norm": 8.375, "learning_rate": 2.006120266076852e-06, "loss": 1.0503437519073486, "step": 6388 }, { "epoch": 1.9668333974605616, "grad_norm": 12.0, "learning_rate": 2.00590175664833e-06, "loss": 1.8246984481811523, "step": 6390 }, { "epoch": 1.9674490188534053, "grad_norm": 2.390625, "learning_rate": 2.0056872162447636e-06, "loss": 1.23115873336792, "step": 6392 }, { "epoch": 1.9680646402462485, "grad_norm": 5.9375, "learning_rate": 2.0054766450793462e-06, "loss": 1.1373382806777954, "step": 6394 }, { "epoch": 1.968680261639092, "grad_norm": 4.71875, "learning_rate": 2.0052700433613277e-06, "loss": 1.1563501358032227, "step": 6396 }, { "epoch": 1.9692958830319354, "grad_norm": 6.5625, "learning_rate": 2.005067411296011e-06, "loss": 1.4384610652923584, "step": 6398 }, { "epoch": 1.9699115044247788, "grad_norm": 10.8125, "learning_rate": 2.0048687490847585e-06, "loss": 1.4476401805877686, "step": 6400 }, { "epoch": 1.9705271258176222, "grad_norm": 12.5, "learning_rate": 2.004674056924984e-06, "loss": 1.4745994806289673, "step": 6402 }, { "epoch": 1.9711427472104655, "grad_norm": 12.4375, "learning_rate": 2.004483335010158e-06, "loss": 1.651023268699646, "step": 6404 }, { "epoch": 1.971758368603309, "grad_norm": 6.78125, "learning_rate": 2.0042965835298043e-06, "loss": 1.414797067642212, "step": 6406 }, { "epoch": 1.9723739899961523, "grad_norm": 4.25, "learning_rate": 2.0041138026695024e-06, "loss": 0.9318125247955322, "step": 6408 }, { "epoch": 1.9729896113889958, "grad_norm": 7.03125, "learning_rate": 2.0039349926108864e-06, "loss": 1.1416950225830078, "step": 6410 }, { "epoch": 1.9736052327818392, "grad_norm": 4.0625, "learning_rate": 2.003760153531643e-06, "loss": 1.0876226425170898, "step": 6412 }, { "epoch": 1.9742208541746824, "grad_norm": 11.1875, "learning_rate": 2.0035892856055144e-06, "loss": 1.3485053777694702, "step": 6414 }, { "epoch": 1.974836475567526, "grad_norm": 9.0, "learning_rate": 2.0034223890022954e-06, "loss": 1.4001414775848389, "step": 6416 }, { "epoch": 1.9754520969603693, "grad_norm": 5.375, "learning_rate": 2.003259463887835e-06, "loss": 1.4163827896118164, "step": 6418 }, { "epoch": 1.976067718353213, "grad_norm": 6.28125, "learning_rate": 2.0031005104240356e-06, "loss": 1.1511523723602295, "step": 6420 }, { "epoch": 1.9766833397460561, "grad_norm": 6.90625, "learning_rate": 2.002945528768853e-06, "loss": 1.307472825050354, "step": 6422 }, { "epoch": 1.9772989611388996, "grad_norm": 5.5, "learning_rate": 2.002794519076296e-06, "loss": 1.4973814487457275, "step": 6424 }, { "epoch": 1.977914582531743, "grad_norm": 4.53125, "learning_rate": 2.002647481496425e-06, "loss": 1.3886406421661377, "step": 6426 }, { "epoch": 1.9785302039245862, "grad_norm": 5.46875, "learning_rate": 2.002504416175357e-06, "loss": 1.3384922742843628, "step": 6428 }, { "epoch": 1.9791458253174299, "grad_norm": 13.6875, "learning_rate": 2.0023653232552565e-06, "loss": 2.076946973800659, "step": 6430 }, { "epoch": 1.979761446710273, "grad_norm": 10.6875, "learning_rate": 2.0022302028743457e-06, "loss": 1.7353794574737549, "step": 6432 }, { "epoch": 1.9803770681031165, "grad_norm": 8.25, "learning_rate": 2.002099055166895e-06, "loss": 1.3509681224822998, "step": 6434 }, { "epoch": 1.98099268949596, "grad_norm": 6.1875, "learning_rate": 2.00197188026323e-06, "loss": 1.244296669960022, "step": 6436 }, { "epoch": 1.9816083108888034, "grad_norm": 66.5, "learning_rate": 2.0018486782897257e-06, "loss": 1.6251654624938965, "step": 6438 }, { "epoch": 1.9822239322816468, "grad_norm": 4.96875, "learning_rate": 2.0017294493688128e-06, "loss": 0.9943356513977051, "step": 6440 }, { "epoch": 1.98283955367449, "grad_norm": 7.34375, "learning_rate": 2.00161419361897e-06, "loss": 0.9533267617225647, "step": 6442 }, { "epoch": 1.9834551750673337, "grad_norm": 4.59375, "learning_rate": 2.0015029111547304e-06, "loss": 1.4887453317642212, "step": 6444 }, { "epoch": 1.984070796460177, "grad_norm": 6.84375, "learning_rate": 2.0013956020866772e-06, "loss": 0.9633210897445679, "step": 6446 }, { "epoch": 1.9846864178530204, "grad_norm": 5.84375, "learning_rate": 2.001292266521446e-06, "loss": 1.1440017223358154, "step": 6448 }, { "epoch": 1.9853020392458638, "grad_norm": 4.09375, "learning_rate": 2.0011929045617252e-06, "loss": 1.0662113428115845, "step": 6450 }, { "epoch": 1.9859176606387072, "grad_norm": 7.84375, "learning_rate": 2.0010975163062508e-06, "loss": 1.2532540559768677, "step": 6452 }, { "epoch": 1.9865332820315507, "grad_norm": 19.125, "learning_rate": 2.001006101849813e-06, "loss": 0.6765220165252686, "step": 6454 }, { "epoch": 1.9871489034243939, "grad_norm": 3.3125, "learning_rate": 2.0009186612832533e-06, "loss": 1.2371511459350586, "step": 6456 }, { "epoch": 1.9877645248172375, "grad_norm": 3.5625, "learning_rate": 2.000835194693462e-06, "loss": 1.330336332321167, "step": 6458 }, { "epoch": 1.9883801462100807, "grad_norm": 11.9375, "learning_rate": 2.000755702163383e-06, "loss": 1.3997299671173096, "step": 6460 }, { "epoch": 1.9889957676029242, "grad_norm": 7.125, "learning_rate": 2.000680183772008e-06, "loss": 1.324434518814087, "step": 6462 }, { "epoch": 1.9896113889957676, "grad_norm": 7.0625, "learning_rate": 2.0006086395943834e-06, "loss": 1.0115548372268677, "step": 6464 }, { "epoch": 1.990227010388611, "grad_norm": 5.0, "learning_rate": 2.0005410697016033e-06, "loss": 1.1448062658309937, "step": 6466 }, { "epoch": 1.9908426317814545, "grad_norm": 14.5625, "learning_rate": 2.0004774741608126e-06, "loss": 0.9839100241661072, "step": 6468 }, { "epoch": 1.9914582531742977, "grad_norm": 6.40625, "learning_rate": 2.0004178530352093e-06, "loss": 1.489912986755371, "step": 6470 }, { "epoch": 1.9920738745671414, "grad_norm": 11.5625, "learning_rate": 2.000362206384039e-06, "loss": 1.4942373037338257, "step": 6472 }, { "epoch": 1.9926894959599846, "grad_norm": 7.53125, "learning_rate": 2.0003105342625993e-06, "loss": 1.1496572494506836, "step": 6474 }, { "epoch": 1.993305117352828, "grad_norm": 10.3125, "learning_rate": 2.0002628367222387e-06, "loss": 1.4847123622894287, "step": 6476 }, { "epoch": 1.9939207387456714, "grad_norm": 8.0625, "learning_rate": 2.0002191138103544e-06, "loss": 1.662402629852295, "step": 6478 }, { "epoch": 1.9945363601385147, "grad_norm": 11.3125, "learning_rate": 2.000179365570395e-06, "loss": 1.5024702548980713, "step": 6480 }, { "epoch": 1.9951519815313583, "grad_norm": 14.125, "learning_rate": 2.000143592041859e-06, "loss": 0.7729127407073975, "step": 6482 }, { "epoch": 1.9957676029242015, "grad_norm": 2.984375, "learning_rate": 2.0001117932602966e-06, "loss": 0.9017384648323059, "step": 6484 }, { "epoch": 1.9963832243170452, "grad_norm": 26.125, "learning_rate": 2.0000839692573048e-06, "loss": 1.2910041809082031, "step": 6486 }, { "epoch": 1.9969988457098884, "grad_norm": 1.8671875, "learning_rate": 2.000060120060535e-06, "loss": 0.4969649910926819, "step": 6488 }, { "epoch": 1.9976144671027318, "grad_norm": 8.5625, "learning_rate": 2.0000402456936858e-06, "loss": 1.0192922353744507, "step": 6490 }, { "epoch": 1.9982300884955753, "grad_norm": 15.6875, "learning_rate": 2.0000243461765068e-06, "loss": 1.1761404275894165, "step": 6492 }, { "epoch": 1.9988457098884185, "grad_norm": 85.0, "learning_rate": 2.000012421524798e-06, "loss": 1.2855005264282227, "step": 6494 }, { "epoch": 1.9994613312812621, "grad_norm": 7.09375, "learning_rate": 2.0000044717504087e-06, "loss": 1.3793665170669556, "step": 6496 }, { "epoch": 2.0, "grad_norm": 19.125, "learning_rate": 2.000000496861239e-06, "loss": 1.2610743045806885, "step": 6498 }, { "epoch": 2.0, "step": 6498, "total_flos": 2.5760029558366536e+18, "train_loss": 1.2936220749611853, "train_runtime": 22787.4373, "train_samples_per_second": 1.141, "train_steps_per_second": 0.285 } ], "logging_steps": 2, "max_steps": 6498, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 9999999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5760029558366536e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }