diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,22786 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 6498, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006156213928434013, + "grad_norm": 3.09375, + "learning_rate": 2.5641025641025643e-08, + "loss": 1.2494438886642456, + "step": 2 + }, + { + "epoch": 0.0012312427856868025, + "grad_norm": 3.75, + "learning_rate": 7.692307692307694e-08, + "loss": 1.9450684785842896, + "step": 4 + }, + { + "epoch": 0.001846864178530204, + "grad_norm": 4.40625, + "learning_rate": 1.282051282051282e-07, + "loss": 1.6189125776290894, + "step": 6 + }, + { + "epoch": 0.002462485571373605, + "grad_norm": 5.8125, + "learning_rate": 1.7948717948717948e-07, + "loss": 1.9541618824005127, + "step": 8 + }, + { + "epoch": 0.0030781069642170067, + "grad_norm": 7.40625, + "learning_rate": 2.307692307692308e-07, + "loss": 2.2807464599609375, + "step": 10 + }, + { + "epoch": 0.003693728357060408, + "grad_norm": 6.3125, + "learning_rate": 2.820512820512821e-07, + "loss": 1.4778738021850586, + "step": 12 + }, + { + "epoch": 0.004309349749903809, + "grad_norm": 22.0, + "learning_rate": 3.3333333333333335e-07, + "loss": 2.5894782543182373, + "step": 14 + }, + { + "epoch": 0.00492497114274721, + "grad_norm": 6.78125, + "learning_rate": 3.846153846153847e-07, + "loss": 1.8322064876556396, + "step": 16 + }, + { + "epoch": 0.005540592535590611, + "grad_norm": 25.0, + "learning_rate": 4.358974358974359e-07, + "loss": 1.6440393924713135, + "step": 18 + }, + { + "epoch": 0.0061562139284340135, + "grad_norm": 6.0625, + "learning_rate": 4.871794871794872e-07, + "loss": 1.502462387084961, + "step": 20 + }, + { + "epoch": 0.006771835321277415, + "grad_norm": 5.5, + "learning_rate": 5.384615384615386e-07, + "loss": 1.8018434047698975, + "step": 22 + }, + { + "epoch": 0.007387456714120816, + "grad_norm": 4.03125, + "learning_rate": 5.897435897435898e-07, + "loss": 1.8351409435272217, + "step": 24 + }, + { + "epoch": 0.008003078106964217, + "grad_norm": 4.65625, + "learning_rate": 6.41025641025641e-07, + "loss": 1.6938329935073853, + "step": 26 + }, + { + "epoch": 0.008618699499807618, + "grad_norm": 5.125, + "learning_rate": 6.923076923076924e-07, + "loss": 1.592248558998108, + "step": 28 + }, + { + "epoch": 0.00923432089265102, + "grad_norm": 3.796875, + "learning_rate": 7.435897435897436e-07, + "loss": 1.804097294807434, + "step": 30 + }, + { + "epoch": 0.00984994228549442, + "grad_norm": 6.65625, + "learning_rate": 7.948717948717949e-07, + "loss": 1.8579797744750977, + "step": 32 + }, + { + "epoch": 0.010465563678337822, + "grad_norm": 4.1875, + "learning_rate": 8.461538461538463e-07, + "loss": 2.209737777709961, + "step": 34 + }, + { + "epoch": 0.011081185071181223, + "grad_norm": 4.875, + "learning_rate": 8.974358974358975e-07, + "loss": 1.4839755296707153, + "step": 36 + }, + { + "epoch": 0.011696806464024625, + "grad_norm": 12.6875, + "learning_rate": 9.487179487179487e-07, + "loss": 1.8842666149139404, + "step": 38 + }, + { + "epoch": 0.012312427856868027, + "grad_norm": 5.5, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.1992511749267578, + "step": 40 + }, + { + "epoch": 0.012928049249711427, + "grad_norm": 8.3125, + "learning_rate": 1.0512820512820514e-06, + "loss": 1.2397454977035522, + "step": 42 + }, + { + "epoch": 0.01354367064255483, + "grad_norm": 15.25, + "learning_rate": 1.1025641025641026e-06, + "loss": 2.012077569961548, + "step": 44 + }, + { + "epoch": 0.01415929203539823, + "grad_norm": 3.0, + "learning_rate": 1.153846153846154e-06, + "loss": 1.4503027200698853, + "step": 46 + }, + { + "epoch": 0.014774913428241632, + "grad_norm": 9.125, + "learning_rate": 1.2051282051282053e-06, + "loss": 1.7287070751190186, + "step": 48 + }, + { + "epoch": 0.015390534821085032, + "grad_norm": 4.59375, + "learning_rate": 1.2564102564102565e-06, + "loss": 1.3663043975830078, + "step": 50 + }, + { + "epoch": 0.016006156213928435, + "grad_norm": 6.25, + "learning_rate": 1.307692307692308e-06, + "loss": 1.6556223630905151, + "step": 52 + }, + { + "epoch": 0.016621777606771835, + "grad_norm": 5.71875, + "learning_rate": 1.358974358974359e-06, + "loss": 1.983393907546997, + "step": 54 + }, + { + "epoch": 0.017237398999615235, + "grad_norm": 9.75, + "learning_rate": 1.4102564102564104e-06, + "loss": 1.5182173252105713, + "step": 56 + }, + { + "epoch": 0.01785302039245864, + "grad_norm": 4.8125, + "learning_rate": 1.4615384615384618e-06, + "loss": 1.9359819889068604, + "step": 58 + }, + { + "epoch": 0.01846864178530204, + "grad_norm": 3.453125, + "learning_rate": 1.5128205128205128e-06, + "loss": 1.8004614114761353, + "step": 60 + }, + { + "epoch": 0.01908426317814544, + "grad_norm": 5.78125, + "learning_rate": 1.5641025641025642e-06, + "loss": 1.513994812965393, + "step": 62 + }, + { + "epoch": 0.01969988457098884, + "grad_norm": 2.71875, + "learning_rate": 1.6153846153846157e-06, + "loss": 1.17235267162323, + "step": 64 + }, + { + "epoch": 0.020315505963832244, + "grad_norm": 13.4375, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.682073712348938, + "step": 66 + }, + { + "epoch": 0.020931127356675645, + "grad_norm": 2.640625, + "learning_rate": 1.717948717948718e-06, + "loss": 1.2705780267715454, + "step": 68 + }, + { + "epoch": 0.021546748749519045, + "grad_norm": 6.3125, + "learning_rate": 1.7692307692307695e-06, + "loss": 2.2972326278686523, + "step": 70 + }, + { + "epoch": 0.022162370142362445, + "grad_norm": 10.6875, + "learning_rate": 1.8205128205128205e-06, + "loss": 1.8854734897613525, + "step": 72 + }, + { + "epoch": 0.02277799153520585, + "grad_norm": 5.71875, + "learning_rate": 1.871794871794872e-06, + "loss": 1.6820818185806274, + "step": 74 + }, + { + "epoch": 0.02339361292804925, + "grad_norm": 11.0, + "learning_rate": 1.9230769230769234e-06, + "loss": 2.026367664337158, + "step": 76 + }, + { + "epoch": 0.02400923432089265, + "grad_norm": 184.0, + "learning_rate": 1.9743589743589744e-06, + "loss": 1.9582500457763672, + "step": 78 + }, + { + "epoch": 0.024624855713736054, + "grad_norm": 2.375, + "learning_rate": 2.025641025641026e-06, + "loss": 1.233195185661316, + "step": 80 + }, + { + "epoch": 0.025240477106579454, + "grad_norm": 15.875, + "learning_rate": 2.0769230769230773e-06, + "loss": 1.5411051511764526, + "step": 82 + }, + { + "epoch": 0.025856098499422855, + "grad_norm": 5.96875, + "learning_rate": 2.1282051282051283e-06, + "loss": 1.4217170476913452, + "step": 84 + }, + { + "epoch": 0.026471719892266255, + "grad_norm": 5.03125, + "learning_rate": 2.1794871794871797e-06, + "loss": 1.4688079357147217, + "step": 86 + }, + { + "epoch": 0.02708734128510966, + "grad_norm": 7.875, + "learning_rate": 2.230769230769231e-06, + "loss": 1.6037238836288452, + "step": 88 + }, + { + "epoch": 0.02770296267795306, + "grad_norm": 13.9375, + "learning_rate": 2.282051282051282e-06, + "loss": 2.0153872966766357, + "step": 90 + }, + { + "epoch": 0.02831858407079646, + "grad_norm": 4.96875, + "learning_rate": 2.3333333333333336e-06, + "loss": 1.4056789875030518, + "step": 92 + }, + { + "epoch": 0.02893420546363986, + "grad_norm": 5.0625, + "learning_rate": 2.384615384615385e-06, + "loss": 1.8487026691436768, + "step": 94 + }, + { + "epoch": 0.029549826856483264, + "grad_norm": 2.703125, + "learning_rate": 2.435897435897436e-06, + "loss": 1.5239802598953247, + "step": 96 + }, + { + "epoch": 0.030165448249326664, + "grad_norm": 6.3125, + "learning_rate": 2.4871794871794875e-06, + "loss": 1.6807596683502197, + "step": 98 + }, + { + "epoch": 0.030781069642170065, + "grad_norm": 12.9375, + "learning_rate": 2.5384615384615385e-06, + "loss": 1.1042921543121338, + "step": 100 + }, + { + "epoch": 0.03139669103501347, + "grad_norm": 7.34375, + "learning_rate": 2.5897435897435903e-06, + "loss": 1.5535751581192017, + "step": 102 + }, + { + "epoch": 0.03201231242785687, + "grad_norm": 6.28125, + "learning_rate": 2.6410256410256413e-06, + "loss": 1.560031771659851, + "step": 104 + }, + { + "epoch": 0.03262793382070027, + "grad_norm": 8.6875, + "learning_rate": 2.6923076923076923e-06, + "loss": 1.714495301246643, + "step": 106 + }, + { + "epoch": 0.03324355521354367, + "grad_norm": 2.890625, + "learning_rate": 2.743589743589744e-06, + "loss": 1.4674392938613892, + "step": 108 + }, + { + "epoch": 0.03385917660638707, + "grad_norm": 2.859375, + "learning_rate": 2.794871794871795e-06, + "loss": 1.6619113683700562, + "step": 110 + }, + { + "epoch": 0.03447479799923047, + "grad_norm": 21.625, + "learning_rate": 2.846153846153846e-06, + "loss": 1.9322985410690308, + "step": 112 + }, + { + "epoch": 0.03509041939207388, + "grad_norm": 8.5625, + "learning_rate": 2.897435897435898e-06, + "loss": 1.994782567024231, + "step": 114 + }, + { + "epoch": 0.03570604078491728, + "grad_norm": 2.8125, + "learning_rate": 2.948717948717949e-06, + "loss": 1.4228806495666504, + "step": 116 + }, + { + "epoch": 0.03632166217776068, + "grad_norm": 14.0, + "learning_rate": 3e-06, + "loss": 1.8283569812774658, + "step": 118 + }, + { + "epoch": 0.03693728357060408, + "grad_norm": 5.0, + "learning_rate": 3.051282051282052e-06, + "loss": 1.550577163696289, + "step": 120 + }, + { + "epoch": 0.03755290496344748, + "grad_norm": 4.4375, + "learning_rate": 3.102564102564103e-06, + "loss": 1.4163514375686646, + "step": 122 + }, + { + "epoch": 0.03816852635629088, + "grad_norm": 2.4375, + "learning_rate": 3.153846153846154e-06, + "loss": 1.0851366519927979, + "step": 124 + }, + { + "epoch": 0.03878414774913428, + "grad_norm": 7.96875, + "learning_rate": 3.205128205128206e-06, + "loss": 1.3987257480621338, + "step": 126 + }, + { + "epoch": 0.03939976914197768, + "grad_norm": 10.75, + "learning_rate": 3.256410256410257e-06, + "loss": 1.721047282218933, + "step": 128 + }, + { + "epoch": 0.04001539053482109, + "grad_norm": 5.375, + "learning_rate": 3.307692307692308e-06, + "loss": 1.260140299797058, + "step": 130 + }, + { + "epoch": 0.04063101192766449, + "grad_norm": 5.78125, + "learning_rate": 3.358974358974359e-06, + "loss": 1.6700891256332397, + "step": 132 + }, + { + "epoch": 0.04124663332050789, + "grad_norm": 5.375, + "learning_rate": 3.4102564102564107e-06, + "loss": 1.6653907299041748, + "step": 134 + }, + { + "epoch": 0.04186225471335129, + "grad_norm": 19.5, + "learning_rate": 3.4615384615384617e-06, + "loss": 0.9548298120498657, + "step": 136 + }, + { + "epoch": 0.04247787610619469, + "grad_norm": 18.75, + "learning_rate": 3.5128205128205127e-06, + "loss": 1.584119439125061, + "step": 138 + }, + { + "epoch": 0.04309349749903809, + "grad_norm": 9.8125, + "learning_rate": 3.5641025641025646e-06, + "loss": 1.7926617860794067, + "step": 140 + }, + { + "epoch": 0.04370911889188149, + "grad_norm": 6.03125, + "learning_rate": 3.6153846153846156e-06, + "loss": 1.367248296737671, + "step": 142 + }, + { + "epoch": 0.04432474028472489, + "grad_norm": 2.375, + "learning_rate": 3.6666666666666666e-06, + "loss": 1.4224942922592163, + "step": 144 + }, + { + "epoch": 0.0449403616775683, + "grad_norm": 5.59375, + "learning_rate": 3.7179487179487184e-06, + "loss": 1.492388367652893, + "step": 146 + }, + { + "epoch": 0.0455559830704117, + "grad_norm": 11.125, + "learning_rate": 3.7692307692307694e-06, + "loss": 1.9079093933105469, + "step": 148 + }, + { + "epoch": 0.0461716044632551, + "grad_norm": 7.03125, + "learning_rate": 3.8205128205128204e-06, + "loss": 1.3468117713928223, + "step": 150 + }, + { + "epoch": 0.0467872258560985, + "grad_norm": 12.8125, + "learning_rate": 3.871794871794872e-06, + "loss": 1.4502298831939697, + "step": 152 + }, + { + "epoch": 0.0474028472489419, + "grad_norm": 9.6875, + "learning_rate": 3.923076923076923e-06, + "loss": 1.6394906044006348, + "step": 154 + }, + { + "epoch": 0.0480184686417853, + "grad_norm": 5.9375, + "learning_rate": 3.974358974358974e-06, + "loss": 1.235146164894104, + "step": 156 + }, + { + "epoch": 0.0486340900346287, + "grad_norm": 3.546875, + "learning_rate": 4.025641025641026e-06, + "loss": 1.3920354843139648, + "step": 158 + }, + { + "epoch": 0.04924971142747211, + "grad_norm": 9.1875, + "learning_rate": 4.076923076923077e-06, + "loss": 1.631940245628357, + "step": 160 + }, + { + "epoch": 0.04986533282031551, + "grad_norm": 2.96875, + "learning_rate": 4.128205128205128e-06, + "loss": 1.311639666557312, + "step": 162 + }, + { + "epoch": 0.05048095421315891, + "grad_norm": 5.375, + "learning_rate": 4.17948717948718e-06, + "loss": 1.2662135362625122, + "step": 164 + }, + { + "epoch": 0.05109657560600231, + "grad_norm": 13.75, + "learning_rate": 4.230769230769231e-06, + "loss": 1.3407542705535889, + "step": 166 + }, + { + "epoch": 0.05171219699884571, + "grad_norm": 3.296875, + "learning_rate": 4.282051282051282e-06, + "loss": 1.7766584157943726, + "step": 168 + }, + { + "epoch": 0.05232781839168911, + "grad_norm": 4.03125, + "learning_rate": 4.333333333333334e-06, + "loss": 1.4202091693878174, + "step": 170 + }, + { + "epoch": 0.05294343978453251, + "grad_norm": 11.375, + "learning_rate": 4.384615384615385e-06, + "loss": 1.832815170288086, + "step": 172 + }, + { + "epoch": 0.05355906117737591, + "grad_norm": 6.65625, + "learning_rate": 4.435897435897436e-06, + "loss": 1.2218002080917358, + "step": 174 + }, + { + "epoch": 0.05417468257021932, + "grad_norm": 6.65625, + "learning_rate": 4.487179487179488e-06, + "loss": 1.5034165382385254, + "step": 176 + }, + { + "epoch": 0.05479030396306272, + "grad_norm": 9.75, + "learning_rate": 4.538461538461539e-06, + "loss": 1.6898847818374634, + "step": 178 + }, + { + "epoch": 0.05540592535590612, + "grad_norm": 15.3125, + "learning_rate": 4.58974358974359e-06, + "loss": 1.3020204305648804, + "step": 180 + }, + { + "epoch": 0.05602154674874952, + "grad_norm": 7.4375, + "learning_rate": 4.641025641025642e-06, + "loss": 1.4048819541931152, + "step": 182 + }, + { + "epoch": 0.05663716814159292, + "grad_norm": 4.59375, + "learning_rate": 4.692307692307693e-06, + "loss": 0.9551922082901001, + "step": 184 + }, + { + "epoch": 0.05725278953443632, + "grad_norm": 5.59375, + "learning_rate": 4.743589743589744e-06, + "loss": 1.7097375392913818, + "step": 186 + }, + { + "epoch": 0.05786841092727972, + "grad_norm": 11.8125, + "learning_rate": 4.7948717948717955e-06, + "loss": 1.3811229467391968, + "step": 188 + }, + { + "epoch": 0.05848403232012313, + "grad_norm": 6.46875, + "learning_rate": 4.8461538461538465e-06, + "loss": 1.6789026260375977, + "step": 190 + }, + { + "epoch": 0.05909965371296653, + "grad_norm": 4.3125, + "learning_rate": 4.8974358974358975e-06, + "loss": 1.3382556438446045, + "step": 192 + }, + { + "epoch": 0.05971527510580993, + "grad_norm": 1.9453125, + "learning_rate": 4.948717948717949e-06, + "loss": 1.215793251991272, + "step": 194 + }, + { + "epoch": 0.06033089649865333, + "grad_norm": 4.90625, + "learning_rate": 5e-06, + "loss": 1.4898529052734375, + "step": 196 + }, + { + "epoch": 0.06094651789149673, + "grad_norm": 3.78125, + "learning_rate": 4.999999006277585e-06, + "loss": 1.278192162513733, + "step": 198 + }, + { + "epoch": 0.06156213928434013, + "grad_norm": 2.9375, + "learning_rate": 4.9999960251113246e-06, + "loss": 1.3816601037979126, + "step": 200 + }, + { + "epoch": 0.06217776067718353, + "grad_norm": 8.625, + "learning_rate": 4.999991056504183e-06, + "loss": 1.6115779876708984, + "step": 202 + }, + { + "epoch": 0.06279338207002694, + "grad_norm": 5.46875, + "learning_rate": 4.9999841004610975e-06, + "loss": 1.047288417816162, + "step": 204 + }, + { + "epoch": 0.06340900346287033, + "grad_norm": 5.8125, + "learning_rate": 4.999975156988978e-06, + "loss": 1.6929690837860107, + "step": 206 + }, + { + "epoch": 0.06402462485571374, + "grad_norm": 9.625, + "learning_rate": 4.999964226096716e-06, + "loss": 1.6581546068191528, + "step": 208 + }, + { + "epoch": 0.06464024624855713, + "grad_norm": 6.96875, + "learning_rate": 4.999951307795171e-06, + "loss": 1.750710129737854, + "step": 210 + }, + { + "epoch": 0.06525586764140054, + "grad_norm": 8.5, + "learning_rate": 4.999936402097182e-06, + "loss": 1.7493253946304321, + "step": 212 + }, + { + "epoch": 0.06587148903424395, + "grad_norm": 10.6875, + "learning_rate": 4.999919509017559e-06, + "loss": 1.6592433452606201, + "step": 214 + }, + { + "epoch": 0.06648711042708734, + "grad_norm": 10.875, + "learning_rate": 4.99990062857309e-06, + "loss": 1.333007574081421, + "step": 216 + }, + { + "epoch": 0.06710273181993075, + "grad_norm": 7.84375, + "learning_rate": 4.999879760782537e-06, + "loss": 1.5750339031219482, + "step": 218 + }, + { + "epoch": 0.06771835321277414, + "grad_norm": 4.0625, + "learning_rate": 4.999856905666636e-06, + "loss": 1.4182538986206055, + "step": 220 + }, + { + "epoch": 0.06833397460561755, + "grad_norm": 5.3125, + "learning_rate": 4.9998320632481e-06, + "loss": 1.384655237197876, + "step": 222 + }, + { + "epoch": 0.06894959599846094, + "grad_norm": 3.125, + "learning_rate": 4.999805233551616e-06, + "loss": 1.140051007270813, + "step": 224 + }, + { + "epoch": 0.06956521739130435, + "grad_norm": 2.109375, + "learning_rate": 4.999776416603842e-06, + "loss": 1.4289488792419434, + "step": 226 + }, + { + "epoch": 0.07018083878414776, + "grad_norm": 10.0, + "learning_rate": 4.999745612433418e-06, + "loss": 1.4386173486709595, + "step": 228 + }, + { + "epoch": 0.07079646017699115, + "grad_norm": 18.5, + "learning_rate": 4.999712821070951e-06, + "loss": 1.656036615371704, + "step": 230 + }, + { + "epoch": 0.07141208156983456, + "grad_norm": 12.5625, + "learning_rate": 4.99967804254903e-06, + "loss": 1.302478313446045, + "step": 232 + }, + { + "epoch": 0.07202770296267795, + "grad_norm": 7.09375, + "learning_rate": 4.999641276902213e-06, + "loss": 1.3145049810409546, + "step": 234 + }, + { + "epoch": 0.07264332435552136, + "grad_norm": 3.734375, + "learning_rate": 4.999602524167036e-06, + "loss": 1.347992181777954, + "step": 236 + }, + { + "epoch": 0.07325894574836475, + "grad_norm": 4.5, + "learning_rate": 4.999561784382009e-06, + "loss": 1.5541445016860962, + "step": 238 + }, + { + "epoch": 0.07387456714120816, + "grad_norm": 6.1875, + "learning_rate": 4.999519057587613e-06, + "loss": 1.3125475645065308, + "step": 240 + }, + { + "epoch": 0.07449018853405155, + "grad_norm": 3.265625, + "learning_rate": 4.999474343826309e-06, + "loss": 1.1761116981506348, + "step": 242 + }, + { + "epoch": 0.07510580992689496, + "grad_norm": 4.5625, + "learning_rate": 4.999427643142531e-06, + "loss": 1.3364779949188232, + "step": 244 + }, + { + "epoch": 0.07572143131973837, + "grad_norm": 6.5625, + "learning_rate": 4.999378955582684e-06, + "loss": 1.5807875394821167, + "step": 246 + }, + { + "epoch": 0.07633705271258176, + "grad_norm": 5.40625, + "learning_rate": 4.9993282811951514e-06, + "loss": 1.500673532485962, + "step": 248 + }, + { + "epoch": 0.07695267410542517, + "grad_norm": 3.765625, + "learning_rate": 4.99927562003029e-06, + "loss": 1.0658483505249023, + "step": 250 + }, + { + "epoch": 0.07756829549826856, + "grad_norm": 8.125, + "learning_rate": 4.999220972140427e-06, + "loss": 1.478998064994812, + "step": 252 + }, + { + "epoch": 0.07818391689111197, + "grad_norm": 4.84375, + "learning_rate": 4.999164337579873e-06, + "loss": 1.6777269840240479, + "step": 254 + }, + { + "epoch": 0.07879953828395536, + "grad_norm": 9.0625, + "learning_rate": 4.999105716404901e-06, + "loss": 1.8494199514389038, + "step": 256 + }, + { + "epoch": 0.07941515967679877, + "grad_norm": 4.875, + "learning_rate": 4.999045108673769e-06, + "loss": 1.463538646697998, + "step": 258 + }, + { + "epoch": 0.08003078106964218, + "grad_norm": 1.890625, + "learning_rate": 4.998982514446702e-06, + "loss": 1.5650489330291748, + "step": 260 + }, + { + "epoch": 0.08064640246248557, + "grad_norm": 10.0, + "learning_rate": 4.9989179337859e-06, + "loss": 1.5145618915557861, + "step": 262 + }, + { + "epoch": 0.08126202385532898, + "grad_norm": 6.78125, + "learning_rate": 4.998851366755541e-06, + "loss": 1.5924221277236938, + "step": 264 + }, + { + "epoch": 0.08187764524817237, + "grad_norm": 4.75, + "learning_rate": 4.998782813421773e-06, + "loss": 1.2749375104904175, + "step": 266 + }, + { + "epoch": 0.08249326664101578, + "grad_norm": 5.0625, + "learning_rate": 4.998712273852719e-06, + "loss": 1.400267243385315, + "step": 268 + }, + { + "epoch": 0.08310888803385917, + "grad_norm": 2.109375, + "learning_rate": 4.998639748118476e-06, + "loss": 1.3081163167953491, + "step": 270 + }, + { + "epoch": 0.08372450942670258, + "grad_norm": 5.25, + "learning_rate": 4.998565236291114e-06, + "loss": 1.604859709739685, + "step": 272 + }, + { + "epoch": 0.08434013081954599, + "grad_norm": 5.03125, + "learning_rate": 4.9984887384446755e-06, + "loss": 1.1373594999313354, + "step": 274 + }, + { + "epoch": 0.08495575221238938, + "grad_norm": 2.59375, + "learning_rate": 4.998410254655181e-06, + "loss": 1.4989488124847412, + "step": 276 + }, + { + "epoch": 0.08557137360523279, + "grad_norm": 4.84375, + "learning_rate": 4.998329785000621e-06, + "loss": 1.4071130752563477, + "step": 278 + }, + { + "epoch": 0.08618699499807618, + "grad_norm": 14.0, + "learning_rate": 4.998247329560959e-06, + "loss": 1.5088939666748047, + "step": 280 + }, + { + "epoch": 0.08680261639091959, + "grad_norm": 6.34375, + "learning_rate": 4.9981628884181335e-06, + "loss": 1.5704026222229004, + "step": 282 + }, + { + "epoch": 0.08741823778376298, + "grad_norm": 17.625, + "learning_rate": 4.9980764616560555e-06, + "loss": 1.4247150421142578, + "step": 284 + }, + { + "epoch": 0.08803385917660639, + "grad_norm": 25.5, + "learning_rate": 4.997988049360608e-06, + "loss": 2.021291494369507, + "step": 286 + }, + { + "epoch": 0.08864948056944978, + "grad_norm": 12.8125, + "learning_rate": 4.99789765161965e-06, + "loss": 1.0340229272842407, + "step": 288 + }, + { + "epoch": 0.08926510196229319, + "grad_norm": 14.4375, + "learning_rate": 4.9978052685230105e-06, + "loss": 1.5912001132965088, + "step": 290 + }, + { + "epoch": 0.0898807233551366, + "grad_norm": 5.75, + "learning_rate": 4.997710900162494e-06, + "loss": 1.0831013917922974, + "step": 292 + }, + { + "epoch": 0.09049634474797999, + "grad_norm": 10.75, + "learning_rate": 4.997614546631875e-06, + "loss": 1.4437825679779053, + "step": 294 + }, + { + "epoch": 0.0911119661408234, + "grad_norm": 5.40625, + "learning_rate": 4.997516208026902e-06, + "loss": 1.5366003513336182, + "step": 296 + }, + { + "epoch": 0.09172758753366679, + "grad_norm": 40.75, + "learning_rate": 4.997415884445299e-06, + "loss": 1.1476376056671143, + "step": 298 + }, + { + "epoch": 0.0923432089265102, + "grad_norm": 8.625, + "learning_rate": 4.997313575986756e-06, + "loss": 1.4211304187774658, + "step": 300 + }, + { + "epoch": 0.09295883031935359, + "grad_norm": 6.84375, + "learning_rate": 4.997209282752943e-06, + "loss": 1.0276066064834595, + "step": 302 + }, + { + "epoch": 0.093574451712197, + "grad_norm": 18.375, + "learning_rate": 4.997103004847496e-06, + "loss": 1.928350806236267, + "step": 304 + }, + { + "epoch": 0.0941900731050404, + "grad_norm": 5.90625, + "learning_rate": 4.996994742376025e-06, + "loss": 1.7541956901550293, + "step": 306 + }, + { + "epoch": 0.0948056944978838, + "grad_norm": 2.28125, + "learning_rate": 4.996884495446116e-06, + "loss": 0.8440700769424438, + "step": 308 + }, + { + "epoch": 0.0954213158907272, + "grad_norm": 18.375, + "learning_rate": 4.996772264167321e-06, + "loss": 2.0790932178497314, + "step": 310 + }, + { + "epoch": 0.0960369372835706, + "grad_norm": 9.1875, + "learning_rate": 4.996658048651169e-06, + "loss": 1.836275577545166, + "step": 312 + }, + { + "epoch": 0.09665255867641401, + "grad_norm": 7.0625, + "learning_rate": 4.996541849011156e-06, + "loss": 1.8032808303833008, + "step": 314 + }, + { + "epoch": 0.0972681800692574, + "grad_norm": 5.5625, + "learning_rate": 4.996423665362754e-06, + "loss": 0.6839714050292969, + "step": 316 + }, + { + "epoch": 0.09788380146210081, + "grad_norm": 3.21875, + "learning_rate": 4.9963034978234035e-06, + "loss": 1.235926866531372, + "step": 318 + }, + { + "epoch": 0.09849942285494422, + "grad_norm": 4.25, + "learning_rate": 4.99618134651252e-06, + "loss": 0.9765989184379578, + "step": 320 + }, + { + "epoch": 0.09911504424778761, + "grad_norm": 2.515625, + "learning_rate": 4.996057211551485e-06, + "loss": 1.3052048683166504, + "step": 322 + }, + { + "epoch": 0.09973066564063102, + "grad_norm": 258.0, + "learning_rate": 4.995931093063656e-06, + "loss": 1.5037379264831543, + "step": 324 + }, + { + "epoch": 0.10034628703347441, + "grad_norm": 6.375, + "learning_rate": 4.99580299117436e-06, + "loss": 1.4417295455932617, + "step": 326 + }, + { + "epoch": 0.10096190842631782, + "grad_norm": 14.5625, + "learning_rate": 4.995672906010893e-06, + "loss": 1.6018487215042114, + "step": 328 + }, + { + "epoch": 0.10157752981916121, + "grad_norm": 4.96875, + "learning_rate": 4.9955408377025245e-06, + "loss": 1.2220993041992188, + "step": 330 + }, + { + "epoch": 0.10219315121200462, + "grad_norm": 5.125, + "learning_rate": 4.995406786380496e-06, + "loss": 1.2403991222381592, + "step": 332 + }, + { + "epoch": 0.10280877260484801, + "grad_norm": 7.5, + "learning_rate": 4.995270752178013e-06, + "loss": 1.5319416522979736, + "step": 334 + }, + { + "epoch": 0.10342439399769142, + "grad_norm": 2.3125, + "learning_rate": 4.995132735230258e-06, + "loss": 1.2290902137756348, + "step": 336 + }, + { + "epoch": 0.10404001539053483, + "grad_norm": 9.75, + "learning_rate": 4.994992735674382e-06, + "loss": 1.1177040338516235, + "step": 338 + }, + { + "epoch": 0.10465563678337822, + "grad_norm": 12.625, + "learning_rate": 4.994850753649506e-06, + "loss": 1.6134198904037476, + "step": 340 + }, + { + "epoch": 0.10527125817622163, + "grad_norm": 29.125, + "learning_rate": 4.99470678929672e-06, + "loss": 1.218366026878357, + "step": 342 + }, + { + "epoch": 0.10588687956906502, + "grad_norm": 7.375, + "learning_rate": 4.9945608427590834e-06, + "loss": 1.4785023927688599, + "step": 344 + }, + { + "epoch": 0.10650250096190843, + "grad_norm": 2.890625, + "learning_rate": 4.994412914181627e-06, + "loss": 1.3170098066329956, + "step": 346 + }, + { + "epoch": 0.10711812235475182, + "grad_norm": 5.4375, + "learning_rate": 4.994263003711351e-06, + "loss": 1.3708548545837402, + "step": 348 + }, + { + "epoch": 0.10773374374759523, + "grad_norm": 6.21875, + "learning_rate": 4.994111111497227e-06, + "loss": 0.997755229473114, + "step": 350 + }, + { + "epoch": 0.10834936514043864, + "grad_norm": 1.6953125, + "learning_rate": 4.993957237690191e-06, + "loss": 1.177932858467102, + "step": 352 + }, + { + "epoch": 0.10896498653328203, + "grad_norm": 8.625, + "learning_rate": 4.993801382443152e-06, + "loss": 1.588719367980957, + "step": 354 + }, + { + "epoch": 0.10958060792612544, + "grad_norm": 6.0625, + "learning_rate": 4.993643545910986e-06, + "loss": 1.4122413396835327, + "step": 356 + }, + { + "epoch": 0.11019622931896883, + "grad_norm": 5.53125, + "learning_rate": 4.99348372825054e-06, + "loss": 1.3857849836349487, + "step": 358 + }, + { + "epoch": 0.11081185071181224, + "grad_norm": 5.4375, + "learning_rate": 4.993321929620627e-06, + "loss": 1.5077461004257202, + "step": 360 + }, + { + "epoch": 0.11142747210465563, + "grad_norm": 8.875, + "learning_rate": 4.9931581501820315e-06, + "loss": 1.5170936584472656, + "step": 362 + }, + { + "epoch": 0.11204309349749904, + "grad_norm": 4.78125, + "learning_rate": 4.992992390097503e-06, + "loss": 1.5608993768692017, + "step": 364 + }, + { + "epoch": 0.11265871489034245, + "grad_norm": 11.25, + "learning_rate": 4.992824649531762e-06, + "loss": 0.5916320085525513, + "step": 366 + }, + { + "epoch": 0.11327433628318584, + "grad_norm": 11.1875, + "learning_rate": 4.992654928651496e-06, + "loss": 1.656692624092102, + "step": 368 + }, + { + "epoch": 0.11388995767602925, + "grad_norm": 25.75, + "learning_rate": 4.99248322762536e-06, + "loss": 1.4191769361495972, + "step": 370 + }, + { + "epoch": 0.11450557906887264, + "grad_norm": 2.125, + "learning_rate": 4.992309546623978e-06, + "loss": 1.0085556507110596, + "step": 372 + }, + { + "epoch": 0.11512120046171605, + "grad_norm": 11.125, + "learning_rate": 4.99213388581994e-06, + "loss": 1.6765049695968628, + "step": 374 + }, + { + "epoch": 0.11573682185455944, + "grad_norm": 3.328125, + "learning_rate": 4.991956245387805e-06, + "loss": 1.2738518714904785, + "step": 376 + }, + { + "epoch": 0.11635244324740285, + "grad_norm": 5.46875, + "learning_rate": 4.991776625504097e-06, + "loss": 1.433395504951477, + "step": 378 + }, + { + "epoch": 0.11696806464024626, + "grad_norm": 3.09375, + "learning_rate": 4.991595026347309e-06, + "loss": 1.188396692276001, + "step": 380 + }, + { + "epoch": 0.11758368603308965, + "grad_norm": 2.0625, + "learning_rate": 4.9914114480979e-06, + "loss": 1.204866647720337, + "step": 382 + }, + { + "epoch": 0.11819930742593306, + "grad_norm": 10.125, + "learning_rate": 4.991225890938296e-06, + "loss": 1.6785403490066528, + "step": 384 + }, + { + "epoch": 0.11881492881877645, + "grad_norm": 7.21875, + "learning_rate": 4.991038355052889e-06, + "loss": 1.2951123714447021, + "step": 386 + }, + { + "epoch": 0.11943055021161986, + "grad_norm": 5.46875, + "learning_rate": 4.9908488406280375e-06, + "loss": 1.6818344593048096, + "step": 388 + }, + { + "epoch": 0.12004617160446325, + "grad_norm": 6.1875, + "learning_rate": 4.990657347852067e-06, + "loss": 0.9846329092979431, + "step": 390 + }, + { + "epoch": 0.12066179299730666, + "grad_norm": 3.578125, + "learning_rate": 4.990463876915268e-06, + "loss": 1.4272600412368774, + "step": 392 + }, + { + "epoch": 0.12127741439015005, + "grad_norm": 13.5, + "learning_rate": 4.9902684280098964e-06, + "loss": 1.4261621236801147, + "step": 394 + }, + { + "epoch": 0.12189303578299346, + "grad_norm": 3.75, + "learning_rate": 4.990071001330174e-06, + "loss": 1.3102799654006958, + "step": 396 + }, + { + "epoch": 0.12250865717583687, + "grad_norm": 5.96875, + "learning_rate": 4.989871597072289e-06, + "loss": 1.2802685499191284, + "step": 398 + }, + { + "epoch": 0.12312427856868026, + "grad_norm": 4.375, + "learning_rate": 4.989670215434393e-06, + "loss": 1.4422342777252197, + "step": 400 + }, + { + "epoch": 0.12373989996152367, + "grad_norm": 5.71875, + "learning_rate": 4.989466856616604e-06, + "loss": 1.5702648162841797, + "step": 402 + }, + { + "epoch": 0.12435552135436706, + "grad_norm": 7.875, + "learning_rate": 4.989261520821004e-06, + "loss": 1.0299426317214966, + "step": 404 + }, + { + "epoch": 0.12497114274721047, + "grad_norm": 7.53125, + "learning_rate": 4.98905420825164e-06, + "loss": 1.2051361799240112, + "step": 406 + }, + { + "epoch": 0.12558676414005387, + "grad_norm": 9.6875, + "learning_rate": 4.988844919114523e-06, + "loss": 1.3921747207641602, + "step": 408 + }, + { + "epoch": 0.12620238553289725, + "grad_norm": 2.875, + "learning_rate": 4.988633653617628e-06, + "loss": 1.3282568454742432, + "step": 410 + }, + { + "epoch": 0.12681800692574066, + "grad_norm": 6.46875, + "learning_rate": 4.9884204119708946e-06, + "loss": 1.248948335647583, + "step": 412 + }, + { + "epoch": 0.12743362831858407, + "grad_norm": 5.8125, + "learning_rate": 4.988205194386225e-06, + "loss": 1.5414966344833374, + "step": 414 + }, + { + "epoch": 0.12804924971142748, + "grad_norm": 12.875, + "learning_rate": 4.987988001077487e-06, + "loss": 1.5431689023971558, + "step": 416 + }, + { + "epoch": 0.12866487110427088, + "grad_norm": 14.8125, + "learning_rate": 4.98776883226051e-06, + "loss": 0.8197199106216431, + "step": 418 + }, + { + "epoch": 0.12928049249711426, + "grad_norm": 17.0, + "learning_rate": 4.987547688153087e-06, + "loss": 1.4242963790893555, + "step": 420 + }, + { + "epoch": 0.12989611388995767, + "grad_norm": 33.25, + "learning_rate": 4.987324568974974e-06, + "loss": 1.5151054859161377, + "step": 422 + }, + { + "epoch": 0.13051173528280108, + "grad_norm": 8.125, + "learning_rate": 4.987099474947889e-06, + "loss": 1.0335174798965454, + "step": 424 + }, + { + "epoch": 0.13112735667564449, + "grad_norm": 6.6875, + "learning_rate": 4.986872406295513e-06, + "loss": 1.4056828022003174, + "step": 426 + }, + { + "epoch": 0.1317429780684879, + "grad_norm": 11.0625, + "learning_rate": 4.9866433632434895e-06, + "loss": 1.2344582080841064, + "step": 428 + }, + { + "epoch": 0.13235859946133127, + "grad_norm": 5.3125, + "learning_rate": 4.986412346019423e-06, + "loss": 1.3656508922576904, + "step": 430 + }, + { + "epoch": 0.13297422085417468, + "grad_norm": 10.6875, + "learning_rate": 4.9861793548528835e-06, + "loss": 1.4779958724975586, + "step": 432 + }, + { + "epoch": 0.1335898422470181, + "grad_norm": 27.25, + "learning_rate": 4.985944389975396e-06, + "loss": 1.868300199508667, + "step": 434 + }, + { + "epoch": 0.1342054636398615, + "grad_norm": 13.6875, + "learning_rate": 4.98570745162045e-06, + "loss": 1.876098394393921, + "step": 436 + }, + { + "epoch": 0.13482108503270487, + "grad_norm": 4.84375, + "learning_rate": 4.985468540023501e-06, + "loss": 1.430997371673584, + "step": 438 + }, + { + "epoch": 0.13543670642554828, + "grad_norm": 14.4375, + "learning_rate": 4.985227655421956e-06, + "loss": 0.9740759134292603, + "step": 440 + }, + { + "epoch": 0.1360523278183917, + "grad_norm": 9.6875, + "learning_rate": 4.984984798055189e-06, + "loss": 1.6053884029388428, + "step": 442 + }, + { + "epoch": 0.1366679492112351, + "grad_norm": 8.5, + "learning_rate": 4.984739968164534e-06, + "loss": 1.3193613290786743, + "step": 444 + }, + { + "epoch": 0.1372835706040785, + "grad_norm": 6.46875, + "learning_rate": 4.9844931659932825e-06, + "loss": 1.4540421962738037, + "step": 446 + }, + { + "epoch": 0.13789919199692188, + "grad_norm": 18.875, + "learning_rate": 4.984244391786688e-06, + "loss": 2.0545427799224854, + "step": 448 + }, + { + "epoch": 0.1385148133897653, + "grad_norm": 5.40625, + "learning_rate": 4.983993645791962e-06, + "loss": 1.3011374473571777, + "step": 450 + }, + { + "epoch": 0.1391304347826087, + "grad_norm": 8.6875, + "learning_rate": 4.9837409282582795e-06, + "loss": 1.555488109588623, + "step": 452 + }, + { + "epoch": 0.1397460561754521, + "grad_norm": 9.625, + "learning_rate": 4.983486239436768e-06, + "loss": 1.208080530166626, + "step": 454 + }, + { + "epoch": 0.1403616775682955, + "grad_norm": 7.34375, + "learning_rate": 4.983229579580519e-06, + "loss": 1.504934310913086, + "step": 456 + }, + { + "epoch": 0.1409772989611389, + "grad_norm": 23.75, + "learning_rate": 4.982970948944581e-06, + "loss": 1.4824737310409546, + "step": 458 + }, + { + "epoch": 0.1415929203539823, + "grad_norm": 6.3125, + "learning_rate": 4.98271034778596e-06, + "loss": 1.8614106178283691, + "step": 460 + }, + { + "epoch": 0.1422085417468257, + "grad_norm": 10.0625, + "learning_rate": 4.982447776363625e-06, + "loss": 1.7203400135040283, + "step": 462 + }, + { + "epoch": 0.1428241631396691, + "grad_norm": 5.71875, + "learning_rate": 4.982183234938495e-06, + "loss": 1.408145785331726, + "step": 464 + }, + { + "epoch": 0.1434397845325125, + "grad_norm": 6.625, + "learning_rate": 4.9819167237734515e-06, + "loss": 1.392209768295288, + "step": 466 + }, + { + "epoch": 0.1440554059253559, + "grad_norm": 9.5625, + "learning_rate": 4.981648243133334e-06, + "loss": 1.7749637365341187, + "step": 468 + }, + { + "epoch": 0.1446710273181993, + "grad_norm": 3.96875, + "learning_rate": 4.9813777932849365e-06, + "loss": 1.681519865989685, + "step": 470 + }, + { + "epoch": 0.14528664871104272, + "grad_norm": 5.875, + "learning_rate": 4.981105374497012e-06, + "loss": 1.4043043851852417, + "step": 472 + }, + { + "epoch": 0.14590227010388612, + "grad_norm": 6.75, + "learning_rate": 4.9808309870402685e-06, + "loss": 1.5208261013031006, + "step": 474 + }, + { + "epoch": 0.1465178914967295, + "grad_norm": 6.4375, + "learning_rate": 4.980554631187371e-06, + "loss": 1.7243291139602661, + "step": 476 + }, + { + "epoch": 0.1471335128895729, + "grad_norm": 2.34375, + "learning_rate": 4.980276307212941e-06, + "loss": 1.0319477319717407, + "step": 478 + }, + { + "epoch": 0.14774913428241632, + "grad_norm": 6.84375, + "learning_rate": 4.9799960153935555e-06, + "loss": 1.298151969909668, + "step": 480 + }, + { + "epoch": 0.14836475567525972, + "grad_norm": 4.75, + "learning_rate": 4.9797137560077456e-06, + "loss": 1.3441214561462402, + "step": 482 + }, + { + "epoch": 0.1489803770681031, + "grad_norm": 5.84375, + "learning_rate": 4.979429529335999e-06, + "loss": 1.2724394798278809, + "step": 484 + }, + { + "epoch": 0.1495959984609465, + "grad_norm": 7.875, + "learning_rate": 4.97914333566076e-06, + "loss": 1.5625994205474854, + "step": 486 + }, + { + "epoch": 0.15021161985378992, + "grad_norm": 7.125, + "learning_rate": 4.978855175266423e-06, + "loss": 1.6065152883529663, + "step": 488 + }, + { + "epoch": 0.15082724124663333, + "grad_norm": 11.625, + "learning_rate": 4.978565048439341e-06, + "loss": 1.4982917308807373, + "step": 490 + }, + { + "epoch": 0.15144286263947673, + "grad_norm": 5.9375, + "learning_rate": 4.9782729554678185e-06, + "loss": 1.2153326272964478, + "step": 492 + }, + { + "epoch": 0.1520584840323201, + "grad_norm": 9.125, + "learning_rate": 4.977978896642117e-06, + "loss": 0.9097068309783936, + "step": 494 + }, + { + "epoch": 0.15267410542516352, + "grad_norm": 7.3125, + "learning_rate": 4.9776828722544465e-06, + "loss": 1.5312962532043457, + "step": 496 + }, + { + "epoch": 0.15328972681800693, + "grad_norm": 7.75, + "learning_rate": 4.977384882598976e-06, + "loss": 1.705657958984375, + "step": 498 + }, + { + "epoch": 0.15390534821085033, + "grad_norm": 8.5625, + "learning_rate": 4.9770849279718215e-06, + "loss": 1.2420040369033813, + "step": 500 + }, + { + "epoch": 0.15452096960369374, + "grad_norm": 8.125, + "learning_rate": 4.9767830086710565e-06, + "loss": 1.4940606355667114, + "step": 502 + }, + { + "epoch": 0.15513659099653712, + "grad_norm": 7.0625, + "learning_rate": 4.976479124996705e-06, + "loss": 1.698551058769226, + "step": 504 + }, + { + "epoch": 0.15575221238938053, + "grad_norm": 4.65625, + "learning_rate": 4.976173277250742e-06, + "loss": 1.4665323495864868, + "step": 506 + }, + { + "epoch": 0.15636783378222394, + "grad_norm": 6.1875, + "learning_rate": 4.975865465737096e-06, + "loss": 1.350402593612671, + "step": 508 + }, + { + "epoch": 0.15698345517506734, + "grad_norm": 6.0, + "learning_rate": 4.9755556907616455e-06, + "loss": 1.5278868675231934, + "step": 510 + }, + { + "epoch": 0.15759907656791072, + "grad_norm": 11.6875, + "learning_rate": 4.9752439526322224e-06, + "loss": 1.794555902481079, + "step": 512 + }, + { + "epoch": 0.15821469796075413, + "grad_norm": 7.71875, + "learning_rate": 4.974930251658606e-06, + "loss": 1.5282689332962036, + "step": 514 + }, + { + "epoch": 0.15883031935359754, + "grad_norm": 11.75, + "learning_rate": 4.97461458815253e-06, + "loss": 1.35675048828125, + "step": 516 + }, + { + "epoch": 0.15944594074644095, + "grad_norm": 3.734375, + "learning_rate": 4.9742969624276735e-06, + "loss": 1.4859691858291626, + "step": 518 + }, + { + "epoch": 0.16006156213928435, + "grad_norm": 6.78125, + "learning_rate": 4.9739773747996715e-06, + "loss": 1.4229437112808228, + "step": 520 + }, + { + "epoch": 0.16067718353212773, + "grad_norm": 16.125, + "learning_rate": 4.973655825586102e-06, + "loss": 1.8158589601516724, + "step": 522 + }, + { + "epoch": 0.16129280492497114, + "grad_norm": 12.375, + "learning_rate": 4.973332315106499e-06, + "loss": 1.4468257427215576, + "step": 524 + }, + { + "epoch": 0.16190842631781455, + "grad_norm": 9.375, + "learning_rate": 4.97300684368234e-06, + "loss": 1.7638283967971802, + "step": 526 + }, + { + "epoch": 0.16252404771065795, + "grad_norm": 4.3125, + "learning_rate": 4.972679411637053e-06, + "loss": 1.5149681568145752, + "step": 528 + }, + { + "epoch": 0.16313966910350133, + "grad_norm": 6.96875, + "learning_rate": 4.972350019296017e-06, + "loss": 0.9841006994247437, + "step": 530 + }, + { + "epoch": 0.16375529049634474, + "grad_norm": 2.671875, + "learning_rate": 4.972018666986554e-06, + "loss": 1.3767181634902954, + "step": 532 + }, + { + "epoch": 0.16437091188918815, + "grad_norm": 8.75, + "learning_rate": 4.971685355037938e-06, + "loss": 1.373497724533081, + "step": 534 + }, + { + "epoch": 0.16498653328203156, + "grad_norm": 20.0, + "learning_rate": 4.971350083781387e-06, + "loss": 1.4224358797073364, + "step": 536 + }, + { + "epoch": 0.16560215467487496, + "grad_norm": 9.5625, + "learning_rate": 4.971012853550069e-06, + "loss": 1.730084776878357, + "step": 538 + }, + { + "epoch": 0.16621777606771834, + "grad_norm": 6.78125, + "learning_rate": 4.970673664679097e-06, + "loss": 1.3526341915130615, + "step": 540 + }, + { + "epoch": 0.16683339746056175, + "grad_norm": 7.75, + "learning_rate": 4.9703325175055285e-06, + "loss": 1.5470267534255981, + "step": 542 + }, + { + "epoch": 0.16744901885340516, + "grad_norm": 7.4375, + "learning_rate": 4.969989412368371e-06, + "loss": 1.348077654838562, + "step": 544 + }, + { + "epoch": 0.16806464024624856, + "grad_norm": 4.5, + "learning_rate": 4.969644349608576e-06, + "loss": 0.9325212836265564, + "step": 546 + }, + { + "epoch": 0.16868026163909197, + "grad_norm": 7.34375, + "learning_rate": 4.969297329569039e-06, + "loss": 1.2758643627166748, + "step": 548 + }, + { + "epoch": 0.16929588303193535, + "grad_norm": 4.75, + "learning_rate": 4.968948352594604e-06, + "loss": 1.388833999633789, + "step": 550 + }, + { + "epoch": 0.16991150442477876, + "grad_norm": 11.8125, + "learning_rate": 4.968597419032053e-06, + "loss": 1.6071833372116089, + "step": 552 + }, + { + "epoch": 0.17052712581762217, + "grad_norm": 10.1875, + "learning_rate": 4.96824452923012e-06, + "loss": 1.414900779724121, + "step": 554 + }, + { + "epoch": 0.17114274721046557, + "grad_norm": 20.875, + "learning_rate": 4.967889683539479e-06, + "loss": 1.6497011184692383, + "step": 556 + }, + { + "epoch": 0.17175836860330895, + "grad_norm": 6.3125, + "learning_rate": 4.9675328823127465e-06, + "loss": 1.6876095533370972, + "step": 558 + }, + { + "epoch": 0.17237398999615236, + "grad_norm": 8.25, + "learning_rate": 4.967174125904486e-06, + "loss": 1.365379810333252, + "step": 560 + }, + { + "epoch": 0.17298961138899577, + "grad_norm": 3.515625, + "learning_rate": 4.9668134146712e-06, + "loss": 0.9860933423042297, + "step": 562 + }, + { + "epoch": 0.17360523278183917, + "grad_norm": 11.5, + "learning_rate": 4.966450748971336e-06, + "loss": 1.3949452638626099, + "step": 564 + }, + { + "epoch": 0.17422085417468258, + "grad_norm": 8.625, + "learning_rate": 4.966086129165283e-06, + "loss": 0.9252087473869324, + "step": 566 + }, + { + "epoch": 0.17483647556752596, + "grad_norm": 5.15625, + "learning_rate": 4.9657195556153725e-06, + "loss": 1.239025592803955, + "step": 568 + }, + { + "epoch": 0.17545209696036937, + "grad_norm": 13.0, + "learning_rate": 4.965351028685876e-06, + "loss": 1.5434101819992065, + "step": 570 + }, + { + "epoch": 0.17606771835321278, + "grad_norm": 5.28125, + "learning_rate": 4.964980548743009e-06, + "loss": 1.3591309785842896, + "step": 572 + }, + { + "epoch": 0.17668333974605618, + "grad_norm": 6.8125, + "learning_rate": 4.964608116154922e-06, + "loss": 1.2411075830459595, + "step": 574 + }, + { + "epoch": 0.17729896113889956, + "grad_norm": 8.4375, + "learning_rate": 4.9642337312917125e-06, + "loss": 1.6506497859954834, + "step": 576 + }, + { + "epoch": 0.17791458253174297, + "grad_norm": 5.46875, + "learning_rate": 4.963857394525414e-06, + "loss": 0.9767615795135498, + "step": 578 + }, + { + "epoch": 0.17853020392458638, + "grad_norm": 6.875, + "learning_rate": 4.963479106230001e-06, + "loss": 0.9908381104469299, + "step": 580 + }, + { + "epoch": 0.17914582531742979, + "grad_norm": 43.75, + "learning_rate": 4.963098866781387e-06, + "loss": 1.3052326440811157, + "step": 582 + }, + { + "epoch": 0.1797614467102732, + "grad_norm": 15.25, + "learning_rate": 4.9627166765574255e-06, + "loss": 1.7691961526870728, + "step": 584 + }, + { + "epoch": 0.18037706810311657, + "grad_norm": 67.0, + "learning_rate": 4.962332535937906e-06, + "loss": 1.5837366580963135, + "step": 586 + }, + { + "epoch": 0.18099268949595998, + "grad_norm": 4.09375, + "learning_rate": 4.961946445304559e-06, + "loss": 1.6570383310317993, + "step": 588 + }, + { + "epoch": 0.1816083108888034, + "grad_norm": 10.875, + "learning_rate": 4.961558405041048e-06, + "loss": 1.3879214525222778, + "step": 590 + }, + { + "epoch": 0.1822239322816468, + "grad_norm": 4.0625, + "learning_rate": 4.961168415532983e-06, + "loss": 1.4827848672866821, + "step": 592 + }, + { + "epoch": 0.1828395536744902, + "grad_norm": 6.625, + "learning_rate": 4.9607764771679e-06, + "loss": 1.458146095275879, + "step": 594 + }, + { + "epoch": 0.18345517506733358, + "grad_norm": 13.375, + "learning_rate": 4.960382590335281e-06, + "loss": 1.4375436305999756, + "step": 596 + }, + { + "epoch": 0.184070796460177, + "grad_norm": 5.65625, + "learning_rate": 4.959986755426538e-06, + "loss": 0.9506613612174988, + "step": 598 + }, + { + "epoch": 0.1846864178530204, + "grad_norm": 11.375, + "learning_rate": 4.95958897283502e-06, + "loss": 0.8702154159545898, + "step": 600 + }, + { + "epoch": 0.1853020392458638, + "grad_norm": 21.0, + "learning_rate": 4.959189242956015e-06, + "loss": 1.3946590423583984, + "step": 602 + }, + { + "epoch": 0.18591766063870718, + "grad_norm": 7.3125, + "learning_rate": 4.958787566186743e-06, + "loss": 1.4666697978973389, + "step": 604 + }, + { + "epoch": 0.1865332820315506, + "grad_norm": 8.0, + "learning_rate": 4.958383942926358e-06, + "loss": 1.4276539087295532, + "step": 606 + }, + { + "epoch": 0.187148903424394, + "grad_norm": 4.25, + "learning_rate": 4.95797837357595e-06, + "loss": 1.5988221168518066, + "step": 608 + }, + { + "epoch": 0.1877645248172374, + "grad_norm": 4.84375, + "learning_rate": 4.957570858538543e-06, + "loss": 1.6277081966400146, + "step": 610 + }, + { + "epoch": 0.1883801462100808, + "grad_norm": 42.25, + "learning_rate": 4.957161398219092e-06, + "loss": 0.6431888341903687, + "step": 612 + }, + { + "epoch": 0.1889957676029242, + "grad_norm": 23.25, + "learning_rate": 4.956749993024489e-06, + "loss": 1.298551321029663, + "step": 614 + }, + { + "epoch": 0.1896113889957676, + "grad_norm": 7.78125, + "learning_rate": 4.956336643363556e-06, + "loss": 1.4090662002563477, + "step": 616 + }, + { + "epoch": 0.190227010388611, + "grad_norm": 9.625, + "learning_rate": 4.955921349647047e-06, + "loss": 1.3029241561889648, + "step": 618 + }, + { + "epoch": 0.1908426317814544, + "grad_norm": 17.375, + "learning_rate": 4.95550411228765e-06, + "loss": 0.9873400926589966, + "step": 620 + }, + { + "epoch": 0.1914582531742978, + "grad_norm": 7.5625, + "learning_rate": 4.955084931699982e-06, + "loss": 1.5364199876785278, + "step": 622 + }, + { + "epoch": 0.1920738745671412, + "grad_norm": 16.875, + "learning_rate": 4.954663808300593e-06, + "loss": 1.638574242591858, + "step": 624 + }, + { + "epoch": 0.1926894959599846, + "grad_norm": 13.1875, + "learning_rate": 4.954240742507961e-06, + "loss": 1.0367610454559326, + "step": 626 + }, + { + "epoch": 0.19330511735282802, + "grad_norm": 12.25, + "learning_rate": 4.9538157347424985e-06, + "loss": 1.7227222919464111, + "step": 628 + }, + { + "epoch": 0.19392073874567142, + "grad_norm": 8.5625, + "learning_rate": 4.953388785426544e-06, + "loss": 1.8985217809677124, + "step": 630 + }, + { + "epoch": 0.1945363601385148, + "grad_norm": 8.6875, + "learning_rate": 4.952959894984365e-06, + "loss": 1.3614797592163086, + "step": 632 + }, + { + "epoch": 0.1951519815313582, + "grad_norm": 4.375, + "learning_rate": 4.952529063842163e-06, + "loss": 1.24729323387146, + "step": 634 + }, + { + "epoch": 0.19576760292420162, + "grad_norm": 6.8125, + "learning_rate": 4.952096292428062e-06, + "loss": 1.5064369440078735, + "step": 636 + }, + { + "epoch": 0.19638322431704502, + "grad_norm": 6.53125, + "learning_rate": 4.951661581172117e-06, + "loss": 1.2572686672210693, + "step": 638 + }, + { + "epoch": 0.19699884570988843, + "grad_norm": 9.125, + "learning_rate": 4.951224930506311e-06, + "loss": 1.4917693138122559, + "step": 640 + }, + { + "epoch": 0.1976144671027318, + "grad_norm": 6.15625, + "learning_rate": 4.950786340864553e-06, + "loss": 1.6578432321548462, + "step": 642 + }, + { + "epoch": 0.19823008849557522, + "grad_norm": 12.6875, + "learning_rate": 4.95034581268268e-06, + "loss": 1.349355697631836, + "step": 644 + }, + { + "epoch": 0.19884570988841863, + "grad_norm": 6.125, + "learning_rate": 4.9499033463984535e-06, + "loss": 1.4282751083374023, + "step": 646 + }, + { + "epoch": 0.19946133128126203, + "grad_norm": 20.0, + "learning_rate": 4.9494589424515636e-06, + "loss": 1.1630656719207764, + "step": 648 + }, + { + "epoch": 0.2000769526741054, + "grad_norm": 5.65625, + "learning_rate": 4.949012601283624e-06, + "loss": 1.5436525344848633, + "step": 650 + }, + { + "epoch": 0.20069257406694882, + "grad_norm": 10.375, + "learning_rate": 4.948564323338174e-06, + "loss": 1.4936554431915283, + "step": 652 + }, + { + "epoch": 0.20130819545979223, + "grad_norm": 4.875, + "learning_rate": 4.948114109060677e-06, + "loss": 1.0556228160858154, + "step": 654 + }, + { + "epoch": 0.20192381685263563, + "grad_norm": 10.125, + "learning_rate": 4.947661958898521e-06, + "loss": 1.6761348247528076, + "step": 656 + }, + { + "epoch": 0.20253943824547904, + "grad_norm": 7.3125, + "learning_rate": 4.947207873301018e-06, + "loss": 1.4784631729125977, + "step": 658 + }, + { + "epoch": 0.20315505963832242, + "grad_norm": 4.8125, + "learning_rate": 4.946751852719403e-06, + "loss": 1.4860063791275024, + "step": 660 + }, + { + "epoch": 0.20377068103116583, + "grad_norm": 5.75, + "learning_rate": 4.946293897606833e-06, + "loss": 1.3539552688598633, + "step": 662 + }, + { + "epoch": 0.20438630242400924, + "grad_norm": 7.34375, + "learning_rate": 4.945834008418391e-06, + "loss": 1.3625863790512085, + "step": 664 + }, + { + "epoch": 0.20500192381685264, + "grad_norm": 3.578125, + "learning_rate": 4.945372185611076e-06, + "loss": 0.7780288457870483, + "step": 666 + }, + { + "epoch": 0.20561754520969602, + "grad_norm": 7.84375, + "learning_rate": 4.9449084296438135e-06, + "loss": 1.1947097778320312, + "step": 668 + }, + { + "epoch": 0.20623316660253943, + "grad_norm": 3.921875, + "learning_rate": 4.944442740977447e-06, + "loss": 1.4437352418899536, + "step": 670 + }, + { + "epoch": 0.20684878799538284, + "grad_norm": 5.8125, + "learning_rate": 4.943975120074743e-06, + "loss": 1.557558298110962, + "step": 672 + }, + { + "epoch": 0.20746440938822625, + "grad_norm": 15.8125, + "learning_rate": 4.943505567400387e-06, + "loss": 1.3439304828643799, + "step": 674 + }, + { + "epoch": 0.20808003078106965, + "grad_norm": 38.25, + "learning_rate": 4.943034083420983e-06, + "loss": 1.2202385663986206, + "step": 676 + }, + { + "epoch": 0.20869565217391303, + "grad_norm": 5.15625, + "learning_rate": 4.942560668605055e-06, + "loss": 1.6866698265075684, + "step": 678 + }, + { + "epoch": 0.20931127356675644, + "grad_norm": 9.0625, + "learning_rate": 4.942085323423048e-06, + "loss": 1.5172860622406006, + "step": 680 + }, + { + "epoch": 0.20992689495959985, + "grad_norm": 7.0625, + "learning_rate": 4.941608048347321e-06, + "loss": 1.3118488788604736, + "step": 682 + }, + { + "epoch": 0.21054251635244325, + "grad_norm": 7.28125, + "learning_rate": 4.941128843852152e-06, + "loss": 1.3376363515853882, + "step": 684 + }, + { + "epoch": 0.21115813774528666, + "grad_norm": 10.625, + "learning_rate": 4.940647710413741e-06, + "loss": 1.5807679891586304, + "step": 686 + }, + { + "epoch": 0.21177375913813004, + "grad_norm": 7.46875, + "learning_rate": 4.940164648510197e-06, + "loss": 1.8470110893249512, + "step": 688 + }, + { + "epoch": 0.21238938053097345, + "grad_norm": 7.6875, + "learning_rate": 4.939679658621552e-06, + "loss": 1.506554126739502, + "step": 690 + }, + { + "epoch": 0.21300500192381686, + "grad_norm": 11.3125, + "learning_rate": 4.9391927412297525e-06, + "loss": 1.1695079803466797, + "step": 692 + }, + { + "epoch": 0.21362062331666026, + "grad_norm": 12.0625, + "learning_rate": 4.938703896818655e-06, + "loss": 1.4596199989318848, + "step": 694 + }, + { + "epoch": 0.21423624470950364, + "grad_norm": 5.34375, + "learning_rate": 4.938213125874039e-06, + "loss": 1.5376107692718506, + "step": 696 + }, + { + "epoch": 0.21485186610234705, + "grad_norm": 4.15625, + "learning_rate": 4.937720428883594e-06, + "loss": 1.3674694299697876, + "step": 698 + }, + { + "epoch": 0.21546748749519046, + "grad_norm": 5.0, + "learning_rate": 4.937225806336921e-06, + "loss": 1.2678442001342773, + "step": 700 + }, + { + "epoch": 0.21608310888803386, + "grad_norm": 5.375, + "learning_rate": 4.93672925872554e-06, + "loss": 1.3211055994033813, + "step": 702 + }, + { + "epoch": 0.21669873028087727, + "grad_norm": 2.609375, + "learning_rate": 4.936230786542883e-06, + "loss": 1.1183050870895386, + "step": 704 + }, + { + "epoch": 0.21731435167372065, + "grad_norm": 10.375, + "learning_rate": 4.935730390284289e-06, + "loss": 1.6955947875976562, + "step": 706 + }, + { + "epoch": 0.21792997306656406, + "grad_norm": 4.75, + "learning_rate": 4.935228070447017e-06, + "loss": 1.372933030128479, + "step": 708 + }, + { + "epoch": 0.21854559445940747, + "grad_norm": 12.0, + "learning_rate": 4.934723827530231e-06, + "loss": 1.433283805847168, + "step": 710 + }, + { + "epoch": 0.21916121585225087, + "grad_norm": 5.96875, + "learning_rate": 4.934217662035008e-06, + "loss": 1.4605717658996582, + "step": 712 + }, + { + "epoch": 0.21977683724509428, + "grad_norm": 10.0, + "learning_rate": 4.9337095744643385e-06, + "loss": 1.422659158706665, + "step": 714 + }, + { + "epoch": 0.22039245863793766, + "grad_norm": 5.0, + "learning_rate": 4.933199565323119e-06, + "loss": 1.2856898307800293, + "step": 716 + }, + { + "epoch": 0.22100808003078107, + "grad_norm": 11.375, + "learning_rate": 4.932687635118157e-06, + "loss": 1.190561294555664, + "step": 718 + }, + { + "epoch": 0.22162370142362448, + "grad_norm": 6.0625, + "learning_rate": 4.9321737843581685e-06, + "loss": 1.4947484731674194, + "step": 720 + }, + { + "epoch": 0.22223932281646788, + "grad_norm": 10.0, + "learning_rate": 4.931658013553781e-06, + "loss": 1.70393705368042, + "step": 722 + }, + { + "epoch": 0.22285494420931126, + "grad_norm": 5.0625, + "learning_rate": 4.931140323217524e-06, + "loss": 0.7426916360855103, + "step": 724 + }, + { + "epoch": 0.22347056560215467, + "grad_norm": 7.8125, + "learning_rate": 4.93062071386384e-06, + "loss": 1.5670533180236816, + "step": 726 + }, + { + "epoch": 0.22408618699499808, + "grad_norm": 6.15625, + "learning_rate": 4.930099186009077e-06, + "loss": 1.7750446796417236, + "step": 728 + }, + { + "epoch": 0.22470180838784148, + "grad_norm": 53.25, + "learning_rate": 4.929575740171488e-06, + "loss": 1.3493258953094482, + "step": 730 + }, + { + "epoch": 0.2253174297806849, + "grad_norm": 5.125, + "learning_rate": 4.929050376871231e-06, + "loss": 1.225829839706421, + "step": 732 + }, + { + "epoch": 0.22593305117352827, + "grad_norm": 2.109375, + "learning_rate": 4.928523096630376e-06, + "loss": 1.150283694267273, + "step": 734 + }, + { + "epoch": 0.22654867256637168, + "grad_norm": 7.09375, + "learning_rate": 4.9279938999728886e-06, + "loss": 1.4825005531311035, + "step": 736 + }, + { + "epoch": 0.22716429395921509, + "grad_norm": 6.3125, + "learning_rate": 4.927462787424646e-06, + "loss": 1.4402482509613037, + "step": 738 + }, + { + "epoch": 0.2277799153520585, + "grad_norm": 12.0625, + "learning_rate": 4.926929759513426e-06, + "loss": 1.7994085550308228, + "step": 740 + }, + { + "epoch": 0.22839553674490187, + "grad_norm": 10.5625, + "learning_rate": 4.926394816768909e-06, + "loss": 1.7221037149429321, + "step": 742 + }, + { + "epoch": 0.22901115813774528, + "grad_norm": 4.34375, + "learning_rate": 4.925857959722682e-06, + "loss": 0.974231481552124, + "step": 744 + }, + { + "epoch": 0.2296267795305887, + "grad_norm": 7.4375, + "learning_rate": 4.92531918890823e-06, + "loss": 1.3925503492355347, + "step": 746 + }, + { + "epoch": 0.2302424009234321, + "grad_norm": 12.625, + "learning_rate": 4.924778504860943e-06, + "loss": 1.2160550355911255, + "step": 748 + }, + { + "epoch": 0.2308580223162755, + "grad_norm": 4.125, + "learning_rate": 4.92423590811811e-06, + "loss": 1.2490298748016357, + "step": 750 + }, + { + "epoch": 0.23147364370911888, + "grad_norm": 12.1875, + "learning_rate": 4.923691399218921e-06, + "loss": 1.4309897422790527, + "step": 752 + }, + { + "epoch": 0.2320892651019623, + "grad_norm": 14.75, + "learning_rate": 4.9231449787044695e-06, + "loss": 1.4852542877197266, + "step": 754 + }, + { + "epoch": 0.2327048864948057, + "grad_norm": 5.375, + "learning_rate": 4.922596647117742e-06, + "loss": 1.4291030168533325, + "step": 756 + }, + { + "epoch": 0.2333205078876491, + "grad_norm": 8.25, + "learning_rate": 4.92204640500363e-06, + "loss": 1.3657782077789307, + "step": 758 + }, + { + "epoch": 0.2339361292804925, + "grad_norm": 5.6875, + "learning_rate": 4.9214942529089215e-06, + "loss": 1.50307297706604, + "step": 760 + }, + { + "epoch": 0.2345517506733359, + "grad_norm": 5.125, + "learning_rate": 4.920940191382302e-06, + "loss": 1.1985366344451904, + "step": 762 + }, + { + "epoch": 0.2351673720661793, + "grad_norm": 6.96875, + "learning_rate": 4.920384220974355e-06, + "loss": 1.7522002458572388, + "step": 764 + }, + { + "epoch": 0.2357829934590227, + "grad_norm": 9.9375, + "learning_rate": 4.919826342237559e-06, + "loss": 1.6885839700698853, + "step": 766 + }, + { + "epoch": 0.2363986148518661, + "grad_norm": 12.4375, + "learning_rate": 4.919266555726293e-06, + "loss": 1.5652323961257935, + "step": 768 + }, + { + "epoch": 0.2370142362447095, + "grad_norm": 6.21875, + "learning_rate": 4.918704861996829e-06, + "loss": 1.2147345542907715, + "step": 770 + }, + { + "epoch": 0.2376298576375529, + "grad_norm": 6.0625, + "learning_rate": 4.918141261607335e-06, + "loss": 1.2249404191970825, + "step": 772 + }, + { + "epoch": 0.2382454790303963, + "grad_norm": 10.8125, + "learning_rate": 4.917575755117872e-06, + "loss": 1.6967936754226685, + "step": 774 + }, + { + "epoch": 0.23886110042323971, + "grad_norm": 5.78125, + "learning_rate": 4.917008343090397e-06, + "loss": 1.049770474433899, + "step": 776 + }, + { + "epoch": 0.23947672181608312, + "grad_norm": 7.25, + "learning_rate": 4.91643902608876e-06, + "loss": 1.4362744092941284, + "step": 778 + }, + { + "epoch": 0.2400923432089265, + "grad_norm": 7.3125, + "learning_rate": 4.915867804678704e-06, + "loss": 1.4061223268508911, + "step": 780 + }, + { + "epoch": 0.2407079646017699, + "grad_norm": 8.0625, + "learning_rate": 4.915294679427865e-06, + "loss": 1.106104850769043, + "step": 782 + }, + { + "epoch": 0.24132358599461332, + "grad_norm": 2.1875, + "learning_rate": 4.91471965090577e-06, + "loss": 1.4797524213790894, + "step": 784 + }, + { + "epoch": 0.24193920738745672, + "grad_norm": 11.1875, + "learning_rate": 4.914142719683839e-06, + "loss": 1.674882173538208, + "step": 786 + }, + { + "epoch": 0.2425548287803001, + "grad_norm": 6.0625, + "learning_rate": 4.913563886335379e-06, + "loss": 1.3974217176437378, + "step": 788 + }, + { + "epoch": 0.2431704501731435, + "grad_norm": 3.53125, + "learning_rate": 4.9129831514355915e-06, + "loss": 1.4340710639953613, + "step": 790 + }, + { + "epoch": 0.24378607156598692, + "grad_norm": 9.6875, + "learning_rate": 4.912400515561565e-06, + "loss": 1.3193720579147339, + "step": 792 + }, + { + "epoch": 0.24440169295883032, + "grad_norm": 5.15625, + "learning_rate": 4.911815979292278e-06, + "loss": 1.2771092653274536, + "step": 794 + }, + { + "epoch": 0.24501731435167373, + "grad_norm": 6.625, + "learning_rate": 4.911229543208598e-06, + "loss": 1.6375248432159424, + "step": 796 + }, + { + "epoch": 0.2456329357445171, + "grad_norm": 5.34375, + "learning_rate": 4.9106412078932785e-06, + "loss": 1.5073238611221313, + "step": 798 + }, + { + "epoch": 0.24624855713736052, + "grad_norm": 6.53125, + "learning_rate": 4.9100509739309635e-06, + "loss": 1.5905678272247314, + "step": 800 + }, + { + "epoch": 0.24686417853020393, + "grad_norm": 6.09375, + "learning_rate": 4.909458841908179e-06, + "loss": 1.6703548431396484, + "step": 802 + }, + { + "epoch": 0.24747979992304733, + "grad_norm": 7.5625, + "learning_rate": 4.908864812413341e-06, + "loss": 1.626749038696289, + "step": 804 + }, + { + "epoch": 0.24809542131589074, + "grad_norm": 9.375, + "learning_rate": 4.908268886036751e-06, + "loss": 1.822136402130127, + "step": 806 + }, + { + "epoch": 0.24871104270873412, + "grad_norm": 9.0625, + "learning_rate": 4.907671063370592e-06, + "loss": 1.4152077436447144, + "step": 808 + }, + { + "epoch": 0.24932666410157753, + "grad_norm": 18.75, + "learning_rate": 4.907071345008938e-06, + "loss": 1.0158774852752686, + "step": 810 + }, + { + "epoch": 0.24994228549442093, + "grad_norm": 15.9375, + "learning_rate": 4.906469731547738e-06, + "loss": 1.6534260511398315, + "step": 812 + }, + { + "epoch": 0.2505579068872643, + "grad_norm": 3.53125, + "learning_rate": 4.905866223584831e-06, + "loss": 1.1389552354812622, + "step": 814 + }, + { + "epoch": 0.25117352828010775, + "grad_norm": 4.625, + "learning_rate": 4.905260821719936e-06, + "loss": 1.2016785144805908, + "step": 816 + }, + { + "epoch": 0.25178914967295113, + "grad_norm": 16.625, + "learning_rate": 4.904653526554655e-06, + "loss": 1.0394606590270996, + "step": 818 + }, + { + "epoch": 0.2524047710657945, + "grad_norm": 4.5, + "learning_rate": 4.9040443386924694e-06, + "loss": 1.1572508811950684, + "step": 820 + }, + { + "epoch": 0.25302039245863794, + "grad_norm": 8.5, + "learning_rate": 4.903433258738744e-06, + "loss": 1.594638705253601, + "step": 822 + }, + { + "epoch": 0.2536360138514813, + "grad_norm": 3.59375, + "learning_rate": 4.9028202873007216e-06, + "loss": 1.3011562824249268, + "step": 824 + }, + { + "epoch": 0.25425163524432476, + "grad_norm": 6.84375, + "learning_rate": 4.902205424987528e-06, + "loss": 1.2228734493255615, + "step": 826 + }, + { + "epoch": 0.25486725663716814, + "grad_norm": 9.125, + "learning_rate": 4.901588672410163e-06, + "loss": 0.702058732509613, + "step": 828 + }, + { + "epoch": 0.2554828780300115, + "grad_norm": 7.40625, + "learning_rate": 4.900970030181509e-06, + "loss": 1.5469894409179688, + "step": 830 + }, + { + "epoch": 0.25609849942285495, + "grad_norm": 2.578125, + "learning_rate": 4.900349498916324e-06, + "loss": 1.3061200380325317, + "step": 832 + }, + { + "epoch": 0.25671412081569833, + "grad_norm": 9.4375, + "learning_rate": 4.899727079231244e-06, + "loss": 1.4491020441055298, + "step": 834 + }, + { + "epoch": 0.25732974220854177, + "grad_norm": 9.75, + "learning_rate": 4.899102771744781e-06, + "loss": 1.0271233320236206, + "step": 836 + }, + { + "epoch": 0.25794536360138515, + "grad_norm": 40.5, + "learning_rate": 4.898476577077325e-06, + "loss": 0.7111718654632568, + "step": 838 + }, + { + "epoch": 0.2585609849942285, + "grad_norm": 9.875, + "learning_rate": 4.897848495851137e-06, + "loss": 1.659692645072937, + "step": 840 + }, + { + "epoch": 0.25917660638707196, + "grad_norm": 6.0625, + "learning_rate": 4.897218528690357e-06, + "loss": 1.3913519382476807, + "step": 842 + }, + { + "epoch": 0.25979222777991534, + "grad_norm": 6.09375, + "learning_rate": 4.896586676220998e-06, + "loss": 1.4254862070083618, + "step": 844 + }, + { + "epoch": 0.2604078491727588, + "grad_norm": 12.5, + "learning_rate": 4.895952939070946e-06, + "loss": 1.488398790359497, + "step": 846 + }, + { + "epoch": 0.26102347056560216, + "grad_norm": 5.78125, + "learning_rate": 4.8953173178699575e-06, + "loss": 1.8268243074417114, + "step": 848 + }, + { + "epoch": 0.26163909195844554, + "grad_norm": 2.4375, + "learning_rate": 4.894679813249666e-06, + "loss": 1.1114075183868408, + "step": 850 + }, + { + "epoch": 0.26225471335128897, + "grad_norm": 13.6875, + "learning_rate": 4.8940404258435725e-06, + "loss": 1.3839600086212158, + "step": 852 + }, + { + "epoch": 0.26287033474413235, + "grad_norm": 4.46875, + "learning_rate": 4.893399156287052e-06, + "loss": 1.1785236597061157, + "step": 854 + }, + { + "epoch": 0.2634859561369758, + "grad_norm": 10.9375, + "learning_rate": 4.892756005217347e-06, + "loss": 1.7065340280532837, + "step": 856 + }, + { + "epoch": 0.26410157752981916, + "grad_norm": 8.25, + "learning_rate": 4.892110973273573e-06, + "loss": 1.713850498199463, + "step": 858 + }, + { + "epoch": 0.26471719892266254, + "grad_norm": 3.1875, + "learning_rate": 4.891464061096711e-06, + "loss": 1.5599521398544312, + "step": 860 + }, + { + "epoch": 0.265332820315506, + "grad_norm": 11.375, + "learning_rate": 4.890815269329613e-06, + "loss": 1.5649224519729614, + "step": 862 + }, + { + "epoch": 0.26594844170834936, + "grad_norm": 11.125, + "learning_rate": 4.890164598616997e-06, + "loss": 1.5326846837997437, + "step": 864 + }, + { + "epoch": 0.2665640631011928, + "grad_norm": 9.0, + "learning_rate": 4.88951204960545e-06, + "loss": 0.8126274347305298, + "step": 866 + }, + { + "epoch": 0.2671796844940362, + "grad_norm": 2.9375, + "learning_rate": 4.888857622943426e-06, + "loss": 1.1992944478988647, + "step": 868 + }, + { + "epoch": 0.26779530588687955, + "grad_norm": 14.25, + "learning_rate": 4.88820131928124e-06, + "loss": 1.210860013961792, + "step": 870 + }, + { + "epoch": 0.268410927279723, + "grad_norm": 5.84375, + "learning_rate": 4.887543139271078e-06, + "loss": 1.1094002723693848, + "step": 872 + }, + { + "epoch": 0.26902654867256637, + "grad_norm": 4.1875, + "learning_rate": 4.886883083566988e-06, + "loss": 1.062155842781067, + "step": 874 + }, + { + "epoch": 0.26964217006540975, + "grad_norm": 5.0625, + "learning_rate": 4.88622115282488e-06, + "loss": 0.9888610243797302, + "step": 876 + }, + { + "epoch": 0.2702577914582532, + "grad_norm": 17.0, + "learning_rate": 4.885557347702533e-06, + "loss": 1.4706164598464966, + "step": 878 + }, + { + "epoch": 0.27087341285109656, + "grad_norm": 5.375, + "learning_rate": 4.884891668859583e-06, + "loss": 1.4921408891677856, + "step": 880 + }, + { + "epoch": 0.27148903424394, + "grad_norm": 11.6875, + "learning_rate": 4.88422411695753e-06, + "loss": 0.9078661799430847, + "step": 882 + }, + { + "epoch": 0.2721046556367834, + "grad_norm": 7.53125, + "learning_rate": 4.883554692659736e-06, + "loss": 1.3483184576034546, + "step": 884 + }, + { + "epoch": 0.27272027702962676, + "grad_norm": 3.40625, + "learning_rate": 4.882883396631421e-06, + "loss": 1.2152024507522583, + "step": 886 + }, + { + "epoch": 0.2733358984224702, + "grad_norm": 7.46875, + "learning_rate": 4.88221022953967e-06, + "loss": 1.4284247159957886, + "step": 888 + }, + { + "epoch": 0.27395151981531357, + "grad_norm": 6.34375, + "learning_rate": 4.881535192053423e-06, + "loss": 1.623680830001831, + "step": 890 + }, + { + "epoch": 0.274567141208157, + "grad_norm": 8.0625, + "learning_rate": 4.880858284843477e-06, + "loss": 1.4158018827438354, + "step": 892 + }, + { + "epoch": 0.2751827626010004, + "grad_norm": 5.96875, + "learning_rate": 4.8801795085824945e-06, + "loss": 1.7160724401474, + "step": 894 + }, + { + "epoch": 0.27579838399384377, + "grad_norm": 8.25, + "learning_rate": 4.879498863944988e-06, + "loss": 1.4056710004806519, + "step": 896 + }, + { + "epoch": 0.2764140053866872, + "grad_norm": 11.0625, + "learning_rate": 4.87881635160733e-06, + "loss": 1.3139357566833496, + "step": 898 + }, + { + "epoch": 0.2770296267795306, + "grad_norm": 7.375, + "learning_rate": 4.878131972247747e-06, + "loss": 1.2047233581542969, + "step": 900 + }, + { + "epoch": 0.277645248172374, + "grad_norm": 7.46875, + "learning_rate": 4.8774457265463245e-06, + "loss": 1.2656543254852295, + "step": 902 + }, + { + "epoch": 0.2782608695652174, + "grad_norm": 4.59375, + "learning_rate": 4.8767576151849985e-06, + "loss": 1.1122605800628662, + "step": 904 + }, + { + "epoch": 0.2788764909580608, + "grad_norm": 5.21875, + "learning_rate": 4.876067638847561e-06, + "loss": 1.4599850177764893, + "step": 906 + }, + { + "epoch": 0.2794921123509042, + "grad_norm": 4.09375, + "learning_rate": 4.875375798219658e-06, + "loss": 1.1917310953140259, + "step": 908 + }, + { + "epoch": 0.2801077337437476, + "grad_norm": 5.4375, + "learning_rate": 4.874682093988786e-06, + "loss": 1.4234414100646973, + "step": 910 + }, + { + "epoch": 0.280723355136591, + "grad_norm": 7.59375, + "learning_rate": 4.873986526844294e-06, + "loss": 0.9316292405128479, + "step": 912 + }, + { + "epoch": 0.2813389765294344, + "grad_norm": 9.0625, + "learning_rate": 4.873289097477384e-06, + "loss": 0.9619709849357605, + "step": 914 + }, + { + "epoch": 0.2819545979222778, + "grad_norm": 3.671875, + "learning_rate": 4.872589806581106e-06, + "loss": 0.9365772604942322, + "step": 916 + }, + { + "epoch": 0.2825702193151212, + "grad_norm": 8.25, + "learning_rate": 4.871888654850362e-06, + "loss": 1.7384915351867676, + "step": 918 + }, + { + "epoch": 0.2831858407079646, + "grad_norm": 7.96875, + "learning_rate": 4.871185642981901e-06, + "loss": 1.2540651559829712, + "step": 920 + }, + { + "epoch": 0.283801462100808, + "grad_norm": 5.625, + "learning_rate": 4.870480771674324e-06, + "loss": 1.3530112504959106, + "step": 922 + }, + { + "epoch": 0.2844170834936514, + "grad_norm": 5.9375, + "learning_rate": 4.869774041628075e-06, + "loss": 0.942622184753418, + "step": 924 + }, + { + "epoch": 0.2850327048864948, + "grad_norm": 9.6875, + "learning_rate": 4.869065453545447e-06, + "loss": 1.549060583114624, + "step": 926 + }, + { + "epoch": 0.2856483262793382, + "grad_norm": 17.75, + "learning_rate": 4.868355008130583e-06, + "loss": 1.539589285850525, + "step": 928 + }, + { + "epoch": 0.2862639476721816, + "grad_norm": 8.9375, + "learning_rate": 4.867642706089466e-06, + "loss": 1.3125108480453491, + "step": 930 + }, + { + "epoch": 0.286879569065025, + "grad_norm": 7.84375, + "learning_rate": 4.866928548129927e-06, + "loss": 1.1507761478424072, + "step": 932 + }, + { + "epoch": 0.2874951904578684, + "grad_norm": 8.875, + "learning_rate": 4.866212534961641e-06, + "loss": 1.196966528892517, + "step": 934 + }, + { + "epoch": 0.2881108118507118, + "grad_norm": 11.4375, + "learning_rate": 4.865494667296126e-06, + "loss": 1.3384065628051758, + "step": 936 + }, + { + "epoch": 0.28872643324355524, + "grad_norm": 5.4375, + "learning_rate": 4.864774945846744e-06, + "loss": 1.2341539859771729, + "step": 938 + }, + { + "epoch": 0.2893420546363986, + "grad_norm": 9.75, + "learning_rate": 4.864053371328697e-06, + "loss": 1.3900071382522583, + "step": 940 + }, + { + "epoch": 0.289957676029242, + "grad_norm": 2.578125, + "learning_rate": 4.8633299444590324e-06, + "loss": 1.0347307920455933, + "step": 942 + }, + { + "epoch": 0.29057329742208543, + "grad_norm": 4.46875, + "learning_rate": 4.862604665956633e-06, + "loss": 0.9689977169036865, + "step": 944 + }, + { + "epoch": 0.2911889188149288, + "grad_norm": 9.1875, + "learning_rate": 4.8618775365422246e-06, + "loss": 1.257887363433838, + "step": 946 + }, + { + "epoch": 0.29180454020777224, + "grad_norm": 10.1875, + "learning_rate": 4.861148556938372e-06, + "loss": 1.772337794303894, + "step": 948 + }, + { + "epoch": 0.2924201616006156, + "grad_norm": 7.6875, + "learning_rate": 4.860417727869481e-06, + "loss": 1.334553837776184, + "step": 950 + }, + { + "epoch": 0.293035782993459, + "grad_norm": 34.25, + "learning_rate": 4.85968505006179e-06, + "loss": 1.46852445602417, + "step": 952 + }, + { + "epoch": 0.29365140438630244, + "grad_norm": 4.0625, + "learning_rate": 4.858950524243379e-06, + "loss": 1.1797504425048828, + "step": 954 + }, + { + "epoch": 0.2942670257791458, + "grad_norm": 4.78125, + "learning_rate": 4.858214151144161e-06, + "loss": 1.172776460647583, + "step": 956 + }, + { + "epoch": 0.29488264717198925, + "grad_norm": 5.5625, + "learning_rate": 4.857475931495888e-06, + "loss": 1.4012408256530762, + "step": 958 + }, + { + "epoch": 0.29549826856483263, + "grad_norm": 13.375, + "learning_rate": 4.8567358660321465e-06, + "loss": 1.6080305576324463, + "step": 960 + }, + { + "epoch": 0.296113889957676, + "grad_norm": 7.96875, + "learning_rate": 4.8559939554883526e-06, + "loss": 1.382482886314392, + "step": 962 + }, + { + "epoch": 0.29672951135051945, + "grad_norm": 8.25, + "learning_rate": 4.855250200601762e-06, + "loss": 1.3735034465789795, + "step": 964 + }, + { + "epoch": 0.2973451327433628, + "grad_norm": 8.5625, + "learning_rate": 4.854504602111461e-06, + "loss": 1.5223190784454346, + "step": 966 + }, + { + "epoch": 0.2979607541362062, + "grad_norm": 7.03125, + "learning_rate": 4.853757160758367e-06, + "loss": 1.059695839881897, + "step": 968 + }, + { + "epoch": 0.29857637552904964, + "grad_norm": 6.5, + "learning_rate": 4.853007877285226e-06, + "loss": 1.1238740682601929, + "step": 970 + }, + { + "epoch": 0.299191996921893, + "grad_norm": 6.84375, + "learning_rate": 4.852256752436623e-06, + "loss": 0.9654524326324463, + "step": 972 + }, + { + "epoch": 0.29980761831473646, + "grad_norm": 5.75, + "learning_rate": 4.851503786958965e-06, + "loss": 1.393660545349121, + "step": 974 + }, + { + "epoch": 0.30042323970757984, + "grad_norm": 7.03125, + "learning_rate": 4.85074898160049e-06, + "loss": 1.5199719667434692, + "step": 976 + }, + { + "epoch": 0.3010388611004232, + "grad_norm": 8.25, + "learning_rate": 4.849992337111267e-06, + "loss": 1.1787185668945312, + "step": 978 + }, + { + "epoch": 0.30165448249326665, + "grad_norm": 4.8125, + "learning_rate": 4.849233854243189e-06, + "loss": 1.5050737857818604, + "step": 980 + }, + { + "epoch": 0.30227010388611003, + "grad_norm": 9.3125, + "learning_rate": 4.848473533749979e-06, + "loss": 1.610541820526123, + "step": 982 + }, + { + "epoch": 0.30288572527895347, + "grad_norm": 4.0625, + "learning_rate": 4.847711376387182e-06, + "loss": 1.2235430479049683, + "step": 984 + }, + { + "epoch": 0.30350134667179685, + "grad_norm": 9.0, + "learning_rate": 4.846947382912173e-06, + "loss": 1.4376128911972046, + "step": 986 + }, + { + "epoch": 0.3041169680646402, + "grad_norm": 7.4375, + "learning_rate": 4.846181554084147e-06, + "loss": 1.3893898725509644, + "step": 988 + }, + { + "epoch": 0.30473258945748366, + "grad_norm": 8.1875, + "learning_rate": 4.845413890664129e-06, + "loss": 1.1377290487289429, + "step": 990 + }, + { + "epoch": 0.30534821085032704, + "grad_norm": 5.96875, + "learning_rate": 4.844644393414961e-06, + "loss": 1.4446502923965454, + "step": 992 + }, + { + "epoch": 0.3059638322431705, + "grad_norm": 12.25, + "learning_rate": 4.84387306310131e-06, + "loss": 1.6131486892700195, + "step": 994 + }, + { + "epoch": 0.30657945363601385, + "grad_norm": 6.3125, + "learning_rate": 4.843099900489664e-06, + "loss": 1.3349088430404663, + "step": 996 + }, + { + "epoch": 0.30719507502885723, + "grad_norm": 20.125, + "learning_rate": 4.842324906348333e-06, + "loss": 1.4115350246429443, + "step": 998 + }, + { + "epoch": 0.30781069642170067, + "grad_norm": 16.25, + "learning_rate": 4.841548081447445e-06, + "loss": 1.2290444374084473, + "step": 1000 + }, + { + "epoch": 0.30842631781454405, + "grad_norm": 7.625, + "learning_rate": 4.840769426558948e-06, + "loss": 1.7017638683319092, + "step": 1002 + }, + { + "epoch": 0.3090419392073875, + "grad_norm": 7.09375, + "learning_rate": 4.839988942456609e-06, + "loss": 1.12124764919281, + "step": 1004 + }, + { + "epoch": 0.30965756060023086, + "grad_norm": 57.5, + "learning_rate": 4.839206629916015e-06, + "loss": 1.723775863647461, + "step": 1006 + }, + { + "epoch": 0.31027318199307424, + "grad_norm": 2.328125, + "learning_rate": 4.838422489714564e-06, + "loss": 1.3458318710327148, + "step": 1008 + }, + { + "epoch": 0.3108888033859177, + "grad_norm": 5.3125, + "learning_rate": 4.837636522631475e-06, + "loss": 1.2959532737731934, + "step": 1010 + }, + { + "epoch": 0.31150442477876106, + "grad_norm": 5.75, + "learning_rate": 4.8368487294477815e-06, + "loss": 1.2677240371704102, + "step": 1012 + }, + { + "epoch": 0.31212004617160444, + "grad_norm": 6.5625, + "learning_rate": 4.836059110946332e-06, + "loss": 1.1831039190292358, + "step": 1014 + }, + { + "epoch": 0.3127356675644479, + "grad_norm": 16.625, + "learning_rate": 4.835267667911786e-06, + "loss": 1.4186724424362183, + "step": 1016 + }, + { + "epoch": 0.31335128895729125, + "grad_norm": 20.0, + "learning_rate": 4.83447440113062e-06, + "loss": 1.467540979385376, + "step": 1018 + }, + { + "epoch": 0.3139669103501347, + "grad_norm": 4.6875, + "learning_rate": 4.833679311391121e-06, + "loss": 1.560141921043396, + "step": 1020 + }, + { + "epoch": 0.31458253174297807, + "grad_norm": 5.9375, + "learning_rate": 4.832882399483385e-06, + "loss": 1.241237998008728, + "step": 1022 + }, + { + "epoch": 0.31519815313582145, + "grad_norm": 5.0625, + "learning_rate": 4.832083666199324e-06, + "loss": 1.0696933269500732, + "step": 1024 + }, + { + "epoch": 0.3158137745286649, + "grad_norm": 10.4375, + "learning_rate": 4.8312831123326565e-06, + "loss": 0.7319807410240173, + "step": 1026 + }, + { + "epoch": 0.31642939592150826, + "grad_norm": 5.15625, + "learning_rate": 4.83048073867891e-06, + "loss": 1.38838791847229, + "step": 1028 + }, + { + "epoch": 0.3170450173143517, + "grad_norm": 6.375, + "learning_rate": 4.829676546035422e-06, + "loss": 1.09077787399292, + "step": 1030 + }, + { + "epoch": 0.3176606387071951, + "grad_norm": 18.25, + "learning_rate": 4.828870535201336e-06, + "loss": 1.5093919038772583, + "step": 1032 + }, + { + "epoch": 0.31827626010003846, + "grad_norm": 5.6875, + "learning_rate": 4.828062706977605e-06, + "loss": 1.4667083024978638, + "step": 1034 + }, + { + "epoch": 0.3188918814928819, + "grad_norm": 17.0, + "learning_rate": 4.827253062166985e-06, + "loss": 1.5847958326339722, + "step": 1036 + }, + { + "epoch": 0.31950750288572527, + "grad_norm": 12.125, + "learning_rate": 4.826441601574035e-06, + "loss": 1.0991942882537842, + "step": 1038 + }, + { + "epoch": 0.3201231242785687, + "grad_norm": 4.59375, + "learning_rate": 4.825628326005126e-06, + "loss": 1.5162996053695679, + "step": 1040 + }, + { + "epoch": 0.3207387456714121, + "grad_norm": 6.9375, + "learning_rate": 4.824813236268425e-06, + "loss": 1.5189971923828125, + "step": 1042 + }, + { + "epoch": 0.32135436706425546, + "grad_norm": 8.3125, + "learning_rate": 4.823996333173908e-06, + "loss": 1.4960123300552368, + "step": 1044 + }, + { + "epoch": 0.3219699884570989, + "grad_norm": 7.875, + "learning_rate": 4.823177617533348e-06, + "loss": 1.262855052947998, + "step": 1046 + }, + { + "epoch": 0.3225856098499423, + "grad_norm": 4.3125, + "learning_rate": 4.822357090160321e-06, + "loss": 1.3328428268432617, + "step": 1048 + }, + { + "epoch": 0.3232012312427857, + "grad_norm": 14.375, + "learning_rate": 4.821534751870205e-06, + "loss": 1.2429380416870117, + "step": 1050 + }, + { + "epoch": 0.3238168526356291, + "grad_norm": 7.53125, + "learning_rate": 4.8207106034801735e-06, + "loss": 1.1245135068893433, + "step": 1052 + }, + { + "epoch": 0.3244324740284725, + "grad_norm": 14.0, + "learning_rate": 4.819884645809203e-06, + "loss": 0.9234024882316589, + "step": 1054 + }, + { + "epoch": 0.3250480954213159, + "grad_norm": 7.40625, + "learning_rate": 4.819056879678066e-06, + "loss": 1.314733862876892, + "step": 1056 + }, + { + "epoch": 0.3256637168141593, + "grad_norm": 4.25, + "learning_rate": 4.818227305909332e-06, + "loss": 1.3780370950698853, + "step": 1058 + }, + { + "epoch": 0.32627933820700267, + "grad_norm": 13.4375, + "learning_rate": 4.817395925327367e-06, + "loss": 1.2600730657577515, + "step": 1060 + }, + { + "epoch": 0.3268949595998461, + "grad_norm": 6.0, + "learning_rate": 4.8165627387583316e-06, + "loss": 1.6714415550231934, + "step": 1062 + }, + { + "epoch": 0.3275105809926895, + "grad_norm": 8.5, + "learning_rate": 4.815727747030184e-06, + "loss": 1.5505750179290771, + "step": 1064 + }, + { + "epoch": 0.3281262023855329, + "grad_norm": 16.375, + "learning_rate": 4.814890950972672e-06, + "loss": 1.1855148077011108, + "step": 1066 + }, + { + "epoch": 0.3287418237783763, + "grad_norm": 9.0625, + "learning_rate": 4.814052351417341e-06, + "loss": 0.9353585243225098, + "step": 1068 + }, + { + "epoch": 0.3293574451712197, + "grad_norm": 6.90625, + "learning_rate": 4.813211949197525e-06, + "loss": 1.228911280632019, + "step": 1070 + }, + { + "epoch": 0.3299730665640631, + "grad_norm": 8.25, + "learning_rate": 4.81236974514835e-06, + "loss": 1.7351125478744507, + "step": 1072 + }, + { + "epoch": 0.3305886879569065, + "grad_norm": 16.5, + "learning_rate": 4.811525740106734e-06, + "loss": 1.7127091884613037, + "step": 1074 + }, + { + "epoch": 0.3312043093497499, + "grad_norm": 19.625, + "learning_rate": 4.810679934911382e-06, + "loss": 1.5812722444534302, + "step": 1076 + }, + { + "epoch": 0.3318199307425933, + "grad_norm": 4.9375, + "learning_rate": 4.8098323304027915e-06, + "loss": 1.3842276334762573, + "step": 1078 + }, + { + "epoch": 0.3324355521354367, + "grad_norm": 2.578125, + "learning_rate": 4.808982927423246e-06, + "loss": 1.4643383026123047, + "step": 1080 + }, + { + "epoch": 0.3330511735282801, + "grad_norm": 7.875, + "learning_rate": 4.808131726816814e-06, + "loss": 1.3700556755065918, + "step": 1082 + }, + { + "epoch": 0.3336667949211235, + "grad_norm": 32.5, + "learning_rate": 4.807278729429356e-06, + "loss": 1.2572020292282104, + "step": 1084 + }, + { + "epoch": 0.33428241631396693, + "grad_norm": 11.625, + "learning_rate": 4.8064239361085115e-06, + "loss": 1.3481972217559814, + "step": 1086 + }, + { + "epoch": 0.3348980377068103, + "grad_norm": 13.875, + "learning_rate": 4.80556734770371e-06, + "loss": 1.007904291152954, + "step": 1088 + }, + { + "epoch": 0.3355136590996537, + "grad_norm": 2.546875, + "learning_rate": 4.804708965066162e-06, + "loss": 1.2376952171325684, + "step": 1090 + }, + { + "epoch": 0.33612928049249713, + "grad_norm": 18.75, + "learning_rate": 4.803848789048861e-06, + "loss": 1.1707139015197754, + "step": 1092 + }, + { + "epoch": 0.3367449018853405, + "grad_norm": 2.515625, + "learning_rate": 4.802986820506583e-06, + "loss": 0.4984327554702759, + "step": 1094 + }, + { + "epoch": 0.33736052327818394, + "grad_norm": 8.0625, + "learning_rate": 4.802123060295887e-06, + "loss": 1.3485133647918701, + "step": 1096 + }, + { + "epoch": 0.3379761446710273, + "grad_norm": 5.40625, + "learning_rate": 4.801257509275109e-06, + "loss": 1.1582012176513672, + "step": 1098 + }, + { + "epoch": 0.3385917660638707, + "grad_norm": 6.15625, + "learning_rate": 4.8003901683043675e-06, + "loss": 1.3930922746658325, + "step": 1100 + }, + { + "epoch": 0.33920738745671414, + "grad_norm": 6.25, + "learning_rate": 4.799521038245559e-06, + "loss": 1.6170856952667236, + "step": 1102 + }, + { + "epoch": 0.3398230088495575, + "grad_norm": 12.375, + "learning_rate": 4.798650119962357e-06, + "loss": 1.0825083255767822, + "step": 1104 + }, + { + "epoch": 0.3404386302424009, + "grad_norm": 13.375, + "learning_rate": 4.797777414320213e-06, + "loss": 1.1367437839508057, + "step": 1106 + }, + { + "epoch": 0.34105425163524433, + "grad_norm": 5.03125, + "learning_rate": 4.796902922186353e-06, + "loss": 1.1999157667160034, + "step": 1108 + }, + { + "epoch": 0.3416698730280877, + "grad_norm": 8.4375, + "learning_rate": 4.7960266444297794e-06, + "loss": 1.4281408786773682, + "step": 1110 + }, + { + "epoch": 0.34228549442093115, + "grad_norm": 7.03125, + "learning_rate": 4.79514858192127e-06, + "loss": 1.3028841018676758, + "step": 1112 + }, + { + "epoch": 0.3429011158137745, + "grad_norm": 15.9375, + "learning_rate": 4.794268735533377e-06, + "loss": 1.4932379722595215, + "step": 1114 + }, + { + "epoch": 0.3435167372066179, + "grad_norm": 6.90625, + "learning_rate": 4.7933871061404204e-06, + "loss": 1.7872411012649536, + "step": 1116 + }, + { + "epoch": 0.34413235859946134, + "grad_norm": 8.625, + "learning_rate": 4.792503694618495e-06, + "loss": 1.3513400554656982, + "step": 1118 + }, + { + "epoch": 0.3447479799923047, + "grad_norm": 11.5625, + "learning_rate": 4.791618501845469e-06, + "loss": 1.5094581842422485, + "step": 1120 + }, + { + "epoch": 0.34536360138514816, + "grad_norm": 3.515625, + "learning_rate": 4.790731528700977e-06, + "loss": 1.0170141458511353, + "step": 1122 + }, + { + "epoch": 0.34597922277799154, + "grad_norm": 10.375, + "learning_rate": 4.789842776066425e-06, + "loss": 1.5296138525009155, + "step": 1124 + }, + { + "epoch": 0.3465948441708349, + "grad_norm": 8.3125, + "learning_rate": 4.788952244824984e-06, + "loss": 1.539614200592041, + "step": 1126 + }, + { + "epoch": 0.34721046556367835, + "grad_norm": 4.875, + "learning_rate": 4.788059935861597e-06, + "loss": 1.3610502481460571, + "step": 1128 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 9.6875, + "learning_rate": 4.78716585006297e-06, + "loss": 1.7436330318450928, + "step": 1130 + }, + { + "epoch": 0.34844170834936516, + "grad_norm": 16.25, + "learning_rate": 4.786269988317579e-06, + "loss": 1.3264261484146118, + "step": 1132 + }, + { + "epoch": 0.34905732974220854, + "grad_norm": 11.3125, + "learning_rate": 4.785372351515659e-06, + "loss": 1.6471713781356812, + "step": 1134 + }, + { + "epoch": 0.3496729511350519, + "grad_norm": 7.34375, + "learning_rate": 4.784472940549213e-06, + "loss": 1.5563104152679443, + "step": 1136 + }, + { + "epoch": 0.35028857252789536, + "grad_norm": 2.703125, + "learning_rate": 4.7835717563120044e-06, + "loss": 1.2945740222930908, + "step": 1138 + }, + { + "epoch": 0.35090419392073874, + "grad_norm": 7.3125, + "learning_rate": 4.782668799699563e-06, + "loss": 1.3756704330444336, + "step": 1140 + }, + { + "epoch": 0.3515198153135822, + "grad_norm": 2.078125, + "learning_rate": 4.781764071609173e-06, + "loss": 1.108988642692566, + "step": 1142 + }, + { + "epoch": 0.35213543670642555, + "grad_norm": 5.21875, + "learning_rate": 4.7808575729398865e-06, + "loss": 1.2461122274398804, + "step": 1144 + }, + { + "epoch": 0.35275105809926893, + "grad_norm": 6.15625, + "learning_rate": 4.779949304592511e-06, + "loss": 1.4991414546966553, + "step": 1146 + }, + { + "epoch": 0.35336667949211237, + "grad_norm": 7.6875, + "learning_rate": 4.779039267469612e-06, + "loss": 1.4341750144958496, + "step": 1148 + }, + { + "epoch": 0.35398230088495575, + "grad_norm": 14.0, + "learning_rate": 4.778127462475513e-06, + "loss": 1.4996871948242188, + "step": 1150 + }, + { + "epoch": 0.3545979222777991, + "grad_norm": 13.9375, + "learning_rate": 4.777213890516299e-06, + "loss": 1.656551718711853, + "step": 1152 + }, + { + "epoch": 0.35521354367064256, + "grad_norm": 23.75, + "learning_rate": 4.776298552499803e-06, + "loss": 1.1717579364776611, + "step": 1154 + }, + { + "epoch": 0.35582916506348594, + "grad_norm": 8.0, + "learning_rate": 4.775381449335617e-06, + "loss": 1.55185866355896, + "step": 1156 + }, + { + "epoch": 0.3564447864563294, + "grad_norm": 19.75, + "learning_rate": 4.77446258193509e-06, + "loss": 1.2072546482086182, + "step": 1158 + }, + { + "epoch": 0.35706040784917276, + "grad_norm": 7.5, + "learning_rate": 4.773541951211318e-06, + "loss": 1.4020482301712036, + "step": 1160 + }, + { + "epoch": 0.35767602924201614, + "grad_norm": 22.0, + "learning_rate": 4.772619558079154e-06, + "loss": 1.4429304599761963, + "step": 1162 + }, + { + "epoch": 0.35829165063485957, + "grad_norm": 13.25, + "learning_rate": 4.771695403455201e-06, + "loss": 1.5435585975646973, + "step": 1164 + }, + { + "epoch": 0.35890727202770295, + "grad_norm": 11.0625, + "learning_rate": 4.770769488257812e-06, + "loss": 1.156057596206665, + "step": 1166 + }, + { + "epoch": 0.3595228934205464, + "grad_norm": 5.65625, + "learning_rate": 4.769841813407088e-06, + "loss": 1.3467957973480225, + "step": 1168 + }, + { + "epoch": 0.36013851481338977, + "grad_norm": 26.375, + "learning_rate": 4.768912379824882e-06, + "loss": 1.4512295722961426, + "step": 1170 + }, + { + "epoch": 0.36075413620623314, + "grad_norm": 3.296875, + "learning_rate": 4.767981188434791e-06, + "loss": 1.6179447174072266, + "step": 1172 + }, + { + "epoch": 0.3613697575990766, + "grad_norm": 12.9375, + "learning_rate": 4.767048240162164e-06, + "loss": 1.697928547859192, + "step": 1174 + }, + { + "epoch": 0.36198537899191996, + "grad_norm": 7.4375, + "learning_rate": 4.7661135359340915e-06, + "loss": 1.2352161407470703, + "step": 1176 + }, + { + "epoch": 0.3626010003847634, + "grad_norm": 9.0625, + "learning_rate": 4.7651770766794085e-06, + "loss": 1.5190646648406982, + "step": 1178 + }, + { + "epoch": 0.3632166217776068, + "grad_norm": 12.75, + "learning_rate": 4.764238863328696e-06, + "loss": 1.2285685539245605, + "step": 1180 + }, + { + "epoch": 0.36383224317045015, + "grad_norm": 20.5, + "learning_rate": 4.763298896814279e-06, + "loss": 1.5371445417404175, + "step": 1182 + }, + { + "epoch": 0.3644478645632936, + "grad_norm": 5.78125, + "learning_rate": 4.762357178070221e-06, + "loss": 1.3828462362289429, + "step": 1184 + }, + { + "epoch": 0.36506348595613697, + "grad_norm": 7.875, + "learning_rate": 4.761413708032332e-06, + "loss": 1.4076499938964844, + "step": 1186 + }, + { + "epoch": 0.3656791073489804, + "grad_norm": 6.53125, + "learning_rate": 4.760468487638158e-06, + "loss": 1.313516616821289, + "step": 1188 + }, + { + "epoch": 0.3662947287418238, + "grad_norm": 6.5625, + "learning_rate": 4.759521517826985e-06, + "loss": 1.4816573858261108, + "step": 1190 + }, + { + "epoch": 0.36691035013466716, + "grad_norm": 6.46875, + "learning_rate": 4.7585727995398376e-06, + "loss": 1.158787727355957, + "step": 1192 + }, + { + "epoch": 0.3675259715275106, + "grad_norm": 4.40625, + "learning_rate": 4.75762233371948e-06, + "loss": 1.4099538326263428, + "step": 1194 + }, + { + "epoch": 0.368141592920354, + "grad_norm": 8.3125, + "learning_rate": 4.756670121310411e-06, + "loss": 1.3869171142578125, + "step": 1196 + }, + { + "epoch": 0.36875721431319736, + "grad_norm": 11.875, + "learning_rate": 4.7557161632588655e-06, + "loss": 1.889589786529541, + "step": 1198 + }, + { + "epoch": 0.3693728357060408, + "grad_norm": 15.5, + "learning_rate": 4.754760460512813e-06, + "loss": 1.5803718566894531, + "step": 1200 + }, + { + "epoch": 0.36998845709888417, + "grad_norm": 2.171875, + "learning_rate": 4.753803014021956e-06, + "loss": 1.3832993507385254, + "step": 1202 + }, + { + "epoch": 0.3706040784917276, + "grad_norm": 7.84375, + "learning_rate": 4.75284382473773e-06, + "loss": 1.3015557527542114, + "step": 1204 + }, + { + "epoch": 0.371219699884571, + "grad_norm": 10.8125, + "learning_rate": 4.751882893613305e-06, + "loss": 1.5658292770385742, + "step": 1206 + }, + { + "epoch": 0.37183532127741437, + "grad_norm": 6.625, + "learning_rate": 4.75092022160358e-06, + "loss": 1.6867892742156982, + "step": 1208 + }, + { + "epoch": 0.3724509426702578, + "grad_norm": 7.65625, + "learning_rate": 4.7499558096651796e-06, + "loss": 1.3453547954559326, + "step": 1210 + }, + { + "epoch": 0.3730665640631012, + "grad_norm": 9.125, + "learning_rate": 4.748989658756467e-06, + "loss": 1.4894853830337524, + "step": 1212 + }, + { + "epoch": 0.3736821854559446, + "grad_norm": 6.0625, + "learning_rate": 4.748021769837524e-06, + "loss": 1.0069468021392822, + "step": 1214 + }, + { + "epoch": 0.374297806848788, + "grad_norm": 9.75, + "learning_rate": 4.747052143870166e-06, + "loss": 1.6321216821670532, + "step": 1216 + }, + { + "epoch": 0.3749134282416314, + "grad_norm": 7.25, + "learning_rate": 4.746080781817929e-06, + "loss": 1.0080413818359375, + "step": 1218 + }, + { + "epoch": 0.3755290496344748, + "grad_norm": 2.34375, + "learning_rate": 4.745107684646081e-06, + "loss": 1.0679363012313843, + "step": 1220 + }, + { + "epoch": 0.3761446710273182, + "grad_norm": 5.1875, + "learning_rate": 4.744132853321608e-06, + "loss": 1.5123836994171143, + "step": 1222 + }, + { + "epoch": 0.3767602924201616, + "grad_norm": 11.375, + "learning_rate": 4.743156288813223e-06, + "loss": 1.4279792308807373, + "step": 1224 + }, + { + "epoch": 0.377375913813005, + "grad_norm": 3.765625, + "learning_rate": 4.742177992091359e-06, + "loss": 1.2972075939178467, + "step": 1226 + }, + { + "epoch": 0.3779915352058484, + "grad_norm": 6.78125, + "learning_rate": 4.7411979641281724e-06, + "loss": 1.2801681756973267, + "step": 1228 + }, + { + "epoch": 0.3786071565986918, + "grad_norm": 5.21875, + "learning_rate": 4.7402162058975375e-06, + "loss": 1.3559138774871826, + "step": 1230 + }, + { + "epoch": 0.3792227779915352, + "grad_norm": 9.375, + "learning_rate": 4.7392327183750516e-06, + "loss": 1.3966338634490967, + "step": 1232 + }, + { + "epoch": 0.37983839938437863, + "grad_norm": 5.4375, + "learning_rate": 4.738247502538027e-06, + "loss": 1.4354243278503418, + "step": 1234 + }, + { + "epoch": 0.380454020777222, + "grad_norm": 3.953125, + "learning_rate": 4.737260559365494e-06, + "loss": 1.4101216793060303, + "step": 1236 + }, + { + "epoch": 0.3810696421700654, + "grad_norm": 5.59375, + "learning_rate": 4.736271889838201e-06, + "loss": 1.9176892042160034, + "step": 1238 + }, + { + "epoch": 0.3816852635629088, + "grad_norm": 6.4375, + "learning_rate": 4.735281494938612e-06, + "loss": 1.3109514713287354, + "step": 1240 + }, + { + "epoch": 0.3823008849557522, + "grad_norm": 5.1875, + "learning_rate": 4.734289375650903e-06, + "loss": 1.9060190916061401, + "step": 1242 + }, + { + "epoch": 0.3829165063485956, + "grad_norm": 5.15625, + "learning_rate": 4.733295532960966e-06, + "loss": 1.3563264608383179, + "step": 1244 + }, + { + "epoch": 0.383532127741439, + "grad_norm": 10.3125, + "learning_rate": 4.732299967856405e-06, + "loss": 1.6917022466659546, + "step": 1246 + }, + { + "epoch": 0.3841477491342824, + "grad_norm": 7.625, + "learning_rate": 4.731302681326535e-06, + "loss": 1.2438175678253174, + "step": 1248 + }, + { + "epoch": 0.38476337052712584, + "grad_norm": 6.25, + "learning_rate": 4.730303674362382e-06, + "loss": 1.493094801902771, + "step": 1250 + }, + { + "epoch": 0.3853789919199692, + "grad_norm": 52.25, + "learning_rate": 4.729302947956681e-06, + "loss": 1.2970855236053467, + "step": 1252 + }, + { + "epoch": 0.3859946133128126, + "grad_norm": 13.9375, + "learning_rate": 4.7283005031038775e-06, + "loss": 1.180719256401062, + "step": 1254 + }, + { + "epoch": 0.38661023470565603, + "grad_norm": 6.53125, + "learning_rate": 4.727296340800123e-06, + "loss": 1.365455150604248, + "step": 1256 + }, + { + "epoch": 0.3872258560984994, + "grad_norm": 5.65625, + "learning_rate": 4.726290462043275e-06, + "loss": 1.53617262840271, + "step": 1258 + }, + { + "epoch": 0.38784147749134285, + "grad_norm": 8.125, + "learning_rate": 4.725282867832899e-06, + "loss": 1.7439792156219482, + "step": 1260 + }, + { + "epoch": 0.3884570988841862, + "grad_norm": 8.5625, + "learning_rate": 4.724273559170264e-06, + "loss": 1.4724012613296509, + "step": 1262 + }, + { + "epoch": 0.3890727202770296, + "grad_norm": 39.5, + "learning_rate": 4.723262537058342e-06, + "loss": 1.6723387241363525, + "step": 1264 + }, + { + "epoch": 0.38968834166987304, + "grad_norm": 7.625, + "learning_rate": 4.722249802501807e-06, + "loss": 1.2530168294906616, + "step": 1266 + }, + { + "epoch": 0.3903039630627164, + "grad_norm": 2.75, + "learning_rate": 4.72123535650704e-06, + "loss": 1.153831124305725, + "step": 1268 + }, + { + "epoch": 0.39091958445555985, + "grad_norm": 4.75, + "learning_rate": 4.720219200082116e-06, + "loss": 1.2530832290649414, + "step": 1270 + }, + { + "epoch": 0.39153520584840323, + "grad_norm": 77.0, + "learning_rate": 4.719201334236811e-06, + "loss": 1.822005033493042, + "step": 1272 + }, + { + "epoch": 0.3921508272412466, + "grad_norm": 6.40625, + "learning_rate": 4.718181759982604e-06, + "loss": 1.5550251007080078, + "step": 1274 + }, + { + "epoch": 0.39276644863409005, + "grad_norm": 9.375, + "learning_rate": 4.7171604783326674e-06, + "loss": 1.3926851749420166, + "step": 1276 + }, + { + "epoch": 0.39338207002693343, + "grad_norm": 30.875, + "learning_rate": 4.716137490301872e-06, + "loss": 1.1721335649490356, + "step": 1278 + }, + { + "epoch": 0.39399769141977686, + "grad_norm": 3.09375, + "learning_rate": 4.715112796906784e-06, + "loss": 1.2481324672698975, + "step": 1280 + }, + { + "epoch": 0.39461331281262024, + "grad_norm": 10.4375, + "learning_rate": 4.714086399165664e-06, + "loss": 0.8791213631629944, + "step": 1282 + }, + { + "epoch": 0.3952289342054636, + "grad_norm": 5.09375, + "learning_rate": 4.713058298098467e-06, + "loss": 1.1425285339355469, + "step": 1284 + }, + { + "epoch": 0.39584455559830706, + "grad_norm": 4.6875, + "learning_rate": 4.712028494726838e-06, + "loss": 1.3065851926803589, + "step": 1286 + }, + { + "epoch": 0.39646017699115044, + "grad_norm": 6.3125, + "learning_rate": 4.7109969900741185e-06, + "loss": 1.2983765602111816, + "step": 1288 + }, + { + "epoch": 0.3970757983839938, + "grad_norm": 5.8125, + "learning_rate": 4.709963785165336e-06, + "loss": 1.4547548294067383, + "step": 1290 + }, + { + "epoch": 0.39769141977683725, + "grad_norm": 7.0, + "learning_rate": 4.708928881027209e-06, + "loss": 1.8190933465957642, + "step": 1292 + }, + { + "epoch": 0.39830704116968063, + "grad_norm": 8.375, + "learning_rate": 4.707892278688148e-06, + "loss": 1.7181881666183472, + "step": 1294 + }, + { + "epoch": 0.39892266256252407, + "grad_norm": 6.9375, + "learning_rate": 4.706853979178244e-06, + "loss": 1.382314920425415, + "step": 1296 + }, + { + "epoch": 0.39953828395536745, + "grad_norm": 5.9375, + "learning_rate": 4.705813983529282e-06, + "loss": 1.3636555671691895, + "step": 1298 + }, + { + "epoch": 0.4001539053482108, + "grad_norm": 6.96875, + "learning_rate": 4.704772292774726e-06, + "loss": 1.2776868343353271, + "step": 1300 + }, + { + "epoch": 0.40076952674105426, + "grad_norm": 5.5625, + "learning_rate": 4.703728907949729e-06, + "loss": 1.383784532546997, + "step": 1302 + }, + { + "epoch": 0.40138514813389764, + "grad_norm": 8.75, + "learning_rate": 4.702683830091127e-06, + "loss": 1.3588171005249023, + "step": 1304 + }, + { + "epoch": 0.4020007695267411, + "grad_norm": 15.5, + "learning_rate": 4.701637060237434e-06, + "loss": 1.5341674089431763, + "step": 1306 + }, + { + "epoch": 0.40261639091958445, + "grad_norm": 7.75, + "learning_rate": 4.700588599428851e-06, + "loss": 1.5815376043319702, + "step": 1308 + }, + { + "epoch": 0.40323201231242783, + "grad_norm": 3.953125, + "learning_rate": 4.699538448707258e-06, + "loss": 1.2344942092895508, + "step": 1310 + }, + { + "epoch": 0.40384763370527127, + "grad_norm": 5.03125, + "learning_rate": 4.698486609116212e-06, + "loss": 1.4477823972702026, + "step": 1312 + }, + { + "epoch": 0.40446325509811465, + "grad_norm": 7.5, + "learning_rate": 4.697433081700949e-06, + "loss": 1.0380557775497437, + "step": 1314 + }, + { + "epoch": 0.4050788764909581, + "grad_norm": 3.046875, + "learning_rate": 4.6963778675083815e-06, + "loss": 1.322392225265503, + "step": 1316 + }, + { + "epoch": 0.40569449788380146, + "grad_norm": 23.0, + "learning_rate": 4.695320967587104e-06, + "loss": 1.6188472509384155, + "step": 1318 + }, + { + "epoch": 0.40631011927664484, + "grad_norm": 4.53125, + "learning_rate": 4.694262382987377e-06, + "loss": 1.4975945949554443, + "step": 1320 + }, + { + "epoch": 0.4069257406694883, + "grad_norm": 17.0, + "learning_rate": 4.693202114761143e-06, + "loss": 1.310897946357727, + "step": 1322 + }, + { + "epoch": 0.40754136206233166, + "grad_norm": 3.21875, + "learning_rate": 4.692140163962012e-06, + "loss": 1.1289759874343872, + "step": 1324 + }, + { + "epoch": 0.4081569834551751, + "grad_norm": 6.46875, + "learning_rate": 4.69107653164527e-06, + "loss": 1.5048915147781372, + "step": 1326 + }, + { + "epoch": 0.4087726048480185, + "grad_norm": 3.25, + "learning_rate": 4.6900112188678715e-06, + "loss": 1.0909032821655273, + "step": 1328 + }, + { + "epoch": 0.40938822624086185, + "grad_norm": 9.625, + "learning_rate": 4.688944226688442e-06, + "loss": 1.2539284229278564, + "step": 1330 + }, + { + "epoch": 0.4100038476337053, + "grad_norm": 5.28125, + "learning_rate": 4.687875556167275e-06, + "loss": 1.3737448453903198, + "step": 1332 + }, + { + "epoch": 0.41061946902654867, + "grad_norm": 8.0625, + "learning_rate": 4.686805208366333e-06, + "loss": 1.6758344173431396, + "step": 1334 + }, + { + "epoch": 0.41123509041939205, + "grad_norm": 18.75, + "learning_rate": 4.685733184349245e-06, + "loss": 1.5155155658721924, + "step": 1336 + }, + { + "epoch": 0.4118507118122355, + "grad_norm": 6.40625, + "learning_rate": 4.684659485181303e-06, + "loss": 1.2852325439453125, + "step": 1338 + }, + { + "epoch": 0.41246633320507886, + "grad_norm": 9.25, + "learning_rate": 4.683584111929469e-06, + "loss": 1.2048956155776978, + "step": 1340 + }, + { + "epoch": 0.4130819545979223, + "grad_norm": 8.125, + "learning_rate": 4.682507065662363e-06, + "loss": 1.2220932245254517, + "step": 1342 + }, + { + "epoch": 0.4136975759907657, + "grad_norm": 13.3125, + "learning_rate": 4.681428347450271e-06, + "loss": 1.357061505317688, + "step": 1344 + }, + { + "epoch": 0.41431319738360906, + "grad_norm": 7.25, + "learning_rate": 4.68034795836514e-06, + "loss": 1.5958317518234253, + "step": 1346 + }, + { + "epoch": 0.4149288187764525, + "grad_norm": 7.15625, + "learning_rate": 4.679265899480577e-06, + "loss": 1.5246134996414185, + "step": 1348 + }, + { + "epoch": 0.41554444016929587, + "grad_norm": 6.25, + "learning_rate": 4.678182171871847e-06, + "loss": 1.5932855606079102, + "step": 1350 + }, + { + "epoch": 0.4161600615621393, + "grad_norm": 6.03125, + "learning_rate": 4.677096776615875e-06, + "loss": 1.343849539756775, + "step": 1352 + }, + { + "epoch": 0.4167756829549827, + "grad_norm": 4.625, + "learning_rate": 4.676009714791242e-06, + "loss": 1.0770719051361084, + "step": 1354 + }, + { + "epoch": 0.41739130434782606, + "grad_norm": 3.53125, + "learning_rate": 4.6749209874781864e-06, + "loss": 1.2976691722869873, + "step": 1356 + }, + { + "epoch": 0.4180069257406695, + "grad_norm": 9.125, + "learning_rate": 4.6738305957586e-06, + "loss": 1.6580630540847778, + "step": 1358 + }, + { + "epoch": 0.4186225471335129, + "grad_norm": 51.25, + "learning_rate": 4.672738540716032e-06, + "loss": 1.1791507005691528, + "step": 1360 + }, + { + "epoch": 0.4192381685263563, + "grad_norm": 7.375, + "learning_rate": 4.671644823435681e-06, + "loss": 1.7933979034423828, + "step": 1362 + }, + { + "epoch": 0.4198537899191997, + "grad_norm": 7.59375, + "learning_rate": 4.670549445004395e-06, + "loss": 1.090211272239685, + "step": 1364 + }, + { + "epoch": 0.4204694113120431, + "grad_norm": 4.75, + "learning_rate": 4.669452406510681e-06, + "loss": 1.3155393600463867, + "step": 1366 + }, + { + "epoch": 0.4210850327048865, + "grad_norm": 7.71875, + "learning_rate": 4.6683537090446875e-06, + "loss": 1.668478012084961, + "step": 1368 + }, + { + "epoch": 0.4217006540977299, + "grad_norm": 11.4375, + "learning_rate": 4.667253353698216e-06, + "loss": 1.3040167093276978, + "step": 1370 + }, + { + "epoch": 0.4223162754905733, + "grad_norm": 4.40625, + "learning_rate": 4.666151341564713e-06, + "loss": 1.1923213005065918, + "step": 1372 + }, + { + "epoch": 0.4229318968834167, + "grad_norm": 25.625, + "learning_rate": 4.665047673739275e-06, + "loss": 1.5164082050323486, + "step": 1374 + }, + { + "epoch": 0.4235475182762601, + "grad_norm": 6.0625, + "learning_rate": 4.66394235131864e-06, + "loss": 1.4245156049728394, + "step": 1376 + }, + { + "epoch": 0.4241631396691035, + "grad_norm": 10.1875, + "learning_rate": 4.662835375401191e-06, + "loss": 1.396084189414978, + "step": 1378 + }, + { + "epoch": 0.4247787610619469, + "grad_norm": 4.875, + "learning_rate": 4.661726747086957e-06, + "loss": 1.2419757843017578, + "step": 1380 + }, + { + "epoch": 0.4253943824547903, + "grad_norm": 8.6875, + "learning_rate": 4.660616467477604e-06, + "loss": 1.0193966627120972, + "step": 1382 + }, + { + "epoch": 0.4260100038476337, + "grad_norm": 7.5625, + "learning_rate": 4.659504537676444e-06, + "loss": 1.6151647567749023, + "step": 1384 + }, + { + "epoch": 0.4266256252404771, + "grad_norm": 3.75, + "learning_rate": 4.658390958788426e-06, + "loss": 1.2757296562194824, + "step": 1386 + }, + { + "epoch": 0.4272412466333205, + "grad_norm": 8.0625, + "learning_rate": 4.6572757319201366e-06, + "loss": 1.3928544521331787, + "step": 1388 + }, + { + "epoch": 0.4278568680261639, + "grad_norm": 2.796875, + "learning_rate": 4.656158858179805e-06, + "loss": 1.0542001724243164, + "step": 1390 + }, + { + "epoch": 0.4284724894190073, + "grad_norm": 7.0, + "learning_rate": 4.655040338677292e-06, + "loss": 1.1852567195892334, + "step": 1392 + }, + { + "epoch": 0.4290881108118507, + "grad_norm": 6.5, + "learning_rate": 4.6539201745240925e-06, + "loss": 1.287937879562378, + "step": 1394 + }, + { + "epoch": 0.4297037322046941, + "grad_norm": 12.9375, + "learning_rate": 4.652798366833344e-06, + "loss": 1.372180461883545, + "step": 1396 + }, + { + "epoch": 0.43031935359753754, + "grad_norm": 7.46875, + "learning_rate": 4.651674916719809e-06, + "loss": 1.5380756855010986, + "step": 1398 + }, + { + "epoch": 0.4309349749903809, + "grad_norm": 3.359375, + "learning_rate": 4.650549825299886e-06, + "loss": 1.2015230655670166, + "step": 1400 + }, + { + "epoch": 0.4315505963832243, + "grad_norm": 5.25, + "learning_rate": 4.649423093691603e-06, + "loss": 1.5411839485168457, + "step": 1402 + }, + { + "epoch": 0.43216621777606773, + "grad_norm": 6.21875, + "learning_rate": 4.648294723014618e-06, + "loss": 1.373792290687561, + "step": 1404 + }, + { + "epoch": 0.4327818391689111, + "grad_norm": 1.4296875, + "learning_rate": 4.647164714390219e-06, + "loss": 1.1725441217422485, + "step": 1406 + }, + { + "epoch": 0.43339746056175454, + "grad_norm": 7.0, + "learning_rate": 4.6460330689413214e-06, + "loss": 1.2541426420211792, + "step": 1408 + }, + { + "epoch": 0.4340130819545979, + "grad_norm": 34.0, + "learning_rate": 4.644899787792465e-06, + "loss": 1.3803458213806152, + "step": 1410 + }, + { + "epoch": 0.4346287033474413, + "grad_norm": 7.03125, + "learning_rate": 4.643764872069819e-06, + "loss": 1.414416790008545, + "step": 1412 + }, + { + "epoch": 0.43524432474028474, + "grad_norm": 5.09375, + "learning_rate": 4.642628322901171e-06, + "loss": 1.1327471733093262, + "step": 1414 + }, + { + "epoch": 0.4358599461331281, + "grad_norm": 28.0, + "learning_rate": 4.64149014141594e-06, + "loss": 1.6805217266082764, + "step": 1416 + }, + { + "epoch": 0.43647556752597155, + "grad_norm": 7.96875, + "learning_rate": 4.640350328745159e-06, + "loss": 1.5550297498703003, + "step": 1418 + }, + { + "epoch": 0.43709118891881493, + "grad_norm": 6.8125, + "learning_rate": 4.6392088860214865e-06, + "loss": 1.574475884437561, + "step": 1420 + }, + { + "epoch": 0.4377068103116583, + "grad_norm": 13.5, + "learning_rate": 4.638065814379201e-06, + "loss": 1.6193119287490845, + "step": 1422 + }, + { + "epoch": 0.43832243170450175, + "grad_norm": 6.0625, + "learning_rate": 4.636921114954196e-06, + "loss": 1.1327303647994995, + "step": 1424 + }, + { + "epoch": 0.4389380530973451, + "grad_norm": 5.65625, + "learning_rate": 4.635774788883986e-06, + "loss": 0.9087588787078857, + "step": 1426 + }, + { + "epoch": 0.43955367449018856, + "grad_norm": 9.1875, + "learning_rate": 4.634626837307702e-06, + "loss": 1.2922405004501343, + "step": 1428 + }, + { + "epoch": 0.44016929588303194, + "grad_norm": 4.09375, + "learning_rate": 4.633477261366087e-06, + "loss": 1.099453330039978, + "step": 1430 + }, + { + "epoch": 0.4407849172758753, + "grad_norm": 6.875, + "learning_rate": 4.632326062201502e-06, + "loss": 1.2831623554229736, + "step": 1432 + }, + { + "epoch": 0.44140053866871876, + "grad_norm": 11.5625, + "learning_rate": 4.631173240957919e-06, + "loss": 1.104215145111084, + "step": 1434 + }, + { + "epoch": 0.44201616006156214, + "grad_norm": 8.1875, + "learning_rate": 4.630018798780923e-06, + "loss": 1.4881500005722046, + "step": 1436 + }, + { + "epoch": 0.4426317814544055, + "grad_norm": 6.90625, + "learning_rate": 4.628862736817707e-06, + "loss": 1.2838746309280396, + "step": 1438 + }, + { + "epoch": 0.44324740284724895, + "grad_norm": 6.3125, + "learning_rate": 4.627705056217079e-06, + "loss": 1.402595043182373, + "step": 1440 + }, + { + "epoch": 0.44386302424009233, + "grad_norm": 5.40625, + "learning_rate": 4.626545758129449e-06, + "loss": 1.4191675186157227, + "step": 1442 + }, + { + "epoch": 0.44447864563293576, + "grad_norm": 3.375, + "learning_rate": 4.62538484370684e-06, + "loss": 1.5125514268875122, + "step": 1444 + }, + { + "epoch": 0.44509426702577914, + "grad_norm": 6.25, + "learning_rate": 4.624222314102876e-06, + "loss": 1.2079944610595703, + "step": 1446 + }, + { + "epoch": 0.4457098884186225, + "grad_norm": 4.78125, + "learning_rate": 4.623058170472792e-06, + "loss": 1.1275867223739624, + "step": 1448 + }, + { + "epoch": 0.44632550981146596, + "grad_norm": 55.75, + "learning_rate": 4.62189241397342e-06, + "loss": 1.4862993955612183, + "step": 1450 + }, + { + "epoch": 0.44694113120430934, + "grad_norm": 7.5, + "learning_rate": 4.6207250457632e-06, + "loss": 1.3993580341339111, + "step": 1452 + }, + { + "epoch": 0.4475567525971528, + "grad_norm": 3.5625, + "learning_rate": 4.619556067002173e-06, + "loss": 1.0116881132125854, + "step": 1454 + }, + { + "epoch": 0.44817237398999615, + "grad_norm": 7.875, + "learning_rate": 4.6183854788519785e-06, + "loss": 1.0309436321258545, + "step": 1456 + }, + { + "epoch": 0.44878799538283953, + "grad_norm": 12.5, + "learning_rate": 4.6172132824758565e-06, + "loss": 1.2677288055419922, + "step": 1458 + }, + { + "epoch": 0.44940361677568297, + "grad_norm": 18.0, + "learning_rate": 4.616039479038644e-06, + "loss": 1.6138582229614258, + "step": 1460 + }, + { + "epoch": 0.45001923816852635, + "grad_norm": 9.0625, + "learning_rate": 4.614864069706777e-06, + "loss": 1.326206922531128, + "step": 1462 + }, + { + "epoch": 0.4506348595613698, + "grad_norm": 11.75, + "learning_rate": 4.613687055648285e-06, + "loss": 1.2700221538543701, + "step": 1464 + }, + { + "epoch": 0.45125048095421316, + "grad_norm": 13.125, + "learning_rate": 4.6125084380327935e-06, + "loss": 1.2647291421890259, + "step": 1466 + }, + { + "epoch": 0.45186610234705654, + "grad_norm": 6.25, + "learning_rate": 4.611328218031521e-06, + "loss": 1.2361282110214233, + "step": 1468 + }, + { + "epoch": 0.4524817237399, + "grad_norm": 10.3125, + "learning_rate": 4.6101463968172795e-06, + "loss": 1.4439221620559692, + "step": 1470 + }, + { + "epoch": 0.45309734513274336, + "grad_norm": 10.5625, + "learning_rate": 4.608962975564471e-06, + "loss": 1.5618476867675781, + "step": 1472 + }, + { + "epoch": 0.4537129665255868, + "grad_norm": 13.25, + "learning_rate": 4.6077779554490875e-06, + "loss": 1.4153897762298584, + "step": 1474 + }, + { + "epoch": 0.45432858791843017, + "grad_norm": 12.125, + "learning_rate": 4.606591337648709e-06, + "loss": 1.6324784755706787, + "step": 1476 + }, + { + "epoch": 0.45494420931127355, + "grad_norm": 10.75, + "learning_rate": 4.605403123342506e-06, + "loss": 1.528695821762085, + "step": 1478 + }, + { + "epoch": 0.455559830704117, + "grad_norm": 9.125, + "learning_rate": 4.604213313711232e-06, + "loss": 1.4585542678833008, + "step": 1480 + }, + { + "epoch": 0.45617545209696037, + "grad_norm": 7.65625, + "learning_rate": 4.60302190993723e-06, + "loss": 1.4700865745544434, + "step": 1482 + }, + { + "epoch": 0.45679107348980375, + "grad_norm": 11.75, + "learning_rate": 4.601828913204421e-06, + "loss": 1.332728385925293, + "step": 1484 + }, + { + "epoch": 0.4574066948826472, + "grad_norm": 11.875, + "learning_rate": 4.600634324698317e-06, + "loss": 1.1675676107406616, + "step": 1486 + }, + { + "epoch": 0.45802231627549056, + "grad_norm": 18.75, + "learning_rate": 4.599438145606003e-06, + "loss": 0.9165405631065369, + "step": 1488 + }, + { + "epoch": 0.458637937668334, + "grad_norm": 3.859375, + "learning_rate": 4.5982403771161525e-06, + "loss": 0.7547895908355713, + "step": 1490 + }, + { + "epoch": 0.4592535590611774, + "grad_norm": 16.625, + "learning_rate": 4.597041020419012e-06, + "loss": 1.3234155178070068, + "step": 1492 + }, + { + "epoch": 0.45986918045402075, + "grad_norm": 16.875, + "learning_rate": 4.595840076706411e-06, + "loss": 1.7954037189483643, + "step": 1494 + }, + { + "epoch": 0.4604848018468642, + "grad_norm": 16.0, + "learning_rate": 4.5946375471717545e-06, + "loss": 1.4703285694122314, + "step": 1496 + }, + { + "epoch": 0.46110042323970757, + "grad_norm": 8.25, + "learning_rate": 4.593433433010021e-06, + "loss": 1.2443583011627197, + "step": 1498 + }, + { + "epoch": 0.461716044632551, + "grad_norm": 8.875, + "learning_rate": 4.592227735417768e-06, + "loss": 0.90807044506073, + "step": 1500 + }, + { + "epoch": 0.4623316660253944, + "grad_norm": 14.875, + "learning_rate": 4.591020455593123e-06, + "loss": 1.6143991947174072, + "step": 1502 + }, + { + "epoch": 0.46294728741823776, + "grad_norm": 5.71875, + "learning_rate": 4.589811594735785e-06, + "loss": 1.3481473922729492, + "step": 1504 + }, + { + "epoch": 0.4635629088110812, + "grad_norm": 13.25, + "learning_rate": 4.588601154047031e-06, + "loss": 1.9968032836914062, + "step": 1506 + }, + { + "epoch": 0.4641785302039246, + "grad_norm": 8.3125, + "learning_rate": 4.5873891347296995e-06, + "loss": 1.3899567127227783, + "step": 1508 + }, + { + "epoch": 0.464794151596768, + "grad_norm": 4.8125, + "learning_rate": 4.586175537988204e-06, + "loss": 1.2550766468048096, + "step": 1510 + }, + { + "epoch": 0.4654097729896114, + "grad_norm": 6.8125, + "learning_rate": 4.584960365028519e-06, + "loss": 1.5146225690841675, + "step": 1512 + }, + { + "epoch": 0.46602539438245477, + "grad_norm": 7.34375, + "learning_rate": 4.58374361705819e-06, + "loss": 0.8701425194740295, + "step": 1514 + }, + { + "epoch": 0.4666410157752982, + "grad_norm": 12.1875, + "learning_rate": 4.58252529528633e-06, + "loss": 0.8581297993659973, + "step": 1516 + }, + { + "epoch": 0.4672566371681416, + "grad_norm": 8.3125, + "learning_rate": 4.58130540092361e-06, + "loss": 1.3546233177185059, + "step": 1518 + }, + { + "epoch": 0.467872258560985, + "grad_norm": 22.125, + "learning_rate": 4.5800839351822665e-06, + "loss": 1.392436146736145, + "step": 1520 + }, + { + "epoch": 0.4684878799538284, + "grad_norm": 7.125, + "learning_rate": 4.578860899276097e-06, + "loss": 1.556898832321167, + "step": 1522 + }, + { + "epoch": 0.4691035013466718, + "grad_norm": 9.3125, + "learning_rate": 4.577636294420462e-06, + "loss": 1.5069950819015503, + "step": 1524 + }, + { + "epoch": 0.4697191227395152, + "grad_norm": 5.28125, + "learning_rate": 4.5764101218322765e-06, + "loss": 1.4646862745285034, + "step": 1526 + }, + { + "epoch": 0.4703347441323586, + "grad_norm": 1.984375, + "learning_rate": 4.575182382730016e-06, + "loss": 1.3124823570251465, + "step": 1528 + }, + { + "epoch": 0.470950365525202, + "grad_norm": 4.4375, + "learning_rate": 4.573953078333712e-06, + "loss": 0.9778600931167603, + "step": 1530 + }, + { + "epoch": 0.4715659869180454, + "grad_norm": 5.40625, + "learning_rate": 4.572722209864955e-06, + "loss": 1.3515981435775757, + "step": 1532 + }, + { + "epoch": 0.4721816083108888, + "grad_norm": 10.0625, + "learning_rate": 4.571489778546883e-06, + "loss": 1.7041574716567993, + "step": 1534 + }, + { + "epoch": 0.4727972297037322, + "grad_norm": 15.8125, + "learning_rate": 4.57025578560419e-06, + "loss": 1.6992871761322021, + "step": 1536 + }, + { + "epoch": 0.4734128510965756, + "grad_norm": 6.65625, + "learning_rate": 4.569020232263127e-06, + "loss": 1.851335048675537, + "step": 1538 + }, + { + "epoch": 0.474028472489419, + "grad_norm": 44.5, + "learning_rate": 4.567783119751487e-06, + "loss": 1.4812127351760864, + "step": 1540 + }, + { + "epoch": 0.4746440938822624, + "grad_norm": 5.4375, + "learning_rate": 4.566544449298618e-06, + "loss": 1.3944592475891113, + "step": 1542 + }, + { + "epoch": 0.4752597152751058, + "grad_norm": 6.1875, + "learning_rate": 4.565304222135414e-06, + "loss": 1.2519789934158325, + "step": 1544 + }, + { + "epoch": 0.47587533666794923, + "grad_norm": 1.609375, + "learning_rate": 4.5640624394943164e-06, + "loss": 1.0768710374832153, + "step": 1546 + }, + { + "epoch": 0.4764909580607926, + "grad_norm": 11.3125, + "learning_rate": 4.562819102609314e-06, + "loss": 1.4202693700790405, + "step": 1548 + }, + { + "epoch": 0.477106579453636, + "grad_norm": 7.53125, + "learning_rate": 4.5615742127159365e-06, + "loss": 1.2447857856750488, + "step": 1550 + }, + { + "epoch": 0.47772220084647943, + "grad_norm": 3.359375, + "learning_rate": 4.560327771051262e-06, + "loss": 1.0244370698928833, + "step": 1552 + }, + { + "epoch": 0.4783378222393228, + "grad_norm": 5.65625, + "learning_rate": 4.5590797788539035e-06, + "loss": 1.51302170753479, + "step": 1554 + }, + { + "epoch": 0.47895344363216624, + "grad_norm": 10.9375, + "learning_rate": 4.55783023736402e-06, + "loss": 1.6049150228500366, + "step": 1556 + }, + { + "epoch": 0.4795690650250096, + "grad_norm": 8.625, + "learning_rate": 4.556579147823311e-06, + "loss": 1.128237247467041, + "step": 1558 + }, + { + "epoch": 0.480184686417853, + "grad_norm": 9.5625, + "learning_rate": 4.55532651147501e-06, + "loss": 1.1766349077224731, + "step": 1560 + }, + { + "epoch": 0.48080030781069644, + "grad_norm": 7.875, + "learning_rate": 4.554072329563891e-06, + "loss": 1.5755650997161865, + "step": 1562 + }, + { + "epoch": 0.4814159292035398, + "grad_norm": 11.75, + "learning_rate": 4.552816603336262e-06, + "loss": 1.2936866283416748, + "step": 1564 + }, + { + "epoch": 0.48203155059638325, + "grad_norm": 2.796875, + "learning_rate": 4.551559334039966e-06, + "loss": 1.2639069557189941, + "step": 1566 + }, + { + "epoch": 0.48264717198922663, + "grad_norm": 9.625, + "learning_rate": 4.550300522924383e-06, + "loss": 1.2861605882644653, + "step": 1568 + }, + { + "epoch": 0.48326279338207, + "grad_norm": 8.0, + "learning_rate": 4.549040171240416e-06, + "loss": 1.325995922088623, + "step": 1570 + }, + { + "epoch": 0.48387841477491345, + "grad_norm": 8.6875, + "learning_rate": 4.54777828024051e-06, + "loss": 1.4679720401763916, + "step": 1572 + }, + { + "epoch": 0.4844940361677568, + "grad_norm": 5.9375, + "learning_rate": 4.546514851178631e-06, + "loss": 1.2052301168441772, + "step": 1574 + }, + { + "epoch": 0.4851096575606002, + "grad_norm": 1.6328125, + "learning_rate": 4.545249885310278e-06, + "loss": 1.1089370250701904, + "step": 1576 + }, + { + "epoch": 0.48572527895344364, + "grad_norm": 5.71875, + "learning_rate": 4.543983383892477e-06, + "loss": 1.4348812103271484, + "step": 1578 + }, + { + "epoch": 0.486340900346287, + "grad_norm": 3.28125, + "learning_rate": 4.542715348183776e-06, + "loss": 1.1429481506347656, + "step": 1580 + }, + { + "epoch": 0.48695652173913045, + "grad_norm": 8.125, + "learning_rate": 4.541445779444252e-06, + "loss": 1.3008825778961182, + "step": 1582 + }, + { + "epoch": 0.48757214313197383, + "grad_norm": 5.40625, + "learning_rate": 4.540174678935506e-06, + "loss": 1.3641581535339355, + "step": 1584 + }, + { + "epoch": 0.4881877645248172, + "grad_norm": 4.40625, + "learning_rate": 4.538902047920657e-06, + "loss": 1.3861111402511597, + "step": 1586 + }, + { + "epoch": 0.48880338591766065, + "grad_norm": 8.5625, + "learning_rate": 4.537627887664346e-06, + "loss": 1.1937060356140137, + "step": 1588 + }, + { + "epoch": 0.48941900731050403, + "grad_norm": 7.3125, + "learning_rate": 4.536352199432737e-06, + "loss": 1.2857470512390137, + "step": 1590 + }, + { + "epoch": 0.49003462870334746, + "grad_norm": 11.875, + "learning_rate": 4.535074984493508e-06, + "loss": 1.3743950128555298, + "step": 1592 + }, + { + "epoch": 0.49065025009619084, + "grad_norm": 7.09375, + "learning_rate": 4.533796244115858e-06, + "loss": 1.0143524408340454, + "step": 1594 + }, + { + "epoch": 0.4912658714890342, + "grad_norm": 3.234375, + "learning_rate": 4.532515979570498e-06, + "loss": 0.5565921664237976, + "step": 1596 + }, + { + "epoch": 0.49188149288187766, + "grad_norm": 7.03125, + "learning_rate": 4.5312341921296565e-06, + "loss": 1.3684699535369873, + "step": 1598 + }, + { + "epoch": 0.49249711427472104, + "grad_norm": 16.25, + "learning_rate": 4.5299508830670745e-06, + "loss": 1.6425446271896362, + "step": 1600 + }, + { + "epoch": 0.4931127356675645, + "grad_norm": 23.0, + "learning_rate": 4.528666053658005e-06, + "loss": 1.9633190631866455, + "step": 1602 + }, + { + "epoch": 0.49372835706040785, + "grad_norm": 7.78125, + "learning_rate": 4.5273797051792114e-06, + "loss": 1.4236207008361816, + "step": 1604 + }, + { + "epoch": 0.49434397845325123, + "grad_norm": 9.0, + "learning_rate": 4.526091838908968e-06, + "loss": 1.4245315790176392, + "step": 1606 + }, + { + "epoch": 0.49495959984609467, + "grad_norm": 4.0625, + "learning_rate": 4.524802456127054e-06, + "loss": 1.3921014070510864, + "step": 1608 + }, + { + "epoch": 0.49557522123893805, + "grad_norm": 16.0, + "learning_rate": 4.523511558114762e-06, + "loss": 1.4808584451675415, + "step": 1610 + }, + { + "epoch": 0.4961908426317815, + "grad_norm": 4.875, + "learning_rate": 4.522219146154883e-06, + "loss": 1.4472503662109375, + "step": 1612 + }, + { + "epoch": 0.49680646402462486, + "grad_norm": 3.484375, + "learning_rate": 4.520925221531716e-06, + "loss": 1.0601378679275513, + "step": 1614 + }, + { + "epoch": 0.49742208541746824, + "grad_norm": 6.8125, + "learning_rate": 4.519629785531063e-06, + "loss": 1.198182463645935, + "step": 1616 + }, + { + "epoch": 0.4980377068103117, + "grad_norm": 3.390625, + "learning_rate": 4.518332839440231e-06, + "loss": 1.1204509735107422, + "step": 1618 + }, + { + "epoch": 0.49865332820315506, + "grad_norm": 16.625, + "learning_rate": 4.517034384548019e-06, + "loss": 1.464831829071045, + "step": 1620 + }, + { + "epoch": 0.49926894959599843, + "grad_norm": 4.78125, + "learning_rate": 4.515734422144734e-06, + "loss": 1.3328585624694824, + "step": 1622 + }, + { + "epoch": 0.49988457098884187, + "grad_norm": 4.71875, + "learning_rate": 4.514432953522178e-06, + "loss": 1.0713456869125366, + "step": 1624 + }, + { + "epoch": 0.5005001923816853, + "grad_norm": 6.0, + "learning_rate": 4.513129979973648e-06, + "loss": 1.3135950565338135, + "step": 1626 + }, + { + "epoch": 0.5011158137745286, + "grad_norm": 7.6875, + "learning_rate": 4.51182550279394e-06, + "loss": 1.5405608415603638, + "step": 1628 + }, + { + "epoch": 0.5017314351673721, + "grad_norm": 7.9375, + "learning_rate": 4.5105195232793405e-06, + "loss": 1.6978867053985596, + "step": 1630 + }, + { + "epoch": 0.5023470565602155, + "grad_norm": 2.96875, + "learning_rate": 4.509212042727632e-06, + "loss": 0.8523469567298889, + "step": 1632 + }, + { + "epoch": 0.5029626779530588, + "grad_norm": 7.125, + "learning_rate": 4.5079030624380845e-06, + "loss": 1.5460319519042969, + "step": 1634 + }, + { + "epoch": 0.5035782993459023, + "grad_norm": 12.4375, + "learning_rate": 4.5065925837114645e-06, + "loss": 1.2278990745544434, + "step": 1636 + }, + { + "epoch": 0.5041939207387457, + "grad_norm": 8.8125, + "learning_rate": 4.5052806078500225e-06, + "loss": 1.4185572862625122, + "step": 1638 + }, + { + "epoch": 0.504809542131589, + "grad_norm": 6.0625, + "learning_rate": 4.503967136157498e-06, + "loss": 1.1045608520507812, + "step": 1640 + }, + { + "epoch": 0.5054251635244325, + "grad_norm": 18.125, + "learning_rate": 4.502652169939117e-06, + "loss": 1.389453649520874, + "step": 1642 + }, + { + "epoch": 0.5060407849172759, + "grad_norm": 12.9375, + "learning_rate": 4.501335710501592e-06, + "loss": 1.7360384464263916, + "step": 1644 + }, + { + "epoch": 0.5066564063101193, + "grad_norm": 4.28125, + "learning_rate": 4.500017759153118e-06, + "loss": 0.5589945316314697, + "step": 1646 + }, + { + "epoch": 0.5072720277029626, + "grad_norm": 5.8125, + "learning_rate": 4.498698317203373e-06, + "loss": 1.0301040410995483, + "step": 1648 + }, + { + "epoch": 0.5078876490958061, + "grad_norm": 5.625, + "learning_rate": 4.497377385963514e-06, + "loss": 1.1001111268997192, + "step": 1650 + }, + { + "epoch": 0.5085032704886495, + "grad_norm": 3.6875, + "learning_rate": 4.496054966746183e-06, + "loss": 1.3102152347564697, + "step": 1652 + }, + { + "epoch": 0.5091188918814928, + "grad_norm": 4.84375, + "learning_rate": 4.494731060865496e-06, + "loss": 1.5156065225601196, + "step": 1654 + }, + { + "epoch": 0.5097345132743363, + "grad_norm": 3.859375, + "learning_rate": 4.493405669637048e-06, + "loss": 1.0802805423736572, + "step": 1656 + }, + { + "epoch": 0.5103501346671797, + "grad_norm": 8.5625, + "learning_rate": 4.49207879437791e-06, + "loss": 1.690042495727539, + "step": 1658 + }, + { + "epoch": 0.510965756060023, + "grad_norm": 5.34375, + "learning_rate": 4.490750436406628e-06, + "loss": 1.1154403686523438, + "step": 1660 + }, + { + "epoch": 0.5115813774528665, + "grad_norm": 13.8125, + "learning_rate": 4.489420597043221e-06, + "loss": 0.5433551669120789, + "step": 1662 + }, + { + "epoch": 0.5121969988457099, + "grad_norm": 3.96875, + "learning_rate": 4.48808927760918e-06, + "loss": 1.4064877033233643, + "step": 1664 + }, + { + "epoch": 0.5128126202385533, + "grad_norm": 8.5625, + "learning_rate": 4.486756479427467e-06, + "loss": 1.0362461805343628, + "step": 1666 + }, + { + "epoch": 0.5134282416313967, + "grad_norm": 6.375, + "learning_rate": 4.485422203822515e-06, + "loss": 1.279341220855713, + "step": 1668 + }, + { + "epoch": 0.5140438630242401, + "grad_norm": 3.6875, + "learning_rate": 4.484086452120221e-06, + "loss": 0.9909439086914062, + "step": 1670 + }, + { + "epoch": 0.5146594844170835, + "grad_norm": 19.875, + "learning_rate": 4.482749225647952e-06, + "loss": 1.3306505680084229, + "step": 1672 + }, + { + "epoch": 0.5152751058099269, + "grad_norm": 5.5625, + "learning_rate": 4.481410525734541e-06, + "loss": 1.437589406967163, + "step": 1674 + }, + { + "epoch": 0.5158907272027703, + "grad_norm": 10.5625, + "learning_rate": 4.480070353710283e-06, + "loss": 1.6570290327072144, + "step": 1676 + }, + { + "epoch": 0.5165063485956137, + "grad_norm": 5.875, + "learning_rate": 4.478728710906938e-06, + "loss": 1.4835489988327026, + "step": 1678 + }, + { + "epoch": 0.517121969988457, + "grad_norm": 8.125, + "learning_rate": 4.4773855986577255e-06, + "loss": 1.515626311302185, + "step": 1680 + }, + { + "epoch": 0.5177375913813005, + "grad_norm": 8.375, + "learning_rate": 4.476041018297327e-06, + "loss": 1.5490896701812744, + "step": 1682 + }, + { + "epoch": 0.5183532127741439, + "grad_norm": 26.75, + "learning_rate": 4.474694971161882e-06, + "loss": 1.4817088842391968, + "step": 1684 + }, + { + "epoch": 0.5189688341669872, + "grad_norm": 7.0, + "learning_rate": 4.473347458588987e-06, + "loss": 1.4445440769195557, + "step": 1686 + }, + { + "epoch": 0.5195844555598307, + "grad_norm": 10.625, + "learning_rate": 4.471998481917698e-06, + "loss": 1.2884917259216309, + "step": 1688 + }, + { + "epoch": 0.5202000769526741, + "grad_norm": 5.71875, + "learning_rate": 4.47064804248852e-06, + "loss": 1.2980061769485474, + "step": 1690 + }, + { + "epoch": 0.5208156983455176, + "grad_norm": 3.53125, + "learning_rate": 4.4692961416434156e-06, + "loss": 1.294404149055481, + "step": 1692 + }, + { + "epoch": 0.5214313197383609, + "grad_norm": 5.0625, + "learning_rate": 4.467942780725801e-06, + "loss": 1.3953012228012085, + "step": 1694 + }, + { + "epoch": 0.5220469411312043, + "grad_norm": 7.90625, + "learning_rate": 4.46658796108054e-06, + "loss": 1.4300891160964966, + "step": 1696 + }, + { + "epoch": 0.5226625625240477, + "grad_norm": 6.65625, + "learning_rate": 4.465231684053947e-06, + "loss": 1.6064602136611938, + "step": 1698 + }, + { + "epoch": 0.5232781839168911, + "grad_norm": 15.625, + "learning_rate": 4.463873950993786e-06, + "loss": 1.0882776975631714, + "step": 1700 + }, + { + "epoch": 0.5238938053097345, + "grad_norm": 6.1875, + "learning_rate": 4.462514763249265e-06, + "loss": 1.3740888833999634, + "step": 1702 + }, + { + "epoch": 0.5245094267025779, + "grad_norm": 2.421875, + "learning_rate": 4.46115412217104e-06, + "loss": 1.3197846412658691, + "step": 1704 + }, + { + "epoch": 0.5251250480954213, + "grad_norm": 4.71875, + "learning_rate": 4.459792029111211e-06, + "loss": 1.0752228498458862, + "step": 1706 + }, + { + "epoch": 0.5257406694882647, + "grad_norm": 2.171875, + "learning_rate": 4.45842848542332e-06, + "loss": 1.2539163827896118, + "step": 1708 + }, + { + "epoch": 0.5263562908811081, + "grad_norm": 17.375, + "learning_rate": 4.457063492462352e-06, + "loss": 1.708069920539856, + "step": 1710 + }, + { + "epoch": 0.5269719122739516, + "grad_norm": 12.8125, + "learning_rate": 4.4556970515847305e-06, + "loss": 1.1730300188064575, + "step": 1712 + }, + { + "epoch": 0.5275875336667949, + "grad_norm": 4.65625, + "learning_rate": 4.454329164148317e-06, + "loss": 1.1715214252471924, + "step": 1714 + }, + { + "epoch": 0.5282031550596383, + "grad_norm": 10.5625, + "learning_rate": 4.452959831512414e-06, + "loss": 1.459154486656189, + "step": 1716 + }, + { + "epoch": 0.5288187764524818, + "grad_norm": 2.875, + "learning_rate": 4.451589055037757e-06, + "loss": 1.113134503364563, + "step": 1718 + }, + { + "epoch": 0.5294343978453251, + "grad_norm": 6.1875, + "learning_rate": 4.4502168360865175e-06, + "loss": 0.9972245097160339, + "step": 1720 + }, + { + "epoch": 0.5300500192381685, + "grad_norm": 7.84375, + "learning_rate": 4.448843176022299e-06, + "loss": 1.3292319774627686, + "step": 1722 + }, + { + "epoch": 0.530665640631012, + "grad_norm": 7.78125, + "learning_rate": 4.44746807621014e-06, + "loss": 1.3687920570373535, + "step": 1724 + }, + { + "epoch": 0.5312812620238553, + "grad_norm": 5.84375, + "learning_rate": 4.44609153801651e-06, + "loss": 1.1176486015319824, + "step": 1726 + }, + { + "epoch": 0.5318968834166987, + "grad_norm": 6.53125, + "learning_rate": 4.4447135628093e-06, + "loss": 1.3572094440460205, + "step": 1728 + }, + { + "epoch": 0.5325125048095422, + "grad_norm": 7.40625, + "learning_rate": 4.44333415195784e-06, + "loss": 1.3096528053283691, + "step": 1730 + }, + { + "epoch": 0.5331281262023856, + "grad_norm": 9.0, + "learning_rate": 4.441953306832879e-06, + "loss": 1.611821174621582, + "step": 1732 + }, + { + "epoch": 0.5337437475952289, + "grad_norm": 5.71875, + "learning_rate": 4.440571028806594e-06, + "loss": 1.4841302633285522, + "step": 1734 + }, + { + "epoch": 0.5343593689880723, + "grad_norm": 10.8125, + "learning_rate": 4.439187319252586e-06, + "loss": 1.7697718143463135, + "step": 1736 + }, + { + "epoch": 0.5349749903809158, + "grad_norm": 13.8125, + "learning_rate": 4.437802179545879e-06, + "loss": 1.5109689235687256, + "step": 1738 + }, + { + "epoch": 0.5355906117737591, + "grad_norm": 3.5, + "learning_rate": 4.436415611062916e-06, + "loss": 1.1347957849502563, + "step": 1740 + }, + { + "epoch": 0.5362062331666025, + "grad_norm": 26.125, + "learning_rate": 4.435027615181563e-06, + "loss": 1.394513487815857, + "step": 1742 + }, + { + "epoch": 0.536821854559446, + "grad_norm": 2.921875, + "learning_rate": 4.4336381932811e-06, + "loss": 1.1014000177383423, + "step": 1744 + }, + { + "epoch": 0.5374374759522893, + "grad_norm": 26.375, + "learning_rate": 4.43224734674223e-06, + "loss": 1.4138752222061157, + "step": 1746 + }, + { + "epoch": 0.5380530973451327, + "grad_norm": 4.71875, + "learning_rate": 4.4308550769470645e-06, + "loss": 1.3574260473251343, + "step": 1748 + }, + { + "epoch": 0.5386687187379762, + "grad_norm": 9.5, + "learning_rate": 4.429461385279136e-06, + "loss": 1.3197760581970215, + "step": 1750 + }, + { + "epoch": 0.5392843401308195, + "grad_norm": 5.375, + "learning_rate": 4.428066273123387e-06, + "loss": 1.2813074588775635, + "step": 1752 + }, + { + "epoch": 0.5398999615236629, + "grad_norm": 3.234375, + "learning_rate": 4.4266697418661705e-06, + "loss": 1.2030224800109863, + "step": 1754 + }, + { + "epoch": 0.5405155829165064, + "grad_norm": 4.65625, + "learning_rate": 4.425271792895252e-06, + "loss": 1.0384302139282227, + "step": 1756 + }, + { + "epoch": 0.5411312043093498, + "grad_norm": 10.3125, + "learning_rate": 4.423872427599804e-06, + "loss": 1.326210618019104, + "step": 1758 + }, + { + "epoch": 0.5417468257021931, + "grad_norm": 6.96875, + "learning_rate": 4.422471647370406e-06, + "loss": 1.1778022050857544, + "step": 1760 + }, + { + "epoch": 0.5423624470950366, + "grad_norm": 6.78125, + "learning_rate": 4.421069453599049e-06, + "loss": 1.474500060081482, + "step": 1762 + }, + { + "epoch": 0.54297806848788, + "grad_norm": 8.3125, + "learning_rate": 4.41966584767912e-06, + "loss": 1.3914190530776978, + "step": 1764 + }, + { + "epoch": 0.5435936898807233, + "grad_norm": 11.6875, + "learning_rate": 4.418260831005415e-06, + "loss": 1.714091181755066, + "step": 1766 + }, + { + "epoch": 0.5442093112735668, + "grad_norm": 8.5, + "learning_rate": 4.4168544049741304e-06, + "loss": 1.2844468355178833, + "step": 1768 + }, + { + "epoch": 0.5448249326664102, + "grad_norm": 7.1875, + "learning_rate": 4.415446570982864e-06, + "loss": 1.6642802953720093, + "step": 1770 + }, + { + "epoch": 0.5454405540592535, + "grad_norm": 4.90625, + "learning_rate": 4.414037330430611e-06, + "loss": 1.248421549797058, + "step": 1772 + }, + { + "epoch": 0.546056175452097, + "grad_norm": 13.4375, + "learning_rate": 4.412626684717768e-06, + "loss": 1.2882202863693237, + "step": 1774 + }, + { + "epoch": 0.5466717968449404, + "grad_norm": 6.8125, + "learning_rate": 4.4112146352461216e-06, + "loss": 1.4664686918258667, + "step": 1776 + }, + { + "epoch": 0.5472874182377838, + "grad_norm": 7.78125, + "learning_rate": 4.409801183418858e-06, + "loss": 1.3767423629760742, + "step": 1778 + }, + { + "epoch": 0.5479030396306271, + "grad_norm": 4.71875, + "learning_rate": 4.408386330640559e-06, + "loss": 1.5045976638793945, + "step": 1780 + }, + { + "epoch": 0.5485186610234706, + "grad_norm": 15.25, + "learning_rate": 4.40697007831719e-06, + "loss": 1.7616403102874756, + "step": 1782 + }, + { + "epoch": 0.549134282416314, + "grad_norm": 9.1875, + "learning_rate": 4.4055524278561175e-06, + "loss": 1.5928730964660645, + "step": 1784 + }, + { + "epoch": 0.5497499038091573, + "grad_norm": 8.1875, + "learning_rate": 4.40413338066609e-06, + "loss": 1.1425762176513672, + "step": 1786 + }, + { + "epoch": 0.5503655252020008, + "grad_norm": 9.25, + "learning_rate": 4.402712938157249e-06, + "loss": 1.3747527599334717, + "step": 1788 + }, + { + "epoch": 0.5509811465948442, + "grad_norm": 9.4375, + "learning_rate": 4.401291101741116e-06, + "loss": 1.4497945308685303, + "step": 1790 + }, + { + "epoch": 0.5515967679876875, + "grad_norm": 7.15625, + "learning_rate": 4.399867872830607e-06, + "loss": 1.469414472579956, + "step": 1792 + }, + { + "epoch": 0.552212389380531, + "grad_norm": 9.0, + "learning_rate": 4.398443252840011e-06, + "loss": 1.0230164527893066, + "step": 1794 + }, + { + "epoch": 0.5528280107733744, + "grad_norm": 2.59375, + "learning_rate": 4.397017243185008e-06, + "loss": 1.3341656923294067, + "step": 1796 + }, + { + "epoch": 0.5534436321662177, + "grad_norm": 6.5625, + "learning_rate": 4.395589845282656e-06, + "loss": 1.4266735315322876, + "step": 1798 + }, + { + "epoch": 0.5540592535590612, + "grad_norm": 5.96875, + "learning_rate": 4.3941610605513905e-06, + "loss": 1.1735732555389404, + "step": 1800 + }, + { + "epoch": 0.5546748749519046, + "grad_norm": 6.65625, + "learning_rate": 4.392730890411029e-06, + "loss": 1.1153186559677124, + "step": 1802 + }, + { + "epoch": 0.555290496344748, + "grad_norm": 10.375, + "learning_rate": 4.391299336282761e-06, + "loss": 1.5208882093429565, + "step": 1804 + }, + { + "epoch": 0.5559061177375914, + "grad_norm": 7.875, + "learning_rate": 4.389866399589157e-06, + "loss": 0.9873666763305664, + "step": 1806 + }, + { + "epoch": 0.5565217391304348, + "grad_norm": 11.0, + "learning_rate": 4.388432081754155e-06, + "loss": 1.7669349908828735, + "step": 1808 + }, + { + "epoch": 0.5571373605232782, + "grad_norm": 3.71875, + "learning_rate": 4.386996384203072e-06, + "loss": 1.1138397455215454, + "step": 1810 + }, + { + "epoch": 0.5577529819161215, + "grad_norm": 9.4375, + "learning_rate": 4.3855593083625904e-06, + "loss": 1.7766603231430054, + "step": 1812 + }, + { + "epoch": 0.558368603308965, + "grad_norm": 15.0625, + "learning_rate": 4.384120855660765e-06, + "loss": 1.8187227249145508, + "step": 1814 + }, + { + "epoch": 0.5589842247018084, + "grad_norm": 8.875, + "learning_rate": 4.382681027527021e-06, + "loss": 1.538252830505371, + "step": 1816 + }, + { + "epoch": 0.5595998460946517, + "grad_norm": 8.75, + "learning_rate": 4.381239825392144e-06, + "loss": 1.093695044517517, + "step": 1818 + }, + { + "epoch": 0.5602154674874952, + "grad_norm": 3.9375, + "learning_rate": 4.379797250688292e-06, + "loss": 1.2730540037155151, + "step": 1820 + }, + { + "epoch": 0.5608310888803386, + "grad_norm": 8.3125, + "learning_rate": 4.378353304848982e-06, + "loss": 1.4347320795059204, + "step": 1822 + }, + { + "epoch": 0.561446710273182, + "grad_norm": 4.71875, + "learning_rate": 4.376907989309097e-06, + "loss": 1.4746718406677246, + "step": 1824 + }, + { + "epoch": 0.5620623316660254, + "grad_norm": 12.75, + "learning_rate": 4.375461305504879e-06, + "loss": 1.1002285480499268, + "step": 1826 + }, + { + "epoch": 0.5626779530588688, + "grad_norm": 17.625, + "learning_rate": 4.374013254873929e-06, + "loss": 1.2012693881988525, + "step": 1828 + }, + { + "epoch": 0.5632935744517122, + "grad_norm": 6.28125, + "learning_rate": 4.372563838855207e-06, + "loss": 1.7264009714126587, + "step": 1830 + }, + { + "epoch": 0.5639091958445556, + "grad_norm": 18.5, + "learning_rate": 4.3711130588890315e-06, + "loss": 1.62944757938385, + "step": 1832 + }, + { + "epoch": 0.564524817237399, + "grad_norm": 11.1875, + "learning_rate": 4.369660916417076e-06, + "loss": 1.4099031686782837, + "step": 1834 + }, + { + "epoch": 0.5651404386302424, + "grad_norm": 4.46875, + "learning_rate": 4.3682074128823645e-06, + "loss": 1.4762779474258423, + "step": 1836 + }, + { + "epoch": 0.5657560600230858, + "grad_norm": 34.5, + "learning_rate": 4.3667525497292776e-06, + "loss": 1.6303919553756714, + "step": 1838 + }, + { + "epoch": 0.5663716814159292, + "grad_norm": 6.65625, + "learning_rate": 4.365296328403546e-06, + "loss": 1.3313677310943604, + "step": 1840 + }, + { + "epoch": 0.5669873028087726, + "grad_norm": 5.75, + "learning_rate": 4.363838750352247e-06, + "loss": 1.5207209587097168, + "step": 1842 + }, + { + "epoch": 0.567602924201616, + "grad_norm": 9.1875, + "learning_rate": 4.362379817023811e-06, + "loss": 1.0308501720428467, + "step": 1844 + }, + { + "epoch": 0.5682185455944594, + "grad_norm": 3.15625, + "learning_rate": 4.3609195298680115e-06, + "loss": 1.0928391218185425, + "step": 1846 + }, + { + "epoch": 0.5688341669873028, + "grad_norm": 10.0, + "learning_rate": 4.3594578903359695e-06, + "loss": 1.54830801486969, + "step": 1848 + }, + { + "epoch": 0.5694497883801463, + "grad_norm": 5.625, + "learning_rate": 4.357994899880149e-06, + "loss": 1.4137128591537476, + "step": 1850 + }, + { + "epoch": 0.5700654097729896, + "grad_norm": 5.8125, + "learning_rate": 4.356530559954356e-06, + "loss": 1.3005894422531128, + "step": 1852 + }, + { + "epoch": 0.570681031165833, + "grad_norm": 10.875, + "learning_rate": 4.355064872013737e-06, + "loss": 1.2449814081192017, + "step": 1854 + }, + { + "epoch": 0.5712966525586765, + "grad_norm": 6.90625, + "learning_rate": 4.353597837514779e-06, + "loss": 0.9704883098602295, + "step": 1856 + }, + { + "epoch": 0.5719122739515198, + "grad_norm": 6.59375, + "learning_rate": 4.3521294579153096e-06, + "loss": 1.3039305210113525, + "step": 1858 + }, + { + "epoch": 0.5725278953443632, + "grad_norm": 8.125, + "learning_rate": 4.350659734674488e-06, + "loss": 1.180039882659912, + "step": 1860 + }, + { + "epoch": 0.5731435167372066, + "grad_norm": 5.1875, + "learning_rate": 4.3491886692528115e-06, + "loss": 1.2943824529647827, + "step": 1862 + }, + { + "epoch": 0.57375913813005, + "grad_norm": 6.125, + "learning_rate": 4.347716263112112e-06, + "loss": 1.1885600090026855, + "step": 1864 + }, + { + "epoch": 0.5743747595228934, + "grad_norm": 4.71875, + "learning_rate": 4.346242517715551e-06, + "loss": 1.2636488676071167, + "step": 1866 + }, + { + "epoch": 0.5749903809157368, + "grad_norm": 4.65625, + "learning_rate": 4.344767434527623e-06, + "loss": 1.3801287412643433, + "step": 1868 + }, + { + "epoch": 0.5756060023085803, + "grad_norm": 4.03125, + "learning_rate": 4.343291015014152e-06, + "loss": 1.2369672060012817, + "step": 1870 + }, + { + "epoch": 0.5762216237014236, + "grad_norm": 18.0, + "learning_rate": 4.341813260642288e-06, + "loss": 0.8730614185333252, + "step": 1872 + }, + { + "epoch": 0.576837245094267, + "grad_norm": 5.375, + "learning_rate": 4.340334172880508e-06, + "loss": 1.412117838859558, + "step": 1874 + }, + { + "epoch": 0.5774528664871105, + "grad_norm": 3.109375, + "learning_rate": 4.338853753198618e-06, + "loss": 1.1440041065216064, + "step": 1876 + }, + { + "epoch": 0.5780684878799538, + "grad_norm": 8.625, + "learning_rate": 4.33737200306774e-06, + "loss": 1.1688601970672607, + "step": 1878 + }, + { + "epoch": 0.5786841092727972, + "grad_norm": 17.0, + "learning_rate": 4.3358889239603245e-06, + "loss": 1.5140353441238403, + "step": 1880 + }, + { + "epoch": 0.5792997306656407, + "grad_norm": 5.0625, + "learning_rate": 4.33440451735014e-06, + "loss": 1.4316186904907227, + "step": 1882 + }, + { + "epoch": 0.579915352058484, + "grad_norm": 26.125, + "learning_rate": 4.332918784712276e-06, + "loss": 1.2686431407928467, + "step": 1884 + }, + { + "epoch": 0.5805309734513274, + "grad_norm": 4.96875, + "learning_rate": 4.331431727523136e-06, + "loss": 1.2316128015518188, + "step": 1886 + }, + { + "epoch": 0.5811465948441709, + "grad_norm": 18.375, + "learning_rate": 4.3299433472604445e-06, + "loss": 1.5370372533798218, + "step": 1888 + }, + { + "epoch": 0.5817622162370142, + "grad_norm": 6.59375, + "learning_rate": 4.3284536454032356e-06, + "loss": 1.5379806756973267, + "step": 1890 + }, + { + "epoch": 0.5823778376298576, + "grad_norm": 9.5625, + "learning_rate": 4.326962623431862e-06, + "loss": 1.0020148754119873, + "step": 1892 + }, + { + "epoch": 0.582993459022701, + "grad_norm": 5.375, + "learning_rate": 4.325470282827984e-06, + "loss": 1.5552363395690918, + "step": 1894 + }, + { + "epoch": 0.5836090804155445, + "grad_norm": 7.34375, + "learning_rate": 4.323976625074574e-06, + "loss": 1.3486781120300293, + "step": 1896 + }, + { + "epoch": 0.5842247018083878, + "grad_norm": 7.25, + "learning_rate": 4.3224816516559145e-06, + "loss": 1.2770202159881592, + "step": 1898 + }, + { + "epoch": 0.5848403232012312, + "grad_norm": 7.6875, + "learning_rate": 4.320985364057593e-06, + "loss": 1.44130277633667, + "step": 1900 + }, + { + "epoch": 0.5854559445940747, + "grad_norm": 12.375, + "learning_rate": 4.3194877637665035e-06, + "loss": 1.6760141849517822, + "step": 1902 + }, + { + "epoch": 0.586071565986918, + "grad_norm": 6.21875, + "learning_rate": 4.317988852270845e-06, + "loss": 1.637210726737976, + "step": 1904 + }, + { + "epoch": 0.5866871873797614, + "grad_norm": 13.25, + "learning_rate": 4.3164886310601224e-06, + "loss": 1.471998929977417, + "step": 1906 + }, + { + "epoch": 0.5873028087726049, + "grad_norm": 10.1875, + "learning_rate": 4.3149871016251365e-06, + "loss": 1.327508568763733, + "step": 1908 + }, + { + "epoch": 0.5879184301654482, + "grad_norm": 4.65625, + "learning_rate": 4.31348426545799e-06, + "loss": 1.1353042125701904, + "step": 1910 + }, + { + "epoch": 0.5885340515582916, + "grad_norm": 8.75, + "learning_rate": 4.311980124052087e-06, + "loss": 1.2621204853057861, + "step": 1912 + }, + { + "epoch": 0.5891496729511351, + "grad_norm": 5.09375, + "learning_rate": 4.310474678902126e-06, + "loss": 1.4534045457839966, + "step": 1914 + }, + { + "epoch": 0.5897652943439785, + "grad_norm": 1.546875, + "learning_rate": 4.3089679315041e-06, + "loss": 1.2747136354446411, + "step": 1916 + }, + { + "epoch": 0.5903809157368218, + "grad_norm": 5.90625, + "learning_rate": 4.307459883355299e-06, + "loss": 1.384915828704834, + "step": 1918 + }, + { + "epoch": 0.5909965371296653, + "grad_norm": 15.6875, + "learning_rate": 4.305950535954305e-06, + "loss": 0.9512431621551514, + "step": 1920 + }, + { + "epoch": 0.5916121585225087, + "grad_norm": 9.9375, + "learning_rate": 4.30443989080099e-06, + "loss": 1.3275504112243652, + "step": 1922 + }, + { + "epoch": 0.592227779915352, + "grad_norm": 8.5, + "learning_rate": 4.3029279493965135e-06, + "loss": 1.2675621509552002, + "step": 1924 + }, + { + "epoch": 0.5928434013081955, + "grad_norm": 5.03125, + "learning_rate": 4.301414713243328e-06, + "loss": 1.1729919910430908, + "step": 1926 + }, + { + "epoch": 0.5934590227010389, + "grad_norm": 9.5, + "learning_rate": 4.299900183845171e-06, + "loss": 1.3558121919631958, + "step": 1928 + }, + { + "epoch": 0.5940746440938822, + "grad_norm": 6.1875, + "learning_rate": 4.2983843627070625e-06, + "loss": 1.3227899074554443, + "step": 1930 + }, + { + "epoch": 0.5946902654867257, + "grad_norm": 12.5, + "learning_rate": 4.2968672513353075e-06, + "loss": 1.5395879745483398, + "step": 1932 + }, + { + "epoch": 0.5953058868795691, + "grad_norm": 13.75, + "learning_rate": 4.295348851237494e-06, + "loss": 1.1591770648956299, + "step": 1934 + }, + { + "epoch": 0.5959215082724124, + "grad_norm": 3.9375, + "learning_rate": 4.293829163922491e-06, + "loss": 1.4132144451141357, + "step": 1936 + }, + { + "epoch": 0.5965371296652558, + "grad_norm": 6.21875, + "learning_rate": 4.2923081909004475e-06, + "loss": 1.6093425750732422, + "step": 1938 + }, + { + "epoch": 0.5971527510580993, + "grad_norm": 7.15625, + "learning_rate": 4.290785933682785e-06, + "loss": 1.4642449617385864, + "step": 1940 + }, + { + "epoch": 0.5977683724509427, + "grad_norm": 6.0, + "learning_rate": 4.289262393782206e-06, + "loss": 1.2696763277053833, + "step": 1942 + }, + { + "epoch": 0.598383993843786, + "grad_norm": 36.0, + "learning_rate": 4.287737572712687e-06, + "loss": 1.4310896396636963, + "step": 1944 + }, + { + "epoch": 0.5989996152366295, + "grad_norm": 2.703125, + "learning_rate": 4.2862114719894754e-06, + "loss": 1.1763420104980469, + "step": 1946 + }, + { + "epoch": 0.5996152366294729, + "grad_norm": 9.375, + "learning_rate": 4.284684093129093e-06, + "loss": 0.9990206956863403, + "step": 1948 + }, + { + "epoch": 0.6002308580223162, + "grad_norm": 17.0, + "learning_rate": 4.283155437649331e-06, + "loss": 1.5044660568237305, + "step": 1950 + }, + { + "epoch": 0.6008464794151597, + "grad_norm": 9.6875, + "learning_rate": 4.281625507069247e-06, + "loss": 1.063808798789978, + "step": 1952 + }, + { + "epoch": 0.6014621008080031, + "grad_norm": 5.65625, + "learning_rate": 4.280094302909168e-06, + "loss": 1.3041024208068848, + "step": 1954 + }, + { + "epoch": 0.6020777222008464, + "grad_norm": 6.90625, + "learning_rate": 4.2785618266906844e-06, + "loss": 1.295802116394043, + "step": 1956 + }, + { + "epoch": 0.6026933435936899, + "grad_norm": 3.125, + "learning_rate": 4.277028079936656e-06, + "loss": 1.0778144598007202, + "step": 1958 + }, + { + "epoch": 0.6033089649865333, + "grad_norm": 56.0, + "learning_rate": 4.2754930641711974e-06, + "loss": 1.7069801092147827, + "step": 1960 + }, + { + "epoch": 0.6039245863793767, + "grad_norm": 8.5, + "learning_rate": 4.27395678091969e-06, + "loss": 1.3479996919631958, + "step": 1962 + }, + { + "epoch": 0.6045402077722201, + "grad_norm": 10.75, + "learning_rate": 4.272419231708773e-06, + "loss": 1.3411500453948975, + "step": 1964 + }, + { + "epoch": 0.6051558291650635, + "grad_norm": 13.0, + "learning_rate": 4.270880418066342e-06, + "loss": 1.144321322441101, + "step": 1966 + }, + { + "epoch": 0.6057714505579069, + "grad_norm": 8.875, + "learning_rate": 4.2693403415215525e-06, + "loss": 0.7488104104995728, + "step": 1968 + }, + { + "epoch": 0.6063870719507503, + "grad_norm": 12.375, + "learning_rate": 4.267799003604812e-06, + "loss": 1.4354453086853027, + "step": 1970 + }, + { + "epoch": 0.6070026933435937, + "grad_norm": 4.8125, + "learning_rate": 4.266256405847784e-06, + "loss": 1.4058129787445068, + "step": 1972 + }, + { + "epoch": 0.6076183147364371, + "grad_norm": 2.640625, + "learning_rate": 4.264712549783381e-06, + "loss": 0.9840666055679321, + "step": 1974 + }, + { + "epoch": 0.6082339361292805, + "grad_norm": 7.625, + "learning_rate": 4.263167436945767e-06, + "loss": 1.3485808372497559, + "step": 1976 + }, + { + "epoch": 0.6088495575221239, + "grad_norm": 14.875, + "learning_rate": 4.261621068870355e-06, + "loss": 0.8841054439544678, + "step": 1978 + }, + { + "epoch": 0.6094651789149673, + "grad_norm": 39.5, + "learning_rate": 4.260073447093808e-06, + "loss": 1.401408076286316, + "step": 1980 + }, + { + "epoch": 0.6100808003078106, + "grad_norm": 6.46875, + "learning_rate": 4.258524573154031e-06, + "loss": 1.2707502841949463, + "step": 1982 + }, + { + "epoch": 0.6106964217006541, + "grad_norm": 10.5, + "learning_rate": 4.256974448590174e-06, + "loss": 1.7859660387039185, + "step": 1984 + }, + { + "epoch": 0.6113120430934975, + "grad_norm": 8.6875, + "learning_rate": 4.255423074942632e-06, + "loss": 1.7836737632751465, + "step": 1986 + }, + { + "epoch": 0.611927664486341, + "grad_norm": 7.75, + "learning_rate": 4.25387045375304e-06, + "loss": 1.2297873497009277, + "step": 1988 + }, + { + "epoch": 0.6125432858791843, + "grad_norm": 6.5625, + "learning_rate": 4.252316586564269e-06, + "loss": 1.423917531967163, + "step": 1990 + }, + { + "epoch": 0.6131589072720277, + "grad_norm": 3.546875, + "learning_rate": 4.250761474920437e-06, + "loss": 1.277387261390686, + "step": 1992 + }, + { + "epoch": 0.6137745286648711, + "grad_norm": 6.53125, + "learning_rate": 4.249205120366888e-06, + "loss": 1.3519625663757324, + "step": 1994 + }, + { + "epoch": 0.6143901500577145, + "grad_norm": 7.625, + "learning_rate": 4.2476475244502105e-06, + "loss": 1.0225456953048706, + "step": 1996 + }, + { + "epoch": 0.6150057714505579, + "grad_norm": 5.0, + "learning_rate": 4.246088688718221e-06, + "loss": 1.5308113098144531, + "step": 1998 + }, + { + "epoch": 0.6156213928434013, + "grad_norm": 11.0, + "learning_rate": 4.244528614719968e-06, + "loss": 1.7866837978363037, + "step": 2000 + }, + { + "epoch": 0.6162370142362447, + "grad_norm": 13.875, + "learning_rate": 4.242967304005734e-06, + "loss": 1.0780322551727295, + "step": 2002 + }, + { + "epoch": 0.6168526356290881, + "grad_norm": 7.78125, + "learning_rate": 4.241404758127029e-06, + "loss": 1.3198527097702026, + "step": 2004 + }, + { + "epoch": 0.6174682570219315, + "grad_norm": 8.4375, + "learning_rate": 4.239840978636588e-06, + "loss": 1.4212534427642822, + "step": 2006 + }, + { + "epoch": 0.618083878414775, + "grad_norm": 18.5, + "learning_rate": 4.238275967088375e-06, + "loss": 1.2522720098495483, + "step": 2008 + }, + { + "epoch": 0.6186994998076183, + "grad_norm": 4.4375, + "learning_rate": 4.2367097250375744e-06, + "loss": 1.012687087059021, + "step": 2010 + }, + { + "epoch": 0.6193151212004617, + "grad_norm": 4.53125, + "learning_rate": 4.2351422540406005e-06, + "loss": 1.2856965065002441, + "step": 2012 + }, + { + "epoch": 0.6199307425933052, + "grad_norm": 5.625, + "learning_rate": 4.23357355565508e-06, + "loss": 1.5035991668701172, + "step": 2014 + }, + { + "epoch": 0.6205463639861485, + "grad_norm": 4.4375, + "learning_rate": 4.232003631439868e-06, + "loss": 1.4920026063919067, + "step": 2016 + }, + { + "epoch": 0.6211619853789919, + "grad_norm": 4.90625, + "learning_rate": 4.2304324829550306e-06, + "loss": 1.1564490795135498, + "step": 2018 + }, + { + "epoch": 0.6217776067718354, + "grad_norm": 5.5, + "learning_rate": 4.228860111761852e-06, + "loss": 1.3548294305801392, + "step": 2020 + }, + { + "epoch": 0.6223932281646787, + "grad_norm": 19.75, + "learning_rate": 4.2272865194228355e-06, + "loss": 1.7174577713012695, + "step": 2022 + }, + { + "epoch": 0.6230088495575221, + "grad_norm": 7.375, + "learning_rate": 4.225711707501694e-06, + "loss": 0.9778303503990173, + "step": 2024 + }, + { + "epoch": 0.6236244709503656, + "grad_norm": 4.59375, + "learning_rate": 4.224135677563354e-06, + "loss": 1.320866584777832, + "step": 2026 + }, + { + "epoch": 0.6242400923432089, + "grad_norm": 5.34375, + "learning_rate": 4.22255843117395e-06, + "loss": 1.6332377195358276, + "step": 2028 + }, + { + "epoch": 0.6248557137360523, + "grad_norm": 10.1875, + "learning_rate": 4.220979969900828e-06, + "loss": 1.790048360824585, + "step": 2030 + }, + { + "epoch": 0.6254713351288957, + "grad_norm": 10.6875, + "learning_rate": 4.219400295312542e-06, + "loss": 1.036562204360962, + "step": 2032 + }, + { + "epoch": 0.6260869565217392, + "grad_norm": 52.75, + "learning_rate": 4.217819408978848e-06, + "loss": 1.2087514400482178, + "step": 2034 + }, + { + "epoch": 0.6267025779145825, + "grad_norm": 11.9375, + "learning_rate": 4.216237312470709e-06, + "loss": 1.2638986110687256, + "step": 2036 + }, + { + "epoch": 0.6273181993074259, + "grad_norm": 7.9375, + "learning_rate": 4.214654007360289e-06, + "loss": 1.248311161994934, + "step": 2038 + }, + { + "epoch": 0.6279338207002694, + "grad_norm": 12.4375, + "learning_rate": 4.213069495220955e-06, + "loss": 1.0299032926559448, + "step": 2040 + }, + { + "epoch": 0.6285494420931127, + "grad_norm": 5.0625, + "learning_rate": 4.211483777627272e-06, + "loss": 0.9252605438232422, + "step": 2042 + }, + { + "epoch": 0.6291650634859561, + "grad_norm": 3.609375, + "learning_rate": 4.2098968561550025e-06, + "loss": 1.118775486946106, + "step": 2044 + }, + { + "epoch": 0.6297806848787996, + "grad_norm": 41.25, + "learning_rate": 4.208308732381106e-06, + "loss": 1.6422300338745117, + "step": 2046 + }, + { + "epoch": 0.6303963062716429, + "grad_norm": 7.28125, + "learning_rate": 4.206719407883737e-06, + "loss": 1.361843228340149, + "step": 2048 + }, + { + "epoch": 0.6310119276644863, + "grad_norm": 5.875, + "learning_rate": 4.205128884242243e-06, + "loss": 1.3892865180969238, + "step": 2050 + }, + { + "epoch": 0.6316275490573298, + "grad_norm": 27.125, + "learning_rate": 4.203537163037163e-06, + "loss": 1.3998996019363403, + "step": 2052 + }, + { + "epoch": 0.6322431704501732, + "grad_norm": 2.71875, + "learning_rate": 4.201944245850224e-06, + "loss": 1.0297900438308716, + "step": 2054 + }, + { + "epoch": 0.6328587918430165, + "grad_norm": 6.78125, + "learning_rate": 4.200350134264347e-06, + "loss": 1.2010695934295654, + "step": 2056 + }, + { + "epoch": 0.63347441323586, + "grad_norm": 9.5, + "learning_rate": 4.198754829863635e-06, + "loss": 1.0788944959640503, + "step": 2058 + }, + { + "epoch": 0.6340900346287034, + "grad_norm": 10.875, + "learning_rate": 4.197158334233376e-06, + "loss": 1.6973694562911987, + "step": 2060 + }, + { + "epoch": 0.6347056560215467, + "grad_norm": 8.5, + "learning_rate": 4.195560648960046e-06, + "loss": 1.5456275939941406, + "step": 2062 + }, + { + "epoch": 0.6353212774143902, + "grad_norm": 8.0625, + "learning_rate": 4.1939617756313e-06, + "loss": 1.54824960231781, + "step": 2064 + }, + { + "epoch": 0.6359368988072336, + "grad_norm": 3.453125, + "learning_rate": 4.192361715835973e-06, + "loss": 1.167006254196167, + "step": 2066 + }, + { + "epoch": 0.6365525202000769, + "grad_norm": 14.0625, + "learning_rate": 4.190760471164081e-06, + "loss": 1.243059754371643, + "step": 2068 + }, + { + "epoch": 0.6371681415929203, + "grad_norm": 4.15625, + "learning_rate": 4.189158043206818e-06, + "loss": 1.1235977411270142, + "step": 2070 + }, + { + "epoch": 0.6377837629857638, + "grad_norm": 5.53125, + "learning_rate": 4.187554433556552e-06, + "loss": 1.3937187194824219, + "step": 2072 + }, + { + "epoch": 0.6383993843786071, + "grad_norm": 9.375, + "learning_rate": 4.185949643806824e-06, + "loss": 1.7005552053451538, + "step": 2074 + }, + { + "epoch": 0.6390150057714505, + "grad_norm": 5.71875, + "learning_rate": 4.184343675552351e-06, + "loss": 1.7017693519592285, + "step": 2076 + }, + { + "epoch": 0.639630627164294, + "grad_norm": 23.125, + "learning_rate": 4.18273653038902e-06, + "loss": 1.5986844301223755, + "step": 2078 + }, + { + "epoch": 0.6402462485571374, + "grad_norm": 5.84375, + "learning_rate": 4.1811282099138865e-06, + "loss": 1.2742788791656494, + "step": 2080 + }, + { + "epoch": 0.6408618699499807, + "grad_norm": 8.375, + "learning_rate": 4.179518715725175e-06, + "loss": 1.2799839973449707, + "step": 2082 + }, + { + "epoch": 0.6414774913428242, + "grad_norm": 5.46875, + "learning_rate": 4.177908049422276e-06, + "loss": 1.5190964937210083, + "step": 2084 + }, + { + "epoch": 0.6420931127356676, + "grad_norm": 8.6875, + "learning_rate": 4.176296212605744e-06, + "loss": 1.379951000213623, + "step": 2086 + }, + { + "epoch": 0.6427087341285109, + "grad_norm": 8.25, + "learning_rate": 4.174683206877298e-06, + "loss": 1.459817886352539, + "step": 2088 + }, + { + "epoch": 0.6433243555213544, + "grad_norm": 4.8125, + "learning_rate": 4.173069033839818e-06, + "loss": 1.223576307296753, + "step": 2090 + }, + { + "epoch": 0.6439399769141978, + "grad_norm": 5.84375, + "learning_rate": 4.171453695097344e-06, + "loss": 1.3411672115325928, + "step": 2092 + }, + { + "epoch": 0.6445555983070411, + "grad_norm": 14.25, + "learning_rate": 4.169837192255073e-06, + "loss": 1.7932548522949219, + "step": 2094 + }, + { + "epoch": 0.6451712196998846, + "grad_norm": 8.125, + "learning_rate": 4.168219526919361e-06, + "loss": 1.269776701927185, + "step": 2096 + }, + { + "epoch": 0.645786841092728, + "grad_norm": 4.90625, + "learning_rate": 4.16660070069772e-06, + "loss": 1.080666184425354, + "step": 2098 + }, + { + "epoch": 0.6464024624855714, + "grad_norm": 11.625, + "learning_rate": 4.164980715198812e-06, + "loss": 1.3051639795303345, + "step": 2100 + }, + { + "epoch": 0.6470180838784148, + "grad_norm": 14.4375, + "learning_rate": 4.1633595720324525e-06, + "loss": 1.7639557123184204, + "step": 2102 + }, + { + "epoch": 0.6476337052712582, + "grad_norm": 8.3125, + "learning_rate": 4.161737272809609e-06, + "loss": 1.2831711769104004, + "step": 2104 + }, + { + "epoch": 0.6482493266641016, + "grad_norm": 8.375, + "learning_rate": 4.160113819142398e-06, + "loss": 1.4829983711242676, + "step": 2106 + }, + { + "epoch": 0.648864948056945, + "grad_norm": 8.5625, + "learning_rate": 4.158489212644078e-06, + "loss": 1.9377659559249878, + "step": 2108 + }, + { + "epoch": 0.6494805694497884, + "grad_norm": 11.0, + "learning_rate": 4.1568634549290585e-06, + "loss": 1.1541517972946167, + "step": 2110 + }, + { + "epoch": 0.6500961908426318, + "grad_norm": 22.625, + "learning_rate": 4.155236547612893e-06, + "loss": 1.2363619804382324, + "step": 2112 + }, + { + "epoch": 0.6507118122354751, + "grad_norm": 21.625, + "learning_rate": 4.153608492312273e-06, + "loss": 1.6787185668945312, + "step": 2114 + }, + { + "epoch": 0.6513274336283186, + "grad_norm": 10.5, + "learning_rate": 4.151979290645037e-06, + "loss": 1.7561014890670776, + "step": 2116 + }, + { + "epoch": 0.651943055021162, + "grad_norm": 2.96875, + "learning_rate": 4.150348944230157e-06, + "loss": 1.3189289569854736, + "step": 2118 + }, + { + "epoch": 0.6525586764140053, + "grad_norm": 5.5, + "learning_rate": 4.148717454687744e-06, + "loss": 1.1471681594848633, + "step": 2120 + }, + { + "epoch": 0.6531742978068488, + "grad_norm": 4.90625, + "learning_rate": 4.147084823639048e-06, + "loss": 1.2217907905578613, + "step": 2122 + }, + { + "epoch": 0.6537899191996922, + "grad_norm": 6.0, + "learning_rate": 4.1454510527064495e-06, + "loss": 1.4032762050628662, + "step": 2124 + }, + { + "epoch": 0.6544055405925356, + "grad_norm": 6.78125, + "learning_rate": 4.143816143513463e-06, + "loss": 1.2402032613754272, + "step": 2126 + }, + { + "epoch": 0.655021161985379, + "grad_norm": 5.90625, + "learning_rate": 4.1421800976847355e-06, + "loss": 1.4848300218582153, + "step": 2128 + }, + { + "epoch": 0.6556367833782224, + "grad_norm": 4.625, + "learning_rate": 4.140542916846042e-06, + "loss": 1.312753677368164, + "step": 2130 + }, + { + "epoch": 0.6562524047710658, + "grad_norm": 11.875, + "learning_rate": 4.138904602624286e-06, + "loss": 1.695920705795288, + "step": 2132 + }, + { + "epoch": 0.6568680261639092, + "grad_norm": 3.9375, + "learning_rate": 4.137265156647496e-06, + "loss": 1.3408093452453613, + "step": 2134 + }, + { + "epoch": 0.6574836475567526, + "grad_norm": 3.78125, + "learning_rate": 4.135624580544829e-06, + "loss": 1.3905094861984253, + "step": 2136 + }, + { + "epoch": 0.658099268949596, + "grad_norm": 4.53125, + "learning_rate": 4.13398287594656e-06, + "loss": 1.229650616645813, + "step": 2138 + }, + { + "epoch": 0.6587148903424394, + "grad_norm": 11.625, + "learning_rate": 4.132340044484086e-06, + "loss": 1.1174551248550415, + "step": 2140 + }, + { + "epoch": 0.6593305117352828, + "grad_norm": 4.90625, + "learning_rate": 4.130696087789929e-06, + "loss": 1.40761399269104, + "step": 2142 + }, + { + "epoch": 0.6599461331281262, + "grad_norm": 7.125, + "learning_rate": 4.129051007497723e-06, + "loss": 1.3790582418441772, + "step": 2144 + }, + { + "epoch": 0.6605617545209697, + "grad_norm": 11.0625, + "learning_rate": 4.127404805242224e-06, + "loss": 1.678246259689331, + "step": 2146 + }, + { + "epoch": 0.661177375913813, + "grad_norm": 6.84375, + "learning_rate": 4.1257574826592975e-06, + "loss": 1.2218875885009766, + "step": 2148 + }, + { + "epoch": 0.6617929973066564, + "grad_norm": 13.375, + "learning_rate": 4.124109041385925e-06, + "loss": 1.3439253568649292, + "step": 2150 + }, + { + "epoch": 0.6624086186994999, + "grad_norm": 7.9375, + "learning_rate": 4.122459483060201e-06, + "loss": 1.3088505268096924, + "step": 2152 + }, + { + "epoch": 0.6630242400923432, + "grad_norm": 18.0, + "learning_rate": 4.1208088093213275e-06, + "loss": 1.2833305597305298, + "step": 2154 + }, + { + "epoch": 0.6636398614851866, + "grad_norm": 8.0625, + "learning_rate": 4.119157021809616e-06, + "loss": 1.722816824913025, + "step": 2156 + }, + { + "epoch": 0.66425548287803, + "grad_norm": 3.453125, + "learning_rate": 4.1175041221664855e-06, + "loss": 1.346421718597412, + "step": 2158 + }, + { + "epoch": 0.6648711042708734, + "grad_norm": 8.6875, + "learning_rate": 4.11585011203446e-06, + "loss": 1.4551198482513428, + "step": 2160 + }, + { + "epoch": 0.6654867256637168, + "grad_norm": 4.4375, + "learning_rate": 4.114194993057163e-06, + "loss": 1.3853923082351685, + "step": 2162 + }, + { + "epoch": 0.6661023470565602, + "grad_norm": 9.0, + "learning_rate": 4.112538766879328e-06, + "loss": 1.7009466886520386, + "step": 2164 + }, + { + "epoch": 0.6667179684494036, + "grad_norm": 6.96875, + "learning_rate": 4.110881435146782e-06, + "loss": 1.2280793190002441, + "step": 2166 + }, + { + "epoch": 0.667333589842247, + "grad_norm": 12.4375, + "learning_rate": 4.109222999506452e-06, + "loss": 1.4504725933074951, + "step": 2168 + }, + { + "epoch": 0.6679492112350904, + "grad_norm": 11.375, + "learning_rate": 4.107563461606362e-06, + "loss": 1.5431996583938599, + "step": 2170 + }, + { + "epoch": 0.6685648326279339, + "grad_norm": 5.28125, + "learning_rate": 4.105902823095634e-06, + "loss": 1.3769593238830566, + "step": 2172 + }, + { + "epoch": 0.6691804540207772, + "grad_norm": 19.0, + "learning_rate": 4.104241085624482e-06, + "loss": 1.5673210620880127, + "step": 2174 + }, + { + "epoch": 0.6697960754136206, + "grad_norm": 17.75, + "learning_rate": 4.102578250844209e-06, + "loss": 1.2605724334716797, + "step": 2176 + }, + { + "epoch": 0.6704116968064641, + "grad_norm": 6.4375, + "learning_rate": 4.100914320407213e-06, + "loss": 1.0668892860412598, + "step": 2178 + }, + { + "epoch": 0.6710273181993074, + "grad_norm": 12.6875, + "learning_rate": 4.099249295966981e-06, + "loss": 1.7679181098937988, + "step": 2180 + }, + { + "epoch": 0.6716429395921508, + "grad_norm": 9.8125, + "learning_rate": 4.0975831791780815e-06, + "loss": 1.7594194412231445, + "step": 2182 + }, + { + "epoch": 0.6722585609849943, + "grad_norm": 5.4375, + "learning_rate": 4.0959159716961735e-06, + "loss": 1.4177989959716797, + "step": 2184 + }, + { + "epoch": 0.6728741823778376, + "grad_norm": 7.25, + "learning_rate": 4.094247675177999e-06, + "loss": 1.2712677717208862, + "step": 2186 + }, + { + "epoch": 0.673489803770681, + "grad_norm": 4.90625, + "learning_rate": 4.0925782912813815e-06, + "loss": 1.2475125789642334, + "step": 2188 + }, + { + "epoch": 0.6741054251635245, + "grad_norm": 10.6875, + "learning_rate": 4.0909078216652245e-06, + "loss": 1.1614550352096558, + "step": 2190 + }, + { + "epoch": 0.6747210465563679, + "grad_norm": 6.84375, + "learning_rate": 4.089236267989512e-06, + "loss": 1.6680519580841064, + "step": 2192 + }, + { + "epoch": 0.6753366679492112, + "grad_norm": 4.25, + "learning_rate": 4.0875636319153025e-06, + "loss": 1.6825952529907227, + "step": 2194 + }, + { + "epoch": 0.6759522893420546, + "grad_norm": 10.125, + "learning_rate": 4.085889915104735e-06, + "loss": 0.9246540069580078, + "step": 2196 + }, + { + "epoch": 0.6765679107348981, + "grad_norm": 4.34375, + "learning_rate": 4.084215119221016e-06, + "loss": 0.7624496221542358, + "step": 2198 + }, + { + "epoch": 0.6771835321277414, + "grad_norm": 20.5, + "learning_rate": 4.0825392459284305e-06, + "loss": 1.0339101552963257, + "step": 2200 + }, + { + "epoch": 0.6777991535205848, + "grad_norm": 7.3125, + "learning_rate": 4.08086229689233e-06, + "loss": 1.7177927494049072, + "step": 2202 + }, + { + "epoch": 0.6784147749134283, + "grad_norm": 4.09375, + "learning_rate": 4.079184273779138e-06, + "loss": 0.9616536498069763, + "step": 2204 + }, + { + "epoch": 0.6790303963062716, + "grad_norm": 8.625, + "learning_rate": 4.07750517825634e-06, + "loss": 0.8436191082000732, + "step": 2206 + }, + { + "epoch": 0.679646017699115, + "grad_norm": 7.15625, + "learning_rate": 4.075825011992495e-06, + "loss": 1.8311975002288818, + "step": 2208 + }, + { + "epoch": 0.6802616390919585, + "grad_norm": 4.75, + "learning_rate": 4.074143776657219e-06, + "loss": 1.3282256126403809, + "step": 2210 + }, + { + "epoch": 0.6808772604848018, + "grad_norm": 4.375, + "learning_rate": 4.072461473921196e-06, + "loss": 0.8911463022232056, + "step": 2212 + }, + { + "epoch": 0.6814928818776452, + "grad_norm": 3.484375, + "learning_rate": 4.070778105456164e-06, + "loss": 1.3046122789382935, + "step": 2214 + }, + { + "epoch": 0.6821085032704887, + "grad_norm": 10.4375, + "learning_rate": 4.0690936729349275e-06, + "loss": 1.3092163801193237, + "step": 2216 + }, + { + "epoch": 0.6827241246633321, + "grad_norm": 4.78125, + "learning_rate": 4.067408178031342e-06, + "loss": 1.1632335186004639, + "step": 2218 + }, + { + "epoch": 0.6833397460561754, + "grad_norm": 2.28125, + "learning_rate": 4.0657216224203255e-06, + "loss": 1.214794635772705, + "step": 2220 + }, + { + "epoch": 0.6839553674490189, + "grad_norm": 5.3125, + "learning_rate": 4.064034007777843e-06, + "loss": 1.2219361066818237, + "step": 2222 + }, + { + "epoch": 0.6845709888418623, + "grad_norm": 4.375, + "learning_rate": 4.062345335780915e-06, + "loss": 1.2587751150131226, + "step": 2224 + }, + { + "epoch": 0.6851866102347056, + "grad_norm": 7.34375, + "learning_rate": 4.0606556081076145e-06, + "loss": 1.2284146547317505, + "step": 2226 + }, + { + "epoch": 0.685802231627549, + "grad_norm": 8.8125, + "learning_rate": 4.058964826437059e-06, + "loss": 1.3370146751403809, + "step": 2228 + }, + { + "epoch": 0.6864178530203925, + "grad_norm": 12.9375, + "learning_rate": 4.057272992449419e-06, + "loss": 1.803123116493225, + "step": 2230 + }, + { + "epoch": 0.6870334744132358, + "grad_norm": 20.5, + "learning_rate": 4.0555801078259085e-06, + "loss": 1.3987561464309692, + "step": 2232 + }, + { + "epoch": 0.6876490958060792, + "grad_norm": 19.375, + "learning_rate": 4.0538861742487815e-06, + "loss": 1.7791513204574585, + "step": 2234 + }, + { + "epoch": 0.6882647171989227, + "grad_norm": 8.4375, + "learning_rate": 4.052191193401339e-06, + "loss": 1.7824699878692627, + "step": 2236 + }, + { + "epoch": 0.6888803385917661, + "grad_norm": 7.03125, + "learning_rate": 4.050495166967922e-06, + "loss": 1.3275808095932007, + "step": 2238 + }, + { + "epoch": 0.6894959599846094, + "grad_norm": 5.4375, + "learning_rate": 4.04879809663391e-06, + "loss": 1.5000485181808472, + "step": 2240 + }, + { + "epoch": 0.6901115813774529, + "grad_norm": 5.65625, + "learning_rate": 4.04709998408572e-06, + "loss": 1.3786470890045166, + "step": 2242 + }, + { + "epoch": 0.6907272027702963, + "grad_norm": 4.25, + "learning_rate": 4.045400831010804e-06, + "loss": 1.2372782230377197, + "step": 2244 + }, + { + "epoch": 0.6913428241631396, + "grad_norm": 7.125, + "learning_rate": 4.043700639097649e-06, + "loss": 1.7251200675964355, + "step": 2246 + }, + { + "epoch": 0.6919584455559831, + "grad_norm": 8.75, + "learning_rate": 4.0419994100357725e-06, + "loss": 1.489999771118164, + "step": 2248 + }, + { + "epoch": 0.6925740669488265, + "grad_norm": 4.65625, + "learning_rate": 4.0402971455157255e-06, + "loss": 1.1909527778625488, + "step": 2250 + }, + { + "epoch": 0.6931896883416698, + "grad_norm": 12.1875, + "learning_rate": 4.038593847229088e-06, + "loss": 1.633176326751709, + "step": 2252 + }, + { + "epoch": 0.6938053097345133, + "grad_norm": 9.125, + "learning_rate": 4.036889516868461e-06, + "loss": 0.6954819560050964, + "step": 2254 + }, + { + "epoch": 0.6944209311273567, + "grad_norm": 2.40625, + "learning_rate": 4.035184156127478e-06, + "loss": 1.3334051370620728, + "step": 2256 + }, + { + "epoch": 0.6950365525202, + "grad_norm": 8.25, + "learning_rate": 4.0334777667007966e-06, + "loss": 1.3189637660980225, + "step": 2258 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 7.15625, + "learning_rate": 4.031770350284091e-06, + "loss": 1.2316111326217651, + "step": 2260 + }, + { + "epoch": 0.6962677953058869, + "grad_norm": 11.1875, + "learning_rate": 4.03006190857406e-06, + "loss": 1.5647934675216675, + "step": 2262 + }, + { + "epoch": 0.6968834166987303, + "grad_norm": 8.9375, + "learning_rate": 4.028352443268422e-06, + "loss": 1.5584977865219116, + "step": 2264 + }, + { + "epoch": 0.6974990380915737, + "grad_norm": 2.734375, + "learning_rate": 4.026641956065908e-06, + "loss": 1.2405673265457153, + "step": 2266 + }, + { + "epoch": 0.6981146594844171, + "grad_norm": 10.9375, + "learning_rate": 4.02493044866627e-06, + "loss": 1.471353530883789, + "step": 2268 + }, + { + "epoch": 0.6987302808772605, + "grad_norm": 7.125, + "learning_rate": 4.023217922770272e-06, + "loss": 1.5434600114822388, + "step": 2270 + }, + { + "epoch": 0.6993459022701038, + "grad_norm": 6.0625, + "learning_rate": 4.021504380079686e-06, + "loss": 1.395086646080017, + "step": 2272 + }, + { + "epoch": 0.6999615236629473, + "grad_norm": 8.375, + "learning_rate": 4.0197898222973e-06, + "loss": 1.3828070163726807, + "step": 2274 + }, + { + "epoch": 0.7005771450557907, + "grad_norm": 2.828125, + "learning_rate": 4.018074251126908e-06, + "loss": 1.3579838275909424, + "step": 2276 + }, + { + "epoch": 0.701192766448634, + "grad_norm": 13.1875, + "learning_rate": 4.016357668273313e-06, + "loss": 1.3393607139587402, + "step": 2278 + }, + { + "epoch": 0.7018083878414775, + "grad_norm": 5.34375, + "learning_rate": 4.014640075442318e-06, + "loss": 1.1327935457229614, + "step": 2280 + }, + { + "epoch": 0.7024240092343209, + "grad_norm": 5.75, + "learning_rate": 4.012921474340738e-06, + "loss": 1.2218248844146729, + "step": 2282 + }, + { + "epoch": 0.7030396306271643, + "grad_norm": 8.0, + "learning_rate": 4.011201866676383e-06, + "loss": 0.902611494064331, + "step": 2284 + }, + { + "epoch": 0.7036552520200077, + "grad_norm": 6.21875, + "learning_rate": 4.009481254158066e-06, + "loss": 1.4177908897399902, + "step": 2286 + }, + { + "epoch": 0.7042708734128511, + "grad_norm": 3.6875, + "learning_rate": 4.007759638495599e-06, + "loss": 1.1517624855041504, + "step": 2288 + }, + { + "epoch": 0.7048864948056945, + "grad_norm": 4.65625, + "learning_rate": 4.006037021399789e-06, + "loss": 1.2392499446868896, + "step": 2290 + }, + { + "epoch": 0.7055021161985379, + "grad_norm": 3.75, + "learning_rate": 4.004313404582439e-06, + "loss": 1.0375635623931885, + "step": 2292 + }, + { + "epoch": 0.7061177375913813, + "grad_norm": 7.21875, + "learning_rate": 4.002588789756349e-06, + "loss": 1.2339847087860107, + "step": 2294 + }, + { + "epoch": 0.7067333589842247, + "grad_norm": 6.59375, + "learning_rate": 4.000863178635301e-06, + "loss": 1.436223030090332, + "step": 2296 + }, + { + "epoch": 0.7073489803770681, + "grad_norm": 7.71875, + "learning_rate": 3.9991365729340805e-06, + "loss": 1.2261691093444824, + "step": 2298 + }, + { + "epoch": 0.7079646017699115, + "grad_norm": 5.1875, + "learning_rate": 3.997408974368449e-06, + "loss": 1.1778173446655273, + "step": 2300 + }, + { + "epoch": 0.7085802231627549, + "grad_norm": 6.25, + "learning_rate": 3.995680384655162e-06, + "loss": 1.211751103401184, + "step": 2302 + }, + { + "epoch": 0.7091958445555983, + "grad_norm": 18.0, + "learning_rate": 3.993950805511959e-06, + "loss": 1.642501950263977, + "step": 2304 + }, + { + "epoch": 0.7098114659484417, + "grad_norm": 15.8125, + "learning_rate": 3.992220238657559e-06, + "loss": 1.3975833654403687, + "step": 2306 + }, + { + "epoch": 0.7104270873412851, + "grad_norm": 2.203125, + "learning_rate": 3.990488685811667e-06, + "loss": 1.1813315153121948, + "step": 2308 + }, + { + "epoch": 0.7110427087341286, + "grad_norm": 4.8125, + "learning_rate": 3.9887561486949655e-06, + "loss": 1.1163524389266968, + "step": 2310 + }, + { + "epoch": 0.7116583301269719, + "grad_norm": 6.3125, + "learning_rate": 3.987022629029115e-06, + "loss": 1.407476782798767, + "step": 2312 + }, + { + "epoch": 0.7122739515198153, + "grad_norm": 4.8125, + "learning_rate": 3.985288128536753e-06, + "loss": 1.3963619470596313, + "step": 2314 + }, + { + "epoch": 0.7128895729126588, + "grad_norm": 4.8125, + "learning_rate": 3.983552648941492e-06, + "loss": 1.281212568283081, + "step": 2316 + }, + { + "epoch": 0.7135051943055021, + "grad_norm": 6.0625, + "learning_rate": 3.981816191967917e-06, + "loss": 1.3652002811431885, + "step": 2318 + }, + { + "epoch": 0.7141208156983455, + "grad_norm": 11.5625, + "learning_rate": 3.980078759341582e-06, + "loss": 1.2505300045013428, + "step": 2320 + }, + { + "epoch": 0.714736437091189, + "grad_norm": 6.90625, + "learning_rate": 3.978340352789017e-06, + "loss": 0.7349134683609009, + "step": 2322 + }, + { + "epoch": 0.7153520584840323, + "grad_norm": 6.71875, + "learning_rate": 3.976600974037711e-06, + "loss": 1.4582650661468506, + "step": 2324 + }, + { + "epoch": 0.7159676798768757, + "grad_norm": 14.5, + "learning_rate": 3.974860624816126e-06, + "loss": 1.3325326442718506, + "step": 2326 + }, + { + "epoch": 0.7165833012697191, + "grad_norm": 33.25, + "learning_rate": 3.973119306853687e-06, + "loss": 1.6099287271499634, + "step": 2328 + }, + { + "epoch": 0.7171989226625626, + "grad_norm": 7.5625, + "learning_rate": 3.971377021880779e-06, + "loss": 1.5254729986190796, + "step": 2330 + }, + { + "epoch": 0.7178145440554059, + "grad_norm": 12.0625, + "learning_rate": 3.96963377162875e-06, + "loss": 1.2081190347671509, + "step": 2332 + }, + { + "epoch": 0.7184301654482493, + "grad_norm": 6.8125, + "learning_rate": 3.967889557829907e-06, + "loss": 1.290348768234253, + "step": 2334 + }, + { + "epoch": 0.7190457868410928, + "grad_norm": 9.375, + "learning_rate": 3.966144382217514e-06, + "loss": 1.1293039321899414, + "step": 2336 + }, + { + "epoch": 0.7196614082339361, + "grad_norm": 6.78125, + "learning_rate": 3.964398246525791e-06, + "loss": 1.4362313747406006, + "step": 2338 + }, + { + "epoch": 0.7202770296267795, + "grad_norm": 6.8125, + "learning_rate": 3.962651152489914e-06, + "loss": 1.1427749395370483, + "step": 2340 + }, + { + "epoch": 0.720892651019623, + "grad_norm": 18.0, + "learning_rate": 3.960903101846009e-06, + "loss": 1.441338062286377, + "step": 2342 + }, + { + "epoch": 0.7215082724124663, + "grad_norm": 16.5, + "learning_rate": 3.959154096331149e-06, + "loss": 1.6988006830215454, + "step": 2344 + }, + { + "epoch": 0.7221238938053097, + "grad_norm": 6.28125, + "learning_rate": 3.957404137683366e-06, + "loss": 1.1266911029815674, + "step": 2346 + }, + { + "epoch": 0.7227395151981532, + "grad_norm": 5.40625, + "learning_rate": 3.955653227641629e-06, + "loss": 1.1880162954330444, + "step": 2348 + }, + { + "epoch": 0.7233551365909965, + "grad_norm": 6.6875, + "learning_rate": 3.953901367945858e-06, + "loss": 1.274377703666687, + "step": 2350 + }, + { + "epoch": 0.7239707579838399, + "grad_norm": 4.65625, + "learning_rate": 3.952148560336916e-06, + "loss": 1.4765647649765015, + "step": 2352 + }, + { + "epoch": 0.7245863793766834, + "grad_norm": 4.0625, + "learning_rate": 3.950394806556607e-06, + "loss": 1.2762149572372437, + "step": 2354 + }, + { + "epoch": 0.7252020007695268, + "grad_norm": 9.8125, + "learning_rate": 3.948640108347673e-06, + "loss": 1.3984206914901733, + "step": 2356 + }, + { + "epoch": 0.7258176221623701, + "grad_norm": 3.34375, + "learning_rate": 3.9468844674537995e-06, + "loss": 1.2570399045944214, + "step": 2358 + }, + { + "epoch": 0.7264332435552135, + "grad_norm": 9.1875, + "learning_rate": 3.945127885619605e-06, + "loss": 1.921069860458374, + "step": 2360 + }, + { + "epoch": 0.727048864948057, + "grad_norm": 6.21875, + "learning_rate": 3.943370364590646e-06, + "loss": 1.4534980058670044, + "step": 2362 + }, + { + "epoch": 0.7276644863409003, + "grad_norm": 13.5625, + "learning_rate": 3.941611906113409e-06, + "loss": 1.3451404571533203, + "step": 2364 + }, + { + "epoch": 0.7282801077337437, + "grad_norm": 4.65625, + "learning_rate": 3.939852511935313e-06, + "loss": 1.1856927871704102, + "step": 2366 + }, + { + "epoch": 0.7288957291265872, + "grad_norm": 9.5, + "learning_rate": 3.938092183804709e-06, + "loss": 1.5575098991394043, + "step": 2368 + }, + { + "epoch": 0.7295113505194305, + "grad_norm": 5.875, + "learning_rate": 3.936330923470874e-06, + "loss": 1.4615497589111328, + "step": 2370 + }, + { + "epoch": 0.7301269719122739, + "grad_norm": 11.5, + "learning_rate": 3.934568732684011e-06, + "loss": 1.7046332359313965, + "step": 2372 + }, + { + "epoch": 0.7307425933051174, + "grad_norm": 5.28125, + "learning_rate": 3.932805613195249e-06, + "loss": 1.2465029954910278, + "step": 2374 + }, + { + "epoch": 0.7313582146979608, + "grad_norm": 9.125, + "learning_rate": 3.9310415667566405e-06, + "loss": 1.5415079593658447, + "step": 2376 + }, + { + "epoch": 0.7319738360908041, + "grad_norm": 21.25, + "learning_rate": 3.929276595121157e-06, + "loss": 1.7812328338623047, + "step": 2378 + }, + { + "epoch": 0.7325894574836476, + "grad_norm": 8.1875, + "learning_rate": 3.927510700042689e-06, + "loss": 1.452361822128296, + "step": 2380 + }, + { + "epoch": 0.733205078876491, + "grad_norm": 12.5625, + "learning_rate": 3.9257438832760485e-06, + "loss": 1.593746304512024, + "step": 2382 + }, + { + "epoch": 0.7338207002693343, + "grad_norm": 6.0625, + "learning_rate": 3.923976146576961e-06, + "loss": 1.1884487867355347, + "step": 2384 + }, + { + "epoch": 0.7344363216621778, + "grad_norm": 6.53125, + "learning_rate": 3.922207491702064e-06, + "loss": 1.1821532249450684, + "step": 2386 + }, + { + "epoch": 0.7350519430550212, + "grad_norm": 6.15625, + "learning_rate": 3.9204379204089095e-06, + "loss": 1.617356538772583, + "step": 2388 + }, + { + "epoch": 0.7356675644478645, + "grad_norm": 4.90625, + "learning_rate": 3.918667434455962e-06, + "loss": 1.1489228010177612, + "step": 2390 + }, + { + "epoch": 0.736283185840708, + "grad_norm": 4.5, + "learning_rate": 3.916896035602592e-06, + "loss": 1.4210864305496216, + "step": 2392 + }, + { + "epoch": 0.7368988072335514, + "grad_norm": 4.9375, + "learning_rate": 3.915123725609079e-06, + "loss": 1.2937191724777222, + "step": 2394 + }, + { + "epoch": 0.7375144286263947, + "grad_norm": 4.40625, + "learning_rate": 3.913350506236606e-06, + "loss": 1.2571167945861816, + "step": 2396 + }, + { + "epoch": 0.7381300500192381, + "grad_norm": 6.625, + "learning_rate": 3.9115763792472615e-06, + "loss": 1.1669660806655884, + "step": 2398 + }, + { + "epoch": 0.7387456714120816, + "grad_norm": 3.875, + "learning_rate": 3.909801346404035e-06, + "loss": 1.3945196866989136, + "step": 2400 + }, + { + "epoch": 0.739361292804925, + "grad_norm": 5.71875, + "learning_rate": 3.908025409470817e-06, + "loss": 1.3932634592056274, + "step": 2402 + }, + { + "epoch": 0.7399769141977683, + "grad_norm": 5.28125, + "learning_rate": 3.906248570212395e-06, + "loss": 1.3155955076217651, + "step": 2404 + }, + { + "epoch": 0.7405925355906118, + "grad_norm": 7.03125, + "learning_rate": 3.904470830394455e-06, + "loss": 1.3457392454147339, + "step": 2406 + }, + { + "epoch": 0.7412081569834552, + "grad_norm": 9.75, + "learning_rate": 3.902692191783576e-06, + "loss": 1.2566583156585693, + "step": 2408 + }, + { + "epoch": 0.7418237783762985, + "grad_norm": 11.25, + "learning_rate": 3.9009126561472325e-06, + "loss": 1.4173073768615723, + "step": 2410 + }, + { + "epoch": 0.742439399769142, + "grad_norm": 3.953125, + "learning_rate": 3.899132225253786e-06, + "loss": 1.030098557472229, + "step": 2412 + }, + { + "epoch": 0.7430550211619854, + "grad_norm": 5.65625, + "learning_rate": 3.897350900872494e-06, + "loss": 1.1613000631332397, + "step": 2414 + }, + { + "epoch": 0.7436706425548287, + "grad_norm": 8.0625, + "learning_rate": 3.895568684773496e-06, + "loss": 1.7477487325668335, + "step": 2416 + }, + { + "epoch": 0.7442862639476722, + "grad_norm": 3.765625, + "learning_rate": 3.893785578727821e-06, + "loss": 1.2887831926345825, + "step": 2418 + }, + { + "epoch": 0.7449018853405156, + "grad_norm": 5.34375, + "learning_rate": 3.892001584507382e-06, + "loss": 1.3257797956466675, + "step": 2420 + }, + { + "epoch": 0.745517506733359, + "grad_norm": 2.890625, + "learning_rate": 3.890216703884974e-06, + "loss": 1.1927638053894043, + "step": 2422 + }, + { + "epoch": 0.7461331281262024, + "grad_norm": 4.4375, + "learning_rate": 3.888430938634272e-06, + "loss": 1.2174712419509888, + "step": 2424 + }, + { + "epoch": 0.7467487495190458, + "grad_norm": 36.75, + "learning_rate": 3.886644290529831e-06, + "loss": 0.8827616572380066, + "step": 2426 + }, + { + "epoch": 0.7473643709118892, + "grad_norm": 6.4375, + "learning_rate": 3.884856761347084e-06, + "loss": 1.3602244853973389, + "step": 2428 + }, + { + "epoch": 0.7479799923047326, + "grad_norm": 6.5, + "learning_rate": 3.883068352862338e-06, + "loss": 0.8709210157394409, + "step": 2430 + }, + { + "epoch": 0.748595613697576, + "grad_norm": 7.09375, + "learning_rate": 3.8812790668527755e-06, + "loss": 1.2259773015975952, + "step": 2432 + }, + { + "epoch": 0.7492112350904194, + "grad_norm": 6.15625, + "learning_rate": 3.879488905096449e-06, + "loss": 1.3588147163391113, + "step": 2434 + }, + { + "epoch": 0.7498268564832627, + "grad_norm": 7.34375, + "learning_rate": 3.877697869372284e-06, + "loss": 1.464431643486023, + "step": 2436 + }, + { + "epoch": 0.7504424778761062, + "grad_norm": 4.875, + "learning_rate": 3.8759059614600705e-06, + "loss": 1.271799087524414, + "step": 2438 + }, + { + "epoch": 0.7510580992689496, + "grad_norm": 2.625, + "learning_rate": 3.87411318314047e-06, + "loss": 1.2482190132141113, + "step": 2440 + }, + { + "epoch": 0.7516737206617929, + "grad_norm": 13.0, + "learning_rate": 3.872319536195006e-06, + "loss": 1.4509402513504028, + "step": 2442 + }, + { + "epoch": 0.7522893420546364, + "grad_norm": 9.9375, + "learning_rate": 3.870525022406064e-06, + "loss": 1.517437219619751, + "step": 2444 + }, + { + "epoch": 0.7529049634474798, + "grad_norm": 5.75, + "learning_rate": 3.8687296435568945e-06, + "loss": 1.2650505304336548, + "step": 2446 + }, + { + "epoch": 0.7535205848403232, + "grad_norm": 14.125, + "learning_rate": 3.866933401431604e-06, + "loss": 1.5364501476287842, + "step": 2448 + }, + { + "epoch": 0.7541362062331666, + "grad_norm": 1.8984375, + "learning_rate": 3.865136297815161e-06, + "loss": 1.1213115453720093, + "step": 2450 + }, + { + "epoch": 0.75475182762601, + "grad_norm": 4.3125, + "learning_rate": 3.863338334493384e-06, + "loss": 1.0022122859954834, + "step": 2452 + }, + { + "epoch": 0.7553674490188534, + "grad_norm": 15.8125, + "learning_rate": 3.8615395132529536e-06, + "loss": 1.7348800897598267, + "step": 2454 + }, + { + "epoch": 0.7559830704116968, + "grad_norm": 3.109375, + "learning_rate": 3.859739835881394e-06, + "loss": 1.3278826475143433, + "step": 2456 + }, + { + "epoch": 0.7565986918045402, + "grad_norm": 5.125, + "learning_rate": 3.85793930416709e-06, + "loss": 1.449325680732727, + "step": 2458 + }, + { + "epoch": 0.7572143131973836, + "grad_norm": 7.96875, + "learning_rate": 3.856137919899268e-06, + "loss": 1.2966344356536865, + "step": 2460 + }, + { + "epoch": 0.757829934590227, + "grad_norm": 8.75, + "learning_rate": 3.854335684868004e-06, + "loss": 1.4409971237182617, + "step": 2462 + }, + { + "epoch": 0.7584455559830704, + "grad_norm": 7.9375, + "learning_rate": 3.852532600864218e-06, + "loss": 1.4250284433364868, + "step": 2464 + }, + { + "epoch": 0.7590611773759138, + "grad_norm": 10.0, + "learning_rate": 3.850728669679678e-06, + "loss": 1.5082494020462036, + "step": 2466 + }, + { + "epoch": 0.7596767987687573, + "grad_norm": 4.75, + "learning_rate": 3.848923893106987e-06, + "loss": 1.257767915725708, + "step": 2468 + }, + { + "epoch": 0.7602924201616006, + "grad_norm": 4.46875, + "learning_rate": 3.847118272939595e-06, + "loss": 1.1704938411712646, + "step": 2470 + }, + { + "epoch": 0.760908041554444, + "grad_norm": 9.9375, + "learning_rate": 3.845311810971787e-06, + "loss": 1.3709163665771484, + "step": 2472 + }, + { + "epoch": 0.7615236629472875, + "grad_norm": 8.75, + "learning_rate": 3.843504508998684e-06, + "loss": 1.782489538192749, + "step": 2474 + }, + { + "epoch": 0.7621392843401308, + "grad_norm": 5.90625, + "learning_rate": 3.841696368816242e-06, + "loss": 1.6315412521362305, + "step": 2476 + }, + { + "epoch": 0.7627549057329742, + "grad_norm": 13.1875, + "learning_rate": 3.839887392221252e-06, + "loss": 1.3365678787231445, + "step": 2478 + }, + { + "epoch": 0.7633705271258177, + "grad_norm": 10.375, + "learning_rate": 3.838077581011332e-06, + "loss": 1.4168919324874878, + "step": 2480 + }, + { + "epoch": 0.763986148518661, + "grad_norm": 10.5, + "learning_rate": 3.836266936984934e-06, + "loss": 1.4411951303482056, + "step": 2482 + }, + { + "epoch": 0.7646017699115044, + "grad_norm": 11.4375, + "learning_rate": 3.834455461941335e-06, + "loss": 1.7773278951644897, + "step": 2484 + }, + { + "epoch": 0.7652173913043478, + "grad_norm": 4.96875, + "learning_rate": 3.83264315768064e-06, + "loss": 1.1491798162460327, + "step": 2486 + }, + { + "epoch": 0.7658330126971912, + "grad_norm": 4.625, + "learning_rate": 3.830830026003774e-06, + "loss": 1.0899100303649902, + "step": 2488 + }, + { + "epoch": 0.7664486340900346, + "grad_norm": 12.4375, + "learning_rate": 3.829016068712486e-06, + "loss": 1.6271624565124512, + "step": 2490 + }, + { + "epoch": 0.767064255482878, + "grad_norm": 5.8125, + "learning_rate": 3.827201287609349e-06, + "loss": 1.2503747940063477, + "step": 2492 + }, + { + "epoch": 0.7676798768757215, + "grad_norm": 7.40625, + "learning_rate": 3.82538568449775e-06, + "loss": 0.8986793160438538, + "step": 2494 + }, + { + "epoch": 0.7682954982685648, + "grad_norm": 15.1875, + "learning_rate": 3.823569261181894e-06, + "loss": 1.471342921257019, + "step": 2496 + }, + { + "epoch": 0.7689111196614082, + "grad_norm": 9.0, + "learning_rate": 3.821752019466803e-06, + "loss": 1.291356086730957, + "step": 2498 + }, + { + "epoch": 0.7695267410542517, + "grad_norm": 5.0625, + "learning_rate": 3.819933961158308e-06, + "loss": 1.318943738937378, + "step": 2500 + }, + { + "epoch": 0.770142362447095, + "grad_norm": 6.0, + "learning_rate": 3.818115088063058e-06, + "loss": 1.4840826988220215, + "step": 2502 + }, + { + "epoch": 0.7707579838399384, + "grad_norm": 4.6875, + "learning_rate": 3.816295401988507e-06, + "loss": 1.2456152439117432, + "step": 2504 + }, + { + "epoch": 0.7713736052327819, + "grad_norm": 6.28125, + "learning_rate": 3.814474904742916e-06, + "loss": 1.1569583415985107, + "step": 2506 + }, + { + "epoch": 0.7719892266256252, + "grad_norm": 14.5625, + "learning_rate": 3.812653598135355e-06, + "loss": 1.612607717514038, + "step": 2508 + }, + { + "epoch": 0.7726048480184686, + "grad_norm": 6.40625, + "learning_rate": 3.8108314839756976e-06, + "loss": 1.265571117401123, + "step": 2510 + }, + { + "epoch": 0.7732204694113121, + "grad_norm": 16.125, + "learning_rate": 3.809008564074619e-06, + "loss": 1.9577373266220093, + "step": 2512 + }, + { + "epoch": 0.7738360908041555, + "grad_norm": 5.125, + "learning_rate": 3.807184840243595e-06, + "loss": 1.4305379390716553, + "step": 2514 + }, + { + "epoch": 0.7744517121969988, + "grad_norm": 11.875, + "learning_rate": 3.8053603142949024e-06, + "loss": 1.0490630865097046, + "step": 2516 + }, + { + "epoch": 0.7750673335898423, + "grad_norm": 5.875, + "learning_rate": 3.803534988041613e-06, + "loss": 1.2258621454238892, + "step": 2518 + }, + { + "epoch": 0.7756829549826857, + "grad_norm": 7.59375, + "learning_rate": 3.8017088632975928e-06, + "loss": 1.3509862422943115, + "step": 2520 + }, + { + "epoch": 0.776298576375529, + "grad_norm": 6.90625, + "learning_rate": 3.7998819418775044e-06, + "loss": 1.5921310186386108, + "step": 2522 + }, + { + "epoch": 0.7769141977683724, + "grad_norm": 11.1875, + "learning_rate": 3.798054225596801e-06, + "loss": 1.2857153415679932, + "step": 2524 + }, + { + "epoch": 0.7775298191612159, + "grad_norm": 2.78125, + "learning_rate": 3.7962257162717242e-06, + "loss": 1.2583740949630737, + "step": 2526 + }, + { + "epoch": 0.7781454405540592, + "grad_norm": 4.28125, + "learning_rate": 3.7943964157193057e-06, + "loss": 1.31504225730896, + "step": 2528 + }, + { + "epoch": 0.7787610619469026, + "grad_norm": 11.375, + "learning_rate": 3.792566325757361e-06, + "loss": 1.482974886894226, + "step": 2530 + }, + { + "epoch": 0.7793766833397461, + "grad_norm": 7.46875, + "learning_rate": 3.790735448204492e-06, + "loss": 1.0821229219436646, + "step": 2532 + }, + { + "epoch": 0.7799923047325894, + "grad_norm": 5.75, + "learning_rate": 3.7889037848800846e-06, + "loss": 1.1284804344177246, + "step": 2534 + }, + { + "epoch": 0.7806079261254328, + "grad_norm": 1.8984375, + "learning_rate": 3.787071337604301e-06, + "loss": 1.0225565433502197, + "step": 2536 + }, + { + "epoch": 0.7812235475182763, + "grad_norm": 5.5, + "learning_rate": 3.785238108198087e-06, + "loss": 1.2461652755737305, + "step": 2538 + }, + { + "epoch": 0.7818391689111197, + "grad_norm": 8.125, + "learning_rate": 3.783404098483163e-06, + "loss": 0.9364984035491943, + "step": 2540 + }, + { + "epoch": 0.782454790303963, + "grad_norm": 3.984375, + "learning_rate": 3.781569310282026e-06, + "loss": 1.2495629787445068, + "step": 2542 + }, + { + "epoch": 0.7830704116968065, + "grad_norm": 12.25, + "learning_rate": 3.779733745417945e-06, + "loss": 0.924501895904541, + "step": 2544 + }, + { + "epoch": 0.7836860330896499, + "grad_norm": 9.4375, + "learning_rate": 3.7778974057149632e-06, + "loss": 1.6362602710723877, + "step": 2546 + }, + { + "epoch": 0.7843016544824932, + "grad_norm": 10.125, + "learning_rate": 3.776060292997893e-06, + "loss": 1.1575554609298706, + "step": 2548 + }, + { + "epoch": 0.7849172758753367, + "grad_norm": 7.78125, + "learning_rate": 3.774222409092313e-06, + "loss": 1.4146034717559814, + "step": 2550 + }, + { + "epoch": 0.7855328972681801, + "grad_norm": 10.1875, + "learning_rate": 3.772383755824569e-06, + "loss": 1.4862112998962402, + "step": 2552 + }, + { + "epoch": 0.7861485186610234, + "grad_norm": 8.625, + "learning_rate": 3.770544335021774e-06, + "loss": 1.2228214740753174, + "step": 2554 + }, + { + "epoch": 0.7867641400538669, + "grad_norm": 10.875, + "learning_rate": 3.7687041485118025e-06, + "loss": 1.2453134059906006, + "step": 2556 + }, + { + "epoch": 0.7873797614467103, + "grad_norm": 11.25, + "learning_rate": 3.7668631981232852e-06, + "loss": 1.4665039777755737, + "step": 2558 + }, + { + "epoch": 0.7879953828395537, + "grad_norm": 4.8125, + "learning_rate": 3.7650214856856192e-06, + "loss": 1.2003722190856934, + "step": 2560 + }, + { + "epoch": 0.788611004232397, + "grad_norm": 7.53125, + "learning_rate": 3.7631790130289548e-06, + "loss": 1.515622615814209, + "step": 2562 + }, + { + "epoch": 0.7892266256252405, + "grad_norm": 8.3125, + "learning_rate": 3.7613357819841968e-06, + "loss": 1.1541849374771118, + "step": 2564 + }, + { + "epoch": 0.7898422470180839, + "grad_norm": 6.8125, + "learning_rate": 3.7594917943830065e-06, + "loss": 1.406124472618103, + "step": 2566 + }, + { + "epoch": 0.7904578684109272, + "grad_norm": 4.5625, + "learning_rate": 3.757647052057796e-06, + "loss": 1.10648775100708, + "step": 2568 + }, + { + "epoch": 0.7910734898037707, + "grad_norm": 6.625, + "learning_rate": 3.755801556841726e-06, + "loss": 1.2991050481796265, + "step": 2570 + }, + { + "epoch": 0.7916891111966141, + "grad_norm": 5.78125, + "learning_rate": 3.7539553105687067e-06, + "loss": 1.4035756587982178, + "step": 2572 + }, + { + "epoch": 0.7923047325894574, + "grad_norm": 8.4375, + "learning_rate": 3.7521083150733952e-06, + "loss": 1.6464347839355469, + "step": 2574 + }, + { + "epoch": 0.7929203539823009, + "grad_norm": 7.6875, + "learning_rate": 3.7502605721911923e-06, + "loss": 1.4995614290237427, + "step": 2576 + }, + { + "epoch": 0.7935359753751443, + "grad_norm": 8.0, + "learning_rate": 3.7484120837582405e-06, + "loss": 1.1336731910705566, + "step": 2578 + }, + { + "epoch": 0.7941515967679876, + "grad_norm": 3.90625, + "learning_rate": 3.746562851611425e-06, + "loss": 0.9471673369407654, + "step": 2580 + }, + { + "epoch": 0.7947672181608311, + "grad_norm": 13.125, + "learning_rate": 3.744712877588369e-06, + "loss": 1.2849431037902832, + "step": 2582 + }, + { + "epoch": 0.7953828395536745, + "grad_norm": 5.03125, + "learning_rate": 3.7428621635274333e-06, + "loss": 1.2788987159729004, + "step": 2584 + }, + { + "epoch": 0.7959984609465179, + "grad_norm": 9.625, + "learning_rate": 3.741010711267714e-06, + "loss": 1.6069245338439941, + "step": 2586 + }, + { + "epoch": 0.7966140823393613, + "grad_norm": 7.25, + "learning_rate": 3.73915852264904e-06, + "loss": 1.4633699655532837, + "step": 2588 + }, + { + "epoch": 0.7972297037322047, + "grad_norm": 6.625, + "learning_rate": 3.7373055995119745e-06, + "loss": 1.132173776626587, + "step": 2590 + }, + { + "epoch": 0.7978453251250481, + "grad_norm": 6.59375, + "learning_rate": 3.7354519436978056e-06, + "loss": 1.3197592496871948, + "step": 2592 + }, + { + "epoch": 0.7984609465178915, + "grad_norm": 20.5, + "learning_rate": 3.733597557048555e-06, + "loss": 1.333539605140686, + "step": 2594 + }, + { + "epoch": 0.7990765679107349, + "grad_norm": 5.75, + "learning_rate": 3.731742441406969e-06, + "loss": 1.344053864479065, + "step": 2596 + }, + { + "epoch": 0.7996921893035783, + "grad_norm": 6.6875, + "learning_rate": 3.7298865986165154e-06, + "loss": 1.3256186246871948, + "step": 2598 + }, + { + "epoch": 0.8003078106964217, + "grad_norm": 10.75, + "learning_rate": 3.728030030521387e-06, + "loss": 1.4850950241088867, + "step": 2600 + }, + { + "epoch": 0.8009234320892651, + "grad_norm": 4.5, + "learning_rate": 3.7261727389664993e-06, + "loss": 1.096773386001587, + "step": 2602 + }, + { + "epoch": 0.8015390534821085, + "grad_norm": 1.96875, + "learning_rate": 3.7243147257974832e-06, + "loss": 1.0473754405975342, + "step": 2604 + }, + { + "epoch": 0.802154674874952, + "grad_norm": 7.875, + "learning_rate": 3.7224559928606862e-06, + "loss": 1.1904107332229614, + "step": 2606 + }, + { + "epoch": 0.8027702962677953, + "grad_norm": 9.25, + "learning_rate": 3.7205965420031763e-06, + "loss": 1.3250120878219604, + "step": 2608 + }, + { + "epoch": 0.8033859176606387, + "grad_norm": 5.96875, + "learning_rate": 3.718736375072729e-06, + "loss": 1.4196326732635498, + "step": 2610 + }, + { + "epoch": 0.8040015390534822, + "grad_norm": 9.75, + "learning_rate": 3.716875493917834e-06, + "loss": 0.9277944564819336, + "step": 2612 + }, + { + "epoch": 0.8046171604463255, + "grad_norm": 8.625, + "learning_rate": 3.715013900387693e-06, + "loss": 1.0608748197555542, + "step": 2614 + }, + { + "epoch": 0.8052327818391689, + "grad_norm": 5.875, + "learning_rate": 3.7131515963322106e-06, + "loss": 1.126943826675415, + "step": 2616 + }, + { + "epoch": 0.8058484032320123, + "grad_norm": 15.875, + "learning_rate": 3.7112885836020017e-06, + "loss": 1.363018274307251, + "step": 2618 + }, + { + "epoch": 0.8064640246248557, + "grad_norm": 5.21875, + "learning_rate": 3.7094248640483834e-06, + "loss": 1.227006196975708, + "step": 2620 + }, + { + "epoch": 0.8070796460176991, + "grad_norm": 8.1875, + "learning_rate": 3.7075604395233745e-06, + "loss": 1.02199125289917, + "step": 2622 + }, + { + "epoch": 0.8076952674105425, + "grad_norm": 1.9375, + "learning_rate": 3.705695311879696e-06, + "loss": 1.1358833312988281, + "step": 2624 + }, + { + "epoch": 0.8083108888033859, + "grad_norm": 6.875, + "learning_rate": 3.7038294829707675e-06, + "loss": 1.2867252826690674, + "step": 2626 + }, + { + "epoch": 0.8089265101962293, + "grad_norm": 8.5, + "learning_rate": 3.701962954650705e-06, + "loss": 1.364060401916504, + "step": 2628 + }, + { + "epoch": 0.8095421315890727, + "grad_norm": 4.78125, + "learning_rate": 3.70009572877432e-06, + "loss": 0.9407942295074463, + "step": 2630 + }, + { + "epoch": 0.8101577529819162, + "grad_norm": 2.4375, + "learning_rate": 3.6982278071971158e-06, + "loss": 1.1905866861343384, + "step": 2632 + }, + { + "epoch": 0.8107733743747595, + "grad_norm": 9.0, + "learning_rate": 3.696359191775288e-06, + "loss": 1.0922561883926392, + "step": 2634 + }, + { + "epoch": 0.8113889957676029, + "grad_norm": 7.03125, + "learning_rate": 3.694489884365722e-06, + "loss": 1.2597466707229614, + "step": 2636 + }, + { + "epoch": 0.8120046171604464, + "grad_norm": 9.875, + "learning_rate": 3.69261988682599e-06, + "loss": 1.451778769493103, + "step": 2638 + }, + { + "epoch": 0.8126202385532897, + "grad_norm": 11.0, + "learning_rate": 3.690749201014352e-06, + "loss": 1.3754864931106567, + "step": 2640 + }, + { + "epoch": 0.8132358599461331, + "grad_norm": 9.5, + "learning_rate": 3.6888778287897477e-06, + "loss": 1.460006833076477, + "step": 2642 + }, + { + "epoch": 0.8138514813389766, + "grad_norm": 7.78125, + "learning_rate": 3.6870057720118036e-06, + "loss": 1.3695088624954224, + "step": 2644 + }, + { + "epoch": 0.8144671027318199, + "grad_norm": 4.625, + "learning_rate": 3.6851330325408242e-06, + "loss": 1.2939337491989136, + "step": 2646 + }, + { + "epoch": 0.8150827241246633, + "grad_norm": 3.6875, + "learning_rate": 3.6832596122377926e-06, + "loss": 1.1832215785980225, + "step": 2648 + }, + { + "epoch": 0.8156983455175068, + "grad_norm": 4.84375, + "learning_rate": 3.681385512964368e-06, + "loss": 1.4975953102111816, + "step": 2650 + }, + { + "epoch": 0.8163139669103502, + "grad_norm": 5.6875, + "learning_rate": 3.6795107365828862e-06, + "loss": 1.2492905855178833, + "step": 2652 + }, + { + "epoch": 0.8169295883031935, + "grad_norm": 49.75, + "learning_rate": 3.6776352849563534e-06, + "loss": 1.4812604188919067, + "step": 2654 + }, + { + "epoch": 0.817545209696037, + "grad_norm": 9.875, + "learning_rate": 3.6757591599484476e-06, + "loss": 1.4885751008987427, + "step": 2656 + }, + { + "epoch": 0.8181608310888804, + "grad_norm": 6.09375, + "learning_rate": 3.6738823634235175e-06, + "loss": 1.0009148120880127, + "step": 2658 + }, + { + "epoch": 0.8187764524817237, + "grad_norm": 5.1875, + "learning_rate": 3.6720048972465773e-06, + "loss": 1.2781193256378174, + "step": 2660 + }, + { + "epoch": 0.8193920738745671, + "grad_norm": 7.875, + "learning_rate": 3.670126763283307e-06, + "loss": 1.4532701969146729, + "step": 2662 + }, + { + "epoch": 0.8200076952674106, + "grad_norm": 4.9375, + "learning_rate": 3.66824796340005e-06, + "loss": 1.2384744882583618, + "step": 2664 + }, + { + "epoch": 0.8206233166602539, + "grad_norm": 7.75, + "learning_rate": 3.666368499463814e-06, + "loss": 1.1035149097442627, + "step": 2666 + }, + { + "epoch": 0.8212389380530973, + "grad_norm": 9.6875, + "learning_rate": 3.664488373342261e-06, + "loss": 1.0949159860610962, + "step": 2668 + }, + { + "epoch": 0.8218545594459408, + "grad_norm": 7.96875, + "learning_rate": 3.662607586903717e-06, + "loss": 1.260567545890808, + "step": 2670 + }, + { + "epoch": 0.8224701808387841, + "grad_norm": 5.65625, + "learning_rate": 3.6607261420171614e-06, + "loss": 1.4338582754135132, + "step": 2672 + }, + { + "epoch": 0.8230858022316275, + "grad_norm": 7.21875, + "learning_rate": 3.6588440405522275e-06, + "loss": 1.106650710105896, + "step": 2674 + }, + { + "epoch": 0.823701423624471, + "grad_norm": 7.9375, + "learning_rate": 3.6569612843792015e-06, + "loss": 1.488775610923767, + "step": 2676 + }, + { + "epoch": 0.8243170450173144, + "grad_norm": 4.8125, + "learning_rate": 3.655077875369022e-06, + "loss": 1.230544090270996, + "step": 2678 + }, + { + "epoch": 0.8249326664101577, + "grad_norm": 4.625, + "learning_rate": 3.653193815393272e-06, + "loss": 1.0270192623138428, + "step": 2680 + }, + { + "epoch": 0.8255482878030012, + "grad_norm": 8.5, + "learning_rate": 3.6513091063241878e-06, + "loss": 1.27272367477417, + "step": 2682 + }, + { + "epoch": 0.8261639091958446, + "grad_norm": 10.6875, + "learning_rate": 3.649423750034643e-06, + "loss": 1.2475188970565796, + "step": 2684 + }, + { + "epoch": 0.8267795305886879, + "grad_norm": 6.71875, + "learning_rate": 3.647537748398162e-06, + "loss": 1.383508563041687, + "step": 2686 + }, + { + "epoch": 0.8273951519815314, + "grad_norm": 2.59375, + "learning_rate": 3.645651103288904e-06, + "loss": 1.3146872520446777, + "step": 2688 + }, + { + "epoch": 0.8280107733743748, + "grad_norm": 8.375, + "learning_rate": 3.6437638165816725e-06, + "loss": 1.4075785875320435, + "step": 2690 + }, + { + "epoch": 0.8286263947672181, + "grad_norm": 4.8125, + "learning_rate": 3.641875890151906e-06, + "loss": 1.0752105712890625, + "step": 2692 + }, + { + "epoch": 0.8292420161600615, + "grad_norm": 14.6875, + "learning_rate": 3.63998732587568e-06, + "loss": 1.566037654876709, + "step": 2694 + }, + { + "epoch": 0.829857637552905, + "grad_norm": 9.5, + "learning_rate": 3.638098125629701e-06, + "loss": 1.4545506238937378, + "step": 2696 + }, + { + "epoch": 0.8304732589457484, + "grad_norm": 9.4375, + "learning_rate": 3.636208291291312e-06, + "loss": 1.1511316299438477, + "step": 2698 + }, + { + "epoch": 0.8310888803385917, + "grad_norm": 9.125, + "learning_rate": 3.6343178247384827e-06, + "loss": 1.2352555990219116, + "step": 2700 + }, + { + "epoch": 0.8317045017314352, + "grad_norm": 4.71875, + "learning_rate": 3.6324267278498114e-06, + "loss": 1.4677183628082275, + "step": 2702 + }, + { + "epoch": 0.8323201231242786, + "grad_norm": 1.609375, + "learning_rate": 3.630535002504526e-06, + "loss": 1.2390086650848389, + "step": 2704 + }, + { + "epoch": 0.8329357445171219, + "grad_norm": 36.25, + "learning_rate": 3.6286426505824734e-06, + "loss": 1.2680785655975342, + "step": 2706 + }, + { + "epoch": 0.8335513659099654, + "grad_norm": 15.75, + "learning_rate": 3.6267496739641272e-06, + "loss": 1.656466007232666, + "step": 2708 + }, + { + "epoch": 0.8341669873028088, + "grad_norm": 14.125, + "learning_rate": 3.6248560745305818e-06, + "loss": 1.5022876262664795, + "step": 2710 + }, + { + "epoch": 0.8347826086956521, + "grad_norm": 8.75, + "learning_rate": 3.622961854163548e-06, + "loss": 1.4779717922210693, + "step": 2712 + }, + { + "epoch": 0.8353982300884956, + "grad_norm": 2.125, + "learning_rate": 3.6210670147453555e-06, + "loss": 0.9939274191856384, + "step": 2714 + }, + { + "epoch": 0.836013851481339, + "grad_norm": 14.75, + "learning_rate": 3.6191715581589482e-06, + "loss": 1.1958377361297607, + "step": 2716 + }, + { + "epoch": 0.8366294728741823, + "grad_norm": 8.9375, + "learning_rate": 3.6172754862878844e-06, + "loss": 1.7432280778884888, + "step": 2718 + }, + { + "epoch": 0.8372450942670258, + "grad_norm": 7.53125, + "learning_rate": 3.6153788010163336e-06, + "loss": 1.4366165399551392, + "step": 2720 + }, + { + "epoch": 0.8378607156598692, + "grad_norm": 6.25, + "learning_rate": 3.6134815042290737e-06, + "loss": 1.3425190448760986, + "step": 2722 + }, + { + "epoch": 0.8384763370527126, + "grad_norm": 23.875, + "learning_rate": 3.611583597811491e-06, + "loss": 1.2663666009902954, + "step": 2724 + }, + { + "epoch": 0.839091958445556, + "grad_norm": 4.0625, + "learning_rate": 3.609685083649579e-06, + "loss": 1.2251818180084229, + "step": 2726 + }, + { + "epoch": 0.8397075798383994, + "grad_norm": 11.125, + "learning_rate": 3.6077859636299316e-06, + "loss": 1.2505325078964233, + "step": 2728 + }, + { + "epoch": 0.8403232012312428, + "grad_norm": 3.578125, + "learning_rate": 3.60588623963975e-06, + "loss": 1.2615528106689453, + "step": 2730 + }, + { + "epoch": 0.8409388226240861, + "grad_norm": 4.78125, + "learning_rate": 3.6039859135668287e-06, + "loss": 1.3801602125167847, + "step": 2732 + }, + { + "epoch": 0.8415544440169296, + "grad_norm": 12.125, + "learning_rate": 3.602084987299567e-06, + "loss": 1.6640632152557373, + "step": 2734 + }, + { + "epoch": 0.842170065409773, + "grad_norm": 9.0, + "learning_rate": 3.6001834627269573e-06, + "loss": 1.6788702011108398, + "step": 2736 + }, + { + "epoch": 0.8427856868026163, + "grad_norm": 8.125, + "learning_rate": 3.5982813417385876e-06, + "loss": 1.5393545627593994, + "step": 2738 + }, + { + "epoch": 0.8434013081954598, + "grad_norm": 2.375, + "learning_rate": 3.596378626224636e-06, + "loss": 1.3888500928878784, + "step": 2740 + }, + { + "epoch": 0.8440169295883032, + "grad_norm": 6.71875, + "learning_rate": 3.594475318075876e-06, + "loss": 1.542112112045288, + "step": 2742 + }, + { + "epoch": 0.8446325509811466, + "grad_norm": 5.65625, + "learning_rate": 3.592571419183667e-06, + "loss": 1.5727214813232422, + "step": 2744 + }, + { + "epoch": 0.84524817237399, + "grad_norm": 8.3125, + "learning_rate": 3.5906669314399555e-06, + "loss": 1.481203556060791, + "step": 2746 + }, + { + "epoch": 0.8458637937668334, + "grad_norm": 11.625, + "learning_rate": 3.5887618567372752e-06, + "loss": 1.3226168155670166, + "step": 2748 + }, + { + "epoch": 0.8464794151596768, + "grad_norm": 15.6875, + "learning_rate": 3.5868561969687387e-06, + "loss": 1.7433969974517822, + "step": 2750 + }, + { + "epoch": 0.8470950365525202, + "grad_norm": 5.0, + "learning_rate": 3.584949954028045e-06, + "loss": 1.3892580270767212, + "step": 2752 + }, + { + "epoch": 0.8477106579453636, + "grad_norm": 6.5625, + "learning_rate": 3.583043129809469e-06, + "loss": 1.2613940238952637, + "step": 2754 + }, + { + "epoch": 0.848326279338207, + "grad_norm": 2.4375, + "learning_rate": 3.581135726207867e-06, + "loss": 0.9564993381500244, + "step": 2756 + }, + { + "epoch": 0.8489419007310504, + "grad_norm": 7.90625, + "learning_rate": 3.5792277451186665e-06, + "loss": 1.3259934186935425, + "step": 2758 + }, + { + "epoch": 0.8495575221238938, + "grad_norm": 6.03125, + "learning_rate": 3.577319188437872e-06, + "loss": 1.374593734741211, + "step": 2760 + }, + { + "epoch": 0.8501731435167372, + "grad_norm": 7.625, + "learning_rate": 3.5754100580620587e-06, + "loss": 1.2887630462646484, + "step": 2762 + }, + { + "epoch": 0.8507887649095806, + "grad_norm": 6.25, + "learning_rate": 3.573500355888372e-06, + "loss": 1.3124195337295532, + "step": 2764 + }, + { + "epoch": 0.851404386302424, + "grad_norm": 2.125, + "learning_rate": 3.5715900838145267e-06, + "loss": 1.0512268543243408, + "step": 2766 + }, + { + "epoch": 0.8520200076952674, + "grad_norm": 7.53125, + "learning_rate": 3.569679243738803e-06, + "loss": 1.4116840362548828, + "step": 2768 + }, + { + "epoch": 0.8526356290881109, + "grad_norm": 6.15625, + "learning_rate": 3.5677678375600443e-06, + "loss": 1.2163076400756836, + "step": 2770 + }, + { + "epoch": 0.8532512504809542, + "grad_norm": 15.0, + "learning_rate": 3.5658558671776577e-06, + "loss": 1.4438374042510986, + "step": 2772 + }, + { + "epoch": 0.8538668718737976, + "grad_norm": 9.1875, + "learning_rate": 3.5639433344916117e-06, + "loss": 1.4797570705413818, + "step": 2774 + }, + { + "epoch": 0.854482493266641, + "grad_norm": 10.375, + "learning_rate": 3.5620302414024345e-06, + "loss": 1.3446606397628784, + "step": 2776 + }, + { + "epoch": 0.8550981146594844, + "grad_norm": 3.296875, + "learning_rate": 3.560116589811207e-06, + "loss": 1.3919823169708252, + "step": 2778 + }, + { + "epoch": 0.8557137360523278, + "grad_norm": 56.25, + "learning_rate": 3.5582023816195687e-06, + "loss": 1.4049403667449951, + "step": 2780 + }, + { + "epoch": 0.8563293574451712, + "grad_norm": 6.4375, + "learning_rate": 3.5562876187297125e-06, + "loss": 1.478960394859314, + "step": 2782 + }, + { + "epoch": 0.8569449788380146, + "grad_norm": 1.4453125, + "learning_rate": 3.554372303044379e-06, + "loss": 1.004663109779358, + "step": 2784 + }, + { + "epoch": 0.857560600230858, + "grad_norm": 35.5, + "learning_rate": 3.5524564364668602e-06, + "loss": 1.060371994972229, + "step": 2786 + }, + { + "epoch": 0.8581762216237014, + "grad_norm": 4.0625, + "learning_rate": 3.550540020900998e-06, + "loss": 1.1833809614181519, + "step": 2788 + }, + { + "epoch": 0.8587918430165449, + "grad_norm": 7.5625, + "learning_rate": 3.548623058251176e-06, + "loss": 1.2524820566177368, + "step": 2790 + }, + { + "epoch": 0.8594074644093882, + "grad_norm": 5.90625, + "learning_rate": 3.5467055504223225e-06, + "loss": 0.8956518173217773, + "step": 2792 + }, + { + "epoch": 0.8600230858022316, + "grad_norm": 6.125, + "learning_rate": 3.5447874993199095e-06, + "loss": 1.0373610258102417, + "step": 2794 + }, + { + "epoch": 0.8606387071950751, + "grad_norm": 10.3125, + "learning_rate": 3.542868906849947e-06, + "loss": 1.2199324369430542, + "step": 2796 + }, + { + "epoch": 0.8612543285879184, + "grad_norm": 4.4375, + "learning_rate": 3.5409497749189814e-06, + "loss": 1.2490025758743286, + "step": 2798 + }, + { + "epoch": 0.8618699499807618, + "grad_norm": 6.96875, + "learning_rate": 3.539030105434099e-06, + "loss": 1.2936158180236816, + "step": 2800 + }, + { + "epoch": 0.8624855713736053, + "grad_norm": 6.1875, + "learning_rate": 3.5371099003029184e-06, + "loss": 0.8811333179473877, + "step": 2802 + }, + { + "epoch": 0.8631011927664486, + "grad_norm": 7.3125, + "learning_rate": 3.535189161433591e-06, + "loss": 1.2990635633468628, + "step": 2804 + }, + { + "epoch": 0.863716814159292, + "grad_norm": 7.65625, + "learning_rate": 3.5332678907347963e-06, + "loss": 1.4486134052276611, + "step": 2806 + }, + { + "epoch": 0.8643324355521355, + "grad_norm": 8.5, + "learning_rate": 3.531346090115745e-06, + "loss": 1.3083335161209106, + "step": 2808 + }, + { + "epoch": 0.8649480569449788, + "grad_norm": 2.359375, + "learning_rate": 3.5294237614861738e-06, + "loss": 1.341057300567627, + "step": 2810 + }, + { + "epoch": 0.8655636783378222, + "grad_norm": 5.3125, + "learning_rate": 3.5275009067563413e-06, + "loss": 1.33603835105896, + "step": 2812 + }, + { + "epoch": 0.8661792997306657, + "grad_norm": 4.78125, + "learning_rate": 3.5255775278370363e-06, + "loss": 1.279923677444458, + "step": 2814 + }, + { + "epoch": 0.8667949211235091, + "grad_norm": 5.125, + "learning_rate": 3.5236536266395594e-06, + "loss": 1.374291181564331, + "step": 2816 + }, + { + "epoch": 0.8674105425163524, + "grad_norm": 6.3125, + "learning_rate": 3.521729205075736e-06, + "loss": 1.5705500841140747, + "step": 2818 + }, + { + "epoch": 0.8680261639091958, + "grad_norm": 14.6875, + "learning_rate": 3.5198042650579043e-06, + "loss": 1.232345700263977, + "step": 2820 + }, + { + "epoch": 0.8686417853020393, + "grad_norm": 3.59375, + "learning_rate": 3.5178788084989244e-06, + "loss": 1.432027816772461, + "step": 2822 + }, + { + "epoch": 0.8692574066948826, + "grad_norm": 4.46875, + "learning_rate": 3.5159528373121645e-06, + "loss": 1.2847026586532593, + "step": 2824 + }, + { + "epoch": 0.869873028087726, + "grad_norm": 3.46875, + "learning_rate": 3.5140263534115038e-06, + "loss": 1.088910460472107, + "step": 2826 + }, + { + "epoch": 0.8704886494805695, + "grad_norm": 3.5, + "learning_rate": 3.512099358711333e-06, + "loss": 1.2364428043365479, + "step": 2828 + }, + { + "epoch": 0.8711042708734128, + "grad_norm": 9.4375, + "learning_rate": 3.5101718551265505e-06, + "loss": 1.2363696098327637, + "step": 2830 + }, + { + "epoch": 0.8717198922662562, + "grad_norm": 4.78125, + "learning_rate": 3.50824384457256e-06, + "loss": 1.062808871269226, + "step": 2832 + }, + { + "epoch": 0.8723355136590997, + "grad_norm": 11.625, + "learning_rate": 3.5063153289652685e-06, + "loss": 1.218300700187683, + "step": 2834 + }, + { + "epoch": 0.8729511350519431, + "grad_norm": 8.5, + "learning_rate": 3.5043863102210853e-06, + "loss": 1.5316327810287476, + "step": 2836 + }, + { + "epoch": 0.8735667564447864, + "grad_norm": 44.5, + "learning_rate": 3.5024567902569196e-06, + "loss": 1.658814549446106, + "step": 2838 + }, + { + "epoch": 0.8741823778376299, + "grad_norm": 24.375, + "learning_rate": 3.500526770990177e-06, + "loss": 1.4052304029464722, + "step": 2840 + }, + { + "epoch": 0.8747979992304733, + "grad_norm": 17.75, + "learning_rate": 3.4985962543387632e-06, + "loss": 1.5683777332305908, + "step": 2842 + }, + { + "epoch": 0.8754136206233166, + "grad_norm": 8.4375, + "learning_rate": 3.4966652422210746e-06, + "loss": 1.5750112533569336, + "step": 2844 + }, + { + "epoch": 0.8760292420161601, + "grad_norm": 11.4375, + "learning_rate": 3.4947337365560023e-06, + "loss": 1.8130897283554077, + "step": 2846 + }, + { + "epoch": 0.8766448634090035, + "grad_norm": 7.5, + "learning_rate": 3.4928017392629265e-06, + "loss": 1.3907134532928467, + "step": 2848 + }, + { + "epoch": 0.8772604848018468, + "grad_norm": 7.03125, + "learning_rate": 3.4908692522617147e-06, + "loss": 1.6184890270233154, + "step": 2850 + }, + { + "epoch": 0.8778761061946903, + "grad_norm": 8.3125, + "learning_rate": 3.4889362774727244e-06, + "loss": 1.3463777303695679, + "step": 2852 + }, + { + "epoch": 0.8784917275875337, + "grad_norm": 6.59375, + "learning_rate": 3.487002816816796e-06, + "loss": 1.1038285493850708, + "step": 2854 + }, + { + "epoch": 0.8791073489803771, + "grad_norm": 4.59375, + "learning_rate": 3.4850688722152498e-06, + "loss": 1.0783321857452393, + "step": 2856 + }, + { + "epoch": 0.8797229703732204, + "grad_norm": 6.46875, + "learning_rate": 3.4831344455898937e-06, + "loss": 1.2051217555999756, + "step": 2858 + }, + { + "epoch": 0.8803385917660639, + "grad_norm": 8.875, + "learning_rate": 3.4811995388630093e-06, + "loss": 1.3849332332611084, + "step": 2860 + }, + { + "epoch": 0.8809542131589073, + "grad_norm": 11.1875, + "learning_rate": 3.4792641539573558e-06, + "loss": 1.4876724481582642, + "step": 2862 + }, + { + "epoch": 0.8815698345517506, + "grad_norm": 15.375, + "learning_rate": 3.4773282927961693e-06, + "loss": 1.4236429929733276, + "step": 2864 + }, + { + "epoch": 0.8821854559445941, + "grad_norm": 17.125, + "learning_rate": 3.475391957303159e-06, + "loss": 0.7241360545158386, + "step": 2866 + }, + { + "epoch": 0.8828010773374375, + "grad_norm": 31.375, + "learning_rate": 3.4734551494025047e-06, + "loss": 1.4327094554901123, + "step": 2868 + }, + { + "epoch": 0.8834166987302808, + "grad_norm": 7.46875, + "learning_rate": 3.4715178710188552e-06, + "loss": 1.6214407682418823, + "step": 2870 + }, + { + "epoch": 0.8840323201231243, + "grad_norm": 11.875, + "learning_rate": 3.469580124077328e-06, + "loss": 1.151062250137329, + "step": 2872 + }, + { + "epoch": 0.8846479415159677, + "grad_norm": 4.6875, + "learning_rate": 3.4676419105035054e-06, + "loss": 1.0868035554885864, + "step": 2874 + }, + { + "epoch": 0.885263562908811, + "grad_norm": 9.3125, + "learning_rate": 3.465703232223433e-06, + "loss": 1.3283675909042358, + "step": 2876 + }, + { + "epoch": 0.8858791843016545, + "grad_norm": 7.28125, + "learning_rate": 3.4637640911636206e-06, + "loss": 1.2539609670639038, + "step": 2878 + }, + { + "epoch": 0.8864948056944979, + "grad_norm": 3.40625, + "learning_rate": 3.4618244892510346e-06, + "loss": 1.2640516757965088, + "step": 2880 + }, + { + "epoch": 0.8871104270873413, + "grad_norm": 5.4375, + "learning_rate": 3.4598844284131017e-06, + "loss": 1.2733051776885986, + "step": 2882 + }, + { + "epoch": 0.8877260484801847, + "grad_norm": 9.4375, + "learning_rate": 3.4579439105777034e-06, + "loss": 1.0956273078918457, + "step": 2884 + }, + { + "epoch": 0.8883416698730281, + "grad_norm": 5.09375, + "learning_rate": 3.4560029376731765e-06, + "loss": 1.4090516567230225, + "step": 2886 + }, + { + "epoch": 0.8889572912658715, + "grad_norm": 7.125, + "learning_rate": 3.454061511628308e-06, + "loss": 1.0756299495697021, + "step": 2888 + }, + { + "epoch": 0.8895729126587149, + "grad_norm": 13.8125, + "learning_rate": 3.4521196343723377e-06, + "loss": 0.6679868102073669, + "step": 2890 + }, + { + "epoch": 0.8901885340515583, + "grad_norm": 2.765625, + "learning_rate": 3.450177307834952e-06, + "loss": 1.3705148696899414, + "step": 2892 + }, + { + "epoch": 0.8908041554444017, + "grad_norm": 5.03125, + "learning_rate": 3.448234533946284e-06, + "loss": 1.2735440731048584, + "step": 2894 + }, + { + "epoch": 0.891419776837245, + "grad_norm": 12.1875, + "learning_rate": 3.4462913146369124e-06, + "loss": 1.1582732200622559, + "step": 2896 + }, + { + "epoch": 0.8920353982300885, + "grad_norm": 28.25, + "learning_rate": 3.4443476518378583e-06, + "loss": 1.429880976676941, + "step": 2898 + }, + { + "epoch": 0.8926510196229319, + "grad_norm": 6.96875, + "learning_rate": 3.4424035474805808e-06, + "loss": 1.5774238109588623, + "step": 2900 + }, + { + "epoch": 0.8932666410157754, + "grad_norm": 5.25, + "learning_rate": 3.440459003496982e-06, + "loss": 1.1342121362686157, + "step": 2902 + }, + { + "epoch": 0.8938822624086187, + "grad_norm": 7.28125, + "learning_rate": 3.4385140218193978e-06, + "loss": 1.3799785375595093, + "step": 2904 + }, + { + "epoch": 0.8944978838014621, + "grad_norm": 6.46875, + "learning_rate": 3.4365686043806014e-06, + "loss": 1.3847746849060059, + "step": 2906 + }, + { + "epoch": 0.8951135051943055, + "grad_norm": 9.0625, + "learning_rate": 3.4346227531137954e-06, + "loss": 1.256567120552063, + "step": 2908 + }, + { + "epoch": 0.8957291265871489, + "grad_norm": 21.25, + "learning_rate": 3.4326764699526184e-06, + "loss": 1.6475443840026855, + "step": 2910 + }, + { + "epoch": 0.8963447479799923, + "grad_norm": 7.9375, + "learning_rate": 3.4307297568311337e-06, + "loss": 1.379146933555603, + "step": 2912 + }, + { + "epoch": 0.8969603693728357, + "grad_norm": 8.625, + "learning_rate": 3.428782615683835e-06, + "loss": 1.2929694652557373, + "step": 2914 + }, + { + "epoch": 0.8975759907656791, + "grad_norm": 13.25, + "learning_rate": 3.4268350484456385e-06, + "loss": 1.4918608665466309, + "step": 2916 + }, + { + "epoch": 0.8981916121585225, + "grad_norm": 2.421875, + "learning_rate": 3.4248870570518875e-06, + "loss": 1.288706660270691, + "step": 2918 + }, + { + "epoch": 0.8988072335513659, + "grad_norm": 5.1875, + "learning_rate": 3.4229386434383438e-06, + "loss": 1.5433827638626099, + "step": 2920 + }, + { + "epoch": 0.8994228549442093, + "grad_norm": 5.03125, + "learning_rate": 3.4209898095411894e-06, + "loss": 1.0797992944717407, + "step": 2922 + }, + { + "epoch": 0.9000384763370527, + "grad_norm": 26.875, + "learning_rate": 3.4190405572970242e-06, + "loss": 1.0770173072814941, + "step": 2924 + }, + { + "epoch": 0.9006540977298961, + "grad_norm": 6.375, + "learning_rate": 3.4170908886428644e-06, + "loss": 1.7204927206039429, + "step": 2926 + }, + { + "epoch": 0.9012697191227396, + "grad_norm": 5.09375, + "learning_rate": 3.4151408055161385e-06, + "loss": 1.4469215869903564, + "step": 2928 + }, + { + "epoch": 0.9018853405155829, + "grad_norm": 4.46875, + "learning_rate": 3.413190309854688e-06, + "loss": 1.4087347984313965, + "step": 2930 + }, + { + "epoch": 0.9025009619084263, + "grad_norm": 13.125, + "learning_rate": 3.4112394035967656e-06, + "loss": 1.660951852798462, + "step": 2932 + }, + { + "epoch": 0.9031165833012698, + "grad_norm": 2.671875, + "learning_rate": 3.40928808868103e-06, + "loss": 1.0870805978775024, + "step": 2934 + }, + { + "epoch": 0.9037322046941131, + "grad_norm": 4.375, + "learning_rate": 3.407336367046545e-06, + "loss": 1.313590168952942, + "step": 2936 + }, + { + "epoch": 0.9043478260869565, + "grad_norm": 5.8125, + "learning_rate": 3.405384240632782e-06, + "loss": 1.594503402709961, + "step": 2938 + }, + { + "epoch": 0.9049634474798, + "grad_norm": 4.71875, + "learning_rate": 3.4034317113796125e-06, + "loss": 1.244261384010315, + "step": 2940 + }, + { + "epoch": 0.9055790688726433, + "grad_norm": 17.375, + "learning_rate": 3.4014787812273063e-06, + "loss": 1.5862045288085938, + "step": 2942 + }, + { + "epoch": 0.9061946902654867, + "grad_norm": 12.125, + "learning_rate": 3.3995254521165376e-06, + "loss": 1.4014017581939697, + "step": 2944 + }, + { + "epoch": 0.9068103116583301, + "grad_norm": 6.375, + "learning_rate": 3.397571725988371e-06, + "loss": 1.2643262147903442, + "step": 2946 + }, + { + "epoch": 0.9074259330511736, + "grad_norm": 9.9375, + "learning_rate": 3.3956176047842683e-06, + "loss": 1.7110437154769897, + "step": 2948 + }, + { + "epoch": 0.9080415544440169, + "grad_norm": 7.21875, + "learning_rate": 3.393663090446083e-06, + "loss": 1.3050544261932373, + "step": 2950 + }, + { + "epoch": 0.9086571758368603, + "grad_norm": 20.375, + "learning_rate": 3.391708184916061e-06, + "loss": 0.8747286796569824, + "step": 2952 + }, + { + "epoch": 0.9092727972297038, + "grad_norm": 5.9375, + "learning_rate": 3.389752890136835e-06, + "loss": 0.7631102800369263, + "step": 2954 + }, + { + "epoch": 0.9098884186225471, + "grad_norm": 22.0, + "learning_rate": 3.3877972080514255e-06, + "loss": 1.5913681983947754, + "step": 2956 + }, + { + "epoch": 0.9105040400153905, + "grad_norm": 4.78125, + "learning_rate": 3.385841140603238e-06, + "loss": 1.36069655418396, + "step": 2958 + }, + { + "epoch": 0.911119661408234, + "grad_norm": 5.53125, + "learning_rate": 3.3838846897360595e-06, + "loss": 1.1376392841339111, + "step": 2960 + }, + { + "epoch": 0.9117352828010773, + "grad_norm": 12.75, + "learning_rate": 3.3819278573940595e-06, + "loss": 1.6261056661605835, + "step": 2962 + }, + { + "epoch": 0.9123509041939207, + "grad_norm": 6.9375, + "learning_rate": 3.3799706455217875e-06, + "loss": 1.4007984399795532, + "step": 2964 + }, + { + "epoch": 0.9129665255867642, + "grad_norm": 4.28125, + "learning_rate": 3.3780130560641666e-06, + "loss": 1.3082530498504639, + "step": 2966 + }, + { + "epoch": 0.9135821469796075, + "grad_norm": 11.0625, + "learning_rate": 3.376055090966499e-06, + "loss": 1.2865114212036133, + "step": 2968 + }, + { + "epoch": 0.9141977683724509, + "grad_norm": 16.625, + "learning_rate": 3.3740967521744584e-06, + "loss": 1.3429794311523438, + "step": 2970 + }, + { + "epoch": 0.9148133897652944, + "grad_norm": 5.90625, + "learning_rate": 3.372138041634088e-06, + "loss": 1.3486827611923218, + "step": 2972 + }, + { + "epoch": 0.9154290111581378, + "grad_norm": 27.5, + "learning_rate": 3.3701789612918047e-06, + "loss": 1.6032177209854126, + "step": 2974 + }, + { + "epoch": 0.9160446325509811, + "grad_norm": 5.25, + "learning_rate": 3.3682195130943897e-06, + "loss": 1.3208991289138794, + "step": 2976 + }, + { + "epoch": 0.9166602539438246, + "grad_norm": 22.875, + "learning_rate": 3.3662596989889906e-06, + "loss": 1.5932797193527222, + "step": 2978 + }, + { + "epoch": 0.917275875336668, + "grad_norm": 2.9375, + "learning_rate": 3.364299520923118e-06, + "loss": 1.0427111387252808, + "step": 2980 + }, + { + "epoch": 0.9178914967295113, + "grad_norm": 3.578125, + "learning_rate": 3.3623389808446468e-06, + "loss": 1.0028395652770996, + "step": 2982 + }, + { + "epoch": 0.9185071181223547, + "grad_norm": 7.75, + "learning_rate": 3.360378080701807e-06, + "loss": 1.156231164932251, + "step": 2984 + }, + { + "epoch": 0.9191227395151982, + "grad_norm": 1.9140625, + "learning_rate": 3.3584168224431902e-06, + "loss": 1.1786229610443115, + "step": 2986 + }, + { + "epoch": 0.9197383609080415, + "grad_norm": 5.9375, + "learning_rate": 3.3564552080177438e-06, + "loss": 1.3133583068847656, + "step": 2988 + }, + { + "epoch": 0.9203539823008849, + "grad_norm": 5.53125, + "learning_rate": 3.354493239374766e-06, + "loss": 1.6019172668457031, + "step": 2990 + }, + { + "epoch": 0.9209696036937284, + "grad_norm": 6.15625, + "learning_rate": 3.35253091846391e-06, + "loss": 1.4596740007400513, + "step": 2992 + }, + { + "epoch": 0.9215852250865718, + "grad_norm": 5.65625, + "learning_rate": 3.350568247235178e-06, + "loss": 0.813117265701294, + "step": 2994 + }, + { + "epoch": 0.9222008464794151, + "grad_norm": 9.125, + "learning_rate": 3.348605227638921e-06, + "loss": 1.4932818412780762, + "step": 2996 + }, + { + "epoch": 0.9228164678722586, + "grad_norm": 9.625, + "learning_rate": 3.3466418616258345e-06, + "loss": 1.745734453201294, + "step": 2998 + }, + { + "epoch": 0.923432089265102, + "grad_norm": 6.65625, + "learning_rate": 3.3446781511469606e-06, + "loss": 1.5422130823135376, + "step": 3000 + }, + { + "epoch": 0.9240477106579453, + "grad_norm": 3.0, + "learning_rate": 3.342714098153681e-06, + "loss": 1.240844964981079, + "step": 3002 + }, + { + "epoch": 0.9246633320507888, + "grad_norm": 7.0625, + "learning_rate": 3.34074970459772e-06, + "loss": 1.431497573852539, + "step": 3004 + }, + { + "epoch": 0.9252789534436322, + "grad_norm": 3.390625, + "learning_rate": 3.3387849724311383e-06, + "loss": 1.3990793228149414, + "step": 3006 + }, + { + "epoch": 0.9258945748364755, + "grad_norm": 6.3125, + "learning_rate": 3.336819903606337e-06, + "loss": 1.4542325735092163, + "step": 3008 + }, + { + "epoch": 0.926510196229319, + "grad_norm": 1.6328125, + "learning_rate": 3.3348545000760468e-06, + "loss": 1.1801506280899048, + "step": 3010 + }, + { + "epoch": 0.9271258176221624, + "grad_norm": 2.75, + "learning_rate": 3.332888763793334e-06, + "loss": 1.095503807067871, + "step": 3012 + }, + { + "epoch": 0.9277414390150057, + "grad_norm": 6.40625, + "learning_rate": 3.3309226967115936e-06, + "loss": 1.2318484783172607, + "step": 3014 + }, + { + "epoch": 0.9283570604078492, + "grad_norm": 3.828125, + "learning_rate": 3.3289563007845525e-06, + "loss": 1.251956820487976, + "step": 3016 + }, + { + "epoch": 0.9289726818006926, + "grad_norm": 7.5625, + "learning_rate": 3.326989577966262e-06, + "loss": 1.3468469381332397, + "step": 3018 + }, + { + "epoch": 0.929588303193536, + "grad_norm": 8.75, + "learning_rate": 3.3250225302110973e-06, + "loss": 1.019060730934143, + "step": 3020 + }, + { + "epoch": 0.9302039245863793, + "grad_norm": 3.046875, + "learning_rate": 3.323055159473759e-06, + "loss": 1.1843026876449585, + "step": 3022 + }, + { + "epoch": 0.9308195459792228, + "grad_norm": 7.59375, + "learning_rate": 3.3210874677092675e-06, + "loss": 1.3412996530532837, + "step": 3024 + }, + { + "epoch": 0.9314351673720662, + "grad_norm": 11.1875, + "learning_rate": 3.319119456872961e-06, + "loss": 1.5219874382019043, + "step": 3026 + }, + { + "epoch": 0.9320507887649095, + "grad_norm": 6.0625, + "learning_rate": 3.3171511289204973e-06, + "loss": 1.341783881187439, + "step": 3028 + }, + { + "epoch": 0.932666410157753, + "grad_norm": 7.65625, + "learning_rate": 3.3151824858078474e-06, + "loss": 1.3534069061279297, + "step": 3030 + }, + { + "epoch": 0.9332820315505964, + "grad_norm": 5.375, + "learning_rate": 3.313213529491297e-06, + "loss": 1.3870600461959839, + "step": 3032 + }, + { + "epoch": 0.9338976529434397, + "grad_norm": 9.1875, + "learning_rate": 3.311244261927441e-06, + "loss": 1.5487487316131592, + "step": 3034 + }, + { + "epoch": 0.9345132743362832, + "grad_norm": 6.1875, + "learning_rate": 3.309274685073185e-06, + "loss": 1.4522387981414795, + "step": 3036 + }, + { + "epoch": 0.9351288957291266, + "grad_norm": 11.9375, + "learning_rate": 3.307304800885741e-06, + "loss": 1.3280080556869507, + "step": 3038 + }, + { + "epoch": 0.93574451712197, + "grad_norm": 10.375, + "learning_rate": 3.3053346113226287e-06, + "loss": 1.4234414100646973, + "step": 3040 + }, + { + "epoch": 0.9363601385148134, + "grad_norm": 5.09375, + "learning_rate": 3.3033641183416686e-06, + "loss": 1.592652440071106, + "step": 3042 + }, + { + "epoch": 0.9369757599076568, + "grad_norm": 4.21875, + "learning_rate": 3.301393323900984e-06, + "loss": 1.0118399858474731, + "step": 3044 + }, + { + "epoch": 0.9375913813005002, + "grad_norm": 3.59375, + "learning_rate": 3.2994222299589986e-06, + "loss": 1.1533851623535156, + "step": 3046 + }, + { + "epoch": 0.9382070026933436, + "grad_norm": 5.1875, + "learning_rate": 3.2974508384744303e-06, + "loss": 1.3363091945648193, + "step": 3048 + }, + { + "epoch": 0.938822624086187, + "grad_norm": 5.4375, + "learning_rate": 3.295479151406296e-06, + "loss": 1.2613701820373535, + "step": 3050 + }, + { + "epoch": 0.9394382454790304, + "grad_norm": 12.5, + "learning_rate": 3.293507170713906e-06, + "loss": 1.6866613626480103, + "step": 3052 + }, + { + "epoch": 0.9400538668718738, + "grad_norm": 7.875, + "learning_rate": 3.2915348983568612e-06, + "loss": 1.5392214059829712, + "step": 3054 + }, + { + "epoch": 0.9406694882647172, + "grad_norm": 5.53125, + "learning_rate": 3.2895623362950533e-06, + "loss": 1.2921439409255981, + "step": 3056 + }, + { + "epoch": 0.9412851096575606, + "grad_norm": 21.875, + "learning_rate": 3.2875894864886604e-06, + "loss": 1.4320118427276611, + "step": 3058 + }, + { + "epoch": 0.941900731050404, + "grad_norm": 5.625, + "learning_rate": 3.2856163508981486e-06, + "loss": 1.557847023010254, + "step": 3060 + }, + { + "epoch": 0.9425163524432474, + "grad_norm": 5.625, + "learning_rate": 3.283642931484266e-06, + "loss": 1.168736457824707, + "step": 3062 + }, + { + "epoch": 0.9431319738360908, + "grad_norm": 6.46875, + "learning_rate": 3.281669230208045e-06, + "loss": 1.7805376052856445, + "step": 3064 + }, + { + "epoch": 0.9437475952289343, + "grad_norm": 8.375, + "learning_rate": 3.2796952490307953e-06, + "loss": 1.4829258918762207, + "step": 3066 + }, + { + "epoch": 0.9443632166217776, + "grad_norm": 4.4375, + "learning_rate": 3.2777209899141084e-06, + "loss": 1.155414342880249, + "step": 3068 + }, + { + "epoch": 0.944978838014621, + "grad_norm": 5.90625, + "learning_rate": 3.275746454819847e-06, + "loss": 1.3559682369232178, + "step": 3070 + }, + { + "epoch": 0.9455944594074644, + "grad_norm": 8.9375, + "learning_rate": 3.273771645710153e-06, + "loss": 1.1242122650146484, + "step": 3072 + }, + { + "epoch": 0.9462100808003078, + "grad_norm": 3.109375, + "learning_rate": 3.2717965645474382e-06, + "loss": 1.1673762798309326, + "step": 3074 + }, + { + "epoch": 0.9468257021931512, + "grad_norm": 4.375, + "learning_rate": 3.269821213294384e-06, + "loss": 1.437583088874817, + "step": 3076 + }, + { + "epoch": 0.9474413235859946, + "grad_norm": 4.875, + "learning_rate": 3.2678455939139418e-06, + "loss": 1.2453110218048096, + "step": 3078 + }, + { + "epoch": 0.948056944978838, + "grad_norm": 12.5, + "learning_rate": 3.2658697083693302e-06, + "loss": 1.655014991760254, + "step": 3080 + }, + { + "epoch": 0.9486725663716814, + "grad_norm": 9.375, + "learning_rate": 3.263893558624028e-06, + "loss": 1.2827417850494385, + "step": 3082 + }, + { + "epoch": 0.9492881877645248, + "grad_norm": 7.53125, + "learning_rate": 3.2619171466417823e-06, + "loss": 0.9656928777694702, + "step": 3084 + }, + { + "epoch": 0.9499038091573683, + "grad_norm": 5.3125, + "learning_rate": 3.259940474386597e-06, + "loss": 1.532065987586975, + "step": 3086 + }, + { + "epoch": 0.9505194305502116, + "grad_norm": 8.5, + "learning_rate": 3.2579635438227354e-06, + "loss": 1.1672122478485107, + "step": 3088 + }, + { + "epoch": 0.951135051943055, + "grad_norm": 2.265625, + "learning_rate": 3.2559863569147167e-06, + "loss": 1.1641114950180054, + "step": 3090 + }, + { + "epoch": 0.9517506733358985, + "grad_norm": 16.25, + "learning_rate": 3.2540089156273185e-06, + "loss": 1.7468979358673096, + "step": 3092 + }, + { + "epoch": 0.9523662947287418, + "grad_norm": 13.6875, + "learning_rate": 3.2520312219255656e-06, + "loss": 1.3798656463623047, + "step": 3094 + }, + { + "epoch": 0.9529819161215852, + "grad_norm": 11.0, + "learning_rate": 3.250053277774739e-06, + "loss": 1.3403314352035522, + "step": 3096 + }, + { + "epoch": 0.9535975375144287, + "grad_norm": 5.375, + "learning_rate": 3.2480750851403652e-06, + "loss": 1.4658020734786987, + "step": 3098 + }, + { + "epoch": 0.954213158907272, + "grad_norm": 5.84375, + "learning_rate": 3.2460966459882182e-06, + "loss": 1.1230003833770752, + "step": 3100 + }, + { + "epoch": 0.9548287803001154, + "grad_norm": 3.9375, + "learning_rate": 3.2441179622843178e-06, + "loss": 1.2341731786727905, + "step": 3102 + }, + { + "epoch": 0.9554444016929589, + "grad_norm": 11.375, + "learning_rate": 3.242139035994926e-06, + "loss": 1.2085893154144287, + "step": 3104 + }, + { + "epoch": 0.9560600230858022, + "grad_norm": 3.265625, + "learning_rate": 3.240159869086546e-06, + "loss": 1.1262381076812744, + "step": 3106 + }, + { + "epoch": 0.9566756444786456, + "grad_norm": 4.28125, + "learning_rate": 3.2381804635259208e-06, + "loss": 1.32058846950531, + "step": 3108 + }, + { + "epoch": 0.957291265871489, + "grad_norm": 4.78125, + "learning_rate": 3.236200821280031e-06, + "loss": 1.0688804388046265, + "step": 3110 + }, + { + "epoch": 0.9579068872643325, + "grad_norm": 21.375, + "learning_rate": 3.2342209443160895e-06, + "loss": 1.5210665464401245, + "step": 3112 + }, + { + "epoch": 0.9585225086571758, + "grad_norm": 31.25, + "learning_rate": 3.2322408346015453e-06, + "loss": 1.36966073513031, + "step": 3114 + }, + { + "epoch": 0.9591381300500192, + "grad_norm": 5.875, + "learning_rate": 3.230260494104078e-06, + "loss": 1.208855390548706, + "step": 3116 + }, + { + "epoch": 0.9597537514428627, + "grad_norm": 53.0, + "learning_rate": 3.2282799247915964e-06, + "loss": 1.295619010925293, + "step": 3118 + }, + { + "epoch": 0.960369372835706, + "grad_norm": 4.15625, + "learning_rate": 3.2262991286322366e-06, + "loss": 1.0792986154556274, + "step": 3120 + }, + { + "epoch": 0.9609849942285494, + "grad_norm": 4.125, + "learning_rate": 3.2243181075943595e-06, + "loss": 1.2362124919891357, + "step": 3122 + }, + { + "epoch": 0.9616006156213929, + "grad_norm": 46.75, + "learning_rate": 3.2223368636465513e-06, + "loss": 1.3643451929092407, + "step": 3124 + }, + { + "epoch": 0.9622162370142362, + "grad_norm": 9.6875, + "learning_rate": 3.2203553987576175e-06, + "loss": 1.571152687072754, + "step": 3126 + }, + { + "epoch": 0.9628318584070796, + "grad_norm": 28.0, + "learning_rate": 3.2183737148965845e-06, + "loss": 1.1599127054214478, + "step": 3128 + }, + { + "epoch": 0.9634474797999231, + "grad_norm": 6.15625, + "learning_rate": 3.216391814032696e-06, + "loss": 0.9567220211029053, + "step": 3130 + }, + { + "epoch": 0.9640631011927665, + "grad_norm": 3.0, + "learning_rate": 3.2144096981354113e-06, + "loss": 1.0243805646896362, + "step": 3132 + }, + { + "epoch": 0.9646787225856098, + "grad_norm": 11.8125, + "learning_rate": 3.2124273691744032e-06, + "loss": 1.4391425848007202, + "step": 3134 + }, + { + "epoch": 0.9652943439784533, + "grad_norm": 3.765625, + "learning_rate": 3.2104448291195567e-06, + "loss": 1.2657476663589478, + "step": 3136 + }, + { + "epoch": 0.9659099653712967, + "grad_norm": 4.125, + "learning_rate": 3.208462079940966e-06, + "loss": 1.4858558177947998, + "step": 3138 + }, + { + "epoch": 0.96652558676414, + "grad_norm": 17.625, + "learning_rate": 3.2064791236089344e-06, + "loss": 1.4508737325668335, + "step": 3140 + }, + { + "epoch": 0.9671412081569835, + "grad_norm": 4.75, + "learning_rate": 3.2044959620939685e-06, + "loss": 1.6149027347564697, + "step": 3142 + }, + { + "epoch": 0.9677568295498269, + "grad_norm": 7.4375, + "learning_rate": 3.2025125973667815e-06, + "loss": 1.2314261198043823, + "step": 3144 + }, + { + "epoch": 0.9683724509426702, + "grad_norm": 5.65625, + "learning_rate": 3.2005290313982864e-06, + "loss": 1.1191548109054565, + "step": 3146 + }, + { + "epoch": 0.9689880723355137, + "grad_norm": 15.75, + "learning_rate": 3.1985452661595984e-06, + "loss": 1.3416482210159302, + "step": 3148 + }, + { + "epoch": 0.9696036937283571, + "grad_norm": 13.3125, + "learning_rate": 3.1965613036220283e-06, + "loss": 1.5457309484481812, + "step": 3150 + }, + { + "epoch": 0.9702193151212004, + "grad_norm": 9.125, + "learning_rate": 3.1945771457570855e-06, + "loss": 1.552435040473938, + "step": 3152 + }, + { + "epoch": 0.9708349365140438, + "grad_norm": 7.65625, + "learning_rate": 3.192592794536471e-06, + "loss": 1.7783514261245728, + "step": 3154 + }, + { + "epoch": 0.9714505579068873, + "grad_norm": 6.875, + "learning_rate": 3.1906082519320793e-06, + "loss": 1.5031394958496094, + "step": 3156 + }, + { + "epoch": 0.9720661792997307, + "grad_norm": 2.9375, + "learning_rate": 3.1886235199159955e-06, + "loss": 1.2081122398376465, + "step": 3158 + }, + { + "epoch": 0.972681800692574, + "grad_norm": 4.3125, + "learning_rate": 3.186638600460491e-06, + "loss": 1.0296821594238281, + "step": 3160 + }, + { + "epoch": 0.9732974220854175, + "grad_norm": 8.375, + "learning_rate": 3.1846534955380257e-06, + "loss": 1.1985293626785278, + "step": 3162 + }, + { + "epoch": 0.9739130434782609, + "grad_norm": 7.78125, + "learning_rate": 3.1826682071212435e-06, + "loss": 1.211530089378357, + "step": 3164 + }, + { + "epoch": 0.9745286648711042, + "grad_norm": 5.375, + "learning_rate": 3.1806827371829687e-06, + "loss": 1.4879037141799927, + "step": 3166 + }, + { + "epoch": 0.9751442862639477, + "grad_norm": 8.125, + "learning_rate": 3.1786970876962076e-06, + "loss": 1.6099568605422974, + "step": 3168 + }, + { + "epoch": 0.9757599076567911, + "grad_norm": 9.1875, + "learning_rate": 3.1767112606341454e-06, + "loss": 1.311849594116211, + "step": 3170 + }, + { + "epoch": 0.9763755290496344, + "grad_norm": 6.34375, + "learning_rate": 3.1747252579701415e-06, + "loss": 1.1136656999588013, + "step": 3172 + }, + { + "epoch": 0.9769911504424779, + "grad_norm": 15.0625, + "learning_rate": 3.1727390816777326e-06, + "loss": 1.661421775817871, + "step": 3174 + }, + { + "epoch": 0.9776067718353213, + "grad_norm": 17.875, + "learning_rate": 3.1707527337306266e-06, + "loss": 1.4666205644607544, + "step": 3176 + }, + { + "epoch": 0.9782223932281647, + "grad_norm": 9.625, + "learning_rate": 3.168766216102701e-06, + "loss": 1.4527794122695923, + "step": 3178 + }, + { + "epoch": 0.9788380146210081, + "grad_norm": 11.625, + "learning_rate": 3.166779530768004e-06, + "loss": 1.731745719909668, + "step": 3180 + }, + { + "epoch": 0.9794536360138515, + "grad_norm": 14.375, + "learning_rate": 3.164792679700748e-06, + "loss": 1.977335810661316, + "step": 3182 + }, + { + "epoch": 0.9800692574066949, + "grad_norm": 30.125, + "learning_rate": 3.1628056648753127e-06, + "loss": 1.8837709426879883, + "step": 3184 + }, + { + "epoch": 0.9806848787995383, + "grad_norm": 24.75, + "learning_rate": 3.1608184882662386e-06, + "loss": 1.3388452529907227, + "step": 3186 + }, + { + "epoch": 0.9813005001923817, + "grad_norm": 11.5, + "learning_rate": 3.158831151848228e-06, + "loss": 1.4896000623703003, + "step": 3188 + }, + { + "epoch": 0.9819161215852251, + "grad_norm": 22.875, + "learning_rate": 3.1568436575961412e-06, + "loss": 1.3206768035888672, + "step": 3190 + }, + { + "epoch": 0.9825317429780684, + "grad_norm": 7.21875, + "learning_rate": 3.1548560074849965e-06, + "loss": 0.7416938543319702, + "step": 3192 + }, + { + "epoch": 0.9831473643709119, + "grad_norm": 8.8125, + "learning_rate": 3.152868203489965e-06, + "loss": 1.5659494400024414, + "step": 3194 + }, + { + "epoch": 0.9837629857637553, + "grad_norm": 3.765625, + "learning_rate": 3.150880247586374e-06, + "loss": 1.1568924188613892, + "step": 3196 + }, + { + "epoch": 0.9843786071565986, + "grad_norm": 3.875, + "learning_rate": 3.1488921417496985e-06, + "loss": 1.079269528388977, + "step": 3198 + }, + { + "epoch": 0.9849942285494421, + "grad_norm": 5.625, + "learning_rate": 3.1469038879555647e-06, + "loss": 1.287787914276123, + "step": 3200 + }, + { + "epoch": 0.9856098499422855, + "grad_norm": 13.3125, + "learning_rate": 3.144915488179744e-06, + "loss": 1.1777808666229248, + "step": 3202 + }, + { + "epoch": 0.986225471335129, + "grad_norm": 3.78125, + "learning_rate": 3.1429269443981537e-06, + "loss": 0.975529134273529, + "step": 3204 + }, + { + "epoch": 0.9868410927279723, + "grad_norm": 10.4375, + "learning_rate": 3.1409382585868553e-06, + "loss": 1.20067298412323, + "step": 3206 + }, + { + "epoch": 0.9874567141208157, + "grad_norm": 7.125, + "learning_rate": 3.1389494327220506e-06, + "loss": 1.0966123342514038, + "step": 3208 + }, + { + "epoch": 0.9880723355136591, + "grad_norm": 6.4375, + "learning_rate": 3.1369604687800804e-06, + "loss": 1.4545329809188843, + "step": 3210 + }, + { + "epoch": 0.9886879569065025, + "grad_norm": 2.875, + "learning_rate": 3.1349713687374213e-06, + "loss": 1.449361801147461, + "step": 3212 + }, + { + "epoch": 0.9893035782993459, + "grad_norm": 2.59375, + "learning_rate": 3.1329821345706877e-06, + "loss": 1.2503468990325928, + "step": 3214 + }, + { + "epoch": 0.9899191996921893, + "grad_norm": 8.125, + "learning_rate": 3.1309927682566266e-06, + "loss": 1.3865764141082764, + "step": 3216 + }, + { + "epoch": 0.9905348210850327, + "grad_norm": 4.34375, + "learning_rate": 3.1290032717721143e-06, + "loss": 1.2038789987564087, + "step": 3218 + }, + { + "epoch": 0.9911504424778761, + "grad_norm": 25.375, + "learning_rate": 3.1270136470941604e-06, + "loss": 1.167089819908142, + "step": 3220 + }, + { + "epoch": 0.9917660638707195, + "grad_norm": 24.875, + "learning_rate": 3.1250238961998972e-06, + "loss": 1.6324681043624878, + "step": 3222 + }, + { + "epoch": 0.992381685263563, + "grad_norm": 12.0625, + "learning_rate": 3.1230340210665866e-06, + "loss": 1.3044756650924683, + "step": 3224 + }, + { + "epoch": 0.9929973066564063, + "grad_norm": 6.1875, + "learning_rate": 3.121044023671611e-06, + "loss": 1.5603355169296265, + "step": 3226 + }, + { + "epoch": 0.9936129280492497, + "grad_norm": 14.375, + "learning_rate": 3.1190539059924756e-06, + "loss": 1.5595190525054932, + "step": 3228 + }, + { + "epoch": 0.9942285494420932, + "grad_norm": 11.0, + "learning_rate": 3.117063670006806e-06, + "loss": 1.5781952142715454, + "step": 3230 + }, + { + "epoch": 0.9948441708349365, + "grad_norm": 5.46875, + "learning_rate": 3.115073317692342e-06, + "loss": 1.336469292640686, + "step": 3232 + }, + { + "epoch": 0.9954597922277799, + "grad_norm": 3.265625, + "learning_rate": 3.113082851026944e-06, + "loss": 0.8571915626525879, + "step": 3234 + }, + { + "epoch": 0.9960754136206234, + "grad_norm": 4.84375, + "learning_rate": 3.1110922719885817e-06, + "loss": 0.9757181406021118, + "step": 3236 + }, + { + "epoch": 0.9966910350134667, + "grad_norm": 6.34375, + "learning_rate": 3.109101582555338e-06, + "loss": 0.931984543800354, + "step": 3238 + }, + { + "epoch": 0.9973066564063101, + "grad_norm": 4.03125, + "learning_rate": 3.1071107847054074e-06, + "loss": 0.8089004755020142, + "step": 3240 + }, + { + "epoch": 0.9979222777991535, + "grad_norm": 4.6875, + "learning_rate": 3.1051198804170877e-06, + "loss": 1.2749488353729248, + "step": 3242 + }, + { + "epoch": 0.9985378991919969, + "grad_norm": 4.34375, + "learning_rate": 3.103128871668787e-06, + "loss": 1.2046691179275513, + "step": 3244 + }, + { + "epoch": 0.9991535205848403, + "grad_norm": 25.625, + "learning_rate": 3.1011377604390147e-06, + "loss": 1.3801567554473877, + "step": 3246 + }, + { + "epoch": 0.9997691419776837, + "grad_norm": 54.0, + "learning_rate": 3.099146548706383e-06, + "loss": 1.3879741430282593, + "step": 3248 + }, + { + "epoch": 1.0003078106964216, + "grad_norm": 2.9375, + "learning_rate": 3.0971552384496028e-06, + "loss": 1.3940231800079346, + "step": 3250 + }, + { + "epoch": 1.000923432089265, + "grad_norm": 10.5625, + "learning_rate": 3.0951638316474853e-06, + "loss": 1.43914794921875, + "step": 3252 + }, + { + "epoch": 1.0015390534821085, + "grad_norm": 6.75, + "learning_rate": 3.0931723302789352e-06, + "loss": 1.196732997894287, + "step": 3254 + }, + { + "epoch": 1.002154674874952, + "grad_norm": 9.3125, + "learning_rate": 3.0911807363229505e-06, + "loss": 1.4213953018188477, + "step": 3256 + }, + { + "epoch": 1.0027702962677953, + "grad_norm": 12.6875, + "learning_rate": 3.0891890517586254e-06, + "loss": 1.6695702075958252, + "step": 3258 + }, + { + "epoch": 1.0033859176606388, + "grad_norm": 3.8125, + "learning_rate": 3.0871972785651395e-06, + "loss": 1.1721190214157104, + "step": 3260 + }, + { + "epoch": 1.0040015390534822, + "grad_norm": 10.4375, + "learning_rate": 3.085205418721764e-06, + "loss": 0.7390797138214111, + "step": 3262 + }, + { + "epoch": 1.0046171604463254, + "grad_norm": 8.75, + "learning_rate": 3.083213474207854e-06, + "loss": 1.515286922454834, + "step": 3264 + }, + { + "epoch": 1.0052327818391689, + "grad_norm": 3.3125, + "learning_rate": 3.081221447002849e-06, + "loss": 1.2244067192077637, + "step": 3266 + }, + { + "epoch": 1.0058484032320123, + "grad_norm": 24.875, + "learning_rate": 3.0792293390862715e-06, + "loss": 1.3070162534713745, + "step": 3268 + }, + { + "epoch": 1.0064640246248557, + "grad_norm": 7.59375, + "learning_rate": 3.0772371524377242e-06, + "loss": 1.2169034481048584, + "step": 3270 + }, + { + "epoch": 1.0070796460176992, + "grad_norm": 9.0, + "learning_rate": 3.0752448890368865e-06, + "loss": 1.5556094646453857, + "step": 3272 + }, + { + "epoch": 1.0076952674105426, + "grad_norm": 7.9375, + "learning_rate": 3.0732525508635157e-06, + "loss": 1.0534472465515137, + "step": 3274 + }, + { + "epoch": 1.0083108888033858, + "grad_norm": 6.09375, + "learning_rate": 3.071260139897445e-06, + "loss": 1.1789206266403198, + "step": 3276 + }, + { + "epoch": 1.0089265101962293, + "grad_norm": 7.78125, + "learning_rate": 3.069267658118574e-06, + "loss": 1.4941965341567993, + "step": 3278 + }, + { + "epoch": 1.0095421315890727, + "grad_norm": 28.125, + "learning_rate": 3.0672751075068796e-06, + "loss": 1.2050433158874512, + "step": 3280 + }, + { + "epoch": 1.0101577529819161, + "grad_norm": 14.625, + "learning_rate": 3.0652824900424015e-06, + "loss": 1.8228259086608887, + "step": 3282 + }, + { + "epoch": 1.0107733743747596, + "grad_norm": 4.4375, + "learning_rate": 3.063289807705251e-06, + "loss": 1.3229200839996338, + "step": 3284 + }, + { + "epoch": 1.011388995767603, + "grad_norm": 6.21875, + "learning_rate": 3.061297062475599e-06, + "loss": 1.2523341178894043, + "step": 3286 + }, + { + "epoch": 1.0120046171604464, + "grad_norm": 2.234375, + "learning_rate": 3.059304256333682e-06, + "loss": 1.1175116300582886, + "step": 3288 + }, + { + "epoch": 1.0126202385532896, + "grad_norm": 2.5625, + "learning_rate": 3.0573113912597967e-06, + "loss": 1.0994659662246704, + "step": 3290 + }, + { + "epoch": 1.013235859946133, + "grad_norm": 8.25, + "learning_rate": 3.0553184692342967e-06, + "loss": 1.1365960836410522, + "step": 3292 + }, + { + "epoch": 1.0138514813389765, + "grad_norm": 6.15625, + "learning_rate": 3.0533254922375942e-06, + "loss": 1.5383667945861816, + "step": 3294 + }, + { + "epoch": 1.01446710273182, + "grad_norm": 25.375, + "learning_rate": 3.051332462250155e-06, + "loss": 1.1995936632156372, + "step": 3296 + }, + { + "epoch": 1.0150827241246634, + "grad_norm": 5.3125, + "learning_rate": 3.049339381252497e-06, + "loss": 1.3785426616668701, + "step": 3298 + }, + { + "epoch": 1.0156983455175068, + "grad_norm": 3.0625, + "learning_rate": 3.04734625122519e-06, + "loss": 1.074662446975708, + "step": 3300 + }, + { + "epoch": 1.01631396691035, + "grad_norm": 6.25, + "learning_rate": 3.045353074148851e-06, + "loss": 1.5231975317001343, + "step": 3302 + }, + { + "epoch": 1.0169295883031935, + "grad_norm": 3.578125, + "learning_rate": 3.0433598520041462e-06, + "loss": 1.3498982191085815, + "step": 3304 + }, + { + "epoch": 1.017545209696037, + "grad_norm": 8.4375, + "learning_rate": 3.041366586771786e-06, + "loss": 1.6604855060577393, + "step": 3306 + }, + { + "epoch": 1.0181608310888803, + "grad_norm": 9.3125, + "learning_rate": 3.0393732804325193e-06, + "loss": 1.5140495300292969, + "step": 3308 + }, + { + "epoch": 1.0187764524817238, + "grad_norm": 2.5, + "learning_rate": 3.037379934967142e-06, + "loss": 1.2019877433776855, + "step": 3310 + }, + { + "epoch": 1.0193920738745672, + "grad_norm": 3.34375, + "learning_rate": 3.0353865523564854e-06, + "loss": 1.2346773147583008, + "step": 3312 + }, + { + "epoch": 1.0200076952674106, + "grad_norm": 5.5625, + "learning_rate": 3.0333931345814177e-06, + "loss": 1.1609855890274048, + "step": 3314 + }, + { + "epoch": 1.0206233166602539, + "grad_norm": 4.34375, + "learning_rate": 3.031399683622844e-06, + "loss": 1.006929874420166, + "step": 3316 + }, + { + "epoch": 1.0212389380530973, + "grad_norm": 20.5, + "learning_rate": 3.029406201461702e-06, + "loss": 1.6011276245117188, + "step": 3318 + }, + { + "epoch": 1.0218545594459407, + "grad_norm": 9.4375, + "learning_rate": 3.0274126900789575e-06, + "loss": 1.4379968643188477, + "step": 3320 + }, + { + "epoch": 1.0224701808387842, + "grad_norm": 15.25, + "learning_rate": 3.0254191514556084e-06, + "loss": 1.412289023399353, + "step": 3322 + }, + { + "epoch": 1.0230858022316276, + "grad_norm": 5.84375, + "learning_rate": 3.023425587572678e-06, + "loss": 1.3547412157058716, + "step": 3324 + }, + { + "epoch": 1.023701423624471, + "grad_norm": 7.65625, + "learning_rate": 3.0214320004112176e-06, + "loss": 1.49954354763031, + "step": 3326 + }, + { + "epoch": 1.0243170450173142, + "grad_norm": 27.75, + "learning_rate": 3.019438391952297e-06, + "loss": 1.1961394548416138, + "step": 3328 + }, + { + "epoch": 1.0249326664101577, + "grad_norm": 5.1875, + "learning_rate": 3.017444764177011e-06, + "loss": 1.1155706644058228, + "step": 3330 + }, + { + "epoch": 1.0255482878030011, + "grad_norm": 6.28125, + "learning_rate": 3.0154511190664713e-06, + "loss": 1.260684847831726, + "step": 3332 + }, + { + "epoch": 1.0261639091958445, + "grad_norm": 2.5, + "learning_rate": 3.0134574586018085e-06, + "loss": 0.9367880821228027, + "step": 3334 + }, + { + "epoch": 1.026779530588688, + "grad_norm": 12.375, + "learning_rate": 3.011463784764168e-06, + "loss": 1.618381381034851, + "step": 3336 + }, + { + "epoch": 1.0273951519815314, + "grad_norm": 11.25, + "learning_rate": 3.009470099534707e-06, + "loss": 1.3859292268753052, + "step": 3338 + }, + { + "epoch": 1.0280107733743749, + "grad_norm": 3.546875, + "learning_rate": 3.0074764048945963e-06, + "loss": 1.3100966215133667, + "step": 3340 + }, + { + "epoch": 1.028626394767218, + "grad_norm": 14.8125, + "learning_rate": 3.005482702825014e-06, + "loss": 1.727941632270813, + "step": 3342 + }, + { + "epoch": 1.0292420161600615, + "grad_norm": 7.34375, + "learning_rate": 3.0034889953071466e-06, + "loss": 1.530333161354065, + "step": 3344 + }, + { + "epoch": 1.029857637552905, + "grad_norm": 10.25, + "learning_rate": 3.0014952843221874e-06, + "loss": 1.1521104574203491, + "step": 3346 + }, + { + "epoch": 1.0304732589457484, + "grad_norm": 2.78125, + "learning_rate": 2.9995015718513296e-06, + "loss": 1.0660818815231323, + "step": 3348 + }, + { + "epoch": 1.0310888803385918, + "grad_norm": 2.265625, + "learning_rate": 2.9975078598757723e-06, + "loss": 1.217354416847229, + "step": 3350 + }, + { + "epoch": 1.0317045017314352, + "grad_norm": 6.0, + "learning_rate": 2.9955141503767093e-06, + "loss": 1.3975917100906372, + "step": 3352 + }, + { + "epoch": 1.0323201231242787, + "grad_norm": 5.5625, + "learning_rate": 2.9935204453353363e-06, + "loss": 1.3113981485366821, + "step": 3354 + }, + { + "epoch": 1.032935744517122, + "grad_norm": 4.90625, + "learning_rate": 2.9915267467328414e-06, + "loss": 1.3454499244689941, + "step": 3356 + }, + { + "epoch": 1.0335513659099653, + "grad_norm": 13.25, + "learning_rate": 2.9895330565504088e-06, + "loss": 1.5505820512771606, + "step": 3358 + }, + { + "epoch": 1.0341669873028088, + "grad_norm": 8.8125, + "learning_rate": 2.9875393767692117e-06, + "loss": 1.083367109298706, + "step": 3360 + }, + { + "epoch": 1.0347826086956522, + "grad_norm": 27.0, + "learning_rate": 2.9855457093704166e-06, + "loss": 1.8957247734069824, + "step": 3362 + }, + { + "epoch": 1.0353982300884956, + "grad_norm": 9.375, + "learning_rate": 2.9835520563351737e-06, + "loss": 1.5325194597244263, + "step": 3364 + }, + { + "epoch": 1.036013851481339, + "grad_norm": 6.0, + "learning_rate": 2.98155841964462e-06, + "loss": 1.1251282691955566, + "step": 3366 + }, + { + "epoch": 1.0366294728741823, + "grad_norm": 4.78125, + "learning_rate": 2.9795648012798795e-06, + "loss": 1.5618937015533447, + "step": 3368 + }, + { + "epoch": 1.0372450942670257, + "grad_norm": 12.0625, + "learning_rate": 2.9775712032220526e-06, + "loss": 1.2356228828430176, + "step": 3370 + }, + { + "epoch": 1.0378607156598691, + "grad_norm": 6.875, + "learning_rate": 2.975577627452225e-06, + "loss": 1.0679433345794678, + "step": 3372 + }, + { + "epoch": 1.0384763370527126, + "grad_norm": 1.796875, + "learning_rate": 2.973584075951456e-06, + "loss": 1.0582077503204346, + "step": 3374 + }, + { + "epoch": 1.039091958445556, + "grad_norm": 14.8125, + "learning_rate": 2.9715905507007837e-06, + "loss": 1.3411349058151245, + "step": 3376 + }, + { + "epoch": 1.0397075798383995, + "grad_norm": 6.15625, + "learning_rate": 2.969597053681217e-06, + "loss": 1.0146530866622925, + "step": 3378 + }, + { + "epoch": 1.0403232012312429, + "grad_norm": 6.46875, + "learning_rate": 2.9676035868737397e-06, + "loss": 1.305014967918396, + "step": 3380 + }, + { + "epoch": 1.040938822624086, + "grad_norm": 48.75, + "learning_rate": 2.965610152259304e-06, + "loss": 1.4220374822616577, + "step": 3382 + }, + { + "epoch": 1.0415544440169295, + "grad_norm": 1.796875, + "learning_rate": 2.9636167518188308e-06, + "loss": 1.1082732677459717, + "step": 3384 + }, + { + "epoch": 1.042170065409773, + "grad_norm": 2.78125, + "learning_rate": 2.961623387533208e-06, + "loss": 0.9837604761123657, + "step": 3386 + }, + { + "epoch": 1.0427856868026164, + "grad_norm": 6.875, + "learning_rate": 2.959630061383285e-06, + "loss": 1.675440788269043, + "step": 3388 + }, + { + "epoch": 1.0434013081954598, + "grad_norm": 2.6875, + "learning_rate": 2.957636775349874e-06, + "loss": 1.466201663017273, + "step": 3390 + }, + { + "epoch": 1.0440169295883033, + "grad_norm": 6.84375, + "learning_rate": 2.9556435314137495e-06, + "loss": 1.3465406894683838, + "step": 3392 + }, + { + "epoch": 1.0446325509811465, + "grad_norm": 4.34375, + "learning_rate": 2.953650331555642e-06, + "loss": 1.222961664199829, + "step": 3394 + }, + { + "epoch": 1.04524817237399, + "grad_norm": 8.8125, + "learning_rate": 2.9516571777562387e-06, + "loss": 1.6121528148651123, + "step": 3396 + }, + { + "epoch": 1.0458637937668334, + "grad_norm": 6.8125, + "learning_rate": 2.949664071996182e-06, + "loss": 1.0301077365875244, + "step": 3398 + }, + { + "epoch": 1.0464794151596768, + "grad_norm": 6.90625, + "learning_rate": 2.947671016256066e-06, + "loss": 1.1865458488464355, + "step": 3400 + }, + { + "epoch": 1.0470950365525202, + "grad_norm": 10.8125, + "learning_rate": 2.945678012516433e-06, + "loss": 1.6517291069030762, + "step": 3402 + }, + { + "epoch": 1.0477106579453637, + "grad_norm": 3.96875, + "learning_rate": 2.943685062757777e-06, + "loss": 1.078000783920288, + "step": 3404 + }, + { + "epoch": 1.048326279338207, + "grad_norm": 8.75, + "learning_rate": 2.941692168960536e-06, + "loss": 1.5187592506408691, + "step": 3406 + }, + { + "epoch": 1.0489419007310503, + "grad_norm": 20.375, + "learning_rate": 2.9396993331050944e-06, + "loss": 1.0346647500991821, + "step": 3408 + }, + { + "epoch": 1.0495575221238937, + "grad_norm": 6.15625, + "learning_rate": 2.937706557171777e-06, + "loss": 1.3847370147705078, + "step": 3410 + }, + { + "epoch": 1.0501731435167372, + "grad_norm": 3.0625, + "learning_rate": 2.9357138431408493e-06, + "loss": 0.9305753707885742, + "step": 3412 + }, + { + "epoch": 1.0507887649095806, + "grad_norm": 32.5, + "learning_rate": 2.933721192992518e-06, + "loss": 0.8851282000541687, + "step": 3414 + }, + { + "epoch": 1.051404386302424, + "grad_norm": 9.3125, + "learning_rate": 2.9317286087069225e-06, + "loss": 1.888558268547058, + "step": 3416 + }, + { + "epoch": 1.0520200076952675, + "grad_norm": 4.65625, + "learning_rate": 2.9297360922641393e-06, + "loss": 1.3558225631713867, + "step": 3418 + }, + { + "epoch": 1.052635629088111, + "grad_norm": 3.21875, + "learning_rate": 2.9277436456441755e-06, + "loss": 1.1919571161270142, + "step": 3420 + }, + { + "epoch": 1.0532512504809541, + "grad_norm": 6.53125, + "learning_rate": 2.92575127082697e-06, + "loss": 1.3299031257629395, + "step": 3422 + }, + { + "epoch": 1.0538668718737976, + "grad_norm": 1.484375, + "learning_rate": 2.9237589697923917e-06, + "loss": 1.3148071765899658, + "step": 3424 + }, + { + "epoch": 1.054482493266641, + "grad_norm": 6.0, + "learning_rate": 2.921766744520235e-06, + "loss": 1.4010940790176392, + "step": 3426 + }, + { + "epoch": 1.0550981146594844, + "grad_norm": 3.4375, + "learning_rate": 2.919774596990217e-06, + "loss": 1.2768487930297852, + "step": 3428 + }, + { + "epoch": 1.0557137360523279, + "grad_norm": 5.125, + "learning_rate": 2.917782529181981e-06, + "loss": 1.5137887001037598, + "step": 3430 + }, + { + "epoch": 1.0563293574451713, + "grad_norm": 9.3125, + "learning_rate": 2.9157905430750884e-06, + "loss": 0.9090896844863892, + "step": 3432 + }, + { + "epoch": 1.0569449788380145, + "grad_norm": 7.28125, + "learning_rate": 2.9137986406490205e-06, + "loss": 1.3498903512954712, + "step": 3434 + }, + { + "epoch": 1.057560600230858, + "grad_norm": 7.90625, + "learning_rate": 2.9118068238831755e-06, + "loss": 0.933256983757019, + "step": 3436 + }, + { + "epoch": 1.0581762216237014, + "grad_norm": 5.3125, + "learning_rate": 2.909815094756867e-06, + "loss": 1.6633082628250122, + "step": 3438 + }, + { + "epoch": 1.0587918430165448, + "grad_norm": 4.625, + "learning_rate": 2.907823455249321e-06, + "loss": 1.3229087591171265, + "step": 3440 + }, + { + "epoch": 1.0594074644093883, + "grad_norm": 2.515625, + "learning_rate": 2.9058319073396725e-06, + "loss": 1.1567513942718506, + "step": 3442 + }, + { + "epoch": 1.0600230858022317, + "grad_norm": 7.8125, + "learning_rate": 2.9038404530069687e-06, + "loss": 1.1566039323806763, + "step": 3444 + }, + { + "epoch": 1.060638707195075, + "grad_norm": 7.875, + "learning_rate": 2.9018490942301625e-06, + "loss": 1.2508800029754639, + "step": 3446 + }, + { + "epoch": 1.0612543285879183, + "grad_norm": 10.75, + "learning_rate": 2.899857832988112e-06, + "loss": 1.4512087106704712, + "step": 3448 + }, + { + "epoch": 1.0618699499807618, + "grad_norm": 6.03125, + "learning_rate": 2.897866671259577e-06, + "loss": 1.2040528059005737, + "step": 3450 + }, + { + "epoch": 1.0624855713736052, + "grad_norm": 6.125, + "learning_rate": 2.8958756110232212e-06, + "loss": 1.1625609397888184, + "step": 3452 + }, + { + "epoch": 1.0631011927664487, + "grad_norm": 10.875, + "learning_rate": 2.893884654257604e-06, + "loss": 1.39517343044281, + "step": 3454 + }, + { + "epoch": 1.063716814159292, + "grad_norm": 12.8125, + "learning_rate": 2.8918938029411836e-06, + "loss": 1.4427934885025024, + "step": 3456 + }, + { + "epoch": 1.0643324355521355, + "grad_norm": 4.1875, + "learning_rate": 2.889903059052315e-06, + "loss": 1.4380152225494385, + "step": 3458 + }, + { + "epoch": 1.0649480569449787, + "grad_norm": 11.875, + "learning_rate": 2.8879124245692456e-06, + "loss": 1.8976993560791016, + "step": 3460 + }, + { + "epoch": 1.0655636783378222, + "grad_norm": 6.96875, + "learning_rate": 2.8859219014701112e-06, + "loss": 1.3139724731445312, + "step": 3462 + }, + { + "epoch": 1.0661792997306656, + "grad_norm": 7.65625, + "learning_rate": 2.883931491732942e-06, + "loss": 1.3236113786697388, + "step": 3464 + }, + { + "epoch": 1.066794921123509, + "grad_norm": 11.375, + "learning_rate": 2.8819411973356497e-06, + "loss": 1.1875003576278687, + "step": 3466 + }, + { + "epoch": 1.0674105425163525, + "grad_norm": 9.5625, + "learning_rate": 2.8799510202560366e-06, + "loss": 1.552634358406067, + "step": 3468 + }, + { + "epoch": 1.068026163909196, + "grad_norm": 6.34375, + "learning_rate": 2.8779609624717854e-06, + "loss": 1.2890095710754395, + "step": 3470 + }, + { + "epoch": 1.0686417853020393, + "grad_norm": 6.4375, + "learning_rate": 2.8759710259604616e-06, + "loss": 1.207531452178955, + "step": 3472 + }, + { + "epoch": 1.0692574066948826, + "grad_norm": 5.6875, + "learning_rate": 2.8739812126995094e-06, + "loss": 1.124848484992981, + "step": 3474 + }, + { + "epoch": 1.069873028087726, + "grad_norm": 1.1953125, + "learning_rate": 2.871991524666251e-06, + "loss": 1.2248820066452026, + "step": 3476 + }, + { + "epoch": 1.0704886494805694, + "grad_norm": 6.125, + "learning_rate": 2.8700019638378846e-06, + "loss": 1.2643013000488281, + "step": 3478 + }, + { + "epoch": 1.0711042708734129, + "grad_norm": 4.21875, + "learning_rate": 2.86801253219148e-06, + "loss": 1.5682262182235718, + "step": 3480 + }, + { + "epoch": 1.0717198922662563, + "grad_norm": 5.625, + "learning_rate": 2.8660232317039804e-06, + "loss": 1.1708942651748657, + "step": 3482 + }, + { + "epoch": 1.0723355136590997, + "grad_norm": 6.8125, + "learning_rate": 2.8640340643521996e-06, + "loss": 1.252043604850769, + "step": 3484 + }, + { + "epoch": 1.0729511350519432, + "grad_norm": 5.65625, + "learning_rate": 2.862045032112817e-06, + "loss": 1.1808435916900635, + "step": 3486 + }, + { + "epoch": 1.0735667564447864, + "grad_norm": 8.0625, + "learning_rate": 2.860056136962377e-06, + "loss": 1.2189979553222656, + "step": 3488 + }, + { + "epoch": 1.0741823778376298, + "grad_norm": 7.8125, + "learning_rate": 2.858067380877292e-06, + "loss": 1.2846750020980835, + "step": 3490 + }, + { + "epoch": 1.0747979992304733, + "grad_norm": 14.4375, + "learning_rate": 2.856078765833832e-06, + "loss": 1.0122082233428955, + "step": 3492 + }, + { + "epoch": 1.0754136206233167, + "grad_norm": 10.5, + "learning_rate": 2.8540902938081285e-06, + "loss": 1.2930790185928345, + "step": 3494 + }, + { + "epoch": 1.0760292420161601, + "grad_norm": 5.59375, + "learning_rate": 2.8521019667761697e-06, + "loss": 1.3262078762054443, + "step": 3496 + }, + { + "epoch": 1.0766448634090036, + "grad_norm": 5.1875, + "learning_rate": 2.8501137867138017e-06, + "loss": 1.196105718612671, + "step": 3498 + }, + { + "epoch": 1.0772604848018468, + "grad_norm": 8.1875, + "learning_rate": 2.8481257555967244e-06, + "loss": 1.1585220098495483, + "step": 3500 + }, + { + "epoch": 1.0778761061946902, + "grad_norm": 4.65625, + "learning_rate": 2.8461378754004886e-06, + "loss": 1.36484956741333, + "step": 3502 + }, + { + "epoch": 1.0784917275875336, + "grad_norm": 2.59375, + "learning_rate": 2.844150148100495e-06, + "loss": 1.497213363647461, + "step": 3504 + }, + { + "epoch": 1.079107348980377, + "grad_norm": 5.09375, + "learning_rate": 2.8421625756719923e-06, + "loss": 1.8112924098968506, + "step": 3506 + }, + { + "epoch": 1.0797229703732205, + "grad_norm": 9.8125, + "learning_rate": 2.840175160090076e-06, + "loss": 1.5275667905807495, + "step": 3508 + }, + { + "epoch": 1.080338591766064, + "grad_norm": 12.3125, + "learning_rate": 2.8381879033296856e-06, + "loss": 1.0221048593521118, + "step": 3510 + }, + { + "epoch": 1.0809542131589072, + "grad_norm": 6.3125, + "learning_rate": 2.8362008073656033e-06, + "loss": 1.7415841817855835, + "step": 3512 + }, + { + "epoch": 1.0815698345517506, + "grad_norm": 1.703125, + "learning_rate": 2.83421387417245e-06, + "loss": 1.1348897218704224, + "step": 3514 + }, + { + "epoch": 1.082185455944594, + "grad_norm": 2.59375, + "learning_rate": 2.8322271057246864e-06, + "loss": 1.2618352174758911, + "step": 3516 + }, + { + "epoch": 1.0828010773374375, + "grad_norm": 8.3125, + "learning_rate": 2.830240503996609e-06, + "loss": 1.410486102104187, + "step": 3518 + }, + { + "epoch": 1.083416698730281, + "grad_norm": 11.25, + "learning_rate": 2.8282540709623473e-06, + "loss": 1.1378185749053955, + "step": 3520 + }, + { + "epoch": 1.0840323201231243, + "grad_norm": 9.375, + "learning_rate": 2.8262678085958657e-06, + "loss": 1.0309453010559082, + "step": 3522 + }, + { + "epoch": 1.0846479415159678, + "grad_norm": 14.6875, + "learning_rate": 2.8242817188709563e-06, + "loss": 1.4624661207199097, + "step": 3524 + }, + { + "epoch": 1.085263562908811, + "grad_norm": 9.6875, + "learning_rate": 2.8222958037612423e-06, + "loss": 1.2220213413238525, + "step": 3526 + }, + { + "epoch": 1.0858791843016544, + "grad_norm": 5.375, + "learning_rate": 2.8203100652401714e-06, + "loss": 1.385746955871582, + "step": 3528 + }, + { + "epoch": 1.0864948056944979, + "grad_norm": 18.625, + "learning_rate": 2.818324505281017e-06, + "loss": 1.4827823638916016, + "step": 3530 + }, + { + "epoch": 1.0871104270873413, + "grad_norm": 1.984375, + "learning_rate": 2.8163391258568738e-06, + "loss": 1.048915982246399, + "step": 3532 + }, + { + "epoch": 1.0877260484801847, + "grad_norm": 9.6875, + "learning_rate": 2.8143539289406583e-06, + "loss": 1.7094111442565918, + "step": 3534 + }, + { + "epoch": 1.0883416698730282, + "grad_norm": 6.46875, + "learning_rate": 2.8123689165051042e-06, + "loss": 1.5769238471984863, + "step": 3536 + }, + { + "epoch": 1.0889572912658716, + "grad_norm": 5.96875, + "learning_rate": 2.810384090522765e-06, + "loss": 0.83470618724823, + "step": 3538 + }, + { + "epoch": 1.0895729126587148, + "grad_norm": 6.625, + "learning_rate": 2.808399452966004e-06, + "loss": 1.711340308189392, + "step": 3540 + }, + { + "epoch": 1.0901885340515582, + "grad_norm": 2.671875, + "learning_rate": 2.8064150058070026e-06, + "loss": 0.7356083393096924, + "step": 3542 + }, + { + "epoch": 1.0908041554444017, + "grad_norm": 38.5, + "learning_rate": 2.804430751017749e-06, + "loss": 1.528712272644043, + "step": 3544 + }, + { + "epoch": 1.0914197768372451, + "grad_norm": 5.03125, + "learning_rate": 2.802446690570042e-06, + "loss": 1.3249646425247192, + "step": 3546 + }, + { + "epoch": 1.0920353982300885, + "grad_norm": 6.34375, + "learning_rate": 2.8004628264354873e-06, + "loss": 0.8770416378974915, + "step": 3548 + }, + { + "epoch": 1.092651019622932, + "grad_norm": 7.53125, + "learning_rate": 2.7984791605854935e-06, + "loss": 1.2947643995285034, + "step": 3550 + }, + { + "epoch": 1.0932666410157752, + "grad_norm": 8.5, + "learning_rate": 2.796495694991276e-06, + "loss": 1.35930597782135, + "step": 3552 + }, + { + "epoch": 1.0938822624086186, + "grad_norm": 6.78125, + "learning_rate": 2.794512431623847e-06, + "loss": 1.733473300933838, + "step": 3554 + }, + { + "epoch": 1.094497883801462, + "grad_norm": 10.125, + "learning_rate": 2.7925293724540226e-06, + "loss": 1.0471633672714233, + "step": 3556 + }, + { + "epoch": 1.0951135051943055, + "grad_norm": 13.4375, + "learning_rate": 2.7905465194524106e-06, + "loss": 1.462620735168457, + "step": 3558 + }, + { + "epoch": 1.095729126587149, + "grad_norm": 10.0, + "learning_rate": 2.788563874589417e-06, + "loss": 2.0687508583068848, + "step": 3560 + }, + { + "epoch": 1.0963447479799924, + "grad_norm": 6.375, + "learning_rate": 2.7865814398352412e-06, + "loss": 1.5122491121292114, + "step": 3562 + }, + { + "epoch": 1.0969603693728358, + "grad_norm": 5.53125, + "learning_rate": 2.7845992171598724e-06, + "loss": 0.8604429960250854, + "step": 3564 + }, + { + "epoch": 1.097575990765679, + "grad_norm": 19.625, + "learning_rate": 2.7826172085330895e-06, + "loss": 1.1481788158416748, + "step": 3566 + }, + { + "epoch": 1.0981916121585225, + "grad_norm": 6.96875, + "learning_rate": 2.780635415924461e-06, + "loss": 0.8968583941459656, + "step": 3568 + }, + { + "epoch": 1.098807233551366, + "grad_norm": 4.9375, + "learning_rate": 2.7786538413033347e-06, + "loss": 1.1648143529891968, + "step": 3570 + }, + { + "epoch": 1.0994228549442093, + "grad_norm": 8.25, + "learning_rate": 2.7766724866388496e-06, + "loss": 1.365343689918518, + "step": 3572 + }, + { + "epoch": 1.1000384763370528, + "grad_norm": 1.8203125, + "learning_rate": 2.7746913538999197e-06, + "loss": 0.9039756655693054, + "step": 3574 + }, + { + "epoch": 1.1006540977298962, + "grad_norm": 5.3125, + "learning_rate": 2.772710445055242e-06, + "loss": 1.5619524717330933, + "step": 3576 + }, + { + "epoch": 1.1012697191227394, + "grad_norm": 2.390625, + "learning_rate": 2.7707297620732897e-06, + "loss": 1.3508001565933228, + "step": 3578 + }, + { + "epoch": 1.1018853405155828, + "grad_norm": 3.984375, + "learning_rate": 2.7687493069223128e-06, + "loss": 1.1135735511779785, + "step": 3580 + }, + { + "epoch": 1.1025009619084263, + "grad_norm": 4.84375, + "learning_rate": 2.766769081570333e-06, + "loss": 1.3249479532241821, + "step": 3582 + }, + { + "epoch": 1.1031165833012697, + "grad_norm": 6.21875, + "learning_rate": 2.7647890879851447e-06, + "loss": 1.39180326461792, + "step": 3584 + }, + { + "epoch": 1.1037322046941132, + "grad_norm": 3.46875, + "learning_rate": 2.7628093281343127e-06, + "loss": 0.8148363828659058, + "step": 3586 + }, + { + "epoch": 1.1043478260869566, + "grad_norm": 4.3125, + "learning_rate": 2.760829803985167e-06, + "loss": 1.3193633556365967, + "step": 3588 + }, + { + "epoch": 1.1049634474798, + "grad_norm": 5.03125, + "learning_rate": 2.7588505175048074e-06, + "loss": 1.305018424987793, + "step": 3590 + }, + { + "epoch": 1.1055790688726432, + "grad_norm": 4.90625, + "learning_rate": 2.7568714706600935e-06, + "loss": 1.3810641765594482, + "step": 3592 + }, + { + "epoch": 1.1061946902654867, + "grad_norm": 5.71875, + "learning_rate": 2.75489266541765e-06, + "loss": 1.2623786926269531, + "step": 3594 + }, + { + "epoch": 1.10681031165833, + "grad_norm": 16.5, + "learning_rate": 2.7529141037438584e-06, + "loss": 1.2916265726089478, + "step": 3596 + }, + { + "epoch": 1.1074259330511735, + "grad_norm": 3.0, + "learning_rate": 2.7509357876048604e-06, + "loss": 1.1904457807540894, + "step": 3598 + }, + { + "epoch": 1.108041554444017, + "grad_norm": 3.53125, + "learning_rate": 2.7489577189665535e-06, + "loss": 0.8278931975364685, + "step": 3600 + }, + { + "epoch": 1.1086571758368604, + "grad_norm": 5.25, + "learning_rate": 2.7469798997945886e-06, + "loss": 1.2530715465545654, + "step": 3602 + }, + { + "epoch": 1.1092727972297038, + "grad_norm": 4.96875, + "learning_rate": 2.7450023320543685e-06, + "loss": 1.4102144241333008, + "step": 3604 + }, + { + "epoch": 1.109888418622547, + "grad_norm": 7.15625, + "learning_rate": 2.743025017711047e-06, + "loss": 1.2377433776855469, + "step": 3606 + }, + { + "epoch": 1.1105040400153905, + "grad_norm": 7.03125, + "learning_rate": 2.7410479587295272e-06, + "loss": 1.302953839302063, + "step": 3608 + }, + { + "epoch": 1.111119661408234, + "grad_norm": 10.875, + "learning_rate": 2.7390711570744542e-06, + "loss": 1.2824311256408691, + "step": 3610 + }, + { + "epoch": 1.1117352828010774, + "grad_norm": 17.0, + "learning_rate": 2.7370946147102216e-06, + "loss": 1.6581602096557617, + "step": 3612 + }, + { + "epoch": 1.1123509041939208, + "grad_norm": 3.78125, + "learning_rate": 2.7351183336009633e-06, + "loss": 1.1298326253890991, + "step": 3614 + }, + { + "epoch": 1.1129665255867642, + "grad_norm": 5.71875, + "learning_rate": 2.733142315710555e-06, + "loss": 0.7118703126907349, + "step": 3616 + }, + { + "epoch": 1.1135821469796074, + "grad_norm": 5.5625, + "learning_rate": 2.7311665630026086e-06, + "loss": 1.3887100219726562, + "step": 3618 + }, + { + "epoch": 1.1141977683724509, + "grad_norm": 1.78125, + "learning_rate": 2.7291910774404764e-06, + "loss": 1.3764194250106812, + "step": 3620 + }, + { + "epoch": 1.1148133897652943, + "grad_norm": 17.25, + "learning_rate": 2.727215860987239e-06, + "loss": 1.3574413061141968, + "step": 3622 + }, + { + "epoch": 1.1154290111581378, + "grad_norm": 7.75, + "learning_rate": 2.725240915605716e-06, + "loss": 1.3049046993255615, + "step": 3624 + }, + { + "epoch": 1.1160446325509812, + "grad_norm": 5.0625, + "learning_rate": 2.7232662432584546e-06, + "loss": 1.1837899684906006, + "step": 3626 + }, + { + "epoch": 1.1166602539438246, + "grad_norm": 5.75, + "learning_rate": 2.7212918459077293e-06, + "loss": 1.251051425933838, + "step": 3628 + }, + { + "epoch": 1.1172758753366678, + "grad_norm": 5.6875, + "learning_rate": 2.7193177255155447e-06, + "loss": 1.1709699630737305, + "step": 3630 + }, + { + "epoch": 1.1178914967295113, + "grad_norm": 6.1875, + "learning_rate": 2.717343884043628e-06, + "loss": 1.2499865293502808, + "step": 3632 + }, + { + "epoch": 1.1185071181223547, + "grad_norm": 3.640625, + "learning_rate": 2.71537032345343e-06, + "loss": 1.274937629699707, + "step": 3634 + }, + { + "epoch": 1.1191227395151981, + "grad_norm": 10.5, + "learning_rate": 2.713397045706122e-06, + "loss": 1.678328514099121, + "step": 3636 + }, + { + "epoch": 1.1197383609080416, + "grad_norm": 12.8125, + "learning_rate": 2.7114240527625935e-06, + "loss": 0.8977378010749817, + "step": 3638 + }, + { + "epoch": 1.120353982300885, + "grad_norm": 13.5, + "learning_rate": 2.7094513465834528e-06, + "loss": 1.4107129573822021, + "step": 3640 + }, + { + "epoch": 1.1209696036937284, + "grad_norm": 6.03125, + "learning_rate": 2.7074789291290214e-06, + "loss": 1.2703056335449219, + "step": 3642 + }, + { + "epoch": 1.1215852250865717, + "grad_norm": 6.0, + "learning_rate": 2.7055068023593356e-06, + "loss": 1.2788171768188477, + "step": 3644 + }, + { + "epoch": 1.122200846479415, + "grad_norm": 2.484375, + "learning_rate": 2.703534968234142e-06, + "loss": 1.0924688577651978, + "step": 3646 + }, + { + "epoch": 1.1228164678722585, + "grad_norm": 4.09375, + "learning_rate": 2.7015634287128955e-06, + "loss": 1.3006528615951538, + "step": 3648 + }, + { + "epoch": 1.123432089265102, + "grad_norm": 11.5625, + "learning_rate": 2.6995921857547604e-06, + "loss": 1.451360821723938, + "step": 3650 + }, + { + "epoch": 1.1240477106579454, + "grad_norm": 7.71875, + "learning_rate": 2.697621241318603e-06, + "loss": 0.8371405005455017, + "step": 3652 + }, + { + "epoch": 1.1246633320507888, + "grad_norm": 3.28125, + "learning_rate": 2.6956505973629965e-06, + "loss": 1.2515637874603271, + "step": 3654 + }, + { + "epoch": 1.1252789534436323, + "grad_norm": 7.53125, + "learning_rate": 2.6936802558462136e-06, + "loss": 1.2885754108428955, + "step": 3656 + }, + { + "epoch": 1.1258945748364755, + "grad_norm": 6.375, + "learning_rate": 2.6917102187262266e-06, + "loss": 1.4919085502624512, + "step": 3658 + }, + { + "epoch": 1.126510196229319, + "grad_norm": 4.03125, + "learning_rate": 2.689740487960707e-06, + "loss": 1.0740493535995483, + "step": 3660 + }, + { + "epoch": 1.1271258176221624, + "grad_norm": 11.375, + "learning_rate": 2.687771065507019e-06, + "loss": 1.2783704996109009, + "step": 3662 + }, + { + "epoch": 1.1277414390150058, + "grad_norm": 7.34375, + "learning_rate": 2.6858019533222215e-06, + "loss": 1.3667179346084595, + "step": 3664 + }, + { + "epoch": 1.1283570604078492, + "grad_norm": 11.1875, + "learning_rate": 2.6838331533630658e-06, + "loss": 1.2072407007217407, + "step": 3666 + }, + { + "epoch": 1.1289726818006927, + "grad_norm": 10.25, + "learning_rate": 2.6818646675859926e-06, + "loss": 0.8582851886749268, + "step": 3668 + }, + { + "epoch": 1.129588303193536, + "grad_norm": 14.8125, + "learning_rate": 2.679896497947131e-06, + "loss": 1.3421087265014648, + "step": 3670 + }, + { + "epoch": 1.1302039245863793, + "grad_norm": 8.1875, + "learning_rate": 2.677928646402296e-06, + "loss": 1.018212080001831, + "step": 3672 + }, + { + "epoch": 1.1308195459792227, + "grad_norm": 4.53125, + "learning_rate": 2.6759611149069826e-06, + "loss": 1.347084641456604, + "step": 3674 + }, + { + "epoch": 1.1314351673720662, + "grad_norm": 8.3125, + "learning_rate": 2.6739939054163734e-06, + "loss": 1.146074891090393, + "step": 3676 + }, + { + "epoch": 1.1320507887649096, + "grad_norm": 6.65625, + "learning_rate": 2.672027019885328e-06, + "loss": 1.2172248363494873, + "step": 3678 + }, + { + "epoch": 1.132666410157753, + "grad_norm": 6.0, + "learning_rate": 2.6700604602683856e-06, + "loss": 1.3072665929794312, + "step": 3680 + }, + { + "epoch": 1.1332820315505965, + "grad_norm": 9.375, + "learning_rate": 2.6680942285197586e-06, + "loss": 1.6006169319152832, + "step": 3682 + }, + { + "epoch": 1.1338976529434397, + "grad_norm": 18.125, + "learning_rate": 2.666128326593337e-06, + "loss": 1.7242209911346436, + "step": 3684 + }, + { + "epoch": 1.1345132743362831, + "grad_norm": 8.375, + "learning_rate": 2.664162756442682e-06, + "loss": 1.8018231391906738, + "step": 3686 + }, + { + "epoch": 1.1351288957291266, + "grad_norm": 5.46875, + "learning_rate": 2.6621975200210238e-06, + "loss": 0.7340916395187378, + "step": 3688 + }, + { + "epoch": 1.13574451712197, + "grad_norm": 7.0, + "learning_rate": 2.660232619281261e-06, + "loss": 1.3796991109848022, + "step": 3690 + }, + { + "epoch": 1.1363601385148134, + "grad_norm": 19.0, + "learning_rate": 2.6582680561759615e-06, + "loss": 1.5105079412460327, + "step": 3692 + }, + { + "epoch": 1.1369757599076569, + "grad_norm": 15.9375, + "learning_rate": 2.656303832657354e-06, + "loss": 1.0867021083831787, + "step": 3694 + }, + { + "epoch": 1.1375913813005, + "grad_norm": 14.875, + "learning_rate": 2.6543399506773333e-06, + "loss": 1.6368738412857056, + "step": 3696 + }, + { + "epoch": 1.1382070026933435, + "grad_norm": 8.5625, + "learning_rate": 2.652376412187452e-06, + "loss": 1.5982671976089478, + "step": 3698 + }, + { + "epoch": 1.138822624086187, + "grad_norm": 12.3125, + "learning_rate": 2.650413219138921e-06, + "loss": 1.488477110862732, + "step": 3700 + }, + { + "epoch": 1.1394382454790304, + "grad_norm": 4.375, + "learning_rate": 2.648450373482612e-06, + "loss": 1.094407320022583, + "step": 3702 + }, + { + "epoch": 1.1400538668718738, + "grad_norm": 18.625, + "learning_rate": 2.646487877169045e-06, + "loss": 1.3227121829986572, + "step": 3704 + }, + { + "epoch": 1.1406694882647173, + "grad_norm": 8.4375, + "learning_rate": 2.6445257321483998e-06, + "loss": 1.2866860628128052, + "step": 3706 + }, + { + "epoch": 1.1412851096575607, + "grad_norm": 9.1875, + "learning_rate": 2.6425639403705028e-06, + "loss": 1.722301721572876, + "step": 3708 + }, + { + "epoch": 1.141900731050404, + "grad_norm": 7.09375, + "learning_rate": 2.6406025037848316e-06, + "loss": 1.615889310836792, + "step": 3710 + }, + { + "epoch": 1.1425163524432473, + "grad_norm": 10.5, + "learning_rate": 2.6386414243405068e-06, + "loss": 1.5408185720443726, + "step": 3712 + }, + { + "epoch": 1.1431319738360908, + "grad_norm": 18.25, + "learning_rate": 2.6366807039863e-06, + "loss": 1.4188480377197266, + "step": 3714 + }, + { + "epoch": 1.1437475952289342, + "grad_norm": 6.15625, + "learning_rate": 2.6347203446706214e-06, + "loss": 1.2408636808395386, + "step": 3716 + }, + { + "epoch": 1.1443632166217776, + "grad_norm": 13.1875, + "learning_rate": 2.632760348341524e-06, + "loss": 1.8462144136428833, + "step": 3718 + }, + { + "epoch": 1.144978838014621, + "grad_norm": 5.40625, + "learning_rate": 2.6308007169467003e-06, + "loss": 1.3214315176010132, + "step": 3720 + }, + { + "epoch": 1.1455944594074645, + "grad_norm": 4.71875, + "learning_rate": 2.6288414524334803e-06, + "loss": 1.3338682651519775, + "step": 3722 + }, + { + "epoch": 1.1462100808003077, + "grad_norm": 28.5, + "learning_rate": 2.6268825567488297e-06, + "loss": 1.6675822734832764, + "step": 3724 + }, + { + "epoch": 1.1468257021931512, + "grad_norm": 6.75, + "learning_rate": 2.6249240318393454e-06, + "loss": 1.1487302780151367, + "step": 3726 + }, + { + "epoch": 1.1474413235859946, + "grad_norm": 3.625, + "learning_rate": 2.622965879651258e-06, + "loss": 1.1260499954223633, + "step": 3728 + }, + { + "epoch": 1.148056944978838, + "grad_norm": 3.5625, + "learning_rate": 2.6210081021304278e-06, + "loss": 1.261810302734375, + "step": 3730 + }, + { + "epoch": 1.1486725663716815, + "grad_norm": 9.5, + "learning_rate": 2.619050701222342e-06, + "loss": 1.1598726511001587, + "step": 3732 + }, + { + "epoch": 1.149288187764525, + "grad_norm": 5.3125, + "learning_rate": 2.617093678872114e-06, + "loss": 1.3789466619491577, + "step": 3734 + }, + { + "epoch": 1.1499038091573683, + "grad_norm": 5.625, + "learning_rate": 2.6151370370244807e-06, + "loss": 1.467581033706665, + "step": 3736 + }, + { + "epoch": 1.1505194305502116, + "grad_norm": 3.125, + "learning_rate": 2.6131807776238007e-06, + "loss": 1.3655937910079956, + "step": 3738 + }, + { + "epoch": 1.151135051943055, + "grad_norm": 4.65625, + "learning_rate": 2.6112249026140515e-06, + "loss": 1.4911147356033325, + "step": 3740 + }, + { + "epoch": 1.1517506733358984, + "grad_norm": 4.46875, + "learning_rate": 2.609269413938832e-06, + "loss": 0.998041033744812, + "step": 3742 + }, + { + "epoch": 1.1523662947287419, + "grad_norm": 6.875, + "learning_rate": 2.6073143135413546e-06, + "loss": 0.9422230124473572, + "step": 3744 + }, + { + "epoch": 1.1529819161215853, + "grad_norm": 7.75, + "learning_rate": 2.6053596033644463e-06, + "loss": 1.4808106422424316, + "step": 3746 + }, + { + "epoch": 1.1535975375144285, + "grad_norm": 3.625, + "learning_rate": 2.603405285350546e-06, + "loss": 1.4457093477249146, + "step": 3748 + }, + { + "epoch": 1.154213158907272, + "grad_norm": 4.96875, + "learning_rate": 2.601451361441705e-06, + "loss": 1.3998022079467773, + "step": 3750 + }, + { + "epoch": 1.1548287803001154, + "grad_norm": 42.25, + "learning_rate": 2.5994978335795784e-06, + "loss": 1.4046134948730469, + "step": 3752 + }, + { + "epoch": 1.1554444016929588, + "grad_norm": 6.78125, + "learning_rate": 2.5975447037054325e-06, + "loss": 1.610612392425537, + "step": 3754 + }, + { + "epoch": 1.1560600230858022, + "grad_norm": 3.671875, + "learning_rate": 2.595591973760135e-06, + "loss": 1.1340110301971436, + "step": 3756 + }, + { + "epoch": 1.1566756444786457, + "grad_norm": 16.375, + "learning_rate": 2.5936396456841597e-06, + "loss": 1.4932774305343628, + "step": 3758 + }, + { + "epoch": 1.1572912658714891, + "grad_norm": 10.3125, + "learning_rate": 2.5916877214175774e-06, + "loss": 1.3877044916152954, + "step": 3760 + }, + { + "epoch": 1.1579068872643323, + "grad_norm": 5.59375, + "learning_rate": 2.5897362029000583e-06, + "loss": 1.7278505563735962, + "step": 3762 + }, + { + "epoch": 1.1585225086571758, + "grad_norm": 6.28125, + "learning_rate": 2.5877850920708714e-06, + "loss": 1.4104070663452148, + "step": 3764 + }, + { + "epoch": 1.1591381300500192, + "grad_norm": 50.5, + "learning_rate": 2.585834390868878e-06, + "loss": 1.2621724605560303, + "step": 3766 + }, + { + "epoch": 1.1597537514428626, + "grad_norm": 7.5, + "learning_rate": 2.5838841012325344e-06, + "loss": 1.3192962408065796, + "step": 3768 + }, + { + "epoch": 1.160369372835706, + "grad_norm": 7.09375, + "learning_rate": 2.581934225099887e-06, + "loss": 1.565580129623413, + "step": 3770 + }, + { + "epoch": 1.1609849942285495, + "grad_norm": 5.96875, + "learning_rate": 2.579984764408572e-06, + "loss": 1.4773286581039429, + "step": 3772 + }, + { + "epoch": 1.161600615621393, + "grad_norm": 18.625, + "learning_rate": 2.578035721095811e-06, + "loss": 1.5876959562301636, + "step": 3774 + }, + { + "epoch": 1.1622162370142362, + "grad_norm": 9.875, + "learning_rate": 2.5760870970984132e-06, + "loss": 1.556991696357727, + "step": 3776 + }, + { + "epoch": 1.1628318584070796, + "grad_norm": 4.78125, + "learning_rate": 2.5741388943527684e-06, + "loss": 1.2104475498199463, + "step": 3778 + }, + { + "epoch": 1.163447479799923, + "grad_norm": 12.5, + "learning_rate": 2.572191114794851e-06, + "loss": 1.2030988931655884, + "step": 3780 + }, + { + "epoch": 1.1640631011927665, + "grad_norm": 8.4375, + "learning_rate": 2.5702437603602125e-06, + "loss": 1.2617757320404053, + "step": 3782 + }, + { + "epoch": 1.16467872258561, + "grad_norm": 36.0, + "learning_rate": 2.568296832983982e-06, + "loss": 1.4930431842803955, + "step": 3784 + }, + { + "epoch": 1.1652943439784533, + "grad_norm": 8.625, + "learning_rate": 2.5663503346008663e-06, + "loss": 1.2744383811950684, + "step": 3786 + }, + { + "epoch": 1.1659099653712968, + "grad_norm": 3.1875, + "learning_rate": 2.564404267145144e-06, + "loss": 1.3536694049835205, + "step": 3788 + }, + { + "epoch": 1.16652558676414, + "grad_norm": 11.75, + "learning_rate": 2.562458632550665e-06, + "loss": 1.6043957471847534, + "step": 3790 + }, + { + "epoch": 1.1671412081569834, + "grad_norm": 6.15625, + "learning_rate": 2.5605134327508506e-06, + "loss": 1.3562557697296143, + "step": 3792 + }, + { + "epoch": 1.1677568295498268, + "grad_norm": 11.125, + "learning_rate": 2.5585686696786903e-06, + "loss": 0.7872848510742188, + "step": 3794 + }, + { + "epoch": 1.1683724509426703, + "grad_norm": 7.21875, + "learning_rate": 2.5566243452667374e-06, + "loss": 1.2110083103179932, + "step": 3796 + }, + { + "epoch": 1.1689880723355137, + "grad_norm": 5.15625, + "learning_rate": 2.554680461447111e-06, + "loss": 1.2281920909881592, + "step": 3798 + }, + { + "epoch": 1.1696036937283572, + "grad_norm": 13.375, + "learning_rate": 2.5527370201514924e-06, + "loss": 1.279605507850647, + "step": 3800 + }, + { + "epoch": 1.1702193151212004, + "grad_norm": 8.1875, + "learning_rate": 2.550794023311124e-06, + "loss": 1.4011292457580566, + "step": 3802 + }, + { + "epoch": 1.1708349365140438, + "grad_norm": 9.4375, + "learning_rate": 2.5488514728568026e-06, + "loss": 1.5559638738632202, + "step": 3804 + }, + { + "epoch": 1.1714505579068872, + "grad_norm": 11.625, + "learning_rate": 2.5469093707188854e-06, + "loss": 1.8137891292572021, + "step": 3806 + }, + { + "epoch": 1.1720661792997307, + "grad_norm": 7.96875, + "learning_rate": 2.5449677188272825e-06, + "loss": 1.4091744422912598, + "step": 3808 + }, + { + "epoch": 1.172681800692574, + "grad_norm": 12.5625, + "learning_rate": 2.5430265191114587e-06, + "loss": 0.9213383197784424, + "step": 3810 + }, + { + "epoch": 1.1732974220854175, + "grad_norm": 6.625, + "learning_rate": 2.541085773500426e-06, + "loss": 1.2011784315109253, + "step": 3812 + }, + { + "epoch": 1.1739130434782608, + "grad_norm": 6.71875, + "learning_rate": 2.539145483922747e-06, + "loss": 0.9825503826141357, + "step": 3814 + }, + { + "epoch": 1.1745286648711042, + "grad_norm": 27.375, + "learning_rate": 2.5372056523065304e-06, + "loss": 1.07127845287323, + "step": 3816 + }, + { + "epoch": 1.1751442862639476, + "grad_norm": 13.9375, + "learning_rate": 2.5352662805794313e-06, + "loss": 1.530601143836975, + "step": 3818 + }, + { + "epoch": 1.175759907656791, + "grad_norm": 5.125, + "learning_rate": 2.533327370668647e-06, + "loss": 1.3024001121520996, + "step": 3820 + }, + { + "epoch": 1.1763755290496345, + "grad_norm": 16.5, + "learning_rate": 2.531388924500915e-06, + "loss": 1.3389467000961304, + "step": 3822 + }, + { + "epoch": 1.176991150442478, + "grad_norm": 10.5, + "learning_rate": 2.5294509440025127e-06, + "loss": 1.2247848510742188, + "step": 3824 + }, + { + "epoch": 1.1776067718353214, + "grad_norm": 6.25, + "learning_rate": 2.5275134310992554e-06, + "loss": 1.303011417388916, + "step": 3826 + }, + { + "epoch": 1.1782223932281646, + "grad_norm": 5.5, + "learning_rate": 2.5255763877164933e-06, + "loss": 0.8400078415870667, + "step": 3828 + }, + { + "epoch": 1.178838014621008, + "grad_norm": 11.9375, + "learning_rate": 2.5236398157791085e-06, + "loss": 1.3081486225128174, + "step": 3830 + }, + { + "epoch": 1.1794536360138514, + "grad_norm": 10.3125, + "learning_rate": 2.521703717211518e-06, + "loss": 1.2455041408538818, + "step": 3832 + }, + { + "epoch": 1.1800692574066949, + "grad_norm": 16.0, + "learning_rate": 2.519768093937664e-06, + "loss": 1.7230615615844727, + "step": 3834 + }, + { + "epoch": 1.1806848787995383, + "grad_norm": 7.59375, + "learning_rate": 2.5178329478810198e-06, + "loss": 1.3537473678588867, + "step": 3836 + }, + { + "epoch": 1.1813005001923818, + "grad_norm": 4.28125, + "learning_rate": 2.5158982809645838e-06, + "loss": 1.4008030891418457, + "step": 3838 + }, + { + "epoch": 1.1819161215852252, + "grad_norm": 7.75, + "learning_rate": 2.5139640951108777e-06, + "loss": 1.5628221035003662, + "step": 3840 + }, + { + "epoch": 1.1825317429780684, + "grad_norm": 6.6875, + "learning_rate": 2.512030392241945e-06, + "loss": 1.3440524339675903, + "step": 3842 + }, + { + "epoch": 1.1831473643709118, + "grad_norm": 5.78125, + "learning_rate": 2.5100971742793502e-06, + "loss": 1.3246959447860718, + "step": 3844 + }, + { + "epoch": 1.1837629857637553, + "grad_norm": 4.09375, + "learning_rate": 2.508164443144174e-06, + "loss": 1.1068414449691772, + "step": 3846 + }, + { + "epoch": 1.1843786071565987, + "grad_norm": 3.90625, + "learning_rate": 2.506232200757016e-06, + "loss": 0.8754851818084717, + "step": 3848 + }, + { + "epoch": 1.1849942285494421, + "grad_norm": 4.9375, + "learning_rate": 2.5043004490379887e-06, + "loss": 0.9590293169021606, + "step": 3850 + }, + { + "epoch": 1.1856098499422856, + "grad_norm": 14.5, + "learning_rate": 2.502369189906716e-06, + "loss": 1.3972516059875488, + "step": 3852 + }, + { + "epoch": 1.186225471335129, + "grad_norm": 8.75, + "learning_rate": 2.5004384252823353e-06, + "loss": 1.3978736400604248, + "step": 3854 + }, + { + "epoch": 1.1868410927279722, + "grad_norm": 7.5, + "learning_rate": 2.498508157083489e-06, + "loss": 1.71484375, + "step": 3856 + }, + { + "epoch": 1.1874567141208157, + "grad_norm": 6.875, + "learning_rate": 2.4965783872283275e-06, + "loss": 1.3648632764816284, + "step": 3858 + }, + { + "epoch": 1.188072335513659, + "grad_norm": 10.625, + "learning_rate": 2.4946491176345077e-06, + "loss": 1.0106247663497925, + "step": 3860 + }, + { + "epoch": 1.1886879569065025, + "grad_norm": 5.0, + "learning_rate": 2.4927203502191873e-06, + "loss": 0.8573707938194275, + "step": 3862 + }, + { + "epoch": 1.189303578299346, + "grad_norm": 72.5, + "learning_rate": 2.4907920868990266e-06, + "loss": 1.3709218502044678, + "step": 3864 + }, + { + "epoch": 1.1899191996921894, + "grad_norm": 13.5, + "learning_rate": 2.4888643295901834e-06, + "loss": 1.093931794166565, + "step": 3866 + }, + { + "epoch": 1.1905348210850326, + "grad_norm": 4.75, + "learning_rate": 2.4869370802083135e-06, + "loss": 0.9337255358695984, + "step": 3868 + }, + { + "epoch": 1.191150442477876, + "grad_norm": 6.21875, + "learning_rate": 2.485010340668567e-06, + "loss": 1.3781424760818481, + "step": 3870 + }, + { + "epoch": 1.1917660638707195, + "grad_norm": 6.5, + "learning_rate": 2.4830841128855894e-06, + "loss": 1.520790934562683, + "step": 3872 + }, + { + "epoch": 1.192381685263563, + "grad_norm": 6.25, + "learning_rate": 2.4811583987735157e-06, + "loss": 1.0379527807235718, + "step": 3874 + }, + { + "epoch": 1.1929973066564064, + "grad_norm": 9.875, + "learning_rate": 2.4792332002459717e-06, + "loss": 1.5793462991714478, + "step": 3876 + }, + { + "epoch": 1.1936129280492498, + "grad_norm": 14.6875, + "learning_rate": 2.4773085192160697e-06, + "loss": 1.8597513437271118, + "step": 3878 + }, + { + "epoch": 1.194228549442093, + "grad_norm": 7.15625, + "learning_rate": 2.4753843575964094e-06, + "loss": 1.4552783966064453, + "step": 3880 + }, + { + "epoch": 1.1948441708349364, + "grad_norm": 4.9375, + "learning_rate": 2.473460717299072e-06, + "loss": 1.150854468345642, + "step": 3882 + }, + { + "epoch": 1.1954597922277799, + "grad_norm": 7.5625, + "learning_rate": 2.4715376002356225e-06, + "loss": 1.5145831108093262, + "step": 3884 + }, + { + "epoch": 1.1960754136206233, + "grad_norm": 17.0, + "learning_rate": 2.4696150083171057e-06, + "loss": 1.0157501697540283, + "step": 3886 + }, + { + "epoch": 1.1966910350134667, + "grad_norm": 10.4375, + "learning_rate": 2.4676929434540444e-06, + "loss": 1.3922085762023926, + "step": 3888 + }, + { + "epoch": 1.1973066564063102, + "grad_norm": 13.5625, + "learning_rate": 2.4657714075564374e-06, + "loss": 1.631951093673706, + "step": 3890 + }, + { + "epoch": 1.1979222777991536, + "grad_norm": 5.84375, + "learning_rate": 2.463850402533758e-06, + "loss": 1.3009285926818848, + "step": 3892 + }, + { + "epoch": 1.1985378991919968, + "grad_norm": 14.125, + "learning_rate": 2.4619299302949517e-06, + "loss": 1.3850191831588745, + "step": 3894 + }, + { + "epoch": 1.1991535205848403, + "grad_norm": 7.0625, + "learning_rate": 2.4600099927484345e-06, + "loss": 0.9943208694458008, + "step": 3896 + }, + { + "epoch": 1.1997691419776837, + "grad_norm": 12.0, + "learning_rate": 2.458090591802092e-06, + "loss": 1.3799011707305908, + "step": 3898 + }, + { + "epoch": 1.2003847633705271, + "grad_norm": 4.40625, + "learning_rate": 2.456171729363276e-06, + "loss": 1.3535175323486328, + "step": 3900 + }, + { + "epoch": 1.2010003847633706, + "grad_norm": 6.21875, + "learning_rate": 2.4542534073388026e-06, + "loss": 1.3181215524673462, + "step": 3902 + }, + { + "epoch": 1.201616006156214, + "grad_norm": 8.5, + "learning_rate": 2.4523356276349515e-06, + "loss": 1.240586280822754, + "step": 3904 + }, + { + "epoch": 1.2022316275490574, + "grad_norm": 8.0625, + "learning_rate": 2.4504183921574648e-06, + "loss": 1.71377694606781, + "step": 3906 + }, + { + "epoch": 1.2028472489419006, + "grad_norm": 32.25, + "learning_rate": 2.44850170281154e-06, + "loss": 1.4551841020584106, + "step": 3908 + }, + { + "epoch": 1.203462870334744, + "grad_norm": 10.6875, + "learning_rate": 2.446585561501836e-06, + "loss": 1.301320195198059, + "step": 3910 + }, + { + "epoch": 1.2040784917275875, + "grad_norm": 4.875, + "learning_rate": 2.4446699701324643e-06, + "loss": 1.2411322593688965, + "step": 3912 + }, + { + "epoch": 1.204694113120431, + "grad_norm": 5.65625, + "learning_rate": 2.4427549306069918e-06, + "loss": 0.7906562089920044, + "step": 3914 + }, + { + "epoch": 1.2053097345132744, + "grad_norm": 7.71875, + "learning_rate": 2.4408404448284352e-06, + "loss": 1.1022000312805176, + "step": 3916 + }, + { + "epoch": 1.2059253559061178, + "grad_norm": 6.53125, + "learning_rate": 2.4389265146992637e-06, + "loss": 1.2730300426483154, + "step": 3918 + }, + { + "epoch": 1.2065409772989613, + "grad_norm": 7.53125, + "learning_rate": 2.437013142121391e-06, + "loss": 1.5247621536254883, + "step": 3920 + }, + { + "epoch": 1.2071565986918045, + "grad_norm": 9.125, + "learning_rate": 2.435100328996179e-06, + "loss": 1.2567329406738281, + "step": 3922 + }, + { + "epoch": 1.207772220084648, + "grad_norm": 4.0, + "learning_rate": 2.433188077224432e-06, + "loss": 1.0988311767578125, + "step": 3924 + }, + { + "epoch": 1.2083878414774913, + "grad_norm": 5.15625, + "learning_rate": 2.431276388706398e-06, + "loss": 1.4548678398132324, + "step": 3926 + }, + { + "epoch": 1.2090034628703348, + "grad_norm": 10.625, + "learning_rate": 2.429365265341766e-06, + "loss": 1.5173746347427368, + "step": 3928 + }, + { + "epoch": 1.2096190842631782, + "grad_norm": 5.8125, + "learning_rate": 2.4274547090296614e-06, + "loss": 1.312397837638855, + "step": 3930 + }, + { + "epoch": 1.2102347056560214, + "grad_norm": 4.125, + "learning_rate": 2.4255447216686455e-06, + "loss": 1.3173680305480957, + "step": 3932 + }, + { + "epoch": 1.2108503270488649, + "grad_norm": 3.6875, + "learning_rate": 2.4236353051567172e-06, + "loss": 1.3009153604507446, + "step": 3934 + }, + { + "epoch": 1.2114659484417083, + "grad_norm": 8.125, + "learning_rate": 2.4217264613913053e-06, + "loss": 1.756670594215393, + "step": 3936 + }, + { + "epoch": 1.2120815698345517, + "grad_norm": 9.8125, + "learning_rate": 2.4198181922692714e-06, + "loss": 1.6502559185028076, + "step": 3938 + }, + { + "epoch": 1.2126971912273952, + "grad_norm": 7.21875, + "learning_rate": 2.417910499686905e-06, + "loss": 1.3588635921478271, + "step": 3940 + }, + { + "epoch": 1.2133128126202386, + "grad_norm": 12.3125, + "learning_rate": 2.4160033855399235e-06, + "loss": 1.100928783416748, + "step": 3942 + }, + { + "epoch": 1.213928434013082, + "grad_norm": 4.78125, + "learning_rate": 2.4140968517234682e-06, + "loss": 1.347513198852539, + "step": 3944 + }, + { + "epoch": 1.2145440554059252, + "grad_norm": 6.6875, + "learning_rate": 2.4121909001321054e-06, + "loss": 1.5707118511199951, + "step": 3946 + }, + { + "epoch": 1.2151596767987687, + "grad_norm": 3.171875, + "learning_rate": 2.4102855326598205e-06, + "loss": 1.0723353624343872, + "step": 3948 + }, + { + "epoch": 1.2157752981916121, + "grad_norm": 5.78125, + "learning_rate": 2.408380751200021e-06, + "loss": 1.1773736476898193, + "step": 3950 + }, + { + "epoch": 1.2163909195844556, + "grad_norm": 6.0625, + "learning_rate": 2.4064765576455307e-06, + "loss": 1.230358600616455, + "step": 3952 + }, + { + "epoch": 1.217006540977299, + "grad_norm": 7.90625, + "learning_rate": 2.40457295388859e-06, + "loss": 1.334424614906311, + "step": 3954 + }, + { + "epoch": 1.2176221623701424, + "grad_norm": 8.25, + "learning_rate": 2.402669941820852e-06, + "loss": 1.5765841007232666, + "step": 3956 + }, + { + "epoch": 1.2182377837629859, + "grad_norm": 10.375, + "learning_rate": 2.4007675233333816e-06, + "loss": 1.255386471748352, + "step": 3958 + }, + { + "epoch": 1.218853405155829, + "grad_norm": 8.4375, + "learning_rate": 2.398865700316656e-06, + "loss": 1.4202911853790283, + "step": 3960 + }, + { + "epoch": 1.2194690265486725, + "grad_norm": 8.1875, + "learning_rate": 2.3969644746605584e-06, + "loss": 1.4272559881210327, + "step": 3962 + }, + { + "epoch": 1.220084647941516, + "grad_norm": 2.046875, + "learning_rate": 2.39506384825438e-06, + "loss": 1.2267391681671143, + "step": 3964 + }, + { + "epoch": 1.2207002693343594, + "grad_norm": 8.375, + "learning_rate": 2.3931638229868163e-06, + "loss": 1.056898593902588, + "step": 3966 + }, + { + "epoch": 1.2213158907272028, + "grad_norm": 8.5625, + "learning_rate": 2.391264400745964e-06, + "loss": 1.5538334846496582, + "step": 3968 + }, + { + "epoch": 1.2219315121200462, + "grad_norm": 8.625, + "learning_rate": 2.389365583419323e-06, + "loss": 1.3304157257080078, + "step": 3970 + }, + { + "epoch": 1.2225471335128897, + "grad_norm": 6.71875, + "learning_rate": 2.3874673728937886e-06, + "loss": 1.3482346534729004, + "step": 3972 + }, + { + "epoch": 1.223162754905733, + "grad_norm": 7.65625, + "learning_rate": 2.3855697710556562e-06, + "loss": 0.9510328769683838, + "step": 3974 + }, + { + "epoch": 1.2237783762985763, + "grad_norm": 7.6875, + "learning_rate": 2.3836727797906157e-06, + "loss": 1.4577893018722534, + "step": 3976 + }, + { + "epoch": 1.2243939976914198, + "grad_norm": 5.6875, + "learning_rate": 2.381776400983749e-06, + "loss": 1.333723545074463, + "step": 3978 + }, + { + "epoch": 1.2250096190842632, + "grad_norm": 10.625, + "learning_rate": 2.3798806365195305e-06, + "loss": 1.211458444595337, + "step": 3980 + }, + { + "epoch": 1.2256252404771066, + "grad_norm": 2.34375, + "learning_rate": 2.377985488281825e-06, + "loss": 1.0996417999267578, + "step": 3982 + }, + { + "epoch": 1.22624086186995, + "grad_norm": 3.9375, + "learning_rate": 2.3760909581538818e-06, + "loss": 1.1678493022918701, + "step": 3984 + }, + { + "epoch": 1.2268564832627935, + "grad_norm": 14.9375, + "learning_rate": 2.374197048018339e-06, + "loss": 1.5703445672988892, + "step": 3986 + }, + { + "epoch": 1.2274721046556367, + "grad_norm": 6.25, + "learning_rate": 2.372303759757218e-06, + "loss": 1.2629894018173218, + "step": 3988 + }, + { + "epoch": 1.2280877260484802, + "grad_norm": 16.75, + "learning_rate": 2.3704110952519206e-06, + "loss": 2.015996217727661, + "step": 3990 + }, + { + "epoch": 1.2287033474413236, + "grad_norm": 17.0, + "learning_rate": 2.3685190563832307e-06, + "loss": 0.9846193790435791, + "step": 3992 + }, + { + "epoch": 1.229318968834167, + "grad_norm": 3.828125, + "learning_rate": 2.36662764503131e-06, + "loss": 1.3436334133148193, + "step": 3994 + }, + { + "epoch": 1.2299345902270105, + "grad_norm": 6.625, + "learning_rate": 2.3647368630756964e-06, + "loss": 1.0975793600082397, + "step": 3996 + }, + { + "epoch": 1.2305502116198537, + "grad_norm": 5.875, + "learning_rate": 2.3628467123953015e-06, + "loss": 1.2426040172576904, + "step": 3998 + }, + { + "epoch": 1.231165833012697, + "grad_norm": 4.90625, + "learning_rate": 2.3609571948684107e-06, + "loss": 1.2495640516281128, + "step": 4000 + }, + { + "epoch": 1.2317814544055405, + "grad_norm": 9.125, + "learning_rate": 2.35906831237268e-06, + "loss": 1.39968740940094, + "step": 4002 + }, + { + "epoch": 1.232397075798384, + "grad_norm": 13.125, + "learning_rate": 2.3571800667851343e-06, + "loss": 1.3630619049072266, + "step": 4004 + }, + { + "epoch": 1.2330126971912274, + "grad_norm": 5.09375, + "learning_rate": 2.355292459982165e-06, + "loss": 1.2856032848358154, + "step": 4006 + }, + { + "epoch": 1.2336283185840708, + "grad_norm": 6.8125, + "learning_rate": 2.3534054938395313e-06, + "loss": 1.4433751106262207, + "step": 4008 + }, + { + "epoch": 1.2342439399769143, + "grad_norm": 5.5, + "learning_rate": 2.351519170232352e-06, + "loss": 1.1141834259033203, + "step": 4010 + }, + { + "epoch": 1.2348595613697575, + "grad_norm": 7.71875, + "learning_rate": 2.3496334910351086e-06, + "loss": 1.4758944511413574, + "step": 4012 + }, + { + "epoch": 1.235475182762601, + "grad_norm": 12.25, + "learning_rate": 2.3477484581216435e-06, + "loss": 1.576067328453064, + "step": 4014 + }, + { + "epoch": 1.2360908041554444, + "grad_norm": 9.125, + "learning_rate": 2.345864073365157e-06, + "loss": 1.7607650756835938, + "step": 4016 + }, + { + "epoch": 1.2367064255482878, + "grad_norm": 4.9375, + "learning_rate": 2.3439803386382033e-06, + "loss": 1.2492669820785522, + "step": 4018 + }, + { + "epoch": 1.2373220469411312, + "grad_norm": 3.890625, + "learning_rate": 2.3420972558126933e-06, + "loss": 1.0438222885131836, + "step": 4020 + }, + { + "epoch": 1.2379376683339747, + "grad_norm": 11.6875, + "learning_rate": 2.3402148267598875e-06, + "loss": 1.4356199502944946, + "step": 4022 + }, + { + "epoch": 1.238553289726818, + "grad_norm": 2.890625, + "learning_rate": 2.3383330533503973e-06, + "loss": 1.433849573135376, + "step": 4024 + }, + { + "epoch": 1.2391689111196613, + "grad_norm": 11.0625, + "learning_rate": 2.3364519374541838e-06, + "loss": 1.0784897804260254, + "step": 4026 + }, + { + "epoch": 1.2397845325125048, + "grad_norm": 5.78125, + "learning_rate": 2.334571480940554e-06, + "loss": 1.433074712753296, + "step": 4028 + }, + { + "epoch": 1.2404001539053482, + "grad_norm": 10.125, + "learning_rate": 2.3326916856781603e-06, + "loss": 0.9229840636253357, + "step": 4030 + }, + { + "epoch": 1.2410157752981916, + "grad_norm": 9.125, + "learning_rate": 2.330812553534996e-06, + "loss": 1.4485481977462769, + "step": 4032 + }, + { + "epoch": 1.241631396691035, + "grad_norm": 5.0625, + "learning_rate": 2.3289340863783993e-06, + "loss": 1.4929358959197998, + "step": 4034 + }, + { + "epoch": 1.2422470180838785, + "grad_norm": 7.0, + "learning_rate": 2.327056286075042e-06, + "loss": 1.5561566352844238, + "step": 4036 + }, + { + "epoch": 1.242862639476722, + "grad_norm": 11.875, + "learning_rate": 2.325179154490938e-06, + "loss": 1.411828637123108, + "step": 4038 + }, + { + "epoch": 1.2434782608695651, + "grad_norm": 4.84375, + "learning_rate": 2.3233026934914347e-06, + "loss": 1.2632527351379395, + "step": 4040 + }, + { + "epoch": 1.2440938822624086, + "grad_norm": 10.8125, + "learning_rate": 2.3214269049412142e-06, + "loss": 1.2041977643966675, + "step": 4042 + }, + { + "epoch": 1.244709503655252, + "grad_norm": 6.9375, + "learning_rate": 2.3195517907042884e-06, + "loss": 1.3839242458343506, + "step": 4044 + }, + { + "epoch": 1.2453251250480954, + "grad_norm": 5.375, + "learning_rate": 2.317677352644001e-06, + "loss": 1.4752936363220215, + "step": 4046 + }, + { + "epoch": 1.2459407464409389, + "grad_norm": 7.34375, + "learning_rate": 2.315803592623024e-06, + "loss": 1.6179171800613403, + "step": 4048 + }, + { + "epoch": 1.2465563678337823, + "grad_norm": 4.46875, + "learning_rate": 2.3139305125033533e-06, + "loss": 1.3558285236358643, + "step": 4050 + }, + { + "epoch": 1.2471719892266255, + "grad_norm": 5.28125, + "learning_rate": 2.3120581141463107e-06, + "loss": 1.619982123374939, + "step": 4052 + }, + { + "epoch": 1.247787610619469, + "grad_norm": 10.0, + "learning_rate": 2.3101863994125417e-06, + "loss": 1.641546368598938, + "step": 4054 + }, + { + "epoch": 1.2484032320123124, + "grad_norm": 5.25, + "learning_rate": 2.30831537016201e-06, + "loss": 1.7766023874282837, + "step": 4056 + }, + { + "epoch": 1.2490188534051558, + "grad_norm": 18.625, + "learning_rate": 2.3064450282539993e-06, + "loss": 1.1997973918914795, + "step": 4058 + }, + { + "epoch": 1.2496344747979993, + "grad_norm": 12.0625, + "learning_rate": 2.3045753755471114e-06, + "loss": 1.1165738105773926, + "step": 4060 + }, + { + "epoch": 1.2502500961908427, + "grad_norm": 5.46875, + "learning_rate": 2.3027064138992604e-06, + "loss": 1.2666813135147095, + "step": 4062 + }, + { + "epoch": 1.250865717583686, + "grad_norm": 26.25, + "learning_rate": 2.3008381451676764e-06, + "loss": 1.0904207229614258, + "step": 4064 + }, + { + "epoch": 1.2514813389765294, + "grad_norm": 13.125, + "learning_rate": 2.2989705712089004e-06, + "loss": 1.314873456954956, + "step": 4066 + }, + { + "epoch": 1.2520969603693728, + "grad_norm": 5.4375, + "learning_rate": 2.2971036938787816e-06, + "loss": 0.7683751583099365, + "step": 4068 + }, + { + "epoch": 1.2527125817622162, + "grad_norm": 4.40625, + "learning_rate": 2.2952375150324785e-06, + "loss": 1.3286861181259155, + "step": 4070 + }, + { + "epoch": 1.2533282031550597, + "grad_norm": 6.0, + "learning_rate": 2.293372036524454e-06, + "loss": 1.5416184663772583, + "step": 4072 + }, + { + "epoch": 1.253943824547903, + "grad_norm": 10.0625, + "learning_rate": 2.2915072602084778e-06, + "loss": 1.1479551792144775, + "step": 4074 + }, + { + "epoch": 1.2545594459407465, + "grad_norm": 17.0, + "learning_rate": 2.2896431879376177e-06, + "loss": 0.9122674465179443, + "step": 4076 + }, + { + "epoch": 1.2551750673335897, + "grad_norm": 4.28125, + "learning_rate": 2.2877798215642445e-06, + "loss": 0.7844657897949219, + "step": 4078 + }, + { + "epoch": 1.2557906887264332, + "grad_norm": 4.78125, + "learning_rate": 2.285917162940028e-06, + "loss": 1.523891568183899, + "step": 4080 + }, + { + "epoch": 1.2564063101192766, + "grad_norm": 7.0, + "learning_rate": 2.2840552139159335e-06, + "loss": 1.2284789085388184, + "step": 4082 + }, + { + "epoch": 1.25702193151212, + "grad_norm": 8.0625, + "learning_rate": 2.2821939763422217e-06, + "loss": 1.3383231163024902, + "step": 4084 + }, + { + "epoch": 1.2576375529049635, + "grad_norm": 4.6875, + "learning_rate": 2.2803334520684456e-06, + "loss": 0.7896304726600647, + "step": 4086 + }, + { + "epoch": 1.258253174297807, + "grad_norm": 10.875, + "learning_rate": 2.2784736429434505e-06, + "loss": 0.9124029874801636, + "step": 4088 + }, + { + "epoch": 1.2588687956906504, + "grad_norm": 25.125, + "learning_rate": 2.276614550815369e-06, + "loss": 1.7797210216522217, + "step": 4090 + }, + { + "epoch": 1.2594844170834936, + "grad_norm": 6.84375, + "learning_rate": 2.274756177531624e-06, + "loss": 1.1753852367401123, + "step": 4092 + }, + { + "epoch": 1.260100038476337, + "grad_norm": 7.09375, + "learning_rate": 2.2728985249389225e-06, + "loss": 1.2321572303771973, + "step": 4094 + }, + { + "epoch": 1.2607156598691804, + "grad_norm": 6.5625, + "learning_rate": 2.2710415948832557e-06, + "loss": 1.5728049278259277, + "step": 4096 + }, + { + "epoch": 1.2613312812620239, + "grad_norm": 6.53125, + "learning_rate": 2.2691853892098957e-06, + "loss": 1.596179485321045, + "step": 4098 + }, + { + "epoch": 1.2619469026548673, + "grad_norm": 6.71875, + "learning_rate": 2.267329909763397e-06, + "loss": 1.1732280254364014, + "step": 4100 + }, + { + "epoch": 1.2625625240477105, + "grad_norm": 4.0625, + "learning_rate": 2.26547515838759e-06, + "loss": 1.041813850402832, + "step": 4102 + }, + { + "epoch": 1.2631781454405542, + "grad_norm": 19.875, + "learning_rate": 2.263621136925583e-06, + "loss": 1.3980480432510376, + "step": 4104 + }, + { + "epoch": 1.2637937668333974, + "grad_norm": 7.5, + "learning_rate": 2.261767847219758e-06, + "loss": 1.753159761428833, + "step": 4106 + }, + { + "epoch": 1.2644093882262408, + "grad_norm": 8.625, + "learning_rate": 2.2599152911117726e-06, + "loss": 1.6130338907241821, + "step": 4108 + }, + { + "epoch": 1.2650250096190843, + "grad_norm": 3.34375, + "learning_rate": 2.2580634704425513e-06, + "loss": 1.5305721759796143, + "step": 4110 + }, + { + "epoch": 1.2656406310119277, + "grad_norm": 6.25, + "learning_rate": 2.2562123870522914e-06, + "loss": 1.4050418138504028, + "step": 4112 + }, + { + "epoch": 1.2662562524047711, + "grad_norm": 9.8125, + "learning_rate": 2.254362042780454e-06, + "loss": 1.247245192527771, + "step": 4114 + }, + { + "epoch": 1.2668718737976143, + "grad_norm": 5.875, + "learning_rate": 2.2525124394657694e-06, + "loss": 0.8592467308044434, + "step": 4116 + }, + { + "epoch": 1.267487495190458, + "grad_norm": 7.09375, + "learning_rate": 2.2506635789462287e-06, + "loss": 1.0305540561676025, + "step": 4118 + }, + { + "epoch": 1.2681031165833012, + "grad_norm": 3.09375, + "learning_rate": 2.2488154630590876e-06, + "loss": 1.1216397285461426, + "step": 4120 + }, + { + "epoch": 1.2687187379761447, + "grad_norm": 4.59375, + "learning_rate": 2.2469680936408584e-06, + "loss": 1.134533405303955, + "step": 4122 + }, + { + "epoch": 1.269334359368988, + "grad_norm": 7.1875, + "learning_rate": 2.2451214725273143e-06, + "loss": 0.668088436126709, + "step": 4124 + }, + { + "epoch": 1.2699499807618315, + "grad_norm": 7.8125, + "learning_rate": 2.2432756015534853e-06, + "loss": 1.3163663148880005, + "step": 4126 + }, + { + "epoch": 1.270565602154675, + "grad_norm": 10.0, + "learning_rate": 2.2414304825536526e-06, + "loss": 1.5193448066711426, + "step": 4128 + }, + { + "epoch": 1.2711812235475182, + "grad_norm": 4.53125, + "learning_rate": 2.239586117361354e-06, + "loss": 1.1491132974624634, + "step": 4130 + }, + { + "epoch": 1.2717968449403616, + "grad_norm": 7.71875, + "learning_rate": 2.237742507809375e-06, + "loss": 0.9073981046676636, + "step": 4132 + }, + { + "epoch": 1.272412466333205, + "grad_norm": 5.90625, + "learning_rate": 2.2358996557297534e-06, + "loss": 1.2663335800170898, + "step": 4134 + }, + { + "epoch": 1.2730280877260485, + "grad_norm": 5.8125, + "learning_rate": 2.2340575629537713e-06, + "loss": 1.2100262641906738, + "step": 4136 + }, + { + "epoch": 1.273643709118892, + "grad_norm": 8.5625, + "learning_rate": 2.232216231311959e-06, + "loss": 1.367043375968933, + "step": 4138 + }, + { + "epoch": 1.2742593305117353, + "grad_norm": 6.4375, + "learning_rate": 2.2303756626340875e-06, + "loss": 1.49073326587677, + "step": 4140 + }, + { + "epoch": 1.2748749519045788, + "grad_norm": 5.1875, + "learning_rate": 2.228535858749172e-06, + "loss": 1.3855880498886108, + "step": 4142 + }, + { + "epoch": 1.275490573297422, + "grad_norm": 11.25, + "learning_rate": 2.2266968214854664e-06, + "loss": 1.4641410112380981, + "step": 4144 + }, + { + "epoch": 1.2761061946902654, + "grad_norm": 5.21875, + "learning_rate": 2.2248585526704635e-06, + "loss": 1.4227370023727417, + "step": 4146 + }, + { + "epoch": 1.2767218160831089, + "grad_norm": 6.0, + "learning_rate": 2.223021054130892e-06, + "loss": 1.0786595344543457, + "step": 4148 + }, + { + "epoch": 1.2773374374759523, + "grad_norm": 13.9375, + "learning_rate": 2.221184327692717e-06, + "loss": 1.286320447921753, + "step": 4150 + }, + { + "epoch": 1.2779530588687957, + "grad_norm": 2.046875, + "learning_rate": 2.2193483751811324e-06, + "loss": 1.0497817993164062, + "step": 4152 + }, + { + "epoch": 1.2785686802616392, + "grad_norm": 5.53125, + "learning_rate": 2.2175131984205664e-06, + "loss": 1.2185951471328735, + "step": 4154 + }, + { + "epoch": 1.2791843016544826, + "grad_norm": 7.34375, + "learning_rate": 2.2156787992346752e-06, + "loss": 1.302861213684082, + "step": 4156 + }, + { + "epoch": 1.2797999230473258, + "grad_norm": 7.875, + "learning_rate": 2.2138451794463423e-06, + "loss": 1.4149022102355957, + "step": 4158 + }, + { + "epoch": 1.2804155444401693, + "grad_norm": 5.6875, + "learning_rate": 2.2120123408776765e-06, + "loss": 0.7368856072425842, + "step": 4160 + }, + { + "epoch": 1.2810311658330127, + "grad_norm": 2.53125, + "learning_rate": 2.2101802853500118e-06, + "loss": 1.0423181056976318, + "step": 4162 + }, + { + "epoch": 1.2816467872258561, + "grad_norm": 11.5, + "learning_rate": 2.2083490146839023e-06, + "loss": 0.8842816352844238, + "step": 4164 + }, + { + "epoch": 1.2822624086186996, + "grad_norm": 7.0, + "learning_rate": 2.206518530699122e-06, + "loss": 1.50794517993927, + "step": 4166 + }, + { + "epoch": 1.2828780300115428, + "grad_norm": 7.625, + "learning_rate": 2.2046888352146632e-06, + "loss": 1.2277774810791016, + "step": 4168 + }, + { + "epoch": 1.2834936514043864, + "grad_norm": 6.34375, + "learning_rate": 2.2028599300487372e-06, + "loss": 1.3741124868392944, + "step": 4170 + }, + { + "epoch": 1.2841092727972296, + "grad_norm": 5.96875, + "learning_rate": 2.2010318170187676e-06, + "loss": 0.7409383654594421, + "step": 4172 + }, + { + "epoch": 1.284724894190073, + "grad_norm": 6.46875, + "learning_rate": 2.199204497941391e-06, + "loss": 1.3880950212478638, + "step": 4174 + }, + { + "epoch": 1.2853405155829165, + "grad_norm": 13.8125, + "learning_rate": 2.1973779746324556e-06, + "loss": 1.5435235500335693, + "step": 4176 + }, + { + "epoch": 1.28595613697576, + "grad_norm": 7.34375, + "learning_rate": 2.195552248907018e-06, + "loss": 1.1817162036895752, + "step": 4178 + }, + { + "epoch": 1.2865717583686034, + "grad_norm": 5.75, + "learning_rate": 2.1937273225793422e-06, + "loss": 1.2965961694717407, + "step": 4180 + }, + { + "epoch": 1.2871873797614466, + "grad_norm": 6.78125, + "learning_rate": 2.1919031974628995e-06, + "loss": 1.1692346334457397, + "step": 4182 + }, + { + "epoch": 1.2878030011542902, + "grad_norm": 7.9375, + "learning_rate": 2.190079875370364e-06, + "loss": 1.1079033613204956, + "step": 4184 + }, + { + "epoch": 1.2884186225471335, + "grad_norm": 3.515625, + "learning_rate": 2.1882573581136112e-06, + "loss": 1.3430814743041992, + "step": 4186 + }, + { + "epoch": 1.289034243939977, + "grad_norm": 5.75, + "learning_rate": 2.1864356475037175e-06, + "loss": 1.2273108959197998, + "step": 4188 + }, + { + "epoch": 1.2896498653328203, + "grad_norm": 3.65625, + "learning_rate": 2.184614745350958e-06, + "loss": 1.1358712911605835, + "step": 4190 + }, + { + "epoch": 1.2902654867256638, + "grad_norm": 4.75, + "learning_rate": 2.1827946534648035e-06, + "loss": 1.0054996013641357, + "step": 4192 + }, + { + "epoch": 1.2908811081185072, + "grad_norm": 2.4375, + "learning_rate": 2.1809753736539195e-06, + "loss": 0.9101351499557495, + "step": 4194 + }, + { + "epoch": 1.2914967295113504, + "grad_norm": 9.4375, + "learning_rate": 2.1791569077261663e-06, + "loss": 1.3053233623504639, + "step": 4196 + }, + { + "epoch": 1.2921123509041939, + "grad_norm": 9.125, + "learning_rate": 2.177339257488594e-06, + "loss": 1.711419939994812, + "step": 4198 + }, + { + "epoch": 1.2927279722970373, + "grad_norm": 4.625, + "learning_rate": 2.1755224247474416e-06, + "loss": 1.2489417791366577, + "step": 4200 + }, + { + "epoch": 1.2933435936898807, + "grad_norm": 8.9375, + "learning_rate": 2.1737064113081376e-06, + "loss": 1.293163776397705, + "step": 4202 + }, + { + "epoch": 1.2939592150827242, + "grad_norm": 4.34375, + "learning_rate": 2.1718912189752945e-06, + "loss": 1.2553480863571167, + "step": 4204 + }, + { + "epoch": 1.2945748364755676, + "grad_norm": 5.40625, + "learning_rate": 2.1700768495527097e-06, + "loss": 1.1565001010894775, + "step": 4206 + }, + { + "epoch": 1.295190457868411, + "grad_norm": 4.65625, + "learning_rate": 2.168263304843363e-06, + "loss": 1.3635306358337402, + "step": 4208 + }, + { + "epoch": 1.2958060792612542, + "grad_norm": 11.625, + "learning_rate": 2.1664505866494143e-06, + "loss": 1.366041898727417, + "step": 4210 + }, + { + "epoch": 1.2964217006540977, + "grad_norm": 7.625, + "learning_rate": 2.1646386967722024e-06, + "loss": 1.3417686223983765, + "step": 4212 + }, + { + "epoch": 1.297037322046941, + "grad_norm": 8.9375, + "learning_rate": 2.1628276370122435e-06, + "loss": 1.6105479001998901, + "step": 4214 + }, + { + "epoch": 1.2976529434397845, + "grad_norm": 9.0, + "learning_rate": 2.161017409169227e-06, + "loss": 1.046107530593872, + "step": 4216 + }, + { + "epoch": 1.298268564832628, + "grad_norm": 7.65625, + "learning_rate": 2.159208015042018e-06, + "loss": 1.0018730163574219, + "step": 4218 + }, + { + "epoch": 1.2988841862254714, + "grad_norm": 11.125, + "learning_rate": 2.1573994564286504e-06, + "loss": 0.7739481925964355, + "step": 4220 + }, + { + "epoch": 1.2994998076183149, + "grad_norm": 6.1875, + "learning_rate": 2.1555917351263315e-06, + "loss": 1.2665600776672363, + "step": 4222 + }, + { + "epoch": 1.300115429011158, + "grad_norm": 14.4375, + "learning_rate": 2.153784852931433e-06, + "loss": 1.6424885988235474, + "step": 4224 + }, + { + "epoch": 1.3007310504040015, + "grad_norm": 8.3125, + "learning_rate": 2.1519788116394944e-06, + "loss": 0.8370161056518555, + "step": 4226 + }, + { + "epoch": 1.301346671796845, + "grad_norm": 7.5, + "learning_rate": 2.1501736130452215e-06, + "loss": 1.579601526260376, + "step": 4228 + }, + { + "epoch": 1.3019622931896884, + "grad_norm": 20.125, + "learning_rate": 2.148369258942477e-06, + "loss": 1.3582897186279297, + "step": 4230 + }, + { + "epoch": 1.3025779145825318, + "grad_norm": 7.96875, + "learning_rate": 2.146565751124291e-06, + "loss": 1.4349735975265503, + "step": 4232 + }, + { + "epoch": 1.303193535975375, + "grad_norm": 7.21875, + "learning_rate": 2.1447630913828486e-06, + "loss": 1.290523648262024, + "step": 4234 + }, + { + "epoch": 1.3038091573682187, + "grad_norm": 5.375, + "learning_rate": 2.1429612815094917e-06, + "loss": 1.2350491285324097, + "step": 4236 + }, + { + "epoch": 1.3044247787610619, + "grad_norm": 7.5625, + "learning_rate": 2.141160323294722e-06, + "loss": 1.2998807430267334, + "step": 4238 + }, + { + "epoch": 1.3050404001539053, + "grad_norm": 23.5, + "learning_rate": 2.1393602185281895e-06, + "loss": 1.1882212162017822, + "step": 4240 + }, + { + "epoch": 1.3056560215467488, + "grad_norm": 11.1875, + "learning_rate": 2.1375609689987018e-06, + "loss": 1.3439292907714844, + "step": 4242 + }, + { + "epoch": 1.3062716429395922, + "grad_norm": 5.4375, + "learning_rate": 2.1357625764942096e-06, + "loss": 1.469037652015686, + "step": 4244 + }, + { + "epoch": 1.3068872643324356, + "grad_norm": 5.5, + "learning_rate": 2.133965042801819e-06, + "loss": 1.5019780397415161, + "step": 4246 + }, + { + "epoch": 1.3075028857252788, + "grad_norm": 13.5, + "learning_rate": 2.1321683697077774e-06, + "loss": 0.697911262512207, + "step": 4248 + }, + { + "epoch": 1.3081185071181223, + "grad_norm": 11.875, + "learning_rate": 2.1303725589974797e-06, + "loss": 1.7564258575439453, + "step": 4250 + }, + { + "epoch": 1.3087341285109657, + "grad_norm": 5.625, + "learning_rate": 2.1285776124554644e-06, + "loss": 1.2818973064422607, + "step": 4252 + }, + { + "epoch": 1.3093497499038091, + "grad_norm": 13.3125, + "learning_rate": 2.126783531865409e-06, + "loss": 1.2938495874404907, + "step": 4254 + }, + { + "epoch": 1.3099653712966526, + "grad_norm": 4.21875, + "learning_rate": 2.124990319010132e-06, + "loss": 1.5566442012786865, + "step": 4256 + }, + { + "epoch": 1.310580992689496, + "grad_norm": 19.125, + "learning_rate": 2.123197975671589e-06, + "loss": 1.201784610748291, + "step": 4258 + }, + { + "epoch": 1.3111966140823395, + "grad_norm": 4.25, + "learning_rate": 2.121406503630871e-06, + "loss": 1.2029253244400024, + "step": 4260 + }, + { + "epoch": 1.3118122354751827, + "grad_norm": 7.0, + "learning_rate": 2.1196159046682058e-06, + "loss": 1.1904261112213135, + "step": 4262 + }, + { + "epoch": 1.312427856868026, + "grad_norm": 33.25, + "learning_rate": 2.1178261805629495e-06, + "loss": 1.1313016414642334, + "step": 4264 + }, + { + "epoch": 1.3130434782608695, + "grad_norm": 7.5, + "learning_rate": 2.1160373330935937e-06, + "loss": 1.395995020866394, + "step": 4266 + }, + { + "epoch": 1.313659099653713, + "grad_norm": 10.375, + "learning_rate": 2.114249364037754e-06, + "loss": 1.5595605373382568, + "step": 4268 + }, + { + "epoch": 1.3142747210465564, + "grad_norm": 3.890625, + "learning_rate": 2.112462275172176e-06, + "loss": 1.3686449527740479, + "step": 4270 + }, + { + "epoch": 1.3148903424393998, + "grad_norm": 17.0, + "learning_rate": 2.110676068272731e-06, + "loss": 0.719721794128418, + "step": 4272 + }, + { + "epoch": 1.3155059638322433, + "grad_norm": 5.28125, + "learning_rate": 2.1088907451144105e-06, + "loss": 1.0529125928878784, + "step": 4274 + }, + { + "epoch": 1.3161215852250865, + "grad_norm": 4.625, + "learning_rate": 2.107106307471332e-06, + "loss": 0.9316014051437378, + "step": 4276 + }, + { + "epoch": 1.31673720661793, + "grad_norm": 5.03125, + "learning_rate": 2.1053227571167316e-06, + "loss": 0.9757555723190308, + "step": 4278 + }, + { + "epoch": 1.3173528280107734, + "grad_norm": 30.875, + "learning_rate": 2.1035400958229617e-06, + "loss": 1.6467084884643555, + "step": 4280 + }, + { + "epoch": 1.3179684494036168, + "grad_norm": 32.75, + "learning_rate": 2.1017583253614936e-06, + "loss": 1.2693678140640259, + "step": 4282 + }, + { + "epoch": 1.3185840707964602, + "grad_norm": 20.25, + "learning_rate": 2.099977447502912e-06, + "loss": 1.5373982191085815, + "step": 4284 + }, + { + "epoch": 1.3191996921893034, + "grad_norm": 6.0, + "learning_rate": 2.0981974640169155e-06, + "loss": 1.1194545030593872, + "step": 4286 + }, + { + "epoch": 1.319815313582147, + "grad_norm": 12.4375, + "learning_rate": 2.0964183766723143e-06, + "loss": 1.5202691555023193, + "step": 4288 + }, + { + "epoch": 1.3204309349749903, + "grad_norm": 13.4375, + "learning_rate": 2.094640187237026e-06, + "loss": 1.4231841564178467, + "step": 4290 + }, + { + "epoch": 1.3210465563678337, + "grad_norm": 10.5, + "learning_rate": 2.0928628974780784e-06, + "loss": 1.7791295051574707, + "step": 4292 + }, + { + "epoch": 1.3216621777606772, + "grad_norm": 2.875, + "learning_rate": 2.0910865091616044e-06, + "loss": 0.8141826391220093, + "step": 4294 + }, + { + "epoch": 1.3222777991535206, + "grad_norm": 8.5625, + "learning_rate": 2.08931102405284e-06, + "loss": 1.2941914796829224, + "step": 4296 + }, + { + "epoch": 1.322893420546364, + "grad_norm": 11.8125, + "learning_rate": 2.087536443916124e-06, + "loss": 1.2962119579315186, + "step": 4298 + }, + { + "epoch": 1.3235090419392073, + "grad_norm": 7.75, + "learning_rate": 2.0857627705148985e-06, + "loss": 0.9499344825744629, + "step": 4300 + }, + { + "epoch": 1.324124663332051, + "grad_norm": 5.90625, + "learning_rate": 2.083990005611701e-06, + "loss": 1.2333036661148071, + "step": 4302 + }, + { + "epoch": 1.3247402847248941, + "grad_norm": 7.25, + "learning_rate": 2.082218150968167e-06, + "loss": 0.8718281984329224, + "step": 4304 + }, + { + "epoch": 1.3253559061177376, + "grad_norm": 13.625, + "learning_rate": 2.080447208345031e-06, + "loss": 1.2827845811843872, + "step": 4306 + }, + { + "epoch": 1.325971527510581, + "grad_norm": 14.25, + "learning_rate": 2.078677179502115e-06, + "loss": 1.189026951789856, + "step": 4308 + }, + { + "epoch": 1.3265871489034244, + "grad_norm": 16.25, + "learning_rate": 2.076908066198339e-06, + "loss": 1.5428333282470703, + "step": 4310 + }, + { + "epoch": 1.3272027702962679, + "grad_norm": 11.3125, + "learning_rate": 2.0751398701917092e-06, + "loss": 1.5201754570007324, + "step": 4312 + }, + { + "epoch": 1.327818391689111, + "grad_norm": 9.1875, + "learning_rate": 2.073372593239321e-06, + "loss": 1.4332178831100464, + "step": 4314 + }, + { + "epoch": 1.3284340130819545, + "grad_norm": 7.3125, + "learning_rate": 2.0716062370973587e-06, + "loss": 0.520851731300354, + "step": 4316 + }, + { + "epoch": 1.329049634474798, + "grad_norm": 4.1875, + "learning_rate": 2.069840803521089e-06, + "loss": 1.1611192226409912, + "step": 4318 + }, + { + "epoch": 1.3296652558676414, + "grad_norm": 11.9375, + "learning_rate": 2.0680762942648646e-06, + "loss": 1.5519795417785645, + "step": 4320 + }, + { + "epoch": 1.3302808772604848, + "grad_norm": 8.75, + "learning_rate": 2.0663127110821144e-06, + "loss": 1.5584685802459717, + "step": 4322 + }, + { + "epoch": 1.3308964986533283, + "grad_norm": 9.5625, + "learning_rate": 2.0645500557253544e-06, + "loss": 1.6258420944213867, + "step": 4324 + }, + { + "epoch": 1.3315121200461717, + "grad_norm": 7.03125, + "learning_rate": 2.062788329946172e-06, + "loss": 1.5503873825073242, + "step": 4326 + }, + { + "epoch": 1.332127741439015, + "grad_norm": 9.0, + "learning_rate": 2.0610275354952338e-06, + "loss": 1.4425421953201294, + "step": 4328 + }, + { + "epoch": 1.3327433628318583, + "grad_norm": 10.125, + "learning_rate": 2.059267674122283e-06, + "loss": 1.1959125995635986, + "step": 4330 + }, + { + "epoch": 1.3333589842247018, + "grad_norm": 2.09375, + "learning_rate": 2.057508747576131e-06, + "loss": 1.3274942636489868, + "step": 4332 + }, + { + "epoch": 1.3339746056175452, + "grad_norm": 6.53125, + "learning_rate": 2.0557507576046632e-06, + "loss": 1.3199305534362793, + "step": 4334 + }, + { + "epoch": 1.3345902270103887, + "grad_norm": 39.5, + "learning_rate": 2.0539937059548336e-06, + "loss": 1.1806223392486572, + "step": 4336 + }, + { + "epoch": 1.335205848403232, + "grad_norm": 3.75, + "learning_rate": 2.0522375943726634e-06, + "loss": 1.0360240936279297, + "step": 4338 + }, + { + "epoch": 1.3358214697960755, + "grad_norm": 3.0625, + "learning_rate": 2.050482424603242e-06, + "loss": 1.1422436237335205, + "step": 4340 + }, + { + "epoch": 1.3364370911889187, + "grad_norm": 6.3125, + "learning_rate": 2.0487281983907185e-06, + "loss": 0.6544202566146851, + "step": 4342 + }, + { + "epoch": 1.3370527125817622, + "grad_norm": 9.4375, + "learning_rate": 2.0469749174783072e-06, + "loss": 0.6959512829780579, + "step": 4344 + }, + { + "epoch": 1.3376683339746056, + "grad_norm": 7.75, + "learning_rate": 2.045222583608285e-06, + "loss": 1.5423005819320679, + "step": 4346 + }, + { + "epoch": 1.338283955367449, + "grad_norm": 5.96875, + "learning_rate": 2.0434711985219823e-06, + "loss": 1.20880126953125, + "step": 4348 + }, + { + "epoch": 1.3388995767602925, + "grad_norm": 6.21875, + "learning_rate": 2.041720763959791e-06, + "loss": 1.2663626670837402, + "step": 4350 + }, + { + "epoch": 1.3395151981531357, + "grad_norm": 10.625, + "learning_rate": 2.0399712816611573e-06, + "loss": 1.7219278812408447, + "step": 4352 + }, + { + "epoch": 1.3401308195459793, + "grad_norm": 3.0, + "learning_rate": 2.0382227533645813e-06, + "loss": 0.7353307008743286, + "step": 4354 + }, + { + "epoch": 1.3407464409388226, + "grad_norm": 6.15625, + "learning_rate": 2.0364751808076142e-06, + "loss": 1.160996437072754, + "step": 4356 + }, + { + "epoch": 1.341362062331666, + "grad_norm": 7.71875, + "learning_rate": 2.034728565726858e-06, + "loss": 1.242622971534729, + "step": 4358 + }, + { + "epoch": 1.3419776837245094, + "grad_norm": 18.875, + "learning_rate": 2.032982909857964e-06, + "loss": 1.3442938327789307, + "step": 4360 + }, + { + "epoch": 1.3425933051173529, + "grad_norm": 12.125, + "learning_rate": 2.0312382149356276e-06, + "loss": 1.2941884994506836, + "step": 4362 + }, + { + "epoch": 1.3432089265101963, + "grad_norm": 15.3125, + "learning_rate": 2.0294944826935937e-06, + "loss": 1.6418509483337402, + "step": 4364 + }, + { + "epoch": 1.3438245479030395, + "grad_norm": 4.84375, + "learning_rate": 2.027751714864647e-06, + "loss": 1.308324933052063, + "step": 4366 + }, + { + "epoch": 1.3444401692958832, + "grad_norm": 6.40625, + "learning_rate": 2.0260099131806137e-06, + "loss": 1.695947289466858, + "step": 4368 + }, + { + "epoch": 1.3450557906887264, + "grad_norm": 2.09375, + "learning_rate": 2.024269079372365e-06, + "loss": 1.0717357397079468, + "step": 4370 + }, + { + "epoch": 1.3456714120815698, + "grad_norm": 7.1875, + "learning_rate": 2.0225292151698016e-06, + "loss": 1.1823691129684448, + "step": 4372 + }, + { + "epoch": 1.3462870334744133, + "grad_norm": 19.125, + "learning_rate": 2.0207903223018686e-06, + "loss": 1.3260829448699951, + "step": 4374 + }, + { + "epoch": 1.3469026548672567, + "grad_norm": 4.84375, + "learning_rate": 2.019052402496542e-06, + "loss": 1.543877363204956, + "step": 4376 + }, + { + "epoch": 1.3475182762601001, + "grad_norm": 6.8125, + "learning_rate": 2.017315457480832e-06, + "loss": 1.5688681602478027, + "step": 4378 + }, + { + "epoch": 1.3481338976529433, + "grad_norm": 4.375, + "learning_rate": 2.0155794889807802e-06, + "loss": 1.315077781677246, + "step": 4380 + }, + { + "epoch": 1.3487495190457868, + "grad_norm": 9.75, + "learning_rate": 2.0138444987214556e-06, + "loss": 1.5093774795532227, + "step": 4382 + }, + { + "epoch": 1.3493651404386302, + "grad_norm": 11.75, + "learning_rate": 2.0121104884269598e-06, + "loss": 1.6181879043579102, + "step": 4384 + }, + { + "epoch": 1.3499807618314736, + "grad_norm": 7.0625, + "learning_rate": 2.0103774598204144e-06, + "loss": 1.4826487302780151, + "step": 4386 + }, + { + "epoch": 1.350596383224317, + "grad_norm": 7.90625, + "learning_rate": 2.008645414623971e-06, + "loss": 1.058173418045044, + "step": 4388 + }, + { + "epoch": 1.3512120046171605, + "grad_norm": 13.0625, + "learning_rate": 2.006914354558801e-06, + "loss": 1.2639877796173096, + "step": 4390 + }, + { + "epoch": 1.351827626010004, + "grad_norm": 4.75, + "learning_rate": 2.0051842813450977e-06, + "loss": 1.0501971244812012, + "step": 4392 + }, + { + "epoch": 1.3524432474028472, + "grad_norm": 4.75, + "learning_rate": 2.003455196702074e-06, + "loss": 1.299783706665039, + "step": 4394 + }, + { + "epoch": 1.3530588687956906, + "grad_norm": 9.9375, + "learning_rate": 2.0017271023479595e-06, + "loss": 1.511252522468567, + "step": 4396 + }, + { + "epoch": 1.353674490188534, + "grad_norm": 6.78125, + "learning_rate": 2.0000000000000008e-06, + "loss": 1.2938201427459717, + "step": 4398 + }, + { + "epoch": 1.3542901115813775, + "grad_norm": 7.28125, + "learning_rate": 1.9982738913744574e-06, + "loss": 1.4771069288253784, + "step": 4400 + }, + { + "epoch": 1.354905732974221, + "grad_norm": 2.984375, + "learning_rate": 1.9965487781866026e-06, + "loss": 1.3068078756332397, + "step": 4402 + }, + { + "epoch": 1.3555213543670643, + "grad_norm": 34.25, + "learning_rate": 1.9948246621507204e-06, + "loss": 1.4712064266204834, + "step": 4404 + }, + { + "epoch": 1.3561369757599078, + "grad_norm": 6.84375, + "learning_rate": 1.993101544980103e-06, + "loss": 1.3041491508483887, + "step": 4406 + }, + { + "epoch": 1.356752597152751, + "grad_norm": 4.5, + "learning_rate": 1.9913794283870513e-06, + "loss": 1.106006145477295, + "step": 4408 + }, + { + "epoch": 1.3573682185455944, + "grad_norm": 7.75, + "learning_rate": 1.9896583140828707e-06, + "loss": 1.4679481983184814, + "step": 4410 + }, + { + "epoch": 1.3579838399384379, + "grad_norm": 9.5625, + "learning_rate": 1.987938203777871e-06, + "loss": 1.508618950843811, + "step": 4412 + }, + { + "epoch": 1.3585994613312813, + "grad_norm": 8.125, + "learning_rate": 1.9862190991813642e-06, + "loss": 1.4362380504608154, + "step": 4414 + }, + { + "epoch": 1.3592150827241247, + "grad_norm": 5.71875, + "learning_rate": 1.984501002001663e-06, + "loss": 0.9908819198608398, + "step": 4416 + }, + { + "epoch": 1.359830704116968, + "grad_norm": 5.3125, + "learning_rate": 1.9827839139460793e-06, + "loss": 1.3242170810699463, + "step": 4418 + }, + { + "epoch": 1.3604463255098116, + "grad_norm": 10.125, + "learning_rate": 1.981067836720923e-06, + "loss": 1.7057260274887085, + "step": 4420 + }, + { + "epoch": 1.3610619469026548, + "grad_norm": 7.6875, + "learning_rate": 1.979352772031497e-06, + "loss": 1.3802309036254883, + "step": 4422 + }, + { + "epoch": 1.3616775682954982, + "grad_norm": 6.84375, + "learning_rate": 1.9776387215821e-06, + "loss": 1.4686613082885742, + "step": 4424 + }, + { + "epoch": 1.3622931896883417, + "grad_norm": 5.625, + "learning_rate": 1.9759256870760226e-06, + "loss": 1.37690269947052, + "step": 4426 + }, + { + "epoch": 1.3629088110811851, + "grad_norm": 13.0625, + "learning_rate": 1.9742136702155452e-06, + "loss": 1.2001221179962158, + "step": 4428 + }, + { + "epoch": 1.3635244324740285, + "grad_norm": 19.75, + "learning_rate": 1.9725026727019368e-06, + "loss": 1.3925925493240356, + "step": 4430 + }, + { + "epoch": 1.3641400538668718, + "grad_norm": 9.375, + "learning_rate": 1.970792696235456e-06, + "loss": 1.4831708669662476, + "step": 4432 + }, + { + "epoch": 1.3647556752597152, + "grad_norm": 6.03125, + "learning_rate": 1.9690837425153433e-06, + "loss": 1.2565829753875732, + "step": 4434 + }, + { + "epoch": 1.3653712966525586, + "grad_norm": 6.71875, + "learning_rate": 1.9673758132398245e-06, + "loss": 1.4044508934020996, + "step": 4436 + }, + { + "epoch": 1.365986918045402, + "grad_norm": 7.53125, + "learning_rate": 1.9656689101061076e-06, + "loss": 1.3536463975906372, + "step": 4438 + }, + { + "epoch": 1.3666025394382455, + "grad_norm": 4.0625, + "learning_rate": 1.963963034810379e-06, + "loss": 1.1673328876495361, + "step": 4440 + }, + { + "epoch": 1.367218160831089, + "grad_norm": 6.34375, + "learning_rate": 1.9622581890478066e-06, + "loss": 1.3062219619750977, + "step": 4442 + }, + { + "epoch": 1.3678337822239324, + "grad_norm": 10.875, + "learning_rate": 1.9605543745125343e-06, + "loss": 1.3968011140823364, + "step": 4444 + }, + { + "epoch": 1.3684494036167756, + "grad_norm": 10.0625, + "learning_rate": 1.9588515928976793e-06, + "loss": 1.5252107381820679, + "step": 4446 + }, + { + "epoch": 1.369065025009619, + "grad_norm": 9.0, + "learning_rate": 1.957149845895336e-06, + "loss": 1.6871984004974365, + "step": 4448 + }, + { + "epoch": 1.3696806464024625, + "grad_norm": 11.625, + "learning_rate": 1.9554491351965654e-06, + "loss": 1.533086895942688, + "step": 4450 + }, + { + "epoch": 1.370296267795306, + "grad_norm": 4.09375, + "learning_rate": 1.9537494624914046e-06, + "loss": 1.1465895175933838, + "step": 4452 + }, + { + "epoch": 1.3709118891881493, + "grad_norm": 5.5625, + "learning_rate": 1.9520508294688558e-06, + "loss": 1.3533809185028076, + "step": 4454 + }, + { + "epoch": 1.3715275105809928, + "grad_norm": 18.125, + "learning_rate": 1.950353237816887e-06, + "loss": 1.780302882194519, + "step": 4456 + }, + { + "epoch": 1.3721431319738362, + "grad_norm": 4.21875, + "learning_rate": 1.9486566892224355e-06, + "loss": 1.293370246887207, + "step": 4458 + }, + { + "epoch": 1.3727587533666794, + "grad_norm": 8.375, + "learning_rate": 1.9469611853713984e-06, + "loss": 1.4877686500549316, + "step": 4460 + }, + { + "epoch": 1.3733743747595228, + "grad_norm": 12.625, + "learning_rate": 1.945266727948637e-06, + "loss": 0.8963188529014587, + "step": 4462 + }, + { + "epoch": 1.3739899961523663, + "grad_norm": 8.625, + "learning_rate": 1.9435733186379694e-06, + "loss": 1.6254758834838867, + "step": 4464 + }, + { + "epoch": 1.3746056175452097, + "grad_norm": 38.5, + "learning_rate": 1.941880959122177e-06, + "loss": 0.9384620785713196, + "step": 4466 + }, + { + "epoch": 1.3752212389380531, + "grad_norm": 12.25, + "learning_rate": 1.9401896510829935e-06, + "loss": 1.291838526725769, + "step": 4468 + }, + { + "epoch": 1.3758368603308964, + "grad_norm": 12.6875, + "learning_rate": 1.93849939620111e-06, + "loss": 1.2295382022857666, + "step": 4470 + }, + { + "epoch": 1.37645248172374, + "grad_norm": 9.625, + "learning_rate": 1.9368101961561712e-06, + "loss": 1.405733346939087, + "step": 4472 + }, + { + "epoch": 1.3770681031165832, + "grad_norm": 4.5, + "learning_rate": 1.935122052626773e-06, + "loss": 1.1982272863388062, + "step": 4474 + }, + { + "epoch": 1.3776837245094267, + "grad_norm": 19.875, + "learning_rate": 1.933434967290461e-06, + "loss": 1.320880651473999, + "step": 4476 + }, + { + "epoch": 1.37829934590227, + "grad_norm": 6.28125, + "learning_rate": 1.9317489418237303e-06, + "loss": 1.209782361984253, + "step": 4478 + }, + { + "epoch": 1.3789149672951135, + "grad_norm": 5.09375, + "learning_rate": 1.930063977902021e-06, + "loss": 1.2707037925720215, + "step": 4480 + }, + { + "epoch": 1.379530588687957, + "grad_norm": 6.40625, + "learning_rate": 1.928380077199721e-06, + "loss": 1.4870203733444214, + "step": 4482 + }, + { + "epoch": 1.3801462100808002, + "grad_norm": 65.0, + "learning_rate": 1.926697241390159e-06, + "loss": 1.3282127380371094, + "step": 4484 + }, + { + "epoch": 1.3807618314736438, + "grad_norm": 12.3125, + "learning_rate": 1.9250154721456075e-06, + "loss": 1.694354772567749, + "step": 4486 + }, + { + "epoch": 1.381377452866487, + "grad_norm": 9.375, + "learning_rate": 1.9233347711372794e-06, + "loss": 1.4913123846054077, + "step": 4488 + }, + { + "epoch": 1.3819930742593305, + "grad_norm": 7.9375, + "learning_rate": 1.9216551400353213e-06, + "loss": 1.6237177848815918, + "step": 4490 + }, + { + "epoch": 1.382608695652174, + "grad_norm": 4.96875, + "learning_rate": 1.9199765805088237e-06, + "loss": 1.5483479499816895, + "step": 4492 + }, + { + "epoch": 1.3832243170450174, + "grad_norm": 22.875, + "learning_rate": 1.9182990942258074e-06, + "loss": 1.4631401300430298, + "step": 4494 + }, + { + "epoch": 1.3838399384378608, + "grad_norm": 5.8125, + "learning_rate": 1.9166226828532285e-06, + "loss": 1.4134818315505981, + "step": 4496 + }, + { + "epoch": 1.384455559830704, + "grad_norm": 6.25, + "learning_rate": 1.9149473480569747e-06, + "loss": 1.2620103359222412, + "step": 4498 + }, + { + "epoch": 1.3850711812235474, + "grad_norm": 11.375, + "learning_rate": 1.913273091501863e-06, + "loss": 1.4693878889083862, + "step": 4500 + }, + { + "epoch": 1.3856868026163909, + "grad_norm": 7.0, + "learning_rate": 1.9115999148516408e-06, + "loss": 1.0056512355804443, + "step": 4502 + }, + { + "epoch": 1.3863024240092343, + "grad_norm": 8.5, + "learning_rate": 1.9099278197689796e-06, + "loss": 1.149704098701477, + "step": 4504 + }, + { + "epoch": 1.3869180454020777, + "grad_norm": 12.3125, + "learning_rate": 1.9082568079154797e-06, + "loss": 1.523463249206543, + "step": 4506 + }, + { + "epoch": 1.3875336667949212, + "grad_norm": 22.25, + "learning_rate": 1.906586880951662e-06, + "loss": 1.2891063690185547, + "step": 4508 + }, + { + "epoch": 1.3881492881877646, + "grad_norm": 14.25, + "learning_rate": 1.9049180405369693e-06, + "loss": 1.7021067142486572, + "step": 4510 + }, + { + "epoch": 1.3887649095806078, + "grad_norm": 14.875, + "learning_rate": 1.9032502883297683e-06, + "loss": 1.4896409511566162, + "step": 4512 + }, + { + "epoch": 1.3893805309734513, + "grad_norm": 11.9375, + "learning_rate": 1.9015836259873399e-06, + "loss": 1.4434374570846558, + "step": 4514 + }, + { + "epoch": 1.3899961523662947, + "grad_norm": 3.84375, + "learning_rate": 1.8999180551658844e-06, + "loss": 1.1028622388839722, + "step": 4516 + }, + { + "epoch": 1.3906117737591381, + "grad_norm": 11.625, + "learning_rate": 1.898253577520516e-06, + "loss": 1.107566237449646, + "step": 4518 + }, + { + "epoch": 1.3912273951519816, + "grad_norm": 8.6875, + "learning_rate": 1.8965901947052648e-06, + "loss": 1.5406841039657593, + "step": 4520 + }, + { + "epoch": 1.391843016544825, + "grad_norm": 8.0625, + "learning_rate": 1.8949279083730713e-06, + "loss": 1.5343679189682007, + "step": 4522 + }, + { + "epoch": 1.3924586379376684, + "grad_norm": 12.125, + "learning_rate": 1.8932667201757853e-06, + "loss": 1.6217793226242065, + "step": 4524 + }, + { + "epoch": 1.3930742593305117, + "grad_norm": 6.375, + "learning_rate": 1.8916066317641692e-06, + "loss": 1.1800013780593872, + "step": 4526 + }, + { + "epoch": 1.393689880723355, + "grad_norm": 2.703125, + "learning_rate": 1.8899476447878875e-06, + "loss": 1.1012816429138184, + "step": 4528 + }, + { + "epoch": 1.3943055021161985, + "grad_norm": 11.5, + "learning_rate": 1.8882897608955147e-06, + "loss": 1.1068960428237915, + "step": 4530 + }, + { + "epoch": 1.394921123509042, + "grad_norm": 9.3125, + "learning_rate": 1.8866329817345264e-06, + "loss": 0.9407857656478882, + "step": 4532 + }, + { + "epoch": 1.3955367449018854, + "grad_norm": 6.34375, + "learning_rate": 1.8849773089513002e-06, + "loss": 1.0099589824676514, + "step": 4534 + }, + { + "epoch": 1.3961523662947286, + "grad_norm": 8.75, + "learning_rate": 1.8833227441911173e-06, + "loss": 1.339590072631836, + "step": 4536 + }, + { + "epoch": 1.3967679876875723, + "grad_norm": 6.84375, + "learning_rate": 1.8816692890981535e-06, + "loss": 1.2085269689559937, + "step": 4538 + }, + { + "epoch": 1.3973836090804155, + "grad_norm": 9.1875, + "learning_rate": 1.8800169453154873e-06, + "loss": 1.6690895557403564, + "step": 4540 + }, + { + "epoch": 1.397999230473259, + "grad_norm": 7.875, + "learning_rate": 1.8783657144850873e-06, + "loss": 1.625824213027954, + "step": 4542 + }, + { + "epoch": 1.3986148518661023, + "grad_norm": 7.625, + "learning_rate": 1.876715598247818e-06, + "loss": 1.6058636903762817, + "step": 4544 + }, + { + "epoch": 1.3992304732589458, + "grad_norm": 7.125, + "learning_rate": 1.875066598243439e-06, + "loss": 1.2708394527435303, + "step": 4546 + }, + { + "epoch": 1.3998460946517892, + "grad_norm": 5.09375, + "learning_rate": 1.8734187161105971e-06, + "loss": 1.3050729036331177, + "step": 4548 + }, + { + "epoch": 1.4004617160446324, + "grad_norm": 6.46875, + "learning_rate": 1.8717719534868305e-06, + "loss": 1.233088493347168, + "step": 4550 + }, + { + "epoch": 1.401077337437476, + "grad_norm": 3.859375, + "learning_rate": 1.8701263120085644e-06, + "loss": 1.26491379737854, + "step": 4552 + }, + { + "epoch": 1.4016929588303193, + "grad_norm": 7.625, + "learning_rate": 1.8684817933111092e-06, + "loss": 1.4451459646224976, + "step": 4554 + }, + { + "epoch": 1.4023085802231627, + "grad_norm": 6.78125, + "learning_rate": 1.8668383990286595e-06, + "loss": 1.622502326965332, + "step": 4556 + }, + { + "epoch": 1.4029242016160062, + "grad_norm": 11.25, + "learning_rate": 1.8651961307942927e-06, + "loss": 1.3570857048034668, + "step": 4558 + }, + { + "epoch": 1.4035398230088496, + "grad_norm": 9.8125, + "learning_rate": 1.8635549902399693e-06, + "loss": 1.2812453508377075, + "step": 4560 + }, + { + "epoch": 1.404155444401693, + "grad_norm": 8.75, + "learning_rate": 1.8619149789965262e-06, + "loss": 1.1815876960754395, + "step": 4562 + }, + { + "epoch": 1.4047710657945363, + "grad_norm": 188.0, + "learning_rate": 1.860276098693679e-06, + "loss": 1.2427574396133423, + "step": 4564 + }, + { + "epoch": 1.4053866871873797, + "grad_norm": 6.84375, + "learning_rate": 1.858638350960022e-06, + "loss": 1.0153923034667969, + "step": 4566 + }, + { + "epoch": 1.4060023085802231, + "grad_norm": 13.875, + "learning_rate": 1.8570017374230186e-06, + "loss": 1.691391110420227, + "step": 4568 + }, + { + "epoch": 1.4066179299730666, + "grad_norm": 9.4375, + "learning_rate": 1.8553662597090108e-06, + "loss": 1.2465100288391113, + "step": 4570 + }, + { + "epoch": 1.40723355136591, + "grad_norm": 11.0625, + "learning_rate": 1.8537319194432079e-06, + "loss": 1.3901227712631226, + "step": 4572 + }, + { + "epoch": 1.4078491727587534, + "grad_norm": 6.53125, + "learning_rate": 1.852098718249692e-06, + "loss": 1.0444080829620361, + "step": 4574 + }, + { + "epoch": 1.4084647941515969, + "grad_norm": 6.8125, + "learning_rate": 1.8504666577514107e-06, + "loss": 1.3018534183502197, + "step": 4576 + }, + { + "epoch": 1.40908041554444, + "grad_norm": 4.4375, + "learning_rate": 1.8488357395701795e-06, + "loss": 1.1676541566848755, + "step": 4578 + }, + { + "epoch": 1.4096960369372835, + "grad_norm": 3.9375, + "learning_rate": 1.847205965326678e-06, + "loss": 1.2985882759094238, + "step": 4580 + }, + { + "epoch": 1.410311658330127, + "grad_norm": 18.0, + "learning_rate": 1.845577336640449e-06, + "loss": 1.411516785621643, + "step": 4582 + }, + { + "epoch": 1.4109272797229704, + "grad_norm": 7.21875, + "learning_rate": 1.8439498551298984e-06, + "loss": 1.4975998401641846, + "step": 4584 + }, + { + "epoch": 1.4115429011158138, + "grad_norm": 6.34375, + "learning_rate": 1.8423235224122909e-06, + "loss": 1.2311623096466064, + "step": 4586 + }, + { + "epoch": 1.4121585225086573, + "grad_norm": 3.28125, + "learning_rate": 1.8406983401037487e-06, + "loss": 1.2641805410385132, + "step": 4588 + }, + { + "epoch": 1.4127741439015007, + "grad_norm": 3.609375, + "learning_rate": 1.8390743098192543e-06, + "loss": 1.3353376388549805, + "step": 4590 + }, + { + "epoch": 1.413389765294344, + "grad_norm": 8.875, + "learning_rate": 1.8374514331726396e-06, + "loss": 1.0704381465911865, + "step": 4592 + }, + { + "epoch": 1.4140053866871873, + "grad_norm": 7.0, + "learning_rate": 1.8358297117765958e-06, + "loss": 1.4866266250610352, + "step": 4594 + }, + { + "epoch": 1.4146210080800308, + "grad_norm": 6.0625, + "learning_rate": 1.8342091472426637e-06, + "loss": 1.5926576852798462, + "step": 4596 + }, + { + "epoch": 1.4152366294728742, + "grad_norm": 9.625, + "learning_rate": 1.8325897411812333e-06, + "loss": 1.5305733680725098, + "step": 4598 + }, + { + "epoch": 1.4158522508657176, + "grad_norm": 7.0625, + "learning_rate": 1.830971495201546e-06, + "loss": 1.4690890312194824, + "step": 4600 + }, + { + "epoch": 1.4164678722585609, + "grad_norm": 12.875, + "learning_rate": 1.829354410911688e-06, + "loss": 0.9163476228713989, + "step": 4602 + }, + { + "epoch": 1.4170834936514045, + "grad_norm": 5.125, + "learning_rate": 1.8277384899185946e-06, + "loss": 1.257507085800171, + "step": 4604 + }, + { + "epoch": 1.4176991150442477, + "grad_norm": 10.5625, + "learning_rate": 1.8261237338280393e-06, + "loss": 1.5716406106948853, + "step": 4606 + }, + { + "epoch": 1.4183147364370912, + "grad_norm": 6.71875, + "learning_rate": 1.824510144244644e-06, + "loss": 1.0980907678604126, + "step": 4608 + }, + { + "epoch": 1.4189303578299346, + "grad_norm": 7.78125, + "learning_rate": 1.822897722771868e-06, + "loss": 1.5042476654052734, + "step": 4610 + }, + { + "epoch": 1.419545979222778, + "grad_norm": 5.46875, + "learning_rate": 1.8212864710120096e-06, + "loss": 1.3456066846847534, + "step": 4612 + }, + { + "epoch": 1.4201616006156215, + "grad_norm": 13.125, + "learning_rate": 1.8196763905662077e-06, + "loss": 0.7732136845588684, + "step": 4614 + }, + { + "epoch": 1.4207772220084647, + "grad_norm": 9.5625, + "learning_rate": 1.8180674830344343e-06, + "loss": 1.8712453842163086, + "step": 4616 + }, + { + "epoch": 1.4213928434013081, + "grad_norm": 8.4375, + "learning_rate": 1.816459750015497e-06, + "loss": 1.4007591009140015, + "step": 4618 + }, + { + "epoch": 1.4220084647941515, + "grad_norm": 10.625, + "learning_rate": 1.8148531931070365e-06, + "loss": 1.0826690196990967, + "step": 4620 + }, + { + "epoch": 1.422624086186995, + "grad_norm": 11.375, + "learning_rate": 1.813247813905523e-06, + "loss": 1.226710557937622, + "step": 4622 + }, + { + "epoch": 1.4232397075798384, + "grad_norm": 6.5625, + "learning_rate": 1.81164361400626e-06, + "loss": 1.5129951238632202, + "step": 4624 + }, + { + "epoch": 1.4238553289726819, + "grad_norm": 7.25, + "learning_rate": 1.8100405950033744e-06, + "loss": 1.3884327411651611, + "step": 4626 + }, + { + "epoch": 1.4244709503655253, + "grad_norm": 5.0, + "learning_rate": 1.8084387584898244e-06, + "loss": 1.3586499691009521, + "step": 4628 + }, + { + "epoch": 1.4250865717583685, + "grad_norm": 7.15625, + "learning_rate": 1.8068381060573903e-06, + "loss": 1.124431848526001, + "step": 4630 + }, + { + "epoch": 1.425702193151212, + "grad_norm": 16.25, + "learning_rate": 1.8052386392966756e-06, + "loss": 1.155695915222168, + "step": 4632 + }, + { + "epoch": 1.4263178145440554, + "grad_norm": 5.59375, + "learning_rate": 1.8036403597971064e-06, + "loss": 1.4743564128875732, + "step": 4634 + }, + { + "epoch": 1.4269334359368988, + "grad_norm": 3.984375, + "learning_rate": 1.8020432691469289e-06, + "loss": 1.2493964433670044, + "step": 4636 + }, + { + "epoch": 1.4275490573297422, + "grad_norm": 4.6875, + "learning_rate": 1.8004473689332082e-06, + "loss": 1.23715078830719, + "step": 4638 + }, + { + "epoch": 1.4281646787225857, + "grad_norm": 8.875, + "learning_rate": 1.7988526607418264e-06, + "loss": 1.0521396398544312, + "step": 4640 + }, + { + "epoch": 1.4287803001154291, + "grad_norm": 7.90625, + "learning_rate": 1.7972591461574801e-06, + "loss": 1.257058024406433, + "step": 4642 + }, + { + "epoch": 1.4293959215082723, + "grad_norm": 4.21875, + "learning_rate": 1.7956668267636806e-06, + "loss": 0.9944857358932495, + "step": 4644 + }, + { + "epoch": 1.4300115429011158, + "grad_norm": 10.6875, + "learning_rate": 1.7940757041427512e-06, + "loss": 1.5173877477645874, + "step": 4646 + }, + { + "epoch": 1.4306271642939592, + "grad_norm": 6.0, + "learning_rate": 1.7924857798758265e-06, + "loss": 1.4341071844100952, + "step": 4648 + }, + { + "epoch": 1.4312427856868026, + "grad_norm": 8.375, + "learning_rate": 1.7908970555428504e-06, + "loss": 1.187394142150879, + "step": 4650 + }, + { + "epoch": 1.431858407079646, + "grad_norm": 4.46875, + "learning_rate": 1.789309532722572e-06, + "loss": 1.5108386278152466, + "step": 4652 + }, + { + "epoch": 1.4324740284724893, + "grad_norm": 8.875, + "learning_rate": 1.7877232129925506e-06, + "loss": 1.2611860036849976, + "step": 4654 + }, + { + "epoch": 1.433089649865333, + "grad_norm": 11.5, + "learning_rate": 1.7861380979291464e-06, + "loss": 1.0800515413284302, + "step": 4656 + }, + { + "epoch": 1.4337052712581762, + "grad_norm": 5.8125, + "learning_rate": 1.7845541891075245e-06, + "loss": 1.4038598537445068, + "step": 4658 + }, + { + "epoch": 1.4343208926510196, + "grad_norm": 7.0, + "learning_rate": 1.7829714881016489e-06, + "loss": 1.3155494928359985, + "step": 4660 + }, + { + "epoch": 1.434936514043863, + "grad_norm": 6.75, + "learning_rate": 1.781389996484287e-06, + "loss": 1.3487207889556885, + "step": 4662 + }, + { + "epoch": 1.4355521354367065, + "grad_norm": 8.3125, + "learning_rate": 1.779809715827002e-06, + "loss": 1.2048066854476929, + "step": 4664 + }, + { + "epoch": 1.43616775682955, + "grad_norm": 7.53125, + "learning_rate": 1.7782306477001533e-06, + "loss": 1.5318856239318848, + "step": 4666 + }, + { + "epoch": 1.436783378222393, + "grad_norm": 8.1875, + "learning_rate": 1.776652793672898e-06, + "loss": 1.6808677911758423, + "step": 4668 + }, + { + "epoch": 1.4373989996152368, + "grad_norm": 14.9375, + "learning_rate": 1.775076155313183e-06, + "loss": 1.43800950050354, + "step": 4670 + }, + { + "epoch": 1.43801462100808, + "grad_norm": 4.28125, + "learning_rate": 1.7735007341877505e-06, + "loss": 1.326431393623352, + "step": 4672 + }, + { + "epoch": 1.4386302424009234, + "grad_norm": 26.25, + "learning_rate": 1.7719265318621314e-06, + "loss": 0.7720835208892822, + "step": 4674 + }, + { + "epoch": 1.4392458637937668, + "grad_norm": 13.125, + "learning_rate": 1.7703535499006455e-06, + "loss": 1.4564543962478638, + "step": 4676 + }, + { + "epoch": 1.4398614851866103, + "grad_norm": 3.984375, + "learning_rate": 1.7687817898664012e-06, + "loss": 0.8778886795043945, + "step": 4678 + }, + { + "epoch": 1.4404771065794537, + "grad_norm": 12.0, + "learning_rate": 1.7672112533212904e-06, + "loss": 1.2717019319534302, + "step": 4680 + }, + { + "epoch": 1.441092727972297, + "grad_norm": 13.25, + "learning_rate": 1.7656419418259923e-06, + "loss": 1.0154832601547241, + "step": 4682 + }, + { + "epoch": 1.4417083493651404, + "grad_norm": 10.875, + "learning_rate": 1.764073856939965e-06, + "loss": 1.366729974746704, + "step": 4684 + }, + { + "epoch": 1.4423239707579838, + "grad_norm": 8.5, + "learning_rate": 1.7625070002214502e-06, + "loss": 1.3453766107559204, + "step": 4686 + }, + { + "epoch": 1.4429395921508272, + "grad_norm": 2.96875, + "learning_rate": 1.7609413732274694e-06, + "loss": 1.1809775829315186, + "step": 4688 + }, + { + "epoch": 1.4435552135436707, + "grad_norm": 14.125, + "learning_rate": 1.7593769775138196e-06, + "loss": 1.6814355850219727, + "step": 4690 + }, + { + "epoch": 1.444170834936514, + "grad_norm": 7.9375, + "learning_rate": 1.7578138146350776e-06, + "loss": 1.379990577697754, + "step": 4692 + }, + { + "epoch": 1.4447864563293575, + "grad_norm": 5.0, + "learning_rate": 1.7562518861445923e-06, + "loss": 1.1842437982559204, + "step": 4694 + }, + { + "epoch": 1.4454020777222008, + "grad_norm": 2.546875, + "learning_rate": 1.7546911935944878e-06, + "loss": 1.024581789970398, + "step": 4696 + }, + { + "epoch": 1.4460176991150442, + "grad_norm": 6.21875, + "learning_rate": 1.7531317385356587e-06, + "loss": 1.291495680809021, + "step": 4698 + }, + { + "epoch": 1.4466333205078876, + "grad_norm": 3.78125, + "learning_rate": 1.7515735225177698e-06, + "loss": 1.3858778476715088, + "step": 4700 + }, + { + "epoch": 1.447248941900731, + "grad_norm": 2.46875, + "learning_rate": 1.7500165470892571e-06, + "loss": 1.2054401636123657, + "step": 4702 + }, + { + "epoch": 1.4478645632935745, + "grad_norm": 6.84375, + "learning_rate": 1.7484608137973207e-06, + "loss": 1.1113160848617554, + "step": 4704 + }, + { + "epoch": 1.448480184686418, + "grad_norm": 4.84375, + "learning_rate": 1.7469063241879272e-06, + "loss": 1.0340756177902222, + "step": 4706 + }, + { + "epoch": 1.4490958060792614, + "grad_norm": 5.90625, + "learning_rate": 1.74535307980581e-06, + "loss": 1.1889235973358154, + "step": 4708 + }, + { + "epoch": 1.4497114274721046, + "grad_norm": 7.65625, + "learning_rate": 1.7438010821944602e-06, + "loss": 1.5387898683547974, + "step": 4710 + }, + { + "epoch": 1.450327048864948, + "grad_norm": 1.78125, + "learning_rate": 1.742250332896134e-06, + "loss": 1.1450103521347046, + "step": 4712 + }, + { + "epoch": 1.4509426702577914, + "grad_norm": 5.9375, + "learning_rate": 1.7407008334518451e-06, + "loss": 1.2448246479034424, + "step": 4714 + }, + { + "epoch": 1.4515582916506349, + "grad_norm": 4.0, + "learning_rate": 1.7391525854013668e-06, + "loss": 1.2321335077285767, + "step": 4716 + }, + { + "epoch": 1.4521739130434783, + "grad_norm": 9.9375, + "learning_rate": 1.7376055902832273e-06, + "loss": 1.2992148399353027, + "step": 4718 + }, + { + "epoch": 1.4527895344363215, + "grad_norm": 9.25, + "learning_rate": 1.7360598496347105e-06, + "loss": 1.4072332382202148, + "step": 4720 + }, + { + "epoch": 1.4534051558291652, + "grad_norm": 16.5, + "learning_rate": 1.7345153649918533e-06, + "loss": 1.4186739921569824, + "step": 4722 + }, + { + "epoch": 1.4540207772220084, + "grad_norm": 7.9375, + "learning_rate": 1.7329721378894443e-06, + "loss": 1.5055826902389526, + "step": 4724 + }, + { + "epoch": 1.4546363986148518, + "grad_norm": 9.1875, + "learning_rate": 1.731430169861024e-06, + "loss": 1.5261921882629395, + "step": 4726 + }, + { + "epoch": 1.4552520200076953, + "grad_norm": 6.78125, + "learning_rate": 1.7298894624388796e-06, + "loss": 1.4818764925003052, + "step": 4728 + }, + { + "epoch": 1.4558676414005387, + "grad_norm": 16.375, + "learning_rate": 1.7283500171540468e-06, + "loss": 1.5456936359405518, + "step": 4730 + }, + { + "epoch": 1.4564832627933821, + "grad_norm": 7.15625, + "learning_rate": 1.7268118355363074e-06, + "loss": 1.3593312501907349, + "step": 4732 + }, + { + "epoch": 1.4570988841862254, + "grad_norm": 10.625, + "learning_rate": 1.7252749191141866e-06, + "loss": 1.1384668350219727, + "step": 4734 + }, + { + "epoch": 1.457714505579069, + "grad_norm": 5.1875, + "learning_rate": 1.7237392694149527e-06, + "loss": 1.149019479751587, + "step": 4736 + }, + { + "epoch": 1.4583301269719122, + "grad_norm": 7.46875, + "learning_rate": 1.7222048879646147e-06, + "loss": 0.5427249670028687, + "step": 4738 + }, + { + "epoch": 1.4589457483647557, + "grad_norm": 12.1875, + "learning_rate": 1.7206717762879228e-06, + "loss": 0.9779499173164368, + "step": 4740 + }, + { + "epoch": 1.459561369757599, + "grad_norm": 6.8125, + "learning_rate": 1.7191399359083642e-06, + "loss": 1.6622265577316284, + "step": 4742 + }, + { + "epoch": 1.4601769911504425, + "grad_norm": 9.5625, + "learning_rate": 1.717609368348162e-06, + "loss": 1.4094470739364624, + "step": 4744 + }, + { + "epoch": 1.460792612543286, + "grad_norm": 7.8125, + "learning_rate": 1.716080075128278e-06, + "loss": 1.5001131296157837, + "step": 4746 + }, + { + "epoch": 1.4614082339361292, + "grad_norm": 12.1875, + "learning_rate": 1.7145520577684015e-06, + "loss": 0.9509897232055664, + "step": 4748 + }, + { + "epoch": 1.4620238553289726, + "grad_norm": 11.6875, + "learning_rate": 1.7130253177869607e-06, + "loss": 1.2298247814178467, + "step": 4750 + }, + { + "epoch": 1.462639476721816, + "grad_norm": 9.0, + "learning_rate": 1.7114998567011105e-06, + "loss": 1.5896788835525513, + "step": 4752 + }, + { + "epoch": 1.4632550981146595, + "grad_norm": 12.5625, + "learning_rate": 1.7099756760267345e-06, + "loss": 1.5959491729736328, + "step": 4754 + }, + { + "epoch": 1.463870719507503, + "grad_norm": 8.75, + "learning_rate": 1.7084527772784466e-06, + "loss": 1.5655696392059326, + "step": 4756 + }, + { + "epoch": 1.4644863409003464, + "grad_norm": 5.40625, + "learning_rate": 1.7069311619695852e-06, + "loss": 1.3257312774658203, + "step": 4758 + }, + { + "epoch": 1.4651019622931898, + "grad_norm": 9.0, + "learning_rate": 1.7054108316122136e-06, + "loss": 1.3359758853912354, + "step": 4760 + }, + { + "epoch": 1.465717583686033, + "grad_norm": 4.90625, + "learning_rate": 1.7038917877171179e-06, + "loss": 1.2051633596420288, + "step": 4762 + }, + { + "epoch": 1.4663332050788764, + "grad_norm": 2.984375, + "learning_rate": 1.7023740317938053e-06, + "loss": 0.8368760347366333, + "step": 4764 + }, + { + "epoch": 1.4669488264717199, + "grad_norm": 6.0625, + "learning_rate": 1.700857565350505e-06, + "loss": 0.9664328694343567, + "step": 4766 + }, + { + "epoch": 1.4675644478645633, + "grad_norm": 5.53125, + "learning_rate": 1.6993423898941632e-06, + "loss": 1.2796266078948975, + "step": 4768 + }, + { + "epoch": 1.4681800692574067, + "grad_norm": 11.4375, + "learning_rate": 1.6978285069304444e-06, + "loss": 1.601299524307251, + "step": 4770 + }, + { + "epoch": 1.4687956906502502, + "grad_norm": 15.875, + "learning_rate": 1.6963159179637274e-06, + "loss": 1.381443738937378, + "step": 4772 + }, + { + "epoch": 1.4694113120430936, + "grad_norm": 7.75, + "learning_rate": 1.6948046244971062e-06, + "loss": 1.4764504432678223, + "step": 4774 + }, + { + "epoch": 1.4700269334359368, + "grad_norm": 6.1875, + "learning_rate": 1.6932946280323865e-06, + "loss": 1.3972898721694946, + "step": 4776 + }, + { + "epoch": 1.4706425548287803, + "grad_norm": 2.390625, + "learning_rate": 1.6917859300700848e-06, + "loss": 1.0846296548843384, + "step": 4778 + }, + { + "epoch": 1.4712581762216237, + "grad_norm": 6.59375, + "learning_rate": 1.69027853210943e-06, + "loss": 1.0266474485397339, + "step": 4780 + }, + { + "epoch": 1.4718737976144671, + "grad_norm": 6.09375, + "learning_rate": 1.6887724356483564e-06, + "loss": 1.5290355682373047, + "step": 4782 + }, + { + "epoch": 1.4724894190073106, + "grad_norm": 5.6875, + "learning_rate": 1.6872676421835055e-06, + "loss": 1.6265225410461426, + "step": 4784 + }, + { + "epoch": 1.4731050404001538, + "grad_norm": 8.8125, + "learning_rate": 1.6857641532102254e-06, + "loss": 1.7117468118667603, + "step": 4786 + }, + { + "epoch": 1.4737206617929974, + "grad_norm": 8.125, + "learning_rate": 1.6842619702225643e-06, + "loss": 1.8459373712539673, + "step": 4788 + }, + { + "epoch": 1.4743362831858406, + "grad_norm": 3.90625, + "learning_rate": 1.682761094713278e-06, + "loss": 1.2410180568695068, + "step": 4790 + }, + { + "epoch": 1.474951904578684, + "grad_norm": 5.625, + "learning_rate": 1.6812615281738178e-06, + "loss": 1.2673044204711914, + "step": 4792 + }, + { + "epoch": 1.4755675259715275, + "grad_norm": 8.5625, + "learning_rate": 1.6797632720943385e-06, + "loss": 1.2248817682266235, + "step": 4794 + }, + { + "epoch": 1.476183147364371, + "grad_norm": 6.6875, + "learning_rate": 1.6782663279636902e-06, + "loss": 1.089085340499878, + "step": 4796 + }, + { + "epoch": 1.4767987687572144, + "grad_norm": 4.4375, + "learning_rate": 1.6767706972694192e-06, + "loss": 1.323860764503479, + "step": 4798 + }, + { + "epoch": 1.4774143901500576, + "grad_norm": 13.0, + "learning_rate": 1.6752763814977679e-06, + "loss": 1.0628877878189087, + "step": 4800 + }, + { + "epoch": 1.478030011542901, + "grad_norm": 6.4375, + "learning_rate": 1.67378338213367e-06, + "loss": 1.3006656169891357, + "step": 4802 + }, + { + "epoch": 1.4786456329357445, + "grad_norm": 6.5, + "learning_rate": 1.6722917006607548e-06, + "loss": 1.4505863189697266, + "step": 4804 + }, + { + "epoch": 1.479261254328588, + "grad_norm": 5.0, + "learning_rate": 1.6708013385613378e-06, + "loss": 1.3453786373138428, + "step": 4806 + }, + { + "epoch": 1.4798768757214313, + "grad_norm": 8.0625, + "learning_rate": 1.6693122973164255e-06, + "loss": 0.9350771903991699, + "step": 4808 + }, + { + "epoch": 1.4804924971142748, + "grad_norm": 12.375, + "learning_rate": 1.6678245784057124e-06, + "loss": 1.6242949962615967, + "step": 4810 + }, + { + "epoch": 1.4811081185071182, + "grad_norm": 4.21875, + "learning_rate": 1.666338183307577e-06, + "loss": 1.2193092107772827, + "step": 4812 + }, + { + "epoch": 1.4817237398999614, + "grad_norm": 5.96875, + "learning_rate": 1.6648531134990845e-06, + "loss": 1.388013482093811, + "step": 4814 + }, + { + "epoch": 1.4823393612928049, + "grad_norm": 6.15625, + "learning_rate": 1.6633693704559816e-06, + "loss": 1.156988263130188, + "step": 4816 + }, + { + "epoch": 1.4829549826856483, + "grad_norm": 6.28125, + "learning_rate": 1.6618869556526962e-06, + "loss": 1.3346011638641357, + "step": 4818 + }, + { + "epoch": 1.4835706040784917, + "grad_norm": 5.96875, + "learning_rate": 1.6604058705623383e-06, + "loss": 1.282492756843567, + "step": 4820 + }, + { + "epoch": 1.4841862254713352, + "grad_norm": 3.125, + "learning_rate": 1.6589261166566945e-06, + "loss": 1.2600009441375732, + "step": 4822 + }, + { + "epoch": 1.4848018468641786, + "grad_norm": 6.46875, + "learning_rate": 1.6574476954062312e-06, + "loss": 1.3078112602233887, + "step": 4824 + }, + { + "epoch": 1.485417468257022, + "grad_norm": 6.78125, + "learning_rate": 1.6559706082800859e-06, + "loss": 1.066993236541748, + "step": 4826 + }, + { + "epoch": 1.4860330896498652, + "grad_norm": 4.375, + "learning_rate": 1.6544948567460755e-06, + "loss": 1.383507490158081, + "step": 4828 + }, + { + "epoch": 1.4866487110427087, + "grad_norm": 2.328125, + "learning_rate": 1.6530204422706867e-06, + "loss": 1.0866873264312744, + "step": 4830 + }, + { + "epoch": 1.4872643324355521, + "grad_norm": 4.0625, + "learning_rate": 1.6515473663190774e-06, + "loss": 1.2688915729522705, + "step": 4832 + }, + { + "epoch": 1.4878799538283956, + "grad_norm": 8.625, + "learning_rate": 1.6500756303550775e-06, + "loss": 1.510236144065857, + "step": 4834 + }, + { + "epoch": 1.488495575221239, + "grad_norm": 2.28125, + "learning_rate": 1.6486052358411831e-06, + "loss": 1.077452301979065, + "step": 4836 + }, + { + "epoch": 1.4891111966140824, + "grad_norm": 7.5625, + "learning_rate": 1.6471361842385586e-06, + "loss": 1.2230280637741089, + "step": 4838 + }, + { + "epoch": 1.4897268180069259, + "grad_norm": 9.5625, + "learning_rate": 1.6456684770070336e-06, + "loss": 1.319644570350647, + "step": 4840 + }, + { + "epoch": 1.490342439399769, + "grad_norm": 15.0625, + "learning_rate": 1.6442021156051009e-06, + "loss": 1.1388092041015625, + "step": 4842 + }, + { + "epoch": 1.4909580607926125, + "grad_norm": 4.28125, + "learning_rate": 1.6427371014899175e-06, + "loss": 0.6261699795722961, + "step": 4844 + }, + { + "epoch": 1.491573682185456, + "grad_norm": 31.5, + "learning_rate": 1.6412734361173e-06, + "loss": 0.9949028491973877, + "step": 4846 + }, + { + "epoch": 1.4921893035782994, + "grad_norm": 8.875, + "learning_rate": 1.6398111209417266e-06, + "loss": 1.4906691312789917, + "step": 4848 + }, + { + "epoch": 1.4928049249711428, + "grad_norm": 8.0, + "learning_rate": 1.638350157416333e-06, + "loss": 1.877355694770813, + "step": 4850 + }, + { + "epoch": 1.493420546363986, + "grad_norm": 14.125, + "learning_rate": 1.6368905469929091e-06, + "loss": 1.703723669052124, + "step": 4852 + }, + { + "epoch": 1.4940361677568297, + "grad_norm": 7.03125, + "learning_rate": 1.6354322911219045e-06, + "loss": 1.234142780303955, + "step": 4854 + }, + { + "epoch": 1.494651789149673, + "grad_norm": 11.0625, + "learning_rate": 1.6339753912524196e-06, + "loss": 1.5459946393966675, + "step": 4856 + }, + { + "epoch": 1.4952674105425163, + "grad_norm": 12.0, + "learning_rate": 1.6325198488322095e-06, + "loss": 1.1419613361358643, + "step": 4858 + }, + { + "epoch": 1.4958830319353598, + "grad_norm": 5.25, + "learning_rate": 1.631065665307679e-06, + "loss": 1.4642760753631592, + "step": 4860 + }, + { + "epoch": 1.4964986533282032, + "grad_norm": 2.78125, + "learning_rate": 1.6296128421238822e-06, + "loss": 1.274398684501648, + "step": 4862 + }, + { + "epoch": 1.4971142747210466, + "grad_norm": 4.21875, + "learning_rate": 1.6281613807245228e-06, + "loss": 1.0630048513412476, + "step": 4864 + }, + { + "epoch": 1.4977298961138898, + "grad_norm": 27.875, + "learning_rate": 1.6267112825519498e-06, + "loss": 1.2358163595199585, + "step": 4866 + }, + { + "epoch": 1.4983455175067333, + "grad_norm": 5.4375, + "learning_rate": 1.6252625490471591e-06, + "loss": 1.0745879411697388, + "step": 4868 + }, + { + "epoch": 1.4989611388995767, + "grad_norm": 6.84375, + "learning_rate": 1.6238151816497896e-06, + "loss": 1.559770107269287, + "step": 4870 + }, + { + "epoch": 1.4995767602924202, + "grad_norm": 2.15625, + "learning_rate": 1.622369181798122e-06, + "loss": 0.9814578890800476, + "step": 4872 + }, + { + "epoch": 1.5001923816852636, + "grad_norm": 4.96875, + "learning_rate": 1.6209245509290794e-06, + "loss": 1.20391845703125, + "step": 4874 + }, + { + "epoch": 1.5008080030781068, + "grad_norm": 4.96875, + "learning_rate": 1.6194812904782236e-06, + "loss": 1.2740033864974976, + "step": 4876 + }, + { + "epoch": 1.5014236244709505, + "grad_norm": 9.125, + "learning_rate": 1.6180394018797552e-06, + "loss": 1.7918224334716797, + "step": 4878 + }, + { + "epoch": 1.5020392458637937, + "grad_norm": 3.046875, + "learning_rate": 1.61659888656651e-06, + "loss": 1.2199079990386963, + "step": 4880 + }, + { + "epoch": 1.5026548672566373, + "grad_norm": 20.75, + "learning_rate": 1.6151597459699622e-06, + "loss": 0.8411628007888794, + "step": 4882 + }, + { + "epoch": 1.5032704886494805, + "grad_norm": 6.6875, + "learning_rate": 1.613721981520217e-06, + "loss": 1.7471990585327148, + "step": 4884 + }, + { + "epoch": 1.503886110042324, + "grad_norm": 7.09375, + "learning_rate": 1.6122855946460128e-06, + "loss": 1.0529389381408691, + "step": 4886 + }, + { + "epoch": 1.5045017314351674, + "grad_norm": 5.4375, + "learning_rate": 1.6108505867747215e-06, + "loss": 1.3521735668182373, + "step": 4888 + }, + { + "epoch": 1.5051173528280106, + "grad_norm": 8.8125, + "learning_rate": 1.6094169593323395e-06, + "loss": 1.0746639966964722, + "step": 4890 + }, + { + "epoch": 1.5057329742208543, + "grad_norm": 16.125, + "learning_rate": 1.6079847137434967e-06, + "loss": 1.5944452285766602, + "step": 4892 + }, + { + "epoch": 1.5063485956136975, + "grad_norm": 5.78125, + "learning_rate": 1.6065538514314472e-06, + "loss": 1.073503017425537, + "step": 4894 + }, + { + "epoch": 1.506964217006541, + "grad_norm": 81.0, + "learning_rate": 1.60512437381807e-06, + "loss": 0.8253582119941711, + "step": 4896 + }, + { + "epoch": 1.5075798383993844, + "grad_norm": 3.984375, + "learning_rate": 1.6036962823238703e-06, + "loss": 0.8447735905647278, + "step": 4898 + }, + { + "epoch": 1.5081954597922278, + "grad_norm": 4.78125, + "learning_rate": 1.6022695783679736e-06, + "loss": 1.2586779594421387, + "step": 4900 + }, + { + "epoch": 1.5088110811850712, + "grad_norm": 10.375, + "learning_rate": 1.6008442633681298e-06, + "loss": 1.3773995637893677, + "step": 4902 + }, + { + "epoch": 1.5094267025779144, + "grad_norm": 3.046875, + "learning_rate": 1.5994203387407036e-06, + "loss": 1.2577307224273682, + "step": 4904 + }, + { + "epoch": 1.510042323970758, + "grad_norm": 8.1875, + "learning_rate": 1.5979978059006819e-06, + "loss": 1.2247735261917114, + "step": 4906 + }, + { + "epoch": 1.5106579453636013, + "grad_norm": 4.90625, + "learning_rate": 1.5965766662616677e-06, + "loss": 1.536833643913269, + "step": 4908 + }, + { + "epoch": 1.5112735667564448, + "grad_norm": 10.8125, + "learning_rate": 1.5951569212358787e-06, + "loss": 0.6794060468673706, + "step": 4910 + }, + { + "epoch": 1.5118891881492882, + "grad_norm": 17.625, + "learning_rate": 1.5937385722341481e-06, + "loss": 0.9972232580184937, + "step": 4912 + }, + { + "epoch": 1.5125048095421316, + "grad_norm": 2.828125, + "learning_rate": 1.5923216206659213e-06, + "loss": 1.1143596172332764, + "step": 4914 + }, + { + "epoch": 1.513120430934975, + "grad_norm": 3.421875, + "learning_rate": 1.590906067939254e-06, + "loss": 1.128780722618103, + "step": 4916 + }, + { + "epoch": 1.5137360523278183, + "grad_norm": 2.5625, + "learning_rate": 1.589491915460813e-06, + "loss": 1.1181021928787231, + "step": 4918 + }, + { + "epoch": 1.514351673720662, + "grad_norm": 4.875, + "learning_rate": 1.5880791646358728e-06, + "loss": 1.110456943511963, + "step": 4920 + }, + { + "epoch": 1.5149672951135051, + "grad_norm": 5.90625, + "learning_rate": 1.5866678168683167e-06, + "loss": 1.3714308738708496, + "step": 4922 + }, + { + "epoch": 1.5155829165063486, + "grad_norm": 39.5, + "learning_rate": 1.5852578735606317e-06, + "loss": 1.3347901105880737, + "step": 4924 + }, + { + "epoch": 1.516198537899192, + "grad_norm": 6.03125, + "learning_rate": 1.58384933611391e-06, + "loss": 1.6708590984344482, + "step": 4926 + }, + { + "epoch": 1.5168141592920354, + "grad_norm": 7.5, + "learning_rate": 1.5824422059278486e-06, + "loss": 1.5403391122817993, + "step": 4928 + }, + { + "epoch": 1.5174297806848789, + "grad_norm": 22.25, + "learning_rate": 1.5810364844007414e-06, + "loss": 1.3507730960845947, + "step": 4930 + }, + { + "epoch": 1.518045402077722, + "grad_norm": 8.3125, + "learning_rate": 1.5796321729294875e-06, + "loss": 1.4804326295852661, + "step": 4932 + }, + { + "epoch": 1.5186610234705658, + "grad_norm": 13.5625, + "learning_rate": 1.5782292729095815e-06, + "loss": 1.4317504167556763, + "step": 4934 + }, + { + "epoch": 1.519276644863409, + "grad_norm": 7.65625, + "learning_rate": 1.576827785735118e-06, + "loss": 1.2551989555358887, + "step": 4936 + }, + { + "epoch": 1.5198922662562524, + "grad_norm": 5.84375, + "learning_rate": 1.5754277127987852e-06, + "loss": 1.3449933528900146, + "step": 4938 + }, + { + "epoch": 1.5205078876490958, + "grad_norm": 11.6875, + "learning_rate": 1.5740290554918675e-06, + "loss": 1.3214181661605835, + "step": 4940 + }, + { + "epoch": 1.521123509041939, + "grad_norm": 7.34375, + "learning_rate": 1.5726318152042413e-06, + "loss": 1.3067471981048584, + "step": 4942 + }, + { + "epoch": 1.5217391304347827, + "grad_norm": 5.53125, + "learning_rate": 1.5712359933243754e-06, + "loss": 1.39982008934021, + "step": 4944 + }, + { + "epoch": 1.522354751827626, + "grad_norm": 8.6875, + "learning_rate": 1.5698415912393306e-06, + "loss": 1.399191975593567, + "step": 4946 + }, + { + "epoch": 1.5229703732204696, + "grad_norm": 3.203125, + "learning_rate": 1.5684486103347549e-06, + "loss": 1.3007683753967285, + "step": 4948 + }, + { + "epoch": 1.5235859946133128, + "grad_norm": 5.40625, + "learning_rate": 1.5670570519948836e-06, + "loss": 1.1364392042160034, + "step": 4950 + }, + { + "epoch": 1.5242016160061562, + "grad_norm": 21.875, + "learning_rate": 1.565666917602541e-06, + "loss": 1.522515892982483, + "step": 4952 + }, + { + "epoch": 1.5248172373989997, + "grad_norm": 5.1875, + "learning_rate": 1.5642782085391345e-06, + "loss": 1.0475881099700928, + "step": 4954 + }, + { + "epoch": 1.5254328587918429, + "grad_norm": 6.3125, + "learning_rate": 1.5628909261846547e-06, + "loss": 1.2147290706634521, + "step": 4956 + }, + { + "epoch": 1.5260484801846865, + "grad_norm": 7.71875, + "learning_rate": 1.5615050719176758e-06, + "loss": 1.2593631744384766, + "step": 4958 + }, + { + "epoch": 1.5266641015775297, + "grad_norm": 6.21875, + "learning_rate": 1.560120647115351e-06, + "loss": 1.6543095111846924, + "step": 4960 + }, + { + "epoch": 1.5272797229703732, + "grad_norm": 5.375, + "learning_rate": 1.5587376531534162e-06, + "loss": 1.0033334493637085, + "step": 4962 + }, + { + "epoch": 1.5278953443632166, + "grad_norm": 11.3125, + "learning_rate": 1.557356091406182e-06, + "loss": 1.2959232330322266, + "step": 4964 + }, + { + "epoch": 1.52851096575606, + "grad_norm": 5.90625, + "learning_rate": 1.5559759632465388e-06, + "loss": 1.2834677696228027, + "step": 4966 + }, + { + "epoch": 1.5291265871489035, + "grad_norm": 5.875, + "learning_rate": 1.554597270045949e-06, + "loss": 1.1852271556854248, + "step": 4968 + }, + { + "epoch": 1.5297422085417467, + "grad_norm": 11.5625, + "learning_rate": 1.553220013174452e-06, + "loss": 1.0119701623916626, + "step": 4970 + }, + { + "epoch": 1.5303578299345904, + "grad_norm": 9.0625, + "learning_rate": 1.551844194000659e-06, + "loss": 1.247909665107727, + "step": 4972 + }, + { + "epoch": 1.5309734513274336, + "grad_norm": 1.7890625, + "learning_rate": 1.5504698138917515e-06, + "loss": 1.0428611040115356, + "step": 4974 + }, + { + "epoch": 1.531589072720277, + "grad_norm": 5.28125, + "learning_rate": 1.5490968742134826e-06, + "loss": 1.3514962196350098, + "step": 4976 + }, + { + "epoch": 1.5322046941131204, + "grad_norm": 7.46875, + "learning_rate": 1.5477253763301734e-06, + "loss": 1.2463090419769287, + "step": 4978 + }, + { + "epoch": 1.5328203155059639, + "grad_norm": 7.25, + "learning_rate": 1.5463553216047114e-06, + "loss": 1.3994954824447632, + "step": 4980 + }, + { + "epoch": 1.5334359368988073, + "grad_norm": 10.0, + "learning_rate": 1.5449867113985512e-06, + "loss": 1.6348204612731934, + "step": 4982 + }, + { + "epoch": 1.5340515582916505, + "grad_norm": 5.25, + "learning_rate": 1.5436195470717104e-06, + "loss": 1.3202322721481323, + "step": 4984 + }, + { + "epoch": 1.5346671796844942, + "grad_norm": 8.5, + "learning_rate": 1.5422538299827725e-06, + "loss": 1.6668038368225098, + "step": 4986 + }, + { + "epoch": 1.5352828010773374, + "grad_norm": 16.75, + "learning_rate": 1.5408895614888798e-06, + "loss": 1.4328434467315674, + "step": 4988 + }, + { + "epoch": 1.5358984224701808, + "grad_norm": 8.625, + "learning_rate": 1.5395267429457371e-06, + "loss": 1.0568163394927979, + "step": 4990 + }, + { + "epoch": 1.5365140438630243, + "grad_norm": 2.984375, + "learning_rate": 1.5381653757076082e-06, + "loss": 1.1498777866363525, + "step": 4992 + }, + { + "epoch": 1.5371296652558677, + "grad_norm": 4.15625, + "learning_rate": 1.5368054611273133e-06, + "loss": 1.2708563804626465, + "step": 4994 + }, + { + "epoch": 1.5377452866487111, + "grad_norm": 9.0, + "learning_rate": 1.5354470005562306e-06, + "loss": 1.5392816066741943, + "step": 4996 + }, + { + "epoch": 1.5383609080415543, + "grad_norm": 6.15625, + "learning_rate": 1.5340899953442923e-06, + "loss": 1.0745151042938232, + "step": 4998 + }, + { + "epoch": 1.538976529434398, + "grad_norm": 13.4375, + "learning_rate": 1.5327344468399852e-06, + "loss": 1.365796685218811, + "step": 5000 + }, + { + "epoch": 1.5395921508272412, + "grad_norm": 8.25, + "learning_rate": 1.5313803563903485e-06, + "loss": 1.3695905208587646, + "step": 5002 + }, + { + "epoch": 1.5402077722200846, + "grad_norm": 4.65625, + "learning_rate": 1.5300277253409715e-06, + "loss": 0.8892248272895813, + "step": 5004 + }, + { + "epoch": 1.540823393612928, + "grad_norm": 10.3125, + "learning_rate": 1.5286765550359958e-06, + "loss": 1.1660937070846558, + "step": 5006 + }, + { + "epoch": 1.5414390150057713, + "grad_norm": 2.1875, + "learning_rate": 1.527326846818107e-06, + "loss": 1.207476258277893, + "step": 5008 + }, + { + "epoch": 1.542054636398615, + "grad_norm": 5.96875, + "learning_rate": 1.525978602028542e-06, + "loss": 1.2643166780471802, + "step": 5010 + }, + { + "epoch": 1.5426702577914582, + "grad_norm": 5.9375, + "learning_rate": 1.5246318220070818e-06, + "loss": 1.335673451423645, + "step": 5012 + }, + { + "epoch": 1.5432858791843016, + "grad_norm": 8.875, + "learning_rate": 1.5232865080920512e-06, + "loss": 1.5163029432296753, + "step": 5014 + }, + { + "epoch": 1.543901500577145, + "grad_norm": 5.15625, + "learning_rate": 1.5219426616203198e-06, + "loss": 1.4841883182525635, + "step": 5016 + }, + { + "epoch": 1.5445171219699885, + "grad_norm": 8.5625, + "learning_rate": 1.5206002839272973e-06, + "loss": 1.6200695037841797, + "step": 5018 + }, + { + "epoch": 1.545132743362832, + "grad_norm": 5.21875, + "learning_rate": 1.5192593763469346e-06, + "loss": 1.3461923599243164, + "step": 5020 + }, + { + "epoch": 1.5457483647556751, + "grad_norm": 5.78125, + "learning_rate": 1.5179199402117214e-06, + "loss": 1.234081745147705, + "step": 5022 + }, + { + "epoch": 1.5463639861485188, + "grad_norm": 8.9375, + "learning_rate": 1.516581976852686e-06, + "loss": 1.3456507921218872, + "step": 5024 + }, + { + "epoch": 1.546979607541362, + "grad_norm": 7.53125, + "learning_rate": 1.5152454875993921e-06, + "loss": 1.0912538766860962, + "step": 5026 + }, + { + "epoch": 1.5475952289342054, + "grad_norm": 10.1875, + "learning_rate": 1.513910473779939e-06, + "loss": 1.7042229175567627, + "step": 5028 + }, + { + "epoch": 1.5482108503270489, + "grad_norm": 7.625, + "learning_rate": 1.5125769367209603e-06, + "loss": 1.3107324838638306, + "step": 5030 + }, + { + "epoch": 1.5488264717198923, + "grad_norm": 5.28125, + "learning_rate": 1.5112448777476216e-06, + "loss": 1.7281067371368408, + "step": 5032 + }, + { + "epoch": 1.5494420931127357, + "grad_norm": 6.75, + "learning_rate": 1.5099142981836193e-06, + "loss": 1.3627159595489502, + "step": 5034 + }, + { + "epoch": 1.550057714505579, + "grad_norm": 9.75, + "learning_rate": 1.5085851993511807e-06, + "loss": 1.335439682006836, + "step": 5036 + }, + { + "epoch": 1.5506733358984226, + "grad_norm": 6.9375, + "learning_rate": 1.5072575825710601e-06, + "loss": 1.2400782108306885, + "step": 5038 + }, + { + "epoch": 1.5512889572912658, + "grad_norm": 6.46875, + "learning_rate": 1.5059314491625413e-06, + "loss": 1.5848368406295776, + "step": 5040 + }, + { + "epoch": 1.5519045786841092, + "grad_norm": 8.0, + "learning_rate": 1.5046068004434318e-06, + "loss": 1.104585886001587, + "step": 5042 + }, + { + "epoch": 1.5525202000769527, + "grad_norm": 8.5, + "learning_rate": 1.5032836377300663e-06, + "loss": 1.2151025533676147, + "step": 5044 + }, + { + "epoch": 1.5531358214697961, + "grad_norm": 18.25, + "learning_rate": 1.501961962337299e-06, + "loss": 1.3426923751831055, + "step": 5046 + }, + { + "epoch": 1.5537514428626396, + "grad_norm": 8.3125, + "learning_rate": 1.5006417755785096e-06, + "loss": 1.0586085319519043, + "step": 5048 + }, + { + "epoch": 1.5543670642554828, + "grad_norm": 7.96875, + "learning_rate": 1.4993230787655978e-06, + "loss": 1.2460078001022339, + "step": 5050 + }, + { + "epoch": 1.5549826856483264, + "grad_norm": 15.125, + "learning_rate": 1.4980058732089807e-06, + "loss": 1.0223263502120972, + "step": 5052 + }, + { + "epoch": 1.5555983070411696, + "grad_norm": 16.75, + "learning_rate": 1.4966901602175965e-06, + "loss": 1.5279709100723267, + "step": 5054 + }, + { + "epoch": 1.556213928434013, + "grad_norm": 10.5625, + "learning_rate": 1.495375941098898e-06, + "loss": 1.2279974222183228, + "step": 5056 + }, + { + "epoch": 1.5568295498268565, + "grad_norm": 4.84375, + "learning_rate": 1.4940632171588544e-06, + "loss": 1.3564412593841553, + "step": 5058 + }, + { + "epoch": 1.5574451712196997, + "grad_norm": 7.875, + "learning_rate": 1.4927519897019482e-06, + "loss": 1.4334050416946411, + "step": 5060 + }, + { + "epoch": 1.5580607926125434, + "grad_norm": 39.75, + "learning_rate": 1.491442260031176e-06, + "loss": 1.7179346084594727, + "step": 5062 + }, + { + "epoch": 1.5586764140053866, + "grad_norm": 15.3125, + "learning_rate": 1.490134029448046e-06, + "loss": 1.6199084520339966, + "step": 5064 + }, + { + "epoch": 1.5592920353982302, + "grad_norm": 2.515625, + "learning_rate": 1.4888272992525758e-06, + "loss": 1.2121710777282715, + "step": 5066 + }, + { + "epoch": 1.5599076567910735, + "grad_norm": 5.0, + "learning_rate": 1.487522070743292e-06, + "loss": 1.2717112302780151, + "step": 5068 + }, + { + "epoch": 1.560523278183917, + "grad_norm": 3.296875, + "learning_rate": 1.486218345217231e-06, + "loss": 1.2327752113342285, + "step": 5070 + }, + { + "epoch": 1.5611388995767603, + "grad_norm": 9.3125, + "learning_rate": 1.484916123969932e-06, + "loss": 1.564252257347107, + "step": 5072 + }, + { + "epoch": 1.5617545209696035, + "grad_norm": 2.953125, + "learning_rate": 1.4836154082954428e-06, + "loss": 1.0845710039138794, + "step": 5074 + }, + { + "epoch": 1.5623701423624472, + "grad_norm": 2.546875, + "learning_rate": 1.4823161994863134e-06, + "loss": 1.1228383779525757, + "step": 5076 + }, + { + "epoch": 1.5629857637552904, + "grad_norm": 19.625, + "learning_rate": 1.4810184988335965e-06, + "loss": 1.4887943267822266, + "step": 5078 + }, + { + "epoch": 1.5636013851481338, + "grad_norm": 9.625, + "learning_rate": 1.479722307626847e-06, + "loss": 1.802656888961792, + "step": 5080 + }, + { + "epoch": 1.5642170065409773, + "grad_norm": 32.5, + "learning_rate": 1.4784276271541188e-06, + "loss": 1.276349663734436, + "step": 5082 + }, + { + "epoch": 1.5648326279338207, + "grad_norm": 5.15625, + "learning_rate": 1.4771344587019644e-06, + "loss": 1.297951579093933, + "step": 5084 + }, + { + "epoch": 1.5654482493266642, + "grad_norm": 6.1875, + "learning_rate": 1.4758428035554345e-06, + "loss": 1.5146312713623047, + "step": 5086 + }, + { + "epoch": 1.5660638707195074, + "grad_norm": 3.59375, + "learning_rate": 1.4745526629980766e-06, + "loss": 1.4900462627410889, + "step": 5088 + }, + { + "epoch": 1.566679492112351, + "grad_norm": 10.75, + "learning_rate": 1.4732640383119312e-06, + "loss": 1.4104520082473755, + "step": 5090 + }, + { + "epoch": 1.5672951135051942, + "grad_norm": 8.5, + "learning_rate": 1.4719769307775337e-06, + "loss": 1.2928396463394165, + "step": 5092 + }, + { + "epoch": 1.5679107348980377, + "grad_norm": 2.40625, + "learning_rate": 1.4706913416739123e-06, + "loss": 0.8257898092269897, + "step": 5094 + }, + { + "epoch": 1.568526356290881, + "grad_norm": 9.875, + "learning_rate": 1.4694072722785857e-06, + "loss": 1.2909157276153564, + "step": 5096 + }, + { + "epoch": 1.5691419776837245, + "grad_norm": 4.15625, + "learning_rate": 1.4681247238675622e-06, + "loss": 1.4911627769470215, + "step": 5098 + }, + { + "epoch": 1.569757599076568, + "grad_norm": 5.9375, + "learning_rate": 1.4668436977153383e-06, + "loss": 1.3358662128448486, + "step": 5100 + }, + { + "epoch": 1.5703732204694112, + "grad_norm": 1.921875, + "learning_rate": 1.4655641950948993e-06, + "loss": 0.9619826078414917, + "step": 5102 + }, + { + "epoch": 1.5709888418622548, + "grad_norm": 9.25, + "learning_rate": 1.4642862172777154e-06, + "loss": 1.237134337425232, + "step": 5104 + }, + { + "epoch": 1.571604463255098, + "grad_norm": 5.4375, + "learning_rate": 1.463009765533741e-06, + "loss": 1.2212834358215332, + "step": 5106 + }, + { + "epoch": 1.5722200846479415, + "grad_norm": 6.90625, + "learning_rate": 1.4617348411314167e-06, + "loss": 1.2365351915359497, + "step": 5108 + }, + { + "epoch": 1.572835706040785, + "grad_norm": 10.25, + "learning_rate": 1.4604614453376613e-06, + "loss": 1.1786137819290161, + "step": 5110 + }, + { + "epoch": 1.5734513274336284, + "grad_norm": 15.375, + "learning_rate": 1.459189579417878e-06, + "loss": 1.056581735610962, + "step": 5112 + }, + { + "epoch": 1.5740669488264718, + "grad_norm": 5.03125, + "learning_rate": 1.4579192446359483e-06, + "loss": 1.3243074417114258, + "step": 5114 + }, + { + "epoch": 1.574682570219315, + "grad_norm": 10.375, + "learning_rate": 1.4566504422542316e-06, + "loss": 1.2739485502243042, + "step": 5116 + }, + { + "epoch": 1.5752981916121587, + "grad_norm": 5.0, + "learning_rate": 1.4553831735335667e-06, + "loss": 1.2523506879806519, + "step": 5118 + }, + { + "epoch": 1.5759138130050019, + "grad_norm": 5.5, + "learning_rate": 1.4541174397332659e-06, + "loss": 1.0985594987869263, + "step": 5120 + }, + { + "epoch": 1.5765294343978453, + "grad_norm": 5.9375, + "learning_rate": 1.4528532421111175e-06, + "loss": 1.0287668704986572, + "step": 5122 + }, + { + "epoch": 1.5771450557906888, + "grad_norm": 8.75, + "learning_rate": 1.451590581923383e-06, + "loss": 1.2971521615982056, + "step": 5124 + }, + { + "epoch": 1.577760677183532, + "grad_norm": 4.65625, + "learning_rate": 1.4503294604247953e-06, + "loss": 0.7459732890129089, + "step": 5126 + }, + { + "epoch": 1.5783762985763756, + "grad_norm": 139.0, + "learning_rate": 1.449069878868561e-06, + "loss": 1.7319278717041016, + "step": 5128 + }, + { + "epoch": 1.5789919199692188, + "grad_norm": 7.125, + "learning_rate": 1.4478118385063526e-06, + "loss": 1.4584087133407593, + "step": 5130 + }, + { + "epoch": 1.5796075413620625, + "grad_norm": 42.75, + "learning_rate": 1.4465553405883146e-06, + "loss": 1.3035287857055664, + "step": 5132 + }, + { + "epoch": 1.5802231627549057, + "grad_norm": 17.625, + "learning_rate": 1.4453003863630564e-06, + "loss": 1.0673809051513672, + "step": 5134 + }, + { + "epoch": 1.5808387841477491, + "grad_norm": 6.15625, + "learning_rate": 1.4440469770776538e-06, + "loss": 1.2335408926010132, + "step": 5136 + }, + { + "epoch": 1.5814544055405926, + "grad_norm": 4.0625, + "learning_rate": 1.4427951139776483e-06, + "loss": 1.5894789695739746, + "step": 5138 + }, + { + "epoch": 1.5820700269334358, + "grad_norm": 15.75, + "learning_rate": 1.4415447983070435e-06, + "loss": 1.413337230682373, + "step": 5140 + }, + { + "epoch": 1.5826856483262794, + "grad_norm": 7.5, + "learning_rate": 1.4402960313083072e-06, + "loss": 1.2019569873809814, + "step": 5142 + }, + { + "epoch": 1.5833012697191227, + "grad_norm": 10.25, + "learning_rate": 1.4390488142223668e-06, + "loss": 1.3251562118530273, + "step": 5144 + }, + { + "epoch": 1.583916891111966, + "grad_norm": 9.25, + "learning_rate": 1.437803148288609e-06, + "loss": 1.3022871017456055, + "step": 5146 + }, + { + "epoch": 1.5845325125048095, + "grad_norm": 6.0, + "learning_rate": 1.436559034744882e-06, + "loss": 1.4235812425613403, + "step": 5148 + }, + { + "epoch": 1.585148133897653, + "grad_norm": 8.125, + "learning_rate": 1.4353164748274867e-06, + "loss": 1.2802785634994507, + "step": 5150 + }, + { + "epoch": 1.5857637552904964, + "grad_norm": 10.9375, + "learning_rate": 1.4340754697711848e-06, + "loss": 1.7972906827926636, + "step": 5152 + }, + { + "epoch": 1.5863793766833396, + "grad_norm": 8.6875, + "learning_rate": 1.4328360208091893e-06, + "loss": 1.5309021472930908, + "step": 5154 + }, + { + "epoch": 1.5869949980761833, + "grad_norm": 9.25, + "learning_rate": 1.4315981291731698e-06, + "loss": 1.2634186744689941, + "step": 5156 + }, + { + "epoch": 1.5876106194690265, + "grad_norm": 4.65625, + "learning_rate": 1.4303617960932467e-06, + "loss": 1.1108946800231934, + "step": 5158 + }, + { + "epoch": 1.58822624086187, + "grad_norm": 18.375, + "learning_rate": 1.4291270227979912e-06, + "loss": 1.176788091659546, + "step": 5160 + }, + { + "epoch": 1.5888418622547134, + "grad_norm": 7.6875, + "learning_rate": 1.4278938105144257e-06, + "loss": 1.330367922782898, + "step": 5162 + }, + { + "epoch": 1.5894574836475568, + "grad_norm": 7.40625, + "learning_rate": 1.42666216046802e-06, + "loss": 1.5494861602783203, + "step": 5164 + }, + { + "epoch": 1.5900731050404002, + "grad_norm": 4.5, + "learning_rate": 1.425432073882694e-06, + "loss": 1.1721563339233398, + "step": 5166 + }, + { + "epoch": 1.5906887264332434, + "grad_norm": 14.1875, + "learning_rate": 1.4242035519808113e-06, + "loss": 1.1466803550720215, + "step": 5168 + }, + { + "epoch": 1.591304347826087, + "grad_norm": 7.65625, + "learning_rate": 1.4229765959831813e-06, + "loss": 1.0815727710723877, + "step": 5170 + }, + { + "epoch": 1.5919199692189303, + "grad_norm": 6.875, + "learning_rate": 1.4217512071090587e-06, + "loss": 1.200536847114563, + "step": 5172 + }, + { + "epoch": 1.5925355906117737, + "grad_norm": 5.375, + "learning_rate": 1.4205273865761393e-06, + "loss": 1.2767208814620972, + "step": 5174 + }, + { + "epoch": 1.5931512120046172, + "grad_norm": 12.25, + "learning_rate": 1.4193051356005608e-06, + "loss": 1.1916471719741821, + "step": 5176 + }, + { + "epoch": 1.5937668333974606, + "grad_norm": 6.96875, + "learning_rate": 1.418084455396902e-06, + "loss": 1.357552170753479, + "step": 5178 + }, + { + "epoch": 1.594382454790304, + "grad_norm": 5.15625, + "learning_rate": 1.416865347178179e-06, + "loss": 1.1372745037078857, + "step": 5180 + }, + { + "epoch": 1.5949980761831473, + "grad_norm": 4.3125, + "learning_rate": 1.4156478121558484e-06, + "loss": 1.5234270095825195, + "step": 5182 + }, + { + "epoch": 1.595613697575991, + "grad_norm": 9.25, + "learning_rate": 1.4144318515398012e-06, + "loss": 1.21889328956604, + "step": 5184 + }, + { + "epoch": 1.5962293189688341, + "grad_norm": 5.90625, + "learning_rate": 1.4132174665383658e-06, + "loss": 1.5588219165802002, + "step": 5186 + }, + { + "epoch": 1.5968449403616776, + "grad_norm": 16.125, + "learning_rate": 1.4120046583583019e-06, + "loss": 1.4573081731796265, + "step": 5188 + }, + { + "epoch": 1.597460561754521, + "grad_norm": 5.0, + "learning_rate": 1.4107934282048056e-06, + "loss": 1.3332809209823608, + "step": 5190 + }, + { + "epoch": 1.5980761831473642, + "grad_norm": 4.5, + "learning_rate": 1.4095837772815033e-06, + "loss": 1.2771632671356201, + "step": 5192 + }, + { + "epoch": 1.5986918045402079, + "grad_norm": 12.0625, + "learning_rate": 1.4083757067904513e-06, + "loss": 1.2745952606201172, + "step": 5194 + }, + { + "epoch": 1.599307425933051, + "grad_norm": 1.640625, + "learning_rate": 1.4071692179321378e-06, + "loss": 1.1325695514678955, + "step": 5196 + }, + { + "epoch": 1.5999230473258945, + "grad_norm": 6.3125, + "learning_rate": 1.405964311905477e-06, + "loss": 1.1452646255493164, + "step": 5198 + }, + { + "epoch": 1.600538668718738, + "grad_norm": 8.375, + "learning_rate": 1.4047609899078107e-06, + "loss": 1.3691991567611694, + "step": 5200 + }, + { + "epoch": 1.6011542901115814, + "grad_norm": 21.375, + "learning_rate": 1.4035592531349079e-06, + "loss": 1.2523620128631592, + "step": 5202 + }, + { + "epoch": 1.6017699115044248, + "grad_norm": 5.6875, + "learning_rate": 1.4023591027809601e-06, + "loss": 1.025434136390686, + "step": 5204 + }, + { + "epoch": 1.602385532897268, + "grad_norm": 9.375, + "learning_rate": 1.4011605400385847e-06, + "loss": 1.1921247243881226, + "step": 5206 + }, + { + "epoch": 1.6030011542901117, + "grad_norm": 7.0, + "learning_rate": 1.3999635660988199e-06, + "loss": 1.3524107933044434, + "step": 5208 + }, + { + "epoch": 1.603616775682955, + "grad_norm": 6.1875, + "learning_rate": 1.3987681821511255e-06, + "loss": 1.6408205032348633, + "step": 5210 + }, + { + "epoch": 1.6042323970757983, + "grad_norm": 4.09375, + "learning_rate": 1.3975743893833823e-06, + "loss": 1.2043273448944092, + "step": 5212 + }, + { + "epoch": 1.6048480184686418, + "grad_norm": 11.75, + "learning_rate": 1.3963821889818868e-06, + "loss": 1.4415565729141235, + "step": 5214 + }, + { + "epoch": 1.6054636398614852, + "grad_norm": 8.875, + "learning_rate": 1.3951915821313572e-06, + "loss": 0.6851305365562439, + "step": 5216 + }, + { + "epoch": 1.6060792612543286, + "grad_norm": 5.21875, + "learning_rate": 1.394002570014925e-06, + "loss": 1.123796820640564, + "step": 5218 + }, + { + "epoch": 1.6066948826471719, + "grad_norm": 9.1875, + "learning_rate": 1.3928151538141393e-06, + "loss": 1.3693852424621582, + "step": 5220 + }, + { + "epoch": 1.6073105040400155, + "grad_norm": 6.8125, + "learning_rate": 1.3916293347089618e-06, + "loss": 1.1036242246627808, + "step": 5222 + }, + { + "epoch": 1.6079261254328587, + "grad_norm": 7.625, + "learning_rate": 1.3904451138777666e-06, + "loss": 1.0903420448303223, + "step": 5224 + }, + { + "epoch": 1.6085417468257022, + "grad_norm": 2.5625, + "learning_rate": 1.3892624924973425e-06, + "loss": 1.148220419883728, + "step": 5226 + }, + { + "epoch": 1.6091573682185456, + "grad_norm": 6.625, + "learning_rate": 1.3880814717428844e-06, + "loss": 0.9726516008377075, + "step": 5228 + }, + { + "epoch": 1.609772989611389, + "grad_norm": 6.90625, + "learning_rate": 1.386902052788001e-06, + "loss": 1.5604653358459473, + "step": 5230 + }, + { + "epoch": 1.6103886110042325, + "grad_norm": 11.5625, + "learning_rate": 1.3857242368047065e-06, + "loss": 1.1536065340042114, + "step": 5232 + }, + { + "epoch": 1.6110042323970757, + "grad_norm": 9.6875, + "learning_rate": 1.3845480249634227e-06, + "loss": 2.0095927715301514, + "step": 5234 + }, + { + "epoch": 1.6116198537899193, + "grad_norm": 5.65625, + "learning_rate": 1.383373418432979e-06, + "loss": 1.5020904541015625, + "step": 5236 + }, + { + "epoch": 1.6122354751827626, + "grad_norm": 6.65625, + "learning_rate": 1.382200418380607e-06, + "loss": 1.2842357158660889, + "step": 5238 + }, + { + "epoch": 1.612851096575606, + "grad_norm": 4.5, + "learning_rate": 1.381029025971944e-06, + "loss": 1.2769551277160645, + "step": 5240 + }, + { + "epoch": 1.6134667179684494, + "grad_norm": 25.5, + "learning_rate": 1.3798592423710278e-06, + "loss": 1.1915051937103271, + "step": 5242 + }, + { + "epoch": 1.6140823393612926, + "grad_norm": 10.0625, + "learning_rate": 1.3786910687402998e-06, + "loss": 1.229771614074707, + "step": 5244 + }, + { + "epoch": 1.6146979607541363, + "grad_norm": 7.6875, + "learning_rate": 1.3775245062405996e-06, + "loss": 1.2317193746566772, + "step": 5246 + }, + { + "epoch": 1.6153135821469795, + "grad_norm": 7.40625, + "learning_rate": 1.3763595560311663e-06, + "loss": 1.4626492261886597, + "step": 5248 + }, + { + "epoch": 1.6159292035398232, + "grad_norm": 5.71875, + "learning_rate": 1.3751962192696378e-06, + "loss": 1.6538395881652832, + "step": 5250 + }, + { + "epoch": 1.6165448249326664, + "grad_norm": 7.1875, + "learning_rate": 1.3740344971120478e-06, + "loss": 0.9849225282669067, + "step": 5252 + }, + { + "epoch": 1.6171604463255098, + "grad_norm": 3.90625, + "learning_rate": 1.372874390712825e-06, + "loss": 1.276718020439148, + "step": 5254 + }, + { + "epoch": 1.6177760677183533, + "grad_norm": 10.5625, + "learning_rate": 1.3717159012247938e-06, + "loss": 1.4531131982803345, + "step": 5256 + }, + { + "epoch": 1.6183916891111965, + "grad_norm": 16.375, + "learning_rate": 1.3705590297991705e-06, + "loss": 0.8134285807609558, + "step": 5258 + }, + { + "epoch": 1.6190073105040401, + "grad_norm": 11.5625, + "learning_rate": 1.3694037775855651e-06, + "loss": 1.3254494667053223, + "step": 5260 + }, + { + "epoch": 1.6196229318968833, + "grad_norm": 21.625, + "learning_rate": 1.3682501457319764e-06, + "loss": 1.3920022249221802, + "step": 5262 + }, + { + "epoch": 1.6202385532897268, + "grad_norm": 12.3125, + "learning_rate": 1.3670981353847955e-06, + "loss": 1.470127820968628, + "step": 5264 + }, + { + "epoch": 1.6208541746825702, + "grad_norm": 5.125, + "learning_rate": 1.3659477476888006e-06, + "loss": 1.1842358112335205, + "step": 5266 + }, + { + "epoch": 1.6214697960754136, + "grad_norm": 12.5625, + "learning_rate": 1.3647989837871565e-06, + "loss": 1.2555248737335205, + "step": 5268 + }, + { + "epoch": 1.622085417468257, + "grad_norm": 7.5, + "learning_rate": 1.3636518448214172e-06, + "loss": 1.632139801979065, + "step": 5270 + }, + { + "epoch": 1.6227010388611003, + "grad_norm": 7.59375, + "learning_rate": 1.362506331931519e-06, + "loss": 1.117124080657959, + "step": 5272 + }, + { + "epoch": 1.623316660253944, + "grad_norm": 5.4375, + "learning_rate": 1.3613624462557857e-06, + "loss": 1.1123738288879395, + "step": 5274 + }, + { + "epoch": 1.6239322816467872, + "grad_norm": 12.75, + "learning_rate": 1.3602201889309204e-06, + "loss": 1.6312143802642822, + "step": 5276 + }, + { + "epoch": 1.6245479030396306, + "grad_norm": 8.8125, + "learning_rate": 1.3590795610920108e-06, + "loss": 1.4402046203613281, + "step": 5278 + }, + { + "epoch": 1.625163524432474, + "grad_norm": 6.15625, + "learning_rate": 1.3579405638725238e-06, + "loss": 1.1443254947662354, + "step": 5280 + }, + { + "epoch": 1.6257791458253175, + "grad_norm": 7.09375, + "learning_rate": 1.356803198404306e-06, + "loss": 1.3337159156799316, + "step": 5282 + }, + { + "epoch": 1.626394767218161, + "grad_norm": 4.5, + "learning_rate": 1.355667465817584e-06, + "loss": 1.227888584136963, + "step": 5284 + }, + { + "epoch": 1.627010388611004, + "grad_norm": 6.3125, + "learning_rate": 1.3545333672409605e-06, + "loss": 1.1797523498535156, + "step": 5286 + }, + { + "epoch": 1.6276260100038478, + "grad_norm": 9.1875, + "learning_rate": 1.353400903801414e-06, + "loss": 1.4626578092575073, + "step": 5288 + }, + { + "epoch": 1.628241631396691, + "grad_norm": 11.3125, + "learning_rate": 1.3522700766243e-06, + "loss": 0.43031102418899536, + "step": 5290 + }, + { + "epoch": 1.6288572527895344, + "grad_norm": 4.53125, + "learning_rate": 1.3511408868333453e-06, + "loss": 1.227097511291504, + "step": 5292 + }, + { + "epoch": 1.6294728741823779, + "grad_norm": 2.984375, + "learning_rate": 1.3500133355506523e-06, + "loss": 1.1872293949127197, + "step": 5294 + }, + { + "epoch": 1.6300884955752213, + "grad_norm": 8.0, + "learning_rate": 1.3488874238966931e-06, + "loss": 1.690487027168274, + "step": 5296 + }, + { + "epoch": 1.6307041169680647, + "grad_norm": 5.40625, + "learning_rate": 1.3477631529903124e-06, + "loss": 1.1481996774673462, + "step": 5298 + }, + { + "epoch": 1.631319738360908, + "grad_norm": 8.3125, + "learning_rate": 1.346640523948723e-06, + "loss": 1.4720221757888794, + "step": 5300 + }, + { + "epoch": 1.6319353597537516, + "grad_norm": 2.609375, + "learning_rate": 1.345519537887506e-06, + "loss": 1.1942338943481445, + "step": 5302 + }, + { + "epoch": 1.6325509811465948, + "grad_norm": 10.9375, + "learning_rate": 1.344400195920611e-06, + "loss": 1.0284016132354736, + "step": 5304 + }, + { + "epoch": 1.6331666025394382, + "grad_norm": 51.25, + "learning_rate": 1.3432824991603525e-06, + "loss": 1.1344106197357178, + "step": 5306 + }, + { + "epoch": 1.6337822239322817, + "grad_norm": 16.875, + "learning_rate": 1.3421664487174116e-06, + "loss": 1.4245868921279907, + "step": 5308 + }, + { + "epoch": 1.6343978453251249, + "grad_norm": 7.53125, + "learning_rate": 1.3410520457008325e-06, + "loss": 1.54558265209198, + "step": 5310 + }, + { + "epoch": 1.6350134667179685, + "grad_norm": 9.1875, + "learning_rate": 1.3399392912180214e-06, + "loss": 1.724228024482727, + "step": 5312 + }, + { + "epoch": 1.6356290881108118, + "grad_norm": 3.109375, + "learning_rate": 1.3388281863747494e-06, + "loss": 1.1608362197875977, + "step": 5314 + }, + { + "epoch": 1.6362447095036554, + "grad_norm": 6.09375, + "learning_rate": 1.3377187322751448e-06, + "loss": 1.1420176029205322, + "step": 5316 + }, + { + "epoch": 1.6368603308964986, + "grad_norm": 5.34375, + "learning_rate": 1.336610930021697e-06, + "loss": 1.0587470531463623, + "step": 5318 + }, + { + "epoch": 1.637475952289342, + "grad_norm": 5.90625, + "learning_rate": 1.3355047807152543e-06, + "loss": 1.3292616605758667, + "step": 5320 + }, + { + "epoch": 1.6380915736821855, + "grad_norm": 10.8125, + "learning_rate": 1.3344002854550222e-06, + "loss": 1.4881070852279663, + "step": 5322 + }, + { + "epoch": 1.6387071950750287, + "grad_norm": 11.125, + "learning_rate": 1.3332974453385628e-06, + "loss": 1.8732441663742065, + "step": 5324 + }, + { + "epoch": 1.6393228164678724, + "grad_norm": 8.6875, + "learning_rate": 1.3321962614617914e-06, + "loss": 1.35651695728302, + "step": 5326 + }, + { + "epoch": 1.6399384378607156, + "grad_norm": 15.25, + "learning_rate": 1.3310967349189815e-06, + "loss": 1.5227655172348022, + "step": 5328 + }, + { + "epoch": 1.640554059253559, + "grad_norm": 2.90625, + "learning_rate": 1.329998866802755e-06, + "loss": 1.1766257286071777, + "step": 5330 + }, + { + "epoch": 1.6411696806464025, + "grad_norm": 9.8125, + "learning_rate": 1.3289026582040892e-06, + "loss": 1.5235556364059448, + "step": 5332 + }, + { + "epoch": 1.6417853020392459, + "grad_norm": 5.40625, + "learning_rate": 1.3278081102123111e-06, + "loss": 1.3055951595306396, + "step": 5334 + }, + { + "epoch": 1.6424009234320893, + "grad_norm": 16.5, + "learning_rate": 1.3267152239150971e-06, + "loss": 1.365351676940918, + "step": 5336 + }, + { + "epoch": 1.6430165448249325, + "grad_norm": 199.0, + "learning_rate": 1.3256240003984736e-06, + "loss": 1.2809425592422485, + "step": 5338 + }, + { + "epoch": 1.6436321662177762, + "grad_norm": 6.3125, + "learning_rate": 1.3245344407468133e-06, + "loss": 1.3115761280059814, + "step": 5340 + }, + { + "epoch": 1.6442477876106194, + "grad_norm": 6.5, + "learning_rate": 1.3234465460428363e-06, + "loss": 1.4750884771347046, + "step": 5342 + }, + { + "epoch": 1.6448634090034628, + "grad_norm": 8.75, + "learning_rate": 1.322360317367608e-06, + "loss": 1.533599853515625, + "step": 5344 + }, + { + "epoch": 1.6454790303963063, + "grad_norm": 5.59375, + "learning_rate": 1.3212757558005374e-06, + "loss": 1.107212781906128, + "step": 5346 + }, + { + "epoch": 1.6460946517891497, + "grad_norm": 7.09375, + "learning_rate": 1.3201928624193785e-06, + "loss": 1.17499840259552, + "step": 5348 + }, + { + "epoch": 1.6467102731819931, + "grad_norm": 10.0625, + "learning_rate": 1.3191116383002265e-06, + "loss": 1.496663212776184, + "step": 5350 + }, + { + "epoch": 1.6473258945748364, + "grad_norm": 4.78125, + "learning_rate": 1.3180320845175181e-06, + "loss": 1.499389410018921, + "step": 5352 + }, + { + "epoch": 1.64794151596768, + "grad_norm": 15.625, + "learning_rate": 1.31695420214403e-06, + "loss": 1.3854055404663086, + "step": 5354 + }, + { + "epoch": 1.6485571373605232, + "grad_norm": 11.8125, + "learning_rate": 1.3158779922508782e-06, + "loss": 1.507443904876709, + "step": 5356 + }, + { + "epoch": 1.6491727587533667, + "grad_norm": 7.96875, + "learning_rate": 1.3148034559075169e-06, + "loss": 1.6484109163284302, + "step": 5358 + }, + { + "epoch": 1.64978838014621, + "grad_norm": 12.875, + "learning_rate": 1.3137305941817354e-06, + "loss": 1.4092316627502441, + "step": 5360 + }, + { + "epoch": 1.6504040015390535, + "grad_norm": 13.625, + "learning_rate": 1.3126594081396627e-06, + "loss": 1.1647193431854248, + "step": 5362 + }, + { + "epoch": 1.651019622931897, + "grad_norm": 8.8125, + "learning_rate": 1.3115898988457586e-06, + "loss": 1.6195112466812134, + "step": 5364 + }, + { + "epoch": 1.6516352443247402, + "grad_norm": 8.125, + "learning_rate": 1.3105220673628195e-06, + "loss": 1.6777485609054565, + "step": 5366 + }, + { + "epoch": 1.6522508657175838, + "grad_norm": 9.3125, + "learning_rate": 1.3094559147519733e-06, + "loss": 1.0617992877960205, + "step": 5368 + }, + { + "epoch": 1.652866487110427, + "grad_norm": 7.09375, + "learning_rate": 1.3083914420726787e-06, + "loss": 1.3714184761047363, + "step": 5370 + }, + { + "epoch": 1.6534821085032705, + "grad_norm": 31.75, + "learning_rate": 1.3073286503827275e-06, + "loss": 1.0258792638778687, + "step": 5372 + }, + { + "epoch": 1.654097729896114, + "grad_norm": 3.265625, + "learning_rate": 1.3062675407382389e-06, + "loss": 1.311981439590454, + "step": 5374 + }, + { + "epoch": 1.6547133512889571, + "grad_norm": 25.625, + "learning_rate": 1.3052081141936618e-06, + "loss": 1.4438797235488892, + "step": 5376 + }, + { + "epoch": 1.6553289726818008, + "grad_norm": 11.5, + "learning_rate": 1.3041503718017715e-06, + "loss": 1.2950350046157837, + "step": 5378 + }, + { + "epoch": 1.655944594074644, + "grad_norm": 4.96875, + "learning_rate": 1.303094314613671e-06, + "loss": 1.5233476161956787, + "step": 5380 + }, + { + "epoch": 1.6565602154674874, + "grad_norm": 7.15625, + "learning_rate": 1.3020399436787876e-06, + "loss": 1.4304616451263428, + "step": 5382 + }, + { + "epoch": 1.6571758368603309, + "grad_norm": 7.21875, + "learning_rate": 1.3009872600448725e-06, + "loss": 1.5034236907958984, + "step": 5384 + }, + { + "epoch": 1.6577914582531743, + "grad_norm": 7.25, + "learning_rate": 1.2999362647580027e-06, + "loss": 1.215216040611267, + "step": 5386 + }, + { + "epoch": 1.6584070796460177, + "grad_norm": 9.1875, + "learning_rate": 1.2988869588625746e-06, + "loss": 1.0233787298202515, + "step": 5388 + }, + { + "epoch": 1.659022701038861, + "grad_norm": 7.46875, + "learning_rate": 1.297839343401307e-06, + "loss": 1.4407703876495361, + "step": 5390 + }, + { + "epoch": 1.6596383224317046, + "grad_norm": 9.6875, + "learning_rate": 1.2967934194152399e-06, + "loss": 1.1850214004516602, + "step": 5392 + }, + { + "epoch": 1.6602539438245478, + "grad_norm": 12.375, + "learning_rate": 1.2957491879437306e-06, + "loss": 1.499040126800537, + "step": 5394 + }, + { + "epoch": 1.6608695652173913, + "grad_norm": 9.25, + "learning_rate": 1.2947066500244554e-06, + "loss": 1.5880337953567505, + "step": 5396 + }, + { + "epoch": 1.6614851866102347, + "grad_norm": 7.78125, + "learning_rate": 1.2936658066934077e-06, + "loss": 1.1170679330825806, + "step": 5398 + }, + { + "epoch": 1.6621008080030781, + "grad_norm": 7.03125, + "learning_rate": 1.2926266589848965e-06, + "loss": 1.1979840993881226, + "step": 5400 + }, + { + "epoch": 1.6627164293959216, + "grad_norm": 2.25, + "learning_rate": 1.2915892079315465e-06, + "loss": 1.1809914112091064, + "step": 5402 + }, + { + "epoch": 1.6633320507887648, + "grad_norm": 9.5, + "learning_rate": 1.2905534545642958e-06, + "loss": 1.5309373140335083, + "step": 5404 + }, + { + "epoch": 1.6639476721816084, + "grad_norm": 5.40625, + "learning_rate": 1.2895193999123966e-06, + "loss": 1.4166136980056763, + "step": 5406 + }, + { + "epoch": 1.6645632935744517, + "grad_norm": 6.90625, + "learning_rate": 1.2884870450034112e-06, + "loss": 1.5316441059112549, + "step": 5408 + }, + { + "epoch": 1.665178914967295, + "grad_norm": 4.1875, + "learning_rate": 1.2874563908632142e-06, + "loss": 1.3547664880752563, + "step": 5410 + }, + { + "epoch": 1.6657945363601385, + "grad_norm": 11.375, + "learning_rate": 1.28642743851599e-06, + "loss": 1.549019455909729, + "step": 5412 + }, + { + "epoch": 1.666410157752982, + "grad_norm": 9.1875, + "learning_rate": 1.2854001889842305e-06, + "loss": 1.3322649002075195, + "step": 5414 + }, + { + "epoch": 1.6670257791458254, + "grad_norm": 6.40625, + "learning_rate": 1.2843746432887382e-06, + "loss": 1.2669012546539307, + "step": 5416 + }, + { + "epoch": 1.6676414005386686, + "grad_norm": 9.0, + "learning_rate": 1.2833508024486197e-06, + "loss": 1.4787484407424927, + "step": 5418 + }, + { + "epoch": 1.6682570219315123, + "grad_norm": 6.40625, + "learning_rate": 1.282328667481289e-06, + "loss": 1.5382004976272583, + "step": 5420 + }, + { + "epoch": 1.6688726433243555, + "grad_norm": 11.0, + "learning_rate": 1.2813082394024646e-06, + "loss": 1.46946382522583, + "step": 5422 + }, + { + "epoch": 1.669488264717199, + "grad_norm": 2.859375, + "learning_rate": 1.280289519226168e-06, + "loss": 1.4135793447494507, + "step": 5424 + }, + { + "epoch": 1.6701038861100423, + "grad_norm": 580.0, + "learning_rate": 1.2792725079647253e-06, + "loss": 0.8567907214164734, + "step": 5426 + }, + { + "epoch": 1.6707195075028856, + "grad_norm": 11.25, + "learning_rate": 1.2782572066287626e-06, + "loss": 1.6358792781829834, + "step": 5428 + }, + { + "epoch": 1.6713351288957292, + "grad_norm": 4.71875, + "learning_rate": 1.2772436162272084e-06, + "loss": 1.5351207256317139, + "step": 5430 + }, + { + "epoch": 1.6719507502885724, + "grad_norm": 11.625, + "learning_rate": 1.2762317377672905e-06, + "loss": 1.840936541557312, + "step": 5432 + }, + { + "epoch": 1.672566371681416, + "grad_norm": 4.6875, + "learning_rate": 1.2752215722545334e-06, + "loss": 1.1363348960876465, + "step": 5434 + }, + { + "epoch": 1.6731819930742593, + "grad_norm": 6.6875, + "learning_rate": 1.2742131206927624e-06, + "loss": 1.3856546878814697, + "step": 5436 + }, + { + "epoch": 1.6737976144671027, + "grad_norm": 5.875, + "learning_rate": 1.273206384084098e-06, + "loss": 1.0931546688079834, + "step": 5438 + }, + { + "epoch": 1.6744132358599462, + "grad_norm": 11.3125, + "learning_rate": 1.2722013634289579e-06, + "loss": 1.5303289890289307, + "step": 5440 + }, + { + "epoch": 1.6750288572527894, + "grad_norm": 5.59375, + "learning_rate": 1.2711980597260532e-06, + "loss": 1.5187264680862427, + "step": 5442 + }, + { + "epoch": 1.675644478645633, + "grad_norm": 7.4375, + "learning_rate": 1.2701964739723883e-06, + "loss": 1.1173781156539917, + "step": 5444 + }, + { + "epoch": 1.6762601000384763, + "grad_norm": 6.8125, + "learning_rate": 1.2691966071632634e-06, + "loss": 0.7528840899467468, + "step": 5446 + }, + { + "epoch": 1.6768757214313197, + "grad_norm": 6.21875, + "learning_rate": 1.2681984602922659e-06, + "loss": 0.8180416822433472, + "step": 5448 + }, + { + "epoch": 1.6774913428241631, + "grad_norm": 7.78125, + "learning_rate": 1.2672020343512788e-06, + "loss": 1.5359752178192139, + "step": 5450 + }, + { + "epoch": 1.6781069642170066, + "grad_norm": 2.265625, + "learning_rate": 1.2662073303304726e-06, + "loss": 1.2845362424850464, + "step": 5452 + }, + { + "epoch": 1.67872258560985, + "grad_norm": 12.5, + "learning_rate": 1.265214349218306e-06, + "loss": 0.669722318649292, + "step": 5454 + }, + { + "epoch": 1.6793382070026932, + "grad_norm": 7.65625, + "learning_rate": 1.2642230920015279e-06, + "loss": 1.5199944972991943, + "step": 5456 + }, + { + "epoch": 1.6799538283955369, + "grad_norm": 13.875, + "learning_rate": 1.2632335596651717e-06, + "loss": 1.471770167350769, + "step": 5458 + }, + { + "epoch": 1.68056944978838, + "grad_norm": 23.0, + "learning_rate": 1.2622457531925586e-06, + "loss": 1.2522399425506592, + "step": 5460 + }, + { + "epoch": 1.6811850711812235, + "grad_norm": 13.625, + "learning_rate": 1.2612596735652935e-06, + "loss": 1.019469976425171, + "step": 5462 + }, + { + "epoch": 1.681800692574067, + "grad_norm": 6.09375, + "learning_rate": 1.2602753217632662e-06, + "loss": 1.0745410919189453, + "step": 5464 + }, + { + "epoch": 1.6824163139669104, + "grad_norm": 1.9453125, + "learning_rate": 1.2592926987646492e-06, + "loss": 1.2746351957321167, + "step": 5466 + }, + { + "epoch": 1.6830319353597538, + "grad_norm": 8.0625, + "learning_rate": 1.2583118055458965e-06, + "loss": 1.1866888999938965, + "step": 5468 + }, + { + "epoch": 1.683647556752597, + "grad_norm": 1.9765625, + "learning_rate": 1.2573326430817443e-06, + "loss": 1.1329030990600586, + "step": 5470 + }, + { + "epoch": 1.6842631781454407, + "grad_norm": 6.71875, + "learning_rate": 1.256355212345208e-06, + "loss": 1.3046042919158936, + "step": 5472 + }, + { + "epoch": 1.684878799538284, + "grad_norm": 4.6875, + "learning_rate": 1.2553795143075825e-06, + "loss": 1.1448999643325806, + "step": 5474 + }, + { + "epoch": 1.6854944209311273, + "grad_norm": 4.84375, + "learning_rate": 1.2544055499384408e-06, + "loss": 1.2929960489273071, + "step": 5476 + }, + { + "epoch": 1.6861100423239708, + "grad_norm": 8.75, + "learning_rate": 1.2534333202056326e-06, + "loss": 1.6156351566314697, + "step": 5478 + }, + { + "epoch": 1.6867256637168142, + "grad_norm": 8.4375, + "learning_rate": 1.252462826075285e-06, + "loss": 1.4472776651382446, + "step": 5480 + }, + { + "epoch": 1.6873412851096576, + "grad_norm": 13.75, + "learning_rate": 1.2514940685117996e-06, + "loss": 1.7093762159347534, + "step": 5482 + }, + { + "epoch": 1.6879569065025009, + "grad_norm": 24.0, + "learning_rate": 1.2505270484778532e-06, + "loss": 1.6886651515960693, + "step": 5484 + }, + { + "epoch": 1.6885725278953445, + "grad_norm": 3.359375, + "learning_rate": 1.2495617669343943e-06, + "loss": 1.4793897867202759, + "step": 5486 + }, + { + "epoch": 1.6891881492881877, + "grad_norm": 14.125, + "learning_rate": 1.2485982248406445e-06, + "loss": 1.4237476587295532, + "step": 5488 + }, + { + "epoch": 1.6898037706810312, + "grad_norm": 5.96875, + "learning_rate": 1.2476364231540982e-06, + "loss": 1.5307400226593018, + "step": 5490 + }, + { + "epoch": 1.6904193920738746, + "grad_norm": 4.96875, + "learning_rate": 1.2466763628305189e-06, + "loss": 1.1641037464141846, + "step": 5492 + }, + { + "epoch": 1.6910350134667178, + "grad_norm": 13.1875, + "learning_rate": 1.24571804482394e-06, + "loss": 1.4488732814788818, + "step": 5494 + }, + { + "epoch": 1.6916506348595615, + "grad_norm": 8.6875, + "learning_rate": 1.2447614700866639e-06, + "loss": 1.524038553237915, + "step": 5496 + }, + { + "epoch": 1.6922662562524047, + "grad_norm": 7.5, + "learning_rate": 1.24380663956926e-06, + "loss": 1.4217767715454102, + "step": 5498 + }, + { + "epoch": 1.6928818776452483, + "grad_norm": 7.65625, + "learning_rate": 1.2428535542205651e-06, + "loss": 1.250489592552185, + "step": 5500 + }, + { + "epoch": 1.6934974990380915, + "grad_norm": 4.3125, + "learning_rate": 1.2419022149876808e-06, + "loss": 1.3538804054260254, + "step": 5502 + }, + { + "epoch": 1.694113120430935, + "grad_norm": 10.8125, + "learning_rate": 1.240952622815975e-06, + "loss": 0.9905145168304443, + "step": 5504 + }, + { + "epoch": 1.6947287418237784, + "grad_norm": 3.203125, + "learning_rate": 1.2400047786490783e-06, + "loss": 1.1671013832092285, + "step": 5506 + }, + { + "epoch": 1.6953443632166216, + "grad_norm": 8.25, + "learning_rate": 1.2390586834288846e-06, + "loss": 1.2432751655578613, + "step": 5508 + }, + { + "epoch": 1.6959599846094653, + "grad_norm": 2.90625, + "learning_rate": 1.238114338095551e-06, + "loss": 1.2975242137908936, + "step": 5510 + }, + { + "epoch": 1.6965756060023085, + "grad_norm": 5.4375, + "learning_rate": 1.2371717435874926e-06, + "loss": 1.561279535293579, + "step": 5512 + }, + { + "epoch": 1.697191227395152, + "grad_norm": 6.5, + "learning_rate": 1.2362309008413887e-06, + "loss": 1.4640417098999023, + "step": 5514 + }, + { + "epoch": 1.6978068487879954, + "grad_norm": 8.875, + "learning_rate": 1.2352918107921744e-06, + "loss": 1.2020208835601807, + "step": 5516 + }, + { + "epoch": 1.6984224701808388, + "grad_norm": 8.5, + "learning_rate": 1.2343544743730454e-06, + "loss": 1.6358736753463745, + "step": 5518 + }, + { + "epoch": 1.6990380915736822, + "grad_norm": 9.5625, + "learning_rate": 1.233418892515454e-06, + "loss": 1.4060251712799072, + "step": 5520 + }, + { + "epoch": 1.6996537129665255, + "grad_norm": 3.984375, + "learning_rate": 1.232485066149108e-06, + "loss": 1.262081503868103, + "step": 5522 + }, + { + "epoch": 1.7002693343593691, + "grad_norm": 11.0, + "learning_rate": 1.2315529962019722e-06, + "loss": 1.615898847579956, + "step": 5524 + }, + { + "epoch": 1.7008849557522123, + "grad_norm": 3.515625, + "learning_rate": 1.230622683600265e-06, + "loss": 1.0624699592590332, + "step": 5526 + }, + { + "epoch": 1.7015005771450558, + "grad_norm": 13.1875, + "learning_rate": 1.2296941292684595e-06, + "loss": 1.3232251405715942, + "step": 5528 + }, + { + "epoch": 1.7021161985378992, + "grad_norm": 2.4375, + "learning_rate": 1.2287673341292808e-06, + "loss": 1.1552577018737793, + "step": 5530 + }, + { + "epoch": 1.7027318199307426, + "grad_norm": 12.25, + "learning_rate": 1.2278422991037051e-06, + "loss": 1.103263258934021, + "step": 5532 + }, + { + "epoch": 1.703347441323586, + "grad_norm": 4.5625, + "learning_rate": 1.2269190251109619e-06, + "loss": 1.0746948719024658, + "step": 5534 + }, + { + "epoch": 1.7039630627164293, + "grad_norm": 6.9375, + "learning_rate": 1.2259975130685285e-06, + "loss": 1.2396745681762695, + "step": 5536 + }, + { + "epoch": 1.704578684109273, + "grad_norm": 5.6875, + "learning_rate": 1.2250777638921318e-06, + "loss": 1.2555797100067139, + "step": 5538 + }, + { + "epoch": 1.7051943055021161, + "grad_norm": 5.75, + "learning_rate": 1.2241597784957477e-06, + "loss": 1.088400959968567, + "step": 5540 + }, + { + "epoch": 1.7058099268949596, + "grad_norm": 3.453125, + "learning_rate": 1.2232435577915982e-06, + "loss": 1.0315688848495483, + "step": 5542 + }, + { + "epoch": 1.706425548287803, + "grad_norm": 3.0, + "learning_rate": 1.2223291026901534e-06, + "loss": 1.2652621269226074, + "step": 5544 + }, + { + "epoch": 1.7070411696806465, + "grad_norm": 5.875, + "learning_rate": 1.2214164141001266e-06, + "loss": 1.4031015634536743, + "step": 5546 + }, + { + "epoch": 1.7076567910734899, + "grad_norm": 7.53125, + "learning_rate": 1.2205054929284784e-06, + "loss": 1.2202022075653076, + "step": 5548 + }, + { + "epoch": 1.708272412466333, + "grad_norm": 4.28125, + "learning_rate": 1.21959634008041e-06, + "loss": 1.0692622661590576, + "step": 5550 + }, + { + "epoch": 1.7088880338591768, + "grad_norm": 7.6875, + "learning_rate": 1.2186889564593678e-06, + "loss": 1.356179118156433, + "step": 5552 + }, + { + "epoch": 1.70950365525202, + "grad_norm": 11.1875, + "learning_rate": 1.2177833429670395e-06, + "loss": 1.7683215141296387, + "step": 5554 + }, + { + "epoch": 1.7101192766448634, + "grad_norm": 4.65625, + "learning_rate": 1.2168795005033524e-06, + "loss": 1.0441101789474487, + "step": 5556 + }, + { + "epoch": 1.7107348980377068, + "grad_norm": 5.4375, + "learning_rate": 1.2159774299664765e-06, + "loss": 1.1631641387939453, + "step": 5558 + }, + { + "epoch": 1.71135051943055, + "grad_norm": 14.0, + "learning_rate": 1.2150771322528187e-06, + "loss": 1.212846040725708, + "step": 5560 + }, + { + "epoch": 1.7119661408233937, + "grad_norm": 9.8125, + "learning_rate": 1.2141786082570248e-06, + "loss": 1.3858622312545776, + "step": 5562 + }, + { + "epoch": 1.712581762216237, + "grad_norm": 6.40625, + "learning_rate": 1.2132818588719788e-06, + "loss": 1.2670592069625854, + "step": 5564 + }, + { + "epoch": 1.7131973836090806, + "grad_norm": 18.625, + "learning_rate": 1.2123868849888e-06, + "loss": 1.259639859199524, + "step": 5566 + }, + { + "epoch": 1.7138130050019238, + "grad_norm": 12.9375, + "learning_rate": 1.2114936874968452e-06, + "loss": 1.3705947399139404, + "step": 5568 + }, + { + "epoch": 1.7144286263947672, + "grad_norm": 6.875, + "learning_rate": 1.210602267283703e-06, + "loss": 0.7072739601135254, + "step": 5570 + }, + { + "epoch": 1.7150442477876107, + "grad_norm": 4.875, + "learning_rate": 1.2097126252351992e-06, + "loss": 1.2308495044708252, + "step": 5572 + }, + { + "epoch": 1.7156598691804539, + "grad_norm": 5.25, + "learning_rate": 1.2088247622353907e-06, + "loss": 1.469330906867981, + "step": 5574 + }, + { + "epoch": 1.7162754905732975, + "grad_norm": 7.28125, + "learning_rate": 1.2079386791665664e-06, + "loss": 1.3243225812911987, + "step": 5576 + }, + { + "epoch": 1.7168911119661407, + "grad_norm": 9.3125, + "learning_rate": 1.2070543769092475e-06, + "loss": 1.68306565284729, + "step": 5578 + }, + { + "epoch": 1.7175067333589842, + "grad_norm": 8.25, + "learning_rate": 1.206171856342184e-06, + "loss": 1.2183438539505005, + "step": 5580 + }, + { + "epoch": 1.7181223547518276, + "grad_norm": 13.3125, + "learning_rate": 1.205291118342357e-06, + "loss": 1.412492275238037, + "step": 5582 + }, + { + "epoch": 1.718737976144671, + "grad_norm": 17.25, + "learning_rate": 1.2044121637849762e-06, + "loss": 1.0876752138137817, + "step": 5584 + }, + { + "epoch": 1.7193535975375145, + "grad_norm": 7.15625, + "learning_rate": 1.203534993543477e-06, + "loss": 1.2739953994750977, + "step": 5586 + }, + { + "epoch": 1.7199692189303577, + "grad_norm": 4.625, + "learning_rate": 1.202659608489525e-06, + "loss": 1.3260688781738281, + "step": 5588 + }, + { + "epoch": 1.7205848403232014, + "grad_norm": 6.5, + "learning_rate": 1.2017860094930084e-06, + "loss": 0.9359119534492493, + "step": 5590 + }, + { + "epoch": 1.7212004617160446, + "grad_norm": 14.4375, + "learning_rate": 1.2009141974220428e-06, + "loss": 1.80499267578125, + "step": 5592 + }, + { + "epoch": 1.721816083108888, + "grad_norm": 6.84375, + "learning_rate": 1.2000441731429669e-06, + "loss": 1.2378640174865723, + "step": 5594 + }, + { + "epoch": 1.7224317045017314, + "grad_norm": 3.90625, + "learning_rate": 1.1991759375203437e-06, + "loss": 1.1640088558197021, + "step": 5596 + }, + { + "epoch": 1.7230473258945749, + "grad_norm": 4.4375, + "learning_rate": 1.1983094914169586e-06, + "loss": 1.1311296224594116, + "step": 5598 + }, + { + "epoch": 1.7236629472874183, + "grad_norm": 12.8125, + "learning_rate": 1.197444835693818e-06, + "loss": 1.6203620433807373, + "step": 5600 + }, + { + "epoch": 1.7242785686802615, + "grad_norm": 12.8125, + "learning_rate": 1.19658197121015e-06, + "loss": 1.2333106994628906, + "step": 5602 + }, + { + "epoch": 1.7248941900731052, + "grad_norm": 7.5, + "learning_rate": 1.1957208988234025e-06, + "loss": 1.2552940845489502, + "step": 5604 + }, + { + "epoch": 1.7255098114659484, + "grad_norm": 9.8125, + "learning_rate": 1.1948616193892421e-06, + "loss": 1.3899599313735962, + "step": 5606 + }, + { + "epoch": 1.7261254328587918, + "grad_norm": 8.25, + "learning_rate": 1.1940041337615544e-06, + "loss": 1.5031278133392334, + "step": 5608 + }, + { + "epoch": 1.7267410542516353, + "grad_norm": 11.625, + "learning_rate": 1.1931484427924415e-06, + "loss": 1.7559168338775635, + "step": 5610 + }, + { + "epoch": 1.7273566756444787, + "grad_norm": 4.59375, + "learning_rate": 1.1922945473322233e-06, + "loss": 1.2270822525024414, + "step": 5612 + }, + { + "epoch": 1.7279722970373221, + "grad_norm": 9.0625, + "learning_rate": 1.1914424482294347e-06, + "loss": 1.2980186939239502, + "step": 5614 + }, + { + "epoch": 1.7285879184301653, + "grad_norm": 14.8125, + "learning_rate": 1.1905921463308256e-06, + "loss": 1.246907114982605, + "step": 5616 + }, + { + "epoch": 1.729203539823009, + "grad_norm": 20.125, + "learning_rate": 1.18974364248136e-06, + "loss": 1.6529814004898071, + "step": 5618 + }, + { + "epoch": 1.7298191612158522, + "grad_norm": 14.0, + "learning_rate": 1.1888969375242153e-06, + "loss": 1.4205410480499268, + "step": 5620 + }, + { + "epoch": 1.7304347826086957, + "grad_norm": 3.140625, + "learning_rate": 1.1880520323007811e-06, + "loss": 1.503267765045166, + "step": 5622 + }, + { + "epoch": 1.731050404001539, + "grad_norm": 5.625, + "learning_rate": 1.1872089276506584e-06, + "loss": 1.4263373613357544, + "step": 5624 + }, + { + "epoch": 1.7316660253943823, + "grad_norm": 9.375, + "learning_rate": 1.1863676244116604e-06, + "loss": 1.579979419708252, + "step": 5626 + }, + { + "epoch": 1.732281646787226, + "grad_norm": 6.8125, + "learning_rate": 1.1855281234198073e-06, + "loss": 1.5714635848999023, + "step": 5628 + }, + { + "epoch": 1.7328972681800692, + "grad_norm": 10.5625, + "learning_rate": 1.1846904255093312e-06, + "loss": 1.6432576179504395, + "step": 5630 + }, + { + "epoch": 1.7335128895729126, + "grad_norm": 4.9375, + "learning_rate": 1.183854531512671e-06, + "loss": 1.2803030014038086, + "step": 5632 + }, + { + "epoch": 1.734128510965756, + "grad_norm": 2.484375, + "learning_rate": 1.1830204422604728e-06, + "loss": 1.0546574592590332, + "step": 5634 + }, + { + "epoch": 1.7347441323585995, + "grad_norm": 19.375, + "learning_rate": 1.1821881585815907e-06, + "loss": 1.4187873601913452, + "step": 5636 + }, + { + "epoch": 1.735359753751443, + "grad_norm": 7.5625, + "learning_rate": 1.1813576813030831e-06, + "loss": 1.383895754814148, + "step": 5638 + }, + { + "epoch": 1.7359753751442861, + "grad_norm": 6.1875, + "learning_rate": 1.180529011250214e-06, + "loss": 1.2424262762069702, + "step": 5640 + }, + { + "epoch": 1.7365909965371298, + "grad_norm": 4.90625, + "learning_rate": 1.1797021492464514e-06, + "loss": 1.362465739250183, + "step": 5642 + }, + { + "epoch": 1.737206617929973, + "grad_norm": 26.625, + "learning_rate": 1.1788770961134662e-06, + "loss": 1.3541010618209839, + "step": 5644 + }, + { + "epoch": 1.7378222393228164, + "grad_norm": 9.875, + "learning_rate": 1.1780538526711329e-06, + "loss": 0.9746760725975037, + "step": 5646 + }, + { + "epoch": 1.7384378607156599, + "grad_norm": 9.9375, + "learning_rate": 1.1772324197375267e-06, + "loss": 1.4916549921035767, + "step": 5648 + }, + { + "epoch": 1.7390534821085033, + "grad_norm": 4.28125, + "learning_rate": 1.1764127981289234e-06, + "loss": 1.221276044845581, + "step": 5650 + }, + { + "epoch": 1.7396691035013467, + "grad_norm": 9.1875, + "learning_rate": 1.1755949886598006e-06, + "loss": 1.383480191230774, + "step": 5652 + }, + { + "epoch": 1.74028472489419, + "grad_norm": 5.15625, + "learning_rate": 1.1747789921428324e-06, + "loss": 1.2294129133224487, + "step": 5654 + }, + { + "epoch": 1.7409003462870336, + "grad_norm": 3.921875, + "learning_rate": 1.1739648093888938e-06, + "loss": 1.397127628326416, + "step": 5656 + }, + { + "epoch": 1.7415159676798768, + "grad_norm": 8.1875, + "learning_rate": 1.1731524412070562e-06, + "loss": 1.2268887758255005, + "step": 5658 + }, + { + "epoch": 1.7421315890727203, + "grad_norm": 5.59375, + "learning_rate": 1.1723418884045881e-06, + "loss": 1.2938385009765625, + "step": 5660 + }, + { + "epoch": 1.7427472104655637, + "grad_norm": 3.625, + "learning_rate": 1.171533151786954e-06, + "loss": 1.0179165601730347, + "step": 5662 + }, + { + "epoch": 1.7433628318584071, + "grad_norm": 30.25, + "learning_rate": 1.1707262321578134e-06, + "loss": 1.429439663887024, + "step": 5664 + }, + { + "epoch": 1.7439784532512506, + "grad_norm": 7.96875, + "learning_rate": 1.1699211303190212e-06, + "loss": 1.526271104812622, + "step": 5666 + }, + { + "epoch": 1.7445940746440938, + "grad_norm": 3.4375, + "learning_rate": 1.169117847070624e-06, + "loss": 1.0915848016738892, + "step": 5668 + }, + { + "epoch": 1.7452096960369374, + "grad_norm": 9.125, + "learning_rate": 1.1683163832108626e-06, + "loss": 1.369112491607666, + "step": 5670 + }, + { + "epoch": 1.7458253174297806, + "grad_norm": 4.84375, + "learning_rate": 1.1675167395361705e-06, + "loss": 1.1842763423919678, + "step": 5672 + }, + { + "epoch": 1.746440938822624, + "grad_norm": 5.4375, + "learning_rate": 1.1667189168411706e-06, + "loss": 1.0369771718978882, + "step": 5674 + }, + { + "epoch": 1.7470565602154675, + "grad_norm": 13.625, + "learning_rate": 1.1659229159186779e-06, + "loss": 1.2211787700653076, + "step": 5676 + }, + { + "epoch": 1.7476721816083107, + "grad_norm": 5.46875, + "learning_rate": 1.165128737559696e-06, + "loss": 0.7121513485908508, + "step": 5678 + }, + { + "epoch": 1.7482878030011544, + "grad_norm": 3.421875, + "learning_rate": 1.1643363825534173e-06, + "loss": 1.1748327016830444, + "step": 5680 + }, + { + "epoch": 1.7489034243939976, + "grad_norm": 5.65625, + "learning_rate": 1.1635458516872234e-06, + "loss": 1.2920325994491577, + "step": 5682 + }, + { + "epoch": 1.7495190457868413, + "grad_norm": 8.8125, + "learning_rate": 1.1627571457466824e-06, + "loss": 1.368028998374939, + "step": 5684 + }, + { + "epoch": 1.7501346671796845, + "grad_norm": 11.375, + "learning_rate": 1.161970265515549e-06, + "loss": 1.3619840145111084, + "step": 5686 + }, + { + "epoch": 1.750750288572528, + "grad_norm": 5.25, + "learning_rate": 1.1611852117757634e-06, + "loss": 1.3654226064682007, + "step": 5688 + }, + { + "epoch": 1.7513659099653713, + "grad_norm": 6.09375, + "learning_rate": 1.1604019853074518e-06, + "loss": 1.1195334196090698, + "step": 5690 + }, + { + "epoch": 1.7519815313582145, + "grad_norm": 23.75, + "learning_rate": 1.1596205868889238e-06, + "loss": 1.4777530431747437, + "step": 5692 + }, + { + "epoch": 1.7525971527510582, + "grad_norm": 6.375, + "learning_rate": 1.1588410172966719e-06, + "loss": 1.3182849884033203, + "step": 5694 + }, + { + "epoch": 1.7532127741439014, + "grad_norm": 9.125, + "learning_rate": 1.1580632773053722e-06, + "loss": 1.2617470026016235, + "step": 5696 + }, + { + "epoch": 1.7538283955367449, + "grad_norm": 5.0, + "learning_rate": 1.1572873676878822e-06, + "loss": 1.5043034553527832, + "step": 5698 + }, + { + "epoch": 1.7544440169295883, + "grad_norm": 1.8203125, + "learning_rate": 1.156513289215241e-06, + "loss": 0.9596728086471558, + "step": 5700 + }, + { + "epoch": 1.7550596383224317, + "grad_norm": 5.09375, + "learning_rate": 1.1557410426566678e-06, + "loss": 1.29399573802948, + "step": 5702 + }, + { + "epoch": 1.7556752597152752, + "grad_norm": 6.25, + "learning_rate": 1.154970628779561e-06, + "loss": 1.644959568977356, + "step": 5704 + }, + { + "epoch": 1.7562908811081184, + "grad_norm": 14.25, + "learning_rate": 1.1542020483494982e-06, + "loss": 1.3602919578552246, + "step": 5706 + }, + { + "epoch": 1.756906502500962, + "grad_norm": 5.46875, + "learning_rate": 1.1534353021302347e-06, + "loss": 1.2901757955551147, + "step": 5708 + }, + { + "epoch": 1.7575221238938052, + "grad_norm": 8.25, + "learning_rate": 1.1526703908837043e-06, + "loss": 1.4418219327926636, + "step": 5710 + }, + { + "epoch": 1.7581377452866487, + "grad_norm": 15.75, + "learning_rate": 1.1519073153700156e-06, + "loss": 1.4010531902313232, + "step": 5712 + }, + { + "epoch": 1.7587533666794921, + "grad_norm": 5.40625, + "learning_rate": 1.1511460763474543e-06, + "loss": 1.2491612434387207, + "step": 5714 + }, + { + "epoch": 1.7593689880723355, + "grad_norm": 5.375, + "learning_rate": 1.1503866745724807e-06, + "loss": 1.45999014377594, + "step": 5716 + }, + { + "epoch": 1.759984609465179, + "grad_norm": 2.859375, + "learning_rate": 1.1496291107997288e-06, + "loss": 1.1590230464935303, + "step": 5718 + }, + { + "epoch": 1.7606002308580222, + "grad_norm": 9.75, + "learning_rate": 1.1488733857820073e-06, + "loss": 1.1593949794769287, + "step": 5720 + }, + { + "epoch": 1.7612158522508659, + "grad_norm": 22.375, + "learning_rate": 1.1481195002702968e-06, + "loss": 1.744612455368042, + "step": 5722 + }, + { + "epoch": 1.761831473643709, + "grad_norm": 8.125, + "learning_rate": 1.1473674550137503e-06, + "loss": 1.607894778251648, + "step": 5724 + }, + { + "epoch": 1.7624470950365525, + "grad_norm": 5.71875, + "learning_rate": 1.1466172507596923e-06, + "loss": 1.4263869524002075, + "step": 5726 + }, + { + "epoch": 1.763062716429396, + "grad_norm": 5.4375, + "learning_rate": 1.1458688882536168e-06, + "loss": 1.3822015523910522, + "step": 5728 + }, + { + "epoch": 1.7636783378222394, + "grad_norm": 15.75, + "learning_rate": 1.14512236823919e-06, + "loss": 1.2835092544555664, + "step": 5730 + }, + { + "epoch": 1.7642939592150828, + "grad_norm": 11.1875, + "learning_rate": 1.1443776914582434e-06, + "loss": 1.523394227027893, + "step": 5732 + }, + { + "epoch": 1.764909580607926, + "grad_norm": 4.09375, + "learning_rate": 1.1436348586507807e-06, + "loss": 1.4853382110595703, + "step": 5734 + }, + { + "epoch": 1.7655252020007697, + "grad_norm": 2.25, + "learning_rate": 1.1428938705549704e-06, + "loss": 1.1671732664108276, + "step": 5736 + }, + { + "epoch": 1.766140823393613, + "grad_norm": 7.15625, + "learning_rate": 1.1421547279071499e-06, + "loss": 1.3251278400421143, + "step": 5738 + }, + { + "epoch": 1.7667564447864563, + "grad_norm": 7.53125, + "learning_rate": 1.1414174314418217e-06, + "loss": 1.5456947088241577, + "step": 5740 + }, + { + "epoch": 1.7673720661792998, + "grad_norm": 5.90625, + "learning_rate": 1.1406819818916533e-06, + "loss": 0.7835991978645325, + "step": 5742 + }, + { + "epoch": 1.767987687572143, + "grad_norm": 15.25, + "learning_rate": 1.1399483799874777e-06, + "loss": 1.0862616300582886, + "step": 5744 + }, + { + "epoch": 1.7686033089649866, + "grad_norm": 12.75, + "learning_rate": 1.139216626458291e-06, + "loss": 1.527071475982666, + "step": 5746 + }, + { + "epoch": 1.7692189303578298, + "grad_norm": 6.0625, + "learning_rate": 1.1384867220312541e-06, + "loss": 1.3800554275512695, + "step": 5748 + }, + { + "epoch": 1.7698345517506735, + "grad_norm": 6.15625, + "learning_rate": 1.1377586674316887e-06, + "loss": 1.2057721614837646, + "step": 5750 + }, + { + "epoch": 1.7704501731435167, + "grad_norm": 4.46875, + "learning_rate": 1.137032463383079e-06, + "loss": 1.502542495727539, + "step": 5752 + }, + { + "epoch": 1.7710657945363601, + "grad_norm": 6.3125, + "learning_rate": 1.1363081106070709e-06, + "loss": 1.1586823463439941, + "step": 5754 + }, + { + "epoch": 1.7716814159292036, + "grad_norm": 9.625, + "learning_rate": 1.1355856098234693e-06, + "loss": 1.2189154624938965, + "step": 5756 + }, + { + "epoch": 1.7722970373220468, + "grad_norm": 7.9375, + "learning_rate": 1.1348649617502395e-06, + "loss": 1.298841118812561, + "step": 5758 + }, + { + "epoch": 1.7729126587148905, + "grad_norm": 10.875, + "learning_rate": 1.1341461671035059e-06, + "loss": 1.798980474472046, + "step": 5760 + }, + { + "epoch": 1.7735282801077337, + "grad_norm": 10.875, + "learning_rate": 1.1334292265975506e-06, + "loss": 1.6718307733535767, + "step": 5762 + }, + { + "epoch": 1.774143901500577, + "grad_norm": 3.65625, + "learning_rate": 1.1327141409448134e-06, + "loss": 1.12022864818573, + "step": 5764 + }, + { + "epoch": 1.7747595228934205, + "grad_norm": 4.84375, + "learning_rate": 1.132000910855891e-06, + "loss": 1.088094711303711, + "step": 5766 + }, + { + "epoch": 1.775375144286264, + "grad_norm": 3.21875, + "learning_rate": 1.131289537039537e-06, + "loss": 1.2651698589324951, + "step": 5768 + }, + { + "epoch": 1.7759907656791074, + "grad_norm": 18.25, + "learning_rate": 1.1305800202026581e-06, + "loss": 1.527762770652771, + "step": 5770 + }, + { + "epoch": 1.7766063870719506, + "grad_norm": 5.46875, + "learning_rate": 1.1298723610503178e-06, + "loss": 1.3321083784103394, + "step": 5772 + }, + { + "epoch": 1.7772220084647943, + "grad_norm": 29.75, + "learning_rate": 1.129166560285733e-06, + "loss": 1.3004841804504395, + "step": 5774 + }, + { + "epoch": 1.7778376298576375, + "grad_norm": 6.53125, + "learning_rate": 1.1284626186102733e-06, + "loss": 1.1199957132339478, + "step": 5776 + }, + { + "epoch": 1.778453251250481, + "grad_norm": 9.5625, + "learning_rate": 1.1277605367234617e-06, + "loss": 1.4505853652954102, + "step": 5778 + }, + { + "epoch": 1.7790688726433244, + "grad_norm": 3.625, + "learning_rate": 1.1270603153229725e-06, + "loss": 1.1563801765441895, + "step": 5780 + }, + { + "epoch": 1.7796844940361678, + "grad_norm": 3.5, + "learning_rate": 1.1263619551046315e-06, + "loss": 1.1516625881195068, + "step": 5782 + }, + { + "epoch": 1.7803001154290112, + "grad_norm": 2.234375, + "learning_rate": 1.1256654567624151e-06, + "loss": 1.0001325607299805, + "step": 5784 + }, + { + "epoch": 1.7809157368218544, + "grad_norm": 3.140625, + "learning_rate": 1.1249708209884485e-06, + "loss": 1.1107304096221924, + "step": 5786 + }, + { + "epoch": 1.781531358214698, + "grad_norm": 7.71875, + "learning_rate": 1.124278048473008e-06, + "loss": 1.2107503414154053, + "step": 5788 + }, + { + "epoch": 1.7821469796075413, + "grad_norm": 5.875, + "learning_rate": 1.1235871399045157e-06, + "loss": 1.021038293838501, + "step": 5790 + }, + { + "epoch": 1.7827626010003848, + "grad_norm": 2.234375, + "learning_rate": 1.1228980959695447e-06, + "loss": 1.0064464807510376, + "step": 5792 + }, + { + "epoch": 1.7833782223932282, + "grad_norm": 5.375, + "learning_rate": 1.1222109173528127e-06, + "loss": 1.3229267597198486, + "step": 5794 + }, + { + "epoch": 1.7839938437860716, + "grad_norm": 5.90625, + "learning_rate": 1.1215256047371837e-06, + "loss": 1.33932363986969, + "step": 5796 + }, + { + "epoch": 1.784609465178915, + "grad_norm": 5.46875, + "learning_rate": 1.120842158803669e-06, + "loss": 1.2903056144714355, + "step": 5798 + }, + { + "epoch": 1.7852250865717583, + "grad_norm": 5.1875, + "learning_rate": 1.120160580231424e-06, + "loss": 1.3245118856430054, + "step": 5800 + }, + { + "epoch": 1.785840707964602, + "grad_norm": 2.6875, + "learning_rate": 1.1194808696977487e-06, + "loss": 1.2954384088516235, + "step": 5802 + }, + { + "epoch": 1.7864563293574451, + "grad_norm": 2.921875, + "learning_rate": 1.1188030278780867e-06, + "loss": 1.1626046895980835, + "step": 5804 + }, + { + "epoch": 1.7870719507502886, + "grad_norm": 5.71875, + "learning_rate": 1.118127055446024e-06, + "loss": 1.4342396259307861, + "step": 5806 + }, + { + "epoch": 1.787687572143132, + "grad_norm": 5.46875, + "learning_rate": 1.1174529530732908e-06, + "loss": 1.3454452753067017, + "step": 5808 + }, + { + "epoch": 1.7883031935359752, + "grad_norm": 5.09375, + "learning_rate": 1.1167807214297562e-06, + "loss": 1.3604456186294556, + "step": 5810 + }, + { + "epoch": 1.7889188149288189, + "grad_norm": 5.09375, + "learning_rate": 1.116110361183433e-06, + "loss": 1.27693772315979, + "step": 5812 + }, + { + "epoch": 1.789534436321662, + "grad_norm": 4.34375, + "learning_rate": 1.115441873000473e-06, + "loss": 1.1668188571929932, + "step": 5814 + }, + { + "epoch": 1.7901500577145055, + "grad_norm": 16.625, + "learning_rate": 1.1147752575451674e-06, + "loss": 1.3752187490463257, + "step": 5816 + }, + { + "epoch": 1.790765679107349, + "grad_norm": 3.796875, + "learning_rate": 1.1141105154799475e-06, + "loss": 1.0731788873672485, + "step": 5818 + }, + { + "epoch": 1.7913813005001924, + "grad_norm": 5.4375, + "learning_rate": 1.113447647465382e-06, + "loss": 1.426667332649231, + "step": 5820 + }, + { + "epoch": 1.7919969218930358, + "grad_norm": 7.375, + "learning_rate": 1.112786654160178e-06, + "loss": 1.4523508548736572, + "step": 5822 + }, + { + "epoch": 1.792612543285879, + "grad_norm": 9.875, + "learning_rate": 1.112127536221179e-06, + "loss": 1.5422025918960571, + "step": 5824 + }, + { + "epoch": 1.7932281646787227, + "grad_norm": 15.0, + "learning_rate": 1.1114702943033656e-06, + "loss": 1.28928542137146, + "step": 5826 + }, + { + "epoch": 1.793843786071566, + "grad_norm": 2.28125, + "learning_rate": 1.1108149290598537e-06, + "loss": 1.0345327854156494, + "step": 5828 + }, + { + "epoch": 1.7944594074644094, + "grad_norm": 2.671875, + "learning_rate": 1.1101614411418945e-06, + "loss": 1.075919270515442, + "step": 5830 + }, + { + "epoch": 1.7950750288572528, + "grad_norm": 5.53125, + "learning_rate": 1.1095098311988735e-06, + "loss": 1.2990070581436157, + "step": 5832 + }, + { + "epoch": 1.7956906502500962, + "grad_norm": 7.5, + "learning_rate": 1.1088600998783101e-06, + "loss": 1.3440529108047485, + "step": 5834 + }, + { + "epoch": 1.7963062716429397, + "grad_norm": 9.25, + "learning_rate": 1.1082122478258572e-06, + "loss": 1.5774719715118408, + "step": 5836 + }, + { + "epoch": 1.7969218930357829, + "grad_norm": 6.875, + "learning_rate": 1.1075662756852996e-06, + "loss": 1.2371692657470703, + "step": 5838 + }, + { + "epoch": 1.7975375144286265, + "grad_norm": 3.515625, + "learning_rate": 1.106922184098554e-06, + "loss": 1.199177861213684, + "step": 5840 + }, + { + "epoch": 1.7981531358214697, + "grad_norm": 8.0625, + "learning_rate": 1.106279973705669e-06, + "loss": 1.2665684223175049, + "step": 5842 + }, + { + "epoch": 1.7987687572143132, + "grad_norm": 6.65625, + "learning_rate": 1.1056396451448233e-06, + "loss": 1.4166505336761475, + "step": 5844 + }, + { + "epoch": 1.7993843786071566, + "grad_norm": 7.15625, + "learning_rate": 1.1050011990523263e-06, + "loss": 1.285638451576233, + "step": 5846 + }, + { + "epoch": 1.8, + "grad_norm": 6.6875, + "learning_rate": 1.1043646360626159e-06, + "loss": 1.3939177989959717, + "step": 5848 + }, + { + "epoch": 1.8006156213928435, + "grad_norm": 8.5625, + "learning_rate": 1.1037299568082583e-06, + "loss": 1.106064796447754, + "step": 5850 + }, + { + "epoch": 1.8012312427856867, + "grad_norm": 10.875, + "learning_rate": 1.1030971619199496e-06, + "loss": 1.2404805421829224, + "step": 5852 + }, + { + "epoch": 1.8018468641785303, + "grad_norm": 5.28125, + "learning_rate": 1.1024662520265113e-06, + "loss": 0.9673604369163513, + "step": 5854 + }, + { + "epoch": 1.8024624855713736, + "grad_norm": 4.6875, + "learning_rate": 1.1018372277548934e-06, + "loss": 1.3252668380737305, + "step": 5856 + }, + { + "epoch": 1.803078106964217, + "grad_norm": 9.75, + "learning_rate": 1.1012100897301712e-06, + "loss": 1.380995512008667, + "step": 5858 + }, + { + "epoch": 1.8036937283570604, + "grad_norm": 10.1875, + "learning_rate": 1.1005848385755457e-06, + "loss": 1.1895322799682617, + "step": 5860 + }, + { + "epoch": 1.8043093497499036, + "grad_norm": 7.84375, + "learning_rate": 1.0999614749123433e-06, + "loss": 0.9498265981674194, + "step": 5862 + }, + { + "epoch": 1.8049249711427473, + "grad_norm": 2.40625, + "learning_rate": 1.099339999360014e-06, + "loss": 0.9431395530700684, + "step": 5864 + }, + { + "epoch": 1.8055405925355905, + "grad_norm": 5.5625, + "learning_rate": 1.0987204125361324e-06, + "loss": 1.2742674350738525, + "step": 5866 + }, + { + "epoch": 1.8061562139284342, + "grad_norm": 5.53125, + "learning_rate": 1.0981027150563954e-06, + "loss": 1.5245552062988281, + "step": 5868 + }, + { + "epoch": 1.8067718353212774, + "grad_norm": 7.125, + "learning_rate": 1.0974869075346228e-06, + "loss": 0.723529040813446, + "step": 5870 + }, + { + "epoch": 1.8073874567141208, + "grad_norm": 4.59375, + "learning_rate": 1.0968729905827575e-06, + "loss": 1.2482393980026245, + "step": 5872 + }, + { + "epoch": 1.8080030781069643, + "grad_norm": 3.859375, + "learning_rate": 1.0962609648108607e-06, + "loss": 1.2178765535354614, + "step": 5874 + }, + { + "epoch": 1.8086186994998075, + "grad_norm": 7.59375, + "learning_rate": 1.0956508308271174e-06, + "loss": 1.0812054872512817, + "step": 5876 + }, + { + "epoch": 1.8092343208926511, + "grad_norm": 4.9375, + "learning_rate": 1.0950425892378309e-06, + "loss": 1.3730570077896118, + "step": 5878 + }, + { + "epoch": 1.8098499422854943, + "grad_norm": 65.0, + "learning_rate": 1.094436240647425e-06, + "loss": 0.9377802610397339, + "step": 5880 + }, + { + "epoch": 1.8104655636783378, + "grad_norm": 10.75, + "learning_rate": 1.0938317856584415e-06, + "loss": 1.1239988803863525, + "step": 5882 + }, + { + "epoch": 1.8110811850711812, + "grad_norm": 16.125, + "learning_rate": 1.0932292248715407e-06, + "loss": 1.2952073812484741, + "step": 5884 + }, + { + "epoch": 1.8116968064640246, + "grad_norm": 6.03125, + "learning_rate": 1.0926285588855016e-06, + "loss": 1.3250057697296143, + "step": 5886 + }, + { + "epoch": 1.812312427856868, + "grad_norm": 17.625, + "learning_rate": 1.0920297882972183e-06, + "loss": 1.3514528274536133, + "step": 5888 + }, + { + "epoch": 1.8129280492497113, + "grad_norm": 6.5625, + "learning_rate": 1.0914329137017032e-06, + "loss": 1.3845181465148926, + "step": 5890 + }, + { + "epoch": 1.813543670642555, + "grad_norm": 39.5, + "learning_rate": 1.0908379356920838e-06, + "loss": 1.256075382232666, + "step": 5892 + }, + { + "epoch": 1.8141592920353982, + "grad_norm": 7.46875, + "learning_rate": 1.0902448548596034e-06, + "loss": 1.3791804313659668, + "step": 5894 + }, + { + "epoch": 1.8147749134282416, + "grad_norm": 4.90625, + "learning_rate": 1.089653671793619e-06, + "loss": 1.1116917133331299, + "step": 5896 + }, + { + "epoch": 1.815390534821085, + "grad_norm": 8.375, + "learning_rate": 1.0890643870816033e-06, + "loss": 1.268815279006958, + "step": 5898 + }, + { + "epoch": 1.8160061562139285, + "grad_norm": 3.171875, + "learning_rate": 1.0884770013091416e-06, + "loss": 1.3997559547424316, + "step": 5900 + }, + { + "epoch": 1.816621777606772, + "grad_norm": 7.03125, + "learning_rate": 1.0878915150599318e-06, + "loss": 1.3274210691452026, + "step": 5902 + }, + { + "epoch": 1.8172373989996151, + "grad_norm": 6.8125, + "learning_rate": 1.087307928915785e-06, + "loss": 1.410146713256836, + "step": 5904 + }, + { + "epoch": 1.8178530203924588, + "grad_norm": 22.375, + "learning_rate": 1.0867262434566237e-06, + "loss": 1.1526761054992676, + "step": 5906 + }, + { + "epoch": 1.818468641785302, + "grad_norm": 8.75, + "learning_rate": 1.0861464592604818e-06, + "loss": 1.2998844385147095, + "step": 5908 + }, + { + "epoch": 1.8190842631781454, + "grad_norm": 6.15625, + "learning_rate": 1.0855685769035044e-06, + "loss": 1.3243539333343506, + "step": 5910 + }, + { + "epoch": 1.8196998845709889, + "grad_norm": 4.375, + "learning_rate": 1.0849925969599454e-06, + "loss": 1.346010446548462, + "step": 5912 + }, + { + "epoch": 1.8203155059638323, + "grad_norm": 3.265625, + "learning_rate": 1.0844185200021695e-06, + "loss": 1.0687695741653442, + "step": 5914 + }, + { + "epoch": 1.8209311273566757, + "grad_norm": 10.4375, + "learning_rate": 1.0838463466006496e-06, + "loss": 0.9370536804199219, + "step": 5916 + }, + { + "epoch": 1.821546748749519, + "grad_norm": 5.375, + "learning_rate": 1.0832760773239668e-06, + "loss": 1.3170430660247803, + "step": 5918 + }, + { + "epoch": 1.8221623701423626, + "grad_norm": 5.3125, + "learning_rate": 1.0827077127388114e-06, + "loss": 1.3774001598358154, + "step": 5920 + }, + { + "epoch": 1.8227779915352058, + "grad_norm": 14.0625, + "learning_rate": 1.0821412534099794e-06, + "loss": 0.8619877696037292, + "step": 5922 + }, + { + "epoch": 1.8233936129280492, + "grad_norm": 9.6875, + "learning_rate": 1.0815766999003744e-06, + "loss": 1.6785545349121094, + "step": 5924 + }, + { + "epoch": 1.8240092343208927, + "grad_norm": 4.90625, + "learning_rate": 1.0810140527710057e-06, + "loss": 1.270500898361206, + "step": 5926 + }, + { + "epoch": 1.824624855713736, + "grad_norm": 4.4375, + "learning_rate": 1.0804533125809879e-06, + "loss": 1.0367622375488281, + "step": 5928 + }, + { + "epoch": 1.8252404771065796, + "grad_norm": 11.6875, + "learning_rate": 1.0798944798875425e-06, + "loss": 1.118377923965454, + "step": 5930 + }, + { + "epoch": 1.8258560984994228, + "grad_norm": 6.40625, + "learning_rate": 1.0793375552459925e-06, + "loss": 1.2934290170669556, + "step": 5932 + }, + { + "epoch": 1.8264717198922664, + "grad_norm": 8.3125, + "learning_rate": 1.0787825392097673e-06, + "loss": 1.301269292831421, + "step": 5934 + }, + { + "epoch": 1.8270873412851096, + "grad_norm": 5.09375, + "learning_rate": 1.0782294323303987e-06, + "loss": 1.43231999874115, + "step": 5936 + }, + { + "epoch": 1.827702962677953, + "grad_norm": 8.125, + "learning_rate": 1.0776782351575212e-06, + "loss": 1.0624133348464966, + "step": 5938 + }, + { + "epoch": 1.8283185840707965, + "grad_norm": 5.09375, + "learning_rate": 1.077128948238872e-06, + "loss": 1.4891644716262817, + "step": 5940 + }, + { + "epoch": 1.8289342054636397, + "grad_norm": 7.03125, + "learning_rate": 1.0765815721202895e-06, + "loss": 1.1424224376678467, + "step": 5942 + }, + { + "epoch": 1.8295498268564834, + "grad_norm": 6.53125, + "learning_rate": 1.076036107345714e-06, + "loss": 1.5409581661224365, + "step": 5944 + }, + { + "epoch": 1.8301654482493266, + "grad_norm": 4.625, + "learning_rate": 1.0754925544571858e-06, + "loss": 1.2086446285247803, + "step": 5946 + }, + { + "epoch": 1.83078106964217, + "grad_norm": 6.375, + "learning_rate": 1.0749509139948455e-06, + "loss": 1.3200281858444214, + "step": 5948 + }, + { + "epoch": 1.8313966910350135, + "grad_norm": 9.0, + "learning_rate": 1.074411186496934e-06, + "loss": 1.3422338962554932, + "step": 5950 + }, + { + "epoch": 1.832012312427857, + "grad_norm": 7.0, + "learning_rate": 1.0738733724997896e-06, + "loss": 1.2960294485092163, + "step": 5952 + }, + { + "epoch": 1.8326279338207003, + "grad_norm": 3.8125, + "learning_rate": 1.0733374725378508e-06, + "loss": 1.1552178859710693, + "step": 5954 + }, + { + "epoch": 1.8332435552135435, + "grad_norm": 8.0625, + "learning_rate": 1.0728034871436536e-06, + "loss": 1.386427879333496, + "step": 5956 + }, + { + "epoch": 1.8338591766063872, + "grad_norm": 5.40625, + "learning_rate": 1.0722714168478306e-06, + "loss": 1.6475698947906494, + "step": 5958 + }, + { + "epoch": 1.8344747979992304, + "grad_norm": 5.875, + "learning_rate": 1.0717412621791123e-06, + "loss": 1.4369161128997803, + "step": 5960 + }, + { + "epoch": 1.8350904193920738, + "grad_norm": 3.578125, + "learning_rate": 1.0712130236643257e-06, + "loss": 1.274042010307312, + "step": 5962 + }, + { + "epoch": 1.8357060407849173, + "grad_norm": 21.125, + "learning_rate": 1.0706867018283924e-06, + "loss": 0.9764050245285034, + "step": 5964 + }, + { + "epoch": 1.8363216621777607, + "grad_norm": 8.125, + "learning_rate": 1.070162297194331e-06, + "loss": 1.4333478212356567, + "step": 5966 + }, + { + "epoch": 1.8369372835706042, + "grad_norm": 16.125, + "learning_rate": 1.0696398102832534e-06, + "loss": 1.6606907844543457, + "step": 5968 + }, + { + "epoch": 1.8375529049634474, + "grad_norm": 5.125, + "learning_rate": 1.0691192416143673e-06, + "loss": 1.4241994619369507, + "step": 5970 + }, + { + "epoch": 1.838168526356291, + "grad_norm": 6.28125, + "learning_rate": 1.068600591704973e-06, + "loss": 1.156654953956604, + "step": 5972 + }, + { + "epoch": 1.8387841477491342, + "grad_norm": 4.75, + "learning_rate": 1.0680838610704645e-06, + "loss": 1.4421087503433228, + "step": 5974 + }, + { + "epoch": 1.8393997691419777, + "grad_norm": 2.90625, + "learning_rate": 1.0675690502243288e-06, + "loss": 1.1113789081573486, + "step": 5976 + }, + { + "epoch": 1.840015390534821, + "grad_norm": 7.75, + "learning_rate": 1.0670561596781454e-06, + "loss": 1.331964373588562, + "step": 5978 + }, + { + "epoch": 1.8406310119276645, + "grad_norm": 3.8125, + "learning_rate": 1.0665451899415843e-06, + "loss": 1.075302004814148, + "step": 5980 + }, + { + "epoch": 1.841246633320508, + "grad_norm": 9.0, + "learning_rate": 1.0660361415224077e-06, + "loss": 1.4577410221099854, + "step": 5982 + }, + { + "epoch": 1.8418622547133512, + "grad_norm": 29.75, + "learning_rate": 1.0655290149264688e-06, + "loss": 1.7311087846755981, + "step": 5984 + }, + { + "epoch": 1.8424778761061948, + "grad_norm": 8.0625, + "learning_rate": 1.0650238106577104e-06, + "loss": 1.6321951150894165, + "step": 5986 + }, + { + "epoch": 1.843093497499038, + "grad_norm": 17.0, + "learning_rate": 1.0645205292181662e-06, + "loss": 1.5196198225021362, + "step": 5988 + }, + { + "epoch": 1.8437091188918815, + "grad_norm": 7.0, + "learning_rate": 1.0640191711079568e-06, + "loss": 1.3114598989486694, + "step": 5990 + }, + { + "epoch": 1.844324740284725, + "grad_norm": 6.25, + "learning_rate": 1.063519736825294e-06, + "loss": 1.561679482460022, + "step": 5992 + }, + { + "epoch": 1.8449403616775681, + "grad_norm": 5.625, + "learning_rate": 1.0630222268664764e-06, + "loss": 1.591888666152954, + "step": 5994 + }, + { + "epoch": 1.8455559830704118, + "grad_norm": 19.375, + "learning_rate": 1.062526641725891e-06, + "loss": 1.102703332901001, + "step": 5996 + }, + { + "epoch": 1.846171604463255, + "grad_norm": 6.3125, + "learning_rate": 1.0620329818960116e-06, + "loss": 1.6592758893966675, + "step": 5998 + }, + { + "epoch": 1.8467872258560984, + "grad_norm": 8.4375, + "learning_rate": 1.0615412478673996e-06, + "loss": 1.662277340888977, + "step": 6000 + }, + { + "epoch": 1.8474028472489419, + "grad_norm": 9.3125, + "learning_rate": 1.0610514401287015e-06, + "loss": 1.3304259777069092, + "step": 6002 + }, + { + "epoch": 1.8480184686417853, + "grad_norm": 6.1875, + "learning_rate": 1.0605635591666505e-06, + "loss": 0.9376093149185181, + "step": 6004 + }, + { + "epoch": 1.8486340900346288, + "grad_norm": 3.671875, + "learning_rate": 1.0600776054660646e-06, + "loss": 1.0420424938201904, + "step": 6006 + }, + { + "epoch": 1.849249711427472, + "grad_norm": 13.0625, + "learning_rate": 1.0595935795098474e-06, + "loss": 1.5878580808639526, + "step": 6008 + }, + { + "epoch": 1.8498653328203156, + "grad_norm": 6.09375, + "learning_rate": 1.0591114817789861e-06, + "loss": 1.1772234439849854, + "step": 6010 + }, + { + "epoch": 1.8504809542131588, + "grad_norm": 6.625, + "learning_rate": 1.058631312752552e-06, + "loss": 1.3105149269104004, + "step": 6012 + }, + { + "epoch": 1.8510965756060023, + "grad_norm": 5.4375, + "learning_rate": 1.0581530729076997e-06, + "loss": 1.240427851676941, + "step": 6014 + }, + { + "epoch": 1.8517121969988457, + "grad_norm": 2.703125, + "learning_rate": 1.057676762719667e-06, + "loss": 0.9596577882766724, + "step": 6016 + }, + { + "epoch": 1.8523278183916891, + "grad_norm": 2.734375, + "learning_rate": 1.057202382661774e-06, + "loss": 1.3915760517120361, + "step": 6018 + }, + { + "epoch": 1.8529434397845326, + "grad_norm": 8.6875, + "learning_rate": 1.0567299332054225e-06, + "loss": 1.3262848854064941, + "step": 6020 + }, + { + "epoch": 1.8535590611773758, + "grad_norm": 33.25, + "learning_rate": 1.0562594148200966e-06, + "loss": 1.464792251586914, + "step": 6022 + }, + { + "epoch": 1.8541746825702194, + "grad_norm": 7.5, + "learning_rate": 1.055790827973361e-06, + "loss": 1.4374839067459106, + "step": 6024 + }, + { + "epoch": 1.8547903039630627, + "grad_norm": 6.59375, + "learning_rate": 1.0553241731308602e-06, + "loss": 1.3735730648040771, + "step": 6026 + }, + { + "epoch": 1.855405925355906, + "grad_norm": 8.5, + "learning_rate": 1.0548594507563207e-06, + "loss": 1.1841590404510498, + "step": 6028 + }, + { + "epoch": 1.8560215467487495, + "grad_norm": 12.0, + "learning_rate": 1.0543966613115464e-06, + "loss": 1.6395710706710815, + "step": 6030 + }, + { + "epoch": 1.856637168141593, + "grad_norm": 5.03125, + "learning_rate": 1.0539358052564224e-06, + "loss": 1.2497833967208862, + "step": 6032 + }, + { + "epoch": 1.8572527895344364, + "grad_norm": 36.0, + "learning_rate": 1.0534768830489111e-06, + "loss": 0.7703664898872375, + "step": 6034 + }, + { + "epoch": 1.8578684109272796, + "grad_norm": 16.875, + "learning_rate": 1.0530198951450542e-06, + "loss": 1.0920758247375488, + "step": 6036 + }, + { + "epoch": 1.8584840323201233, + "grad_norm": 10.1875, + "learning_rate": 1.0525648419989705e-06, + "loss": 1.3005291223526, + "step": 6038 + }, + { + "epoch": 1.8590996537129665, + "grad_norm": 4.875, + "learning_rate": 1.052111724062857e-06, + "loss": 0.813774049282074, + "step": 6040 + }, + { + "epoch": 1.85971527510581, + "grad_norm": 7.84375, + "learning_rate": 1.0516605417869865e-06, + "loss": 1.2903823852539062, + "step": 6042 + }, + { + "epoch": 1.8603308964986534, + "grad_norm": 4.75, + "learning_rate": 1.0512112956197094e-06, + "loss": 0.8724620938301086, + "step": 6044 + }, + { + "epoch": 1.8609465178914966, + "grad_norm": 8.8125, + "learning_rate": 1.0507639860074517e-06, + "loss": 1.3100684881210327, + "step": 6046 + }, + { + "epoch": 1.8615621392843402, + "grad_norm": 5.375, + "learning_rate": 1.0503186133947148e-06, + "loss": 1.3518569469451904, + "step": 6048 + }, + { + "epoch": 1.8621777606771834, + "grad_norm": 3.734375, + "learning_rate": 1.0498751782240752e-06, + "loss": 0.7228596210479736, + "step": 6050 + }, + { + "epoch": 1.862793382070027, + "grad_norm": 5.0625, + "learning_rate": 1.049433680936185e-06, + "loss": 1.2877094745635986, + "step": 6052 + }, + { + "epoch": 1.8634090034628703, + "grad_norm": 7.78125, + "learning_rate": 1.0489941219697695e-06, + "loss": 1.3102952241897583, + "step": 6054 + }, + { + "epoch": 1.8640246248557137, + "grad_norm": 8.6875, + "learning_rate": 1.0485565017616286e-06, + "loss": 1.3873236179351807, + "step": 6056 + }, + { + "epoch": 1.8646402462485572, + "grad_norm": 6.09375, + "learning_rate": 1.0481208207466349e-06, + "loss": 1.362139344215393, + "step": 6058 + }, + { + "epoch": 1.8652558676414004, + "grad_norm": 6.03125, + "learning_rate": 1.0476870793577346e-06, + "loss": 1.3270050287246704, + "step": 6060 + }, + { + "epoch": 1.865871489034244, + "grad_norm": 8.375, + "learning_rate": 1.0472552780259464e-06, + "loss": 1.2958656549453735, + "step": 6062 + }, + { + "epoch": 1.8664871104270873, + "grad_norm": 6.34375, + "learning_rate": 1.0468254171803607e-06, + "loss": 1.2908275127410889, + "step": 6064 + }, + { + "epoch": 1.8671027318199307, + "grad_norm": 5.625, + "learning_rate": 1.0463974972481402e-06, + "loss": 1.4341435432434082, + "step": 6066 + }, + { + "epoch": 1.8677183532127741, + "grad_norm": 3.375, + "learning_rate": 1.045971518654518e-06, + "loss": 1.2812824249267578, + "step": 6068 + }, + { + "epoch": 1.8683339746056176, + "grad_norm": 11.25, + "learning_rate": 1.045547481822799e-06, + "loss": 1.5339080095291138, + "step": 6070 + }, + { + "epoch": 1.868949595998461, + "grad_norm": 6.40625, + "learning_rate": 1.0451253871743582e-06, + "loss": 1.1604589223861694, + "step": 6072 + }, + { + "epoch": 1.8695652173913042, + "grad_norm": 6.625, + "learning_rate": 1.0447052351286401e-06, + "loss": 1.3172587156295776, + "step": 6074 + }, + { + "epoch": 1.8701808387841479, + "grad_norm": 5.3125, + "learning_rate": 1.0442870261031593e-06, + "loss": 1.0107141733169556, + "step": 6076 + }, + { + "epoch": 1.870796460176991, + "grad_norm": 7.46875, + "learning_rate": 1.0438707605134996e-06, + "loss": 1.1750514507293701, + "step": 6078 + }, + { + "epoch": 1.8714120815698345, + "grad_norm": 5.6875, + "learning_rate": 1.0434564387733138e-06, + "loss": 1.1378748416900635, + "step": 6080 + }, + { + "epoch": 1.872027702962678, + "grad_norm": 4.34375, + "learning_rate": 1.0430440612943222e-06, + "loss": 1.150460958480835, + "step": 6082 + }, + { + "epoch": 1.8726433243555214, + "grad_norm": 7.84375, + "learning_rate": 1.0426336284863136e-06, + "loss": 1.2356613874435425, + "step": 6084 + }, + { + "epoch": 1.8732589457483648, + "grad_norm": 12.5, + "learning_rate": 1.0422251407571444e-06, + "loss": 1.6873862743377686, + "step": 6086 + }, + { + "epoch": 1.873874567141208, + "grad_norm": 8.0, + "learning_rate": 1.0418185985127379e-06, + "loss": 1.527343988418579, + "step": 6088 + }, + { + "epoch": 1.8744901885340517, + "grad_norm": 23.25, + "learning_rate": 1.041414002157084e-06, + "loss": 1.393473505973816, + "step": 6090 + }, + { + "epoch": 1.875105809926895, + "grad_norm": 7.1875, + "learning_rate": 1.0410113520922402e-06, + "loss": 1.6080451011657715, + "step": 6092 + }, + { + "epoch": 1.8757214313197383, + "grad_norm": 28.0, + "learning_rate": 1.0406106487183277e-06, + "loss": 1.6712785959243774, + "step": 6094 + }, + { + "epoch": 1.8763370527125818, + "grad_norm": 9.375, + "learning_rate": 1.040211892433535e-06, + "loss": 1.5237807035446167, + "step": 6096 + }, + { + "epoch": 1.8769526741054252, + "grad_norm": 11.5, + "learning_rate": 1.039815083634115e-06, + "loss": 1.5383855104446411, + "step": 6098 + }, + { + "epoch": 1.8775682954982686, + "grad_norm": 16.125, + "learning_rate": 1.0394202227143857e-06, + "loss": 1.5905861854553223, + "step": 6100 + }, + { + "epoch": 1.8781839168911119, + "grad_norm": 16.625, + "learning_rate": 1.0390273100667291e-06, + "loss": 1.1382654905319214, + "step": 6102 + }, + { + "epoch": 1.8787995382839555, + "grad_norm": 12.25, + "learning_rate": 1.0386363460815913e-06, + "loss": 0.9064276218414307, + "step": 6104 + }, + { + "epoch": 1.8794151596767987, + "grad_norm": 3.078125, + "learning_rate": 1.0382473311474821e-06, + "loss": 1.2223496437072754, + "step": 6106 + }, + { + "epoch": 1.8800307810696422, + "grad_norm": 111.0, + "learning_rate": 1.037860265650974e-06, + "loss": 1.2195074558258057, + "step": 6108 + }, + { + "epoch": 1.8806464024624856, + "grad_norm": 8.8125, + "learning_rate": 1.037475149976703e-06, + "loss": 1.5098378658294678, + "step": 6110 + }, + { + "epoch": 1.8812620238553288, + "grad_norm": 9.6875, + "learning_rate": 1.0370919845073674e-06, + "loss": 1.5166021585464478, + "step": 6112 + }, + { + "epoch": 1.8818776452481725, + "grad_norm": 5.5625, + "learning_rate": 1.0367107696237266e-06, + "loss": 1.114842176437378, + "step": 6114 + }, + { + "epoch": 1.8824932666410157, + "grad_norm": 6.4375, + "learning_rate": 1.036331505704603e-06, + "loss": 0.7568085789680481, + "step": 6116 + }, + { + "epoch": 1.8831088880338593, + "grad_norm": 11.0, + "learning_rate": 1.0359541931268793e-06, + "loss": 1.5966717004776, + "step": 6118 + }, + { + "epoch": 1.8837245094267026, + "grad_norm": 7.3125, + "learning_rate": 1.0355788322655e-06, + "loss": 1.569027304649353, + "step": 6120 + }, + { + "epoch": 1.884340130819546, + "grad_norm": 2.609375, + "learning_rate": 1.0352054234934688e-06, + "loss": 0.9419485330581665, + "step": 6122 + }, + { + "epoch": 1.8849557522123894, + "grad_norm": 7.03125, + "learning_rate": 1.0348339671818509e-06, + "loss": 1.1914021968841553, + "step": 6124 + }, + { + "epoch": 1.8855713736052326, + "grad_norm": 4.09375, + "learning_rate": 1.0344644636997705e-06, + "loss": 1.2729462385177612, + "step": 6126 + }, + { + "epoch": 1.8861869949980763, + "grad_norm": 6.65625, + "learning_rate": 1.0340969134144118e-06, + "loss": 1.2701356410980225, + "step": 6128 + }, + { + "epoch": 1.8868026163909195, + "grad_norm": 7.15625, + "learning_rate": 1.0337313166910176e-06, + "loss": 1.2504490613937378, + "step": 6130 + }, + { + "epoch": 1.887418237783763, + "grad_norm": 5.125, + "learning_rate": 1.0333676738928895e-06, + "loss": 1.3305485248565674, + "step": 6132 + }, + { + "epoch": 1.8880338591766064, + "grad_norm": 8.8125, + "learning_rate": 1.0330059853813875e-06, + "loss": 1.276931881904602, + "step": 6134 + }, + { + "epoch": 1.8886494805694498, + "grad_norm": 6.125, + "learning_rate": 1.0326462515159297e-06, + "loss": 0.7779605388641357, + "step": 6136 + }, + { + "epoch": 1.8892651019622932, + "grad_norm": 6.0625, + "learning_rate": 1.0322884726539915e-06, + "loss": 1.1487722396850586, + "step": 6138 + }, + { + "epoch": 1.8898807233551365, + "grad_norm": 7.21875, + "learning_rate": 1.0319326491511062e-06, + "loss": 1.1358317136764526, + "step": 6140 + }, + { + "epoch": 1.8904963447479801, + "grad_norm": 2.375, + "learning_rate": 1.0315787813608631e-06, + "loss": 1.1888786554336548, + "step": 6142 + }, + { + "epoch": 1.8911119661408233, + "grad_norm": 9.6875, + "learning_rate": 1.031226869634909e-06, + "loss": 0.9452347159385681, + "step": 6144 + }, + { + "epoch": 1.8917275875336668, + "grad_norm": 5.09375, + "learning_rate": 1.0308769143229458e-06, + "loss": 1.399794578552246, + "step": 6146 + }, + { + "epoch": 1.8923432089265102, + "grad_norm": 22.25, + "learning_rate": 1.0305289157727326e-06, + "loss": 1.4802740812301636, + "step": 6148 + }, + { + "epoch": 1.8929588303193536, + "grad_norm": 14.625, + "learning_rate": 1.030182874330083e-06, + "loss": 1.3717149496078491, + "step": 6150 + }, + { + "epoch": 1.893574451712197, + "grad_norm": 3.34375, + "learning_rate": 1.0298387903388665e-06, + "loss": 1.2769005298614502, + "step": 6152 + }, + { + "epoch": 1.8941900731050403, + "grad_norm": 9.75, + "learning_rate": 1.0294966641410067e-06, + "loss": 1.3929970264434814, + "step": 6154 + }, + { + "epoch": 1.894805694497884, + "grad_norm": 8.0, + "learning_rate": 1.0291564960764822e-06, + "loss": 1.286952018737793, + "step": 6156 + }, + { + "epoch": 1.8954213158907272, + "grad_norm": 8.8125, + "learning_rate": 1.028818286483326e-06, + "loss": 1.2915070056915283, + "step": 6158 + }, + { + "epoch": 1.8960369372835706, + "grad_norm": 7.375, + "learning_rate": 1.0284820356976239e-06, + "loss": 1.7561068534851074, + "step": 6160 + }, + { + "epoch": 1.896652558676414, + "grad_norm": 12.9375, + "learning_rate": 1.0281477440535157e-06, + "loss": 0.9700071215629578, + "step": 6162 + }, + { + "epoch": 1.8972681800692575, + "grad_norm": 5.5625, + "learning_rate": 1.0278154118831956e-06, + "loss": 1.5087077617645264, + "step": 6164 + }, + { + "epoch": 1.897883801462101, + "grad_norm": 6.6875, + "learning_rate": 1.0274850395169086e-06, + "loss": 1.482177972793579, + "step": 6166 + }, + { + "epoch": 1.898499422854944, + "grad_norm": 5.6875, + "learning_rate": 1.0271566272829532e-06, + "loss": 1.3078364133834839, + "step": 6168 + }, + { + "epoch": 1.8991150442477878, + "grad_norm": 3.953125, + "learning_rate": 1.0268301755076806e-06, + "loss": 1.3884791135787964, + "step": 6170 + }, + { + "epoch": 1.899730665640631, + "grad_norm": 6.34375, + "learning_rate": 1.0265056845154927e-06, + "loss": 0.9594384431838989, + "step": 6172 + }, + { + "epoch": 1.9003462870334744, + "grad_norm": 11.1875, + "learning_rate": 1.0261831546288435e-06, + "loss": 1.4539716243743896, + "step": 6174 + }, + { + "epoch": 1.9009619084263178, + "grad_norm": 9.0625, + "learning_rate": 1.0258625861682383e-06, + "loss": 1.788408637046814, + "step": 6176 + }, + { + "epoch": 1.901577529819161, + "grad_norm": 4.8125, + "learning_rate": 1.0255439794522332e-06, + "loss": 1.148524284362793, + "step": 6178 + }, + { + "epoch": 1.9021931512120047, + "grad_norm": 8.0625, + "learning_rate": 1.025227334797435e-06, + "loss": 1.4598939418792725, + "step": 6180 + }, + { + "epoch": 1.902808772604848, + "grad_norm": 4.625, + "learning_rate": 1.0249126525185e-06, + "loss": 1.4986612796783447, + "step": 6182 + }, + { + "epoch": 1.9034243939976914, + "grad_norm": 7.84375, + "learning_rate": 1.0245999329281356e-06, + "loss": 1.09617280960083, + "step": 6184 + }, + { + "epoch": 1.9040400153905348, + "grad_norm": 7.1875, + "learning_rate": 1.024289176337098e-06, + "loss": 1.5821504592895508, + "step": 6186 + }, + { + "epoch": 1.9046556367833782, + "grad_norm": 4.21875, + "learning_rate": 1.0239803830541933e-06, + "loss": 1.1723260879516602, + "step": 6188 + }, + { + "epoch": 1.9052712581762217, + "grad_norm": 6.5625, + "learning_rate": 1.023673553386276e-06, + "loss": 1.4746092557907104, + "step": 6190 + }, + { + "epoch": 1.9058868795690649, + "grad_norm": 19.75, + "learning_rate": 1.0233686876382493e-06, + "loss": 1.318878412246704, + "step": 6192 + }, + { + "epoch": 1.9065025009619085, + "grad_norm": 5.5, + "learning_rate": 1.023065786113066e-06, + "loss": 1.4316555261611938, + "step": 6194 + }, + { + "epoch": 1.9071181223547518, + "grad_norm": 8.375, + "learning_rate": 1.0227648491117256e-06, + "loss": 1.5165979862213135, + "step": 6196 + }, + { + "epoch": 1.9077337437475952, + "grad_norm": 7.28125, + "learning_rate": 1.0224658769332758e-06, + "loss": 1.47892427444458, + "step": 6198 + }, + { + "epoch": 1.9083493651404386, + "grad_norm": 17.5, + "learning_rate": 1.0221688698748124e-06, + "loss": 1.3471444845199585, + "step": 6200 + }, + { + "epoch": 1.908964986533282, + "grad_norm": 7.125, + "learning_rate": 1.0218738282314776e-06, + "loss": 0.6739861369132996, + "step": 6202 + }, + { + "epoch": 1.9095806079261255, + "grad_norm": 8.5, + "learning_rate": 1.021580752296461e-06, + "loss": 0.8699933886528015, + "step": 6204 + }, + { + "epoch": 1.9101962293189687, + "grad_norm": 5.75, + "learning_rate": 1.0212896423609986e-06, + "loss": 1.5890634059906006, + "step": 6206 + }, + { + "epoch": 1.9108118507118124, + "grad_norm": 7.3125, + "learning_rate": 1.0210004987143736e-06, + "loss": 1.2174679040908813, + "step": 6208 + }, + { + "epoch": 1.9114274721046556, + "grad_norm": 4.25, + "learning_rate": 1.0207133216439136e-06, + "loss": 1.385654091835022, + "step": 6210 + }, + { + "epoch": 1.912043093497499, + "grad_norm": 4.3125, + "learning_rate": 1.020428111434993e-06, + "loss": 1.5168359279632568, + "step": 6212 + }, + { + "epoch": 1.9126587148903424, + "grad_norm": 8.6875, + "learning_rate": 1.020144868371032e-06, + "loss": 1.3468291759490967, + "step": 6214 + }, + { + "epoch": 1.9132743362831859, + "grad_norm": 2.671875, + "learning_rate": 1.0198635927334954e-06, + "loss": 1.1638237237930298, + "step": 6216 + }, + { + "epoch": 1.9138899576760293, + "grad_norm": 49.5, + "learning_rate": 1.0195842848018932e-06, + "loss": 1.425129771232605, + "step": 6218 + }, + { + "epoch": 1.9145055790688725, + "grad_norm": 6.15625, + "learning_rate": 1.01930694485378e-06, + "loss": 1.3448690176010132, + "step": 6220 + }, + { + "epoch": 1.9151212004617162, + "grad_norm": 6.28125, + "learning_rate": 1.0190315731647542e-06, + "loss": 1.4132188558578491, + "step": 6222 + }, + { + "epoch": 1.9157368218545594, + "grad_norm": 6.625, + "learning_rate": 1.0187581700084593e-06, + "loss": 1.5360431671142578, + "step": 6224 + }, + { + "epoch": 1.9163524432474028, + "grad_norm": 5.375, + "learning_rate": 1.018486735656582e-06, + "loss": 1.2933526039123535, + "step": 6226 + }, + { + "epoch": 1.9169680646402463, + "grad_norm": 25.0, + "learning_rate": 1.0182172703788529e-06, + "loss": 1.4225658178329468, + "step": 6228 + }, + { + "epoch": 1.9175836860330895, + "grad_norm": 1.8828125, + "learning_rate": 1.0179497744430456e-06, + "loss": 1.0137361288070679, + "step": 6230 + }, + { + "epoch": 1.9181993074259331, + "grad_norm": 6.625, + "learning_rate": 1.0176842481149765e-06, + "loss": 1.0409231185913086, + "step": 6232 + }, + { + "epoch": 1.9188149288187764, + "grad_norm": 7.34375, + "learning_rate": 1.0174206916585056e-06, + "loss": 1.2320098876953125, + "step": 6234 + }, + { + "epoch": 1.91943055021162, + "grad_norm": 4.0, + "learning_rate": 1.017159105335534e-06, + "loss": 1.1500188112258911, + "step": 6236 + }, + { + "epoch": 1.9200461716044632, + "grad_norm": 8.875, + "learning_rate": 1.016899489406007e-06, + "loss": 1.364874005317688, + "step": 6238 + }, + { + "epoch": 1.9206617929973067, + "grad_norm": 7.1875, + "learning_rate": 1.0166418441279101e-06, + "loss": 1.4855751991271973, + "step": 6240 + }, + { + "epoch": 1.92127741439015, + "grad_norm": 3.015625, + "learning_rate": 1.0163861697572714e-06, + "loss": 1.0191508531570435, + "step": 6242 + }, + { + "epoch": 1.9218930357829933, + "grad_norm": 6.15625, + "learning_rate": 1.0161324665481598e-06, + "loss": 1.259505033493042, + "step": 6244 + }, + { + "epoch": 1.922508657175837, + "grad_norm": 7.78125, + "learning_rate": 1.0158807347526865e-06, + "loss": 1.6142199039459229, + "step": 6246 + }, + { + "epoch": 1.9231242785686802, + "grad_norm": 4.75, + "learning_rate": 1.0156309746210028e-06, + "loss": 1.6033637523651123, + "step": 6248 + }, + { + "epoch": 1.9237398999615236, + "grad_norm": 5.90625, + "learning_rate": 1.0153831864013008e-06, + "loss": 1.5636179447174072, + "step": 6250 + }, + { + "epoch": 1.924355521354367, + "grad_norm": 14.4375, + "learning_rate": 1.0151373703398134e-06, + "loss": 1.2434797286987305, + "step": 6252 + }, + { + "epoch": 1.9249711427472105, + "grad_norm": 10.75, + "learning_rate": 1.0148935266808134e-06, + "loss": 1.4169031381607056, + "step": 6254 + }, + { + "epoch": 1.925586764140054, + "grad_norm": 6.25, + "learning_rate": 1.0146516556666135e-06, + "loss": 1.2062106132507324, + "step": 6256 + }, + { + "epoch": 1.9262023855328971, + "grad_norm": 6.6875, + "learning_rate": 1.0144117575375667e-06, + "loss": 1.5046076774597168, + "step": 6258 + }, + { + "epoch": 1.9268180069257408, + "grad_norm": 3.15625, + "learning_rate": 1.0141738325320656e-06, + "loss": 1.0233358144760132, + "step": 6260 + }, + { + "epoch": 1.927433628318584, + "grad_norm": 5.625, + "learning_rate": 1.0139378808865404e-06, + "loss": 1.2588615417480469, + "step": 6262 + }, + { + "epoch": 1.9280492497114274, + "grad_norm": 2.828125, + "learning_rate": 1.0137039028354624e-06, + "loss": 1.157249093055725, + "step": 6264 + }, + { + "epoch": 1.9286648711042709, + "grad_norm": 7.96875, + "learning_rate": 1.0134718986113406e-06, + "loss": 1.1970008611679077, + "step": 6266 + }, + { + "epoch": 1.9292804924971143, + "grad_norm": 5.125, + "learning_rate": 1.0132418684447227e-06, + "loss": 1.3345950841903687, + "step": 6268 + }, + { + "epoch": 1.9298961138899577, + "grad_norm": 8.0, + "learning_rate": 1.013013812564195e-06, + "loss": 1.1379836797714233, + "step": 6270 + }, + { + "epoch": 1.930511735282801, + "grad_norm": 13.8125, + "learning_rate": 1.0127877311963818e-06, + "loss": 1.1116348505020142, + "step": 6272 + }, + { + "epoch": 1.9311273566756446, + "grad_norm": 4.28125, + "learning_rate": 1.0125636245659453e-06, + "loss": 1.2857990264892578, + "step": 6274 + }, + { + "epoch": 1.9317429780684878, + "grad_norm": 5.34375, + "learning_rate": 1.012341492895585e-06, + "loss": 1.5975061655044556, + "step": 6276 + }, + { + "epoch": 1.9323585994613313, + "grad_norm": 3.796875, + "learning_rate": 1.0121213364060383e-06, + "loss": 1.213213324546814, + "step": 6278 + }, + { + "epoch": 1.9329742208541747, + "grad_norm": 7.71875, + "learning_rate": 1.0119031553160791e-06, + "loss": 1.5098799467086792, + "step": 6280 + }, + { + "epoch": 1.9335898422470181, + "grad_norm": 7.375, + "learning_rate": 1.01168694984252e-06, + "loss": 1.2202961444854736, + "step": 6282 + }, + { + "epoch": 1.9342054636398616, + "grad_norm": 6.40625, + "learning_rate": 1.011472720200208e-06, + "loss": 1.6116353273391724, + "step": 6284 + }, + { + "epoch": 1.9348210850327048, + "grad_norm": 5.28125, + "learning_rate": 1.0112604666020288e-06, + "loss": 1.3997621536254883, + "step": 6286 + }, + { + "epoch": 1.9354367064255484, + "grad_norm": 15.6875, + "learning_rate": 1.011050189258903e-06, + "loss": 1.4360119104385376, + "step": 6288 + }, + { + "epoch": 1.9360523278183916, + "grad_norm": 11.1875, + "learning_rate": 1.010841888379788e-06, + "loss": 1.6121647357940674, + "step": 6290 + }, + { + "epoch": 1.936667949211235, + "grad_norm": 3.203125, + "learning_rate": 1.0106355641716772e-06, + "loss": 1.1204249858856201, + "step": 6292 + }, + { + "epoch": 1.9372835706040785, + "grad_norm": 5.53125, + "learning_rate": 1.0104312168395996e-06, + "loss": 0.9558459520339966, + "step": 6294 + }, + { + "epoch": 1.9378991919969217, + "grad_norm": 6.40625, + "learning_rate": 1.0102288465866196e-06, + "loss": 1.29323148727417, + "step": 6296 + }, + { + "epoch": 1.9385148133897654, + "grad_norm": 4.28125, + "learning_rate": 1.0100284536138372e-06, + "loss": 1.245251178741455, + "step": 6298 + }, + { + "epoch": 1.9391304347826086, + "grad_norm": 18.125, + "learning_rate": 1.0098300381203873e-06, + "loss": 1.4215116500854492, + "step": 6300 + }, + { + "epoch": 1.9397460561754523, + "grad_norm": 8.5, + "learning_rate": 1.0096336003034398e-06, + "loss": 1.6721007823944092, + "step": 6302 + }, + { + "epoch": 1.9403616775682955, + "grad_norm": 4.5, + "learning_rate": 1.0094391403581991e-06, + "loss": 1.408481478691101, + "step": 6304 + }, + { + "epoch": 1.940977298961139, + "grad_norm": 7.9375, + "learning_rate": 1.0092466584779052e-06, + "loss": 1.1964179277420044, + "step": 6306 + }, + { + "epoch": 1.9415929203539823, + "grad_norm": 6.25, + "learning_rate": 1.009056154853831e-06, + "loss": 1.6166949272155762, + "step": 6308 + }, + { + "epoch": 1.9422085417468256, + "grad_norm": 7.78125, + "learning_rate": 1.008867629675284e-06, + "loss": 1.4972113370895386, + "step": 6310 + }, + { + "epoch": 1.9428241631396692, + "grad_norm": 13.1875, + "learning_rate": 1.0086810831296071e-06, + "loss": 1.3295609951019287, + "step": 6312 + }, + { + "epoch": 1.9434397845325124, + "grad_norm": 6.28125, + "learning_rate": 1.0084965154021741e-06, + "loss": 1.528801679611206, + "step": 6314 + }, + { + "epoch": 1.9440554059253559, + "grad_norm": 3.109375, + "learning_rate": 1.0083139266763955e-06, + "loss": 1.281317949295044, + "step": 6316 + }, + { + "epoch": 1.9446710273181993, + "grad_norm": 4.84375, + "learning_rate": 1.0081333171337132e-06, + "loss": 1.282319188117981, + "step": 6318 + }, + { + "epoch": 1.9452866487110427, + "grad_norm": 7.65625, + "learning_rate": 1.0079546869536027e-06, + "loss": 1.2659920454025269, + "step": 6320 + }, + { + "epoch": 1.9459022701038862, + "grad_norm": 5.0, + "learning_rate": 1.0077780363135736e-06, + "loss": 1.0919814109802246, + "step": 6322 + }, + { + "epoch": 1.9465178914967294, + "grad_norm": 10.625, + "learning_rate": 1.0076033653891667e-06, + "loss": 1.3609809875488281, + "step": 6324 + }, + { + "epoch": 1.947133512889573, + "grad_norm": 2.6875, + "learning_rate": 1.007430674353957e-06, + "loss": 1.2212693691253662, + "step": 6326 + }, + { + "epoch": 1.9477491342824163, + "grad_norm": 5.96875, + "learning_rate": 1.0072599633795512e-06, + "loss": 1.3360657691955566, + "step": 6328 + }, + { + "epoch": 1.9483647556752597, + "grad_norm": 5.375, + "learning_rate": 1.007091232635589e-06, + "loss": 1.6322662830352783, + "step": 6330 + }, + { + "epoch": 1.9489803770681031, + "grad_norm": 9.8125, + "learning_rate": 1.0069244822897413e-06, + "loss": 0.840678334236145, + "step": 6332 + }, + { + "epoch": 1.9495959984609466, + "grad_norm": 12.625, + "learning_rate": 1.006759712507712e-06, + "loss": 1.6254369020462036, + "step": 6334 + }, + { + "epoch": 1.95021161985379, + "grad_norm": 9.0, + "learning_rate": 1.0065969234532367e-06, + "loss": 0.9958906769752502, + "step": 6336 + }, + { + "epoch": 1.9508272412466332, + "grad_norm": 5.90625, + "learning_rate": 1.0064361152880823e-06, + "loss": 1.294455885887146, + "step": 6338 + }, + { + "epoch": 1.9514428626394769, + "grad_norm": 10.875, + "learning_rate": 1.0062772881720476e-06, + "loss": 1.2550309896469116, + "step": 6340 + }, + { + "epoch": 1.95205848403232, + "grad_norm": 5.125, + "learning_rate": 1.0061204422629625e-06, + "loss": 1.6748965978622437, + "step": 6342 + }, + { + "epoch": 1.9526741054251635, + "grad_norm": 6.8125, + "learning_rate": 1.0059655777166883e-06, + "loss": 1.3717496395111084, + "step": 6344 + }, + { + "epoch": 1.953289726818007, + "grad_norm": 10.375, + "learning_rate": 1.0058126946871174e-06, + "loss": 1.4777202606201172, + "step": 6346 + }, + { + "epoch": 1.9539053482108504, + "grad_norm": 3.78125, + "learning_rate": 1.0056617933261735e-06, + "loss": 1.1079200506210327, + "step": 6348 + }, + { + "epoch": 1.9545209696036938, + "grad_norm": 3.0, + "learning_rate": 1.0055128737838101e-06, + "loss": 1.182327389717102, + "step": 6350 + }, + { + "epoch": 1.955136590996537, + "grad_norm": 23.625, + "learning_rate": 1.0053659362080123e-06, + "loss": 1.4486178159713745, + "step": 6352 + }, + { + "epoch": 1.9557522123893807, + "grad_norm": 13.1875, + "learning_rate": 1.0052209807447948e-06, + "loss": 1.0059131383895874, + "step": 6354 + }, + { + "epoch": 1.956367833782224, + "grad_norm": 5.3125, + "learning_rate": 1.0050780075382033e-06, + "loss": 1.1534645557403564, + "step": 6356 + }, + { + "epoch": 1.9569834551750673, + "grad_norm": 13.0, + "learning_rate": 1.0049370167303138e-06, + "loss": 1.2540004253387451, + "step": 6358 + }, + { + "epoch": 1.9575990765679108, + "grad_norm": 7.1875, + "learning_rate": 1.0047980084612318e-06, + "loss": 1.0319916009902954, + "step": 6360 + }, + { + "epoch": 1.958214697960754, + "grad_norm": 12.3125, + "learning_rate": 1.0046609828690929e-06, + "loss": 1.5139015913009644, + "step": 6362 + }, + { + "epoch": 1.9588303193535976, + "grad_norm": 6.5625, + "learning_rate": 1.0045259400900622e-06, + "loss": 1.390751838684082, + "step": 6364 + }, + { + "epoch": 1.9594459407464409, + "grad_norm": 5.125, + "learning_rate": 1.0043928802583352e-06, + "loss": 1.1808812618255615, + "step": 6366 + }, + { + "epoch": 1.9600615621392843, + "grad_norm": 2.3125, + "learning_rate": 1.0042618035061364e-06, + "loss": 1.1729049682617188, + "step": 6368 + }, + { + "epoch": 1.9606771835321277, + "grad_norm": 6.125, + "learning_rate": 1.0041327099637196e-06, + "loss": 1.105539083480835, + "step": 6370 + }, + { + "epoch": 1.9612928049249712, + "grad_norm": 7.96875, + "learning_rate": 1.0040055997593677e-06, + "loss": 1.1965091228485107, + "step": 6372 + }, + { + "epoch": 1.9619084263178146, + "grad_norm": 10.625, + "learning_rate": 1.0038804730193933e-06, + "loss": 1.5604122877120972, + "step": 6374 + }, + { + "epoch": 1.9625240477106578, + "grad_norm": 5.75, + "learning_rate": 1.0037573298681375e-06, + "loss": 1.297991156578064, + "step": 6376 + }, + { + "epoch": 1.9631396691035015, + "grad_norm": 12.6875, + "learning_rate": 1.0036361704279705e-06, + "loss": 1.061846375465393, + "step": 6378 + }, + { + "epoch": 1.9637552904963447, + "grad_norm": 6.53125, + "learning_rate": 1.0035169948192912e-06, + "loss": 1.0523725748062134, + "step": 6380 + }, + { + "epoch": 1.964370911889188, + "grad_norm": 5.25, + "learning_rate": 1.003399803160527e-06, + "loss": 1.1259384155273438, + "step": 6382 + }, + { + "epoch": 1.9649865332820315, + "grad_norm": 8.875, + "learning_rate": 1.0032845955681337e-06, + "loss": 1.567067265510559, + "step": 6384 + }, + { + "epoch": 1.965602154674875, + "grad_norm": 17.0, + "learning_rate": 1.0031713721565957e-06, + "loss": 1.315956950187683, + "step": 6386 + }, + { + "epoch": 1.9662177760677184, + "grad_norm": 9.3125, + "learning_rate": 1.003060133038426e-06, + "loss": 1.0785685777664185, + "step": 6388 + }, + { + "epoch": 1.9668333974605616, + "grad_norm": 10.0, + "learning_rate": 1.002950878324165e-06, + "loss": 1.875533103942871, + "step": 6390 + }, + { + "epoch": 1.9674490188534053, + "grad_norm": 6.09375, + "learning_rate": 1.0028436081223818e-06, + "loss": 1.2702431678771973, + "step": 6392 + }, + { + "epoch": 1.9680646402462485, + "grad_norm": 38.5, + "learning_rate": 1.0027383225396731e-06, + "loss": 1.201972484588623, + "step": 6394 + }, + { + "epoch": 1.968680261639092, + "grad_norm": 4.6875, + "learning_rate": 1.0026350216806638e-06, + "loss": 1.2024223804473877, + "step": 6396 + }, + { + "epoch": 1.9692958830319354, + "grad_norm": 9.9375, + "learning_rate": 1.0025337056480055e-06, + "loss": 1.4981609582901, + "step": 6398 + }, + { + "epoch": 1.9699115044247788, + "grad_norm": 5.96875, + "learning_rate": 1.0024343745423792e-06, + "loss": 1.516045093536377, + "step": 6400 + }, + { + "epoch": 1.9705271258176222, + "grad_norm": 10.375, + "learning_rate": 1.002337028462492e-06, + "loss": 1.532149076461792, + "step": 6402 + }, + { + "epoch": 1.9711427472104655, + "grad_norm": 8.875, + "learning_rate": 1.002241667505079e-06, + "loss": 1.6822595596313477, + "step": 6404 + }, + { + "epoch": 1.971758368603309, + "grad_norm": 6.25, + "learning_rate": 1.0021482917649021e-06, + "loss": 1.4750088453292847, + "step": 6406 + }, + { + "epoch": 1.9723739899961523, + "grad_norm": 4.78125, + "learning_rate": 1.0020569013347512e-06, + "loss": 0.9572700262069702, + "step": 6408 + }, + { + "epoch": 1.9729896113889958, + "grad_norm": 2.71875, + "learning_rate": 1.0019674963054432e-06, + "loss": 1.1694449186325073, + "step": 6410 + }, + { + "epoch": 1.9736052327818392, + "grad_norm": 61.25, + "learning_rate": 1.0018800767658216e-06, + "loss": 1.1208440065383911, + "step": 6412 + }, + { + "epoch": 1.9742208541746824, + "grad_norm": 11.0625, + "learning_rate": 1.0017946428027572e-06, + "loss": 1.4088279008865356, + "step": 6414 + }, + { + "epoch": 1.974836475567526, + "grad_norm": 10.1875, + "learning_rate": 1.0017111945011477e-06, + "loss": 1.4532158374786377, + "step": 6416 + }, + { + "epoch": 1.9754520969603693, + "grad_norm": 6.03125, + "learning_rate": 1.0016297319439175e-06, + "loss": 1.4767062664031982, + "step": 6418 + }, + { + "epoch": 1.976067718353213, + "grad_norm": 3.625, + "learning_rate": 1.0015502552120178e-06, + "loss": 1.2021182775497437, + "step": 6420 + }, + { + "epoch": 1.9766833397460561, + "grad_norm": 8.5625, + "learning_rate": 1.0014727643844265e-06, + "loss": 1.348162293434143, + "step": 6422 + }, + { + "epoch": 1.9772989611388996, + "grad_norm": 9.4375, + "learning_rate": 1.001397259538148e-06, + "loss": 1.5533214807510376, + "step": 6424 + }, + { + "epoch": 1.977914582531743, + "grad_norm": 6.125, + "learning_rate": 1.0013237407482126e-06, + "loss": 1.439581274986267, + "step": 6426 + }, + { + "epoch": 1.9785302039245862, + "grad_norm": 6.28125, + "learning_rate": 1.0012522080876784e-06, + "loss": 1.3912233114242554, + "step": 6428 + }, + { + "epoch": 1.9791458253174299, + "grad_norm": 8.5, + "learning_rate": 1.0011826616276283e-06, + "loss": 2.146826982498169, + "step": 6430 + }, + { + "epoch": 1.979761446710273, + "grad_norm": 18.625, + "learning_rate": 1.0011151014371728e-06, + "loss": 1.7773079872131348, + "step": 6432 + }, + { + "epoch": 1.9803770681031165, + "grad_norm": 6.3125, + "learning_rate": 1.0010495275834475e-06, + "loss": 1.4938530921936035, + "step": 6434 + }, + { + "epoch": 1.98099268949596, + "grad_norm": 6.71875, + "learning_rate": 1.000985940131615e-06, + "loss": 1.3083902597427368, + "step": 6436 + }, + { + "epoch": 1.9816083108888034, + "grad_norm": 8.125, + "learning_rate": 1.0009243391448629e-06, + "loss": 1.6811070442199707, + "step": 6438 + }, + { + "epoch": 1.9822239322816468, + "grad_norm": 5.71875, + "learning_rate": 1.0008647246844064e-06, + "loss": 1.0163898468017578, + "step": 6440 + }, + { + "epoch": 1.98283955367449, + "grad_norm": 9.4375, + "learning_rate": 1.000807096809485e-06, + "loss": 0.9855446219444275, + "step": 6442 + }, + { + "epoch": 1.9834551750673337, + "grad_norm": 6.3125, + "learning_rate": 1.0007514555773652e-06, + "loss": 1.5436961650848389, + "step": 6444 + }, + { + "epoch": 1.984070796460177, + "grad_norm": 10.375, + "learning_rate": 1.0006978010433386e-06, + "loss": 0.9845457673072815, + "step": 6446 + }, + { + "epoch": 1.9846864178530204, + "grad_norm": 7.09375, + "learning_rate": 1.000646133260723e-06, + "loss": 1.182098388671875, + "step": 6448 + }, + { + "epoch": 1.9853020392458638, + "grad_norm": 2.453125, + "learning_rate": 1.0005964522808626e-06, + "loss": 1.1003978252410889, + "step": 6450 + }, + { + "epoch": 1.9859176606387072, + "grad_norm": 5.25, + "learning_rate": 1.0005487581531254e-06, + "loss": 1.3184493780136108, + "step": 6452 + }, + { + "epoch": 1.9865332820315507, + "grad_norm": 18.375, + "learning_rate": 1.0005030509249064e-06, + "loss": 0.7050902247428894, + "step": 6454 + }, + { + "epoch": 1.9871489034243939, + "grad_norm": 3.359375, + "learning_rate": 1.0004593306416267e-06, + "loss": 1.2746435403823853, + "step": 6456 + }, + { + "epoch": 1.9877645248172375, + "grad_norm": 8.75, + "learning_rate": 1.000417597346731e-06, + "loss": 1.3945417404174805, + "step": 6458 + }, + { + "epoch": 1.9883801462100807, + "grad_norm": 12.1875, + "learning_rate": 1.0003778510816915e-06, + "loss": 1.4266530275344849, + "step": 6460 + }, + { + "epoch": 1.9889957676029242, + "grad_norm": 8.25, + "learning_rate": 1.000340091886004e-06, + "loss": 1.3669017553329468, + "step": 6462 + }, + { + "epoch": 1.9896113889957676, + "grad_norm": 9.1875, + "learning_rate": 1.0003043197971917e-06, + "loss": 1.1172243356704712, + "step": 6464 + }, + { + "epoch": 1.990227010388611, + "grad_norm": 6.75, + "learning_rate": 1.0002705348508016e-06, + "loss": 1.4020445346832275, + "step": 6466 + }, + { + "epoch": 1.9908426317814545, + "grad_norm": 5.90625, + "learning_rate": 1.0002387370804063e-06, + "loss": 1.021568775177002, + "step": 6468 + }, + { + "epoch": 1.9914582531742977, + "grad_norm": 9.25, + "learning_rate": 1.0002089265176046e-06, + "loss": 1.5302304029464722, + "step": 6470 + }, + { + "epoch": 1.9920738745671414, + "grad_norm": 8.9375, + "learning_rate": 1.0001811031920195e-06, + "loss": 1.5604091882705688, + "step": 6472 + }, + { + "epoch": 1.9926894959599846, + "grad_norm": 7.96875, + "learning_rate": 1.0001552671312996e-06, + "loss": 1.1880793571472168, + "step": 6474 + }, + { + "epoch": 1.993305117352828, + "grad_norm": 7.5, + "learning_rate": 1.0001314183611194e-06, + "loss": 1.5420341491699219, + "step": 6476 + }, + { + "epoch": 1.9939207387456714, + "grad_norm": 9.0, + "learning_rate": 1.0001095569051772e-06, + "loss": 1.7072868347167969, + "step": 6478 + }, + { + "epoch": 1.9945363601385147, + "grad_norm": 13.0, + "learning_rate": 1.0000896827851974e-06, + "loss": 1.5550557374954224, + "step": 6480 + }, + { + "epoch": 1.9951519815313583, + "grad_norm": 14.6875, + "learning_rate": 1.0000717960209295e-06, + "loss": 0.8384658694267273, + "step": 6482 + }, + { + "epoch": 1.9957676029242015, + "grad_norm": 3.34375, + "learning_rate": 1.0000558966301483e-06, + "loss": 0.9205080270767212, + "step": 6484 + }, + { + "epoch": 1.9963832243170452, + "grad_norm": 16.875, + "learning_rate": 1.0000419846286524e-06, + "loss": 1.325616478919983, + "step": 6486 + }, + { + "epoch": 1.9969988457098884, + "grad_norm": 3.046875, + "learning_rate": 1.0000300600302676e-06, + "loss": 0.5225828886032104, + "step": 6488 + }, + { + "epoch": 1.9976144671027318, + "grad_norm": 7.6875, + "learning_rate": 1.0000201228468429e-06, + "loss": 1.0502115488052368, + "step": 6490 + }, + { + "epoch": 1.9982300884955753, + "grad_norm": 5.71875, + "learning_rate": 1.0000121730882534e-06, + "loss": 1.224287748336792, + "step": 6492 + }, + { + "epoch": 1.9988457098884185, + "grad_norm": 17.75, + "learning_rate": 1.000006210762399e-06, + "loss": 1.3089892864227295, + "step": 6494 + }, + { + "epoch": 1.9994613312812621, + "grad_norm": 9.125, + "learning_rate": 1.0000022358752043e-06, + "loss": 1.4170578718185425, + "step": 6496 + }, + { + "epoch": 2.0, + "grad_norm": 7.375, + "learning_rate": 1.0000002484306195e-06, + "loss": 1.3172667026519775, + "step": 6498 + }, + { + "epoch": 2.0, + "step": 6498, + "total_flos": 2.5760029558366536e+18, + "train_loss": 1.3367212960888106, + "train_runtime": 21551.3774, + "train_samples_per_second": 1.206, + "train_steps_per_second": 0.302 + } + ], + "logging_steps": 2, + "max_steps": 6498, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 9999999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.5760029558366536e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}