diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,22786 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 6498, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006156213928434013, + "grad_norm": 2.703125, + "learning_rate": 5.1282051282051286e-08, + "loss": 1.2488996982574463, + "step": 2 + }, + { + "epoch": 0.0012312427856868025, + "grad_norm": 3.59375, + "learning_rate": 1.5384615384615387e-07, + "loss": 1.944703459739685, + "step": 4 + }, + { + "epoch": 0.001846864178530204, + "grad_norm": 5.8125, + "learning_rate": 2.564102564102564e-07, + "loss": 1.6205146312713623, + "step": 6 + }, + { + "epoch": 0.002462485571373605, + "grad_norm": 4.875, + "learning_rate": 3.5897435897435896e-07, + "loss": 1.951253056526184, + "step": 8 + }, + { + "epoch": 0.0030781069642170067, + "grad_norm": 14.5, + "learning_rate": 4.615384615384616e-07, + "loss": 2.2729759216308594, + "step": 10 + }, + { + "epoch": 0.003693728357060408, + "grad_norm": 8.75, + "learning_rate": 5.641025641025642e-07, + "loss": 1.4744157791137695, + "step": 12 + }, + { + "epoch": 0.004309349749903809, + "grad_norm": 27.125, + "learning_rate": 6.666666666666667e-07, + "loss": 2.574921131134033, + "step": 14 + }, + { + "epoch": 0.00492497114274721, + "grad_norm": 8.25, + "learning_rate": 7.692307692307694e-07, + "loss": 1.8204889297485352, + "step": 16 + }, + { + "epoch": 0.005540592535590611, + "grad_norm": 15.6875, + "learning_rate": 8.717948717948718e-07, + "loss": 1.6300557851791382, + "step": 18 + }, + { + "epoch": 0.0061562139284340135, + "grad_norm": 8.5, + "learning_rate": 9.743589743589745e-07, + "loss": 1.4897385835647583, + "step": 20 + }, + { + "epoch": 0.006771835321277415, + "grad_norm": 4.125, + "learning_rate": 1.076923076923077e-06, + "loss": 1.7789047956466675, + "step": 22 + }, + { + "epoch": 0.007387456714120816, + "grad_norm": 4.15625, + "learning_rate": 1.1794871794871795e-06, + "loss": 1.8086273670196533, + "step": 24 + }, + { + "epoch": 0.008003078106964217, + "grad_norm": 5.5, + "learning_rate": 1.282051282051282e-06, + "loss": 1.6509721279144287, + "step": 26 + }, + { + "epoch": 0.008618699499807618, + "grad_norm": 3.484375, + "learning_rate": 1.3846153846153848e-06, + "loss": 1.568681001663208, + "step": 28 + }, + { + "epoch": 0.00923432089265102, + "grad_norm": 3.625, + "learning_rate": 1.4871794871794873e-06, + "loss": 1.7773399353027344, + "step": 30 + }, + { + "epoch": 0.00984994228549442, + "grad_norm": 6.65625, + "learning_rate": 1.5897435897435897e-06, + "loss": 1.8189994096755981, + "step": 32 + }, + { + "epoch": 0.010465563678337822, + "grad_norm": 4.75, + "learning_rate": 1.6923076923076926e-06, + "loss": 2.1563496589660645, + "step": 34 + }, + { + "epoch": 0.011081185071181223, + "grad_norm": 5.0, + "learning_rate": 1.794871794871795e-06, + "loss": 1.4601736068725586, + "step": 36 + }, + { + "epoch": 0.011696806464024625, + "grad_norm": 20.5, + "learning_rate": 1.8974358974358975e-06, + "loss": 1.8413267135620117, + "step": 38 + }, + { + "epoch": 0.012312427856868027, + "grad_norm": 2.515625, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.1767711639404297, + "step": 40 + }, + { + "epoch": 0.012928049249711427, + "grad_norm": 7.28125, + "learning_rate": 2.1025641025641028e-06, + "loss": 1.2139570713043213, + "step": 42 + }, + { + "epoch": 0.01354367064255483, + "grad_norm": 10.8125, + "learning_rate": 2.2051282051282052e-06, + "loss": 1.9730910062789917, + "step": 44 + }, + { + "epoch": 0.01415929203539823, + "grad_norm": 6.03125, + "learning_rate": 2.307692307692308e-06, + "loss": 1.4171947240829468, + "step": 46 + }, + { + "epoch": 0.014774913428241632, + "grad_norm": 8.1875, + "learning_rate": 2.4102564102564105e-06, + "loss": 1.6850796937942505, + "step": 48 + }, + { + "epoch": 0.015390534821085032, + "grad_norm": 2.296875, + "learning_rate": 2.512820512820513e-06, + "loss": 1.3291962146759033, + "step": 50 + }, + { + "epoch": 0.016006156213928435, + "grad_norm": 6.90625, + "learning_rate": 2.615384615384616e-06, + "loss": 1.6199973821640015, + "step": 52 + }, + { + "epoch": 0.016621777606771835, + "grad_norm": 6.46875, + "learning_rate": 2.717948717948718e-06, + "loss": 1.920493721961975, + "step": 54 + }, + { + "epoch": 0.017237398999615235, + "grad_norm": 6.53125, + "learning_rate": 2.8205128205128207e-06, + "loss": 1.4693753719329834, + "step": 56 + }, + { + "epoch": 0.01785302039245864, + "grad_norm": 3.46875, + "learning_rate": 2.9230769230769236e-06, + "loss": 1.869727611541748, + "step": 58 + }, + { + "epoch": 0.01846864178530204, + "grad_norm": 8.625, + "learning_rate": 3.0256410256410256e-06, + "loss": 1.7403851747512817, + "step": 60 + }, + { + "epoch": 0.01908426317814544, + "grad_norm": 6.03125, + "learning_rate": 3.1282051282051284e-06, + "loss": 1.4627933502197266, + "step": 62 + }, + { + "epoch": 0.01969988457098884, + "grad_norm": 3.0625, + "learning_rate": 3.2307692307692313e-06, + "loss": 1.1403491497039795, + "step": 64 + }, + { + "epoch": 0.020315505963832244, + "grad_norm": 10.4375, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.5978504419326782, + "step": 66 + }, + { + "epoch": 0.020931127356675645, + "grad_norm": 4.1875, + "learning_rate": 3.435897435897436e-06, + "loss": 1.2248575687408447, + "step": 68 + }, + { + "epoch": 0.021546748749519045, + "grad_norm": 7.5, + "learning_rate": 3.538461538461539e-06, + "loss": 2.153047800064087, + "step": 70 + }, + { + "epoch": 0.022162370142362445, + "grad_norm": 8.75, + "learning_rate": 3.641025641025641e-06, + "loss": 1.7931642532348633, + "step": 72 + }, + { + "epoch": 0.02277799153520585, + "grad_norm": 6.125, + "learning_rate": 3.743589743589744e-06, + "loss": 1.591568946838379, + "step": 74 + }, + { + "epoch": 0.02339361292804925, + "grad_norm": 13.5625, + "learning_rate": 3.846153846153847e-06, + "loss": 1.8861238956451416, + "step": 76 + }, + { + "epoch": 0.02400923432089265, + "grad_norm": 30.5, + "learning_rate": 3.948717948717949e-06, + "loss": 1.8302340507507324, + "step": 78 + }, + { + "epoch": 0.024624855713736054, + "grad_norm": 6.84375, + "learning_rate": 4.051282051282052e-06, + "loss": 1.1091889142990112, + "step": 80 + }, + { + "epoch": 0.025240477106579454, + "grad_norm": 4.15625, + "learning_rate": 4.1538461538461545e-06, + "loss": 1.4548031091690063, + "step": 82 + }, + { + "epoch": 0.025856098499422855, + "grad_norm": 8.0625, + "learning_rate": 4.2564102564102566e-06, + "loss": 1.3143142461776733, + "step": 84 + }, + { + "epoch": 0.026471719892266255, + "grad_norm": 4.34375, + "learning_rate": 4.358974358974359e-06, + "loss": 1.3913114070892334, + "step": 86 + }, + { + "epoch": 0.02708734128510966, + "grad_norm": 4.4375, + "learning_rate": 4.461538461538462e-06, + "loss": 1.53354012966156, + "step": 88 + }, + { + "epoch": 0.02770296267795306, + "grad_norm": 7.0625, + "learning_rate": 4.564102564102564e-06, + "loss": 1.8740226030349731, + "step": 90 + }, + { + "epoch": 0.02831858407079646, + "grad_norm": 5.6875, + "learning_rate": 4.666666666666667e-06, + "loss": 1.359494924545288, + "step": 92 + }, + { + "epoch": 0.02893420546363986, + "grad_norm": 6.78125, + "learning_rate": 4.76923076923077e-06, + "loss": 1.7467714548110962, + "step": 94 + }, + { + "epoch": 0.029549826856483264, + "grad_norm": 3.828125, + "learning_rate": 4.871794871794872e-06, + "loss": 1.4591333866119385, + "step": 96 + }, + { + "epoch": 0.030165448249326664, + "grad_norm": 6.6875, + "learning_rate": 4.974358974358975e-06, + "loss": 1.5871689319610596, + "step": 98 + }, + { + "epoch": 0.030781069642170065, + "grad_norm": 3.46875, + "learning_rate": 5.076923076923077e-06, + "loss": 1.078829288482666, + "step": 100 + }, + { + "epoch": 0.03139669103501347, + "grad_norm": 4.4375, + "learning_rate": 5.179487179487181e-06, + "loss": 1.4992876052856445, + "step": 102 + }, + { + "epoch": 0.03201231242785687, + "grad_norm": 10.1875, + "learning_rate": 5.282051282051283e-06, + "loss": 1.4937949180603027, + "step": 104 + }, + { + "epoch": 0.03262793382070027, + "grad_norm": 7.40625, + "learning_rate": 5.384615384615385e-06, + "loss": 1.634779930114746, + "step": 106 + }, + { + "epoch": 0.03324355521354367, + "grad_norm": 5.5625, + "learning_rate": 5.487179487179488e-06, + "loss": 1.4202882051467896, + "step": 108 + }, + { + "epoch": 0.03385917660638707, + "grad_norm": 4.625, + "learning_rate": 5.58974358974359e-06, + "loss": 1.5589418411254883, + "step": 110 + }, + { + "epoch": 0.03447479799923047, + "grad_norm": 12.4375, + "learning_rate": 5.692307692307692e-06, + "loss": 1.8365751504898071, + "step": 112 + }, + { + "epoch": 0.03509041939207388, + "grad_norm": 7.1875, + "learning_rate": 5.794871794871796e-06, + "loss": 1.9069055318832397, + "step": 114 + }, + { + "epoch": 0.03570604078491728, + "grad_norm": 2.796875, + "learning_rate": 5.897435897435898e-06, + "loss": 1.3717936277389526, + "step": 116 + }, + { + "epoch": 0.03632166217776068, + "grad_norm": 29.125, + "learning_rate": 6e-06, + "loss": 1.7499637603759766, + "step": 118 + }, + { + "epoch": 0.03693728357060408, + "grad_norm": 3.953125, + "learning_rate": 6.102564102564104e-06, + "loss": 1.4855519533157349, + "step": 120 + }, + { + "epoch": 0.03755290496344748, + "grad_norm": 4.65625, + "learning_rate": 6.205128205128206e-06, + "loss": 1.3522869348526, + "step": 122 + }, + { + "epoch": 0.03816852635629088, + "grad_norm": 4.59375, + "learning_rate": 6.307692307692308e-06, + "loss": 1.0647168159484863, + "step": 124 + }, + { + "epoch": 0.03878414774913428, + "grad_norm": 4.8125, + "learning_rate": 6.410256410256412e-06, + "loss": 1.3522032499313354, + "step": 126 + }, + { + "epoch": 0.03939976914197768, + "grad_norm": 11.375, + "learning_rate": 6.512820512820514e-06, + "loss": 1.6329436302185059, + "step": 128 + }, + { + "epoch": 0.04001539053482109, + "grad_norm": 8.3125, + "learning_rate": 6.615384615384616e-06, + "loss": 1.1570396423339844, + "step": 130 + }, + { + "epoch": 0.04063101192766449, + "grad_norm": 6.21875, + "learning_rate": 6.717948717948718e-06, + "loss": 1.609910011291504, + "step": 132 + }, + { + "epoch": 0.04124663332050789, + "grad_norm": 6.5625, + "learning_rate": 6.820512820512821e-06, + "loss": 1.589626669883728, + "step": 134 + }, + { + "epoch": 0.04186225471335129, + "grad_norm": 1.7578125, + "learning_rate": 6.923076923076923e-06, + "loss": 0.9378024339675903, + "step": 136 + }, + { + "epoch": 0.04247787610619469, + "grad_norm": 9.1875, + "learning_rate": 7.025641025641025e-06, + "loss": 1.545324444770813, + "step": 138 + }, + { + "epoch": 0.04309349749903809, + "grad_norm": 8.625, + "learning_rate": 7.128205128205129e-06, + "loss": 1.7356501817703247, + "step": 140 + }, + { + "epoch": 0.04370911889188149, + "grad_norm": 5.15625, + "learning_rate": 7.230769230769231e-06, + "loss": 1.3267698287963867, + "step": 142 + }, + { + "epoch": 0.04432474028472489, + "grad_norm": 3.828125, + "learning_rate": 7.333333333333333e-06, + "loss": 1.3805948495864868, + "step": 144 + }, + { + "epoch": 0.0449403616775683, + "grad_norm": 5.1875, + "learning_rate": 7.435897435897437e-06, + "loss": 1.43031644821167, + "step": 146 + }, + { + "epoch": 0.0455559830704117, + "grad_norm": 4.96875, + "learning_rate": 7.538461538461539e-06, + "loss": 1.8443446159362793, + "step": 148 + }, + { + "epoch": 0.0461716044632551, + "grad_norm": 6.09375, + "learning_rate": 7.641025641025641e-06, + "loss": 1.2642335891723633, + "step": 150 + }, + { + "epoch": 0.0467872258560985, + "grad_norm": 30.125, + "learning_rate": 7.743589743589745e-06, + "loss": 1.3919174671173096, + "step": 152 + }, + { + "epoch": 0.0474028472489419, + "grad_norm": 9.125, + "learning_rate": 7.846153846153847e-06, + "loss": 1.5884499549865723, + "step": 154 + }, + { + "epoch": 0.0480184686417853, + "grad_norm": 5.875, + "learning_rate": 7.948717948717949e-06, + "loss": 1.1845054626464844, + "step": 156 + }, + { + "epoch": 0.0486340900346287, + "grad_norm": 13.5, + "learning_rate": 8.051282051282052e-06, + "loss": 1.3628064393997192, + "step": 158 + }, + { + "epoch": 0.04924971142747211, + "grad_norm": 9.6875, + "learning_rate": 8.153846153846154e-06, + "loss": 1.5696605443954468, + "step": 160 + }, + { + "epoch": 0.04986533282031551, + "grad_norm": 3.25, + "learning_rate": 8.256410256410256e-06, + "loss": 1.2648931741714478, + "step": 162 + }, + { + "epoch": 0.05048095421315891, + "grad_norm": 11.875, + "learning_rate": 8.35897435897436e-06, + "loss": 1.2277356386184692, + "step": 164 + }, + { + "epoch": 0.05109657560600231, + "grad_norm": 18.625, + "learning_rate": 8.461538461538462e-06, + "loss": 1.2374329566955566, + "step": 166 + }, + { + "epoch": 0.05171219699884571, + "grad_norm": 3.171875, + "learning_rate": 8.564102564102564e-06, + "loss": 1.7571760416030884, + "step": 168 + }, + { + "epoch": 0.05232781839168911, + "grad_norm": 2.484375, + "learning_rate": 8.666666666666668e-06, + "loss": 1.3766837120056152, + "step": 170 + }, + { + "epoch": 0.05294343978453251, + "grad_norm": 14.375, + "learning_rate": 8.76923076923077e-06, + "loss": 1.7967778444290161, + "step": 172 + }, + { + "epoch": 0.05355906117737591, + "grad_norm": 4.3125, + "learning_rate": 8.871794871794872e-06, + "loss": 1.1553382873535156, + "step": 174 + }, + { + "epoch": 0.05417468257021932, + "grad_norm": 4.5625, + "learning_rate": 8.974358974358976e-06, + "loss": 1.4610005617141724, + "step": 176 + }, + { + "epoch": 0.05479030396306272, + "grad_norm": 15.4375, + "learning_rate": 9.076923076923078e-06, + "loss": 1.636716604232788, + "step": 178 + }, + { + "epoch": 0.05540592535590612, + "grad_norm": 6.6875, + "learning_rate": 9.17948717948718e-06, + "loss": 1.2763687372207642, + "step": 180 + }, + { + "epoch": 0.05602154674874952, + "grad_norm": 5.65625, + "learning_rate": 9.282051282051283e-06, + "loss": 1.361682653427124, + "step": 182 + }, + { + "epoch": 0.05663716814159292, + "grad_norm": 5.46875, + "learning_rate": 9.384615384615385e-06, + "loss": 0.9292706251144409, + "step": 184 + }, + { + "epoch": 0.05725278953443632, + "grad_norm": 7.28125, + "learning_rate": 9.487179487179487e-06, + "loss": 1.634671926498413, + "step": 186 + }, + { + "epoch": 0.05786841092727972, + "grad_norm": 9.625, + "learning_rate": 9.589743589743591e-06, + "loss": 1.34225594997406, + "step": 188 + }, + { + "epoch": 0.05848403232012313, + "grad_norm": 15.625, + "learning_rate": 9.692307692307693e-06, + "loss": 1.5954524278640747, + "step": 190 + }, + { + "epoch": 0.05909965371296653, + "grad_norm": 3.0, + "learning_rate": 9.794871794871795e-06, + "loss": 1.2905179262161255, + "step": 192 + }, + { + "epoch": 0.05971527510580993, + "grad_norm": 4.5, + "learning_rate": 9.897435897435899e-06, + "loss": 1.197383165359497, + "step": 194 + }, + { + "epoch": 0.06033089649865333, + "grad_norm": 5.6875, + "learning_rate": 1e-05, + "loss": 1.425607442855835, + "step": 196 + }, + { + "epoch": 0.06094651789149673, + "grad_norm": 2.875, + "learning_rate": 9.99999801255517e-06, + "loss": 1.2564163208007812, + "step": 198 + }, + { + "epoch": 0.06156213928434013, + "grad_norm": 12.875, + "learning_rate": 9.999992050222649e-06, + "loss": 1.3558177947998047, + "step": 200 + }, + { + "epoch": 0.06217776067718353, + "grad_norm": 5.125, + "learning_rate": 9.999982113008366e-06, + "loss": 1.5833901166915894, + "step": 202 + }, + { + "epoch": 0.06279338207002694, + "grad_norm": 2.21875, + "learning_rate": 9.999968200922195e-06, + "loss": 1.0304534435272217, + "step": 204 + }, + { + "epoch": 0.06340900346287033, + "grad_norm": 5.53125, + "learning_rate": 9.999950313977957e-06, + "loss": 1.6564018726348877, + "step": 206 + }, + { + "epoch": 0.06402462485571374, + "grad_norm": 5.625, + "learning_rate": 9.999928452193432e-06, + "loss": 1.6364089250564575, + "step": 208 + }, + { + "epoch": 0.06464024624855713, + "grad_norm": 6.84375, + "learning_rate": 9.999902615590342e-06, + "loss": 1.6970021724700928, + "step": 210 + }, + { + "epoch": 0.06525586764140054, + "grad_norm": 7.46875, + "learning_rate": 9.999872804194363e-06, + "loss": 1.6989271640777588, + "step": 212 + }, + { + "epoch": 0.06587148903424395, + "grad_norm": 22.375, + "learning_rate": 9.999839018035117e-06, + "loss": 1.6279175281524658, + "step": 214 + }, + { + "epoch": 0.06648711042708734, + "grad_norm": 26.25, + "learning_rate": 9.99980125714618e-06, + "loss": 1.2584187984466553, + "step": 216 + }, + { + "epoch": 0.06710273181993075, + "grad_norm": 5.96875, + "learning_rate": 9.999759521565074e-06, + "loss": 1.5264304876327515, + "step": 218 + }, + { + "epoch": 0.06771835321277414, + "grad_norm": 6.375, + "learning_rate": 9.999713811333272e-06, + "loss": 1.37717604637146, + "step": 220 + }, + { + "epoch": 0.06833397460561755, + "grad_norm": 2.5, + "learning_rate": 9.9996641264962e-06, + "loss": 1.3600963354110718, + "step": 222 + }, + { + "epoch": 0.06894959599846094, + "grad_norm": 6.6875, + "learning_rate": 9.999610467103231e-06, + "loss": 1.1031935214996338, + "step": 224 + }, + { + "epoch": 0.06956521739130435, + "grad_norm": 2.515625, + "learning_rate": 9.999552833207684e-06, + "loss": 1.3930033445358276, + "step": 226 + }, + { + "epoch": 0.07018083878414776, + "grad_norm": 5.6875, + "learning_rate": 9.999491224866836e-06, + "loss": 1.3915252685546875, + "step": 228 + }, + { + "epoch": 0.07079646017699115, + "grad_norm": 21.0, + "learning_rate": 9.999425642141903e-06, + "loss": 1.6080518960952759, + "step": 230 + }, + { + "epoch": 0.07141208156983456, + "grad_norm": 4.40625, + "learning_rate": 9.99935608509806e-06, + "loss": 1.2827638387680054, + "step": 232 + }, + { + "epoch": 0.07202770296267795, + "grad_norm": 4.625, + "learning_rate": 9.999282553804425e-06, + "loss": 1.281058669090271, + "step": 234 + }, + { + "epoch": 0.07264332435552136, + "grad_norm": 3.265625, + "learning_rate": 9.999205048334073e-06, + "loss": 1.315993070602417, + "step": 236 + }, + { + "epoch": 0.07325894574836475, + "grad_norm": 5.34375, + "learning_rate": 9.999123568764018e-06, + "loss": 1.48131263256073, + "step": 238 + }, + { + "epoch": 0.07387456714120816, + "grad_norm": 6.1875, + "learning_rate": 9.999038115175226e-06, + "loss": 1.273392677307129, + "step": 240 + }, + { + "epoch": 0.07449018853405155, + "grad_norm": 2.71875, + "learning_rate": 9.998948687652619e-06, + "loss": 1.138009786605835, + "step": 242 + }, + { + "epoch": 0.07510580992689496, + "grad_norm": 5.40625, + "learning_rate": 9.998855286285061e-06, + "loss": 1.2859021425247192, + "step": 244 + }, + { + "epoch": 0.07572143131973837, + "grad_norm": 15.0625, + "learning_rate": 9.998757911165368e-06, + "loss": 1.5143579244613647, + "step": 246 + }, + { + "epoch": 0.07633705271258176, + "grad_norm": 4.9375, + "learning_rate": 9.998656562390303e-06, + "loss": 1.4446684122085571, + "step": 248 + }, + { + "epoch": 0.07695267410542517, + "grad_norm": 1.71875, + "learning_rate": 9.99855124006058e-06, + "loss": 1.0485321283340454, + "step": 250 + }, + { + "epoch": 0.07756829549826856, + "grad_norm": 6.09375, + "learning_rate": 9.998441944280854e-06, + "loss": 1.4379631280899048, + "step": 252 + }, + { + "epoch": 0.07818391689111197, + "grad_norm": 4.5, + "learning_rate": 9.998328675159746e-06, + "loss": 1.640031099319458, + "step": 254 + }, + { + "epoch": 0.07879953828395536, + "grad_norm": 28.25, + "learning_rate": 9.998211432809803e-06, + "loss": 1.8261964321136475, + "step": 256 + }, + { + "epoch": 0.07941515967679877, + "grad_norm": 3.921875, + "learning_rate": 9.998090217347537e-06, + "loss": 1.4264521598815918, + "step": 258 + }, + { + "epoch": 0.08003078106964218, + "grad_norm": 1.5625, + "learning_rate": 9.997965028893404e-06, + "loss": 1.5476711988449097, + "step": 260 + }, + { + "epoch": 0.08064640246248557, + "grad_norm": 11.0625, + "learning_rate": 9.9978358675718e-06, + "loss": 1.4974404573440552, + "step": 262 + }, + { + "epoch": 0.08126202385532898, + "grad_norm": 5.46875, + "learning_rate": 9.997702733511082e-06, + "loss": 1.5387169122695923, + "step": 264 + }, + { + "epoch": 0.08187764524817237, + "grad_norm": 4.0625, + "learning_rate": 9.997565626843546e-06, + "loss": 1.240181803703308, + "step": 266 + }, + { + "epoch": 0.08249326664101578, + "grad_norm": 5.21875, + "learning_rate": 9.997424547705438e-06, + "loss": 1.3706841468811035, + "step": 268 + }, + { + "epoch": 0.08310888803385917, + "grad_norm": 5.8125, + "learning_rate": 9.997279496236952e-06, + "loss": 1.2704806327819824, + "step": 270 + }, + { + "epoch": 0.08372450942670258, + "grad_norm": 6.53125, + "learning_rate": 9.997130472582228e-06, + "loss": 1.5347249507904053, + "step": 272 + }, + { + "epoch": 0.08434013081954599, + "grad_norm": 4.65625, + "learning_rate": 9.996977476889351e-06, + "loss": 1.072393536567688, + "step": 274 + }, + { + "epoch": 0.08495575221238938, + "grad_norm": 5.96875, + "learning_rate": 9.996820509310363e-06, + "loss": 1.465416669845581, + "step": 276 + }, + { + "epoch": 0.08557137360523279, + "grad_norm": 4.78125, + "learning_rate": 9.996659570001242e-06, + "loss": 1.3557649850845337, + "step": 278 + }, + { + "epoch": 0.08618699499807618, + "grad_norm": 14.875, + "learning_rate": 9.996494659121919e-06, + "loss": 1.492985486984253, + "step": 280 + }, + { + "epoch": 0.08680261639091959, + "grad_norm": 8.6875, + "learning_rate": 9.996325776836267e-06, + "loss": 1.5023902654647827, + "step": 282 + }, + { + "epoch": 0.08741823778376298, + "grad_norm": 15.125, + "learning_rate": 9.996152923312111e-06, + "loss": 1.3942416906356812, + "step": 284 + }, + { + "epoch": 0.08803385917660639, + "grad_norm": 8.3125, + "learning_rate": 9.995976098721216e-06, + "loss": 1.9787180423736572, + "step": 286 + }, + { + "epoch": 0.08864948056944978, + "grad_norm": 8.8125, + "learning_rate": 9.9957953032393e-06, + "loss": 0.9891330599784851, + "step": 288 + }, + { + "epoch": 0.08926510196229319, + "grad_norm": 11.4375, + "learning_rate": 9.995610537046021e-06, + "loss": 1.5660462379455566, + "step": 290 + }, + { + "epoch": 0.0898807233551366, + "grad_norm": 5.3125, + "learning_rate": 9.995421800324987e-06, + "loss": 1.0548125505447388, + "step": 292 + }, + { + "epoch": 0.09049634474797999, + "grad_norm": 5.875, + "learning_rate": 9.99522909326375e-06, + "loss": 1.4109697341918945, + "step": 294 + }, + { + "epoch": 0.0911119661408234, + "grad_norm": 4.375, + "learning_rate": 9.995032416053804e-06, + "loss": 1.4787043333053589, + "step": 296 + }, + { + "epoch": 0.09172758753366679, + "grad_norm": 12.25, + "learning_rate": 9.994831768890598e-06, + "loss": 1.09103524684906, + "step": 298 + }, + { + "epoch": 0.0923432089265102, + "grad_norm": 8.3125, + "learning_rate": 9.994627151973513e-06, + "loss": 1.40105140209198, + "step": 300 + }, + { + "epoch": 0.09295883031935359, + "grad_norm": 2.171875, + "learning_rate": 9.994418565505885e-06, + "loss": 1.0116194486618042, + "step": 302 + }, + { + "epoch": 0.093574451712197, + "grad_norm": 22.375, + "learning_rate": 9.994206009694991e-06, + "loss": 1.894078016281128, + "step": 304 + }, + { + "epoch": 0.0941900731050404, + "grad_norm": 12.3125, + "learning_rate": 9.99398948475205e-06, + "loss": 1.7190589904785156, + "step": 306 + }, + { + "epoch": 0.0948056944978838, + "grad_norm": 3.75, + "learning_rate": 9.993768990892232e-06, + "loss": 0.814287543296814, + "step": 308 + }, + { + "epoch": 0.0954213158907272, + "grad_norm": 9.6875, + "learning_rate": 9.993544528334641e-06, + "loss": 2.03491473197937, + "step": 310 + }, + { + "epoch": 0.0960369372835706, + "grad_norm": 11.6875, + "learning_rate": 9.993316097302337e-06, + "loss": 1.7918344736099243, + "step": 312 + }, + { + "epoch": 0.09665255867641401, + "grad_norm": 8.875, + "learning_rate": 9.993083698022313e-06, + "loss": 1.7452021837234497, + "step": 314 + }, + { + "epoch": 0.0972681800692574, + "grad_norm": 1.7578125, + "learning_rate": 9.992847330725507e-06, + "loss": 0.6417583227157593, + "step": 316 + }, + { + "epoch": 0.09788380146210081, + "grad_norm": 4.125, + "learning_rate": 9.992606995646807e-06, + "loss": 1.199703335762024, + "step": 318 + }, + { + "epoch": 0.09849942285494422, + "grad_norm": 15.75, + "learning_rate": 9.99236269302504e-06, + "loss": 0.9593316316604614, + "step": 320 + }, + { + "epoch": 0.09911504424778761, + "grad_norm": 3.078125, + "learning_rate": 9.99211442310297e-06, + "loss": 1.2619590759277344, + "step": 322 + }, + { + "epoch": 0.09973066564063102, + "grad_norm": 6.65625, + "learning_rate": 9.991862186127312e-06, + "loss": 1.4878219366073608, + "step": 324 + }, + { + "epoch": 0.10034628703347441, + "grad_norm": 21.875, + "learning_rate": 9.99160598234872e-06, + "loss": 1.4164918661117554, + "step": 326 + }, + { + "epoch": 0.10096190842631782, + "grad_norm": 32.5, + "learning_rate": 9.991345812021786e-06, + "loss": 1.5547345876693726, + "step": 328 + }, + { + "epoch": 0.10157752981916121, + "grad_norm": 6.875, + "learning_rate": 9.991081675405049e-06, + "loss": 1.1818456649780273, + "step": 330 + }, + { + "epoch": 0.10219315121200462, + "grad_norm": 6.5625, + "learning_rate": 9.990813572760992e-06, + "loss": 1.2009780406951904, + "step": 332 + }, + { + "epoch": 0.10280877260484801, + "grad_norm": 9.75, + "learning_rate": 9.990541504356027e-06, + "loss": 1.4748668670654297, + "step": 334 + }, + { + "epoch": 0.10342439399769142, + "grad_norm": 3.90625, + "learning_rate": 9.990265470460516e-06, + "loss": 1.1837290525436401, + "step": 336 + }, + { + "epoch": 0.10404001539053483, + "grad_norm": 9.0, + "learning_rate": 9.989985471348765e-06, + "loss": 1.0815367698669434, + "step": 338 + }, + { + "epoch": 0.10465563678337822, + "grad_norm": 16.875, + "learning_rate": 9.989701507299013e-06, + "loss": 1.576779842376709, + "step": 340 + }, + { + "epoch": 0.10527125817622163, + "grad_norm": 6.34375, + "learning_rate": 9.98941357859344e-06, + "loss": 1.180511474609375, + "step": 342 + }, + { + "epoch": 0.10588687956906502, + "grad_norm": 28.875, + "learning_rate": 9.989121685518167e-06, + "loss": 1.410712718963623, + "step": 344 + }, + { + "epoch": 0.10650250096190843, + "grad_norm": 3.375, + "learning_rate": 9.988825828363254e-06, + "loss": 1.2783693075180054, + "step": 346 + }, + { + "epoch": 0.10711812235475182, + "grad_norm": 3.234375, + "learning_rate": 9.988526007422703e-06, + "loss": 1.327851414680481, + "step": 348 + }, + { + "epoch": 0.10773374374759523, + "grad_norm": 8.0625, + "learning_rate": 9.988222222994455e-06, + "loss": 0.9698463082313538, + "step": 350 + }, + { + "epoch": 0.10834936514043864, + "grad_norm": 2.828125, + "learning_rate": 9.987914475380382e-06, + "loss": 1.1605567932128906, + "step": 352 + }, + { + "epoch": 0.10896498653328203, + "grad_norm": 6.375, + "learning_rate": 9.987602764886304e-06, + "loss": 1.5284254550933838, + "step": 354 + }, + { + "epoch": 0.10958060792612544, + "grad_norm": 5.71875, + "learning_rate": 9.987287091821973e-06, + "loss": 1.3579685688018799, + "step": 356 + }, + { + "epoch": 0.11019622931896883, + "grad_norm": 6.125, + "learning_rate": 9.98696745650108e-06, + "loss": 1.316291093826294, + "step": 358 + }, + { + "epoch": 0.11081185071181224, + "grad_norm": 6.1875, + "learning_rate": 9.986643859241255e-06, + "loss": 1.4510215520858765, + "step": 360 + }, + { + "epoch": 0.11142747210465563, + "grad_norm": 10.3125, + "learning_rate": 9.986316300364063e-06, + "loss": 1.5002473592758179, + "step": 362 + }, + { + "epoch": 0.11204309349749904, + "grad_norm": 6.375, + "learning_rate": 9.985984780195006e-06, + "loss": 1.5161786079406738, + "step": 364 + }, + { + "epoch": 0.11265871489034245, + "grad_norm": 5.53125, + "learning_rate": 9.985649299063524e-06, + "loss": 0.5424119830131531, + "step": 366 + }, + { + "epoch": 0.11327433628318584, + "grad_norm": 12.5, + "learning_rate": 9.985309857302992e-06, + "loss": 1.6288212537765503, + "step": 368 + }, + { + "epoch": 0.11388995767602925, + "grad_norm": 10.125, + "learning_rate": 9.98496645525072e-06, + "loss": 1.3971303701400757, + "step": 370 + }, + { + "epoch": 0.11450557906887264, + "grad_norm": 2.21875, + "learning_rate": 9.984619093247956e-06, + "loss": 0.9907740354537964, + "step": 372 + }, + { + "epoch": 0.11512120046171605, + "grad_norm": 9.625, + "learning_rate": 9.98426777163988e-06, + "loss": 1.6520015001296997, + "step": 374 + }, + { + "epoch": 0.11573682185455944, + "grad_norm": 3.84375, + "learning_rate": 9.98391249077561e-06, + "loss": 1.2411595582962036, + "step": 376 + }, + { + "epoch": 0.11635244324740285, + "grad_norm": 9.5, + "learning_rate": 9.983553251008194e-06, + "loss": 1.3712900876998901, + "step": 378 + }, + { + "epoch": 0.11696806464024626, + "grad_norm": 3.3125, + "learning_rate": 9.983190052694618e-06, + "loss": 1.1593055725097656, + "step": 380 + }, + { + "epoch": 0.11758368603308965, + "grad_norm": 1.8359375, + "learning_rate": 9.9828228961958e-06, + "loss": 1.1706119775772095, + "step": 382 + }, + { + "epoch": 0.11819930742593306, + "grad_norm": 11.9375, + "learning_rate": 9.982451781876592e-06, + "loss": 1.6324158906936646, + "step": 384 + }, + { + "epoch": 0.11881492881877645, + "grad_norm": 6.5625, + "learning_rate": 9.982076710105778e-06, + "loss": 1.252874493598938, + "step": 386 + }, + { + "epoch": 0.11943055021161986, + "grad_norm": 6.28125, + "learning_rate": 9.981697681256075e-06, + "loss": 1.670320749282837, + "step": 388 + }, + { + "epoch": 0.12004617160446325, + "grad_norm": 6.28125, + "learning_rate": 9.981314695704134e-06, + "loss": 0.9480584859848022, + "step": 390 + }, + { + "epoch": 0.12066179299730666, + "grad_norm": 9.6875, + "learning_rate": 9.980927753830536e-06, + "loss": 1.4103999137878418, + "step": 392 + }, + { + "epoch": 0.12127741439015005, + "grad_norm": 3.34375, + "learning_rate": 9.980536856019793e-06, + "loss": 1.3862167596817017, + "step": 394 + }, + { + "epoch": 0.12189303578299346, + "grad_norm": 4.0625, + "learning_rate": 9.980142002660349e-06, + "loss": 1.2710118293762207, + "step": 396 + }, + { + "epoch": 0.12250865717583687, + "grad_norm": 3.890625, + "learning_rate": 9.979743194144578e-06, + "loss": 1.2509794235229492, + "step": 398 + }, + { + "epoch": 0.12312427856868026, + "grad_norm": 4.15625, + "learning_rate": 9.979340430868786e-06, + "loss": 1.3792119026184082, + "step": 400 + }, + { + "epoch": 0.12373989996152367, + "grad_norm": 5.90625, + "learning_rate": 9.978933713233208e-06, + "loss": 1.5315032005310059, + "step": 402 + }, + { + "epoch": 0.12435552135436706, + "grad_norm": 6.375, + "learning_rate": 9.978523041642007e-06, + "loss": 0.9869703054428101, + "step": 404 + }, + { + "epoch": 0.12497114274721047, + "grad_norm": 8.875, + "learning_rate": 9.97810841650328e-06, + "loss": 1.1880018711090088, + "step": 406 + }, + { + "epoch": 0.12558676414005387, + "grad_norm": 11.3125, + "learning_rate": 9.977689838229045e-06, + "loss": 1.3651795387268066, + "step": 408 + }, + { + "epoch": 0.12620238553289725, + "grad_norm": 10.9375, + "learning_rate": 9.977267307235255e-06, + "loss": 1.3049819469451904, + "step": 410 + }, + { + "epoch": 0.12681800692574066, + "grad_norm": 4.5, + "learning_rate": 9.976840823941789e-06, + "loss": 1.2135716676712036, + "step": 412 + }, + { + "epoch": 0.12743362831858407, + "grad_norm": 6.59375, + "learning_rate": 9.97641038877245e-06, + "loss": 1.504198670387268, + "step": 414 + }, + { + "epoch": 0.12804924971142748, + "grad_norm": 8.1875, + "learning_rate": 9.975976002154974e-06, + "loss": 1.4882495403289795, + "step": 416 + }, + { + "epoch": 0.12866487110427088, + "grad_norm": 10.1875, + "learning_rate": 9.97553766452102e-06, + "loss": 0.7723729610443115, + "step": 418 + }, + { + "epoch": 0.12928049249711426, + "grad_norm": 6.25, + "learning_rate": 9.975095376306174e-06, + "loss": 1.3737043142318726, + "step": 420 + }, + { + "epoch": 0.12989611388995767, + "grad_norm": 9.1875, + "learning_rate": 9.974649137949947e-06, + "loss": 1.471574068069458, + "step": 422 + }, + { + "epoch": 0.13051173528280108, + "grad_norm": 6.875, + "learning_rate": 9.974198949895778e-06, + "loss": 0.9776772856712341, + "step": 424 + }, + { + "epoch": 0.13112735667564449, + "grad_norm": 16.25, + "learning_rate": 9.973744812591027e-06, + "loss": 1.3625441789627075, + "step": 426 + }, + { + "epoch": 0.1317429780684879, + "grad_norm": 6.53125, + "learning_rate": 9.973286726486979e-06, + "loss": 1.225606918334961, + "step": 428 + }, + { + "epoch": 0.13235859946133127, + "grad_norm": 6.34375, + "learning_rate": 9.972824692038846e-06, + "loss": 1.3208441734313965, + "step": 430 + }, + { + "epoch": 0.13297422085417468, + "grad_norm": 12.1875, + "learning_rate": 9.972358709705767e-06, + "loss": 1.4286389350891113, + "step": 432 + }, + { + "epoch": 0.1335898422470181, + "grad_norm": 19.0, + "learning_rate": 9.971888779950791e-06, + "loss": 1.8515408039093018, + "step": 434 + }, + { + "epoch": 0.1342054636398615, + "grad_norm": 10.1875, + "learning_rate": 9.9714149032409e-06, + "loss": 1.8427278995513916, + "step": 436 + }, + { + "epoch": 0.13482108503270487, + "grad_norm": 8.3125, + "learning_rate": 9.970937080047001e-06, + "loss": 1.406190037727356, + "step": 438 + }, + { + "epoch": 0.13543670642554828, + "grad_norm": 5.34375, + "learning_rate": 9.970455310843911e-06, + "loss": 0.9209026098251343, + "step": 440 + }, + { + "epoch": 0.1360523278183917, + "grad_norm": 15.3125, + "learning_rate": 9.969969596110378e-06, + "loss": 1.5718461275100708, + "step": 442 + }, + { + "epoch": 0.1366679492112351, + "grad_norm": 2.890625, + "learning_rate": 9.969479936329067e-06, + "loss": 1.2917200326919556, + "step": 444 + }, + { + "epoch": 0.1372835706040785, + "grad_norm": 7.5, + "learning_rate": 9.968986331986565e-06, + "loss": 1.4151887893676758, + "step": 446 + }, + { + "epoch": 0.13789919199692188, + "grad_norm": 14.75, + "learning_rate": 9.968488783573376e-06, + "loss": 1.9977552890777588, + "step": 448 + }, + { + "epoch": 0.1385148133897653, + "grad_norm": 3.921875, + "learning_rate": 9.967987291583924e-06, + "loss": 1.2807127237319946, + "step": 450 + }, + { + "epoch": 0.1391304347826087, + "grad_norm": 20.875, + "learning_rate": 9.967481856516559e-06, + "loss": 1.5343042612075806, + "step": 452 + }, + { + "epoch": 0.1397460561754521, + "grad_norm": 7.6875, + "learning_rate": 9.966972478873536e-06, + "loss": 1.1871122121810913, + "step": 454 + }, + { + "epoch": 0.1403616775682955, + "grad_norm": 7.96875, + "learning_rate": 9.966459159161038e-06, + "loss": 1.477895736694336, + "step": 456 + }, + { + "epoch": 0.1409772989611389, + "grad_norm": 6.53125, + "learning_rate": 9.965941897889162e-06, + "loss": 1.4448455572128296, + "step": 458 + }, + { + "epoch": 0.1415929203539823, + "grad_norm": 6.59375, + "learning_rate": 9.96542069557192e-06, + "loss": 1.834481954574585, + "step": 460 + }, + { + "epoch": 0.1422085417468257, + "grad_norm": 10.5625, + "learning_rate": 9.96489555272725e-06, + "loss": 1.6769435405731201, + "step": 462 + }, + { + "epoch": 0.1428241631396691, + "grad_norm": 5.34375, + "learning_rate": 9.96436646987699e-06, + "loss": 1.369468092918396, + "step": 464 + }, + { + "epoch": 0.1434397845325125, + "grad_norm": 6.78125, + "learning_rate": 9.963833447546903e-06, + "loss": 1.3754758834838867, + "step": 466 + }, + { + "epoch": 0.1440554059253559, + "grad_norm": 7.34375, + "learning_rate": 9.963296486266667e-06, + "loss": 1.7363370656967163, + "step": 468 + }, + { + "epoch": 0.1446710273181993, + "grad_norm": 5.125, + "learning_rate": 9.962755586569873e-06, + "loss": 1.634676218032837, + "step": 470 + }, + { + "epoch": 0.14528664871104272, + "grad_norm": 8.0625, + "learning_rate": 9.962210748994023e-06, + "loss": 1.3580645322799683, + "step": 472 + }, + { + "epoch": 0.14590227010388612, + "grad_norm": 4.0, + "learning_rate": 9.961661974080537e-06, + "loss": 1.469011902809143, + "step": 474 + }, + { + "epoch": 0.1465178914967295, + "grad_norm": 8.25, + "learning_rate": 9.961109262374742e-06, + "loss": 1.6970473527908325, + "step": 476 + }, + { + "epoch": 0.1471335128895729, + "grad_norm": 2.546875, + "learning_rate": 9.960552614425882e-06, + "loss": 1.0189458131790161, + "step": 478 + }, + { + "epoch": 0.14774913428241632, + "grad_norm": 11.0625, + "learning_rate": 9.959992030787111e-06, + "loss": 1.264059066772461, + "step": 480 + }, + { + "epoch": 0.14836475567525972, + "grad_norm": 4.3125, + "learning_rate": 9.959427512015491e-06, + "loss": 1.3091117143630981, + "step": 482 + }, + { + "epoch": 0.1489803770681031, + "grad_norm": 14.375, + "learning_rate": 9.958859058671999e-06, + "loss": 1.2481650114059448, + "step": 484 + }, + { + "epoch": 0.1495959984609465, + "grad_norm": 5.28125, + "learning_rate": 9.95828667132152e-06, + "loss": 1.513774037361145, + "step": 486 + }, + { + "epoch": 0.15021161985378992, + "grad_norm": 11.375, + "learning_rate": 9.957710350532846e-06, + "loss": 1.5790436267852783, + "step": 488 + }, + { + "epoch": 0.15082724124663333, + "grad_norm": 10.5625, + "learning_rate": 9.957130096878682e-06, + "loss": 1.4887685775756836, + "step": 490 + }, + { + "epoch": 0.15144286263947673, + "grad_norm": 3.546875, + "learning_rate": 9.956545910935637e-06, + "loss": 1.1842014789581299, + "step": 492 + }, + { + "epoch": 0.1520584840323201, + "grad_norm": 6.65625, + "learning_rate": 9.955957793284234e-06, + "loss": 0.8832247257232666, + "step": 494 + }, + { + "epoch": 0.15267410542516352, + "grad_norm": 8.0625, + "learning_rate": 9.955365744508893e-06, + "loss": 1.473832130432129, + "step": 496 + }, + { + "epoch": 0.15328972681800693, + "grad_norm": 10.0625, + "learning_rate": 9.954769765197952e-06, + "loss": 1.6755919456481934, + "step": 498 + }, + { + "epoch": 0.15390534821085033, + "grad_norm": 4.5, + "learning_rate": 9.954169855943643e-06, + "loss": 1.207249402999878, + "step": 500 + }, + { + "epoch": 0.15452096960369374, + "grad_norm": 7.1875, + "learning_rate": 9.953566017342113e-06, + "loss": 1.4508755207061768, + "step": 502 + }, + { + "epoch": 0.15513659099653712, + "grad_norm": 7.59375, + "learning_rate": 9.95295824999341e-06, + "loss": 1.6686768531799316, + "step": 504 + }, + { + "epoch": 0.15575221238938053, + "grad_norm": 7.28125, + "learning_rate": 9.952346554501485e-06, + "loss": 1.4300891160964966, + "step": 506 + }, + { + "epoch": 0.15636783378222394, + "grad_norm": 9.375, + "learning_rate": 9.951730931474192e-06, + "loss": 1.3210822343826294, + "step": 508 + }, + { + "epoch": 0.15698345517506734, + "grad_norm": 6.03125, + "learning_rate": 9.951111381523291e-06, + "loss": 1.4945731163024902, + "step": 510 + }, + { + "epoch": 0.15759907656791072, + "grad_norm": 11.875, + "learning_rate": 9.950487905264445e-06, + "loss": 1.7546916007995605, + "step": 512 + }, + { + "epoch": 0.15821469796075413, + "grad_norm": 8.0625, + "learning_rate": 9.949860503317213e-06, + "loss": 1.4959115982055664, + "step": 514 + }, + { + "epoch": 0.15883031935359754, + "grad_norm": 7.28125, + "learning_rate": 9.94922917630506e-06, + "loss": 1.3217384815216064, + "step": 516 + }, + { + "epoch": 0.15944594074644095, + "grad_norm": 9.625, + "learning_rate": 9.948593924855347e-06, + "loss": 1.45122492313385, + "step": 518 + }, + { + "epoch": 0.16006156213928435, + "grad_norm": 16.375, + "learning_rate": 9.947954749599343e-06, + "loss": 1.3676655292510986, + "step": 520 + }, + { + "epoch": 0.16067718353212773, + "grad_norm": 11.4375, + "learning_rate": 9.947311651172205e-06, + "loss": 1.78343665599823, + "step": 522 + }, + { + "epoch": 0.16129280492497114, + "grad_norm": 8.6875, + "learning_rate": 9.946664630212998e-06, + "loss": 1.3944776058197021, + "step": 524 + }, + { + "epoch": 0.16190842631781455, + "grad_norm": 8.6875, + "learning_rate": 9.94601368736468e-06, + "loss": 1.7285417318344116, + "step": 526 + }, + { + "epoch": 0.16252404771065795, + "grad_norm": 8.9375, + "learning_rate": 9.945358823274107e-06, + "loss": 1.4867370128631592, + "step": 528 + }, + { + "epoch": 0.16313966910350133, + "grad_norm": 3.96875, + "learning_rate": 9.944700038592034e-06, + "loss": 0.9708921909332275, + "step": 530 + }, + { + "epoch": 0.16375529049634474, + "grad_norm": 3.125, + "learning_rate": 9.944037333973109e-06, + "loss": 1.3556715250015259, + "step": 532 + }, + { + "epoch": 0.16437091188918815, + "grad_norm": 5.625, + "learning_rate": 9.943370710075877e-06, + "loss": 1.361598253250122, + "step": 534 + }, + { + "epoch": 0.16498653328203156, + "grad_norm": 8.5, + "learning_rate": 9.942700167562774e-06, + "loss": 1.3754247426986694, + "step": 536 + }, + { + "epoch": 0.16560215467487496, + "grad_norm": 15.75, + "learning_rate": 9.942025707100139e-06, + "loss": 1.7027689218521118, + "step": 538 + }, + { + "epoch": 0.16621777606771834, + "grad_norm": 6.375, + "learning_rate": 9.941347329358193e-06, + "loss": 1.3407535552978516, + "step": 540 + }, + { + "epoch": 0.16683339746056175, + "grad_norm": 5.03125, + "learning_rate": 9.940665035011057e-06, + "loss": 1.505167007446289, + "step": 542 + }, + { + "epoch": 0.16744901885340516, + "grad_norm": 6.46875, + "learning_rate": 9.939978824736742e-06, + "loss": 1.297150731086731, + "step": 544 + }, + { + "epoch": 0.16806464024624856, + "grad_norm": 7.09375, + "learning_rate": 9.939288699217152e-06, + "loss": 0.9072936773300171, + "step": 546 + }, + { + "epoch": 0.16868026163909197, + "grad_norm": 7.625, + "learning_rate": 9.938594659138078e-06, + "loss": 1.2618991136550903, + "step": 548 + }, + { + "epoch": 0.16929588303193535, + "grad_norm": 39.25, + "learning_rate": 9.937896705189207e-06, + "loss": 1.3387863636016846, + "step": 550 + }, + { + "epoch": 0.16991150442477876, + "grad_norm": 15.8125, + "learning_rate": 9.937194838064106e-06, + "loss": 1.590695858001709, + "step": 552 + }, + { + "epoch": 0.17052712581762217, + "grad_norm": 6.96875, + "learning_rate": 9.93648905846024e-06, + "loss": 1.4013009071350098, + "step": 554 + }, + { + "epoch": 0.17114274721046557, + "grad_norm": 9.9375, + "learning_rate": 9.935779367078958e-06, + "loss": 1.6209547519683838, + "step": 556 + }, + { + "epoch": 0.17175836860330895, + "grad_norm": 5.1875, + "learning_rate": 9.935065764625493e-06, + "loss": 1.6528769731521606, + "step": 558 + }, + { + "epoch": 0.17237398999615236, + "grad_norm": 12.25, + "learning_rate": 9.934348251808972e-06, + "loss": 1.3280056715011597, + "step": 560 + }, + { + "epoch": 0.17298961138899577, + "grad_norm": 5.5, + "learning_rate": 9.9336268293424e-06, + "loss": 0.9575151801109314, + "step": 562 + }, + { + "epoch": 0.17360523278183917, + "grad_norm": 7.9375, + "learning_rate": 9.932901497942672e-06, + "loss": 1.3487428426742554, + "step": 564 + }, + { + "epoch": 0.17422085417468258, + "grad_norm": 2.828125, + "learning_rate": 9.932172258330566e-06, + "loss": 0.9193823337554932, + "step": 566 + }, + { + "epoch": 0.17483647556752596, + "grad_norm": 4.4375, + "learning_rate": 9.931439111230745e-06, + "loss": 1.2267365455627441, + "step": 568 + }, + { + "epoch": 0.17545209696036937, + "grad_norm": 8.8125, + "learning_rate": 9.930702057371752e-06, + "loss": 1.499516487121582, + "step": 570 + }, + { + "epoch": 0.17606771835321278, + "grad_norm": 7.96875, + "learning_rate": 9.929961097486018e-06, + "loss": 1.337449312210083, + "step": 572 + }, + { + "epoch": 0.17668333974605618, + "grad_norm": 3.921875, + "learning_rate": 9.929216232309845e-06, + "loss": 1.2182761430740356, + "step": 574 + }, + { + "epoch": 0.17729896113889956, + "grad_norm": 8.375, + "learning_rate": 9.928467462583425e-06, + "loss": 1.60722815990448, + "step": 576 + }, + { + "epoch": 0.17791458253174297, + "grad_norm": 3.515625, + "learning_rate": 9.927714789050828e-06, + "loss": 0.9616989493370056, + "step": 578 + }, + { + "epoch": 0.17853020392458638, + "grad_norm": 6.78125, + "learning_rate": 9.926958212460002e-06, + "loss": 0.9722899794578552, + "step": 580 + }, + { + "epoch": 0.17914582531742979, + "grad_norm": 75.0, + "learning_rate": 9.926197733562774e-06, + "loss": 1.2725977897644043, + "step": 582 + }, + { + "epoch": 0.1797614467102732, + "grad_norm": 27.125, + "learning_rate": 9.925433353114851e-06, + "loss": 1.7348570823669434, + "step": 584 + }, + { + "epoch": 0.18037706810311657, + "grad_norm": 10.5, + "learning_rate": 9.924665071875812e-06, + "loss": 1.5623992681503296, + "step": 586 + }, + { + "epoch": 0.18099268949595998, + "grad_norm": 8.25, + "learning_rate": 9.923892890609118e-06, + "loss": 1.6208124160766602, + "step": 588 + }, + { + "epoch": 0.1816083108888034, + "grad_norm": 6.875, + "learning_rate": 9.923116810082096e-06, + "loss": 1.3482056856155396, + "step": 590 + }, + { + "epoch": 0.1822239322816468, + "grad_norm": 3.15625, + "learning_rate": 9.922336831065966e-06, + "loss": 1.4589314460754395, + "step": 592 + }, + { + "epoch": 0.1828395536744902, + "grad_norm": 6.5, + "learning_rate": 9.9215529543358e-06, + "loss": 1.4144929647445679, + "step": 594 + }, + { + "epoch": 0.18345517506733358, + "grad_norm": 4.03125, + "learning_rate": 9.920765180670562e-06, + "loss": 1.389893651008606, + "step": 596 + }, + { + "epoch": 0.184070796460177, + "grad_norm": 2.875, + "learning_rate": 9.919973510853076e-06, + "loss": 0.9373916387557983, + "step": 598 + }, + { + "epoch": 0.1846864178530204, + "grad_norm": 7.4375, + "learning_rate": 9.91917794567004e-06, + "loss": 0.8495765924453735, + "step": 600 + }, + { + "epoch": 0.1853020392458638, + "grad_norm": 11.25, + "learning_rate": 9.91837848591203e-06, + "loss": 1.3704453706741333, + "step": 602 + }, + { + "epoch": 0.18591766063870718, + "grad_norm": 17.625, + "learning_rate": 9.917575132373485e-06, + "loss": 1.4358397722244263, + "step": 604 + }, + { + "epoch": 0.1865332820315506, + "grad_norm": 6.0, + "learning_rate": 9.916767885852716e-06, + "loss": 1.389540195465088, + "step": 606 + }, + { + "epoch": 0.187148903424394, + "grad_norm": 4.90625, + "learning_rate": 9.9159567471519e-06, + "loss": 1.5843164920806885, + "step": 608 + }, + { + "epoch": 0.1877645248172374, + "grad_norm": 5.03125, + "learning_rate": 9.915141717077087e-06, + "loss": 1.5799970626831055, + "step": 610 + }, + { + "epoch": 0.1883801462100808, + "grad_norm": 15.375, + "learning_rate": 9.914322796438185e-06, + "loss": 0.6345373392105103, + "step": 612 + }, + { + "epoch": 0.1889957676029242, + "grad_norm": 5.90625, + "learning_rate": 9.913499986048979e-06, + "loss": 1.2795158624649048, + "step": 614 + }, + { + "epoch": 0.1896113889957676, + "grad_norm": 14.125, + "learning_rate": 9.912673286727112e-06, + "loss": 1.3732866048812866, + "step": 616 + }, + { + "epoch": 0.190227010388611, + "grad_norm": 16.5, + "learning_rate": 9.911842699294095e-06, + "loss": 1.2570321559906006, + "step": 618 + }, + { + "epoch": 0.1908426317814544, + "grad_norm": 5.53125, + "learning_rate": 9.9110082245753e-06, + "loss": 0.9575303792953491, + "step": 620 + }, + { + "epoch": 0.1914582531742978, + "grad_norm": 12.25, + "learning_rate": 9.910169863399964e-06, + "loss": 1.4983577728271484, + "step": 622 + }, + { + "epoch": 0.1920738745671412, + "grad_norm": 33.75, + "learning_rate": 9.909327616601185e-06, + "loss": 1.6086472272872925, + "step": 624 + }, + { + "epoch": 0.1926894959599846, + "grad_norm": 14.8125, + "learning_rate": 9.908481485015922e-06, + "loss": 0.9734293818473816, + "step": 626 + }, + { + "epoch": 0.19330511735282802, + "grad_norm": 10.5625, + "learning_rate": 9.907631469484997e-06, + "loss": 1.6982768774032593, + "step": 628 + }, + { + "epoch": 0.19392073874567142, + "grad_norm": 11.25, + "learning_rate": 9.906777570853088e-06, + "loss": 1.8723676204681396, + "step": 630 + }, + { + "epoch": 0.1945363601385148, + "grad_norm": 7.28125, + "learning_rate": 9.90591978996873e-06, + "loss": 1.3249781131744385, + "step": 632 + }, + { + "epoch": 0.1951519815313582, + "grad_norm": 3.8125, + "learning_rate": 9.905058127684326e-06, + "loss": 1.2242438793182373, + "step": 634 + }, + { + "epoch": 0.19576760292420162, + "grad_norm": 5.90625, + "learning_rate": 9.904192584856123e-06, + "loss": 1.4721521139144897, + "step": 636 + }, + { + "epoch": 0.19638322431704502, + "grad_norm": 10.875, + "learning_rate": 9.903323162344234e-06, + "loss": 1.2291452884674072, + "step": 638 + }, + { + "epoch": 0.19699884570988843, + "grad_norm": 10.8125, + "learning_rate": 9.902449861012622e-06, + "loss": 1.4630839824676514, + "step": 640 + }, + { + "epoch": 0.1976144671027318, + "grad_norm": 6.59375, + "learning_rate": 9.901572681729106e-06, + "loss": 1.6249059438705444, + "step": 642 + }, + { + "epoch": 0.19823008849557522, + "grad_norm": 7.75, + "learning_rate": 9.90069162536536e-06, + "loss": 1.314393162727356, + "step": 644 + }, + { + "epoch": 0.19884570988841863, + "grad_norm": 6.8125, + "learning_rate": 9.899806692796907e-06, + "loss": 1.394429326057434, + "step": 646 + }, + { + "epoch": 0.19946133128126203, + "grad_norm": 12.125, + "learning_rate": 9.898917884903127e-06, + "loss": 1.1286661624908447, + "step": 648 + }, + { + "epoch": 0.2000769526741054, + "grad_norm": 5.84375, + "learning_rate": 9.898025202567247e-06, + "loss": 1.5071682929992676, + "step": 650 + }, + { + "epoch": 0.20069257406694882, + "grad_norm": 8.4375, + "learning_rate": 9.897128646676349e-06, + "loss": 1.4699369668960571, + "step": 652 + }, + { + "epoch": 0.20130819545979223, + "grad_norm": 4.90625, + "learning_rate": 9.896228218121353e-06, + "loss": 1.037987470626831, + "step": 654 + }, + { + "epoch": 0.20192381685263563, + "grad_norm": 11.8125, + "learning_rate": 9.895323917797042e-06, + "loss": 1.6456223726272583, + "step": 656 + }, + { + "epoch": 0.20253943824547904, + "grad_norm": 97.0, + "learning_rate": 9.894415746602035e-06, + "loss": 1.4587563276290894, + "step": 658 + }, + { + "epoch": 0.20315505963832242, + "grad_norm": 5.875, + "learning_rate": 9.893503705438806e-06, + "loss": 1.4608798027038574, + "step": 660 + }, + { + "epoch": 0.20377068103116583, + "grad_norm": 3.0, + "learning_rate": 9.892587795213666e-06, + "loss": 1.3375227451324463, + "step": 662 + }, + { + "epoch": 0.20438630242400924, + "grad_norm": 15.625, + "learning_rate": 9.891668016836782e-06, + "loss": 1.3512498140335083, + "step": 664 + }, + { + "epoch": 0.20500192381685264, + "grad_norm": 6.78125, + "learning_rate": 9.890744371222152e-06, + "loss": 0.7749507427215576, + "step": 666 + }, + { + "epoch": 0.20561754520969602, + "grad_norm": 6.25, + "learning_rate": 9.889816859287627e-06, + "loss": 1.158947229385376, + "step": 668 + }, + { + "epoch": 0.20623316660253943, + "grad_norm": 4.59375, + "learning_rate": 9.888885481954895e-06, + "loss": 1.391932487487793, + "step": 670 + }, + { + "epoch": 0.20684878799538284, + "grad_norm": 5.09375, + "learning_rate": 9.887950240149486e-06, + "loss": 1.5135208368301392, + "step": 672 + }, + { + "epoch": 0.20746440938822625, + "grad_norm": 7.375, + "learning_rate": 9.887011134800774e-06, + "loss": 1.2985072135925293, + "step": 674 + }, + { + "epoch": 0.20808003078106965, + "grad_norm": 8.5625, + "learning_rate": 9.886068166841966e-06, + "loss": 1.1860077381134033, + "step": 676 + }, + { + "epoch": 0.20869565217391303, + "grad_norm": 6.40625, + "learning_rate": 9.88512133721011e-06, + "loss": 1.6630322933197021, + "step": 678 + }, + { + "epoch": 0.20931127356675644, + "grad_norm": 13.0, + "learning_rate": 9.884170646846096e-06, + "loss": 1.4792208671569824, + "step": 680 + }, + { + "epoch": 0.20992689495959985, + "grad_norm": 14.6875, + "learning_rate": 9.883216096694641e-06, + "loss": 1.2808235883712769, + "step": 682 + }, + { + "epoch": 0.21054251635244325, + "grad_norm": 3.9375, + "learning_rate": 9.882257687704304e-06, + "loss": 1.296074628829956, + "step": 684 + }, + { + "epoch": 0.21115813774528666, + "grad_norm": 8.875, + "learning_rate": 9.881295420827482e-06, + "loss": 1.5456552505493164, + "step": 686 + }, + { + "epoch": 0.21177375913813004, + "grad_norm": 7.96875, + "learning_rate": 9.880329297020394e-06, + "loss": 1.8220405578613281, + "step": 688 + }, + { + "epoch": 0.21238938053097345, + "grad_norm": 3.90625, + "learning_rate": 9.879359317243104e-06, + "loss": 1.4817025661468506, + "step": 690 + }, + { + "epoch": 0.21300500192381686, + "grad_norm": 10.125, + "learning_rate": 9.878385482459505e-06, + "loss": 1.1662178039550781, + "step": 692 + }, + { + "epoch": 0.21362062331666026, + "grad_norm": 11.375, + "learning_rate": 9.87740779363731e-06, + "loss": 1.4008355140686035, + "step": 694 + }, + { + "epoch": 0.21423624470950364, + "grad_norm": 8.8125, + "learning_rate": 9.876426251748079e-06, + "loss": 1.5103132724761963, + "step": 696 + }, + { + "epoch": 0.21485186610234705, + "grad_norm": 4.0, + "learning_rate": 9.875440857767187e-06, + "loss": 1.3361725807189941, + "step": 698 + }, + { + "epoch": 0.21546748749519046, + "grad_norm": 4.0, + "learning_rate": 9.874451612673841e-06, + "loss": 1.2430728673934937, + "step": 700 + }, + { + "epoch": 0.21608310888803386, + "grad_norm": 4.75, + "learning_rate": 9.87345851745108e-06, + "loss": 1.2919254302978516, + "step": 702 + }, + { + "epoch": 0.21669873028087727, + "grad_norm": 6.65625, + "learning_rate": 9.872461573085766e-06, + "loss": 1.1002001762390137, + "step": 704 + }, + { + "epoch": 0.21731435167372065, + "grad_norm": 11.1875, + "learning_rate": 9.871460780568578e-06, + "loss": 1.653430700302124, + "step": 706 + }, + { + "epoch": 0.21792997306656406, + "grad_norm": 57.25, + "learning_rate": 9.870456140894033e-06, + "loss": 1.343963384628296, + "step": 708 + }, + { + "epoch": 0.21854559445940747, + "grad_norm": 12.5625, + "learning_rate": 9.869447655060463e-06, + "loss": 1.4002281427383423, + "step": 710 + }, + { + "epoch": 0.21916121585225087, + "grad_norm": 6.75, + "learning_rate": 9.868435324070016e-06, + "loss": 1.4317114353179932, + "step": 712 + }, + { + "epoch": 0.21977683724509428, + "grad_norm": 8.0625, + "learning_rate": 9.867419148928677e-06, + "loss": 1.3845818042755127, + "step": 714 + }, + { + "epoch": 0.22039245863793766, + "grad_norm": 6.0625, + "learning_rate": 9.866399130646238e-06, + "loss": 1.2754722833633423, + "step": 716 + }, + { + "epoch": 0.22100808003078107, + "grad_norm": 10.5625, + "learning_rate": 9.865375270236314e-06, + "loss": 1.173915147781372, + "step": 718 + }, + { + "epoch": 0.22162370142362448, + "grad_norm": 6.65625, + "learning_rate": 9.864347568716337e-06, + "loss": 1.476589322090149, + "step": 720 + }, + { + "epoch": 0.22223932281646788, + "grad_norm": 10.375, + "learning_rate": 9.863316027107561e-06, + "loss": 1.672022819519043, + "step": 722 + }, + { + "epoch": 0.22285494420931126, + "grad_norm": 12.25, + "learning_rate": 9.862280646435048e-06, + "loss": 0.7103063464164734, + "step": 724 + }, + { + "epoch": 0.22347056560215467, + "grad_norm": 5.875, + "learning_rate": 9.86124142772768e-06, + "loss": 1.5016530752182007, + "step": 726 + }, + { + "epoch": 0.22408618699499808, + "grad_norm": 6.78125, + "learning_rate": 9.860198372018153e-06, + "loss": 1.6842578649520874, + "step": 728 + }, + { + "epoch": 0.22470180838784148, + "grad_norm": 22.375, + "learning_rate": 9.859151480342975e-06, + "loss": 1.2881923913955688, + "step": 730 + }, + { + "epoch": 0.2253174297806849, + "grad_norm": 4.53125, + "learning_rate": 9.858100753742463e-06, + "loss": 1.1919798851013184, + "step": 732 + }, + { + "epoch": 0.22593305117352827, + "grad_norm": 5.59375, + "learning_rate": 9.857046193260751e-06, + "loss": 1.1353979110717773, + "step": 734 + }, + { + "epoch": 0.22654867256637168, + "grad_norm": 8.9375, + "learning_rate": 9.855987799945777e-06, + "loss": 1.460937261581421, + "step": 736 + }, + { + "epoch": 0.22716429395921509, + "grad_norm": 7.65625, + "learning_rate": 9.854925574849292e-06, + "loss": 1.3833016157150269, + "step": 738 + }, + { + "epoch": 0.2277799153520585, + "grad_norm": 30.125, + "learning_rate": 9.853859519026852e-06, + "loss": 1.7745245695114136, + "step": 740 + }, + { + "epoch": 0.22839553674490187, + "grad_norm": 7.53125, + "learning_rate": 9.852789633537818e-06, + "loss": 1.6859065294265747, + "step": 742 + }, + { + "epoch": 0.22901115813774528, + "grad_norm": 6.34375, + "learning_rate": 9.851715919445364e-06, + "loss": 0.9433289170265198, + "step": 744 + }, + { + "epoch": 0.2296267795305887, + "grad_norm": 9.125, + "learning_rate": 9.85063837781646e-06, + "loss": 1.363451361656189, + "step": 746 + }, + { + "epoch": 0.2302424009234321, + "grad_norm": 8.1875, + "learning_rate": 9.849557009721885e-06, + "loss": 1.1893389225006104, + "step": 748 + }, + { + "epoch": 0.2308580223162755, + "grad_norm": 3.84375, + "learning_rate": 9.84847181623622e-06, + "loss": 1.209572672843933, + "step": 750 + }, + { + "epoch": 0.23147364370911888, + "grad_norm": 12.125, + "learning_rate": 9.847382798437843e-06, + "loss": 1.3902331590652466, + "step": 752 + }, + { + "epoch": 0.2320892651019623, + "grad_norm": 5.875, + "learning_rate": 9.846289957408939e-06, + "loss": 1.4305294752120972, + "step": 754 + }, + { + "epoch": 0.2327048864948057, + "grad_norm": 5.28125, + "learning_rate": 9.845193294235484e-06, + "loss": 1.384616732597351, + "step": 756 + }, + { + "epoch": 0.2333205078876491, + "grad_norm": 5.40625, + "learning_rate": 9.84409281000726e-06, + "loss": 1.3366835117340088, + "step": 758 + }, + { + "epoch": 0.2339361292804925, + "grad_norm": 4.09375, + "learning_rate": 9.842988505817843e-06, + "loss": 1.4528403282165527, + "step": 760 + }, + { + "epoch": 0.2345517506733359, + "grad_norm": 4.78125, + "learning_rate": 9.841880382764604e-06, + "loss": 1.1686084270477295, + "step": 762 + }, + { + "epoch": 0.2351673720661793, + "grad_norm": 6.125, + "learning_rate": 9.84076844194871e-06, + "loss": 1.7286490201950073, + "step": 764 + }, + { + "epoch": 0.2357829934590227, + "grad_norm": 11.4375, + "learning_rate": 9.839652684475118e-06, + "loss": 1.6478850841522217, + "step": 766 + }, + { + "epoch": 0.2363986148518661, + "grad_norm": 6.5625, + "learning_rate": 9.838533111452586e-06, + "loss": 1.5273628234863281, + "step": 768 + }, + { + "epoch": 0.2370142362447095, + "grad_norm": 10.0, + "learning_rate": 9.837409723993658e-06, + "loss": 1.1861430406570435, + "step": 770 + }, + { + "epoch": 0.2376298576375529, + "grad_norm": 4.5625, + "learning_rate": 9.83628252321467e-06, + "loss": 1.1912150382995605, + "step": 772 + }, + { + "epoch": 0.2382454790303963, + "grad_norm": 18.5, + "learning_rate": 9.835151510235744e-06, + "loss": 1.6655570268630981, + "step": 774 + }, + { + "epoch": 0.23886110042323971, + "grad_norm": 5.0, + "learning_rate": 9.834016686180794e-06, + "loss": 1.0292619466781616, + "step": 776 + }, + { + "epoch": 0.23947672181608312, + "grad_norm": 6.90625, + "learning_rate": 9.83287805217752e-06, + "loss": 1.4104214906692505, + "step": 778 + }, + { + "epoch": 0.2400923432089265, + "grad_norm": 12.875, + "learning_rate": 9.831735609357408e-06, + "loss": 1.3935370445251465, + "step": 780 + }, + { + "epoch": 0.2407079646017699, + "grad_norm": 11.1875, + "learning_rate": 9.83058935885573e-06, + "loss": 1.1317634582519531, + "step": 782 + }, + { + "epoch": 0.24132358599461332, + "grad_norm": 2.84375, + "learning_rate": 9.82943930181154e-06, + "loss": 1.459189772605896, + "step": 784 + }, + { + "epoch": 0.24193920738745672, + "grad_norm": 8.625, + "learning_rate": 9.828285439367678e-06, + "loss": 1.6490992307662964, + "step": 786 + }, + { + "epoch": 0.2425548287803001, + "grad_norm": 34.75, + "learning_rate": 9.827127772670758e-06, + "loss": 1.3461780548095703, + "step": 788 + }, + { + "epoch": 0.2431704501731435, + "grad_norm": 6.21875, + "learning_rate": 9.825966302871183e-06, + "loss": 1.4193799495697021, + "step": 790 + }, + { + "epoch": 0.24378607156598692, + "grad_norm": 5.28125, + "learning_rate": 9.82480103112313e-06, + "loss": 1.273714542388916, + "step": 792 + }, + { + "epoch": 0.24440169295883032, + "grad_norm": 11.0625, + "learning_rate": 9.823631958584556e-06, + "loss": 1.255758285522461, + "step": 794 + }, + { + "epoch": 0.24501731435167373, + "grad_norm": 5.9375, + "learning_rate": 9.822459086417195e-06, + "loss": 1.5950769186019897, + "step": 796 + }, + { + "epoch": 0.2456329357445171, + "grad_norm": 6.96875, + "learning_rate": 9.821282415786557e-06, + "loss": 1.4654479026794434, + "step": 798 + }, + { + "epoch": 0.24624855713736052, + "grad_norm": 10.625, + "learning_rate": 9.820101947861927e-06, + "loss": 1.5545507669448853, + "step": 800 + }, + { + "epoch": 0.24686417853020393, + "grad_norm": 6.75, + "learning_rate": 9.818917683816358e-06, + "loss": 1.6426501274108887, + "step": 802 + }, + { + "epoch": 0.24747979992304733, + "grad_norm": 7.40625, + "learning_rate": 9.817729624826681e-06, + "loss": 1.5924813747406006, + "step": 804 + }, + { + "epoch": 0.24809542131589074, + "grad_norm": 16.25, + "learning_rate": 9.816537772073502e-06, + "loss": 1.791450023651123, + "step": 806 + }, + { + "epoch": 0.24871104270873412, + "grad_norm": 14.375, + "learning_rate": 9.815342126741185e-06, + "loss": 1.4027529954910278, + "step": 808 + }, + { + "epoch": 0.24932666410157753, + "grad_norm": 14.625, + "learning_rate": 9.814142690017875e-06, + "loss": 0.960854709148407, + "step": 810 + }, + { + "epoch": 0.24994228549442093, + "grad_norm": 5.9375, + "learning_rate": 9.812939463095476e-06, + "loss": 1.6002954244613647, + "step": 812 + }, + { + "epoch": 0.2505579068872643, + "grad_norm": 2.90625, + "learning_rate": 9.811732447169662e-06, + "loss": 1.1075432300567627, + "step": 814 + }, + { + "epoch": 0.25117352828010775, + "grad_norm": 4.9375, + "learning_rate": 9.810521643439872e-06, + "loss": 1.1781002283096313, + "step": 816 + }, + { + "epoch": 0.25178914967295113, + "grad_norm": 20.75, + "learning_rate": 9.80930705310931e-06, + "loss": 1.0230952501296997, + "step": 818 + }, + { + "epoch": 0.2524047710657945, + "grad_norm": 1.7265625, + "learning_rate": 9.808088677384939e-06, + "loss": 1.1417490243911743, + "step": 820 + }, + { + "epoch": 0.25302039245863794, + "grad_norm": 10.6875, + "learning_rate": 9.806866517477487e-06, + "loss": 1.560872197151184, + "step": 822 + }, + { + "epoch": 0.2536360138514813, + "grad_norm": 3.0625, + "learning_rate": 9.805640574601443e-06, + "loss": 1.2669354677200317, + "step": 824 + }, + { + "epoch": 0.25425163524432476, + "grad_norm": 5.03125, + "learning_rate": 9.804410849975056e-06, + "loss": 1.2113425731658936, + "step": 826 + }, + { + "epoch": 0.25486725663716814, + "grad_norm": 8.75, + "learning_rate": 9.803177344820326e-06, + "loss": 0.6720989942550659, + "step": 828 + }, + { + "epoch": 0.2554828780300115, + "grad_norm": 8.3125, + "learning_rate": 9.801940060363018e-06, + "loss": 1.5107386112213135, + "step": 830 + }, + { + "epoch": 0.25609849942285495, + "grad_norm": 2.140625, + "learning_rate": 9.800698997832647e-06, + "loss": 1.2716776132583618, + "step": 832 + }, + { + "epoch": 0.25671412081569833, + "grad_norm": 6.46875, + "learning_rate": 9.799454158462487e-06, + "loss": 1.4024817943572998, + "step": 834 + }, + { + "epoch": 0.25732974220854177, + "grad_norm": 12.25, + "learning_rate": 9.798205543489562e-06, + "loss": 1.0063973665237427, + "step": 836 + }, + { + "epoch": 0.25794536360138515, + "grad_norm": 9.5625, + "learning_rate": 9.79695315415465e-06, + "loss": 0.7014191150665283, + "step": 838 + }, + { + "epoch": 0.2585609849942285, + "grad_norm": 19.625, + "learning_rate": 9.795696991702274e-06, + "loss": 1.6371238231658936, + "step": 840 + }, + { + "epoch": 0.25917660638707196, + "grad_norm": 7.03125, + "learning_rate": 9.794437057380714e-06, + "loss": 1.376318097114563, + "step": 842 + }, + { + "epoch": 0.25979222777991534, + "grad_norm": 4.5, + "learning_rate": 9.793173352441996e-06, + "loss": 1.3747721910476685, + "step": 844 + }, + { + "epoch": 0.2604078491727588, + "grad_norm": 9.125, + "learning_rate": 9.791905878141891e-06, + "loss": 1.455966830253601, + "step": 846 + }, + { + "epoch": 0.26102347056560216, + "grad_norm": 149.0, + "learning_rate": 9.790634635739915e-06, + "loss": 1.793878197669983, + "step": 848 + }, + { + "epoch": 0.26163909195844554, + "grad_norm": 37.5, + "learning_rate": 9.789359626499332e-06, + "loss": 1.0845096111297607, + "step": 850 + }, + { + "epoch": 0.26225471335128897, + "grad_norm": 9.9375, + "learning_rate": 9.788080851687145e-06, + "loss": 1.3753288984298706, + "step": 852 + }, + { + "epoch": 0.26287033474413235, + "grad_norm": 5.15625, + "learning_rate": 9.786798312574104e-06, + "loss": 1.1450377702713013, + "step": 854 + }, + { + "epoch": 0.2634859561369758, + "grad_norm": 11.625, + "learning_rate": 9.785512010434695e-06, + "loss": 1.6636018753051758, + "step": 856 + }, + { + "epoch": 0.26410157752981916, + "grad_norm": 8.5, + "learning_rate": 9.784221946547146e-06, + "loss": 1.6864784955978394, + "step": 858 + }, + { + "epoch": 0.26471719892266254, + "grad_norm": 6.1875, + "learning_rate": 9.782928122193423e-06, + "loss": 1.5488227605819702, + "step": 860 + }, + { + "epoch": 0.265332820315506, + "grad_norm": 7.78125, + "learning_rate": 9.781630538659226e-06, + "loss": 1.5389184951782227, + "step": 862 + }, + { + "epoch": 0.26594844170834936, + "grad_norm": 13.125, + "learning_rate": 9.780329197233995e-06, + "loss": 1.5071114301681519, + "step": 864 + }, + { + "epoch": 0.2665640631011928, + "grad_norm": 496.0, + "learning_rate": 9.7790240992109e-06, + "loss": 0.7931236624717712, + "step": 866 + }, + { + "epoch": 0.2671796844940362, + "grad_norm": 4.21875, + "learning_rate": 9.777715245886852e-06, + "loss": 1.1507668495178223, + "step": 868 + }, + { + "epoch": 0.26779530588687955, + "grad_norm": 5.8125, + "learning_rate": 9.77640263856248e-06, + "loss": 1.1638729572296143, + "step": 870 + }, + { + "epoch": 0.268410927279723, + "grad_norm": 4.34375, + "learning_rate": 9.775086278542156e-06, + "loss": 1.0706069469451904, + "step": 872 + }, + { + "epoch": 0.26902654867256637, + "grad_norm": 4.65625, + "learning_rate": 9.773766167133976e-06, + "loss": 1.0477044582366943, + "step": 874 + }, + { + "epoch": 0.26964217006540975, + "grad_norm": 5.59375, + "learning_rate": 9.77244230564976e-06, + "loss": 0.9538878798484802, + "step": 876 + }, + { + "epoch": 0.2702577914582532, + "grad_norm": 9.625, + "learning_rate": 9.771114695405066e-06, + "loss": 1.4235448837280273, + "step": 878 + }, + { + "epoch": 0.27087341285109656, + "grad_norm": 7.21875, + "learning_rate": 9.769783337719166e-06, + "loss": 1.4780068397521973, + "step": 880 + }, + { + "epoch": 0.27148903424394, + "grad_norm": 26.0, + "learning_rate": 9.76844823391506e-06, + "loss": 0.8643165230751038, + "step": 882 + }, + { + "epoch": 0.2721046556367834, + "grad_norm": 9.75, + "learning_rate": 9.767109385319472e-06, + "loss": 1.3260691165924072, + "step": 884 + }, + { + "epoch": 0.27272027702962676, + "grad_norm": 11.125, + "learning_rate": 9.765766793262843e-06, + "loss": 1.1863996982574463, + "step": 886 + }, + { + "epoch": 0.2733358984224702, + "grad_norm": 7.0, + "learning_rate": 9.76442045907934e-06, + "loss": 1.3834737539291382, + "step": 888 + }, + { + "epoch": 0.27395151981531357, + "grad_norm": 14.1875, + "learning_rate": 9.763070384106845e-06, + "loss": 1.594760775566101, + "step": 890 + }, + { + "epoch": 0.274567141208157, + "grad_norm": 23.625, + "learning_rate": 9.761716569686954e-06, + "loss": 1.3638572692871094, + "step": 892 + }, + { + "epoch": 0.2751827626010004, + "grad_norm": 5.21875, + "learning_rate": 9.760359017164989e-06, + "loss": 1.6811443567276, + "step": 894 + }, + { + "epoch": 0.27579838399384377, + "grad_norm": 8.1875, + "learning_rate": 9.758997727889977e-06, + "loss": 1.3500550985336304, + "step": 896 + }, + { + "epoch": 0.2764140053866872, + "grad_norm": 5.875, + "learning_rate": 9.75763270321466e-06, + "loss": 1.2751091718673706, + "step": 898 + }, + { + "epoch": 0.2770296267795306, + "grad_norm": 5.75, + "learning_rate": 9.756263944495495e-06, + "loss": 1.177546501159668, + "step": 900 + }, + { + "epoch": 0.277645248172374, + "grad_norm": 5.40625, + "learning_rate": 9.754891453092649e-06, + "loss": 1.2334669828414917, + "step": 902 + }, + { + "epoch": 0.2782608695652174, + "grad_norm": 3.71875, + "learning_rate": 9.753515230369997e-06, + "loss": 1.0920450687408447, + "step": 904 + }, + { + "epoch": 0.2788764909580608, + "grad_norm": 5.15625, + "learning_rate": 9.752135277695122e-06, + "loss": 1.4145758152008057, + "step": 906 + }, + { + "epoch": 0.2794921123509042, + "grad_norm": 5.6875, + "learning_rate": 9.750751596439316e-06, + "loss": 1.1625487804412842, + "step": 908 + }, + { + "epoch": 0.2801077337437476, + "grad_norm": 8.375, + "learning_rate": 9.749364187977572e-06, + "loss": 1.4057106971740723, + "step": 910 + }, + { + "epoch": 0.280723355136591, + "grad_norm": 16.125, + "learning_rate": 9.747973053688589e-06, + "loss": 0.9219164252281189, + "step": 912 + }, + { + "epoch": 0.2813389765294344, + "grad_norm": 6.71875, + "learning_rate": 9.746578194954767e-06, + "loss": 0.9533755779266357, + "step": 914 + }, + { + "epoch": 0.2819545979222778, + "grad_norm": 2.953125, + "learning_rate": 9.745179613162213e-06, + "loss": 0.9482744932174683, + "step": 916 + }, + { + "epoch": 0.2825702193151212, + "grad_norm": 9.75, + "learning_rate": 9.743777309700724e-06, + "loss": 1.7051759958267212, + "step": 918 + }, + { + "epoch": 0.2831858407079646, + "grad_norm": 6.03125, + "learning_rate": 9.742371285963802e-06, + "loss": 1.216848611831665, + "step": 920 + }, + { + "epoch": 0.283801462100808, + "grad_norm": 9.0625, + "learning_rate": 9.740961543348648e-06, + "loss": 1.3270268440246582, + "step": 922 + }, + { + "epoch": 0.2844170834936514, + "grad_norm": 4.5, + "learning_rate": 9.73954808325615e-06, + "loss": 0.9035344123840332, + "step": 924 + }, + { + "epoch": 0.2850327048864948, + "grad_norm": 55.25, + "learning_rate": 9.738130907090895e-06, + "loss": 1.51797616481781, + "step": 926 + }, + { + "epoch": 0.2856483262793382, + "grad_norm": 12.375, + "learning_rate": 9.736710016261166e-06, + "loss": 1.508449673652649, + "step": 928 + }, + { + "epoch": 0.2862639476721816, + "grad_norm": 9.6875, + "learning_rate": 9.735285412178931e-06, + "loss": 1.2803070545196533, + "step": 930 + }, + { + "epoch": 0.286879569065025, + "grad_norm": 11.25, + "learning_rate": 9.733857096259854e-06, + "loss": 1.1159873008728027, + "step": 932 + }, + { + "epoch": 0.2874951904578684, + "grad_norm": 13.75, + "learning_rate": 9.732425069923282e-06, + "loss": 1.1685740947723389, + "step": 934 + }, + { + "epoch": 0.2881108118507118, + "grad_norm": 7.09375, + "learning_rate": 9.730989334592252e-06, + "loss": 1.297022819519043, + "step": 936 + }, + { + "epoch": 0.28872643324355524, + "grad_norm": 3.609375, + "learning_rate": 9.729549891693487e-06, + "loss": 1.2184970378875732, + "step": 938 + }, + { + "epoch": 0.2893420546363986, + "grad_norm": 5.0625, + "learning_rate": 9.728106742657394e-06, + "loss": 1.3542568683624268, + "step": 940 + }, + { + "epoch": 0.289957676029242, + "grad_norm": 2.296875, + "learning_rate": 9.726659888918065e-06, + "loss": 1.019122838973999, + "step": 942 + }, + { + "epoch": 0.29057329742208543, + "grad_norm": 9.0, + "learning_rate": 9.725209331913266e-06, + "loss": 0.9411934614181519, + "step": 944 + }, + { + "epoch": 0.2911889188149288, + "grad_norm": 7.34375, + "learning_rate": 9.723755073084449e-06, + "loss": 1.2213878631591797, + "step": 946 + }, + { + "epoch": 0.29180454020777224, + "grad_norm": 8.5, + "learning_rate": 9.722297113876744e-06, + "loss": 1.7289658784866333, + "step": 948 + }, + { + "epoch": 0.2924201616006156, + "grad_norm": 5.3125, + "learning_rate": 9.720835455738961e-06, + "loss": 1.3057657480239868, + "step": 950 + }, + { + "epoch": 0.293035782993459, + "grad_norm": 25.625, + "learning_rate": 9.71937010012358e-06, + "loss": 1.4629443883895874, + "step": 952 + }, + { + "epoch": 0.29365140438630244, + "grad_norm": 4.34375, + "learning_rate": 9.717901048486758e-06, + "loss": 1.1471151113510132, + "step": 954 + }, + { + "epoch": 0.2942670257791458, + "grad_norm": 3.609375, + "learning_rate": 9.716428302288323e-06, + "loss": 1.144550085067749, + "step": 956 + }, + { + "epoch": 0.29488264717198925, + "grad_norm": 5.78125, + "learning_rate": 9.714951862991777e-06, + "loss": 1.3617162704467773, + "step": 958 + }, + { + "epoch": 0.29549826856483263, + "grad_norm": 7.3125, + "learning_rate": 9.713471732064293e-06, + "loss": 1.593535304069519, + "step": 960 + }, + { + "epoch": 0.296113889957676, + "grad_norm": 13.5625, + "learning_rate": 9.711987910976705e-06, + "loss": 1.3249685764312744, + "step": 962 + }, + { + "epoch": 0.29672951135051945, + "grad_norm": 5.96875, + "learning_rate": 9.710500401203525e-06, + "loss": 1.3375357389450073, + "step": 964 + }, + { + "epoch": 0.2973451327433628, + "grad_norm": 4.9375, + "learning_rate": 9.709009204222923e-06, + "loss": 1.483339786529541, + "step": 966 + }, + { + "epoch": 0.2979607541362062, + "grad_norm": 10.875, + "learning_rate": 9.707514321516734e-06, + "loss": 1.0293675661087036, + "step": 968 + }, + { + "epoch": 0.29857637552904964, + "grad_norm": 5.5625, + "learning_rate": 9.706015754570452e-06, + "loss": 1.0801160335540771, + "step": 970 + }, + { + "epoch": 0.299191996921893, + "grad_norm": 5.25, + "learning_rate": 9.704513504873247e-06, + "loss": 0.9405486583709717, + "step": 972 + }, + { + "epoch": 0.29980761831473646, + "grad_norm": 5.65625, + "learning_rate": 9.70300757391793e-06, + "loss": 1.3500492572784424, + "step": 974 + }, + { + "epoch": 0.30042323970757984, + "grad_norm": 26.875, + "learning_rate": 9.70149796320098e-06, + "loss": 1.480463981628418, + "step": 976 + }, + { + "epoch": 0.3010388611004232, + "grad_norm": 5.25, + "learning_rate": 9.699984674222534e-06, + "loss": 1.1579872369766235, + "step": 978 + }, + { + "epoch": 0.30165448249326665, + "grad_norm": 6.78125, + "learning_rate": 9.698467708486379e-06, + "loss": 1.4822840690612793, + "step": 980 + }, + { + "epoch": 0.30227010388611003, + "grad_norm": 9.1875, + "learning_rate": 9.696947067499958e-06, + "loss": 1.5733311176300049, + "step": 982 + }, + { + "epoch": 0.30288572527895347, + "grad_norm": 3.640625, + "learning_rate": 9.695422752774364e-06, + "loss": 1.190136432647705, + "step": 984 + }, + { + "epoch": 0.30350134667179685, + "grad_norm": 6.8125, + "learning_rate": 9.693894765824345e-06, + "loss": 1.3894709348678589, + "step": 986 + }, + { + "epoch": 0.3041169680646402, + "grad_norm": 7.25, + "learning_rate": 9.692363108168294e-06, + "loss": 1.3427432775497437, + "step": 988 + }, + { + "epoch": 0.30473258945748366, + "grad_norm": 9.4375, + "learning_rate": 9.690827781328259e-06, + "loss": 1.1161162853240967, + "step": 990 + }, + { + "epoch": 0.30534821085032704, + "grad_norm": 11.3125, + "learning_rate": 9.689288786829922e-06, + "loss": 1.4082365036010742, + "step": 992 + }, + { + "epoch": 0.3059638322431705, + "grad_norm": 11.0625, + "learning_rate": 9.68774612620262e-06, + "loss": 1.6027635335922241, + "step": 994 + }, + { + "epoch": 0.30657945363601385, + "grad_norm": 6.53125, + "learning_rate": 9.686199800979328e-06, + "loss": 1.2858593463897705, + "step": 996 + }, + { + "epoch": 0.30719507502885723, + "grad_norm": 4.28125, + "learning_rate": 9.684649812696665e-06, + "loss": 1.340716004371643, + "step": 998 + }, + { + "epoch": 0.30781069642170067, + "grad_norm": 12.5, + "learning_rate": 9.68309616289489e-06, + "loss": 1.1984376907348633, + "step": 1000 + }, + { + "epoch": 0.30842631781454405, + "grad_norm": 5.4375, + "learning_rate": 9.681538853117896e-06, + "loss": 1.6487531661987305, + "step": 1002 + }, + { + "epoch": 0.3090419392073875, + "grad_norm": 4.46875, + "learning_rate": 9.679977884913219e-06, + "loss": 1.0971587896347046, + "step": 1004 + }, + { + "epoch": 0.30965756060023086, + "grad_norm": 7.28125, + "learning_rate": 9.67841325983203e-06, + "loss": 1.6865112781524658, + "step": 1006 + }, + { + "epoch": 0.31027318199307424, + "grad_norm": 2.109375, + "learning_rate": 9.676844979429127e-06, + "loss": 1.3177874088287354, + "step": 1008 + }, + { + "epoch": 0.3108888033859177, + "grad_norm": 8.125, + "learning_rate": 9.67527304526295e-06, + "loss": 1.2586731910705566, + "step": 1010 + }, + { + "epoch": 0.31150442477876106, + "grad_norm": 5.625, + "learning_rate": 9.673697458895563e-06, + "loss": 1.2238794565200806, + "step": 1012 + }, + { + "epoch": 0.31212004617160444, + "grad_norm": 4.96875, + "learning_rate": 9.672118221892663e-06, + "loss": 1.1530698537826538, + "step": 1014 + }, + { + "epoch": 0.3127356675644479, + "grad_norm": 7.0, + "learning_rate": 9.670535335823572e-06, + "loss": 1.3727072477340698, + "step": 1016 + }, + { + "epoch": 0.31335128895729125, + "grad_norm": 8.4375, + "learning_rate": 9.66894880226124e-06, + "loss": 1.4163241386413574, + "step": 1018 + }, + { + "epoch": 0.3139669103501347, + "grad_norm": 7.84375, + "learning_rate": 9.667358622782242e-06, + "loss": 1.5218753814697266, + "step": 1020 + }, + { + "epoch": 0.31458253174297807, + "grad_norm": 11.875, + "learning_rate": 9.66576479896677e-06, + "loss": 1.2111762762069702, + "step": 1022 + }, + { + "epoch": 0.31519815313582145, + "grad_norm": 6.78125, + "learning_rate": 9.664167332398649e-06, + "loss": 1.0470290184020996, + "step": 1024 + }, + { + "epoch": 0.3158137745286649, + "grad_norm": 11.25, + "learning_rate": 9.662566224665313e-06, + "loss": 0.7057480812072754, + "step": 1026 + }, + { + "epoch": 0.31642939592150826, + "grad_norm": 6.125, + "learning_rate": 9.66096147735782e-06, + "loss": 1.3530975580215454, + "step": 1028 + }, + { + "epoch": 0.3170450173143517, + "grad_norm": 12.625, + "learning_rate": 9.659353092070844e-06, + "loss": 1.084864854812622, + "step": 1030 + }, + { + "epoch": 0.3176606387071951, + "grad_norm": 9.5625, + "learning_rate": 9.657741070402673e-06, + "loss": 1.4985663890838623, + "step": 1032 + }, + { + "epoch": 0.31827626010003846, + "grad_norm": 4.75, + "learning_rate": 9.65612541395521e-06, + "loss": 1.4467542171478271, + "step": 1034 + }, + { + "epoch": 0.3188918814928819, + "grad_norm": 9.8125, + "learning_rate": 9.65450612433397e-06, + "loss": 1.5565028190612793, + "step": 1036 + }, + { + "epoch": 0.31950750288572527, + "grad_norm": 17.125, + "learning_rate": 9.65288320314807e-06, + "loss": 1.085126519203186, + "step": 1038 + }, + { + "epoch": 0.3201231242785687, + "grad_norm": 4.53125, + "learning_rate": 9.651256652010252e-06, + "loss": 1.497908115386963, + "step": 1040 + }, + { + "epoch": 0.3207387456714121, + "grad_norm": 12.9375, + "learning_rate": 9.64962647253685e-06, + "loss": 1.504968285560608, + "step": 1042 + }, + { + "epoch": 0.32135436706425546, + "grad_norm": 7.03125, + "learning_rate": 9.647992666347816e-06, + "loss": 1.4675335884094238, + "step": 1044 + }, + { + "epoch": 0.3219699884570989, + "grad_norm": 7.46875, + "learning_rate": 9.646355235066696e-06, + "loss": 1.2435938119888306, + "step": 1046 + }, + { + "epoch": 0.3225856098499423, + "grad_norm": 4.21875, + "learning_rate": 9.644714180320642e-06, + "loss": 1.3065686225891113, + "step": 1048 + }, + { + "epoch": 0.3232012312427857, + "grad_norm": 9.625, + "learning_rate": 9.64306950374041e-06, + "loss": 1.2061657905578613, + "step": 1050 + }, + { + "epoch": 0.3238168526356291, + "grad_norm": 10.0625, + "learning_rate": 9.641421206960347e-06, + "loss": 1.10911226272583, + "step": 1052 + }, + { + "epoch": 0.3244324740284725, + "grad_norm": 11.5, + "learning_rate": 9.639769291618406e-06, + "loss": 0.9055336117744446, + "step": 1054 + }, + { + "epoch": 0.3250480954213159, + "grad_norm": 4.21875, + "learning_rate": 9.638113759356132e-06, + "loss": 1.2694463729858398, + "step": 1056 + }, + { + "epoch": 0.3256637168141593, + "grad_norm": 3.25, + "learning_rate": 9.636454611818665e-06, + "loss": 1.3356084823608398, + "step": 1058 + }, + { + "epoch": 0.32627933820700267, + "grad_norm": 7.84375, + "learning_rate": 9.634791850654735e-06, + "loss": 1.2268829345703125, + "step": 1060 + }, + { + "epoch": 0.3268949595998461, + "grad_norm": 5.21875, + "learning_rate": 9.633125477516663e-06, + "loss": 1.638374924659729, + "step": 1062 + }, + { + "epoch": 0.3275105809926895, + "grad_norm": 8.5, + "learning_rate": 9.631455494060369e-06, + "loss": 1.517810344696045, + "step": 1064 + }, + { + "epoch": 0.3281262023855329, + "grad_norm": 44.75, + "learning_rate": 9.629781901945345e-06, + "loss": 1.184501051902771, + "step": 1066 + }, + { + "epoch": 0.3287418237783763, + "grad_norm": 6.59375, + "learning_rate": 9.628104702834681e-06, + "loss": 0.9123966693878174, + "step": 1068 + }, + { + "epoch": 0.3293574451712197, + "grad_norm": 11.3125, + "learning_rate": 9.62642389839505e-06, + "loss": 1.200285792350769, + "step": 1070 + }, + { + "epoch": 0.3299730665640631, + "grad_norm": 18.5, + "learning_rate": 9.6247394902967e-06, + "loss": 1.7133913040161133, + "step": 1072 + }, + { + "epoch": 0.3305886879569065, + "grad_norm": 11.75, + "learning_rate": 9.623051480213468e-06, + "loss": 1.683318018913269, + "step": 1074 + }, + { + "epoch": 0.3312043093497499, + "grad_norm": 6.125, + "learning_rate": 9.621359869822764e-06, + "loss": 1.5562039613723755, + "step": 1076 + }, + { + "epoch": 0.3318199307425933, + "grad_norm": 8.875, + "learning_rate": 9.619664660805583e-06, + "loss": 1.3498424291610718, + "step": 1078 + }, + { + "epoch": 0.3324355521354367, + "grad_norm": 2.71875, + "learning_rate": 9.617965854846492e-06, + "loss": 1.4478331804275513, + "step": 1080 + }, + { + "epoch": 0.3330511735282801, + "grad_norm": 9.75, + "learning_rate": 9.616263453633628e-06, + "loss": 1.3443846702575684, + "step": 1082 + }, + { + "epoch": 0.3336667949211235, + "grad_norm": 5.125, + "learning_rate": 9.614557458858712e-06, + "loss": 1.2382532358169556, + "step": 1084 + }, + { + "epoch": 0.33428241631396693, + "grad_norm": 4.125, + "learning_rate": 9.612847872217023e-06, + "loss": 1.3172005414962769, + "step": 1086 + }, + { + "epoch": 0.3348980377068103, + "grad_norm": 14.1875, + "learning_rate": 9.61113469540742e-06, + "loss": 0.9626773595809937, + "step": 1088 + }, + { + "epoch": 0.3355136590996537, + "grad_norm": 2.328125, + "learning_rate": 9.609417930132324e-06, + "loss": 1.2234991788864136, + "step": 1090 + }, + { + "epoch": 0.33612928049249713, + "grad_norm": 9.5, + "learning_rate": 9.607697578097721e-06, + "loss": 1.1397364139556885, + "step": 1092 + }, + { + "epoch": 0.3367449018853405, + "grad_norm": 2.796875, + "learning_rate": 9.605973641013166e-06, + "loss": 0.4958764910697937, + "step": 1094 + }, + { + "epoch": 0.33736052327818394, + "grad_norm": 6.375, + "learning_rate": 9.604246120591774e-06, + "loss": 1.32682204246521, + "step": 1096 + }, + { + "epoch": 0.3379761446710273, + "grad_norm": 12.375, + "learning_rate": 9.602515018550217e-06, + "loss": 1.1351960897445679, + "step": 1098 + }, + { + "epoch": 0.3385917660638707, + "grad_norm": 6.40625, + "learning_rate": 9.600780336608735e-06, + "loss": 1.3413037061691284, + "step": 1100 + }, + { + "epoch": 0.33920738745671414, + "grad_norm": 7.59375, + "learning_rate": 9.599042076491118e-06, + "loss": 1.577768087387085, + "step": 1102 + }, + { + "epoch": 0.3398230088495575, + "grad_norm": 23.625, + "learning_rate": 9.597300239924714e-06, + "loss": 1.0736544132232666, + "step": 1104 + }, + { + "epoch": 0.3404386302424009, + "grad_norm": 9.125, + "learning_rate": 9.595554828640426e-06, + "loss": 1.1022412776947021, + "step": 1106 + }, + { + "epoch": 0.34105425163524433, + "grad_norm": 5.15625, + "learning_rate": 9.593805844372706e-06, + "loss": 1.1736547946929932, + "step": 1108 + }, + { + "epoch": 0.3416698730280877, + "grad_norm": 9.0625, + "learning_rate": 9.592053288859559e-06, + "loss": 1.372632622718811, + "step": 1110 + }, + { + "epoch": 0.34228549442093115, + "grad_norm": 5.96875, + "learning_rate": 9.59029716384254e-06, + "loss": 1.2505266666412354, + "step": 1112 + }, + { + "epoch": 0.3429011158137745, + "grad_norm": 6.5, + "learning_rate": 9.588537471066755e-06, + "loss": 1.4534019231796265, + "step": 1114 + }, + { + "epoch": 0.3435167372066179, + "grad_norm": 9.0, + "learning_rate": 9.586774212280841e-06, + "loss": 1.7671773433685303, + "step": 1116 + }, + { + "epoch": 0.34413235859946134, + "grad_norm": 6.875, + "learning_rate": 9.58500738923699e-06, + "loss": 1.3340649604797363, + "step": 1118 + }, + { + "epoch": 0.3447479799923047, + "grad_norm": 4.90625, + "learning_rate": 9.583237003690939e-06, + "loss": 1.4870283603668213, + "step": 1120 + }, + { + "epoch": 0.34536360138514816, + "grad_norm": 6.40625, + "learning_rate": 9.581463057401954e-06, + "loss": 1.010940432548523, + "step": 1122 + }, + { + "epoch": 0.34597922277799154, + "grad_norm": 5.75, + "learning_rate": 9.57968555213285e-06, + "loss": 1.4298595190048218, + "step": 1124 + }, + { + "epoch": 0.3465948441708349, + "grad_norm": 20.625, + "learning_rate": 9.577904489649968e-06, + "loss": 1.5162783861160278, + "step": 1126 + }, + { + "epoch": 0.34721046556367835, + "grad_norm": 4.625, + "learning_rate": 9.576119871723194e-06, + "loss": 1.3182761669158936, + "step": 1128 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 11.375, + "learning_rate": 9.57433170012594e-06, + "loss": 1.7118732929229736, + "step": 1130 + }, + { + "epoch": 0.34844170834936516, + "grad_norm": 7.71875, + "learning_rate": 9.572539976635158e-06, + "loss": 1.2914398908615112, + "step": 1132 + }, + { + "epoch": 0.34905732974220854, + "grad_norm": 8.375, + "learning_rate": 9.570744703031319e-06, + "loss": 1.6061583757400513, + "step": 1134 + }, + { + "epoch": 0.3496729511350519, + "grad_norm": 6.21875, + "learning_rate": 9.568945881098426e-06, + "loss": 1.5233328342437744, + "step": 1136 + }, + { + "epoch": 0.35028857252789536, + "grad_norm": 2.0, + "learning_rate": 9.567143512624009e-06, + "loss": 1.2799041271209717, + "step": 1138 + }, + { + "epoch": 0.35090419392073874, + "grad_norm": 7.8125, + "learning_rate": 9.565337599399126e-06, + "loss": 1.3228352069854736, + "step": 1140 + }, + { + "epoch": 0.3515198153135822, + "grad_norm": 2.65625, + "learning_rate": 9.563528143218346e-06, + "loss": 1.082732915878296, + "step": 1142 + }, + { + "epoch": 0.35213543670642555, + "grad_norm": 7.40625, + "learning_rate": 9.561715145879773e-06, + "loss": 1.2160236835479736, + "step": 1144 + }, + { + "epoch": 0.35275105809926893, + "grad_norm": 19.75, + "learning_rate": 9.559898609185023e-06, + "loss": 1.4509515762329102, + "step": 1146 + }, + { + "epoch": 0.35336667949211237, + "grad_norm": 5.59375, + "learning_rate": 9.558078534939223e-06, + "loss": 1.3905575275421143, + "step": 1148 + }, + { + "epoch": 0.35398230088495575, + "grad_norm": 6.34375, + "learning_rate": 9.556254924951026e-06, + "loss": 1.4463965892791748, + "step": 1150 + }, + { + "epoch": 0.3545979222777991, + "grad_norm": 10.75, + "learning_rate": 9.554427781032597e-06, + "loss": 1.6196813583374023, + "step": 1152 + }, + { + "epoch": 0.35521354367064256, + "grad_norm": 16.625, + "learning_rate": 9.552597104999606e-06, + "loss": 1.157198429107666, + "step": 1154 + }, + { + "epoch": 0.35582916506348594, + "grad_norm": 6.3125, + "learning_rate": 9.550762898671235e-06, + "loss": 1.513045310974121, + "step": 1156 + }, + { + "epoch": 0.3564447864563294, + "grad_norm": 5.21875, + "learning_rate": 9.54892516387018e-06, + "loss": 1.1897473335266113, + "step": 1158 + }, + { + "epoch": 0.35706040784917276, + "grad_norm": 11.3125, + "learning_rate": 9.547083902422636e-06, + "loss": 1.3680248260498047, + "step": 1160 + }, + { + "epoch": 0.35767602924201614, + "grad_norm": 9.0625, + "learning_rate": 9.545239116158308e-06, + "loss": 1.395264744758606, + "step": 1162 + }, + { + "epoch": 0.35829165063485957, + "grad_norm": 6.0, + "learning_rate": 9.543390806910403e-06, + "loss": 1.5195459127426147, + "step": 1164 + }, + { + "epoch": 0.35890727202770295, + "grad_norm": 10.0625, + "learning_rate": 9.541538976515624e-06, + "loss": 1.1251592636108398, + "step": 1166 + }, + { + "epoch": 0.3595228934205464, + "grad_norm": 5.65625, + "learning_rate": 9.539683626814176e-06, + "loss": 1.3194764852523804, + "step": 1168 + }, + { + "epoch": 0.36013851481338977, + "grad_norm": 12.3125, + "learning_rate": 9.537824759649763e-06, + "loss": 1.413508415222168, + "step": 1170 + }, + { + "epoch": 0.36075413620623314, + "grad_norm": 5.0, + "learning_rate": 9.535962376869582e-06, + "loss": 1.5969830751419067, + "step": 1172 + }, + { + "epoch": 0.3613697575990766, + "grad_norm": 16.375, + "learning_rate": 9.534096480324329e-06, + "loss": 1.6726967096328735, + "step": 1174 + }, + { + "epoch": 0.36198537899191996, + "grad_norm": 5.03125, + "learning_rate": 9.532227071868183e-06, + "loss": 1.2074344158172607, + "step": 1176 + }, + { + "epoch": 0.3626010003847634, + "grad_norm": 8.0625, + "learning_rate": 9.530354153358817e-06, + "loss": 1.5075865983963013, + "step": 1178 + }, + { + "epoch": 0.3632166217776068, + "grad_norm": 7.875, + "learning_rate": 9.528477726657393e-06, + "loss": 1.1990852355957031, + "step": 1180 + }, + { + "epoch": 0.36383224317045015, + "grad_norm": 8.8125, + "learning_rate": 9.526597793628558e-06, + "loss": 1.5207868814468384, + "step": 1182 + }, + { + "epoch": 0.3644478645632936, + "grad_norm": 17.75, + "learning_rate": 9.524714356140443e-06, + "loss": 1.3340822458267212, + "step": 1184 + }, + { + "epoch": 0.36506348595613697, + "grad_norm": 7.40625, + "learning_rate": 9.522827416064664e-06, + "loss": 1.3735941648483276, + "step": 1186 + }, + { + "epoch": 0.3656791073489804, + "grad_norm": 5.59375, + "learning_rate": 9.520936975276316e-06, + "loss": 1.2616640329360962, + "step": 1188 + }, + { + "epoch": 0.3662947287418238, + "grad_norm": 8.5625, + "learning_rate": 9.51904303565397e-06, + "loss": 1.4502698183059692, + "step": 1190 + }, + { + "epoch": 0.36691035013466716, + "grad_norm": 7.3125, + "learning_rate": 9.517145599079675e-06, + "loss": 1.1244027614593506, + "step": 1192 + }, + { + "epoch": 0.3675259715275106, + "grad_norm": 8.3125, + "learning_rate": 9.51524466743896e-06, + "loss": 1.3728070259094238, + "step": 1194 + }, + { + "epoch": 0.368141592920354, + "grad_norm": 13.875, + "learning_rate": 9.513340242620823e-06, + "loss": 1.3410530090332031, + "step": 1196 + }, + { + "epoch": 0.36875721431319736, + "grad_norm": 8.3125, + "learning_rate": 9.511432326517731e-06, + "loss": 1.8403198719024658, + "step": 1198 + }, + { + "epoch": 0.3693728357060408, + "grad_norm": 16.25, + "learning_rate": 9.509520921025626e-06, + "loss": 1.5507339239120483, + "step": 1200 + }, + { + "epoch": 0.36998845709888417, + "grad_norm": 2.765625, + "learning_rate": 9.507606028043912e-06, + "loss": 1.3664857149124146, + "step": 1202 + }, + { + "epoch": 0.3706040784917276, + "grad_norm": 8.25, + "learning_rate": 9.50568764947546e-06, + "loss": 1.2702953815460205, + "step": 1204 + }, + { + "epoch": 0.371219699884571, + "grad_norm": 16.75, + "learning_rate": 9.50376578722661e-06, + "loss": 1.535370945930481, + "step": 1206 + }, + { + "epoch": 0.37183532127741437, + "grad_norm": 8.25, + "learning_rate": 9.50184044320716e-06, + "loss": 1.6450167894363403, + "step": 1208 + }, + { + "epoch": 0.3724509426702578, + "grad_norm": 9.125, + "learning_rate": 9.499911619330359e-06, + "loss": 1.3015737533569336, + "step": 1210 + }, + { + "epoch": 0.3730665640631012, + "grad_norm": 6.53125, + "learning_rate": 9.497979317512933e-06, + "loss": 1.4610661268234253, + "step": 1212 + }, + { + "epoch": 0.3736821854559446, + "grad_norm": 6.65625, + "learning_rate": 9.496043539675048e-06, + "loss": 0.986057460308075, + "step": 1214 + }, + { + "epoch": 0.374297806848788, + "grad_norm": 15.0, + "learning_rate": 9.494104287740332e-06, + "loss": 1.6070356369018555, + "step": 1216 + }, + { + "epoch": 0.3749134282416314, + "grad_norm": 4.9375, + "learning_rate": 9.492161563635857e-06, + "loss": 0.9645931720733643, + "step": 1218 + }, + { + "epoch": 0.3755290496344748, + "grad_norm": 3.015625, + "learning_rate": 9.490215369292162e-06, + "loss": 1.0386302471160889, + "step": 1220 + }, + { + "epoch": 0.3761446710273182, + "grad_norm": 5.78125, + "learning_rate": 9.488265706643216e-06, + "loss": 1.4799411296844482, + "step": 1222 + }, + { + "epoch": 0.3767602924201616, + "grad_norm": 5.53125, + "learning_rate": 9.486312577626446e-06, + "loss": 1.4030064344406128, + "step": 1224 + }, + { + "epoch": 0.377375913813005, + "grad_norm": 4.96875, + "learning_rate": 9.484355984182718e-06, + "loss": 1.2825661897659302, + "step": 1226 + }, + { + "epoch": 0.3779915352058484, + "grad_norm": 5.46875, + "learning_rate": 9.482395928256345e-06, + "loss": 1.2394254207611084, + "step": 1228 + }, + { + "epoch": 0.3786071565986918, + "grad_norm": 5.625, + "learning_rate": 9.480432411795075e-06, + "loss": 1.317694902420044, + "step": 1230 + }, + { + "epoch": 0.3792227779915352, + "grad_norm": 15.6875, + "learning_rate": 9.478465436750103e-06, + "loss": 1.3715949058532715, + "step": 1232 + }, + { + "epoch": 0.37983839938437863, + "grad_norm": 5.875, + "learning_rate": 9.476495005076054e-06, + "loss": 1.4082399606704712, + "step": 1234 + }, + { + "epoch": 0.380454020777222, + "grad_norm": 6.6875, + "learning_rate": 9.474521118730988e-06, + "loss": 1.38877534866333, + "step": 1236 + }, + { + "epoch": 0.3810696421700654, + "grad_norm": 5.46875, + "learning_rate": 9.472543779676402e-06, + "loss": 1.8902709484100342, + "step": 1238 + }, + { + "epoch": 0.3816852635629088, + "grad_norm": 6.40625, + "learning_rate": 9.470562989877224e-06, + "loss": 1.2837282419204712, + "step": 1240 + }, + { + "epoch": 0.3823008849557522, + "grad_norm": 4.625, + "learning_rate": 9.468578751301806e-06, + "loss": 1.8792657852172852, + "step": 1242 + }, + { + "epoch": 0.3829165063485956, + "grad_norm": 4.875, + "learning_rate": 9.466591065921932e-06, + "loss": 1.319744348526001, + "step": 1244 + }, + { + "epoch": 0.383532127741439, + "grad_norm": 5.03125, + "learning_rate": 9.46459993571281e-06, + "loss": 1.664703607559204, + "step": 1246 + }, + { + "epoch": 0.3841477491342824, + "grad_norm": 5.96875, + "learning_rate": 9.46260536265307e-06, + "loss": 1.2072725296020508, + "step": 1248 + }, + { + "epoch": 0.38476337052712584, + "grad_norm": 6.90625, + "learning_rate": 9.460607348724763e-06, + "loss": 1.4186588525772095, + "step": 1250 + }, + { + "epoch": 0.3853789919199692, + "grad_norm": 22.875, + "learning_rate": 9.458605895913362e-06, + "loss": 1.2559492588043213, + "step": 1252 + }, + { + "epoch": 0.3859946133128126, + "grad_norm": 2.34375, + "learning_rate": 9.456601006207755e-06, + "loss": 1.1489924192428589, + "step": 1254 + }, + { + "epoch": 0.38661023470565603, + "grad_norm": 10.1875, + "learning_rate": 9.454592681600246e-06, + "loss": 1.3234691619873047, + "step": 1256 + }, + { + "epoch": 0.3872258560984994, + "grad_norm": 6.34375, + "learning_rate": 9.45258092408655e-06, + "loss": 1.5079069137573242, + "step": 1258 + }, + { + "epoch": 0.38784147749134285, + "grad_norm": 7.34375, + "learning_rate": 9.450565735665797e-06, + "loss": 1.5823462009429932, + "step": 1260 + }, + { + "epoch": 0.3884570988841862, + "grad_norm": 6.09375, + "learning_rate": 9.448547118340528e-06, + "loss": 1.4394303560256958, + "step": 1262 + }, + { + "epoch": 0.3890727202770296, + "grad_norm": 6.28125, + "learning_rate": 9.446525074116684e-06, + "loss": 1.6312642097473145, + "step": 1264 + }, + { + "epoch": 0.38968834166987304, + "grad_norm": 2.421875, + "learning_rate": 9.444499605003614e-06, + "loss": 1.2395602464675903, + "step": 1266 + }, + { + "epoch": 0.3903039630627164, + "grad_norm": 3.0, + "learning_rate": 9.44247071301408e-06, + "loss": 1.1280606985092163, + "step": 1268 + }, + { + "epoch": 0.39091958445555985, + "grad_norm": 6.59375, + "learning_rate": 9.440438400164232e-06, + "loss": 1.2219340801239014, + "step": 1270 + }, + { + "epoch": 0.39153520584840323, + "grad_norm": 8.4375, + "learning_rate": 9.438402668473623e-06, + "loss": 1.7627100944519043, + "step": 1272 + }, + { + "epoch": 0.3921508272412466, + "grad_norm": 13.25, + "learning_rate": 9.436363519965209e-06, + "loss": 1.527239203453064, + "step": 1274 + }, + { + "epoch": 0.39276644863409005, + "grad_norm": 6.3125, + "learning_rate": 9.434320956665335e-06, + "loss": 1.3796216249465942, + "step": 1276 + }, + { + "epoch": 0.39338207002693343, + "grad_norm": 12.5, + "learning_rate": 9.432274980603745e-06, + "loss": 1.1514739990234375, + "step": 1278 + }, + { + "epoch": 0.39399769141977686, + "grad_norm": 3.84375, + "learning_rate": 9.430225593813567e-06, + "loss": 1.2350151538848877, + "step": 1280 + }, + { + "epoch": 0.39461331281262024, + "grad_norm": 16.25, + "learning_rate": 9.428172798331328e-06, + "loss": 0.8400089740753174, + "step": 1282 + }, + { + "epoch": 0.3952289342054636, + "grad_norm": 4.0625, + "learning_rate": 9.426116596196933e-06, + "loss": 1.1161513328552246, + "step": 1284 + }, + { + "epoch": 0.39584455559830706, + "grad_norm": 4.40625, + "learning_rate": 9.424056989453677e-06, + "loss": 1.2691034078598022, + "step": 1286 + }, + { + "epoch": 0.39646017699115044, + "grad_norm": 6.28125, + "learning_rate": 9.421993980148237e-06, + "loss": 1.256693720817566, + "step": 1288 + }, + { + "epoch": 0.3970757983839938, + "grad_norm": 5.0625, + "learning_rate": 9.419927570330672e-06, + "loss": 1.4276587963104248, + "step": 1290 + }, + { + "epoch": 0.39769141977683725, + "grad_norm": 6.3125, + "learning_rate": 9.417857762054418e-06, + "loss": 1.7910493612289429, + "step": 1292 + }, + { + "epoch": 0.39830704116968063, + "grad_norm": 6.03125, + "learning_rate": 9.415784557376296e-06, + "loss": 1.6821799278259277, + "step": 1294 + }, + { + "epoch": 0.39892266256252407, + "grad_norm": 12.1875, + "learning_rate": 9.413707958356489e-06, + "loss": 1.3325204849243164, + "step": 1296 + }, + { + "epoch": 0.39953828395536745, + "grad_norm": 6.28125, + "learning_rate": 9.411627967058563e-06, + "loss": 1.311694860458374, + "step": 1298 + }, + { + "epoch": 0.4001539053482108, + "grad_norm": 5.875, + "learning_rate": 9.409544585549452e-06, + "loss": 1.2477080821990967, + "step": 1300 + }, + { + "epoch": 0.40076952674105426, + "grad_norm": 5.53125, + "learning_rate": 9.407457815899458e-06, + "loss": 1.3540740013122559, + "step": 1302 + }, + { + "epoch": 0.40138514813389764, + "grad_norm": 9.1875, + "learning_rate": 9.405367660182254e-06, + "loss": 1.3317627906799316, + "step": 1304 + }, + { + "epoch": 0.4020007695267411, + "grad_norm": 7.8125, + "learning_rate": 9.403274120474867e-06, + "loss": 1.508737564086914, + "step": 1306 + }, + { + "epoch": 0.40261639091958445, + "grad_norm": 7.3125, + "learning_rate": 9.401177198857703e-06, + "loss": 1.5539381504058838, + "step": 1308 + }, + { + "epoch": 0.40323201231242783, + "grad_norm": 7.5, + "learning_rate": 9.399076897414517e-06, + "loss": 1.196575403213501, + "step": 1310 + }, + { + "epoch": 0.40384763370527127, + "grad_norm": 4.34375, + "learning_rate": 9.396973218232424e-06, + "loss": 1.4205857515335083, + "step": 1312 + }, + { + "epoch": 0.40446325509811465, + "grad_norm": 22.125, + "learning_rate": 9.394866163401897e-06, + "loss": 1.0114387273788452, + "step": 1314 + }, + { + "epoch": 0.4050788764909581, + "grad_norm": 3.78125, + "learning_rate": 9.392755735016763e-06, + "loss": 1.302337646484375, + "step": 1316 + }, + { + "epoch": 0.40569449788380146, + "grad_norm": 20.125, + "learning_rate": 9.390641935174208e-06, + "loss": 1.5426905155181885, + "step": 1318 + }, + { + "epoch": 0.40631011927664484, + "grad_norm": 56.75, + "learning_rate": 9.388524765974754e-06, + "loss": 1.4665982723236084, + "step": 1320 + }, + { + "epoch": 0.4069257406694883, + "grad_norm": 6.5625, + "learning_rate": 9.386404229522286e-06, + "loss": 1.2628440856933594, + "step": 1322 + }, + { + "epoch": 0.40754136206233166, + "grad_norm": 4.15625, + "learning_rate": 9.384280327924024e-06, + "loss": 1.117812991142273, + "step": 1324 + }, + { + "epoch": 0.4081569834551751, + "grad_norm": 5.03125, + "learning_rate": 9.38215306329054e-06, + "loss": 1.4815458059310913, + "step": 1326 + }, + { + "epoch": 0.4087726048480185, + "grad_norm": 4.59375, + "learning_rate": 9.380022437735743e-06, + "loss": 1.057960033416748, + "step": 1328 + }, + { + "epoch": 0.40938822624086185, + "grad_norm": 17.625, + "learning_rate": 9.377888453376885e-06, + "loss": 1.2156786918640137, + "step": 1330 + }, + { + "epoch": 0.4100038476337053, + "grad_norm": 4.21875, + "learning_rate": 9.37575111233455e-06, + "loss": 1.3469533920288086, + "step": 1332 + }, + { + "epoch": 0.41061946902654867, + "grad_norm": 7.90625, + "learning_rate": 9.373610416732667e-06, + "loss": 1.6464571952819824, + "step": 1334 + }, + { + "epoch": 0.41123509041939205, + "grad_norm": 8.25, + "learning_rate": 9.37146636869849e-06, + "loss": 1.4672791957855225, + "step": 1336 + }, + { + "epoch": 0.4118507118122355, + "grad_norm": 9.5, + "learning_rate": 9.369318970362606e-06, + "loss": 1.2561655044555664, + "step": 1338 + }, + { + "epoch": 0.41246633320507886, + "grad_norm": 19.375, + "learning_rate": 9.367168223858937e-06, + "loss": 1.1853914260864258, + "step": 1340 + }, + { + "epoch": 0.4130819545979223, + "grad_norm": 8.5, + "learning_rate": 9.365014131324726e-06, + "loss": 1.1935577392578125, + "step": 1342 + }, + { + "epoch": 0.4136975759907657, + "grad_norm": 8.5, + "learning_rate": 9.362856694900542e-06, + "loss": 1.3152433633804321, + "step": 1344 + }, + { + "epoch": 0.41431319738360906, + "grad_norm": 7.1875, + "learning_rate": 9.36069591673028e-06, + "loss": 1.5555031299591064, + "step": 1346 + }, + { + "epoch": 0.4149288187764525, + "grad_norm": 19.0, + "learning_rate": 9.358531798961154e-06, + "loss": 1.4917670488357544, + "step": 1348 + }, + { + "epoch": 0.41554444016929587, + "grad_norm": 4.8125, + "learning_rate": 9.356364343743694e-06, + "loss": 1.564723253250122, + "step": 1350 + }, + { + "epoch": 0.4161600615621393, + "grad_norm": 6.5625, + "learning_rate": 9.35419355323175e-06, + "loss": 1.31098210811615, + "step": 1352 + }, + { + "epoch": 0.4167756829549827, + "grad_norm": 3.9375, + "learning_rate": 9.352019429582485e-06, + "loss": 1.0555250644683838, + "step": 1354 + }, + { + "epoch": 0.41739130434782606, + "grad_norm": 3.0625, + "learning_rate": 9.349841974956373e-06, + "loss": 1.2879685163497925, + "step": 1356 + }, + { + "epoch": 0.4180069257406695, + "grad_norm": 7.34375, + "learning_rate": 9.3476611915172e-06, + "loss": 1.627746820449829, + "step": 1358 + }, + { + "epoch": 0.4186225471335129, + "grad_norm": 14.75, + "learning_rate": 9.345477081432065e-06, + "loss": 1.1410566568374634, + "step": 1360 + }, + { + "epoch": 0.4192381685263563, + "grad_norm": 14.6875, + "learning_rate": 9.343289646871361e-06, + "loss": 1.6584223508834839, + "step": 1362 + }, + { + "epoch": 0.4198537899191997, + "grad_norm": 4.46875, + "learning_rate": 9.34109889000879e-06, + "loss": 1.063903570175171, + "step": 1364 + }, + { + "epoch": 0.4204694113120431, + "grad_norm": 5.46875, + "learning_rate": 9.338904813021361e-06, + "loss": 1.2660856246948242, + "step": 1366 + }, + { + "epoch": 0.4210850327048865, + "grad_norm": 8.9375, + "learning_rate": 9.336707418089375e-06, + "loss": 1.5954651832580566, + "step": 1368 + }, + { + "epoch": 0.4217006540977299, + "grad_norm": 20.75, + "learning_rate": 9.334506707396432e-06, + "loss": 1.2890225648880005, + "step": 1370 + }, + { + "epoch": 0.4223162754905733, + "grad_norm": 35.25, + "learning_rate": 9.332302683129427e-06, + "loss": 1.1560983657836914, + "step": 1372 + }, + { + "epoch": 0.4229318968834167, + "grad_norm": 10.8125, + "learning_rate": 9.33009534747855e-06, + "loss": 1.4955801963806152, + "step": 1374 + }, + { + "epoch": 0.4235475182762601, + "grad_norm": 11.75, + "learning_rate": 9.32788470263728e-06, + "loss": 1.3701071739196777, + "step": 1376 + }, + { + "epoch": 0.4241631396691035, + "grad_norm": 8.0625, + "learning_rate": 9.325670750802382e-06, + "loss": 1.3674092292785645, + "step": 1378 + }, + { + "epoch": 0.4247787610619469, + "grad_norm": 3.375, + "learning_rate": 9.323453494173913e-06, + "loss": 1.222502589225769, + "step": 1380 + }, + { + "epoch": 0.4253943824547903, + "grad_norm": 8.875, + "learning_rate": 9.321232934955208e-06, + "loss": 0.9671890735626221, + "step": 1382 + }, + { + "epoch": 0.4260100038476337, + "grad_norm": 20.375, + "learning_rate": 9.319009075352888e-06, + "loss": 1.5922764539718628, + "step": 1384 + }, + { + "epoch": 0.4266256252404771, + "grad_norm": 3.390625, + "learning_rate": 9.316781917576851e-06, + "loss": 1.2351106405258179, + "step": 1386 + }, + { + "epoch": 0.4272412466333205, + "grad_norm": 5.0625, + "learning_rate": 9.314551463840273e-06, + "loss": 1.357588529586792, + "step": 1388 + }, + { + "epoch": 0.4278568680261639, + "grad_norm": 5.125, + "learning_rate": 9.31231771635961e-06, + "loss": 1.0194324254989624, + "step": 1390 + }, + { + "epoch": 0.4284724894190073, + "grad_norm": 3.71875, + "learning_rate": 9.310080677354583e-06, + "loss": 1.1744673252105713, + "step": 1392 + }, + { + "epoch": 0.4290881108118507, + "grad_norm": 6.75, + "learning_rate": 9.307840349048185e-06, + "loss": 1.2617883682250977, + "step": 1394 + }, + { + "epoch": 0.4297037322046941, + "grad_norm": 17.875, + "learning_rate": 9.305596733666688e-06, + "loss": 1.3517186641693115, + "step": 1396 + }, + { + "epoch": 0.43031935359753754, + "grad_norm": 7.46875, + "learning_rate": 9.303349833439619e-06, + "loss": 1.5301835536956787, + "step": 1398 + }, + { + "epoch": 0.4309349749903809, + "grad_norm": 2.359375, + "learning_rate": 9.301099650599771e-06, + "loss": 1.1756891012191772, + "step": 1400 + }, + { + "epoch": 0.4315505963832243, + "grad_norm": 17.5, + "learning_rate": 9.298846187383206e-06, + "loss": 1.5164425373077393, + "step": 1402 + }, + { + "epoch": 0.43216621777606773, + "grad_norm": 13.3125, + "learning_rate": 9.296589446029235e-06, + "loss": 1.3366045951843262, + "step": 1404 + }, + { + "epoch": 0.4327818391689111, + "grad_norm": 6.75, + "learning_rate": 9.294329428780437e-06, + "loss": 1.1451746225357056, + "step": 1406 + }, + { + "epoch": 0.43339746056175454, + "grad_norm": 5.21875, + "learning_rate": 9.292066137882643e-06, + "loss": 1.2165199518203735, + "step": 1408 + }, + { + "epoch": 0.4340130819545979, + "grad_norm": 4.40625, + "learning_rate": 9.28979957558493e-06, + "loss": 1.3512365818023682, + "step": 1410 + }, + { + "epoch": 0.4346287033474413, + "grad_norm": 5.84375, + "learning_rate": 9.287529744139638e-06, + "loss": 1.3853118419647217, + "step": 1412 + }, + { + "epoch": 0.43524432474028474, + "grad_norm": 3.46875, + "learning_rate": 9.285256645802343e-06, + "loss": 1.1237252950668335, + "step": 1414 + }, + { + "epoch": 0.4358599461331281, + "grad_norm": 12.3125, + "learning_rate": 9.28298028283188e-06, + "loss": 1.6480573415756226, + "step": 1416 + }, + { + "epoch": 0.43647556752597155, + "grad_norm": 10.1875, + "learning_rate": 9.280700657490319e-06, + "loss": 1.5300313234329224, + "step": 1418 + }, + { + "epoch": 0.43709118891881493, + "grad_norm": 9.0625, + "learning_rate": 9.278417772042973e-06, + "loss": 1.5395290851593018, + "step": 1420 + }, + { + "epoch": 0.4377068103116583, + "grad_norm": 11.4375, + "learning_rate": 9.276131628758402e-06, + "loss": 1.599255084991455, + "step": 1422 + }, + { + "epoch": 0.43832243170450175, + "grad_norm": 5.96875, + "learning_rate": 9.273842229908392e-06, + "loss": 1.0969022512435913, + "step": 1424 + }, + { + "epoch": 0.4389380530973451, + "grad_norm": 5.1875, + "learning_rate": 9.271549577767972e-06, + "loss": 0.8788780570030212, + "step": 1426 + }, + { + "epoch": 0.43955367449018856, + "grad_norm": 18.5, + "learning_rate": 9.269253674615404e-06, + "loss": 1.2673485279083252, + "step": 1428 + }, + { + "epoch": 0.44016929588303194, + "grad_norm": 3.421875, + "learning_rate": 9.266954522732174e-06, + "loss": 1.0636610984802246, + "step": 1430 + }, + { + "epoch": 0.4407849172758753, + "grad_norm": 6.21875, + "learning_rate": 9.264652124403004e-06, + "loss": 1.2438650131225586, + "step": 1432 + }, + { + "epoch": 0.44140053866871876, + "grad_norm": 8.6875, + "learning_rate": 9.262346481915838e-06, + "loss": 1.0668020248413086, + "step": 1434 + }, + { + "epoch": 0.44201616006156214, + "grad_norm": 7.65625, + "learning_rate": 9.260037597561846e-06, + "loss": 1.4651720523834229, + "step": 1436 + }, + { + "epoch": 0.4426317814544055, + "grad_norm": 6.96875, + "learning_rate": 9.257725473635414e-06, + "loss": 1.2487192153930664, + "step": 1438 + }, + { + "epoch": 0.44324740284724895, + "grad_norm": 6.6875, + "learning_rate": 9.255410112434158e-06, + "loss": 1.3854714632034302, + "step": 1440 + }, + { + "epoch": 0.44386302424009233, + "grad_norm": 5.71875, + "learning_rate": 9.253091516258899e-06, + "loss": 1.4076898097991943, + "step": 1442 + }, + { + "epoch": 0.44447864563293576, + "grad_norm": 2.171875, + "learning_rate": 9.25076968741368e-06, + "loss": 1.4974058866500854, + "step": 1444 + }, + { + "epoch": 0.44509426702577914, + "grad_norm": 4.3125, + "learning_rate": 9.248444628205753e-06, + "loss": 1.1812409162521362, + "step": 1446 + }, + { + "epoch": 0.4457098884186225, + "grad_norm": 6.90625, + "learning_rate": 9.246116340945584e-06, + "loss": 1.0853921175003052, + "step": 1448 + }, + { + "epoch": 0.44632550981146596, + "grad_norm": 5.59375, + "learning_rate": 9.24378482794684e-06, + "loss": 1.4460300207138062, + "step": 1450 + }, + { + "epoch": 0.44694113120430934, + "grad_norm": 6.46875, + "learning_rate": 9.2414500915264e-06, + "loss": 1.3900498151779175, + "step": 1452 + }, + { + "epoch": 0.4475567525971528, + "grad_norm": 4.9375, + "learning_rate": 9.239112134004346e-06, + "loss": 0.9988191723823547, + "step": 1454 + }, + { + "epoch": 0.44817237398999615, + "grad_norm": 10.125, + "learning_rate": 9.236770957703957e-06, + "loss": 1.0114213228225708, + "step": 1456 + }, + { + "epoch": 0.44878799538283953, + "grad_norm": 5.65625, + "learning_rate": 9.234426564951713e-06, + "loss": 1.2418731451034546, + "step": 1458 + }, + { + "epoch": 0.44940361677568297, + "grad_norm": 19.25, + "learning_rate": 9.232078958077289e-06, + "loss": 1.515458106994629, + "step": 1460 + }, + { + "epoch": 0.45001923816852635, + "grad_norm": 4.8125, + "learning_rate": 9.229728139413553e-06, + "loss": 1.2873460054397583, + "step": 1462 + }, + { + "epoch": 0.4506348595613698, + "grad_norm": 12.75, + "learning_rate": 9.22737411129657e-06, + "loss": 1.2370492219924927, + "step": 1464 + }, + { + "epoch": 0.45125048095421316, + "grad_norm": 6.75, + "learning_rate": 9.225016876065587e-06, + "loss": 1.2360658645629883, + "step": 1466 + }, + { + "epoch": 0.45186610234705654, + "grad_norm": 7.4375, + "learning_rate": 9.222656436063043e-06, + "loss": 1.2146968841552734, + "step": 1468 + }, + { + "epoch": 0.4524817237399, + "grad_norm": 6.46875, + "learning_rate": 9.220292793634559e-06, + "loss": 1.3954319953918457, + "step": 1470 + }, + { + "epoch": 0.45309734513274336, + "grad_norm": 6.0625, + "learning_rate": 9.217925951128941e-06, + "loss": 1.5174306631088257, + "step": 1472 + }, + { + "epoch": 0.4537129665255868, + "grad_norm": 10.5, + "learning_rate": 9.215555910898175e-06, + "loss": 1.3878395557403564, + "step": 1474 + }, + { + "epoch": 0.45432858791843017, + "grad_norm": 9.8125, + "learning_rate": 9.213182675297418e-06, + "loss": 1.5955039262771606, + "step": 1476 + }, + { + "epoch": 0.45494420931127355, + "grad_norm": 6.84375, + "learning_rate": 9.210806246685012e-06, + "loss": 1.4851185083389282, + "step": 1478 + }, + { + "epoch": 0.455559830704117, + "grad_norm": 9.1875, + "learning_rate": 9.208426627422464e-06, + "loss": 1.4243017435073853, + "step": 1480 + }, + { + "epoch": 0.45617545209696037, + "grad_norm": 5.75, + "learning_rate": 9.20604381987446e-06, + "loss": 1.4351035356521606, + "step": 1482 + }, + { + "epoch": 0.45679107348980375, + "grad_norm": 7.0, + "learning_rate": 9.203657826408842e-06, + "loss": 1.302568793296814, + "step": 1484 + }, + { + "epoch": 0.4574066948826472, + "grad_norm": 6.75, + "learning_rate": 9.201268649396634e-06, + "loss": 1.1568458080291748, + "step": 1486 + }, + { + "epoch": 0.45802231627549056, + "grad_norm": 9.8125, + "learning_rate": 9.198876291212006e-06, + "loss": 0.8684755563735962, + "step": 1488 + }, + { + "epoch": 0.458637937668334, + "grad_norm": 3.71875, + "learning_rate": 9.196480754232305e-06, + "loss": 0.7467790246009827, + "step": 1490 + }, + { + "epoch": 0.4592535590611774, + "grad_norm": 11.5, + "learning_rate": 9.194082040838025e-06, + "loss": 1.2954021692276, + "step": 1492 + }, + { + "epoch": 0.45986918045402075, + "grad_norm": 11.3125, + "learning_rate": 9.191680153412823e-06, + "loss": 1.7717770338058472, + "step": 1494 + }, + { + "epoch": 0.4604848018468642, + "grad_norm": 10.8125, + "learning_rate": 9.189275094343509e-06, + "loss": 1.44906747341156, + "step": 1496 + }, + { + "epoch": 0.46110042323970757, + "grad_norm": 12.5625, + "learning_rate": 9.186866866020042e-06, + "loss": 1.2336671352386475, + "step": 1498 + }, + { + "epoch": 0.461716044632551, + "grad_norm": 5.25, + "learning_rate": 9.184455470835536e-06, + "loss": 0.8941473364830017, + "step": 1500 + }, + { + "epoch": 0.4623316660253944, + "grad_norm": 10.75, + "learning_rate": 9.182040911186246e-06, + "loss": 1.598117470741272, + "step": 1502 + }, + { + "epoch": 0.46294728741823776, + "grad_norm": 5.5, + "learning_rate": 9.17962318947157e-06, + "loss": 1.3159825801849365, + "step": 1504 + }, + { + "epoch": 0.4635629088110812, + "grad_norm": 13.875, + "learning_rate": 9.177202308094062e-06, + "loss": 1.9779479503631592, + "step": 1506 + }, + { + "epoch": 0.4641785302039246, + "grad_norm": 8.1875, + "learning_rate": 9.174778269459399e-06, + "loss": 1.3620253801345825, + "step": 1508 + }, + { + "epoch": 0.464794151596768, + "grad_norm": 5.96875, + "learning_rate": 9.172351075976407e-06, + "loss": 1.2268351316452026, + "step": 1510 + }, + { + "epoch": 0.4654097729896114, + "grad_norm": 6.46875, + "learning_rate": 9.169920730057038e-06, + "loss": 1.475582242012024, + "step": 1512 + }, + { + "epoch": 0.46602539438245477, + "grad_norm": 4.6875, + "learning_rate": 9.16748723411638e-06, + "loss": 0.8650484681129456, + "step": 1514 + }, + { + "epoch": 0.4666410157752982, + "grad_norm": 20.375, + "learning_rate": 9.16505059057266e-06, + "loss": 0.8505802750587463, + "step": 1516 + }, + { + "epoch": 0.4672566371681416, + "grad_norm": 6.0625, + "learning_rate": 9.16261080184722e-06, + "loss": 1.3201478719711304, + "step": 1518 + }, + { + "epoch": 0.467872258560985, + "grad_norm": 7.9375, + "learning_rate": 9.160167870364533e-06, + "loss": 1.345275640487671, + "step": 1520 + }, + { + "epoch": 0.4684878799538284, + "grad_norm": 8.6875, + "learning_rate": 9.157721798552194e-06, + "loss": 1.5291322469711304, + "step": 1522 + }, + { + "epoch": 0.4691035013466718, + "grad_norm": 9.8125, + "learning_rate": 9.155272588840924e-06, + "loss": 1.465450406074524, + "step": 1524 + }, + { + "epoch": 0.4697191227395152, + "grad_norm": 4.625, + "learning_rate": 9.152820243664553e-06, + "loss": 1.4166549444198608, + "step": 1526 + }, + { + "epoch": 0.4703347441323586, + "grad_norm": 2.21875, + "learning_rate": 9.150364765460032e-06, + "loss": 1.282368779182434, + "step": 1528 + }, + { + "epoch": 0.470950365525202, + "grad_norm": 2.65625, + "learning_rate": 9.147906156667425e-06, + "loss": 0.9684574007987976, + "step": 1530 + }, + { + "epoch": 0.4715659869180454, + "grad_norm": 8.125, + "learning_rate": 9.14544441972991e-06, + "loss": 1.324642300605774, + "step": 1532 + }, + { + "epoch": 0.4721816083108888, + "grad_norm": 8.9375, + "learning_rate": 9.142979557093766e-06, + "loss": 1.6652965545654297, + "step": 1534 + }, + { + "epoch": 0.4727972297037322, + "grad_norm": 16.875, + "learning_rate": 9.14051157120838e-06, + "loss": 1.6565061807632446, + "step": 1536 + }, + { + "epoch": 0.4734128510965756, + "grad_norm": 6.75, + "learning_rate": 9.138040464526254e-06, + "loss": 1.8133372068405151, + "step": 1538 + }, + { + "epoch": 0.474028472489419, + "grad_norm": 3.984375, + "learning_rate": 9.135566239502974e-06, + "loss": 1.4453431367874146, + "step": 1540 + }, + { + "epoch": 0.4746440938822624, + "grad_norm": 4.5, + "learning_rate": 9.133088898597236e-06, + "loss": 1.37595796585083, + "step": 1542 + }, + { + "epoch": 0.4752597152751058, + "grad_norm": 4.1875, + "learning_rate": 9.130608444270828e-06, + "loss": 1.2220466136932373, + "step": 1544 + }, + { + "epoch": 0.47587533666794923, + "grad_norm": 2.71875, + "learning_rate": 9.128124878988633e-06, + "loss": 1.0569881200790405, + "step": 1546 + }, + { + "epoch": 0.4764909580607926, + "grad_norm": 9.375, + "learning_rate": 9.125638205218628e-06, + "loss": 1.4060856103897095, + "step": 1548 + }, + { + "epoch": 0.477106579453636, + "grad_norm": 6.1875, + "learning_rate": 9.123148425431873e-06, + "loss": 1.219241976737976, + "step": 1550 + }, + { + "epoch": 0.47772220084647943, + "grad_norm": 5.1875, + "learning_rate": 9.120655542102524e-06, + "loss": 1.014161467552185, + "step": 1552 + }, + { + "epoch": 0.4783378222393228, + "grad_norm": 6.875, + "learning_rate": 9.118159557707807e-06, + "loss": 1.4548730850219727, + "step": 1554 + }, + { + "epoch": 0.47895344363216624, + "grad_norm": 11.4375, + "learning_rate": 9.11566047472804e-06, + "loss": 1.5502562522888184, + "step": 1556 + }, + { + "epoch": 0.4795690650250096, + "grad_norm": 6.53125, + "learning_rate": 9.113158295646623e-06, + "loss": 1.093739628791809, + "step": 1558 + }, + { + "epoch": 0.480184686417853, + "grad_norm": 9.875, + "learning_rate": 9.11065302295002e-06, + "loss": 1.1581438779830933, + "step": 1560 + }, + { + "epoch": 0.48080030781069644, + "grad_norm": 5.25, + "learning_rate": 9.108144659127782e-06, + "loss": 1.5227125883102417, + "step": 1562 + }, + { + "epoch": 0.4814159292035398, + "grad_norm": 6.5625, + "learning_rate": 9.105633206672524e-06, + "loss": 1.2725529670715332, + "step": 1564 + }, + { + "epoch": 0.48203155059638325, + "grad_norm": 4.75, + "learning_rate": 9.103118668079932e-06, + "loss": 1.2470329999923706, + "step": 1566 + }, + { + "epoch": 0.48264717198922663, + "grad_norm": 8.875, + "learning_rate": 9.100601045848765e-06, + "loss": 1.2510249614715576, + "step": 1568 + }, + { + "epoch": 0.48326279338207, + "grad_norm": 5.25, + "learning_rate": 9.098080342480832e-06, + "loss": 1.2870439291000366, + "step": 1570 + }, + { + "epoch": 0.48387841477491345, + "grad_norm": 6.84375, + "learning_rate": 9.09555656048102e-06, + "loss": 1.453609824180603, + "step": 1572 + }, + { + "epoch": 0.4844940361677568, + "grad_norm": 11.3125, + "learning_rate": 9.093029702357262e-06, + "loss": 1.162030577659607, + "step": 1574 + }, + { + "epoch": 0.4851096575606002, + "grad_norm": 1.9921875, + "learning_rate": 9.090499770620556e-06, + "loss": 1.0766057968139648, + "step": 1576 + }, + { + "epoch": 0.48572527895344364, + "grad_norm": 8.6875, + "learning_rate": 9.087966767784953e-06, + "loss": 1.3970556259155273, + "step": 1578 + }, + { + "epoch": 0.486340900346287, + "grad_norm": 10.6875, + "learning_rate": 9.085430696367553e-06, + "loss": 1.1191117763519287, + "step": 1580 + }, + { + "epoch": 0.48695652173913045, + "grad_norm": 5.5625, + "learning_rate": 9.082891558888505e-06, + "loss": 1.2700603008270264, + "step": 1582 + }, + { + "epoch": 0.48757214313197383, + "grad_norm": 10.25, + "learning_rate": 9.080349357871013e-06, + "loss": 1.3424956798553467, + "step": 1584 + }, + { + "epoch": 0.4881877645248172, + "grad_norm": 3.234375, + "learning_rate": 9.077804095841314e-06, + "loss": 1.348867416381836, + "step": 1586 + }, + { + "epoch": 0.48880338591766065, + "grad_norm": 4.875, + "learning_rate": 9.075255775328692e-06, + "loss": 1.1686131954193115, + "step": 1588 + }, + { + "epoch": 0.48941900731050403, + "grad_norm": 30.125, + "learning_rate": 9.072704398865473e-06, + "loss": 1.240150809288025, + "step": 1590 + }, + { + "epoch": 0.49003462870334746, + "grad_norm": 15.4375, + "learning_rate": 9.070149968987017e-06, + "loss": 1.333054780960083, + "step": 1592 + }, + { + "epoch": 0.49065025009619084, + "grad_norm": 7.34375, + "learning_rate": 9.067592488231716e-06, + "loss": 0.98194819688797, + "step": 1594 + }, + { + "epoch": 0.4912658714890342, + "grad_norm": 3.1875, + "learning_rate": 9.065031959140995e-06, + "loss": 0.5497357249259949, + "step": 1596 + }, + { + "epoch": 0.49188149288187766, + "grad_norm": 5.59375, + "learning_rate": 9.062468384259313e-06, + "loss": 1.3398869037628174, + "step": 1598 + }, + { + "epoch": 0.49249711427472104, + "grad_norm": 9.125, + "learning_rate": 9.059901766134149e-06, + "loss": 1.6062138080596924, + "step": 1600 + }, + { + "epoch": 0.4931127356675645, + "grad_norm": 12.5, + "learning_rate": 9.05733210731601e-06, + "loss": 1.9328994750976562, + "step": 1602 + }, + { + "epoch": 0.49372835706040785, + "grad_norm": 6.34375, + "learning_rate": 9.054759410358423e-06, + "loss": 1.378353476524353, + "step": 1604 + }, + { + "epoch": 0.49434397845325123, + "grad_norm": 5.28125, + "learning_rate": 9.052183677817936e-06, + "loss": 1.3997539281845093, + "step": 1606 + }, + { + "epoch": 0.49495959984609467, + "grad_norm": 3.203125, + "learning_rate": 9.049604912254108e-06, + "loss": 1.3598434925079346, + "step": 1608 + }, + { + "epoch": 0.49557522123893805, + "grad_norm": 13.3125, + "learning_rate": 9.047023116229523e-06, + "loss": 1.4434458017349243, + "step": 1610 + }, + { + "epoch": 0.4961908426317815, + "grad_norm": 4.375, + "learning_rate": 9.044438292309766e-06, + "loss": 1.3952776193618774, + "step": 1612 + }, + { + "epoch": 0.49680646402462486, + "grad_norm": 3.15625, + "learning_rate": 9.041850443063431e-06, + "loss": 1.047157883644104, + "step": 1614 + }, + { + "epoch": 0.49742208541746824, + "grad_norm": 8.8125, + "learning_rate": 9.039259571062126e-06, + "loss": 1.1723214387893677, + "step": 1616 + }, + { + "epoch": 0.4980377068103117, + "grad_norm": 2.90625, + "learning_rate": 9.036665678880462e-06, + "loss": 1.1033647060394287, + "step": 1618 + }, + { + "epoch": 0.49865332820315506, + "grad_norm": 11.0625, + "learning_rate": 9.034068769096038e-06, + "loss": 1.4442481994628906, + "step": 1620 + }, + { + "epoch": 0.49926894959599843, + "grad_norm": 4.625, + "learning_rate": 9.031468844289467e-06, + "loss": 1.2997276782989502, + "step": 1622 + }, + { + "epoch": 0.49988457098884187, + "grad_norm": 3.046875, + "learning_rate": 9.028865907044356e-06, + "loss": 1.0517377853393555, + "step": 1624 + }, + { + "epoch": 0.5005001923816853, + "grad_norm": 4.78125, + "learning_rate": 9.026259959947296e-06, + "loss": 1.2825373411178589, + "step": 1626 + }, + { + "epoch": 0.5011158137745286, + "grad_norm": 9.1875, + "learning_rate": 9.02365100558788e-06, + "loss": 1.5244855880737305, + "step": 1628 + }, + { + "epoch": 0.5017314351673721, + "grad_norm": 5.84375, + "learning_rate": 9.021039046558681e-06, + "loss": 1.6843534708023071, + "step": 1630 + }, + { + "epoch": 0.5023470565602155, + "grad_norm": 3.296875, + "learning_rate": 9.018424085455264e-06, + "loss": 0.8393566012382507, + "step": 1632 + }, + { + "epoch": 0.5029626779530588, + "grad_norm": 4.84375, + "learning_rate": 9.015806124876169e-06, + "loss": 1.4292811155319214, + "step": 1634 + }, + { + "epoch": 0.5035782993459023, + "grad_norm": 10.3125, + "learning_rate": 9.013185167422929e-06, + "loss": 1.1995810270309448, + "step": 1636 + }, + { + "epoch": 0.5041939207387457, + "grad_norm": 6.71875, + "learning_rate": 9.010561215700045e-06, + "loss": 1.3768254518508911, + "step": 1638 + }, + { + "epoch": 0.504809542131589, + "grad_norm": 8.1875, + "learning_rate": 9.007934272314996e-06, + "loss": 1.08784019947052, + "step": 1640 + }, + { + "epoch": 0.5054251635244325, + "grad_norm": 17.0, + "learning_rate": 9.005304339878234e-06, + "loss": 1.3579922914505005, + "step": 1642 + }, + { + "epoch": 0.5060407849172759, + "grad_norm": 11.6875, + "learning_rate": 9.002671421003185e-06, + "loss": 1.7261128425598145, + "step": 1644 + }, + { + "epoch": 0.5066564063101193, + "grad_norm": 7.3125, + "learning_rate": 9.000035518306236e-06, + "loss": 0.5321837663650513, + "step": 1646 + }, + { + "epoch": 0.5072720277029626, + "grad_norm": 68.5, + "learning_rate": 8.997396634406746e-06, + "loss": 1.0127274990081787, + "step": 1648 + }, + { + "epoch": 0.5078876490958061, + "grad_norm": 7.1875, + "learning_rate": 8.994754771927029e-06, + "loss": 1.0840795040130615, + "step": 1650 + }, + { + "epoch": 0.5085032704886495, + "grad_norm": 8.625, + "learning_rate": 8.992109933492366e-06, + "loss": 1.2792012691497803, + "step": 1652 + }, + { + "epoch": 0.5091188918814928, + "grad_norm": 5.21875, + "learning_rate": 8.989462121730991e-06, + "loss": 1.4806631803512573, + "step": 1654 + }, + { + "epoch": 0.5097345132743363, + "grad_norm": 3.34375, + "learning_rate": 8.986811339274095e-06, + "loss": 1.0648256540298462, + "step": 1656 + }, + { + "epoch": 0.5103501346671797, + "grad_norm": 7.4375, + "learning_rate": 8.98415758875582e-06, + "loss": 1.6581193208694458, + "step": 1658 + }, + { + "epoch": 0.510965756060023, + "grad_norm": 5.25, + "learning_rate": 8.981500872813256e-06, + "loss": 1.0713742971420288, + "step": 1660 + }, + { + "epoch": 0.5115813774528665, + "grad_norm": 11.5, + "learning_rate": 8.978841194086443e-06, + "loss": 0.5658085942268372, + "step": 1662 + }, + { + "epoch": 0.5121969988457099, + "grad_norm": 4.53125, + "learning_rate": 8.97617855521836e-06, + "loss": 1.364528775215149, + "step": 1664 + }, + { + "epoch": 0.5128126202385533, + "grad_norm": 4.78125, + "learning_rate": 8.973512958854934e-06, + "loss": 1.0248429775238037, + "step": 1666 + }, + { + "epoch": 0.5134282416313967, + "grad_norm": 5.15625, + "learning_rate": 8.97084440764503e-06, + "loss": 1.2535791397094727, + "step": 1668 + }, + { + "epoch": 0.5140438630242401, + "grad_norm": 2.84375, + "learning_rate": 8.968172904240441e-06, + "loss": 0.9831995964050293, + "step": 1670 + }, + { + "epoch": 0.5146594844170835, + "grad_norm": 12.3125, + "learning_rate": 8.965498451295904e-06, + "loss": 1.2839998006820679, + "step": 1672 + }, + { + "epoch": 0.5152751058099269, + "grad_norm": 8.6875, + "learning_rate": 8.962821051469082e-06, + "loss": 1.3942573070526123, + "step": 1674 + }, + { + "epoch": 0.5158907272027703, + "grad_norm": 13.75, + "learning_rate": 8.960140707420566e-06, + "loss": 1.6228309869766235, + "step": 1676 + }, + { + "epoch": 0.5165063485956137, + "grad_norm": 5.0625, + "learning_rate": 8.957457421813876e-06, + "loss": 1.4481265544891357, + "step": 1678 + }, + { + "epoch": 0.517121969988457, + "grad_norm": 7.5625, + "learning_rate": 8.954771197315451e-06, + "loss": 1.4853836297988892, + "step": 1680 + }, + { + "epoch": 0.5177375913813005, + "grad_norm": 8.75, + "learning_rate": 8.952082036594653e-06, + "loss": 1.552171230316162, + "step": 1682 + }, + { + "epoch": 0.5183532127741439, + "grad_norm": 6.4375, + "learning_rate": 8.949389942323763e-06, + "loss": 1.4480795860290527, + "step": 1684 + }, + { + "epoch": 0.5189688341669872, + "grad_norm": 4.625, + "learning_rate": 8.946694917177974e-06, + "loss": 1.430462121963501, + "step": 1686 + }, + { + "epoch": 0.5195844555598307, + "grad_norm": 8.0625, + "learning_rate": 8.943996963835396e-06, + "loss": 1.272444486618042, + "step": 1688 + }, + { + "epoch": 0.5202000769526741, + "grad_norm": 4.96875, + "learning_rate": 8.94129608497704e-06, + "loss": 1.265394926071167, + "step": 1690 + }, + { + "epoch": 0.5208156983455176, + "grad_norm": 10.8125, + "learning_rate": 8.938592283286831e-06, + "loss": 1.2681784629821777, + "step": 1692 + }, + { + "epoch": 0.5214313197383609, + "grad_norm": 10.1875, + "learning_rate": 8.935885561451602e-06, + "loss": 1.3533116579055786, + "step": 1694 + }, + { + "epoch": 0.5220469411312043, + "grad_norm": 8.5, + "learning_rate": 8.93317592216108e-06, + "loss": 1.3701342344284058, + "step": 1696 + }, + { + "epoch": 0.5226625625240477, + "grad_norm": 8.25, + "learning_rate": 8.930463368107894e-06, + "loss": 1.5623503923416138, + "step": 1698 + }, + { + "epoch": 0.5232781839168911, + "grad_norm": 6.53125, + "learning_rate": 8.927747901987572e-06, + "loss": 1.0573383569717407, + "step": 1700 + }, + { + "epoch": 0.5238938053097345, + "grad_norm": 33.75, + "learning_rate": 8.92502952649853e-06, + "loss": 1.3457276821136475, + "step": 1702 + }, + { + "epoch": 0.5245094267025779, + "grad_norm": 5.25, + "learning_rate": 8.92230824434208e-06, + "loss": 1.3082480430603027, + "step": 1704 + }, + { + "epoch": 0.5251250480954213, + "grad_norm": 2.90625, + "learning_rate": 8.919584058222422e-06, + "loss": 1.063995361328125, + "step": 1706 + }, + { + "epoch": 0.5257406694882647, + "grad_norm": 2.21875, + "learning_rate": 8.91685697084664e-06, + "loss": 1.228017807006836, + "step": 1708 + }, + { + "epoch": 0.5263562908811081, + "grad_norm": 7.5625, + "learning_rate": 8.914126984924705e-06, + "loss": 1.6891307830810547, + "step": 1710 + }, + { + "epoch": 0.5269719122739516, + "grad_norm": 6.15625, + "learning_rate": 8.911394103169461e-06, + "loss": 1.14736807346344, + "step": 1712 + }, + { + "epoch": 0.5275875336667949, + "grad_norm": 3.890625, + "learning_rate": 8.908658328296635e-06, + "loss": 1.1494313478469849, + "step": 1714 + }, + { + "epoch": 0.5282031550596383, + "grad_norm": 6.75, + "learning_rate": 8.905919663024829e-06, + "loss": 1.4376049041748047, + "step": 1716 + }, + { + "epoch": 0.5288187764524818, + "grad_norm": 2.9375, + "learning_rate": 8.903178110075514e-06, + "loss": 1.0928840637207031, + "step": 1718 + }, + { + "epoch": 0.5294343978453251, + "grad_norm": 4.03125, + "learning_rate": 8.900433672173035e-06, + "loss": 0.9837021827697754, + "step": 1720 + }, + { + "epoch": 0.5300500192381685, + "grad_norm": 4.40625, + "learning_rate": 8.897686352044599e-06, + "loss": 1.3075284957885742, + "step": 1722 + }, + { + "epoch": 0.530665640631012, + "grad_norm": 4.71875, + "learning_rate": 8.89493615242028e-06, + "loss": 1.307611107826233, + "step": 1724 + }, + { + "epoch": 0.5312812620238553, + "grad_norm": 10.9375, + "learning_rate": 8.89218307603302e-06, + "loss": 1.0998104810714722, + "step": 1726 + }, + { + "epoch": 0.5318968834166987, + "grad_norm": 11.625, + "learning_rate": 8.8894271256186e-06, + "loss": 1.3269050121307373, + "step": 1728 + }, + { + "epoch": 0.5325125048095422, + "grad_norm": 7.125, + "learning_rate": 8.88666830391568e-06, + "loss": 1.2515839338302612, + "step": 1730 + }, + { + "epoch": 0.5331281262023856, + "grad_norm": 10.1875, + "learning_rate": 8.883906613665758e-06, + "loss": 1.5673224925994873, + "step": 1732 + }, + { + "epoch": 0.5337437475952289, + "grad_norm": 6.46875, + "learning_rate": 8.881142057613187e-06, + "loss": 1.440687656402588, + "step": 1734 + }, + { + "epoch": 0.5343593689880723, + "grad_norm": 18.75, + "learning_rate": 8.878374638505172e-06, + "loss": 1.611922025680542, + "step": 1736 + }, + { + "epoch": 0.5349749903809158, + "grad_norm": 7.375, + "learning_rate": 8.875604359091759e-06, + "loss": 1.4720699787139893, + "step": 1738 + }, + { + "epoch": 0.5355906117737591, + "grad_norm": 4.75, + "learning_rate": 8.872831222125833e-06, + "loss": 1.1215943098068237, + "step": 1740 + }, + { + "epoch": 0.5362062331666025, + "grad_norm": 7.59375, + "learning_rate": 8.870055230363126e-06, + "loss": 1.3788470029830933, + "step": 1742 + }, + { + "epoch": 0.536821854559446, + "grad_norm": 4.09375, + "learning_rate": 8.8672763865622e-06, + "loss": 1.083932876586914, + "step": 1744 + }, + { + "epoch": 0.5374374759522893, + "grad_norm": 6.15625, + "learning_rate": 8.86449469348446e-06, + "loss": 1.3741451501846313, + "step": 1746 + }, + { + "epoch": 0.5380530973451327, + "grad_norm": 4.0625, + "learning_rate": 8.861710153894129e-06, + "loss": 1.334261417388916, + "step": 1748 + }, + { + "epoch": 0.5386687187379762, + "grad_norm": 4.1875, + "learning_rate": 8.858922770558272e-06, + "loss": 1.2975589036941528, + "step": 1750 + }, + { + "epoch": 0.5392843401308195, + "grad_norm": 24.25, + "learning_rate": 8.856132546246774e-06, + "loss": 1.2593014240264893, + "step": 1752 + }, + { + "epoch": 0.5398999615236629, + "grad_norm": 3.265625, + "learning_rate": 8.853339483732341e-06, + "loss": 1.1915488243103027, + "step": 1754 + }, + { + "epoch": 0.5405155829165064, + "grad_norm": 6.46875, + "learning_rate": 8.850543585790504e-06, + "loss": 1.025390625, + "step": 1756 + }, + { + "epoch": 0.5411312043093498, + "grad_norm": 10.0625, + "learning_rate": 8.847744855199607e-06, + "loss": 1.3022722005844116, + "step": 1758 + }, + { + "epoch": 0.5417468257021931, + "grad_norm": 14.1875, + "learning_rate": 8.844943294740813e-06, + "loss": 1.1693408489227295, + "step": 1760 + }, + { + "epoch": 0.5423624470950366, + "grad_norm": 5.5625, + "learning_rate": 8.842138907198098e-06, + "loss": 1.4360636472702026, + "step": 1762 + }, + { + "epoch": 0.54297806848788, + "grad_norm": 13.75, + "learning_rate": 8.83933169535824e-06, + "loss": 1.373488187789917, + "step": 1764 + }, + { + "epoch": 0.5435936898807233, + "grad_norm": 9.6875, + "learning_rate": 8.83652166201083e-06, + "loss": 1.6782146692276, + "step": 1766 + }, + { + "epoch": 0.5442093112735668, + "grad_norm": 4.28125, + "learning_rate": 8.833708809948261e-06, + "loss": 1.2512364387512207, + "step": 1768 + }, + { + "epoch": 0.5448249326664102, + "grad_norm": 9.25, + "learning_rate": 8.830893141965729e-06, + "loss": 1.6331942081451416, + "step": 1770 + }, + { + "epoch": 0.5454405540592535, + "grad_norm": 4.21875, + "learning_rate": 8.828074660861223e-06, + "loss": 1.2096216678619385, + "step": 1772 + }, + { + "epoch": 0.546056175452097, + "grad_norm": 5.21875, + "learning_rate": 8.825253369435536e-06, + "loss": 1.2483621835708618, + "step": 1774 + }, + { + "epoch": 0.5466717968449404, + "grad_norm": 9.5625, + "learning_rate": 8.822429270492243e-06, + "loss": 1.4440934658050537, + "step": 1776 + }, + { + "epoch": 0.5472874182377838, + "grad_norm": 7.8125, + "learning_rate": 8.819602366837716e-06, + "loss": 1.355756402015686, + "step": 1778 + }, + { + "epoch": 0.5479030396306271, + "grad_norm": 4.375, + "learning_rate": 8.816772661281117e-06, + "loss": 1.4682282209396362, + "step": 1780 + }, + { + "epoch": 0.5485186610234706, + "grad_norm": 9.4375, + "learning_rate": 8.81394015663438e-06, + "loss": 1.7345459461212158, + "step": 1782 + }, + { + "epoch": 0.549134282416314, + "grad_norm": 10.6875, + "learning_rate": 8.811104855712235e-06, + "loss": 1.5893970727920532, + "step": 1784 + }, + { + "epoch": 0.5497499038091573, + "grad_norm": 6.6875, + "learning_rate": 8.80826676133218e-06, + "loss": 1.135110855102539, + "step": 1786 + }, + { + "epoch": 0.5503655252020008, + "grad_norm": 7.6875, + "learning_rate": 8.805425876314497e-06, + "loss": 1.3346829414367676, + "step": 1788 + }, + { + "epoch": 0.5509811465948442, + "grad_norm": 9.1875, + "learning_rate": 8.802582203482232e-06, + "loss": 1.416990876197815, + "step": 1790 + }, + { + "epoch": 0.5515967679876875, + "grad_norm": 4.65625, + "learning_rate": 8.799735745661214e-06, + "loss": 1.4351859092712402, + "step": 1792 + }, + { + "epoch": 0.552212389380531, + "grad_norm": 5.375, + "learning_rate": 8.796886505680022e-06, + "loss": 1.0165706872940063, + "step": 1794 + }, + { + "epoch": 0.5528280107733744, + "grad_norm": 4.625, + "learning_rate": 8.794034486370015e-06, + "loss": 1.3028804063796997, + "step": 1796 + }, + { + "epoch": 0.5534436321662177, + "grad_norm": 8.3125, + "learning_rate": 8.791179690565312e-06, + "loss": 1.4025111198425293, + "step": 1798 + }, + { + "epoch": 0.5540592535590612, + "grad_norm": 4.96875, + "learning_rate": 8.788322121102781e-06, + "loss": 1.1301255226135254, + "step": 1800 + }, + { + "epoch": 0.5546748749519046, + "grad_norm": 5.21875, + "learning_rate": 8.785461780822058e-06, + "loss": 1.0889443159103394, + "step": 1802 + }, + { + "epoch": 0.555290496344748, + "grad_norm": 8.5625, + "learning_rate": 8.782598672565521e-06, + "loss": 1.5021328926086426, + "step": 1804 + }, + { + "epoch": 0.5559061177375914, + "grad_norm": 6.28125, + "learning_rate": 8.779732799178314e-06, + "loss": 0.9383888840675354, + "step": 1806 + }, + { + "epoch": 0.5565217391304348, + "grad_norm": 7.5625, + "learning_rate": 8.77686416350831e-06, + "loss": 1.7545676231384277, + "step": 1808 + }, + { + "epoch": 0.5571373605232782, + "grad_norm": 5.09375, + "learning_rate": 8.773992768406144e-06, + "loss": 1.0912437438964844, + "step": 1810 + }, + { + "epoch": 0.5577529819161215, + "grad_norm": 8.1875, + "learning_rate": 8.771118616725181e-06, + "loss": 1.7606732845306396, + "step": 1812 + }, + { + "epoch": 0.558368603308965, + "grad_norm": 14.25, + "learning_rate": 8.76824171132153e-06, + "loss": 1.8077706098556519, + "step": 1814 + }, + { + "epoch": 0.5589842247018084, + "grad_norm": 34.5, + "learning_rate": 8.765362055054042e-06, + "loss": 1.521872878074646, + "step": 1816 + }, + { + "epoch": 0.5595998460946517, + "grad_norm": 5.75, + "learning_rate": 8.762479650784287e-06, + "loss": 1.0878723859786987, + "step": 1818 + }, + { + "epoch": 0.5602154674874952, + "grad_norm": 6.09375, + "learning_rate": 8.759594501376584e-06, + "loss": 1.2345068454742432, + "step": 1820 + }, + { + "epoch": 0.5608310888803386, + "grad_norm": 6.625, + "learning_rate": 8.756706609697965e-06, + "loss": 1.4004822969436646, + "step": 1822 + }, + { + "epoch": 0.561446710273182, + "grad_norm": 20.625, + "learning_rate": 8.753815978618194e-06, + "loss": 1.4569531679153442, + "step": 1824 + }, + { + "epoch": 0.5620623316660254, + "grad_norm": 12.25, + "learning_rate": 8.750922611009757e-06, + "loss": 1.0861998796463013, + "step": 1826 + }, + { + "epoch": 0.5626779530588688, + "grad_norm": 7.0625, + "learning_rate": 8.748026509747858e-06, + "loss": 1.1484061479568481, + "step": 1828 + }, + { + "epoch": 0.5632935744517122, + "grad_norm": 7.84375, + "learning_rate": 8.745127677710415e-06, + "loss": 1.6220837831497192, + "step": 1830 + }, + { + "epoch": 0.5639091958445556, + "grad_norm": 6.3125, + "learning_rate": 8.742226117778063e-06, + "loss": 1.5962282419204712, + "step": 1832 + }, + { + "epoch": 0.564524817237399, + "grad_norm": 12.875, + "learning_rate": 8.739321832834151e-06, + "loss": 1.4062249660491943, + "step": 1834 + }, + { + "epoch": 0.5651404386302424, + "grad_norm": 6.34375, + "learning_rate": 8.736414825764729e-06, + "loss": 1.4309604167938232, + "step": 1836 + }, + { + "epoch": 0.5657560600230858, + "grad_norm": 10.1875, + "learning_rate": 8.733505099458555e-06, + "loss": 1.6085896492004395, + "step": 1838 + }, + { + "epoch": 0.5663716814159292, + "grad_norm": 6.875, + "learning_rate": 8.730592656807091e-06, + "loss": 1.260807991027832, + "step": 1840 + }, + { + "epoch": 0.5669873028087726, + "grad_norm": 10.3125, + "learning_rate": 8.727677500704494e-06, + "loss": 1.4040920734405518, + "step": 1842 + }, + { + "epoch": 0.567602924201616, + "grad_norm": 12.0625, + "learning_rate": 8.724759634047622e-06, + "loss": 1.0131279230117798, + "step": 1844 + }, + { + "epoch": 0.5682185455944594, + "grad_norm": 5.59375, + "learning_rate": 8.721839059736023e-06, + "loss": 1.0781059265136719, + "step": 1846 + }, + { + "epoch": 0.5688341669873028, + "grad_norm": 12.1875, + "learning_rate": 8.718915780671939e-06, + "loss": 1.4911892414093018, + "step": 1848 + }, + { + "epoch": 0.5694497883801463, + "grad_norm": 5.8125, + "learning_rate": 8.715989799760298e-06, + "loss": 1.3792434930801392, + "step": 1850 + }, + { + "epoch": 0.5700654097729896, + "grad_norm": 5.65625, + "learning_rate": 8.713061119908713e-06, + "loss": 1.2725658416748047, + "step": 1852 + }, + { + "epoch": 0.570681031165833, + "grad_norm": 5.8125, + "learning_rate": 8.710129744027474e-06, + "loss": 1.2276039123535156, + "step": 1854 + }, + { + "epoch": 0.5712966525586765, + "grad_norm": 4.21875, + "learning_rate": 8.707195675029558e-06, + "loss": 0.955919086933136, + "step": 1856 + }, + { + "epoch": 0.5719122739515198, + "grad_norm": 4.53125, + "learning_rate": 8.704258915830619e-06, + "loss": 1.2571704387664795, + "step": 1858 + }, + { + "epoch": 0.5725278953443632, + "grad_norm": 10.1875, + "learning_rate": 8.701319469348975e-06, + "loss": 1.1578973531723022, + "step": 1860 + }, + { + "epoch": 0.5731435167372066, + "grad_norm": 4.21875, + "learning_rate": 8.698377338505623e-06, + "loss": 1.2620946168899536, + "step": 1862 + }, + { + "epoch": 0.57375913813005, + "grad_norm": 5.5, + "learning_rate": 8.695432526224223e-06, + "loss": 1.1545274257659912, + "step": 1864 + }, + { + "epoch": 0.5743747595228934, + "grad_norm": 5.34375, + "learning_rate": 8.692485035431103e-06, + "loss": 1.2381047010421753, + "step": 1866 + }, + { + "epoch": 0.5749903809157368, + "grad_norm": 10.0625, + "learning_rate": 8.689534869055247e-06, + "loss": 1.3293325901031494, + "step": 1868 + }, + { + "epoch": 0.5756060023085803, + "grad_norm": 3.859375, + "learning_rate": 8.686582030028304e-06, + "loss": 1.2085543870925903, + "step": 1870 + }, + { + "epoch": 0.5762216237014236, + "grad_norm": 8.3125, + "learning_rate": 8.683626521284576e-06, + "loss": 0.848010241985321, + "step": 1872 + }, + { + "epoch": 0.576837245094267, + "grad_norm": 4.46875, + "learning_rate": 8.680668345761016e-06, + "loss": 1.362102746963501, + "step": 1874 + }, + { + "epoch": 0.5774528664871105, + "grad_norm": 6.90625, + "learning_rate": 8.677707506397235e-06, + "loss": 1.1126528978347778, + "step": 1876 + }, + { + "epoch": 0.5780684878799538, + "grad_norm": 7.8125, + "learning_rate": 8.67474400613548e-06, + "loss": 1.150965929031372, + "step": 1878 + }, + { + "epoch": 0.5786841092727972, + "grad_norm": 4.125, + "learning_rate": 8.671777847920649e-06, + "loss": 1.4929723739624023, + "step": 1880 + }, + { + "epoch": 0.5792997306656407, + "grad_norm": 5.84375, + "learning_rate": 8.66880903470028e-06, + "loss": 1.4086570739746094, + "step": 1882 + }, + { + "epoch": 0.579915352058484, + "grad_norm": 12.25, + "learning_rate": 8.665837569424552e-06, + "loss": 1.2356774806976318, + "step": 1884 + }, + { + "epoch": 0.5805309734513274, + "grad_norm": 4.1875, + "learning_rate": 8.662863455046272e-06, + "loss": 1.2308299541473389, + "step": 1886 + }, + { + "epoch": 0.5811465948441709, + "grad_norm": 13.3125, + "learning_rate": 8.659886694520889e-06, + "loss": 1.505770206451416, + "step": 1888 + }, + { + "epoch": 0.5817622162370142, + "grad_norm": 5.59375, + "learning_rate": 8.656907290806471e-06, + "loss": 1.5034568309783936, + "step": 1890 + }, + { + "epoch": 0.5823778376298576, + "grad_norm": 7.40625, + "learning_rate": 8.653925246863724e-06, + "loss": 0.9744303822517395, + "step": 1892 + }, + { + "epoch": 0.582993459022701, + "grad_norm": 4.40625, + "learning_rate": 8.650940565655968e-06, + "loss": 1.5263757705688477, + "step": 1894 + }, + { + "epoch": 0.5836090804155445, + "grad_norm": 5.5, + "learning_rate": 8.647953250149149e-06, + "loss": 1.3015506267547607, + "step": 1896 + }, + { + "epoch": 0.5842247018083878, + "grad_norm": 7.4375, + "learning_rate": 8.644963303311829e-06, + "loss": 1.2412558794021606, + "step": 1898 + }, + { + "epoch": 0.5848403232012312, + "grad_norm": 4.09375, + "learning_rate": 8.641970728115186e-06, + "loss": 1.366623044013977, + "step": 1900 + }, + { + "epoch": 0.5854559445940747, + "grad_norm": 14.4375, + "learning_rate": 8.638975527533007e-06, + "loss": 1.654881238937378, + "step": 1902 + }, + { + "epoch": 0.586071565986918, + "grad_norm": 5.34375, + "learning_rate": 8.63597770454169e-06, + "loss": 1.6023765802383423, + "step": 1904 + }, + { + "epoch": 0.5866871873797614, + "grad_norm": 9.0, + "learning_rate": 8.632977262120245e-06, + "loss": 1.4414507150650024, + "step": 1906 + }, + { + "epoch": 0.5873028087726049, + "grad_norm": 9.3125, + "learning_rate": 8.629974203250273e-06, + "loss": 1.3019405603408813, + "step": 1908 + }, + { + "epoch": 0.5879184301654482, + "grad_norm": 7.28125, + "learning_rate": 8.62696853091598e-06, + "loss": 1.1021331548690796, + "step": 1910 + }, + { + "epoch": 0.5885340515582916, + "grad_norm": 8.1875, + "learning_rate": 8.623960248104175e-06, + "loss": 1.2331730127334595, + "step": 1912 + }, + { + "epoch": 0.5891496729511351, + "grad_norm": 5.75, + "learning_rate": 8.620949357804252e-06, + "loss": 1.4081820249557495, + "step": 1914 + }, + { + "epoch": 0.5897652943439785, + "grad_norm": 4.90625, + "learning_rate": 8.6179358630082e-06, + "loss": 1.2616064548492432, + "step": 1916 + }, + { + "epoch": 0.5903809157368218, + "grad_norm": 8.375, + "learning_rate": 8.614919766710598e-06, + "loss": 1.3621723651885986, + "step": 1918 + }, + { + "epoch": 0.5909965371296653, + "grad_norm": 14.875, + "learning_rate": 8.61190107190861e-06, + "loss": 0.9340348243713379, + "step": 1920 + }, + { + "epoch": 0.5916121585225087, + "grad_norm": 4.65625, + "learning_rate": 8.60887978160198e-06, + "loss": 1.278018593788147, + "step": 1922 + }, + { + "epoch": 0.592227779915352, + "grad_norm": 4.53125, + "learning_rate": 8.605855898793027e-06, + "loss": 1.2426376342773438, + "step": 1924 + }, + { + "epoch": 0.5928434013081955, + "grad_norm": 3.109375, + "learning_rate": 8.602829426486657e-06, + "loss": 1.1372792720794678, + "step": 1926 + }, + { + "epoch": 0.5934590227010389, + "grad_norm": 5.3125, + "learning_rate": 8.599800367690342e-06, + "loss": 1.3143759965896606, + "step": 1928 + }, + { + "epoch": 0.5940746440938822, + "grad_norm": 3.8125, + "learning_rate": 8.596768725414125e-06, + "loss": 1.2842066287994385, + "step": 1930 + }, + { + "epoch": 0.5946902654867257, + "grad_norm": 6.34375, + "learning_rate": 8.593734502670615e-06, + "loss": 1.5189852714538574, + "step": 1932 + }, + { + "epoch": 0.5953058868795691, + "grad_norm": 8.875, + "learning_rate": 8.590697702474988e-06, + "loss": 1.1181970834732056, + "step": 1934 + }, + { + "epoch": 0.5959215082724124, + "grad_norm": 6.09375, + "learning_rate": 8.587658327844982e-06, + "loss": 1.3686904907226562, + "step": 1936 + }, + { + "epoch": 0.5965371296652558, + "grad_norm": 8.5, + "learning_rate": 8.584616381800895e-06, + "loss": 1.596742033958435, + "step": 1938 + }, + { + "epoch": 0.5971527510580993, + "grad_norm": 5.5625, + "learning_rate": 8.58157186736557e-06, + "loss": 1.4269849061965942, + "step": 1940 + }, + { + "epoch": 0.5977683724509427, + "grad_norm": 4.5, + "learning_rate": 8.578524787564412e-06, + "loss": 1.2365959882736206, + "step": 1942 + }, + { + "epoch": 0.598383993843786, + "grad_norm": 15.25, + "learning_rate": 8.575475145425373e-06, + "loss": 1.385693907737732, + "step": 1944 + }, + { + "epoch": 0.5989996152366295, + "grad_norm": 4.5, + "learning_rate": 8.572422943978951e-06, + "loss": 1.1528739929199219, + "step": 1946 + }, + { + "epoch": 0.5996152366294729, + "grad_norm": 3.25, + "learning_rate": 8.569368186258187e-06, + "loss": 0.9854751825332642, + "step": 1948 + }, + { + "epoch": 0.6002308580223162, + "grad_norm": 15.75, + "learning_rate": 8.566310875298662e-06, + "loss": 1.4587029218673706, + "step": 1950 + }, + { + "epoch": 0.6008464794151597, + "grad_norm": 12.0625, + "learning_rate": 8.563251014138493e-06, + "loss": 1.0503976345062256, + "step": 1952 + }, + { + "epoch": 0.6014621008080031, + "grad_norm": 5.28125, + "learning_rate": 8.560188605818335e-06, + "loss": 1.2880537509918213, + "step": 1954 + }, + { + "epoch": 0.6020777222008464, + "grad_norm": 22.25, + "learning_rate": 8.557123653381369e-06, + "loss": 1.2823199033737183, + "step": 1956 + }, + { + "epoch": 0.6026933435936899, + "grad_norm": 3.640625, + "learning_rate": 8.554056159873311e-06, + "loss": 1.0577878952026367, + "step": 1958 + }, + { + "epoch": 0.6033089649865333, + "grad_norm": 10.1875, + "learning_rate": 8.550986128342395e-06, + "loss": 1.6685055494308472, + "step": 1960 + }, + { + "epoch": 0.6039245863793767, + "grad_norm": 3.75, + "learning_rate": 8.54791356183938e-06, + "loss": 1.2891515493392944, + "step": 1962 + }, + { + "epoch": 0.6045402077722201, + "grad_norm": 8.8125, + "learning_rate": 8.544838463417547e-06, + "loss": 1.311370849609375, + "step": 1964 + }, + { + "epoch": 0.6051558291650635, + "grad_norm": 10.3125, + "learning_rate": 8.541760836132684e-06, + "loss": 1.1342506408691406, + "step": 1966 + }, + { + "epoch": 0.6057714505579069, + "grad_norm": 8.9375, + "learning_rate": 8.538680683043105e-06, + "loss": 0.7439443469047546, + "step": 1968 + }, + { + "epoch": 0.6063870719507503, + "grad_norm": 6.28125, + "learning_rate": 8.535598007209624e-06, + "loss": 1.3986562490463257, + "step": 1970 + }, + { + "epoch": 0.6070026933435937, + "grad_norm": 4.625, + "learning_rate": 8.532512811695567e-06, + "loss": 1.389469027519226, + "step": 1972 + }, + { + "epoch": 0.6076183147364371, + "grad_norm": 1.875, + "learning_rate": 8.529425099566761e-06, + "loss": 0.9762718677520752, + "step": 1974 + }, + { + "epoch": 0.6082339361292805, + "grad_norm": 7.09375, + "learning_rate": 8.526334873891533e-06, + "loss": 1.321108102798462, + "step": 1976 + }, + { + "epoch": 0.6088495575221239, + "grad_norm": 39.5, + "learning_rate": 8.52324213774071e-06, + "loss": 0.8753465414047241, + "step": 1978 + }, + { + "epoch": 0.6094651789149673, + "grad_norm": 9.6875, + "learning_rate": 8.520146894187616e-06, + "loss": 1.3895344734191895, + "step": 1980 + }, + { + "epoch": 0.6100808003078106, + "grad_norm": 3.875, + "learning_rate": 8.517049146308063e-06, + "loss": 1.2270596027374268, + "step": 1982 + }, + { + "epoch": 0.6106964217006541, + "grad_norm": 59.0, + "learning_rate": 8.513948897180348e-06, + "loss": 1.7473394870758057, + "step": 1984 + }, + { + "epoch": 0.6113120430934975, + "grad_norm": 9.75, + "learning_rate": 8.510846149885264e-06, + "loss": 1.770764946937561, + "step": 1986 + }, + { + "epoch": 0.611927664486341, + "grad_norm": 7.96875, + "learning_rate": 8.50774090750608e-06, + "loss": 1.2302086353302002, + "step": 1988 + }, + { + "epoch": 0.6125432858791843, + "grad_norm": 7.0, + "learning_rate": 8.504633173128539e-06, + "loss": 1.3983111381530762, + "step": 1990 + }, + { + "epoch": 0.6131589072720277, + "grad_norm": 7.15625, + "learning_rate": 8.501522949840873e-06, + "loss": 1.2710623741149902, + "step": 1992 + }, + { + "epoch": 0.6137745286648711, + "grad_norm": 5.9375, + "learning_rate": 8.498410240733776e-06, + "loss": 1.2992455959320068, + "step": 1994 + }, + { + "epoch": 0.6143901500577145, + "grad_norm": 9.25, + "learning_rate": 8.495295048900421e-06, + "loss": 1.0018309354782104, + "step": 1996 + }, + { + "epoch": 0.6150057714505579, + "grad_norm": 16.625, + "learning_rate": 8.492177377436442e-06, + "loss": 1.5174227952957153, + "step": 1998 + }, + { + "epoch": 0.6156213928434013, + "grad_norm": 7.75, + "learning_rate": 8.489057229439937e-06, + "loss": 1.758395791053772, + "step": 2000 + }, + { + "epoch": 0.6162370142362447, + "grad_norm": 13.625, + "learning_rate": 8.485934608011469e-06, + "loss": 1.062825322151184, + "step": 2002 + }, + { + "epoch": 0.6168526356290881, + "grad_norm": 7.375, + "learning_rate": 8.482809516254058e-06, + "loss": 1.2867090702056885, + "step": 2004 + }, + { + "epoch": 0.6174682570219315, + "grad_norm": 10.8125, + "learning_rate": 8.479681957273177e-06, + "loss": 1.3977320194244385, + "step": 2006 + }, + { + "epoch": 0.618083878414775, + "grad_norm": 8.125, + "learning_rate": 8.47655193417675e-06, + "loss": 1.230679988861084, + "step": 2008 + }, + { + "epoch": 0.6186994998076183, + "grad_norm": 7.59375, + "learning_rate": 8.473419450075149e-06, + "loss": 0.9919013381004333, + "step": 2010 + }, + { + "epoch": 0.6193151212004617, + "grad_norm": 6.0, + "learning_rate": 8.470284508081201e-06, + "loss": 1.2583743333816528, + "step": 2012 + }, + { + "epoch": 0.6199307425933052, + "grad_norm": 14.0625, + "learning_rate": 8.46714711131016e-06, + "loss": 1.474548578262329, + "step": 2014 + }, + { + "epoch": 0.6205463639861485, + "grad_norm": 11.875, + "learning_rate": 8.464007262879736e-06, + "loss": 1.4689182043075562, + "step": 2016 + }, + { + "epoch": 0.6211619853789919, + "grad_norm": 21.125, + "learning_rate": 8.460864965910061e-06, + "loss": 1.1271132230758667, + "step": 2018 + }, + { + "epoch": 0.6217776067718354, + "grad_norm": 8.9375, + "learning_rate": 8.457720223523704e-06, + "loss": 1.3200182914733887, + "step": 2020 + }, + { + "epoch": 0.6223932281646787, + "grad_norm": 7.6875, + "learning_rate": 8.454573038845671e-06, + "loss": 1.6913087368011475, + "step": 2022 + }, + { + "epoch": 0.6230088495575221, + "grad_norm": 8.1875, + "learning_rate": 8.451423415003387e-06, + "loss": 0.9711453914642334, + "step": 2024 + }, + { + "epoch": 0.6236244709503656, + "grad_norm": 7.3125, + "learning_rate": 8.448271355126707e-06, + "loss": 1.281950831413269, + "step": 2026 + }, + { + "epoch": 0.6242400923432089, + "grad_norm": 4.59375, + "learning_rate": 8.4451168623479e-06, + "loss": 1.5933538675308228, + "step": 2028 + }, + { + "epoch": 0.6248557137360523, + "grad_norm": 13.4375, + "learning_rate": 8.441959939801657e-06, + "loss": 1.6639219522476196, + "step": 2030 + }, + { + "epoch": 0.6254713351288957, + "grad_norm": 16.5, + "learning_rate": 8.438800590625084e-06, + "loss": 1.0111881494522095, + "step": 2032 + }, + { + "epoch": 0.6260869565217392, + "grad_norm": 6.59375, + "learning_rate": 8.435638817957696e-06, + "loss": 1.194913387298584, + "step": 2034 + }, + { + "epoch": 0.6267025779145825, + "grad_norm": 6.1875, + "learning_rate": 8.432474624941418e-06, + "loss": 1.225528359413147, + "step": 2036 + }, + { + "epoch": 0.6273181993074259, + "grad_norm": 5.75, + "learning_rate": 8.429308014720578e-06, + "loss": 1.2120106220245361, + "step": 2038 + }, + { + "epoch": 0.6279338207002694, + "grad_norm": 10.625, + "learning_rate": 8.42613899044191e-06, + "loss": 1.016963243484497, + "step": 2040 + }, + { + "epoch": 0.6285494420931127, + "grad_norm": 6.78125, + "learning_rate": 8.422967555254544e-06, + "loss": 0.8804018497467041, + "step": 2042 + }, + { + "epoch": 0.6291650634859561, + "grad_norm": 2.828125, + "learning_rate": 8.419793712310005e-06, + "loss": 1.086695671081543, + "step": 2044 + }, + { + "epoch": 0.6297806848787996, + "grad_norm": 8.25, + "learning_rate": 8.416617464762213e-06, + "loss": 1.6183313131332397, + "step": 2046 + }, + { + "epoch": 0.6303963062716429, + "grad_norm": 4.1875, + "learning_rate": 8.413438815767474e-06, + "loss": 1.333935260772705, + "step": 2048 + }, + { + "epoch": 0.6310119276644863, + "grad_norm": 9.75, + "learning_rate": 8.410257768484486e-06, + "loss": 1.3460040092468262, + "step": 2050 + }, + { + "epoch": 0.6316275490573298, + "grad_norm": 6.5625, + "learning_rate": 8.407074326074325e-06, + "loss": 1.3743432760238647, + "step": 2052 + }, + { + "epoch": 0.6322431704501732, + "grad_norm": 3.921875, + "learning_rate": 8.403888491700449e-06, + "loss": 1.0204885005950928, + "step": 2054 + }, + { + "epoch": 0.6328587918430165, + "grad_norm": 10.125, + "learning_rate": 8.400700268528695e-06, + "loss": 1.1670832633972168, + "step": 2056 + }, + { + "epoch": 0.63347441323586, + "grad_norm": 5.40625, + "learning_rate": 8.39750965972727e-06, + "loss": 1.0655497312545776, + "step": 2058 + }, + { + "epoch": 0.6340900346287034, + "grad_norm": 14.75, + "learning_rate": 8.394316668466753e-06, + "loss": 1.6687726974487305, + "step": 2060 + }, + { + "epoch": 0.6347056560215467, + "grad_norm": 9.25, + "learning_rate": 8.391121297920093e-06, + "loss": 1.5025804042816162, + "step": 2062 + }, + { + "epoch": 0.6353212774143902, + "grad_norm": 4.6875, + "learning_rate": 8.3879235512626e-06, + "loss": 1.5211477279663086, + "step": 2064 + }, + { + "epoch": 0.6359368988072336, + "grad_norm": 3.46875, + "learning_rate": 8.384723431671947e-06, + "loss": 1.1468708515167236, + "step": 2066 + }, + { + "epoch": 0.6365525202000769, + "grad_norm": 8.8125, + "learning_rate": 8.381520942328163e-06, + "loss": 1.2303088903427124, + "step": 2068 + }, + { + "epoch": 0.6371681415929203, + "grad_norm": 5.84375, + "learning_rate": 8.378316086413637e-06, + "loss": 1.098226547241211, + "step": 2070 + }, + { + "epoch": 0.6377837629857638, + "grad_norm": 4.5625, + "learning_rate": 8.375108867113104e-06, + "loss": 1.3485264778137207, + "step": 2072 + }, + { + "epoch": 0.6383993843786071, + "grad_norm": 7.40625, + "learning_rate": 8.371899287613648e-06, + "loss": 1.6760691404342651, + "step": 2074 + }, + { + "epoch": 0.6390150057714505, + "grad_norm": 4.09375, + "learning_rate": 8.368687351104702e-06, + "loss": 1.6598718166351318, + "step": 2076 + }, + { + "epoch": 0.639630627164294, + "grad_norm": 16.0, + "learning_rate": 8.36547306077804e-06, + "loss": 1.560227394104004, + "step": 2078 + }, + { + "epoch": 0.6402462485571374, + "grad_norm": 4.25, + "learning_rate": 8.362256419827773e-06, + "loss": 1.228755235671997, + "step": 2080 + }, + { + "epoch": 0.6408618699499807, + "grad_norm": 7.28125, + "learning_rate": 8.35903743145035e-06, + "loss": 1.2501813173294067, + "step": 2082 + }, + { + "epoch": 0.6414774913428242, + "grad_norm": 3.828125, + "learning_rate": 8.355816098844551e-06, + "loss": 1.4830968379974365, + "step": 2084 + }, + { + "epoch": 0.6420931127356676, + "grad_norm": 7.5625, + "learning_rate": 8.352592425211488e-06, + "loss": 1.3437997102737427, + "step": 2086 + }, + { + "epoch": 0.6427087341285109, + "grad_norm": 9.3125, + "learning_rate": 8.349366413754595e-06, + "loss": 1.3608304262161255, + "step": 2088 + }, + { + "epoch": 0.6433243555213544, + "grad_norm": 3.78125, + "learning_rate": 8.346138067679635e-06, + "loss": 1.2023274898529053, + "step": 2090 + }, + { + "epoch": 0.6439399769141978, + "grad_norm": 16.125, + "learning_rate": 8.342907390194687e-06, + "loss": 1.297385811805725, + "step": 2092 + }, + { + "epoch": 0.6445555983070411, + "grad_norm": 8.5, + "learning_rate": 8.339674384510145e-06, + "loss": 1.7631044387817383, + "step": 2094 + }, + { + "epoch": 0.6451712196998846, + "grad_norm": 4.375, + "learning_rate": 8.336439053838722e-06, + "loss": 1.2460544109344482, + "step": 2096 + }, + { + "epoch": 0.645786841092728, + "grad_norm": 13.0, + "learning_rate": 8.33320140139544e-06, + "loss": 1.054074764251709, + "step": 2098 + }, + { + "epoch": 0.6464024624855714, + "grad_norm": 8.75, + "learning_rate": 8.329961430397623e-06, + "loss": 1.279534101486206, + "step": 2100 + }, + { + "epoch": 0.6470180838784148, + "grad_norm": 10.75, + "learning_rate": 8.326719144064905e-06, + "loss": 1.72566819190979, + "step": 2102 + }, + { + "epoch": 0.6476337052712582, + "grad_norm": 5.78125, + "learning_rate": 8.323474545619219e-06, + "loss": 1.2224308252334595, + "step": 2104 + }, + { + "epoch": 0.6482493266641016, + "grad_norm": 7.71875, + "learning_rate": 8.320227638284795e-06, + "loss": 1.363440752029419, + "step": 2106 + }, + { + "epoch": 0.648864948056945, + "grad_norm": 9.3125, + "learning_rate": 8.316978425288157e-06, + "loss": 1.9052454233169556, + "step": 2108 + }, + { + "epoch": 0.6494805694497884, + "grad_norm": 25.125, + "learning_rate": 8.313726909858117e-06, + "loss": 1.117143988609314, + "step": 2110 + }, + { + "epoch": 0.6500961908426318, + "grad_norm": 38.75, + "learning_rate": 8.310473095225786e-06, + "loss": 1.2457895278930664, + "step": 2112 + }, + { + "epoch": 0.6507118122354751, + "grad_norm": 16.25, + "learning_rate": 8.307216984624547e-06, + "loss": 1.6599624156951904, + "step": 2114 + }, + { + "epoch": 0.6513274336283186, + "grad_norm": 9.375, + "learning_rate": 8.303958581290074e-06, + "loss": 1.7203136682510376, + "step": 2116 + }, + { + "epoch": 0.651943055021162, + "grad_norm": 7.71875, + "learning_rate": 8.300697888460314e-06, + "loss": 1.2997355461120605, + "step": 2118 + }, + { + "epoch": 0.6525586764140053, + "grad_norm": 4.9375, + "learning_rate": 8.297434909375488e-06, + "loss": 1.1495037078857422, + "step": 2120 + }, + { + "epoch": 0.6531742978068488, + "grad_norm": 2.921875, + "learning_rate": 8.294169647278097e-06, + "loss": 1.1950782537460327, + "step": 2122 + }, + { + "epoch": 0.6537899191996922, + "grad_norm": 7.34375, + "learning_rate": 8.290902105412899e-06, + "loss": 1.379351258277893, + "step": 2124 + }, + { + "epoch": 0.6544055405925356, + "grad_norm": 19.125, + "learning_rate": 8.287632287026925e-06, + "loss": 1.2099511623382568, + "step": 2126 + }, + { + "epoch": 0.655021161985379, + "grad_norm": 8.1875, + "learning_rate": 8.284360195369471e-06, + "loss": 1.4267417192459106, + "step": 2128 + }, + { + "epoch": 0.6556367833782224, + "grad_norm": 4.0, + "learning_rate": 8.281085833692083e-06, + "loss": 1.3050733804702759, + "step": 2130 + }, + { + "epoch": 0.6562524047710658, + "grad_norm": 9.3125, + "learning_rate": 8.277809205248572e-06, + "loss": 1.6474896669387817, + "step": 2132 + }, + { + "epoch": 0.6568680261639092, + "grad_norm": 5.4375, + "learning_rate": 8.274530313294992e-06, + "loss": 1.317122459411621, + "step": 2134 + }, + { + "epoch": 0.6574836475567526, + "grad_norm": 2.453125, + "learning_rate": 8.271249161089658e-06, + "loss": 1.3636703491210938, + "step": 2136 + }, + { + "epoch": 0.658099268949596, + "grad_norm": 5.125, + "learning_rate": 8.26796575189312e-06, + "loss": 1.190585970878601, + "step": 2138 + }, + { + "epoch": 0.6587148903424394, + "grad_norm": 8.75, + "learning_rate": 8.264680088968173e-06, + "loss": 1.0943142175674438, + "step": 2140 + }, + { + "epoch": 0.6593305117352828, + "grad_norm": 4.4375, + "learning_rate": 8.261392175579859e-06, + "loss": 1.3861849308013916, + "step": 2142 + }, + { + "epoch": 0.6599461331281262, + "grad_norm": 28.25, + "learning_rate": 8.258102014995446e-06, + "loss": 1.3402506113052368, + "step": 2144 + }, + { + "epoch": 0.6605617545209697, + "grad_norm": 16.375, + "learning_rate": 8.254809610484449e-06, + "loss": 1.638556718826294, + "step": 2146 + }, + { + "epoch": 0.661177375913813, + "grad_norm": 30.25, + "learning_rate": 8.251514965318595e-06, + "loss": 1.2110763788223267, + "step": 2148 + }, + { + "epoch": 0.6617929973066564, + "grad_norm": 6.71875, + "learning_rate": 8.24821808277185e-06, + "loss": 1.3014509677886963, + "step": 2150 + }, + { + "epoch": 0.6624086186994999, + "grad_norm": 6.78125, + "learning_rate": 8.244918966120402e-06, + "loss": 1.265984296798706, + "step": 2152 + }, + { + "epoch": 0.6630242400923432, + "grad_norm": 10.1875, + "learning_rate": 8.241617618642655e-06, + "loss": 1.253446102142334, + "step": 2154 + }, + { + "epoch": 0.6636398614851866, + "grad_norm": 10.5625, + "learning_rate": 8.238314043619233e-06, + "loss": 1.6335866451263428, + "step": 2156 + }, + { + "epoch": 0.66425548287803, + "grad_norm": 3.59375, + "learning_rate": 8.235008244332971e-06, + "loss": 1.3307693004608154, + "step": 2158 + }, + { + "epoch": 0.6648711042708734, + "grad_norm": 4.96875, + "learning_rate": 8.23170022406892e-06, + "loss": 1.4052999019622803, + "step": 2160 + }, + { + "epoch": 0.6654867256637168, + "grad_norm": 4.46875, + "learning_rate": 8.228389986114326e-06, + "loss": 1.3534044027328491, + "step": 2162 + }, + { + "epoch": 0.6661023470565602, + "grad_norm": 11.375, + "learning_rate": 8.225077533758656e-06, + "loss": 1.6899948120117188, + "step": 2164 + }, + { + "epoch": 0.6667179684494036, + "grad_norm": 8.6875, + "learning_rate": 8.221762870293564e-06, + "loss": 1.1940737962722778, + "step": 2166 + }, + { + "epoch": 0.667333589842247, + "grad_norm": 21.25, + "learning_rate": 8.218445999012903e-06, + "loss": 1.4204351902008057, + "step": 2168 + }, + { + "epoch": 0.6679492112350904, + "grad_norm": 5.75, + "learning_rate": 8.215126923212724e-06, + "loss": 1.5102076530456543, + "step": 2170 + }, + { + "epoch": 0.6685648326279339, + "grad_norm": 32.25, + "learning_rate": 8.211805646191268e-06, + "loss": 1.3515056371688843, + "step": 2172 + }, + { + "epoch": 0.6691804540207772, + "grad_norm": 18.0, + "learning_rate": 8.208482171248964e-06, + "loss": 1.5586285591125488, + "step": 2174 + }, + { + "epoch": 0.6697960754136206, + "grad_norm": 4.34375, + "learning_rate": 8.205156501688418e-06, + "loss": 1.2292786836624146, + "step": 2176 + }, + { + "epoch": 0.6704116968064641, + "grad_norm": 14.9375, + "learning_rate": 8.201828640814426e-06, + "loss": 1.0528019666671753, + "step": 2178 + }, + { + "epoch": 0.6710273181993074, + "grad_norm": 10.5, + "learning_rate": 8.198498591933961e-06, + "loss": 1.7368333339691162, + "step": 2180 + }, + { + "epoch": 0.6716429395921508, + "grad_norm": 9.6875, + "learning_rate": 8.195166358356163e-06, + "loss": 1.732184648513794, + "step": 2182 + }, + { + "epoch": 0.6722585609849943, + "grad_norm": 6.1875, + "learning_rate": 8.191831943392347e-06, + "loss": 1.375901699066162, + "step": 2184 + }, + { + "epoch": 0.6728741823778376, + "grad_norm": 4.90625, + "learning_rate": 8.188495350355998e-06, + "loss": 1.2327485084533691, + "step": 2186 + }, + { + "epoch": 0.673489803770681, + "grad_norm": 6.875, + "learning_rate": 8.185156582562763e-06, + "loss": 1.2183061838150024, + "step": 2188 + }, + { + "epoch": 0.6741054251635245, + "grad_norm": 4.5, + "learning_rate": 8.181815643330449e-06, + "loss": 1.133251428604126, + "step": 2190 + }, + { + "epoch": 0.6747210465563679, + "grad_norm": 8.875, + "learning_rate": 8.178472535979023e-06, + "loss": 1.6299774646759033, + "step": 2192 + }, + { + "epoch": 0.6753366679492112, + "grad_norm": 8.5, + "learning_rate": 8.175127263830605e-06, + "loss": 1.6542222499847412, + "step": 2194 + }, + { + "epoch": 0.6759522893420546, + "grad_norm": 9.875, + "learning_rate": 8.17177983020947e-06, + "loss": 0.9074614644050598, + "step": 2196 + }, + { + "epoch": 0.6765679107348981, + "grad_norm": 4.71875, + "learning_rate": 8.168430238442033e-06, + "loss": 0.7373267412185669, + "step": 2198 + }, + { + "epoch": 0.6771835321277414, + "grad_norm": 7.125, + "learning_rate": 8.165078491856861e-06, + "loss": 1.0262969732284546, + "step": 2200 + }, + { + "epoch": 0.6777991535205848, + "grad_norm": 8.0625, + "learning_rate": 8.16172459378466e-06, + "loss": 1.5758912563323975, + "step": 2202 + }, + { + "epoch": 0.6784147749134283, + "grad_norm": 4.96875, + "learning_rate": 8.158368547558276e-06, + "loss": 0.9562698006629944, + "step": 2204 + }, + { + "epoch": 0.6790303963062716, + "grad_norm": 6.15625, + "learning_rate": 8.15501035651268e-06, + "loss": 0.8322404623031616, + "step": 2206 + }, + { + "epoch": 0.679646017699115, + "grad_norm": 12.6875, + "learning_rate": 8.15165002398499e-06, + "loss": 1.8041938543319702, + "step": 2208 + }, + { + "epoch": 0.6802616390919585, + "grad_norm": 8.125, + "learning_rate": 8.148287553314438e-06, + "loss": 1.2884726524353027, + "step": 2210 + }, + { + "epoch": 0.6808772604848018, + "grad_norm": 6.9375, + "learning_rate": 8.144922947842391e-06, + "loss": 0.8856427669525146, + "step": 2212 + }, + { + "epoch": 0.6814928818776452, + "grad_norm": 3.09375, + "learning_rate": 8.141556210912328e-06, + "loss": 1.3027023077011108, + "step": 2214 + }, + { + "epoch": 0.6821085032704887, + "grad_norm": 15.1875, + "learning_rate": 8.138187345869855e-06, + "loss": 1.2902532815933228, + "step": 2216 + }, + { + "epoch": 0.6827241246633321, + "grad_norm": 3.671875, + "learning_rate": 8.134816356062684e-06, + "loss": 1.1476560831069946, + "step": 2218 + }, + { + "epoch": 0.6833397460561754, + "grad_norm": 1.6875, + "learning_rate": 8.131443244840651e-06, + "loss": 1.194162130355835, + "step": 2220 + }, + { + "epoch": 0.6839553674490189, + "grad_norm": 6.21875, + "learning_rate": 8.128068015555686e-06, + "loss": 1.206117033958435, + "step": 2222 + }, + { + "epoch": 0.6845709888418623, + "grad_norm": 4.1875, + "learning_rate": 8.12469067156183e-06, + "loss": 1.229992389678955, + "step": 2224 + }, + { + "epoch": 0.6851866102347056, + "grad_norm": 5.625, + "learning_rate": 8.121311216215229e-06, + "loss": 1.1885910034179688, + "step": 2226 + }, + { + "epoch": 0.685802231627549, + "grad_norm": 7.21875, + "learning_rate": 8.117929652874119e-06, + "loss": 1.3053927421569824, + "step": 2228 + }, + { + "epoch": 0.6864178530203925, + "grad_norm": 7.15625, + "learning_rate": 8.114545984898838e-06, + "loss": 1.7367584705352783, + "step": 2230 + }, + { + "epoch": 0.6870334744132358, + "grad_norm": 9.875, + "learning_rate": 8.111160215651817e-06, + "loss": 1.3689879179000854, + "step": 2232 + }, + { + "epoch": 0.6876490958060792, + "grad_norm": 7.03125, + "learning_rate": 8.107772348497563e-06, + "loss": 1.7468862533569336, + "step": 2234 + }, + { + "epoch": 0.6882647171989227, + "grad_norm": 8.75, + "learning_rate": 8.104382386802678e-06, + "loss": 1.7702608108520508, + "step": 2236 + }, + { + "epoch": 0.6888803385917661, + "grad_norm": 5.6875, + "learning_rate": 8.100990333935845e-06, + "loss": 1.2906309366226196, + "step": 2238 + }, + { + "epoch": 0.6894959599846094, + "grad_norm": 5.6875, + "learning_rate": 8.09759619326782e-06, + "loss": 1.4636949300765991, + "step": 2240 + }, + { + "epoch": 0.6901115813774529, + "grad_norm": 6.15625, + "learning_rate": 8.09419996817144e-06, + "loss": 1.3649095296859741, + "step": 2242 + }, + { + "epoch": 0.6907272027702963, + "grad_norm": 56.5, + "learning_rate": 8.090801662021609e-06, + "loss": 1.213357925415039, + "step": 2244 + }, + { + "epoch": 0.6913428241631396, + "grad_norm": 13.75, + "learning_rate": 8.087401278195297e-06, + "loss": 1.6974331140518188, + "step": 2246 + }, + { + "epoch": 0.6919584455559831, + "grad_norm": 7.65625, + "learning_rate": 8.083998820071545e-06, + "loss": 1.4499943256378174, + "step": 2248 + }, + { + "epoch": 0.6925740669488265, + "grad_norm": 4.25, + "learning_rate": 8.080594291031451e-06, + "loss": 1.1680039167404175, + "step": 2250 + }, + { + "epoch": 0.6931896883416698, + "grad_norm": 6.3125, + "learning_rate": 8.077187694458175e-06, + "loss": 1.6170227527618408, + "step": 2252 + }, + { + "epoch": 0.6938053097345133, + "grad_norm": 10.5625, + "learning_rate": 8.073779033736922e-06, + "loss": 0.6868647336959839, + "step": 2254 + }, + { + "epoch": 0.6944209311273567, + "grad_norm": 7.34375, + "learning_rate": 8.070368312254956e-06, + "loss": 1.337400197982788, + "step": 2256 + }, + { + "epoch": 0.6950365525202, + "grad_norm": 4.71875, + "learning_rate": 8.066955533401593e-06, + "loss": 1.283442497253418, + "step": 2258 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 8.4375, + "learning_rate": 8.063540700568182e-06, + "loss": 1.2001821994781494, + "step": 2260 + }, + { + "epoch": 0.6962677953058869, + "grad_norm": 12.25, + "learning_rate": 8.06012381714812e-06, + "loss": 1.5367563962936401, + "step": 2262 + }, + { + "epoch": 0.6968834166987303, + "grad_norm": 9.625, + "learning_rate": 8.056704886536844e-06, + "loss": 1.5206408500671387, + "step": 2264 + }, + { + "epoch": 0.6974990380915737, + "grad_norm": 3.328125, + "learning_rate": 8.053283912131817e-06, + "loss": 1.2205138206481934, + "step": 2266 + }, + { + "epoch": 0.6981146594844171, + "grad_norm": 11.375, + "learning_rate": 8.04986089733254e-06, + "loss": 1.4563145637512207, + "step": 2268 + }, + { + "epoch": 0.6987302808772605, + "grad_norm": 4.75, + "learning_rate": 8.046435845540543e-06, + "loss": 1.5146596431732178, + "step": 2270 + }, + { + "epoch": 0.6993459022701038, + "grad_norm": 8.0625, + "learning_rate": 8.043008760159372e-06, + "loss": 1.3689314126968384, + "step": 2272 + }, + { + "epoch": 0.6999615236629473, + "grad_norm": 14.9375, + "learning_rate": 8.0395796445946e-06, + "loss": 1.366464614868164, + "step": 2274 + }, + { + "epoch": 0.7005771450557907, + "grad_norm": 2.40625, + "learning_rate": 8.036148502253816e-06, + "loss": 1.3394603729248047, + "step": 2276 + }, + { + "epoch": 0.701192766448634, + "grad_norm": 8.8125, + "learning_rate": 8.032715336546627e-06, + "loss": 1.3192006349563599, + "step": 2278 + }, + { + "epoch": 0.7018083878414775, + "grad_norm": 2.734375, + "learning_rate": 8.029280150884637e-06, + "loss": 1.1054191589355469, + "step": 2280 + }, + { + "epoch": 0.7024240092343209, + "grad_norm": 5.9375, + "learning_rate": 8.025842948681477e-06, + "loss": 1.1914633512496948, + "step": 2282 + }, + { + "epoch": 0.7030396306271643, + "grad_norm": 6.71875, + "learning_rate": 8.022403733352767e-06, + "loss": 0.8835414052009583, + "step": 2284 + }, + { + "epoch": 0.7036552520200077, + "grad_norm": 7.0625, + "learning_rate": 8.018962508316132e-06, + "loss": 1.3785141706466675, + "step": 2286 + }, + { + "epoch": 0.7042708734128511, + "grad_norm": 2.3125, + "learning_rate": 8.015519276991199e-06, + "loss": 1.1339272260665894, + "step": 2288 + }, + { + "epoch": 0.7048864948056945, + "grad_norm": 15.125, + "learning_rate": 8.012074042799578e-06, + "loss": 1.2178083658218384, + "step": 2290 + }, + { + "epoch": 0.7055021161985379, + "grad_norm": 4.1875, + "learning_rate": 8.008626809164878e-06, + "loss": 1.0197231769561768, + "step": 2292 + }, + { + "epoch": 0.7061177375913813, + "grad_norm": 6.21875, + "learning_rate": 8.005177579512698e-06, + "loss": 1.201414942741394, + "step": 2294 + }, + { + "epoch": 0.7067333589842247, + "grad_norm": 6.5, + "learning_rate": 8.001726357270602e-06, + "loss": 1.4018722772598267, + "step": 2296 + }, + { + "epoch": 0.7073489803770681, + "grad_norm": 7.84375, + "learning_rate": 7.998273145868161e-06, + "loss": 1.2003297805786133, + "step": 2298 + }, + { + "epoch": 0.7079646017699115, + "grad_norm": 3.765625, + "learning_rate": 7.994817948736898e-06, + "loss": 1.143210530281067, + "step": 2300 + }, + { + "epoch": 0.7085802231627549, + "grad_norm": 6.5625, + "learning_rate": 7.991360769310324e-06, + "loss": 1.1977264881134033, + "step": 2302 + }, + { + "epoch": 0.7091958445555983, + "grad_norm": 34.75, + "learning_rate": 7.987901611023918e-06, + "loss": 1.6158286333084106, + "step": 2304 + }, + { + "epoch": 0.7098114659484417, + "grad_norm": 17.75, + "learning_rate": 7.984440477315118e-06, + "loss": 1.3872449398040771, + "step": 2306 + }, + { + "epoch": 0.7104270873412851, + "grad_norm": 14.125, + "learning_rate": 7.980977371623335e-06, + "loss": 1.1497303247451782, + "step": 2308 + }, + { + "epoch": 0.7110427087341286, + "grad_norm": 5.46875, + "learning_rate": 7.977512297389931e-06, + "loss": 1.0877271890640259, + "step": 2310 + }, + { + "epoch": 0.7116583301269719, + "grad_norm": 8.0625, + "learning_rate": 7.97404525805823e-06, + "loss": 1.3802058696746826, + "step": 2312 + }, + { + "epoch": 0.7122739515198153, + "grad_norm": 4.0, + "learning_rate": 7.970576257073506e-06, + "loss": 1.307868480682373, + "step": 2314 + }, + { + "epoch": 0.7128895729126588, + "grad_norm": 4.5, + "learning_rate": 7.967105297882984e-06, + "loss": 1.2434139251708984, + "step": 2316 + }, + { + "epoch": 0.7135051943055021, + "grad_norm": 9.0, + "learning_rate": 7.963632383935834e-06, + "loss": 1.3490757942199707, + "step": 2318 + }, + { + "epoch": 0.7141208156983455, + "grad_norm": 9.375, + "learning_rate": 7.960157518683164e-06, + "loss": 1.2088630199432373, + "step": 2320 + }, + { + "epoch": 0.714736437091189, + "grad_norm": 9.625, + "learning_rate": 7.956680705578033e-06, + "loss": 0.7232799530029297, + "step": 2322 + }, + { + "epoch": 0.7153520584840323, + "grad_norm": 6.25, + "learning_rate": 7.953201948075423e-06, + "loss": 1.441251277923584, + "step": 2324 + }, + { + "epoch": 0.7159676798768757, + "grad_norm": 8.875, + "learning_rate": 7.949721249632251e-06, + "loss": 1.3019921779632568, + "step": 2326 + }, + { + "epoch": 0.7165833012697191, + "grad_norm": 7.625, + "learning_rate": 7.946238613707374e-06, + "loss": 1.57804274559021, + "step": 2328 + }, + { + "epoch": 0.7171989226625626, + "grad_norm": 13.25, + "learning_rate": 7.942754043761558e-06, + "loss": 1.4858185052871704, + "step": 2330 + }, + { + "epoch": 0.7178145440554059, + "grad_norm": 3.609375, + "learning_rate": 7.9392675432575e-06, + "loss": 1.1838147640228271, + "step": 2332 + }, + { + "epoch": 0.7184301654482493, + "grad_norm": 7.5, + "learning_rate": 7.935779115659813e-06, + "loss": 1.2579776048660278, + "step": 2334 + }, + { + "epoch": 0.7190457868410928, + "grad_norm": 6.0, + "learning_rate": 7.932288764435028e-06, + "loss": 1.0854580402374268, + "step": 2336 + }, + { + "epoch": 0.7196614082339361, + "grad_norm": 4.9375, + "learning_rate": 7.928796493051582e-06, + "loss": 1.3975675106048584, + "step": 2338 + }, + { + "epoch": 0.7202770296267795, + "grad_norm": 4.75, + "learning_rate": 7.925302304979827e-06, + "loss": 1.115556001663208, + "step": 2340 + }, + { + "epoch": 0.720892651019623, + "grad_norm": 9.4375, + "learning_rate": 7.921806203692017e-06, + "loss": 1.4209973812103271, + "step": 2342 + }, + { + "epoch": 0.7215082724124663, + "grad_norm": 7.90625, + "learning_rate": 7.918308192662298e-06, + "loss": 1.6648054122924805, + "step": 2344 + }, + { + "epoch": 0.7221238938053097, + "grad_norm": 7.5, + "learning_rate": 7.914808275366733e-06, + "loss": 1.1019439697265625, + "step": 2346 + }, + { + "epoch": 0.7227395151981532, + "grad_norm": 4.125, + "learning_rate": 7.911306455283258e-06, + "loss": 1.1492326259613037, + "step": 2348 + }, + { + "epoch": 0.7233551365909965, + "grad_norm": 6.65625, + "learning_rate": 7.907802735891716e-06, + "loss": 1.2494227886199951, + "step": 2350 + }, + { + "epoch": 0.7239707579838399, + "grad_norm": 5.78125, + "learning_rate": 7.904297120673831e-06, + "loss": 1.4483420848846436, + "step": 2352 + }, + { + "epoch": 0.7245863793766834, + "grad_norm": 2.453125, + "learning_rate": 7.900789613113214e-06, + "loss": 1.2527090311050415, + "step": 2354 + }, + { + "epoch": 0.7252020007695268, + "grad_norm": 7.59375, + "learning_rate": 7.897280216695346e-06, + "loss": 1.3328126668930054, + "step": 2356 + }, + { + "epoch": 0.7258176221623701, + "grad_norm": 4.3125, + "learning_rate": 7.893768934907599e-06, + "loss": 1.2253814935684204, + "step": 2358 + }, + { + "epoch": 0.7264332435552135, + "grad_norm": 24.625, + "learning_rate": 7.89025577123921e-06, + "loss": 1.8969019651412964, + "step": 2360 + }, + { + "epoch": 0.727048864948057, + "grad_norm": 5.8125, + "learning_rate": 7.886740729181292e-06, + "loss": 1.4229713678359985, + "step": 2362 + }, + { + "epoch": 0.7276644863409003, + "grad_norm": 4.5625, + "learning_rate": 7.883223812226817e-06, + "loss": 1.3291300535202026, + "step": 2364 + }, + { + "epoch": 0.7282801077337437, + "grad_norm": 4.625, + "learning_rate": 7.879705023870626e-06, + "loss": 1.1539807319641113, + "step": 2366 + }, + { + "epoch": 0.7288957291265872, + "grad_norm": 8.4375, + "learning_rate": 7.876184367609418e-06, + "loss": 1.536054015159607, + "step": 2368 + }, + { + "epoch": 0.7295113505194305, + "grad_norm": 5.125, + "learning_rate": 7.872661846941747e-06, + "loss": 1.4130350351333618, + "step": 2370 + }, + { + "epoch": 0.7301269719122739, + "grad_norm": 9.4375, + "learning_rate": 7.869137465368023e-06, + "loss": 1.6681222915649414, + "step": 2372 + }, + { + "epoch": 0.7307425933051174, + "grad_norm": 6.28125, + "learning_rate": 7.865611226390499e-06, + "loss": 1.2161335945129395, + "step": 2374 + }, + { + "epoch": 0.7313582146979608, + "grad_norm": 10.875, + "learning_rate": 7.862083133513281e-06, + "loss": 1.5017954111099243, + "step": 2376 + }, + { + "epoch": 0.7319738360908041, + "grad_norm": 7.78125, + "learning_rate": 7.858553190242314e-06, + "loss": 1.7430108785629272, + "step": 2378 + }, + { + "epoch": 0.7325894574836476, + "grad_norm": 6.75, + "learning_rate": 7.855021400085378e-06, + "loss": 1.3919011354446411, + "step": 2380 + }, + { + "epoch": 0.733205078876491, + "grad_norm": 6.0, + "learning_rate": 7.851487766552097e-06, + "loss": 1.5471138954162598, + "step": 2382 + }, + { + "epoch": 0.7338207002693343, + "grad_norm": 6.5625, + "learning_rate": 7.847952293153923e-06, + "loss": 1.1629184484481812, + "step": 2384 + }, + { + "epoch": 0.7344363216621778, + "grad_norm": 7.3125, + "learning_rate": 7.844414983404128e-06, + "loss": 1.168907880783081, + "step": 2386 + }, + { + "epoch": 0.7350519430550212, + "grad_norm": 6.5, + "learning_rate": 7.840875840817819e-06, + "loss": 1.5863640308380127, + "step": 2388 + }, + { + "epoch": 0.7356675644478645, + "grad_norm": 4.28125, + "learning_rate": 7.837334868911923e-06, + "loss": 1.121029257774353, + "step": 2390 + }, + { + "epoch": 0.736283185840708, + "grad_norm": 4.5, + "learning_rate": 7.833792071205184e-06, + "loss": 1.3915820121765137, + "step": 2392 + }, + { + "epoch": 0.7368988072335514, + "grad_norm": 4.375, + "learning_rate": 7.830247451218158e-06, + "loss": 1.2680091857910156, + "step": 2394 + }, + { + "epoch": 0.7375144286263947, + "grad_norm": 5.6875, + "learning_rate": 7.826701012473213e-06, + "loss": 1.2384248971939087, + "step": 2396 + }, + { + "epoch": 0.7381300500192381, + "grad_norm": 4.65625, + "learning_rate": 7.823152758494523e-06, + "loss": 1.1445600986480713, + "step": 2398 + }, + { + "epoch": 0.7387456714120816, + "grad_norm": 12.75, + "learning_rate": 7.81960269280807e-06, + "loss": 1.365366816520691, + "step": 2400 + }, + { + "epoch": 0.739361292804925, + "grad_norm": 5.59375, + "learning_rate": 7.816050818941634e-06, + "loss": 1.353005051612854, + "step": 2402 + }, + { + "epoch": 0.7399769141977683, + "grad_norm": 5.875, + "learning_rate": 7.81249714042479e-06, + "loss": 1.2653348445892334, + "step": 2404 + }, + { + "epoch": 0.7405925355906118, + "grad_norm": 7.03125, + "learning_rate": 7.80894166078891e-06, + "loss": 1.3015531301498413, + "step": 2406 + }, + { + "epoch": 0.7412081569834552, + "grad_norm": 3.6875, + "learning_rate": 7.805384383567152e-06, + "loss": 1.2278153896331787, + "step": 2408 + }, + { + "epoch": 0.7418237783762985, + "grad_norm": 9.6875, + "learning_rate": 7.801825312294465e-06, + "loss": 1.3797082901000977, + "step": 2410 + }, + { + "epoch": 0.742439399769142, + "grad_norm": 2.578125, + "learning_rate": 7.798264450507573e-06, + "loss": 1.002259373664856, + "step": 2412 + }, + { + "epoch": 0.7430550211619854, + "grad_norm": 13.0625, + "learning_rate": 7.794701801744989e-06, + "loss": 1.1419209241867065, + "step": 2414 + }, + { + "epoch": 0.7436706425548287, + "grad_norm": 8.1875, + "learning_rate": 7.791137369546992e-06, + "loss": 1.7288269996643066, + "step": 2416 + }, + { + "epoch": 0.7442862639476722, + "grad_norm": 2.4375, + "learning_rate": 7.787571157455643e-06, + "loss": 1.265411615371704, + "step": 2418 + }, + { + "epoch": 0.7449018853405156, + "grad_norm": 5.5, + "learning_rate": 7.784003169014764e-06, + "loss": 1.294354796409607, + "step": 2420 + }, + { + "epoch": 0.745517506733359, + "grad_norm": 4.625, + "learning_rate": 7.780433407769948e-06, + "loss": 1.1728662252426147, + "step": 2422 + }, + { + "epoch": 0.7461331281262024, + "grad_norm": 5.3125, + "learning_rate": 7.776861877268544e-06, + "loss": 1.1810388565063477, + "step": 2424 + }, + { + "epoch": 0.7467487495190458, + "grad_norm": 9.0, + "learning_rate": 7.773288581059661e-06, + "loss": 0.8759303689002991, + "step": 2426 + }, + { + "epoch": 0.7473643709118892, + "grad_norm": 5.71875, + "learning_rate": 7.769713522694167e-06, + "loss": 1.3331336975097656, + "step": 2428 + }, + { + "epoch": 0.7479799923047326, + "grad_norm": 4.5, + "learning_rate": 7.766136705724675e-06, + "loss": 0.8516298532485962, + "step": 2430 + }, + { + "epoch": 0.748595613697576, + "grad_norm": 20.875, + "learning_rate": 7.762558133705551e-06, + "loss": 1.1931649446487427, + "step": 2432 + }, + { + "epoch": 0.7492112350904194, + "grad_norm": 3.953125, + "learning_rate": 7.758977810192898e-06, + "loss": 1.3147087097167969, + "step": 2434 + }, + { + "epoch": 0.7498268564832627, + "grad_norm": 12.0, + "learning_rate": 7.755395738744567e-06, + "loss": 1.4341905117034912, + "step": 2436 + }, + { + "epoch": 0.7504424778761062, + "grad_norm": 4.46875, + "learning_rate": 7.751811922920141e-06, + "loss": 1.2278903722763062, + "step": 2438 + }, + { + "epoch": 0.7510580992689496, + "grad_norm": 7.25, + "learning_rate": 7.74822636628094e-06, + "loss": 1.236400842666626, + "step": 2440 + }, + { + "epoch": 0.7516737206617929, + "grad_norm": 19.875, + "learning_rate": 7.744639072390013e-06, + "loss": 1.4345285892486572, + "step": 2442 + }, + { + "epoch": 0.7522893420546364, + "grad_norm": 7.78125, + "learning_rate": 7.741050044812128e-06, + "loss": 1.4369693994522095, + "step": 2444 + }, + { + "epoch": 0.7529049634474798, + "grad_norm": 3.859375, + "learning_rate": 7.737459287113789e-06, + "loss": 1.2332228422164917, + "step": 2446 + }, + { + "epoch": 0.7535205848403232, + "grad_norm": 7.25, + "learning_rate": 7.733866802863207e-06, + "loss": 1.5218693017959595, + "step": 2448 + }, + { + "epoch": 0.7541362062331666, + "grad_norm": 1.5703125, + "learning_rate": 7.730272595630322e-06, + "loss": 1.0981626510620117, + "step": 2450 + }, + { + "epoch": 0.75475182762601, + "grad_norm": 3.484375, + "learning_rate": 7.726676668986769e-06, + "loss": 0.9859358668327332, + "step": 2452 + }, + { + "epoch": 0.7553674490188534, + "grad_norm": 7.90625, + "learning_rate": 7.723079026505907e-06, + "loss": 1.6938118934631348, + "step": 2454 + }, + { + "epoch": 0.7559830704116968, + "grad_norm": 3.78125, + "learning_rate": 7.719479671762788e-06, + "loss": 1.2973098754882812, + "step": 2456 + }, + { + "epoch": 0.7565986918045402, + "grad_norm": 5.40625, + "learning_rate": 7.71587860833418e-06, + "loss": 1.4085596799850464, + "step": 2458 + }, + { + "epoch": 0.7572143131973836, + "grad_norm": 6.90625, + "learning_rate": 7.712275839798536e-06, + "loss": 1.2574015855789185, + "step": 2460 + }, + { + "epoch": 0.757829934590227, + "grad_norm": 5.0, + "learning_rate": 7.708671369736007e-06, + "loss": 1.3973033428192139, + "step": 2462 + }, + { + "epoch": 0.7584455559830704, + "grad_norm": 6.96875, + "learning_rate": 7.705065201728436e-06, + "loss": 1.380937099456787, + "step": 2464 + }, + { + "epoch": 0.7590611773759138, + "grad_norm": 6.8125, + "learning_rate": 7.701457339359356e-06, + "loss": 1.4758228063583374, + "step": 2466 + }, + { + "epoch": 0.7596767987687573, + "grad_norm": 3.71875, + "learning_rate": 7.697847786213974e-06, + "loss": 1.2328314781188965, + "step": 2468 + }, + { + "epoch": 0.7602924201616006, + "grad_norm": 2.859375, + "learning_rate": 7.69423654587919e-06, + "loss": 1.1552143096923828, + "step": 2470 + }, + { + "epoch": 0.760908041554444, + "grad_norm": 6.875, + "learning_rate": 7.690623621943574e-06, + "loss": 1.1700533628463745, + "step": 2472 + }, + { + "epoch": 0.7615236629472875, + "grad_norm": 24.0, + "learning_rate": 7.687009017997369e-06, + "loss": 1.7394222021102905, + "step": 2474 + }, + { + "epoch": 0.7621392843401308, + "grad_norm": 5.40625, + "learning_rate": 7.683392737632484e-06, + "loss": 1.5715559720993042, + "step": 2476 + }, + { + "epoch": 0.7627549057329742, + "grad_norm": 4.21875, + "learning_rate": 7.679774784442504e-06, + "loss": 1.3047338724136353, + "step": 2478 + }, + { + "epoch": 0.7633705271258177, + "grad_norm": 6.03125, + "learning_rate": 7.676155162022664e-06, + "loss": 1.3770108222961426, + "step": 2480 + }, + { + "epoch": 0.763986148518661, + "grad_norm": 7.65625, + "learning_rate": 7.672533873969867e-06, + "loss": 1.4164502620697021, + "step": 2482 + }, + { + "epoch": 0.7646017699115044, + "grad_norm": 9.4375, + "learning_rate": 7.66891092388267e-06, + "loss": 1.746399164199829, + "step": 2484 + }, + { + "epoch": 0.7652173913043478, + "grad_norm": 5.375, + "learning_rate": 7.66528631536128e-06, + "loss": 1.126002311706543, + "step": 2486 + }, + { + "epoch": 0.7658330126971912, + "grad_norm": 3.953125, + "learning_rate": 7.661660052007547e-06, + "loss": 1.0803347826004028, + "step": 2488 + }, + { + "epoch": 0.7664486340900346, + "grad_norm": 6.84375, + "learning_rate": 7.658032137424973e-06, + "loss": 1.6090704202651978, + "step": 2490 + }, + { + "epoch": 0.767064255482878, + "grad_norm": 5.625, + "learning_rate": 7.654402575218698e-06, + "loss": 1.215574026107788, + "step": 2492 + }, + { + "epoch": 0.7676798768757215, + "grad_norm": 8.6875, + "learning_rate": 7.6507713689955e-06, + "loss": 0.8813379406929016, + "step": 2494 + }, + { + "epoch": 0.7682954982685648, + "grad_norm": 7.65625, + "learning_rate": 7.647138522363788e-06, + "loss": 1.4518318176269531, + "step": 2496 + }, + { + "epoch": 0.7689111196614082, + "grad_norm": 7.21875, + "learning_rate": 7.643504038933607e-06, + "loss": 1.2638100385665894, + "step": 2498 + }, + { + "epoch": 0.7695267410542517, + "grad_norm": 2.109375, + "learning_rate": 7.639867922316616e-06, + "loss": 1.2956639528274536, + "step": 2500 + }, + { + "epoch": 0.770142362447095, + "grad_norm": 4.78125, + "learning_rate": 7.636230176126116e-06, + "loss": 1.4407836198806763, + "step": 2502 + }, + { + "epoch": 0.7707579838399384, + "grad_norm": 5.96875, + "learning_rate": 7.632590803977014e-06, + "loss": 1.2232799530029297, + "step": 2504 + }, + { + "epoch": 0.7713736052327819, + "grad_norm": 6.0625, + "learning_rate": 7.628949809485832e-06, + "loss": 1.1274707317352295, + "step": 2506 + }, + { + "epoch": 0.7719892266256252, + "grad_norm": 7.625, + "learning_rate": 7.62530719627071e-06, + "loss": 1.583906888961792, + "step": 2508 + }, + { + "epoch": 0.7726048480184686, + "grad_norm": 5.1875, + "learning_rate": 7.621662967951395e-06, + "loss": 1.2432018518447876, + "step": 2510 + }, + { + "epoch": 0.7732204694113121, + "grad_norm": 7.90625, + "learning_rate": 7.618017128149238e-06, + "loss": 1.9361577033996582, + "step": 2512 + }, + { + "epoch": 0.7738360908041555, + "grad_norm": 3.671875, + "learning_rate": 7.61436968048719e-06, + "loss": 1.390897512435913, + "step": 2514 + }, + { + "epoch": 0.7744517121969988, + "grad_norm": 3.78125, + "learning_rate": 7.610720628589805e-06, + "loss": 1.0258307456970215, + "step": 2516 + }, + { + "epoch": 0.7750673335898423, + "grad_norm": 4.8125, + "learning_rate": 7.607069976083226e-06, + "loss": 1.1963353157043457, + "step": 2518 + }, + { + "epoch": 0.7756829549826857, + "grad_norm": 5.8125, + "learning_rate": 7.6034177265951855e-06, + "loss": 1.3238426446914673, + "step": 2520 + }, + { + "epoch": 0.776298576375529, + "grad_norm": 4.71875, + "learning_rate": 7.599763883755009e-06, + "loss": 1.54075026512146, + "step": 2522 + }, + { + "epoch": 0.7769141977683724, + "grad_norm": 4.6875, + "learning_rate": 7.596108451193602e-06, + "loss": 1.1980767250061035, + "step": 2524 + }, + { + "epoch": 0.7775298191612159, + "grad_norm": 4.46875, + "learning_rate": 7.5924514325434484e-06, + "loss": 1.0477863550186157, + "step": 2526 + }, + { + "epoch": 0.7781454405540592, + "grad_norm": 4.25, + "learning_rate": 7.5887928314386115e-06, + "loss": 1.2851420640945435, + "step": 2528 + }, + { + "epoch": 0.7787610619469026, + "grad_norm": 4.09375, + "learning_rate": 7.585132651514722e-06, + "loss": 1.459242343902588, + "step": 2530 + }, + { + "epoch": 0.7793766833397461, + "grad_norm": 7.40625, + "learning_rate": 7.581470896408984e-06, + "loss": 1.0518429279327393, + "step": 2532 + }, + { + "epoch": 0.7799923047325894, + "grad_norm": 3.671875, + "learning_rate": 7.577807569760169e-06, + "loss": 1.1042135953903198, + "step": 2534 + }, + { + "epoch": 0.7806079261254328, + "grad_norm": 6.96875, + "learning_rate": 7.574142675208602e-06, + "loss": 1.0141704082489014, + "step": 2536 + }, + { + "epoch": 0.7812235475182763, + "grad_norm": 4.625, + "learning_rate": 7.570476216396174e-06, + "loss": 1.2268452644348145, + "step": 2538 + }, + { + "epoch": 0.7818391689111197, + "grad_norm": 9.6875, + "learning_rate": 7.566808196966326e-06, + "loss": 0.9167345762252808, + "step": 2540 + }, + { + "epoch": 0.782454790303963, + "grad_norm": 2.5, + "learning_rate": 7.563138620564052e-06, + "loss": 1.2149308919906616, + "step": 2542 + }, + { + "epoch": 0.7830704116968065, + "grad_norm": 7.3125, + "learning_rate": 7.55946749083589e-06, + "loss": 0.9064674973487854, + "step": 2544 + }, + { + "epoch": 0.7836860330896499, + "grad_norm": 5.03125, + "learning_rate": 7.5557948114299265e-06, + "loss": 1.6139088869094849, + "step": 2546 + }, + { + "epoch": 0.7843016544824932, + "grad_norm": 12.5, + "learning_rate": 7.552120585995786e-06, + "loss": 1.1430484056472778, + "step": 2548 + }, + { + "epoch": 0.7849172758753367, + "grad_norm": 10.375, + "learning_rate": 7.548444818184626e-06, + "loss": 1.3700685501098633, + "step": 2550 + }, + { + "epoch": 0.7855328972681801, + "grad_norm": 5.5, + "learning_rate": 7.544767511649138e-06, + "loss": 1.3238214254379272, + "step": 2552 + }, + { + "epoch": 0.7861485186610234, + "grad_norm": 5.625, + "learning_rate": 7.541088670043548e-06, + "loss": 1.196365237236023, + "step": 2554 + }, + { + "epoch": 0.7867641400538669, + "grad_norm": 9.75, + "learning_rate": 7.537408297023605e-06, + "loss": 1.2222591638565063, + "step": 2556 + }, + { + "epoch": 0.7873797614467103, + "grad_norm": 8.0625, + "learning_rate": 7.5337263962465704e-06, + "loss": 1.451030969619751, + "step": 2558 + }, + { + "epoch": 0.7879953828395537, + "grad_norm": 3.5625, + "learning_rate": 7.5300429713712385e-06, + "loss": 1.190792441368103, + "step": 2560 + }, + { + "epoch": 0.788611004232397, + "grad_norm": 8.8125, + "learning_rate": 7.5263580260579096e-06, + "loss": 1.4873063564300537, + "step": 2562 + }, + { + "epoch": 0.7892266256252405, + "grad_norm": 4.0625, + "learning_rate": 7.5226715639683936e-06, + "loss": 1.1327518224716187, + "step": 2564 + }, + { + "epoch": 0.7898422470180839, + "grad_norm": 6.375, + "learning_rate": 7.518983588766013e-06, + "loss": 1.3757460117340088, + "step": 2566 + }, + { + "epoch": 0.7904578684109272, + "grad_norm": 3.109375, + "learning_rate": 7.515294104115592e-06, + "loss": 1.0891473293304443, + "step": 2568 + }, + { + "epoch": 0.7910734898037707, + "grad_norm": 5.1875, + "learning_rate": 7.511603113683452e-06, + "loss": 1.2719448804855347, + "step": 2570 + }, + { + "epoch": 0.7916891111966141, + "grad_norm": 5.96875, + "learning_rate": 7.507910621137413e-06, + "loss": 1.379211187362671, + "step": 2572 + }, + { + "epoch": 0.7923047325894574, + "grad_norm": 7.65625, + "learning_rate": 7.5042166301467904e-06, + "loss": 1.6094646453857422, + "step": 2574 + }, + { + "epoch": 0.7929203539823009, + "grad_norm": 6.5625, + "learning_rate": 7.500521144382385e-06, + "loss": 1.4890835285186768, + "step": 2576 + }, + { + "epoch": 0.7935359753751443, + "grad_norm": 13.625, + "learning_rate": 7.496824167516481e-06, + "loss": 1.1069023609161377, + "step": 2578 + }, + { + "epoch": 0.7941515967679876, + "grad_norm": 3.5, + "learning_rate": 7.49312570322285e-06, + "loss": 0.9310998916625977, + "step": 2580 + }, + { + "epoch": 0.7947672181608311, + "grad_norm": 7.71875, + "learning_rate": 7.489425755176738e-06, + "loss": 1.2464100122451782, + "step": 2582 + }, + { + "epoch": 0.7953828395536745, + "grad_norm": 5.03125, + "learning_rate": 7.4857243270548666e-06, + "loss": 1.2356469631195068, + "step": 2584 + }, + { + "epoch": 0.7959984609465179, + "grad_norm": 7.625, + "learning_rate": 7.482021422535428e-06, + "loss": 1.5616161823272705, + "step": 2586 + }, + { + "epoch": 0.7966140823393613, + "grad_norm": 24.25, + "learning_rate": 7.47831704529808e-06, + "loss": 1.4321647882461548, + "step": 2588 + }, + { + "epoch": 0.7972297037322047, + "grad_norm": 5.0625, + "learning_rate": 7.474611199023949e-06, + "loss": 1.0948270559310913, + "step": 2590 + }, + { + "epoch": 0.7978453251250481, + "grad_norm": 5.9375, + "learning_rate": 7.470903887395611e-06, + "loss": 1.3025152683258057, + "step": 2592 + }, + { + "epoch": 0.7984609465178915, + "grad_norm": 9.25, + "learning_rate": 7.46719511409711e-06, + "loss": 1.2904844284057617, + "step": 2594 + }, + { + "epoch": 0.7990765679107349, + "grad_norm": 6.71875, + "learning_rate": 7.463484882813938e-06, + "loss": 1.301687240600586, + "step": 2596 + }, + { + "epoch": 0.7996921893035783, + "grad_norm": 11.625, + "learning_rate": 7.459773197233031e-06, + "loss": 1.275160551071167, + "step": 2598 + }, + { + "epoch": 0.8003078106964217, + "grad_norm": 8.625, + "learning_rate": 7.456060061042774e-06, + "loss": 1.452974557876587, + "step": 2600 + }, + { + "epoch": 0.8009234320892651, + "grad_norm": 3.3125, + "learning_rate": 7.452345477932999e-06, + "loss": 1.0892770290374756, + "step": 2602 + }, + { + "epoch": 0.8015390534821085, + "grad_norm": 8.75, + "learning_rate": 7.4486294515949665e-06, + "loss": 1.018752932548523, + "step": 2604 + }, + { + "epoch": 0.802154674874952, + "grad_norm": 27.375, + "learning_rate": 7.4449119857213725e-06, + "loss": 1.1587097644805908, + "step": 2606 + }, + { + "epoch": 0.8027702962677953, + "grad_norm": 6.125, + "learning_rate": 7.441193084006353e-06, + "loss": 1.2957143783569336, + "step": 2608 + }, + { + "epoch": 0.8033859176606387, + "grad_norm": 9.875, + "learning_rate": 7.437472750145458e-06, + "loss": 1.380386471748352, + "step": 2610 + }, + { + "epoch": 0.8040015390534822, + "grad_norm": 12.875, + "learning_rate": 7.433750987835668e-06, + "loss": 0.9111630916595459, + "step": 2612 + }, + { + "epoch": 0.8046171604463255, + "grad_norm": 1.9765625, + "learning_rate": 7.430027800775386e-06, + "loss": 1.0486061573028564, + "step": 2614 + }, + { + "epoch": 0.8052327818391689, + "grad_norm": 7.71875, + "learning_rate": 7.426303192664421e-06, + "loss": 1.111664056777954, + "step": 2616 + }, + { + "epoch": 0.8058484032320123, + "grad_norm": 5.21875, + "learning_rate": 7.422577167204003e-06, + "loss": 1.3210314512252808, + "step": 2618 + }, + { + "epoch": 0.8064640246248557, + "grad_norm": 5.34375, + "learning_rate": 7.418849728096767e-06, + "loss": 1.2019422054290771, + "step": 2620 + }, + { + "epoch": 0.8070796460176991, + "grad_norm": 41.5, + "learning_rate": 7.415120879046749e-06, + "loss": 0.9851787090301514, + "step": 2622 + }, + { + "epoch": 0.8076952674105425, + "grad_norm": 2.6875, + "learning_rate": 7.411390623759392e-06, + "loss": 1.1129413843154907, + "step": 2624 + }, + { + "epoch": 0.8083108888033859, + "grad_norm": 6.53125, + "learning_rate": 7.407658965941535e-06, + "loss": 1.2529385089874268, + "step": 2626 + }, + { + "epoch": 0.8089265101962293, + "grad_norm": 12.0625, + "learning_rate": 7.40392590930141e-06, + "loss": 1.3294597864151, + "step": 2628 + }, + { + "epoch": 0.8095421315890727, + "grad_norm": 3.625, + "learning_rate": 7.40019145754864e-06, + "loss": 0.9272560477256775, + "step": 2630 + }, + { + "epoch": 0.8101577529819162, + "grad_norm": 1.7265625, + "learning_rate": 7.3964556143942315e-06, + "loss": 1.1824629306793213, + "step": 2632 + }, + { + "epoch": 0.8107733743747595, + "grad_norm": 11.0625, + "learning_rate": 7.392718383550576e-06, + "loss": 1.0783371925354004, + "step": 2634 + }, + { + "epoch": 0.8113889957676029, + "grad_norm": 6.3125, + "learning_rate": 7.388979768731444e-06, + "loss": 1.2361860275268555, + "step": 2636 + }, + { + "epoch": 0.8120046171604464, + "grad_norm": 6.15625, + "learning_rate": 7.38523977365198e-06, + "loss": 1.3848330974578857, + "step": 2638 + }, + { + "epoch": 0.8126202385532897, + "grad_norm": 6.59375, + "learning_rate": 7.381498402028704e-06, + "loss": 1.2543951272964478, + "step": 2640 + }, + { + "epoch": 0.8132358599461331, + "grad_norm": 6.3125, + "learning_rate": 7.377755657579495e-06, + "loss": 1.4167609214782715, + "step": 2642 + }, + { + "epoch": 0.8138514813389766, + "grad_norm": 5.375, + "learning_rate": 7.374011544023607e-06, + "loss": 1.3318917751312256, + "step": 2644 + }, + { + "epoch": 0.8144671027318199, + "grad_norm": 7.375, + "learning_rate": 7.3702660650816485e-06, + "loss": 1.0573267936706543, + "step": 2646 + }, + { + "epoch": 0.8150827241246633, + "grad_norm": 3.328125, + "learning_rate": 7.366519224475585e-06, + "loss": 1.1513400077819824, + "step": 2648 + }, + { + "epoch": 0.8156983455175068, + "grad_norm": 7.09375, + "learning_rate": 7.362771025928736e-06, + "loss": 1.4746935367584229, + "step": 2650 + }, + { + "epoch": 0.8163139669103502, + "grad_norm": 5.65625, + "learning_rate": 7.3590214731657724e-06, + "loss": 1.2213988304138184, + "step": 2652 + }, + { + "epoch": 0.8169295883031935, + "grad_norm": 8.6875, + "learning_rate": 7.355270569912707e-06, + "loss": 1.4360812902450562, + "step": 2654 + }, + { + "epoch": 0.817545209696037, + "grad_norm": 6.90625, + "learning_rate": 7.351518319896895e-06, + "loss": 1.447126030921936, + "step": 2656 + }, + { + "epoch": 0.8181608310888804, + "grad_norm": 6.40625, + "learning_rate": 7.347764726847035e-06, + "loss": 0.9661370515823364, + "step": 2658 + }, + { + "epoch": 0.8187764524817237, + "grad_norm": 24.5, + "learning_rate": 7.3440097944931545e-06, + "loss": 1.2369790077209473, + "step": 2660 + }, + { + "epoch": 0.8193920738745671, + "grad_norm": 7.90625, + "learning_rate": 7.340253526566614e-06, + "loss": 1.4130780696868896, + "step": 2662 + }, + { + "epoch": 0.8200076952674106, + "grad_norm": 3.953125, + "learning_rate": 7.3364959268001e-06, + "loss": 1.210066318511963, + "step": 2664 + }, + { + "epoch": 0.8206233166602539, + "grad_norm": 18.375, + "learning_rate": 7.332736998927628e-06, + "loss": 1.090823769569397, + "step": 2666 + }, + { + "epoch": 0.8212389380530973, + "grad_norm": 5.5625, + "learning_rate": 7.328976746684522e-06, + "loss": 1.0669875144958496, + "step": 2668 + }, + { + "epoch": 0.8218545594459408, + "grad_norm": 6.21875, + "learning_rate": 7.325215173807434e-06, + "loss": 1.2229068279266357, + "step": 2670 + }, + { + "epoch": 0.8224701808387841, + "grad_norm": 11.1875, + "learning_rate": 7.321452284034323e-06, + "loss": 1.389937400817871, + "step": 2672 + }, + { + "epoch": 0.8230858022316275, + "grad_norm": 13.25, + "learning_rate": 7.317688081104455e-06, + "loss": 1.0739669799804688, + "step": 2674 + }, + { + "epoch": 0.823701423624471, + "grad_norm": 6.625, + "learning_rate": 7.313922568758403e-06, + "loss": 1.4765902757644653, + "step": 2676 + }, + { + "epoch": 0.8243170450173144, + "grad_norm": 3.78125, + "learning_rate": 7.310155750738044e-06, + "loss": 1.188515543937683, + "step": 2678 + }, + { + "epoch": 0.8249326664101577, + "grad_norm": 6.96875, + "learning_rate": 7.306387630786544e-06, + "loss": 1.0044671297073364, + "step": 2680 + }, + { + "epoch": 0.8255482878030012, + "grad_norm": 6.90625, + "learning_rate": 7.3026182126483755e-06, + "loss": 1.2412277460098267, + "step": 2682 + }, + { + "epoch": 0.8261639091958446, + "grad_norm": 4.09375, + "learning_rate": 7.298847500069286e-06, + "loss": 1.2116367816925049, + "step": 2684 + }, + { + "epoch": 0.8267795305886879, + "grad_norm": 5.28125, + "learning_rate": 7.295075496796324e-06, + "loss": 1.3458802700042725, + "step": 2686 + }, + { + "epoch": 0.8273951519815314, + "grad_norm": 5.25, + "learning_rate": 7.291302206577808e-06, + "loss": 1.289811611175537, + "step": 2688 + }, + { + "epoch": 0.8280107733743748, + "grad_norm": 6.90625, + "learning_rate": 7.287527633163345e-06, + "loss": 1.3732936382293701, + "step": 2690 + }, + { + "epoch": 0.8286263947672181, + "grad_norm": 3.984375, + "learning_rate": 7.283751780303812e-06, + "loss": 1.0649974346160889, + "step": 2692 + }, + { + "epoch": 0.8292420161600615, + "grad_norm": 12.75, + "learning_rate": 7.27997465175136e-06, + "loss": 1.5276716947555542, + "step": 2694 + }, + { + "epoch": 0.829857637552905, + "grad_norm": 10.0625, + "learning_rate": 7.276196251259402e-06, + "loss": 1.4253476858139038, + "step": 2696 + }, + { + "epoch": 0.8304732589457484, + "grad_norm": 6.5625, + "learning_rate": 7.272416582582624e-06, + "loss": 1.1293399333953857, + "step": 2698 + }, + { + "epoch": 0.8310888803385917, + "grad_norm": 3.921875, + "learning_rate": 7.268635649476965e-06, + "loss": 1.224055528640747, + "step": 2700 + }, + { + "epoch": 0.8317045017314352, + "grad_norm": 3.828125, + "learning_rate": 7.264853455699623e-06, + "loss": 1.432799220085144, + "step": 2702 + }, + { + "epoch": 0.8323201231242786, + "grad_norm": 5.3125, + "learning_rate": 7.261070005009052e-06, + "loss": 1.2237067222595215, + "step": 2704 + }, + { + "epoch": 0.8329357445171219, + "grad_norm": 9.4375, + "learning_rate": 7.257285301164947e-06, + "loss": 1.2335262298583984, + "step": 2706 + }, + { + "epoch": 0.8335513659099654, + "grad_norm": 10.5, + "learning_rate": 7.2534993479282545e-06, + "loss": 1.619771957397461, + "step": 2708 + }, + { + "epoch": 0.8341669873028088, + "grad_norm": 10.0, + "learning_rate": 7.2497121490611636e-06, + "loss": 1.460617184638977, + "step": 2710 + }, + { + "epoch": 0.8347826086956521, + "grad_norm": 10.8125, + "learning_rate": 7.245923708327096e-06, + "loss": 1.450473427772522, + "step": 2712 + }, + { + "epoch": 0.8353982300884956, + "grad_norm": 2.234375, + "learning_rate": 7.242134029490711e-06, + "loss": 0.9833471179008484, + "step": 2714 + }, + { + "epoch": 0.836013851481339, + "grad_norm": 6.6875, + "learning_rate": 7.2383431163178965e-06, + "loss": 1.161080241203308, + "step": 2716 + }, + { + "epoch": 0.8366294728741823, + "grad_norm": 8.25, + "learning_rate": 7.234550972575769e-06, + "loss": 1.712884545326233, + "step": 2718 + }, + { + "epoch": 0.8372450942670258, + "grad_norm": 5.71875, + "learning_rate": 7.230757602032667e-06, + "loss": 1.40744149684906, + "step": 2720 + }, + { + "epoch": 0.8378607156598692, + "grad_norm": 3.140625, + "learning_rate": 7.2269630084581475e-06, + "loss": 1.321396827697754, + "step": 2722 + }, + { + "epoch": 0.8384763370527126, + "grad_norm": 5.3125, + "learning_rate": 7.223167195622982e-06, + "loss": 1.2216382026672363, + "step": 2724 + }, + { + "epoch": 0.839091958445556, + "grad_norm": 52.0, + "learning_rate": 7.219370167299158e-06, + "loss": 1.2050942182540894, + "step": 2726 + }, + { + "epoch": 0.8397075798383994, + "grad_norm": 12.125, + "learning_rate": 7.215571927259863e-06, + "loss": 1.2433619499206543, + "step": 2728 + }, + { + "epoch": 0.8403232012312428, + "grad_norm": 2.953125, + "learning_rate": 7.2117724792795e-06, + "loss": 1.2390086650848389, + "step": 2730 + }, + { + "epoch": 0.8409388226240861, + "grad_norm": 11.375, + "learning_rate": 7.207971827133657e-06, + "loss": 1.348872184753418, + "step": 2732 + }, + { + "epoch": 0.8415544440169296, + "grad_norm": 23.375, + "learning_rate": 7.204169974599134e-06, + "loss": 1.6377098560333252, + "step": 2734 + }, + { + "epoch": 0.842170065409773, + "grad_norm": 6.5, + "learning_rate": 7.200366925453915e-06, + "loss": 1.6480786800384521, + "step": 2736 + }, + { + "epoch": 0.8427856868026163, + "grad_norm": 5.96875, + "learning_rate": 7.196562683477175e-06, + "loss": 1.4472780227661133, + "step": 2738 + }, + { + "epoch": 0.8434013081954598, + "grad_norm": 2.5625, + "learning_rate": 7.192757252449272e-06, + "loss": 1.3245518207550049, + "step": 2740 + }, + { + "epoch": 0.8440169295883032, + "grad_norm": 5.59375, + "learning_rate": 7.188950636151752e-06, + "loss": 1.510989785194397, + "step": 2742 + }, + { + "epoch": 0.8446325509811466, + "grad_norm": 10.625, + "learning_rate": 7.185142838367334e-06, + "loss": 1.5210520029067993, + "step": 2744 + }, + { + "epoch": 0.84524817237399, + "grad_norm": 8.375, + "learning_rate": 7.181333862879911e-06, + "loss": 1.4478716850280762, + "step": 2746 + }, + { + "epoch": 0.8458637937668334, + "grad_norm": 8.5, + "learning_rate": 7.1775237134745505e-06, + "loss": 1.3034485578536987, + "step": 2748 + }, + { + "epoch": 0.8464794151596768, + "grad_norm": 8.25, + "learning_rate": 7.173712393937477e-06, + "loss": 1.708869457244873, + "step": 2750 + }, + { + "epoch": 0.8470950365525202, + "grad_norm": 5.90625, + "learning_rate": 7.16989990805609e-06, + "loss": 1.3472611904144287, + "step": 2752 + }, + { + "epoch": 0.8477106579453636, + "grad_norm": 3.015625, + "learning_rate": 7.166086259618938e-06, + "loss": 1.2394222021102905, + "step": 2754 + }, + { + "epoch": 0.848326279338207, + "grad_norm": 2.890625, + "learning_rate": 7.162271452415734e-06, + "loss": 0.943736732006073, + "step": 2756 + }, + { + "epoch": 0.8489419007310504, + "grad_norm": 41.5, + "learning_rate": 7.158455490237333e-06, + "loss": 1.3057875633239746, + "step": 2758 + }, + { + "epoch": 0.8495575221238938, + "grad_norm": 8.6875, + "learning_rate": 7.154638376875744e-06, + "loss": 1.333833932876587, + "step": 2760 + }, + { + "epoch": 0.8501731435167372, + "grad_norm": 38.25, + "learning_rate": 7.150820116124117e-06, + "loss": 1.2368322610855103, + "step": 2762 + }, + { + "epoch": 0.8507887649095806, + "grad_norm": 8.0625, + "learning_rate": 7.147000711776744e-06, + "loss": 1.2598323822021484, + "step": 2764 + }, + { + "epoch": 0.851404386302424, + "grad_norm": 2.125, + "learning_rate": 7.1431801676290535e-06, + "loss": 1.039093017578125, + "step": 2766 + }, + { + "epoch": 0.8520200076952674, + "grad_norm": 5.53125, + "learning_rate": 7.139358487477606e-06, + "loss": 1.363373041152954, + "step": 2768 + }, + { + "epoch": 0.8526356290881109, + "grad_norm": 8.375, + "learning_rate": 7.1355356751200886e-06, + "loss": 1.186076283454895, + "step": 2770 + }, + { + "epoch": 0.8532512504809542, + "grad_norm": 6.46875, + "learning_rate": 7.131711734355315e-06, + "loss": 1.3930425643920898, + "step": 2772 + }, + { + "epoch": 0.8538668718737976, + "grad_norm": 8.875, + "learning_rate": 7.1278866689832235e-06, + "loss": 1.4591971635818481, + "step": 2774 + }, + { + "epoch": 0.854482493266641, + "grad_norm": 4.75, + "learning_rate": 7.124060482804869e-06, + "loss": 1.313432216644287, + "step": 2776 + }, + { + "epoch": 0.8550981146594844, + "grad_norm": 9.5625, + "learning_rate": 7.120233179622414e-06, + "loss": 1.3527673482894897, + "step": 2778 + }, + { + "epoch": 0.8557137360523278, + "grad_norm": 11.3125, + "learning_rate": 7.1164047632391375e-06, + "loss": 1.3231563568115234, + "step": 2780 + }, + { + "epoch": 0.8563293574451712, + "grad_norm": 7.75, + "learning_rate": 7.112575237459425e-06, + "loss": 1.3130462169647217, + "step": 2782 + }, + { + "epoch": 0.8569449788380146, + "grad_norm": 3.21875, + "learning_rate": 7.108744606088758e-06, + "loss": 0.9800470471382141, + "step": 2784 + }, + { + "epoch": 0.857560600230858, + "grad_norm": 6.0, + "learning_rate": 7.1049128729337205e-06, + "loss": 1.008358359336853, + "step": 2786 + }, + { + "epoch": 0.8581762216237014, + "grad_norm": 5.0, + "learning_rate": 7.101080041801996e-06, + "loss": 1.1537256240844727, + "step": 2788 + }, + { + "epoch": 0.8587918430165449, + "grad_norm": 8.5625, + "learning_rate": 7.097246116502352e-06, + "loss": 1.2312843799591064, + "step": 2790 + }, + { + "epoch": 0.8594074644093882, + "grad_norm": 14.625, + "learning_rate": 7.093411100844645e-06, + "loss": 0.8555831909179688, + "step": 2792 + }, + { + "epoch": 0.8600230858022316, + "grad_norm": 6.6875, + "learning_rate": 7.089574998639819e-06, + "loss": 1.0235496759414673, + "step": 2794 + }, + { + "epoch": 0.8606387071950751, + "grad_norm": 13.9375, + "learning_rate": 7.085737813699894e-06, + "loss": 1.1901612281799316, + "step": 2796 + }, + { + "epoch": 0.8612543285879184, + "grad_norm": 8.4375, + "learning_rate": 7.081899549837963e-06, + "loss": 1.1888796091079712, + "step": 2798 + }, + { + "epoch": 0.8618699499807618, + "grad_norm": 6.125, + "learning_rate": 7.078060210868198e-06, + "loss": 1.2371288537979126, + "step": 2800 + }, + { + "epoch": 0.8624855713736053, + "grad_norm": 4.6875, + "learning_rate": 7.074219800605837e-06, + "loss": 0.8463555574417114, + "step": 2802 + }, + { + "epoch": 0.8631011927664486, + "grad_norm": 7.8125, + "learning_rate": 7.070378322867182e-06, + "loss": 1.2541903257369995, + "step": 2804 + }, + { + "epoch": 0.863716814159292, + "grad_norm": 7.15625, + "learning_rate": 7.066535781469593e-06, + "loss": 1.4080777168273926, + "step": 2806 + }, + { + "epoch": 0.8643324355521355, + "grad_norm": 8.5625, + "learning_rate": 7.06269218023149e-06, + "loss": 1.2658737897872925, + "step": 2808 + }, + { + "epoch": 0.8649480569449788, + "grad_norm": 3.171875, + "learning_rate": 7.0588475229723475e-06, + "loss": 1.2996368408203125, + "step": 2810 + }, + { + "epoch": 0.8655636783378222, + "grad_norm": 7.34375, + "learning_rate": 7.055001813512683e-06, + "loss": 1.2913126945495605, + "step": 2812 + }, + { + "epoch": 0.8661792997306657, + "grad_norm": 4.21875, + "learning_rate": 7.051155055674073e-06, + "loss": 1.2469276189804077, + "step": 2814 + }, + { + "epoch": 0.8667949211235091, + "grad_norm": 4.0625, + "learning_rate": 7.047307253279119e-06, + "loss": 1.3468555212020874, + "step": 2816 + }, + { + "epoch": 0.8674105425163524, + "grad_norm": 8.3125, + "learning_rate": 7.043458410151472e-06, + "loss": 1.4589338302612305, + "step": 2818 + }, + { + "epoch": 0.8680261639091958, + "grad_norm": 7.28125, + "learning_rate": 7.039608530115809e-06, + "loss": 1.2077659368515015, + "step": 2820 + }, + { + "epoch": 0.8686417853020393, + "grad_norm": 3.859375, + "learning_rate": 7.035757616997849e-06, + "loss": 1.411242127418518, + "step": 2822 + }, + { + "epoch": 0.8692574066948826, + "grad_norm": 4.5625, + "learning_rate": 7.031905674624329e-06, + "loss": 1.2462449073791504, + "step": 2824 + }, + { + "epoch": 0.869873028087726, + "grad_norm": 2.328125, + "learning_rate": 7.0280527068230076e-06, + "loss": 1.0618919134140015, + "step": 2826 + }, + { + "epoch": 0.8704886494805695, + "grad_norm": 4.09375, + "learning_rate": 7.024198717422666e-06, + "loss": 1.2162715196609497, + "step": 2828 + }, + { + "epoch": 0.8711042708734128, + "grad_norm": 7.78125, + "learning_rate": 7.020343710253101e-06, + "loss": 1.230750322341919, + "step": 2830 + }, + { + "epoch": 0.8717198922662562, + "grad_norm": 6.84375, + "learning_rate": 7.01648768914512e-06, + "loss": 1.0324935913085938, + "step": 2832 + }, + { + "epoch": 0.8723355136590997, + "grad_norm": 4.125, + "learning_rate": 7.012630657930537e-06, + "loss": 1.205783724784851, + "step": 2834 + }, + { + "epoch": 0.8729511350519431, + "grad_norm": 7.75, + "learning_rate": 7.008772620442171e-06, + "loss": 1.4939160346984863, + "step": 2836 + }, + { + "epoch": 0.8735667564447864, + "grad_norm": 9.0625, + "learning_rate": 7.004913580513839e-06, + "loss": 1.623045802116394, + "step": 2838 + }, + { + "epoch": 0.8741823778376299, + "grad_norm": 10.625, + "learning_rate": 7.001053541980354e-06, + "loss": 1.3726472854614258, + "step": 2840 + }, + { + "epoch": 0.8747979992304733, + "grad_norm": 18.0, + "learning_rate": 6.9971925086775264e-06, + "loss": 1.5557606220245361, + "step": 2842 + }, + { + "epoch": 0.8754136206233166, + "grad_norm": 11.5625, + "learning_rate": 6.993330484442149e-06, + "loss": 1.5380194187164307, + "step": 2844 + }, + { + "epoch": 0.8760292420161601, + "grad_norm": 7.375, + "learning_rate": 6.989467473112005e-06, + "loss": 1.790273904800415, + "step": 2846 + }, + { + "epoch": 0.8766448634090035, + "grad_norm": 7.59375, + "learning_rate": 6.985603478525853e-06, + "loss": 1.3569434881210327, + "step": 2848 + }, + { + "epoch": 0.8772604848018468, + "grad_norm": 5.4375, + "learning_rate": 6.9817385045234294e-06, + "loss": 1.58054780960083, + "step": 2850 + }, + { + "epoch": 0.8778761061946903, + "grad_norm": 4.5625, + "learning_rate": 6.977872554945449e-06, + "loss": 1.3338651657104492, + "step": 2852 + }, + { + "epoch": 0.8784917275875337, + "grad_norm": 6.03125, + "learning_rate": 6.974005633633592e-06, + "loss": 1.0849279165267944, + "step": 2854 + }, + { + "epoch": 0.8791073489803771, + "grad_norm": 4.125, + "learning_rate": 6.9701377444304995e-06, + "loss": 1.0520415306091309, + "step": 2856 + }, + { + "epoch": 0.8797229703732204, + "grad_norm": 21.875, + "learning_rate": 6.9662688911797874e-06, + "loss": 1.1952751874923706, + "step": 2858 + }, + { + "epoch": 0.8803385917660639, + "grad_norm": 7.34375, + "learning_rate": 6.962399077726019e-06, + "loss": 1.3586761951446533, + "step": 2860 + }, + { + "epoch": 0.8809542131589073, + "grad_norm": 8.0, + "learning_rate": 6.9585283079147116e-06, + "loss": 1.4400031566619873, + "step": 2862 + }, + { + "epoch": 0.8815698345517506, + "grad_norm": 7.0625, + "learning_rate": 6.954656585592339e-06, + "loss": 1.38856840133667, + "step": 2864 + }, + { + "epoch": 0.8821854559445941, + "grad_norm": 12.875, + "learning_rate": 6.950783914606318e-06, + "loss": 0.7037546038627625, + "step": 2866 + }, + { + "epoch": 0.8828010773374375, + "grad_norm": 8.75, + "learning_rate": 6.946910298805009e-06, + "loss": 1.418529748916626, + "step": 2868 + }, + { + "epoch": 0.8834166987302808, + "grad_norm": 7.15625, + "learning_rate": 6.9430357420377104e-06, + "loss": 1.5819289684295654, + "step": 2870 + }, + { + "epoch": 0.8840323201231243, + "grad_norm": 3.171875, + "learning_rate": 6.939160248154656e-06, + "loss": 1.130005121231079, + "step": 2872 + }, + { + "epoch": 0.8846479415159677, + "grad_norm": 4.40625, + "learning_rate": 6.935283821007011e-06, + "loss": 1.0710422992706299, + "step": 2874 + }, + { + "epoch": 0.885263562908811, + "grad_norm": 9.625, + "learning_rate": 6.931406464446866e-06, + "loss": 1.295141339302063, + "step": 2876 + }, + { + "epoch": 0.8858791843016545, + "grad_norm": 4.625, + "learning_rate": 6.927528182327241e-06, + "loss": 1.2207305431365967, + "step": 2878 + }, + { + "epoch": 0.8864948056944979, + "grad_norm": 3.125, + "learning_rate": 6.923648978502069e-06, + "loss": 1.2300981283187866, + "step": 2880 + }, + { + "epoch": 0.8871104270873413, + "grad_norm": 6.875, + "learning_rate": 6.9197688568262035e-06, + "loss": 1.2342026233673096, + "step": 2882 + }, + { + "epoch": 0.8877260484801847, + "grad_norm": 5.25, + "learning_rate": 6.915887821155407e-06, + "loss": 1.0839194059371948, + "step": 2884 + }, + { + "epoch": 0.8883416698730281, + "grad_norm": 4.65625, + "learning_rate": 6.912005875346353e-06, + "loss": 1.3776112794876099, + "step": 2886 + }, + { + "epoch": 0.8889572912658715, + "grad_norm": 17.5, + "learning_rate": 6.908123023256616e-06, + "loss": 1.059165120124817, + "step": 2888 + }, + { + "epoch": 0.8895729126587149, + "grad_norm": 9.8125, + "learning_rate": 6.904239268744675e-06, + "loss": 0.6640627980232239, + "step": 2890 + }, + { + "epoch": 0.8901885340515583, + "grad_norm": 2.328125, + "learning_rate": 6.900354615669904e-06, + "loss": 1.352664589881897, + "step": 2892 + }, + { + "epoch": 0.8908041554444017, + "grad_norm": 4.15625, + "learning_rate": 6.896469067892568e-06, + "loss": 1.2407547235488892, + "step": 2894 + }, + { + "epoch": 0.891419776837245, + "grad_norm": 13.75, + "learning_rate": 6.892582629273825e-06, + "loss": 1.150381326675415, + "step": 2896 + }, + { + "epoch": 0.8920353982300885, + "grad_norm": 12.25, + "learning_rate": 6.8886953036757165e-06, + "loss": 1.401031494140625, + "step": 2898 + }, + { + "epoch": 0.8926510196229319, + "grad_norm": 8.3125, + "learning_rate": 6.8848070949611616e-06, + "loss": 1.51918625831604, + "step": 2900 + }, + { + "epoch": 0.8932666410157754, + "grad_norm": 9.875, + "learning_rate": 6.880918006993964e-06, + "loss": 1.1139386892318726, + "step": 2902 + }, + { + "epoch": 0.8938822624086187, + "grad_norm": 7.15625, + "learning_rate": 6.8770280436387956e-06, + "loss": 1.3600401878356934, + "step": 2904 + }, + { + "epoch": 0.8944978838014621, + "grad_norm": 9.3125, + "learning_rate": 6.873137208761203e-06, + "loss": 1.346145749092102, + "step": 2906 + }, + { + "epoch": 0.8951135051943055, + "grad_norm": 6.40625, + "learning_rate": 6.869245506227591e-06, + "loss": 1.2260481119155884, + "step": 2908 + }, + { + "epoch": 0.8957291265871489, + "grad_norm": 22.875, + "learning_rate": 6.865352939905237e-06, + "loss": 1.6141583919525146, + "step": 2910 + }, + { + "epoch": 0.8963447479799923, + "grad_norm": 7.6875, + "learning_rate": 6.861459513662267e-06, + "loss": 1.3436152935028076, + "step": 2912 + }, + { + "epoch": 0.8969603693728357, + "grad_norm": 16.625, + "learning_rate": 6.85756523136767e-06, + "loss": 1.2726424932479858, + "step": 2914 + }, + { + "epoch": 0.8975759907656791, + "grad_norm": 13.1875, + "learning_rate": 6.853670096891277e-06, + "loss": 1.37815260887146, + "step": 2916 + }, + { + "epoch": 0.8981916121585225, + "grad_norm": 4.375, + "learning_rate": 6.849774114103775e-06, + "loss": 1.2725327014923096, + "step": 2918 + }, + { + "epoch": 0.8988072335513659, + "grad_norm": 11.0625, + "learning_rate": 6.8458772868766875e-06, + "loss": 1.5259203910827637, + "step": 2920 + }, + { + "epoch": 0.8994228549442093, + "grad_norm": 4.5, + "learning_rate": 6.841979619082379e-06, + "loss": 1.0570274591445923, + "step": 2922 + }, + { + "epoch": 0.9000384763370527, + "grad_norm": 9.3125, + "learning_rate": 6.8380811145940485e-06, + "loss": 1.0440081357955933, + "step": 2924 + }, + { + "epoch": 0.9006540977298961, + "grad_norm": 7.25, + "learning_rate": 6.834181777285729e-06, + "loss": 1.6796268224716187, + "step": 2926 + }, + { + "epoch": 0.9012697191227396, + "grad_norm": 2.109375, + "learning_rate": 6.830281611032277e-06, + "loss": 1.435825228691101, + "step": 2928 + }, + { + "epoch": 0.9018853405155829, + "grad_norm": 4.40625, + "learning_rate": 6.826380619709376e-06, + "loss": 1.38742196559906, + "step": 2930 + }, + { + "epoch": 0.9025009619084263, + "grad_norm": 9.9375, + "learning_rate": 6.822478807193531e-06, + "loss": 1.6338295936584473, + "step": 2932 + }, + { + "epoch": 0.9031165833012698, + "grad_norm": 11.8125, + "learning_rate": 6.81857617736206e-06, + "loss": 1.0583865642547607, + "step": 2934 + }, + { + "epoch": 0.9037322046941131, + "grad_norm": 7.21875, + "learning_rate": 6.81467273409309e-06, + "loss": 1.2847459316253662, + "step": 2936 + }, + { + "epoch": 0.9043478260869565, + "grad_norm": 14.875, + "learning_rate": 6.810768481265564e-06, + "loss": 1.5531105995178223, + "step": 2938 + }, + { + "epoch": 0.9049634474798, + "grad_norm": 3.53125, + "learning_rate": 6.806863422759225e-06, + "loss": 1.2093020677566528, + "step": 2940 + }, + { + "epoch": 0.9055790688726433, + "grad_norm": 6.09375, + "learning_rate": 6.802957562454613e-06, + "loss": 1.5628454685211182, + "step": 2942 + }, + { + "epoch": 0.9061946902654867, + "grad_norm": 9.875, + "learning_rate": 6.799050904233075e-06, + "loss": 1.3811249732971191, + "step": 2944 + }, + { + "epoch": 0.9068103116583301, + "grad_norm": 6.9375, + "learning_rate": 6.795143451976742e-06, + "loss": 1.2331663370132446, + "step": 2946 + }, + { + "epoch": 0.9074259330511736, + "grad_norm": 12.9375, + "learning_rate": 6.7912352095685366e-06, + "loss": 1.6550911664962769, + "step": 2948 + }, + { + "epoch": 0.9080415544440169, + "grad_norm": 6.4375, + "learning_rate": 6.787326180892166e-06, + "loss": 1.2516546249389648, + "step": 2950 + }, + { + "epoch": 0.9086571758368603, + "grad_norm": 4.5625, + "learning_rate": 6.783416369832122e-06, + "loss": 0.8436126708984375, + "step": 2952 + }, + { + "epoch": 0.9092727972297038, + "grad_norm": 12.5625, + "learning_rate": 6.77950578027367e-06, + "loss": 0.747209906578064, + "step": 2954 + }, + { + "epoch": 0.9098884186225471, + "grad_norm": 13.9375, + "learning_rate": 6.775594416102851e-06, + "loss": 1.5576605796813965, + "step": 2956 + }, + { + "epoch": 0.9105040400153905, + "grad_norm": 4.46875, + "learning_rate": 6.771682281206476e-06, + "loss": 1.3183671236038208, + "step": 2958 + }, + { + "epoch": 0.911119661408234, + "grad_norm": 6.40625, + "learning_rate": 6.767769379472119e-06, + "loss": 1.1161308288574219, + "step": 2960 + }, + { + "epoch": 0.9117352828010773, + "grad_norm": 10.125, + "learning_rate": 6.763855714788119e-06, + "loss": 1.6109819412231445, + "step": 2962 + }, + { + "epoch": 0.9123509041939207, + "grad_norm": 7.375, + "learning_rate": 6.759941291043575e-06, + "loss": 1.3613532781600952, + "step": 2964 + }, + { + "epoch": 0.9129665255867642, + "grad_norm": 5.625, + "learning_rate": 6.756026112128333e-06, + "loss": 1.263413667678833, + "step": 2966 + }, + { + "epoch": 0.9135821469796075, + "grad_norm": 5.59375, + "learning_rate": 6.752110181932998e-06, + "loss": 1.2548730373382568, + "step": 2968 + }, + { + "epoch": 0.9141977683724509, + "grad_norm": 4.78125, + "learning_rate": 6.748193504348917e-06, + "loss": 1.312070369720459, + "step": 2970 + }, + { + "epoch": 0.9148133897652944, + "grad_norm": 7.03125, + "learning_rate": 6.744276083268176e-06, + "loss": 1.305625081062317, + "step": 2972 + }, + { + "epoch": 0.9154290111581378, + "grad_norm": 16.125, + "learning_rate": 6.7403579225836094e-06, + "loss": 1.5548710823059082, + "step": 2974 + }, + { + "epoch": 0.9160446325509811, + "grad_norm": 4.5625, + "learning_rate": 6.736439026188779e-06, + "loss": 1.302093505859375, + "step": 2976 + }, + { + "epoch": 0.9166602539438246, + "grad_norm": 32.5, + "learning_rate": 6.732519397977981e-06, + "loss": 1.5850815773010254, + "step": 2978 + }, + { + "epoch": 0.917275875336668, + "grad_norm": 2.4375, + "learning_rate": 6.728599041846236e-06, + "loss": 1.028442144393921, + "step": 2980 + }, + { + "epoch": 0.9178914967295113, + "grad_norm": 2.75, + "learning_rate": 6.7246779616892936e-06, + "loss": 0.9915659427642822, + "step": 2982 + }, + { + "epoch": 0.9185071181223547, + "grad_norm": 4.03125, + "learning_rate": 6.720756161403614e-06, + "loss": 1.1359045505523682, + "step": 2984 + }, + { + "epoch": 0.9191227395151982, + "grad_norm": 1.4453125, + "learning_rate": 6.7168336448863805e-06, + "loss": 1.156688928604126, + "step": 2986 + }, + { + "epoch": 0.9197383609080415, + "grad_norm": 8.0, + "learning_rate": 6.7129104160354875e-06, + "loss": 1.278592824935913, + "step": 2988 + }, + { + "epoch": 0.9203539823008849, + "grad_norm": 7.34375, + "learning_rate": 6.708986478749532e-06, + "loss": 1.583968997001648, + "step": 2990 + }, + { + "epoch": 0.9209696036937284, + "grad_norm": 6.71875, + "learning_rate": 6.70506183692782e-06, + "loss": 1.4367609024047852, + "step": 2992 + }, + { + "epoch": 0.9215852250865718, + "grad_norm": 5.34375, + "learning_rate": 6.701136494470356e-06, + "loss": 0.7980353236198425, + "step": 2994 + }, + { + "epoch": 0.9222008464794151, + "grad_norm": 8.75, + "learning_rate": 6.697210455277842e-06, + "loss": 1.4646888971328735, + "step": 2996 + }, + { + "epoch": 0.9228164678722586, + "grad_norm": 9.375, + "learning_rate": 6.693283723251669e-06, + "loss": 1.732331395149231, + "step": 2998 + }, + { + "epoch": 0.923432089265102, + "grad_norm": 13.3125, + "learning_rate": 6.689356302293921e-06, + "loss": 1.5182671546936035, + "step": 3000 + }, + { + "epoch": 0.9240477106579453, + "grad_norm": 13.125, + "learning_rate": 6.685428196307362e-06, + "loss": 1.2069380283355713, + "step": 3002 + }, + { + "epoch": 0.9246633320507888, + "grad_norm": 5.78125, + "learning_rate": 6.68149940919544e-06, + "loss": 1.3300877809524536, + "step": 3004 + }, + { + "epoch": 0.9252789534436322, + "grad_norm": 25.125, + "learning_rate": 6.677569944862277e-06, + "loss": 1.355804204940796, + "step": 3006 + }, + { + "epoch": 0.9258945748364755, + "grad_norm": 8.5625, + "learning_rate": 6.673639807212674e-06, + "loss": 1.4251302480697632, + "step": 3008 + }, + { + "epoch": 0.926510196229319, + "grad_norm": 1.8046875, + "learning_rate": 6.6697090001520936e-06, + "loss": 1.1559216976165771, + "step": 3010 + }, + { + "epoch": 0.9271258176221624, + "grad_norm": 3.59375, + "learning_rate": 6.665777527586668e-06, + "loss": 1.0845048427581787, + "step": 3012 + }, + { + "epoch": 0.9277414390150057, + "grad_norm": 10.125, + "learning_rate": 6.661845393423187e-06, + "loss": 1.2049851417541504, + "step": 3014 + }, + { + "epoch": 0.9283570604078492, + "grad_norm": 9.6875, + "learning_rate": 6.657912601569105e-06, + "loss": 1.232240080833435, + "step": 3016 + }, + { + "epoch": 0.9289726818006926, + "grad_norm": 6.4375, + "learning_rate": 6.653979155932524e-06, + "loss": 1.3066433668136597, + "step": 3018 + }, + { + "epoch": 0.929588303193536, + "grad_norm": 22.5, + "learning_rate": 6.6500450604221945e-06, + "loss": 1.005613923072815, + "step": 3020 + }, + { + "epoch": 0.9302039245863793, + "grad_norm": 19.75, + "learning_rate": 6.646110318947518e-06, + "loss": 1.1477642059326172, + "step": 3022 + }, + { + "epoch": 0.9308195459792228, + "grad_norm": 7.375, + "learning_rate": 6.642174935418535e-06, + "loss": 1.2949566841125488, + "step": 3024 + }, + { + "epoch": 0.9314351673720662, + "grad_norm": 10.8125, + "learning_rate": 6.638238913745922e-06, + "loss": 1.5063673257827759, + "step": 3026 + }, + { + "epoch": 0.9320507887649095, + "grad_norm": 6.71875, + "learning_rate": 6.634302257840995e-06, + "loss": 1.325649619102478, + "step": 3028 + }, + { + "epoch": 0.932666410157753, + "grad_norm": 8.375, + "learning_rate": 6.630364971615695e-06, + "loss": 1.3196182250976562, + "step": 3030 + }, + { + "epoch": 0.9332820315505964, + "grad_norm": 7.125, + "learning_rate": 6.626427058982594e-06, + "loss": 1.3414075374603271, + "step": 3032 + }, + { + "epoch": 0.9338976529434397, + "grad_norm": 17.5, + "learning_rate": 6.622488523854882e-06, + "loss": 1.5244091749191284, + "step": 3034 + }, + { + "epoch": 0.9345132743362832, + "grad_norm": 6.53125, + "learning_rate": 6.61854937014637e-06, + "loss": 1.4249980449676514, + "step": 3036 + }, + { + "epoch": 0.9351288957291266, + "grad_norm": 8.8125, + "learning_rate": 6.614609601771482e-06, + "loss": 1.284740924835205, + "step": 3038 + }, + { + "epoch": 0.93574451712197, + "grad_norm": 6.96875, + "learning_rate": 6.6106692226452574e-06, + "loss": 1.396584153175354, + "step": 3040 + }, + { + "epoch": 0.9363601385148134, + "grad_norm": 5.6875, + "learning_rate": 6.606728236683337e-06, + "loss": 1.5623862743377686, + "step": 3042 + }, + { + "epoch": 0.9369757599076568, + "grad_norm": 2.15625, + "learning_rate": 6.602786647801968e-06, + "loss": 0.997999906539917, + "step": 3044 + }, + { + "epoch": 0.9375913813005002, + "grad_norm": 4.8125, + "learning_rate": 6.598844459917997e-06, + "loss": 1.1285858154296875, + "step": 3046 + }, + { + "epoch": 0.9382070026933436, + "grad_norm": 7.53125, + "learning_rate": 6.594901676948861e-06, + "loss": 1.3015860319137573, + "step": 3048 + }, + { + "epoch": 0.938822624086187, + "grad_norm": 6.375, + "learning_rate": 6.590958302812592e-06, + "loss": 1.2325267791748047, + "step": 3050 + }, + { + "epoch": 0.9394382454790304, + "grad_norm": 10.125, + "learning_rate": 6.587014341427812e-06, + "loss": 1.6582579612731934, + "step": 3052 + }, + { + "epoch": 0.9400538668718738, + "grad_norm": 11.1875, + "learning_rate": 6.5830697967137225e-06, + "loss": 1.4930779933929443, + "step": 3054 + }, + { + "epoch": 0.9406694882647172, + "grad_norm": 4.03125, + "learning_rate": 6.579124672590107e-06, + "loss": 1.2360354661941528, + "step": 3056 + }, + { + "epoch": 0.9412851096575606, + "grad_norm": 9.0625, + "learning_rate": 6.575178972977321e-06, + "loss": 1.4100964069366455, + "step": 3058 + }, + { + "epoch": 0.941900731050404, + "grad_norm": 5.0625, + "learning_rate": 6.571232701796297e-06, + "loss": 1.505408763885498, + "step": 3060 + }, + { + "epoch": 0.9425163524432474, + "grad_norm": 4.78125, + "learning_rate": 6.567285862968532e-06, + "loss": 1.143608808517456, + "step": 3062 + }, + { + "epoch": 0.9431319738360908, + "grad_norm": 21.5, + "learning_rate": 6.56333846041609e-06, + "loss": 1.7274588346481323, + "step": 3064 + }, + { + "epoch": 0.9437475952289343, + "grad_norm": 6.125, + "learning_rate": 6.559390498061591e-06, + "loss": 1.4496876001358032, + "step": 3066 + }, + { + "epoch": 0.9443632166217776, + "grad_norm": 4.125, + "learning_rate": 6.555441979828217e-06, + "loss": 1.1227610111236572, + "step": 3068 + }, + { + "epoch": 0.944978838014621, + "grad_norm": 6.0625, + "learning_rate": 6.551492909639694e-06, + "loss": 1.3069566488265991, + "step": 3070 + }, + { + "epoch": 0.9455944594074644, + "grad_norm": 9.0, + "learning_rate": 6.547543291420306e-06, + "loss": 1.0902361869812012, + "step": 3072 + }, + { + "epoch": 0.9462100808003078, + "grad_norm": 17.375, + "learning_rate": 6.5435931290948765e-06, + "loss": 1.1386152505874634, + "step": 3074 + }, + { + "epoch": 0.9468257021931512, + "grad_norm": 8.0, + "learning_rate": 6.539642426588768e-06, + "loss": 1.3928558826446533, + "step": 3076 + }, + { + "epoch": 0.9474413235859946, + "grad_norm": 31.625, + "learning_rate": 6.5356911878278835e-06, + "loss": 1.2120009660720825, + "step": 3078 + }, + { + "epoch": 0.948056944978838, + "grad_norm": 25.5, + "learning_rate": 6.5317394167386605e-06, + "loss": 1.620686411857605, + "step": 3080 + }, + { + "epoch": 0.9486725663716814, + "grad_norm": 5.96875, + "learning_rate": 6.527787117248056e-06, + "loss": 1.2502431869506836, + "step": 3082 + }, + { + "epoch": 0.9492881877645248, + "grad_norm": 7.75, + "learning_rate": 6.5238342932835645e-06, + "loss": 0.956054151058197, + "step": 3084 + }, + { + "epoch": 0.9499038091573683, + "grad_norm": 2.40625, + "learning_rate": 6.519880948773194e-06, + "loss": 1.5191279649734497, + "step": 3086 + }, + { + "epoch": 0.9505194305502116, + "grad_norm": 4.84375, + "learning_rate": 6.515927087645471e-06, + "loss": 1.1445995569229126, + "step": 3088 + }, + { + "epoch": 0.951135051943055, + "grad_norm": 4.125, + "learning_rate": 6.511972713829433e-06, + "loss": 1.129223108291626, + "step": 3090 + }, + { + "epoch": 0.9517506733358985, + "grad_norm": 13.625, + "learning_rate": 6.508017831254637e-06, + "loss": 1.7071363925933838, + "step": 3092 + }, + { + "epoch": 0.9523662947287418, + "grad_norm": 4.28125, + "learning_rate": 6.504062443851131e-06, + "loss": 1.3347673416137695, + "step": 3094 + }, + { + "epoch": 0.9529819161215852, + "grad_norm": 4.53125, + "learning_rate": 6.500106555549478e-06, + "loss": 1.29487943649292, + "step": 3096 + }, + { + "epoch": 0.9535975375144287, + "grad_norm": 4.75, + "learning_rate": 6.4961501702807305e-06, + "loss": 1.4279407262802124, + "step": 3098 + }, + { + "epoch": 0.954213158907272, + "grad_norm": 8.875, + "learning_rate": 6.4921932919764365e-06, + "loss": 1.0926623344421387, + "step": 3100 + }, + { + "epoch": 0.9548287803001154, + "grad_norm": 5.65625, + "learning_rate": 6.4882359245686355e-06, + "loss": 1.2280659675598145, + "step": 3102 + }, + { + "epoch": 0.9554444016929589, + "grad_norm": 10.125, + "learning_rate": 6.484278071989852e-06, + "loss": 1.1809242963790894, + "step": 3104 + }, + { + "epoch": 0.9560600230858022, + "grad_norm": 3.265625, + "learning_rate": 6.480319738173092e-06, + "loss": 1.0962247848510742, + "step": 3106 + }, + { + "epoch": 0.9566756444786456, + "grad_norm": 8.125, + "learning_rate": 6.4763609270518416e-06, + "loss": 1.2776129245758057, + "step": 3108 + }, + { + "epoch": 0.957291265871489, + "grad_norm": 6.75, + "learning_rate": 6.472401642560062e-06, + "loss": 1.0463416576385498, + "step": 3110 + }, + { + "epoch": 0.9579068872643325, + "grad_norm": 15.6875, + "learning_rate": 6.468441888632179e-06, + "loss": 1.5015335083007812, + "step": 3112 + }, + { + "epoch": 0.9585225086571758, + "grad_norm": 7.0, + "learning_rate": 6.4644816692030905e-06, + "loss": 1.3311984539031982, + "step": 3114 + }, + { + "epoch": 0.9591381300500192, + "grad_norm": 5.1875, + "learning_rate": 6.460520988208156e-06, + "loss": 1.1144503355026245, + "step": 3116 + }, + { + "epoch": 0.9597537514428627, + "grad_norm": 4.78125, + "learning_rate": 6.456559849583193e-06, + "loss": 1.2613763809204102, + "step": 3118 + }, + { + "epoch": 0.960369372835706, + "grad_norm": 27.5, + "learning_rate": 6.452598257264473e-06, + "loss": 1.0620733499526978, + "step": 3120 + }, + { + "epoch": 0.9609849942285494, + "grad_norm": 4.09375, + "learning_rate": 6.448636215188719e-06, + "loss": 1.211551308631897, + "step": 3122 + }, + { + "epoch": 0.9616006156213929, + "grad_norm": 4.1875, + "learning_rate": 6.444673727293103e-06, + "loss": 1.3237829208374023, + "step": 3124 + }, + { + "epoch": 0.9622162370142362, + "grad_norm": 11.5625, + "learning_rate": 6.440710797515235e-06, + "loss": 1.543803095817566, + "step": 3126 + }, + { + "epoch": 0.9628318584070796, + "grad_norm": 10.0, + "learning_rate": 6.436747429793169e-06, + "loss": 1.1492283344268799, + "step": 3128 + }, + { + "epoch": 0.9634474797999231, + "grad_norm": 5.25, + "learning_rate": 6.432783628065392e-06, + "loss": 0.9545158743858337, + "step": 3130 + }, + { + "epoch": 0.9640631011927665, + "grad_norm": 2.734375, + "learning_rate": 6.4288193962708225e-06, + "loss": 1.005800485610962, + "step": 3132 + }, + { + "epoch": 0.9646787225856098, + "grad_norm": 10.75, + "learning_rate": 6.4248547383488065e-06, + "loss": 1.4155142307281494, + "step": 3134 + }, + { + "epoch": 0.9652943439784533, + "grad_norm": 9.125, + "learning_rate": 6.420889658239113e-06, + "loss": 1.2309889793395996, + "step": 3136 + }, + { + "epoch": 0.9659099653712967, + "grad_norm": 7.28125, + "learning_rate": 6.416924159881932e-06, + "loss": 1.452165126800537, + "step": 3138 + }, + { + "epoch": 0.96652558676414, + "grad_norm": 11.75, + "learning_rate": 6.412958247217869e-06, + "loss": 1.4262676239013672, + "step": 3140 + }, + { + "epoch": 0.9671412081569835, + "grad_norm": 4.0625, + "learning_rate": 6.408991924187937e-06, + "loss": 1.578167200088501, + "step": 3142 + }, + { + "epoch": 0.9677568295498269, + "grad_norm": 4.75, + "learning_rate": 6.405025194733563e-06, + "loss": 1.1947578191757202, + "step": 3144 + }, + { + "epoch": 0.9683724509426702, + "grad_norm": 8.5, + "learning_rate": 6.401058062796573e-06, + "loss": 1.0839043855667114, + "step": 3146 + }, + { + "epoch": 0.9689880723355137, + "grad_norm": 5.8125, + "learning_rate": 6.397090532319197e-06, + "loss": 1.2948286533355713, + "step": 3148 + }, + { + "epoch": 0.9696036937283571, + "grad_norm": 5.46875, + "learning_rate": 6.393122607244057e-06, + "loss": 1.5003513097763062, + "step": 3150 + }, + { + "epoch": 0.9702193151212004, + "grad_norm": 5.9375, + "learning_rate": 6.389154291514171e-06, + "loss": 1.5157687664031982, + "step": 3152 + }, + { + "epoch": 0.9708349365140438, + "grad_norm": 12.3125, + "learning_rate": 6.385185589072942e-06, + "loss": 1.7549535036087036, + "step": 3154 + }, + { + "epoch": 0.9714505579068873, + "grad_norm": 7.53125, + "learning_rate": 6.3812165038641585e-06, + "loss": 1.477494716644287, + "step": 3156 + }, + { + "epoch": 0.9720661792997307, + "grad_norm": 2.9375, + "learning_rate": 6.377247039831991e-06, + "loss": 1.1848636865615845, + "step": 3158 + }, + { + "epoch": 0.972681800692574, + "grad_norm": 6.125, + "learning_rate": 6.373277200920982e-06, + "loss": 1.0180506706237793, + "step": 3160 + }, + { + "epoch": 0.9732974220854175, + "grad_norm": 9.5625, + "learning_rate": 6.3693069910760515e-06, + "loss": 1.167291283607483, + "step": 3162 + }, + { + "epoch": 0.9739130434782609, + "grad_norm": 7.5625, + "learning_rate": 6.365336414242487e-06, + "loss": 1.1879794597625732, + "step": 3164 + }, + { + "epoch": 0.9745286648711042, + "grad_norm": 6.78125, + "learning_rate": 6.361365474365937e-06, + "loss": 1.449440598487854, + "step": 3166 + }, + { + "epoch": 0.9751442862639477, + "grad_norm": 7.9375, + "learning_rate": 6.357394175392415e-06, + "loss": 1.5806314945220947, + "step": 3168 + }, + { + "epoch": 0.9757599076567911, + "grad_norm": 9.375, + "learning_rate": 6.353422521268291e-06, + "loss": 1.2685093879699707, + "step": 3170 + }, + { + "epoch": 0.9763755290496344, + "grad_norm": 9.8125, + "learning_rate": 6.349450515940283e-06, + "loss": 1.0854675769805908, + "step": 3172 + }, + { + "epoch": 0.9769911504424779, + "grad_norm": 11.0625, + "learning_rate": 6.345478163355465e-06, + "loss": 1.622758150100708, + "step": 3174 + }, + { + "epoch": 0.9776067718353213, + "grad_norm": 9.4375, + "learning_rate": 6.341505467461253e-06, + "loss": 1.4243215322494507, + "step": 3176 + }, + { + "epoch": 0.9782223932281647, + "grad_norm": 4.5625, + "learning_rate": 6.337532432205402e-06, + "loss": 1.414436936378479, + "step": 3178 + }, + { + "epoch": 0.9788380146210081, + "grad_norm": 10.75, + "learning_rate": 6.333559061536008e-06, + "loss": 1.6876072883605957, + "step": 3180 + }, + { + "epoch": 0.9794536360138515, + "grad_norm": 13.3125, + "learning_rate": 6.329585359401496e-06, + "loss": 1.9350146055221558, + "step": 3182 + }, + { + "epoch": 0.9800692574066949, + "grad_norm": 12.5625, + "learning_rate": 6.325611329750625e-06, + "loss": 1.7809089422225952, + "step": 3184 + }, + { + "epoch": 0.9806848787995383, + "grad_norm": 11.0, + "learning_rate": 6.321636976532477e-06, + "loss": 1.300473690032959, + "step": 3186 + }, + { + "epoch": 0.9813005001923817, + "grad_norm": 8.5625, + "learning_rate": 6.317662303696456e-06, + "loss": 1.4619872570037842, + "step": 3188 + }, + { + "epoch": 0.9819161215852251, + "grad_norm": 4.53125, + "learning_rate": 6.3136873151922825e-06, + "loss": 1.2975380420684814, + "step": 3190 + }, + { + "epoch": 0.9825317429780684, + "grad_norm": 10.625, + "learning_rate": 6.309712014969993e-06, + "loss": 0.7392944693565369, + "step": 3192 + }, + { + "epoch": 0.9831473643709119, + "grad_norm": 10.375, + "learning_rate": 6.30573640697993e-06, + "loss": 1.530669927597046, + "step": 3194 + }, + { + "epoch": 0.9837629857637553, + "grad_norm": 3.25, + "learning_rate": 6.301760495172748e-06, + "loss": 1.13042414188385, + "step": 3196 + }, + { + "epoch": 0.9843786071565986, + "grad_norm": 5.1875, + "learning_rate": 6.297784283499397e-06, + "loss": 1.0704107284545898, + "step": 3198 + }, + { + "epoch": 0.9849942285494421, + "grad_norm": 5.625, + "learning_rate": 6.293807775911129e-06, + "loss": 1.2520568370819092, + "step": 3200 + }, + { + "epoch": 0.9856098499422855, + "grad_norm": 9.875, + "learning_rate": 6.289830976359488e-06, + "loss": 1.1383583545684814, + "step": 3202 + }, + { + "epoch": 0.986225471335129, + "grad_norm": 2.421875, + "learning_rate": 6.285853888796307e-06, + "loss": 0.9404821991920471, + "step": 3204 + }, + { + "epoch": 0.9868410927279723, + "grad_norm": 6.78125, + "learning_rate": 6.2818765171737106e-06, + "loss": 1.1734299659729004, + "step": 3206 + }, + { + "epoch": 0.9874567141208157, + "grad_norm": 6.8125, + "learning_rate": 6.277898865444101e-06, + "loss": 1.0691970586776733, + "step": 3208 + }, + { + "epoch": 0.9880723355136591, + "grad_norm": 7.625, + "learning_rate": 6.273920937560161e-06, + "loss": 1.4263228178024292, + "step": 3210 + }, + { + "epoch": 0.9886879569065025, + "grad_norm": 6.46875, + "learning_rate": 6.269942737474843e-06, + "loss": 1.433255672454834, + "step": 3212 + }, + { + "epoch": 0.9893035782993459, + "grad_norm": 3.109375, + "learning_rate": 6.265964269141375e-06, + "loss": 1.2142915725708008, + "step": 3214 + }, + { + "epoch": 0.9899191996921893, + "grad_norm": 6.90625, + "learning_rate": 6.261985536513253e-06, + "loss": 1.1429392099380493, + "step": 3216 + }, + { + "epoch": 0.9905348210850327, + "grad_norm": 8.125, + "learning_rate": 6.258006543544229e-06, + "loss": 1.1579214334487915, + "step": 3218 + }, + { + "epoch": 0.9911504424778761, + "grad_norm": 9.0625, + "learning_rate": 6.254027294188321e-06, + "loss": 1.15116548538208, + "step": 3220 + }, + { + "epoch": 0.9917660638707195, + "grad_norm": 11.0625, + "learning_rate": 6.2500477923997945e-06, + "loss": 1.5957486629486084, + "step": 3222 + }, + { + "epoch": 0.992381685263563, + "grad_norm": 11.8125, + "learning_rate": 6.246068042133173e-06, + "loss": 1.2969751358032227, + "step": 3224 + }, + { + "epoch": 0.9929973066564063, + "grad_norm": 6.28125, + "learning_rate": 6.242088047343222e-06, + "loss": 1.5214343070983887, + "step": 3226 + }, + { + "epoch": 0.9936129280492497, + "grad_norm": 11.3125, + "learning_rate": 6.238107811984951e-06, + "loss": 1.5400235652923584, + "step": 3228 + }, + { + "epoch": 0.9942285494420932, + "grad_norm": 8.0, + "learning_rate": 6.234127340013612e-06, + "loss": 1.5471324920654297, + "step": 3230 + }, + { + "epoch": 0.9948441708349365, + "grad_norm": 5.34375, + "learning_rate": 6.230146635384684e-06, + "loss": 1.3047486543655396, + "step": 3232 + }, + { + "epoch": 0.9954597922277799, + "grad_norm": 2.40625, + "learning_rate": 6.226165702053888e-06, + "loss": 0.83452308177948, + "step": 3234 + }, + { + "epoch": 0.9960754136206234, + "grad_norm": 3.09375, + "learning_rate": 6.222184543977163e-06, + "loss": 0.9607036113739014, + "step": 3236 + }, + { + "epoch": 0.9966910350134667, + "grad_norm": 4.59375, + "learning_rate": 6.218203165110676e-06, + "loss": 0.9204300045967102, + "step": 3238 + }, + { + "epoch": 0.9973066564063101, + "grad_norm": 7.40625, + "learning_rate": 6.214221569410815e-06, + "loss": 0.8008885383605957, + "step": 3240 + }, + { + "epoch": 0.9979222777991535, + "grad_norm": 5.1875, + "learning_rate": 6.2102397608341755e-06, + "loss": 1.2449378967285156, + "step": 3242 + }, + { + "epoch": 0.9985378991919969, + "grad_norm": 4.90625, + "learning_rate": 6.206257743337574e-06, + "loss": 1.1880314350128174, + "step": 3244 + }, + { + "epoch": 0.9991535205848403, + "grad_norm": 5.90625, + "learning_rate": 6.202275520878029e-06, + "loss": 1.374240517616272, + "step": 3246 + }, + { + "epoch": 0.9997691419776837, + "grad_norm": 7.03125, + "learning_rate": 6.198293097412766e-06, + "loss": 1.3699828386306763, + "step": 3248 + }, + { + "epoch": 1.0003078106964216, + "grad_norm": 1.28125, + "learning_rate": 6.1943104768992055e-06, + "loss": 1.3666250705718994, + "step": 3250 + }, + { + "epoch": 1.000923432089265, + "grad_norm": 7.125, + "learning_rate": 6.190327663294971e-06, + "loss": 1.4202823638916016, + "step": 3252 + }, + { + "epoch": 1.0015390534821085, + "grad_norm": 3.703125, + "learning_rate": 6.1863446605578705e-06, + "loss": 1.1655609607696533, + "step": 3254 + }, + { + "epoch": 1.002154674874952, + "grad_norm": 10.5625, + "learning_rate": 6.182361472645901e-06, + "loss": 1.410911202430725, + "step": 3256 + }, + { + "epoch": 1.0027702962677953, + "grad_norm": 10.0625, + "learning_rate": 6.178378103517251e-06, + "loss": 1.6366688013076782, + "step": 3258 + }, + { + "epoch": 1.0033859176606388, + "grad_norm": 3.59375, + "learning_rate": 6.174394557130279e-06, + "loss": 1.1457003355026245, + "step": 3260 + }, + { + "epoch": 1.0040015390534822, + "grad_norm": 18.0, + "learning_rate": 6.170410837443528e-06, + "loss": 0.724550187587738, + "step": 3262 + }, + { + "epoch": 1.0046171604463254, + "grad_norm": 3.515625, + "learning_rate": 6.166426948415708e-06, + "loss": 1.4719600677490234, + "step": 3264 + }, + { + "epoch": 1.0052327818391689, + "grad_norm": 6.34375, + "learning_rate": 6.162442894005698e-06, + "loss": 1.1914829015731812, + "step": 3266 + }, + { + "epoch": 1.0058484032320123, + "grad_norm": 7.09375, + "learning_rate": 6.158458678172543e-06, + "loss": 1.26486337184906, + "step": 3268 + }, + { + "epoch": 1.0064640246248557, + "grad_norm": 7.78125, + "learning_rate": 6.1544743048754484e-06, + "loss": 1.1825785636901855, + "step": 3270 + }, + { + "epoch": 1.0070796460176992, + "grad_norm": 10.375, + "learning_rate": 6.150489778073773e-06, + "loss": 1.5313773155212402, + "step": 3272 + }, + { + "epoch": 1.0076952674105426, + "grad_norm": 12.6875, + "learning_rate": 6.146505101727031e-06, + "loss": 1.0273815393447876, + "step": 3274 + }, + { + "epoch": 1.0083108888033858, + "grad_norm": 4.15625, + "learning_rate": 6.14252027979489e-06, + "loss": 1.1481852531433105, + "step": 3276 + }, + { + "epoch": 1.0089265101962293, + "grad_norm": 10.1875, + "learning_rate": 6.138535316237148e-06, + "loss": 1.4595853090286255, + "step": 3278 + }, + { + "epoch": 1.0095421315890727, + "grad_norm": 8.1875, + "learning_rate": 6.134550215013759e-06, + "loss": 1.178184151649475, + "step": 3280 + }, + { + "epoch": 1.0101577529819161, + "grad_norm": 9.5, + "learning_rate": 6.130564980084803e-06, + "loss": 1.7836490869522095, + "step": 3282 + }, + { + "epoch": 1.0107733743747596, + "grad_norm": 3.859375, + "learning_rate": 6.126579615410502e-06, + "loss": 1.275104284286499, + "step": 3284 + }, + { + "epoch": 1.011388995767603, + "grad_norm": 11.125, + "learning_rate": 6.122594124951198e-06, + "loss": 1.2140474319458008, + "step": 3286 + }, + { + "epoch": 1.0120046171604464, + "grad_norm": 3.375, + "learning_rate": 6.118608512667364e-06, + "loss": 1.1134839057922363, + "step": 3288 + }, + { + "epoch": 1.0126202385532896, + "grad_norm": 7.5625, + "learning_rate": 6.1146227825195934e-06, + "loss": 1.0847816467285156, + "step": 3290 + }, + { + "epoch": 1.013235859946133, + "grad_norm": 4.9375, + "learning_rate": 6.110636938468593e-06, + "loss": 1.0978894233703613, + "step": 3292 + }, + { + "epoch": 1.0138514813389765, + "grad_norm": 6.5, + "learning_rate": 6.1066509844751884e-06, + "loss": 1.5050498247146606, + "step": 3294 + }, + { + "epoch": 1.01446710273182, + "grad_norm": 3.671875, + "learning_rate": 6.10266492450031e-06, + "loss": 1.1623247861862183, + "step": 3296 + }, + { + "epoch": 1.0150827241246634, + "grad_norm": 4.75, + "learning_rate": 6.098678762504994e-06, + "loss": 1.3516665697097778, + "step": 3298 + }, + { + "epoch": 1.0156983455175068, + "grad_norm": 1.7109375, + "learning_rate": 6.09469250245038e-06, + "loss": 1.0650442838668823, + "step": 3300 + }, + { + "epoch": 1.01631396691035, + "grad_norm": 5.75, + "learning_rate": 6.090706148297702e-06, + "loss": 1.4884039163589478, + "step": 3302 + }, + { + "epoch": 1.0169295883031935, + "grad_norm": 3.375, + "learning_rate": 6.0867197040082925e-06, + "loss": 1.312299132347107, + "step": 3304 + }, + { + "epoch": 1.017545209696037, + "grad_norm": 11.625, + "learning_rate": 6.082733173543572e-06, + "loss": 1.6194450855255127, + "step": 3306 + }, + { + "epoch": 1.0181608310888803, + "grad_norm": 9.6875, + "learning_rate": 6.078746560865039e-06, + "loss": 1.4919713735580444, + "step": 3308 + }, + { + "epoch": 1.0187764524817238, + "grad_norm": 2.421875, + "learning_rate": 6.074759869934284e-06, + "loss": 1.1863977909088135, + "step": 3310 + }, + { + "epoch": 1.0193920738745672, + "grad_norm": 2.9375, + "learning_rate": 6.070773104712971e-06, + "loss": 1.208958625793457, + "step": 3312 + }, + { + "epoch": 1.0200076952674106, + "grad_norm": 5.3125, + "learning_rate": 6.0667862691628355e-06, + "loss": 1.1294976472854614, + "step": 3314 + }, + { + "epoch": 1.0206233166602539, + "grad_norm": 2.796875, + "learning_rate": 6.062799367245688e-06, + "loss": 0.984968364238739, + "step": 3316 + }, + { + "epoch": 1.0212389380530973, + "grad_norm": 32.75, + "learning_rate": 6.058812402923404e-06, + "loss": 1.56447172164917, + "step": 3318 + }, + { + "epoch": 1.0218545594459407, + "grad_norm": 6.125, + "learning_rate": 6.054825380157915e-06, + "loss": 1.4007642269134521, + "step": 3320 + }, + { + "epoch": 1.0224701808387842, + "grad_norm": 4.90625, + "learning_rate": 6.050838302911217e-06, + "loss": 1.3601939678192139, + "step": 3322 + }, + { + "epoch": 1.0230858022316276, + "grad_norm": 4.625, + "learning_rate": 6.046851175145356e-06, + "loss": 1.3050976991653442, + "step": 3324 + }, + { + "epoch": 1.023701423624471, + "grad_norm": 7.65625, + "learning_rate": 6.042864000822435e-06, + "loss": 1.4357898235321045, + "step": 3326 + }, + { + "epoch": 1.0243170450173142, + "grad_norm": 14.1875, + "learning_rate": 6.038876783904594e-06, + "loss": 1.1699950695037842, + "step": 3328 + }, + { + "epoch": 1.0249326664101577, + "grad_norm": 4.9375, + "learning_rate": 6.034889528354022e-06, + "loss": 1.083859920501709, + "step": 3330 + }, + { + "epoch": 1.0255482878030011, + "grad_norm": 5.53125, + "learning_rate": 6.030902238132943e-06, + "loss": 1.2051880359649658, + "step": 3332 + }, + { + "epoch": 1.0261639091958445, + "grad_norm": 3.265625, + "learning_rate": 6.026914917203617e-06, + "loss": 0.9221932888031006, + "step": 3334 + }, + { + "epoch": 1.026779530588688, + "grad_norm": 47.75, + "learning_rate": 6.022927569528336e-06, + "loss": 1.5843613147735596, + "step": 3336 + }, + { + "epoch": 1.0273951519815314, + "grad_norm": 17.5, + "learning_rate": 6.018940199069414e-06, + "loss": 1.3591049909591675, + "step": 3338 + }, + { + "epoch": 1.0280107733743749, + "grad_norm": 6.9375, + "learning_rate": 6.014952809789193e-06, + "loss": 1.2899994850158691, + "step": 3340 + }, + { + "epoch": 1.028626394767218, + "grad_norm": 7.25, + "learning_rate": 6.010965405650028e-06, + "loss": 1.698613166809082, + "step": 3342 + }, + { + "epoch": 1.0292420161600615, + "grad_norm": 7.21875, + "learning_rate": 6.006977990614293e-06, + "loss": 1.4977262020111084, + "step": 3344 + }, + { + "epoch": 1.029857637552905, + "grad_norm": 7.34375, + "learning_rate": 6.002990568644375e-06, + "loss": 1.124574065208435, + "step": 3346 + }, + { + "epoch": 1.0304732589457484, + "grad_norm": 2.25, + "learning_rate": 5.999003143702659e-06, + "loss": 1.0379462242126465, + "step": 3348 + }, + { + "epoch": 1.0310888803385918, + "grad_norm": 3.28125, + "learning_rate": 5.9950157197515445e-06, + "loss": 1.1994037628173828, + "step": 3350 + }, + { + "epoch": 1.0317045017314352, + "grad_norm": 8.4375, + "learning_rate": 5.9910283007534185e-06, + "loss": 1.3688772916793823, + "step": 3352 + }, + { + "epoch": 1.0323201231242787, + "grad_norm": 5.65625, + "learning_rate": 5.9870408906706725e-06, + "loss": 1.2714589834213257, + "step": 3354 + }, + { + "epoch": 1.032935744517122, + "grad_norm": 4.5625, + "learning_rate": 5.983053493465683e-06, + "loss": 1.300389051437378, + "step": 3356 + }, + { + "epoch": 1.0335513659099653, + "grad_norm": 12.5, + "learning_rate": 5.9790661131008175e-06, + "loss": 1.529327630996704, + "step": 3358 + }, + { + "epoch": 1.0341669873028088, + "grad_norm": 4.84375, + "learning_rate": 5.975078753538423e-06, + "loss": 1.053821086883545, + "step": 3360 + }, + { + "epoch": 1.0347826086956522, + "grad_norm": 12.9375, + "learning_rate": 5.971091418740833e-06, + "loss": 1.8510454893112183, + "step": 3362 + }, + { + "epoch": 1.0353982300884956, + "grad_norm": 6.03125, + "learning_rate": 5.9671041126703475e-06, + "loss": 1.4777189493179321, + "step": 3364 + }, + { + "epoch": 1.036013851481339, + "grad_norm": 8.5625, + "learning_rate": 5.96311683928924e-06, + "loss": 1.08958899974823, + "step": 3366 + }, + { + "epoch": 1.0366294728741823, + "grad_norm": 3.78125, + "learning_rate": 5.959129602559759e-06, + "loss": 1.5156233310699463, + "step": 3368 + }, + { + "epoch": 1.0372450942670257, + "grad_norm": 4.625, + "learning_rate": 5.955142406444105e-06, + "loss": 1.18940007686615, + "step": 3370 + }, + { + "epoch": 1.0378607156598691, + "grad_norm": 3.390625, + "learning_rate": 5.95115525490445e-06, + "loss": 1.036544919013977, + "step": 3372 + }, + { + "epoch": 1.0384763370527126, + "grad_norm": 1.75, + "learning_rate": 5.947168151902912e-06, + "loss": 1.041877269744873, + "step": 3374 + }, + { + "epoch": 1.039091958445556, + "grad_norm": 16.625, + "learning_rate": 5.943181101401567e-06, + "loss": 1.3080252408981323, + "step": 3376 + }, + { + "epoch": 1.0397075798383995, + "grad_norm": 4.8125, + "learning_rate": 5.939194107362434e-06, + "loss": 0.9550781846046448, + "step": 3378 + }, + { + "epoch": 1.0403232012312429, + "grad_norm": 5.0, + "learning_rate": 5.9352071737474795e-06, + "loss": 1.2598165273666382, + "step": 3380 + }, + { + "epoch": 1.040938822624086, + "grad_norm": 7.75, + "learning_rate": 5.931220304518608e-06, + "loss": 1.376179575920105, + "step": 3382 + }, + { + "epoch": 1.0415544440169295, + "grad_norm": 1.6796875, + "learning_rate": 5.9272335036376615e-06, + "loss": 1.0743067264556885, + "step": 3384 + }, + { + "epoch": 1.042170065409773, + "grad_norm": 2.875, + "learning_rate": 5.923246775066416e-06, + "loss": 0.9674758315086365, + "step": 3386 + }, + { + "epoch": 1.0427856868026164, + "grad_norm": 6.03125, + "learning_rate": 5.91926012276657e-06, + "loss": 1.6355992555618286, + "step": 3388 + }, + { + "epoch": 1.0434013081954598, + "grad_norm": 3.40625, + "learning_rate": 5.915273550699748e-06, + "loss": 1.4389272928237915, + "step": 3390 + }, + { + "epoch": 1.0440169295883033, + "grad_norm": 5.8125, + "learning_rate": 5.911287062827499e-06, + "loss": 1.3074742555618286, + "step": 3392 + }, + { + "epoch": 1.0446325509811465, + "grad_norm": 4.9375, + "learning_rate": 5.907300663111284e-06, + "loss": 1.1949514150619507, + "step": 3394 + }, + { + "epoch": 1.04524817237399, + "grad_norm": 11.9375, + "learning_rate": 5.903314355512477e-06, + "loss": 1.5670204162597656, + "step": 3396 + }, + { + "epoch": 1.0458637937668334, + "grad_norm": 6.375, + "learning_rate": 5.899328143992364e-06, + "loss": 0.9783201813697815, + "step": 3398 + }, + { + "epoch": 1.0464794151596768, + "grad_norm": 5.15625, + "learning_rate": 5.895342032512132e-06, + "loss": 1.126775860786438, + "step": 3400 + }, + { + "epoch": 1.0470950365525202, + "grad_norm": 13.5, + "learning_rate": 5.891356025032866e-06, + "loss": 1.6031320095062256, + "step": 3402 + }, + { + "epoch": 1.0477106579453637, + "grad_norm": 3.375, + "learning_rate": 5.887370125515554e-06, + "loss": 1.0438646078109741, + "step": 3404 + }, + { + "epoch": 1.048326279338207, + "grad_norm": 14.375, + "learning_rate": 5.883384337921072e-06, + "loss": 1.4825729131698608, + "step": 3406 + }, + { + "epoch": 1.0489419007310503, + "grad_norm": 12.1875, + "learning_rate": 5.879398666210189e-06, + "loss": 0.9457087516784668, + "step": 3408 + }, + { + "epoch": 1.0495575221238937, + "grad_norm": 5.15625, + "learning_rate": 5.875413114343554e-06, + "loss": 1.3323965072631836, + "step": 3410 + }, + { + "epoch": 1.0501731435167372, + "grad_norm": 6.09375, + "learning_rate": 5.871427686281699e-06, + "loss": 0.9091135859489441, + "step": 3412 + }, + { + "epoch": 1.0507887649095806, + "grad_norm": 40.0, + "learning_rate": 5.867442385985036e-06, + "loss": 0.8349736928939819, + "step": 3414 + }, + { + "epoch": 1.051404386302424, + "grad_norm": 24.625, + "learning_rate": 5.863457217413845e-06, + "loss": 1.8638713359832764, + "step": 3416 + }, + { + "epoch": 1.0520200076952675, + "grad_norm": 11.125, + "learning_rate": 5.859472184528279e-06, + "loss": 1.314063549041748, + "step": 3418 + }, + { + "epoch": 1.052635629088111, + "grad_norm": 1.8515625, + "learning_rate": 5.855487291288351e-06, + "loss": 1.1677442789077759, + "step": 3420 + }, + { + "epoch": 1.0532512504809541, + "grad_norm": 7.84375, + "learning_rate": 5.85150254165394e-06, + "loss": 1.2674967050552368, + "step": 3422 + }, + { + "epoch": 1.0538668718737976, + "grad_norm": 6.375, + "learning_rate": 5.847517939584783e-06, + "loss": 1.268509864807129, + "step": 3424 + }, + { + "epoch": 1.054482493266641, + "grad_norm": 4.25, + "learning_rate": 5.84353348904047e-06, + "loss": 1.3440495729446411, + "step": 3426 + }, + { + "epoch": 1.0550981146594844, + "grad_norm": 11.4375, + "learning_rate": 5.839549193980434e-06, + "loss": 1.240549087524414, + "step": 3428 + }, + { + "epoch": 1.0557137360523279, + "grad_norm": 6.03125, + "learning_rate": 5.835565058363962e-06, + "loss": 1.4629731178283691, + "step": 3430 + }, + { + "epoch": 1.0563293574451713, + "grad_norm": 13.375, + "learning_rate": 5.831581086150177e-06, + "loss": 0.8581249117851257, + "step": 3432 + }, + { + "epoch": 1.0569449788380145, + "grad_norm": 14.25, + "learning_rate": 5.827597281298041e-06, + "loss": 1.3019016981124878, + "step": 3434 + }, + { + "epoch": 1.057560600230858, + "grad_norm": 11.0, + "learning_rate": 5.823613647766351e-06, + "loss": 0.8576952815055847, + "step": 3436 + }, + { + "epoch": 1.0581762216237014, + "grad_norm": 7.0, + "learning_rate": 5.819630189513734e-06, + "loss": 1.5954262018203735, + "step": 3438 + }, + { + "epoch": 1.0587918430165448, + "grad_norm": 3.484375, + "learning_rate": 5.815646910498642e-06, + "loss": 1.2628898620605469, + "step": 3440 + }, + { + "epoch": 1.0594074644093883, + "grad_norm": 2.234375, + "learning_rate": 5.811663814679345e-06, + "loss": 1.136056900024414, + "step": 3442 + }, + { + "epoch": 1.0600230858022317, + "grad_norm": 8.5, + "learning_rate": 5.807680906013937e-06, + "loss": 1.1236200332641602, + "step": 3444 + }, + { + "epoch": 1.060638707195075, + "grad_norm": 7.03125, + "learning_rate": 5.803698188460325e-06, + "loss": 1.1918787956237793, + "step": 3446 + }, + { + "epoch": 1.0612543285879183, + "grad_norm": 8.5625, + "learning_rate": 5.799715665976224e-06, + "loss": 1.402172327041626, + "step": 3448 + }, + { + "epoch": 1.0618699499807618, + "grad_norm": 7.03125, + "learning_rate": 5.795733342519154e-06, + "loss": 1.1775987148284912, + "step": 3450 + }, + { + "epoch": 1.0624855713736052, + "grad_norm": 25.375, + "learning_rate": 5.7917512220464424e-06, + "loss": 1.1295747756958008, + "step": 3452 + }, + { + "epoch": 1.0631011927664487, + "grad_norm": 7.03125, + "learning_rate": 5.787769308515208e-06, + "loss": 1.3732116222381592, + "step": 3454 + }, + { + "epoch": 1.063716814159292, + "grad_norm": 8.9375, + "learning_rate": 5.783787605882367e-06, + "loss": 1.401436448097229, + "step": 3456 + }, + { + "epoch": 1.0643324355521355, + "grad_norm": 6.53125, + "learning_rate": 5.77980611810463e-06, + "loss": 1.3957934379577637, + "step": 3458 + }, + { + "epoch": 1.0649480569449787, + "grad_norm": 8.1875, + "learning_rate": 5.775824849138491e-06, + "loss": 1.8235782384872437, + "step": 3460 + }, + { + "epoch": 1.0655636783378222, + "grad_norm": 5.65625, + "learning_rate": 5.7718438029402225e-06, + "loss": 1.2653405666351318, + "step": 3462 + }, + { + "epoch": 1.0661792997306656, + "grad_norm": 10.5, + "learning_rate": 5.767862983465884e-06, + "loss": 1.286799669265747, + "step": 3464 + }, + { + "epoch": 1.066794921123509, + "grad_norm": 10.9375, + "learning_rate": 5.763882394671299e-06, + "loss": 1.1525325775146484, + "step": 3466 + }, + { + "epoch": 1.0674105425163525, + "grad_norm": 10.1875, + "learning_rate": 5.759902040512073e-06, + "loss": 1.480642318725586, + "step": 3468 + }, + { + "epoch": 1.068026163909196, + "grad_norm": 5.59375, + "learning_rate": 5.755921924943571e-06, + "loss": 1.250403642654419, + "step": 3470 + }, + { + "epoch": 1.0686417853020393, + "grad_norm": 5.3125, + "learning_rate": 5.751942051920923e-06, + "loss": 1.1772561073303223, + "step": 3472 + }, + { + "epoch": 1.0692574066948826, + "grad_norm": 7.9375, + "learning_rate": 5.747962425399019e-06, + "loss": 1.0843044519424438, + "step": 3474 + }, + { + "epoch": 1.069873028087726, + "grad_norm": 1.8671875, + "learning_rate": 5.743983049332502e-06, + "loss": 1.2011973857879639, + "step": 3476 + }, + { + "epoch": 1.0704886494805694, + "grad_norm": 5.8125, + "learning_rate": 5.740003927675769e-06, + "loss": 1.196240782737732, + "step": 3478 + }, + { + "epoch": 1.0711042708734129, + "grad_norm": 6.15625, + "learning_rate": 5.73602506438296e-06, + "loss": 1.5370745658874512, + "step": 3480 + }, + { + "epoch": 1.0717198922662563, + "grad_norm": 7.3125, + "learning_rate": 5.732046463407961e-06, + "loss": 1.1346544027328491, + "step": 3482 + }, + { + "epoch": 1.0723355136590997, + "grad_norm": 7.34375, + "learning_rate": 5.728068128704399e-06, + "loss": 1.1971721649169922, + "step": 3484 + }, + { + "epoch": 1.0729511350519432, + "grad_norm": 5.90625, + "learning_rate": 5.724090064225634e-06, + "loss": 1.1389195919036865, + "step": 3486 + }, + { + "epoch": 1.0735667564447864, + "grad_norm": 5.625, + "learning_rate": 5.720112273924754e-06, + "loss": 1.1738673448562622, + "step": 3488 + }, + { + "epoch": 1.0741823778376298, + "grad_norm": 4.25, + "learning_rate": 5.716134761754584e-06, + "loss": 1.225260853767395, + "step": 3490 + }, + { + "epoch": 1.0747979992304733, + "grad_norm": 10.3125, + "learning_rate": 5.712157531667664e-06, + "loss": 0.9891336560249329, + "step": 3492 + }, + { + "epoch": 1.0754136206233167, + "grad_norm": 4.5, + "learning_rate": 5.708180587616257e-06, + "loss": 1.2247391939163208, + "step": 3494 + }, + { + "epoch": 1.0760292420161601, + "grad_norm": 5.8125, + "learning_rate": 5.704203933552339e-06, + "loss": 1.2745413780212402, + "step": 3496 + }, + { + "epoch": 1.0766448634090036, + "grad_norm": 4.34375, + "learning_rate": 5.7002275734276034e-06, + "loss": 1.155380368232727, + "step": 3498 + }, + { + "epoch": 1.0772604848018468, + "grad_norm": 5.8125, + "learning_rate": 5.696251511193449e-06, + "loss": 1.122201681137085, + "step": 3500 + }, + { + "epoch": 1.0778761061946902, + "grad_norm": 5.40625, + "learning_rate": 5.692275750800977e-06, + "loss": 1.2920207977294922, + "step": 3502 + }, + { + "epoch": 1.0784917275875336, + "grad_norm": 51.25, + "learning_rate": 5.68830029620099e-06, + "loss": 1.4547216892242432, + "step": 3504 + }, + { + "epoch": 1.079107348980377, + "grad_norm": 7.0625, + "learning_rate": 5.6843251513439845e-06, + "loss": 1.7656255960464478, + "step": 3506 + }, + { + "epoch": 1.0797229703732205, + "grad_norm": 8.1875, + "learning_rate": 5.680350320180152e-06, + "loss": 1.4713743925094604, + "step": 3508 + }, + { + "epoch": 1.080338591766064, + "grad_norm": 13.75, + "learning_rate": 5.676375806659371e-06, + "loss": 0.9524730443954468, + "step": 3510 + }, + { + "epoch": 1.0809542131589072, + "grad_norm": 18.375, + "learning_rate": 5.6724016147312065e-06, + "loss": 1.7022722959518433, + "step": 3512 + }, + { + "epoch": 1.0815698345517506, + "grad_norm": 5.84375, + "learning_rate": 5.6684277483449e-06, + "loss": 1.0908372402191162, + "step": 3514 + }, + { + "epoch": 1.082185455944594, + "grad_norm": 2.875, + "learning_rate": 5.664454211449373e-06, + "loss": 1.2222366333007812, + "step": 3516 + }, + { + "epoch": 1.0828010773374375, + "grad_norm": 5.0, + "learning_rate": 5.660481007993218e-06, + "loss": 1.3541144132614136, + "step": 3518 + }, + { + "epoch": 1.083416698730281, + "grad_norm": 8.875, + "learning_rate": 5.656508141924695e-06, + "loss": 1.100854754447937, + "step": 3520 + }, + { + "epoch": 1.0840323201231243, + "grad_norm": 5.6875, + "learning_rate": 5.6525356171917314e-06, + "loss": 0.957051157951355, + "step": 3522 + }, + { + "epoch": 1.0846479415159678, + "grad_norm": 6.96875, + "learning_rate": 5.648563437741913e-06, + "loss": 1.4000749588012695, + "step": 3524 + }, + { + "epoch": 1.085263562908811, + "grad_norm": 4.90625, + "learning_rate": 5.6445916075224845e-06, + "loss": 1.1802750825881958, + "step": 3526 + }, + { + "epoch": 1.0858791843016544, + "grad_norm": 9.875, + "learning_rate": 5.640620130480343e-06, + "loss": 1.3265138864517212, + "step": 3528 + }, + { + "epoch": 1.0864948056944979, + "grad_norm": 6.4375, + "learning_rate": 5.636649010562034e-06, + "loss": 1.4276715517044067, + "step": 3530 + }, + { + "epoch": 1.0871104270873413, + "grad_norm": 5.15625, + "learning_rate": 5.6326782517137475e-06, + "loss": 1.0193065404891968, + "step": 3532 + }, + { + "epoch": 1.0877260484801847, + "grad_norm": 10.8125, + "learning_rate": 5.628707857881317e-06, + "loss": 1.513476014137268, + "step": 3534 + }, + { + "epoch": 1.0883416698730282, + "grad_norm": 4.59375, + "learning_rate": 5.6247378330102085e-06, + "loss": 1.5175204277038574, + "step": 3536 + }, + { + "epoch": 1.0889572912658716, + "grad_norm": 3.8125, + "learning_rate": 5.62076818104553e-06, + "loss": 0.7854060530662537, + "step": 3538 + }, + { + "epoch": 1.0895729126587148, + "grad_norm": 7.71875, + "learning_rate": 5.616798905932008e-06, + "loss": 1.6784924268722534, + "step": 3540 + }, + { + "epoch": 1.0901885340515582, + "grad_norm": 1.7265625, + "learning_rate": 5.612830011614005e-06, + "loss": 0.6873236298561096, + "step": 3542 + }, + { + "epoch": 1.0908041554444017, + "grad_norm": 10.1875, + "learning_rate": 5.608861502035498e-06, + "loss": 1.4695472717285156, + "step": 3544 + }, + { + "epoch": 1.0914197768372451, + "grad_norm": 7.1875, + "learning_rate": 5.604893381140084e-06, + "loss": 1.2640148401260376, + "step": 3546 + }, + { + "epoch": 1.0920353982300885, + "grad_norm": 8.1875, + "learning_rate": 5.600925652870975e-06, + "loss": 0.8067486882209778, + "step": 3548 + }, + { + "epoch": 1.092651019622932, + "grad_norm": 5.75, + "learning_rate": 5.596958321170987e-06, + "loss": 1.2527005672454834, + "step": 3550 + }, + { + "epoch": 1.0932666410157752, + "grad_norm": 8.8125, + "learning_rate": 5.592991389982552e-06, + "loss": 1.3179057836532593, + "step": 3552 + }, + { + "epoch": 1.0938822624086186, + "grad_norm": 9.6875, + "learning_rate": 5.589024863247694e-06, + "loss": 1.688755989074707, + "step": 3554 + }, + { + "epoch": 1.094497883801462, + "grad_norm": 11.875, + "learning_rate": 5.585058744908045e-06, + "loss": 1.0170738697052002, + "step": 3556 + }, + { + "epoch": 1.0951135051943055, + "grad_norm": 10.75, + "learning_rate": 5.581093038904821e-06, + "loss": 1.4297583103179932, + "step": 3558 + }, + { + "epoch": 1.095729126587149, + "grad_norm": 10.25, + "learning_rate": 5.577127749178834e-06, + "loss": 1.9947293996810913, + "step": 3560 + }, + { + "epoch": 1.0963447479799924, + "grad_norm": 8.4375, + "learning_rate": 5.5731628796704825e-06, + "loss": 1.440248727798462, + "step": 3562 + }, + { + "epoch": 1.0969603693728358, + "grad_norm": 12.3125, + "learning_rate": 5.569198434319745e-06, + "loss": 0.8194329142570496, + "step": 3564 + }, + { + "epoch": 1.097575990765679, + "grad_norm": 6.96875, + "learning_rate": 5.565234417066179e-06, + "loss": 1.1109315156936646, + "step": 3566 + }, + { + "epoch": 1.0981916121585225, + "grad_norm": 7.96875, + "learning_rate": 5.561270831848922e-06, + "loss": 0.8463287949562073, + "step": 3568 + }, + { + "epoch": 1.098807233551366, + "grad_norm": 8.4375, + "learning_rate": 5.557307682606669e-06, + "loss": 1.1162123680114746, + "step": 3570 + }, + { + "epoch": 1.0994228549442093, + "grad_norm": 12.5625, + "learning_rate": 5.553344973277699e-06, + "loss": 1.2072829008102417, + "step": 3572 + }, + { + "epoch": 1.1000384763370528, + "grad_norm": 1.1484375, + "learning_rate": 5.5493827077998395e-06, + "loss": 0.888592004776001, + "step": 3574 + }, + { + "epoch": 1.1006540977298962, + "grad_norm": 11.875, + "learning_rate": 5.545420890110484e-06, + "loss": 1.5153419971466064, + "step": 3576 + }, + { + "epoch": 1.1012697191227394, + "grad_norm": 2.328125, + "learning_rate": 5.541459524146579e-06, + "loss": 1.3337959051132202, + "step": 3578 + }, + { + "epoch": 1.1018853405155828, + "grad_norm": 3.5625, + "learning_rate": 5.5374986138446255e-06, + "loss": 1.0780278444290161, + "step": 3580 + }, + { + "epoch": 1.1025009619084263, + "grad_norm": 4.5, + "learning_rate": 5.533538163140666e-06, + "loss": 1.2636305093765259, + "step": 3582 + }, + { + "epoch": 1.1031165833012697, + "grad_norm": 4.15625, + "learning_rate": 5.5295781759702895e-06, + "loss": 1.325244426727295, + "step": 3584 + }, + { + "epoch": 1.1037322046941132, + "grad_norm": 7.28125, + "learning_rate": 5.5256186562686255e-06, + "loss": 0.8051739931106567, + "step": 3586 + }, + { + "epoch": 1.1043478260869566, + "grad_norm": 4.78125, + "learning_rate": 5.521659607970334e-06, + "loss": 1.2588369846343994, + "step": 3588 + }, + { + "epoch": 1.1049634474798, + "grad_norm": 2.546875, + "learning_rate": 5.517701035009615e-06, + "loss": 1.2784472703933716, + "step": 3590 + }, + { + "epoch": 1.1055790688726432, + "grad_norm": 7.0625, + "learning_rate": 5.513742941320187e-06, + "loss": 1.315181016921997, + "step": 3592 + }, + { + "epoch": 1.1061946902654867, + "grad_norm": 4.65625, + "learning_rate": 5.5097853308353e-06, + "loss": 1.1988142728805542, + "step": 3594 + }, + { + "epoch": 1.10681031165833, + "grad_norm": 10.25, + "learning_rate": 5.505828207487717e-06, + "loss": 1.2534257173538208, + "step": 3596 + }, + { + "epoch": 1.1074259330511735, + "grad_norm": 5.78125, + "learning_rate": 5.501871575209721e-06, + "loss": 1.1419970989227295, + "step": 3598 + }, + { + "epoch": 1.108041554444017, + "grad_norm": 2.859375, + "learning_rate": 5.497915437933107e-06, + "loss": 0.7719895839691162, + "step": 3600 + }, + { + "epoch": 1.1086571758368604, + "grad_norm": 5.59375, + "learning_rate": 5.493959799589177e-06, + "loss": 1.2032313346862793, + "step": 3602 + }, + { + "epoch": 1.1092727972297038, + "grad_norm": 3.796875, + "learning_rate": 5.490004664108737e-06, + "loss": 1.3593542575836182, + "step": 3604 + }, + { + "epoch": 1.109888418622547, + "grad_norm": 4.4375, + "learning_rate": 5.486050035422094e-06, + "loss": 1.1797250509262085, + "step": 3606 + }, + { + "epoch": 1.1105040400153905, + "grad_norm": 5.78125, + "learning_rate": 5.4820959174590545e-06, + "loss": 1.2517540454864502, + "step": 3608 + }, + { + "epoch": 1.111119661408234, + "grad_norm": 9.5, + "learning_rate": 5.4781423141489085e-06, + "loss": 1.248819351196289, + "step": 3610 + }, + { + "epoch": 1.1117352828010774, + "grad_norm": 21.0, + "learning_rate": 5.474189229420443e-06, + "loss": 1.6034396886825562, + "step": 3612 + }, + { + "epoch": 1.1123509041939208, + "grad_norm": 4.125, + "learning_rate": 5.470236667201927e-06, + "loss": 1.0933693647384644, + "step": 3614 + }, + { + "epoch": 1.1129665255867642, + "grad_norm": 7.28125, + "learning_rate": 5.46628463142111e-06, + "loss": 0.682421863079071, + "step": 3616 + }, + { + "epoch": 1.1135821469796074, + "grad_norm": 8.5625, + "learning_rate": 5.462333126005217e-06, + "loss": 1.3613861799240112, + "step": 3618 + }, + { + "epoch": 1.1141977683724509, + "grad_norm": 6.34375, + "learning_rate": 5.458382154880953e-06, + "loss": 1.3445461988449097, + "step": 3620 + }, + { + "epoch": 1.1148133897652943, + "grad_norm": 17.75, + "learning_rate": 5.454431721974478e-06, + "loss": 1.3255103826522827, + "step": 3622 + }, + { + "epoch": 1.1154290111581378, + "grad_norm": 5.96875, + "learning_rate": 5.450481831211432e-06, + "loss": 1.2539777755737305, + "step": 3624 + }, + { + "epoch": 1.1160446325509812, + "grad_norm": 6.9375, + "learning_rate": 5.446532486516909e-06, + "loss": 1.1418551206588745, + "step": 3626 + }, + { + "epoch": 1.1166602539438246, + "grad_norm": 5.8125, + "learning_rate": 5.4425836918154585e-06, + "loss": 1.2032052278518677, + "step": 3628 + }, + { + "epoch": 1.1172758753366678, + "grad_norm": 7.3125, + "learning_rate": 5.438635451031089e-06, + "loss": 1.1165993213653564, + "step": 3630 + }, + { + "epoch": 1.1178914967295113, + "grad_norm": 7.40625, + "learning_rate": 5.434687768087256e-06, + "loss": 1.2143586874008179, + "step": 3632 + }, + { + "epoch": 1.1185071181223547, + "grad_norm": 4.6875, + "learning_rate": 5.43074064690686e-06, + "loss": 1.2352961301803589, + "step": 3634 + }, + { + "epoch": 1.1191227395151981, + "grad_norm": 12.0, + "learning_rate": 5.426794091412244e-06, + "loss": 1.598303198814392, + "step": 3636 + }, + { + "epoch": 1.1197383609080416, + "grad_norm": 32.5, + "learning_rate": 5.422848105525187e-06, + "loss": 0.8510976433753967, + "step": 3638 + }, + { + "epoch": 1.120353982300885, + "grad_norm": 15.8125, + "learning_rate": 5.4189026931669056e-06, + "loss": 1.3632495403289795, + "step": 3640 + }, + { + "epoch": 1.1209696036937284, + "grad_norm": 6.15625, + "learning_rate": 5.414957858258043e-06, + "loss": 1.2300214767456055, + "step": 3642 + }, + { + "epoch": 1.1215852250865717, + "grad_norm": 5.34375, + "learning_rate": 5.411013604718671e-06, + "loss": 1.2433948516845703, + "step": 3644 + }, + { + "epoch": 1.122200846479415, + "grad_norm": 1.8203125, + "learning_rate": 5.407069936468284e-06, + "loss": 1.080621361732483, + "step": 3646 + }, + { + "epoch": 1.1228164678722585, + "grad_norm": 4.09375, + "learning_rate": 5.403126857425791e-06, + "loss": 1.2390270233154297, + "step": 3648 + }, + { + "epoch": 1.123432089265102, + "grad_norm": 9.5625, + "learning_rate": 5.399184371509521e-06, + "loss": 1.39670991897583, + "step": 3650 + }, + { + "epoch": 1.1240477106579454, + "grad_norm": 7.25, + "learning_rate": 5.395242482637206e-06, + "loss": 0.806347131729126, + "step": 3652 + }, + { + "epoch": 1.1246633320507888, + "grad_norm": 3.40625, + "learning_rate": 5.391301194725993e-06, + "loss": 1.2208232879638672, + "step": 3654 + }, + { + "epoch": 1.1252789534436323, + "grad_norm": 5.53125, + "learning_rate": 5.387360511692427e-06, + "loss": 1.2304521799087524, + "step": 3656 + }, + { + "epoch": 1.1258945748364755, + "grad_norm": 5.25, + "learning_rate": 5.383420437452453e-06, + "loss": 1.4465998411178589, + "step": 3658 + }, + { + "epoch": 1.126510196229319, + "grad_norm": 3.734375, + "learning_rate": 5.379480975921414e-06, + "loss": 1.0305050611495972, + "step": 3660 + }, + { + "epoch": 1.1271258176221624, + "grad_norm": 8.1875, + "learning_rate": 5.375542131014038e-06, + "loss": 1.223438024520874, + "step": 3662 + }, + { + "epoch": 1.1277414390150058, + "grad_norm": 5.4375, + "learning_rate": 5.371603906644443e-06, + "loss": 1.3073363304138184, + "step": 3664 + }, + { + "epoch": 1.1283570604078492, + "grad_norm": 6.875, + "learning_rate": 5.3676663067261315e-06, + "loss": 1.14788818359375, + "step": 3666 + }, + { + "epoch": 1.1289726818006927, + "grad_norm": 6.71875, + "learning_rate": 5.363729335171985e-06, + "loss": 0.8086817264556885, + "step": 3668 + }, + { + "epoch": 1.129588303193536, + "grad_norm": 12.5625, + "learning_rate": 5.359792995894262e-06, + "loss": 1.2843672037124634, + "step": 3670 + }, + { + "epoch": 1.1302039245863793, + "grad_norm": 8.5625, + "learning_rate": 5.355857292804592e-06, + "loss": 0.9466153383255005, + "step": 3672 + }, + { + "epoch": 1.1308195459792227, + "grad_norm": 4.5, + "learning_rate": 5.351922229813965e-06, + "loss": 1.2944337129592896, + "step": 3674 + }, + { + "epoch": 1.1314351673720662, + "grad_norm": 3.40625, + "learning_rate": 5.347987810832747e-06, + "loss": 1.0941487550735474, + "step": 3676 + }, + { + "epoch": 1.1320507887649096, + "grad_norm": 4.09375, + "learning_rate": 5.344054039770656e-06, + "loss": 1.1674392223358154, + "step": 3678 + }, + { + "epoch": 1.132666410157753, + "grad_norm": 10.0, + "learning_rate": 5.340120920536771e-06, + "loss": 1.2395570278167725, + "step": 3680 + }, + { + "epoch": 1.1332820315505965, + "grad_norm": 8.1875, + "learning_rate": 5.336188457039517e-06, + "loss": 1.541485071182251, + "step": 3682 + }, + { + "epoch": 1.1338976529434397, + "grad_norm": 10.875, + "learning_rate": 5.332256653186674e-06, + "loss": 1.6763718128204346, + "step": 3684 + }, + { + "epoch": 1.1345132743362831, + "grad_norm": 6.875, + "learning_rate": 5.328325512885364e-06, + "loss": 1.7518420219421387, + "step": 3686 + }, + { + "epoch": 1.1351288957291266, + "grad_norm": 19.5, + "learning_rate": 5.3243950400420476e-06, + "loss": 0.7111321687698364, + "step": 3688 + }, + { + "epoch": 1.13574451712197, + "grad_norm": 5.8125, + "learning_rate": 5.320465238562522e-06, + "loss": 1.3289068937301636, + "step": 3690 + }, + { + "epoch": 1.1363601385148134, + "grad_norm": 7.9375, + "learning_rate": 5.316536112351923e-06, + "loss": 1.4628081321716309, + "step": 3692 + }, + { + "epoch": 1.1369757599076569, + "grad_norm": 3.765625, + "learning_rate": 5.312607665314708e-06, + "loss": 1.062358021736145, + "step": 3694 + }, + { + "epoch": 1.1375913813005, + "grad_norm": 10.25, + "learning_rate": 5.308679901354667e-06, + "loss": 1.35207200050354, + "step": 3696 + }, + { + "epoch": 1.1382070026933435, + "grad_norm": 5.15625, + "learning_rate": 5.304752824374904e-06, + "loss": 1.5505430698394775, + "step": 3698 + }, + { + "epoch": 1.138822624086187, + "grad_norm": 42.5, + "learning_rate": 5.300826438277842e-06, + "loss": 1.450918436050415, + "step": 3700 + }, + { + "epoch": 1.1394382454790304, + "grad_norm": 3.734375, + "learning_rate": 5.296900746965224e-06, + "loss": 1.0654526948928833, + "step": 3702 + }, + { + "epoch": 1.1400538668718738, + "grad_norm": 8.5, + "learning_rate": 5.29297575433809e-06, + "loss": 1.2837327718734741, + "step": 3704 + }, + { + "epoch": 1.1406694882647173, + "grad_norm": 4.84375, + "learning_rate": 5.2890514642967995e-06, + "loss": 1.2365937232971191, + "step": 3706 + }, + { + "epoch": 1.1412851096575607, + "grad_norm": 11.25, + "learning_rate": 5.2851278807410055e-06, + "loss": 1.6589319705963135, + "step": 3708 + }, + { + "epoch": 1.141900731050404, + "grad_norm": 5.59375, + "learning_rate": 5.281205007569663e-06, + "loss": 1.5612269639968872, + "step": 3710 + }, + { + "epoch": 1.1425163524432473, + "grad_norm": 4.4375, + "learning_rate": 5.2772828486810135e-06, + "loss": 1.5178145170211792, + "step": 3712 + }, + { + "epoch": 1.1431319738360908, + "grad_norm": 16.625, + "learning_rate": 5.2733614079726e-06, + "loss": 1.3569532632827759, + "step": 3714 + }, + { + "epoch": 1.1437475952289342, + "grad_norm": 9.25, + "learning_rate": 5.269440689341243e-06, + "loss": 1.2014302015304565, + "step": 3716 + }, + { + "epoch": 1.1443632166217776, + "grad_norm": 19.5, + "learning_rate": 5.265520696683048e-06, + "loss": 1.7885704040527344, + "step": 3718 + }, + { + "epoch": 1.144978838014621, + "grad_norm": 8.3125, + "learning_rate": 5.2616014338934005e-06, + "loss": 1.2593146562576294, + "step": 3720 + }, + { + "epoch": 1.1455944594074645, + "grad_norm": 4.75, + "learning_rate": 5.2576829048669606e-06, + "loss": 1.2763214111328125, + "step": 3722 + }, + { + "epoch": 1.1462100808003077, + "grad_norm": 10.1875, + "learning_rate": 5.253765113497659e-06, + "loss": 1.6070430278778076, + "step": 3724 + }, + { + "epoch": 1.1468257021931512, + "grad_norm": 3.828125, + "learning_rate": 5.249848063678691e-06, + "loss": 1.1164813041687012, + "step": 3726 + }, + { + "epoch": 1.1474413235859946, + "grad_norm": 3.125, + "learning_rate": 5.245931759302516e-06, + "loss": 1.0933690071105957, + "step": 3728 + }, + { + "epoch": 1.148056944978838, + "grad_norm": 4.28125, + "learning_rate": 5.2420162042608555e-06, + "loss": 1.2221918106079102, + "step": 3730 + }, + { + "epoch": 1.1486725663716815, + "grad_norm": 4.96875, + "learning_rate": 5.238101402444684e-06, + "loss": 1.1146072149276733, + "step": 3732 + }, + { + "epoch": 1.149288187764525, + "grad_norm": 8.75, + "learning_rate": 5.234187357744228e-06, + "loss": 1.342393159866333, + "step": 3734 + }, + { + "epoch": 1.1499038091573683, + "grad_norm": 4.5, + "learning_rate": 5.230274074048961e-06, + "loss": 1.3994978666305542, + "step": 3736 + }, + { + "epoch": 1.1505194305502116, + "grad_norm": 1.90625, + "learning_rate": 5.226361555247601e-06, + "loss": 1.3354339599609375, + "step": 3738 + }, + { + "epoch": 1.151135051943055, + "grad_norm": 6.4375, + "learning_rate": 5.222449805228103e-06, + "loss": 1.4526199102401733, + "step": 3740 + }, + { + "epoch": 1.1517506733358984, + "grad_norm": 5.71875, + "learning_rate": 5.218538827877664e-06, + "loss": 0.9815947413444519, + "step": 3742 + }, + { + "epoch": 1.1523662947287419, + "grad_norm": 7.46875, + "learning_rate": 5.214628627082709e-06, + "loss": 0.8592740297317505, + "step": 3744 + }, + { + "epoch": 1.1529819161215853, + "grad_norm": 5.78125, + "learning_rate": 5.2107192067288925e-06, + "loss": 1.4137502908706665, + "step": 3746 + }, + { + "epoch": 1.1535975375144285, + "grad_norm": 2.96875, + "learning_rate": 5.206810570701092e-06, + "loss": 1.4178833961486816, + "step": 3748 + }, + { + "epoch": 1.154213158907272, + "grad_norm": 4.71875, + "learning_rate": 5.20290272288341e-06, + "loss": 1.3415439128875732, + "step": 3750 + }, + { + "epoch": 1.1548287803001154, + "grad_norm": 8.375, + "learning_rate": 5.198995667159157e-06, + "loss": 1.3343955278396606, + "step": 3752 + }, + { + "epoch": 1.1554444016929588, + "grad_norm": 9.6875, + "learning_rate": 5.195089407410865e-06, + "loss": 1.5473103523254395, + "step": 3754 + }, + { + "epoch": 1.1560600230858022, + "grad_norm": 4.9375, + "learning_rate": 5.19118394752027e-06, + "loss": 1.0742156505584717, + "step": 3756 + }, + { + "epoch": 1.1566756444786457, + "grad_norm": 8.5, + "learning_rate": 5.187279291368319e-06, + "loss": 1.4407182931900024, + "step": 3758 + }, + { + "epoch": 1.1572912658714891, + "grad_norm": 33.5, + "learning_rate": 5.183375442835155e-06, + "loss": 1.318236231803894, + "step": 3760 + }, + { + "epoch": 1.1579068872643323, + "grad_norm": 9.6875, + "learning_rate": 5.1794724058001165e-06, + "loss": 1.6564728021621704, + "step": 3762 + }, + { + "epoch": 1.1585225086571758, + "grad_norm": 3.203125, + "learning_rate": 5.175570184141743e-06, + "loss": 1.357529640197754, + "step": 3764 + }, + { + "epoch": 1.1591381300500192, + "grad_norm": 4.65625, + "learning_rate": 5.171668781737756e-06, + "loss": 1.2191883325576782, + "step": 3766 + }, + { + "epoch": 1.1597537514428626, + "grad_norm": 7.78125, + "learning_rate": 5.167768202465069e-06, + "loss": 1.2753338813781738, + "step": 3768 + }, + { + "epoch": 1.160369372835706, + "grad_norm": 9.5625, + "learning_rate": 5.163868450199774e-06, + "loss": 1.4886462688446045, + "step": 3770 + }, + { + "epoch": 1.1609849942285495, + "grad_norm": 6.34375, + "learning_rate": 5.159969528817144e-06, + "loss": 1.4246331453323364, + "step": 3772 + }, + { + "epoch": 1.161600615621393, + "grad_norm": 8.0, + "learning_rate": 5.156071442191622e-06, + "loss": 1.5249407291412354, + "step": 3774 + }, + { + "epoch": 1.1622162370142362, + "grad_norm": 12.8125, + "learning_rate": 5.1521741941968265e-06, + "loss": 1.5107097625732422, + "step": 3776 + }, + { + "epoch": 1.1628318584070796, + "grad_norm": 2.09375, + "learning_rate": 5.148277788705537e-06, + "loss": 1.1826157569885254, + "step": 3778 + }, + { + "epoch": 1.163447479799923, + "grad_norm": 9.3125, + "learning_rate": 5.144382229589702e-06, + "loss": 1.1656603813171387, + "step": 3780 + }, + { + "epoch": 1.1640631011927665, + "grad_norm": 6.53125, + "learning_rate": 5.140487520720425e-06, + "loss": 1.2253952026367188, + "step": 3782 + }, + { + "epoch": 1.16467872258561, + "grad_norm": 15.0625, + "learning_rate": 5.136593665967964e-06, + "loss": 1.449235200881958, + "step": 3784 + }, + { + "epoch": 1.1652943439784533, + "grad_norm": 22.75, + "learning_rate": 5.1327006692017325e-06, + "loss": 1.1967664957046509, + "step": 3786 + }, + { + "epoch": 1.1659099653712968, + "grad_norm": 1.8359375, + "learning_rate": 5.128808534290288e-06, + "loss": 1.3245983123779297, + "step": 3788 + }, + { + "epoch": 1.16652558676414, + "grad_norm": 7.09375, + "learning_rate": 5.12491726510133e-06, + "loss": 1.5706483125686646, + "step": 3790 + }, + { + "epoch": 1.1671412081569834, + "grad_norm": 5.78125, + "learning_rate": 5.121026865501701e-06, + "loss": 1.29669189453125, + "step": 3792 + }, + { + "epoch": 1.1677568295498268, + "grad_norm": 21.5, + "learning_rate": 5.117137339357381e-06, + "loss": 0.7407981753349304, + "step": 3794 + }, + { + "epoch": 1.1683724509426703, + "grad_norm": 8.5, + "learning_rate": 5.113248690533475e-06, + "loss": 1.147154688835144, + "step": 3796 + }, + { + "epoch": 1.1689880723355137, + "grad_norm": 6.09375, + "learning_rate": 5.109360922894222e-06, + "loss": 1.1887047290802002, + "step": 3798 + }, + { + "epoch": 1.1696036937283572, + "grad_norm": 9.375, + "learning_rate": 5.105474040302985e-06, + "loss": 1.2298659086227417, + "step": 3800 + }, + { + "epoch": 1.1702193151212004, + "grad_norm": 5.375, + "learning_rate": 5.101588046622248e-06, + "loss": 1.3768701553344727, + "step": 3802 + }, + { + "epoch": 1.1708349365140438, + "grad_norm": 6.0, + "learning_rate": 5.097702945713605e-06, + "loss": 1.4939483404159546, + "step": 3804 + }, + { + "epoch": 1.1714505579068872, + "grad_norm": 20.0, + "learning_rate": 5.093818741437771e-06, + "loss": 1.765651822090149, + "step": 3806 + }, + { + "epoch": 1.1720661792997307, + "grad_norm": 6.59375, + "learning_rate": 5.089935437654565e-06, + "loss": 1.3605940341949463, + "step": 3808 + }, + { + "epoch": 1.172681800692574, + "grad_norm": 14.4375, + "learning_rate": 5.0860530382229175e-06, + "loss": 0.8839163184165955, + "step": 3810 + }, + { + "epoch": 1.1732974220854175, + "grad_norm": 9.375, + "learning_rate": 5.082171547000852e-06, + "loss": 1.1597310304641724, + "step": 3812 + }, + { + "epoch": 1.1739130434782608, + "grad_norm": 6.40625, + "learning_rate": 5.078290967845494e-06, + "loss": 0.9128301739692688, + "step": 3814 + }, + { + "epoch": 1.1745286648711042, + "grad_norm": 6.6875, + "learning_rate": 5.074411304613061e-06, + "loss": 1.056254267692566, + "step": 3816 + }, + { + "epoch": 1.1751442862639476, + "grad_norm": 13.875, + "learning_rate": 5.0705325611588626e-06, + "loss": 1.498201847076416, + "step": 3818 + }, + { + "epoch": 1.175759907656791, + "grad_norm": 8.3125, + "learning_rate": 5.066654741337294e-06, + "loss": 1.243221402168274, + "step": 3820 + }, + { + "epoch": 1.1763755290496345, + "grad_norm": 5.625, + "learning_rate": 5.06277784900183e-06, + "loss": 1.3041177988052368, + "step": 3822 + }, + { + "epoch": 1.176991150442478, + "grad_norm": 81.5, + "learning_rate": 5.0589018880050255e-06, + "loss": 1.1544214487075806, + "step": 3824 + }, + { + "epoch": 1.1776067718353214, + "grad_norm": 23.125, + "learning_rate": 5.055026862198511e-06, + "loss": 1.2737535238265991, + "step": 3826 + }, + { + "epoch": 1.1782223932281646, + "grad_norm": 5.15625, + "learning_rate": 5.051152775432987e-06, + "loss": 0.8115167021751404, + "step": 3828 + }, + { + "epoch": 1.178838014621008, + "grad_norm": 10.3125, + "learning_rate": 5.047279631558217e-06, + "loss": 1.2570488452911377, + "step": 3830 + }, + { + "epoch": 1.1794536360138514, + "grad_norm": 16.25, + "learning_rate": 5.043407434423036e-06, + "loss": 1.199446201324463, + "step": 3832 + }, + { + "epoch": 1.1800692574066949, + "grad_norm": 54.25, + "learning_rate": 5.039536187875328e-06, + "loss": 1.6809020042419434, + "step": 3834 + }, + { + "epoch": 1.1806848787995383, + "grad_norm": 9.8125, + "learning_rate": 5.0356658957620395e-06, + "loss": 1.2956881523132324, + "step": 3836 + }, + { + "epoch": 1.1813005001923818, + "grad_norm": 4.15625, + "learning_rate": 5.0317965619291676e-06, + "loss": 1.3684914112091064, + "step": 3838 + }, + { + "epoch": 1.1819161215852252, + "grad_norm": 7.25, + "learning_rate": 5.0279281902217555e-06, + "loss": 1.5077707767486572, + "step": 3840 + }, + { + "epoch": 1.1825317429780684, + "grad_norm": 5.90625, + "learning_rate": 5.02406078448389e-06, + "loss": 1.291996955871582, + "step": 3842 + }, + { + "epoch": 1.1831473643709118, + "grad_norm": 47.0, + "learning_rate": 5.0201943485587004e-06, + "loss": 1.2624106407165527, + "step": 3844 + }, + { + "epoch": 1.1837629857637553, + "grad_norm": 4.53125, + "learning_rate": 5.016328886288348e-06, + "loss": 1.0628662109375, + "step": 3846 + }, + { + "epoch": 1.1843786071565987, + "grad_norm": 8.625, + "learning_rate": 5.012464401514032e-06, + "loss": 0.8396918773651123, + "step": 3848 + }, + { + "epoch": 1.1849942285494421, + "grad_norm": 5.09375, + "learning_rate": 5.0086008980759775e-06, + "loss": 0.8910311460494995, + "step": 3850 + }, + { + "epoch": 1.1856098499422856, + "grad_norm": 6.8125, + "learning_rate": 5.004738379813432e-06, + "loss": 1.350878119468689, + "step": 3852 + }, + { + "epoch": 1.186225471335129, + "grad_norm": 4.8125, + "learning_rate": 5.000876850564671e-06, + "loss": 1.3546310663223267, + "step": 3854 + }, + { + "epoch": 1.1868410927279722, + "grad_norm": 8.9375, + "learning_rate": 4.997016314166978e-06, + "loss": 1.653855800628662, + "step": 3856 + }, + { + "epoch": 1.1874567141208157, + "grad_norm": 9.9375, + "learning_rate": 4.993156774456655e-06, + "loss": 1.289168119430542, + "step": 3858 + }, + { + "epoch": 1.188072335513659, + "grad_norm": 7.25, + "learning_rate": 4.989298235269015e-06, + "loss": 0.9712845683097839, + "step": 3860 + }, + { + "epoch": 1.1886879569065025, + "grad_norm": 2.484375, + "learning_rate": 4.985440700438375e-06, + "loss": 0.8265883326530457, + "step": 3862 + }, + { + "epoch": 1.189303578299346, + "grad_norm": 8.5625, + "learning_rate": 4.981584173798053e-06, + "loss": 1.333146333694458, + "step": 3864 + }, + { + "epoch": 1.1899191996921894, + "grad_norm": 20.375, + "learning_rate": 4.977728659180367e-06, + "loss": 1.0070924758911133, + "step": 3866 + }, + { + "epoch": 1.1905348210850326, + "grad_norm": 3.796875, + "learning_rate": 4.973874160416627e-06, + "loss": 0.8928322792053223, + "step": 3868 + }, + { + "epoch": 1.191150442477876, + "grad_norm": 6.65625, + "learning_rate": 4.970020681337134e-06, + "loss": 1.2990491390228271, + "step": 3870 + }, + { + "epoch": 1.1917660638707195, + "grad_norm": 5.84375, + "learning_rate": 4.966168225771179e-06, + "loss": 1.4741895198822021, + "step": 3872 + }, + { + "epoch": 1.192381685263563, + "grad_norm": 6.125, + "learning_rate": 4.962316797547031e-06, + "loss": 0.9886192083358765, + "step": 3874 + }, + { + "epoch": 1.1929973066564064, + "grad_norm": 8.5625, + "learning_rate": 4.958466400491943e-06, + "loss": 1.517229437828064, + "step": 3876 + }, + { + "epoch": 1.1936129280492498, + "grad_norm": 11.1875, + "learning_rate": 4.954617038432139e-06, + "loss": 1.8023860454559326, + "step": 3878 + }, + { + "epoch": 1.194228549442093, + "grad_norm": 5.78125, + "learning_rate": 4.950768715192819e-06, + "loss": 1.4020851850509644, + "step": 3880 + }, + { + "epoch": 1.1948441708349364, + "grad_norm": 4.78125, + "learning_rate": 4.946921434598144e-06, + "loss": 1.0979114770889282, + "step": 3882 + }, + { + "epoch": 1.1954597922277799, + "grad_norm": 6.75, + "learning_rate": 4.943075200471245e-06, + "loss": 1.4677519798278809, + "step": 3884 + }, + { + "epoch": 1.1960754136206233, + "grad_norm": 6.0625, + "learning_rate": 4.939230016634211e-06, + "loss": 0.9861754179000854, + "step": 3886 + }, + { + "epoch": 1.1966910350134667, + "grad_norm": 27.25, + "learning_rate": 4.935385886908089e-06, + "loss": 1.3366788625717163, + "step": 3888 + }, + { + "epoch": 1.1973066564063102, + "grad_norm": 9.875, + "learning_rate": 4.931542815112875e-06, + "loss": 1.57736337184906, + "step": 3890 + }, + { + "epoch": 1.1979222777991536, + "grad_norm": 6.5625, + "learning_rate": 4.927700805067516e-06, + "loss": 1.2437076568603516, + "step": 3892 + }, + { + "epoch": 1.1985378991919968, + "grad_norm": 23.625, + "learning_rate": 4.9238598605899035e-06, + "loss": 1.3512110710144043, + "step": 3894 + }, + { + "epoch": 1.1991535205848403, + "grad_norm": 7.0, + "learning_rate": 4.920019985496869e-06, + "loss": 0.9439166188240051, + "step": 3896 + }, + { + "epoch": 1.1997691419776837, + "grad_norm": 7.09375, + "learning_rate": 4.916181183604184e-06, + "loss": 1.318813681602478, + "step": 3898 + }, + { + "epoch": 1.2003847633705271, + "grad_norm": 4.6875, + "learning_rate": 4.912343458726552e-06, + "loss": 1.2898008823394775, + "step": 3900 + }, + { + "epoch": 1.2010003847633706, + "grad_norm": 10.0, + "learning_rate": 4.908506814677605e-06, + "loss": 1.2824454307556152, + "step": 3902 + }, + { + "epoch": 1.201616006156214, + "grad_norm": 4.4375, + "learning_rate": 4.904671255269903e-06, + "loss": 1.2138501405715942, + "step": 3904 + }, + { + "epoch": 1.2022316275490574, + "grad_norm": 11.375, + "learning_rate": 4.9008367843149296e-06, + "loss": 1.675101637840271, + "step": 3906 + }, + { + "epoch": 1.2028472489419006, + "grad_norm": 11.4375, + "learning_rate": 4.89700340562308e-06, + "loss": 1.4074642658233643, + "step": 3908 + }, + { + "epoch": 1.203462870334744, + "grad_norm": 4.40625, + "learning_rate": 4.893171123003672e-06, + "loss": 1.2531607151031494, + "step": 3910 + }, + { + "epoch": 1.2040784917275875, + "grad_norm": 5.40625, + "learning_rate": 4.889339940264929e-06, + "loss": 1.2239080667495728, + "step": 3912 + }, + { + "epoch": 1.204694113120431, + "grad_norm": 5.4375, + "learning_rate": 4.8855098612139835e-06, + "loss": 0.7669562101364136, + "step": 3914 + }, + { + "epoch": 1.2053097345132744, + "grad_norm": 6.8125, + "learning_rate": 4.8816808896568705e-06, + "loss": 1.0785932540893555, + "step": 3916 + }, + { + "epoch": 1.2059253559061178, + "grad_norm": 6.78125, + "learning_rate": 4.877853029398527e-06, + "loss": 1.1975194215774536, + "step": 3918 + }, + { + "epoch": 1.2065409772989613, + "grad_norm": 7.59375, + "learning_rate": 4.874026284242782e-06, + "loss": 1.4719704389572144, + "step": 3920 + }, + { + "epoch": 1.2071565986918045, + "grad_norm": 5.03125, + "learning_rate": 4.870200657992358e-06, + "loss": 1.1891669034957886, + "step": 3922 + }, + { + "epoch": 1.207772220084648, + "grad_norm": 7.71875, + "learning_rate": 4.866376154448864e-06, + "loss": 1.0619840621948242, + "step": 3924 + }, + { + "epoch": 1.2083878414774913, + "grad_norm": 6.8125, + "learning_rate": 4.862552777412796e-06, + "loss": 1.3883061408996582, + "step": 3926 + }, + { + "epoch": 1.2090034628703348, + "grad_norm": 7.8125, + "learning_rate": 4.858730530683532e-06, + "loss": 1.4634684324264526, + "step": 3928 + }, + { + "epoch": 1.2096190842631782, + "grad_norm": 6.59375, + "learning_rate": 4.854909418059323e-06, + "loss": 1.2528977394104004, + "step": 3930 + }, + { + "epoch": 1.2102347056560214, + "grad_norm": 66.0, + "learning_rate": 4.851089443337291e-06, + "loss": 1.2767219543457031, + "step": 3932 + }, + { + "epoch": 1.2108503270488649, + "grad_norm": 4.6875, + "learning_rate": 4.8472706103134344e-06, + "loss": 1.2410752773284912, + "step": 3934 + }, + { + "epoch": 1.2114659484417083, + "grad_norm": 9.875, + "learning_rate": 4.8434529227826106e-06, + "loss": 1.6885144710540771, + "step": 3936 + }, + { + "epoch": 1.2120815698345517, + "grad_norm": 9.25, + "learning_rate": 4.839636384538543e-06, + "loss": 1.6024225950241089, + "step": 3938 + }, + { + "epoch": 1.2126971912273952, + "grad_norm": 8.0625, + "learning_rate": 4.83582099937381e-06, + "loss": 1.3082659244537354, + "step": 3940 + }, + { + "epoch": 1.2133128126202386, + "grad_norm": 4.53125, + "learning_rate": 4.832006771079847e-06, + "loss": 1.046728253364563, + "step": 3942 + }, + { + "epoch": 1.213928434013082, + "grad_norm": 5.40625, + "learning_rate": 4.8281937034469364e-06, + "loss": 1.2913883924484253, + "step": 3944 + }, + { + "epoch": 1.2145440554059252, + "grad_norm": 9.375, + "learning_rate": 4.824381800264211e-06, + "loss": 1.5368022918701172, + "step": 3946 + }, + { + "epoch": 1.2151596767987687, + "grad_norm": 5.65625, + "learning_rate": 4.820571065319641e-06, + "loss": 1.0252327919006348, + "step": 3948 + }, + { + "epoch": 1.2157752981916121, + "grad_norm": 6.125, + "learning_rate": 4.816761502400042e-06, + "loss": 1.1273882389068604, + "step": 3950 + }, + { + "epoch": 1.2163909195844556, + "grad_norm": 4.21875, + "learning_rate": 4.8129531152910615e-06, + "loss": 1.1979624032974243, + "step": 3952 + }, + { + "epoch": 1.217006540977299, + "grad_norm": 18.125, + "learning_rate": 4.80914590777718e-06, + "loss": 1.2820459604263306, + "step": 3954 + }, + { + "epoch": 1.2176221623701424, + "grad_norm": 4.9375, + "learning_rate": 4.805339883641704e-06, + "loss": 1.5242979526519775, + "step": 3956 + }, + { + "epoch": 1.2182377837629859, + "grad_norm": 5.21875, + "learning_rate": 4.801535046666763e-06, + "loss": 1.2126457691192627, + "step": 3958 + }, + { + "epoch": 1.218853405155829, + "grad_norm": 10.1875, + "learning_rate": 4.797731400633312e-06, + "loss": 1.3734692335128784, + "step": 3960 + }, + { + "epoch": 1.2194690265486725, + "grad_norm": 10.875, + "learning_rate": 4.793928949321117e-06, + "loss": 1.368640661239624, + "step": 3962 + }, + { + "epoch": 1.220084647941516, + "grad_norm": 2.140625, + "learning_rate": 4.79012769650876e-06, + "loss": 1.1736152172088623, + "step": 3964 + }, + { + "epoch": 1.2207002693343594, + "grad_norm": 9.625, + "learning_rate": 4.786327645973633e-06, + "loss": 1.005432367324829, + "step": 3966 + }, + { + "epoch": 1.2213158907272028, + "grad_norm": 9.875, + "learning_rate": 4.782528801491928e-06, + "loss": 1.5107418298721313, + "step": 3968 + }, + { + "epoch": 1.2219315121200462, + "grad_norm": 7.8125, + "learning_rate": 4.778731166838646e-06, + "loss": 1.278852939605713, + "step": 3970 + }, + { + "epoch": 1.2225471335128897, + "grad_norm": 6.0, + "learning_rate": 4.774934745787577e-06, + "loss": 1.2914077043533325, + "step": 3972 + }, + { + "epoch": 1.223162754905733, + "grad_norm": 5.21875, + "learning_rate": 4.7711395421113124e-06, + "loss": 0.9130945801734924, + "step": 3974 + }, + { + "epoch": 1.2237783762985763, + "grad_norm": 50.75, + "learning_rate": 4.767345559581231e-06, + "loss": 1.1317358016967773, + "step": 3976 + }, + { + "epoch": 1.2243939976914198, + "grad_norm": 7.4375, + "learning_rate": 4.763552801967498e-06, + "loss": 1.285693645477295, + "step": 3978 + }, + { + "epoch": 1.2250096190842632, + "grad_norm": 2.96875, + "learning_rate": 4.759761273039061e-06, + "loss": 1.1612380743026733, + "step": 3980 + }, + { + "epoch": 1.2256252404771066, + "grad_norm": 2.53125, + "learning_rate": 4.75597097656365e-06, + "loss": 1.0672385692596436, + "step": 3982 + }, + { + "epoch": 1.22624086186995, + "grad_norm": 47.25, + "learning_rate": 4.7521819163077635e-06, + "loss": 1.1390371322631836, + "step": 3984 + }, + { + "epoch": 1.2268564832627935, + "grad_norm": 7.6875, + "learning_rate": 4.748394096036678e-06, + "loss": 1.5185624361038208, + "step": 3986 + }, + { + "epoch": 1.2274721046556367, + "grad_norm": 7.15625, + "learning_rate": 4.744607519514436e-06, + "loss": 1.2074450254440308, + "step": 3988 + }, + { + "epoch": 1.2280877260484802, + "grad_norm": 10.625, + "learning_rate": 4.740822190503841e-06, + "loss": 1.9707778692245483, + "step": 3990 + }, + { + "epoch": 1.2287033474413236, + "grad_norm": 12.4375, + "learning_rate": 4.737038112766461e-06, + "loss": 0.9399597644805908, + "step": 3992 + }, + { + "epoch": 1.229318968834167, + "grad_norm": 3.609375, + "learning_rate": 4.73325529006262e-06, + "loss": 1.2759641408920288, + "step": 3994 + }, + { + "epoch": 1.2299345902270105, + "grad_norm": 6.90625, + "learning_rate": 4.729473726151393e-06, + "loss": 1.0565696954727173, + "step": 3996 + }, + { + "epoch": 1.2305502116198537, + "grad_norm": 5.6875, + "learning_rate": 4.725693424790603e-06, + "loss": 1.1895898580551147, + "step": 3998 + }, + { + "epoch": 1.231165833012697, + "grad_norm": 7.625, + "learning_rate": 4.721914389736821e-06, + "loss": 1.195793867111206, + "step": 4000 + }, + { + "epoch": 1.2317814544055405, + "grad_norm": 12.8125, + "learning_rate": 4.71813662474536e-06, + "loss": 1.334660530090332, + "step": 4002 + }, + { + "epoch": 1.232397075798384, + "grad_norm": 6.25, + "learning_rate": 4.7143601335702686e-06, + "loss": 1.3096296787261963, + "step": 4004 + }, + { + "epoch": 1.2330126971912274, + "grad_norm": 9.3125, + "learning_rate": 4.71058491996433e-06, + "loss": 1.246321201324463, + "step": 4006 + }, + { + "epoch": 1.2336283185840708, + "grad_norm": 6.46875, + "learning_rate": 4.706810987679063e-06, + "loss": 1.372193694114685, + "step": 4008 + }, + { + "epoch": 1.2342439399769143, + "grad_norm": 5.96875, + "learning_rate": 4.703038340464704e-06, + "loss": 1.0236767530441284, + "step": 4010 + }, + { + "epoch": 1.2348595613697575, + "grad_norm": 6.375, + "learning_rate": 4.699266982070217e-06, + "loss": 1.4130743741989136, + "step": 4012 + }, + { + "epoch": 1.235475182762601, + "grad_norm": 7.65625, + "learning_rate": 4.695496916243287e-06, + "loss": 1.526268482208252, + "step": 4014 + }, + { + "epoch": 1.2360908041554444, + "grad_norm": 10.375, + "learning_rate": 4.691728146730314e-06, + "loss": 1.710693597793579, + "step": 4016 + }, + { + "epoch": 1.2367064255482878, + "grad_norm": 5.625, + "learning_rate": 4.6879606772764066e-06, + "loss": 1.1987316608428955, + "step": 4018 + }, + { + "epoch": 1.2373220469411312, + "grad_norm": 2.46875, + "learning_rate": 4.6841945116253865e-06, + "loss": 1.0062463283538818, + "step": 4020 + }, + { + "epoch": 1.2379376683339747, + "grad_norm": 8.0625, + "learning_rate": 4.680429653519775e-06, + "loss": 1.3855282068252563, + "step": 4022 + }, + { + "epoch": 1.238553289726818, + "grad_norm": 6.46875, + "learning_rate": 4.6766661067007945e-06, + "loss": 1.396597981452942, + "step": 4024 + }, + { + "epoch": 1.2391689111196613, + "grad_norm": 12.0, + "learning_rate": 4.6729038749083675e-06, + "loss": 1.0071481466293335, + "step": 4026 + }, + { + "epoch": 1.2397845325125048, + "grad_norm": 8.5, + "learning_rate": 4.669142961881108e-06, + "loss": 1.3875513076782227, + "step": 4028 + }, + { + "epoch": 1.2404001539053482, + "grad_norm": 8.3125, + "learning_rate": 4.665383371356321e-06, + "loss": 0.9031148552894592, + "step": 4030 + }, + { + "epoch": 1.2410157752981916, + "grad_norm": 6.09375, + "learning_rate": 4.661625107069992e-06, + "loss": 1.4057817459106445, + "step": 4032 + }, + { + "epoch": 1.241631396691035, + "grad_norm": 5.3125, + "learning_rate": 4.657868172756799e-06, + "loss": 1.468703269958496, + "step": 4034 + }, + { + "epoch": 1.2422470180838785, + "grad_norm": 6.90625, + "learning_rate": 4.654112572150084e-06, + "loss": 1.4883900880813599, + "step": 4036 + }, + { + "epoch": 1.242862639476722, + "grad_norm": 12.0, + "learning_rate": 4.650358308981876e-06, + "loss": 1.3661744594573975, + "step": 4038 + }, + { + "epoch": 1.2434782608695651, + "grad_norm": 5.34375, + "learning_rate": 4.6466053869828695e-06, + "loss": 1.223865270614624, + "step": 4040 + }, + { + "epoch": 1.2440938822624086, + "grad_norm": 12.25, + "learning_rate": 4.6428538098824284e-06, + "loss": 1.1496392488479614, + "step": 4042 + }, + { + "epoch": 1.244709503655252, + "grad_norm": 8.0, + "learning_rate": 4.639103581408577e-06, + "loss": 1.3361393213272095, + "step": 4044 + }, + { + "epoch": 1.2453251250480954, + "grad_norm": 19.125, + "learning_rate": 4.635354705288002e-06, + "loss": 1.4294644594192505, + "step": 4046 + }, + { + "epoch": 1.2459407464409389, + "grad_norm": 9.25, + "learning_rate": 4.631607185246048e-06, + "loss": 1.5690128803253174, + "step": 4048 + }, + { + "epoch": 1.2465563678337823, + "grad_norm": 5.65625, + "learning_rate": 4.6278610250067065e-06, + "loss": 1.3025476932525635, + "step": 4050 + }, + { + "epoch": 1.2471719892266255, + "grad_norm": 5.5625, + "learning_rate": 4.624116228292621e-06, + "loss": 1.5564523935317993, + "step": 4052 + }, + { + "epoch": 1.247787610619469, + "grad_norm": 7.25, + "learning_rate": 4.620372798825083e-06, + "loss": 1.5863494873046875, + "step": 4054 + }, + { + "epoch": 1.2484032320123124, + "grad_norm": 9.875, + "learning_rate": 4.61663074032402e-06, + "loss": 1.7407833337783813, + "step": 4056 + }, + { + "epoch": 1.2490188534051558, + "grad_norm": 53.5, + "learning_rate": 4.6128900565079985e-06, + "loss": 1.1799893379211426, + "step": 4058 + }, + { + "epoch": 1.2496344747979993, + "grad_norm": 14.1875, + "learning_rate": 4.609150751094223e-06, + "loss": 1.0459473133087158, + "step": 4060 + }, + { + "epoch": 1.2502500961908427, + "grad_norm": 4.0625, + "learning_rate": 4.605412827798521e-06, + "loss": 1.190739393234253, + "step": 4062 + }, + { + "epoch": 1.250865717583686, + "grad_norm": 17.625, + "learning_rate": 4.601676290335353e-06, + "loss": 1.0495179891586304, + "step": 4064 + }, + { + "epoch": 1.2514813389765294, + "grad_norm": 17.25, + "learning_rate": 4.597941142417801e-06, + "loss": 1.2779337167739868, + "step": 4066 + }, + { + "epoch": 1.2520969603693728, + "grad_norm": 2.96875, + "learning_rate": 4.594207387757563e-06, + "loss": 0.7309108376502991, + "step": 4068 + }, + { + "epoch": 1.2527125817622162, + "grad_norm": 4.875, + "learning_rate": 4.590475030064957e-06, + "loss": 1.3036484718322754, + "step": 4070 + }, + { + "epoch": 1.2533282031550597, + "grad_norm": 7.125, + "learning_rate": 4.586744073048908e-06, + "loss": 1.4698227643966675, + "step": 4072 + }, + { + "epoch": 1.253943824547903, + "grad_norm": 9.5, + "learning_rate": 4.5830145204169555e-06, + "loss": 1.1250114440917969, + "step": 4074 + }, + { + "epoch": 1.2545594459407465, + "grad_norm": 2.921875, + "learning_rate": 4.5792863758752355e-06, + "loss": 0.8555299639701843, + "step": 4076 + }, + { + "epoch": 1.2551750673335897, + "grad_norm": 4.0625, + "learning_rate": 4.575559643128489e-06, + "loss": 0.7287892699241638, + "step": 4078 + }, + { + "epoch": 1.2557906887264332, + "grad_norm": 5.9375, + "learning_rate": 4.571834325880056e-06, + "loss": 1.4676601886749268, + "step": 4080 + }, + { + "epoch": 1.2564063101192766, + "grad_norm": 7.78125, + "learning_rate": 4.568110427831867e-06, + "loss": 1.1823492050170898, + "step": 4082 + }, + { + "epoch": 1.25702193151212, + "grad_norm": 4.90625, + "learning_rate": 4.5643879526844435e-06, + "loss": 1.2526085376739502, + "step": 4084 + }, + { + "epoch": 1.2576375529049635, + "grad_norm": 7.875, + "learning_rate": 4.560666904136891e-06, + "loss": 0.753780722618103, + "step": 4086 + }, + { + "epoch": 1.258253174297807, + "grad_norm": 23.625, + "learning_rate": 4.556947285886901e-06, + "loss": 0.8512752056121826, + "step": 4088 + }, + { + "epoch": 1.2588687956906504, + "grad_norm": 20.75, + "learning_rate": 4.553229101630738e-06, + "loss": 1.7275090217590332, + "step": 4090 + }, + { + "epoch": 1.2594844170834936, + "grad_norm": 6.65625, + "learning_rate": 4.549512355063248e-06, + "loss": 1.0966256856918335, + "step": 4092 + }, + { + "epoch": 1.260100038476337, + "grad_norm": 6.4375, + "learning_rate": 4.545797049877845e-06, + "loss": 1.1546615362167358, + "step": 4094 + }, + { + "epoch": 1.2607156598691804, + "grad_norm": 7.0, + "learning_rate": 4.5420831897665115e-06, + "loss": 1.4722708463668823, + "step": 4096 + }, + { + "epoch": 1.2613312812620239, + "grad_norm": 9.125, + "learning_rate": 4.538370778419791e-06, + "loss": 1.5684009790420532, + "step": 4098 + }, + { + "epoch": 1.2619469026548673, + "grad_norm": 5.75, + "learning_rate": 4.534659819526794e-06, + "loss": 1.1395797729492188, + "step": 4100 + }, + { + "epoch": 1.2625625240477105, + "grad_norm": 3.71875, + "learning_rate": 4.53095031677518e-06, + "loss": 0.9826986193656921, + "step": 4102 + }, + { + "epoch": 1.2631781454405542, + "grad_norm": 8.0625, + "learning_rate": 4.527242273851166e-06, + "loss": 1.339382529258728, + "step": 4104 + }, + { + "epoch": 1.2637937668333974, + "grad_norm": 13.875, + "learning_rate": 4.523535694439516e-06, + "loss": 1.7129334211349487, + "step": 4106 + }, + { + "epoch": 1.2644093882262408, + "grad_norm": 7.53125, + "learning_rate": 4.519830582223545e-06, + "loss": 1.558545470237732, + "step": 4108 + }, + { + "epoch": 1.2650250096190843, + "grad_norm": 8.8125, + "learning_rate": 4.516126940885103e-06, + "loss": 1.5102803707122803, + "step": 4110 + }, + { + "epoch": 1.2656406310119277, + "grad_norm": 3.9375, + "learning_rate": 4.512424774104583e-06, + "loss": 1.365267038345337, + "step": 4112 + }, + { + "epoch": 1.2662562524047711, + "grad_norm": 8.4375, + "learning_rate": 4.508724085560908e-06, + "loss": 1.2057257890701294, + "step": 4114 + }, + { + "epoch": 1.2668718737976143, + "grad_norm": 5.34375, + "learning_rate": 4.505024878931539e-06, + "loss": 0.8204096555709839, + "step": 4116 + }, + { + "epoch": 1.267487495190458, + "grad_norm": 7.09375, + "learning_rate": 4.501327157892457e-06, + "loss": 0.9797679781913757, + "step": 4118 + }, + { + "epoch": 1.2681031165833012, + "grad_norm": 3.828125, + "learning_rate": 4.497630926118175e-06, + "loss": 1.0729451179504395, + "step": 4120 + }, + { + "epoch": 1.2687187379761447, + "grad_norm": 12.25, + "learning_rate": 4.493936187281717e-06, + "loss": 1.0971177816390991, + "step": 4122 + }, + { + "epoch": 1.269334359368988, + "grad_norm": 3.4375, + "learning_rate": 4.490242945054629e-06, + "loss": 0.6305558085441589, + "step": 4124 + }, + { + "epoch": 1.2699499807618315, + "grad_norm": 9.6875, + "learning_rate": 4.486551203106971e-06, + "loss": 1.2655073404312134, + "step": 4126 + }, + { + "epoch": 1.270565602154675, + "grad_norm": 9.75, + "learning_rate": 4.482860965107305e-06, + "loss": 1.462432861328125, + "step": 4128 + }, + { + "epoch": 1.2711812235475182, + "grad_norm": 19.25, + "learning_rate": 4.479172234722708e-06, + "loss": 1.0850521326065063, + "step": 4130 + }, + { + "epoch": 1.2717968449403616, + "grad_norm": 7.0, + "learning_rate": 4.47548501561875e-06, + "loss": 0.8474297523498535, + "step": 4132 + }, + { + "epoch": 1.272412466333205, + "grad_norm": 7.9375, + "learning_rate": 4.471799311459507e-06, + "loss": 1.212317943572998, + "step": 4134 + }, + { + "epoch": 1.2730280877260485, + "grad_norm": 5.5, + "learning_rate": 4.468115125907543e-06, + "loss": 1.1656124591827393, + "step": 4136 + }, + { + "epoch": 1.273643709118892, + "grad_norm": 5.8125, + "learning_rate": 4.464432462623918e-06, + "loss": 1.2986111640930176, + "step": 4138 + }, + { + "epoch": 1.2742593305117353, + "grad_norm": 7.875, + "learning_rate": 4.460751325268175e-06, + "loss": 1.3736796379089355, + "step": 4140 + }, + { + "epoch": 1.2748749519045788, + "grad_norm": 4.34375, + "learning_rate": 4.457071717498344e-06, + "loss": 1.3222219944000244, + "step": 4142 + }, + { + "epoch": 1.275490573297422, + "grad_norm": 6.0625, + "learning_rate": 4.453393642970933e-06, + "loss": 1.3862910270690918, + "step": 4144 + }, + { + "epoch": 1.2761061946902654, + "grad_norm": 5.25, + "learning_rate": 4.449717105340927e-06, + "loss": 1.3663526773452759, + "step": 4146 + }, + { + "epoch": 1.2767218160831089, + "grad_norm": 6.46875, + "learning_rate": 4.446042108261784e-06, + "loss": 1.046976089477539, + "step": 4148 + }, + { + "epoch": 1.2773374374759523, + "grad_norm": 10.1875, + "learning_rate": 4.442368655385434e-06, + "loss": 1.2302095890045166, + "step": 4150 + }, + { + "epoch": 1.2779530588687957, + "grad_norm": 6.90625, + "learning_rate": 4.438696750362265e-06, + "loss": 1.021092176437378, + "step": 4152 + }, + { + "epoch": 1.2785686802616392, + "grad_norm": 4.25, + "learning_rate": 4.435026396841133e-06, + "loss": 1.173077940940857, + "step": 4154 + }, + { + "epoch": 1.2791843016544826, + "grad_norm": 8.4375, + "learning_rate": 4.4313575984693505e-06, + "loss": 1.241083025932312, + "step": 4156 + }, + { + "epoch": 1.2797999230473258, + "grad_norm": 10.3125, + "learning_rate": 4.4276903588926846e-06, + "loss": 1.3768047094345093, + "step": 4158 + }, + { + "epoch": 1.2804155444401693, + "grad_norm": 6.75, + "learning_rate": 4.424024681755353e-06, + "loss": 0.6591784358024597, + "step": 4160 + }, + { + "epoch": 1.2810311658330127, + "grad_norm": 2.125, + "learning_rate": 4.4203605707000236e-06, + "loss": 1.0183517932891846, + "step": 4162 + }, + { + "epoch": 1.2816467872258561, + "grad_norm": 18.125, + "learning_rate": 4.4166980293678045e-06, + "loss": 0.8171372413635254, + "step": 4164 + }, + { + "epoch": 1.2822624086186996, + "grad_norm": 6.59375, + "learning_rate": 4.413037061398244e-06, + "loss": 1.4593392610549927, + "step": 4166 + }, + { + "epoch": 1.2828780300115428, + "grad_norm": 21.375, + "learning_rate": 4.4093776704293265e-06, + "loss": 1.1947382688522339, + "step": 4168 + }, + { + "epoch": 1.2834936514043864, + "grad_norm": 5.9375, + "learning_rate": 4.4057198600974745e-06, + "loss": 1.3039485216140747, + "step": 4170 + }, + { + "epoch": 1.2841092727972296, + "grad_norm": 3.171875, + "learning_rate": 4.402063634037535e-06, + "loss": 0.7116605043411255, + "step": 4172 + }, + { + "epoch": 1.284724894190073, + "grad_norm": 10.0625, + "learning_rate": 4.398408995882782e-06, + "loss": 1.313848614692688, + "step": 4174 + }, + { + "epoch": 1.2853405155829165, + "grad_norm": 14.625, + "learning_rate": 4.394755949264911e-06, + "loss": 1.5144187211990356, + "step": 4176 + }, + { + "epoch": 1.28595613697576, + "grad_norm": 21.25, + "learning_rate": 4.391104497814036e-06, + "loss": 1.0581088066101074, + "step": 4178 + }, + { + "epoch": 1.2865717583686034, + "grad_norm": 21.25, + "learning_rate": 4.3874546451586845e-06, + "loss": 1.2269830703735352, + "step": 4180 + }, + { + "epoch": 1.2871873797614466, + "grad_norm": 5.90625, + "learning_rate": 4.383806394925799e-06, + "loss": 1.0916420221328735, + "step": 4182 + }, + { + "epoch": 1.2878030011542902, + "grad_norm": 14.6875, + "learning_rate": 4.380159750740728e-06, + "loss": 1.0353869199752808, + "step": 4184 + }, + { + "epoch": 1.2884186225471335, + "grad_norm": 2.34375, + "learning_rate": 4.3765147162272225e-06, + "loss": 1.2946035861968994, + "step": 4186 + }, + { + "epoch": 1.289034243939977, + "grad_norm": 6.4375, + "learning_rate": 4.372871295007435e-06, + "loss": 1.1765669584274292, + "step": 4188 + }, + { + "epoch": 1.2896498653328203, + "grad_norm": 2.984375, + "learning_rate": 4.369229490701916e-06, + "loss": 1.099198341369629, + "step": 4190 + }, + { + "epoch": 1.2902654867256638, + "grad_norm": 4.84375, + "learning_rate": 4.365589306929607e-06, + "loss": 0.9606583118438721, + "step": 4192 + }, + { + "epoch": 1.2908811081185072, + "grad_norm": 2.609375, + "learning_rate": 4.361950747307839e-06, + "loss": 0.8651901483535767, + "step": 4194 + }, + { + "epoch": 1.2914967295113504, + "grad_norm": 6.53125, + "learning_rate": 4.358313815452333e-06, + "loss": 1.2352056503295898, + "step": 4196 + }, + { + "epoch": 1.2921123509041939, + "grad_norm": 8.3125, + "learning_rate": 4.354678514977188e-06, + "loss": 1.6388479471206665, + "step": 4198 + }, + { + "epoch": 1.2927279722970373, + "grad_norm": 4.25, + "learning_rate": 4.351044849494883e-06, + "loss": 1.20128333568573, + "step": 4200 + }, + { + "epoch": 1.2933435936898807, + "grad_norm": 8.5, + "learning_rate": 4.347412822616275e-06, + "loss": 1.2727470397949219, + "step": 4202 + }, + { + "epoch": 1.2939592150827242, + "grad_norm": 4.625, + "learning_rate": 4.343782437950589e-06, + "loss": 1.2012213468551636, + "step": 4204 + }, + { + "epoch": 1.2945748364755676, + "grad_norm": 4.0625, + "learning_rate": 4.3401536991054194e-06, + "loss": 1.129618525505066, + "step": 4206 + }, + { + "epoch": 1.295190457868411, + "grad_norm": 4.125, + "learning_rate": 4.336526609686726e-06, + "loss": 1.3237369060516357, + "step": 4208 + }, + { + "epoch": 1.2958060792612542, + "grad_norm": 15.125, + "learning_rate": 4.3329011732988285e-06, + "loss": 1.2727644443511963, + "step": 4210 + }, + { + "epoch": 1.2964217006540977, + "grad_norm": 12.6875, + "learning_rate": 4.329277393544405e-06, + "loss": 1.2533864974975586, + "step": 4212 + }, + { + "epoch": 1.297037322046941, + "grad_norm": 8.4375, + "learning_rate": 4.325655274024487e-06, + "loss": 1.5394253730773926, + "step": 4214 + }, + { + "epoch": 1.2976529434397845, + "grad_norm": 4.03125, + "learning_rate": 4.322034818338454e-06, + "loss": 0.9984561204910278, + "step": 4216 + }, + { + "epoch": 1.298268564832628, + "grad_norm": 8.0625, + "learning_rate": 4.318416030084036e-06, + "loss": 0.9267721176147461, + "step": 4218 + }, + { + "epoch": 1.2988841862254714, + "grad_norm": 5.4375, + "learning_rate": 4.314798912857301e-06, + "loss": 0.7328625917434692, + "step": 4220 + }, + { + "epoch": 1.2994998076183149, + "grad_norm": 6.6875, + "learning_rate": 4.311183470252663e-06, + "loss": 1.1941163539886475, + "step": 4222 + }, + { + "epoch": 1.300115429011158, + "grad_norm": 10.75, + "learning_rate": 4.307569705862866e-06, + "loss": 1.5790549516677856, + "step": 4224 + }, + { + "epoch": 1.3007310504040015, + "grad_norm": 29.25, + "learning_rate": 4.303957623278989e-06, + "loss": 0.7785030603408813, + "step": 4226 + }, + { + "epoch": 1.301346671796845, + "grad_norm": 7.0, + "learning_rate": 4.300347226090443e-06, + "loss": 1.5293329954147339, + "step": 4228 + }, + { + "epoch": 1.3019622931896884, + "grad_norm": 10.75, + "learning_rate": 4.296738517884954e-06, + "loss": 1.3094733953475952, + "step": 4230 + }, + { + "epoch": 1.3025779145825318, + "grad_norm": 4.8125, + "learning_rate": 4.293131502248582e-06, + "loss": 1.3940355777740479, + "step": 4232 + }, + { + "epoch": 1.303193535975375, + "grad_norm": 5.90625, + "learning_rate": 4.289526182765697e-06, + "loss": 1.2179944515228271, + "step": 4234 + }, + { + "epoch": 1.3038091573682187, + "grad_norm": 4.1875, + "learning_rate": 4.285922563018983e-06, + "loss": 1.1753324270248413, + "step": 4236 + }, + { + "epoch": 1.3044247787610619, + "grad_norm": 25.625, + "learning_rate": 4.282320646589444e-06, + "loss": 1.2323254346847534, + "step": 4238 + }, + { + "epoch": 1.3050404001539053, + "grad_norm": 6.6875, + "learning_rate": 4.278720437056379e-06, + "loss": 1.1362353563308716, + "step": 4240 + }, + { + "epoch": 1.3056560215467488, + "grad_norm": 8.8125, + "learning_rate": 4.2751219379974035e-06, + "loss": 1.307444453239441, + "step": 4242 + }, + { + "epoch": 1.3062716429395922, + "grad_norm": 2.5625, + "learning_rate": 4.271525152988419e-06, + "loss": 1.4350255727767944, + "step": 4244 + }, + { + "epoch": 1.3068872643324356, + "grad_norm": 22.5, + "learning_rate": 4.267930085603638e-06, + "loss": 1.1926822662353516, + "step": 4246 + }, + { + "epoch": 1.3075028857252788, + "grad_norm": 14.5, + "learning_rate": 4.264336739415555e-06, + "loss": 0.6354160308837891, + "step": 4248 + }, + { + "epoch": 1.3081185071181223, + "grad_norm": 13.6875, + "learning_rate": 4.260745117994959e-06, + "loss": 1.6849381923675537, + "step": 4250 + }, + { + "epoch": 1.3087341285109657, + "grad_norm": 5.5, + "learning_rate": 4.257155224910929e-06, + "loss": 1.223006010055542, + "step": 4252 + }, + { + "epoch": 1.3093497499038091, + "grad_norm": 13.5, + "learning_rate": 4.253567063730818e-06, + "loss": 1.2167497873306274, + "step": 4254 + }, + { + "epoch": 1.3099653712966526, + "grad_norm": 5.4375, + "learning_rate": 4.249980638020264e-06, + "loss": 1.4918882846832275, + "step": 4256 + }, + { + "epoch": 1.310580992689496, + "grad_norm": 4.53125, + "learning_rate": 4.246395951343178e-06, + "loss": 1.132615566253662, + "step": 4258 + }, + { + "epoch": 1.3111966140823395, + "grad_norm": 4.75, + "learning_rate": 4.242813007261742e-06, + "loss": 1.1479458808898926, + "step": 4260 + }, + { + "epoch": 1.3118122354751827, + "grad_norm": 8.5, + "learning_rate": 4.2392318093364115e-06, + "loss": 1.1298502683639526, + "step": 4262 + }, + { + "epoch": 1.312427856868026, + "grad_norm": 6.5625, + "learning_rate": 4.235652361125899e-06, + "loss": 1.0727702379226685, + "step": 4264 + }, + { + "epoch": 1.3130434782608695, + "grad_norm": 9.375, + "learning_rate": 4.232074666187187e-06, + "loss": 1.3204834461212158, + "step": 4266 + }, + { + "epoch": 1.313659099653713, + "grad_norm": 10.0, + "learning_rate": 4.228498728075508e-06, + "loss": 1.491713523864746, + "step": 4268 + }, + { + "epoch": 1.3142747210465564, + "grad_norm": 3.359375, + "learning_rate": 4.224924550344352e-06, + "loss": 1.342588186264038, + "step": 4270 + }, + { + "epoch": 1.3148903424393998, + "grad_norm": 28.5, + "learning_rate": 4.221352136545462e-06, + "loss": 0.6640535593032837, + "step": 4272 + }, + { + "epoch": 1.3155059638322433, + "grad_norm": 5.96875, + "learning_rate": 4.217781490228821e-06, + "loss": 0.9824773073196411, + "step": 4274 + }, + { + "epoch": 1.3161215852250865, + "grad_norm": 5.25, + "learning_rate": 4.214212614942664e-06, + "loss": 0.8653430938720703, + "step": 4276 + }, + { + "epoch": 1.31673720661793, + "grad_norm": 6.0, + "learning_rate": 4.210645514233463e-06, + "loss": 0.9413928389549255, + "step": 4278 + }, + { + "epoch": 1.3173528280107734, + "grad_norm": 19.125, + "learning_rate": 4.207080191645923e-06, + "loss": 1.6054041385650635, + "step": 4280 + }, + { + "epoch": 1.3179684494036168, + "grad_norm": 15.875, + "learning_rate": 4.203516650722987e-06, + "loss": 1.221308946609497, + "step": 4282 + }, + { + "epoch": 1.3185840707964602, + "grad_norm": 8.4375, + "learning_rate": 4.199954895005824e-06, + "loss": 1.490601658821106, + "step": 4284 + }, + { + "epoch": 1.3191996921893034, + "grad_norm": 5.40625, + "learning_rate": 4.196394928033831e-06, + "loss": 1.0584721565246582, + "step": 4286 + }, + { + "epoch": 1.319815313582147, + "grad_norm": 20.875, + "learning_rate": 4.192836753344629e-06, + "loss": 1.4903297424316406, + "step": 4288 + }, + { + "epoch": 1.3204309349749903, + "grad_norm": 8.5625, + "learning_rate": 4.189280374474052e-06, + "loss": 1.3928847312927246, + "step": 4290 + }, + { + "epoch": 1.3210465563678337, + "grad_norm": 11.875, + "learning_rate": 4.185725794956157e-06, + "loss": 1.7273478507995605, + "step": 4292 + }, + { + "epoch": 1.3216621777606772, + "grad_norm": 4.53125, + "learning_rate": 4.182173018323209e-06, + "loss": 0.7521393895149231, + "step": 4294 + }, + { + "epoch": 1.3222777991535206, + "grad_norm": 6.0625, + "learning_rate": 4.17862204810568e-06, + "loss": 1.2451914548873901, + "step": 4296 + }, + { + "epoch": 1.322893420546364, + "grad_norm": 5.25, + "learning_rate": 4.175072887832248e-06, + "loss": 1.2428488731384277, + "step": 4298 + }, + { + "epoch": 1.3235090419392073, + "grad_norm": 6.875, + "learning_rate": 4.171525541029797e-06, + "loss": 0.8714505434036255, + "step": 4300 + }, + { + "epoch": 1.324124663332051, + "grad_norm": 6.53125, + "learning_rate": 4.167980011223402e-06, + "loss": 1.1614984273910522, + "step": 4302 + }, + { + "epoch": 1.3247402847248941, + "grad_norm": 5.25, + "learning_rate": 4.164436301936334e-06, + "loss": 0.8371998071670532, + "step": 4304 + }, + { + "epoch": 1.3253559061177376, + "grad_norm": 4.9375, + "learning_rate": 4.160894416690062e-06, + "loss": 1.2088501453399658, + "step": 4306 + }, + { + "epoch": 1.325971527510581, + "grad_norm": 7.71875, + "learning_rate": 4.15735435900423e-06, + "loss": 1.1465224027633667, + "step": 4308 + }, + { + "epoch": 1.3265871489034244, + "grad_norm": 6.625, + "learning_rate": 4.153816132396678e-06, + "loss": 1.4936845302581787, + "step": 4310 + }, + { + "epoch": 1.3272027702962679, + "grad_norm": 10.9375, + "learning_rate": 4.1502797403834184e-06, + "loss": 1.4611656665802002, + "step": 4312 + }, + { + "epoch": 1.327818391689111, + "grad_norm": 6.0, + "learning_rate": 4.146745186478642e-06, + "loss": 1.3357412815093994, + "step": 4314 + }, + { + "epoch": 1.3284340130819545, + "grad_norm": 4.5625, + "learning_rate": 4.143212474194717e-06, + "loss": 0.46920517086982727, + "step": 4316 + }, + { + "epoch": 1.329049634474798, + "grad_norm": 3.03125, + "learning_rate": 4.139681607042178e-06, + "loss": 1.1236724853515625, + "step": 4318 + }, + { + "epoch": 1.3296652558676414, + "grad_norm": 9.0, + "learning_rate": 4.136152588529729e-06, + "loss": 1.4975790977478027, + "step": 4320 + }, + { + "epoch": 1.3302808772604848, + "grad_norm": 11.625, + "learning_rate": 4.132625422164229e-06, + "loss": 1.5210164785385132, + "step": 4322 + }, + { + "epoch": 1.3308964986533283, + "grad_norm": 7.34375, + "learning_rate": 4.129100111450709e-06, + "loss": 1.5824909210205078, + "step": 4324 + }, + { + "epoch": 1.3315121200461717, + "grad_norm": 4.75, + "learning_rate": 4.125576659892344e-06, + "loss": 1.4773908853530884, + "step": 4326 + }, + { + "epoch": 1.332127741439015, + "grad_norm": 81.5, + "learning_rate": 4.1220550709904676e-06, + "loss": 1.4020800590515137, + "step": 4328 + }, + { + "epoch": 1.3327433628318583, + "grad_norm": 4.8125, + "learning_rate": 4.118535348244566e-06, + "loss": 1.1659955978393555, + "step": 4330 + }, + { + "epoch": 1.3333589842247018, + "grad_norm": 3.75, + "learning_rate": 4.115017495152262e-06, + "loss": 1.2958264350891113, + "step": 4332 + }, + { + "epoch": 1.3339746056175452, + "grad_norm": 4.59375, + "learning_rate": 4.1115015152093264e-06, + "loss": 1.2718833684921265, + "step": 4334 + }, + { + "epoch": 1.3345902270103887, + "grad_norm": 10.5625, + "learning_rate": 4.107987411909667e-06, + "loss": 1.120107889175415, + "step": 4336 + }, + { + "epoch": 1.335205848403232, + "grad_norm": 12.125, + "learning_rate": 4.104475188745327e-06, + "loss": 0.9704511761665344, + "step": 4338 + }, + { + "epoch": 1.3358214697960755, + "grad_norm": 14.5, + "learning_rate": 4.100964849206484e-06, + "loss": 1.0948693752288818, + "step": 4340 + }, + { + "epoch": 1.3364370911889187, + "grad_norm": 2.1875, + "learning_rate": 4.097456396781437e-06, + "loss": 0.613154411315918, + "step": 4342 + }, + { + "epoch": 1.3370527125817622, + "grad_norm": 14.75, + "learning_rate": 4.0939498349566145e-06, + "loss": 0.6655212640762329, + "step": 4344 + }, + { + "epoch": 1.3376683339746056, + "grad_norm": 5.125, + "learning_rate": 4.09044516721657e-06, + "loss": 1.4865241050720215, + "step": 4346 + }, + { + "epoch": 1.338283955367449, + "grad_norm": 4.40625, + "learning_rate": 4.0869423970439646e-06, + "loss": 1.161873459815979, + "step": 4348 + }, + { + "epoch": 1.3388995767602925, + "grad_norm": 9.8125, + "learning_rate": 4.083441527919582e-06, + "loss": 1.1888126134872437, + "step": 4350 + }, + { + "epoch": 1.3395151981531357, + "grad_norm": 8.8125, + "learning_rate": 4.079942563322315e-06, + "loss": 1.6624459028244019, + "step": 4352 + }, + { + "epoch": 1.3401308195459793, + "grad_norm": 3.671875, + "learning_rate": 4.0764455067291625e-06, + "loss": 0.6965327262878418, + "step": 4354 + }, + { + "epoch": 1.3407464409388226, + "grad_norm": 8.3125, + "learning_rate": 4.0729503616152284e-06, + "loss": 1.100224256515503, + "step": 4356 + }, + { + "epoch": 1.341362062331666, + "grad_norm": 4.9375, + "learning_rate": 4.069457131453716e-06, + "loss": 1.1976124048233032, + "step": 4358 + }, + { + "epoch": 1.3419776837245094, + "grad_norm": 5.03125, + "learning_rate": 4.065965819715928e-06, + "loss": 1.2772959470748901, + "step": 4360 + }, + { + "epoch": 1.3425933051173529, + "grad_norm": 4.15625, + "learning_rate": 4.062476429871255e-06, + "loss": 1.228773832321167, + "step": 4362 + }, + { + "epoch": 1.3432089265101963, + "grad_norm": 9.625, + "learning_rate": 4.058988965387187e-06, + "loss": 1.5692869424819946, + "step": 4364 + }, + { + "epoch": 1.3438245479030395, + "grad_norm": 4.71875, + "learning_rate": 4.055503429729294e-06, + "loss": 1.2732508182525635, + "step": 4366 + }, + { + "epoch": 1.3444401692958832, + "grad_norm": 55.75, + "learning_rate": 4.052019826361227e-06, + "loss": 1.653706669807434, + "step": 4368 + }, + { + "epoch": 1.3450557906887264, + "grad_norm": 2.5625, + "learning_rate": 4.04853815874473e-06, + "loss": 1.0349485874176025, + "step": 4370 + }, + { + "epoch": 1.3456714120815698, + "grad_norm": 4.53125, + "learning_rate": 4.045058430339603e-06, + "loss": 1.0696982145309448, + "step": 4372 + }, + { + "epoch": 1.3462870334744133, + "grad_norm": 4.125, + "learning_rate": 4.041580644603737e-06, + "loss": 1.13498854637146, + "step": 4374 + }, + { + "epoch": 1.3469026548672567, + "grad_norm": 5.5, + "learning_rate": 4.038104804993084e-06, + "loss": 1.483115315437317, + "step": 4376 + }, + { + "epoch": 1.3475182762601001, + "grad_norm": 7.75, + "learning_rate": 4.034630914961664e-06, + "loss": 1.5009520053863525, + "step": 4378 + }, + { + "epoch": 1.3481338976529433, + "grad_norm": 5.9375, + "learning_rate": 4.0311589779615605e-06, + "loss": 1.265244960784912, + "step": 4380 + }, + { + "epoch": 1.3487495190457868, + "grad_norm": 13.5, + "learning_rate": 4.027688997442911e-06, + "loss": 1.444014072418213, + "step": 4382 + }, + { + "epoch": 1.3493651404386302, + "grad_norm": 9.3125, + "learning_rate": 4.0242209768539195e-06, + "loss": 1.5586994886398315, + "step": 4384 + }, + { + "epoch": 1.3499807618314736, + "grad_norm": 6.21875, + "learning_rate": 4.020754919640829e-06, + "loss": 1.4298980236053467, + "step": 4386 + }, + { + "epoch": 1.350596383224317, + "grad_norm": 5.6875, + "learning_rate": 4.017290829247942e-06, + "loss": 1.0092744827270508, + "step": 4388 + }, + { + "epoch": 1.3512120046171605, + "grad_norm": 13.125, + "learning_rate": 4.013828709117602e-06, + "loss": 1.1938982009887695, + "step": 4390 + }, + { + "epoch": 1.351827626010004, + "grad_norm": 5.84375, + "learning_rate": 4.010368562690195e-06, + "loss": 1.0090004205703735, + "step": 4392 + }, + { + "epoch": 1.3524432474028472, + "grad_norm": 8.75, + "learning_rate": 4.006910393404148e-06, + "loss": 1.2579269409179688, + "step": 4394 + }, + { + "epoch": 1.3530588687956906, + "grad_norm": 10.5625, + "learning_rate": 4.003454204695919e-06, + "loss": 1.4525524377822876, + "step": 4396 + }, + { + "epoch": 1.353674490188534, + "grad_norm": 5.03125, + "learning_rate": 4.0000000000000015e-06, + "loss": 1.2327172756195068, + "step": 4398 + }, + { + "epoch": 1.3542901115813775, + "grad_norm": 6.875, + "learning_rate": 3.996547782748915e-06, + "loss": 1.3894731998443604, + "step": 4400 + }, + { + "epoch": 1.354905732974221, + "grad_norm": 3.453125, + "learning_rate": 3.993097556373205e-06, + "loss": 1.2647353410720825, + "step": 4402 + }, + { + "epoch": 1.3555213543670643, + "grad_norm": 16.375, + "learning_rate": 3.989649324301441e-06, + "loss": 1.43026864528656, + "step": 4404 + }, + { + "epoch": 1.3561369757599078, + "grad_norm": 6.625, + "learning_rate": 3.986203089960206e-06, + "loss": 1.2500108480453491, + "step": 4406 + }, + { + "epoch": 1.356752597152751, + "grad_norm": 5.15625, + "learning_rate": 3.982758856774103e-06, + "loss": 1.0625942945480347, + "step": 4408 + }, + { + "epoch": 1.3573682185455944, + "grad_norm": 12.0625, + "learning_rate": 3.979316628165741e-06, + "loss": 1.4211559295654297, + "step": 4410 + }, + { + "epoch": 1.3579838399384379, + "grad_norm": 8.625, + "learning_rate": 3.975876407555742e-06, + "loss": 1.446379542350769, + "step": 4412 + }, + { + "epoch": 1.3585994613312813, + "grad_norm": 9.6875, + "learning_rate": 3.9724381983627285e-06, + "loss": 1.3842189311981201, + "step": 4414 + }, + { + "epoch": 1.3592150827241247, + "grad_norm": 6.78125, + "learning_rate": 3.969002004003326e-06, + "loss": 0.9346789717674255, + "step": 4416 + }, + { + "epoch": 1.359830704116968, + "grad_norm": 3.0, + "learning_rate": 3.965567827892159e-06, + "loss": 1.2848070859909058, + "step": 4418 + }, + { + "epoch": 1.3604463255098116, + "grad_norm": 12.75, + "learning_rate": 3.962135673441846e-06, + "loss": 1.651962161064148, + "step": 4420 + }, + { + "epoch": 1.3610619469026548, + "grad_norm": 7.84375, + "learning_rate": 3.958705544062994e-06, + "loss": 1.3421986103057861, + "step": 4422 + }, + { + "epoch": 1.3616775682954982, + "grad_norm": 5.59375, + "learning_rate": 3.9552774431642e-06, + "loss": 1.4327430725097656, + "step": 4424 + }, + { + "epoch": 1.3622931896883417, + "grad_norm": 4.65625, + "learning_rate": 3.951851374152045e-06, + "loss": 1.3108030557632446, + "step": 4426 + }, + { + "epoch": 1.3629088110811851, + "grad_norm": 3.390625, + "learning_rate": 3.9484273404310905e-06, + "loss": 1.1684361696243286, + "step": 4428 + }, + { + "epoch": 1.3635244324740285, + "grad_norm": 5.03125, + "learning_rate": 3.9450053454038735e-06, + "loss": 1.3234820365905762, + "step": 4430 + }, + { + "epoch": 1.3641400538668718, + "grad_norm": 5.84375, + "learning_rate": 3.941585392470912e-06, + "loss": 1.429876446723938, + "step": 4432 + }, + { + "epoch": 1.3647556752597152, + "grad_norm": 5.8125, + "learning_rate": 3.938167485030687e-06, + "loss": 1.1931172609329224, + "step": 4434 + }, + { + "epoch": 1.3653712966525586, + "grad_norm": 10.5625, + "learning_rate": 3.934751626479649e-06, + "loss": 1.3414031267166138, + "step": 4436 + }, + { + "epoch": 1.365986918045402, + "grad_norm": 8.125, + "learning_rate": 3.931337820212215e-06, + "loss": 1.2865818738937378, + "step": 4438 + }, + { + "epoch": 1.3666025394382455, + "grad_norm": 5.1875, + "learning_rate": 3.927926069620758e-06, + "loss": 1.122363567352295, + "step": 4440 + }, + { + "epoch": 1.367218160831089, + "grad_norm": 4.1875, + "learning_rate": 3.924516378095613e-06, + "loss": 1.2529959678649902, + "step": 4442 + }, + { + "epoch": 1.3678337822239324, + "grad_norm": 6.75, + "learning_rate": 3.921108749025069e-06, + "loss": 1.335496425628662, + "step": 4444 + }, + { + "epoch": 1.3684494036167756, + "grad_norm": 9.5, + "learning_rate": 3.917703185795359e-06, + "loss": 1.4739428758621216, + "step": 4446 + }, + { + "epoch": 1.369065025009619, + "grad_norm": 15.0625, + "learning_rate": 3.914299691790672e-06, + "loss": 1.5854828357696533, + "step": 4448 + }, + { + "epoch": 1.3696806464024625, + "grad_norm": 8.875, + "learning_rate": 3.910898270393131e-06, + "loss": 1.4780482053756714, + "step": 4450 + }, + { + "epoch": 1.370296267795306, + "grad_norm": 14.5625, + "learning_rate": 3.907498924982809e-06, + "loss": 1.1311975717544556, + "step": 4452 + }, + { + "epoch": 1.3709118891881493, + "grad_norm": 5.46875, + "learning_rate": 3.9041016589377115e-06, + "loss": 1.292945146560669, + "step": 4454 + }, + { + "epoch": 1.3715275105809928, + "grad_norm": 16.875, + "learning_rate": 3.900706475633774e-06, + "loss": 1.7364095449447632, + "step": 4456 + }, + { + "epoch": 1.3721431319738362, + "grad_norm": 4.46875, + "learning_rate": 3.897313378444871e-06, + "loss": 1.2401243448257446, + "step": 4458 + }, + { + "epoch": 1.3727587533666794, + "grad_norm": 12.8125, + "learning_rate": 3.893922370742797e-06, + "loss": 1.4392495155334473, + "step": 4460 + }, + { + "epoch": 1.3733743747595228, + "grad_norm": 5.46875, + "learning_rate": 3.890533455897274e-06, + "loss": 0.848065197467804, + "step": 4462 + }, + { + "epoch": 1.3739899961523663, + "grad_norm": 19.75, + "learning_rate": 3.887146637275939e-06, + "loss": 1.5929031372070312, + "step": 4464 + }, + { + "epoch": 1.3746056175452097, + "grad_norm": 31.25, + "learning_rate": 3.883761918244354e-06, + "loss": 0.9019598960876465, + "step": 4466 + }, + { + "epoch": 1.3752212389380531, + "grad_norm": 7.28125, + "learning_rate": 3.880379302165987e-06, + "loss": 1.2336246967315674, + "step": 4468 + }, + { + "epoch": 1.3758368603308964, + "grad_norm": 10.0, + "learning_rate": 3.87699879240222e-06, + "loss": 1.1775295734405518, + "step": 4470 + }, + { + "epoch": 1.37645248172374, + "grad_norm": 8.1875, + "learning_rate": 3.8736203923123425e-06, + "loss": 1.3640468120574951, + "step": 4472 + }, + { + "epoch": 1.3770681031165832, + "grad_norm": 4.65625, + "learning_rate": 3.870244105253546e-06, + "loss": 1.1681087017059326, + "step": 4474 + }, + { + "epoch": 1.3776837245094267, + "grad_norm": 8.625, + "learning_rate": 3.866869934580922e-06, + "loss": 1.2777169942855835, + "step": 4476 + }, + { + "epoch": 1.37829934590227, + "grad_norm": 7.75, + "learning_rate": 3.8634978836474605e-06, + "loss": 1.1488378047943115, + "step": 4478 + }, + { + "epoch": 1.3789149672951135, + "grad_norm": 10.375, + "learning_rate": 3.860127955804042e-06, + "loss": 1.2367651462554932, + "step": 4480 + }, + { + "epoch": 1.379530588687957, + "grad_norm": 8.1875, + "learning_rate": 3.856760154399442e-06, + "loss": 1.4411249160766602, + "step": 4482 + }, + { + "epoch": 1.3801462100808002, + "grad_norm": 11.25, + "learning_rate": 3.853394482780318e-06, + "loss": 1.2920490503311157, + "step": 4484 + }, + { + "epoch": 1.3807618314736438, + "grad_norm": 13.4375, + "learning_rate": 3.850030944291215e-06, + "loss": 1.6774686574935913, + "step": 4486 + }, + { + "epoch": 1.381377452866487, + "grad_norm": 5.65625, + "learning_rate": 3.846669542274559e-06, + "loss": 1.4606332778930664, + "step": 4488 + }, + { + "epoch": 1.3819930742593305, + "grad_norm": 6.5625, + "learning_rate": 3.843310280070643e-06, + "loss": 1.5854536294937134, + "step": 4490 + }, + { + "epoch": 1.382608695652174, + "grad_norm": 8.1875, + "learning_rate": 3.839953161017647e-06, + "loss": 1.4916932582855225, + "step": 4492 + }, + { + "epoch": 1.3832243170450174, + "grad_norm": 10.1875, + "learning_rate": 3.836598188451615e-06, + "loss": 1.4063359498977661, + "step": 4494 + }, + { + "epoch": 1.3838399384378608, + "grad_norm": 4.46875, + "learning_rate": 3.833245365706457e-06, + "loss": 1.3658607006072998, + "step": 4496 + }, + { + "epoch": 1.384455559830704, + "grad_norm": 9.1875, + "learning_rate": 3.829894696113949e-06, + "loss": 1.2148422002792358, + "step": 4498 + }, + { + "epoch": 1.3850711812235474, + "grad_norm": 12.625, + "learning_rate": 3.826546183003726e-06, + "loss": 1.2281498908996582, + "step": 4500 + }, + { + "epoch": 1.3856868026163909, + "grad_norm": 5.1875, + "learning_rate": 3.8231998297032815e-06, + "loss": 0.9534695744514465, + "step": 4502 + }, + { + "epoch": 1.3863024240092343, + "grad_norm": 11.125, + "learning_rate": 3.819855639537959e-06, + "loss": 1.0877275466918945, + "step": 4504 + }, + { + "epoch": 1.3869180454020777, + "grad_norm": 10.75, + "learning_rate": 3.816513615830959e-06, + "loss": 1.4760278463363647, + "step": 4506 + }, + { + "epoch": 1.3875336667949212, + "grad_norm": 7.0625, + "learning_rate": 3.813173761903324e-06, + "loss": 1.136398434638977, + "step": 4508 + }, + { + "epoch": 1.3881492881877646, + "grad_norm": 21.375, + "learning_rate": 3.8098360810739386e-06, + "loss": 1.4807792901992798, + "step": 4510 + }, + { + "epoch": 1.3887649095806078, + "grad_norm": 9.75, + "learning_rate": 3.8065005766595366e-06, + "loss": 1.4393525123596191, + "step": 4512 + }, + { + "epoch": 1.3893805309734513, + "grad_norm": 7.40625, + "learning_rate": 3.8031672519746797e-06, + "loss": 1.3941676616668701, + "step": 4514 + }, + { + "epoch": 1.3899961523662947, + "grad_norm": 3.515625, + "learning_rate": 3.7998361103317688e-06, + "loss": 1.0671101808547974, + "step": 4516 + }, + { + "epoch": 1.3906117737591381, + "grad_norm": 5.15625, + "learning_rate": 3.796507155041032e-06, + "loss": 1.0656688213348389, + "step": 4518 + }, + { + "epoch": 1.3912273951519816, + "grad_norm": 11.0, + "learning_rate": 3.7931803894105296e-06, + "loss": 1.498962163925171, + "step": 4520 + }, + { + "epoch": 1.391843016544825, + "grad_norm": 23.375, + "learning_rate": 3.7898558167461426e-06, + "loss": 1.4071496725082397, + "step": 4522 + }, + { + "epoch": 1.3924586379376684, + "grad_norm": 5.875, + "learning_rate": 3.7865334403515706e-06, + "loss": 1.5745182037353516, + "step": 4524 + }, + { + "epoch": 1.3930742593305117, + "grad_norm": 6.65625, + "learning_rate": 3.7832132635283385e-06, + "loss": 1.134566307067871, + "step": 4526 + }, + { + "epoch": 1.393689880723355, + "grad_norm": 32.5, + "learning_rate": 3.779895289575775e-06, + "loss": 1.036484718322754, + "step": 4528 + }, + { + "epoch": 1.3943055021161985, + "grad_norm": 4.34375, + "learning_rate": 3.7765795217910294e-06, + "loss": 1.0628092288970947, + "step": 4530 + }, + { + "epoch": 1.394921123509042, + "grad_norm": 5.40625, + "learning_rate": 3.7732659634690528e-06, + "loss": 0.8714097738265991, + "step": 4532 + }, + { + "epoch": 1.3955367449018854, + "grad_norm": 11.1875, + "learning_rate": 3.7699546179026003e-06, + "loss": 0.9510615468025208, + "step": 4534 + }, + { + "epoch": 1.3961523662947286, + "grad_norm": 6.46875, + "learning_rate": 3.7666454883822345e-06, + "loss": 1.2836421728134155, + "step": 4536 + }, + { + "epoch": 1.3967679876875723, + "grad_norm": 6.28125, + "learning_rate": 3.763338578196307e-06, + "loss": 1.164069652557373, + "step": 4538 + }, + { + "epoch": 1.3973836090804155, + "grad_norm": 24.125, + "learning_rate": 3.7600338906309747e-06, + "loss": 1.5945087671279907, + "step": 4540 + }, + { + "epoch": 1.397999230473259, + "grad_norm": 7.71875, + "learning_rate": 3.7567314289701746e-06, + "loss": 1.5566492080688477, + "step": 4542 + }, + { + "epoch": 1.3986148518661023, + "grad_norm": 6.28125, + "learning_rate": 3.753431196495636e-06, + "loss": 1.5426421165466309, + "step": 4544 + }, + { + "epoch": 1.3992304732589458, + "grad_norm": 6.25, + "learning_rate": 3.750133196486878e-06, + "loss": 1.209553837776184, + "step": 4546 + }, + { + "epoch": 1.3998460946517892, + "grad_norm": 5.84375, + "learning_rate": 3.7468374322211943e-06, + "loss": 1.249995470046997, + "step": 4548 + }, + { + "epoch": 1.4004617160446324, + "grad_norm": 16.0, + "learning_rate": 3.743543906973661e-06, + "loss": 1.1695905923843384, + "step": 4550 + }, + { + "epoch": 1.401077337437476, + "grad_norm": 8.0625, + "learning_rate": 3.740252624017129e-06, + "loss": 1.2239853143692017, + "step": 4552 + }, + { + "epoch": 1.4016929588303193, + "grad_norm": 5.65625, + "learning_rate": 3.7369635866222183e-06, + "loss": 1.3880513906478882, + "step": 4554 + }, + { + "epoch": 1.4023085802231627, + "grad_norm": 4.96875, + "learning_rate": 3.733676798057319e-06, + "loss": 1.5688056945800781, + "step": 4556 + }, + { + "epoch": 1.4029242016160062, + "grad_norm": 6.71875, + "learning_rate": 3.7303922615885855e-06, + "loss": 1.2916425466537476, + "step": 4558 + }, + { + "epoch": 1.4035398230088496, + "grad_norm": 12.5625, + "learning_rate": 3.7271099804799387e-06, + "loss": 1.238887071609497, + "step": 4560 + }, + { + "epoch": 1.404155444401693, + "grad_norm": 8.0, + "learning_rate": 3.7238299579930525e-06, + "loss": 1.1047825813293457, + "step": 4562 + }, + { + "epoch": 1.4047710657945363, + "grad_norm": 10.4375, + "learning_rate": 3.720552197387358e-06, + "loss": 1.199622392654419, + "step": 4564 + }, + { + "epoch": 1.4053866871873797, + "grad_norm": 6.40625, + "learning_rate": 3.717276701920044e-06, + "loss": 0.9500876665115356, + "step": 4566 + }, + { + "epoch": 1.4060023085802231, + "grad_norm": 12.9375, + "learning_rate": 3.7140034748460373e-06, + "loss": 1.5336376428604126, + "step": 4568 + }, + { + "epoch": 1.4066179299730666, + "grad_norm": 8.4375, + "learning_rate": 3.7107325194180216e-06, + "loss": 1.1892744302749634, + "step": 4570 + }, + { + "epoch": 1.40723355136591, + "grad_norm": 17.375, + "learning_rate": 3.7074638388864157e-06, + "loss": 1.3420631885528564, + "step": 4572 + }, + { + "epoch": 1.4078491727587534, + "grad_norm": 9.5, + "learning_rate": 3.704197436499384e-06, + "loss": 1.0091776847839355, + "step": 4574 + }, + { + "epoch": 1.4084647941515969, + "grad_norm": 4.375, + "learning_rate": 3.7009333155028215e-06, + "loss": 1.2374558448791504, + "step": 4576 + }, + { + "epoch": 1.40908041554444, + "grad_norm": 4.28125, + "learning_rate": 3.697671479140359e-06, + "loss": 1.1223832368850708, + "step": 4578 + }, + { + "epoch": 1.4096960369372835, + "grad_norm": 4.15625, + "learning_rate": 3.694411930653356e-06, + "loss": 1.2537848949432373, + "step": 4580 + }, + { + "epoch": 1.410311658330127, + "grad_norm": 8.1875, + "learning_rate": 3.691154673280898e-06, + "loss": 1.3645119667053223, + "step": 4582 + }, + { + "epoch": 1.4109272797229704, + "grad_norm": 7.28125, + "learning_rate": 3.6878997102597967e-06, + "loss": 1.4384486675262451, + "step": 4584 + }, + { + "epoch": 1.4115429011158138, + "grad_norm": 14.0625, + "learning_rate": 3.6846470448245817e-06, + "loss": 1.1669893264770508, + "step": 4586 + }, + { + "epoch": 1.4121585225086573, + "grad_norm": 2.375, + "learning_rate": 3.6813966802074975e-06, + "loss": 1.2316890954971313, + "step": 4588 + }, + { + "epoch": 1.4127741439015007, + "grad_norm": 15.5625, + "learning_rate": 3.6781486196385085e-06, + "loss": 1.3015397787094116, + "step": 4590 + }, + { + "epoch": 1.413389765294344, + "grad_norm": 5.75, + "learning_rate": 3.674902866345279e-06, + "loss": 1.0278640985488892, + "step": 4592 + }, + { + "epoch": 1.4140053866871873, + "grad_norm": 4.875, + "learning_rate": 3.6716594235531915e-06, + "loss": 1.4170969724655151, + "step": 4594 + }, + { + "epoch": 1.4146210080800308, + "grad_norm": 5.625, + "learning_rate": 3.6684182944853274e-06, + "loss": 1.532051920890808, + "step": 4596 + }, + { + "epoch": 1.4152366294728742, + "grad_norm": 8.75, + "learning_rate": 3.6651794823624665e-06, + "loss": 1.4891656637191772, + "step": 4598 + }, + { + "epoch": 1.4158522508657176, + "grad_norm": 10.625, + "learning_rate": 3.661942990403092e-06, + "loss": 1.4055567979812622, + "step": 4600 + }, + { + "epoch": 1.4164678722585609, + "grad_norm": 8.0625, + "learning_rate": 3.658708821823376e-06, + "loss": 0.8534173965454102, + "step": 4602 + }, + { + "epoch": 1.4170834936514045, + "grad_norm": 4.15625, + "learning_rate": 3.655476979837189e-06, + "loss": 1.2280539274215698, + "step": 4604 + }, + { + "epoch": 1.4176991150442477, + "grad_norm": 8.0, + "learning_rate": 3.6522474676560786e-06, + "loss": 1.5194727182388306, + "step": 4606 + }, + { + "epoch": 1.4183147364370912, + "grad_norm": 5.8125, + "learning_rate": 3.649020288489288e-06, + "loss": 1.046549916267395, + "step": 4608 + }, + { + "epoch": 1.4189303578299346, + "grad_norm": 10.875, + "learning_rate": 3.645795445543736e-06, + "loss": 1.0877692699432373, + "step": 4610 + }, + { + "epoch": 1.419545979222778, + "grad_norm": 4.34375, + "learning_rate": 3.6425729420240193e-06, + "loss": 1.3161126375198364, + "step": 4612 + }, + { + "epoch": 1.4201616006156215, + "grad_norm": 5.46875, + "learning_rate": 3.6393527811324154e-06, + "loss": 0.7252700328826904, + "step": 4614 + }, + { + "epoch": 1.4207772220084647, + "grad_norm": 8.25, + "learning_rate": 3.6361349660688687e-06, + "loss": 1.6500828266143799, + "step": 4616 + }, + { + "epoch": 1.4213928434013081, + "grad_norm": 9.0, + "learning_rate": 3.632919500030994e-06, + "loss": 1.3527953624725342, + "step": 4618 + }, + { + "epoch": 1.4220084647941515, + "grad_norm": 5.125, + "learning_rate": 3.629706386214073e-06, + "loss": 1.0250791311264038, + "step": 4620 + }, + { + "epoch": 1.422624086186995, + "grad_norm": 17.375, + "learning_rate": 3.626495627811046e-06, + "loss": 1.1802740097045898, + "step": 4622 + }, + { + "epoch": 1.4232397075798384, + "grad_norm": 5.0625, + "learning_rate": 3.62328722801252e-06, + "loss": 1.4389358758926392, + "step": 4624 + }, + { + "epoch": 1.4238553289726819, + "grad_norm": 4.40625, + "learning_rate": 3.6200811900067488e-06, + "loss": 1.3220165967941284, + "step": 4626 + }, + { + "epoch": 1.4244709503655253, + "grad_norm": 10.5625, + "learning_rate": 3.616877516979649e-06, + "loss": 1.3158611059188843, + "step": 4628 + }, + { + "epoch": 1.4250865717583685, + "grad_norm": 15.0, + "learning_rate": 3.6136762121147805e-06, + "loss": 1.0538830757141113, + "step": 4630 + }, + { + "epoch": 1.425702193151212, + "grad_norm": 9.8125, + "learning_rate": 3.610477278593351e-06, + "loss": 1.0830562114715576, + "step": 4632 + }, + { + "epoch": 1.4263178145440554, + "grad_norm": 5.59375, + "learning_rate": 3.607280719594213e-06, + "loss": 1.409989833831787, + "step": 4634 + }, + { + "epoch": 1.4269334359368988, + "grad_norm": 3.84375, + "learning_rate": 3.6040865382938578e-06, + "loss": 1.2030140161514282, + "step": 4636 + }, + { + "epoch": 1.4275490573297422, + "grad_norm": 3.96875, + "learning_rate": 3.6008947378664164e-06, + "loss": 1.1751364469528198, + "step": 4638 + }, + { + "epoch": 1.4281646787225857, + "grad_norm": 6.21875, + "learning_rate": 3.597705321483653e-06, + "loss": 1.0235944986343384, + "step": 4640 + }, + { + "epoch": 1.4287803001154291, + "grad_norm": 12.375, + "learning_rate": 3.5945182923149602e-06, + "loss": 1.2032253742218018, + "step": 4642 + }, + { + "epoch": 1.4293959215082723, + "grad_norm": 3.15625, + "learning_rate": 3.5913336535273613e-06, + "loss": 0.9244052171707153, + "step": 4644 + }, + { + "epoch": 1.4300115429011158, + "grad_norm": 8.875, + "learning_rate": 3.5881514082855023e-06, + "loss": 1.4916614294052124, + "step": 4646 + }, + { + "epoch": 1.4306271642939592, + "grad_norm": 7.34375, + "learning_rate": 3.584971559751653e-06, + "loss": 1.3980257511138916, + "step": 4648 + }, + { + "epoch": 1.4312427856868026, + "grad_norm": 37.25, + "learning_rate": 3.5817941110857008e-06, + "loss": 1.134635090827942, + "step": 4650 + }, + { + "epoch": 1.431858407079646, + "grad_norm": 4.6875, + "learning_rate": 3.578619065445144e-06, + "loss": 1.467618465423584, + "step": 4652 + }, + { + "epoch": 1.4324740284724893, + "grad_norm": 5.90625, + "learning_rate": 3.5754464259851013e-06, + "loss": 1.207594871520996, + "step": 4654 + }, + { + "epoch": 1.433089649865333, + "grad_norm": 10.5625, + "learning_rate": 3.572276195858293e-06, + "loss": 1.0639207363128662, + "step": 4656 + }, + { + "epoch": 1.4337052712581762, + "grad_norm": 4.9375, + "learning_rate": 3.569108378215049e-06, + "loss": 1.3432626724243164, + "step": 4658 + }, + { + "epoch": 1.4343208926510196, + "grad_norm": 7.3125, + "learning_rate": 3.5659429762032977e-06, + "loss": 1.2566211223602295, + "step": 4660 + }, + { + "epoch": 1.434936514043863, + "grad_norm": 4.21875, + "learning_rate": 3.562779992968574e-06, + "loss": 1.310718297958374, + "step": 4662 + }, + { + "epoch": 1.4355521354367065, + "grad_norm": 13.875, + "learning_rate": 3.559619431654004e-06, + "loss": 1.1663079261779785, + "step": 4664 + }, + { + "epoch": 1.43616775682955, + "grad_norm": 4.9375, + "learning_rate": 3.5564612954003066e-06, + "loss": 1.4862658977508545, + "step": 4666 + }, + { + "epoch": 1.436783378222393, + "grad_norm": 24.75, + "learning_rate": 3.553305587345796e-06, + "loss": 1.6377918720245361, + "step": 4668 + }, + { + "epoch": 1.4373989996152368, + "grad_norm": 6.6875, + "learning_rate": 3.550152310626366e-06, + "loss": 1.3760124444961548, + "step": 4670 + }, + { + "epoch": 1.43801462100808, + "grad_norm": 3.359375, + "learning_rate": 3.547001468375501e-06, + "loss": 1.28596830368042, + "step": 4672 + }, + { + "epoch": 1.4386302424009234, + "grad_norm": 18.625, + "learning_rate": 3.543853063724263e-06, + "loss": 0.7132194638252258, + "step": 4674 + }, + { + "epoch": 1.4392458637937668, + "grad_norm": 10.0, + "learning_rate": 3.540707099801291e-06, + "loss": 1.4063074588775635, + "step": 4676 + }, + { + "epoch": 1.4398614851866103, + "grad_norm": 2.484375, + "learning_rate": 3.5375635797328024e-06, + "loss": 0.8440459966659546, + "step": 4678 + }, + { + "epoch": 1.4404771065794537, + "grad_norm": 4.5, + "learning_rate": 3.534422506642581e-06, + "loss": 1.209274172782898, + "step": 4680 + }, + { + "epoch": 1.441092727972297, + "grad_norm": 7.34375, + "learning_rate": 3.5312838836519846e-06, + "loss": 0.9432491660118103, + "step": 4682 + }, + { + "epoch": 1.4417083493651404, + "grad_norm": 16.0, + "learning_rate": 3.52814771387993e-06, + "loss": 1.304944634437561, + "step": 4684 + }, + { + "epoch": 1.4423239707579838, + "grad_norm": 5.09375, + "learning_rate": 3.5250140004429005e-06, + "loss": 1.2836129665374756, + "step": 4686 + }, + { + "epoch": 1.4429395921508272, + "grad_norm": 2.65625, + "learning_rate": 3.521882746454939e-06, + "loss": 1.123637080192566, + "step": 4688 + }, + { + "epoch": 1.4435552135436707, + "grad_norm": 9.375, + "learning_rate": 3.518753955027639e-06, + "loss": 1.6363736391067505, + "step": 4690 + }, + { + "epoch": 1.444170834936514, + "grad_norm": 9.1875, + "learning_rate": 3.5156276292701552e-06, + "loss": 1.3315503597259521, + "step": 4692 + }, + { + "epoch": 1.4447864563293575, + "grad_norm": 3.484375, + "learning_rate": 3.5125037722891846e-06, + "loss": 1.1604689359664917, + "step": 4694 + }, + { + "epoch": 1.4454020777222008, + "grad_norm": 3.125, + "learning_rate": 3.5093823871889756e-06, + "loss": 0.9855441451072693, + "step": 4696 + }, + { + "epoch": 1.4460176991150442, + "grad_norm": 5.40625, + "learning_rate": 3.5062634770713174e-06, + "loss": 1.2205106019973755, + "step": 4698 + }, + { + "epoch": 1.4466333205078876, + "grad_norm": 3.3125, + "learning_rate": 3.5031470450355396e-06, + "loss": 1.3333935737609863, + "step": 4700 + }, + { + "epoch": 1.447248941900731, + "grad_norm": 4.03125, + "learning_rate": 3.5000330941785142e-06, + "loss": 1.1714081764221191, + "step": 4702 + }, + { + "epoch": 1.4478645632935745, + "grad_norm": 6.25, + "learning_rate": 3.4969216275946414e-06, + "loss": 1.0127254724502563, + "step": 4704 + }, + { + "epoch": 1.448480184686418, + "grad_norm": 6.09375, + "learning_rate": 3.4938126483758544e-06, + "loss": 0.9646738767623901, + "step": 4706 + }, + { + "epoch": 1.4490958060792614, + "grad_norm": 5.28125, + "learning_rate": 3.49070615961162e-06, + "loss": 1.1320888996124268, + "step": 4708 + }, + { + "epoch": 1.4497114274721046, + "grad_norm": 10.5, + "learning_rate": 3.4876021643889203e-06, + "loss": 1.3203264474868774, + "step": 4710 + }, + { + "epoch": 1.450327048864948, + "grad_norm": 2.625, + "learning_rate": 3.484500665792268e-06, + "loss": 1.0886530876159668, + "step": 4712 + }, + { + "epoch": 1.4509426702577914, + "grad_norm": 28.75, + "learning_rate": 3.4814016669036903e-06, + "loss": 1.164709210395813, + "step": 4714 + }, + { + "epoch": 1.4515582916506349, + "grad_norm": 3.90625, + "learning_rate": 3.4783051708027337e-06, + "loss": 1.1819281578063965, + "step": 4716 + }, + { + "epoch": 1.4521739130434783, + "grad_norm": 9.0625, + "learning_rate": 3.4752111805664547e-06, + "loss": 1.2423570156097412, + "step": 4718 + }, + { + "epoch": 1.4527895344363215, + "grad_norm": 7.03125, + "learning_rate": 3.472119699269421e-06, + "loss": 1.3394665718078613, + "step": 4720 + }, + { + "epoch": 1.4534051558291652, + "grad_norm": 7.96875, + "learning_rate": 3.4690307299837065e-06, + "loss": 1.3577899932861328, + "step": 4722 + }, + { + "epoch": 1.4540207772220084, + "grad_norm": 6.40625, + "learning_rate": 3.4659442757788886e-06, + "loss": 1.440277338027954, + "step": 4724 + }, + { + "epoch": 1.4546363986148518, + "grad_norm": 4.8125, + "learning_rate": 3.462860339722048e-06, + "loss": 1.4500181674957275, + "step": 4726 + }, + { + "epoch": 1.4552520200076953, + "grad_norm": 5.5625, + "learning_rate": 3.459778924877759e-06, + "loss": 1.429072618484497, + "step": 4728 + }, + { + "epoch": 1.4558676414005387, + "grad_norm": 6.125, + "learning_rate": 3.4567000343080936e-06, + "loss": 1.4825221300125122, + "step": 4730 + }, + { + "epoch": 1.4564832627933821, + "grad_norm": 5.65625, + "learning_rate": 3.4536236710726147e-06, + "loss": 1.3014346361160278, + "step": 4732 + }, + { + "epoch": 1.4570988841862254, + "grad_norm": 4.65625, + "learning_rate": 3.450549838228373e-06, + "loss": 1.0785235166549683, + "step": 4734 + }, + { + "epoch": 1.457714505579069, + "grad_norm": 5.03125, + "learning_rate": 3.4474785388299054e-06, + "loss": 1.0648795366287231, + "step": 4736 + }, + { + "epoch": 1.4583301269719122, + "grad_norm": 6.40625, + "learning_rate": 3.4444097759292294e-06, + "loss": 0.4583609700202942, + "step": 4738 + }, + { + "epoch": 1.4589457483647557, + "grad_norm": 4.1875, + "learning_rate": 3.4413435525758456e-06, + "loss": 0.9563672542572021, + "step": 4740 + }, + { + "epoch": 1.459561369757599, + "grad_norm": 17.5, + "learning_rate": 3.4382798718167283e-06, + "loss": 1.5865509510040283, + "step": 4742 + }, + { + "epoch": 1.4601769911504425, + "grad_norm": 4.53125, + "learning_rate": 3.435218736696324e-06, + "loss": 1.3797905445098877, + "step": 4744 + }, + { + "epoch": 1.460792612543286, + "grad_norm": 18.25, + "learning_rate": 3.432160150256556e-06, + "loss": 1.4692882299423218, + "step": 4746 + }, + { + "epoch": 1.4614082339361292, + "grad_norm": 6.0625, + "learning_rate": 3.429104115536803e-06, + "loss": 0.8994933366775513, + "step": 4748 + }, + { + "epoch": 1.4620238553289726, + "grad_norm": 21.25, + "learning_rate": 3.4260506355739214e-06, + "loss": 1.1846752166748047, + "step": 4750 + }, + { + "epoch": 1.462639476721816, + "grad_norm": 7.5, + "learning_rate": 3.422999713402221e-06, + "loss": 1.5342363119125366, + "step": 4752 + }, + { + "epoch": 1.4632550981146595, + "grad_norm": 17.125, + "learning_rate": 3.419951352053469e-06, + "loss": 1.5448050498962402, + "step": 4754 + }, + { + "epoch": 1.463870719507503, + "grad_norm": 5.0625, + "learning_rate": 3.416905554556893e-06, + "loss": 1.5295370817184448, + "step": 4756 + }, + { + "epoch": 1.4644863409003464, + "grad_norm": 7.71875, + "learning_rate": 3.4138623239391705e-06, + "loss": 1.2686611413955688, + "step": 4758 + }, + { + "epoch": 1.4651019622931898, + "grad_norm": 8.375, + "learning_rate": 3.4108216632244272e-06, + "loss": 1.2924885749816895, + "step": 4760 + }, + { + "epoch": 1.465717583686033, + "grad_norm": 6.0625, + "learning_rate": 3.4077835754342357e-06, + "loss": 1.1660832166671753, + "step": 4762 + }, + { + "epoch": 1.4663332050788764, + "grad_norm": 4.71875, + "learning_rate": 3.4047480635876106e-06, + "loss": 0.7886714339256287, + "step": 4764 + }, + { + "epoch": 1.4669488264717199, + "grad_norm": 6.125, + "learning_rate": 3.40171513070101e-06, + "loss": 0.928934633731842, + "step": 4766 + }, + { + "epoch": 1.4675644478645633, + "grad_norm": 4.09375, + "learning_rate": 3.3986847797883265e-06, + "loss": 1.2280499935150146, + "step": 4768 + }, + { + "epoch": 1.4681800692574067, + "grad_norm": 12.625, + "learning_rate": 3.395657013860889e-06, + "loss": 1.5266685485839844, + "step": 4770 + }, + { + "epoch": 1.4687956906502502, + "grad_norm": 8.4375, + "learning_rate": 3.392631835927455e-06, + "loss": 1.3424854278564453, + "step": 4772 + }, + { + "epoch": 1.4694113120430936, + "grad_norm": 5.5, + "learning_rate": 3.3896092489942123e-06, + "loss": 1.4064112901687622, + "step": 4774 + }, + { + "epoch": 1.4700269334359368, + "grad_norm": 11.625, + "learning_rate": 3.386589256064773e-06, + "loss": 1.328787922859192, + "step": 4776 + }, + { + "epoch": 1.4706425548287803, + "grad_norm": 4.6875, + "learning_rate": 3.3835718601401696e-06, + "loss": 1.0625183582305908, + "step": 4778 + }, + { + "epoch": 1.4712581762216237, + "grad_norm": 6.78125, + "learning_rate": 3.38055706421886e-06, + "loss": 0.9996272921562195, + "step": 4780 + }, + { + "epoch": 1.4718737976144671, + "grad_norm": 6.625, + "learning_rate": 3.3775448712967128e-06, + "loss": 1.4594124555587769, + "step": 4782 + }, + { + "epoch": 1.4724894190073106, + "grad_norm": 13.75, + "learning_rate": 3.374535284367011e-06, + "loss": 1.557099461555481, + "step": 4784 + }, + { + "epoch": 1.4731050404001538, + "grad_norm": 8.6875, + "learning_rate": 3.371528306420451e-06, + "loss": 1.6163135766983032, + "step": 4786 + }, + { + "epoch": 1.4737206617929974, + "grad_norm": 9.375, + "learning_rate": 3.3685239404451286e-06, + "loss": 1.7769767045974731, + "step": 4788 + }, + { + "epoch": 1.4743362831858406, + "grad_norm": 3.15625, + "learning_rate": 3.365522189426556e-06, + "loss": 1.1799372434616089, + "step": 4790 + }, + { + "epoch": 1.474951904578684, + "grad_norm": 6.71875, + "learning_rate": 3.3625230563476356e-06, + "loss": 1.2087534666061401, + "step": 4792 + }, + { + "epoch": 1.4755675259715275, + "grad_norm": 7.59375, + "learning_rate": 3.359526544188677e-06, + "loss": 1.1725058555603027, + "step": 4794 + }, + { + "epoch": 1.476183147364371, + "grad_norm": 3.46875, + "learning_rate": 3.3565326559273803e-06, + "loss": 1.0747044086456299, + "step": 4796 + }, + { + "epoch": 1.4767987687572144, + "grad_norm": 5.46875, + "learning_rate": 3.3535413945388385e-06, + "loss": 1.2907452583312988, + "step": 4798 + }, + { + "epoch": 1.4774143901500576, + "grad_norm": 96.5, + "learning_rate": 3.3505527629955357e-06, + "loss": 0.9903527498245239, + "step": 4800 + }, + { + "epoch": 1.478030011542901, + "grad_norm": 4.9375, + "learning_rate": 3.34756676426734e-06, + "loss": 1.253095030784607, + "step": 4802 + }, + { + "epoch": 1.4786456329357445, + "grad_norm": 5.625, + "learning_rate": 3.3445834013215095e-06, + "loss": 1.3294130563735962, + "step": 4804 + }, + { + "epoch": 1.479261254328588, + "grad_norm": 2.84375, + "learning_rate": 3.3416026771226756e-06, + "loss": 1.3141107559204102, + "step": 4806 + }, + { + "epoch": 1.4798768757214313, + "grad_norm": 8.0625, + "learning_rate": 3.338624594632851e-06, + "loss": 0.8612028956413269, + "step": 4808 + }, + { + "epoch": 1.4804924971142748, + "grad_norm": 8.375, + "learning_rate": 3.335649156811425e-06, + "loss": 1.5777348279953003, + "step": 4810 + }, + { + "epoch": 1.4811081185071182, + "grad_norm": 3.515625, + "learning_rate": 3.332676366615154e-06, + "loss": 1.1648386716842651, + "step": 4812 + }, + { + "epoch": 1.4817237398999614, + "grad_norm": 8.5625, + "learning_rate": 3.329706226998169e-06, + "loss": 1.3452742099761963, + "step": 4814 + }, + { + "epoch": 1.4823393612928049, + "grad_norm": 6.03125, + "learning_rate": 3.3267387409119633e-06, + "loss": 1.1355098485946655, + "step": 4816 + }, + { + "epoch": 1.4829549826856483, + "grad_norm": 8.9375, + "learning_rate": 3.3237739113053924e-06, + "loss": 1.2803765535354614, + "step": 4818 + }, + { + "epoch": 1.4835706040784917, + "grad_norm": 7.46875, + "learning_rate": 3.3208117411246766e-06, + "loss": 1.2495794296264648, + "step": 4820 + }, + { + "epoch": 1.4841862254713352, + "grad_norm": 5.0625, + "learning_rate": 3.317852233313389e-06, + "loss": 1.2289323806762695, + "step": 4822 + }, + { + "epoch": 1.4848018468641786, + "grad_norm": 3.890625, + "learning_rate": 3.3148953908124624e-06, + "loss": 1.246183156967163, + "step": 4824 + }, + { + "epoch": 1.485417468257022, + "grad_norm": 6.34375, + "learning_rate": 3.3119412165601717e-06, + "loss": 1.047895073890686, + "step": 4826 + }, + { + "epoch": 1.4860330896498652, + "grad_norm": 7.1875, + "learning_rate": 3.308989713492151e-06, + "loss": 1.32063627243042, + "step": 4828 + }, + { + "epoch": 1.4866487110427087, + "grad_norm": 3.796875, + "learning_rate": 3.3060408845413733e-06, + "loss": 1.0678014755249023, + "step": 4830 + }, + { + "epoch": 1.4872643324355521, + "grad_norm": 5.15625, + "learning_rate": 3.3030947326381548e-06, + "loss": 1.2184605598449707, + "step": 4832 + }, + { + "epoch": 1.4878799538283956, + "grad_norm": 12.625, + "learning_rate": 3.300151260710155e-06, + "loss": 1.4712916612625122, + "step": 4834 + }, + { + "epoch": 1.488495575221239, + "grad_norm": 14.5, + "learning_rate": 3.2972104716823663e-06, + "loss": 1.0407633781433105, + "step": 4836 + }, + { + "epoch": 1.4891111966140824, + "grad_norm": 5.4375, + "learning_rate": 3.2942723684771172e-06, + "loss": 1.1654162406921387, + "step": 4838 + }, + { + "epoch": 1.4897268180069259, + "grad_norm": 8.3125, + "learning_rate": 3.2913369540140673e-06, + "loss": 1.262948989868164, + "step": 4840 + }, + { + "epoch": 1.490342439399769, + "grad_norm": 7.34375, + "learning_rate": 3.2884042312102017e-06, + "loss": 1.097400188446045, + "step": 4842 + }, + { + "epoch": 1.4909580607926125, + "grad_norm": 3.0, + "learning_rate": 3.285474202979835e-06, + "loss": 0.5754028558731079, + "step": 4844 + }, + { + "epoch": 1.491573682185456, + "grad_norm": 4.28125, + "learning_rate": 3.2825468722346e-06, + "loss": 0.969556450843811, + "step": 4846 + }, + { + "epoch": 1.4921893035782994, + "grad_norm": 8.1875, + "learning_rate": 3.2796222418834533e-06, + "loss": 1.434523105621338, + "step": 4848 + }, + { + "epoch": 1.4928049249711428, + "grad_norm": 9.3125, + "learning_rate": 3.276700314832666e-06, + "loss": 1.8160494565963745, + "step": 4850 + }, + { + "epoch": 1.493420546363986, + "grad_norm": 17.5, + "learning_rate": 3.2737810939858183e-06, + "loss": 1.635191559791565, + "step": 4852 + }, + { + "epoch": 1.4940361677568297, + "grad_norm": 5.4375, + "learning_rate": 3.270864582243809e-06, + "loss": 1.1667835712432861, + "step": 4854 + }, + { + "epoch": 1.494651789149673, + "grad_norm": 9.1875, + "learning_rate": 3.267950782504839e-06, + "loss": 1.4574627876281738, + "step": 4856 + }, + { + "epoch": 1.4952674105425163, + "grad_norm": 5.78125, + "learning_rate": 3.265039697664419e-06, + "loss": 1.0753768682479858, + "step": 4858 + }, + { + "epoch": 1.4958830319353598, + "grad_norm": 5.3125, + "learning_rate": 3.262131330615358e-06, + "loss": 1.4011366367340088, + "step": 4860 + }, + { + "epoch": 1.4964986533282032, + "grad_norm": 1.7421875, + "learning_rate": 3.2592256842477644e-06, + "loss": 1.22893226146698, + "step": 4862 + }, + { + "epoch": 1.4971142747210466, + "grad_norm": 7.375, + "learning_rate": 3.2563227614490456e-06, + "loss": 1.0237904787063599, + "step": 4864 + }, + { + "epoch": 1.4977298961138898, + "grad_norm": 5.78125, + "learning_rate": 3.2534225651038997e-06, + "loss": 1.1823903322219849, + "step": 4866 + }, + { + "epoch": 1.4983455175067333, + "grad_norm": 9.5625, + "learning_rate": 3.2505250980943182e-06, + "loss": 1.0303336381912231, + "step": 4868 + }, + { + "epoch": 1.4989611388995767, + "grad_norm": 5.0, + "learning_rate": 3.2476303632995792e-06, + "loss": 1.4868566989898682, + "step": 4870 + }, + { + "epoch": 1.4995767602924202, + "grad_norm": 8.25, + "learning_rate": 3.244738363596244e-06, + "loss": 0.9418045282363892, + "step": 4872 + }, + { + "epoch": 1.5001923816852636, + "grad_norm": 5.84375, + "learning_rate": 3.241849101858159e-06, + "loss": 1.1360383033752441, + "step": 4874 + }, + { + "epoch": 1.5008080030781068, + "grad_norm": 33.0, + "learning_rate": 3.238962580956447e-06, + "loss": 1.2348039150238037, + "step": 4876 + }, + { + "epoch": 1.5014236244709505, + "grad_norm": 7.0625, + "learning_rate": 3.2360788037595104e-06, + "loss": 1.745518445968628, + "step": 4878 + }, + { + "epoch": 1.5020392458637937, + "grad_norm": 1.8515625, + "learning_rate": 3.23319777313302e-06, + "loss": 1.1881951093673706, + "step": 4880 + }, + { + "epoch": 1.5026548672566373, + "grad_norm": 10.625, + "learning_rate": 3.2303194919399244e-06, + "loss": 0.7165201902389526, + "step": 4882 + }, + { + "epoch": 1.5032704886494805, + "grad_norm": 6.40625, + "learning_rate": 3.227443963040434e-06, + "loss": 1.5470805168151855, + "step": 4884 + }, + { + "epoch": 1.503886110042324, + "grad_norm": 5.375, + "learning_rate": 3.2245711892920256e-06, + "loss": 0.9742276668548584, + "step": 4886 + }, + { + "epoch": 1.5045017314351674, + "grad_norm": 5.65625, + "learning_rate": 3.221701173549443e-06, + "loss": 1.2734863758087158, + "step": 4888 + }, + { + "epoch": 1.5051173528280106, + "grad_norm": 12.75, + "learning_rate": 3.218833918664679e-06, + "loss": 1.0193525552749634, + "step": 4890 + }, + { + "epoch": 1.5057329742208543, + "grad_norm": 16.875, + "learning_rate": 3.2159694274869935e-06, + "loss": 1.5467280149459839, + "step": 4892 + }, + { + "epoch": 1.5063485956136975, + "grad_norm": 4.9375, + "learning_rate": 3.2131077028628945e-06, + "loss": 1.0171399116516113, + "step": 4894 + }, + { + "epoch": 1.506964217006541, + "grad_norm": 4.71875, + "learning_rate": 3.21024874763614e-06, + "loss": 0.7832403182983398, + "step": 4896 + }, + { + "epoch": 1.5075798383993844, + "grad_norm": 2.921875, + "learning_rate": 3.2073925646477406e-06, + "loss": 0.8320021629333496, + "step": 4898 + }, + { + "epoch": 1.5081954597922278, + "grad_norm": 4.28125, + "learning_rate": 3.2045391567359473e-06, + "loss": 1.2064433097839355, + "step": 4900 + }, + { + "epoch": 1.5088110811850712, + "grad_norm": 9.9375, + "learning_rate": 3.2016885267362595e-06, + "loss": 1.3419796228408813, + "step": 4902 + }, + { + "epoch": 1.5094267025779144, + "grad_norm": 3.234375, + "learning_rate": 3.198840677481407e-06, + "loss": 1.22185480594635, + "step": 4904 + }, + { + "epoch": 1.510042323970758, + "grad_norm": 6.8125, + "learning_rate": 3.1959956118013637e-06, + "loss": 1.1904466152191162, + "step": 4906 + }, + { + "epoch": 1.5106579453636013, + "grad_norm": 4.75, + "learning_rate": 3.1931533325233354e-06, + "loss": 1.4791064262390137, + "step": 4908 + }, + { + "epoch": 1.5112735667564448, + "grad_norm": 31.5, + "learning_rate": 3.1903138424717573e-06, + "loss": 0.6052420139312744, + "step": 4910 + }, + { + "epoch": 1.5118891881492882, + "grad_norm": 10.375, + "learning_rate": 3.1874771444682962e-06, + "loss": 0.9311878085136414, + "step": 4912 + }, + { + "epoch": 1.5125048095421316, + "grad_norm": 12.25, + "learning_rate": 3.1846432413318425e-06, + "loss": 1.0832551717758179, + "step": 4914 + }, + { + "epoch": 1.513120430934975, + "grad_norm": 4.4375, + "learning_rate": 3.181812135878508e-06, + "loss": 1.0968680381774902, + "step": 4916 + }, + { + "epoch": 1.5137360523278183, + "grad_norm": 1.828125, + "learning_rate": 3.178983830921626e-06, + "loss": 1.0732853412628174, + "step": 4918 + }, + { + "epoch": 1.514351673720662, + "grad_norm": 3.9375, + "learning_rate": 3.1761583292717456e-06, + "loss": 1.0559816360473633, + "step": 4920 + }, + { + "epoch": 1.5149672951135051, + "grad_norm": 4.1875, + "learning_rate": 3.1733356337366334e-06, + "loss": 1.3119699954986572, + "step": 4922 + }, + { + "epoch": 1.5155829165063486, + "grad_norm": 7.40625, + "learning_rate": 3.1705157471212634e-06, + "loss": 1.2674381732940674, + "step": 4924 + }, + { + "epoch": 1.516198537899192, + "grad_norm": 9.6875, + "learning_rate": 3.16769867222782e-06, + "loss": 1.6064198017120361, + "step": 4926 + }, + { + "epoch": 1.5168141592920354, + "grad_norm": 8.6875, + "learning_rate": 3.164884411855697e-06, + "loss": 1.4921178817749023, + "step": 4928 + }, + { + "epoch": 1.5174297806848789, + "grad_norm": 13.0625, + "learning_rate": 3.162072968801483e-06, + "loss": 1.308781623840332, + "step": 4930 + }, + { + "epoch": 1.518045402077722, + "grad_norm": 4.65625, + "learning_rate": 3.159264345858975e-06, + "loss": 1.440758228302002, + "step": 4932 + }, + { + "epoch": 1.5186610234705658, + "grad_norm": 8.8125, + "learning_rate": 3.156458545819163e-06, + "loss": 1.3631476163864136, + "step": 4934 + }, + { + "epoch": 1.519276644863409, + "grad_norm": 6.09375, + "learning_rate": 3.153655571470236e-06, + "loss": 1.188265085220337, + "step": 4936 + }, + { + "epoch": 1.5198922662562524, + "grad_norm": 5.625, + "learning_rate": 3.1508554255975705e-06, + "loss": 1.2975990772247314, + "step": 4938 + }, + { + "epoch": 1.5205078876490958, + "grad_norm": 5.03125, + "learning_rate": 3.148058110983735e-06, + "loss": 1.2463330030441284, + "step": 4940 + }, + { + "epoch": 1.521123509041939, + "grad_norm": 7.71875, + "learning_rate": 3.1452636304084827e-06, + "loss": 1.2518240213394165, + "step": 4942 + }, + { + "epoch": 1.5217391304347827, + "grad_norm": 23.125, + "learning_rate": 3.142471986648751e-06, + "loss": 1.3494226932525635, + "step": 4944 + }, + { + "epoch": 1.522354751827626, + "grad_norm": 6.15625, + "learning_rate": 3.1396831824786612e-06, + "loss": 1.3387173414230347, + "step": 4946 + }, + { + "epoch": 1.5229703732204696, + "grad_norm": 3.96875, + "learning_rate": 3.1368972206695097e-06, + "loss": 1.245314359664917, + "step": 4948 + }, + { + "epoch": 1.5235859946133128, + "grad_norm": 6.46875, + "learning_rate": 3.134114103989767e-06, + "loss": 1.087040901184082, + "step": 4950 + }, + { + "epoch": 1.5242016160061562, + "grad_norm": 9.625, + "learning_rate": 3.131333835205082e-06, + "loss": 1.4781947135925293, + "step": 4952 + }, + { + "epoch": 1.5248172373989997, + "grad_norm": 4.15625, + "learning_rate": 3.128556417078269e-06, + "loss": 1.0316975116729736, + "step": 4954 + }, + { + "epoch": 1.5254328587918429, + "grad_norm": 5.84375, + "learning_rate": 3.1257818523693094e-06, + "loss": 1.1803417205810547, + "step": 4956 + }, + { + "epoch": 1.5260484801846865, + "grad_norm": 3.515625, + "learning_rate": 3.1230101438353516e-06, + "loss": 1.2382619380950928, + "step": 4958 + }, + { + "epoch": 1.5266641015775297, + "grad_norm": 11.6875, + "learning_rate": 3.120241294230702e-06, + "loss": 1.5910042524337769, + "step": 4960 + }, + { + "epoch": 1.5272797229703732, + "grad_norm": 4.9375, + "learning_rate": 3.1174753063068324e-06, + "loss": 0.9350588321685791, + "step": 4962 + }, + { + "epoch": 1.5278953443632166, + "grad_norm": 11.9375, + "learning_rate": 3.114712182812364e-06, + "loss": 1.262253999710083, + "step": 4964 + }, + { + "epoch": 1.52851096575606, + "grad_norm": 11.875, + "learning_rate": 3.1119519264930777e-06, + "loss": 1.2304760217666626, + "step": 4966 + }, + { + "epoch": 1.5291265871489035, + "grad_norm": 7.125, + "learning_rate": 3.109194540091898e-06, + "loss": 1.1309937238693237, + "step": 4968 + }, + { + "epoch": 1.5297422085417467, + "grad_norm": 14.5, + "learning_rate": 3.106440026348904e-06, + "loss": 0.9601719975471497, + "step": 4970 + }, + { + "epoch": 1.5303578299345904, + "grad_norm": 6.6875, + "learning_rate": 3.103688388001318e-06, + "loss": 1.187595248222351, + "step": 4972 + }, + { + "epoch": 1.5309734513274336, + "grad_norm": 1.5703125, + "learning_rate": 3.100939627783503e-06, + "loss": 0.9698360562324524, + "step": 4974 + }, + { + "epoch": 1.531589072720277, + "grad_norm": 7.4375, + "learning_rate": 3.098193748426965e-06, + "loss": 1.3194845914840698, + "step": 4976 + }, + { + "epoch": 1.5322046941131204, + "grad_norm": 8.5625, + "learning_rate": 3.095450752660347e-06, + "loss": 1.1873408555984497, + "step": 4978 + }, + { + "epoch": 1.5328203155059639, + "grad_norm": 11.875, + "learning_rate": 3.0927106432094228e-06, + "loss": 1.3280301094055176, + "step": 4980 + }, + { + "epoch": 1.5334359368988073, + "grad_norm": 9.8125, + "learning_rate": 3.0899734227971025e-06, + "loss": 1.564595103263855, + "step": 4982 + }, + { + "epoch": 1.5340515582916505, + "grad_norm": 5.6875, + "learning_rate": 3.087239094143421e-06, + "loss": 1.1651568412780762, + "step": 4984 + }, + { + "epoch": 1.5346671796844942, + "grad_norm": 10.125, + "learning_rate": 3.084507659965545e-06, + "loss": 1.458910346031189, + "step": 4986 + }, + { + "epoch": 1.5352828010773374, + "grad_norm": 9.375, + "learning_rate": 3.0817791229777595e-06, + "loss": 1.3746843338012695, + "step": 4988 + }, + { + "epoch": 1.5358984224701808, + "grad_norm": 5.96875, + "learning_rate": 3.0790534858914742e-06, + "loss": 1.043514609336853, + "step": 4990 + }, + { + "epoch": 1.5365140438630243, + "grad_norm": 2.46875, + "learning_rate": 3.0763307514152163e-06, + "loss": 1.1240226030349731, + "step": 4992 + }, + { + "epoch": 1.5371296652558677, + "grad_norm": 4.40625, + "learning_rate": 3.0736109222546267e-06, + "loss": 1.2111371755599976, + "step": 4994 + }, + { + "epoch": 1.5377452866487111, + "grad_norm": 9.1875, + "learning_rate": 3.0708940011124613e-06, + "loss": 1.4809215068817139, + "step": 4996 + }, + { + "epoch": 1.5383609080415543, + "grad_norm": 4.125, + "learning_rate": 3.0681799906885846e-06, + "loss": 1.032042145729065, + "step": 4998 + }, + { + "epoch": 1.538976529434398, + "grad_norm": 3.171875, + "learning_rate": 3.0654688936799704e-06, + "loss": 1.322123646736145, + "step": 5000 + }, + { + "epoch": 1.5395921508272412, + "grad_norm": 9.9375, + "learning_rate": 3.062760712780697e-06, + "loss": 1.3321819305419922, + "step": 5002 + }, + { + "epoch": 1.5402077722200846, + "grad_norm": 4.375, + "learning_rate": 3.060055450681943e-06, + "loss": 0.8392131924629211, + "step": 5004 + }, + { + "epoch": 1.540823393612928, + "grad_norm": 5.46875, + "learning_rate": 3.0573531100719915e-06, + "loss": 1.1302207708358765, + "step": 5006 + }, + { + "epoch": 1.5414390150057713, + "grad_norm": 1.875, + "learning_rate": 3.054653693636214e-06, + "loss": 1.157600998878479, + "step": 5008 + }, + { + "epoch": 1.542054636398615, + "grad_norm": 5.21875, + "learning_rate": 3.051957204057084e-06, + "loss": 1.2203869819641113, + "step": 5010 + }, + { + "epoch": 1.5426702577914582, + "grad_norm": 33.0, + "learning_rate": 3.0492636440141637e-06, + "loss": 1.2869846820831299, + "step": 5012 + }, + { + "epoch": 1.5432858791843016, + "grad_norm": 3.640625, + "learning_rate": 3.0465730161841023e-06, + "loss": 1.4599010944366455, + "step": 5014 + }, + { + "epoch": 1.543901500577145, + "grad_norm": 5.75, + "learning_rate": 3.0438853232406395e-06, + "loss": 1.4534536600112915, + "step": 5016 + }, + { + "epoch": 1.5445171219699885, + "grad_norm": 14.9375, + "learning_rate": 3.0412005678545947e-06, + "loss": 1.5622611045837402, + "step": 5018 + }, + { + "epoch": 1.545132743362832, + "grad_norm": 4.8125, + "learning_rate": 3.038518752693869e-06, + "loss": 1.2805486917495728, + "step": 5020 + }, + { + "epoch": 1.5457483647556751, + "grad_norm": 5.21875, + "learning_rate": 3.035839880423443e-06, + "loss": 1.184311866760254, + "step": 5022 + }, + { + "epoch": 1.5463639861485188, + "grad_norm": 6.15625, + "learning_rate": 3.033163953705372e-06, + "loss": 1.2769352197647095, + "step": 5024 + }, + { + "epoch": 1.546979607541362, + "grad_norm": 7.1875, + "learning_rate": 3.0304909751987842e-06, + "loss": 1.059838056564331, + "step": 5026 + }, + { + "epoch": 1.5475952289342054, + "grad_norm": 9.625, + "learning_rate": 3.027820947559878e-06, + "loss": 1.6449565887451172, + "step": 5028 + }, + { + "epoch": 1.5482108503270489, + "grad_norm": 6.5, + "learning_rate": 3.0251538734419205e-06, + "loss": 1.2422211170196533, + "step": 5030 + }, + { + "epoch": 1.5488264717198923, + "grad_norm": 6.625, + "learning_rate": 3.0224897554952433e-06, + "loss": 1.6606228351593018, + "step": 5032 + }, + { + "epoch": 1.5494420931127357, + "grad_norm": 4.3125, + "learning_rate": 3.0198285963672386e-06, + "loss": 1.3177484273910522, + "step": 5034 + }, + { + "epoch": 1.550057714505579, + "grad_norm": 9.25, + "learning_rate": 3.0171703987023615e-06, + "loss": 1.2722313404083252, + "step": 5036 + }, + { + "epoch": 1.5506733358984226, + "grad_norm": 9.125, + "learning_rate": 3.0145151651421202e-06, + "loss": 1.1697152853012085, + "step": 5038 + }, + { + "epoch": 1.5512889572912658, + "grad_norm": 6.6875, + "learning_rate": 3.0118628983250826e-06, + "loss": 1.511368751525879, + "step": 5040 + }, + { + "epoch": 1.5519045786841092, + "grad_norm": 8.5, + "learning_rate": 3.0092136008868635e-06, + "loss": 1.0446054935455322, + "step": 5042 + }, + { + "epoch": 1.5525202000769527, + "grad_norm": 7.84375, + "learning_rate": 3.0065672754601326e-06, + "loss": 1.1255905628204346, + "step": 5044 + }, + { + "epoch": 1.5531358214697961, + "grad_norm": 12.3125, + "learning_rate": 3.003923924674598e-06, + "loss": 1.307901382446289, + "step": 5046 + }, + { + "epoch": 1.5537514428626396, + "grad_norm": 9.0625, + "learning_rate": 3.0012835511570193e-06, + "loss": 1.0257079601287842, + "step": 5048 + }, + { + "epoch": 1.5543670642554828, + "grad_norm": 4.96875, + "learning_rate": 2.9986461575311955e-06, + "loss": 1.1753787994384766, + "step": 5050 + }, + { + "epoch": 1.5549826856483264, + "grad_norm": 14.4375, + "learning_rate": 2.9960117464179615e-06, + "loss": 0.9505599737167358, + "step": 5052 + }, + { + "epoch": 1.5555983070411696, + "grad_norm": 5.9375, + "learning_rate": 2.993380320435193e-06, + "loss": 1.4326746463775635, + "step": 5054 + }, + { + "epoch": 1.556213928434013, + "grad_norm": 8.625, + "learning_rate": 2.990751882197796e-06, + "loss": 1.1694228649139404, + "step": 5056 + }, + { + "epoch": 1.5568295498268565, + "grad_norm": 6.375, + "learning_rate": 2.9881264343177087e-06, + "loss": 1.2966336011886597, + "step": 5058 + }, + { + "epoch": 1.5574451712196997, + "grad_norm": 13.75, + "learning_rate": 2.9855039794038964e-06, + "loss": 1.4063342809677124, + "step": 5060 + }, + { + "epoch": 1.5580607926125434, + "grad_norm": 21.75, + "learning_rate": 2.982884520062352e-06, + "loss": 1.6873064041137695, + "step": 5062 + }, + { + "epoch": 1.5586764140053866, + "grad_norm": 18.25, + "learning_rate": 2.980268058896092e-06, + "loss": 1.5693508386611938, + "step": 5064 + }, + { + "epoch": 1.5592920353982302, + "grad_norm": 51.25, + "learning_rate": 2.9776545985051515e-06, + "loss": 1.1830573081970215, + "step": 5066 + }, + { + "epoch": 1.5599076567910735, + "grad_norm": 7.0625, + "learning_rate": 2.975044141486584e-06, + "loss": 1.215011477470398, + "step": 5068 + }, + { + "epoch": 1.560523278183917, + "grad_norm": 3.84375, + "learning_rate": 2.972436690434462e-06, + "loss": 1.1971906423568726, + "step": 5070 + }, + { + "epoch": 1.5611388995767603, + "grad_norm": 10.6875, + "learning_rate": 2.969832247939864e-06, + "loss": 1.50948965549469, + "step": 5072 + }, + { + "epoch": 1.5617545209696035, + "grad_norm": 4.5625, + "learning_rate": 2.9672308165908857e-06, + "loss": 1.0662416219711304, + "step": 5074 + }, + { + "epoch": 1.5623701423624472, + "grad_norm": 3.46875, + "learning_rate": 2.9646323989726267e-06, + "loss": 1.0881036520004272, + "step": 5076 + }, + { + "epoch": 1.5629857637552904, + "grad_norm": 16.5, + "learning_rate": 2.962036997667193e-06, + "loss": 1.2012338638305664, + "step": 5078 + }, + { + "epoch": 1.5636013851481338, + "grad_norm": 9.625, + "learning_rate": 2.959444615253694e-06, + "loss": 1.7494795322418213, + "step": 5080 + }, + { + "epoch": 1.5642170065409773, + "grad_norm": 19.125, + "learning_rate": 2.9568552543082375e-06, + "loss": 1.1780784130096436, + "step": 5082 + }, + { + "epoch": 1.5648326279338207, + "grad_norm": 4.53125, + "learning_rate": 2.954268917403929e-06, + "loss": 1.244818925857544, + "step": 5084 + }, + { + "epoch": 1.5654482493266642, + "grad_norm": 5.09375, + "learning_rate": 2.951685607110869e-06, + "loss": 1.466505765914917, + "step": 5086 + }, + { + "epoch": 1.5660638707195074, + "grad_norm": 3.78125, + "learning_rate": 2.949105325996153e-06, + "loss": 1.434507131576538, + "step": 5088 + }, + { + "epoch": 1.566679492112351, + "grad_norm": 7.6875, + "learning_rate": 2.9465280766238625e-06, + "loss": 1.105286955833435, + "step": 5090 + }, + { + "epoch": 1.5672951135051942, + "grad_norm": 5.90625, + "learning_rate": 2.9439538615550674e-06, + "loss": 1.214867353439331, + "step": 5092 + }, + { + "epoch": 1.5679107348980377, + "grad_norm": 4.125, + "learning_rate": 2.9413826833478246e-06, + "loss": 0.799168586730957, + "step": 5094 + }, + { + "epoch": 1.568526356290881, + "grad_norm": 35.5, + "learning_rate": 2.9388145445571715e-06, + "loss": 1.2401793003082275, + "step": 5096 + }, + { + "epoch": 1.5691419776837245, + "grad_norm": 5.8125, + "learning_rate": 2.9362494477351245e-06, + "loss": 1.4166289567947388, + "step": 5098 + }, + { + "epoch": 1.569757599076568, + "grad_norm": 4.8125, + "learning_rate": 2.9336873954306765e-06, + "loss": 1.2858507633209229, + "step": 5100 + }, + { + "epoch": 1.5703732204694112, + "grad_norm": 5.28125, + "learning_rate": 2.9311283901897985e-06, + "loss": 0.9255216121673584, + "step": 5102 + }, + { + "epoch": 1.5709888418622548, + "grad_norm": 6.40625, + "learning_rate": 2.928572434555431e-06, + "loss": 1.2053285837173462, + "step": 5104 + }, + { + "epoch": 1.571604463255098, + "grad_norm": 5.5, + "learning_rate": 2.926019531067482e-06, + "loss": 1.1776148080825806, + "step": 5106 + }, + { + "epoch": 1.5722200846479415, + "grad_norm": 3.953125, + "learning_rate": 2.9234696822628334e-06, + "loss": 1.183230996131897, + "step": 5108 + }, + { + "epoch": 1.572835706040785, + "grad_norm": 7.1875, + "learning_rate": 2.9209228906753225e-06, + "loss": 1.1474194526672363, + "step": 5110 + }, + { + "epoch": 1.5734513274336284, + "grad_norm": 20.25, + "learning_rate": 2.918379158835756e-06, + "loss": 1.014983057975769, + "step": 5112 + }, + { + "epoch": 1.5740669488264718, + "grad_norm": 4.59375, + "learning_rate": 2.9158384892718966e-06, + "loss": 1.2621625661849976, + "step": 5114 + }, + { + "epoch": 1.574682570219315, + "grad_norm": 4.34375, + "learning_rate": 2.9133008845084632e-06, + "loss": 1.2142401933670044, + "step": 5116 + }, + { + "epoch": 1.5752981916121587, + "grad_norm": 3.421875, + "learning_rate": 2.9107663470671334e-06, + "loss": 1.1952993869781494, + "step": 5118 + }, + { + "epoch": 1.5759138130050019, + "grad_norm": 19.25, + "learning_rate": 2.9082348794665317e-06, + "loss": 1.0763602256774902, + "step": 5120 + }, + { + "epoch": 1.5765294343978453, + "grad_norm": 5.125, + "learning_rate": 2.905706484222235e-06, + "loss": 0.9360074400901794, + "step": 5122 + }, + { + "epoch": 1.5771450557906888, + "grad_norm": 4.40625, + "learning_rate": 2.903181163846766e-06, + "loss": 1.23157799243927, + "step": 5124 + }, + { + "epoch": 1.577760677183532, + "grad_norm": 5.03125, + "learning_rate": 2.9006589208495907e-06, + "loss": 0.7338969707489014, + "step": 5126 + }, + { + "epoch": 1.5783762985763756, + "grad_norm": 29.625, + "learning_rate": 2.898139757737122e-06, + "loss": 1.6849936246871948, + "step": 5128 + }, + { + "epoch": 1.5789919199692188, + "grad_norm": 11.5, + "learning_rate": 2.895623677012705e-06, + "loss": 1.4091507196426392, + "step": 5130 + }, + { + "epoch": 1.5796075413620625, + "grad_norm": 10.4375, + "learning_rate": 2.8931106811766292e-06, + "loss": 1.269464135169983, + "step": 5132 + }, + { + "epoch": 1.5802231627549057, + "grad_norm": 10.5, + "learning_rate": 2.890600772726113e-06, + "loss": 0.9996485710144043, + "step": 5134 + }, + { + "epoch": 1.5808387841477491, + "grad_norm": 8.625, + "learning_rate": 2.8880939541553075e-06, + "loss": 1.2088068723678589, + "step": 5136 + }, + { + "epoch": 1.5814544055405926, + "grad_norm": 13.5, + "learning_rate": 2.8855902279552966e-06, + "loss": 1.5215330123901367, + "step": 5138 + }, + { + "epoch": 1.5820700269334358, + "grad_norm": 18.0, + "learning_rate": 2.883089596614087e-06, + "loss": 1.3437632322311401, + "step": 5140 + }, + { + "epoch": 1.5826856483262794, + "grad_norm": 10.8125, + "learning_rate": 2.8805920626166144e-06, + "loss": 1.1248289346694946, + "step": 5142 + }, + { + "epoch": 1.5833012697191227, + "grad_norm": 11.5625, + "learning_rate": 2.8780976284447337e-06, + "loss": 1.271138310432434, + "step": 5144 + }, + { + "epoch": 1.583916891111966, + "grad_norm": 8.6875, + "learning_rate": 2.875606296577218e-06, + "loss": 1.2558194398880005, + "step": 5146 + }, + { + "epoch": 1.5845325125048095, + "grad_norm": 8.875, + "learning_rate": 2.873118069489764e-06, + "loss": 1.2829164266586304, + "step": 5148 + }, + { + "epoch": 1.585148133897653, + "grad_norm": 12.125, + "learning_rate": 2.8706329496549734e-06, + "loss": 1.261286735534668, + "step": 5150 + }, + { + "epoch": 1.5857637552904964, + "grad_norm": 13.5, + "learning_rate": 2.8681509395423695e-06, + "loss": 1.7474175691604614, + "step": 5152 + }, + { + "epoch": 1.5863793766833396, + "grad_norm": 9.25, + "learning_rate": 2.8656720416183786e-06, + "loss": 1.4634180068969727, + "step": 5154 + }, + { + "epoch": 1.5869949980761833, + "grad_norm": 3.375, + "learning_rate": 2.8631962583463396e-06, + "loss": 1.2069108486175537, + "step": 5156 + }, + { + "epoch": 1.5876106194690265, + "grad_norm": 3.125, + "learning_rate": 2.8607235921864934e-06, + "loss": 1.0664864778518677, + "step": 5158 + }, + { + "epoch": 1.58822624086187, + "grad_norm": 3.921875, + "learning_rate": 2.8582540455959824e-06, + "loss": 1.1400189399719238, + "step": 5160 + }, + { + "epoch": 1.5888418622547134, + "grad_norm": 12.25, + "learning_rate": 2.8557876210288513e-06, + "loss": 1.2755979299545288, + "step": 5162 + }, + { + "epoch": 1.5894574836475568, + "grad_norm": 8.3125, + "learning_rate": 2.85332432093604e-06, + "loss": 1.5039992332458496, + "step": 5164 + }, + { + "epoch": 1.5900731050404002, + "grad_norm": 2.859375, + "learning_rate": 2.850864147765388e-06, + "loss": 1.158522367477417, + "step": 5166 + }, + { + "epoch": 1.5906887264332434, + "grad_norm": 6.5625, + "learning_rate": 2.8484071039616227e-06, + "loss": 1.0927605628967285, + "step": 5168 + }, + { + "epoch": 1.591304347826087, + "grad_norm": 7.5625, + "learning_rate": 2.8459531919663626e-06, + "loss": 1.0280814170837402, + "step": 5170 + }, + { + "epoch": 1.5919199692189303, + "grad_norm": 4.21875, + "learning_rate": 2.8435024142181174e-06, + "loss": 1.1364541053771973, + "step": 5172 + }, + { + "epoch": 1.5925355906117737, + "grad_norm": 4.71875, + "learning_rate": 2.8410547731522787e-06, + "loss": 1.2341628074645996, + "step": 5174 + }, + { + "epoch": 1.5931512120046172, + "grad_norm": 13.875, + "learning_rate": 2.8386102712011215e-06, + "loss": 1.1459120512008667, + "step": 5176 + }, + { + "epoch": 1.5937668333974606, + "grad_norm": 6.3125, + "learning_rate": 2.836168910793804e-06, + "loss": 1.302492380142212, + "step": 5178 + }, + { + "epoch": 1.594382454790304, + "grad_norm": 5.84375, + "learning_rate": 2.833730694356358e-06, + "loss": 1.0967020988464355, + "step": 5180 + }, + { + "epoch": 1.5949980761831473, + "grad_norm": 5.46875, + "learning_rate": 2.831295624311697e-06, + "loss": 1.4645941257476807, + "step": 5182 + }, + { + "epoch": 1.595613697575991, + "grad_norm": 6.4375, + "learning_rate": 2.8288637030796023e-06, + "loss": 1.1216599941253662, + "step": 5184 + }, + { + "epoch": 1.5962293189688341, + "grad_norm": 6.5625, + "learning_rate": 2.8264349330767316e-06, + "loss": 1.520884394645691, + "step": 5186 + }, + { + "epoch": 1.5968449403616776, + "grad_norm": 10.6875, + "learning_rate": 2.8240093167166037e-06, + "loss": 1.397294044494629, + "step": 5188 + }, + { + "epoch": 1.597460561754521, + "grad_norm": 5.4375, + "learning_rate": 2.821586856409611e-06, + "loss": 1.2669717073440552, + "step": 5190 + }, + { + "epoch": 1.5980761831473642, + "grad_norm": 4.65625, + "learning_rate": 2.8191675545630066e-06, + "loss": 1.2159026861190796, + "step": 5192 + }, + { + "epoch": 1.5986918045402079, + "grad_norm": 7.4375, + "learning_rate": 2.8167514135809025e-06, + "loss": 1.2182968854904175, + "step": 5194 + }, + { + "epoch": 1.599307425933051, + "grad_norm": 15.3125, + "learning_rate": 2.8143384358642757e-06, + "loss": 1.1196813583374023, + "step": 5196 + }, + { + "epoch": 1.5999230473258945, + "grad_norm": 6.4375, + "learning_rate": 2.811928623810954e-06, + "loss": 1.0934268236160278, + "step": 5198 + }, + { + "epoch": 1.600538668718738, + "grad_norm": 5.6875, + "learning_rate": 2.8095219798156213e-06, + "loss": 1.3068904876708984, + "step": 5200 + }, + { + "epoch": 1.6011542901115814, + "grad_norm": 7.09375, + "learning_rate": 2.8071185062698158e-06, + "loss": 1.1975951194763184, + "step": 5202 + }, + { + "epoch": 1.6017699115044248, + "grad_norm": 5.34375, + "learning_rate": 2.8047182055619203e-06, + "loss": 1.0088521242141724, + "step": 5204 + }, + { + "epoch": 1.602385532897268, + "grad_norm": 2.78125, + "learning_rate": 2.8023210800771694e-06, + "loss": 1.170898199081421, + "step": 5206 + }, + { + "epoch": 1.6030011542901117, + "grad_norm": 5.46875, + "learning_rate": 2.7999271321976397e-06, + "loss": 1.3110368251800537, + "step": 5208 + }, + { + "epoch": 1.603616775682955, + "grad_norm": 13.1875, + "learning_rate": 2.797536364302251e-06, + "loss": 1.5783711671829224, + "step": 5210 + }, + { + "epoch": 1.6042323970757983, + "grad_norm": 2.96875, + "learning_rate": 2.7951487787667646e-06, + "loss": 1.162759780883789, + "step": 5212 + }, + { + "epoch": 1.6048480184686418, + "grad_norm": 10.1875, + "learning_rate": 2.7927643779637736e-06, + "loss": 1.3883463144302368, + "step": 5214 + }, + { + "epoch": 1.6054636398614852, + "grad_norm": 7.4375, + "learning_rate": 2.7903831642627144e-06, + "loss": 0.6480385661125183, + "step": 5216 + }, + { + "epoch": 1.6060792612543286, + "grad_norm": 8.0625, + "learning_rate": 2.78800514002985e-06, + "loss": 1.0798194408416748, + "step": 5218 + }, + { + "epoch": 1.6066948826471719, + "grad_norm": 6.28125, + "learning_rate": 2.7856303076282786e-06, + "loss": 1.295226812362671, + "step": 5220 + }, + { + "epoch": 1.6073105040400155, + "grad_norm": 8.0, + "learning_rate": 2.7832586694179235e-06, + "loss": 1.0225231647491455, + "step": 5222 + }, + { + "epoch": 1.6079261254328587, + "grad_norm": 10.4375, + "learning_rate": 2.780890227755533e-06, + "loss": 1.0516223907470703, + "step": 5224 + }, + { + "epoch": 1.6085417468257022, + "grad_norm": 2.9375, + "learning_rate": 2.778524984994685e-06, + "loss": 1.1044272184371948, + "step": 5226 + }, + { + "epoch": 1.6091573682185456, + "grad_norm": 2.984375, + "learning_rate": 2.776162943485769e-06, + "loss": 0.9470773339271545, + "step": 5228 + }, + { + "epoch": 1.609772989611389, + "grad_norm": 7.03125, + "learning_rate": 2.773804105576002e-06, + "loss": 1.5163894891738892, + "step": 5230 + }, + { + "epoch": 1.6103886110042325, + "grad_norm": 16.875, + "learning_rate": 2.771448473609413e-06, + "loss": 1.079423427581787, + "step": 5232 + }, + { + "epoch": 1.6110042323970757, + "grad_norm": 8.875, + "learning_rate": 2.7690960499268453e-06, + "loss": 1.9689688682556152, + "step": 5234 + }, + { + "epoch": 1.6116198537899193, + "grad_norm": 5.6875, + "learning_rate": 2.766746836865958e-06, + "loss": 1.467721700668335, + "step": 5236 + }, + { + "epoch": 1.6122354751827626, + "grad_norm": 5.65625, + "learning_rate": 2.764400836761214e-06, + "loss": 1.2351285219192505, + "step": 5238 + }, + { + "epoch": 1.612851096575606, + "grad_norm": 3.4375, + "learning_rate": 2.762058051943888e-06, + "loss": 1.2483223676681519, + "step": 5240 + }, + { + "epoch": 1.6134667179684494, + "grad_norm": 5.09375, + "learning_rate": 2.7597184847420557e-06, + "loss": 1.146704077720642, + "step": 5242 + }, + { + "epoch": 1.6140823393612926, + "grad_norm": 5.375, + "learning_rate": 2.7573821374805997e-06, + "loss": 1.1703463792800903, + "step": 5244 + }, + { + "epoch": 1.6146979607541363, + "grad_norm": 7.1875, + "learning_rate": 2.7550490124811992e-06, + "loss": 1.1746296882629395, + "step": 5246 + }, + { + "epoch": 1.6153135821469795, + "grad_norm": 5.5625, + "learning_rate": 2.7527191120623325e-06, + "loss": 1.4364252090454102, + "step": 5248 + }, + { + "epoch": 1.6159292035398232, + "grad_norm": 5.8125, + "learning_rate": 2.7503924385392757e-06, + "loss": 1.6019375324249268, + "step": 5250 + }, + { + "epoch": 1.6165448249326664, + "grad_norm": 6.71875, + "learning_rate": 2.7480689942240957e-06, + "loss": 0.9271045923233032, + "step": 5252 + }, + { + "epoch": 1.6171604463255098, + "grad_norm": 3.65625, + "learning_rate": 2.74574878142565e-06, + "loss": 1.2441916465759277, + "step": 5254 + }, + { + "epoch": 1.6177760677183533, + "grad_norm": 9.8125, + "learning_rate": 2.7434318024495875e-06, + "loss": 1.3981280326843262, + "step": 5256 + }, + { + "epoch": 1.6183916891111965, + "grad_norm": 20.75, + "learning_rate": 2.741118059598341e-06, + "loss": 0.778854489326477, + "step": 5258 + }, + { + "epoch": 1.6190073105040401, + "grad_norm": 6.3125, + "learning_rate": 2.7388075551711302e-06, + "loss": 1.2868058681488037, + "step": 5260 + }, + { + "epoch": 1.6196229318968833, + "grad_norm": 12.75, + "learning_rate": 2.736500291463953e-06, + "loss": 1.343778133392334, + "step": 5262 + }, + { + "epoch": 1.6202385532897268, + "grad_norm": 15.8125, + "learning_rate": 2.734196270769591e-06, + "loss": 1.4329502582550049, + "step": 5264 + }, + { + "epoch": 1.6208541746825702, + "grad_norm": 2.546875, + "learning_rate": 2.7318954953776013e-06, + "loss": 1.150846004486084, + "step": 5266 + }, + { + "epoch": 1.6214697960754136, + "grad_norm": 6.03125, + "learning_rate": 2.729597967574313e-06, + "loss": 1.2093613147735596, + "step": 5268 + }, + { + "epoch": 1.622085417468257, + "grad_norm": 12.125, + "learning_rate": 2.7273036896428343e-06, + "loss": 1.6008520126342773, + "step": 5270 + }, + { + "epoch": 1.6227010388611003, + "grad_norm": 6.9375, + "learning_rate": 2.725012663863038e-06, + "loss": 1.0350145101547241, + "step": 5272 + }, + { + "epoch": 1.623316660253944, + "grad_norm": 11.0, + "learning_rate": 2.7227248925115713e-06, + "loss": 1.0717015266418457, + "step": 5274 + }, + { + "epoch": 1.6239322816467872, + "grad_norm": 7.75, + "learning_rate": 2.720440377861841e-06, + "loss": 1.5751841068267822, + "step": 5276 + }, + { + "epoch": 1.6245479030396306, + "grad_norm": 9.4375, + "learning_rate": 2.7181591221840215e-06, + "loss": 1.2236948013305664, + "step": 5278 + }, + { + "epoch": 1.625163524432474, + "grad_norm": 5.9375, + "learning_rate": 2.7158811277450476e-06, + "loss": 1.093230962753296, + "step": 5280 + }, + { + "epoch": 1.6257791458253175, + "grad_norm": 7.90625, + "learning_rate": 2.713606396808612e-06, + "loss": 1.3141717910766602, + "step": 5282 + }, + { + "epoch": 1.626394767218161, + "grad_norm": 56.75, + "learning_rate": 2.711334931635168e-06, + "loss": 1.1603947877883911, + "step": 5284 + }, + { + "epoch": 1.627010388611004, + "grad_norm": 4.84375, + "learning_rate": 2.709066734481921e-06, + "loss": 1.1331205368041992, + "step": 5286 + }, + { + "epoch": 1.6276260100038478, + "grad_norm": 11.5, + "learning_rate": 2.706801807602828e-06, + "loss": 1.4139946699142456, + "step": 5288 + }, + { + "epoch": 1.628241631396691, + "grad_norm": 7.9375, + "learning_rate": 2.7045401532486e-06, + "loss": 0.35363346338272095, + "step": 5290 + }, + { + "epoch": 1.6288572527895344, + "grad_norm": 5.375, + "learning_rate": 2.7022817736666905e-06, + "loss": 1.1643867492675781, + "step": 5292 + }, + { + "epoch": 1.6294728741823779, + "grad_norm": 5.9375, + "learning_rate": 2.7000266711013046e-06, + "loss": 1.1620657444000244, + "step": 5294 + }, + { + "epoch": 1.6300884955752213, + "grad_norm": 9.375, + "learning_rate": 2.6977748477933863e-06, + "loss": 1.6389256715774536, + "step": 5296 + }, + { + "epoch": 1.6307041169680647, + "grad_norm": 4.03125, + "learning_rate": 2.6955263059806247e-06, + "loss": 1.0903058052062988, + "step": 5298 + }, + { + "epoch": 1.631319738360908, + "grad_norm": 8.75, + "learning_rate": 2.693281047897446e-06, + "loss": 1.423241138458252, + "step": 5300 + }, + { + "epoch": 1.6319353597537516, + "grad_norm": 4.40625, + "learning_rate": 2.691039075775012e-06, + "loss": 1.153511881828308, + "step": 5302 + }, + { + "epoch": 1.6325509811465948, + "grad_norm": 3.0, + "learning_rate": 2.688800391841222e-06, + "loss": 1.011278748512268, + "step": 5304 + }, + { + "epoch": 1.6331666025394382, + "grad_norm": 2.203125, + "learning_rate": 2.686564998320705e-06, + "loss": 1.1042957305908203, + "step": 5306 + }, + { + "epoch": 1.6337822239322817, + "grad_norm": 12.125, + "learning_rate": 2.684332897434823e-06, + "loss": 1.385214924812317, + "step": 5308 + }, + { + "epoch": 1.6343978453251249, + "grad_norm": 8.375, + "learning_rate": 2.682104091401665e-06, + "loss": 1.499825358390808, + "step": 5310 + }, + { + "epoch": 1.6350134667179685, + "grad_norm": 8.5, + "learning_rate": 2.679878582436043e-06, + "loss": 1.6597236394882202, + "step": 5312 + }, + { + "epoch": 1.6356290881108118, + "grad_norm": 6.03125, + "learning_rate": 2.6776563727494987e-06, + "loss": 1.1139764785766602, + "step": 5314 + }, + { + "epoch": 1.6362447095036554, + "grad_norm": 6.40625, + "learning_rate": 2.6754374645502896e-06, + "loss": 1.1137142181396484, + "step": 5316 + }, + { + "epoch": 1.6368603308964986, + "grad_norm": 7.15625, + "learning_rate": 2.673221860043394e-06, + "loss": 1.0289076566696167, + "step": 5318 + }, + { + "epoch": 1.637475952289342, + "grad_norm": 4.875, + "learning_rate": 2.6710095614305085e-06, + "loss": 1.285740852355957, + "step": 5320 + }, + { + "epoch": 1.6380915736821855, + "grad_norm": 26.25, + "learning_rate": 2.6688005709100445e-06, + "loss": 1.420121669769287, + "step": 5322 + }, + { + "epoch": 1.6387071950750287, + "grad_norm": 17.375, + "learning_rate": 2.6665948906771257e-06, + "loss": 1.826535940170288, + "step": 5324 + }, + { + "epoch": 1.6393228164678724, + "grad_norm": 12.625, + "learning_rate": 2.6643925229235827e-06, + "loss": 1.2961101531982422, + "step": 5326 + }, + { + "epoch": 1.6399384378607156, + "grad_norm": 10.375, + "learning_rate": 2.662193469837963e-06, + "loss": 1.4630703926086426, + "step": 5328 + }, + { + "epoch": 1.640554059253559, + "grad_norm": 4.0, + "learning_rate": 2.65999773360551e-06, + "loss": 1.136149525642395, + "step": 5330 + }, + { + "epoch": 1.6411696806464025, + "grad_norm": 10.625, + "learning_rate": 2.6578053164081784e-06, + "loss": 1.469642162322998, + "step": 5332 + }, + { + "epoch": 1.6417853020392459, + "grad_norm": 4.9375, + "learning_rate": 2.6556162204246223e-06, + "loss": 1.253054141998291, + "step": 5334 + }, + { + "epoch": 1.6424009234320893, + "grad_norm": 17.875, + "learning_rate": 2.6534304478301942e-06, + "loss": 1.297611951828003, + "step": 5336 + }, + { + "epoch": 1.6430165448249325, + "grad_norm": 4.09375, + "learning_rate": 2.6512480007969472e-06, + "loss": 1.1495213508605957, + "step": 5338 + }, + { + "epoch": 1.6436321662177762, + "grad_norm": 4.71875, + "learning_rate": 2.6490688814936265e-06, + "loss": 1.2631639242172241, + "step": 5340 + }, + { + "epoch": 1.6442477876106194, + "grad_norm": 6.0625, + "learning_rate": 2.6468930920856727e-06, + "loss": 1.4238358736038208, + "step": 5342 + }, + { + "epoch": 1.6448634090034628, + "grad_norm": 6.21875, + "learning_rate": 2.644720634735216e-06, + "loss": 1.4778305292129517, + "step": 5344 + }, + { + "epoch": 1.6454790303963063, + "grad_norm": 3.703125, + "learning_rate": 2.6425515116010748e-06, + "loss": 1.061782717704773, + "step": 5346 + }, + { + "epoch": 1.6460946517891497, + "grad_norm": 7.90625, + "learning_rate": 2.640385724838757e-06, + "loss": 1.1082851886749268, + "step": 5348 + }, + { + "epoch": 1.6467102731819931, + "grad_norm": 8.8125, + "learning_rate": 2.638223276600453e-06, + "loss": 1.4387338161468506, + "step": 5350 + }, + { + "epoch": 1.6473258945748364, + "grad_norm": 6.28125, + "learning_rate": 2.6360641690350362e-06, + "loss": 1.4443461894989014, + "step": 5352 + }, + { + "epoch": 1.64794151596768, + "grad_norm": 12.375, + "learning_rate": 2.63390840428806e-06, + "loss": 1.0974818468093872, + "step": 5354 + }, + { + "epoch": 1.6485571373605232, + "grad_norm": 16.25, + "learning_rate": 2.6317559845017564e-06, + "loss": 1.4540315866470337, + "step": 5356 + }, + { + "epoch": 1.6491727587533667, + "grad_norm": 13.5625, + "learning_rate": 2.6296069118150337e-06, + "loss": 1.5910497903823853, + "step": 5358 + }, + { + "epoch": 1.64978838014621, + "grad_norm": 15.4375, + "learning_rate": 2.627461188363471e-06, + "loss": 1.3330596685409546, + "step": 5360 + }, + { + "epoch": 1.6504040015390535, + "grad_norm": 15.3125, + "learning_rate": 2.6253188162793254e-06, + "loss": 1.1511274576187134, + "step": 5362 + }, + { + "epoch": 1.651019622931897, + "grad_norm": 58.5, + "learning_rate": 2.6231797976915173e-06, + "loss": 1.5621612071990967, + "step": 5364 + }, + { + "epoch": 1.6516352443247402, + "grad_norm": 8.1875, + "learning_rate": 2.621044134725639e-06, + "loss": 1.622155785560608, + "step": 5366 + }, + { + "epoch": 1.6522508657175838, + "grad_norm": 6.21875, + "learning_rate": 2.6189118295039465e-06, + "loss": 1.0462428331375122, + "step": 5368 + }, + { + "epoch": 1.652866487110427, + "grad_norm": 9.0625, + "learning_rate": 2.6167828841453575e-06, + "loss": 1.3204021453857422, + "step": 5370 + }, + { + "epoch": 1.6534821085032705, + "grad_norm": 16.5, + "learning_rate": 2.614657300765455e-06, + "loss": 1.005763053894043, + "step": 5372 + }, + { + "epoch": 1.654097729896114, + "grad_norm": 2.484375, + "learning_rate": 2.6125350814764777e-06, + "loss": 1.2809065580368042, + "step": 5374 + }, + { + "epoch": 1.6547133512889571, + "grad_norm": 9.125, + "learning_rate": 2.6104162283873236e-06, + "loss": 1.371181607246399, + "step": 5376 + }, + { + "epoch": 1.6553289726818008, + "grad_norm": 9.6875, + "learning_rate": 2.608300743603543e-06, + "loss": 1.2377533912658691, + "step": 5378 + }, + { + "epoch": 1.655944594074644, + "grad_norm": 8.75, + "learning_rate": 2.606188629227342e-06, + "loss": 1.4686627388000488, + "step": 5380 + }, + { + "epoch": 1.6565602154674874, + "grad_norm": 5.46875, + "learning_rate": 2.604079887357575e-06, + "loss": 1.3642234802246094, + "step": 5382 + }, + { + "epoch": 1.6571758368603309, + "grad_norm": 7.9375, + "learning_rate": 2.601974520089745e-06, + "loss": 1.4514119625091553, + "step": 5384 + }, + { + "epoch": 1.6577914582531743, + "grad_norm": 6.46875, + "learning_rate": 2.5998725295160053e-06, + "loss": 1.166435956954956, + "step": 5386 + }, + { + "epoch": 1.6584070796460177, + "grad_norm": 3.453125, + "learning_rate": 2.5977739177251492e-06, + "loss": 0.9771444797515869, + "step": 5388 + }, + { + "epoch": 1.659022701038861, + "grad_norm": 7.96875, + "learning_rate": 2.595678686802614e-06, + "loss": 1.4036625623703003, + "step": 5390 + }, + { + "epoch": 1.6596383224317046, + "grad_norm": 6.03125, + "learning_rate": 2.5935868388304797e-06, + "loss": 1.1340692043304443, + "step": 5392 + }, + { + "epoch": 1.6602539438245478, + "grad_norm": 8.125, + "learning_rate": 2.5914983758874612e-06, + "loss": 1.44871187210083, + "step": 5394 + }, + { + "epoch": 1.6608695652173913, + "grad_norm": 9.375, + "learning_rate": 2.5894133000489108e-06, + "loss": 1.542242169380188, + "step": 5396 + }, + { + "epoch": 1.6614851866102347, + "grad_norm": 9.1875, + "learning_rate": 2.5873316133868154e-06, + "loss": 1.0620455741882324, + "step": 5398 + }, + { + "epoch": 1.6621008080030781, + "grad_norm": 5.28125, + "learning_rate": 2.585253317969793e-06, + "loss": 1.1398513317108154, + "step": 5400 + }, + { + "epoch": 1.6627164293959216, + "grad_norm": 28.125, + "learning_rate": 2.583178415863093e-06, + "loss": 1.1372501850128174, + "step": 5402 + }, + { + "epoch": 1.6633320507887648, + "grad_norm": 8.8125, + "learning_rate": 2.5811069091285916e-06, + "loss": 1.3599408864974976, + "step": 5404 + }, + { + "epoch": 1.6639476721816084, + "grad_norm": 4.875, + "learning_rate": 2.5790387998247933e-06, + "loss": 1.3852657079696655, + "step": 5406 + }, + { + "epoch": 1.6645632935744517, + "grad_norm": 6.5625, + "learning_rate": 2.5769740900068223e-06, + "loss": 1.4934041500091553, + "step": 5408 + }, + { + "epoch": 1.665178914967295, + "grad_norm": 6.78125, + "learning_rate": 2.5749127817264284e-06, + "loss": 1.2885091304779053, + "step": 5410 + }, + { + "epoch": 1.6657945363601385, + "grad_norm": 5.46875, + "learning_rate": 2.57285487703198e-06, + "loss": 1.5227521657943726, + "step": 5412 + }, + { + "epoch": 1.666410157752982, + "grad_norm": 8.25, + "learning_rate": 2.570800377968461e-06, + "loss": 1.2954890727996826, + "step": 5414 + }, + { + "epoch": 1.6670257791458254, + "grad_norm": 5.40625, + "learning_rate": 2.5687492865774765e-06, + "loss": 1.2142372131347656, + "step": 5416 + }, + { + "epoch": 1.6676414005386686, + "grad_norm": 5.5625, + "learning_rate": 2.5667016048972394e-06, + "loss": 1.4201480150222778, + "step": 5418 + }, + { + "epoch": 1.6682570219315123, + "grad_norm": 9.6875, + "learning_rate": 2.564657334962578e-06, + "loss": 1.475806713104248, + "step": 5420 + }, + { + "epoch": 1.6688726433243555, + "grad_norm": 7.75, + "learning_rate": 2.562616478804929e-06, + "loss": 1.4380114078521729, + "step": 5422 + }, + { + "epoch": 1.669488264717199, + "grad_norm": 3.921875, + "learning_rate": 2.560579038452336e-06, + "loss": 1.3787497282028198, + "step": 5424 + }, + { + "epoch": 1.6701038861100423, + "grad_norm": 9.8125, + "learning_rate": 2.5585450159294506e-06, + "loss": 0.8029026985168457, + "step": 5426 + }, + { + "epoch": 1.6707195075028856, + "grad_norm": 20.0, + "learning_rate": 2.556514413257525e-06, + "loss": 1.5744322538375854, + "step": 5428 + }, + { + "epoch": 1.6713351288957292, + "grad_norm": 3.96875, + "learning_rate": 2.5544872324544168e-06, + "loss": 1.4971494674682617, + "step": 5430 + }, + { + "epoch": 1.6719507502885724, + "grad_norm": 11.75, + "learning_rate": 2.552463475534581e-06, + "loss": 1.7740942239761353, + "step": 5432 + }, + { + "epoch": 1.672566371681416, + "grad_norm": 4.6875, + "learning_rate": 2.5504431445090668e-06, + "loss": 1.09224534034729, + "step": 5434 + }, + { + "epoch": 1.6731819930742593, + "grad_norm": 4.625, + "learning_rate": 2.5484262413855247e-06, + "loss": 1.319622278213501, + "step": 5436 + }, + { + "epoch": 1.6737976144671027, + "grad_norm": 3.59375, + "learning_rate": 2.546412768168196e-06, + "loss": 1.0411967039108276, + "step": 5438 + }, + { + "epoch": 1.6744132358599462, + "grad_norm": 6.90625, + "learning_rate": 2.5444027268579157e-06, + "loss": 1.4855449199676514, + "step": 5440 + }, + { + "epoch": 1.6750288572527894, + "grad_norm": 8.125, + "learning_rate": 2.5423961194521064e-06, + "loss": 1.4647443294525146, + "step": 5442 + }, + { + "epoch": 1.675644478645633, + "grad_norm": 5.8125, + "learning_rate": 2.5403929479447765e-06, + "loss": 1.0761570930480957, + "step": 5444 + }, + { + "epoch": 1.6762601000384763, + "grad_norm": 5.9375, + "learning_rate": 2.538393214326527e-06, + "loss": 0.6961817741394043, + "step": 5446 + }, + { + "epoch": 1.6768757214313197, + "grad_norm": 9.375, + "learning_rate": 2.5363969205845317e-06, + "loss": 0.7719630599021912, + "step": 5448 + }, + { + "epoch": 1.6774913428241631, + "grad_norm": 5.375, + "learning_rate": 2.5344040687025577e-06, + "loss": 1.4532030820846558, + "step": 5450 + }, + { + "epoch": 1.6781069642170066, + "grad_norm": 4.4375, + "learning_rate": 2.5324146606609452e-06, + "loss": 1.089078664779663, + "step": 5452 + }, + { + "epoch": 1.67872258560985, + "grad_norm": 8.5625, + "learning_rate": 2.530428698436612e-06, + "loss": 0.639711320400238, + "step": 5454 + }, + { + "epoch": 1.6793382070026932, + "grad_norm": 8.375, + "learning_rate": 2.5284461840030557e-06, + "loss": 1.4705522060394287, + "step": 5456 + }, + { + "epoch": 1.6799538283955369, + "grad_norm": 5.65625, + "learning_rate": 2.5264671193303434e-06, + "loss": 1.4255211353302002, + "step": 5458 + }, + { + "epoch": 1.68056944978838, + "grad_norm": 24.375, + "learning_rate": 2.524491506385117e-06, + "loss": 1.188090205192566, + "step": 5460 + }, + { + "epoch": 1.6811850711812235, + "grad_norm": 11.8125, + "learning_rate": 2.522519347130587e-06, + "loss": 1.0061157941818237, + "step": 5462 + }, + { + "epoch": 1.681800692574067, + "grad_norm": 5.59375, + "learning_rate": 2.5205506435265325e-06, + "loss": 1.0411052703857422, + "step": 5464 + }, + { + "epoch": 1.6824163139669104, + "grad_norm": 2.40625, + "learning_rate": 2.5185853975292984e-06, + "loss": 1.256507396697998, + "step": 5466 + }, + { + "epoch": 1.6830319353597538, + "grad_norm": 4.96875, + "learning_rate": 2.516623611091793e-06, + "loss": 1.1471688747406006, + "step": 5468 + }, + { + "epoch": 1.683647556752597, + "grad_norm": 5.34375, + "learning_rate": 2.5146652861634887e-06, + "loss": 1.1057099103927612, + "step": 5470 + }, + { + "epoch": 1.6842631781454407, + "grad_norm": 20.75, + "learning_rate": 2.512710424690416e-06, + "loss": 1.255007028579712, + "step": 5472 + }, + { + "epoch": 1.684878799538284, + "grad_norm": 3.46875, + "learning_rate": 2.510759028615165e-06, + "loss": 1.100907564163208, + "step": 5474 + }, + { + "epoch": 1.6854944209311273, + "grad_norm": 10.8125, + "learning_rate": 2.5088110998768817e-06, + "loss": 1.236773133277893, + "step": 5476 + }, + { + "epoch": 1.6861100423239708, + "grad_norm": 10.4375, + "learning_rate": 2.506866640411265e-06, + "loss": 1.5754953622817993, + "step": 5478 + }, + { + "epoch": 1.6867256637168142, + "grad_norm": 4.78125, + "learning_rate": 2.50492565215057e-06, + "loss": 1.3212549686431885, + "step": 5480 + }, + { + "epoch": 1.6873412851096576, + "grad_norm": 13.0, + "learning_rate": 2.5029881370235993e-06, + "loss": 1.6552295684814453, + "step": 5482 + }, + { + "epoch": 1.6879569065025009, + "grad_norm": 8.875, + "learning_rate": 2.5010540969557064e-06, + "loss": 1.6298810243606567, + "step": 5484 + }, + { + "epoch": 1.6885725278953445, + "grad_norm": 5.125, + "learning_rate": 2.4991235338687886e-06, + "loss": 1.4609882831573486, + "step": 5486 + }, + { + "epoch": 1.6891881492881877, + "grad_norm": 12.4375, + "learning_rate": 2.497196449681289e-06, + "loss": 1.3491747379302979, + "step": 5488 + }, + { + "epoch": 1.6898037706810312, + "grad_norm": 5.1875, + "learning_rate": 2.4952728463081964e-06, + "loss": 1.4812688827514648, + "step": 5490 + }, + { + "epoch": 1.6904193920738746, + "grad_norm": 6.5, + "learning_rate": 2.4933527256610377e-06, + "loss": 1.118019461631775, + "step": 5492 + }, + { + "epoch": 1.6910350134667178, + "grad_norm": 9.9375, + "learning_rate": 2.49143608964788e-06, + "loss": 1.4161806106567383, + "step": 5494 + }, + { + "epoch": 1.6916506348595615, + "grad_norm": 14.1875, + "learning_rate": 2.4895229401733278e-06, + "loss": 1.4623079299926758, + "step": 5496 + }, + { + "epoch": 1.6922662562524047, + "grad_norm": 9.5, + "learning_rate": 2.48761327913852e-06, + "loss": 1.365538239479065, + "step": 5498 + }, + { + "epoch": 1.6928818776452483, + "grad_norm": 19.0, + "learning_rate": 2.4857071084411302e-06, + "loss": 1.220156192779541, + "step": 5500 + }, + { + "epoch": 1.6934974990380915, + "grad_norm": 4.59375, + "learning_rate": 2.4838044299753615e-06, + "loss": 1.311576008796692, + "step": 5502 + }, + { + "epoch": 1.694113120430935, + "grad_norm": 18.125, + "learning_rate": 2.48190524563195e-06, + "loss": 0.9696519374847412, + "step": 5504 + }, + { + "epoch": 1.6947287418237784, + "grad_norm": 3.375, + "learning_rate": 2.4800095572981567e-06, + "loss": 1.153354287147522, + "step": 5506 + }, + { + "epoch": 1.6953443632166216, + "grad_norm": 14.0, + "learning_rate": 2.4781173668577692e-06, + "loss": 1.186468482017517, + "step": 5508 + }, + { + "epoch": 1.6959599846094653, + "grad_norm": 3.796875, + "learning_rate": 2.476228676191102e-06, + "loss": 1.2508482933044434, + "step": 5510 + }, + { + "epoch": 1.6965756060023085, + "grad_norm": 6.375, + "learning_rate": 2.474343487174985e-06, + "loss": 1.5084669589996338, + "step": 5512 + }, + { + "epoch": 1.697191227395152, + "grad_norm": 7.875, + "learning_rate": 2.4724618016827775e-06, + "loss": 1.4095842838287354, + "step": 5514 + }, + { + "epoch": 1.6978068487879954, + "grad_norm": 8.5625, + "learning_rate": 2.470583621584349e-06, + "loss": 1.1798251867294312, + "step": 5516 + }, + { + "epoch": 1.6984224701808388, + "grad_norm": 5.75, + "learning_rate": 2.468708948746091e-06, + "loss": 1.5921574831008911, + "step": 5518 + }, + { + "epoch": 1.6990380915736822, + "grad_norm": 2.9375, + "learning_rate": 2.466837785030908e-06, + "loss": 1.3569403886795044, + "step": 5520 + }, + { + "epoch": 1.6996537129665255, + "grad_norm": 2.546875, + "learning_rate": 2.464970132298216e-06, + "loss": 1.2199639081954956, + "step": 5522 + }, + { + "epoch": 1.7002693343593691, + "grad_norm": 8.25, + "learning_rate": 2.4631059924039444e-06, + "loss": 1.5668450593948364, + "step": 5524 + }, + { + "epoch": 1.7008849557522123, + "grad_norm": 7.0625, + "learning_rate": 2.46124536720053e-06, + "loss": 1.0412358045578003, + "step": 5526 + }, + { + "epoch": 1.7015005771450558, + "grad_norm": 7.0, + "learning_rate": 2.459388258536919e-06, + "loss": 1.2781226634979248, + "step": 5528 + }, + { + "epoch": 1.7021161985378992, + "grad_norm": 4.6875, + "learning_rate": 2.4575346682585616e-06, + "loss": 1.128685474395752, + "step": 5530 + }, + { + "epoch": 1.7027318199307426, + "grad_norm": 8.625, + "learning_rate": 2.4556845982074103e-06, + "loss": 1.0571131706237793, + "step": 5532 + }, + { + "epoch": 1.703347441323586, + "grad_norm": 4.75, + "learning_rate": 2.4538380502219238e-06, + "loss": 1.0286558866500854, + "step": 5534 + }, + { + "epoch": 1.7039630627164293, + "grad_norm": 5.53125, + "learning_rate": 2.451995026137057e-06, + "loss": 1.1925334930419922, + "step": 5536 + }, + { + "epoch": 1.704578684109273, + "grad_norm": 7.46875, + "learning_rate": 2.4501555277842636e-06, + "loss": 1.2129243612289429, + "step": 5538 + }, + { + "epoch": 1.7051943055021161, + "grad_norm": 4.09375, + "learning_rate": 2.4483195569914954e-06, + "loss": 1.0670663118362427, + "step": 5540 + }, + { + "epoch": 1.7058099268949596, + "grad_norm": 6.40625, + "learning_rate": 2.4464871155831963e-06, + "loss": 1.0114057064056396, + "step": 5542 + }, + { + "epoch": 1.706425548287803, + "grad_norm": 11.5, + "learning_rate": 2.4446582053803068e-06, + "loss": 1.2254160642623901, + "step": 5544 + }, + { + "epoch": 1.7070411696806465, + "grad_norm": 5.25, + "learning_rate": 2.442832828200253e-06, + "loss": 1.3474894762039185, + "step": 5546 + }, + { + "epoch": 1.7076567910734899, + "grad_norm": 4.1875, + "learning_rate": 2.4410109858569567e-06, + "loss": 1.1840492486953735, + "step": 5548 + }, + { + "epoch": 1.708272412466333, + "grad_norm": 4.6875, + "learning_rate": 2.43919268016082e-06, + "loss": 1.0255926847457886, + "step": 5550 + }, + { + "epoch": 1.7088880338591768, + "grad_norm": 70.5, + "learning_rate": 2.4373779129187356e-06, + "loss": 1.3053051233291626, + "step": 5552 + }, + { + "epoch": 1.70950365525202, + "grad_norm": 12.5625, + "learning_rate": 2.435566685934079e-06, + "loss": 1.7308895587921143, + "step": 5554 + }, + { + "epoch": 1.7101192766448634, + "grad_norm": 5.6875, + "learning_rate": 2.433759001006705e-06, + "loss": 0.9967052340507507, + "step": 5556 + }, + { + "epoch": 1.7107348980377068, + "grad_norm": 4.0, + "learning_rate": 2.431954859932953e-06, + "loss": 1.1225043535232544, + "step": 5558 + }, + { + "epoch": 1.71135051943055, + "grad_norm": 10.0, + "learning_rate": 2.4301542645056373e-06, + "loss": 1.176830530166626, + "step": 5560 + }, + { + "epoch": 1.7119661408233937, + "grad_norm": 8.25, + "learning_rate": 2.4283572165140496e-06, + "loss": 1.2773972749710083, + "step": 5562 + }, + { + "epoch": 1.712581762216237, + "grad_norm": 5.15625, + "learning_rate": 2.4265637177439577e-06, + "loss": 1.1938509941101074, + "step": 5564 + }, + { + "epoch": 1.7131973836090806, + "grad_norm": 5.5625, + "learning_rate": 2.4247737699776e-06, + "loss": 1.229737401008606, + "step": 5566 + }, + { + "epoch": 1.7138130050019238, + "grad_norm": 5.125, + "learning_rate": 2.4229873749936904e-06, + "loss": 1.314058542251587, + "step": 5568 + }, + { + "epoch": 1.7144286263947672, + "grad_norm": 119.5, + "learning_rate": 2.421204534567406e-06, + "loss": 0.6797417402267456, + "step": 5570 + }, + { + "epoch": 1.7150442477876107, + "grad_norm": 4.1875, + "learning_rate": 2.4194252504703985e-06, + "loss": 1.1942250728607178, + "step": 5572 + }, + { + "epoch": 1.7156598691804539, + "grad_norm": 4.1875, + "learning_rate": 2.4176495244707814e-06, + "loss": 1.420361876487732, + "step": 5574 + }, + { + "epoch": 1.7162754905732975, + "grad_norm": 15.0625, + "learning_rate": 2.415877358333133e-06, + "loss": 1.2674133777618408, + "step": 5576 + }, + { + "epoch": 1.7168911119661407, + "grad_norm": 10.375, + "learning_rate": 2.414108753818495e-06, + "loss": 1.6348576545715332, + "step": 5578 + }, + { + "epoch": 1.7175067333589842, + "grad_norm": 5.125, + "learning_rate": 2.412343712684368e-06, + "loss": 1.1598114967346191, + "step": 5580 + }, + { + "epoch": 1.7181223547518276, + "grad_norm": 10.8125, + "learning_rate": 2.410582236684714e-06, + "loss": 1.364866018295288, + "step": 5582 + }, + { + "epoch": 1.718737976144671, + "grad_norm": 13.0625, + "learning_rate": 2.4088243275699523e-06, + "loss": 1.0493624210357666, + "step": 5584 + }, + { + "epoch": 1.7193535975375145, + "grad_norm": 5.0, + "learning_rate": 2.407069987086954e-06, + "loss": 1.2027814388275146, + "step": 5586 + }, + { + "epoch": 1.7199692189303577, + "grad_norm": 14.5, + "learning_rate": 2.40531921697905e-06, + "loss": 1.262241005897522, + "step": 5588 + }, + { + "epoch": 1.7205848403232014, + "grad_norm": 3.4375, + "learning_rate": 2.4035720189860167e-06, + "loss": 0.9158905148506165, + "step": 5590 + }, + { + "epoch": 1.7212004617160446, + "grad_norm": 12.6875, + "learning_rate": 2.4018283948440856e-06, + "loss": 1.7495272159576416, + "step": 5592 + }, + { + "epoch": 1.721816083108888, + "grad_norm": 8.8125, + "learning_rate": 2.4000883462859337e-06, + "loss": 1.157536268234253, + "step": 5594 + }, + { + "epoch": 1.7224317045017314, + "grad_norm": 4.1875, + "learning_rate": 2.3983518750406874e-06, + "loss": 1.124690055847168, + "step": 5596 + }, + { + "epoch": 1.7230473258945749, + "grad_norm": 4.3125, + "learning_rate": 2.396618982833917e-06, + "loss": 1.0826674699783325, + "step": 5598 + }, + { + "epoch": 1.7236629472874183, + "grad_norm": 17.625, + "learning_rate": 2.394889671387636e-06, + "loss": 1.5495002269744873, + "step": 5600 + }, + { + "epoch": 1.7242785686802615, + "grad_norm": 9.25, + "learning_rate": 2.3931639424203e-06, + "loss": 1.1986749172210693, + "step": 5602 + }, + { + "epoch": 1.7248941900731052, + "grad_norm": 11.125, + "learning_rate": 2.391441797646805e-06, + "loss": 1.1907917261123657, + "step": 5604 + }, + { + "epoch": 1.7255098114659484, + "grad_norm": 4.3125, + "learning_rate": 2.3897232387784842e-06, + "loss": 1.3200044631958008, + "step": 5606 + }, + { + "epoch": 1.7261254328587918, + "grad_norm": 15.75, + "learning_rate": 2.3880082675231088e-06, + "loss": 1.4564788341522217, + "step": 5608 + }, + { + "epoch": 1.7267410542516353, + "grad_norm": 8.5625, + "learning_rate": 2.386296885584883e-06, + "loss": 1.6941089630126953, + "step": 5610 + }, + { + "epoch": 1.7273566756444787, + "grad_norm": 7.03125, + "learning_rate": 2.3845890946644466e-06, + "loss": 1.1814099550247192, + "step": 5612 + }, + { + "epoch": 1.7279722970373221, + "grad_norm": 3.0, + "learning_rate": 2.3828848964588694e-06, + "loss": 1.255849838256836, + "step": 5614 + }, + { + "epoch": 1.7285879184301653, + "grad_norm": 6.34375, + "learning_rate": 2.3811842926616513e-06, + "loss": 1.195947289466858, + "step": 5616 + }, + { + "epoch": 1.729203539823009, + "grad_norm": 9.9375, + "learning_rate": 2.37948728496272e-06, + "loss": 1.5996789932250977, + "step": 5618 + }, + { + "epoch": 1.7298191612158522, + "grad_norm": 11.75, + "learning_rate": 2.3777938750484306e-06, + "loss": 1.3444377183914185, + "step": 5620 + }, + { + "epoch": 1.7304347826086957, + "grad_norm": 3.078125, + "learning_rate": 2.3761040646015623e-06, + "loss": 1.450816035270691, + "step": 5622 + }, + { + "epoch": 1.731050404001539, + "grad_norm": 11.0, + "learning_rate": 2.374417855301317e-06, + "loss": 1.3607956171035767, + "step": 5624 + }, + { + "epoch": 1.7316660253943823, + "grad_norm": 7.59375, + "learning_rate": 2.372735248823321e-06, + "loss": 1.529814600944519, + "step": 5626 + }, + { + "epoch": 1.732281646787226, + "grad_norm": 6.25, + "learning_rate": 2.3710562468396146e-06, + "loss": 1.4916486740112305, + "step": 5628 + }, + { + "epoch": 1.7328972681800692, + "grad_norm": 10.1875, + "learning_rate": 2.3693808510186625e-06, + "loss": 1.5742865800857544, + "step": 5630 + }, + { + "epoch": 1.7335128895729126, + "grad_norm": 3.78125, + "learning_rate": 2.367709063025342e-06, + "loss": 1.2341521978378296, + "step": 5632 + }, + { + "epoch": 1.734128510965756, + "grad_norm": 1.8671875, + "learning_rate": 2.3660408845209455e-06, + "loss": 1.0239232778549194, + "step": 5634 + }, + { + "epoch": 1.7347441323585995, + "grad_norm": 8.875, + "learning_rate": 2.3643763171631815e-06, + "loss": 1.3741108179092407, + "step": 5636 + }, + { + "epoch": 1.735359753751443, + "grad_norm": 5.0625, + "learning_rate": 2.3627153626061663e-06, + "loss": 1.338080883026123, + "step": 5638 + }, + { + "epoch": 1.7359753751442861, + "grad_norm": 6.03125, + "learning_rate": 2.361058022500428e-06, + "loss": 1.20331609249115, + "step": 5640 + }, + { + "epoch": 1.7365909965371298, + "grad_norm": 4.0625, + "learning_rate": 2.359404298492903e-06, + "loss": 1.3246911764144897, + "step": 5642 + }, + { + "epoch": 1.737206617929973, + "grad_norm": 15.8125, + "learning_rate": 2.3577541922269324e-06, + "loss": 1.3114361763000488, + "step": 5644 + }, + { + "epoch": 1.7378222393228164, + "grad_norm": 102.0, + "learning_rate": 2.3561077053422658e-06, + "loss": 0.9264519214630127, + "step": 5646 + }, + { + "epoch": 1.7384378607156599, + "grad_norm": 12.125, + "learning_rate": 2.3544648394750535e-06, + "loss": 1.4408466815948486, + "step": 5648 + }, + { + "epoch": 1.7390534821085033, + "grad_norm": 5.34375, + "learning_rate": 2.352825596257847e-06, + "loss": 1.1803123950958252, + "step": 5650 + }, + { + "epoch": 1.7396691035013467, + "grad_norm": 8.625, + "learning_rate": 2.351189977319601e-06, + "loss": 1.317006230354309, + "step": 5652 + }, + { + "epoch": 1.74028472489419, + "grad_norm": 5.8125, + "learning_rate": 2.349557984285665e-06, + "loss": 1.1664330959320068, + "step": 5654 + }, + { + "epoch": 1.7409003462870336, + "grad_norm": 3.96875, + "learning_rate": 2.3479296187777877e-06, + "loss": 1.3429275751113892, + "step": 5656 + }, + { + "epoch": 1.7415159676798768, + "grad_norm": 6.84375, + "learning_rate": 2.3463048824141123e-06, + "loss": 1.1863704919815063, + "step": 5658 + }, + { + "epoch": 1.7421315890727203, + "grad_norm": 5.5625, + "learning_rate": 2.3446837768091763e-06, + "loss": 1.2393282651901245, + "step": 5660 + }, + { + "epoch": 1.7427472104655637, + "grad_norm": 5.0, + "learning_rate": 2.343066303573908e-06, + "loss": 0.9789924621582031, + "step": 5662 + }, + { + "epoch": 1.7433628318584071, + "grad_norm": 8.75, + "learning_rate": 2.341452464315627e-06, + "loss": 1.4149452447891235, + "step": 5664 + }, + { + "epoch": 1.7439784532512506, + "grad_norm": 6.5625, + "learning_rate": 2.3398422606380424e-06, + "loss": 1.4665279388427734, + "step": 5666 + }, + { + "epoch": 1.7445940746440938, + "grad_norm": 3.328125, + "learning_rate": 2.338235694141248e-06, + "loss": 1.0495976209640503, + "step": 5668 + }, + { + "epoch": 1.7452096960369374, + "grad_norm": 5.96875, + "learning_rate": 2.3366327664217253e-06, + "loss": 1.3150982856750488, + "step": 5670 + }, + { + "epoch": 1.7458253174297806, + "grad_norm": 4.15625, + "learning_rate": 2.335033479072341e-06, + "loss": 1.1453654766082764, + "step": 5672 + }, + { + "epoch": 1.746440938822624, + "grad_norm": 4.875, + "learning_rate": 2.3334378336823413e-06, + "loss": 1.0010666847229004, + "step": 5674 + }, + { + "epoch": 1.7470565602154675, + "grad_norm": 17.5, + "learning_rate": 2.3318458318373558e-06, + "loss": 1.1657469272613525, + "step": 5676 + }, + { + "epoch": 1.7476721816083107, + "grad_norm": 3.609375, + "learning_rate": 2.330257475119392e-06, + "loss": 0.6801499128341675, + "step": 5678 + }, + { + "epoch": 1.7482878030011544, + "grad_norm": 33.75, + "learning_rate": 2.3286727651068346e-06, + "loss": 1.1293959617614746, + "step": 5680 + }, + { + "epoch": 1.7489034243939976, + "grad_norm": 15.8125, + "learning_rate": 2.327091703374447e-06, + "loss": 1.2393264770507812, + "step": 5682 + }, + { + "epoch": 1.7495190457868413, + "grad_norm": 6.9375, + "learning_rate": 2.325514291493365e-06, + "loss": 1.3086076974868774, + "step": 5684 + }, + { + "epoch": 1.7501346671796845, + "grad_norm": 9.0, + "learning_rate": 2.323940531031098e-06, + "loss": 1.3145592212677002, + "step": 5686 + }, + { + "epoch": 1.750750288572528, + "grad_norm": 2.609375, + "learning_rate": 2.322370423551527e-06, + "loss": 1.3300237655639648, + "step": 5688 + }, + { + "epoch": 1.7513659099653713, + "grad_norm": 10.125, + "learning_rate": 2.3208039706149037e-06, + "loss": 1.1004767417907715, + "step": 5690 + }, + { + "epoch": 1.7519815313582145, + "grad_norm": 19.125, + "learning_rate": 2.3192411737778476e-06, + "loss": 1.3882306814193726, + "step": 5692 + }, + { + "epoch": 1.7525971527510582, + "grad_norm": 6.96875, + "learning_rate": 2.3176820345933437e-06, + "loss": 1.197304368019104, + "step": 5694 + }, + { + "epoch": 1.7532127741439014, + "grad_norm": 18.125, + "learning_rate": 2.3161265546107443e-06, + "loss": 1.220046877861023, + "step": 5696 + }, + { + "epoch": 1.7538283955367449, + "grad_norm": 5.0, + "learning_rate": 2.3145747353757643e-06, + "loss": 1.4614460468292236, + "step": 5698 + }, + { + "epoch": 1.7544440169295883, + "grad_norm": 3.46875, + "learning_rate": 2.313026578430482e-06, + "loss": 0.9256251454353333, + "step": 5700 + }, + { + "epoch": 1.7550596383224317, + "grad_norm": 14.75, + "learning_rate": 2.3114820853133356e-06, + "loss": 1.2474175691604614, + "step": 5702 + }, + { + "epoch": 1.7556752597152752, + "grad_norm": 6.59375, + "learning_rate": 2.309941257559122e-06, + "loss": 1.5943374633789062, + "step": 5704 + }, + { + "epoch": 1.7562908811081184, + "grad_norm": 5.75, + "learning_rate": 2.3084040966989964e-06, + "loss": 1.3186969757080078, + "step": 5706 + }, + { + "epoch": 1.756906502500962, + "grad_norm": 26.375, + "learning_rate": 2.3068706042604694e-06, + "loss": 1.2382252216339111, + "step": 5708 + }, + { + "epoch": 1.7575221238938052, + "grad_norm": 7.1875, + "learning_rate": 2.3053407817674087e-06, + "loss": 1.3816118240356445, + "step": 5710 + }, + { + "epoch": 1.7581377452866487, + "grad_norm": 12.0625, + "learning_rate": 2.3038146307400313e-06, + "loss": 1.339174509048462, + "step": 5712 + }, + { + "epoch": 1.7587533666794921, + "grad_norm": 7.28125, + "learning_rate": 2.3022921526949087e-06, + "loss": 1.2081241607666016, + "step": 5714 + }, + { + "epoch": 1.7593689880723355, + "grad_norm": 4.84375, + "learning_rate": 2.3007733491449615e-06, + "loss": 1.3891562223434448, + "step": 5716 + }, + { + "epoch": 1.759984609465179, + "grad_norm": 2.984375, + "learning_rate": 2.2992582215994576e-06, + "loss": 1.126984715461731, + "step": 5718 + }, + { + "epoch": 1.7606002308580222, + "grad_norm": 4.34375, + "learning_rate": 2.2977467715640147e-06, + "loss": 1.0940567255020142, + "step": 5720 + }, + { + "epoch": 1.7612158522508659, + "grad_norm": 17.5, + "learning_rate": 2.2962390005405935e-06, + "loss": 1.4153151512145996, + "step": 5722 + }, + { + "epoch": 1.761831473643709, + "grad_norm": 8.3125, + "learning_rate": 2.2947349100275007e-06, + "loss": 1.535245418548584, + "step": 5724 + }, + { + "epoch": 1.7624470950365525, + "grad_norm": 4.53125, + "learning_rate": 2.2932345015193845e-06, + "loss": 1.3496298789978027, + "step": 5726 + }, + { + "epoch": 1.763062716429396, + "grad_norm": 7.125, + "learning_rate": 2.2917377765072336e-06, + "loss": 1.3143893480300903, + "step": 5728 + }, + { + "epoch": 1.7636783378222394, + "grad_norm": 8.0625, + "learning_rate": 2.29024473647838e-06, + "loss": 1.2238497734069824, + "step": 5730 + }, + { + "epoch": 1.7642939592150828, + "grad_norm": 6.03125, + "learning_rate": 2.288755382916487e-06, + "loss": 1.4694186449050903, + "step": 5732 + }, + { + "epoch": 1.764909580607926, + "grad_norm": 3.109375, + "learning_rate": 2.2872697173015614e-06, + "loss": 1.4567644596099854, + "step": 5734 + }, + { + "epoch": 1.7655252020007697, + "grad_norm": 3.0625, + "learning_rate": 2.2857877411099407e-06, + "loss": 1.1318411827087402, + "step": 5736 + }, + { + "epoch": 1.766140823393613, + "grad_norm": 5.8125, + "learning_rate": 2.2843094558142998e-06, + "loss": 1.2868903875350952, + "step": 5738 + }, + { + "epoch": 1.7667564447864563, + "grad_norm": 10.0, + "learning_rate": 2.2828348628836434e-06, + "loss": 1.4868735074996948, + "step": 5740 + }, + { + "epoch": 1.7673720661792998, + "grad_norm": 10.5, + "learning_rate": 2.2813639637833065e-06, + "loss": 0.7236781120300293, + "step": 5742 + }, + { + "epoch": 1.767987687572143, + "grad_norm": 6.28125, + "learning_rate": 2.2798967599749554e-06, + "loss": 1.0289052724838257, + "step": 5744 + }, + { + "epoch": 1.7686033089649866, + "grad_norm": 8.0625, + "learning_rate": 2.278433252916582e-06, + "loss": 1.4637351036071777, + "step": 5746 + }, + { + "epoch": 1.7692189303578298, + "grad_norm": 7.71875, + "learning_rate": 2.2769734440625083e-06, + "loss": 1.3271539211273193, + "step": 5748 + }, + { + "epoch": 1.7698345517506735, + "grad_norm": 9.0, + "learning_rate": 2.2755173348633773e-06, + "loss": 1.1616591215133667, + "step": 5750 + }, + { + "epoch": 1.7704501731435167, + "grad_norm": 8.9375, + "learning_rate": 2.274064926766158e-06, + "loss": 1.4358797073364258, + "step": 5752 + }, + { + "epoch": 1.7710657945363601, + "grad_norm": 4.8125, + "learning_rate": 2.2726162212141417e-06, + "loss": 1.1068189144134521, + "step": 5754 + }, + { + "epoch": 1.7716814159292036, + "grad_norm": 6.625, + "learning_rate": 2.2711712196469386e-06, + "loss": 1.1925125122070312, + "step": 5756 + }, + { + "epoch": 1.7722970373220468, + "grad_norm": 2.1875, + "learning_rate": 2.269729923500479e-06, + "loss": 1.2517881393432617, + "step": 5758 + }, + { + "epoch": 1.7729126587148905, + "grad_norm": 14.4375, + "learning_rate": 2.2682923342070118e-06, + "loss": 1.7617454528808594, + "step": 5760 + }, + { + "epoch": 1.7735282801077337, + "grad_norm": 9.625, + "learning_rate": 2.266858453195101e-06, + "loss": 1.614905834197998, + "step": 5762 + }, + { + "epoch": 1.774143901500577, + "grad_norm": 6.78125, + "learning_rate": 2.2654282818896268e-06, + "loss": 1.064343810081482, + "step": 5764 + }, + { + "epoch": 1.7747595228934205, + "grad_norm": 4.25, + "learning_rate": 2.264001821711782e-06, + "loss": 1.0498532056808472, + "step": 5766 + }, + { + "epoch": 1.775375144286264, + "grad_norm": 3.03125, + "learning_rate": 2.262579074079074e-06, + "loss": 1.2279499769210815, + "step": 5768 + }, + { + "epoch": 1.7759907656791074, + "grad_norm": 9.125, + "learning_rate": 2.2611600404053162e-06, + "loss": 1.4600101709365845, + "step": 5770 + }, + { + "epoch": 1.7766063870719506, + "grad_norm": 7.84375, + "learning_rate": 2.2597447221006355e-06, + "loss": 1.2678983211517334, + "step": 5772 + }, + { + "epoch": 1.7772220084647943, + "grad_norm": 9.375, + "learning_rate": 2.258333120571466e-06, + "loss": 0.8515924215316772, + "step": 5774 + }, + { + "epoch": 1.7778376298576375, + "grad_norm": 3.859375, + "learning_rate": 2.2569252372205465e-06, + "loss": 1.0808274745941162, + "step": 5776 + }, + { + "epoch": 1.778453251250481, + "grad_norm": 10.1875, + "learning_rate": 2.2555210734469233e-06, + "loss": 1.4102704524993896, + "step": 5778 + }, + { + "epoch": 1.7790688726433244, + "grad_norm": 3.640625, + "learning_rate": 2.254120630645945e-06, + "loss": 1.1184544563293457, + "step": 5780 + }, + { + "epoch": 1.7796844940361678, + "grad_norm": 2.75, + "learning_rate": 2.252723910209263e-06, + "loss": 1.1054328680038452, + "step": 5782 + }, + { + "epoch": 1.7803001154290112, + "grad_norm": 5.625, + "learning_rate": 2.2513309135248302e-06, + "loss": 0.9634500741958618, + "step": 5784 + }, + { + "epoch": 1.7809157368218544, + "grad_norm": 3.75, + "learning_rate": 2.249941641976897e-06, + "loss": 1.0928064584732056, + "step": 5786 + }, + { + "epoch": 1.781531358214698, + "grad_norm": 18.125, + "learning_rate": 2.248556096946016e-06, + "loss": 1.1610251665115356, + "step": 5788 + }, + { + "epoch": 1.7821469796075413, + "grad_norm": 4.34375, + "learning_rate": 2.2471742798090315e-06, + "loss": 0.9581281542778015, + "step": 5790 + }, + { + "epoch": 1.7827626010003848, + "grad_norm": 5.59375, + "learning_rate": 2.2457961919390893e-06, + "loss": 0.9840832948684692, + "step": 5792 + }, + { + "epoch": 1.7833782223932282, + "grad_norm": 10.125, + "learning_rate": 2.2444218347056253e-06, + "loss": 1.285952091217041, + "step": 5794 + }, + { + "epoch": 1.7839938437860716, + "grad_norm": 6.0625, + "learning_rate": 2.2430512094743674e-06, + "loss": 1.3057432174682617, + "step": 5796 + }, + { + "epoch": 1.784609465178915, + "grad_norm": 4.78125, + "learning_rate": 2.241684317607338e-06, + "loss": 1.2441459894180298, + "step": 5798 + }, + { + "epoch": 1.7852250865717583, + "grad_norm": 5.0625, + "learning_rate": 2.240321160462848e-06, + "loss": 1.260965347290039, + "step": 5800 + }, + { + "epoch": 1.785840707964602, + "grad_norm": 3.796875, + "learning_rate": 2.2389617393954974e-06, + "loss": 1.0719890594482422, + "step": 5802 + }, + { + "epoch": 1.7864563293574451, + "grad_norm": 3.359375, + "learning_rate": 2.2376060557561734e-06, + "loss": 1.1214145421981812, + "step": 5804 + }, + { + "epoch": 1.7870719507502886, + "grad_norm": 5.25, + "learning_rate": 2.236254110892048e-06, + "loss": 1.4030085802078247, + "step": 5806 + }, + { + "epoch": 1.787687572143132, + "grad_norm": 3.203125, + "learning_rate": 2.2349059061465816e-06, + "loss": 1.3156555891036987, + "step": 5808 + }, + { + "epoch": 1.7883031935359752, + "grad_norm": 5.40625, + "learning_rate": 2.2335614428595125e-06, + "loss": 1.3215585947036743, + "step": 5810 + }, + { + "epoch": 1.7889188149288189, + "grad_norm": 3.390625, + "learning_rate": 2.232220722366866e-06, + "loss": 1.2306294441223145, + "step": 5812 + }, + { + "epoch": 1.789534436321662, + "grad_norm": 6.5, + "learning_rate": 2.230883746000946e-06, + "loss": 1.1097067594528198, + "step": 5814 + }, + { + "epoch": 1.7901500577145055, + "grad_norm": 6.6875, + "learning_rate": 2.2295505150903348e-06, + "loss": 1.2976242303848267, + "step": 5816 + }, + { + "epoch": 1.790765679107349, + "grad_norm": 3.15625, + "learning_rate": 2.228221030959895e-06, + "loss": 1.0502114295959473, + "step": 5818 + }, + { + "epoch": 1.7913813005001924, + "grad_norm": 6.71875, + "learning_rate": 2.226895294930764e-06, + "loss": 1.392374038696289, + "step": 5820 + }, + { + "epoch": 1.7919969218930358, + "grad_norm": 6.78125, + "learning_rate": 2.225573308320356e-06, + "loss": 1.3847123384475708, + "step": 5822 + }, + { + "epoch": 1.792612543285879, + "grad_norm": 18.625, + "learning_rate": 2.224255072442358e-06, + "loss": 1.499535322189331, + "step": 5824 + }, + { + "epoch": 1.7932281646787227, + "grad_norm": 3.765625, + "learning_rate": 2.222940588606731e-06, + "loss": 1.2504417896270752, + "step": 5826 + }, + { + "epoch": 1.793843786071566, + "grad_norm": 3.03125, + "learning_rate": 2.2216298581197075e-06, + "loss": 1.0057941675186157, + "step": 5828 + }, + { + "epoch": 1.7944594074644094, + "grad_norm": 3.40625, + "learning_rate": 2.220322882283789e-06, + "loss": 1.0351872444152832, + "step": 5830 + }, + { + "epoch": 1.7950750288572528, + "grad_norm": 3.65625, + "learning_rate": 2.219019662397747e-06, + "loss": 1.2333468198776245, + "step": 5832 + }, + { + "epoch": 1.7956906502500962, + "grad_norm": 16.5, + "learning_rate": 2.2177201997566203e-06, + "loss": 1.2922084331512451, + "step": 5834 + }, + { + "epoch": 1.7963062716429397, + "grad_norm": 6.1875, + "learning_rate": 2.2164244956517144e-06, + "loss": 1.5321375131607056, + "step": 5836 + }, + { + "epoch": 1.7969218930357829, + "grad_norm": 11.9375, + "learning_rate": 2.215132551370599e-06, + "loss": 1.1942152976989746, + "step": 5838 + }, + { + "epoch": 1.7975375144286265, + "grad_norm": 6.03125, + "learning_rate": 2.213844368197108e-06, + "loss": 1.1645042896270752, + "step": 5840 + }, + { + "epoch": 1.7981531358214697, + "grad_norm": 5.53125, + "learning_rate": 2.212559947411338e-06, + "loss": 1.2141934633255005, + "step": 5842 + }, + { + "epoch": 1.7987687572143132, + "grad_norm": 8.6875, + "learning_rate": 2.2112792902896467e-06, + "loss": 1.35050368309021, + "step": 5844 + }, + { + "epoch": 1.7993843786071566, + "grad_norm": 13.5, + "learning_rate": 2.2100023981046526e-06, + "loss": 1.2252672910690308, + "step": 5846 + }, + { + "epoch": 1.8, + "grad_norm": 5.84375, + "learning_rate": 2.2087292721252317e-06, + "loss": 1.3437782526016235, + "step": 5848 + }, + { + "epoch": 1.8006156213928435, + "grad_norm": 6.9375, + "learning_rate": 2.2074599136165165e-06, + "loss": 1.0520520210266113, + "step": 5850 + }, + { + "epoch": 1.8012312427856867, + "grad_norm": 6.9375, + "learning_rate": 2.2061943238398992e-06, + "loss": 1.1961694955825806, + "step": 5852 + }, + { + "epoch": 1.8018468641785303, + "grad_norm": 6.0625, + "learning_rate": 2.2049325040530226e-06, + "loss": 0.9422978162765503, + "step": 5854 + }, + { + "epoch": 1.8024624855713736, + "grad_norm": 6.375, + "learning_rate": 2.2036744555097867e-06, + "loss": 1.279676079750061, + "step": 5856 + }, + { + "epoch": 1.803078106964217, + "grad_norm": 9.9375, + "learning_rate": 2.2024201794603424e-06, + "loss": 1.333317756652832, + "step": 5858 + }, + { + "epoch": 1.8036937283570604, + "grad_norm": 6.03125, + "learning_rate": 2.2011696771510914e-06, + "loss": 1.1424082517623901, + "step": 5860 + }, + { + "epoch": 1.8043093497499036, + "grad_norm": 4.90625, + "learning_rate": 2.1999229498246865e-06, + "loss": 0.9212673902511597, + "step": 5862 + }, + { + "epoch": 1.8049249711427473, + "grad_norm": 2.75, + "learning_rate": 2.198679998720028e-06, + "loss": 0.9288144707679749, + "step": 5864 + }, + { + "epoch": 1.8055405925355905, + "grad_norm": 9.9375, + "learning_rate": 2.1974408250722647e-06, + "loss": 1.2315566539764404, + "step": 5866 + }, + { + "epoch": 1.8061562139284342, + "grad_norm": 16.25, + "learning_rate": 2.1962054301127907e-06, + "loss": 1.463548183441162, + "step": 5868 + }, + { + "epoch": 1.8067718353212774, + "grad_norm": 9.8125, + "learning_rate": 2.1949738150692455e-06, + "loss": 0.6639575958251953, + "step": 5870 + }, + { + "epoch": 1.8073874567141208, + "grad_norm": 6.0, + "learning_rate": 2.193745981165515e-06, + "loss": 1.1918666362762451, + "step": 5872 + }, + { + "epoch": 1.8080030781069643, + "grad_norm": 7.875, + "learning_rate": 2.1925219296217213e-06, + "loss": 1.178608775138855, + "step": 5874 + }, + { + "epoch": 1.8086186994998075, + "grad_norm": 7.4375, + "learning_rate": 2.1913016616542348e-06, + "loss": 1.0431655645370483, + "step": 5876 + }, + { + "epoch": 1.8092343208926511, + "grad_norm": 14.625, + "learning_rate": 2.1900851784756618e-06, + "loss": 1.3192921876907349, + "step": 5878 + }, + { + "epoch": 1.8098499422854943, + "grad_norm": 4.53125, + "learning_rate": 2.18887248129485e-06, + "loss": 0.8913655877113342, + "step": 5880 + }, + { + "epoch": 1.8104655636783378, + "grad_norm": 3.421875, + "learning_rate": 2.187663571316883e-06, + "loss": 1.0939395427703857, + "step": 5882 + }, + { + "epoch": 1.8110811850711812, + "grad_norm": 9.6875, + "learning_rate": 2.1864584497430813e-06, + "loss": 1.2626577615737915, + "step": 5884 + }, + { + "epoch": 1.8116968064640246, + "grad_norm": 12.125, + "learning_rate": 2.185257117771003e-06, + "loss": 1.2554960250854492, + "step": 5886 + }, + { + "epoch": 1.812312427856868, + "grad_norm": 12.3125, + "learning_rate": 2.1840595765944366e-06, + "loss": 1.126465916633606, + "step": 5888 + }, + { + "epoch": 1.8129280492497113, + "grad_norm": 8.125, + "learning_rate": 2.1828658274034063e-06, + "loss": 1.2984936237335205, + "step": 5890 + }, + { + "epoch": 1.813543670642555, + "grad_norm": 4.09375, + "learning_rate": 2.1816758713841676e-06, + "loss": 1.214903473854065, + "step": 5892 + }, + { + "epoch": 1.8141592920353982, + "grad_norm": 7.96875, + "learning_rate": 2.1804897097192067e-06, + "loss": 1.0102453231811523, + "step": 5894 + }, + { + "epoch": 1.8147749134282416, + "grad_norm": 6.3125, + "learning_rate": 2.179307343587238e-06, + "loss": 1.0699224472045898, + "step": 5896 + }, + { + "epoch": 1.815390534821085, + "grad_norm": 10.625, + "learning_rate": 2.1781287741632067e-06, + "loss": 1.2344567775726318, + "step": 5898 + }, + { + "epoch": 1.8160061562139285, + "grad_norm": 6.90625, + "learning_rate": 2.176954002618283e-06, + "loss": 1.3633794784545898, + "step": 5900 + }, + { + "epoch": 1.816621777606772, + "grad_norm": 7.84375, + "learning_rate": 2.1757830301198637e-06, + "loss": 1.2593004703521729, + "step": 5902 + }, + { + "epoch": 1.8172373989996151, + "grad_norm": 7.375, + "learning_rate": 2.17461585783157e-06, + "loss": 1.3392139673233032, + "step": 5904 + }, + { + "epoch": 1.8178530203924588, + "grad_norm": 13.0625, + "learning_rate": 2.1734524869132475e-06, + "loss": 1.0816066265106201, + "step": 5906 + }, + { + "epoch": 1.818468641785302, + "grad_norm": 5.75, + "learning_rate": 2.1722929185209637e-06, + "loss": 1.2291300296783447, + "step": 5908 + }, + { + "epoch": 1.8190842631781454, + "grad_norm": 6.53125, + "learning_rate": 2.1711371538070088e-06, + "loss": 1.2727948427200317, + "step": 5910 + }, + { + "epoch": 1.8196998845709889, + "grad_norm": 7.3125, + "learning_rate": 2.169985193919891e-06, + "loss": 1.287793517112732, + "step": 5912 + }, + { + "epoch": 1.8203155059638323, + "grad_norm": 3.0, + "learning_rate": 2.168837040004339e-06, + "loss": 1.0403543710708618, + "step": 5914 + }, + { + "epoch": 1.8209311273566757, + "grad_norm": 9.5625, + "learning_rate": 2.167692693201299e-06, + "loss": 0.8734148740768433, + "step": 5916 + }, + { + "epoch": 1.821546748749519, + "grad_norm": 7.46875, + "learning_rate": 2.1665521546479336e-06, + "loss": 1.2621644735336304, + "step": 5918 + }, + { + "epoch": 1.8221623701423626, + "grad_norm": 5.625, + "learning_rate": 2.165415425477623e-06, + "loss": 1.3135576248168945, + "step": 5920 + }, + { + "epoch": 1.8227779915352058, + "grad_norm": 7.84375, + "learning_rate": 2.164282506819959e-06, + "loss": 0.8223294615745544, + "step": 5922 + }, + { + "epoch": 1.8233936129280492, + "grad_norm": 9.625, + "learning_rate": 2.163153399800749e-06, + "loss": 1.6391355991363525, + "step": 5924 + }, + { + "epoch": 1.8240092343208927, + "grad_norm": 48.0, + "learning_rate": 2.1620281055420113e-06, + "loss": 1.230574607849121, + "step": 5926 + }, + { + "epoch": 1.824624855713736, + "grad_norm": 3.09375, + "learning_rate": 2.1609066251619757e-06, + "loss": 1.003134846687317, + "step": 5928 + }, + { + "epoch": 1.8252404771065796, + "grad_norm": 3.703125, + "learning_rate": 2.159788959775085e-06, + "loss": 1.0853413343429565, + "step": 5930 + }, + { + "epoch": 1.8258560984994228, + "grad_norm": 12.125, + "learning_rate": 2.158675110491985e-06, + "loss": 1.2373244762420654, + "step": 5932 + }, + { + "epoch": 1.8264717198922664, + "grad_norm": 7.21875, + "learning_rate": 2.1575650784195346e-06, + "loss": 1.2410924434661865, + "step": 5934 + }, + { + "epoch": 1.8270873412851096, + "grad_norm": 8.1875, + "learning_rate": 2.1564588646607974e-06, + "loss": 1.377103567123413, + "step": 5936 + }, + { + "epoch": 1.827702962677953, + "grad_norm": 6.09375, + "learning_rate": 2.1553564703150425e-06, + "loss": 1.0280659198760986, + "step": 5938 + }, + { + "epoch": 1.8283185840707965, + "grad_norm": 5.9375, + "learning_rate": 2.154257896477744e-06, + "loss": 1.4314674139022827, + "step": 5940 + }, + { + "epoch": 1.8289342054636397, + "grad_norm": 6.0, + "learning_rate": 2.153163144240579e-06, + "loss": 1.0865339040756226, + "step": 5942 + }, + { + "epoch": 1.8295498268564834, + "grad_norm": 11.3125, + "learning_rate": 2.152072214691428e-06, + "loss": 1.4894191026687622, + "step": 5944 + }, + { + "epoch": 1.8301654482493266, + "grad_norm": 3.625, + "learning_rate": 2.1509851089143717e-06, + "loss": 1.176236629486084, + "step": 5946 + }, + { + "epoch": 1.83078106964217, + "grad_norm": 6.15625, + "learning_rate": 2.149901827989691e-06, + "loss": 1.2651861906051636, + "step": 5948 + }, + { + "epoch": 1.8313966910350135, + "grad_norm": 7.40625, + "learning_rate": 2.148822372993868e-06, + "loss": 1.2794727087020874, + "step": 5950 + }, + { + "epoch": 1.832012312427857, + "grad_norm": 5.46875, + "learning_rate": 2.1477467449995793e-06, + "loss": 1.244179606437683, + "step": 5952 + }, + { + "epoch": 1.8326279338207003, + "grad_norm": 3.65625, + "learning_rate": 2.1466749450757016e-06, + "loss": 1.132509469985962, + "step": 5954 + }, + { + "epoch": 1.8332435552135435, + "grad_norm": 11.875, + "learning_rate": 2.145606974287307e-06, + "loss": 1.3131150007247925, + "step": 5956 + }, + { + "epoch": 1.8338591766063872, + "grad_norm": 8.5625, + "learning_rate": 2.144542833695661e-06, + "loss": 1.5854219198226929, + "step": 5958 + }, + { + "epoch": 1.8344747979992304, + "grad_norm": 12.8125, + "learning_rate": 2.1434825243582247e-06, + "loss": 1.37516450881958, + "step": 5960 + }, + { + "epoch": 1.8350904193920738, + "grad_norm": 2.578125, + "learning_rate": 2.1424260473286515e-06, + "loss": 1.2534147500991821, + "step": 5962 + }, + { + "epoch": 1.8357060407849173, + "grad_norm": 9.125, + "learning_rate": 2.141373403656785e-06, + "loss": 0.9582955241203308, + "step": 5964 + }, + { + "epoch": 1.8363216621777607, + "grad_norm": 6.1875, + "learning_rate": 2.140324594388662e-06, + "loss": 1.3661773204803467, + "step": 5966 + }, + { + "epoch": 1.8369372835706042, + "grad_norm": 15.875, + "learning_rate": 2.139279620566507e-06, + "loss": 1.6128534078598022, + "step": 5968 + }, + { + "epoch": 1.8375529049634474, + "grad_norm": 5.03125, + "learning_rate": 2.1382384832287345e-06, + "loss": 1.3733783960342407, + "step": 5970 + }, + { + "epoch": 1.838168526356291, + "grad_norm": 7.59375, + "learning_rate": 2.137201183409946e-06, + "loss": 1.1199281215667725, + "step": 5972 + }, + { + "epoch": 1.8387841477491342, + "grad_norm": 5.125, + "learning_rate": 2.136167722140929e-06, + "loss": 1.3824279308319092, + "step": 5974 + }, + { + "epoch": 1.8393997691419777, + "grad_norm": 2.453125, + "learning_rate": 2.1351381004486575e-06, + "loss": 1.0962920188903809, + "step": 5976 + }, + { + "epoch": 1.840015390534821, + "grad_norm": 9.125, + "learning_rate": 2.134112319356291e-06, + "loss": 1.3079670667648315, + "step": 5978 + }, + { + "epoch": 1.8406310119276645, + "grad_norm": 4.15625, + "learning_rate": 2.1330903798831685e-06, + "loss": 1.0482728481292725, + "step": 5980 + }, + { + "epoch": 1.841246633320508, + "grad_norm": 10.75, + "learning_rate": 2.1320722830448155e-06, + "loss": 1.4023866653442383, + "step": 5982 + }, + { + "epoch": 1.8418622547133512, + "grad_norm": 12.9375, + "learning_rate": 2.1310580298529375e-06, + "loss": 1.6925112009048462, + "step": 5984 + }, + { + "epoch": 1.8424778761061948, + "grad_norm": 13.1875, + "learning_rate": 2.130047621315421e-06, + "loss": 1.5157252550125122, + "step": 5986 + }, + { + "epoch": 1.843093497499038, + "grad_norm": 14.1875, + "learning_rate": 2.1290410584363324e-06, + "loss": 1.3847084045410156, + "step": 5988 + }, + { + "epoch": 1.8437091188918815, + "grad_norm": 7.59375, + "learning_rate": 2.1280383422159135e-06, + "loss": 1.2718254327774048, + "step": 5990 + }, + { + "epoch": 1.844324740284725, + "grad_norm": 6.875, + "learning_rate": 2.127039473650588e-06, + "loss": 1.5083742141723633, + "step": 5992 + }, + { + "epoch": 1.8449403616775681, + "grad_norm": 9.3125, + "learning_rate": 2.1260444537329527e-06, + "loss": 1.5251927375793457, + "step": 5994 + }, + { + "epoch": 1.8455559830704118, + "grad_norm": 15.5625, + "learning_rate": 2.125053283451782e-06, + "loss": 1.0528500080108643, + "step": 5996 + }, + { + "epoch": 1.846171604463255, + "grad_norm": 5.8125, + "learning_rate": 2.1240659637920232e-06, + "loss": 1.6003139019012451, + "step": 5998 + }, + { + "epoch": 1.8467872258560984, + "grad_norm": 7.96875, + "learning_rate": 2.123082495734799e-06, + "loss": 1.5982369184494019, + "step": 6000 + }, + { + "epoch": 1.8474028472489419, + "grad_norm": 6.28125, + "learning_rate": 2.122102880257403e-06, + "loss": 1.2855113744735718, + "step": 6002 + }, + { + "epoch": 1.8480184686417853, + "grad_norm": 4.40625, + "learning_rate": 2.121127118333301e-06, + "loss": 0.905225932598114, + "step": 6004 + }, + { + "epoch": 1.8486340900346288, + "grad_norm": 3.65625, + "learning_rate": 2.1201552109321293e-06, + "loss": 1.025927186012268, + "step": 6006 + }, + { + "epoch": 1.849249711427472, + "grad_norm": 7.59375, + "learning_rate": 2.119187159019695e-06, + "loss": 1.531143307685852, + "step": 6008 + }, + { + "epoch": 1.8498653328203156, + "grad_norm": 5.28125, + "learning_rate": 2.1182229635579722e-06, + "loss": 1.1162152290344238, + "step": 6010 + }, + { + "epoch": 1.8504809542131588, + "grad_norm": 6.65625, + "learning_rate": 2.117262625505104e-06, + "loss": 1.2625081539154053, + "step": 6012 + }, + { + "epoch": 1.8510965756060023, + "grad_norm": 5.15625, + "learning_rate": 2.1163061458153994e-06, + "loss": 1.179467797279358, + "step": 6014 + }, + { + "epoch": 1.8517121969988457, + "grad_norm": 5.1875, + "learning_rate": 2.115353525439334e-06, + "loss": 0.9201962947845459, + "step": 6016 + }, + { + "epoch": 1.8523278183916891, + "grad_norm": 9.3125, + "learning_rate": 2.114404765323548e-06, + "loss": 1.3194632530212402, + "step": 6018 + }, + { + "epoch": 1.8529434397845326, + "grad_norm": 6.40625, + "learning_rate": 2.113459866410845e-06, + "loss": 1.2764129638671875, + "step": 6020 + }, + { + "epoch": 1.8535590611773758, + "grad_norm": 13.875, + "learning_rate": 2.112518829640193e-06, + "loss": 1.3906527757644653, + "step": 6022 + }, + { + "epoch": 1.8541746825702194, + "grad_norm": 4.5625, + "learning_rate": 2.111581655946722e-06, + "loss": 1.3976833820343018, + "step": 6024 + }, + { + "epoch": 1.8547903039630627, + "grad_norm": 6.84375, + "learning_rate": 2.1106483462617205e-06, + "loss": 1.3194999694824219, + "step": 6026 + }, + { + "epoch": 1.855405925355906, + "grad_norm": 4.1875, + "learning_rate": 2.1097189015126414e-06, + "loss": 1.1536383628845215, + "step": 6028 + }, + { + "epoch": 1.8560215467487495, + "grad_norm": 10.6875, + "learning_rate": 2.108793322623093e-06, + "loss": 1.3191182613372803, + "step": 6030 + }, + { + "epoch": 1.856637168141593, + "grad_norm": 4.15625, + "learning_rate": 2.107871610512845e-06, + "loss": 1.1951905488967896, + "step": 6032 + }, + { + "epoch": 1.8572527895344364, + "grad_norm": 83.5, + "learning_rate": 2.1069537660978223e-06, + "loss": 0.7205085158348083, + "step": 6034 + }, + { + "epoch": 1.8578684109272796, + "grad_norm": 7.5625, + "learning_rate": 2.1060397902901083e-06, + "loss": 1.0426700115203857, + "step": 6036 + }, + { + "epoch": 1.8584840323201233, + "grad_norm": 5.34375, + "learning_rate": 2.105129683997941e-06, + "loss": 1.239465594291687, + "step": 6038 + }, + { + "epoch": 1.8590996537129665, + "grad_norm": 4.40625, + "learning_rate": 2.104223448125714e-06, + "loss": 0.7743887305259705, + "step": 6040 + }, + { + "epoch": 1.85971527510581, + "grad_norm": 13.5625, + "learning_rate": 2.103321083573973e-06, + "loss": 1.2444900274276733, + "step": 6042 + }, + { + "epoch": 1.8603308964986534, + "grad_norm": 7.46875, + "learning_rate": 2.102422591239419e-06, + "loss": 0.8321576118469238, + "step": 6044 + }, + { + "epoch": 1.8609465178914966, + "grad_norm": 6.625, + "learning_rate": 2.1015279720149035e-06, + "loss": 1.2588664293289185, + "step": 6046 + }, + { + "epoch": 1.8615621392843402, + "grad_norm": 5.875, + "learning_rate": 2.1006372267894296e-06, + "loss": 1.2898892164230347, + "step": 6048 + }, + { + "epoch": 1.8621777606771834, + "grad_norm": 4.78125, + "learning_rate": 2.0997503564481504e-06, + "loss": 0.6708816885948181, + "step": 6050 + }, + { + "epoch": 1.862793382070027, + "grad_norm": 3.796875, + "learning_rate": 2.09886736187237e-06, + "loss": 1.221728801727295, + "step": 6052 + }, + { + "epoch": 1.8634090034628703, + "grad_norm": 14.875, + "learning_rate": 2.097988243939539e-06, + "loss": 1.2509214878082275, + "step": 6054 + }, + { + "epoch": 1.8640246248557137, + "grad_norm": 6.1875, + "learning_rate": 2.097113003523257e-06, + "loss": 1.3385615348815918, + "step": 6056 + }, + { + "epoch": 1.8646402462485572, + "grad_norm": 7.71875, + "learning_rate": 2.0962416414932697e-06, + "loss": 1.2880332469940186, + "step": 6058 + }, + { + "epoch": 1.8652558676414004, + "grad_norm": 7.8125, + "learning_rate": 2.095374158715469e-06, + "loss": 1.2843137979507446, + "step": 6060 + }, + { + "epoch": 1.865871489034244, + "grad_norm": 5.1875, + "learning_rate": 2.094510556051893e-06, + "loss": 1.2483960390090942, + "step": 6062 + }, + { + "epoch": 1.8664871104270873, + "grad_norm": 11.875, + "learning_rate": 2.0936508343607214e-06, + "loss": 1.257112741470337, + "step": 6064 + }, + { + "epoch": 1.8671027318199307, + "grad_norm": 6.4375, + "learning_rate": 2.0927949944962804e-06, + "loss": 1.3845338821411133, + "step": 6066 + }, + { + "epoch": 1.8677183532127741, + "grad_norm": 5.0625, + "learning_rate": 2.091943037309036e-06, + "loss": 1.1420466899871826, + "step": 6068 + }, + { + "epoch": 1.8683339746056176, + "grad_norm": 11.75, + "learning_rate": 2.091094963645598e-06, + "loss": 1.4840701818466187, + "step": 6070 + }, + { + "epoch": 1.868949595998461, + "grad_norm": 5.125, + "learning_rate": 2.0902507743487163e-06, + "loss": 1.1129378080368042, + "step": 6072 + }, + { + "epoch": 1.8695652173913042, + "grad_norm": 4.9375, + "learning_rate": 2.0894104702572803e-06, + "loss": 1.253161907196045, + "step": 6074 + }, + { + "epoch": 1.8701808387841479, + "grad_norm": 6.3125, + "learning_rate": 2.0885740522063187e-06, + "loss": 0.9732499122619629, + "step": 6076 + }, + { + "epoch": 1.870796460176991, + "grad_norm": 5.90625, + "learning_rate": 2.0877415210269993e-06, + "loss": 1.1398284435272217, + "step": 6078 + }, + { + "epoch": 1.8714120815698345, + "grad_norm": 4.4375, + "learning_rate": 2.0869128775466275e-06, + "loss": 1.1115602254867554, + "step": 6080 + }, + { + "epoch": 1.872027702962678, + "grad_norm": 3.546875, + "learning_rate": 2.0860881225886444e-06, + "loss": 1.1072742938995361, + "step": 6082 + }, + { + "epoch": 1.8726433243555214, + "grad_norm": 6.28125, + "learning_rate": 2.085267256972627e-06, + "loss": 1.196256160736084, + "step": 6084 + }, + { + "epoch": 1.8732589457483648, + "grad_norm": 7.15625, + "learning_rate": 2.084450281514289e-06, + "loss": 1.621668815612793, + "step": 6086 + }, + { + "epoch": 1.873874567141208, + "grad_norm": 10.8125, + "learning_rate": 2.0836371970254758e-06, + "loss": 1.4544686079025269, + "step": 6088 + }, + { + "epoch": 1.8744901885340517, + "grad_norm": 8.6875, + "learning_rate": 2.082828004314168e-06, + "loss": 1.3470609188079834, + "step": 6090 + }, + { + "epoch": 1.875105809926895, + "grad_norm": 6.75, + "learning_rate": 2.0820227041844803e-06, + "loss": 1.568597674369812, + "step": 6092 + }, + { + "epoch": 1.8757214313197383, + "grad_norm": 11.4375, + "learning_rate": 2.0812212974366554e-06, + "loss": 1.6319973468780518, + "step": 6094 + }, + { + "epoch": 1.8763370527125818, + "grad_norm": 15.6875, + "learning_rate": 2.08042378486707e-06, + "loss": 1.4720542430877686, + "step": 6096 + }, + { + "epoch": 1.8769526741054252, + "grad_norm": 10.0625, + "learning_rate": 2.07963016726823e-06, + "loss": 1.479348063468933, + "step": 6098 + }, + { + "epoch": 1.8775682954982686, + "grad_norm": 12.375, + "learning_rate": 2.0788404454287714e-06, + "loss": 1.5378873348236084, + "step": 6100 + }, + { + "epoch": 1.8781839168911119, + "grad_norm": 8.25, + "learning_rate": 2.0780546201334583e-06, + "loss": 1.0985162258148193, + "step": 6102 + }, + { + "epoch": 1.8787995382839555, + "grad_norm": 7.375, + "learning_rate": 2.0772726921631826e-06, + "loss": 0.8501262664794922, + "step": 6104 + }, + { + "epoch": 1.8794151596767987, + "grad_norm": 4.25, + "learning_rate": 2.0764946622949642e-06, + "loss": 1.1900298595428467, + "step": 6106 + }, + { + "epoch": 1.8800307810696422, + "grad_norm": 5.78125, + "learning_rate": 2.075720531301948e-06, + "loss": 1.2045283317565918, + "step": 6108 + }, + { + "epoch": 1.8806464024624856, + "grad_norm": 8.5, + "learning_rate": 2.074950299953406e-06, + "loss": 1.4412344694137573, + "step": 6110 + }, + { + "epoch": 1.8812620238553288, + "grad_norm": 13.0625, + "learning_rate": 2.0741839690147347e-06, + "loss": 1.4701406955718994, + "step": 6112 + }, + { + "epoch": 1.8818776452481725, + "grad_norm": 7.34375, + "learning_rate": 2.0734215392474533e-06, + "loss": 1.0646079778671265, + "step": 6114 + }, + { + "epoch": 1.8824932666410157, + "grad_norm": 8.75, + "learning_rate": 2.072663011409206e-06, + "loss": 0.7244550585746765, + "step": 6116 + }, + { + "epoch": 1.8831088880338593, + "grad_norm": 6.15625, + "learning_rate": 2.0719083862537585e-06, + "loss": 1.5345789194107056, + "step": 6118 + }, + { + "epoch": 1.8837245094267026, + "grad_norm": 9.8125, + "learning_rate": 2.071157664531e-06, + "loss": 1.5183554887771606, + "step": 6120 + }, + { + "epoch": 1.884340130819546, + "grad_norm": 1.6875, + "learning_rate": 2.0704108469869377e-06, + "loss": 0.9273104071617126, + "step": 6122 + }, + { + "epoch": 1.8849557522123894, + "grad_norm": 11.5625, + "learning_rate": 2.0696679343637018e-06, + "loss": 1.1478441953659058, + "step": 6124 + }, + { + "epoch": 1.8855713736052326, + "grad_norm": 5.03125, + "learning_rate": 2.068928927399541e-06, + "loss": 1.2363404035568237, + "step": 6126 + }, + { + "epoch": 1.8861869949980763, + "grad_norm": 4.4375, + "learning_rate": 2.0681938268288236e-06, + "loss": 1.2083735466003418, + "step": 6128 + }, + { + "epoch": 1.8868026163909195, + "grad_norm": 4.53125, + "learning_rate": 2.067462633382035e-06, + "loss": 1.207137107849121, + "step": 6130 + }, + { + "epoch": 1.887418237783763, + "grad_norm": 2.140625, + "learning_rate": 2.066735347785779e-06, + "loss": 1.2943766117095947, + "step": 6132 + }, + { + "epoch": 1.8880338591766064, + "grad_norm": 7.75, + "learning_rate": 2.066011970762775e-06, + "loss": 1.2611300945281982, + "step": 6134 + }, + { + "epoch": 1.8886494805694498, + "grad_norm": 6.59375, + "learning_rate": 2.0652925030318594e-06, + "loss": 0.7345150709152222, + "step": 6136 + }, + { + "epoch": 1.8892651019622932, + "grad_norm": 4.375, + "learning_rate": 2.064576945307983e-06, + "loss": 1.110431432723999, + "step": 6138 + }, + { + "epoch": 1.8898807233551365, + "grad_norm": 9.25, + "learning_rate": 2.0638652983022124e-06, + "loss": 1.096685767173767, + "step": 6140 + }, + { + "epoch": 1.8904963447479801, + "grad_norm": 2.421875, + "learning_rate": 2.0631575627217263e-06, + "loss": 1.16090989112854, + "step": 6142 + }, + { + "epoch": 1.8911119661408233, + "grad_norm": 35.5, + "learning_rate": 2.062453739269818e-06, + "loss": 0.8789357542991638, + "step": 6144 + }, + { + "epoch": 1.8917275875336668, + "grad_norm": 4.90625, + "learning_rate": 2.0617538286458915e-06, + "loss": 1.34991455078125, + "step": 6146 + }, + { + "epoch": 1.8923432089265102, + "grad_norm": 6.53125, + "learning_rate": 2.061057831545465e-06, + "loss": 1.434483289718628, + "step": 6148 + }, + { + "epoch": 1.8929588303193536, + "grad_norm": 4.09375, + "learning_rate": 2.060365748660166e-06, + "loss": 1.3135393857955933, + "step": 6150 + }, + { + "epoch": 1.893574451712197, + "grad_norm": 6.6875, + "learning_rate": 2.059677580677733e-06, + "loss": 1.2441500425338745, + "step": 6152 + }, + { + "epoch": 1.8941900731050403, + "grad_norm": 8.75, + "learning_rate": 2.0589933282820133e-06, + "loss": 1.339614987373352, + "step": 6154 + }, + { + "epoch": 1.894805694497884, + "grad_norm": 15.8125, + "learning_rate": 2.0583129921529644e-06, + "loss": 1.2056138515472412, + "step": 6156 + }, + { + "epoch": 1.8954213158907272, + "grad_norm": 11.0, + "learning_rate": 2.057636572966652e-06, + "loss": 1.2353260517120361, + "step": 6158 + }, + { + "epoch": 1.8960369372835706, + "grad_norm": 10.5625, + "learning_rate": 2.0569640713952478e-06, + "loss": 1.705105185508728, + "step": 6160 + }, + { + "epoch": 1.896652558676414, + "grad_norm": 9.5, + "learning_rate": 2.0562954881070313e-06, + "loss": 0.937542200088501, + "step": 6162 + }, + { + "epoch": 1.8972681800692575, + "grad_norm": 5.03125, + "learning_rate": 2.055630823766391e-06, + "loss": 1.455617904663086, + "step": 6164 + }, + { + "epoch": 1.897883801462101, + "grad_norm": 10.5, + "learning_rate": 2.054970079033817e-06, + "loss": 1.343193769454956, + "step": 6166 + }, + { + "epoch": 1.898499422854944, + "grad_norm": 5.84375, + "learning_rate": 2.0543132545659065e-06, + "loss": 1.2717845439910889, + "step": 6168 + }, + { + "epoch": 1.8991150442477878, + "grad_norm": 5.53125, + "learning_rate": 2.053660351015361e-06, + "loss": 1.3608722686767578, + "step": 6170 + }, + { + "epoch": 1.899730665640631, + "grad_norm": 3.875, + "learning_rate": 2.0530113690309854e-06, + "loss": 0.9390138983726501, + "step": 6172 + }, + { + "epoch": 1.9003462870334744, + "grad_norm": 17.5, + "learning_rate": 2.052366309257687e-06, + "loss": 1.403334140777588, + "step": 6174 + }, + { + "epoch": 1.9009619084263178, + "grad_norm": 9.5625, + "learning_rate": 2.0517251723364767e-06, + "loss": 1.7505857944488525, + "step": 6176 + }, + { + "epoch": 1.901577529819161, + "grad_norm": 7.40625, + "learning_rate": 2.0510879589044663e-06, + "loss": 1.120890498161316, + "step": 6178 + }, + { + "epoch": 1.9021931512120047, + "grad_norm": 9.0, + "learning_rate": 2.05045466959487e-06, + "loss": 1.4072320461273193, + "step": 6180 + }, + { + "epoch": 1.902808772604848, + "grad_norm": 6.3125, + "learning_rate": 2.049825305037e-06, + "loss": 1.454264760017395, + "step": 6182 + }, + { + "epoch": 1.9034243939976914, + "grad_norm": 4.1875, + "learning_rate": 2.049199865856271e-06, + "loss": 1.0654551982879639, + "step": 6184 + }, + { + "epoch": 1.9040400153905348, + "grad_norm": 23.75, + "learning_rate": 2.048578352674196e-06, + "loss": 1.544224500656128, + "step": 6186 + }, + { + "epoch": 1.9046556367833782, + "grad_norm": 5.15625, + "learning_rate": 2.0479607661083867e-06, + "loss": 1.1244772672653198, + "step": 6188 + }, + { + "epoch": 1.9052712581762217, + "grad_norm": 5.96875, + "learning_rate": 2.047347106772552e-06, + "loss": 1.4134089946746826, + "step": 6190 + }, + { + "epoch": 1.9058868795690649, + "grad_norm": 9.9375, + "learning_rate": 2.0467373752764986e-06, + "loss": 1.2867887020111084, + "step": 6192 + }, + { + "epoch": 1.9065025009619085, + "grad_norm": 4.90625, + "learning_rate": 2.046131572226132e-06, + "loss": 1.3925960063934326, + "step": 6194 + }, + { + "epoch": 1.9071181223547518, + "grad_norm": 8.1875, + "learning_rate": 2.045529698223451e-06, + "loss": 1.4497476816177368, + "step": 6196 + }, + { + "epoch": 1.9077337437475952, + "grad_norm": 5.0, + "learning_rate": 2.0449317538665515e-06, + "loss": 1.4063258171081543, + "step": 6198 + }, + { + "epoch": 1.9083493651404386, + "grad_norm": 9.8125, + "learning_rate": 2.044337739749625e-06, + "loss": 1.3043848276138306, + "step": 6200 + }, + { + "epoch": 1.908964986533282, + "grad_norm": 4.84375, + "learning_rate": 2.0437476564629553e-06, + "loss": 0.6216869950294495, + "step": 6202 + }, + { + "epoch": 1.9095806079261255, + "grad_norm": 8.1875, + "learning_rate": 2.043161504592922e-06, + "loss": 0.8327115774154663, + "step": 6204 + }, + { + "epoch": 1.9101962293189687, + "grad_norm": 5.65625, + "learning_rate": 2.0425792847219973e-06, + "loss": 1.537729024887085, + "step": 6206 + }, + { + "epoch": 1.9108118507118124, + "grad_norm": 5.5, + "learning_rate": 2.042000997428747e-06, + "loss": 1.1706664562225342, + "step": 6208 + }, + { + "epoch": 1.9114274721046556, + "grad_norm": 3.625, + "learning_rate": 2.041426643287827e-06, + "loss": 1.3486610651016235, + "step": 6210 + }, + { + "epoch": 1.912043093497499, + "grad_norm": 18.0, + "learning_rate": 2.040856222869986e-06, + "loss": 1.4879320859909058, + "step": 6212 + }, + { + "epoch": 1.9126587148903424, + "grad_norm": 16.25, + "learning_rate": 2.040289736742064e-06, + "loss": 1.281280517578125, + "step": 6214 + }, + { + "epoch": 1.9132743362831859, + "grad_norm": 2.609375, + "learning_rate": 2.039727185466991e-06, + "loss": 1.127787470817566, + "step": 6216 + }, + { + "epoch": 1.9138899576760293, + "grad_norm": 7.4375, + "learning_rate": 2.0391685696037864e-06, + "loss": 1.3760292530059814, + "step": 6218 + }, + { + "epoch": 1.9145055790688725, + "grad_norm": 6.25, + "learning_rate": 2.03861388970756e-06, + "loss": 1.2867217063903809, + "step": 6220 + }, + { + "epoch": 1.9151212004617162, + "grad_norm": 8.25, + "learning_rate": 2.0380631463295085e-06, + "loss": 1.3626055717468262, + "step": 6222 + }, + { + "epoch": 1.9157368218545594, + "grad_norm": 4.78125, + "learning_rate": 2.0375163400169186e-06, + "loss": 1.468051791191101, + "step": 6224 + }, + { + "epoch": 1.9163524432474028, + "grad_norm": 15.25, + "learning_rate": 2.036973471313164e-06, + "loss": 1.2753353118896484, + "step": 6226 + }, + { + "epoch": 1.9169680646402463, + "grad_norm": 7.34375, + "learning_rate": 2.0364345407577057e-06, + "loss": 1.407792568206787, + "step": 6228 + }, + { + "epoch": 1.9175836860330895, + "grad_norm": 1.9921875, + "learning_rate": 2.0358995488860912e-06, + "loss": 0.9969472885131836, + "step": 6230 + }, + { + "epoch": 1.9181993074259331, + "grad_norm": 4.53125, + "learning_rate": 2.035368496229953e-06, + "loss": 1.012083888053894, + "step": 6232 + }, + { + "epoch": 1.9188149288187764, + "grad_norm": 7.125, + "learning_rate": 2.0348413833170113e-06, + "loss": 1.195415735244751, + "step": 6234 + }, + { + "epoch": 1.91943055021162, + "grad_norm": 5.53125, + "learning_rate": 2.034318210671068e-06, + "loss": 1.1292438507080078, + "step": 6236 + }, + { + "epoch": 1.9200461716044632, + "grad_norm": 7.53125, + "learning_rate": 2.033798978812014e-06, + "loss": 1.3102638721466064, + "step": 6238 + }, + { + "epoch": 1.9206617929973067, + "grad_norm": 7.625, + "learning_rate": 2.0332836882558202e-06, + "loss": 1.4417779445648193, + "step": 6240 + }, + { + "epoch": 1.92127741439015, + "grad_norm": 5.65625, + "learning_rate": 2.032772339514543e-06, + "loss": 1.0075815916061401, + "step": 6242 + }, + { + "epoch": 1.9218930357829933, + "grad_norm": 4.46875, + "learning_rate": 2.0322649330963197e-06, + "loss": 1.2295527458190918, + "step": 6244 + }, + { + "epoch": 1.922508657175837, + "grad_norm": 11.375, + "learning_rate": 2.031761469505373e-06, + "loss": 1.5699710845947266, + "step": 6246 + }, + { + "epoch": 1.9231242785686802, + "grad_norm": 4.1875, + "learning_rate": 2.0312619492420056e-06, + "loss": 1.5786056518554688, + "step": 6248 + }, + { + "epoch": 1.9237398999615236, + "grad_norm": 6.125, + "learning_rate": 2.0307663728026015e-06, + "loss": 1.5211533308029175, + "step": 6250 + }, + { + "epoch": 1.924355521354367, + "grad_norm": 7.1875, + "learning_rate": 2.0302747406796268e-06, + "loss": 1.1912158727645874, + "step": 6252 + }, + { + "epoch": 1.9249711427472105, + "grad_norm": 11.0625, + "learning_rate": 2.0297870533616267e-06, + "loss": 1.2701131105422974, + "step": 6254 + }, + { + "epoch": 1.925586764140054, + "grad_norm": 4.59375, + "learning_rate": 2.029303311333227e-06, + "loss": 1.1772947311401367, + "step": 6256 + }, + { + "epoch": 1.9262023855328971, + "grad_norm": 8.0, + "learning_rate": 2.0288235150751333e-06, + "loss": 1.4614884853363037, + "step": 6258 + }, + { + "epoch": 1.9268180069257408, + "grad_norm": 24.5, + "learning_rate": 2.028347665064131e-06, + "loss": 1.0047273635864258, + "step": 6260 + }, + { + "epoch": 1.927433628318584, + "grad_norm": 4.28125, + "learning_rate": 2.0278757617730808e-06, + "loss": 1.2402758598327637, + "step": 6262 + }, + { + "epoch": 1.9280492497114274, + "grad_norm": 2.984375, + "learning_rate": 2.0274078056709247e-06, + "loss": 1.119405746459961, + "step": 6264 + }, + { + "epoch": 1.9286648711042709, + "grad_norm": 5.59375, + "learning_rate": 2.026943797222681e-06, + "loss": 1.1718242168426514, + "step": 6266 + }, + { + "epoch": 1.9292804924971143, + "grad_norm": 9.1875, + "learning_rate": 2.0264837368894454e-06, + "loss": 1.274290680885315, + "step": 6268 + }, + { + "epoch": 1.9298961138899577, + "grad_norm": 5.65625, + "learning_rate": 2.02602762512839e-06, + "loss": 1.0971585512161255, + "step": 6270 + }, + { + "epoch": 1.930511735282801, + "grad_norm": 12.0625, + "learning_rate": 2.0255754623927635e-06, + "loss": 1.0695799589157104, + "step": 6272 + }, + { + "epoch": 1.9311273566756446, + "grad_norm": 6.78125, + "learning_rate": 2.0251272491318906e-06, + "loss": 1.2456305027008057, + "step": 6274 + }, + { + "epoch": 1.9317429780684878, + "grad_norm": 7.71875, + "learning_rate": 2.02468298579117e-06, + "loss": 1.5815218687057495, + "step": 6276 + }, + { + "epoch": 1.9323585994613313, + "grad_norm": 4.84375, + "learning_rate": 2.0242426728120766e-06, + "loss": 1.1808098554611206, + "step": 6278 + }, + { + "epoch": 1.9329742208541747, + "grad_norm": 5.15625, + "learning_rate": 2.0238063106321583e-06, + "loss": 1.447092056274414, + "step": 6280 + }, + { + "epoch": 1.9335898422470181, + "grad_norm": 4.4375, + "learning_rate": 2.02337389968504e-06, + "loss": 1.1663380861282349, + "step": 6282 + }, + { + "epoch": 1.9342054636398616, + "grad_norm": 8.75, + "learning_rate": 2.022945440400416e-06, + "loss": 1.5715312957763672, + "step": 6284 + }, + { + "epoch": 1.9348210850327048, + "grad_norm": 5.96875, + "learning_rate": 2.0225209332040576e-06, + "loss": 1.3347316980361938, + "step": 6286 + }, + { + "epoch": 1.9354367064255484, + "grad_norm": 13.5625, + "learning_rate": 2.022100378517806e-06, + "loss": 1.386270523071289, + "step": 6288 + }, + { + "epoch": 1.9360523278183916, + "grad_norm": 9.1875, + "learning_rate": 2.021683776759576e-06, + "loss": 1.545689582824707, + "step": 6290 + }, + { + "epoch": 1.936667949211235, + "grad_norm": 4.40625, + "learning_rate": 2.0212711283433544e-06, + "loss": 1.096389651298523, + "step": 6292 + }, + { + "epoch": 1.9372835706040785, + "grad_norm": 3.53125, + "learning_rate": 2.0208624336791993e-06, + "loss": 0.9402706623077393, + "step": 6294 + }, + { + "epoch": 1.9378991919969217, + "grad_norm": 4.6875, + "learning_rate": 2.020457693173239e-06, + "loss": 1.2515463829040527, + "step": 6296 + }, + { + "epoch": 1.9385148133897654, + "grad_norm": 3.5, + "learning_rate": 2.0200569072276744e-06, + "loss": 1.211615800857544, + "step": 6298 + }, + { + "epoch": 1.9391304347826086, + "grad_norm": 5.3125, + "learning_rate": 2.0196600762407745e-06, + "loss": 1.3675752878189087, + "step": 6300 + }, + { + "epoch": 1.9397460561754523, + "grad_norm": 6.8125, + "learning_rate": 2.0192672006068795e-06, + "loss": 1.6202188730239868, + "step": 6302 + }, + { + "epoch": 1.9403616775682955, + "grad_norm": 4.25, + "learning_rate": 2.0188782807163983e-06, + "loss": 1.3375134468078613, + "step": 6304 + }, + { + "epoch": 1.940977298961139, + "grad_norm": 32.75, + "learning_rate": 2.0184933169558103e-06, + "loss": 1.153170108795166, + "step": 6306 + }, + { + "epoch": 1.9415929203539823, + "grad_norm": 4.96875, + "learning_rate": 2.018112309707662e-06, + "loss": 1.559049367904663, + "step": 6308 + }, + { + "epoch": 1.9422085417468256, + "grad_norm": 4.28125, + "learning_rate": 2.017735259350568e-06, + "loss": 1.4401798248291016, + "step": 6310 + }, + { + "epoch": 1.9428241631396692, + "grad_norm": 12.5625, + "learning_rate": 2.0173621662592142e-06, + "loss": 1.2643327713012695, + "step": 6312 + }, + { + "epoch": 1.9434397845325124, + "grad_norm": 7.28125, + "learning_rate": 2.0169930308043482e-06, + "loss": 1.4889107942581177, + "step": 6314 + }, + { + "epoch": 1.9440554059253559, + "grad_norm": 2.0, + "learning_rate": 2.016627853352791e-06, + "loss": 1.2440648078918457, + "step": 6316 + }, + { + "epoch": 1.9446710273181993, + "grad_norm": 9.25, + "learning_rate": 2.0162666342674265e-06, + "loss": 1.2304495573043823, + "step": 6318 + }, + { + "epoch": 1.9452866487110427, + "grad_norm": 11.6875, + "learning_rate": 2.0159093739072054e-06, + "loss": 1.1931729316711426, + "step": 6320 + }, + { + "epoch": 1.9459022701038862, + "grad_norm": 6.65625, + "learning_rate": 2.015556072627147e-06, + "loss": 1.0525994300842285, + "step": 6322 + }, + { + "epoch": 1.9465178914967294, + "grad_norm": 15.5625, + "learning_rate": 2.0152067307783333e-06, + "loss": 1.3192481994628906, + "step": 6324 + }, + { + "epoch": 1.947133512889573, + "grad_norm": 2.90625, + "learning_rate": 2.014861348707914e-06, + "loss": 1.18320631980896, + "step": 6326 + }, + { + "epoch": 1.9477491342824163, + "grad_norm": 5.90625, + "learning_rate": 2.0145199267591025e-06, + "loss": 1.2805728912353516, + "step": 6328 + }, + { + "epoch": 1.9483647556752597, + "grad_norm": 5.4375, + "learning_rate": 2.014182465271178e-06, + "loss": 1.586653470993042, + "step": 6330 + }, + { + "epoch": 1.9489803770681031, + "grad_norm": 8.25, + "learning_rate": 2.0138489645794826e-06, + "loss": 0.7887523174285889, + "step": 6332 + }, + { + "epoch": 1.9495959984609466, + "grad_norm": 11.25, + "learning_rate": 2.013519425015424e-06, + "loss": 1.597000241279602, + "step": 6334 + }, + { + "epoch": 1.95021161985379, + "grad_norm": 6.0, + "learning_rate": 2.0131938469064734e-06, + "loss": 0.9804918169975281, + "step": 6336 + }, + { + "epoch": 1.9508272412466332, + "grad_norm": 5.625, + "learning_rate": 2.0128722305761646e-06, + "loss": 1.2448334693908691, + "step": 6338 + }, + { + "epoch": 1.9514428626394769, + "grad_norm": 13.75, + "learning_rate": 2.0125545763440953e-06, + "loss": 1.201894998550415, + "step": 6340 + }, + { + "epoch": 1.95205848403232, + "grad_norm": 9.9375, + "learning_rate": 2.012240884525925e-06, + "loss": 1.6245787143707275, + "step": 6342 + }, + { + "epoch": 1.9526741054251635, + "grad_norm": 6.4375, + "learning_rate": 2.0119311554333766e-06, + "loss": 1.30740487575531, + "step": 6344 + }, + { + "epoch": 1.953289726818007, + "grad_norm": 6.25, + "learning_rate": 2.011625389374235e-06, + "loss": 1.4148385524749756, + "step": 6346 + }, + { + "epoch": 1.9539053482108504, + "grad_norm": 4.6875, + "learning_rate": 2.011323586652347e-06, + "loss": 1.0704916715621948, + "step": 6348 + }, + { + "epoch": 1.9545209696036938, + "grad_norm": 3.0625, + "learning_rate": 2.0110257475676203e-06, + "loss": 1.1448662281036377, + "step": 6350 + }, + { + "epoch": 1.955136590996537, + "grad_norm": 8.5, + "learning_rate": 2.0107318724160245e-06, + "loss": 1.4039653539657593, + "step": 6352 + }, + { + "epoch": 1.9557522123893807, + "grad_norm": 8.6875, + "learning_rate": 2.0104419614895896e-06, + "loss": 0.9554769992828369, + "step": 6354 + }, + { + "epoch": 1.956367833782224, + "grad_norm": 7.40625, + "learning_rate": 2.0101560150764067e-06, + "loss": 1.112485408782959, + "step": 6356 + }, + { + "epoch": 1.9569834551750673, + "grad_norm": 3.921875, + "learning_rate": 2.0098740334606277e-06, + "loss": 1.2079534530639648, + "step": 6358 + }, + { + "epoch": 1.9575990765679108, + "grad_norm": 6.5625, + "learning_rate": 2.0095960169224635e-06, + "loss": 1.0168330669403076, + "step": 6360 + }, + { + "epoch": 1.958214697960754, + "grad_norm": 9.9375, + "learning_rate": 2.0093219657381857e-06, + "loss": 1.4579648971557617, + "step": 6362 + }, + { + "epoch": 1.9588303193535976, + "grad_norm": 12.0625, + "learning_rate": 2.0090518801801244e-06, + "loss": 1.2450551986694336, + "step": 6364 + }, + { + "epoch": 1.9594459407464409, + "grad_norm": 5.25, + "learning_rate": 2.0087857605166704e-06, + "loss": 1.1469169855117798, + "step": 6366 + }, + { + "epoch": 1.9600615621392843, + "grad_norm": 1.8359375, + "learning_rate": 2.0085236070122728e-06, + "loss": 1.1382040977478027, + "step": 6368 + }, + { + "epoch": 1.9606771835321277, + "grad_norm": 5.46875, + "learning_rate": 2.008265419927439e-06, + "loss": 1.0705983638763428, + "step": 6370 + }, + { + "epoch": 1.9612928049249712, + "grad_norm": 9.25, + "learning_rate": 2.0080111995187354e-06, + "loss": 1.1481586694717407, + "step": 6372 + }, + { + "epoch": 1.9619084263178146, + "grad_norm": 9.3125, + "learning_rate": 2.0077609460387866e-06, + "loss": 1.5089284181594849, + "step": 6374 + }, + { + "epoch": 1.9625240477106578, + "grad_norm": 4.0, + "learning_rate": 2.007514659736275e-06, + "loss": 1.2592695951461792, + "step": 6376 + }, + { + "epoch": 1.9631396691035015, + "grad_norm": 4.8125, + "learning_rate": 2.007272340855941e-06, + "loss": 1.014017105102539, + "step": 6378 + }, + { + "epoch": 1.9637552904963447, + "grad_norm": 7.3125, + "learning_rate": 2.0070339896385823e-06, + "loss": 1.0303841829299927, + "step": 6380 + }, + { + "epoch": 1.964370911889188, + "grad_norm": 11.4375, + "learning_rate": 2.006799606321054e-06, + "loss": 1.0867152214050293, + "step": 6382 + }, + { + "epoch": 1.9649865332820315, + "grad_norm": 10.75, + "learning_rate": 2.0065691911362674e-06, + "loss": 1.5220026969909668, + "step": 6384 + }, + { + "epoch": 1.965602154674875, + "grad_norm": 26.5, + "learning_rate": 2.0063427443131915e-06, + "loss": 1.254058837890625, + "step": 6386 + }, + { + "epoch": 1.9662177760677184, + "grad_norm": 8.375, + "learning_rate": 2.006120266076852e-06, + "loss": 1.0503437519073486, + "step": 6388 + }, + { + "epoch": 1.9668333974605616, + "grad_norm": 12.0, + "learning_rate": 2.00590175664833e-06, + "loss": 1.8246984481811523, + "step": 6390 + }, + { + "epoch": 1.9674490188534053, + "grad_norm": 2.390625, + "learning_rate": 2.0056872162447636e-06, + "loss": 1.23115873336792, + "step": 6392 + }, + { + "epoch": 1.9680646402462485, + "grad_norm": 5.9375, + "learning_rate": 2.0054766450793462e-06, + "loss": 1.1373382806777954, + "step": 6394 + }, + { + "epoch": 1.968680261639092, + "grad_norm": 4.71875, + "learning_rate": 2.0052700433613277e-06, + "loss": 1.1563501358032227, + "step": 6396 + }, + { + "epoch": 1.9692958830319354, + "grad_norm": 6.5625, + "learning_rate": 2.005067411296011e-06, + "loss": 1.4384610652923584, + "step": 6398 + }, + { + "epoch": 1.9699115044247788, + "grad_norm": 10.8125, + "learning_rate": 2.0048687490847585e-06, + "loss": 1.4476401805877686, + "step": 6400 + }, + { + "epoch": 1.9705271258176222, + "grad_norm": 12.5, + "learning_rate": 2.004674056924984e-06, + "loss": 1.4745994806289673, + "step": 6402 + }, + { + "epoch": 1.9711427472104655, + "grad_norm": 12.4375, + "learning_rate": 2.004483335010158e-06, + "loss": 1.651023268699646, + "step": 6404 + }, + { + "epoch": 1.971758368603309, + "grad_norm": 6.78125, + "learning_rate": 2.0042965835298043e-06, + "loss": 1.414797067642212, + "step": 6406 + }, + { + "epoch": 1.9723739899961523, + "grad_norm": 4.25, + "learning_rate": 2.0041138026695024e-06, + "loss": 0.9318125247955322, + "step": 6408 + }, + { + "epoch": 1.9729896113889958, + "grad_norm": 7.03125, + "learning_rate": 2.0039349926108864e-06, + "loss": 1.1416950225830078, + "step": 6410 + }, + { + "epoch": 1.9736052327818392, + "grad_norm": 4.0625, + "learning_rate": 2.003760153531643e-06, + "loss": 1.0876226425170898, + "step": 6412 + }, + { + "epoch": 1.9742208541746824, + "grad_norm": 11.1875, + "learning_rate": 2.0035892856055144e-06, + "loss": 1.3485053777694702, + "step": 6414 + }, + { + "epoch": 1.974836475567526, + "grad_norm": 9.0, + "learning_rate": 2.0034223890022954e-06, + "loss": 1.4001414775848389, + "step": 6416 + }, + { + "epoch": 1.9754520969603693, + "grad_norm": 5.375, + "learning_rate": 2.003259463887835e-06, + "loss": 1.4163827896118164, + "step": 6418 + }, + { + "epoch": 1.976067718353213, + "grad_norm": 6.28125, + "learning_rate": 2.0031005104240356e-06, + "loss": 1.1511523723602295, + "step": 6420 + }, + { + "epoch": 1.9766833397460561, + "grad_norm": 6.90625, + "learning_rate": 2.002945528768853e-06, + "loss": 1.307472825050354, + "step": 6422 + }, + { + "epoch": 1.9772989611388996, + "grad_norm": 5.5, + "learning_rate": 2.002794519076296e-06, + "loss": 1.4973814487457275, + "step": 6424 + }, + { + "epoch": 1.977914582531743, + "grad_norm": 4.53125, + "learning_rate": 2.002647481496425e-06, + "loss": 1.3886406421661377, + "step": 6426 + }, + { + "epoch": 1.9785302039245862, + "grad_norm": 5.46875, + "learning_rate": 2.002504416175357e-06, + "loss": 1.3384922742843628, + "step": 6428 + }, + { + "epoch": 1.9791458253174299, + "grad_norm": 13.6875, + "learning_rate": 2.0023653232552565e-06, + "loss": 2.076946973800659, + "step": 6430 + }, + { + "epoch": 1.979761446710273, + "grad_norm": 10.6875, + "learning_rate": 2.0022302028743457e-06, + "loss": 1.7353794574737549, + "step": 6432 + }, + { + "epoch": 1.9803770681031165, + "grad_norm": 8.25, + "learning_rate": 2.002099055166895e-06, + "loss": 1.3509681224822998, + "step": 6434 + }, + { + "epoch": 1.98099268949596, + "grad_norm": 6.1875, + "learning_rate": 2.00197188026323e-06, + "loss": 1.244296669960022, + "step": 6436 + }, + { + "epoch": 1.9816083108888034, + "grad_norm": 66.5, + "learning_rate": 2.0018486782897257e-06, + "loss": 1.6251654624938965, + "step": 6438 + }, + { + "epoch": 1.9822239322816468, + "grad_norm": 4.96875, + "learning_rate": 2.0017294493688128e-06, + "loss": 0.9943356513977051, + "step": 6440 + }, + { + "epoch": 1.98283955367449, + "grad_norm": 7.34375, + "learning_rate": 2.00161419361897e-06, + "loss": 0.9533267617225647, + "step": 6442 + }, + { + "epoch": 1.9834551750673337, + "grad_norm": 4.59375, + "learning_rate": 2.0015029111547304e-06, + "loss": 1.4887453317642212, + "step": 6444 + }, + { + "epoch": 1.984070796460177, + "grad_norm": 6.84375, + "learning_rate": 2.0013956020866772e-06, + "loss": 0.9633210897445679, + "step": 6446 + }, + { + "epoch": 1.9846864178530204, + "grad_norm": 5.84375, + "learning_rate": 2.001292266521446e-06, + "loss": 1.1440017223358154, + "step": 6448 + }, + { + "epoch": 1.9853020392458638, + "grad_norm": 4.09375, + "learning_rate": 2.0011929045617252e-06, + "loss": 1.0662113428115845, + "step": 6450 + }, + { + "epoch": 1.9859176606387072, + "grad_norm": 7.84375, + "learning_rate": 2.0010975163062508e-06, + "loss": 1.2532540559768677, + "step": 6452 + }, + { + "epoch": 1.9865332820315507, + "grad_norm": 19.125, + "learning_rate": 2.001006101849813e-06, + "loss": 0.6765220165252686, + "step": 6454 + }, + { + "epoch": 1.9871489034243939, + "grad_norm": 3.3125, + "learning_rate": 2.0009186612832533e-06, + "loss": 1.2371511459350586, + "step": 6456 + }, + { + "epoch": 1.9877645248172375, + "grad_norm": 3.5625, + "learning_rate": 2.000835194693462e-06, + "loss": 1.330336332321167, + "step": 6458 + }, + { + "epoch": 1.9883801462100807, + "grad_norm": 11.9375, + "learning_rate": 2.000755702163383e-06, + "loss": 1.3997299671173096, + "step": 6460 + }, + { + "epoch": 1.9889957676029242, + "grad_norm": 7.125, + "learning_rate": 2.000680183772008e-06, + "loss": 1.324434518814087, + "step": 6462 + }, + { + "epoch": 1.9896113889957676, + "grad_norm": 7.0625, + "learning_rate": 2.0006086395943834e-06, + "loss": 1.0115548372268677, + "step": 6464 + }, + { + "epoch": 1.990227010388611, + "grad_norm": 5.0, + "learning_rate": 2.0005410697016033e-06, + "loss": 1.1448062658309937, + "step": 6466 + }, + { + "epoch": 1.9908426317814545, + "grad_norm": 14.5625, + "learning_rate": 2.0004774741608126e-06, + "loss": 0.9839100241661072, + "step": 6468 + }, + { + "epoch": 1.9914582531742977, + "grad_norm": 6.40625, + "learning_rate": 2.0004178530352093e-06, + "loss": 1.489912986755371, + "step": 6470 + }, + { + "epoch": 1.9920738745671414, + "grad_norm": 11.5625, + "learning_rate": 2.000362206384039e-06, + "loss": 1.4942373037338257, + "step": 6472 + }, + { + "epoch": 1.9926894959599846, + "grad_norm": 7.53125, + "learning_rate": 2.0003105342625993e-06, + "loss": 1.1496572494506836, + "step": 6474 + }, + { + "epoch": 1.993305117352828, + "grad_norm": 10.3125, + "learning_rate": 2.0002628367222387e-06, + "loss": 1.4847123622894287, + "step": 6476 + }, + { + "epoch": 1.9939207387456714, + "grad_norm": 8.0625, + "learning_rate": 2.0002191138103544e-06, + "loss": 1.662402629852295, + "step": 6478 + }, + { + "epoch": 1.9945363601385147, + "grad_norm": 11.3125, + "learning_rate": 2.000179365570395e-06, + "loss": 1.5024702548980713, + "step": 6480 + }, + { + "epoch": 1.9951519815313583, + "grad_norm": 14.125, + "learning_rate": 2.000143592041859e-06, + "loss": 0.7729127407073975, + "step": 6482 + }, + { + "epoch": 1.9957676029242015, + "grad_norm": 2.984375, + "learning_rate": 2.0001117932602966e-06, + "loss": 0.9017384648323059, + "step": 6484 + }, + { + "epoch": 1.9963832243170452, + "grad_norm": 26.125, + "learning_rate": 2.0000839692573048e-06, + "loss": 1.2910041809082031, + "step": 6486 + }, + { + "epoch": 1.9969988457098884, + "grad_norm": 1.8671875, + "learning_rate": 2.000060120060535e-06, + "loss": 0.4969649910926819, + "step": 6488 + }, + { + "epoch": 1.9976144671027318, + "grad_norm": 8.5625, + "learning_rate": 2.0000402456936858e-06, + "loss": 1.0192922353744507, + "step": 6490 + }, + { + "epoch": 1.9982300884955753, + "grad_norm": 15.6875, + "learning_rate": 2.0000243461765068e-06, + "loss": 1.1761404275894165, + "step": 6492 + }, + { + "epoch": 1.9988457098884185, + "grad_norm": 85.0, + "learning_rate": 2.000012421524798e-06, + "loss": 1.2855005264282227, + "step": 6494 + }, + { + "epoch": 1.9994613312812621, + "grad_norm": 7.09375, + "learning_rate": 2.0000044717504087e-06, + "loss": 1.3793665170669556, + "step": 6496 + }, + { + "epoch": 2.0, + "grad_norm": 19.125, + "learning_rate": 2.000000496861239e-06, + "loss": 1.2610743045806885, + "step": 6498 + }, + { + "epoch": 2.0, + "step": 6498, + "total_flos": 2.5760029558366536e+18, + "train_loss": 1.2936220749611853, + "train_runtime": 22787.4373, + "train_samples_per_second": 1.141, + "train_steps_per_second": 0.285 + } + ], + "logging_steps": 2, + "max_steps": 6498, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 9999999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.5760029558366536e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}